rodrigomasini commited on
Commit
32333b3
·
verified ·
1 Parent(s): 17a088f

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +49 -28
mdr_pdf_parser.py CHANGED
@@ -1443,7 +1443,7 @@ class MDROcrEngine:
1443
  def _get_system(self) -> _MDR_TextSystem | None:
1444
  if self._text_system is None:
1445
  paths = {k: str(Path(self._model_dir)/Path(*p)) for k,p in _MDR_OCR_MODELS.items()}
1446
- self._onnx_params = _MDR_ONNXParams(use_gpu=(self._device=="cuda"), det_model_dir=paths["det"], cls_model_dir=paths["cls"], rec_model_dir=paths["rec"], rec_char_dict_path=paths["keys"])
1447
  try: self._text_system = _MDR_TextSystem(self._onnx_params); print(f"MDR OCR System initialized.")
1448
  except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
1449
  return self._text_system
@@ -1688,7 +1688,7 @@ class MDRLatexExtractor:
1688
  if not cp.exists():
1689
  print(f"Warn: MDR LaTeX config not found {self._model_path}")
1690
  try:
1691
- args = Munch({"config": str(cp), "checkpoint": str(wp), "device": self._device, "no_cuda": self._device == "cpu", "no_resize": False, "temperature": 0.0})
1692
  self._model = LatexOCR(args)
1693
  print(f"MDR LaTeX loaded on {self._device}.")
1694
  except Exception as e:
@@ -1712,7 +1712,7 @@ class MDRTableParser:
1712
  def __init__(self, device: Literal["cpu", "cuda"], model_path: str):
1713
  self._model: Any | None = None; self._model_path = mdr_ensure_directory(model_path)
1714
  self._device = device if torch.cuda.is_available() and device=="cuda" else "cpu"
1715
- self._disabled = self._device == "cpu"
1716
  if self._disabled: print("Warning: MDR Table parsing requires CUDA. Disabled.")
1717
 
1718
  def parse_table_image(self, image: Image, format: MDRTableLayoutParsedFormat) -> str | None:
@@ -1950,10 +1950,21 @@ class MDRExtractionEngine:
1950
  try:
1951
  raw_layouts = list(self._run_yolo_detection(optimizer.image, yolo))
1952
  print(f" Engine: {len(raw_layouts)} raw layouts found.")
1953
- except Exception as e:
1954
- print(f" Engine: YOLO error: {e}")
 
1955
  print(" Engine: Matching fragments...")
1956
  layouts = self._match_fragments_to_layouts(frags, raw_layouts)
 
 
 
 
 
 
 
 
 
 
1957
  print(" Engine: Removing overlaps...")
1958
  layouts = mdr_remove_overlap_layouts(layouts)
1959
  print(f" Engine: {len(layouts)} layouts after overlap removal.")
@@ -1974,27 +1985,37 @@ class MDRExtractionEngine:
1974
  print(" Engine: Analysis complete.")
1975
  return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image, adjusted_image=optimizer.adjusted_image)
1976
 
1977
- def _run_yolo_detection(self, img: Image, yolo: YOLOv10) -> Generator[MDRLayoutElement, None, None]:
1978
- img_rgb = img.convert("RGB")
1979
- res = yolo.predict(source=img_rgb, imgsz=1024, conf=0.2, device=self._device, verbose=False)
1980
- if not res or not hasattr(res[0], 'boxes') or res[0].boxes is None:
1981
- return
1982
- boxes = res[0].boxes
1983
- for cls_id_t, xyxy_t in zip(boxes.cls, boxes.xyxy):
1984
- cls_id = int(cls_id_t.item())
1985
- try:
1986
- cls = MDRLayoutClass(cls_id)
1987
- except ValueError:
1988
- continue
1989
- x1, y1, x2, y2 = [c.item() for c in xyxy_t]
1990
- rect = MDRRectangle(lt=(x1, y1), rt=(x2, y1), lb=(x1, y2), rb=(x2, y2))
1991
- if rect.is_valid and rect.area > 10:
1992
- if cls == MDRLayoutClass.TABLE:
1993
- yield MDRTableLayoutElement(cls=cls, rect=rect, fragments=[], parsed=None)
1994
- elif cls == MDRLayoutClass.ISOLATE_FORMULA:
1995
- yield MDRFormulaLayoutElement(cls=cls, rect=rect, fragments=[], latex=None)
1996
- elif cls in MDRPlainLayoutElement.__annotations__['cls'].__args__:
1997
- yield MDRPlainLayoutElement(cls=cls, rect=rect, fragments=[])
 
 
 
 
 
 
 
 
 
 
1998
 
1999
  def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[MDRLayoutElement]:
2000
  if not frags or not layouts:
@@ -2540,7 +2561,7 @@ if __name__ == '__main__':
2540
  MDR_DEBUG_DIRECTORY = "./mdr_debug_output"
2541
 
2542
  # Specify device ('cuda' or 'cpu').
2543
- MDR_DEVICE = "cpu"
2544
 
2545
  # Specify desired table format
2546
  MDR_TABLE_FORMAT = MDRExtractedTableFormat.MARKDOWN
@@ -2637,4 +2658,4 @@ if __name__ == '__main__':
2637
  print(f"\nFATAL ERROR during processing: {e}")
2638
  import traceback
2639
  traceback.print_exc()
2640
- exit(1)
 
1443
  def _get_system(self) -> _MDR_TextSystem | None:
1444
  if self._text_system is None:
1445
  paths = {k: str(Path(self._model_dir)/Path(*p)) for k,p in _MDR_OCR_MODELS.items()}
1446
+ self._onnx_params = _MDR_ONNXParams(use_gpu=(self._device=="cpu"), det_model_dir=paths["det"], cls_model_dir=paths["cls"], rec_model_dir=paths["rec"], rec_char_dict_path=paths["keys"])
1447
  try: self._text_system = _MDR_TextSystem(self._onnx_params); print(f"MDR OCR System initialized.")
1448
  except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
1449
  return self._text_system
 
1688
  if not cp.exists():
1689
  print(f"Warn: MDR LaTeX config not found {self._model_path}")
1690
  try:
1691
+ args = Munch({"config": str(cp), "checkpoint": str(wp), "device": self._device, "no_cuda": self._device == "cuda", "no_resize": False, "temperature": 0.0})
1692
  self._model = LatexOCR(args)
1693
  print(f"MDR LaTeX loaded on {self._device}.")
1694
  except Exception as e:
 
1712
  def __init__(self, device: Literal["cpu", "cuda"], model_path: str):
1713
  self._model: Any | None = None; self._model_path = mdr_ensure_directory(model_path)
1714
  self._device = device if torch.cuda.is_available() and device=="cuda" else "cpu"
1715
+ self._disabled = self._device == "cuda"
1716
  if self._disabled: print("Warning: MDR Table parsing requires CUDA. Disabled.")
1717
 
1718
  def parse_table_image(self, image: Image, format: MDRTableLayoutParsedFormat) -> str | None:
 
1950
  try:
1951
  raw_layouts = list(self._run_yolo_detection(optimizer.image, yolo))
1952
  print(f" Engine: {len(raw_layouts)} raw layouts found.")
1953
+ except Exception:
1954
+ import traceback, sys
1955
+ traceback.print_exc(file=sys.stderr)
1956
  print(" Engine: Matching fragments...")
1957
  layouts = self._match_fragments_to_layouts(frags, raw_layouts)
1958
+ if not layouts and frags:
1959
+ # treat the whole page as one plain-text layout
1960
+ page_rect = MDRRectangle(
1961
+ lt=(0, 0), rt=(optimizer.image.width, 0),
1962
+ lb=(0, optimizer.image.height), rb=(optimizer.image.width, optimizer.image.height)
1963
+ )
1964
+ dummy = MDRPlainLayoutElement(
1965
+ cls=MDRLayoutClass.PLAIN_TEXT, rect=page_rect, fragments=frags.copy()
1966
+ )
1967
+ layouts.append(dummy)
1968
  print(" Engine: Removing overlaps...")
1969
  layouts = mdr_remove_overlap_layouts(layouts)
1970
  print(f" Engine: {len(layouts)} layouts after overlap removal.")
 
1985
  print(" Engine: Analysis complete.")
1986
  return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image, adjusted_image=optimizer.adjusted_image)
1987
 
1988
+ def _run_yolo_detection(self, img: Image, yolo: YOLOv10):
1989
+ img_rgb = img.convert("RGB")
1990
+ res = yolo.predict(source=img_rgb, imgsz=1024, conf=0.20,
1991
+ device=self._device, verbose=False)
1992
+
1993
+ if not res or not res[0].boxes:
1994
+ return
1995
+
1996
+ plain_classes: set[MDRLayoutClass] = {
1997
+ MDRLayoutClass.TITLE,
1998
+ MDRLayoutClass.PLAIN_TEXT,
1999
+ MDRLayoutClass.ABANDON,
2000
+ MDRLayoutClass.FIGURE_CAPTION,
2001
+ MDRLayoutClass.TABLE_CAPTION,
2002
+ MDRLayoutClass.TABLE_FOOTNOTE,
2003
+ MDRLayoutClass.FORMULA_CAPTION,
2004
+ }
2005
+
2006
+ for cls_id_t, xyxy_t in zip(res[0].boxes.cls, res[0].boxes.xyxy):
2007
+ cls = MDRLayoutClass(int(cls_id_t))
2008
+ x1, y1, x2, y2 = map(float, xyxy_t)
2009
+ rect = MDRRectangle((x1, y1), (x2, y1), (x1, y2), (x2, y2))
2010
+ if rect.area < 10:
2011
+ continue
2012
+
2013
+ if cls == MDRLayoutClass.TABLE:
2014
+ yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None)
2015
+ elif cls == MDRLayoutClass.ISOLATE_FORMULA:
2016
+ yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None)
2017
+ elif cls in plain_classes:
2018
+ yield MDRPlainLayoutElement(cls=cls, rect=rect, fragments=[])
2019
 
2020
  def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[MDRLayoutElement]:
2021
  if not frags or not layouts:
 
2561
  MDR_DEBUG_DIRECTORY = "./mdr_debug_output"
2562
 
2563
  # Specify device ('cuda' or 'cpu').
2564
+ MDR_DEVICE = "cuda"
2565
 
2566
  # Specify desired table format
2567
  MDR_TABLE_FORMAT = MDRExtractedTableFormat.MARKDOWN
 
2658
  print(f"\nFATAL ERROR during processing: {e}")
2659
  import traceback
2660
  traceback.print_exc()
2661
+ exit(1)