Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +49 -28
mdr_pdf_parser.py
CHANGED
@@ -1443,7 +1443,7 @@ class MDROcrEngine:
|
|
1443 |
def _get_system(self) -> _MDR_TextSystem | None:
|
1444 |
if self._text_system is None:
|
1445 |
paths = {k: str(Path(self._model_dir)/Path(*p)) for k,p in _MDR_OCR_MODELS.items()}
|
1446 |
-
self._onnx_params = _MDR_ONNXParams(use_gpu=(self._device=="
|
1447 |
try: self._text_system = _MDR_TextSystem(self._onnx_params); print(f"MDR OCR System initialized.")
|
1448 |
except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
|
1449 |
return self._text_system
|
@@ -1688,7 +1688,7 @@ class MDRLatexExtractor:
|
|
1688 |
if not cp.exists():
|
1689 |
print(f"Warn: MDR LaTeX config not found {self._model_path}")
|
1690 |
try:
|
1691 |
-
args = Munch({"config": str(cp), "checkpoint": str(wp), "device": self._device, "no_cuda": self._device == "
|
1692 |
self._model = LatexOCR(args)
|
1693 |
print(f"MDR LaTeX loaded on {self._device}.")
|
1694 |
except Exception as e:
|
@@ -1712,7 +1712,7 @@ class MDRTableParser:
|
|
1712 |
def __init__(self, device: Literal["cpu", "cuda"], model_path: str):
|
1713 |
self._model: Any | None = None; self._model_path = mdr_ensure_directory(model_path)
|
1714 |
self._device = device if torch.cuda.is_available() and device=="cuda" else "cpu"
|
1715 |
-
self._disabled = self._device == "
|
1716 |
if self._disabled: print("Warning: MDR Table parsing requires CUDA. Disabled.")
|
1717 |
|
1718 |
def parse_table_image(self, image: Image, format: MDRTableLayoutParsedFormat) -> str | None:
|
@@ -1950,10 +1950,21 @@ class MDRExtractionEngine:
|
|
1950 |
try:
|
1951 |
raw_layouts = list(self._run_yolo_detection(optimizer.image, yolo))
|
1952 |
print(f" Engine: {len(raw_layouts)} raw layouts found.")
|
1953 |
-
except Exception
|
1954 |
-
|
|
|
1955 |
print(" Engine: Matching fragments...")
|
1956 |
layouts = self._match_fragments_to_layouts(frags, raw_layouts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1957 |
print(" Engine: Removing overlaps...")
|
1958 |
layouts = mdr_remove_overlap_layouts(layouts)
|
1959 |
print(f" Engine: {len(layouts)} layouts after overlap removal.")
|
@@ -1974,27 +1985,37 @@ class MDRExtractionEngine:
|
|
1974 |
print(" Engine: Analysis complete.")
|
1975 |
return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image, adjusted_image=optimizer.adjusted_image)
|
1976 |
|
1977 |
-
def _run_yolo_detection(self, img: Image, yolo: YOLOv10)
|
1978 |
-
|
1979 |
-
|
1980 |
-
|
1981 |
-
|
1982 |
-
|
1983 |
-
|
1984 |
-
|
1985 |
-
|
1986 |
-
|
1987 |
-
|
1988 |
-
|
1989 |
-
|
1990 |
-
|
1991 |
-
|
1992 |
-
|
1993 |
-
|
1994 |
-
|
1995 |
-
|
1996 |
-
|
1997 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1998 |
|
1999 |
def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[MDRLayoutElement]:
|
2000 |
if not frags or not layouts:
|
@@ -2540,7 +2561,7 @@ if __name__ == '__main__':
|
|
2540 |
MDR_DEBUG_DIRECTORY = "./mdr_debug_output"
|
2541 |
|
2542 |
# Specify device ('cuda' or 'cpu').
|
2543 |
-
MDR_DEVICE = "
|
2544 |
|
2545 |
# Specify desired table format
|
2546 |
MDR_TABLE_FORMAT = MDRExtractedTableFormat.MARKDOWN
|
@@ -2637,4 +2658,4 @@ if __name__ == '__main__':
|
|
2637 |
print(f"\nFATAL ERROR during processing: {e}")
|
2638 |
import traceback
|
2639 |
traceback.print_exc()
|
2640 |
-
exit(1)
|
|
|
1443 |
def _get_system(self) -> _MDR_TextSystem | None:
|
1444 |
if self._text_system is None:
|
1445 |
paths = {k: str(Path(self._model_dir)/Path(*p)) for k,p in _MDR_OCR_MODELS.items()}
|
1446 |
+
self._onnx_params = _MDR_ONNXParams(use_gpu=(self._device=="cpu"), det_model_dir=paths["det"], cls_model_dir=paths["cls"], rec_model_dir=paths["rec"], rec_char_dict_path=paths["keys"])
|
1447 |
try: self._text_system = _MDR_TextSystem(self._onnx_params); print(f"MDR OCR System initialized.")
|
1448 |
except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
|
1449 |
return self._text_system
|
|
|
1688 |
if not cp.exists():
|
1689 |
print(f"Warn: MDR LaTeX config not found {self._model_path}")
|
1690 |
try:
|
1691 |
+
args = Munch({"config": str(cp), "checkpoint": str(wp), "device": self._device, "no_cuda": self._device == "cuda", "no_resize": False, "temperature": 0.0})
|
1692 |
self._model = LatexOCR(args)
|
1693 |
print(f"MDR LaTeX loaded on {self._device}.")
|
1694 |
except Exception as e:
|
|
|
1712 |
def __init__(self, device: Literal["cpu", "cuda"], model_path: str):
|
1713 |
self._model: Any | None = None; self._model_path = mdr_ensure_directory(model_path)
|
1714 |
self._device = device if torch.cuda.is_available() and device=="cuda" else "cpu"
|
1715 |
+
self._disabled = self._device == "cuda"
|
1716 |
if self._disabled: print("Warning: MDR Table parsing requires CUDA. Disabled.")
|
1717 |
|
1718 |
def parse_table_image(self, image: Image, format: MDRTableLayoutParsedFormat) -> str | None:
|
|
|
1950 |
try:
|
1951 |
raw_layouts = list(self._run_yolo_detection(optimizer.image, yolo))
|
1952 |
print(f" Engine: {len(raw_layouts)} raw layouts found.")
|
1953 |
+
except Exception:
|
1954 |
+
import traceback, sys
|
1955 |
+
traceback.print_exc(file=sys.stderr)
|
1956 |
print(" Engine: Matching fragments...")
|
1957 |
layouts = self._match_fragments_to_layouts(frags, raw_layouts)
|
1958 |
+
if not layouts and frags:
|
1959 |
+
# treat the whole page as one plain-text layout
|
1960 |
+
page_rect = MDRRectangle(
|
1961 |
+
lt=(0, 0), rt=(optimizer.image.width, 0),
|
1962 |
+
lb=(0, optimizer.image.height), rb=(optimizer.image.width, optimizer.image.height)
|
1963 |
+
)
|
1964 |
+
dummy = MDRPlainLayoutElement(
|
1965 |
+
cls=MDRLayoutClass.PLAIN_TEXT, rect=page_rect, fragments=frags.copy()
|
1966 |
+
)
|
1967 |
+
layouts.append(dummy)
|
1968 |
print(" Engine: Removing overlaps...")
|
1969 |
layouts = mdr_remove_overlap_layouts(layouts)
|
1970 |
print(f" Engine: {len(layouts)} layouts after overlap removal.")
|
|
|
1985 |
print(" Engine: Analysis complete.")
|
1986 |
return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image, adjusted_image=optimizer.adjusted_image)
|
1987 |
|
1988 |
+
def _run_yolo_detection(self, img: Image, yolo: YOLOv10):
|
1989 |
+
img_rgb = img.convert("RGB")
|
1990 |
+
res = yolo.predict(source=img_rgb, imgsz=1024, conf=0.20,
|
1991 |
+
device=self._device, verbose=False)
|
1992 |
+
|
1993 |
+
if not res or not res[0].boxes:
|
1994 |
+
return
|
1995 |
+
|
1996 |
+
plain_classes: set[MDRLayoutClass] = {
|
1997 |
+
MDRLayoutClass.TITLE,
|
1998 |
+
MDRLayoutClass.PLAIN_TEXT,
|
1999 |
+
MDRLayoutClass.ABANDON,
|
2000 |
+
MDRLayoutClass.FIGURE_CAPTION,
|
2001 |
+
MDRLayoutClass.TABLE_CAPTION,
|
2002 |
+
MDRLayoutClass.TABLE_FOOTNOTE,
|
2003 |
+
MDRLayoutClass.FORMULA_CAPTION,
|
2004 |
+
}
|
2005 |
+
|
2006 |
+
for cls_id_t, xyxy_t in zip(res[0].boxes.cls, res[0].boxes.xyxy):
|
2007 |
+
cls = MDRLayoutClass(int(cls_id_t))
|
2008 |
+
x1, y1, x2, y2 = map(float, xyxy_t)
|
2009 |
+
rect = MDRRectangle((x1, y1), (x2, y1), (x1, y2), (x2, y2))
|
2010 |
+
if rect.area < 10:
|
2011 |
+
continue
|
2012 |
+
|
2013 |
+
if cls == MDRLayoutClass.TABLE:
|
2014 |
+
yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None)
|
2015 |
+
elif cls == MDRLayoutClass.ISOLATE_FORMULA:
|
2016 |
+
yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None)
|
2017 |
+
elif cls in plain_classes:
|
2018 |
+
yield MDRPlainLayoutElement(cls=cls, rect=rect, fragments=[])
|
2019 |
|
2020 |
def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[MDRLayoutElement]:
|
2021 |
if not frags or not layouts:
|
|
|
2561 |
MDR_DEBUG_DIRECTORY = "./mdr_debug_output"
|
2562 |
|
2563 |
# Specify device ('cuda' or 'cpu').
|
2564 |
+
MDR_DEVICE = "cuda"
|
2565 |
|
2566 |
# Specify desired table format
|
2567 |
MDR_TABLE_FORMAT = MDRExtractedTableFormat.MARKDOWN
|
|
|
2658 |
print(f"\nFATAL ERROR during processing: {e}")
|
2659 |
import traceback
|
2660 |
traceback.print_exc()
|
2661 |
+
exit(1)
|