Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +47 -27
mdr_pdf_parser.py
CHANGED
@@ -47,6 +47,7 @@ from alphabet_detector import AlphabetDetector
|
|
47 |
from munch import Munch
|
48 |
from transformers import LayoutLMv3ForTokenClassification
|
49 |
import onnxruntime
|
|
|
50 |
# --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
|
51 |
from huggingface_hub import hf_hub_download
|
52 |
from huggingface_hub.errors import HfHubHTTPError
|
@@ -91,7 +92,6 @@ def mdr_download_model(url: str, file_path: Path):
|
|
91 |
if file_path.exists(): os.remove(file_path)
|
92 |
raise e
|
93 |
|
94 |
-
# --- MDR Utilities ---
|
95 |
def mdr_ensure_directory(path: str) -> str:
|
96 |
"""Ensures a directory exists, creating it if necessary."""
|
97 |
path = os.path.abspath(path)
|
@@ -113,7 +113,7 @@ def mdr_expand_image(image: Image, percent: float) -> Image:
|
|
113 |
else: fill = (255, 255, 255)
|
114 |
return pil_expand(image=image, border=(bw, bh), fill=fill)
|
115 |
|
116 |
-
# --- MDR Geometry
|
117 |
MDRPoint: TypeAlias = tuple[float, float]
|
118 |
@dataclass
|
119 |
class MDRRectangle:
|
@@ -181,17 +181,22 @@ class MDRBaseLayoutElement:
|
|
181 |
@dataclass
|
182 |
class MDRPlainLayoutElement(MDRBaseLayoutElement):
|
183 |
"""Layout element for plain text, titles, captions, figures, etc."""
|
184 |
-
|
|
|
185 |
|
186 |
@dataclass
|
187 |
class MDRTableLayoutElement(MDRBaseLayoutElement):
|
188 |
"""Layout element specifically for tables."""
|
189 |
-
parsed: tuple[str, MDRTableLayoutParsedFormat] | None
|
|
|
|
|
190 |
|
191 |
@dataclass
|
192 |
class MDRFormulaLayoutElement(MDRBaseLayoutElement):
|
193 |
"""Layout element specifically for formulas."""
|
194 |
-
latex: str | None
|
|
|
|
|
195 |
|
196 |
MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
|
197 |
|
@@ -218,24 +223,35 @@ class MDRTextSpan:
|
|
218 |
@dataclass
|
219 |
class MDRBasicBlock:
|
220 |
"""Base class for structured blocks extracted from the document."""
|
221 |
-
rect: MDRRectangle
|
|
|
|
|
222 |
|
223 |
@dataclass
|
224 |
class MDRTextBlock(MDRBasicBlock):
|
225 |
"""A structured block containing text content."""
|
226 |
-
kind: MDRTextKind
|
|
|
|
|
227 |
|
228 |
-
class MDRTableFormat(Enum):
|
|
|
|
|
|
|
|
|
229 |
|
230 |
@dataclass
|
231 |
class MDRTableBlock(MDRBasicBlock):
|
232 |
"""A structured block representing a table."""
|
233 |
-
content: str
|
|
|
|
|
234 |
|
235 |
@dataclass
|
236 |
class MDRFormulaBlock(MDRBasicBlock):
|
237 |
"""A structured block representing a formula."""
|
238 |
-
content: str | None
|
|
|
239 |
|
240 |
@dataclass
|
241 |
class MDRFigureBlock(MDRBasicBlock):
|
@@ -278,13 +294,20 @@ def mdr_contains_cjka(text: str):
|
|
278 |
return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
|
279 |
|
280 |
# --- MDR Text Processing ---
|
281 |
-
class _MDR_TokenPhase(Enum):
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
_mdr_alphabet_detector = AlphabetDetector()
|
284 |
|
285 |
def _mdr_is_letter(char: str):
|
286 |
-
if not category(char).startswith("L"):
|
287 |
-
|
|
|
|
|
288 |
except: return False
|
289 |
|
290 |
def mdr_split_into_words(text: str):
|
@@ -373,8 +396,10 @@ class MDRRotationAdjuster:
|
|
373 |
return x + self._n_off[0], y + self._n_off[1]
|
374 |
|
375 |
def mdr_normalize_vertical_rotation(rot: float) -> float:
|
376 |
-
while rot >= pi:
|
377 |
-
|
|
|
|
|
378 |
return rot
|
379 |
|
380 |
def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
|
@@ -452,11 +477,14 @@ class _MDR_PredictBase:
|
|
452 |
print(" CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
|
453 |
raise e
|
454 |
|
455 |
-
def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
|
|
456 |
|
457 |
-
def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
|
|
458 |
|
459 |
-
def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]:
|
|
|
460 |
|
461 |
# --- MDR ONNX OCR Internals ---
|
462 |
class _MDR_NormalizeImage:
|
@@ -590,11 +618,9 @@ def mdr_ocr_transform(
|
|
590 |
) -> Optional[Any]:
|
591 |
"""
|
592 |
Applies a sequence of transformation operations to the input data.
|
593 |
-
|
594 |
This function iterates through a list of operations (callables) and
|
595 |
applies each one sequentially to the data. If any operation
|
596 |
returns None, the processing stops immediately, and None is returned.
|
597 |
-
|
598 |
Args:
|
599 |
data: The initial data to be transformed. Can be of any type
|
600 |
compatible with the operations.
|
@@ -603,7 +629,6 @@ def mdr_ocr_transform(
|
|
603 |
the transformed data or None to signal an early exit.
|
604 |
If None or an empty list is provided, the original data
|
605 |
is returned unchanged.
|
606 |
-
|
607 |
Returns:
|
608 |
The transformed data after applying all operations successfully,
|
609 |
or None if any operation in the sequence returned None.
|
@@ -2261,7 +2286,6 @@ class MagicPDFProcessor:
|
|
2261 |
def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
|
2262 |
"""
|
2263 |
Initializes the MagicPDFProcessor.
|
2264 |
-
|
2265 |
Args:
|
2266 |
device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
|
2267 |
model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
|
@@ -2283,11 +2307,9 @@ class MagicPDFProcessor:
|
|
2283 |
def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
|
2284 |
"""
|
2285 |
Processes the entire PDF document and yields all extracted structured blocks.
|
2286 |
-
|
2287 |
Args:
|
2288 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
2289 |
report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
|
2290 |
-
|
2291 |
Yields:
|
2292 |
MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
|
2293 |
"""
|
@@ -2300,12 +2322,10 @@ class MagicPDFProcessor:
|
|
2300 |
"""
|
2301 |
Processes specific pages (or all if page_indexes is None) of the PDF document.
|
2302 |
Yields results page by page, including the page index, extracted blocks, and the original page image.
|
2303 |
-
|
2304 |
Args:
|
2305 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
2306 |
page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
|
2307 |
report_progress: Optional callback function for progress updates.
|
2308 |
-
|
2309 |
Yields:
|
2310 |
tuple[int, list[MDRStructuredBlock], Image]:
|
2311 |
- page_index (0-based)
|
@@ -2617,4 +2637,4 @@ if __name__ == '__main__':
|
|
2617 |
print(f"\nFATAL ERROR during processing: {e}")
|
2618 |
import traceback
|
2619 |
traceback.print_exc()
|
2620 |
-
exit(1)
|
|
|
47 |
from munch import Munch
|
48 |
from transformers import LayoutLMv3ForTokenClassification
|
49 |
import onnxruntime
|
50 |
+
from enum import auto, Enum
|
51 |
# --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
|
52 |
from huggingface_hub import hf_hub_download
|
53 |
from huggingface_hub.errors import HfHubHTTPError
|
|
|
92 |
if file_path.exists(): os.remove(file_path)
|
93 |
raise e
|
94 |
|
|
|
95 |
def mdr_ensure_directory(path: str) -> str:
|
96 |
"""Ensures a directory exists, creating it if necessary."""
|
97 |
path = os.path.abspath(path)
|
|
|
113 |
else: fill = (255, 255, 255)
|
114 |
return pil_expand(image=image, border=(bw, bh), fill=fill)
|
115 |
|
116 |
+
# --- MDR Geometry ---
|
117 |
MDRPoint: TypeAlias = tuple[float, float]
|
118 |
@dataclass
|
119 |
class MDRRectangle:
|
|
|
181 |
@dataclass
|
182 |
class MDRPlainLayoutElement(MDRBaseLayoutElement):
|
183 |
"""Layout element for plain text, titles, captions, figures, etc."""
|
184 |
+
# MODIFIED: Replaced Literal[...] with the Enum class name
|
185 |
+
cls: MDRLayoutClass # The type hint is now the Enum class itself
|
186 |
|
187 |
@dataclass
|
188 |
class MDRTableLayoutElement(MDRBaseLayoutElement):
|
189 |
"""Layout element specifically for tables."""
|
190 |
+
parsed: tuple[str, MDRTableLayoutParsedFormat] | None
|
191 |
+
# MODIFIED: Replaced Literal[EnumMember] with the Enum class name
|
192 |
+
cls: MDRLayoutClass = MDRLayoutClass.TABLE # Hint with Enum, assign default member
|
193 |
|
194 |
@dataclass
|
195 |
class MDRFormulaLayoutElement(MDRBaseLayoutElement):
|
196 |
"""Layout element specifically for formulas."""
|
197 |
+
latex: str | None
|
198 |
+
# MODIFIED: Replaced Literal[EnumMember] with the Enum class name
|
199 |
+
cls: MDRLayoutClass = MDRLayoutClass.ISOLATE_FORMULA # Hint with Enum, assign default member
|
200 |
|
201 |
MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
|
202 |
|
|
|
223 |
@dataclass
|
224 |
class MDRBasicBlock:
|
225 |
"""Base class for structured blocks extracted from the document."""
|
226 |
+
rect: MDRRectangle
|
227 |
+
texts: list[MDRTextSpan]
|
228 |
+
font_size: float # Relative font size (0-1)
|
229 |
|
230 |
@dataclass
|
231 |
class MDRTextBlock(MDRBasicBlock):
|
232 |
"""A structured block containing text content."""
|
233 |
+
kind: MDRTextKind
|
234 |
+
has_paragraph_indentation: bool = False
|
235 |
+
last_line_touch_end: bool = False
|
236 |
|
237 |
+
class MDRTableFormat(Enum):
|
238 |
+
LATEX=auto()
|
239 |
+
MARKDOWN=auto()
|
240 |
+
HTML=auto()
|
241 |
+
UNRECOGNIZABLE=auto()
|
242 |
|
243 |
@dataclass
|
244 |
class MDRTableBlock(MDRBasicBlock):
|
245 |
"""A structured block representing a table."""
|
246 |
+
content: str
|
247 |
+
format: MDRTableFormat
|
248 |
+
image: Image # Image clip of the table
|
249 |
|
250 |
@dataclass
|
251 |
class MDRFormulaBlock(MDRBasicBlock):
|
252 |
"""A structured block representing a formula."""
|
253 |
+
content: str | None
|
254 |
+
image: Image # Image clip of the formula
|
255 |
|
256 |
@dataclass
|
257 |
class MDRFigureBlock(MDRBasicBlock):
|
|
|
294 |
return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
|
295 |
|
296 |
# --- MDR Text Processing ---
|
297 |
+
class _MDR_TokenPhase(Enum):
|
298 |
+
Init=0
|
299 |
+
Letter=1
|
300 |
+
Character=2
|
301 |
+
Number=3
|
302 |
+
Space=4
|
303 |
|
304 |
_mdr_alphabet_detector = AlphabetDetector()
|
305 |
|
306 |
def _mdr_is_letter(char: str):
|
307 |
+
if not category(char).startswith("L"):
|
308 |
+
return False
|
309 |
+
try:
|
310 |
+
return _mdr_alphabet_detector.is_latin(char) or _mdr_alphabet_detector.is_cyrillic(char) or _mdr_alphabet_detector.is_greek(char) or _mdr_alphabet_detector.is_hebrew(char)
|
311 |
except: return False
|
312 |
|
313 |
def mdr_split_into_words(text: str):
|
|
|
396 |
return x + self._n_off[0], y + self._n_off[1]
|
397 |
|
398 |
def mdr_normalize_vertical_rotation(rot: float) -> float:
|
399 |
+
while rot >= pi:
|
400 |
+
rot -= pi
|
401 |
+
while rot < 0:
|
402 |
+
rot += pi
|
403 |
return rot
|
404 |
|
405 |
def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
|
|
|
477 |
print(" CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
|
478 |
raise e
|
479 |
|
480 |
+
def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
481 |
+
return [n.name for n in sess.get_outputs()]
|
482 |
|
483 |
+
def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
484 |
+
return [n.name for n in sess.get_inputs()]
|
485 |
|
486 |
+
def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]:
|
487 |
+
return {name: img_np for name in names}
|
488 |
|
489 |
# --- MDR ONNX OCR Internals ---
|
490 |
class _MDR_NormalizeImage:
|
|
|
618 |
) -> Optional[Any]:
|
619 |
"""
|
620 |
Applies a sequence of transformation operations to the input data.
|
|
|
621 |
This function iterates through a list of operations (callables) and
|
622 |
applies each one sequentially to the data. If any operation
|
623 |
returns None, the processing stops immediately, and None is returned.
|
|
|
624 |
Args:
|
625 |
data: The initial data to be transformed. Can be of any type
|
626 |
compatible with the operations.
|
|
|
629 |
the transformed data or None to signal an early exit.
|
630 |
If None or an empty list is provided, the original data
|
631 |
is returned unchanged.
|
|
|
632 |
Returns:
|
633 |
The transformed data after applying all operations successfully,
|
634 |
or None if any operation in the sequence returned None.
|
|
|
2286 |
def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
|
2287 |
"""
|
2288 |
Initializes the MagicPDFProcessor.
|
|
|
2289 |
Args:
|
2290 |
device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
|
2291 |
model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
|
|
|
2307 |
def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
|
2308 |
"""
|
2309 |
Processes the entire PDF document and yields all extracted structured blocks.
|
|
|
2310 |
Args:
|
2311 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
2312 |
report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
|
|
|
2313 |
Yields:
|
2314 |
MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
|
2315 |
"""
|
|
|
2322 |
"""
|
2323 |
Processes specific pages (or all if page_indexes is None) of the PDF document.
|
2324 |
Yields results page by page, including the page index, extracted blocks, and the original page image.
|
|
|
2325 |
Args:
|
2326 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
2327 |
page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
|
2328 |
report_progress: Optional callback function for progress updates.
|
|
|
2329 |
Yields:
|
2330 |
tuple[int, list[MDRStructuredBlock], Image]:
|
2331 |
- page_index (0-based)
|
|
|
2637 |
print(f"\nFATAL ERROR during processing: {e}")
|
2638 |
import traceback
|
2639 |
traceback.print_exc()
|
2640 |
+
exit(1)
|