rodrigomasini commited on
Commit
a65e05e
·
verified ·
1 Parent(s): c79a3f8

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +47 -27
mdr_pdf_parser.py CHANGED
@@ -47,6 +47,7 @@ from alphabet_detector import AlphabetDetector
47
  from munch import Munch
48
  from transformers import LayoutLMv3ForTokenClassification
49
  import onnxruntime
 
50
  # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
51
  from huggingface_hub import hf_hub_download
52
  from huggingface_hub.errors import HfHubHTTPError
@@ -91,7 +92,6 @@ def mdr_download_model(url: str, file_path: Path):
91
  if file_path.exists(): os.remove(file_path)
92
  raise e
93
 
94
- # --- MDR Utilities ---
95
  def mdr_ensure_directory(path: str) -> str:
96
  """Ensures a directory exists, creating it if necessary."""
97
  path = os.path.abspath(path)
@@ -113,7 +113,7 @@ def mdr_expand_image(image: Image, percent: float) -> Image:
113
  else: fill = (255, 255, 255)
114
  return pil_expand(image=image, border=(bw, bh), fill=fill)
115
 
116
- # --- MDR Geometry (rectangle.py) ---
117
  MDRPoint: TypeAlias = tuple[float, float]
118
  @dataclass
119
  class MDRRectangle:
@@ -181,17 +181,22 @@ class MDRBaseLayoutElement:
181
  @dataclass
182
  class MDRPlainLayoutElement(MDRBaseLayoutElement):
183
  """Layout element for plain text, titles, captions, figures, etc."""
184
- cls: Literal[MDRLayoutClass.TITLE, MDRLayoutClass.PLAIN_TEXT, MDRLayoutClass.ABANDON, MDRLayoutClass.FIGURE, MDRLayoutClass.FIGURE_CAPTION, MDRLayoutClass.TABLE_CAPTION, MDRLayoutClass.TABLE_FOOTNOTE, MDRLayoutClass.FORMULA_CAPTION]
 
185
 
186
  @dataclass
187
  class MDRTableLayoutElement(MDRBaseLayoutElement):
188
  """Layout element specifically for tables."""
189
- parsed: tuple[str, MDRTableLayoutParsedFormat] | None; cls: Literal[MDRLayoutClass.TABLE] = MDRLayoutClass.TABLE
 
 
190
 
191
  @dataclass
192
  class MDRFormulaLayoutElement(MDRBaseLayoutElement):
193
  """Layout element specifically for formulas."""
194
- latex: str | None; cls: Literal[MDRLayoutClass.ISOLATE_FORMULA] = MDRLayoutClass.ISOLATE_FORMULA
 
 
195
 
196
  MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
197
 
@@ -218,24 +223,35 @@ class MDRTextSpan:
218
  @dataclass
219
  class MDRBasicBlock:
220
  """Base class for structured blocks extracted from the document."""
221
- rect: MDRRectangle; texts: list[MDRTextSpan]; font_size: float # Relative font size (0-1)
 
 
222
 
223
  @dataclass
224
  class MDRTextBlock(MDRBasicBlock):
225
  """A structured block containing text content."""
226
- kind: MDRTextKind; has_paragraph_indentation: bool = False; last_line_touch_end: bool = False
 
 
227
 
228
- class MDRTableFormat(Enum): LATEX=auto(); MARKDOWN=auto(); HTML=auto(); UNRECOGNIZABLE=auto()
 
 
 
 
229
 
230
  @dataclass
231
  class MDRTableBlock(MDRBasicBlock):
232
  """A structured block representing a table."""
233
- content: str; format: MDRTableFormat; image: Image # Image clip of the table
 
 
234
 
235
  @dataclass
236
  class MDRFormulaBlock(MDRBasicBlock):
237
  """A structured block representing a formula."""
238
- content: str | None; image: Image # Image clip of the formula
 
239
 
240
  @dataclass
241
  class MDRFigureBlock(MDRBasicBlock):
@@ -278,13 +294,20 @@ def mdr_contains_cjka(text: str):
278
  return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
279
 
280
  # --- MDR Text Processing ---
281
- class _MDR_TokenPhase(Enum): Init=0; Letter=1; Character=2; Number=3; Space=4
 
 
 
 
 
282
 
283
  _mdr_alphabet_detector = AlphabetDetector()
284
 
285
  def _mdr_is_letter(char: str):
286
- if not category(char).startswith("L"): return False
287
- try: return _mdr_alphabet_detector.is_latin(char) or _mdr_alphabet_detector.is_cyrillic(char) or _mdr_alphabet_detector.is_greek(char) or _mdr_alphabet_detector.is_hebrew(char)
 
 
288
  except: return False
289
 
290
  def mdr_split_into_words(text: str):
@@ -373,8 +396,10 @@ class MDRRotationAdjuster:
373
  return x + self._n_off[0], y + self._n_off[1]
374
 
375
  def mdr_normalize_vertical_rotation(rot: float) -> float:
376
- while rot >= pi: rot -= pi;
377
- while rot < 0: rot += pi;
 
 
378
  return rot
379
 
380
  def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
@@ -452,11 +477,14 @@ class _MDR_PredictBase:
452
  print(" CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
453
  raise e
454
 
455
- def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]: return [n.name for n in sess.get_outputs()]
 
456
 
457
- def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]: return [n.name for n in sess.get_inputs()]
 
458
 
459
- def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]: return {name: img_np for name in names}
 
460
 
461
  # --- MDR ONNX OCR Internals ---
462
  class _MDR_NormalizeImage:
@@ -590,11 +618,9 @@ def mdr_ocr_transform(
590
  ) -> Optional[Any]:
591
  """
592
  Applies a sequence of transformation operations to the input data.
593
-
594
  This function iterates through a list of operations (callables) and
595
  applies each one sequentially to the data. If any operation
596
  returns None, the processing stops immediately, and None is returned.
597
-
598
  Args:
599
  data: The initial data to be transformed. Can be of any type
600
  compatible with the operations.
@@ -603,7 +629,6 @@ def mdr_ocr_transform(
603
  the transformed data or None to signal an early exit.
604
  If None or an empty list is provided, the original data
605
  is returned unchanged.
606
-
607
  Returns:
608
  The transformed data after applying all operations successfully,
609
  or None if any operation in the sequence returned None.
@@ -2261,7 +2286,6 @@ class MagicPDFProcessor:
2261
  def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
2262
  """
2263
  Initializes the MagicPDFProcessor.
2264
-
2265
  Args:
2266
  device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
2267
  model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
@@ -2283,11 +2307,9 @@ class MagicPDFProcessor:
2283
  def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
2284
  """
2285
  Processes the entire PDF document and yields all extracted structured blocks.
2286
-
2287
  Args:
2288
  pdf_input: Path to the PDF file or a loaded fitz.Document object.
2289
  report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
2290
-
2291
  Yields:
2292
  MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
2293
  """
@@ -2300,12 +2322,10 @@ class MagicPDFProcessor:
2300
  """
2301
  Processes specific pages (or all if page_indexes is None) of the PDF document.
2302
  Yields results page by page, including the page index, extracted blocks, and the original page image.
2303
-
2304
  Args:
2305
  pdf_input: Path to the PDF file or a loaded fitz.Document object.
2306
  page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
2307
  report_progress: Optional callback function for progress updates.
2308
-
2309
  Yields:
2310
  tuple[int, list[MDRStructuredBlock], Image]:
2311
  - page_index (0-based)
@@ -2617,4 +2637,4 @@ if __name__ == '__main__':
2617
  print(f"\nFATAL ERROR during processing: {e}")
2618
  import traceback
2619
  traceback.print_exc()
2620
- exit(1)
 
47
  from munch import Munch
48
  from transformers import LayoutLMv3ForTokenClassification
49
  import onnxruntime
50
+ from enum import auto, Enum
51
  # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
52
  from huggingface_hub import hf_hub_download
53
  from huggingface_hub.errors import HfHubHTTPError
 
92
  if file_path.exists(): os.remove(file_path)
93
  raise e
94
 
 
95
  def mdr_ensure_directory(path: str) -> str:
96
  """Ensures a directory exists, creating it if necessary."""
97
  path = os.path.abspath(path)
 
113
  else: fill = (255, 255, 255)
114
  return pil_expand(image=image, border=(bw, bh), fill=fill)
115
 
116
+ # --- MDR Geometry ---
117
  MDRPoint: TypeAlias = tuple[float, float]
118
  @dataclass
119
  class MDRRectangle:
 
181
  @dataclass
182
  class MDRPlainLayoutElement(MDRBaseLayoutElement):
183
  """Layout element for plain text, titles, captions, figures, etc."""
184
+ # MODIFIED: Replaced Literal[...] with the Enum class name
185
+ cls: MDRLayoutClass # The type hint is now the Enum class itself
186
 
187
  @dataclass
188
  class MDRTableLayoutElement(MDRBaseLayoutElement):
189
  """Layout element specifically for tables."""
190
+ parsed: tuple[str, MDRTableLayoutParsedFormat] | None
191
+ # MODIFIED: Replaced Literal[EnumMember] with the Enum class name
192
+ cls: MDRLayoutClass = MDRLayoutClass.TABLE # Hint with Enum, assign default member
193
 
194
  @dataclass
195
  class MDRFormulaLayoutElement(MDRBaseLayoutElement):
196
  """Layout element specifically for formulas."""
197
+ latex: str | None
198
+ # MODIFIED: Replaced Literal[EnumMember] with the Enum class name
199
+ cls: MDRLayoutClass = MDRLayoutClass.ISOLATE_FORMULA # Hint with Enum, assign default member
200
 
201
  MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
202
 
 
223
  @dataclass
224
  class MDRBasicBlock:
225
  """Base class for structured blocks extracted from the document."""
226
+ rect: MDRRectangle
227
+ texts: list[MDRTextSpan]
228
+ font_size: float # Relative font size (0-1)
229
 
230
  @dataclass
231
  class MDRTextBlock(MDRBasicBlock):
232
  """A structured block containing text content."""
233
+ kind: MDRTextKind
234
+ has_paragraph_indentation: bool = False
235
+ last_line_touch_end: bool = False
236
 
237
+ class MDRTableFormat(Enum):
238
+ LATEX=auto()
239
+ MARKDOWN=auto()
240
+ HTML=auto()
241
+ UNRECOGNIZABLE=auto()
242
 
243
  @dataclass
244
  class MDRTableBlock(MDRBasicBlock):
245
  """A structured block representing a table."""
246
+ content: str
247
+ format: MDRTableFormat
248
+ image: Image # Image clip of the table
249
 
250
  @dataclass
251
  class MDRFormulaBlock(MDRBasicBlock):
252
  """A structured block representing a formula."""
253
+ content: str | None
254
+ image: Image # Image clip of the formula
255
 
256
  @dataclass
257
  class MDRFigureBlock(MDRBasicBlock):
 
294
  return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
295
 
296
  # --- MDR Text Processing ---
297
+ class _MDR_TokenPhase(Enum):
298
+ Init=0
299
+ Letter=1
300
+ Character=2
301
+ Number=3
302
+ Space=4
303
 
304
  _mdr_alphabet_detector = AlphabetDetector()
305
 
306
  def _mdr_is_letter(char: str):
307
+ if not category(char).startswith("L"):
308
+ return False
309
+ try:
310
+ return _mdr_alphabet_detector.is_latin(char) or _mdr_alphabet_detector.is_cyrillic(char) or _mdr_alphabet_detector.is_greek(char) or _mdr_alphabet_detector.is_hebrew(char)
311
  except: return False
312
 
313
  def mdr_split_into_words(text: str):
 
396
  return x + self._n_off[0], y + self._n_off[1]
397
 
398
  def mdr_normalize_vertical_rotation(rot: float) -> float:
399
+ while rot >= pi:
400
+ rot -= pi
401
+ while rot < 0:
402
+ rot += pi
403
  return rot
404
 
405
  def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
 
477
  print(" CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
478
  raise e
479
 
480
+ def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
481
+ return [n.name for n in sess.get_outputs()]
482
 
483
+ def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
484
+ return [n.name for n in sess.get_inputs()]
485
 
486
+ def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]:
487
+ return {name: img_np for name in names}
488
 
489
  # --- MDR ONNX OCR Internals ---
490
  class _MDR_NormalizeImage:
 
618
  ) -> Optional[Any]:
619
  """
620
  Applies a sequence of transformation operations to the input data.
 
621
  This function iterates through a list of operations (callables) and
622
  applies each one sequentially to the data. If any operation
623
  returns None, the processing stops immediately, and None is returned.
 
624
  Args:
625
  data: The initial data to be transformed. Can be of any type
626
  compatible with the operations.
 
629
  the transformed data or None to signal an early exit.
630
  If None or an empty list is provided, the original data
631
  is returned unchanged.
 
632
  Returns:
633
  The transformed data after applying all operations successfully,
634
  or None if any operation in the sequence returned None.
 
2286
  def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
2287
  """
2288
  Initializes the MagicPDFProcessor.
 
2289
  Args:
2290
  device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
2291
  model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
 
2307
  def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
2308
  """
2309
  Processes the entire PDF document and yields all extracted structured blocks.
 
2310
  Args:
2311
  pdf_input: Path to the PDF file or a loaded fitz.Document object.
2312
  report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
 
2313
  Yields:
2314
  MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
2315
  """
 
2322
  """
2323
  Processes specific pages (or all if page_indexes is None) of the PDF document.
2324
  Yields results page by page, including the page index, extracted blocks, and the original page image.
 
2325
  Args:
2326
  pdf_input: Path to the PDF file or a loaded fitz.Document object.
2327
  page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
2328
  report_progress: Optional callback function for progress updates.
 
2329
  Yields:
2330
  tuple[int, list[MDRStructuredBlock], Image]:
2331
  - page_index (0-based)
 
2637
  print(f"\nFATAL ERROR during processing: {e}")
2638
  import traceback
2639
  traceback.print_exc()
2640
+ exit(1)