Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on Apr 29

Commit

743c731

verified ·

1 Parent(s): 37fe183

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +190 -47

mdr_pdf_parser.py CHANGED Viewed

@@ -1,3 +1,43 @@
 # --- External Library Imports ---
 import os
 import re
@@ -12,7 +52,7 @@ import requests # For downloading models
 from pathlib import Path
 from enum import auto, Enum
 from dataclasses import dataclass
-from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any
 from collections import defaultdict
 from math import pi, ceil, sin, cos, sqrt, atan2
 from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
@@ -562,10 +602,50 @@ class _MDR_KeepKeys:
   def __call__(self, data): return [data[key] for key in self.keep_keys]
-def mdr_ocr_transform(data, ops=None):
-  ops = ops if ops is not None else [];
-  for op in ops: data = op(data); if data is None: return None;
-  return data
 def mdr_ocr_create_operators(op_param_list, global_config=None):
   ops = []
@@ -850,42 +930,73 @@ class _MDR_TextClassifier(_MDR_PredictBase):
 class _MDR_BaseRecLabelDecode:
-  def __init__(self, char_path=None, use_space=False):
-    self.beg, self.end, self.rev = "sos", "eos", False; self.chars = []
-    if char_path is None: self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
-    else:
-      try:
-        with open(char_path,"rb") as f: self.chars=[l.decode("utf-8").strip("\n\r") for l in f]
-        if use_space: self.chars.append(" ")
-        if any("\u0600"<=c<="\u06FF" for c in self.chars): self.rev=True
-      except FileNotFoundError: print(f"Warn: Dict not found {char_path}"); self.chars=list("0123456789abcdefghijklmnopqrstuvwxyz"); if use_space: self.chars.append(" ")
-    d_char = self.add_special_char(list(self.chars)); self.dict={c:i for i,c in enumerate(d_char)}; self.character=d_char
-  def add_special_char(self, chars): return chars
-  def get_ignored_tokens(self): return []
-  def _reverse(self, pred): res=[]; cur=""; for c in pred: if not re.search("[a-zA-Z0-9 :*./%+-]",c): res.extend([cur,c] if cur!="" else [c]); cur="" else: cur+=c; if cur!="": res.append(cur); return "".join(res[::-1])
-  def decode(self, idxs, probs=None, remove_dup=False):
-    res = []
-    ignored = self.get_ignored_tokens()
-    bs = len(idxs)
-    for b_idx in range(bs):
-      sel = np.ones(len(idxs[b_idx]), dtype=bool)
-      if remove_dup:
-          sel[1:] = idxs[b_idx][1:] != idxs[b_idx][:-1]
-      for ig_tok in ignored:
-          sel &= idxs[b_idx] != ig_tok
-      char_l = [self.character[tid] for tid in idxs[b_idx][sel] if 0 <= tid < len(self.character)]
-      conf_l = probs[b_idx][sel] if probs is not None else [1] * len(char_l)
-      if len(conf_l) == 0:
-          conf_l = [0]
-      txt = "".join(char_l)
-      if self.rev:
-          txt = self._reverse(txt)
-      res.append((txt, float(np.mean(conf_l))))
-    return res
 class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
   def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
@@ -1270,12 +1381,44 @@ _MDR_OCR_MODELS = {"det": ("ppocrv4","det","det.onnx"), "cls": ("ppocrv4","cls",
 _MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
 @dataclass
-class _MDR_ONNXParams: # Simplified container
-    use_gpu: bool; det_model_dir: str; cls_model_dir: str; rec_model_dir: str; rec_char_dict_path: str
-    use_angle_cls: bool=True; rec_image_shape: str="3,48,320"; cls_image_shape: str="3,48,192"; cls_batch_num: int=6; cls_thresh: float=0.9; label_list: list=['0','180']
-    det_algorithm: str="DB"; det_limit_side_len: int=960; det_limit_type: str='max'; det_db_thresh: float=0.3; det_db_box_thresh: float=0.6; det_db_unclip_ratio: float=1.5
-    use_dilation: bool=False; det_db_score_mode: str='fast'; det_box_type: str='quad'; rec_batch_num: int=6; drop_score: float=0.5; rec_algorithm: str="SVTR_LCNet"; use_space_char: bool=True
-    save_crop_res: bool=False; crop_res_save_dir: str="./output/mdr_crop_res"; show_log: bool=False; use_onnx: bool=True
 class MDROcrEngine:
   """Handles OCR detection and recognition using ONNX models."""

+# -*- coding: utf-8 -*-
+# /=====================================================================\ #
+# |              MagicDataReadiness - MAGIC PDF Parser                  | #
+# |---------------------------------------------------------------------| #
+# | Description:                                                        | #
+# |   Extracts structured content (text, tables, figures, formulas)     | #
+# |   from PDF documents using layout analysis and OCR.                 | #
+# |   Combines logic from various internal components.                  | #
+# |---------------------------------------------------------------------| #
+# | Dependencies:                                                       | #
+# |   - Python 3.11+                                                    | #
+# |   - External Libraries (See imports below and installation notes)   | #
+# |   - Pre-trained CV Models (Downloaded automatically to model dir)   | #
+# |---------------------------------------------------------------------| #
+# | Usage:                                                              | #
+# |   See the __main__ block at the end of the script for an example.   | #
+# \=====================================================================/ #
+# -*- coding: utf-8 -*-
+# /=====================================================================\ #
+# |              MagicDataReadiness - MAGIC PDF Parser                  | #
+# |---------------------------------------------------------------------| #
+# | Description:                                                        | #
+# |   Extracts structured content (text, tables, figures, formulas)     | #
+# |   from PDF documents using layout analysis and OCR.                 | #
+# |   Combines logic from various internal components.                  | #
+# |---------------------------------------------------------------------| #
+# | Dependencies:                                                       | #
+# |   - Python 3.11+                                                    | #
+# |   - External Libraries (See imports below and installation notes)   | #
+# |   - Pre-trained CV Models (Downloaded automatically to model dir)   | #
+# |---------------------------------------------------------------------| #
+# | Usage:                                                              | #
+# |   See the __main__ block at the end of the script for an example.   | #
+# \=====================================================================/ #
 # --- External Library Imports ---
 import os
 import re
 from pathlib import Path
 from enum import auto, Enum
 from dataclasses import dataclass
+from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any, Optional
 from collections import defaultdict
 from math import pi, ceil, sin, cos, sqrt, atan2
 from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
   def __call__(self, data): return [data[key] for key in self.keep_keys]
+def mdr_ocr_transform(
+    data: Any,
+    ops: Optional[List[Callable[[Any], Optional[Any]]]] = None
+) -> Optional[Any]:
+    """
+    Applies a sequence of transformation operations to the input data.
+    This function iterates through a list of operations (callables) and
+    applies each one sequentially to the data. If any operation
+    returns None, the processing stops immediately, and None is returned.
+    Args:
+        data: The initial data to be transformed. Can be of any type
+              compatible with the operations.
+        ops: An optional list of callable operations. Each operation
+             should accept the current state of the data and return
+             the transformed data or None to signal an early exit.
+             If None or an empty list is provided, the original data
+             is returned unchanged.
+    Returns:
+        The transformed data after applying all operations successfully,
+        or None if any operation in the sequence returned None.
+    """
+    # Use an empty list if ops is None to avoid errors when iterating
+    # and to represent "no operations" gracefully.
+    if ops is None:
+        operations_to_apply = []
+    else:
+        operations_to_apply = ops
+    current_data = data # Use a separate variable to track the evolving data
+    # Sequentially apply each operation
+    for op in operations_to_apply:
+        current_data = op(current_data) # Apply the operation
+        # Check if the operation signaled failure or requested early exit
+        # by returning None.
+        if current_data is None:
+            return None # Short-circuit the pipeline
+    # If the loop completes without returning None, all operations succeeded.
+    return current_data
 def mdr_ocr_create_operators(op_param_list, global_config=None):
   ops = []
 class _MDR_BaseRecLabelDecode:
+    def __init__(self, char_path=None, use_space=False):
+        self.beg, self.end, self.rev = "sos", "eos", False
+        self.chars = []
+        if char_path is None:
+            self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
+        else:
+            try:
+                with open(char_path, "rb") as f:
+                    self.chars = [l.decode("utf-8").strip("\n\r") for l in f]
+                if use_space:
+                    self.chars.append(" ")
+                if any("\u0600" <= c <= "\u06FF" for c in self.chars):
+                    self.rev = True
+            except FileNotFoundError:
+                print(f"Warn: Dict not found {char_path}")
+                self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
+                if use_space:
+                    self.chars.append(" ")
+        d_char = self.add_special_char(list(self.chars))
+        self.dict = {c: i for i, c in enumerate(d_char)}
+        self.character = d_char
+    def add_special_char(self, chars):
+        return chars
+    def get_ignored_tokens(self):
+        return []
+    def _reverse(self, pred):
+        res = []
+        cur = ""
+        for c in pred:
+            if not re.search("[a-zA-Z0-9 :*./%+-]", c):
+                if cur != "":
+                    res.extend([cur, c])
+                else:
+                    res.extend([c])
+                cur = ""
+            else:
+                cur += c
+        if cur != "":
+            res.append(cur)
+        return "".join(res[::-1])
+    def decode(self, idxs, probs=None, remove_dup=False):
+        res = []
+        ignored = self.get_ignored_tokens()
+        bs = len(idxs)
+        for b_idx in range(bs):
+            sel = np.ones(len(idxs[b_idx]), dtype=bool)
+            if remove_dup:
+                sel[1:] = idxs[b_idx][1:] != idxs[b_idx][:-1]
+            for ig_tok in ignored:
+                sel &= idxs[b_idx] != ig_tok
+            char_l = [
+                self.character[tid]
+                for tid in idxs[b_idx][sel]
+                if 0 <= tid < len(self.character)
+            ]
+            conf_l = probs[b_idx][sel] if probs is not None else [1] * len(char_l)
+            if len(conf_l) == 0:
+                conf_l = [0]
+            txt = "".join(char_l)
+            if self.rev:
+                txt = self._reverse(txt)
+            res.append((txt, float(np.mean(conf_l))))
+        return res
 class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
   def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
 _MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
 @dataclass
+class _MDR_ONNXParams:
+    # Attributes without default values
+    use_gpu: bool
+    det_model_dir: str
+    cls_model_dir: str
+    rec_model_dir: str
+    rec_char_dict_path: str
+    # Attributes with default values (Group 1)
+    use_angle_cls: bool = True
+    rec_image_shape: str = "3,48,320"
+    cls_image_shape: str = "3,48,192"
+    cls_batch_num: int = 6
+    cls_thresh: float = 0.9
+    label_list: list = ['0', '180']
+    # Attributes with default values (Group 2 - Detection)
+    det_algorithm: str = "DB"
+    det_limit_side_len: int = 960
+    det_limit_type: str = 'max'
+    det_db_thresh: float = 0.3
+    det_db_box_thresh: float = 0.6
+    det_db_unclip_ratio: float = 1.5
+    use_dilation: bool = False
+    det_db_score_mode: str = 'fast'
+    det_box_type: str = 'quad'
+    # Attributes with default values (Group 3 - Recognition & General)
+    rec_batch_num: int = 6
+    drop_score: float = 0.5
+    rec_algorithm: str = "SVTR_LCNet"
+    use_space_char: bool = True
+    # Attributes with default values (Group 4 - Output & Logging)
+    save_crop_res: bool = False
+    crop_res_save_dir: str = "./output/mdr_crop_res"
+    show_log: bool = False
+    use_onnx: bool = True
 class MDROcrEngine:
   """Handles OCR detection and recognition using ONNX models."""