rodrigomasini commited on
Commit
743c731
·
verified ·
1 Parent(s): 37fe183

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +190 -47
mdr_pdf_parser.py CHANGED
@@ -1,3 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # --- External Library Imports ---
2
  import os
3
  import re
@@ -12,7 +52,7 @@ import requests # For downloading models
12
  from pathlib import Path
13
  from enum import auto, Enum
14
  from dataclasses import dataclass
15
- from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any
16
  from collections import defaultdict
17
  from math import pi, ceil, sin, cos, sqrt, atan2
18
  from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
@@ -562,10 +602,50 @@ class _MDR_KeepKeys:
562
 
563
  def __call__(self, data): return [data[key] for key in self.keep_keys]
564
 
565
- def mdr_ocr_transform(data, ops=None):
566
- ops = ops if ops is not None else [];
567
- for op in ops: data = op(data); if data is None: return None;
568
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
 
570
  def mdr_ocr_create_operators(op_param_list, global_config=None):
571
  ops = []
@@ -850,42 +930,73 @@ class _MDR_TextClassifier(_MDR_PredictBase):
850
 
851
  class _MDR_BaseRecLabelDecode:
852
 
853
- def __init__(self, char_path=None, use_space=False):
854
- self.beg, self.end, self.rev = "sos", "eos", False; self.chars = []
855
- if char_path is None: self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
856
- else:
857
- try:
858
- with open(char_path,"rb") as f: self.chars=[l.decode("utf-8").strip("\n\r") for l in f]
859
- if use_space: self.chars.append(" ")
860
- if any("\u0600"<=c<="\u06FF" for c in self.chars): self.rev=True
861
- except FileNotFoundError: print(f"Warn: Dict not found {char_path}"); self.chars=list("0123456789abcdefghijklmnopqrstuvwxyz"); if use_space: self.chars.append(" ")
862
- d_char = self.add_special_char(list(self.chars)); self.dict={c:i for i,c in enumerate(d_char)}; self.character=d_char
863
-
864
- def add_special_char(self, chars): return chars
865
-
866
- def get_ignored_tokens(self): return []
867
-
868
- def _reverse(self, pred): res=[]; cur=""; for c in pred: if not re.search("[a-zA-Z0-9 :*./%+-]",c): res.extend([cur,c] if cur!="" else [c]); cur="" else: cur+=c; if cur!="": res.append(cur); return "".join(res[::-1])
869
-
870
- def decode(self, idxs, probs=None, remove_dup=False):
871
- res = []
872
- ignored = self.get_ignored_tokens()
873
- bs = len(idxs)
874
- for b_idx in range(bs):
875
- sel = np.ones(len(idxs[b_idx]), dtype=bool)
876
- if remove_dup:
877
- sel[1:] = idxs[b_idx][1:] != idxs[b_idx][:-1]
878
- for ig_tok in ignored:
879
- sel &= idxs[b_idx] != ig_tok
880
- char_l = [self.character[tid] for tid in idxs[b_idx][sel] if 0 <= tid < len(self.character)]
881
- conf_l = probs[b_idx][sel] if probs is not None else [1] * len(char_l)
882
- if len(conf_l) == 0:
883
- conf_l = [0]
884
- txt = "".join(char_l)
885
- if self.rev:
886
- txt = self._reverse(txt)
887
- res.append((txt, float(np.mean(conf_l))))
888
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
 
890
  class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
891
  def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
@@ -1270,12 +1381,44 @@ _MDR_OCR_MODELS = {"det": ("ppocrv4","det","det.onnx"), "cls": ("ppocrv4","cls",
1270
  _MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
1271
 
1272
  @dataclass
1273
- class _MDR_ONNXParams: # Simplified container
1274
- use_gpu: bool; det_model_dir: str; cls_model_dir: str; rec_model_dir: str; rec_char_dict_path: str
1275
- use_angle_cls: bool=True; rec_image_shape: str="3,48,320"; cls_image_shape: str="3,48,192"; cls_batch_num: int=6; cls_thresh: float=0.9; label_list: list=['0','180']
1276
- det_algorithm: str="DB"; det_limit_side_len: int=960; det_limit_type: str='max'; det_db_thresh: float=0.3; det_db_box_thresh: float=0.6; det_db_unclip_ratio: float=1.5
1277
- use_dilation: bool=False; det_db_score_mode: str='fast'; det_box_type: str='quad'; rec_batch_num: int=6; drop_score: float=0.5; rec_algorithm: str="SVTR_LCNet"; use_space_char: bool=True
1278
- save_crop_res: bool=False; crop_res_save_dir: str="./output/mdr_crop_res"; show_log: bool=False; use_onnx: bool=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1279
 
1280
  class MDROcrEngine:
1281
  """Handles OCR detection and recognition using ONNX models."""
 
1
+ # -*- coding: utf-8 -*-
2
+ # /=====================================================================\ #
3
+ # | MagicDataReadiness - MAGIC PDF Parser | #
4
+ # |---------------------------------------------------------------------| #
5
+ # | Description: | #
6
+ # | Extracts structured content (text, tables, figures, formulas) | #
7
+ # | from PDF documents using layout analysis and OCR. | #
8
+ # | Combines logic from various internal components. | #
9
+ # |---------------------------------------------------------------------| #
10
+ # | Dependencies: | #
11
+ # | - Python 3.11+ | #
12
+ # | - External Libraries (See imports below and installation notes) | #
13
+ # | - Pre-trained CV Models (Downloaded automatically to model dir) | #
14
+ # |---------------------------------------------------------------------| #
15
+ # | Usage: | #
16
+ # | See the __main__ block at the end of the script for an example. | #
17
+ # \=====================================================================/ #
18
+
19
+
20
+
21
+ # -*- coding: utf-8 -*-
22
+ # /=====================================================================\ #
23
+ # | MagicDataReadiness - MAGIC PDF Parser | #
24
+ # |---------------------------------------------------------------------| #
25
+ # | Description: | #
26
+ # | Extracts structured content (text, tables, figures, formulas) | #
27
+ # | from PDF documents using layout analysis and OCR. | #
28
+ # | Combines logic from various internal components. | #
29
+ # |---------------------------------------------------------------------| #
30
+ # | Dependencies: | #
31
+ # | - Python 3.11+ | #
32
+ # | - External Libraries (See imports below and installation notes) | #
33
+ # | - Pre-trained CV Models (Downloaded automatically to model dir) | #
34
+ # |---------------------------------------------------------------------| #
35
+ # | Usage: | #
36
+ # | See the __main__ block at the end of the script for an example. | #
37
+ # \=====================================================================/ #
38
+
39
+
40
+
41
  # --- External Library Imports ---
42
  import os
43
  import re
 
52
  from pathlib import Path
53
  from enum import auto, Enum
54
  from dataclasses import dataclass
55
+ from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any, Optional
56
  from collections import defaultdict
57
  from math import pi, ceil, sin, cos, sqrt, atan2
58
  from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
 
602
 
603
  def __call__(self, data): return [data[key] for key in self.keep_keys]
604
 
605
+ def mdr_ocr_transform(
606
+ data: Any,
607
+ ops: Optional[List[Callable[[Any], Optional[Any]]]] = None
608
+ ) -> Optional[Any]:
609
+ """
610
+ Applies a sequence of transformation operations to the input data.
611
+
612
+ This function iterates through a list of operations (callables) and
613
+ applies each one sequentially to the data. If any operation
614
+ returns None, the processing stops immediately, and None is returned.
615
+
616
+ Args:
617
+ data: The initial data to be transformed. Can be of any type
618
+ compatible with the operations.
619
+ ops: An optional list of callable operations. Each operation
620
+ should accept the current state of the data and return
621
+ the transformed data or None to signal an early exit.
622
+ If None or an empty list is provided, the original data
623
+ is returned unchanged.
624
+
625
+ Returns:
626
+ The transformed data after applying all operations successfully,
627
+ or None if any operation in the sequence returned None.
628
+ """
629
+ # Use an empty list if ops is None to avoid errors when iterating
630
+ # and to represent "no operations" gracefully.
631
+ if ops is None:
632
+ operations_to_apply = []
633
+ else:
634
+ operations_to_apply = ops
635
+
636
+ current_data = data # Use a separate variable to track the evolving data
637
+
638
+ # Sequentially apply each operation
639
+ for op in operations_to_apply:
640
+ current_data = op(current_data) # Apply the operation
641
+
642
+ # Check if the operation signaled failure or requested early exit
643
+ # by returning None.
644
+ if current_data is None:
645
+ return None # Short-circuit the pipeline
646
+
647
+ # If the loop completes without returning None, all operations succeeded.
648
+ return current_data
649
 
650
  def mdr_ocr_create_operators(op_param_list, global_config=None):
651
  ops = []
 
930
 
931
  class _MDR_BaseRecLabelDecode:
932
 
933
+ def __init__(self, char_path=None, use_space=False):
934
+ self.beg, self.end, self.rev = "sos", "eos", False
935
+ self.chars = []
936
+ if char_path is None:
937
+ self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
938
+ else:
939
+ try:
940
+ with open(char_path, "rb") as f:
941
+ self.chars = [l.decode("utf-8").strip("\n\r") for l in f]
942
+ if use_space:
943
+ self.chars.append(" ")
944
+ if any("\u0600" <= c <= "\u06FF" for c in self.chars):
945
+ self.rev = True
946
+ except FileNotFoundError:
947
+ print(f"Warn: Dict not found {char_path}")
948
+ self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
949
+ if use_space:
950
+ self.chars.append(" ")
951
+ d_char = self.add_special_char(list(self.chars))
952
+ self.dict = {c: i for i, c in enumerate(d_char)}
953
+ self.character = d_char
954
+
955
+ def add_special_char(self, chars):
956
+ return chars
957
+
958
+ def get_ignored_tokens(self):
959
+ return []
960
+
961
+ def _reverse(self, pred):
962
+ res = []
963
+ cur = ""
964
+ for c in pred:
965
+ if not re.search("[a-zA-Z0-9 :*./%+-]", c):
966
+ if cur != "":
967
+ res.extend([cur, c])
968
+ else:
969
+ res.extend([c])
970
+ cur = ""
971
+ else:
972
+ cur += c
973
+ if cur != "":
974
+ res.append(cur)
975
+ return "".join(res[::-1])
976
+
977
+ def decode(self, idxs, probs=None, remove_dup=False):
978
+ res = []
979
+ ignored = self.get_ignored_tokens()
980
+ bs = len(idxs)
981
+ for b_idx in range(bs):
982
+ sel = np.ones(len(idxs[b_idx]), dtype=bool)
983
+ if remove_dup:
984
+ sel[1:] = idxs[b_idx][1:] != idxs[b_idx][:-1]
985
+ for ig_tok in ignored:
986
+ sel &= idxs[b_idx] != ig_tok
987
+ char_l = [
988
+ self.character[tid]
989
+ for tid in idxs[b_idx][sel]
990
+ if 0 <= tid < len(self.character)
991
+ ]
992
+ conf_l = probs[b_idx][sel] if probs is not None else [1] * len(char_l)
993
+ if len(conf_l) == 0:
994
+ conf_l = [0]
995
+ txt = "".join(char_l)
996
+ if self.rev:
997
+ txt = self._reverse(txt)
998
+ res.append((txt, float(np.mean(conf_l))))
999
+ return res
1000
 
1001
  class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
1002
  def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
 
1381
  _MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
1382
 
1383
  @dataclass
1384
+ class _MDR_ONNXParams:
1385
+ # Attributes without default values
1386
+ use_gpu: bool
1387
+ det_model_dir: str
1388
+ cls_model_dir: str
1389
+ rec_model_dir: str
1390
+ rec_char_dict_path: str
1391
+
1392
+ # Attributes with default values (Group 1)
1393
+ use_angle_cls: bool = True
1394
+ rec_image_shape: str = "3,48,320"
1395
+ cls_image_shape: str = "3,48,192"
1396
+ cls_batch_num: int = 6
1397
+ cls_thresh: float = 0.9
1398
+ label_list: list = ['0', '180']
1399
+
1400
+ # Attributes with default values (Group 2 - Detection)
1401
+ det_algorithm: str = "DB"
1402
+ det_limit_side_len: int = 960
1403
+ det_limit_type: str = 'max'
1404
+ det_db_thresh: float = 0.3
1405
+ det_db_box_thresh: float = 0.6
1406
+ det_db_unclip_ratio: float = 1.5
1407
+ use_dilation: bool = False
1408
+ det_db_score_mode: str = 'fast'
1409
+ det_box_type: str = 'quad'
1410
+
1411
+ # Attributes with default values (Group 3 - Recognition & General)
1412
+ rec_batch_num: int = 6
1413
+ drop_score: float = 0.5
1414
+ rec_algorithm: str = "SVTR_LCNet"
1415
+ use_space_char: bool = True
1416
+
1417
+ # Attributes with default values (Group 4 - Output & Logging)
1418
+ save_crop_res: bool = False
1419
+ crop_res_save_dir: str = "./output/mdr_crop_res"
1420
+ show_log: bool = False
1421
+ use_onnx: bool = True
1422
 
1423
  class MDROcrEngine:
1424
  """Handles OCR detection and recognition using ONNX models."""