Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +190 -47
mdr_pdf_parser.py
CHANGED
@@ -1,3 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# --- External Library Imports ---
|
2 |
import os
|
3 |
import re
|
@@ -12,7 +52,7 @@ import requests # For downloading models
|
|
12 |
from pathlib import Path
|
13 |
from enum import auto, Enum
|
14 |
from dataclasses import dataclass
|
15 |
-
from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any
|
16 |
from collections import defaultdict
|
17 |
from math import pi, ceil, sin, cos, sqrt, atan2
|
18 |
from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
|
@@ -562,10 +602,50 @@ class _MDR_KeepKeys:
|
|
562 |
|
563 |
def __call__(self, data): return [data[key] for key in self.keep_keys]
|
564 |
|
565 |
-
def mdr_ocr_transform(
|
566 |
-
|
567 |
-
|
568 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
|
570 |
def mdr_ocr_create_operators(op_param_list, global_config=None):
|
571 |
ops = []
|
@@ -850,42 +930,73 @@ class _MDR_TextClassifier(_MDR_PredictBase):
|
|
850 |
|
851 |
class _MDR_BaseRecLabelDecode:
|
852 |
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
|
874 |
-
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
-
|
880 |
-
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
-
|
886 |
-
|
887 |
-
|
888 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
889 |
|
890 |
class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
|
891 |
def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
|
@@ -1270,12 +1381,44 @@ _MDR_OCR_MODELS = {"det": ("ppocrv4","det","det.onnx"), "cls": ("ppocrv4","cls",
|
|
1270 |
_MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
|
1271 |
|
1272 |
@dataclass
|
1273 |
-
class _MDR_ONNXParams:
|
1274 |
-
|
1275 |
-
|
1276 |
-
|
1277 |
-
|
1278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1279 |
|
1280 |
class MDROcrEngine:
|
1281 |
"""Handles OCR detection and recognition using ONNX models."""
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# /=====================================================================\ #
|
3 |
+
# | MagicDataReadiness - MAGIC PDF Parser | #
|
4 |
+
# |---------------------------------------------------------------------| #
|
5 |
+
# | Description: | #
|
6 |
+
# | Extracts structured content (text, tables, figures, formulas) | #
|
7 |
+
# | from PDF documents using layout analysis and OCR. | #
|
8 |
+
# | Combines logic from various internal components. | #
|
9 |
+
# |---------------------------------------------------------------------| #
|
10 |
+
# | Dependencies: | #
|
11 |
+
# | - Python 3.11+ | #
|
12 |
+
# | - External Libraries (See imports below and installation notes) | #
|
13 |
+
# | - Pre-trained CV Models (Downloaded automatically to model dir) | #
|
14 |
+
# |---------------------------------------------------------------------| #
|
15 |
+
# | Usage: | #
|
16 |
+
# | See the __main__ block at the end of the script for an example. | #
|
17 |
+
# \=====================================================================/ #
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
# -*- coding: utf-8 -*-
|
22 |
+
# /=====================================================================\ #
|
23 |
+
# | MagicDataReadiness - MAGIC PDF Parser | #
|
24 |
+
# |---------------------------------------------------------------------| #
|
25 |
+
# | Description: | #
|
26 |
+
# | Extracts structured content (text, tables, figures, formulas) | #
|
27 |
+
# | from PDF documents using layout analysis and OCR. | #
|
28 |
+
# | Combines logic from various internal components. | #
|
29 |
+
# |---------------------------------------------------------------------| #
|
30 |
+
# | Dependencies: | #
|
31 |
+
# | - Python 3.11+ | #
|
32 |
+
# | - External Libraries (See imports below and installation notes) | #
|
33 |
+
# | - Pre-trained CV Models (Downloaded automatically to model dir) | #
|
34 |
+
# |---------------------------------------------------------------------| #
|
35 |
+
# | Usage: | #
|
36 |
+
# | See the __main__ block at the end of the script for an example. | #
|
37 |
+
# \=====================================================================/ #
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
# --- External Library Imports ---
|
42 |
import os
|
43 |
import re
|
|
|
52 |
from pathlib import Path
|
53 |
from enum import auto, Enum
|
54 |
from dataclasses import dataclass
|
55 |
+
from typing import Literal, Iterable, Generator, Sequence, Callable, TypeAlias, List, Dict, Any, Optional
|
56 |
from collections import defaultdict
|
57 |
from math import pi, ceil, sin, cos, sqrt, atan2
|
58 |
from PIL.Image import Image, frombytes, new as new_image, Resampling as PILResampling, Transform as PILTransform
|
|
|
602 |
|
603 |
def __call__(self, data): return [data[key] for key in self.keep_keys]
|
604 |
|
605 |
+
def mdr_ocr_transform(
|
606 |
+
data: Any,
|
607 |
+
ops: Optional[List[Callable[[Any], Optional[Any]]]] = None
|
608 |
+
) -> Optional[Any]:
|
609 |
+
"""
|
610 |
+
Applies a sequence of transformation operations to the input data.
|
611 |
+
|
612 |
+
This function iterates through a list of operations (callables) and
|
613 |
+
applies each one sequentially to the data. If any operation
|
614 |
+
returns None, the processing stops immediately, and None is returned.
|
615 |
+
|
616 |
+
Args:
|
617 |
+
data: The initial data to be transformed. Can be of any type
|
618 |
+
compatible with the operations.
|
619 |
+
ops: An optional list of callable operations. Each operation
|
620 |
+
should accept the current state of the data and return
|
621 |
+
the transformed data or None to signal an early exit.
|
622 |
+
If None or an empty list is provided, the original data
|
623 |
+
is returned unchanged.
|
624 |
+
|
625 |
+
Returns:
|
626 |
+
The transformed data after applying all operations successfully,
|
627 |
+
or None if any operation in the sequence returned None.
|
628 |
+
"""
|
629 |
+
# Use an empty list if ops is None to avoid errors when iterating
|
630 |
+
# and to represent "no operations" gracefully.
|
631 |
+
if ops is None:
|
632 |
+
operations_to_apply = []
|
633 |
+
else:
|
634 |
+
operations_to_apply = ops
|
635 |
+
|
636 |
+
current_data = data # Use a separate variable to track the evolving data
|
637 |
+
|
638 |
+
# Sequentially apply each operation
|
639 |
+
for op in operations_to_apply:
|
640 |
+
current_data = op(current_data) # Apply the operation
|
641 |
+
|
642 |
+
# Check if the operation signaled failure or requested early exit
|
643 |
+
# by returning None.
|
644 |
+
if current_data is None:
|
645 |
+
return None # Short-circuit the pipeline
|
646 |
+
|
647 |
+
# If the loop completes without returning None, all operations succeeded.
|
648 |
+
return current_data
|
649 |
|
650 |
def mdr_ocr_create_operators(op_param_list, global_config=None):
|
651 |
ops = []
|
|
|
930 |
|
931 |
class _MDR_BaseRecLabelDecode:
|
932 |
|
933 |
+
def __init__(self, char_path=None, use_space=False):
|
934 |
+
self.beg, self.end, self.rev = "sos", "eos", False
|
935 |
+
self.chars = []
|
936 |
+
if char_path is None:
|
937 |
+
self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
|
938 |
+
else:
|
939 |
+
try:
|
940 |
+
with open(char_path, "rb") as f:
|
941 |
+
self.chars = [l.decode("utf-8").strip("\n\r") for l in f]
|
942 |
+
if use_space:
|
943 |
+
self.chars.append(" ")
|
944 |
+
if any("\u0600" <= c <= "\u06FF" for c in self.chars):
|
945 |
+
self.rev = True
|
946 |
+
except FileNotFoundError:
|
947 |
+
print(f"Warn: Dict not found {char_path}")
|
948 |
+
self.chars = list("0123456789abcdefghijklmnopqrstuvwxyz")
|
949 |
+
if use_space:
|
950 |
+
self.chars.append(" ")
|
951 |
+
d_char = self.add_special_char(list(self.chars))
|
952 |
+
self.dict = {c: i for i, c in enumerate(d_char)}
|
953 |
+
self.character = d_char
|
954 |
+
|
955 |
+
def add_special_char(self, chars):
|
956 |
+
return chars
|
957 |
+
|
958 |
+
def get_ignored_tokens(self):
|
959 |
+
return []
|
960 |
+
|
961 |
+
def _reverse(self, pred):
|
962 |
+
res = []
|
963 |
+
cur = ""
|
964 |
+
for c in pred:
|
965 |
+
if not re.search("[a-zA-Z0-9 :*./%+-]", c):
|
966 |
+
if cur != "":
|
967 |
+
res.extend([cur, c])
|
968 |
+
else:
|
969 |
+
res.extend([c])
|
970 |
+
cur = ""
|
971 |
+
else:
|
972 |
+
cur += c
|
973 |
+
if cur != "":
|
974 |
+
res.append(cur)
|
975 |
+
return "".join(res[::-1])
|
976 |
+
|
977 |
+
def decode(self, idxs, probs=None, remove_dup=False):
|
978 |
+
res = []
|
979 |
+
ignored = self.get_ignored_tokens()
|
980 |
+
bs = len(idxs)
|
981 |
+
for b_idx in range(bs):
|
982 |
+
sel = np.ones(len(idxs[b_idx]), dtype=bool)
|
983 |
+
if remove_dup:
|
984 |
+
sel[1:] = idxs[b_idx][1:] != idxs[b_idx][:-1]
|
985 |
+
for ig_tok in ignored:
|
986 |
+
sel &= idxs[b_idx] != ig_tok
|
987 |
+
char_l = [
|
988 |
+
self.character[tid]
|
989 |
+
for tid in idxs[b_idx][sel]
|
990 |
+
if 0 <= tid < len(self.character)
|
991 |
+
]
|
992 |
+
conf_l = probs[b_idx][sel] if probs is not None else [1] * len(char_l)
|
993 |
+
if len(conf_l) == 0:
|
994 |
+
conf_l = [0]
|
995 |
+
txt = "".join(char_l)
|
996 |
+
if self.rev:
|
997 |
+
txt = self._reverse(txt)
|
998 |
+
res.append((txt, float(np.mean(conf_l))))
|
999 |
+
return res
|
1000 |
|
1001 |
class _MDR_CTCLabelDecode(_MDR_BaseRecLabelDecode):
|
1002 |
def __init__(self, char_path=None, use_space=False, **kwargs): super().__init__(char_path, use_space)
|
|
|
1381 |
_MDR_OCR_URL_BASE = "https://huggingface.co/moskize/OnnxOCR/resolve/main/"
|
1382 |
|
1383 |
@dataclass
|
1384 |
+
class _MDR_ONNXParams:
|
1385 |
+
# Attributes without default values
|
1386 |
+
use_gpu: bool
|
1387 |
+
det_model_dir: str
|
1388 |
+
cls_model_dir: str
|
1389 |
+
rec_model_dir: str
|
1390 |
+
rec_char_dict_path: str
|
1391 |
+
|
1392 |
+
# Attributes with default values (Group 1)
|
1393 |
+
use_angle_cls: bool = True
|
1394 |
+
rec_image_shape: str = "3,48,320"
|
1395 |
+
cls_image_shape: str = "3,48,192"
|
1396 |
+
cls_batch_num: int = 6
|
1397 |
+
cls_thresh: float = 0.9
|
1398 |
+
label_list: list = ['0', '180']
|
1399 |
+
|
1400 |
+
# Attributes with default values (Group 2 - Detection)
|
1401 |
+
det_algorithm: str = "DB"
|
1402 |
+
det_limit_side_len: int = 960
|
1403 |
+
det_limit_type: str = 'max'
|
1404 |
+
det_db_thresh: float = 0.3
|
1405 |
+
det_db_box_thresh: float = 0.6
|
1406 |
+
det_db_unclip_ratio: float = 1.5
|
1407 |
+
use_dilation: bool = False
|
1408 |
+
det_db_score_mode: str = 'fast'
|
1409 |
+
det_box_type: str = 'quad'
|
1410 |
+
|
1411 |
+
# Attributes with default values (Group 3 - Recognition & General)
|
1412 |
+
rec_batch_num: int = 6
|
1413 |
+
drop_score: float = 0.5
|
1414 |
+
rec_algorithm: str = "SVTR_LCNet"
|
1415 |
+
use_space_char: bool = True
|
1416 |
+
|
1417 |
+
# Attributes with default values (Group 4 - Output & Logging)
|
1418 |
+
save_crop_res: bool = False
|
1419 |
+
crop_res_save_dir: str = "./output/mdr_crop_res"
|
1420 |
+
show_log: bool = False
|
1421 |
+
use_onnx: bool = True
|
1422 |
|
1423 |
class MDROcrEngine:
|
1424 |
"""Handles OCR detection and recognition using ONNX models."""
|