Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +56 -177
mdr_pdf_parser.py
CHANGED
@@ -2656,19 +2656,19 @@ class MDRExtractionEngine:
|
|
2656 |
self._layout_reader = MDRLayoutReader(model_path=os.path.join(self._model_dir, "layoutreader"))
|
2657 |
print(f"MDR Extraction Engine initialized on device: {self._device}")
|
2658 |
|
2659 |
-
# --- MODIFIED _get_yolo_model METHOD for HF ---
|
2660 |
# In class MDRExtractionEngine:
|
2661 |
-
|
2662 |
-
|
2663 |
-
"""Loads the YOLOv10b-DocLayNet layout detection model."""
|
2664 |
if self._yolo is None:
|
|
|
|
|
2665 |
repo_id = "hantian/yolo-doclaynet"
|
2666 |
-
filename = "yolov10b-doclaynet.pt"
|
2667 |
|
2668 |
yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
|
2669 |
mdr_ensure_directory(str(yolo_cache_dir))
|
2670 |
|
2671 |
-
print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
|
2672 |
print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
|
2673 |
|
2674 |
try:
|
@@ -2677,50 +2677,26 @@ class MDRExtractionEngine:
|
|
2677 |
filename=filename,
|
2678 |
cache_dir=yolo_cache_dir,
|
2679 |
local_files_only=False,
|
2680 |
-
force_download=False,
|
2681 |
)
|
2682 |
print(f"YOLO model file path: {yolo_model_filepath}")
|
2683 |
|
2684 |
-
|
2685 |
-
#
|
2686 |
-
|
2687 |
-
from ultralytics import YOLO as UltralyticsYOLO
|
2688 |
-
self._yolo = UltralyticsYOLO(yolo_model_filepath)
|
2689 |
-
print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
|
2690 |
-
except ImportError:
|
2691 |
-
print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
|
2692 |
-
print("Please install it: pip install ultralytics")
|
2693 |
-
self._yolo = None
|
2694 |
-
return None # Critical failure
|
2695 |
-
except Exception as e_ultra: # Catch other ultralytics loading errors
|
2696 |
-
print(f"ERROR: Failed to load YOLO model with ultralytics.YOLO: {e_ultra}")
|
2697 |
-
# If direct ultralytics fails, and your YOLOv10 wrapper exists, you could try it as a fallback,
|
2698 |
-
# but it's likely to fail if ultralytics.YOLO failed due to model structure.
|
2699 |
-
if YOLOv10 is not None:
|
2700 |
-
print("Attempting fallback to doclayout_yolo.YOLOv10 wrapper...")
|
2701 |
-
try:
|
2702 |
-
self._yolo = YOLOv10(yolo_model_filepath)
|
2703 |
-
print("MDR YOLOv10b-DocLayNet model loaded with doclayout_yolo.YOLOv10 wrapper (fallback).")
|
2704 |
-
except Exception as e_wrapper:
|
2705 |
-
print(f"ERROR: Fallback to doclayout_yolo.YOLOv10 also failed: {e_wrapper}")
|
2706 |
-
self._yolo = None
|
2707 |
-
return None
|
2708 |
-
else:
|
2709 |
-
self._yolo = None
|
2710 |
-
return None
|
2711 |
-
# --- END: MODIFIED LOADING ---
|
2712 |
|
2713 |
-
except
|
2714 |
-
print(
|
|
|
2715 |
self._yolo = None
|
2716 |
-
except
|
2717 |
-
print(f"ERROR: YOLO model
|
2718 |
self._yolo = None
|
2719 |
-
except Exception as e: #
|
2720 |
-
print(f"ERROR:
|
2721 |
import traceback
|
2722 |
traceback.print_exc()
|
2723 |
-
self._yolo = None
|
2724 |
|
2725 |
return self._yolo
|
2726 |
|
@@ -2779,153 +2755,60 @@ class MDRExtractionEngine:
|
|
2779 |
|
2780 |
# In class MDRExtractionEngine:
|
2781 |
|
2782 |
-
|
|
|
2783 |
img_rgb = img.convert("RGB")
|
2784 |
|
2785 |
-
|
2786 |
-
# The conf threshold might need adjustment based on the new model's performance
|
2787 |
-
# For DocLayNet, 'Text' is often a high-confidence class.
|
2788 |
-
res_list = yolo.predict(source=img_rgb, imgsz=1024, conf=0.25,
|
2789 |
-
# Slightly higher conf for potentially better precision
|
2790 |
device=self._device, verbose=False)
|
2791 |
|
2792 |
if not res_list or not hasattr(res_list[0], 'boxes') or res_list[0].boxes is None:
|
2793 |
-
print(" Engine: YOLO detection returned no results or no boxes.")
|
2794 |
return
|
2795 |
|
2796 |
-
results = res_list[0]
|
2797 |
-
|
2798 |
-
# --- Determine Class Mapping ---
|
2799 |
-
# This mapping needs to be verified against the actual model's output.
|
2800 |
-
# The hantian/yolo-doclaynet model card or its files might specify this.
|
2801 |
-
# Common DocLayNet class order (example, VERIFY THIS):
|
2802 |
-
# 0: Caption, 1: Footnote, 2: Formula, 3: List-item, 4: Page-footer,
|
2803 |
-
# 5: Page-header, 6: Picture, 7: Section-header, 8: Table, 9: Text, 10: Title
|
2804 |
|
2805 |
-
# Let's try to get names from the model directly if possible
|
2806 |
model_class_names = {}
|
2807 |
if hasattr(results, 'names') and isinstance(results.names, dict):
|
2808 |
-
model_class_names = results.names
|
2809 |
-
print(f" Engine: YOLO model class names: {model_class_names}")
|
2810 |
else:
|
|
|
|
|
2811 |
print(
|
2812 |
-
" Engine:
|
2813 |
-
#
|
2814 |
-
|
2815 |
-
|
2816 |
-
_doclaynet_names_fallback = ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header',
|
2817 |
-
'Picture', 'Section-header', 'Table', 'Text', 'Title']
|
2818 |
model_class_names = {i: name for i, name in enumerate(_doclaynet_names_fallback)}
|
|
|
2819 |
|
2820 |
-
# Define your mapping from DocLayNet names (or indices if names are not available) to MDRLayoutClass
|
2821 |
-
# This is crucial and specific to the new model's output classes.
|
2822 |
-
doclaynet_to_mdr_map = {
|
2823 |
-
model_class_names.get(k): v for k, v in {
|
2824 |
-
# Map by string name if model_class_names is populated correctly
|
2825 |
-
'Text': MDRLayoutClass.PLAIN_TEXT,
|
2826 |
-
'Title': MDRLayoutClass.TITLE,
|
2827 |
-
'Section-header': MDRLayoutClass.TITLE, # Or a new MDRLayoutClass if needed
|
2828 |
-
'List-item': MDRLayoutClass.PLAIN_TEXT, # Treat list items as plain text
|
2829 |
-
'Table': MDRLayoutClass.TABLE,
|
2830 |
-
'Picture': MDRLayoutClass.FIGURE,
|
2831 |
-
'Formula': MDRLayoutClass.ISOLATE_FORMULA,
|
2832 |
-
'Caption': MDRLayoutClass.FIGURE_CAPTION, # Or TABLE_CAPTION, needs context
|
2833 |
-
'Footnote': MDRLayoutClass.TABLE_FOOTNOTE, # Or a general footnote class
|
2834 |
-
'Page-header': MDRLayoutClass.ABANDON, # Often headers/footers are ignored
|
2835 |
-
'Page-footer': MDRLayoutClass.ABANDON,
|
2836 |
-
}.items() if k in model_class_names.values() # Ensure key exists
|
2837 |
-
}
|
2838 |
-
|
2839 |
-
# If mapping by string name failed (e.g. model_class_names was not populated as expected),
|
2840 |
-
# try mapping by assumed index if you know the class ID for 'Text'.
|
2841 |
-
# The hantian/yolo-doclaynet example uses `classes=[1]` for Text. This implies ID 1 is Text.
|
2842 |
-
# This is risky if the order changes.
|
2843 |
-
if 'Text' not in [name for name in model_class_names.values() if name in doclaynet_to_mdr_map]:
|
2844 |
-
print(
|
2845 |
-
" Engine: Warning - 'Text' class not found in model_class_names via string mapping. Attempting index-based mapping for critical classes.")
|
2846 |
-
# Example: If you know from model card that class ID 9 is 'Text' and 10 is 'Title' for hantian/yolo-doclaynet
|
2847 |
-
# This is a COMMON order for DocLayNet, but VERIFY for hantian's model.
|
2848 |
-
# From some sources, for DocLayNet, 'Text' is often ID 9, 'Title' is ID 10.
|
2849 |
-
# The example `classes=[1]` from the HF page for hantian/yolo-doclaynet is confusing if 'Text' is ID 9.
|
2850 |
-
# Let's assume the example `classes=[1]` meant "the class at index 1 in some list", not necessarily ID 1.
|
2851 |
-
# We MUST get the correct ID for 'Text'.
|
2852 |
-
# For now, let's try to find 'Text' and 'Title' by string in model_class_names and get their IDs.
|
2853 |
-
|
2854 |
-
text_id = None
|
2855 |
-
title_id = None
|
2856 |
-
table_id = None
|
2857 |
-
figure_id = None
|
2858 |
-
formula_id = None
|
2859 |
-
caption_id = None # Generic caption
|
2860 |
-
|
2861 |
-
for id_val, name_val in model_class_names.items():
|
2862 |
-
if name_val == 'Text':
|
2863 |
-
text_id = id_val
|
2864 |
-
elif name_val == 'Title':
|
2865 |
-
title_id = id_val
|
2866 |
-
elif name_val == 'Table':
|
2867 |
-
table_id = id_val
|
2868 |
-
elif name_val == 'Picture':
|
2869 |
-
figure_id = id_val
|
2870 |
-
elif name_val == 'Formula':
|
2871 |
-
formula_id = id_val
|
2872 |
-
elif name_val == 'Caption':
|
2873 |
-
caption_id = id_val
|
2874 |
-
# Add other mappings as needed
|
2875 |
-
|
2876 |
-
temp_map_by_id = {}
|
2877 |
-
if text_id is not None: temp_map_by_id[text_id] = MDRLayoutClass.PLAIN_TEXT
|
2878 |
-
if title_id is not None: temp_map_by_id[title_id] = MDRLayoutClass.TITLE
|
2879 |
-
if table_id is not None: temp_map_by_id[table_id] = MDRLayoutClass.TABLE
|
2880 |
-
if figure_id is not None: temp_map_by_id[figure_id] = MDRLayoutClass.FIGURE
|
2881 |
-
if formula_id is not None: temp_map_by_id[formula_id] = MDRLayoutClass.ISOLATE_FORMULA
|
2882 |
-
if caption_id is not None: temp_map_by_id[
|
2883 |
-
caption_id] = MDRLayoutClass.FIGURE_CAPTION # Default, refine later
|
2884 |
-
|
2885 |
-
# Override doclaynet_to_mdr_map if direct ID mapping is more reliable
|
2886 |
-
if temp_map_by_id:
|
2887 |
-
print(f" Engine: Using direct ID mapping for some classes: {temp_map_by_id}")
|
2888 |
-
# This isn't quite right, the map should be from YOLO ID to MDR Class
|
2889 |
-
# The previous doclaynet_to_mdr_map was from string name to MDR Class.
|
2890 |
-
# We need a single, consistent map from YOLO's predicted class ID to MDRLayoutClass.
|
2891 |
-
|
2892 |
-
# Let's rebuild the map: yolo_class_id -> MDRLayoutClass
|
2893 |
-
final_yolo_id_to_mdr_class_map = {}
|
2894 |
-
if text_id is not None: final_yolo_id_to_mdr_class_map[text_id] = MDRLayoutClass.PLAIN_TEXT
|
2895 |
-
if title_id is not None: final_yolo_id_to_mdr_class_map[title_id] = MDRLayoutClass.TITLE
|
2896 |
-
# ... map others based on their found IDs ...
|
2897 |
-
# For simplicity, let's assume the string-based map from above is preferred if names are available.
|
2898 |
-
# The most important thing is to get the ID for 'Text'.
|
2899 |
-
# If `model_class_names` is `{0: 'Caption', 1: 'Footnote', ..., 9: 'Text', 10: 'Title'}`
|
2900 |
-
# then `doclaynet_to_mdr_map` should correctly map 'Text' to `MDRLayoutClass.PLAIN_TEXT`.
|
2901 |
-
|
2902 |
-
# Define which MDRLayoutClasses are considered "plain" for fragment merging later (if needed)
|
2903 |
-
# This set should use your MDRLayoutClass enum members.
|
2904 |
plain_mdr_classes: set[MDRLayoutClass] = {
|
2905 |
-
MDRLayoutClass.TITLE,
|
2906 |
-
MDRLayoutClass.
|
2907 |
-
|
2908 |
-
MDRLayoutClass.FIGURE_CAPTION, # Captions are text
|
2909 |
-
MDRLayoutClass.TABLE_CAPTION, # Captions are text
|
2910 |
-
MDRLayoutClass.TABLE_FOOTNOTE, # Footnotes are text
|
2911 |
-
MDRLayoutClass.FORMULA_CAPTION, # Captions are text
|
2912 |
}
|
2913 |
-
print(f" Engine: Mapping YOLO classes to MDR classes. Effective map used for generation:")
|
2914 |
|
2915 |
-
|
2916 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
2917 |
|
2918 |
-
# Get the string name for logging/mapping
|
2919 |
yolo_cls_name = model_class_names.get(yolo_cls_id, f"UnknownID-{yolo_cls_id}")
|
2920 |
|
2921 |
-
# Map YOLO class name to your MDRLayoutClass
|
2922 |
mdr_cls = None
|
|
|
|
|
2923 |
if yolo_cls_name == 'Text':
|
2924 |
mdr_cls = MDRLayoutClass.PLAIN_TEXT
|
2925 |
elif yolo_cls_name == 'Title':
|
2926 |
mdr_cls = MDRLayoutClass.TITLE
|
2927 |
elif yolo_cls_name == 'Section-header':
|
2928 |
-
mdr_cls = MDRLayoutClass.TITLE
|
2929 |
elif yolo_cls_name == 'List-item':
|
2930 |
mdr_cls = MDRLayoutClass.PLAIN_TEXT
|
2931 |
elif yolo_cls_name == 'Table':
|
@@ -2935,36 +2818,32 @@ class MDRExtractionEngine:
|
|
2935 |
elif yolo_cls_name == 'Formula':
|
2936 |
mdr_cls = MDRLayoutClass.ISOLATE_FORMULA
|
2937 |
elif yolo_cls_name == 'Caption':
|
2938 |
-
mdr_cls = MDRLayoutClass.FIGURE_CAPTION #
|
2939 |
elif yolo_cls_name == 'Footnote':
|
2940 |
-
mdr_cls = MDRLayoutClass.TABLE_FOOTNOTE #
|
2941 |
elif yolo_cls_name in ['Page-header', 'Page-footer']:
|
2942 |
mdr_cls = MDRLayoutClass.ABANDON
|
2943 |
|
2944 |
if mdr_cls is None:
|
2945 |
-
# print(f"
|
2946 |
continue
|
2947 |
|
2948 |
-
# print(f"
|
2949 |
|
2950 |
x1, y1, x2, y2 = map(float, xyxy_tensor)
|
2951 |
rect = MDRRectangle(lt=(x1, y1), rt=(x2, y1), lb=(x1, y2), rb=(x2, y2))
|
2952 |
-
if rect.area < 10:
|
2953 |
-
continue
|
2954 |
|
2955 |
if mdr_cls == MDRLayoutClass.TABLE:
|
2956 |
-
yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None, cls=mdr_cls)
|
2957 |
elif mdr_cls == MDRLayoutClass.ISOLATE_FORMULA:
|
2958 |
-
yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None, cls=mdr_cls)
|
2959 |
-
elif mdr_cls == MDRLayoutClass.FIGURE:
|
2960 |
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
2961 |
-
elif mdr_cls in plain_mdr_classes:
|
2962 |
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
2963 |
-
elif mdr_cls == MDRLayoutClass.ABANDON:
|
2964 |
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
2965 |
-
# else:
|
2966 |
-
# print(f" Engine: MDR class '{mdr_cls.name}' not explicitly handled for yielding, but was mapped.")
|
2967 |
-
|
2968 |
def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[
|
2969 |
MDRLayoutElement]:
|
2970 |
if not frags or not layouts:
|
|
|
2656 |
self._layout_reader = MDRLayoutReader(model_path=os.path.join(self._model_dir, "layoutreader"))
|
2657 |
print(f"MDR Extraction Engine initialized on device: {self._device}")
|
2658 |
|
|
|
2659 |
# In class MDRExtractionEngine:
|
2660 |
+
def _get_yolo_model(self) -> Any | None: # Return type will be ultralytics.YOLO
|
2661 |
+
"""Loads the YOLOv10b-DocLayNet layout detection model using ultralytics.YOLO."""
|
|
|
2662 |
if self._yolo is None:
|
2663 |
+
# Using hantian/yolo-doclaynet (or ppaanngggg if that's the one you have the .pt for)
|
2664 |
+
# Ensure these match the model you intend to use
|
2665 |
repo_id = "hantian/yolo-doclaynet"
|
2666 |
+
filename = "yolov10b-doclaynet.pt" # Or the exact .pt filename from the repo
|
2667 |
|
2668 |
yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
|
2669 |
mdr_ensure_directory(str(yolo_cache_dir))
|
2670 |
|
2671 |
+
print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}' using ultralytics.YOLO...")
|
2672 |
print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
|
2673 |
|
2674 |
try:
|
|
|
2677 |
filename=filename,
|
2678 |
cache_dir=yolo_cache_dir,
|
2679 |
local_files_only=False,
|
2680 |
+
force_download=False, # Set to True if you suspect a corrupted download
|
2681 |
)
|
2682 |
print(f"YOLO model file path: {yolo_model_filepath}")
|
2683 |
|
2684 |
+
from ultralytics import YOLO as UltralyticsYOLO # Import here
|
2685 |
+
self._yolo = UltralyticsYOLO(yolo_model_filepath) # This is the line that fails with SCDown
|
2686 |
+
print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2687 |
|
2688 |
+
except ImportError:
|
2689 |
+
print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
|
2690 |
+
print("Please ensure it's installed: pip install ultralytics (matching version if possible)")
|
2691 |
self._yolo = None
|
2692 |
+
except HfHubHTTPError as e:
|
2693 |
+
print(f"ERROR: Failed to download YOLO model '{filename}' via Hugging Face Hub: {e}")
|
2694 |
self._yolo = None
|
2695 |
+
except Exception as e: # Catch other model loading errors (like the SCDown error)
|
2696 |
+
print(f"ERROR: Failed to load YOLO model '{yolo_model_filepath}' with ultralytics.YOLO: {e}")
|
2697 |
import traceback
|
2698 |
traceback.print_exc()
|
2699 |
+
self._yolo = None # Ensure self._yolo is None on failure
|
2700 |
|
2701 |
return self._yolo
|
2702 |
|
|
|
2755 |
|
2756 |
# In class MDRExtractionEngine:
|
2757 |
|
2758 |
+
# In class MDRExtractionEngine
|
2759 |
+
def _run_yolo_detection(self, img: Image, yolo: Any): # yolo is an ultralytics.YOLO instance
|
2760 |
img_rgb = img.convert("RGB")
|
2761 |
|
2762 |
+
res_list = yolo.predict(source=img_rgb, imgsz=1024, conf=0.25, # Adjust conf as needed
|
|
|
|
|
|
|
|
|
2763 |
device=self._device, verbose=False)
|
2764 |
|
2765 |
if not res_list or not hasattr(res_list[0], 'boxes') or res_list[0].boxes is None:
|
2766 |
+
print(" Engine: YOLO detection (ultralytics) returned no results or no boxes.")
|
2767 |
return
|
2768 |
|
2769 |
+
results = res_list[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2770 |
|
|
|
2771 |
model_class_names = {}
|
2772 |
if hasattr(results, 'names') and isinstance(results.names, dict):
|
2773 |
+
model_class_names = results.names
|
2774 |
+
print(f" Engine: YOLO model class names from ultralytics: {model_class_names}")
|
2775 |
else:
|
2776 |
+
# This fallback is a major source of potential error if results.names isn't populated.
|
2777 |
+
# It's better to fail or have a very explicit warning if names aren't found.
|
2778 |
print(
|
2779 |
+
" Engine: CRITICAL WARNING - Could not get class names from YOLO model. Layout mapping will likely be incorrect.")
|
2780 |
+
# Forcing a known DocLayNet order as a last resort (HIGHLY UNRELIABLE without verification)
|
2781 |
+
_doclaynet_names_fallback = ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer',
|
2782 |
+
'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']
|
|
|
|
|
2783 |
model_class_names = {i: name for i, name in enumerate(_doclaynet_names_fallback)}
|
2784 |
+
print(f" Engine: Using FALLBACK class names (VERIFY!): {model_class_names}")
|
2785 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2786 |
plain_mdr_classes: set[MDRLayoutClass] = {
|
2787 |
+
MDRLayoutClass.TITLE, MDRLayoutClass.PLAIN_TEXT,
|
2788 |
+
MDRLayoutClass.FIGURE_CAPTION, MDRLayoutClass.TABLE_CAPTION,
|
2789 |
+
MDRLayoutClass.TABLE_FOOTNOTE, MDRLayoutClass.FORMULA_CAPTION,
|
|
|
|
|
|
|
|
|
2790 |
}
|
|
|
2791 |
|
2792 |
+
if results.boxes.cls is None or results.boxes.xyxy is None:
|
2793 |
+
print(" Engine: YOLO results.boxes.cls or .xyxy is None.")
|
2794 |
+
return
|
2795 |
+
|
2796 |
+
print(f" Engine: Processing {len(results.boxes.cls)} detected YOLO boxes...")
|
2797 |
+
for i in range(len(results.boxes.cls)):
|
2798 |
+
yolo_cls_id = int(results.boxes.cls[i].item())
|
2799 |
+
xyxy_tensor = results.boxes.xyxy[i]
|
2800 |
|
|
|
2801 |
yolo_cls_name = model_class_names.get(yolo_cls_id, f"UnknownID-{yolo_cls_id}")
|
2802 |
|
|
|
2803 |
mdr_cls = None
|
2804 |
+
# --- THIS MAPPING IS BASED ON STANDARD DOCLAYNET ---
|
2805 |
+
# --- VERIFY IT AGAINST `model_class_names` PRINTED ABOVE ---
|
2806 |
if yolo_cls_name == 'Text':
|
2807 |
mdr_cls = MDRLayoutClass.PLAIN_TEXT
|
2808 |
elif yolo_cls_name == 'Title':
|
2809 |
mdr_cls = MDRLayoutClass.TITLE
|
2810 |
elif yolo_cls_name == 'Section-header':
|
2811 |
+
mdr_cls = MDRLayoutClass.TITLE
|
2812 |
elif yolo_cls_name == 'List-item':
|
2813 |
mdr_cls = MDRLayoutClass.PLAIN_TEXT
|
2814 |
elif yolo_cls_name == 'Table':
|
|
|
2818 |
elif yolo_cls_name == 'Formula':
|
2819 |
mdr_cls = MDRLayoutClass.ISOLATE_FORMULA
|
2820 |
elif yolo_cls_name == 'Caption':
|
2821 |
+
mdr_cls = MDRLayoutClass.FIGURE_CAPTION # Needs context to be TABLE_CAPTION
|
2822 |
elif yolo_cls_name == 'Footnote':
|
2823 |
+
mdr_cls = MDRLayoutClass.TABLE_FOOTNOTE # Needs context
|
2824 |
elif yolo_cls_name in ['Page-header', 'Page-footer']:
|
2825 |
mdr_cls = MDRLayoutClass.ABANDON
|
2826 |
|
2827 |
if mdr_cls is None:
|
2828 |
+
# print(f" Skipping YOLO box: class '{yolo_cls_name}' (ID {yolo_cls_id}) - not mapped.")
|
2829 |
continue
|
2830 |
|
2831 |
+
# print(f" Detected: {yolo_cls_name} (ID {yolo_cls_id}) -> {mdr_cls.name}")
|
2832 |
|
2833 |
x1, y1, x2, y2 = map(float, xyxy_tensor)
|
2834 |
rect = MDRRectangle(lt=(x1, y1), rt=(x2, y1), lb=(x1, y2), rb=(x2, y2))
|
2835 |
+
if rect.area < 10: continue
|
|
|
2836 |
|
2837 |
if mdr_cls == MDRLayoutClass.TABLE:
|
2838 |
+
yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None, cls=mdr_cls)
|
2839 |
elif mdr_cls == MDRLayoutClass.ISOLATE_FORMULA:
|
2840 |
+
yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None, cls=mdr_cls)
|
2841 |
+
elif mdr_cls == MDRLayoutClass.FIGURE:
|
2842 |
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
2843 |
+
elif mdr_cls in plain_mdr_classes:
|
2844 |
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
2845 |
+
elif mdr_cls == MDRLayoutClass.ABANDON:
|
2846 |
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
|
|
|
|
|
|
2847 |
def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[
|
2848 |
MDRLayoutElement]:
|
2849 |
if not frags or not layouts:
|