rodrigomasini commited on
Commit
21f45c5
·
verified ·
1 Parent(s): cf01558

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +56 -177
mdr_pdf_parser.py CHANGED
@@ -2656,19 +2656,19 @@ class MDRExtractionEngine:
2656
  self._layout_reader = MDRLayoutReader(model_path=os.path.join(self._model_dir, "layoutreader"))
2657
  print(f"MDR Extraction Engine initialized on device: {self._device}")
2658
 
2659
- # --- MODIFIED _get_yolo_model METHOD for HF ---
2660
  # In class MDRExtractionEngine:
2661
-
2662
- def _get_yolo_model(self) -> Any | None: # Return type can be ultralytics.YOLO
2663
- """Loads the YOLOv10b-DocLayNet layout detection model."""
2664
  if self._yolo is None:
 
 
2665
  repo_id = "hantian/yolo-doclaynet"
2666
- filename = "yolov10b-doclaynet.pt"
2667
 
2668
  yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
2669
  mdr_ensure_directory(str(yolo_cache_dir))
2670
 
2671
- print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
2672
  print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
2673
 
2674
  try:
@@ -2677,50 +2677,26 @@ class MDRExtractionEngine:
2677
  filename=filename,
2678
  cache_dir=yolo_cache_dir,
2679
  local_files_only=False,
2680
- force_download=False,
2681
  )
2682
  print(f"YOLO model file path: {yolo_model_filepath}")
2683
 
2684
- # --- START: MODIFIED LOADING ---
2685
- # Attempt to load directly using ultralytics.YOLO
2686
- try:
2687
- from ultralytics import YOLO as UltralyticsYOLO
2688
- self._yolo = UltralyticsYOLO(yolo_model_filepath)
2689
- print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
2690
- except ImportError:
2691
- print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
2692
- print("Please install it: pip install ultralytics")
2693
- self._yolo = None
2694
- return None # Critical failure
2695
- except Exception as e_ultra: # Catch other ultralytics loading errors
2696
- print(f"ERROR: Failed to load YOLO model with ultralytics.YOLO: {e_ultra}")
2697
- # If direct ultralytics fails, and your YOLOv10 wrapper exists, you could try it as a fallback,
2698
- # but it's likely to fail if ultralytics.YOLO failed due to model structure.
2699
- if YOLOv10 is not None:
2700
- print("Attempting fallback to doclayout_yolo.YOLOv10 wrapper...")
2701
- try:
2702
- self._yolo = YOLOv10(yolo_model_filepath)
2703
- print("MDR YOLOv10b-DocLayNet model loaded with doclayout_yolo.YOLOv10 wrapper (fallback).")
2704
- except Exception as e_wrapper:
2705
- print(f"ERROR: Fallback to doclayout_yolo.YOLOv10 also failed: {e_wrapper}")
2706
- self._yolo = None
2707
- return None
2708
- else:
2709
- self._yolo = None
2710
- return None
2711
- # --- END: MODIFIED LOADING ---
2712
 
2713
- except HfHubHTTPError as e:
2714
- print(f"ERROR: Failed to download/access YOLO model '{filename}' via Hugging Face Hub: {e}")
 
2715
  self._yolo = None
2716
- except FileNotFoundError as e: # This might be redundant if hf_hub_download raises its own error
2717
- print(f"ERROR: YOLO model file '{filename}' not found after download attempt: {e}")
2718
  self._yolo = None
2719
- except Exception as e: # General catch-all for unexpected issues during hf_hub_download or path ops
2720
- print(f"ERROR: An unexpected issue occurred related to YOLO model file handling for {filename}: {e}")
2721
  import traceback
2722
  traceback.print_exc()
2723
- self._yolo = None
2724
 
2725
  return self._yolo
2726
 
@@ -2779,153 +2755,60 @@ class MDRExtractionEngine:
2779
 
2780
  # In class MDRExtractionEngine:
2781
 
2782
- def _run_yolo_detection(self, img: Image, yolo: Any): # yolo can be doclayout_yolo.YOLOv10 or ultralytics.YOLO
 
2783
  img_rgb = img.convert("RGB")
2784
 
2785
- # Standard predict call
2786
- # The conf threshold might need adjustment based on the new model's performance
2787
- # For DocLayNet, 'Text' is often a high-confidence class.
2788
- res_list = yolo.predict(source=img_rgb, imgsz=1024, conf=0.25,
2789
- # Slightly higher conf for potentially better precision
2790
  device=self._device, verbose=False)
2791
 
2792
  if not res_list or not hasattr(res_list[0], 'boxes') or res_list[0].boxes is None:
2793
- print(" Engine: YOLO detection returned no results or no boxes.")
2794
  return
2795
 
2796
- results = res_list[0] # Get the first (and usually only) result object
2797
-
2798
- # --- Determine Class Mapping ---
2799
- # This mapping needs to be verified against the actual model's output.
2800
- # The hantian/yolo-doclaynet model card or its files might specify this.
2801
- # Common DocLayNet class order (example, VERIFY THIS):
2802
- # 0: Caption, 1: Footnote, 2: Formula, 3: List-item, 4: Page-footer,
2803
- # 5: Page-header, 6: Picture, 7: Section-header, 8: Table, 9: Text, 10: Title
2804
 
2805
- # Let's try to get names from the model directly if possible
2806
  model_class_names = {}
2807
  if hasattr(results, 'names') and isinstance(results.names, dict):
2808
- model_class_names = results.names # results.names is usually {id: name}
2809
- print(f" Engine: YOLO model class names: {model_class_names}")
2810
  else:
 
 
2811
  print(
2812
- " Engine: Warning - Could not automatically get class names from YOLO model. Using predefined fallback mapping.")
2813
- # Fallback predefined mapping (MUST BE VERIFIED FOR hantian/yolo-doclaynet)
2814
- # This is a GUESS based on common DocLayNet order.
2815
- # You MUST verify this by inspecting the model's config or output.
2816
- _doclaynet_names_fallback = ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header',
2817
- 'Picture', 'Section-header', 'Table', 'Text', 'Title']
2818
  model_class_names = {i: name for i, name in enumerate(_doclaynet_names_fallback)}
 
2819
 
2820
- # Define your mapping from DocLayNet names (or indices if names are not available) to MDRLayoutClass
2821
- # This is crucial and specific to the new model's output classes.
2822
- doclaynet_to_mdr_map = {
2823
- model_class_names.get(k): v for k, v in {
2824
- # Map by string name if model_class_names is populated correctly
2825
- 'Text': MDRLayoutClass.PLAIN_TEXT,
2826
- 'Title': MDRLayoutClass.TITLE,
2827
- 'Section-header': MDRLayoutClass.TITLE, # Or a new MDRLayoutClass if needed
2828
- 'List-item': MDRLayoutClass.PLAIN_TEXT, # Treat list items as plain text
2829
- 'Table': MDRLayoutClass.TABLE,
2830
- 'Picture': MDRLayoutClass.FIGURE,
2831
- 'Formula': MDRLayoutClass.ISOLATE_FORMULA,
2832
- 'Caption': MDRLayoutClass.FIGURE_CAPTION, # Or TABLE_CAPTION, needs context
2833
- 'Footnote': MDRLayoutClass.TABLE_FOOTNOTE, # Or a general footnote class
2834
- 'Page-header': MDRLayoutClass.ABANDON, # Often headers/footers are ignored
2835
- 'Page-footer': MDRLayoutClass.ABANDON,
2836
- }.items() if k in model_class_names.values() # Ensure key exists
2837
- }
2838
-
2839
- # If mapping by string name failed (e.g. model_class_names was not populated as expected),
2840
- # try mapping by assumed index if you know the class ID for 'Text'.
2841
- # The hantian/yolo-doclaynet example uses `classes=[1]` for Text. This implies ID 1 is Text.
2842
- # This is risky if the order changes.
2843
- if 'Text' not in [name for name in model_class_names.values() if name in doclaynet_to_mdr_map]:
2844
- print(
2845
- " Engine: Warning - 'Text' class not found in model_class_names via string mapping. Attempting index-based mapping for critical classes.")
2846
- # Example: If you know from model card that class ID 9 is 'Text' and 10 is 'Title' for hantian/yolo-doclaynet
2847
- # This is a COMMON order for DocLayNet, but VERIFY for hantian's model.
2848
- # From some sources, for DocLayNet, 'Text' is often ID 9, 'Title' is ID 10.
2849
- # The example `classes=[1]` from the HF page for hantian/yolo-doclaynet is confusing if 'Text' is ID 9.
2850
- # Let's assume the example `classes=[1]` meant "the class at index 1 in some list", not necessarily ID 1.
2851
- # We MUST get the correct ID for 'Text'.
2852
- # For now, let's try to find 'Text' and 'Title' by string in model_class_names and get their IDs.
2853
-
2854
- text_id = None
2855
- title_id = None
2856
- table_id = None
2857
- figure_id = None
2858
- formula_id = None
2859
- caption_id = None # Generic caption
2860
-
2861
- for id_val, name_val in model_class_names.items():
2862
- if name_val == 'Text':
2863
- text_id = id_val
2864
- elif name_val == 'Title':
2865
- title_id = id_val
2866
- elif name_val == 'Table':
2867
- table_id = id_val
2868
- elif name_val == 'Picture':
2869
- figure_id = id_val
2870
- elif name_val == 'Formula':
2871
- formula_id = id_val
2872
- elif name_val == 'Caption':
2873
- caption_id = id_val
2874
- # Add other mappings as needed
2875
-
2876
- temp_map_by_id = {}
2877
- if text_id is not None: temp_map_by_id[text_id] = MDRLayoutClass.PLAIN_TEXT
2878
- if title_id is not None: temp_map_by_id[title_id] = MDRLayoutClass.TITLE
2879
- if table_id is not None: temp_map_by_id[table_id] = MDRLayoutClass.TABLE
2880
- if figure_id is not None: temp_map_by_id[figure_id] = MDRLayoutClass.FIGURE
2881
- if formula_id is not None: temp_map_by_id[formula_id] = MDRLayoutClass.ISOLATE_FORMULA
2882
- if caption_id is not None: temp_map_by_id[
2883
- caption_id] = MDRLayoutClass.FIGURE_CAPTION # Default, refine later
2884
-
2885
- # Override doclaynet_to_mdr_map if direct ID mapping is more reliable
2886
- if temp_map_by_id:
2887
- print(f" Engine: Using direct ID mapping for some classes: {temp_map_by_id}")
2888
- # This isn't quite right, the map should be from YOLO ID to MDR Class
2889
- # The previous doclaynet_to_mdr_map was from string name to MDR Class.
2890
- # We need a single, consistent map from YOLO's predicted class ID to MDRLayoutClass.
2891
-
2892
- # Let's rebuild the map: yolo_class_id -> MDRLayoutClass
2893
- final_yolo_id_to_mdr_class_map = {}
2894
- if text_id is not None: final_yolo_id_to_mdr_class_map[text_id] = MDRLayoutClass.PLAIN_TEXT
2895
- if title_id is not None: final_yolo_id_to_mdr_class_map[title_id] = MDRLayoutClass.TITLE
2896
- # ... map others based on their found IDs ...
2897
- # For simplicity, let's assume the string-based map from above is preferred if names are available.
2898
- # The most important thing is to get the ID for 'Text'.
2899
- # If `model_class_names` is `{0: 'Caption', 1: 'Footnote', ..., 9: 'Text', 10: 'Title'}`
2900
- # then `doclaynet_to_mdr_map` should correctly map 'Text' to `MDRLayoutClass.PLAIN_TEXT`.
2901
-
2902
- # Define which MDRLayoutClasses are considered "plain" for fragment merging later (if needed)
2903
- # This set should use your MDRLayoutClass enum members.
2904
  plain_mdr_classes: set[MDRLayoutClass] = {
2905
- MDRLayoutClass.TITLE,
2906
- MDRLayoutClass.PLAIN_TEXT,
2907
- # MDRLayoutClass.ABANDON, # ABANDON layouts usually shouldn't get general text fragments
2908
- MDRLayoutClass.FIGURE_CAPTION, # Captions are text
2909
- MDRLayoutClass.TABLE_CAPTION, # Captions are text
2910
- MDRLayoutClass.TABLE_FOOTNOTE, # Footnotes are text
2911
- MDRLayoutClass.FORMULA_CAPTION, # Captions are text
2912
  }
2913
- print(f" Engine: Mapping YOLO classes to MDR classes. Effective map used for generation:")
2914
 
2915
- for cls_id_tensor, xyxy_tensor in zip(results.boxes.cls, results.boxes.xyxy):
2916
- yolo_cls_id = int(cls_id_tensor.item()) # Get integer class ID from tensor
 
 
 
 
 
 
2917
 
2918
- # Get the string name for logging/mapping
2919
  yolo_cls_name = model_class_names.get(yolo_cls_id, f"UnknownID-{yolo_cls_id}")
2920
 
2921
- # Map YOLO class name to your MDRLayoutClass
2922
  mdr_cls = None
 
 
2923
  if yolo_cls_name == 'Text':
2924
  mdr_cls = MDRLayoutClass.PLAIN_TEXT
2925
  elif yolo_cls_name == 'Title':
2926
  mdr_cls = MDRLayoutClass.TITLE
2927
  elif yolo_cls_name == 'Section-header':
2928
- mdr_cls = MDRLayoutClass.TITLE # Or a specific header class
2929
  elif yolo_cls_name == 'List-item':
2930
  mdr_cls = MDRLayoutClass.PLAIN_TEXT
2931
  elif yolo_cls_name == 'Table':
@@ -2935,36 +2818,32 @@ class MDRExtractionEngine:
2935
  elif yolo_cls_name == 'Formula':
2936
  mdr_cls = MDRLayoutClass.ISOLATE_FORMULA
2937
  elif yolo_cls_name == 'Caption':
2938
- mdr_cls = MDRLayoutClass.FIGURE_CAPTION # Default, could be table too
2939
  elif yolo_cls_name == 'Footnote':
2940
- mdr_cls = MDRLayoutClass.TABLE_FOOTNOTE # Or general footnote
2941
  elif yolo_cls_name in ['Page-header', 'Page-footer']:
2942
  mdr_cls = MDRLayoutClass.ABANDON
2943
 
2944
  if mdr_cls is None:
2945
- # print(f" Engine: Skipping YOLO box with class '{yolo_cls_name}' (ID {yolo_cls_id}) as it's not mapped to an MDRLayoutClass.")
2946
  continue
2947
 
2948
- # print(f" Engine: Detected YOLO class '{yolo_cls_name}' (ID {yolo_cls_id}), mapped to MDR class '{mdr_cls.name}'")
2949
 
2950
  x1, y1, x2, y2 = map(float, xyxy_tensor)
2951
  rect = MDRRectangle(lt=(x1, y1), rt=(x2, y1), lb=(x1, y2), rb=(x2, y2))
2952
- if rect.area < 10: # Filter tiny boxes
2953
- continue
2954
 
2955
  if mdr_cls == MDRLayoutClass.TABLE:
2956
- yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None, cls=mdr_cls) # Explicitly pass cls
2957
  elif mdr_cls == MDRLayoutClass.ISOLATE_FORMULA:
2958
- yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None, cls=mdr_cls) # Explicitly pass cls
2959
- elif mdr_cls == MDRLayoutClass.FIGURE: # Figure is not in plain_mdr_classes for default fragment assignment
2960
  yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
2961
- elif mdr_cls in plain_mdr_classes: # For TITLE, PLAIN_TEXT, CAPTION, etc.
2962
  yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
2963
- elif mdr_cls == MDRLayoutClass.ABANDON: # ABANDON class if you want to track but not assign frags by default
2964
  yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
2965
- # else:
2966
- # print(f" Engine: MDR class '{mdr_cls.name}' not explicitly handled for yielding, but was mapped.")
2967
-
2968
  def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[
2969
  MDRLayoutElement]:
2970
  if not frags or not layouts:
 
2656
  self._layout_reader = MDRLayoutReader(model_path=os.path.join(self._model_dir, "layoutreader"))
2657
  print(f"MDR Extraction Engine initialized on device: {self._device}")
2658
 
 
2659
  # In class MDRExtractionEngine:
2660
+ def _get_yolo_model(self) -> Any | None: # Return type will be ultralytics.YOLO
2661
+ """Loads the YOLOv10b-DocLayNet layout detection model using ultralytics.YOLO."""
 
2662
  if self._yolo is None:
2663
+ # Using hantian/yolo-doclaynet (or ppaanngggg if that's the one you have the .pt for)
2664
+ # Ensure these match the model you intend to use
2665
  repo_id = "hantian/yolo-doclaynet"
2666
+ filename = "yolov10b-doclaynet.pt" # Or the exact .pt filename from the repo
2667
 
2668
  yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
2669
  mdr_ensure_directory(str(yolo_cache_dir))
2670
 
2671
+ print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}' using ultralytics.YOLO...")
2672
  print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
2673
 
2674
  try:
 
2677
  filename=filename,
2678
  cache_dir=yolo_cache_dir,
2679
  local_files_only=False,
2680
+ force_download=False, # Set to True if you suspect a corrupted download
2681
  )
2682
  print(f"YOLO model file path: {yolo_model_filepath}")
2683
 
2684
+ from ultralytics import YOLO as UltralyticsYOLO # Import here
2685
+ self._yolo = UltralyticsYOLO(yolo_model_filepath) # This is the line that fails with SCDown
2686
+ print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2687
 
2688
+ except ImportError:
2689
+ print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
2690
+ print("Please ensure it's installed: pip install ultralytics (matching version if possible)")
2691
  self._yolo = None
2692
+ except HfHubHTTPError as e:
2693
+ print(f"ERROR: Failed to download YOLO model '{filename}' via Hugging Face Hub: {e}")
2694
  self._yolo = None
2695
+ except Exception as e: # Catch other model loading errors (like the SCDown error)
2696
+ print(f"ERROR: Failed to load YOLO model '{yolo_model_filepath}' with ultralytics.YOLO: {e}")
2697
  import traceback
2698
  traceback.print_exc()
2699
+ self._yolo = None # Ensure self._yolo is None on failure
2700
 
2701
  return self._yolo
2702
 
 
2755
 
2756
  # In class MDRExtractionEngine:
2757
 
2758
+ # In class MDRExtractionEngine
2759
+ def _run_yolo_detection(self, img: Image, yolo: Any): # yolo is an ultralytics.YOLO instance
2760
  img_rgb = img.convert("RGB")
2761
 
2762
+ res_list = yolo.predict(source=img_rgb, imgsz=1024, conf=0.25, # Adjust conf as needed
 
 
 
 
2763
  device=self._device, verbose=False)
2764
 
2765
  if not res_list or not hasattr(res_list[0], 'boxes') or res_list[0].boxes is None:
2766
+ print(" Engine: YOLO detection (ultralytics) returned no results or no boxes.")
2767
  return
2768
 
2769
+ results = res_list[0]
 
 
 
 
 
 
 
2770
 
 
2771
  model_class_names = {}
2772
  if hasattr(results, 'names') and isinstance(results.names, dict):
2773
+ model_class_names = results.names
2774
+ print(f" Engine: YOLO model class names from ultralytics: {model_class_names}")
2775
  else:
2776
+ # This fallback is a major source of potential error if results.names isn't populated.
2777
+ # It's better to fail or have a very explicit warning if names aren't found.
2778
  print(
2779
+ " Engine: CRITICAL WARNING - Could not get class names from YOLO model. Layout mapping will likely be incorrect.")
2780
+ # Forcing a known DocLayNet order as a last resort (HIGHLY UNRELIABLE without verification)
2781
+ _doclaynet_names_fallback = ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer',
2782
+ 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']
 
 
2783
  model_class_names = {i: name for i, name in enumerate(_doclaynet_names_fallback)}
2784
+ print(f" Engine: Using FALLBACK class names (VERIFY!): {model_class_names}")
2785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2786
  plain_mdr_classes: set[MDRLayoutClass] = {
2787
+ MDRLayoutClass.TITLE, MDRLayoutClass.PLAIN_TEXT,
2788
+ MDRLayoutClass.FIGURE_CAPTION, MDRLayoutClass.TABLE_CAPTION,
2789
+ MDRLayoutClass.TABLE_FOOTNOTE, MDRLayoutClass.FORMULA_CAPTION,
 
 
 
 
2790
  }
 
2791
 
2792
+ if results.boxes.cls is None or results.boxes.xyxy is None:
2793
+ print(" Engine: YOLO results.boxes.cls or .xyxy is None.")
2794
+ return
2795
+
2796
+ print(f" Engine: Processing {len(results.boxes.cls)} detected YOLO boxes...")
2797
+ for i in range(len(results.boxes.cls)):
2798
+ yolo_cls_id = int(results.boxes.cls[i].item())
2799
+ xyxy_tensor = results.boxes.xyxy[i]
2800
 
 
2801
  yolo_cls_name = model_class_names.get(yolo_cls_id, f"UnknownID-{yolo_cls_id}")
2802
 
 
2803
  mdr_cls = None
2804
+ # --- THIS MAPPING IS BASED ON STANDARD DOCLAYNET ---
2805
+ # --- VERIFY IT AGAINST `model_class_names` PRINTED ABOVE ---
2806
  if yolo_cls_name == 'Text':
2807
  mdr_cls = MDRLayoutClass.PLAIN_TEXT
2808
  elif yolo_cls_name == 'Title':
2809
  mdr_cls = MDRLayoutClass.TITLE
2810
  elif yolo_cls_name == 'Section-header':
2811
+ mdr_cls = MDRLayoutClass.TITLE
2812
  elif yolo_cls_name == 'List-item':
2813
  mdr_cls = MDRLayoutClass.PLAIN_TEXT
2814
  elif yolo_cls_name == 'Table':
 
2818
  elif yolo_cls_name == 'Formula':
2819
  mdr_cls = MDRLayoutClass.ISOLATE_FORMULA
2820
  elif yolo_cls_name == 'Caption':
2821
+ mdr_cls = MDRLayoutClass.FIGURE_CAPTION # Needs context to be TABLE_CAPTION
2822
  elif yolo_cls_name == 'Footnote':
2823
+ mdr_cls = MDRLayoutClass.TABLE_FOOTNOTE # Needs context
2824
  elif yolo_cls_name in ['Page-header', 'Page-footer']:
2825
  mdr_cls = MDRLayoutClass.ABANDON
2826
 
2827
  if mdr_cls is None:
2828
+ # print(f" Skipping YOLO box: class '{yolo_cls_name}' (ID {yolo_cls_id}) - not mapped.")
2829
  continue
2830
 
2831
+ # print(f" Detected: {yolo_cls_name} (ID {yolo_cls_id}) -> {mdr_cls.name}")
2832
 
2833
  x1, y1, x2, y2 = map(float, xyxy_tensor)
2834
  rect = MDRRectangle(lt=(x1, y1), rt=(x2, y1), lb=(x1, y2), rb=(x2, y2))
2835
+ if rect.area < 10: continue
 
2836
 
2837
  if mdr_cls == MDRLayoutClass.TABLE:
2838
+ yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None, cls=mdr_cls)
2839
  elif mdr_cls == MDRLayoutClass.ISOLATE_FORMULA:
2840
+ yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None, cls=mdr_cls)
2841
+ elif mdr_cls == MDRLayoutClass.FIGURE:
2842
  yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
2843
+ elif mdr_cls in plain_mdr_classes:
2844
  yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
2845
+ elif mdr_cls == MDRLayoutClass.ABANDON:
2846
  yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
 
 
 
2847
  def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[
2848
  MDRLayoutElement]:
2849
  if not frags or not layouts: