rodrigomasini commited on
Commit
c33bb8f
·
verified ·
1 Parent(s): 8fccd9c

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +135 -42
mdr_pdf_parser.py CHANGED
@@ -2221,27 +2221,98 @@ class MDRLayoutReader:
2221
  bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
2222
  return bbox_list
2223
 
2224
- def _apply_order(self, layouts: list[MDRLayoutElement], bbox_list: list[_MDR_ReaderBBox]) -> list[MDRLayoutElement]:
 
 
 
 
 
2225
  layout_map = defaultdict(list)
2226
- [layout_map[b.layout_index].append(b) for b in bbox_list]
2227
- layout_orders = [(idx, self._median([b.order for b in bboxes])) for idx, bboxes in layout_map.items() if bboxes]
2228
- layout_orders.sort(key=lambda x: x[1])
2229
- sorted_layouts = [layouts[idx] for idx, _ in layout_orders]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2230
  nfo = 0
2231
- for l in sorted_layouts:
2232
- frags = l.fragments
2233
- if not frags:
2234
  continue
2235
- frag_bboxes = [b for b in layout_map[layouts.index(l)] if not b.virtual]
2236
- if frag_bboxes:
2237
- idx_to_order = {b.fragment_index: b.order for b in frag_bboxes}
2238
- frags.sort(key=lambda f: idx_to_order.get(frags.index(f), float('inf')))
2239
- else:
2240
- frags.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
2241
- for frag in frags:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2242
  frag.order = nfo
2243
  nfo += 1
2244
- return sorted_layouts
 
2245
 
2246
  def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
2247
  heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
@@ -2586,52 +2657,74 @@ class MDRExtractionEngine:
2586
  print(f"MDR Extraction Engine initialized on device: {self._device}")
2587
 
2588
  # --- MODIFIED _get_yolo_model METHOD for HF ---
2589
- def _get_yolo_model(self) -> YOLOv10 | None:
2590
- """Loads the YOLOv10 layout detection model using hf_hub_download."""
2591
- if self._yolo is None and YOLOv10 is not None:
 
 
2592
  repo_id = "hantian/yolo-doclaynet"
2593
  filename = "yolov10b-doclaynet.pt"
2594
- # Use a subdirectory within the main model dir for YOLO cache via HF Hub
2595
- yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache"
2596
- mdr_ensure_directory(str(yolo_cache_dir)) # Ensure cache dir exists
2597
 
2598
  print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
2599
  print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
2600
 
2601
  try:
2602
- # Download the model file using huggingface_hub, caching it
2603
  yolo_model_filepath = hf_hub_download(
2604
  repo_id=repo_id,
2605
  filename=filename,
2606
- cache_dir=yolo_cache_dir, # Cache within our designated structure
2607
- local_files_only=False, # Allow download
2608
- force_download=False, # Use cache if available
2609
  )
2610
  print(f"YOLO model file path: {yolo_model_filepath}")
2611
 
2612
- # Load the model using the downloaded file path
2613
- self._yolo = YOLOv10(yolo_model_filepath)
2614
- print("MDR YOLOv10 model loaded successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2615
 
2616
- # --- MODIFIED EXCEPTION HANDLING ---
2617
- except HfHubHTTPError as e: # <-- CHANGED THIS LINE
2618
- print(
2619
- f"ERROR: Failed to download/access YOLO model via Hugging Face Hub: {e}") # Slightly updated message
2620
  self._yolo = None
2621
- except FileNotFoundError as e: # Catch if hf_hub_download fails finding file OR YOLOv10 constructor fails
2622
- print(f"ERROR: YOLO model file not found or failed to load locally: {e}") # Slightly updated message
2623
  self._yolo = None
2624
- except Exception as e:
2625
- # Keep the general exception catch, but make the message more specific
2626
- print(
2627
- f"ERROR: An unexpected issue occurred loading YOLOv10 model from {yolo_cache_dir}/{filename}: {e}")
2628
  self._yolo = None
2629
 
2630
- elif YOLOv10 is None:
2631
- print("MDR YOLOv10 class not available. Layout detection skipped.")
2632
-
2633
  return self._yolo
2634
 
 
2635
  def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
2636
  """Analyzes a single page image to extract layout and content."""
2637
  print(" Engine: Analyzing image...")
 
2221
  bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
2222
  return bbox_list
2223
 
2224
+ # In class MDRLayoutReader
2225
+
2226
+ def _apply_order(self, original_layouts_list: list[MDRLayoutElement],
2227
+ ordered_bbox_list_with_final_orders: list[_MDR_ReaderBBox]) -> list[MDRLayoutElement]:
2228
+
2229
+ # layout_map: maps original layout index to a list of its _MDR_ReaderBBox objects (which now have final .order)
2230
  layout_map = defaultdict(list)
2231
+ for bbox_item in ordered_bbox_list_with_final_orders:
2232
+ layout_map[bbox_item.layout_index].append(bbox_item)
2233
+
2234
+ # Determine the new order of layouts themselves
2235
+ # The .order in bbox_item here is the *within-layout* order for fragments/virtual boxes.
2236
+ # We need the median of these *final reading orders* to sort the layouts.
2237
+ # The .order attribute of _MDR_ReaderBBox should have been updated by mdr_parse_reader_logits.
2238
+
2239
+ layout_median_orders = []
2240
+ for original_layout_idx, bboxes_for_this_layout in layout_map.items():
2241
+ if bboxes_for_this_layout: # Ensure there are bboxes
2242
+ # Each bbox_item.order here is its final reading order determined by LayoutLM
2243
+ median_order_for_layout = self._median([b.order for b in bboxes_for_this_layout])
2244
+ layout_median_orders.append((original_layout_idx, median_order_for_layout))
2245
+
2246
+ layout_median_orders.sort(key=lambda x: x[1]) # Sort layouts by their median reading order
2247
+
2248
+ # Create the new list of sorted layouts
2249
+ # Important: We are reordering the original_layouts_list.
2250
+ # The fragment objects within these layouts are the ones we need to sort.
2251
+ final_sorted_layouts = [original_layouts_list[idx] for idx, _ in layout_median_orders]
2252
+
2253
+ # Now, sort fragments within each layout
2254
+ # nfo (next fragment order) is a global counter for the absolute order of fragments across all layouts
2255
  nfo = 0
2256
+ for layout_obj in final_sorted_layouts:
2257
+ if not layout_obj.fragments: # Skip layouts with no fragments
 
2258
  continue
2259
+
2260
+ # Get the _MDR_ReaderBBox items that correspond to this specific layout_obj
2261
+ # We need the original index of layout_obj from the input `original_layouts_list`
2262
+ # This assumes original_layouts_list has not been reordered yet by this function.
2263
+ try:
2264
+ # Find the original index of the current layout_obj
2265
+ # This is safe if original_layouts_list is the list passed into this function
2266
+ original_idx_of_current_layout = original_layouts_list.index(layout_obj)
2267
+ except ValueError:
2268
+ # This should not happen if layout_obj came from original_layouts_list via layout_median_orders
2269
+ print(
2270
+ f" ERROR: Could not find layout in original list during fragment sort. Skipping fragment sort for this layout.")
2271
+ # Assign sequential order as a fallback for fragments in this layout
2272
+ for i_frag, frag_in_layout in enumerate(layout_obj.fragments):
2273
+ frag_in_layout.order = nfo + i_frag
2274
+ nfo += len(layout_obj.fragments)
2275
+ continue
2276
+
2277
+ # Get the _MDR_ReaderBBox items for this layout, which contain the final .order for each fragment_index
2278
+ reader_bboxes_for_this_layout = [
2279
+ b for b in layout_map[original_idx_of_current_layout] if not b.virtual
2280
+ ]
2281
+
2282
+ if reader_bboxes_for_this_layout:
2283
+ # Create a map from original_fragment_index to its new_reading_order
2284
+ frag_idx_to_new_order_map = {
2285
+ b.fragment_index: b.order for b in reader_bboxes_for_this_layout
2286
+ }
2287
+
2288
+ # Sort the actual MDROcrFragment objects in layout_obj.fragments
2289
+ # The key for sorting should use the original index of the fragment
2290
+ # to look up its new_reading_order from the map.
2291
+ # We assume layout_obj.fragments has not been reordered yet by this function for this layout.
2292
+ # We need to sort a list of (fragment_object, original_index) tuples first.
2293
+
2294
+ fragments_with_original_indices = list(enumerate(layout_obj.fragments))
2295
+
2296
+ fragments_with_original_indices.sort(
2297
+ key=lambda item: frag_idx_to_new_order_map.get(item[0], float('inf')) # item[0] is original index
2298
+ )
2299
+
2300
+ # Reconstruct the sorted list of fragment objects
2301
+ layout_obj.fragments = [item[1] for item in
2302
+ fragments_with_original_indices] # item[1] is fragment object
2303
+
2304
+ else: # No corresponding reader_bboxes (e.g. layout was all virtual or had no frags initially)
2305
+ # or if the layout was created as a fallback and has no reader_bboxes.
2306
+ print(
2307
+ f" LayoutReader ApplyOrder: No reader_bboxes for layout (orig_idx {original_idx_of_current_layout}). Sorting frags geometrically.")
2308
+ layout_obj.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0])) # Fallback geometric sort
2309
+
2310
+ # Assign the final absolute order (nfo)
2311
+ for frag in layout_obj.fragments:
2312
  frag.order = nfo
2313
  nfo += 1
2314
+
2315
+ return final_sorted_layouts
2316
 
2317
  def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
2318
  heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
 
2657
  print(f"MDR Extraction Engine initialized on device: {self._device}")
2658
 
2659
  # --- MODIFIED _get_yolo_model METHOD for HF ---
2660
+ # In class MDRExtractionEngine:
2661
+
2662
+ def _get_yolo_model(self) -> Any | None: # Return type can be ultralytics.YOLO
2663
+ """Loads the YOLOv10b-DocLayNet layout detection model."""
2664
+ if self._yolo is None:
2665
  repo_id = "hantian/yolo-doclaynet"
2666
  filename = "yolov10b-doclaynet.pt"
2667
+
2668
+ yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
2669
+ mdr_ensure_directory(str(yolo_cache_dir))
2670
 
2671
  print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
2672
  print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
2673
 
2674
  try:
 
2675
  yolo_model_filepath = hf_hub_download(
2676
  repo_id=repo_id,
2677
  filename=filename,
2678
+ cache_dir=yolo_cache_dir,
2679
+ local_files_only=False,
2680
+ force_download=False,
2681
  )
2682
  print(f"YOLO model file path: {yolo_model_filepath}")
2683
 
2684
+ # --- START: MODIFIED LOADING ---
2685
+ # Attempt to load directly using ultralytics.YOLO
2686
+ try:
2687
+ from ultralytics import YOLO as UltralyticsYOLO
2688
+ self._yolo = UltralyticsYOLO(yolo_model_filepath)
2689
+ print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
2690
+ except ImportError:
2691
+ print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
2692
+ print("Please install it: pip install ultralytics")
2693
+ self._yolo = None
2694
+ return None # Critical failure
2695
+ except Exception as e_ultra: # Catch other ultralytics loading errors
2696
+ print(f"ERROR: Failed to load YOLO model with ultralytics.YOLO: {e_ultra}")
2697
+ # If direct ultralytics fails, and your YOLOv10 wrapper exists, you could try it as a fallback,
2698
+ # but it's likely to fail if ultralytics.YOLO failed due to model structure.
2699
+ if YOLOv10 is not None:
2700
+ print("Attempting fallback to doclayout_yolo.YOLOv10 wrapper...")
2701
+ try:
2702
+ self._yolo = YOLOv10(yolo_model_filepath)
2703
+ print("MDR YOLOv10b-DocLayNet model loaded with doclayout_yolo.YOLOv10 wrapper (fallback).")
2704
+ except Exception as e_wrapper:
2705
+ print(f"ERROR: Fallback to doclayout_yolo.YOLOv10 also failed: {e_wrapper}")
2706
+ self._yolo = None
2707
+ return None
2708
+ else:
2709
+ self._yolo = None
2710
+ return None
2711
+ # --- END: MODIFIED LOADING ---
2712
 
2713
+ except HfHubHTTPError as e:
2714
+ print(f"ERROR: Failed to download/access YOLO model '{filename}' via Hugging Face Hub: {e}")
 
 
2715
  self._yolo = None
2716
+ except FileNotFoundError as e: # This might be redundant if hf_hub_download raises its own error
2717
+ print(f"ERROR: YOLO model file '{filename}' not found after download attempt: {e}")
2718
  self._yolo = None
2719
+ except Exception as e: # General catch-all for unexpected issues during hf_hub_download or path ops
2720
+ print(f"ERROR: An unexpected issue occurred related to YOLO model file handling for {filename}: {e}")
2721
+ import traceback
2722
+ traceback.print_exc()
2723
  self._yolo = None
2724
 
 
 
 
2725
  return self._yolo
2726
 
2727
+
2728
  def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
2729
  """Analyzes a single page image to extract layout and content."""
2730
  print(" Engine: Analyzing image...")