Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on 12 days ago

Commit

c33bb8f

verified ·

1 Parent(s): 8fccd9c

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +135 -42

mdr_pdf_parser.py CHANGED Viewed

@@ -2221,27 +2221,98 @@ class MDRLayoutReader:
         bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
         return bbox_list
-    def _apply_order(self, layouts: list[MDRLayoutElement], bbox_list: list[_MDR_ReaderBBox]) -> list[MDRLayoutElement]:
         layout_map = defaultdict(list)
-        [layout_map[b.layout_index].append(b) for b in bbox_list]
-        layout_orders = [(idx, self._median([b.order for b in bboxes])) for idx, bboxes in layout_map.items() if bboxes]
-        layout_orders.sort(key=lambda x: x[1])
-        sorted_layouts = [layouts[idx] for idx, _ in layout_orders]
         nfo = 0
-        for l in sorted_layouts:
-            frags = l.fragments
-            if not frags:
                 continue
-            frag_bboxes = [b for b in layout_map[layouts.index(l)] if not b.virtual]
-            if frag_bboxes:
-                idx_to_order = {b.fragment_index: b.order for b in frag_bboxes}
-                frags.sort(key=lambda f: idx_to_order.get(frags.index(f), float('inf')))
-            else:
-                frags.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
-            for frag in frags:
                 frag.order = nfo
                 nfo += 1
-        return sorted_layouts
     def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
         heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
@@ -2586,52 +2657,74 @@ class MDRExtractionEngine:
         print(f"MDR Extraction Engine initialized on device: {self._device}")
     # --- MODIFIED _get_yolo_model METHOD for HF ---
-    def _get_yolo_model(self) -> YOLOv10 | None:
-        """Loads the YOLOv10 layout detection model using hf_hub_download."""
-        if self._yolo is None and YOLOv10 is not None:
             repo_id = "hantian/yolo-doclaynet"
             filename = "yolov10b-doclaynet.pt"
-            # Use a subdirectory within the main model dir for YOLO cache via HF Hub
-            yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache"
-            mdr_ensure_directory(str(yolo_cache_dir))  # Ensure cache dir exists
             print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
             print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
             try:
-                # Download the model file using huggingface_hub, caching it
                 yolo_model_filepath = hf_hub_download(
                     repo_id=repo_id,
                     filename=filename,
-                    cache_dir=yolo_cache_dir,  # Cache within our designated structure
-                    local_files_only=False,  # Allow download
-                    force_download=False,  # Use cache if available
                 )
                 print(f"YOLO model file path: {yolo_model_filepath}")
-                # Load the model using the downloaded file path
-                self._yolo = YOLOv10(yolo_model_filepath)
-                print("MDR YOLOv10 model loaded successfully.")
-            # --- MODIFIED EXCEPTION HANDLING ---
-            except HfHubHTTPError as e:  # <-- CHANGED THIS LINE
-                print(
-                    f"ERROR: Failed to download/access YOLO model via Hugging Face Hub: {e}")  # Slightly updated message
                 self._yolo = None
-            except FileNotFoundError as e:  # Catch if hf_hub_download fails finding file OR YOLOv10 constructor fails
-                print(f"ERROR: YOLO model file not found or failed to load locally: {e}")  # Slightly updated message
                 self._yolo = None
-            except Exception as e:
-                # Keep the general exception catch, but make the message more specific
-                print(
-                    f"ERROR: An unexpected issue occurred loading YOLOv10 model from {yolo_cache_dir}/{filename}: {e}")
                 self._yolo = None
-        elif YOLOv10 is None:
-            print("MDR YOLOv10 class not available. Layout detection skipped.")
         return self._yolo
     def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
         """Analyzes a single page image to extract layout and content."""
         print("  Engine: Analyzing image...")

         bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
         return bbox_list
+        # In class MDRLayoutReader
+    def _apply_order(self, original_layouts_list: list[MDRLayoutElement],
+                     ordered_bbox_list_with_final_orders: list[_MDR_ReaderBBox]) -> list[MDRLayoutElement]:
+        # layout_map: maps original layout index to a list of its _MDR_ReaderBBox objects (which now have final .order)
         layout_map = defaultdict(list)
+        for bbox_item in ordered_bbox_list_with_final_orders:
+            layout_map[bbox_item.layout_index].append(bbox_item)
+        # Determine the new order of layouts themselves
+        # The .order in bbox_item here is the *within-layout* order for fragments/virtual boxes.
+        # We need the median of these *final reading orders* to sort the layouts.
+        # The .order attribute of _MDR_ReaderBBox should have been updated by mdr_parse_reader_logits.
+        layout_median_orders = []
+        for original_layout_idx, bboxes_for_this_layout in layout_map.items():
+            if bboxes_for_this_layout:  # Ensure there are bboxes
+                # Each bbox_item.order here is its final reading order determined by LayoutLM
+                median_order_for_layout = self._median([b.order for b in bboxes_for_this_layout])
+                layout_median_orders.append((original_layout_idx, median_order_for_layout))
+        layout_median_orders.sort(key=lambda x: x[1])  # Sort layouts by their median reading order
+        # Create the new list of sorted layouts
+        # Important: We are reordering the original_layouts_list.
+        # The fragment objects within these layouts are the ones we need to sort.
+        final_sorted_layouts = [original_layouts_list[idx] for idx, _ in layout_median_orders]
+        # Now, sort fragments within each layout
+        # nfo (next fragment order) is a global counter for the absolute order of fragments across all layouts
         nfo = 0
+        for layout_obj in final_sorted_layouts:
+            if not layout_obj.fragments:  # Skip layouts with no fragments
                 continue
+            # Get the _MDR_ReaderBBox items that correspond to this specific layout_obj
+            # We need the original index of layout_obj from the input `original_layouts_list`
+            # This assumes original_layouts_list has not been reordered yet by this function.
+            try:
+                # Find the original index of the current layout_obj
+                # This is safe if original_layouts_list is the list passed into this function
+                original_idx_of_current_layout = original_layouts_list.index(layout_obj)
+            except ValueError:
+                # This should not happen if layout_obj came from original_layouts_list via layout_median_orders
+                print(
+                    f"  ERROR: Could not find layout in original list during fragment sort. Skipping fragment sort for this layout.")
+                # Assign sequential order as a fallback for fragments in this layout
+                for i_frag, frag_in_layout in enumerate(layout_obj.fragments):
+                    frag_in_layout.order = nfo + i_frag
+                nfo += len(layout_obj.fragments)
+                continue
+            # Get the _MDR_ReaderBBox items for this layout, which contain the final .order for each fragment_index
+            reader_bboxes_for_this_layout = [
+                b for b in layout_map[original_idx_of_current_layout] if not b.virtual
+            ]
+            if reader_bboxes_for_this_layout:
+                # Create a map from original_fragment_index to its new_reading_order
+                frag_idx_to_new_order_map = {
+                    b.fragment_index: b.order for b in reader_bboxes_for_this_layout
+                }
+                # Sort the actual MDROcrFragment objects in layout_obj.fragments
+                # The key for sorting should use the original index of the fragment
+                # to look up its new_reading_order from the map.
+                # We assume layout_obj.fragments has not been reordered yet by this function for this layout.
+                # We need to sort a list of (fragment_object, original_index) tuples first.
+                fragments_with_original_indices = list(enumerate(layout_obj.fragments))
+                fragments_with_original_indices.sort(
+                    key=lambda item: frag_idx_to_new_order_map.get(item[0], float('inf'))  # item[0] is original index
+                )
+                # Reconstruct the sorted list of fragment objects
+                layout_obj.fragments = [item[1] for item in
+                                        fragments_with_original_indices]  # item[1] is fragment object
+            else:  # No corresponding reader_bboxes (e.g. layout was all virtual or had no frags initially)
+                # or if the layout was created as a fallback and has no reader_bboxes.
+                print(
+                    f"  LayoutReader ApplyOrder: No reader_bboxes for layout (orig_idx {original_idx_of_current_layout}). Sorting frags geometrically.")
+                layout_obj.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))  # Fallback geometric sort
+            # Assign the final absolute order (nfo)
+            for frag in layout_obj.fragments:
                 frag.order = nfo
                 nfo += 1
+        return final_sorted_layouts
     def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
         heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
         print(f"MDR Extraction Engine initialized on device: {self._device}")
     # --- MODIFIED _get_yolo_model METHOD for HF ---
+    # In class MDRExtractionEngine:
+    def _get_yolo_model(self) -> Any | None:  # Return type can be ultralytics.YOLO
+        """Loads the YOLOv10b-DocLayNet layout detection model."""
+        if self._yolo is None:
             repo_id = "hantian/yolo-doclaynet"
             filename = "yolov10b-doclaynet.pt"
+            yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
+            mdr_ensure_directory(str(yolo_cache_dir))
             print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
             print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
             try:
                 yolo_model_filepath = hf_hub_download(
                     repo_id=repo_id,
                     filename=filename,
+                    cache_dir=yolo_cache_dir,
+                    local_files_only=False,
+                    force_download=False,
                 )
                 print(f"YOLO model file path: {yolo_model_filepath}")
+                # --- START: MODIFIED LOADING ---
+                # Attempt to load directly using ultralytics.YOLO
+                try:
+                    from ultralytics import YOLO as UltralyticsYOLO
+                    self._yolo = UltralyticsYOLO(yolo_model_filepath)
+                    print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
+                except ImportError:
+                    print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
+                    print("Please install it: pip install ultralytics")
+                    self._yolo = None
+                    return None  # Critical failure
+                except Exception as e_ultra:  # Catch other ultralytics loading errors
+                    print(f"ERROR: Failed to load YOLO model with ultralytics.YOLO: {e_ultra}")
+                    # If direct ultralytics fails, and your YOLOv10 wrapper exists, you could try it as a fallback,
+                    # but it's likely to fail if ultralytics.YOLO failed due to model structure.
+                    if YOLOv10 is not None:
+                        print("Attempting fallback to doclayout_yolo.YOLOv10 wrapper...")
+                        try:
+                            self._yolo = YOLOv10(yolo_model_filepath)
+                            print("MDR YOLOv10b-DocLayNet model loaded with doclayout_yolo.YOLOv10 wrapper (fallback).")
+                        except Exception as e_wrapper:
+                            print(f"ERROR: Fallback to doclayout_yolo.YOLOv10 also failed: {e_wrapper}")
+                            self._yolo = None
+                            return None
+                    else:
+                        self._yolo = None
+                        return None
+                # --- END: MODIFIED LOADING ---
+            except HfHubHTTPError as e:
+                print(f"ERROR: Failed to download/access YOLO model '{filename}' via Hugging Face Hub: {e}")
                 self._yolo = None
+            except FileNotFoundError as e:  # This might be redundant if hf_hub_download raises its own error
+                print(f"ERROR: YOLO model file '{filename}' not found after download attempt: {e}")
                 self._yolo = None
+            except Exception as e:  # General catch-all for unexpected issues during hf_hub_download or path ops
+                print(f"ERROR: An unexpected issue occurred related to YOLO model file handling for {filename}: {e}")
+                import traceback
+                traceback.print_exc()
                 self._yolo = None
         return self._yolo
     def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
         """Analyzes a single page image to extract layout and content."""
         print("  Engine: Analyzing image...")