Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on 30 days ago

Commit

763903d

verified ·

1 Parent(s): 0029256

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +84 -59

mdr_pdf_parser.py CHANGED Viewed

@@ -26,7 +26,6 @@ import fitz  # PyMuPDF
 from fitz import Document as FitzDocument, Page as FitzPage, Matrix as FitzMatrix
 import numpy as np
 import cv2  # OpenCV
-import torch  # PyTorch
 import requests  # For downloading models
 from pathlib import Path
 from enum import auto, Enum
@@ -46,7 +45,7 @@ from alphabet_detector import AlphabetDetector
 from munch import Munch
 from transformers import LayoutLMv3ForTokenClassification
 import onnxruntime
-from enum import auto, Enum
 # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
 from huggingface_hub import hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
@@ -1396,91 +1395,109 @@ class _MDR_TextSystem:
         self.save_crop = getattr(args, 'save_crop_res', False)
         self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
-    # --- START: CORRECTED/ADDED __call__ METHOD ---
     def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
-        """
-        Processes an image to detect and recognize text.
-        Args:
-            img: A NumPy array representing the image (BGR format).
-        Returns:
-            A tuple containing:
-            - A list of detected text bounding boxes (each box is a NumPy array of 4 points).
-            - A list of recognition results (each result is a tuple of [text, confidence_score]).
-        """
-        ori_im = img.copy()  # Keep original for cropping
-        # 1. Detect text boxes using self.detector
-        # The detector's __call__ method handles its own preprocessing.
-        # dt_boxes are expected to be in original image coordinates.
-        dt_boxes: np.ndarray = self.detector(img)  # This is an np.ndarray of shape (N, 4, 2) or empty
         print(
             f"  DEBUG TextSystem: Detector found {len(dt_boxes) if dt_boxes is not None and dt_boxes.size > 0 else 0} initial boxes.")
-        if dt_boxes is None or dt_boxes.size == 0:  # Check if array is empty
             return [], []
-        # 2. Sort boxes (typically top-to-bottom, left-to-right)
         dt_boxes_sorted: list[np.ndarray] = self._sort_boxes(dt_boxes)
         print(f"  DEBUG TextSystem: Sorted {len(dt_boxes_sorted)} boxes.")
-        if not dt_boxes_sorted:  # If sorting resulted in empty list (e.g. due to unexpected format)
             return [], []
-        # 3. Get cropped images from detected boxes
         img_crop_list: list[np.ndarray] = []
         for i in range(len(dt_boxes_sorted)):
-            # dt_boxes_sorted[i] is a single box (e.g., 4x2 array of points)
             crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
-            img_crop_list.append(crop_im)
-        print(f"  DEBUG TextSystem: Created {len(img_crop_list)} crops for further processing.")
-        # 4. (Optional) Classify text orientation and rotate crops if necessary
-        # The classifier's __call__ method handles its own preprocessing and modifies img_crop_list in place.
-        if self.use_cls and self.classifier is not None and img_crop_list:
             print(f"  DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
-            img_crop_list, cls_results = self.classifier(img_crop_list)  # classifier updates img_crop_list
             print(f"  DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
-        # 5. Recognize text in the (potentially rotated) cropped images
-        # The recognizer's __call__ method handles its own preprocessing.
         rec_results: list[tuple[str, float]] = []
-        if img_crop_list:
-            print(f"  DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
-            rec_results = self.recognizer(img_crop_list)
-        else:
-            print(f"  DEBUG TextSystem: No crops to recognize.")
-        # 6. Filter results
         final_boxes_to_return: list[np.ndarray] = []
         final_recs_to_return: list[tuple[str, float]] = []
         final_crops_for_saving: list[np.ndarray] = []
-        if rec_results and len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
             for i in range(len(rec_results)):
                 text, confidence = rec_results[i]
-                if confidence >= self.drop_score and text and not mdr_is_whitespace(text):
-                    final_boxes_to_return.append(dt_boxes_sorted[i])
-                    final_recs_to_return.append(rec_results[i])
-                    if self.save_crop:
-                        final_crops_for_saving.append(img_crop_list[i])
         else:
-            print(f"  DEBUG TextSystem: Warning - Mismatch or empty rec_results. "
-                  f"len(rec_results)={len(rec_results) if rec_results else 'None'}, "
                   f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
-                  f"len(img_crop_list)={len(img_crop_list)}. No results will be returned from this stage.")
-            # Do not return here, allow empty lists to propagate if that's the case
         print(f"  DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
-        # 7. (Optional) Save cropped images that passed all filters
         if self.save_crop and final_crops_for_saving:
             print(f"  DEBUG TextSystem: Saving {len(final_crops_for_saving)} filtered crops.")
             self._save_crops(final_crops_for_saving, final_recs_to_return)
         return final_boxes_to_return, final_recs_to_return
-    # --- END: CORRECTED/ADDED __call__ METHOD ---
     def _sort_boxes(self, boxes):
         if boxes is None or len(boxes) == 0: return []
@@ -2437,14 +2454,22 @@ class MDRImageOptimizer:
          in fragments if (r := f.rect)]
     def finalize_layout_coords(self, layouts: list[MDRLayoutElement]):
-        if self._rot_ctx is None or self._adjust_points: return
-        if len(self._fragments) == len(self._rot_ctx.fragment_origin_rectangles): [setattr(f, 'rect', orig_r) for
-                                                                                   f, orig_r in zip(self._fragments,
-                                                                                                    self._rot_ctx.fragment_origin_rectangles)]
-        adj = self._rot_ctx.to_origin;
-        [setattr(l, 'rect',
-                 MDRRectangle(lt=adj.adjust(r.lt), rt=adj.adjust(r.rt), lb=adj.adjust(r.lb), rb=adj.adjust(r.rb))) for l
-         in layouts if (r := l.rect)]
 # --- MDR Image Clipping ---

 from fitz import Document as FitzDocument, Page as FitzPage, Matrix as FitzMatrix
 import numpy as np
 import cv2  # OpenCV
 import requests  # For downloading models
 from pathlib import Path
 from enum import auto, Enum
 from munch import Munch
 from transformers import LayoutLMv3ForTokenClassification
 import onnxruntime
 # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
 from huggingface_hub import hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
         self.save_crop = getattr(args, 'save_crop_res', False)
         self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
+        # In class _MDR_TextSystem:
     def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
+        ori_im = img.copy()
+        dt_boxes: np.ndarray = self.detector(img)
         print(
             f"  DEBUG TextSystem: Detector found {len(dt_boxes) if dt_boxes is not None and dt_boxes.size > 0 else 0} initial boxes.")
+        if dt_boxes is None or dt_boxes.size == 0:
             return [], []
         dt_boxes_sorted: list[np.ndarray] = self._sort_boxes(dt_boxes)
         print(f"  DEBUG TextSystem: Sorted {len(dt_boxes_sorted)} boxes.")
+        if not dt_boxes_sorted:
             return [], []
         img_crop_list: list[np.ndarray] = []
         for i in range(len(dt_boxes_sorted)):
             crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
+            # Ensure crop_im is not empty or too small before adding
+            if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:
+                img_crop_list.append(crop_im)
+            else:
+                print(
+                    f"    DEBUG TextSystem: Crop {i} was None or too small, skipping. Original box: {dt_boxes_sorted[i]}")
+                # To maintain correspondence, we might need to handle this more carefully
+                # For now, this might lead to length mismatches if not all crops are valid.
+                # A better approach might be to filter dt_boxes_sorted alongside img_crop_list creation.
+        # Let's refine the filtering of boxes and creation of crops to ensure they always match
+        valid_boxes_for_cropping: list[np.ndarray] = []
+        img_crop_list_refined: list[np.ndarray] = []
+        for i, box_pts in enumerate(dt_boxes_sorted):
+            crop_im = mdr_get_rotated_crop(ori_im, box_pts)
+            if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:  # Min height/width for a crop
+                valid_boxes_for_cropping.append(box_pts)
+                img_crop_list_refined.append(crop_im)
+            else:
+                print(
+                    f"    DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
+        dt_boxes_sorted = valid_boxes_for_cropping  # Update dt_boxes_sorted to only include those that yielded valid crops
+        img_crop_list = img_crop_list_refined  # Use the refined list of crops
+        print(f"  DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
+        if not img_crop_list:  # If no valid crops were made
+            print("  DEBUG TextSystem: No valid crops generated. Returning empty.")
+            return [], []
+        if self.use_cls and self.classifier is not None:  # No need to check img_crop_list again, it's guaranteed non-empty here
             print(f"  DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
+            img_crop_list, cls_results = self.classifier(img_crop_list)
             print(f"  DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
         rec_results: list[tuple[str, float]] = []
+        # No need to check img_crop_list again
+        print(f"  DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
+        rec_results = self.recognizer(img_crop_list)
+        print(f"  DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
         final_boxes_to_return: list[np.ndarray] = []
         final_recs_to_return: list[tuple[str, float]] = []
         final_crops_for_saving: list[np.ndarray] = []
+        # Ensure lengths match before iterating. This is crucial.
+        if len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
+            print(
+                f"  DEBUG TextSystem: Filtering {len(rec_results)} recognition results with drop_score: {self.drop_score}")
             for i in range(len(rec_results)):
                 text, confidence = rec_results[i]
+                # Log each result before filtering
+                print(f"    DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
+                if confidence >= self.drop_score:
+                    if text and not mdr_is_whitespace(text):
+                        final_boxes_to_return.append(dt_boxes_sorted[i])
+                        final_recs_to_return.append(rec_results[i])
+                        if self.save_crop:
+                            final_crops_for_saving.append(img_crop_list[i])
+                    else:
+                        print(f"      DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
+                else:
+                    print(
+                        f"      DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
         else:
+            print(f"  DEBUG TextSystem: CRITICAL MISMATCH in lengths after recognition! "
+                  f"len(rec_results)={len(rec_results)}, "
                   f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
+                  f"len(img_crop_list)={len(img_crop_list)}. "
+                  f"This indicates an issue in crop generation or recognizer batching. No results will be returned.")
+            # Return empty if critical mismatch, as indexing will fail or be incorrect.
+            return [], []
         print(f"  DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
         if self.save_crop and final_crops_for_saving:
             print(f"  DEBUG TextSystem: Saving {len(final_crops_for_saving)} filtered crops.")
             self._save_crops(final_crops_for_saving, final_recs_to_return)
         return final_boxes_to_return, final_recs_to_return
     def _sort_boxes(self, boxes):
         if boxes is None or len(boxes) == 0: return []
          in fragments if (r := f.rect)]
     def finalize_layout_coords(self, layouts: list[MDRLayoutElement]):
+        if self._rot_ctx is None:  # If no rotation context, nothing to do
+            return
+        if not self._adjust_points:  # If we are NOT adjusting points back to original,
+            # then restore original fragment rectangles
+            if len(self._fragments) == len(self._rot_ctx.fragment_origin_rectangles):
+                for f, orig_r in zip(self._fragments, self._rot_ctx.fragment_origin_rectangles):
+                    f.rect = orig_r
+            # And adjust layout rectangles to origin coordinates
+            adj = self._rot_ctx.to_origin
+            for l in layouts:
+                if (r := l.rect):  # Check if rect exists
+                    l.rect = MDRRectangle(lt=adj.adjust(r.lt), rt=adj.adjust(r.rt), lb=adj.adjust(r.lb),
+                                          rb=adj.adjust(r.rb))
+        # If self._adjust_points is True, the coordinates (already adjusted to the rotated image) are kept as is.
+        # No further action is needed for the True case here, as the adjustments happened in receive_fragments.
 # --- MDR Image Clipping ---