Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on May 30

Commit

0029256

verified ·

1 Parent(s): f6abc87

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +40 -10

mdr_pdf_parser.py CHANGED Viewed

@@ -1013,7 +1013,8 @@ class _MDR_TextDetector(_MDR_PredictBase):
             new_boxes.append(box)
         return np.array(new_boxes)
-    # In class _MDR_TextDetector:
     def __call__(self, img):
         ori_im = img.copy()
         data = {"image": img}
@@ -1026,14 +1027,14 @@ class _MDR_TextDetector(_MDR_PredictBase):
             print(f"  DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
             import traceback
             traceback.print_exc()
-            return np.array([])  # Return empty array on failure
         if data is None:
             print(
                 "  DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
             return np.array([])
-        processed_img, shape_list = data  # shape_list is [src_h, src_w, ratio_h, ratio_w]
         if processed_img is None:
             print("  DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
             return np.array([])
@@ -1052,7 +1053,7 @@ class _MDR_TextDetector(_MDR_PredictBase):
             print(f"  DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
             import traceback
             traceback.print_exc()
-            return np.array([])  # Return empty array on failure
         print(f"  DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
         preds = {"maps": outputs[0]}
@@ -1064,17 +1065,46 @@ class _MDR_TextDetector(_MDR_PredictBase):
             traceback.print_exc()
             return np.array([])
-        if not post_res or not post_res[0].get('points'):
-            print("  DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no points.")
             return np.array([])
-        boxes_from_post = post_res[0]['points']
         print(
             f"  DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
-        if not isinstance(boxes_from_post, (list, np.ndarray)) or len(
-                boxes_from_post) == 0:  # Check if it's empty or not list-like
-            print("  DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter.")
             return np.array([])
         if self.args.det_box_type == 'poly':

             new_boxes.append(box)
         return np.array(new_boxes)
+        # In class _MDR_TextDetector:
     def __call__(self, img):
         ori_im = img.copy()
         data = {"image": img}
             print(f"  DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
             import traceback
             traceback.print_exc()
+            return np.array([])
         if data is None:
             print(
                 "  DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
             return np.array([])
+        processed_img, shape_list = data
         if processed_img is None:
             print("  DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
             return np.array([])
             print(f"  DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
             import traceback
             traceback.print_exc()
+            return np.array([])
         print(f"  DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
         preds = {"maps": outputs[0]}
             traceback.print_exc()
             return np.array([])
+        # --- START: REFINED CHECK ---
+        # 1. Check if post_res itself is valid and contains the expected structure.
+        if not post_res or not isinstance(post_res, list) or len(post_res) == 0 or \
+                not isinstance(post_res[0], dict) or 'points' not in post_res[0]:
+            print("  DEBUG OCR: _MDR_TextDetector: DBPostProcess returned invalid or empty structure for points.")
+            return np.array([])
+        boxes_from_post = post_res[0]['points']  # This is expected to be a np.ndarray or a list of boxes
+        # 2. Check if boxes_from_post is actually empty.
+        #    For a NumPy array, check its size. For a list, check if it's empty.
+        no_boxes_found = False
+        if isinstance(boxes_from_post, np.ndarray):
+            if boxes_from_post.size == 0:
+                no_boxes_found = True
+        elif isinstance(boxes_from_post, list):
+            if not boxes_from_post:  # Empty list
+                no_boxes_found = True
+        elif boxes_from_post is None:  # Explicitly check for None
+            no_boxes_found = True
+        else:
+            # Should not happen if _MDR_DBPostProcess behaves as expected, but good to log
+            print(
+                f"  DEBUG OCR: _MDR_TextDetector: 'points' from DBPostProcess is of unexpected type: {type(boxes_from_post)}")
+            return np.array([])
+        if no_boxes_found:
+            print("  DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no actual point data.")
             return np.array([])
+        # --- END: REFINED CHECK ---
         print(
             f"  DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
+        # The following check might be redundant now but can be kept for extra safety
+        # or if boxes_from_post could be other types not handled above.
+        if not isinstance(boxes_from_post, (list, np.ndarray)) or \
+                (isinstance(boxes_from_post, np.ndarray) and boxes_from_post.size == 0) or \
+                (isinstance(boxes_from_post, list) and not boxes_from_post):
+            print("  DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter (secondary check).")
             return np.array([])
         if self.args.det_box_type == 'poly':