Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on May 29

Commit

8d56101

verified ·

1 Parent(s): d8adf3f

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +133 -37

mdr_pdf_parser.py CHANGED Viewed

@@ -1159,42 +1159,98 @@ class _MDR_TextSystem:
     try: return list(sorted(boxes, key=key))
     except: return list(boxes) # Fallback
-  def __call__(self, img, classify=True):
     ori_im = img.copy()
-    boxes = self.detector(img)
     if boxes is None or len(boxes) == 0:
-        return [], []
-    boxes = self._sort_boxes(boxes)
     crops = []
-    for b in boxes:
         try:
-            crops.append(mdr_get_rotated_crop(ori_im, b)) # Use renamed util
-        except:
             crops.append(None)
-    valid_idxs = [i for i, c in enumerate(crops) if c is not None]
     if not valid_idxs:
         return [], []
-    crops = [crops[i] for i in valid_idxs]
-    boxes = [boxes[i] for i in valid_idxs]
     if self.use_cls and self.classifier and classify:
         try:
-            crops, _ = self.classifier(crops) # Ignore cls results, just use rotated crops
-        except Exception as e:
-            print(f"Classifier error: {e}")
     try:
-        rec_res = self.recognizer(crops)
-    except Exception as e:
-        print(f"Recognizer error: {e}")
-        return boxes, [["", 0.0]] * len(boxes)
-    final_boxes, final_rec = [], []
-    for box, res in zip(boxes, rec_res):
-        txt, score = res
-        if score >= self.drop_score:
             final_boxes.append(box)
-            final_rec.append(res)
-    if self.save_crop:
-        self._save_crops(crops, rec_res)
-    return final_boxes, final_rec
   def _save_crops(self, crops, recs):
       mdr_ensure_directory(self.crop_dir)
@@ -1511,18 +1567,58 @@ class MDROcrEngine:
   def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
     """Finds and recognizes text fragments in a NumPy image (BGR)."""
     system = self._get_system()
-    if system is None: print("MDR OCR System unavailable."); return
-    img = self._preprocess(image_np)
-    try: boxes, recs = system(img)
-    except Exception as e: print(f"MDR OCR prediction error: {e}"); return
-    if not boxes or not recs: return
-    for box_pts, (txt, conf) in zip(boxes, recs):
-      if not txt or mdr_is_whitespace(txt) or conf < 0.1: continue
-      pts = [(float(p[0]), float(p[1])) for p in box_pts]
-      if len(pts) == 4:
-          r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
-          if r.is_valid and r.area > 1:
-              yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
   def _preprocess(self, img: np.ndarray) -> np.ndarray:
     if len(img.shape) == 3 and img.shape[2] == 4:

     try: return list(sorted(boxes, key=key))
     except: return list(boxes) # Fallback
+  def __call__(self, img, classify=True): # classify is True by default
     ori_im = img.copy()
+    print(f"  DEBUG OCR SYS: _MDR_TextSystem called. Original image shape: {ori_im.shape}") # DEBUG
+    boxes = self.detector(img) # This is _MDR_TextDetector
     if boxes is None or len(boxes) == 0:
+        print("  DEBUG OCR SYS: Detector returned no boxes. Returning empty fragments.") # DEBUG
+        return [], [] # This is what currently leads to "0 fragments found" if detector fails
+    print(f"  DEBUG OCR SYS: Detector returned {len(boxes)} boxes. Proceeding to crop and recognize.") # DEBUG
+    boxes = self._sort_boxes(boxes) # Sorting happens here
     crops = []
+    for i, b in enumerate(boxes):
         try:
+            crop_img = mdr_get_rotated_crop(ori_im, b)
+            if crop_img is None:
+                print(f"    DEBUG OCR SYS: Crop {i+1}/{len(boxes)} is None.") # DEBUG
+                crops.append(None)
+            elif crop_img.shape[0] == 0 or crop_img.shape[1] == 0:
+                print(f"    DEBUG OCR SYS: Crop {i+1}/{len(boxes)} has zero dimension: {crop_img.shape}") # DEBUG
+                crops.append(None)
+            else:
+                crops.append(crop_img)
+                # Optionally save these crops for manual inspection:
+                # if self.save_crop: cv2.imwrite(os.path.join(self.crop_dir, f"debug_crop_before_cls_{self.crop_idx + i}.png"), crop_img)
+        except Exception as e_crop:
+            print(f"    DEBUG OCR SYS: Error cropping box {i+1}/{len(boxes)}: {e_crop}") # DEBUG
             crops.append(None)
+    valid_idxs = [i for i, c in enumerate(crops) if c is not None and c.shape[0] > 0 and c.shape[1] > 0]
     if not valid_idxs:
+        print("  DEBUG OCR SYS: No valid crops obtained after attempting to crop detected boxes. Returning empty fragments.") # DEBUG
         return [], []
+    # Filter crops and corresponding boxes
+    valid_crops = [crops[i] for i in valid_idxs]
+    boxes_for_valid_crops = [boxes[i] for i in valid_idxs]
+    print(f"  DEBUG OCR SYS: Number of valid crops to process: {len(valid_crops)}") # DEBUG
     if self.use_cls and self.classifier and classify:
+        print(f"  DEBUG OCR SYS: Applying classifier to {len(valid_crops)} crops...") # DEBUG
         try:
+            # The classifier might modify valid_crops in-place (e.g., rotate them)
+            classified_crops, cls_results = self.classifier(valid_crops) # classifier returns list, results
+            print(f"    DEBUG OCR SYS: Classifier results count: {len(cls_results)}. First few: {cls_results[:3]}") # DEBUG
+            valid_crops = classified_crops # Update with potentially rotated crops
+        except Exception as e_cls:
+            print(f"    DEBUG OCR SYS: Classifier error: {e_cls}. Using unclassified crops.") # DEBUG
+            # Continue with unclassified (but valid) crops
+    print(f"  DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
     try:
+        rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
+        print(f"    DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
+    except Exception as e_rec:
+        print(f"    DEBUG OCR SYS: Recognizer error: {e_rec}. Returning empty fragments.") # DEBUG
+        return [], [] # If recognizer fails, we can't proceed
+    final_boxes, final_rec_tuples = [], [] # Changed final_rec to final_rec_tuples
+    if len(boxes_for_valid_crops) != len(rec_res):
+        print(f"  DEBUG OCR SYS: Mismatch! Boxes count {len(boxes_for_valid_crops)} != Recognizer results count {len(rec_res)}. This should not happen.")
+        # Handle this gracefully, perhaps by taking the minimum length
+        min_len = min(len(boxes_for_valid_crops), len(rec_res))
+        boxes_to_iterate = boxes_for_valid_crops[:min_len]
+        rec_res_to_iterate = rec_res[:min_len]
+    else:
+        boxes_to_iterate = boxes_for_valid_crops
+        rec_res_to_iterate = rec_res
+    print(f"  DEBUG OCR SYS: Filtering {len(rec_res_to_iterate)} recognition results with drop_score: {self.drop_score}") # DEBUG
+    for i, (box, res_tuple) in enumerate(zip(boxes_to_iterate, rec_res_to_iterate)):
+        txt, score = res_tuple
+        print(f"    DEBUG OCR SYS: Box {i+1} - Recognized: '{txt}', Score: {score:.4f}") # DEBUG
+        if score >= self.drop_score and txt and not mdr_is_whitespace(txt): # Added check for non-empty/whitespace
             final_boxes.append(box)
+            final_rec_tuples.append(res_tuple)
+        else:
+            reason = []
+            if score < self.drop_score: reason.append(f"score {score:.2f} < {self.drop_score}")
+            if not txt: reason.append("empty text")
+            if txt and mdr_is_whitespace(txt): reason.append("whitespace text")
+            print(f"      DEBUG OCR SYS: Box {i+1} DROPPED. Reason(s): {', '.join(reason)}") # DEBUG
+    if self.save_crop: # This is false by default in _MDR_ONNXParams
+        # Ensure crop_dir exists if you enable this
+        # self._save_crops(valid_crops, rec_res) # Pass original rec_res to save all attempts if needed
+        pass
+    print(f"  DEBUG OCR SYS: Returning {len(final_boxes)} final boxes and {len(final_rec_tuples)} final recognition results.") # DEBUG
+    return final_boxes, final_rec_tuples # Ensure this returns tuples of (text, score)
   def _save_crops(self, crops, recs):
       mdr_ensure_directory(self.crop_dir)
   def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
     """Finds and recognizes text fragments in a NumPy image (BGR)."""
     system = self._get_system()
+    if system is None:
+        print("  DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.") # DEBUG
+        return # Empty generator
+    img_for_system = self._preprocess(image_np) # _preprocess handles BGR/BGRA/GRAY to BGR
+    print(f"  DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}") # DEBUG
+    try:
+        # system.__call__ should return (list_of_boxes, list_of_tuples_text_score)
+        boxes, recs = system(img_for_system) # recs should be list of (text, score)
+    except Exception as e:
+        print(f"  DEBUG OCR Engine: Error during TextSystem prediction: {e}") # DEBUG
+        import traceback
+        traceback.print_exc()
+        return # Empty generator
+    if not boxes or not recs:
+        print(f"  DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes)}) or no recs ({len(recs)}). No fragments generated.") # DEBUG
+        return # Empty generator
+    if len(boxes) != len(recs):
+        print(f"  DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic.")
+        # Potentially try to recover by taking the minimum length, or just return
+        return
+    print(f"  DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.") # DEBUG
+    fragments_generated_count = 0
+    for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
+        if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
+            print(f"    DEBUG OCR Engine: Rec item {i} is not a valid (text, score) tuple: {rec_tuple}. Skipping.")
+            continue
+        txt, conf = rec_tuple
+        # The filtering by drop_score and whitespace should have happened in _MDR_TextSystem
+        # But we can add a redundant check or rely on it.
+        # For MDROcrFragment, we just need valid text and geometry.
+        if not txt or mdr_is_whitespace(txt): # Basic check, though system should filter
+            # print(f"    DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.")
+            continue
+        pts = [(float(p[0]), float(p[1])) for p in box_pts]
+        if len(pts) == 4:
+            r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
+            if r.is_valid and r.area > 1: # Ensure valid geometry
+                yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
+                fragments_generated_count += 1
+            # else:
+                # print(f"    DEBUG OCR Engine: Fragment {i} has invalid/small rectangle. Area: {r.area}. Valid: {r.is_valid}. Skipping.")
+        # else:
+            # print(f"    DEBUG OCR Engine: Fragment {i} box_pts not length 4: {len(pts)}. Skipping.")
+    print(f"  DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.") # DEBUG
   def _preprocess(self, img: np.ndarray) -> np.ndarray:
     if len(img.shape) == 3 and img.shape[2] == 4: