Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on May 30

Commit

1725894

verified ·

1 Parent(s): 31b4b7f

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +205 -99

mdr_pdf_parser.py CHANGED Viewed

@@ -946,10 +946,9 @@ class _MDR_DBPostProcess:
                 f"  DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
             if self.box_t == 'poly':
-                boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dw_orig, dh_orig)
             elif self.box_t == 'quad':
-                boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dw_orig,
-                                                        dh_orig)  # Pass original dimensions
             else:
                 raise ValueError("box_type must be 'quad' or 'poly'")
             print(
@@ -1299,17 +1298,37 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
         print(
             f"    DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
-        if h_orig == 0 or w_orig == 0:
             print(
-                f"    DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}). Returning zeros.")
             return np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        r_current = w_orig / float(h_orig)
         tw = min(imgW, int(ceil(imgH * r_current)))
-        tw = max(1, tw)
         print(f"    DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
         try:
             resized = cv2.resize(img, (tw, imgH))
         except cv2.error as e_resize:  # Catch specific cv2 error
             print(
@@ -1322,25 +1341,36 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
             traceback.print_exc()
             return np.zeros((imgC, imgH, imgW), dtype=np.float32)
         resized = resized.astype("float32")
-        if imgC == 1 and len(resized.shape) == 3:
-            resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
-        if len(resized.shape) == 2:
-            resized = resized[:, :, np.newaxis]  # Add channel dim if grayscale
-        # Ensure resized has 3 channels if imgC is 3, even if input was grayscale
         if imgC == 3 and resized.shape[2] == 1:
             resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
         resized = resized.transpose((2, 0, 1)) / 255.0
         resized -= 0.5
         resized /= 0.5
         padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding[:, :, 0:tw] = resized
-        print(f"    DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
-        # ADDED: Log normalized crop properties
         min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
         print(f"    DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
               f"dtype: {padding.dtype}, "
@@ -1349,7 +1379,7 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
               f"MeanPx: {mean_px:.4f}")
         if np.all(padding == 0):
             print("    DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
-        elif np.abs(max_px - min_px) < 1e-6:  # Check if all elements are (close to) the same
             print(f"    DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
         return padding
@@ -1400,8 +1430,6 @@ class _MDR_TextSystem:
         self.save_crop = getattr(args, 'save_crop_res', False)
         self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
-        # In class _MDR_TextSystem:
     def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
         ori_im = img.copy()
@@ -1416,34 +1444,22 @@ class _MDR_TextSystem:
         if not dt_boxes_sorted:
             return [], []
-        img_crop_list: list[np.ndarray] = []
-        for i in range(len(dt_boxes_sorted)):
-            crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
-            # Ensure crop_im is not empty or too small before adding
-            if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:
-                img_crop_list.append(crop_im)
-            else:
-                print(
-                    f"    DEBUG TextSystem: Crop {i} was None or too small, skipping. Original box: {dt_boxes_sorted[i]}")
-                # To maintain correspondence, we might need to handle this more carefully
-                # For now, this might lead to length mismatches if not all crops are valid.
-                # A better approach might be to filter dt_boxes_sorted alongside img_crop_list creation.
-        # Let's refine the filtering of boxes and creation of crops to ensure they always match
         valid_boxes_for_cropping: list[np.ndarray] = []
-        img_crop_list_refined: list[np.ndarray] = []
         for i, box_pts in enumerate(dt_boxes_sorted):
             crop_im = mdr_get_rotated_crop(ori_im, box_pts)
             if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:  # Min height/width for a crop
                 valid_boxes_for_cropping.append(box_pts)
-                img_crop_list_refined.append(crop_im)
             else:
                 print(
                     f"    DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
         dt_boxes_sorted = valid_boxes_for_cropping  # Update dt_boxes_sorted to only include those that yielded valid crops
-        img_crop_list = img_crop_list_refined  # Use the refined list of crops
         print(f"  DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
@@ -1451,50 +1467,76 @@ class _MDR_TextSystem:
             print("  DEBUG TextSystem: No valid crops generated. Returning empty.")
             return [], []
-        if self.use_cls and self.classifier is not None:  # No need to check img_crop_list again, it's guaranteed non-empty here
             print(f"  DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
-            img_crop_list, cls_results = self.classifier(img_crop_list)
             print(f"  DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
         rec_results: list[tuple[str, float]] = []
-        # No need to check img_crop_list again
         print(f"  DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
         rec_results = self.recognizer(img_crop_list)
         print(f"  DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
         final_boxes_to_return: list[np.ndarray] = []
         final_recs_to_return: list[tuple[str, float]] = []
         final_crops_for_saving: list[np.ndarray] = []
-        # Ensure lengths match before iterating. This is crucial.
-        if len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
-            print(
-                f"  DEBUG TextSystem: Filtering {len(rec_results)} recognition results with drop_score: {self.drop_score}")
-            for i in range(len(rec_results)):
-                text, confidence = rec_results[i]
-                # Log each result before filtering
-                print(f"    DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
-                if confidence >= self.drop_score:
-                    if text and not mdr_is_whitespace(text):
-                        final_boxes_to_return.append(dt_boxes_sorted[i])
-                        final_recs_to_return.append(rec_results[i])
-                        if self.save_crop:
-                            final_crops_for_saving.append(img_crop_list[i])
-                    else:
-                        print(f"      DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
                 else:
-                    print(
-                        f"      DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
-        else:
-            print(f"  DEBUG TextSystem: CRITICAL MISMATCH in lengths after recognition! "
-                  f"len(rec_results)={len(rec_results)}, "
-                  f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
-                  f"len(img_crop_list)={len(img_crop_list)}. "
-                  f"This indicates an issue in crop generation or recognizer batching. No results will be returned.")
-            # Return empty if critical mismatch, as indexing will fail or be incorrect.
-            return [], []
         print(f"  DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
@@ -1710,36 +1752,67 @@ _MDR_CORRECTION_MIN_OVERLAP = 0.5
 def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image, layout: MDRLayoutElement):
     if not layout.fragments:
         return
     try:
         x1, y1, x2, y2 = layout.rect.wrapper
         margin = 5
-        crop_box = (max(0, round(x1) - margin), max(0, round(y1) - margin), min(source_img.width, round(x2) + margin),
-                    min(source_img.height, round(y2) + margin))
-        if crop_box[0] >= crop_box[2] or crop_box[1] >= crop_box[3]:
             return
-        cropped = source_img.crop(crop_box)
-        off_x, off_y = crop_box[0], crop_box[1]
     except Exception as e:
-        print(f"Correct: Crop error: {e}")
         return
     try:
-        cropped_np = np.array(cropped.convert("RGB"))[:, :, ::-1]
         new_frags_local = list(ocr_engine.find_text_fragments(cropped_np))
     except Exception as e:
-        print(f"Correct: OCR error: {e}")
-        return
     new_frags_global = []
     for f in new_frags_local:
         r = f.rect
         lt, rt, lb, rb = r.lt, r.rt, r.lb, r.rb
         f.rect = MDRRectangle(lt=(lt[0] + off_x, lt[1] + off_y), rt=(rt[0] + off_x, rt[1] + off_y),
                               lb=(lb[0] + off_x, lb[1] + off_y), rb=(rb[0] + off_x, rb[1] + off_y))
         new_frags_global.append(f)
-    orig_frags = layout.fragments
     matched, unmatched_orig = [], []
     used_new = set()
     for i, orig_f in enumerate(orig_frags):
         best_j, best_rate = -1, -1.0
         try:
@@ -1771,13 +1844,16 @@ def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image,
             used_new.add(best_j)
         else:
             unmatched_orig.append(orig_f)
     unmatched_new = [f for j, f in enumerate(new_frags_global) if j not in used_new]
     final = [n if n.rank >= o.rank else o for o, n in matched]
     final.extend(unmatched_orig)
     final.extend(unmatched_new)
-    layout.fragments = final
-    layout.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
 # --- MDR OCR Engine ---
@@ -1860,6 +1936,7 @@ class MDROcrEngine:
                 # much lower thresholds so we actually get some candidate masks:
                 det_db_thresh=0.15,
                 det_db_box_thresh=0.15,
                 drop_score=0.01,
                 use_angle_cls=False,
             )
@@ -2142,16 +2219,11 @@ class MDRLayoutReader:
             return layouts
         print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
-        # --- START: ADDED SCALING LOGIC ---
         scaled_bboxes: list[list[int]] = []
-        if w > 0 and h > 0:  # Ensure valid width and height for division
             for bbox_item in bbox_list:
-                # bbox_item.value is (x0, y0, x1, y1) in original image coordinates
                 x0, y0, x1, y1 = bbox_item.value
-                # Scale to 0-1000 range based on image width (w) and height (h)
-                # Ensure coordinates are within [0, 1000] and x1>=x0, y1>=y0
-                # Clamp values to image boundaries before scaling to prevent negative scaled values if original box is outside
                 x0_c = max(0.0, min(x0, float(w)))
                 y0_c = max(0.0, min(y0, float(h)))
                 x1_c = max(0.0, min(x1, float(w)))
@@ -2159,56 +2231,90 @@ class MDRLayoutReader:
                 scaled_x0 = max(0, min(1000, int(1000 * x0_c / w)))
                 scaled_y0 = max(0, min(1000, int(1000 * y0_c / h)))
-                scaled_x1 = max(scaled_x0, min(1000, int(1000 * x1_c / w)))  # Ensure x1 >= x0
-                scaled_y1 = max(scaled_y0, min(1000, int(1000 * y1_c / h)))  # Ensure y1 >= y0
                 scaled_bboxes.append([scaled_x0, scaled_y0, scaled_x1, scaled_y1])
         else:
             print(
                 "MDRLayoutReader: Warning - Invalid image dimensions (w or h is zero) for scaling bboxes. Cannot determine reading order.")
             layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
             return layouts
-        # --- END: ADDED SCALING LOGIC ---
-        if not scaled_bboxes:  # If scaling resulted in no bboxes (e.g. w/h was 0)
             print(
                 "MDRLayoutReader: No scaled bboxes available after scaling step. Returning geometrically sorted layouts.")
             layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
             return layouts
         orders: list[int] = []
         try:
             with torch.no_grad():
                 print("MDRLayoutReader: Creating reader inputs...")
-                inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)  # Use the newly created scaled_bboxes
                 print("MDRLayoutReader: Preparing inputs for model device...")
                 inputs = mdr_prepare_reader_inputs(inputs, model)
                 print("MDRLayoutReader: Running model inference...")
                 logits = model(**inputs).logits.cpu().squeeze(0)
                 print("MDRLayoutReader: Model inference complete. Parsing logits...")
-                # length is based on original bbox_list (which should match scaled_bboxes length)
-                orders = mdr_parse_reader_logits(logits, len(bbox_list))
                 print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
-                # Assign the determined orders back to the bbox_list items
                 if len(orders) == len(bbox_list):
                     for i, order_val in enumerate(orders):
                         bbox_list[i].order = order_val
                 else:
                     print(
-                        f"MDRLayoutReader: Warning - Mismatch between orders ({len(orders)}) and bbox_list ({len(bbox_list)}). Order assignment might be incorrect. Using sequential order.")
-                    for i in range(len(bbox_list)):  # Fallback to sequential order
                         bbox_list[i].order = i
         except Exception as e:
             print(f"MDR LayoutReader prediction error: {e}")
             import traceback
             traceback.print_exc()
-            # Fallback: assign sequential order to bbox_list items before geometric sort of layouts
             for i in range(len(bbox_list)):
                 bbox_list[i].order = i
-            # Then apply this sequential order (which effectively becomes a geometric sort)
             print("MDRLayoutReader: Applying fallback sequential order due to error...")
             result_layouts = self._apply_order(layouts, bbox_list)
-            return result_layouts  # Return here after applying fallback order
         print("MDRLayoutReader: Applying order...")
         result_layouts = self._apply_order(layouts, bbox_list)

                 f"  DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
             if self.box_t == 'poly':
+                boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dh_orig, dw_orig)
             elif self.box_t == 'quad':
+                boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dh_orig, dw_orig)
             else:
                 raise ValueError("box_type must be 'quad' or 'poly'")
             print(
         print(
             f"    DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
+        # --- START OF FIX ---
+        MIN_DIM_FOR_RESIZE = 2  # Minimum original height or width to attempt resize
+        if h_orig < MIN_DIM_FOR_RESIZE or w_orig < MIN_DIM_FOR_RESIZE:
             print(
+                f"    DEBUG RECOGNIZER: _resize_norm received degenerate crop ({h_orig}x{w_orig}) with dimension < {MIN_DIM_FOR_RESIZE}. Returning zeros before resize attempt.")
+            return np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        # --- END OF FIX ---
+        # Original check for h_orig == 0 or w_orig == 0 is now covered by the above,
+        # but can be kept for explicitness or if MIN_DIM_FOR_RESIZE is set to 1.
+        # If MIN_DIM_FOR_RESIZE is 1, the original check is still useful.
+        # If MIN_DIM_FOR_RESIZE is > 1, this specific check becomes redundant.
+        # Let's keep it for safety if MIN_DIM_FOR_RESIZE is changed.
+        if h_orig == 0 or w_orig == 0:  # This check is technically redundant if MIN_DIM_FOR_RESIZE >= 1
+            print(
+                f"    DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}) (secondary check). Returning zeros.")
             return np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        r_current = w_orig / float(h_orig)  # h_orig is guaranteed > 0 here if MIN_DIM_FOR_RESIZE >=1
         tw = min(imgW, int(ceil(imgH * r_current)))
+        tw = max(1, tw)  # Ensure target width is at least 1
+        # Ensure target height (imgH) is also valid (it comes from self.shape, so should be)
         print(f"    DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
         try:
+            # Ensure target dimensions for resize are valid
+            if tw <= 0 or imgH <= 0:
+                print(
+                    f"    DEBUG RECOGNIZER: _resize_norm calculated invalid target resize dimensions (tw: {tw}, imgH: {imgH}). Returning zeros.")
+                return np.zeros((imgC, imgH, imgW), dtype=np.float32)
             resized = cv2.resize(img, (tw, imgH))
         except cv2.error as e_resize:  # Catch specific cv2 error
             print(
             traceback.print_exc()
             return np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        # ... rest of the normalization code ...
         resized = resized.astype("float32")
+        if imgC == 1 and len(resized.shape) == 3:  # If target is 1 channel and resized is 3
+            if resized.shape[2] == 3:  # Check if it actually has 3 channels
+                resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
+        if len(resized.shape) == 2:  # If grayscale after potential conversion
+            resized = resized[:, :, np.newaxis]  # Add channel dim
+        # Ensure resized has 3 channels if imgC is 3, even if input was grayscale or became grayscale
         if imgC == 3 and resized.shape[2] == 1:
             resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
+        # Final check on channel consistency
+        if resized.shape[2] != imgC:
+            print(
+                f"    DEBUG RECOGNIZER: Channel mismatch after processing. Expected {imgC}, got {resized.shape[2]}. Crop shape ({h_orig},{w_orig}). Returning zeros.")
+            return np.zeros((imgC, imgH, imgW), dtype=np.float32)
         resized = resized.transpose((2, 0, 1)) / 255.0
         resized -= 0.5
         resized /= 0.5
         padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        # Ensure tw is not out of bounds for padding
+        actual_padded_width = min(tw, imgW)
+        padding[:, :, 0:actual_padded_width] = resized[:, :, 0:actual_padded_width]
+        print(f"    DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
+        # ... rest of the logging ...
         min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
         print(f"    DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
               f"dtype: {padding.dtype}, "
               f"MeanPx: {mean_px:.4f}")
         if np.all(padding == 0):
             print("    DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
+        elif np.abs(max_px - min_px) < 1e-6:
             print(f"    DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
         return padding
         self.save_crop = getattr(args, 'save_crop_res', False)
         self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
     def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
         ori_im = img.copy()
         if not dt_boxes_sorted:
             return [], []
+        # --- Stage 1 Fix: Refined filtering of boxes and creation of crops ---
+        # Ensure dt_boxes_sorted and img_crop_list are synchronized.
         valid_boxes_for_cropping: list[np.ndarray] = []
+        img_crop_list: list[np.ndarray] = []  # Initialize img_crop_list here
         for i, box_pts in enumerate(dt_boxes_sorted):
             crop_im = mdr_get_rotated_crop(ori_im, box_pts)
             if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:  # Min height/width for a crop
                 valid_boxes_for_cropping.append(box_pts)
+                img_crop_list.append(crop_im)  # Directly populate the final img_crop_list
             else:
                 print(
                     f"    DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
         dt_boxes_sorted = valid_boxes_for_cropping  # Update dt_boxes_sorted to only include those that yielded valid crops
+        # img_crop_list is now the correctly filtered list of crops, synchronized with dt_boxes_sorted.
+        # --- End of Stage 1 Fix ---
         print(f"  DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
             print("  DEBUG TextSystem: No valid crops generated. Returning empty.")
             return [], []
+        if self.use_cls and self.classifier is not None:
             print(f"  DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
+            img_crop_list, cls_results = self.classifier(
+                img_crop_list)  # classifier might modify img_crop_list (e.g., rotate)
             print(f"  DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
         rec_results: list[tuple[str, float]] = []
         print(f"  DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
         rec_results = self.recognizer(img_crop_list)
         print(f"  DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
+        # --- Start of Stage 2 Fix: Robust handling of rec_results length ---
+        expected_count = len(dt_boxes_sorted)  # This is synchronized with len(img_crop_list) before recognizer
+        # and should still match len(img_crop_list) after classifier
+        # if classifier preserves length.
+        actual_rec_count = len(rec_results)
+        num_to_process = 0
+        if actual_rec_count == expected_count:
+            num_to_process = actual_rec_count
+        else:
+            print(f"  DEBUG TextSystem: WARNING - Mismatch in lengths after recognition! "
+                  f"Expected (from boxes/crops): {expected_count}, "
+                  f"Recognizer returned: {actual_rec_count} results. ")
+            num_to_process = min(actual_rec_count, expected_count)
+            if num_to_process < expected_count:
+                print(
+                    f"  DEBUG TextSystem: Will process {num_to_process} items due to mismatch. Some data might be lost if recognizer dropped results or if there was an issue in earlier stages not caught.")
+            elif num_to_process < actual_rec_count:  # Recognizer returned more than expected
+                print(
+                    f"  DEBUG TextSystem: Will process {num_to_process} items. Recognizer returned more results ({actual_rec_count}) than expected crops ({expected_count}). Extra recognition results will be ignored.")
+        if num_to_process == 0:
+            if expected_count > 0:  # If there were boxes/crops but no rec results to process
+                print(
+                    "  DEBUG TextSystem: No recognition results to process (num_to_process is 0) despite having input boxes/crops. Returning empty.")
+            else:  # If there were no boxes/crops to begin with
+                print(
+                    "  DEBUG TextSystem: No items to process (no initial boxes or num_to_process is 0). Returning empty.")
+            return [], []
+        # --- End of Stage 2 Fix preamble ---
+        print(
+            f"  DEBUG TextSystem: Filtering {num_to_process} recognition results with drop_score: {self.drop_score}")
         final_boxes_to_return: list[np.ndarray] = []
         final_recs_to_return: list[tuple[str, float]] = []
         final_crops_for_saving: list[np.ndarray] = []
+        # --- Stage 2 Fix: Modified Loop (No outer strict if/else) ---
+        for i in range(num_to_process):  # Iterate up to the safe number
+            # It's crucial that dt_boxes_sorted[i], rec_results[i], and img_crop_list[i] correspond
+            # for the items being processed.
+            text, confidence = rec_results[i]
+            print(f"    DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
+            if confidence >= self.drop_score:
+                if text and not mdr_is_whitespace(text):
+                    final_boxes_to_return.append(dt_boxes_sorted[i])
+                    final_recs_to_return.append(rec_results[i])
+                    if self.save_crop:
+                        # Ensure img_crop_list[i] is valid if classifier could have changed its length
+                        # However, self.classifier is expected to return img_list of same length as input.
+                        final_crops_for_saving.append(img_crop_list[i])
                 else:
+                    print(f"      DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
+            else:
+                print(
+                    f"      DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
+        # --- End of Stage 2 Fix: Modified Loop ---
         print(f"  DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
 def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image, layout: MDRLayoutElement):
+    # --- START OF FIX ---
     if not layout.fragments:
+        # If the layout has no fragments to begin with, there's nothing to correct.
+        # Attempting to crop and OCR an empty layout region is unnecessary and can lead to errors.
+        # print(f"Correct: Layout {type(layout.cls).__name__} has no initial fragments. Skipping OCR correction.") # Optional: for debugging
         return
+    # --- END OF FIX ---
     try:
         x1, y1, x2, y2 = layout.rect.wrapper
         margin = 5
+        # Ensure crop_box dimensions are valid before cropping
+        crop_x1 = max(0, round(x1) - margin)
+        crop_y1 = max(0, round(y1) - margin)
+        crop_x2 = min(source_img.width, round(x2) + margin)
+        crop_y2 = min(source_img.height, round(y2) + margin)
+        if crop_x1 >= crop_x2 or crop_y1 >= crop_y2:  # If crop dimensions are invalid/empty
+            print(
+                f"Correct: Crop box for layout {type(layout.cls).__name__} is invalid/empty ({crop_x1},{crop_y1},{crop_x2},{crop_y2}). Skipping OCR correction.")
             return
+        cropped = source_img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
+        off_x, off_y = crop_x1, crop_y1
     except Exception as e:
+        print(f"Correct: Crop error for layout {type(layout.cls).__name__}: {e}")
         return
+    # Additional check: if cropped image is too small for OCR
+    if cropped.width < 5 or cropped.height < 5:  # Arbitrary small threshold
+        print(
+            f"Correct: Cropped image for layout {type(layout.cls).__name__} is too small ({cropped.width}x{cropped.height}). Skipping OCR correction.")
+        return
     try:
+        # Ensure conversion to RGB before converting to NumPy array
+        cropped_np = np.array(cropped.convert("RGB"))[:, :, ::-1]  # BGR for OpenCV-based OCR
         new_frags_local = list(ocr_engine.find_text_fragments(cropped_np))
     except Exception as e:
+        print(f"Correct: OCR error during correction for layout {type(layout.cls).__name__}: {e}")
+        # If OCR fails, we should probably keep the original fragments, if any.
+        # The current logic below will do this if new_frags_local is empty.
+        return  # Exit if OCR itself fails catastrophically
     new_frags_global = []
+    # ... (rest of the function remains the same) ...
     for f in new_frags_local:
         r = f.rect
         lt, rt, lb, rb = r.lt, r.rt, r.lb, r.rb
         f.rect = MDRRectangle(lt=(lt[0] + off_x, lt[1] + off_y), rt=(rt[0] + off_x, rt[1] + off_y),
                               lb=(lb[0] + off_x, lb[1] + off_y), rb=(rb[0] + off_x, rb[1] + off_y))
         new_frags_global.append(f)
+    orig_frags = layout.fragments  # These are the fragments that existed before this function call
     matched, unmatched_orig = [], []
     used_new = set()
+    # If new_frags_global is empty (e.g. OCR found nothing in the cropped region),
+    # then all orig_frags will go into unmatched_orig, and layout.fragments will be restored to orig_frags.
+    # This is generally fine.
     for i, orig_f in enumerate(orig_frags):
         best_j, best_rate = -1, -1.0
         try:
             used_new.add(best_j)
         else:
             unmatched_orig.append(orig_f)
     unmatched_new = [f for j, f in enumerate(new_frags_global) if j not in used_new]
     final = [n if n.rank >= o.rank else o for o, n in matched]
     final.extend(unmatched_orig)
     final.extend(unmatched_new)
+    layout.fragments = final
+    if layout.fragments:  # Only sort if there are fragments
+        layout.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
 # --- MDR OCR Engine ---
                 # much lower thresholds so we actually get some candidate masks:
                 det_db_thresh=0.15,
                 det_db_box_thresh=0.15,
+                unclip_ratio=2.0,
                 drop_score=0.01,
                 use_angle_cls=False,
             )
             return layouts
         print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
+        # --- START: SCALING LOGIC as in the prompt ---
         scaled_bboxes: list[list[int]] = []
+        if w > 0 and h > 0:
             for bbox_item in bbox_list:
                 x0, y0, x1, y1 = bbox_item.value
                 x0_c = max(0.0, min(x0, float(w)))
                 y0_c = max(0.0, min(y0, float(h)))
                 x1_c = max(0.0, min(x1, float(w)))
                 scaled_x0 = max(0, min(1000, int(1000 * x0_c / w)))
                 scaled_y0 = max(0, min(1000, int(1000 * y0_c / h)))
+                scaled_x1 = max(scaled_x0, min(1000, int(1000 * x1_c / w)))
+                scaled_y1 = max(scaled_y0, min(1000, int(1000 * y1_c / h)))
                 scaled_bboxes.append([scaled_x0, scaled_y0, scaled_x1, scaled_y1])
         else:
+            # This branch should ideally not be reached due to the initial w,h check
             print(
                 "MDRLayoutReader: Warning - Invalid image dimensions (w or h is zero) for scaling bboxes. Cannot determine reading order.")
             layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
             return layouts
+        # --- END: SCALING LOGIC ---
+        if not scaled_bboxes:  # Handles if bbox_list was empty
             print(
                 "MDRLayoutReader: No scaled bboxes available after scaling step. Returning geometrically sorted layouts.")
             layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
             return layouts
+        # --- START OF FIX ---
+        # Check if scaled_bboxes are problematic (e.g., all identical and degenerate)
+        bypass_model_inference = False
+        if len(scaled_bboxes) > 0:
+            num_s_bboxes = len(scaled_bboxes)
+            # Check if all scaled_bboxes are identical to the first one
+            first_s_bbox_str = str(scaled_bboxes[0])
+            all_identical = all(str(s_b) == first_s_bbox_str for s_b in scaled_bboxes)
+            if all_identical:
+                # Check if this identical box is degenerate (zero width or height)
+                s_x0, s_y0, s_x1, s_y1 = scaled_bboxes[0]
+                if (s_x1 - s_x0 == 0) or (s_y1 - s_y0 == 0):
+                    bypass_model_inference = True
+                    print("MDRLayoutReader: All scaled bboxes are identical and degenerate. Bypassing LayoutLMv3.")
+            if not bypass_model_inference and num_s_bboxes > 1:  # Check for high proportion of degenerate if not all identical
+                degenerate_count = 0
+                for s_b in scaled_bboxes:
+                    if (s_b[2] - s_b[0] == 0) or (s_b[3] - s_b[1] == 0):  # x1-x0 or y1-y0
+                        degenerate_count += 1
+                # If, for example, more than 90% of bboxes are degenerate
+                if degenerate_count / num_s_bboxes > 0.9:
+                    bypass_model_inference = True
+                    print(
+                        f"MDRLayoutReader: High percentage ({degenerate_count / num_s_bboxes * 100:.1f}%) of scaled bboxes are degenerate. Bypassing LayoutLMv3.")
+        if bypass_model_inference:
+            print("MDRLayoutReader: Applying fallback sequential order due to problematic scaled_bboxes.")
+            # Assign sequential order based on _prepare_bboxes's sort (y, then x)
+            for i in range(len(bbox_list)):
+                bbox_list[i].order = i
+            # Use _apply_order to apply this simple sequential ordering
+            result_layouts = self._apply_order(layouts, bbox_list)
+            return result_layouts
+        # --- END OF FIX ---
         orders: list[int] = []
         try:
             with torch.no_grad():
                 print("MDRLayoutReader: Creating reader inputs...")
+                inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)
                 print("MDRLayoutReader: Preparing inputs for model device...")
                 inputs = mdr_prepare_reader_inputs(inputs, model)
                 print("MDRLayoutReader: Running model inference...")
                 logits = model(**inputs).logits.cpu().squeeze(0)
                 print("MDRLayoutReader: Model inference complete. Parsing logits...")
+                orders = mdr_parse_reader_logits(logits, len(bbox_list))  # len(bbox_list) is correct here
                 print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
                 if len(orders) == len(bbox_list):
                     for i, order_val in enumerate(orders):
                         bbox_list[i].order = order_val
                 else:
                     print(
+                        f"MDRLayoutReader: Warning - Mismatch between orders ({len(orders)}) and bbox_list ({len(bbox_list)}). Using sequential order.")
+                    for i in range(len(bbox_list)):
                         bbox_list[i].order = i
         except Exception as e:
             print(f"MDR LayoutReader prediction error: {e}")
             import traceback
             traceback.print_exc()
             for i in range(len(bbox_list)):
                 bbox_list[i].order = i
             print("MDRLayoutReader: Applying fallback sequential order due to error...")
             result_layouts = self._apply_order(layouts, bbox_list)
+            return result_layouts
         print("MDRLayoutReader: Applying order...")
         result_layouts = self._apply_order(layouts, bbox_list)