rodrigomasini commited on
Commit
1725894
·
verified ·
1 Parent(s): 31b4b7f

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +205 -99
mdr_pdf_parser.py CHANGED
@@ -946,10 +946,9 @@ class _MDR_DBPostProcess:
946
  f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
947
 
948
  if self.box_t == 'poly':
949
- boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dw_orig, dh_orig)
950
  elif self.box_t == 'quad':
951
- boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dw_orig,
952
- dh_orig) # Pass original dimensions
953
  else:
954
  raise ValueError("box_type must be 'quad' or 'poly'")
955
  print(
@@ -1299,17 +1298,37 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
1299
  print(
1300
  f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
1301
 
1302
- if h_orig == 0 or w_orig == 0:
 
 
1303
  print(
1304
- f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}). Returning zeros.")
 
 
 
 
 
 
 
 
 
 
 
1305
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1306
 
1307
- r_current = w_orig / float(h_orig)
1308
  tw = min(imgW, int(ceil(imgH * r_current)))
1309
- tw = max(1, tw)
 
 
1310
  print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
1311
 
1312
  try:
 
 
 
 
 
1313
  resized = cv2.resize(img, (tw, imgH))
1314
  except cv2.error as e_resize: # Catch specific cv2 error
1315
  print(
@@ -1322,25 +1341,36 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
1322
  traceback.print_exc()
1323
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1324
 
 
1325
  resized = resized.astype("float32")
1326
- if imgC == 1 and len(resized.shape) == 3:
1327
- resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
1328
- if len(resized.shape) == 2:
1329
- resized = resized[:, :, np.newaxis] # Add channel dim if grayscale
1330
 
1331
- # Ensure resized has 3 channels if imgC is 3, even if input was grayscale
 
 
 
1332
  if imgC == 3 and resized.shape[2] == 1:
1333
  resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
1334
 
 
 
 
 
 
 
1335
  resized = resized.transpose((2, 0, 1)) / 255.0
1336
  resized -= 0.5
1337
  resized /= 0.5
1338
 
1339
  padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1340
- padding[:, :, 0:tw] = resized
1341
- print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
 
1342
 
1343
- # ADDED: Log normalized crop properties
 
1344
  min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
1345
  print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
1346
  f"dtype: {padding.dtype}, "
@@ -1349,7 +1379,7 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
1349
  f"MeanPx: {mean_px:.4f}")
1350
  if np.all(padding == 0):
1351
  print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
1352
- elif np.abs(max_px - min_px) < 1e-6: # Check if all elements are (close to) the same
1353
  print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
1354
  return padding
1355
 
@@ -1400,8 +1430,6 @@ class _MDR_TextSystem:
1400
  self.save_crop = getattr(args, 'save_crop_res', False)
1401
  self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
1402
 
1403
- # In class _MDR_TextSystem:
1404
-
1405
  def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
1406
  ori_im = img.copy()
1407
 
@@ -1416,34 +1444,22 @@ class _MDR_TextSystem:
1416
  if not dt_boxes_sorted:
1417
  return [], []
1418
 
1419
- img_crop_list: list[np.ndarray] = []
1420
-
1421
- for i in range(len(dt_boxes_sorted)):
1422
- crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
1423
- # Ensure crop_im is not empty or too small before adding
1424
- if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:
1425
- img_crop_list.append(crop_im)
1426
- else:
1427
- print(
1428
- f" DEBUG TextSystem: Crop {i} was None or too small, skipping. Original box: {dt_boxes_sorted[i]}")
1429
- # To maintain correspondence, we might need to handle this more carefully
1430
- # For now, this might lead to length mismatches if not all crops are valid.
1431
- # A better approach might be to filter dt_boxes_sorted alongside img_crop_list creation.
1432
-
1433
- # Let's refine the filtering of boxes and creation of crops to ensure they always match
1434
  valid_boxes_for_cropping: list[np.ndarray] = []
1435
- img_crop_list_refined: list[np.ndarray] = []
1436
  for i, box_pts in enumerate(dt_boxes_sorted):
1437
  crop_im = mdr_get_rotated_crop(ori_im, box_pts)
1438
  if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1: # Min height/width for a crop
1439
  valid_boxes_for_cropping.append(box_pts)
1440
- img_crop_list_refined.append(crop_im)
1441
  else:
1442
  print(
1443
  f" DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
1444
 
1445
  dt_boxes_sorted = valid_boxes_for_cropping # Update dt_boxes_sorted to only include those that yielded valid crops
1446
- img_crop_list = img_crop_list_refined # Use the refined list of crops
 
1447
 
1448
  print(f" DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
1449
 
@@ -1451,50 +1467,76 @@ class _MDR_TextSystem:
1451
  print(" DEBUG TextSystem: No valid crops generated. Returning empty.")
1452
  return [], []
1453
 
1454
- if self.use_cls and self.classifier is not None: # No need to check img_crop_list again, it's guaranteed non-empty here
1455
  print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
1456
- img_crop_list, cls_results = self.classifier(img_crop_list)
 
1457
  print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
1458
 
1459
  rec_results: list[tuple[str, float]] = []
1460
- # No need to check img_crop_list again
1461
  print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
1462
  rec_results = self.recognizer(img_crop_list)
1463
  print(f" DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
1464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1465
  final_boxes_to_return: list[np.ndarray] = []
1466
  final_recs_to_return: list[tuple[str, float]] = []
1467
  final_crops_for_saving: list[np.ndarray] = []
1468
 
1469
- # Ensure lengths match before iterating. This is crucial.
1470
- if len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
1471
- print(
1472
- f" DEBUG TextSystem: Filtering {len(rec_results)} recognition results with drop_score: {self.drop_score}")
1473
- for i in range(len(rec_results)):
1474
- text, confidence = rec_results[i]
1475
-
1476
- # Log each result before filtering
1477
- print(f" DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
1478
-
1479
- if confidence >= self.drop_score:
1480
- if text and not mdr_is_whitespace(text):
1481
- final_boxes_to_return.append(dt_boxes_sorted[i])
1482
- final_recs_to_return.append(rec_results[i])
1483
- if self.save_crop:
1484
- final_crops_for_saving.append(img_crop_list[i])
1485
- else:
1486
- print(f" DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
1487
  else:
1488
- print(
1489
- f" DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
1490
- else:
1491
- print(f" DEBUG TextSystem: CRITICAL MISMATCH in lengths after recognition! "
1492
- f"len(rec_results)={len(rec_results)}, "
1493
- f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
1494
- f"len(img_crop_list)={len(img_crop_list)}. "
1495
- f"This indicates an issue in crop generation or recognizer batching. No results will be returned.")
1496
- # Return empty if critical mismatch, as indexing will fail or be incorrect.
1497
- return [], []
1498
 
1499
  print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
1500
 
@@ -1710,36 +1752,67 @@ _MDR_CORRECTION_MIN_OVERLAP = 0.5
1710
 
1711
 
1712
  def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image, layout: MDRLayoutElement):
 
1713
  if not layout.fragments:
 
 
 
1714
  return
 
 
1715
  try:
1716
  x1, y1, x2, y2 = layout.rect.wrapper
1717
  margin = 5
1718
- crop_box = (max(0, round(x1) - margin), max(0, round(y1) - margin), min(source_img.width, round(x2) + margin),
1719
- min(source_img.height, round(y2) + margin))
1720
- if crop_box[0] >= crop_box[2] or crop_box[1] >= crop_box[3]:
 
 
 
 
 
 
1721
  return
1722
- cropped = source_img.crop(crop_box)
1723
- off_x, off_y = crop_box[0], crop_box[1]
 
1724
  except Exception as e:
1725
- print(f"Correct: Crop error: {e}")
1726
  return
 
 
 
 
 
 
 
1727
  try:
1728
- cropped_np = np.array(cropped.convert("RGB"))[:, :, ::-1]
 
1729
  new_frags_local = list(ocr_engine.find_text_fragments(cropped_np))
1730
  except Exception as e:
1731
- print(f"Correct: OCR error: {e}")
1732
- return
 
 
 
1733
  new_frags_global = []
 
1734
  for f in new_frags_local:
1735
  r = f.rect
1736
  lt, rt, lb, rb = r.lt, r.rt, r.lb, r.rb
1737
  f.rect = MDRRectangle(lt=(lt[0] + off_x, lt[1] + off_y), rt=(rt[0] + off_x, rt[1] + off_y),
1738
  lb=(lb[0] + off_x, lb[1] + off_y), rb=(rb[0] + off_x, rb[1] + off_y))
1739
  new_frags_global.append(f)
1740
- orig_frags = layout.fragments
 
1741
  matched, unmatched_orig = [], []
1742
  used_new = set()
 
 
 
 
 
1743
  for i, orig_f in enumerate(orig_frags):
1744
  best_j, best_rate = -1, -1.0
1745
  try:
@@ -1771,13 +1844,16 @@ def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image,
1771
  used_new.add(best_j)
1772
  else:
1773
  unmatched_orig.append(orig_f)
 
1774
  unmatched_new = [f for j, f in enumerate(new_frags_global) if j not in used_new]
 
1775
  final = [n if n.rank >= o.rank else o for o, n in matched]
1776
  final.extend(unmatched_orig)
1777
  final.extend(unmatched_new)
1778
- layout.fragments = final
1779
- layout.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
1780
 
 
 
 
1781
 
1782
  # --- MDR OCR Engine ---
1783
 
@@ -1860,6 +1936,7 @@ class MDROcrEngine:
1860
  # much lower thresholds so we actually get some candidate masks:
1861
  det_db_thresh=0.15,
1862
  det_db_box_thresh=0.15,
 
1863
  drop_score=0.01,
1864
  use_angle_cls=False,
1865
  )
@@ -2142,16 +2219,11 @@ class MDRLayoutReader:
2142
  return layouts
2143
  print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
2144
 
2145
- # --- START: ADDED SCALING LOGIC ---
2146
  scaled_bboxes: list[list[int]] = []
2147
- if w > 0 and h > 0: # Ensure valid width and height for division
2148
  for bbox_item in bbox_list:
2149
- # bbox_item.value is (x0, y0, x1, y1) in original image coordinates
2150
  x0, y0, x1, y1 = bbox_item.value
2151
-
2152
- # Scale to 0-1000 range based on image width (w) and height (h)
2153
- # Ensure coordinates are within [0, 1000] and x1>=x0, y1>=y0
2154
- # Clamp values to image boundaries before scaling to prevent negative scaled values if original box is outside
2155
  x0_c = max(0.0, min(x0, float(w)))
2156
  y0_c = max(0.0, min(y0, float(h)))
2157
  x1_c = max(0.0, min(x1, float(w)))
@@ -2159,56 +2231,90 @@ class MDRLayoutReader:
2159
 
2160
  scaled_x0 = max(0, min(1000, int(1000 * x0_c / w)))
2161
  scaled_y0 = max(0, min(1000, int(1000 * y0_c / h)))
2162
- scaled_x1 = max(scaled_x0, min(1000, int(1000 * x1_c / w))) # Ensure x1 >= x0
2163
- scaled_y1 = max(scaled_y0, min(1000, int(1000 * y1_c / h))) # Ensure y1 >= y0
2164
  scaled_bboxes.append([scaled_x0, scaled_y0, scaled_x1, scaled_y1])
2165
  else:
 
2166
  print(
2167
  "MDRLayoutReader: Warning - Invalid image dimensions (w or h is zero) for scaling bboxes. Cannot determine reading order.")
2168
  layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
2169
  return layouts
2170
- # --- END: ADDED SCALING LOGIC ---
2171
 
2172
- if not scaled_bboxes: # If scaling resulted in no bboxes (e.g. w/h was 0)
2173
  print(
2174
  "MDRLayoutReader: No scaled bboxes available after scaling step. Returning geometrically sorted layouts.")
2175
  layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
2176
  return layouts
2177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2178
  orders: list[int] = []
2179
  try:
2180
  with torch.no_grad():
2181
  print("MDRLayoutReader: Creating reader inputs...")
2182
- inputs = mdr_boxes_to_reader_inputs(scaled_bboxes) # Use the newly created scaled_bboxes
2183
  print("MDRLayoutReader: Preparing inputs for model device...")
2184
  inputs = mdr_prepare_reader_inputs(inputs, model)
2185
  print("MDRLayoutReader: Running model inference...")
2186
  logits = model(**inputs).logits.cpu().squeeze(0)
2187
  print("MDRLayoutReader: Model inference complete. Parsing logits...")
2188
- # length is based on original bbox_list (which should match scaled_bboxes length)
2189
- orders = mdr_parse_reader_logits(logits, len(bbox_list))
2190
  print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
2191
 
2192
- # Assign the determined orders back to the bbox_list items
2193
  if len(orders) == len(bbox_list):
2194
  for i, order_val in enumerate(orders):
2195
  bbox_list[i].order = order_val
2196
  else:
2197
  print(
2198
- f"MDRLayoutReader: Warning - Mismatch between orders ({len(orders)}) and bbox_list ({len(bbox_list)}). Order assignment might be incorrect. Using sequential order.")
2199
- for i in range(len(bbox_list)): # Fallback to sequential order
2200
  bbox_list[i].order = i
2201
  except Exception as e:
2202
  print(f"MDR LayoutReader prediction error: {e}")
2203
  import traceback
2204
  traceback.print_exc()
2205
- # Fallback: assign sequential order to bbox_list items before geometric sort of layouts
2206
  for i in range(len(bbox_list)):
2207
  bbox_list[i].order = i
2208
- # Then apply this sequential order (which effectively becomes a geometric sort)
2209
  print("MDRLayoutReader: Applying fallback sequential order due to error...")
2210
  result_layouts = self._apply_order(layouts, bbox_list)
2211
- return result_layouts # Return here after applying fallback order
2212
 
2213
  print("MDRLayoutReader: Applying order...")
2214
  result_layouts = self._apply_order(layouts, bbox_list)
 
946
  f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
947
 
948
  if self.box_t == 'poly':
949
+ boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dh_orig, dw_orig)
950
  elif self.box_t == 'quad':
951
+ boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dh_orig, dw_orig)
 
952
  else:
953
  raise ValueError("box_type must be 'quad' or 'poly'")
954
  print(
 
1298
  print(
1299
  f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
1300
 
1301
+ # --- START OF FIX ---
1302
+ MIN_DIM_FOR_RESIZE = 2 # Minimum original height or width to attempt resize
1303
+ if h_orig < MIN_DIM_FOR_RESIZE or w_orig < MIN_DIM_FOR_RESIZE:
1304
  print(
1305
+ f" DEBUG RECOGNIZER: _resize_norm received degenerate crop ({h_orig}x{w_orig}) with dimension < {MIN_DIM_FOR_RESIZE}. Returning zeros before resize attempt.")
1306
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1307
+ # --- END OF FIX ---
1308
+
1309
+ # Original check for h_orig == 0 or w_orig == 0 is now covered by the above,
1310
+ # but can be kept for explicitness or if MIN_DIM_FOR_RESIZE is set to 1.
1311
+ # If MIN_DIM_FOR_RESIZE is 1, the original check is still useful.
1312
+ # If MIN_DIM_FOR_RESIZE is > 1, this specific check becomes redundant.
1313
+ # Let's keep it for safety if MIN_DIM_FOR_RESIZE is changed.
1314
+ if h_orig == 0 or w_orig == 0: # This check is technically redundant if MIN_DIM_FOR_RESIZE >= 1
1315
+ print(
1316
+ f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}) (secondary check). Returning zeros.")
1317
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1318
 
1319
+ r_current = w_orig / float(h_orig) # h_orig is guaranteed > 0 here if MIN_DIM_FOR_RESIZE >=1
1320
  tw = min(imgW, int(ceil(imgH * r_current)))
1321
+ tw = max(1, tw) # Ensure target width is at least 1
1322
+ # Ensure target height (imgH) is also valid (it comes from self.shape, so should be)
1323
+
1324
  print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
1325
 
1326
  try:
1327
+ # Ensure target dimensions for resize are valid
1328
+ if tw <= 0 or imgH <= 0:
1329
+ print(
1330
+ f" DEBUG RECOGNIZER: _resize_norm calculated invalid target resize dimensions (tw: {tw}, imgH: {imgH}). Returning zeros.")
1331
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1332
  resized = cv2.resize(img, (tw, imgH))
1333
  except cv2.error as e_resize: # Catch specific cv2 error
1334
  print(
 
1341
  traceback.print_exc()
1342
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1343
 
1344
+ # ... rest of the normalization code ...
1345
  resized = resized.astype("float32")
1346
+ if imgC == 1 and len(resized.shape) == 3: # If target is 1 channel and resized is 3
1347
+ if resized.shape[2] == 3: # Check if it actually has 3 channels
1348
+ resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
 
1349
 
1350
+ if len(resized.shape) == 2: # If grayscale after potential conversion
1351
+ resized = resized[:, :, np.newaxis] # Add channel dim
1352
+
1353
+ # Ensure resized has 3 channels if imgC is 3, even if input was grayscale or became grayscale
1354
  if imgC == 3 and resized.shape[2] == 1:
1355
  resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
1356
 
1357
+ # Final check on channel consistency
1358
+ if resized.shape[2] != imgC:
1359
+ print(
1360
+ f" DEBUG RECOGNIZER: Channel mismatch after processing. Expected {imgC}, got {resized.shape[2]}. Crop shape ({h_orig},{w_orig}). Returning zeros.")
1361
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1362
+
1363
  resized = resized.transpose((2, 0, 1)) / 255.0
1364
  resized -= 0.5
1365
  resized /= 0.5
1366
 
1367
  padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1368
+ # Ensure tw is not out of bounds for padding
1369
+ actual_padded_width = min(tw, imgW)
1370
+ padding[:, :, 0:actual_padded_width] = resized[:, :, 0:actual_padded_width]
1371
 
1372
+ print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
1373
+ # ... rest of the logging ...
1374
  min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
1375
  print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
1376
  f"dtype: {padding.dtype}, "
 
1379
  f"MeanPx: {mean_px:.4f}")
1380
  if np.all(padding == 0):
1381
  print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
1382
+ elif np.abs(max_px - min_px) < 1e-6:
1383
  print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
1384
  return padding
1385
 
 
1430
  self.save_crop = getattr(args, 'save_crop_res', False)
1431
  self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
1432
 
 
 
1433
  def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
1434
  ori_im = img.copy()
1435
 
 
1444
  if not dt_boxes_sorted:
1445
  return [], []
1446
 
1447
+ # --- Stage 1 Fix: Refined filtering of boxes and creation of crops ---
1448
+ # Ensure dt_boxes_sorted and img_crop_list are synchronized.
 
 
 
 
 
 
 
 
 
 
 
 
 
1449
  valid_boxes_for_cropping: list[np.ndarray] = []
1450
+ img_crop_list: list[np.ndarray] = [] # Initialize img_crop_list here
1451
  for i, box_pts in enumerate(dt_boxes_sorted):
1452
  crop_im = mdr_get_rotated_crop(ori_im, box_pts)
1453
  if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1: # Min height/width for a crop
1454
  valid_boxes_for_cropping.append(box_pts)
1455
+ img_crop_list.append(crop_im) # Directly populate the final img_crop_list
1456
  else:
1457
  print(
1458
  f" DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
1459
 
1460
  dt_boxes_sorted = valid_boxes_for_cropping # Update dt_boxes_sorted to only include those that yielded valid crops
1461
+ # img_crop_list is now the correctly filtered list of crops, synchronized with dt_boxes_sorted.
1462
+ # --- End of Stage 1 Fix ---
1463
 
1464
  print(f" DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
1465
 
 
1467
  print(" DEBUG TextSystem: No valid crops generated. Returning empty.")
1468
  return [], []
1469
 
1470
+ if self.use_cls and self.classifier is not None:
1471
  print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
1472
+ img_crop_list, cls_results = self.classifier(
1473
+ img_crop_list) # classifier might modify img_crop_list (e.g., rotate)
1474
  print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
1475
 
1476
  rec_results: list[tuple[str, float]] = []
 
1477
  print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
1478
  rec_results = self.recognizer(img_crop_list)
1479
  print(f" DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
1480
 
1481
+ # --- Start of Stage 2 Fix: Robust handling of rec_results length ---
1482
+ expected_count = len(dt_boxes_sorted) # This is synchronized with len(img_crop_list) before recognizer
1483
+ # and should still match len(img_crop_list) after classifier
1484
+ # if classifier preserves length.
1485
+ actual_rec_count = len(rec_results)
1486
+ num_to_process = 0
1487
+
1488
+ if actual_rec_count == expected_count:
1489
+ num_to_process = actual_rec_count
1490
+ else:
1491
+ print(f" DEBUG TextSystem: WARNING - Mismatch in lengths after recognition! "
1492
+ f"Expected (from boxes/crops): {expected_count}, "
1493
+ f"Recognizer returned: {actual_rec_count} results. ")
1494
+ num_to_process = min(actual_rec_count, expected_count)
1495
+ if num_to_process < expected_count:
1496
+ print(
1497
+ f" DEBUG TextSystem: Will process {num_to_process} items due to mismatch. Some data might be lost if recognizer dropped results or if there was an issue in earlier stages not caught.")
1498
+ elif num_to_process < actual_rec_count: # Recognizer returned more than expected
1499
+ print(
1500
+ f" DEBUG TextSystem: Will process {num_to_process} items. Recognizer returned more results ({actual_rec_count}) than expected crops ({expected_count}). Extra recognition results will be ignored.")
1501
+
1502
+ if num_to_process == 0:
1503
+ if expected_count > 0: # If there were boxes/crops but no rec results to process
1504
+ print(
1505
+ " DEBUG TextSystem: No recognition results to process (num_to_process is 0) despite having input boxes/crops. Returning empty.")
1506
+ else: # If there were no boxes/crops to begin with
1507
+ print(
1508
+ " DEBUG TextSystem: No items to process (no initial boxes or num_to_process is 0). Returning empty.")
1509
+ return [], []
1510
+ # --- End of Stage 2 Fix preamble ---
1511
+
1512
+ print(
1513
+ f" DEBUG TextSystem: Filtering {num_to_process} recognition results with drop_score: {self.drop_score}")
1514
  final_boxes_to_return: list[np.ndarray] = []
1515
  final_recs_to_return: list[tuple[str, float]] = []
1516
  final_crops_for_saving: list[np.ndarray] = []
1517
 
1518
+ # --- Stage 2 Fix: Modified Loop (No outer strict if/else) ---
1519
+ for i in range(num_to_process): # Iterate up to the safe number
1520
+ # It's crucial that dt_boxes_sorted[i], rec_results[i], and img_crop_list[i] correspond
1521
+ # for the items being processed.
1522
+ text, confidence = rec_results[i]
1523
+
1524
+ print(f" DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
1525
+
1526
+ if confidence >= self.drop_score:
1527
+ if text and not mdr_is_whitespace(text):
1528
+ final_boxes_to_return.append(dt_boxes_sorted[i])
1529
+ final_recs_to_return.append(rec_results[i])
1530
+ if self.save_crop:
1531
+ # Ensure img_crop_list[i] is valid if classifier could have changed its length
1532
+ # However, self.classifier is expected to return img_list of same length as input.
1533
+ final_crops_for_saving.append(img_crop_list[i])
 
 
1534
  else:
1535
+ print(f" DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
1536
+ else:
1537
+ print(
1538
+ f" DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
1539
+ # --- End of Stage 2 Fix: Modified Loop ---
 
 
 
 
 
1540
 
1541
  print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
1542
 
 
1752
 
1753
 
1754
  def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image, layout: MDRLayoutElement):
1755
+ # --- START OF FIX ---
1756
  if not layout.fragments:
1757
+ # If the layout has no fragments to begin with, there's nothing to correct.
1758
+ # Attempting to crop and OCR an empty layout region is unnecessary and can lead to errors.
1759
+ # print(f"Correct: Layout {type(layout.cls).__name__} has no initial fragments. Skipping OCR correction.") # Optional: for debugging
1760
  return
1761
+ # --- END OF FIX ---
1762
+
1763
  try:
1764
  x1, y1, x2, y2 = layout.rect.wrapper
1765
  margin = 5
1766
+ # Ensure crop_box dimensions are valid before cropping
1767
+ crop_x1 = max(0, round(x1) - margin)
1768
+ crop_y1 = max(0, round(y1) - margin)
1769
+ crop_x2 = min(source_img.width, round(x2) + margin)
1770
+ crop_y2 = min(source_img.height, round(y2) + margin)
1771
+
1772
+ if crop_x1 >= crop_x2 or crop_y1 >= crop_y2: # If crop dimensions are invalid/empty
1773
+ print(
1774
+ f"Correct: Crop box for layout {type(layout.cls).__name__} is invalid/empty ({crop_x1},{crop_y1},{crop_x2},{crop_y2}). Skipping OCR correction.")
1775
  return
1776
+
1777
+ cropped = source_img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
1778
+ off_x, off_y = crop_x1, crop_y1
1779
  except Exception as e:
1780
+ print(f"Correct: Crop error for layout {type(layout.cls).__name__}: {e}")
1781
  return
1782
+
1783
+ # Additional check: if cropped image is too small for OCR
1784
+ if cropped.width < 5 or cropped.height < 5: # Arbitrary small threshold
1785
+ print(
1786
+ f"Correct: Cropped image for layout {type(layout.cls).__name__} is too small ({cropped.width}x{cropped.height}). Skipping OCR correction.")
1787
+ return
1788
+
1789
  try:
1790
+ # Ensure conversion to RGB before converting to NumPy array
1791
+ cropped_np = np.array(cropped.convert("RGB"))[:, :, ::-1] # BGR for OpenCV-based OCR
1792
  new_frags_local = list(ocr_engine.find_text_fragments(cropped_np))
1793
  except Exception as e:
1794
+ print(f"Correct: OCR error during correction for layout {type(layout.cls).__name__}: {e}")
1795
+ # If OCR fails, we should probably keep the original fragments, if any.
1796
+ # The current logic below will do this if new_frags_local is empty.
1797
+ return # Exit if OCR itself fails catastrophically
1798
+
1799
  new_frags_global = []
1800
+ # ... (rest of the function remains the same) ...
1801
  for f in new_frags_local:
1802
  r = f.rect
1803
  lt, rt, lb, rb = r.lt, r.rt, r.lb, r.rb
1804
  f.rect = MDRRectangle(lt=(lt[0] + off_x, lt[1] + off_y), rt=(rt[0] + off_x, rt[1] + off_y),
1805
  lb=(lb[0] + off_x, lb[1] + off_y), rb=(rb[0] + off_x, rb[1] + off_y))
1806
  new_frags_global.append(f)
1807
+
1808
+ orig_frags = layout.fragments # These are the fragments that existed before this function call
1809
  matched, unmatched_orig = [], []
1810
  used_new = set()
1811
+
1812
+ # If new_frags_global is empty (e.g. OCR found nothing in the cropped region),
1813
+ # then all orig_frags will go into unmatched_orig, and layout.fragments will be restored to orig_frags.
1814
+ # This is generally fine.
1815
+
1816
  for i, orig_f in enumerate(orig_frags):
1817
  best_j, best_rate = -1, -1.0
1818
  try:
 
1844
  used_new.add(best_j)
1845
  else:
1846
  unmatched_orig.append(orig_f)
1847
+
1848
  unmatched_new = [f for j, f in enumerate(new_frags_global) if j not in used_new]
1849
+
1850
  final = [n if n.rank >= o.rank else o for o, n in matched]
1851
  final.extend(unmatched_orig)
1852
  final.extend(unmatched_new)
 
 
1853
 
1854
+ layout.fragments = final
1855
+ if layout.fragments: # Only sort if there are fragments
1856
+ layout.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
1857
 
1858
  # --- MDR OCR Engine ---
1859
 
 
1936
  # much lower thresholds so we actually get some candidate masks:
1937
  det_db_thresh=0.15,
1938
  det_db_box_thresh=0.15,
1939
+ unclip_ratio=2.0,
1940
  drop_score=0.01,
1941
  use_angle_cls=False,
1942
  )
 
2219
  return layouts
2220
  print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
2221
 
2222
+ # --- START: SCALING LOGIC as in the prompt ---
2223
  scaled_bboxes: list[list[int]] = []
2224
+ if w > 0 and h > 0:
2225
  for bbox_item in bbox_list:
 
2226
  x0, y0, x1, y1 = bbox_item.value
 
 
 
 
2227
  x0_c = max(0.0, min(x0, float(w)))
2228
  y0_c = max(0.0, min(y0, float(h)))
2229
  x1_c = max(0.0, min(x1, float(w)))
 
2231
 
2232
  scaled_x0 = max(0, min(1000, int(1000 * x0_c / w)))
2233
  scaled_y0 = max(0, min(1000, int(1000 * y0_c / h)))
2234
+ scaled_x1 = max(scaled_x0, min(1000, int(1000 * x1_c / w)))
2235
+ scaled_y1 = max(scaled_y0, min(1000, int(1000 * y1_c / h)))
2236
  scaled_bboxes.append([scaled_x0, scaled_y0, scaled_x1, scaled_y1])
2237
  else:
2238
+ # This branch should ideally not be reached due to the initial w,h check
2239
  print(
2240
  "MDRLayoutReader: Warning - Invalid image dimensions (w or h is zero) for scaling bboxes. Cannot determine reading order.")
2241
  layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
2242
  return layouts
2243
+ # --- END: SCALING LOGIC ---
2244
 
2245
+ if not scaled_bboxes: # Handles if bbox_list was empty
2246
  print(
2247
  "MDRLayoutReader: No scaled bboxes available after scaling step. Returning geometrically sorted layouts.")
2248
  layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
2249
  return layouts
2250
 
2251
+ # --- START OF FIX ---
2252
+ # Check if scaled_bboxes are problematic (e.g., all identical and degenerate)
2253
+ bypass_model_inference = False
2254
+ if len(scaled_bboxes) > 0:
2255
+ num_s_bboxes = len(scaled_bboxes)
2256
+ # Check if all scaled_bboxes are identical to the first one
2257
+ first_s_bbox_str = str(scaled_bboxes[0])
2258
+ all_identical = all(str(s_b) == first_s_bbox_str for s_b in scaled_bboxes)
2259
+
2260
+ if all_identical:
2261
+ # Check if this identical box is degenerate (zero width or height)
2262
+ s_x0, s_y0, s_x1, s_y1 = scaled_bboxes[0]
2263
+ if (s_x1 - s_x0 == 0) or (s_y1 - s_y0 == 0):
2264
+ bypass_model_inference = True
2265
+ print("MDRLayoutReader: All scaled bboxes are identical and degenerate. Bypassing LayoutLMv3.")
2266
+
2267
+ if not bypass_model_inference and num_s_bboxes > 1: # Check for high proportion of degenerate if not all identical
2268
+ degenerate_count = 0
2269
+ for s_b in scaled_bboxes:
2270
+ if (s_b[2] - s_b[0] == 0) or (s_b[3] - s_b[1] == 0): # x1-x0 or y1-y0
2271
+ degenerate_count += 1
2272
+ # If, for example, more than 90% of bboxes are degenerate
2273
+ if degenerate_count / num_s_bboxes > 0.9:
2274
+ bypass_model_inference = True
2275
+ print(
2276
+ f"MDRLayoutReader: High percentage ({degenerate_count / num_s_bboxes * 100:.1f}%) of scaled bboxes are degenerate. Bypassing LayoutLMv3.")
2277
+
2278
+ if bypass_model_inference:
2279
+ print("MDRLayoutReader: Applying fallback sequential order due to problematic scaled_bboxes.")
2280
+ # Assign sequential order based on _prepare_bboxes's sort (y, then x)
2281
+ for i in range(len(bbox_list)):
2282
+ bbox_list[i].order = i
2283
+ # Use _apply_order to apply this simple sequential ordering
2284
+ result_layouts = self._apply_order(layouts, bbox_list)
2285
+ return result_layouts
2286
+ # --- END OF FIX ---
2287
+
2288
  orders: list[int] = []
2289
  try:
2290
  with torch.no_grad():
2291
  print("MDRLayoutReader: Creating reader inputs...")
2292
+ inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)
2293
  print("MDRLayoutReader: Preparing inputs for model device...")
2294
  inputs = mdr_prepare_reader_inputs(inputs, model)
2295
  print("MDRLayoutReader: Running model inference...")
2296
  logits = model(**inputs).logits.cpu().squeeze(0)
2297
  print("MDRLayoutReader: Model inference complete. Parsing logits...")
2298
+ orders = mdr_parse_reader_logits(logits, len(bbox_list)) # len(bbox_list) is correct here
 
2299
  print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
2300
 
 
2301
  if len(orders) == len(bbox_list):
2302
  for i, order_val in enumerate(orders):
2303
  bbox_list[i].order = order_val
2304
  else:
2305
  print(
2306
+ f"MDRLayoutReader: Warning - Mismatch between orders ({len(orders)}) and bbox_list ({len(bbox_list)}). Using sequential order.")
2307
+ for i in range(len(bbox_list)):
2308
  bbox_list[i].order = i
2309
  except Exception as e:
2310
  print(f"MDR LayoutReader prediction error: {e}")
2311
  import traceback
2312
  traceback.print_exc()
 
2313
  for i in range(len(bbox_list)):
2314
  bbox_list[i].order = i
 
2315
  print("MDRLayoutReader: Applying fallback sequential order due to error...")
2316
  result_layouts = self._apply_order(layouts, bbox_list)
2317
+ return result_layouts
2318
 
2319
  print("MDRLayoutReader: Applying order...")
2320
  result_layouts = self._apply_order(layouts, bbox_list)