rodrigomasini commited on
Commit
2699f7d
·
verified ·
1 Parent(s): 75717e3

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +295 -311
mdr_pdf_parser.py CHANGED
@@ -712,17 +712,18 @@ class _MDR_DBPostProcess:
712
  scores.append(score)
713
  return boxes, scores
714
 
715
- # In class _MDR_DBPostProcess:
716
  def _boxes_from_bitmap(self, pred, bmp, dw, dh): # pred is the probability map, bmp is the binarized map
717
  h, w = bmp.shape
718
- print(f" DEBUG OCR: _boxes_from_bitmap: Processing bitmap of shape {h}x{w} for original dimensions {dw}x{dh}.") # DEBUG
 
719
  contours, _ = cv2.findContours((bmp * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
720
  num_contours_found = len(contours)
721
- print(f" DEBUG OCR: _boxes_from_bitmap: Found {num_contours_found} raw contours.") # DEBUG
722
 
723
  num_contours_to_process = min(num_contours_found, self.max_cand)
724
  if num_contours_found > self.max_cand:
725
- print(f" DEBUG OCR: _boxes_from_bitmap: Processing limited to {self.max_cand} contours.") # DEBUG
726
 
727
  boxes, scores = [], []
728
  kept_boxes_count = 0
@@ -730,11 +731,10 @@ class _MDR_DBPostProcess:
730
  contour = contours[i]
731
  pts_mini_box, sside = self._get_mini_boxes(contour)
732
  if sside < self.min_sz:
733
- # print(f" DEBUG OCR: Contour {i} too small (sside {sside} < min_sz {self.min_sz}). Skipping.") # Can be too verbose
734
  continue
735
 
736
  pts_arr = np.array(pts_mini_box)
737
- # score_mode is 'fast' by default
738
  current_score = self._box_score_fast(pred, pts_arr.reshape(-1, 2)) if self.score_m == "fast" else self._box_score_slow(pred, contour)
739
 
740
  if self.box_thresh > current_score:
@@ -742,7 +742,6 @@ class _MDR_DBPostProcess:
742
  continue
743
 
744
  try:
745
- # unclip_ratio is self.unclip_r (default 1.5)
746
  box_unclipped = self._unclip(pts_arr, self.unclip_r).reshape(-1, 1, 2)
747
  except Exception as e_unclip:
748
  # print(f" DEBUG OCR: Contour {i} unclip failed: {e_unclip}. Skipping.") # Can be too verbose
@@ -750,18 +749,17 @@ class _MDR_DBPostProcess:
750
 
751
  box_final, sside_final = self._get_mini_boxes(box_unclipped)
752
  if sside_final < self.min_sz + 2: # min_sz is 3
753
- # print(f" DEBUG OCR: Contour {i} final size after unclip too small (sside_final {sside_final} < {self.min_sz + 2}). Skipping.") # Can be too verbose
754
  continue
755
 
756
  box_final_arr = np.array(box_final)
757
- # Rescale to original image dimensions
758
  box_final_arr[:, 0] = np.clip(np.round(box_final_arr[:, 0] / w * dw), 0, dw)
759
  box_final_arr[:, 1] = np.clip(np.round(box_final_arr[:, 1] / h * dh), 0, dh)
760
 
761
  boxes.append(box_final_arr.astype("int32"))
762
  scores.append(current_score)
763
  kept_boxes_count +=1
764
- print(f" DEBUG OCR: _boxes_from_bitmap: Kept {kept_boxes_count} boxes after all filtering (size, score, unclip). Configured box_thresh: {self.box_thresh}, min_sz: {self.min_sz}.") # DEBUG
765
  return np.array(boxes, dtype="int32"), scores
766
 
767
  def _unclip(self, box, ratio):
@@ -807,29 +805,35 @@ class _MDR_DBPostProcess:
807
  cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
808
  return cv2.mean(bmp[ymin : ymax + 1, xmin : xmax + 1], mask)[0] if np.sum(mask) > 0 else 0.0
809
 
810
- # In class _MDR_DBPostProcess:
811
  def __call__(self, outs_dict, shape_list):
812
  pred = outs_dict['maps'][:, 0, :, :]
813
  seg = pred > self.thresh
814
- print(f" DEBUG OCR: _MDR_DBPostProcess: pred map shape: {pred.shape}, seg map shape: {seg.shape}, configured thresh: {self.thresh}") # DEBUG
815
- print(f" DEBUG OCR: _MDR_DBPostProcess: Number of pixels in seg map above threshold (sum of all batches): {np.sum(seg)}") # DEBUG
 
816
 
817
  boxes_batch = []
818
  for batch_idx in range(pred.shape[0]):
819
- sh, sw, _, _ = shape_list[batch_idx]
 
 
 
 
 
 
820
  current_pred_map = pred[batch_idx]
821
  current_seg_map = seg[batch_idx]
822
 
823
  mask = cv2.dilate(np.array(current_seg_map).astype(np.uint8), self.dila_k) if self.dila_k is not None else current_seg_map
824
- print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc {sh}x{sw}. Sum of mask pixels: {np.sum(mask)}") # DEBUG
825
 
826
  if self.box_t == 'poly':
827
- boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, sw, sh)
828
  elif self.box_t == 'quad':
829
- boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, sw, sh)
830
  else:
831
  raise ValueError("box_type must be 'quad' or 'poly'")
832
- print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Found {len(boxes)} boxes from bitmap processing (after score filtering within _boxes_from_bitmap).") # DEBUG
833
  boxes_batch.append({'points': boxes})
834
  return boxes_batch
835
 
@@ -887,49 +891,71 @@ class _MDR_TextDetector(_MDR_PredictBase):
887
  new_boxes.append(box)
888
  return np.array(new_boxes)
889
 
890
- # In class _MDR_TextDetector:
891
  def __call__(self, img):
892
  ori_im = img.copy()
893
  data = {"image": img}
894
- print(f" DEBUG OCR: _MDR_TextDetector: Original image shape: {ori_im.shape}") # DEBUG
895
- data = mdr_ocr_transform(data, self.pre_op)
 
 
 
 
 
 
 
 
 
896
  if data is None:
897
- print(" DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.") # DEBUG
898
- return None
899
 
900
- processed_img, shape_list = data
901
  if processed_img is None:
902
- print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.") # DEBUG
903
- return None
904
- print(f" DEBUG OCR: _MDR_TextDetector: Processed image shape for ONNX: {processed_img.shape}, shape_list: {shape_list}") # DEBUG
905
 
906
  img_for_onnx = np.expand_dims(processed_img, axis=0)
907
  shape_list_for_onnx = np.expand_dims(shape_list, axis=0)
908
- img_for_onnx = img_for_onnx.copy() # Ensure it's a contiguous array if ONNX runtime is sensitive
909
 
910
  inputs = self.get_input_feed(self.input_name, img_for_onnx)
911
- print(f" DEBUG OCR: _MDR_TextDetector: Running ONNX inference for text detection...") # DEBUG
912
  try:
913
  outputs = self.sess.run(self.output_name, input_feed=inputs)
914
- except Exception as e:
915
- print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e}") # DEBUG
916
  import traceback
917
  traceback.print_exc()
918
- return None # Stop if inference fails
919
- print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}") # DEBUG
920
 
921
  preds = {"maps": outputs[0]}
922
- # post_op is _MDR_DBPostProcess
923
- post_res = self.post_op(preds, shape_list_for_onnx)
924
-
 
 
 
 
 
 
 
 
 
925
  boxes_from_post = post_res[0]['points']
926
- print(f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}") # DEBUG
 
 
 
 
927
 
928
  if self.args.det_box_type == 'poly':
929
  final_boxes = self._filter_poly(boxes_from_post, ori_im.shape)
930
  else: # 'quad'
931
  final_boxes = self._filter_quad(boxes_from_post, ori_im.shape)
932
- print(f" DEBUG OCR: _MDR_TextDetector: Boxes after final poly/quad filtering: {len(final_boxes)}") # DEBUG
933
  return final_boxes
934
 
935
  class _MDR_ClsPostProcess:
@@ -1090,56 +1116,64 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
1090
  self.input_name = self.get_input_name(self.sess)
1091
  self.output_name = self.get_output_name(self.sess)
1092
 
1093
- # In class _MDR_TextRecognizer
1094
  def _resize_norm(self, img, max_r): # img is a single crop
1095
  imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
1096
  h_orig, w_orig = img.shape[:2]
 
1097
  print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
1098
 
1099
  if h_orig == 0 or w_orig == 0:
1100
- print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop. Returning zeros.")
1101
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1102
 
1103
  r_current = w_orig / float(h_orig)
1104
- # tw is target width, calculated to maintain aspect ratio up to imgW, using max of current ratio and batch max ratio
1105
  tw = min(imgW, int(ceil(imgH * r_current)))
1106
- tw = max(1, tw) # Ensure target width is at least 1
1107
  print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
1108
 
1109
  try:
1110
- resized = cv2.resize(img, (tw, imgH)) # Resize to (target_width, fixed_height)
1111
- except Exception as e_resize:
1112
- print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH})")
1113
- # Fallback: return zeros or try to pad original without resize if resize fails
1114
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
 
 
 
 
 
 
1115
 
1116
  resized = resized.astype("float32")
1117
- # ... rest of the normalization ...
1118
- # (This part seems standard, but worth checking if the image becomes all black/white after this)
1119
- if imgC == 1 and len(resized.shape) == 3: # if model expects grayscale but crop is color
1120
  resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
1121
- resized = resized[:, :, np.newaxis] # Add channel dim
1122
- if len(resized.shape) == 2: # if grayscale and no channel dim
1123
- resized = resized[:, :, np.newaxis]
1124
 
1125
- resized = resized.transpose((2, 0, 1)) / 255.0 # HWC to CHW and scale to 0-1
1126
- resized -= 0.5 # Normalize to -0.5 to 0.5
1127
- resized /= 0.5 # Normalize to -1 to 1
 
 
 
 
 
1128
 
1129
  padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1130
- padding[:, :, 0:tw] = resized # Place resized image into padded canvas
1131
  print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
1132
- # ---- START LOGGING NORMALIZED CROP PROPERTIES ----
 
 
1133
  print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
1134
- f"dtype: {padding.dtype}, " # Should be float32
1135
- f"MinPx: {np.min(padding):.4f}, "
1136
- f"MaxPx: {np.max(padding):.4f}, "
1137
- f"MeanPx: {np.mean(padding):.4f}")
1138
  if np.all(padding == 0):
1139
  print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
1140
- elif np.all(padding == padding[0,0,0]): # Check if all elements are the same
1141
- print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {padding[0,0,0]}")
1142
- # ---- END LOGGING NORMALIZED CROP PROPERTIES ----
1143
  return padding
1144
 
1145
  def __call__(self, img_list):
@@ -1193,133 +1227,65 @@ class _MDR_TextSystem:
1193
  try: return list(sorted(boxes, key=key))
1194
  except: return list(boxes) # Fallback
1195
 
1196
- def __call__(self, img, classify=True): # classify is True by default
1197
- ori_im = img.copy()
1198
- print(f" DEBUG OCR SYS: _MDR_TextSystem called. Original image shape: {ori_im.shape}") # DEBUG
1199
- boxes = self.detector(img) # This is _MDR_TextDetector
1200
-
1201
- if boxes is None or len(boxes) == 0:
1202
- print(" DEBUG OCR SYS: Detector returned no boxes. Returning empty fragments.") # DEBUG
1203
- return [], [] # This is what currently leads to "0 fragments found" if detector fails
1204
 
1205
- print(f" DEBUG OCR SYS: Detector returned {len(boxes)} boxes. Proceeding to crop and recognize.") # DEBUG
1206
- boxes = self._sort_boxes(boxes) # Sorting happens here
 
1207
 
1208
- crops = []
1209
- for i, b in enumerate(boxes):
1210
- try:
1211
- crop_img = mdr_get_rotated_crop(ori_im, b)
1212
- if crop_img is None:
1213
- print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} is None.") # DEBUG
1214
- crops.append(None)
1215
- elif crop_img.shape[0] == 0 or crop_img.shape[1] == 0:
1216
- print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} has zero dimension: {crop_img.shape}") # DEBUG
1217
- crops.append(None)
1218
- else:
1219
- crops.append(crop_img)
1220
- # Optionally save these crops for manual inspection:
1221
- # if self.save_crop: cv2.imwrite(os.path.join(self.crop_dir, f"debug_crop_before_cls_{self.crop_idx + i}.png"), crop_img)
1222
- except Exception as e_crop:
1223
- print(f" DEBUG OCR SYS: Error cropping box {i+1}/{len(boxes)}: {e_crop}") # DEBUG
1224
- crops.append(None)
1225
-
1226
- valid_idxs = [i for i, c in enumerate(crops) if c is not None and c.shape[0] > 0 and c.shape[1] > 0]
1227
- if not valid_idxs:
1228
- print(" DEBUG OCR SYS: No valid crops obtained after attempting to crop detected boxes. Returning empty fragments.") # DEBUG
1229
- return [], []
1230
-
1231
- # Filter crops and corresponding boxes
1232
- valid_crops = [crops[i] for i in valid_idxs]
1233
- boxes_for_valid_crops = [boxes[i] for i in valid_idxs]
1234
- print(f" DEBUG OCR SYS: Number of valid crops to process: {len(valid_crops)}") # DEBUG
1235
-
1236
- # ---- START LOGGING CROP PROPERTIES ----
1237
- if valid_crops:
1238
- print(" DEBUG OCR SYS: Logging properties of first few valid crops (and Box 21 if present):")
1239
- indices_to_log = list(range(min(3, len(valid_crops)))) # Log first 3
1240
- # Try to find original index of Box 21 if we can map it back, this is a bit tricky here
1241
- # For simplicity, let's just log the first few. If Box 21 was among them, we'd see it.
1242
-
1243
- for i_log_idx, crop_idx in enumerate(indices_to_log):
1244
- crop_image_np = valid_crops[crop_idx]
1245
- if crop_image_np is not None and crop_image_np.size > 0:
1246
- print(f" Crop for Recognizer (Index {crop_idx}): "
1247
- f"Shape: {crop_image_np.shape}, "
1248
- f"dtype: {crop_image_np.dtype}, "
1249
- f"MinPx: {np.min(crop_image_np)}, "
1250
- f"MaxPx: {np.max(crop_image_np)}, "
1251
- f"MeanPx: {np.mean(crop_image_np):.2f}")
1252
- else:
1253
- print(f" Crop for Recognizer (Index {crop_idx}): Is None or empty.")
1254
- # ---- END LOGGING CROP PROPERTIES ----
1255
 
1256
- if self.use_cls and self.classifier and classify:
1257
- print(f" DEBUG OCR SYS: Applying classifier to {len(valid_crops)} crops...") # DEBUG
1258
- try:
1259
- # The classifier might modify valid_crops in-place (e.g., rotate them)
1260
- classified_crops, cls_results = self.classifier(valid_crops) # classifier returns list, results
1261
- print(f" DEBUG OCR SYS: Classifier results count: {len(cls_results)}. First few: {cls_results[:3]}") # DEBUG
1262
- valid_crops = classified_crops # Update with potentially rotated crops
1263
- except Exception as e_cls:
1264
- print(f" DEBUG OCR SYS: Classifier error: {e_cls}. Using unclassified crops.") # DEBUG
1265
- # Continue with unclassified (but valid) crops
1266
-
1267
- print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
1268
- # ---- START TEMP CODE TO SAVE CROPS ----
1269
- save_crop_path_dir = Path("/tmp/temp_recognizer_crops") # Use /tmp
1270
- save_crop_path_dir.mkdir(parents=True, exist_ok=True)
1271
- for i_crop, crop_image_np in enumerate(valid_crops):
1272
- try:
1273
- # Ensure crop_image_np is a valid image array (e.g., uint8)
1274
- if crop_image_np is not None and crop_image_np.size > 0:
1275
- # OpenCV expects BGR if color, or grayscale
1276
- cv2.imwrite(str(save_crop_path_dir / f"crop_to_recognize_{self.crop_idx + i_crop}.png"), crop_image_np)
1277
- else:
1278
- print(f" DEBUG OCR SYS: Crop {i_crop} is None or empty, not saving.")
1279
- except Exception as e_save:
1280
- print(f" DEBUG OCR SYS: Failed to save crop {i_crop}: {e_save}")
1281
- print(f" DEBUG OCR SYS: Saved {len(valid_crops)} crops for recognizer to {save_crop_path_dir}")
1282
- # ---- END TEMP CODE TO SAVE CROPS ----
1283
  try:
1284
- rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
1285
- print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
1286
- except Exception as e_rec:
1287
- print(f" DEBUG OCR SYS: Recognizer error: {e_rec}. Returning empty fragments.") # DEBUG
1288
- return [], [] # If recognizer fails, we can't proceed
1289
-
1290
- final_boxes, final_rec_tuples = [], [] # Changed final_rec to final_rec_tuples
1291
- if len(boxes_for_valid_crops) != len(rec_res):
1292
- print(f" DEBUG OCR SYS: Mismatch! Boxes count {len(boxes_for_valid_crops)} != Recognizer results count {len(rec_res)}. This should not happen.")
1293
- # Handle this gracefully, perhaps by taking the minimum length
1294
- min_len = min(len(boxes_for_valid_crops), len(rec_res))
1295
- boxes_to_iterate = boxes_for_valid_crops[:min_len]
1296
- rec_res_to_iterate = rec_res[:min_len]
1297
- else:
1298
- boxes_to_iterate = boxes_for_valid_crops
1299
- rec_res_to_iterate = rec_res
1300
-
1301
- print(f" DEBUG OCR SYS: Filtering {len(rec_res_to_iterate)} recognition results with drop_score: {self.drop_score}") # DEBUG
1302
- for i, (box, res_tuple) in enumerate(zip(boxes_to_iterate, rec_res_to_iterate)):
1303
- txt, score = res_tuple
1304
- print(f" DEBUG OCR SYS: Box {i+1} - Recognized: '{txt}', Score: {score:.4f}") # DEBUG
1305
- if score >= self.drop_score and txt and not mdr_is_whitespace(txt): # Added check for non-empty/whitespace
1306
- final_boxes.append(box)
1307
- final_rec_tuples.append(res_tuple)
1308
- else:
1309
- reason = []
1310
- if score < self.drop_score: reason.append(f"score {score:.2f} < {self.drop_score}")
1311
- if not txt: reason.append("empty text")
1312
- if txt and mdr_is_whitespace(txt): reason.append("whitespace text")
1313
- print(f" DEBUG OCR SYS: Box {i+1} DROPPED. Reason(s): {', '.join(reason)}") # DEBUG
1314
 
1315
 
1316
- if self.save_crop: # This is false by default in _MDR_ONNXParams
1317
- # Ensure crop_dir exists if you enable this
1318
- # self._save_crops(valid_crops, rec_res) # Pass original rec_res to save all attempts if needed
1319
- pass
 
1320
 
1321
- print(f" DEBUG OCR SYS: Returning {len(final_boxes)} final boxes and {len(final_rec_tuples)} final recognition results.") # DEBUG
1322
- return final_boxes, final_rec_tuples # Ensure this returns tuples of (text, score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1323
 
1324
  def _save_crops(self, crops, recs):
1325
  mdr_ensure_directory(self.crop_dir)
@@ -1645,35 +1611,34 @@ class MDROcrEngine:
1645
  except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
1646
  return self._text_system
1647
 
 
1648
  def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
1649
  """Finds and recognizes text fragments in a NumPy image (BGR)."""
1650
  system = self._get_system()
1651
  if system is None:
1652
- print(" DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.") # DEBUG
1653
- return # Empty generator
1654
 
1655
- img_for_system = self._preprocess(image_np) # _preprocess handles BGR/BGRA/GRAY to BGR
1656
- print(f" DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}") # DEBUG
1657
 
1658
  try:
1659
- # system.__call__ should return (list_of_boxes, list_of_tuples_text_score)
1660
- boxes, recs = system(img_for_system) # recs should be list of (text, score)
1661
  except Exception as e:
1662
- print(f" DEBUG OCR Engine: Error during TextSystem prediction: {e}") # DEBUG
1663
  import traceback
1664
  traceback.print_exc()
1665
- return # Empty generator
1666
 
1667
  if not boxes or not recs:
1668
- print(f" DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes)}) or no recs ({len(recs)}). No fragments generated.") # DEBUG
1669
- return # Empty generator
1670
 
1671
  if len(boxes) != len(recs):
1672
- print(f" DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic.")
1673
- # Potentially try to recover by taking the minimum length, or just return
1674
  return
1675
 
1676
- print(f" DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.") # DEBUG
1677
  fragments_generated_count = 0
1678
  for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
1679
  if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
@@ -1681,25 +1646,26 @@ class MDROcrEngine:
1681
  continue
1682
 
1683
  txt, conf = rec_tuple
1684
- # The filtering by drop_score and whitespace should have happened in _MDR_TextSystem
1685
- # But we can add a redundant check or rely on it.
1686
- # For MDROcrFragment, we just need valid text and geometry.
1687
- if not txt or mdr_is_whitespace(txt): # Basic check, though system should filter
1688
- # print(f" DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.")
1689
  continue
1690
 
1691
- pts = [(float(p[0]), float(p[1])) for p in box_pts]
1692
- if len(pts) == 4:
1693
- r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
1694
- if r.is_valid and r.area > 1: # Ensure valid geometry
1695
- yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
1696
- fragments_generated_count += 1
 
 
 
1697
  # else:
1698
- # print(f" DEBUG OCR Engine: Fragment {i} has invalid/small rectangle. Area: {r.area}. Valid: {r.is_valid}. Skipping.")
1699
- # else:
1700
- # print(f" DEBUG OCR Engine: Fragment {i} box_pts not length 4: {len(pts)}. Skipping.")
 
1701
 
1702
- print(f" DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.") # DEBUG
1703
 
1704
  def _preprocess(self, img: np.ndarray) -> np.ndarray:
1705
  if len(img.shape) == 3 and img.shape[2] == 4:
@@ -1729,47 +1695,101 @@ def mdr_prepare_reader_inputs(inputs: Dict[str, torch.Tensor], model: LayoutLMv3
1729
  return {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
1730
 
1731
  def mdr_parse_reader_logits(logits: torch.Tensor, length: int) -> List[int]:
1732
- print(f"mdr_parse_reader_logits: Called with logits shape: {logits.shape}, length: {length}") # ADDED
1733
  if length == 0:
1734
- print("mdr_parse_reader_logits: length is 0, returning empty list.") # ADDED
1735
  return []
1736
 
1737
- # --- Debugging the slice ---
1738
- print(f"mdr_parse_reader_logits: Attempting to slice logits with [1 : {length + 1}, :{length}]") # ADDED
1739
  try:
1740
  rel_logits = logits[1 : length + 1, :length]
1741
- print(f"mdr_parse_reader_logits: rel_logits shape: {rel_logits.shape}") # ADDED
1742
  except IndexError as e:
1743
- print(f"mdr_parse_reader_logits: IndexError during rel_logits slicing! Error: {e}") # ADDED
1744
  import traceback
1745
  traceback.print_exc()
1746
- raise # Re-raise to see it in FastAPI error handling if possible
 
1747
 
1748
  orders = rel_logits.argmax(dim=1).tolist()
1749
- print(f"mdr_parse_reader_logits: Initial orders calculated. Count: {len(orders)}") # ADDED
 
 
 
 
 
 
 
 
1750
 
1751
- loop_count = 0 # ADDED to detect potential infinite loops
1752
- max_loops = length * length # A generous upper bound for loop iterations; adjust if needed
1753
  while True:
1754
  loop_count += 1
1755
  if loop_count > max_loops:
1756
- print(f"mdr_parse_reader_logits: Exceeded max_loops ({max_loops}), breaking while loop to prevent infinite loop.") # ADDED
1757
- break # Safety break
1758
 
1759
- print(f"mdr_parse_reader_logits: While loop iteration: {loop_count}") # ADDED
1760
  conflicts = defaultdict(list)
1761
  [conflicts[order].append(idx) for idx, order in enumerate(orders)]
1762
- conflicting_orders = {o: idxs for o, idxs in conflicts.items() if len(idxs) > 1}
 
 
1763
 
1764
- if not conflicting_orders:
1765
- print("mdr_parse_reader_logits: No conflicting orders, breaking while loop.") # ADDED
1766
  break
1767
-
1768
- print(f"mdr_parse_reader_logits: Found {len(conflicting_orders)} conflicting orders.") # ADDED
1769
- # ... (rest of the conflict resolution logic) ...
1770
- # Consider adding prints inside the inner loops too if it still hangs here.
1771
-
1772
- print(f"mdr_parse_reader_logits: While loop finished after {loop_count} iterations. Returning {len(orders)} orders.") # ADDED
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1773
  return orders
1774
 
1775
  # --- MDR Layout Reading Engine ---
@@ -1790,45 +1810,30 @@ class MDRLayoutReader:
1790
  self._device = "cpu"
1791
  print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
1792
 
 
1793
  def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
1794
  if self._model is None:
1795
- cache = mdr_ensure_directory(self._model_path)
 
 
 
 
1796
  name = "microsoft/layoutlmv3-base"
1797
- # The h_path was for a specific fine-tuned model 'hantian/layoutreader'
1798
- # If you intend to use a specific fine-tuned head, ensure it's correctly downloaded
1799
- # and compatible. For now, let's assume microsoft/layoutlmv3-base is the target
1800
- # if a more specific one isn't found or intended.
1801
- # The original code had a slightly confusing h_path logic.
1802
- # Let's simplify to prioritize a local cache of "microsoft/layoutlmv3-base"
1803
- # or a specific model if `self._model_path` points to a complete model directory.
1804
-
1805
- model_load_path = name # Default to Hugging Face model name
1806
- local_files_only_flag = False
1807
-
1808
- # Check if self._model_path is a directory containing a full model
1809
- # (e.g., config.json, pytorch_model.bin)
1810
- # This part of the original logic for 'h_path' was a bit specific.
1811
- # For LayoutLMv3, usually, you'd just use "microsoft/layoutlmv3-base"
1812
- # and let transformers handle caching, or provide a path to a fully saved model.
1813
-
1814
- # Let's assume the primary goal is to load "microsoft/layoutlmv3-base"
1815
- # and allow it to be cached in `self._model_path/layoutreader`
1816
- # The `cache_dir` argument to `from_pretrained` handles this.
1817
-
1818
- print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{model_load_path}'. Cache dir: {cache}")
1819
  try:
1820
  self._model = LayoutLMv3ForTokenClassification.from_pretrained(
1821
- model_load_path,
1822
- cache_dir=cache, # Transformers will cache here
1823
- local_files_only=local_files_only_flag, # Set to True if you want to force local only after first download
1824
- num_labels=_MDR_MAX_LEN+1 # This is for the classification head
1825
  )
1826
  # Explicitly move model to the determined device
1827
- self._model.to(torch.device(self._device)) # MODIFIED LINE
1828
  self._model.eval()
1829
- print(f"MDR LayoutReader model '{model_load_path}' loaded successfully on device: {self._model.device}.") # Use model.device
1830
  except Exception as e:
1831
- print(f"ERROR loading MDR LayoutReader model '{model_load_path}': {e}")
1832
  import traceback
1833
  traceback.print_exc()
1834
  self._model = None
@@ -1836,68 +1841,47 @@ class MDRLayoutReader:
1836
 
1837
  def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
1838
  w, h = size
1839
- if w <= 0 or h <= 0 or not layouts:
1840
- print("MDRLayoutReader: Invalid size or no layouts, returning early.")
1841
  return layouts
 
 
 
1842
 
1843
  model = self._get_model()
1844
-
1845
- if model is None: # Fallback geometric sort
1846
- print("MDRLayoutReader: Model is None, using fallback geometric sort.")
1847
- layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
1848
- nfo = 0
1849
- for l in layouts:
1850
- l.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
1851
- [setattr(f, 'order', i + nfo) for i, f in enumerate(l.fragments)]
1852
- nfo += len(l.fragments)
1853
- return layouts
1854
-
1855
- print("MDRLayoutReader: Preparing bboxes...") # ADDED
1856
  bbox_list = self._prepare_bboxes(layouts, w, h)
1857
- print(f"MDRLayoutReader: Prepared {len(bbox_list) if bbox_list else 'None or 0'} bboxes.")
1858
 
1859
- if bbox_list is None or len(bbox_list) == 0:
1860
- print("MDRLayoutReader: No bboxes to process, returning layouts.")
 
 
1861
  return layouts
1862
-
1863
- l_size = 1000.0
1864
- xs = l_size / float(w)
1865
- ys = l_size / float(h)
1866
- scaled_bboxes = []
1867
- for bbox in bbox_list:
1868
- x0, y0, x1, y1 = bbox.value
1869
- sx0 = max(0, min(l_size - 1, round(x0 * xs)))
1870
- sy0 = max(0, min(l_size - 1, round(y0 * ys)))
1871
- sx1 = max(0, min(l_size - 1, round(x1 * xs)))
1872
- sy1 = max(0, min(l_size - 1, round(y1 * ys)))
1873
- scaled_bboxes.append([min(sx0, sx1), min(sy0, sy1), max(sx0, sx1), max(sy0, sy1)])
1874
- print("MDRLayoutReader: Scaled bboxes prepared. Count: ", len(scaled_bboxes))
1875
- orders = []
1876
  try:
1877
  with torch.no_grad():
1878
- print("MDRLayoutReader: Creating reader inputs...") # ADDED
1879
- inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)
1880
- print("MDRLayoutReader: Preparing inputs for model device...") # ADDED
1881
  inputs = mdr_prepare_reader_inputs(inputs, model)
1882
- print("MDRLayoutReader: Running model inference...") # ADDED
1883
  logits = model(**inputs).logits.cpu().squeeze(0)
1884
- print("MDRLayoutReader: Model inference complete. Parsing logits...") # ADDED
1885
  orders = mdr_parse_reader_logits(logits, len(bbox_list))
1886
- print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}") # ADDED
1887
  except Exception as e:
1888
  print(f"MDR LayoutReader prediction error: {e}")
1889
  import traceback
1890
- traceback.print_exc() # ADDED for full traceback
1891
- return layouts # Fallback
1892
-
1893
- if len(orders) != len(bbox_list):
1894
- print(f"MDR LayoutReader order mismatch. Orders: {len(orders)}, BBoxes: {len(bbox_list)}")
1895
- return layouts # Fallback
1896
- for i, order_idx in enumerate(orders):
1897
- bbox_list[i].order = order_idx
1898
  print("MDRLayoutReader: Applying order...")
1899
- result_layouts = self._apply_order(layouts, bbox_list)
1900
- print("MDRLayoutReader: Order applied. Returning layouts.") # ADDED
1901
  return result_layouts
1902
 
1903
  def _prepare_bboxes(self, layouts: list[MDRLayoutElement], w: int, h: int) -> list[_MDR_ReaderBBox] | None:
 
712
  scores.append(score)
713
  return boxes, scores
714
 
715
+ # In class _MDR_DBPostProcess:
716
  def _boxes_from_bitmap(self, pred, bmp, dw, dh): # pred is the probability map, bmp is the binarized map
717
  h, w = bmp.shape
718
+ # ADDED: More detailed logging
719
+ print(f" DEBUG OCR: _boxes_from_bitmap: Processing bitmap of shape {h}x{w} for original dimensions {dw:.1f}x{dh:.1f}.")
720
  contours, _ = cv2.findContours((bmp * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
721
  num_contours_found = len(contours)
722
+ print(f" DEBUG OCR: _boxes_from_bitmap: Found {num_contours_found} raw contours.")
723
 
724
  num_contours_to_process = min(num_contours_found, self.max_cand)
725
  if num_contours_found > self.max_cand:
726
+ print(f" DEBUG OCR: _boxes_from_bitmap: Processing limited to {self.max_cand} contours (max_candidates).")
727
 
728
  boxes, scores = [], []
729
  kept_boxes_count = 0
 
731
  contour = contours[i]
732
  pts_mini_box, sside = self._get_mini_boxes(contour)
733
  if sside < self.min_sz:
734
+ # print(f" DEBUG OCR: Contour {i} too small (sside {sside:.2f} < min_sz {self.min_sz}). Skipping.") # Can be too verbose
735
  continue
736
 
737
  pts_arr = np.array(pts_mini_box)
 
738
  current_score = self._box_score_fast(pred, pts_arr.reshape(-1, 2)) if self.score_m == "fast" else self._box_score_slow(pred, contour)
739
 
740
  if self.box_thresh > current_score:
 
742
  continue
743
 
744
  try:
 
745
  box_unclipped = self._unclip(pts_arr, self.unclip_r).reshape(-1, 1, 2)
746
  except Exception as e_unclip:
747
  # print(f" DEBUG OCR: Contour {i} unclip failed: {e_unclip}. Skipping.") # Can be too verbose
 
749
 
750
  box_final, sside_final = self._get_mini_boxes(box_unclipped)
751
  if sside_final < self.min_sz + 2: # min_sz is 3
752
+ # print(f" DEBUG OCR: Contour {i} final size after unclip too small (sside_final {sside_final:.2f} < {self.min_sz + 2}). Skipping.") # Can be too verbose
753
  continue
754
 
755
  box_final_arr = np.array(box_final)
 
756
  box_final_arr[:, 0] = np.clip(np.round(box_final_arr[:, 0] / w * dw), 0, dw)
757
  box_final_arr[:, 1] = np.clip(np.round(box_final_arr[:, 1] / h * dh), 0, dh)
758
 
759
  boxes.append(box_final_arr.astype("int32"))
760
  scores.append(current_score)
761
  kept_boxes_count +=1
762
+ print(f" DEBUG OCR: _boxes_from_bitmap: Kept {kept_boxes_count} boxes after all filtering (size, score, unclip). Configured box_thresh: {self.box_thresh}, min_sz: {self.min_sz}.")
763
  return np.array(boxes, dtype="int32"), scores
764
 
765
  def _unclip(self, box, ratio):
 
805
  cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
806
  return cv2.mean(bmp[ymin : ymax + 1, xmin : xmax + 1], mask)[0] if np.sum(mask) > 0 else 0.0
807
 
 
808
  def __call__(self, outs_dict, shape_list):
809
  pred = outs_dict['maps'][:, 0, :, :]
810
  seg = pred > self.thresh
811
+ # ADDED: More detailed logging
812
+ print(f" DEBUG OCR: _MDR_DBPostProcess: pred map shape: {pred.shape}, seg map shape: {seg.shape}, configured thresh: {self.thresh}")
813
+ print(f" DEBUG OCR: _MDR_DBPostProcess: Number of pixels in seg map above threshold (sum of all batches): {np.sum(seg)}")
814
 
815
  boxes_batch = []
816
  for batch_idx in range(pred.shape[0]):
817
+ # MODIFIED: Ensure sh, sw are floats for division if they come from shape_list
818
+ sh_orig, sw_orig, rh_ratio, rw_ratio = shape_list[batch_idx]
819
+ # The dw, dh for _boxes_from_bitmap should be the original image dimensions before DetResizeForTest
820
+ # shape_list contains [src_h, src_w, ratio_h, ratio_w]
821
+ # So dw = src_w, dh = src_h
822
+ dw_orig, dh_orig = sw_orig, sh_orig
823
+
824
  current_pred_map = pred[batch_idx]
825
  current_seg_map = seg[batch_idx]
826
 
827
  mask = cv2.dilate(np.array(current_seg_map).astype(np.uint8), self.dila_k) if self.dila_k is not None else current_seg_map
828
+ print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
829
 
830
  if self.box_t == 'poly':
831
+ boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dw_orig, dh_orig)
832
  elif self.box_t == 'quad':
833
+ boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dw_orig, dh_orig) # Pass original dimensions
834
  else:
835
  raise ValueError("box_type must be 'quad' or 'poly'")
836
+ print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Found {len(boxes)} boxes from bitmap processing.")
837
  boxes_batch.append({'points': boxes})
838
  return boxes_batch
839
 
 
891
  new_boxes.append(box)
892
  return np.array(new_boxes)
893
 
894
+ # In class _MDR_TextDetector:
895
  def __call__(self, img):
896
  ori_im = img.copy()
897
  data = {"image": img}
898
+ print(f" DEBUG OCR: _MDR_TextDetector: Original image shape: {ori_im.shape}")
899
+
900
+ # Preprocessing
901
+ try:
902
+ data = mdr_ocr_transform(data, self.pre_op)
903
+ except Exception as e_preproc:
904
+ print(f" DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
905
+ import traceback
906
+ traceback.print_exc()
907
+ return np.array([]) # Return empty array on failure
908
+
909
  if data is None:
910
+ print(" DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
911
+ return np.array([])
912
 
913
+ processed_img, shape_list = data # shape_list is [src_h, src_w, ratio_h, ratio_w]
914
  if processed_img is None:
915
+ print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
916
+ return np.array([])
917
+ print(f" DEBUG OCR: _MDR_TextDetector: Processed image shape for ONNX: {processed_img.shape}, shape_list: {shape_list}")
918
 
919
  img_for_onnx = np.expand_dims(processed_img, axis=0)
920
  shape_list_for_onnx = np.expand_dims(shape_list, axis=0)
921
+ img_for_onnx = img_for_onnx.copy()
922
 
923
  inputs = self.get_input_feed(self.input_name, img_for_onnx)
924
+ print(f" DEBUG OCR: _MDR_TextDetector: Running ONNX inference for text detection...")
925
  try:
926
  outputs = self.sess.run(self.output_name, input_feed=inputs)
927
+ except Exception as e_infer:
928
+ print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
929
  import traceback
930
  traceback.print_exc()
931
+ return np.array([]) # Return empty array on failure
932
+ print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
933
 
934
  preds = {"maps": outputs[0]}
935
+ try:
936
+ post_res = self.post_op(preds, shape_list_for_onnx)
937
+ except Exception as e_postproc:
938
+ print(f" DEBUG OCR: _MDR_TextDetector: Error during DBPostProcess: {e_postproc}")
939
+ import traceback
940
+ traceback.print_exc()
941
+ return np.array([])
942
+
943
+ if not post_res or not post_res[0].get('points'):
944
+ print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no points.")
945
+ return np.array([])
946
+
947
  boxes_from_post = post_res[0]['points']
948
+ print(f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
949
+
950
+ if not isinstance(boxes_from_post, (list, np.ndarray)) or len(boxes_from_post) == 0: # Check if it's empty or not list-like
951
+ print(" DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter.")
952
+ return np.array([])
953
 
954
  if self.args.det_box_type == 'poly':
955
  final_boxes = self._filter_poly(boxes_from_post, ori_im.shape)
956
  else: # 'quad'
957
  final_boxes = self._filter_quad(boxes_from_post, ori_im.shape)
958
+ print(f" DEBUG OCR: _MDR_TextDetector: Boxes after final poly/quad filtering: {len(final_boxes)}")
959
  return final_boxes
960
 
961
  class _MDR_ClsPostProcess:
 
1116
  self.input_name = self.get_input_name(self.sess)
1117
  self.output_name = self.get_output_name(self.sess)
1118
 
1119
+ # In class _MDR_TextRecognizer
1120
  def _resize_norm(self, img, max_r): # img is a single crop
1121
  imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
1122
  h_orig, w_orig = img.shape[:2]
1123
+ # ADDED: Log input crop shape
1124
  print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
1125
 
1126
  if h_orig == 0 or w_orig == 0:
1127
+ print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}). Returning zeros.")
1128
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1129
 
1130
  r_current = w_orig / float(h_orig)
 
1131
  tw = min(imgW, int(ceil(imgH * r_current)))
1132
+ tw = max(1, tw)
1133
  print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
1134
 
1135
  try:
1136
+ resized = cv2.resize(img, (tw, imgH))
1137
+ except cv2.error as e_resize: # Catch specific cv2 error
1138
+ print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
 
1139
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1140
+ except Exception as e_resize_general: # Catch any other unexpected error
1141
+ print(f" DEBUG RECOGNIZER: _resize_norm general error during resize: {e_resize_general}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
1142
+ import traceback
1143
+ traceback.print_exc()
1144
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1145
+
1146
 
1147
  resized = resized.astype("float32")
1148
+ if imgC == 1 and len(resized.shape) == 3:
 
 
1149
  resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
1150
+ if len(resized.shape) == 2:
1151
+ resized = resized[:, :, np.newaxis] # Add channel dim if grayscale
 
1152
 
1153
+ # Ensure resized has 3 channels if imgC is 3, even if input was grayscale
1154
+ if imgC == 3 and resized.shape[2] == 1:
1155
+ resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
1156
+
1157
+
1158
+ resized = resized.transpose((2, 0, 1)) / 255.0
1159
+ resized -= 0.5
1160
+ resized /= 0.5
1161
 
1162
  padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1163
+ padding[:, :, 0:tw] = resized
1164
  print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
1165
+
1166
+ # ADDED: Log normalized crop properties
1167
+ min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
1168
  print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
1169
+ f"dtype: {padding.dtype}, "
1170
+ f"MinPx: {min_px:.4f}, "
1171
+ f"MaxPx: {max_px:.4f}, "
1172
+ f"MeanPx: {mean_px:.4f}")
1173
  if np.all(padding == 0):
1174
  print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
1175
+ elif np.abs(max_px - min_px) < 1e-6 : # Check if all elements are (close to) the same
1176
+ print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
 
1177
  return padding
1178
 
1179
  def __call__(self, img_list):
 
1227
  try: return list(sorted(boxes, key=key))
1228
  except: return list(boxes) # Fallback
1229
 
1230
+ # In class _MDR_TextRecognizer
1231
+ def _resize_norm(self, img, max_r): # img is a single crop
1232
+ imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
1233
+ h_orig, w_orig = img.shape[:2]
1234
+ # ADDED: Log input crop shape
1235
+ print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
 
 
1236
 
1237
+ if h_orig == 0 or w_orig == 0:
1238
+ print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}). Returning zeros.")
1239
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1240
 
1241
+ r_current = w_orig / float(h_orig)
1242
+ tw = min(imgW, int(ceil(imgH * r_current)))
1243
+ tw = max(1, tw)
1244
+ print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
  try:
1247
+ resized = cv2.resize(img, (tw, imgH))
1248
+ except cv2.error as e_resize: # Catch specific cv2 error
1249
+ print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
1250
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1251
+ except Exception as e_resize_general: # Catch any other unexpected error
1252
+ print(f" DEBUG RECOGNIZER: _resize_norm general error during resize: {e_resize_general}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
1253
+ import traceback
1254
+ traceback.print_exc()
1255
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1256
 
1257
 
1258
+ resized = resized.astype("float32")
1259
+ if imgC == 1 and len(resized.shape) == 3:
1260
+ resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
1261
+ if len(resized.shape) == 2:
1262
+ resized = resized[:, :, np.newaxis] # Add channel dim if grayscale
1263
 
1264
+ # Ensure resized has 3 channels if imgC is 3, even if input was grayscale
1265
+ if imgC == 3 and resized.shape[2] == 1:
1266
+ resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
1267
+
1268
+
1269
+ resized = resized.transpose((2, 0, 1)) / 255.0
1270
+ resized -= 0.5
1271
+ resized /= 0.5
1272
+
1273
+ padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1274
+ padding[:, :, 0:tw] = resized
1275
+ print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
1276
+
1277
+ # ADDED: Log normalized crop properties
1278
+ min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
1279
+ print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
1280
+ f"dtype: {padding.dtype}, "
1281
+ f"MinPx: {min_px:.4f}, "
1282
+ f"MaxPx: {max_px:.4f}, "
1283
+ f"MeanPx: {mean_px:.4f}")
1284
+ if np.all(padding == 0):
1285
+ print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
1286
+ elif np.abs(max_px - min_px) < 1e-6 : # Check if all elements are (close to) the same
1287
+ print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
1288
+ return padding
1289
 
1290
  def _save_crops(self, crops, recs):
1291
  mdr_ensure_directory(self.crop_dir)
 
1611
  except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
1612
  return self._text_system
1613
 
1614
+ # In class MDROcrEngine:
1615
  def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
1616
  """Finds and recognizes text fragments in a NumPy image (BGR)."""
1617
  system = self._get_system()
1618
  if system is None:
1619
+ print(" DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.")
1620
+ return
1621
 
1622
+ img_for_system = self._preprocess(image_np)
1623
+ print(f" DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}")
1624
 
1625
  try:
1626
+ boxes, recs = system(img_for_system)
 
1627
  except Exception as e:
1628
+ print(f" DEBUG OCR Engine: Error during TextSystem prediction: {e}")
1629
  import traceback
1630
  traceback.print_exc()
1631
+ return
1632
 
1633
  if not boxes or not recs:
1634
+ print(f" DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes) if boxes is not None else 'None'}) or no recs ({len(recs) if recs is not None else 'None'}). No fragments generated.")
1635
+ return
1636
 
1637
  if len(boxes) != len(recs):
1638
+ print(f" DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic. No fragments generated.")
 
1639
  return
1640
 
1641
+ print(f" DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.")
1642
  fragments_generated_count = 0
1643
  for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
1644
  if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
 
1646
  continue
1647
 
1648
  txt, conf = rec_tuple
1649
+ if not txt or mdr_is_whitespace(txt):
1650
+ # print(f" DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.") # Already logged in TextSystem
 
 
 
1651
  continue
1652
 
1653
+ try:
1654
+ pts = [(float(p[0]), float(p[1])) for p in box_pts]
1655
+ if len(pts) == 4:
1656
+ r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
1657
+ if r.is_valid and r.area > 1:
1658
+ yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
1659
+ fragments_generated_count += 1
1660
+ # else:
1661
+ # print(f" DEBUG OCR Engine: Fragment {i} has invalid/small rectangle. Area: {r.area:.2f}. Valid: {r.is_valid}. Skipping.")
1662
  # else:
1663
+ # print(f" DEBUG OCR Engine: Fragment {i} box_pts not length 4: {len(pts)}. Skipping.")
1664
+ except Exception as e_frag:
1665
+ print(f" DEBUG OCR Engine: Error creating MDROcrFragment for item {i}: {e_frag}")
1666
+ continue
1667
 
1668
+ print(f" DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.")
1669
 
1670
  def _preprocess(self, img: np.ndarray) -> np.ndarray:
1671
  if len(img.shape) == 3 and img.shape[2] == 4:
 
1695
  return {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
1696
 
1697
  def mdr_parse_reader_logits(logits: torch.Tensor, length: int) -> List[int]:
1698
+ print(f"mdr_parse_reader_logits: Called with logits shape: {logits.shape}, length: {length}")
1699
  if length == 0:
1700
+ print("mdr_parse_reader_logits: length is 0, returning empty list.")
1701
  return []
1702
 
1703
+ print(f"mdr_parse_reader_logits: Attempting to slice logits with [1 : {length + 1}, :{length}]")
 
1704
  try:
1705
  rel_logits = logits[1 : length + 1, :length]
1706
+ print(f"mdr_parse_reader_logits: rel_logits shape: {rel_logits.shape}")
1707
  except IndexError as e:
1708
+ print(f"mdr_parse_reader_logits: IndexError during rel_logits slicing! Error: {e}")
1709
  import traceback
1710
  traceback.print_exc()
1711
+ # Depending on desired behavior, either raise or return empty/fallback
1712
+ return list(range(length)) # Fallback to sequential order if slicing fails
1713
 
1714
  orders = rel_logits.argmax(dim=1).tolist()
1715
+ print(f"mdr_parse_reader_logits: Initial orders calculated. Count: {len(orders)}")
1716
+
1717
+ # ADDED: Loop safeguard
1718
+ loop_count = 0
1719
+ # Max loops: if N items, N^2 is a generous limit for pairwise comparisons/adjustments.
1720
+ # For N=33, N^2 = 1089. For N=21, N^2 = 441. This matches the logs.
1721
+ # A tighter bound might be N * (N-1) / 2 or N * some_factor.
1722
+ # Let's use N * N as seen in logs, or a fixed large number if N is small.
1723
+ max_loops = max(50, length * length) # Ensure at least 50 loops for small N
1724
 
 
 
1725
  while True:
1726
  loop_count += 1
1727
  if loop_count > max_loops:
1728
+ print(f"mdr_parse_reader_logits: Exceeded max_loops ({max_loops}), breaking while loop to prevent infinite loop.")
1729
+ break
1730
 
1731
+ # print(f"mdr_parse_reader_logits: While loop iteration: {loop_count}") # Can be too verbose
1732
  conflicts = defaultdict(list)
1733
  [conflicts[order].append(idx) for idx, order in enumerate(orders)]
1734
+
1735
+ # Filter to find actual conflicting orders (where multiple original indices map to the same target order)
1736
+ conflicting_orders_map = {o: idxs for o, idxs in conflicts.items() if len(idxs) > 1}
1737
 
1738
+ if not conflicting_orders_map:
1739
+ # print("mdr_parse_reader_logits: No conflicting orders, breaking while loop.") # Verbose
1740
  break
1741
+
1742
+ # Log only if there are actual conflicts to resolve
1743
+ if loop_count == 1 or loop_count % 10 == 0 : # Log first and every 10th iteration with conflicts
1744
+ print(f"mdr_parse_reader_logits: While loop iteration: {loop_count}. Found {len(conflicting_orders_map)} conflicting orders.")
1745
+
1746
+
1747
+ for order_val, c_idxs in conflicting_orders_map.items():
1748
+ # This logic seems to pick the one with the highest score for that conflicting order.
1749
+ # It might need more sophisticated tie-breaking if scores are identical or very close.
1750
+ # The original logic was:
1751
+ # best_idx = -1; max_score = -float('inf')
1752
+ # for c_idx in c_idxs:
1753
+ # score = rel_logits[c_idx, order_val].item()
1754
+ # if score > max_score: max_score = score; best_idx = c_idx
1755
+ # for c_idx in c_idxs:
1756
+ # if c_idx != best_idx: orders[c_idx] = -1 # Mark for re-evaluation or different assignment
1757
+
1758
+ # Simpler approach: keep the first one, mark others to be reassigned.
1759
+ # This might not be optimal but could break cycles.
1760
+ # A more robust solution might involve graph-based cycle detection or a different assignment strategy.
1761
+ # For now, let's stick to a slight modification of the implied original logic:
1762
+ # The one with the highest confidence for *that specific order_val* keeps it.
1763
+ # Others get their order reset to their own index (diagonal) or -1 to be re-evaluated.
1764
+
1765
+ if not c_idxs: continue
1766
+
1767
+ best_c_idx_for_this_order = -1
1768
+ max_confidence_for_this_order = -float('inf')
1769
+
1770
+ for current_c_idx in c_idxs:
1771
+ confidence = rel_logits[current_c_idx, order_val].item()
1772
+ if confidence > max_confidence_for_this_order:
1773
+ max_confidence_for_this_order = confidence
1774
+ best_c_idx_for_this_order = current_c_idx
1775
+
1776
+ # Now, for all conflicting indices for this 'order_val',
1777
+ # if they are not the 'best_c_idx_for_this_order',
1778
+ # they need a new order. A simple strategy is to make them point to themselves initially.
1779
+ # Or, find their next best alternative.
1780
+ for current_c_idx in c_idxs:
1781
+ if current_c_idx != best_c_idx_for_this_order:
1782
+ # Option 1: Reset to self (might not resolve complex cycles)
1783
+ # orders[current_c_idx] = current_c_idx
1784
+
1785
+ # Option 2: Find next best order for this current_c_idx, excluding the conflicting 'order_val'
1786
+ # Create a temporary copy of its logits row, set the conflicting order's logit to -inf
1787
+ temp_logits_row = rel_logits[current_c_idx, :].clone()
1788
+ temp_logits_row[order_val] = -float('inf')
1789
+ orders[current_c_idx] = temp_logits_row.argmax().item()
1790
+
1791
+
1792
+ print(f"mdr_parse_reader_logits: While loop finished after {loop_count} iterations. Returning {len(orders)} orders.")
1793
  return orders
1794
 
1795
  # --- MDR Layout Reading Engine ---
 
1810
  self._device = "cpu"
1811
  print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
1812
 
1813
+ # In class MDRLayoutReader:
1814
  def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
1815
  if self._model is None:
1816
+ cache = mdr_ensure_directory(self._model_path) # This should be self._model_path / "layoutreader"
1817
+ # Correct cache path for transformers
1818
+ layoutreader_cache_dir = Path(self._model_dir) / "layoutreader" # Assuming _model_dir is the main one
1819
+ mdr_ensure_directory(str(layoutreader_cache_dir))
1820
+
1821
  name = "microsoft/layoutlmv3-base"
1822
+
1823
+ print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1824
  try:
1825
  self._model = LayoutLMv3ForTokenClassification.from_pretrained(
1826
+ name, # Use the HF model name
1827
+ cache_dir=str(layoutreader_cache_dir),
1828
+ local_files_only=False, # Allow download on first run
1829
+ num_labels=_MDR_MAX_LEN+1
1830
  )
1831
  # Explicitly move model to the determined device
1832
+ self._model.to(torch.device(self._device)) # ENSURE THIS LINE IS PRESENT AND CORRECT
1833
  self._model.eval()
1834
+ print(f"MDR LayoutReader model '{name}' loaded successfully on device: {self._model.device}.")
1835
  except Exception as e:
1836
+ print(f"ERROR loading MDR LayoutReader model '{name}': {e}")
1837
  import traceback
1838
  traceback.print_exc()
1839
  self._model = None
 
1841
 
1842
  def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
1843
  w, h = size
1844
+ if w <= 0 or h <= 0: # ADDED check for invalid size
1845
+ print("MDRLayoutReader: Invalid image size (w or h <= 0), returning layouts as is.")
1846
  return layouts
1847
+ if not layouts:
1848
+ print("MDRLayoutReader: No layouts to process, returning empty list.")
1849
+ return [] # Return empty list if no layouts
1850
 
1851
  model = self._get_model()
1852
+ # ... (rest of the method, add logging as needed) ...
1853
+ print("MDRLayoutReader: Preparing bboxes...")
 
 
 
 
 
 
 
 
 
 
1854
  bbox_list = self._prepare_bboxes(layouts, w, h)
 
1855
 
1856
+ if bbox_list is None or len(bbox_list) == 0: # Check if bbox_list is None or empty
1857
+ print("MDRLayoutReader: No bboxes prepared from layouts, returning layouts as is (possibly sorted geometrically).")
1858
+ # Fallback geometric sort if no bboxes could be prepared
1859
+ layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
1860
  return layouts
1861
+ print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
1862
+ # ... (rest of the scaling and inference logic) ...
 
 
 
 
 
 
 
 
 
 
 
 
1863
  try:
1864
  with torch.no_grad():
1865
+ print("MDRLayoutReader: Creating reader inputs...")
1866
+ inputs = mdr_boxes_to_reader_inputs(scaled_bboxes) # scaled_bboxes comes from the loop above
1867
+ print("MDRLayoutReader: Preparing inputs for model device...")
1868
  inputs = mdr_prepare_reader_inputs(inputs, model)
1869
+ print("MDRLayoutReader: Running model inference...")
1870
  logits = model(**inputs).logits.cpu().squeeze(0)
1871
+ print("MDRLayoutReader: Model inference complete. Parsing logits...")
1872
  orders = mdr_parse_reader_logits(logits, len(bbox_list))
1873
+ print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
1874
  except Exception as e:
1875
  print(f"MDR LayoutReader prediction error: {e}")
1876
  import traceback
1877
+ traceback.print_exc()
1878
+ # Fallback geometric sort on error
1879
+ layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
1880
+ return layouts
1881
+ # ... (rest of applying order) ...
 
 
 
1882
  print("MDRLayoutReader: Applying order...")
1883
+ result_layouts = self._apply_order(layouts, bbox_list) # Ensure bbox_list has 'order' attribute set
1884
+ print("MDRLayoutReader: Order applied. Returning layouts.")
1885
  return result_layouts
1886
 
1887
  def _prepare_bboxes(self, layouts: list[MDRLayoutElement], w: int, h: int) -> list[_MDR_ReaderBBox] | None: