Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +295 -311
mdr_pdf_parser.py
CHANGED
@@ -712,17 +712,18 @@ class _MDR_DBPostProcess:
|
|
712 |
scores.append(score)
|
713 |
return boxes, scores
|
714 |
|
715 |
-
|
716 |
def _boxes_from_bitmap(self, pred, bmp, dw, dh): # pred is the probability map, bmp is the binarized map
|
717 |
h, w = bmp.shape
|
718 |
-
|
|
|
719 |
contours, _ = cv2.findContours((bmp * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
720 |
num_contours_found = len(contours)
|
721 |
-
print(f" DEBUG OCR: _boxes_from_bitmap: Found {num_contours_found} raw contours.")
|
722 |
|
723 |
num_contours_to_process = min(num_contours_found, self.max_cand)
|
724 |
if num_contours_found > self.max_cand:
|
725 |
-
print(f" DEBUG OCR: _boxes_from_bitmap: Processing limited to {self.max_cand} contours.")
|
726 |
|
727 |
boxes, scores = [], []
|
728 |
kept_boxes_count = 0
|
@@ -730,11 +731,10 @@ class _MDR_DBPostProcess:
|
|
730 |
contour = contours[i]
|
731 |
pts_mini_box, sside = self._get_mini_boxes(contour)
|
732 |
if sside < self.min_sz:
|
733 |
-
# print(f" DEBUG OCR: Contour {i} too small (sside {sside} < min_sz {self.min_sz}). Skipping.") # Can be too verbose
|
734 |
continue
|
735 |
|
736 |
pts_arr = np.array(pts_mini_box)
|
737 |
-
# score_mode is 'fast' by default
|
738 |
current_score = self._box_score_fast(pred, pts_arr.reshape(-1, 2)) if self.score_m == "fast" else self._box_score_slow(pred, contour)
|
739 |
|
740 |
if self.box_thresh > current_score:
|
@@ -742,7 +742,6 @@ class _MDR_DBPostProcess:
|
|
742 |
continue
|
743 |
|
744 |
try:
|
745 |
-
# unclip_ratio is self.unclip_r (default 1.5)
|
746 |
box_unclipped = self._unclip(pts_arr, self.unclip_r).reshape(-1, 1, 2)
|
747 |
except Exception as e_unclip:
|
748 |
# print(f" DEBUG OCR: Contour {i} unclip failed: {e_unclip}. Skipping.") # Can be too verbose
|
@@ -750,18 +749,17 @@ class _MDR_DBPostProcess:
|
|
750 |
|
751 |
box_final, sside_final = self._get_mini_boxes(box_unclipped)
|
752 |
if sside_final < self.min_sz + 2: # min_sz is 3
|
753 |
-
# print(f" DEBUG OCR: Contour {i} final size after unclip too small (sside_final {sside_final} < {self.min_sz + 2}). Skipping.") # Can be too verbose
|
754 |
continue
|
755 |
|
756 |
box_final_arr = np.array(box_final)
|
757 |
-
# Rescale to original image dimensions
|
758 |
box_final_arr[:, 0] = np.clip(np.round(box_final_arr[:, 0] / w * dw), 0, dw)
|
759 |
box_final_arr[:, 1] = np.clip(np.round(box_final_arr[:, 1] / h * dh), 0, dh)
|
760 |
|
761 |
boxes.append(box_final_arr.astype("int32"))
|
762 |
scores.append(current_score)
|
763 |
kept_boxes_count +=1
|
764 |
-
print(f" DEBUG OCR: _boxes_from_bitmap: Kept {kept_boxes_count} boxes after all filtering (size, score, unclip). Configured box_thresh: {self.box_thresh}, min_sz: {self.min_sz}.")
|
765 |
return np.array(boxes, dtype="int32"), scores
|
766 |
|
767 |
def _unclip(self, box, ratio):
|
@@ -807,29 +805,35 @@ class _MDR_DBPostProcess:
|
|
807 |
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
|
808 |
return cv2.mean(bmp[ymin : ymax + 1, xmin : xmax + 1], mask)[0] if np.sum(mask) > 0 else 0.0
|
809 |
|
810 |
-
# In class _MDR_DBPostProcess:
|
811 |
def __call__(self, outs_dict, shape_list):
|
812 |
pred = outs_dict['maps'][:, 0, :, :]
|
813 |
seg = pred > self.thresh
|
814 |
-
|
815 |
-
print(f" DEBUG OCR: _MDR_DBPostProcess:
|
|
|
816 |
|
817 |
boxes_batch = []
|
818 |
for batch_idx in range(pred.shape[0]):
|
819 |
-
sh, sw
|
|
|
|
|
|
|
|
|
|
|
|
|
820 |
current_pred_map = pred[batch_idx]
|
821 |
current_seg_map = seg[batch_idx]
|
822 |
|
823 |
mask = cv2.dilate(np.array(current_seg_map).astype(np.uint8), self.dila_k) if self.dila_k is not None else current_seg_map
|
824 |
-
print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc {
|
825 |
|
826 |
if self.box_t == 'poly':
|
827 |
-
boxes, scores = self._polygons_from_bitmap(current_pred_map, mask,
|
828 |
elif self.box_t == 'quad':
|
829 |
-
boxes, scores = self._boxes_from_bitmap(current_pred_map, mask,
|
830 |
else:
|
831 |
raise ValueError("box_type must be 'quad' or 'poly'")
|
832 |
-
print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Found {len(boxes)} boxes from bitmap processing
|
833 |
boxes_batch.append({'points': boxes})
|
834 |
return boxes_batch
|
835 |
|
@@ -887,49 +891,71 @@ class _MDR_TextDetector(_MDR_PredictBase):
|
|
887 |
new_boxes.append(box)
|
888 |
return np.array(new_boxes)
|
889 |
|
890 |
-
|
891 |
def __call__(self, img):
|
892 |
ori_im = img.copy()
|
893 |
data = {"image": img}
|
894 |
-
print(f" DEBUG OCR: _MDR_TextDetector: Original image shape: {ori_im.shape}")
|
895 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
896 |
if data is None:
|
897 |
-
print(" DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
|
898 |
-
return
|
899 |
|
900 |
-
processed_img, shape_list = data
|
901 |
if processed_img is None:
|
902 |
-
print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
|
903 |
-
return
|
904 |
-
print(f" DEBUG OCR: _MDR_TextDetector: Processed image shape for ONNX: {processed_img.shape}, shape_list: {shape_list}")
|
905 |
|
906 |
img_for_onnx = np.expand_dims(processed_img, axis=0)
|
907 |
shape_list_for_onnx = np.expand_dims(shape_list, axis=0)
|
908 |
-
img_for_onnx = img_for_onnx.copy()
|
909 |
|
910 |
inputs = self.get_input_feed(self.input_name, img_for_onnx)
|
911 |
-
print(f" DEBUG OCR: _MDR_TextDetector: Running ONNX inference for text detection...")
|
912 |
try:
|
913 |
outputs = self.sess.run(self.output_name, input_feed=inputs)
|
914 |
-
except Exception as
|
915 |
-
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {
|
916 |
import traceback
|
917 |
traceback.print_exc()
|
918 |
-
return
|
919 |
-
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
|
920 |
|
921 |
preds = {"maps": outputs[0]}
|
922 |
-
|
923 |
-
|
924 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
925 |
boxes_from_post = post_res[0]['points']
|
926 |
-
print(f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
|
|
|
|
|
|
|
|
|
927 |
|
928 |
if self.args.det_box_type == 'poly':
|
929 |
final_boxes = self._filter_poly(boxes_from_post, ori_im.shape)
|
930 |
else: # 'quad'
|
931 |
final_boxes = self._filter_quad(boxes_from_post, ori_im.shape)
|
932 |
-
print(f" DEBUG OCR: _MDR_TextDetector: Boxes after final poly/quad filtering: {len(final_boxes)}")
|
933 |
return final_boxes
|
934 |
|
935 |
class _MDR_ClsPostProcess:
|
@@ -1090,56 +1116,64 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
|
|
1090 |
self.input_name = self.get_input_name(self.sess)
|
1091 |
self.output_name = self.get_output_name(self.sess)
|
1092 |
|
1093 |
-
|
1094 |
def _resize_norm(self, img, max_r): # img is a single crop
|
1095 |
imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
|
1096 |
h_orig, w_orig = img.shape[:2]
|
|
|
1097 |
print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
|
1098 |
|
1099 |
if h_orig == 0 or w_orig == 0:
|
1100 |
-
print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop. Returning zeros.")
|
1101 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1102 |
|
1103 |
r_current = w_orig / float(h_orig)
|
1104 |
-
# tw is target width, calculated to maintain aspect ratio up to imgW, using max of current ratio and batch max ratio
|
1105 |
tw = min(imgW, int(ceil(imgH * r_current)))
|
1106 |
-
tw = max(1, tw)
|
1107 |
print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
|
1108 |
|
1109 |
try:
|
1110 |
-
resized = cv2.resize(img, (tw, imgH))
|
1111 |
-
except
|
1112 |
-
print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH})")
|
1113 |
-
# Fallback: return zeros or try to pad original without resize if resize fails
|
1114 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
1115 |
|
1116 |
resized = resized.astype("float32")
|
1117 |
-
|
1118 |
-
# (This part seems standard, but worth checking if the image becomes all black/white after this)
|
1119 |
-
if imgC == 1 and len(resized.shape) == 3: # if model expects grayscale but crop is color
|
1120 |
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
1121 |
-
|
1122 |
-
|
1123 |
-
resized = resized[:, :, np.newaxis]
|
1124 |
|
1125 |
-
|
1126 |
-
|
1127 |
-
|
|
|
|
|
|
|
|
|
|
|
1128 |
|
1129 |
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1130 |
-
padding[:, :, 0:tw] = resized
|
1131 |
print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
|
1132 |
-
|
|
|
|
|
1133 |
print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
|
1134 |
-
f"dtype: {padding.dtype}, "
|
1135 |
-
f"MinPx: {
|
1136 |
-
f"MaxPx: {
|
1137 |
-
f"MeanPx: {
|
1138 |
if np.all(padding == 0):
|
1139 |
print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
|
1140 |
-
elif np.
|
1141 |
-
print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {
|
1142 |
-
# ---- END LOGGING NORMALIZED CROP PROPERTIES ----
|
1143 |
return padding
|
1144 |
|
1145 |
def __call__(self, img_list):
|
@@ -1193,133 +1227,65 @@ class _MDR_TextSystem:
|
|
1193 |
try: return list(sorted(boxes, key=key))
|
1194 |
except: return list(boxes) # Fallback
|
1195 |
|
1196 |
-
|
1197 |
-
|
1198 |
-
|
1199 |
-
|
1200 |
-
|
1201 |
-
|
1202 |
-
print(" DEBUG OCR SYS: Detector returned no boxes. Returning empty fragments.") # DEBUG
|
1203 |
-
return [], [] # This is what currently leads to "0 fragments found" if detector fails
|
1204 |
|
1205 |
-
|
1206 |
-
|
|
|
1207 |
|
1208 |
-
|
1209 |
-
|
1210 |
-
|
1211 |
-
|
1212 |
-
if crop_img is None:
|
1213 |
-
print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} is None.") # DEBUG
|
1214 |
-
crops.append(None)
|
1215 |
-
elif crop_img.shape[0] == 0 or crop_img.shape[1] == 0:
|
1216 |
-
print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} has zero dimension: {crop_img.shape}") # DEBUG
|
1217 |
-
crops.append(None)
|
1218 |
-
else:
|
1219 |
-
crops.append(crop_img)
|
1220 |
-
# Optionally save these crops for manual inspection:
|
1221 |
-
# if self.save_crop: cv2.imwrite(os.path.join(self.crop_dir, f"debug_crop_before_cls_{self.crop_idx + i}.png"), crop_img)
|
1222 |
-
except Exception as e_crop:
|
1223 |
-
print(f" DEBUG OCR SYS: Error cropping box {i+1}/{len(boxes)}: {e_crop}") # DEBUG
|
1224 |
-
crops.append(None)
|
1225 |
-
|
1226 |
-
valid_idxs = [i for i, c in enumerate(crops) if c is not None and c.shape[0] > 0 and c.shape[1] > 0]
|
1227 |
-
if not valid_idxs:
|
1228 |
-
print(" DEBUG OCR SYS: No valid crops obtained after attempting to crop detected boxes. Returning empty fragments.") # DEBUG
|
1229 |
-
return [], []
|
1230 |
-
|
1231 |
-
# Filter crops and corresponding boxes
|
1232 |
-
valid_crops = [crops[i] for i in valid_idxs]
|
1233 |
-
boxes_for_valid_crops = [boxes[i] for i in valid_idxs]
|
1234 |
-
print(f" DEBUG OCR SYS: Number of valid crops to process: {len(valid_crops)}") # DEBUG
|
1235 |
-
|
1236 |
-
# ---- START LOGGING CROP PROPERTIES ----
|
1237 |
-
if valid_crops:
|
1238 |
-
print(" DEBUG OCR SYS: Logging properties of first few valid crops (and Box 21 if present):")
|
1239 |
-
indices_to_log = list(range(min(3, len(valid_crops)))) # Log first 3
|
1240 |
-
# Try to find original index of Box 21 if we can map it back, this is a bit tricky here
|
1241 |
-
# For simplicity, let's just log the first few. If Box 21 was among them, we'd see it.
|
1242 |
-
|
1243 |
-
for i_log_idx, crop_idx in enumerate(indices_to_log):
|
1244 |
-
crop_image_np = valid_crops[crop_idx]
|
1245 |
-
if crop_image_np is not None and crop_image_np.size > 0:
|
1246 |
-
print(f" Crop for Recognizer (Index {crop_idx}): "
|
1247 |
-
f"Shape: {crop_image_np.shape}, "
|
1248 |
-
f"dtype: {crop_image_np.dtype}, "
|
1249 |
-
f"MinPx: {np.min(crop_image_np)}, "
|
1250 |
-
f"MaxPx: {np.max(crop_image_np)}, "
|
1251 |
-
f"MeanPx: {np.mean(crop_image_np):.2f}")
|
1252 |
-
else:
|
1253 |
-
print(f" Crop for Recognizer (Index {crop_idx}): Is None or empty.")
|
1254 |
-
# ---- END LOGGING CROP PROPERTIES ----
|
1255 |
|
1256 |
-
if self.use_cls and self.classifier and classify:
|
1257 |
-
print(f" DEBUG OCR SYS: Applying classifier to {len(valid_crops)} crops...") # DEBUG
|
1258 |
-
try:
|
1259 |
-
# The classifier might modify valid_crops in-place (e.g., rotate them)
|
1260 |
-
classified_crops, cls_results = self.classifier(valid_crops) # classifier returns list, results
|
1261 |
-
print(f" DEBUG OCR SYS: Classifier results count: {len(cls_results)}. First few: {cls_results[:3]}") # DEBUG
|
1262 |
-
valid_crops = classified_crops # Update with potentially rotated crops
|
1263 |
-
except Exception as e_cls:
|
1264 |
-
print(f" DEBUG OCR SYS: Classifier error: {e_cls}. Using unclassified crops.") # DEBUG
|
1265 |
-
# Continue with unclassified (but valid) crops
|
1266 |
-
|
1267 |
-
print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
|
1268 |
-
# ---- START TEMP CODE TO SAVE CROPS ----
|
1269 |
-
save_crop_path_dir = Path("/tmp/temp_recognizer_crops") # Use /tmp
|
1270 |
-
save_crop_path_dir.mkdir(parents=True, exist_ok=True)
|
1271 |
-
for i_crop, crop_image_np in enumerate(valid_crops):
|
1272 |
-
try:
|
1273 |
-
# Ensure crop_image_np is a valid image array (e.g., uint8)
|
1274 |
-
if crop_image_np is not None and crop_image_np.size > 0:
|
1275 |
-
# OpenCV expects BGR if color, or grayscale
|
1276 |
-
cv2.imwrite(str(save_crop_path_dir / f"crop_to_recognize_{self.crop_idx + i_crop}.png"), crop_image_np)
|
1277 |
-
else:
|
1278 |
-
print(f" DEBUG OCR SYS: Crop {i_crop} is None or empty, not saving.")
|
1279 |
-
except Exception as e_save:
|
1280 |
-
print(f" DEBUG OCR SYS: Failed to save crop {i_crop}: {e_save}")
|
1281 |
-
print(f" DEBUG OCR SYS: Saved {len(valid_crops)} crops for recognizer to {save_crop_path_dir}")
|
1282 |
-
# ---- END TEMP CODE TO SAVE CROPS ----
|
1283 |
try:
|
1284 |
-
|
1285 |
-
|
1286 |
-
|
1287 |
-
|
1288 |
-
|
1289 |
-
|
1290 |
-
|
1291 |
-
|
1292 |
-
|
1293 |
-
# Handle this gracefully, perhaps by taking the minimum length
|
1294 |
-
min_len = min(len(boxes_for_valid_crops), len(rec_res))
|
1295 |
-
boxes_to_iterate = boxes_for_valid_crops[:min_len]
|
1296 |
-
rec_res_to_iterate = rec_res[:min_len]
|
1297 |
-
else:
|
1298 |
-
boxes_to_iterate = boxes_for_valid_crops
|
1299 |
-
rec_res_to_iterate = rec_res
|
1300 |
-
|
1301 |
-
print(f" DEBUG OCR SYS: Filtering {len(rec_res_to_iterate)} recognition results with drop_score: {self.drop_score}") # DEBUG
|
1302 |
-
for i, (box, res_tuple) in enumerate(zip(boxes_to_iterate, rec_res_to_iterate)):
|
1303 |
-
txt, score = res_tuple
|
1304 |
-
print(f" DEBUG OCR SYS: Box {i+1} - Recognized: '{txt}', Score: {score:.4f}") # DEBUG
|
1305 |
-
if score >= self.drop_score and txt and not mdr_is_whitespace(txt): # Added check for non-empty/whitespace
|
1306 |
-
final_boxes.append(box)
|
1307 |
-
final_rec_tuples.append(res_tuple)
|
1308 |
-
else:
|
1309 |
-
reason = []
|
1310 |
-
if score < self.drop_score: reason.append(f"score {score:.2f} < {self.drop_score}")
|
1311 |
-
if not txt: reason.append("empty text")
|
1312 |
-
if txt and mdr_is_whitespace(txt): reason.append("whitespace text")
|
1313 |
-
print(f" DEBUG OCR SYS: Box {i+1} DROPPED. Reason(s): {', '.join(reason)}") # DEBUG
|
1314 |
|
1315 |
|
1316 |
-
|
1317 |
-
|
1318 |
-
|
1319 |
-
|
|
|
1320 |
|
1321 |
-
|
1322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1323 |
|
1324 |
def _save_crops(self, crops, recs):
|
1325 |
mdr_ensure_directory(self.crop_dir)
|
@@ -1645,35 +1611,34 @@ class MDROcrEngine:
|
|
1645 |
except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
|
1646 |
return self._text_system
|
1647 |
|
|
|
1648 |
def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
|
1649 |
"""Finds and recognizes text fragments in a NumPy image (BGR)."""
|
1650 |
system = self._get_system()
|
1651 |
if system is None:
|
1652 |
-
print(" DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.")
|
1653 |
-
return
|
1654 |
|
1655 |
-
img_for_system = self._preprocess(image_np)
|
1656 |
-
print(f" DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}")
|
1657 |
|
1658 |
try:
|
1659 |
-
|
1660 |
-
boxes, recs = system(img_for_system) # recs should be list of (text, score)
|
1661 |
except Exception as e:
|
1662 |
-
print(f" DEBUG OCR Engine: Error during TextSystem prediction: {e}")
|
1663 |
import traceback
|
1664 |
traceback.print_exc()
|
1665 |
-
return
|
1666 |
|
1667 |
if not boxes or not recs:
|
1668 |
-
print(f" DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes)}) or no recs ({len(recs)}). No fragments generated.")
|
1669 |
-
return
|
1670 |
|
1671 |
if len(boxes) != len(recs):
|
1672 |
-
print(f" DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic.")
|
1673 |
-
# Potentially try to recover by taking the minimum length, or just return
|
1674 |
return
|
1675 |
|
1676 |
-
print(f" DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.")
|
1677 |
fragments_generated_count = 0
|
1678 |
for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
|
1679 |
if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
|
@@ -1681,25 +1646,26 @@ class MDROcrEngine:
|
|
1681 |
continue
|
1682 |
|
1683 |
txt, conf = rec_tuple
|
1684 |
-
|
1685 |
-
|
1686 |
-
# For MDROcrFragment, we just need valid text and geometry.
|
1687 |
-
if not txt or mdr_is_whitespace(txt): # Basic check, though system should filter
|
1688 |
-
# print(f" DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.")
|
1689 |
continue
|
1690 |
|
1691 |
-
|
1692 |
-
|
1693 |
-
|
1694 |
-
|
1695 |
-
|
1696 |
-
|
|
|
|
|
|
|
1697 |
# else:
|
1698 |
-
# print(f" DEBUG OCR Engine: Fragment {i}
|
1699 |
-
|
1700 |
-
|
|
|
1701 |
|
1702 |
-
print(f" DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.")
|
1703 |
|
1704 |
def _preprocess(self, img: np.ndarray) -> np.ndarray:
|
1705 |
if len(img.shape) == 3 and img.shape[2] == 4:
|
@@ -1729,47 +1695,101 @@ def mdr_prepare_reader_inputs(inputs: Dict[str, torch.Tensor], model: LayoutLMv3
|
|
1729 |
return {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
1730 |
|
1731 |
def mdr_parse_reader_logits(logits: torch.Tensor, length: int) -> List[int]:
|
1732 |
-
print(f"mdr_parse_reader_logits: Called with logits shape: {logits.shape}, length: {length}")
|
1733 |
if length == 0:
|
1734 |
-
print("mdr_parse_reader_logits: length is 0, returning empty list.")
|
1735 |
return []
|
1736 |
|
1737 |
-
|
1738 |
-
print(f"mdr_parse_reader_logits: Attempting to slice logits with [1 : {length + 1}, :{length}]") # ADDED
|
1739 |
try:
|
1740 |
rel_logits = logits[1 : length + 1, :length]
|
1741 |
-
print(f"mdr_parse_reader_logits: rel_logits shape: {rel_logits.shape}")
|
1742 |
except IndexError as e:
|
1743 |
-
print(f"mdr_parse_reader_logits: IndexError during rel_logits slicing! Error: {e}")
|
1744 |
import traceback
|
1745 |
traceback.print_exc()
|
1746 |
-
|
|
|
1747 |
|
1748 |
orders = rel_logits.argmax(dim=1).tolist()
|
1749 |
-
print(f"mdr_parse_reader_logits: Initial orders calculated. Count: {len(orders)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1750 |
|
1751 |
-
loop_count = 0 # ADDED to detect potential infinite loops
|
1752 |
-
max_loops = length * length # A generous upper bound for loop iterations; adjust if needed
|
1753 |
while True:
|
1754 |
loop_count += 1
|
1755 |
if loop_count > max_loops:
|
1756 |
-
print(f"mdr_parse_reader_logits: Exceeded max_loops ({max_loops}), breaking while loop to prevent infinite loop.")
|
1757 |
-
break
|
1758 |
|
1759 |
-
print(f"mdr_parse_reader_logits: While loop iteration: {loop_count}") #
|
1760 |
conflicts = defaultdict(list)
|
1761 |
[conflicts[order].append(idx) for idx, order in enumerate(orders)]
|
1762 |
-
|
|
|
|
|
1763 |
|
1764 |
-
if not
|
1765 |
-
print("mdr_parse_reader_logits: No conflicting orders, breaking while loop.") #
|
1766 |
break
|
1767 |
-
|
1768 |
-
|
1769 |
-
#
|
1770 |
-
|
1771 |
-
|
1772 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1773 |
return orders
|
1774 |
|
1775 |
# --- MDR Layout Reading Engine ---
|
@@ -1790,45 +1810,30 @@ class MDRLayoutReader:
|
|
1790 |
self._device = "cpu"
|
1791 |
print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
|
1792 |
|
|
|
1793 |
def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
|
1794 |
if self._model is None:
|
1795 |
-
cache = mdr_ensure_directory(self._model_path)
|
|
|
|
|
|
|
|
|
1796 |
name = "microsoft/layoutlmv3-base"
|
1797 |
-
|
1798 |
-
|
1799 |
-
# and compatible. For now, let's assume microsoft/layoutlmv3-base is the target
|
1800 |
-
# if a more specific one isn't found or intended.
|
1801 |
-
# The original code had a slightly confusing h_path logic.
|
1802 |
-
# Let's simplify to prioritize a local cache of "microsoft/layoutlmv3-base"
|
1803 |
-
# or a specific model if `self._model_path` points to a complete model directory.
|
1804 |
-
|
1805 |
-
model_load_path = name # Default to Hugging Face model name
|
1806 |
-
local_files_only_flag = False
|
1807 |
-
|
1808 |
-
# Check if self._model_path is a directory containing a full model
|
1809 |
-
# (e.g., config.json, pytorch_model.bin)
|
1810 |
-
# This part of the original logic for 'h_path' was a bit specific.
|
1811 |
-
# For LayoutLMv3, usually, you'd just use "microsoft/layoutlmv3-base"
|
1812 |
-
# and let transformers handle caching, or provide a path to a fully saved model.
|
1813 |
-
|
1814 |
-
# Let's assume the primary goal is to load "microsoft/layoutlmv3-base"
|
1815 |
-
# and allow it to be cached in `self._model_path/layoutreader`
|
1816 |
-
# The `cache_dir` argument to `from_pretrained` handles this.
|
1817 |
-
|
1818 |
-
print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{model_load_path}'. Cache dir: {cache}")
|
1819 |
try:
|
1820 |
self._model = LayoutLMv3ForTokenClassification.from_pretrained(
|
1821 |
-
|
1822 |
-
cache_dir=
|
1823 |
-
local_files_only=
|
1824 |
-
num_labels=_MDR_MAX_LEN+1
|
1825 |
)
|
1826 |
# Explicitly move model to the determined device
|
1827 |
-
self._model.to(torch.device(self._device)) #
|
1828 |
self._model.eval()
|
1829 |
-
print(f"MDR LayoutReader model '{
|
1830 |
except Exception as e:
|
1831 |
-
print(f"ERROR loading MDR LayoutReader model '{
|
1832 |
import traceback
|
1833 |
traceback.print_exc()
|
1834 |
self._model = None
|
@@ -1836,68 +1841,47 @@ class MDRLayoutReader:
|
|
1836 |
|
1837 |
def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
|
1838 |
w, h = size
|
1839 |
-
if w <= 0 or h <= 0
|
1840 |
-
print("MDRLayoutReader: Invalid size or
|
1841 |
return layouts
|
|
|
|
|
|
|
1842 |
|
1843 |
model = self._get_model()
|
1844 |
-
|
1845 |
-
|
1846 |
-
print("MDRLayoutReader: Model is None, using fallback geometric sort.")
|
1847 |
-
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
1848 |
-
nfo = 0
|
1849 |
-
for l in layouts:
|
1850 |
-
l.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
|
1851 |
-
[setattr(f, 'order', i + nfo) for i, f in enumerate(l.fragments)]
|
1852 |
-
nfo += len(l.fragments)
|
1853 |
-
return layouts
|
1854 |
-
|
1855 |
-
print("MDRLayoutReader: Preparing bboxes...") # ADDED
|
1856 |
bbox_list = self._prepare_bboxes(layouts, w, h)
|
1857 |
-
print(f"MDRLayoutReader: Prepared {len(bbox_list) if bbox_list else 'None or 0'} bboxes.")
|
1858 |
|
1859 |
-
if bbox_list is None or len(bbox_list) == 0:
|
1860 |
-
print("MDRLayoutReader: No bboxes
|
|
|
|
|
1861 |
return layouts
|
1862 |
-
|
1863 |
-
|
1864 |
-
xs = l_size / float(w)
|
1865 |
-
ys = l_size / float(h)
|
1866 |
-
scaled_bboxes = []
|
1867 |
-
for bbox in bbox_list:
|
1868 |
-
x0, y0, x1, y1 = bbox.value
|
1869 |
-
sx0 = max(0, min(l_size - 1, round(x0 * xs)))
|
1870 |
-
sy0 = max(0, min(l_size - 1, round(y0 * ys)))
|
1871 |
-
sx1 = max(0, min(l_size - 1, round(x1 * xs)))
|
1872 |
-
sy1 = max(0, min(l_size - 1, round(y1 * ys)))
|
1873 |
-
scaled_bboxes.append([min(sx0, sx1), min(sy0, sy1), max(sx0, sx1), max(sy0, sy1)])
|
1874 |
-
print("MDRLayoutReader: Scaled bboxes prepared. Count: ", len(scaled_bboxes))
|
1875 |
-
orders = []
|
1876 |
try:
|
1877 |
with torch.no_grad():
|
1878 |
-
print("MDRLayoutReader: Creating reader inputs...")
|
1879 |
-
inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)
|
1880 |
-
print("MDRLayoutReader: Preparing inputs for model device...")
|
1881 |
inputs = mdr_prepare_reader_inputs(inputs, model)
|
1882 |
-
print("MDRLayoutReader: Running model inference...")
|
1883 |
logits = model(**inputs).logits.cpu().squeeze(0)
|
1884 |
-
print("MDRLayoutReader: Model inference complete. Parsing logits...")
|
1885 |
orders = mdr_parse_reader_logits(logits, len(bbox_list))
|
1886 |
-
print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
|
1887 |
except Exception as e:
|
1888 |
print(f"MDR LayoutReader prediction error: {e}")
|
1889 |
import traceback
|
1890 |
-
traceback.print_exc()
|
1891 |
-
|
1892 |
-
|
1893 |
-
|
1894 |
-
|
1895 |
-
return layouts # Fallback
|
1896 |
-
for i, order_idx in enumerate(orders):
|
1897 |
-
bbox_list[i].order = order_idx
|
1898 |
print("MDRLayoutReader: Applying order...")
|
1899 |
-
result_layouts = self._apply_order(layouts, bbox_list)
|
1900 |
-
print("MDRLayoutReader: Order applied. Returning layouts.")
|
1901 |
return result_layouts
|
1902 |
|
1903 |
def _prepare_bboxes(self, layouts: list[MDRLayoutElement], w: int, h: int) -> list[_MDR_ReaderBBox] | None:
|
|
|
712 |
scores.append(score)
|
713 |
return boxes, scores
|
714 |
|
715 |
+
# In class _MDR_DBPostProcess:
|
716 |
def _boxes_from_bitmap(self, pred, bmp, dw, dh): # pred is the probability map, bmp is the binarized map
|
717 |
h, w = bmp.shape
|
718 |
+
# ADDED: More detailed logging
|
719 |
+
print(f" DEBUG OCR: _boxes_from_bitmap: Processing bitmap of shape {h}x{w} for original dimensions {dw:.1f}x{dh:.1f}.")
|
720 |
contours, _ = cv2.findContours((bmp * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
721 |
num_contours_found = len(contours)
|
722 |
+
print(f" DEBUG OCR: _boxes_from_bitmap: Found {num_contours_found} raw contours.")
|
723 |
|
724 |
num_contours_to_process = min(num_contours_found, self.max_cand)
|
725 |
if num_contours_found > self.max_cand:
|
726 |
+
print(f" DEBUG OCR: _boxes_from_bitmap: Processing limited to {self.max_cand} contours (max_candidates).")
|
727 |
|
728 |
boxes, scores = [], []
|
729 |
kept_boxes_count = 0
|
|
|
731 |
contour = contours[i]
|
732 |
pts_mini_box, sside = self._get_mini_boxes(contour)
|
733 |
if sside < self.min_sz:
|
734 |
+
# print(f" DEBUG OCR: Contour {i} too small (sside {sside:.2f} < min_sz {self.min_sz}). Skipping.") # Can be too verbose
|
735 |
continue
|
736 |
|
737 |
pts_arr = np.array(pts_mini_box)
|
|
|
738 |
current_score = self._box_score_fast(pred, pts_arr.reshape(-1, 2)) if self.score_m == "fast" else self._box_score_slow(pred, contour)
|
739 |
|
740 |
if self.box_thresh > current_score:
|
|
|
742 |
continue
|
743 |
|
744 |
try:
|
|
|
745 |
box_unclipped = self._unclip(pts_arr, self.unclip_r).reshape(-1, 1, 2)
|
746 |
except Exception as e_unclip:
|
747 |
# print(f" DEBUG OCR: Contour {i} unclip failed: {e_unclip}. Skipping.") # Can be too verbose
|
|
|
749 |
|
750 |
box_final, sside_final = self._get_mini_boxes(box_unclipped)
|
751 |
if sside_final < self.min_sz + 2: # min_sz is 3
|
752 |
+
# print(f" DEBUG OCR: Contour {i} final size after unclip too small (sside_final {sside_final:.2f} < {self.min_sz + 2}). Skipping.") # Can be too verbose
|
753 |
continue
|
754 |
|
755 |
box_final_arr = np.array(box_final)
|
|
|
756 |
box_final_arr[:, 0] = np.clip(np.round(box_final_arr[:, 0] / w * dw), 0, dw)
|
757 |
box_final_arr[:, 1] = np.clip(np.round(box_final_arr[:, 1] / h * dh), 0, dh)
|
758 |
|
759 |
boxes.append(box_final_arr.astype("int32"))
|
760 |
scores.append(current_score)
|
761 |
kept_boxes_count +=1
|
762 |
+
print(f" DEBUG OCR: _boxes_from_bitmap: Kept {kept_boxes_count} boxes after all filtering (size, score, unclip). Configured box_thresh: {self.box_thresh}, min_sz: {self.min_sz}.")
|
763 |
return np.array(boxes, dtype="int32"), scores
|
764 |
|
765 |
def _unclip(self, box, ratio):
|
|
|
805 |
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
|
806 |
return cv2.mean(bmp[ymin : ymax + 1, xmin : xmax + 1], mask)[0] if np.sum(mask) > 0 else 0.0
|
807 |
|
|
|
808 |
def __call__(self, outs_dict, shape_list):
|
809 |
pred = outs_dict['maps'][:, 0, :, :]
|
810 |
seg = pred > self.thresh
|
811 |
+
# ADDED: More detailed logging
|
812 |
+
print(f" DEBUG OCR: _MDR_DBPostProcess: pred map shape: {pred.shape}, seg map shape: {seg.shape}, configured thresh: {self.thresh}")
|
813 |
+
print(f" DEBUG OCR: _MDR_DBPostProcess: Number of pixels in seg map above threshold (sum of all batches): {np.sum(seg)}")
|
814 |
|
815 |
boxes_batch = []
|
816 |
for batch_idx in range(pred.shape[0]):
|
817 |
+
# MODIFIED: Ensure sh, sw are floats for division if they come from shape_list
|
818 |
+
sh_orig, sw_orig, rh_ratio, rw_ratio = shape_list[batch_idx]
|
819 |
+
# The dw, dh for _boxes_from_bitmap should be the original image dimensions before DetResizeForTest
|
820 |
+
# shape_list contains [src_h, src_w, ratio_h, ratio_w]
|
821 |
+
# So dw = src_w, dh = src_h
|
822 |
+
dw_orig, dh_orig = sw_orig, sh_orig
|
823 |
+
|
824 |
current_pred_map = pred[batch_idx]
|
825 |
current_seg_map = seg[batch_idx]
|
826 |
|
827 |
mask = cv2.dilate(np.array(current_seg_map).astype(np.uint8), self.dila_k) if self.dila_k is not None else current_seg_map
|
828 |
+
print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
|
829 |
|
830 |
if self.box_t == 'poly':
|
831 |
+
boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dw_orig, dh_orig)
|
832 |
elif self.box_t == 'quad':
|
833 |
+
boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dw_orig, dh_orig) # Pass original dimensions
|
834 |
else:
|
835 |
raise ValueError("box_type must be 'quad' or 'poly'")
|
836 |
+
print(f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Found {len(boxes)} boxes from bitmap processing.")
|
837 |
boxes_batch.append({'points': boxes})
|
838 |
return boxes_batch
|
839 |
|
|
|
891 |
new_boxes.append(box)
|
892 |
return np.array(new_boxes)
|
893 |
|
894 |
+
# In class _MDR_TextDetector:
|
895 |
def __call__(self, img):
|
896 |
ori_im = img.copy()
|
897 |
data = {"image": img}
|
898 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Original image shape: {ori_im.shape}")
|
899 |
+
|
900 |
+
# Preprocessing
|
901 |
+
try:
|
902 |
+
data = mdr_ocr_transform(data, self.pre_op)
|
903 |
+
except Exception as e_preproc:
|
904 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
|
905 |
+
import traceback
|
906 |
+
traceback.print_exc()
|
907 |
+
return np.array([]) # Return empty array on failure
|
908 |
+
|
909 |
if data is None:
|
910 |
+
print(" DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
|
911 |
+
return np.array([])
|
912 |
|
913 |
+
processed_img, shape_list = data # shape_list is [src_h, src_w, ratio_h, ratio_w]
|
914 |
if processed_img is None:
|
915 |
+
print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
|
916 |
+
return np.array([])
|
917 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Processed image shape for ONNX: {processed_img.shape}, shape_list: {shape_list}")
|
918 |
|
919 |
img_for_onnx = np.expand_dims(processed_img, axis=0)
|
920 |
shape_list_for_onnx = np.expand_dims(shape_list, axis=0)
|
921 |
+
img_for_onnx = img_for_onnx.copy()
|
922 |
|
923 |
inputs = self.get_input_feed(self.input_name, img_for_onnx)
|
924 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Running ONNX inference for text detection...")
|
925 |
try:
|
926 |
outputs = self.sess.run(self.output_name, input_feed=inputs)
|
927 |
+
except Exception as e_infer:
|
928 |
+
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
|
929 |
import traceback
|
930 |
traceback.print_exc()
|
931 |
+
return np.array([]) # Return empty array on failure
|
932 |
+
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
|
933 |
|
934 |
preds = {"maps": outputs[0]}
|
935 |
+
try:
|
936 |
+
post_res = self.post_op(preds, shape_list_for_onnx)
|
937 |
+
except Exception as e_postproc:
|
938 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Error during DBPostProcess: {e_postproc}")
|
939 |
+
import traceback
|
940 |
+
traceback.print_exc()
|
941 |
+
return np.array([])
|
942 |
+
|
943 |
+
if not post_res or not post_res[0].get('points'):
|
944 |
+
print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no points.")
|
945 |
+
return np.array([])
|
946 |
+
|
947 |
boxes_from_post = post_res[0]['points']
|
948 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
|
949 |
+
|
950 |
+
if not isinstance(boxes_from_post, (list, np.ndarray)) or len(boxes_from_post) == 0: # Check if it's empty or not list-like
|
951 |
+
print(" DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter.")
|
952 |
+
return np.array([])
|
953 |
|
954 |
if self.args.det_box_type == 'poly':
|
955 |
final_boxes = self._filter_poly(boxes_from_post, ori_im.shape)
|
956 |
else: # 'quad'
|
957 |
final_boxes = self._filter_quad(boxes_from_post, ori_im.shape)
|
958 |
+
print(f" DEBUG OCR: _MDR_TextDetector: Boxes after final poly/quad filtering: {len(final_boxes)}")
|
959 |
return final_boxes
|
960 |
|
961 |
class _MDR_ClsPostProcess:
|
|
|
1116 |
self.input_name = self.get_input_name(self.sess)
|
1117 |
self.output_name = self.get_output_name(self.sess)
|
1118 |
|
1119 |
+
# In class _MDR_TextRecognizer
|
1120 |
def _resize_norm(self, img, max_r): # img is a single crop
|
1121 |
imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
|
1122 |
h_orig, w_orig = img.shape[:2]
|
1123 |
+
# ADDED: Log input crop shape
|
1124 |
print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
|
1125 |
|
1126 |
if h_orig == 0 or w_orig == 0:
|
1127 |
+
print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}). Returning zeros.")
|
1128 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1129 |
|
1130 |
r_current = w_orig / float(h_orig)
|
|
|
1131 |
tw = min(imgW, int(ceil(imgH * r_current)))
|
1132 |
+
tw = max(1, tw)
|
1133 |
print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
|
1134 |
|
1135 |
try:
|
1136 |
+
resized = cv2.resize(img, (tw, imgH))
|
1137 |
+
except cv2.error as e_resize: # Catch specific cv2 error
|
1138 |
+
print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
|
|
|
1139 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1140 |
+
except Exception as e_resize_general: # Catch any other unexpected error
|
1141 |
+
print(f" DEBUG RECOGNIZER: _resize_norm general error during resize: {e_resize_general}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
|
1142 |
+
import traceback
|
1143 |
+
traceback.print_exc()
|
1144 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1145 |
+
|
1146 |
|
1147 |
resized = resized.astype("float32")
|
1148 |
+
if imgC == 1 and len(resized.shape) == 3:
|
|
|
|
|
1149 |
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
1150 |
+
if len(resized.shape) == 2:
|
1151 |
+
resized = resized[:, :, np.newaxis] # Add channel dim if grayscale
|
|
|
1152 |
|
1153 |
+
# Ensure resized has 3 channels if imgC is 3, even if input was grayscale
|
1154 |
+
if imgC == 3 and resized.shape[2] == 1:
|
1155 |
+
resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
|
1156 |
+
|
1157 |
+
|
1158 |
+
resized = resized.transpose((2, 0, 1)) / 255.0
|
1159 |
+
resized -= 0.5
|
1160 |
+
resized /= 0.5
|
1161 |
|
1162 |
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1163 |
+
padding[:, :, 0:tw] = resized
|
1164 |
print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
|
1165 |
+
|
1166 |
+
# ADDED: Log normalized crop properties
|
1167 |
+
min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
|
1168 |
print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
|
1169 |
+
f"dtype: {padding.dtype}, "
|
1170 |
+
f"MinPx: {min_px:.4f}, "
|
1171 |
+
f"MaxPx: {max_px:.4f}, "
|
1172 |
+
f"MeanPx: {mean_px:.4f}")
|
1173 |
if np.all(padding == 0):
|
1174 |
print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
|
1175 |
+
elif np.abs(max_px - min_px) < 1e-6 : # Check if all elements are (close to) the same
|
1176 |
+
print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
|
|
|
1177 |
return padding
|
1178 |
|
1179 |
def __call__(self, img_list):
|
|
|
1227 |
try: return list(sorted(boxes, key=key))
|
1228 |
except: return list(boxes) # Fallback
|
1229 |
|
1230 |
+
# In class _MDR_TextRecognizer
|
1231 |
+
def _resize_norm(self, img, max_r): # img is a single crop
|
1232 |
+
imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
|
1233 |
+
h_orig, w_orig = img.shape[:2]
|
1234 |
+
# ADDED: Log input crop shape
|
1235 |
+
print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
|
|
|
|
|
1236 |
|
1237 |
+
if h_orig == 0 or w_orig == 0:
|
1238 |
+
print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}). Returning zeros.")
|
1239 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1240 |
|
1241 |
+
r_current = w_orig / float(h_orig)
|
1242 |
+
tw = min(imgW, int(ceil(imgH * r_current)))
|
1243 |
+
tw = max(1, tw)
|
1244 |
+
print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1246 |
try:
|
1247 |
+
resized = cv2.resize(img, (tw, imgH))
|
1248 |
+
except cv2.error as e_resize: # Catch specific cv2 error
|
1249 |
+
print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
|
1250 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1251 |
+
except Exception as e_resize_general: # Catch any other unexpected error
|
1252 |
+
print(f" DEBUG RECOGNIZER: _resize_norm general error during resize: {e_resize_general}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH}). Returning zeros.")
|
1253 |
+
import traceback
|
1254 |
+
traceback.print_exc()
|
1255 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1256 |
|
1257 |
|
1258 |
+
resized = resized.astype("float32")
|
1259 |
+
if imgC == 1 and len(resized.shape) == 3:
|
1260 |
+
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
1261 |
+
if len(resized.shape) == 2:
|
1262 |
+
resized = resized[:, :, np.newaxis] # Add channel dim if grayscale
|
1263 |
|
1264 |
+
# Ensure resized has 3 channels if imgC is 3, even if input was grayscale
|
1265 |
+
if imgC == 3 and resized.shape[2] == 1:
|
1266 |
+
resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
|
1267 |
+
|
1268 |
+
|
1269 |
+
resized = resized.transpose((2, 0, 1)) / 255.0
|
1270 |
+
resized -= 0.5
|
1271 |
+
resized /= 0.5
|
1272 |
+
|
1273 |
+
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1274 |
+
padding[:, :, 0:tw] = resized
|
1275 |
+
print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
|
1276 |
+
|
1277 |
+
# ADDED: Log normalized crop properties
|
1278 |
+
min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
|
1279 |
+
print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
|
1280 |
+
f"dtype: {padding.dtype}, "
|
1281 |
+
f"MinPx: {min_px:.4f}, "
|
1282 |
+
f"MaxPx: {max_px:.4f}, "
|
1283 |
+
f"MeanPx: {mean_px:.4f}")
|
1284 |
+
if np.all(padding == 0):
|
1285 |
+
print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
|
1286 |
+
elif np.abs(max_px - min_px) < 1e-6 : # Check if all elements are (close to) the same
|
1287 |
+
print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
|
1288 |
+
return padding
|
1289 |
|
1290 |
def _save_crops(self, crops, recs):
|
1291 |
mdr_ensure_directory(self.crop_dir)
|
|
|
1611 |
except Exception as e: print(f"ERROR initializing MDR OCR System: {e}"); self._text_system = None
|
1612 |
return self._text_system
|
1613 |
|
1614 |
+
# In class MDROcrEngine:
|
1615 |
def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
|
1616 |
"""Finds and recognizes text fragments in a NumPy image (BGR)."""
|
1617 |
system = self._get_system()
|
1618 |
if system is None:
|
1619 |
+
print(" DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.")
|
1620 |
+
return
|
1621 |
|
1622 |
+
img_for_system = self._preprocess(image_np)
|
1623 |
+
print(f" DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}")
|
1624 |
|
1625 |
try:
|
1626 |
+
boxes, recs = system(img_for_system)
|
|
|
1627 |
except Exception as e:
|
1628 |
+
print(f" DEBUG OCR Engine: Error during TextSystem prediction: {e}")
|
1629 |
import traceback
|
1630 |
traceback.print_exc()
|
1631 |
+
return
|
1632 |
|
1633 |
if not boxes or not recs:
|
1634 |
+
print(f" DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes) if boxes is not None else 'None'}) or no recs ({len(recs) if recs is not None else 'None'}). No fragments generated.")
|
1635 |
+
return
|
1636 |
|
1637 |
if len(boxes) != len(recs):
|
1638 |
+
print(f" DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic. No fragments generated.")
|
|
|
1639 |
return
|
1640 |
|
1641 |
+
print(f" DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.")
|
1642 |
fragments_generated_count = 0
|
1643 |
for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
|
1644 |
if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
|
|
|
1646 |
continue
|
1647 |
|
1648 |
txt, conf = rec_tuple
|
1649 |
+
if not txt or mdr_is_whitespace(txt):
|
1650 |
+
# print(f" DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.") # Already logged in TextSystem
|
|
|
|
|
|
|
1651 |
continue
|
1652 |
|
1653 |
+
try:
|
1654 |
+
pts = [(float(p[0]), float(p[1])) for p in box_pts]
|
1655 |
+
if len(pts) == 4:
|
1656 |
+
r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
|
1657 |
+
if r.is_valid and r.area > 1:
|
1658 |
+
yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
|
1659 |
+
fragments_generated_count += 1
|
1660 |
+
# else:
|
1661 |
+
# print(f" DEBUG OCR Engine: Fragment {i} has invalid/small rectangle. Area: {r.area:.2f}. Valid: {r.is_valid}. Skipping.")
|
1662 |
# else:
|
1663 |
+
# print(f" DEBUG OCR Engine: Fragment {i} box_pts not length 4: {len(pts)}. Skipping.")
|
1664 |
+
except Exception as e_frag:
|
1665 |
+
print(f" DEBUG OCR Engine: Error creating MDROcrFragment for item {i}: {e_frag}")
|
1666 |
+
continue
|
1667 |
|
1668 |
+
print(f" DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.")
|
1669 |
|
1670 |
def _preprocess(self, img: np.ndarray) -> np.ndarray:
|
1671 |
if len(img.shape) == 3 and img.shape[2] == 4:
|
|
|
1695 |
return {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
1696 |
|
1697 |
def mdr_parse_reader_logits(logits: torch.Tensor, length: int) -> List[int]:
|
1698 |
+
print(f"mdr_parse_reader_logits: Called with logits shape: {logits.shape}, length: {length}")
|
1699 |
if length == 0:
|
1700 |
+
print("mdr_parse_reader_logits: length is 0, returning empty list.")
|
1701 |
return []
|
1702 |
|
1703 |
+
print(f"mdr_parse_reader_logits: Attempting to slice logits with [1 : {length + 1}, :{length}]")
|
|
|
1704 |
try:
|
1705 |
rel_logits = logits[1 : length + 1, :length]
|
1706 |
+
print(f"mdr_parse_reader_logits: rel_logits shape: {rel_logits.shape}")
|
1707 |
except IndexError as e:
|
1708 |
+
print(f"mdr_parse_reader_logits: IndexError during rel_logits slicing! Error: {e}")
|
1709 |
import traceback
|
1710 |
traceback.print_exc()
|
1711 |
+
# Depending on desired behavior, either raise or return empty/fallback
|
1712 |
+
return list(range(length)) # Fallback to sequential order if slicing fails
|
1713 |
|
1714 |
orders = rel_logits.argmax(dim=1).tolist()
|
1715 |
+
print(f"mdr_parse_reader_logits: Initial orders calculated. Count: {len(orders)}")
|
1716 |
+
|
1717 |
+
# ADDED: Loop safeguard
|
1718 |
+
loop_count = 0
|
1719 |
+
# Max loops: if N items, N^2 is a generous limit for pairwise comparisons/adjustments.
|
1720 |
+
# For N=33, N^2 = 1089. For N=21, N^2 = 441. This matches the logs.
|
1721 |
+
# A tighter bound might be N * (N-1) / 2 or N * some_factor.
|
1722 |
+
# Let's use N * N as seen in logs, or a fixed large number if N is small.
|
1723 |
+
max_loops = max(50, length * length) # Ensure at least 50 loops for small N
|
1724 |
|
|
|
|
|
1725 |
while True:
|
1726 |
loop_count += 1
|
1727 |
if loop_count > max_loops:
|
1728 |
+
print(f"mdr_parse_reader_logits: Exceeded max_loops ({max_loops}), breaking while loop to prevent infinite loop.")
|
1729 |
+
break
|
1730 |
|
1731 |
+
# print(f"mdr_parse_reader_logits: While loop iteration: {loop_count}") # Can be too verbose
|
1732 |
conflicts = defaultdict(list)
|
1733 |
[conflicts[order].append(idx) for idx, order in enumerate(orders)]
|
1734 |
+
|
1735 |
+
# Filter to find actual conflicting orders (where multiple original indices map to the same target order)
|
1736 |
+
conflicting_orders_map = {o: idxs for o, idxs in conflicts.items() if len(idxs) > 1}
|
1737 |
|
1738 |
+
if not conflicting_orders_map:
|
1739 |
+
# print("mdr_parse_reader_logits: No conflicting orders, breaking while loop.") # Verbose
|
1740 |
break
|
1741 |
+
|
1742 |
+
# Log only if there are actual conflicts to resolve
|
1743 |
+
if loop_count == 1 or loop_count % 10 == 0 : # Log first and every 10th iteration with conflicts
|
1744 |
+
print(f"mdr_parse_reader_logits: While loop iteration: {loop_count}. Found {len(conflicting_orders_map)} conflicting orders.")
|
1745 |
+
|
1746 |
+
|
1747 |
+
for order_val, c_idxs in conflicting_orders_map.items():
|
1748 |
+
# This logic seems to pick the one with the highest score for that conflicting order.
|
1749 |
+
# It might need more sophisticated tie-breaking if scores are identical or very close.
|
1750 |
+
# The original logic was:
|
1751 |
+
# best_idx = -1; max_score = -float('inf')
|
1752 |
+
# for c_idx in c_idxs:
|
1753 |
+
# score = rel_logits[c_idx, order_val].item()
|
1754 |
+
# if score > max_score: max_score = score; best_idx = c_idx
|
1755 |
+
# for c_idx in c_idxs:
|
1756 |
+
# if c_idx != best_idx: orders[c_idx] = -1 # Mark for re-evaluation or different assignment
|
1757 |
+
|
1758 |
+
# Simpler approach: keep the first one, mark others to be reassigned.
|
1759 |
+
# This might not be optimal but could break cycles.
|
1760 |
+
# A more robust solution might involve graph-based cycle detection or a different assignment strategy.
|
1761 |
+
# For now, let's stick to a slight modification of the implied original logic:
|
1762 |
+
# The one with the highest confidence for *that specific order_val* keeps it.
|
1763 |
+
# Others get their order reset to their own index (diagonal) or -1 to be re-evaluated.
|
1764 |
+
|
1765 |
+
if not c_idxs: continue
|
1766 |
+
|
1767 |
+
best_c_idx_for_this_order = -1
|
1768 |
+
max_confidence_for_this_order = -float('inf')
|
1769 |
+
|
1770 |
+
for current_c_idx in c_idxs:
|
1771 |
+
confidence = rel_logits[current_c_idx, order_val].item()
|
1772 |
+
if confidence > max_confidence_for_this_order:
|
1773 |
+
max_confidence_for_this_order = confidence
|
1774 |
+
best_c_idx_for_this_order = current_c_idx
|
1775 |
+
|
1776 |
+
# Now, for all conflicting indices for this 'order_val',
|
1777 |
+
# if they are not the 'best_c_idx_for_this_order',
|
1778 |
+
# they need a new order. A simple strategy is to make them point to themselves initially.
|
1779 |
+
# Or, find their next best alternative.
|
1780 |
+
for current_c_idx in c_idxs:
|
1781 |
+
if current_c_idx != best_c_idx_for_this_order:
|
1782 |
+
# Option 1: Reset to self (might not resolve complex cycles)
|
1783 |
+
# orders[current_c_idx] = current_c_idx
|
1784 |
+
|
1785 |
+
# Option 2: Find next best order for this current_c_idx, excluding the conflicting 'order_val'
|
1786 |
+
# Create a temporary copy of its logits row, set the conflicting order's logit to -inf
|
1787 |
+
temp_logits_row = rel_logits[current_c_idx, :].clone()
|
1788 |
+
temp_logits_row[order_val] = -float('inf')
|
1789 |
+
orders[current_c_idx] = temp_logits_row.argmax().item()
|
1790 |
+
|
1791 |
+
|
1792 |
+
print(f"mdr_parse_reader_logits: While loop finished after {loop_count} iterations. Returning {len(orders)} orders.")
|
1793 |
return orders
|
1794 |
|
1795 |
# --- MDR Layout Reading Engine ---
|
|
|
1810 |
self._device = "cpu"
|
1811 |
print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
|
1812 |
|
1813 |
+
# In class MDRLayoutReader:
|
1814 |
def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
|
1815 |
if self._model is None:
|
1816 |
+
cache = mdr_ensure_directory(self._model_path) # This should be self._model_path / "layoutreader"
|
1817 |
+
# Correct cache path for transformers
|
1818 |
+
layoutreader_cache_dir = Path(self._model_dir) / "layoutreader" # Assuming _model_dir is the main one
|
1819 |
+
mdr_ensure_directory(str(layoutreader_cache_dir))
|
1820 |
+
|
1821 |
name = "microsoft/layoutlmv3-base"
|
1822 |
+
|
1823 |
+
print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1824 |
try:
|
1825 |
self._model = LayoutLMv3ForTokenClassification.from_pretrained(
|
1826 |
+
name, # Use the HF model name
|
1827 |
+
cache_dir=str(layoutreader_cache_dir),
|
1828 |
+
local_files_only=False, # Allow download on first run
|
1829 |
+
num_labels=_MDR_MAX_LEN+1
|
1830 |
)
|
1831 |
# Explicitly move model to the determined device
|
1832 |
+
self._model.to(torch.device(self._device)) # ENSURE THIS LINE IS PRESENT AND CORRECT
|
1833 |
self._model.eval()
|
1834 |
+
print(f"MDR LayoutReader model '{name}' loaded successfully on device: {self._model.device}.")
|
1835 |
except Exception as e:
|
1836 |
+
print(f"ERROR loading MDR LayoutReader model '{name}': {e}")
|
1837 |
import traceback
|
1838 |
traceback.print_exc()
|
1839 |
self._model = None
|
|
|
1841 |
|
1842 |
def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
|
1843 |
w, h = size
|
1844 |
+
if w <= 0 or h <= 0: # ADDED check for invalid size
|
1845 |
+
print("MDRLayoutReader: Invalid image size (w or h <= 0), returning layouts as is.")
|
1846 |
return layouts
|
1847 |
+
if not layouts:
|
1848 |
+
print("MDRLayoutReader: No layouts to process, returning empty list.")
|
1849 |
+
return [] # Return empty list if no layouts
|
1850 |
|
1851 |
model = self._get_model()
|
1852 |
+
# ... (rest of the method, add logging as needed) ...
|
1853 |
+
print("MDRLayoutReader: Preparing bboxes...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1854 |
bbox_list = self._prepare_bboxes(layouts, w, h)
|
|
|
1855 |
|
1856 |
+
if bbox_list is None or len(bbox_list) == 0: # Check if bbox_list is None or empty
|
1857 |
+
print("MDRLayoutReader: No bboxes prepared from layouts, returning layouts as is (possibly sorted geometrically).")
|
1858 |
+
# Fallback geometric sort if no bboxes could be prepared
|
1859 |
+
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
1860 |
return layouts
|
1861 |
+
print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
|
1862 |
+
# ... (rest of the scaling and inference logic) ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1863 |
try:
|
1864 |
with torch.no_grad():
|
1865 |
+
print("MDRLayoutReader: Creating reader inputs...")
|
1866 |
+
inputs = mdr_boxes_to_reader_inputs(scaled_bboxes) # scaled_bboxes comes from the loop above
|
1867 |
+
print("MDRLayoutReader: Preparing inputs for model device...")
|
1868 |
inputs = mdr_prepare_reader_inputs(inputs, model)
|
1869 |
+
print("MDRLayoutReader: Running model inference...")
|
1870 |
logits = model(**inputs).logits.cpu().squeeze(0)
|
1871 |
+
print("MDRLayoutReader: Model inference complete. Parsing logits...")
|
1872 |
orders = mdr_parse_reader_logits(logits, len(bbox_list))
|
1873 |
+
print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
|
1874 |
except Exception as e:
|
1875 |
print(f"MDR LayoutReader prediction error: {e}")
|
1876 |
import traceback
|
1877 |
+
traceback.print_exc()
|
1878 |
+
# Fallback geometric sort on error
|
1879 |
+
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
1880 |
+
return layouts
|
1881 |
+
# ... (rest of applying order) ...
|
|
|
|
|
|
|
1882 |
print("MDRLayoutReader: Applying order...")
|
1883 |
+
result_layouts = self._apply_order(layouts, bbox_list) # Ensure bbox_list has 'order' attribute set
|
1884 |
+
print("MDRLayoutReader: Order applied. Returning layouts.")
|
1885 |
return result_layouts
|
1886 |
|
1887 |
def _prepare_bboxes(self, layouts: list[MDRLayoutElement], w: int, h: int) -> list[_MDR_ReaderBBox] | None:
|