Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +205 -99
mdr_pdf_parser.py
CHANGED
@@ -946,10 +946,9 @@ class _MDR_DBPostProcess:
|
|
946 |
f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
|
947 |
|
948 |
if self.box_t == 'poly':
|
949 |
-
boxes, scores = self._polygons_from_bitmap(current_pred_map, mask,
|
950 |
elif self.box_t == 'quad':
|
951 |
-
boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dw_orig
|
952 |
-
dh_orig) # Pass original dimensions
|
953 |
else:
|
954 |
raise ValueError("box_type must be 'quad' or 'poly'")
|
955 |
print(
|
@@ -1299,17 +1298,37 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
|
|
1299 |
print(
|
1300 |
f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
|
1301 |
|
1302 |
-
|
|
|
|
|
1303 |
print(
|
1304 |
-
f" DEBUG RECOGNIZER: _resize_norm received
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1305 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1306 |
|
1307 |
-
r_current = w_orig / float(h_orig)
|
1308 |
tw = min(imgW, int(ceil(imgH * r_current)))
|
1309 |
-
tw = max(1, tw)
|
|
|
|
|
1310 |
print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
|
1311 |
|
1312 |
try:
|
|
|
|
|
|
|
|
|
|
|
1313 |
resized = cv2.resize(img, (tw, imgH))
|
1314 |
except cv2.error as e_resize: # Catch specific cv2 error
|
1315 |
print(
|
@@ -1322,25 +1341,36 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
|
|
1322 |
traceback.print_exc()
|
1323 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1324 |
|
|
|
1325 |
resized = resized.astype("float32")
|
1326 |
-
if imgC == 1 and len(resized.shape) == 3:
|
1327 |
-
resized
|
1328 |
-
|
1329 |
-
resized = resized[:, :, np.newaxis] # Add channel dim if grayscale
|
1330 |
|
1331 |
-
|
|
|
|
|
|
|
1332 |
if imgC == 3 and resized.shape[2] == 1:
|
1333 |
resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
|
1334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1335 |
resized = resized.transpose((2, 0, 1)) / 255.0
|
1336 |
resized -= 0.5
|
1337 |
resized /= 0.5
|
1338 |
|
1339 |
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1340 |
-
|
1341 |
-
|
|
|
1342 |
|
1343 |
-
|
|
|
1344 |
min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
|
1345 |
print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
|
1346 |
f"dtype: {padding.dtype}, "
|
@@ -1349,7 +1379,7 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
|
|
1349 |
f"MeanPx: {mean_px:.4f}")
|
1350 |
if np.all(padding == 0):
|
1351 |
print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
|
1352 |
-
elif np.abs(max_px - min_px) < 1e-6:
|
1353 |
print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
|
1354 |
return padding
|
1355 |
|
@@ -1400,8 +1430,6 @@ class _MDR_TextSystem:
|
|
1400 |
self.save_crop = getattr(args, 'save_crop_res', False)
|
1401 |
self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
|
1402 |
|
1403 |
-
# In class _MDR_TextSystem:
|
1404 |
-
|
1405 |
def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
|
1406 |
ori_im = img.copy()
|
1407 |
|
@@ -1416,34 +1444,22 @@ class _MDR_TextSystem:
|
|
1416 |
if not dt_boxes_sorted:
|
1417 |
return [], []
|
1418 |
|
1419 |
-
|
1420 |
-
|
1421 |
-
for i in range(len(dt_boxes_sorted)):
|
1422 |
-
crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
|
1423 |
-
# Ensure crop_im is not empty or too small before adding
|
1424 |
-
if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:
|
1425 |
-
img_crop_list.append(crop_im)
|
1426 |
-
else:
|
1427 |
-
print(
|
1428 |
-
f" DEBUG TextSystem: Crop {i} was None or too small, skipping. Original box: {dt_boxes_sorted[i]}")
|
1429 |
-
# To maintain correspondence, we might need to handle this more carefully
|
1430 |
-
# For now, this might lead to length mismatches if not all crops are valid.
|
1431 |
-
# A better approach might be to filter dt_boxes_sorted alongside img_crop_list creation.
|
1432 |
-
|
1433 |
-
# Let's refine the filtering of boxes and creation of crops to ensure they always match
|
1434 |
valid_boxes_for_cropping: list[np.ndarray] = []
|
1435 |
-
|
1436 |
for i, box_pts in enumerate(dt_boxes_sorted):
|
1437 |
crop_im = mdr_get_rotated_crop(ori_im, box_pts)
|
1438 |
if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1: # Min height/width for a crop
|
1439 |
valid_boxes_for_cropping.append(box_pts)
|
1440 |
-
|
1441 |
else:
|
1442 |
print(
|
1443 |
f" DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
|
1444 |
|
1445 |
dt_boxes_sorted = valid_boxes_for_cropping # Update dt_boxes_sorted to only include those that yielded valid crops
|
1446 |
-
img_crop_list
|
|
|
1447 |
|
1448 |
print(f" DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
|
1449 |
|
@@ -1451,50 +1467,76 @@ class _MDR_TextSystem:
|
|
1451 |
print(" DEBUG TextSystem: No valid crops generated. Returning empty.")
|
1452 |
return [], []
|
1453 |
|
1454 |
-
if self.use_cls and self.classifier is not None:
|
1455 |
print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
|
1456 |
-
img_crop_list, cls_results = self.classifier(
|
|
|
1457 |
print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
|
1458 |
|
1459 |
rec_results: list[tuple[str, float]] = []
|
1460 |
-
# No need to check img_crop_list again
|
1461 |
print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
|
1462 |
rec_results = self.recognizer(img_crop_list)
|
1463 |
print(f" DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
|
1464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1465 |
final_boxes_to_return: list[np.ndarray] = []
|
1466 |
final_recs_to_return: list[tuple[str, float]] = []
|
1467 |
final_crops_for_saving: list[np.ndarray] = []
|
1468 |
|
1469 |
-
#
|
1470 |
-
|
1471 |
-
|
1472 |
-
|
1473 |
-
|
1474 |
-
|
1475 |
-
|
1476 |
-
|
1477 |
-
|
1478 |
-
|
1479 |
-
|
1480 |
-
|
1481 |
-
|
1482 |
-
|
1483 |
-
|
1484 |
-
|
1485 |
-
else:
|
1486 |
-
print(f" DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
|
1487 |
else:
|
1488 |
-
print(
|
1489 |
-
|
1490 |
-
|
1491 |
-
|
1492 |
-
|
1493 |
-
f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
|
1494 |
-
f"len(img_crop_list)={len(img_crop_list)}. "
|
1495 |
-
f"This indicates an issue in crop generation or recognizer batching. No results will be returned.")
|
1496 |
-
# Return empty if critical mismatch, as indexing will fail or be incorrect.
|
1497 |
-
return [], []
|
1498 |
|
1499 |
print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
|
1500 |
|
@@ -1710,36 +1752,67 @@ _MDR_CORRECTION_MIN_OVERLAP = 0.5
|
|
1710 |
|
1711 |
|
1712 |
def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image, layout: MDRLayoutElement):
|
|
|
1713 |
if not layout.fragments:
|
|
|
|
|
|
|
1714 |
return
|
|
|
|
|
1715 |
try:
|
1716 |
x1, y1, x2, y2 = layout.rect.wrapper
|
1717 |
margin = 5
|
1718 |
-
|
1719 |
-
|
1720 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1721 |
return
|
1722 |
-
|
1723 |
-
|
|
|
1724 |
except Exception as e:
|
1725 |
-
print(f"Correct: Crop error: {e}")
|
1726 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1727 |
try:
|
1728 |
-
|
|
|
1729 |
new_frags_local = list(ocr_engine.find_text_fragments(cropped_np))
|
1730 |
except Exception as e:
|
1731 |
-
print(f"Correct: OCR error: {e}")
|
1732 |
-
|
|
|
|
|
|
|
1733 |
new_frags_global = []
|
|
|
1734 |
for f in new_frags_local:
|
1735 |
r = f.rect
|
1736 |
lt, rt, lb, rb = r.lt, r.rt, r.lb, r.rb
|
1737 |
f.rect = MDRRectangle(lt=(lt[0] + off_x, lt[1] + off_y), rt=(rt[0] + off_x, rt[1] + off_y),
|
1738 |
lb=(lb[0] + off_x, lb[1] + off_y), rb=(rb[0] + off_x, rb[1] + off_y))
|
1739 |
new_frags_global.append(f)
|
1740 |
-
|
|
|
1741 |
matched, unmatched_orig = [], []
|
1742 |
used_new = set()
|
|
|
|
|
|
|
|
|
|
|
1743 |
for i, orig_f in enumerate(orig_frags):
|
1744 |
best_j, best_rate = -1, -1.0
|
1745 |
try:
|
@@ -1771,13 +1844,16 @@ def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image,
|
|
1771 |
used_new.add(best_j)
|
1772 |
else:
|
1773 |
unmatched_orig.append(orig_f)
|
|
|
1774 |
unmatched_new = [f for j, f in enumerate(new_frags_global) if j not in used_new]
|
|
|
1775 |
final = [n if n.rank >= o.rank else o for o, n in matched]
|
1776 |
final.extend(unmatched_orig)
|
1777 |
final.extend(unmatched_new)
|
1778 |
-
layout.fragments = final
|
1779 |
-
layout.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
|
1780 |
|
|
|
|
|
|
|
1781 |
|
1782 |
# --- MDR OCR Engine ---
|
1783 |
|
@@ -1860,6 +1936,7 @@ class MDROcrEngine:
|
|
1860 |
# much lower thresholds so we actually get some candidate masks:
|
1861 |
det_db_thresh=0.15,
|
1862 |
det_db_box_thresh=0.15,
|
|
|
1863 |
drop_score=0.01,
|
1864 |
use_angle_cls=False,
|
1865 |
)
|
@@ -2142,16 +2219,11 @@ class MDRLayoutReader:
|
|
2142 |
return layouts
|
2143 |
print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
|
2144 |
|
2145 |
-
# --- START:
|
2146 |
scaled_bboxes: list[list[int]] = []
|
2147 |
-
if w > 0 and h > 0:
|
2148 |
for bbox_item in bbox_list:
|
2149 |
-
# bbox_item.value is (x0, y0, x1, y1) in original image coordinates
|
2150 |
x0, y0, x1, y1 = bbox_item.value
|
2151 |
-
|
2152 |
-
# Scale to 0-1000 range based on image width (w) and height (h)
|
2153 |
-
# Ensure coordinates are within [0, 1000] and x1>=x0, y1>=y0
|
2154 |
-
# Clamp values to image boundaries before scaling to prevent negative scaled values if original box is outside
|
2155 |
x0_c = max(0.0, min(x0, float(w)))
|
2156 |
y0_c = max(0.0, min(y0, float(h)))
|
2157 |
x1_c = max(0.0, min(x1, float(w)))
|
@@ -2159,56 +2231,90 @@ class MDRLayoutReader:
|
|
2159 |
|
2160 |
scaled_x0 = max(0, min(1000, int(1000 * x0_c / w)))
|
2161 |
scaled_y0 = max(0, min(1000, int(1000 * y0_c / h)))
|
2162 |
-
scaled_x1 = max(scaled_x0, min(1000, int(1000 * x1_c / w)))
|
2163 |
-
scaled_y1 = max(scaled_y0, min(1000, int(1000 * y1_c / h)))
|
2164 |
scaled_bboxes.append([scaled_x0, scaled_y0, scaled_x1, scaled_y1])
|
2165 |
else:
|
|
|
2166 |
print(
|
2167 |
"MDRLayoutReader: Warning - Invalid image dimensions (w or h is zero) for scaling bboxes. Cannot determine reading order.")
|
2168 |
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
2169 |
return layouts
|
2170 |
-
# --- END:
|
2171 |
|
2172 |
-
if not scaled_bboxes: #
|
2173 |
print(
|
2174 |
"MDRLayoutReader: No scaled bboxes available after scaling step. Returning geometrically sorted layouts.")
|
2175 |
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
2176 |
return layouts
|
2177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2178 |
orders: list[int] = []
|
2179 |
try:
|
2180 |
with torch.no_grad():
|
2181 |
print("MDRLayoutReader: Creating reader inputs...")
|
2182 |
-
inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)
|
2183 |
print("MDRLayoutReader: Preparing inputs for model device...")
|
2184 |
inputs = mdr_prepare_reader_inputs(inputs, model)
|
2185 |
print("MDRLayoutReader: Running model inference...")
|
2186 |
logits = model(**inputs).logits.cpu().squeeze(0)
|
2187 |
print("MDRLayoutReader: Model inference complete. Parsing logits...")
|
2188 |
-
|
2189 |
-
orders = mdr_parse_reader_logits(logits, len(bbox_list))
|
2190 |
print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
|
2191 |
|
2192 |
-
# Assign the determined orders back to the bbox_list items
|
2193 |
if len(orders) == len(bbox_list):
|
2194 |
for i, order_val in enumerate(orders):
|
2195 |
bbox_list[i].order = order_val
|
2196 |
else:
|
2197 |
print(
|
2198 |
-
f"MDRLayoutReader: Warning - Mismatch between orders ({len(orders)}) and bbox_list ({len(bbox_list)}).
|
2199 |
-
for i in range(len(bbox_list)):
|
2200 |
bbox_list[i].order = i
|
2201 |
except Exception as e:
|
2202 |
print(f"MDR LayoutReader prediction error: {e}")
|
2203 |
import traceback
|
2204 |
traceback.print_exc()
|
2205 |
-
# Fallback: assign sequential order to bbox_list items before geometric sort of layouts
|
2206 |
for i in range(len(bbox_list)):
|
2207 |
bbox_list[i].order = i
|
2208 |
-
# Then apply this sequential order (which effectively becomes a geometric sort)
|
2209 |
print("MDRLayoutReader: Applying fallback sequential order due to error...")
|
2210 |
result_layouts = self._apply_order(layouts, bbox_list)
|
2211 |
-
return result_layouts
|
2212 |
|
2213 |
print("MDRLayoutReader: Applying order...")
|
2214 |
result_layouts = self._apply_order(layouts, bbox_list)
|
|
|
946 |
f" DEBUG OCR: _MDR_DBPostProcess (batch {batch_idx}): Input shape to postproc (orig) {dh_orig:.1f}x{dw_orig:.1f}. Sum of mask pixels: {np.sum(mask)}")
|
947 |
|
948 |
if self.box_t == 'poly':
|
949 |
+
boxes, scores = self._polygons_from_bitmap(current_pred_map, mask, dh_orig, dw_orig)
|
950 |
elif self.box_t == 'quad':
|
951 |
+
boxes, scores = self._boxes_from_bitmap(current_pred_map, mask, dh_orig, dw_orig)
|
|
|
952 |
else:
|
953 |
raise ValueError("box_type must be 'quad' or 'poly'")
|
954 |
print(
|
|
|
1298 |
print(
|
1299 |
f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
|
1300 |
|
1301 |
+
# --- START OF FIX ---
|
1302 |
+
MIN_DIM_FOR_RESIZE = 2 # Minimum original height or width to attempt resize
|
1303 |
+
if h_orig < MIN_DIM_FOR_RESIZE or w_orig < MIN_DIM_FOR_RESIZE:
|
1304 |
print(
|
1305 |
+
f" DEBUG RECOGNIZER: _resize_norm received degenerate crop ({h_orig}x{w_orig}) with dimension < {MIN_DIM_FOR_RESIZE}. Returning zeros before resize attempt.")
|
1306 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1307 |
+
# --- END OF FIX ---
|
1308 |
+
|
1309 |
+
# Original check for h_orig == 0 or w_orig == 0 is now covered by the above,
|
1310 |
+
# but can be kept for explicitness or if MIN_DIM_FOR_RESIZE is set to 1.
|
1311 |
+
# If MIN_DIM_FOR_RESIZE is 1, the original check is still useful.
|
1312 |
+
# If MIN_DIM_FOR_RESIZE is > 1, this specific check becomes redundant.
|
1313 |
+
# Let's keep it for safety if MIN_DIM_FOR_RESIZE is changed.
|
1314 |
+
if h_orig == 0 or w_orig == 0: # This check is technically redundant if MIN_DIM_FOR_RESIZE >= 1
|
1315 |
+
print(
|
1316 |
+
f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop ({h_orig}x{w_orig}) (secondary check). Returning zeros.")
|
1317 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1318 |
|
1319 |
+
r_current = w_orig / float(h_orig) # h_orig is guaranteed > 0 here if MIN_DIM_FOR_RESIZE >=1
|
1320 |
tw = min(imgW, int(ceil(imgH * r_current)))
|
1321 |
+
tw = max(1, tw) # Ensure target width is at least 1
|
1322 |
+
# Ensure target height (imgH) is also valid (it comes from self.shape, so should be)
|
1323 |
+
|
1324 |
print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
|
1325 |
|
1326 |
try:
|
1327 |
+
# Ensure target dimensions for resize are valid
|
1328 |
+
if tw <= 0 or imgH <= 0:
|
1329 |
+
print(
|
1330 |
+
f" DEBUG RECOGNIZER: _resize_norm calculated invalid target resize dimensions (tw: {tw}, imgH: {imgH}). Returning zeros.")
|
1331 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1332 |
resized = cv2.resize(img, (tw, imgH))
|
1333 |
except cv2.error as e_resize: # Catch specific cv2 error
|
1334 |
print(
|
|
|
1341 |
traceback.print_exc()
|
1342 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1343 |
|
1344 |
+
# ... rest of the normalization code ...
|
1345 |
resized = resized.astype("float32")
|
1346 |
+
if imgC == 1 and len(resized.shape) == 3: # If target is 1 channel and resized is 3
|
1347 |
+
if resized.shape[2] == 3: # Check if it actually has 3 channels
|
1348 |
+
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
|
|
1349 |
|
1350 |
+
if len(resized.shape) == 2: # If grayscale after potential conversion
|
1351 |
+
resized = resized[:, :, np.newaxis] # Add channel dim
|
1352 |
+
|
1353 |
+
# Ensure resized has 3 channels if imgC is 3, even if input was grayscale or became grayscale
|
1354 |
if imgC == 3 and resized.shape[2] == 1:
|
1355 |
resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2BGR)
|
1356 |
|
1357 |
+
# Final check on channel consistency
|
1358 |
+
if resized.shape[2] != imgC:
|
1359 |
+
print(
|
1360 |
+
f" DEBUG RECOGNIZER: Channel mismatch after processing. Expected {imgC}, got {resized.shape[2]}. Crop shape ({h_orig},{w_orig}). Returning zeros.")
|
1361 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1362 |
+
|
1363 |
resized = resized.transpose((2, 0, 1)) / 255.0
|
1364 |
resized -= 0.5
|
1365 |
resized /= 0.5
|
1366 |
|
1367 |
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1368 |
+
# Ensure tw is not out of bounds for padding
|
1369 |
+
actual_padded_width = min(tw, imgW)
|
1370 |
+
padding[:, :, 0:actual_padded_width] = resized[:, :, 0:actual_padded_width]
|
1371 |
|
1372 |
+
print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
|
1373 |
+
# ... rest of the logging ...
|
1374 |
min_px, max_px, mean_px = np.min(padding), np.max(padding), np.mean(padding)
|
1375 |
print(f" DEBUG RECOGNIZER: Normalized Crop Properties (before ONNX): "
|
1376 |
f"dtype: {padding.dtype}, "
|
|
|
1379 |
f"MeanPx: {mean_px:.4f}")
|
1380 |
if np.all(padding == 0):
|
1381 |
print(" DEBUG RECOGNIZER: WARNING - Normalized image is all zeros!")
|
1382 |
+
elif np.abs(max_px - min_px) < 1e-6:
|
1383 |
print(f" DEBUG RECOGNIZER: WARNING - Normalized image is a constant value: {mean_px:.4f}")
|
1384 |
return padding
|
1385 |
|
|
|
1430 |
self.save_crop = getattr(args, 'save_crop_res', False)
|
1431 |
self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
|
1432 |
|
|
|
|
|
1433 |
def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
|
1434 |
ori_im = img.copy()
|
1435 |
|
|
|
1444 |
if not dt_boxes_sorted:
|
1445 |
return [], []
|
1446 |
|
1447 |
+
# --- Stage 1 Fix: Refined filtering of boxes and creation of crops ---
|
1448 |
+
# Ensure dt_boxes_sorted and img_crop_list are synchronized.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1449 |
valid_boxes_for_cropping: list[np.ndarray] = []
|
1450 |
+
img_crop_list: list[np.ndarray] = [] # Initialize img_crop_list here
|
1451 |
for i, box_pts in enumerate(dt_boxes_sorted):
|
1452 |
crop_im = mdr_get_rotated_crop(ori_im, box_pts)
|
1453 |
if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1: # Min height/width for a crop
|
1454 |
valid_boxes_for_cropping.append(box_pts)
|
1455 |
+
img_crop_list.append(crop_im) # Directly populate the final img_crop_list
|
1456 |
else:
|
1457 |
print(
|
1458 |
f" DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
|
1459 |
|
1460 |
dt_boxes_sorted = valid_boxes_for_cropping # Update dt_boxes_sorted to only include those that yielded valid crops
|
1461 |
+
# img_crop_list is now the correctly filtered list of crops, synchronized with dt_boxes_sorted.
|
1462 |
+
# --- End of Stage 1 Fix ---
|
1463 |
|
1464 |
print(f" DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
|
1465 |
|
|
|
1467 |
print(" DEBUG TextSystem: No valid crops generated. Returning empty.")
|
1468 |
return [], []
|
1469 |
|
1470 |
+
if self.use_cls and self.classifier is not None:
|
1471 |
print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
|
1472 |
+
img_crop_list, cls_results = self.classifier(
|
1473 |
+
img_crop_list) # classifier might modify img_crop_list (e.g., rotate)
|
1474 |
print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
|
1475 |
|
1476 |
rec_results: list[tuple[str, float]] = []
|
|
|
1477 |
print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
|
1478 |
rec_results = self.recognizer(img_crop_list)
|
1479 |
print(f" DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
|
1480 |
|
1481 |
+
# --- Start of Stage 2 Fix: Robust handling of rec_results length ---
|
1482 |
+
expected_count = len(dt_boxes_sorted) # This is synchronized with len(img_crop_list) before recognizer
|
1483 |
+
# and should still match len(img_crop_list) after classifier
|
1484 |
+
# if classifier preserves length.
|
1485 |
+
actual_rec_count = len(rec_results)
|
1486 |
+
num_to_process = 0
|
1487 |
+
|
1488 |
+
if actual_rec_count == expected_count:
|
1489 |
+
num_to_process = actual_rec_count
|
1490 |
+
else:
|
1491 |
+
print(f" DEBUG TextSystem: WARNING - Mismatch in lengths after recognition! "
|
1492 |
+
f"Expected (from boxes/crops): {expected_count}, "
|
1493 |
+
f"Recognizer returned: {actual_rec_count} results. ")
|
1494 |
+
num_to_process = min(actual_rec_count, expected_count)
|
1495 |
+
if num_to_process < expected_count:
|
1496 |
+
print(
|
1497 |
+
f" DEBUG TextSystem: Will process {num_to_process} items due to mismatch. Some data might be lost if recognizer dropped results or if there was an issue in earlier stages not caught.")
|
1498 |
+
elif num_to_process < actual_rec_count: # Recognizer returned more than expected
|
1499 |
+
print(
|
1500 |
+
f" DEBUG TextSystem: Will process {num_to_process} items. Recognizer returned more results ({actual_rec_count}) than expected crops ({expected_count}). Extra recognition results will be ignored.")
|
1501 |
+
|
1502 |
+
if num_to_process == 0:
|
1503 |
+
if expected_count > 0: # If there were boxes/crops but no rec results to process
|
1504 |
+
print(
|
1505 |
+
" DEBUG TextSystem: No recognition results to process (num_to_process is 0) despite having input boxes/crops. Returning empty.")
|
1506 |
+
else: # If there were no boxes/crops to begin with
|
1507 |
+
print(
|
1508 |
+
" DEBUG TextSystem: No items to process (no initial boxes or num_to_process is 0). Returning empty.")
|
1509 |
+
return [], []
|
1510 |
+
# --- End of Stage 2 Fix preamble ---
|
1511 |
+
|
1512 |
+
print(
|
1513 |
+
f" DEBUG TextSystem: Filtering {num_to_process} recognition results with drop_score: {self.drop_score}")
|
1514 |
final_boxes_to_return: list[np.ndarray] = []
|
1515 |
final_recs_to_return: list[tuple[str, float]] = []
|
1516 |
final_crops_for_saving: list[np.ndarray] = []
|
1517 |
|
1518 |
+
# --- Stage 2 Fix: Modified Loop (No outer strict if/else) ---
|
1519 |
+
for i in range(num_to_process): # Iterate up to the safe number
|
1520 |
+
# It's crucial that dt_boxes_sorted[i], rec_results[i], and img_crop_list[i] correspond
|
1521 |
+
# for the items being processed.
|
1522 |
+
text, confidence = rec_results[i]
|
1523 |
+
|
1524 |
+
print(f" DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
|
1525 |
+
|
1526 |
+
if confidence >= self.drop_score:
|
1527 |
+
if text and not mdr_is_whitespace(text):
|
1528 |
+
final_boxes_to_return.append(dt_boxes_sorted[i])
|
1529 |
+
final_recs_to_return.append(rec_results[i])
|
1530 |
+
if self.save_crop:
|
1531 |
+
# Ensure img_crop_list[i] is valid if classifier could have changed its length
|
1532 |
+
# However, self.classifier is expected to return img_list of same length as input.
|
1533 |
+
final_crops_for_saving.append(img_crop_list[i])
|
|
|
|
|
1534 |
else:
|
1535 |
+
print(f" DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
|
1536 |
+
else:
|
1537 |
+
print(
|
1538 |
+
f" DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
|
1539 |
+
# --- End of Stage 2 Fix: Modified Loop ---
|
|
|
|
|
|
|
|
|
|
|
1540 |
|
1541 |
print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
|
1542 |
|
|
|
1752 |
|
1753 |
|
1754 |
def mdr_correct_layout_fragments(ocr_engine: 'MDROcrEngine', source_img: Image, layout: MDRLayoutElement):
|
1755 |
+
# --- START OF FIX ---
|
1756 |
if not layout.fragments:
|
1757 |
+
# If the layout has no fragments to begin with, there's nothing to correct.
|
1758 |
+
# Attempting to crop and OCR an empty layout region is unnecessary and can lead to errors.
|
1759 |
+
# print(f"Correct: Layout {type(layout.cls).__name__} has no initial fragments. Skipping OCR correction.") # Optional: for debugging
|
1760 |
return
|
1761 |
+
# --- END OF FIX ---
|
1762 |
+
|
1763 |
try:
|
1764 |
x1, y1, x2, y2 = layout.rect.wrapper
|
1765 |
margin = 5
|
1766 |
+
# Ensure crop_box dimensions are valid before cropping
|
1767 |
+
crop_x1 = max(0, round(x1) - margin)
|
1768 |
+
crop_y1 = max(0, round(y1) - margin)
|
1769 |
+
crop_x2 = min(source_img.width, round(x2) + margin)
|
1770 |
+
crop_y2 = min(source_img.height, round(y2) + margin)
|
1771 |
+
|
1772 |
+
if crop_x1 >= crop_x2 or crop_y1 >= crop_y2: # If crop dimensions are invalid/empty
|
1773 |
+
print(
|
1774 |
+
f"Correct: Crop box for layout {type(layout.cls).__name__} is invalid/empty ({crop_x1},{crop_y1},{crop_x2},{crop_y2}). Skipping OCR correction.")
|
1775 |
return
|
1776 |
+
|
1777 |
+
cropped = source_img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
|
1778 |
+
off_x, off_y = crop_x1, crop_y1
|
1779 |
except Exception as e:
|
1780 |
+
print(f"Correct: Crop error for layout {type(layout.cls).__name__}: {e}")
|
1781 |
return
|
1782 |
+
|
1783 |
+
# Additional check: if cropped image is too small for OCR
|
1784 |
+
if cropped.width < 5 or cropped.height < 5: # Arbitrary small threshold
|
1785 |
+
print(
|
1786 |
+
f"Correct: Cropped image for layout {type(layout.cls).__name__} is too small ({cropped.width}x{cropped.height}). Skipping OCR correction.")
|
1787 |
+
return
|
1788 |
+
|
1789 |
try:
|
1790 |
+
# Ensure conversion to RGB before converting to NumPy array
|
1791 |
+
cropped_np = np.array(cropped.convert("RGB"))[:, :, ::-1] # BGR for OpenCV-based OCR
|
1792 |
new_frags_local = list(ocr_engine.find_text_fragments(cropped_np))
|
1793 |
except Exception as e:
|
1794 |
+
print(f"Correct: OCR error during correction for layout {type(layout.cls).__name__}: {e}")
|
1795 |
+
# If OCR fails, we should probably keep the original fragments, if any.
|
1796 |
+
# The current logic below will do this if new_frags_local is empty.
|
1797 |
+
return # Exit if OCR itself fails catastrophically
|
1798 |
+
|
1799 |
new_frags_global = []
|
1800 |
+
# ... (rest of the function remains the same) ...
|
1801 |
for f in new_frags_local:
|
1802 |
r = f.rect
|
1803 |
lt, rt, lb, rb = r.lt, r.rt, r.lb, r.rb
|
1804 |
f.rect = MDRRectangle(lt=(lt[0] + off_x, lt[1] + off_y), rt=(rt[0] + off_x, rt[1] + off_y),
|
1805 |
lb=(lb[0] + off_x, lb[1] + off_y), rb=(rb[0] + off_x, rb[1] + off_y))
|
1806 |
new_frags_global.append(f)
|
1807 |
+
|
1808 |
+
orig_frags = layout.fragments # These are the fragments that existed before this function call
|
1809 |
matched, unmatched_orig = [], []
|
1810 |
used_new = set()
|
1811 |
+
|
1812 |
+
# If new_frags_global is empty (e.g. OCR found nothing in the cropped region),
|
1813 |
+
# then all orig_frags will go into unmatched_orig, and layout.fragments will be restored to orig_frags.
|
1814 |
+
# This is generally fine.
|
1815 |
+
|
1816 |
for i, orig_f in enumerate(orig_frags):
|
1817 |
best_j, best_rate = -1, -1.0
|
1818 |
try:
|
|
|
1844 |
used_new.add(best_j)
|
1845 |
else:
|
1846 |
unmatched_orig.append(orig_f)
|
1847 |
+
|
1848 |
unmatched_new = [f for j, f in enumerate(new_frags_global) if j not in used_new]
|
1849 |
+
|
1850 |
final = [n if n.rank >= o.rank else o for o, n in matched]
|
1851 |
final.extend(unmatched_orig)
|
1852 |
final.extend(unmatched_new)
|
|
|
|
|
1853 |
|
1854 |
+
layout.fragments = final
|
1855 |
+
if layout.fragments: # Only sort if there are fragments
|
1856 |
+
layout.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0]))
|
1857 |
|
1858 |
# --- MDR OCR Engine ---
|
1859 |
|
|
|
1936 |
# much lower thresholds so we actually get some candidate masks:
|
1937 |
det_db_thresh=0.15,
|
1938 |
det_db_box_thresh=0.15,
|
1939 |
+
unclip_ratio=2.0,
|
1940 |
drop_score=0.01,
|
1941 |
use_angle_cls=False,
|
1942 |
)
|
|
|
2219 |
return layouts
|
2220 |
print(f"MDRLayoutReader: Prepared {len(bbox_list)} bboxes.")
|
2221 |
|
2222 |
+
# --- START: SCALING LOGIC as in the prompt ---
|
2223 |
scaled_bboxes: list[list[int]] = []
|
2224 |
+
if w > 0 and h > 0:
|
2225 |
for bbox_item in bbox_list:
|
|
|
2226 |
x0, y0, x1, y1 = bbox_item.value
|
|
|
|
|
|
|
|
|
2227 |
x0_c = max(0.0, min(x0, float(w)))
|
2228 |
y0_c = max(0.0, min(y0, float(h)))
|
2229 |
x1_c = max(0.0, min(x1, float(w)))
|
|
|
2231 |
|
2232 |
scaled_x0 = max(0, min(1000, int(1000 * x0_c / w)))
|
2233 |
scaled_y0 = max(0, min(1000, int(1000 * y0_c / h)))
|
2234 |
+
scaled_x1 = max(scaled_x0, min(1000, int(1000 * x1_c / w)))
|
2235 |
+
scaled_y1 = max(scaled_y0, min(1000, int(1000 * y1_c / h)))
|
2236 |
scaled_bboxes.append([scaled_x0, scaled_y0, scaled_x1, scaled_y1])
|
2237 |
else:
|
2238 |
+
# This branch should ideally not be reached due to the initial w,h check
|
2239 |
print(
|
2240 |
"MDRLayoutReader: Warning - Invalid image dimensions (w or h is zero) for scaling bboxes. Cannot determine reading order.")
|
2241 |
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
2242 |
return layouts
|
2243 |
+
# --- END: SCALING LOGIC ---
|
2244 |
|
2245 |
+
if not scaled_bboxes: # Handles if bbox_list was empty
|
2246 |
print(
|
2247 |
"MDRLayoutReader: No scaled bboxes available after scaling step. Returning geometrically sorted layouts.")
|
2248 |
layouts.sort(key=lambda l: (l.rect.lt[1], l.rect.lt[0]))
|
2249 |
return layouts
|
2250 |
|
2251 |
+
# --- START OF FIX ---
|
2252 |
+
# Check if scaled_bboxes are problematic (e.g., all identical and degenerate)
|
2253 |
+
bypass_model_inference = False
|
2254 |
+
if len(scaled_bboxes) > 0:
|
2255 |
+
num_s_bboxes = len(scaled_bboxes)
|
2256 |
+
# Check if all scaled_bboxes are identical to the first one
|
2257 |
+
first_s_bbox_str = str(scaled_bboxes[0])
|
2258 |
+
all_identical = all(str(s_b) == first_s_bbox_str for s_b in scaled_bboxes)
|
2259 |
+
|
2260 |
+
if all_identical:
|
2261 |
+
# Check if this identical box is degenerate (zero width or height)
|
2262 |
+
s_x0, s_y0, s_x1, s_y1 = scaled_bboxes[0]
|
2263 |
+
if (s_x1 - s_x0 == 0) or (s_y1 - s_y0 == 0):
|
2264 |
+
bypass_model_inference = True
|
2265 |
+
print("MDRLayoutReader: All scaled bboxes are identical and degenerate. Bypassing LayoutLMv3.")
|
2266 |
+
|
2267 |
+
if not bypass_model_inference and num_s_bboxes > 1: # Check for high proportion of degenerate if not all identical
|
2268 |
+
degenerate_count = 0
|
2269 |
+
for s_b in scaled_bboxes:
|
2270 |
+
if (s_b[2] - s_b[0] == 0) or (s_b[3] - s_b[1] == 0): # x1-x0 or y1-y0
|
2271 |
+
degenerate_count += 1
|
2272 |
+
# If, for example, more than 90% of bboxes are degenerate
|
2273 |
+
if degenerate_count / num_s_bboxes > 0.9:
|
2274 |
+
bypass_model_inference = True
|
2275 |
+
print(
|
2276 |
+
f"MDRLayoutReader: High percentage ({degenerate_count / num_s_bboxes * 100:.1f}%) of scaled bboxes are degenerate. Bypassing LayoutLMv3.")
|
2277 |
+
|
2278 |
+
if bypass_model_inference:
|
2279 |
+
print("MDRLayoutReader: Applying fallback sequential order due to problematic scaled_bboxes.")
|
2280 |
+
# Assign sequential order based on _prepare_bboxes's sort (y, then x)
|
2281 |
+
for i in range(len(bbox_list)):
|
2282 |
+
bbox_list[i].order = i
|
2283 |
+
# Use _apply_order to apply this simple sequential ordering
|
2284 |
+
result_layouts = self._apply_order(layouts, bbox_list)
|
2285 |
+
return result_layouts
|
2286 |
+
# --- END OF FIX ---
|
2287 |
+
|
2288 |
orders: list[int] = []
|
2289 |
try:
|
2290 |
with torch.no_grad():
|
2291 |
print("MDRLayoutReader: Creating reader inputs...")
|
2292 |
+
inputs = mdr_boxes_to_reader_inputs(scaled_bboxes)
|
2293 |
print("MDRLayoutReader: Preparing inputs for model device...")
|
2294 |
inputs = mdr_prepare_reader_inputs(inputs, model)
|
2295 |
print("MDRLayoutReader: Running model inference...")
|
2296 |
logits = model(**inputs).logits.cpu().squeeze(0)
|
2297 |
print("MDRLayoutReader: Model inference complete. Parsing logits...")
|
2298 |
+
orders = mdr_parse_reader_logits(logits, len(bbox_list)) # len(bbox_list) is correct here
|
|
|
2299 |
print(f"MDRLayoutReader: Logits parsed. Orders count: {len(orders)}")
|
2300 |
|
|
|
2301 |
if len(orders) == len(bbox_list):
|
2302 |
for i, order_val in enumerate(orders):
|
2303 |
bbox_list[i].order = order_val
|
2304 |
else:
|
2305 |
print(
|
2306 |
+
f"MDRLayoutReader: Warning - Mismatch between orders ({len(orders)}) and bbox_list ({len(bbox_list)}). Using sequential order.")
|
2307 |
+
for i in range(len(bbox_list)):
|
2308 |
bbox_list[i].order = i
|
2309 |
except Exception as e:
|
2310 |
print(f"MDR LayoutReader prediction error: {e}")
|
2311 |
import traceback
|
2312 |
traceback.print_exc()
|
|
|
2313 |
for i in range(len(bbox_list)):
|
2314 |
bbox_list[i].order = i
|
|
|
2315 |
print("MDRLayoutReader: Applying fallback sequential order due to error...")
|
2316 |
result_layouts = self._apply_order(layouts, bbox_list)
|
2317 |
+
return result_layouts
|
2318 |
|
2319 |
print("MDRLayoutReader: Applying order...")
|
2320 |
result_layouts = self._apply_order(layouts, bbox_list)
|