Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +133 -37
mdr_pdf_parser.py
CHANGED
@@ -1159,42 +1159,98 @@ class _MDR_TextSystem:
|
|
1159 |
try: return list(sorted(boxes, key=key))
|
1160 |
except: return list(boxes) # Fallback
|
1161 |
|
1162 |
-
def __call__(self, img, classify=True):
|
1163 |
ori_im = img.copy()
|
1164 |
-
|
|
|
|
|
1165 |
if boxes is None or len(boxes) == 0:
|
1166 |
-
|
1167 |
-
|
|
|
|
|
|
|
|
|
1168 |
crops = []
|
1169 |
-
for b in boxes:
|
1170 |
try:
|
1171 |
-
|
1172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1173 |
crops.append(None)
|
1174 |
-
|
|
|
1175 |
if not valid_idxs:
|
|
|
1176 |
return [], []
|
1177 |
-
|
1178 |
-
|
|
|
|
|
|
|
|
|
1179 |
if self.use_cls and self.classifier and classify:
|
|
|
1180 |
try:
|
1181 |
-
|
1182 |
-
|
1183 |
-
print(f"Classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
1184 |
try:
|
1185 |
-
rec_res = self.recognizer(
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
-
|
1190 |
-
|
1191 |
-
|
1192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1193 |
final_boxes.append(box)
|
1194 |
-
|
1195 |
-
|
1196 |
-
|
1197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1198 |
|
1199 |
def _save_crops(self, crops, recs):
|
1200 |
mdr_ensure_directory(self.crop_dir)
|
@@ -1511,18 +1567,58 @@ class MDROcrEngine:
|
|
1511 |
def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
|
1512 |
"""Finds and recognizes text fragments in a NumPy image (BGR)."""
|
1513 |
system = self._get_system()
|
1514 |
-
if system is None:
|
1515 |
-
|
1516 |
-
|
1517 |
-
|
1518 |
-
|
1519 |
-
for
|
1520 |
-
|
1521 |
-
|
1522 |
-
|
1523 |
-
|
1524 |
-
|
1525 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1526 |
|
1527 |
def _preprocess(self, img: np.ndarray) -> np.ndarray:
|
1528 |
if len(img.shape) == 3 and img.shape[2] == 4:
|
|
|
1159 |
try: return list(sorted(boxes, key=key))
|
1160 |
except: return list(boxes) # Fallback
|
1161 |
|
1162 |
+
def __call__(self, img, classify=True): # classify is True by default
|
1163 |
ori_im = img.copy()
|
1164 |
+
print(f" DEBUG OCR SYS: _MDR_TextSystem called. Original image shape: {ori_im.shape}") # DEBUG
|
1165 |
+
boxes = self.detector(img) # This is _MDR_TextDetector
|
1166 |
+
|
1167 |
if boxes is None or len(boxes) == 0:
|
1168 |
+
print(" DEBUG OCR SYS: Detector returned no boxes. Returning empty fragments.") # DEBUG
|
1169 |
+
return [], [] # This is what currently leads to "0 fragments found" if detector fails
|
1170 |
+
|
1171 |
+
print(f" DEBUG OCR SYS: Detector returned {len(boxes)} boxes. Proceeding to crop and recognize.") # DEBUG
|
1172 |
+
boxes = self._sort_boxes(boxes) # Sorting happens here
|
1173 |
+
|
1174 |
crops = []
|
1175 |
+
for i, b in enumerate(boxes):
|
1176 |
try:
|
1177 |
+
crop_img = mdr_get_rotated_crop(ori_im, b)
|
1178 |
+
if crop_img is None:
|
1179 |
+
print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} is None.") # DEBUG
|
1180 |
+
crops.append(None)
|
1181 |
+
elif crop_img.shape[0] == 0 or crop_img.shape[1] == 0:
|
1182 |
+
print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} has zero dimension: {crop_img.shape}") # DEBUG
|
1183 |
+
crops.append(None)
|
1184 |
+
else:
|
1185 |
+
crops.append(crop_img)
|
1186 |
+
# Optionally save these crops for manual inspection:
|
1187 |
+
# if self.save_crop: cv2.imwrite(os.path.join(self.crop_dir, f"debug_crop_before_cls_{self.crop_idx + i}.png"), crop_img)
|
1188 |
+
except Exception as e_crop:
|
1189 |
+
print(f" DEBUG OCR SYS: Error cropping box {i+1}/{len(boxes)}: {e_crop}") # DEBUG
|
1190 |
crops.append(None)
|
1191 |
+
|
1192 |
+
valid_idxs = [i for i, c in enumerate(crops) if c is not None and c.shape[0] > 0 and c.shape[1] > 0]
|
1193 |
if not valid_idxs:
|
1194 |
+
print(" DEBUG OCR SYS: No valid crops obtained after attempting to crop detected boxes. Returning empty fragments.") # DEBUG
|
1195 |
return [], []
|
1196 |
+
|
1197 |
+
# Filter crops and corresponding boxes
|
1198 |
+
valid_crops = [crops[i] for i in valid_idxs]
|
1199 |
+
boxes_for_valid_crops = [boxes[i] for i in valid_idxs]
|
1200 |
+
print(f" DEBUG OCR SYS: Number of valid crops to process: {len(valid_crops)}") # DEBUG
|
1201 |
+
|
1202 |
if self.use_cls and self.classifier and classify:
|
1203 |
+
print(f" DEBUG OCR SYS: Applying classifier to {len(valid_crops)} crops...") # DEBUG
|
1204 |
try:
|
1205 |
+
# The classifier might modify valid_crops in-place (e.g., rotate them)
|
1206 |
+
classified_crops, cls_results = self.classifier(valid_crops) # classifier returns list, results
|
1207 |
+
print(f" DEBUG OCR SYS: Classifier results count: {len(cls_results)}. First few: {cls_results[:3]}") # DEBUG
|
1208 |
+
valid_crops = classified_crops # Update with potentially rotated crops
|
1209 |
+
except Exception as e_cls:
|
1210 |
+
print(f" DEBUG OCR SYS: Classifier error: {e_cls}. Using unclassified crops.") # DEBUG
|
1211 |
+
# Continue with unclassified (but valid) crops
|
1212 |
+
|
1213 |
+
print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
|
1214 |
try:
|
1215 |
+
rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
|
1216 |
+
print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
|
1217 |
+
except Exception as e_rec:
|
1218 |
+
print(f" DEBUG OCR SYS: Recognizer error: {e_rec}. Returning empty fragments.") # DEBUG
|
1219 |
+
return [], [] # If recognizer fails, we can't proceed
|
1220 |
+
|
1221 |
+
final_boxes, final_rec_tuples = [], [] # Changed final_rec to final_rec_tuples
|
1222 |
+
if len(boxes_for_valid_crops) != len(rec_res):
|
1223 |
+
print(f" DEBUG OCR SYS: Mismatch! Boxes count {len(boxes_for_valid_crops)} != Recognizer results count {len(rec_res)}. This should not happen.")
|
1224 |
+
# Handle this gracefully, perhaps by taking the minimum length
|
1225 |
+
min_len = min(len(boxes_for_valid_crops), len(rec_res))
|
1226 |
+
boxes_to_iterate = boxes_for_valid_crops[:min_len]
|
1227 |
+
rec_res_to_iterate = rec_res[:min_len]
|
1228 |
+
else:
|
1229 |
+
boxes_to_iterate = boxes_for_valid_crops
|
1230 |
+
rec_res_to_iterate = rec_res
|
1231 |
+
|
1232 |
+
print(f" DEBUG OCR SYS: Filtering {len(rec_res_to_iterate)} recognition results with drop_score: {self.drop_score}") # DEBUG
|
1233 |
+
for i, (box, res_tuple) in enumerate(zip(boxes_to_iterate, rec_res_to_iterate)):
|
1234 |
+
txt, score = res_tuple
|
1235 |
+
print(f" DEBUG OCR SYS: Box {i+1} - Recognized: '{txt}', Score: {score:.4f}") # DEBUG
|
1236 |
+
if score >= self.drop_score and txt and not mdr_is_whitespace(txt): # Added check for non-empty/whitespace
|
1237 |
final_boxes.append(box)
|
1238 |
+
final_rec_tuples.append(res_tuple)
|
1239 |
+
else:
|
1240 |
+
reason = []
|
1241 |
+
if score < self.drop_score: reason.append(f"score {score:.2f} < {self.drop_score}")
|
1242 |
+
if not txt: reason.append("empty text")
|
1243 |
+
if txt and mdr_is_whitespace(txt): reason.append("whitespace text")
|
1244 |
+
print(f" DEBUG OCR SYS: Box {i+1} DROPPED. Reason(s): {', '.join(reason)}") # DEBUG
|
1245 |
+
|
1246 |
+
|
1247 |
+
if self.save_crop: # This is false by default in _MDR_ONNXParams
|
1248 |
+
# Ensure crop_dir exists if you enable this
|
1249 |
+
# self._save_crops(valid_crops, rec_res) # Pass original rec_res to save all attempts if needed
|
1250 |
+
pass
|
1251 |
+
|
1252 |
+
print(f" DEBUG OCR SYS: Returning {len(final_boxes)} final boxes and {len(final_rec_tuples)} final recognition results.") # DEBUG
|
1253 |
+
return final_boxes, final_rec_tuples # Ensure this returns tuples of (text, score)
|
1254 |
|
1255 |
def _save_crops(self, crops, recs):
|
1256 |
mdr_ensure_directory(self.crop_dir)
|
|
|
1567 |
def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
|
1568 |
"""Finds and recognizes text fragments in a NumPy image (BGR)."""
|
1569 |
system = self._get_system()
|
1570 |
+
if system is None:
|
1571 |
+
print(" DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.") # DEBUG
|
1572 |
+
return # Empty generator
|
1573 |
+
|
1574 |
+
img_for_system = self._preprocess(image_np) # _preprocess handles BGR/BGRA/GRAY to BGR
|
1575 |
+
print(f" DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}") # DEBUG
|
1576 |
+
|
1577 |
+
try:
|
1578 |
+
# system.__call__ should return (list_of_boxes, list_of_tuples_text_score)
|
1579 |
+
boxes, recs = system(img_for_system) # recs should be list of (text, score)
|
1580 |
+
except Exception as e:
|
1581 |
+
print(f" DEBUG OCR Engine: Error during TextSystem prediction: {e}") # DEBUG
|
1582 |
+
import traceback
|
1583 |
+
traceback.print_exc()
|
1584 |
+
return # Empty generator
|
1585 |
+
|
1586 |
+
if not boxes or not recs:
|
1587 |
+
print(f" DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes)}) or no recs ({len(recs)}). No fragments generated.") # DEBUG
|
1588 |
+
return # Empty generator
|
1589 |
+
|
1590 |
+
if len(boxes) != len(recs):
|
1591 |
+
print(f" DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic.")
|
1592 |
+
# Potentially try to recover by taking the minimum length, or just return
|
1593 |
+
return
|
1594 |
+
|
1595 |
+
print(f" DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.") # DEBUG
|
1596 |
+
fragments_generated_count = 0
|
1597 |
+
for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
|
1598 |
+
if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
|
1599 |
+
print(f" DEBUG OCR Engine: Rec item {i} is not a valid (text, score) tuple: {rec_tuple}. Skipping.")
|
1600 |
+
continue
|
1601 |
+
|
1602 |
+
txt, conf = rec_tuple
|
1603 |
+
# The filtering by drop_score and whitespace should have happened in _MDR_TextSystem
|
1604 |
+
# But we can add a redundant check or rely on it.
|
1605 |
+
# For MDROcrFragment, we just need valid text and geometry.
|
1606 |
+
if not txt or mdr_is_whitespace(txt): # Basic check, though system should filter
|
1607 |
+
# print(f" DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.")
|
1608 |
+
continue
|
1609 |
+
|
1610 |
+
pts = [(float(p[0]), float(p[1])) for p in box_pts]
|
1611 |
+
if len(pts) == 4:
|
1612 |
+
r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
|
1613 |
+
if r.is_valid and r.area > 1: # Ensure valid geometry
|
1614 |
+
yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
|
1615 |
+
fragments_generated_count += 1
|
1616 |
+
# else:
|
1617 |
+
# print(f" DEBUG OCR Engine: Fragment {i} has invalid/small rectangle. Area: {r.area}. Valid: {r.is_valid}. Skipping.")
|
1618 |
+
# else:
|
1619 |
+
# print(f" DEBUG OCR Engine: Fragment {i} box_pts not length 4: {len(pts)}. Skipping.")
|
1620 |
+
|
1621 |
+
print(f" DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.") # DEBUG
|
1622 |
|
1623 |
def _preprocess(self, img: np.ndarray) -> np.ndarray:
|
1624 |
if len(img.shape) == 3 and img.shape[2] == 4:
|