rodrigomasini commited on
Commit
8d56101
·
verified ·
1 Parent(s): d8adf3f

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +133 -37
mdr_pdf_parser.py CHANGED
@@ -1159,42 +1159,98 @@ class _MDR_TextSystem:
1159
  try: return list(sorted(boxes, key=key))
1160
  except: return list(boxes) # Fallback
1161
 
1162
- def __call__(self, img, classify=True):
1163
  ori_im = img.copy()
1164
- boxes = self.detector(img)
 
 
1165
  if boxes is None or len(boxes) == 0:
1166
- return [], []
1167
- boxes = self._sort_boxes(boxes)
 
 
 
 
1168
  crops = []
1169
- for b in boxes:
1170
  try:
1171
- crops.append(mdr_get_rotated_crop(ori_im, b)) # Use renamed util
1172
- except:
 
 
 
 
 
 
 
 
 
 
 
1173
  crops.append(None)
1174
- valid_idxs = [i for i, c in enumerate(crops) if c is not None]
 
1175
  if not valid_idxs:
 
1176
  return [], []
1177
- crops = [crops[i] for i in valid_idxs]
1178
- boxes = [boxes[i] for i in valid_idxs]
 
 
 
 
1179
  if self.use_cls and self.classifier and classify:
 
1180
  try:
1181
- crops, _ = self.classifier(crops) # Ignore cls results, just use rotated crops
1182
- except Exception as e:
1183
- print(f"Classifier error: {e}")
 
 
 
 
 
 
1184
  try:
1185
- rec_res = self.recognizer(crops)
1186
- except Exception as e:
1187
- print(f"Recognizer error: {e}")
1188
- return boxes, [["", 0.0]] * len(boxes)
1189
- final_boxes, final_rec = [], []
1190
- for box, res in zip(boxes, rec_res):
1191
- txt, score = res
1192
- if score >= self.drop_score:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1193
  final_boxes.append(box)
1194
- final_rec.append(res)
1195
- if self.save_crop:
1196
- self._save_crops(crops, rec_res)
1197
- return final_boxes, final_rec
 
 
 
 
 
 
 
 
 
 
 
 
1198
 
1199
  def _save_crops(self, crops, recs):
1200
  mdr_ensure_directory(self.crop_dir)
@@ -1511,18 +1567,58 @@ class MDROcrEngine:
1511
  def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
1512
  """Finds and recognizes text fragments in a NumPy image (BGR)."""
1513
  system = self._get_system()
1514
- if system is None: print("MDR OCR System unavailable."); return
1515
- img = self._preprocess(image_np)
1516
- try: boxes, recs = system(img)
1517
- except Exception as e: print(f"MDR OCR prediction error: {e}"); return
1518
- if not boxes or not recs: return
1519
- for box_pts, (txt, conf) in zip(boxes, recs):
1520
- if not txt or mdr_is_whitespace(txt) or conf < 0.1: continue
1521
- pts = [(float(p[0]), float(p[1])) for p in box_pts]
1522
- if len(pts) == 4:
1523
- r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
1524
- if r.is_valid and r.area > 1:
1525
- yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1526
 
1527
  def _preprocess(self, img: np.ndarray) -> np.ndarray:
1528
  if len(img.shape) == 3 and img.shape[2] == 4:
 
1159
  try: return list(sorted(boxes, key=key))
1160
  except: return list(boxes) # Fallback
1161
 
1162
+ def __call__(self, img, classify=True): # classify is True by default
1163
  ori_im = img.copy()
1164
+ print(f" DEBUG OCR SYS: _MDR_TextSystem called. Original image shape: {ori_im.shape}") # DEBUG
1165
+ boxes = self.detector(img) # This is _MDR_TextDetector
1166
+
1167
  if boxes is None or len(boxes) == 0:
1168
+ print(" DEBUG OCR SYS: Detector returned no boxes. Returning empty fragments.") # DEBUG
1169
+ return [], [] # This is what currently leads to "0 fragments found" if detector fails
1170
+
1171
+ print(f" DEBUG OCR SYS: Detector returned {len(boxes)} boxes. Proceeding to crop and recognize.") # DEBUG
1172
+ boxes = self._sort_boxes(boxes) # Sorting happens here
1173
+
1174
  crops = []
1175
+ for i, b in enumerate(boxes):
1176
  try:
1177
+ crop_img = mdr_get_rotated_crop(ori_im, b)
1178
+ if crop_img is None:
1179
+ print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} is None.") # DEBUG
1180
+ crops.append(None)
1181
+ elif crop_img.shape[0] == 0 or crop_img.shape[1] == 0:
1182
+ print(f" DEBUG OCR SYS: Crop {i+1}/{len(boxes)} has zero dimension: {crop_img.shape}") # DEBUG
1183
+ crops.append(None)
1184
+ else:
1185
+ crops.append(crop_img)
1186
+ # Optionally save these crops for manual inspection:
1187
+ # if self.save_crop: cv2.imwrite(os.path.join(self.crop_dir, f"debug_crop_before_cls_{self.crop_idx + i}.png"), crop_img)
1188
+ except Exception as e_crop:
1189
+ print(f" DEBUG OCR SYS: Error cropping box {i+1}/{len(boxes)}: {e_crop}") # DEBUG
1190
  crops.append(None)
1191
+
1192
+ valid_idxs = [i for i, c in enumerate(crops) if c is not None and c.shape[0] > 0 and c.shape[1] > 0]
1193
  if not valid_idxs:
1194
+ print(" DEBUG OCR SYS: No valid crops obtained after attempting to crop detected boxes. Returning empty fragments.") # DEBUG
1195
  return [], []
1196
+
1197
+ # Filter crops and corresponding boxes
1198
+ valid_crops = [crops[i] for i in valid_idxs]
1199
+ boxes_for_valid_crops = [boxes[i] for i in valid_idxs]
1200
+ print(f" DEBUG OCR SYS: Number of valid crops to process: {len(valid_crops)}") # DEBUG
1201
+
1202
  if self.use_cls and self.classifier and classify:
1203
+ print(f" DEBUG OCR SYS: Applying classifier to {len(valid_crops)} crops...") # DEBUG
1204
  try:
1205
+ # The classifier might modify valid_crops in-place (e.g., rotate them)
1206
+ classified_crops, cls_results = self.classifier(valid_crops) # classifier returns list, results
1207
+ print(f" DEBUG OCR SYS: Classifier results count: {len(cls_results)}. First few: {cls_results[:3]}") # DEBUG
1208
+ valid_crops = classified_crops # Update with potentially rotated crops
1209
+ except Exception as e_cls:
1210
+ print(f" DEBUG OCR SYS: Classifier error: {e_cls}. Using unclassified crops.") # DEBUG
1211
+ # Continue with unclassified (but valid) crops
1212
+
1213
+ print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
1214
  try:
1215
+ rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
1216
+ print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
1217
+ except Exception as e_rec:
1218
+ print(f" DEBUG OCR SYS: Recognizer error: {e_rec}. Returning empty fragments.") # DEBUG
1219
+ return [], [] # If recognizer fails, we can't proceed
1220
+
1221
+ final_boxes, final_rec_tuples = [], [] # Changed final_rec to final_rec_tuples
1222
+ if len(boxes_for_valid_crops) != len(rec_res):
1223
+ print(f" DEBUG OCR SYS: Mismatch! Boxes count {len(boxes_for_valid_crops)} != Recognizer results count {len(rec_res)}. This should not happen.")
1224
+ # Handle this gracefully, perhaps by taking the minimum length
1225
+ min_len = min(len(boxes_for_valid_crops), len(rec_res))
1226
+ boxes_to_iterate = boxes_for_valid_crops[:min_len]
1227
+ rec_res_to_iterate = rec_res[:min_len]
1228
+ else:
1229
+ boxes_to_iterate = boxes_for_valid_crops
1230
+ rec_res_to_iterate = rec_res
1231
+
1232
+ print(f" DEBUG OCR SYS: Filtering {len(rec_res_to_iterate)} recognition results with drop_score: {self.drop_score}") # DEBUG
1233
+ for i, (box, res_tuple) in enumerate(zip(boxes_to_iterate, rec_res_to_iterate)):
1234
+ txt, score = res_tuple
1235
+ print(f" DEBUG OCR SYS: Box {i+1} - Recognized: '{txt}', Score: {score:.4f}") # DEBUG
1236
+ if score >= self.drop_score and txt and not mdr_is_whitespace(txt): # Added check for non-empty/whitespace
1237
  final_boxes.append(box)
1238
+ final_rec_tuples.append(res_tuple)
1239
+ else:
1240
+ reason = []
1241
+ if score < self.drop_score: reason.append(f"score {score:.2f} < {self.drop_score}")
1242
+ if not txt: reason.append("empty text")
1243
+ if txt and mdr_is_whitespace(txt): reason.append("whitespace text")
1244
+ print(f" DEBUG OCR SYS: Box {i+1} DROPPED. Reason(s): {', '.join(reason)}") # DEBUG
1245
+
1246
+
1247
+ if self.save_crop: # This is false by default in _MDR_ONNXParams
1248
+ # Ensure crop_dir exists if you enable this
1249
+ # self._save_crops(valid_crops, rec_res) # Pass original rec_res to save all attempts if needed
1250
+ pass
1251
+
1252
+ print(f" DEBUG OCR SYS: Returning {len(final_boxes)} final boxes and {len(final_rec_tuples)} final recognition results.") # DEBUG
1253
+ return final_boxes, final_rec_tuples # Ensure this returns tuples of (text, score)
1254
 
1255
  def _save_crops(self, crops, recs):
1256
  mdr_ensure_directory(self.crop_dir)
 
1567
  def find_text_fragments(self, image_np: np.ndarray) -> Generator[MDROcrFragment, None, None]:
1568
  """Finds and recognizes text fragments in a NumPy image (BGR)."""
1569
  system = self._get_system()
1570
+ if system is None:
1571
+ print(" DEBUG OCR Engine: MDR OCR System unavailable. No fragments will be found.") # DEBUG
1572
+ return # Empty generator
1573
+
1574
+ img_for_system = self._preprocess(image_np) # _preprocess handles BGR/BGRA/GRAY to BGR
1575
+ print(f" DEBUG OCR Engine: Image preprocessed for TextSystem. Shape: {img_for_system.shape}") # DEBUG
1576
+
1577
+ try:
1578
+ # system.__call__ should return (list_of_boxes, list_of_tuples_text_score)
1579
+ boxes, recs = system(img_for_system) # recs should be list of (text, score)
1580
+ except Exception as e:
1581
+ print(f" DEBUG OCR Engine: Error during TextSystem prediction: {e}") # DEBUG
1582
+ import traceback
1583
+ traceback.print_exc()
1584
+ return # Empty generator
1585
+
1586
+ if not boxes or not recs:
1587
+ print(f" DEBUG OCR Engine: TextSystem returned no boxes ({len(boxes)}) or no recs ({len(recs)}). No fragments generated.") # DEBUG
1588
+ return # Empty generator
1589
+
1590
+ if len(boxes) != len(recs):
1591
+ print(f" DEBUG OCR Engine: Mismatch between boxes ({len(boxes)}) and recs ({len(recs)}) from TextSystem. This is problematic.")
1592
+ # Potentially try to recover by taking the minimum length, or just return
1593
+ return
1594
+
1595
+ print(f" DEBUG OCR Engine: TextSystem returned {len(boxes)} boxes and {len(recs)} recognition results. Converting to MDROcrFragment.") # DEBUG
1596
+ fragments_generated_count = 0
1597
+ for i, (box_pts, rec_tuple) in enumerate(zip(boxes, recs)):
1598
+ if not isinstance(rec_tuple, (list, tuple)) or len(rec_tuple) != 2:
1599
+ print(f" DEBUG OCR Engine: Rec item {i} is not a valid (text, score) tuple: {rec_tuple}. Skipping.")
1600
+ continue
1601
+
1602
+ txt, conf = rec_tuple
1603
+ # The filtering by drop_score and whitespace should have happened in _MDR_TextSystem
1604
+ # But we can add a redundant check or rely on it.
1605
+ # For MDROcrFragment, we just need valid text and geometry.
1606
+ if not txt or mdr_is_whitespace(txt): # Basic check, though system should filter
1607
+ # print(f" DEBUG OCR Engine: Fragment {i} has empty/whitespace text after system call. Text: '{txt}'. Skipping.")
1608
+ continue
1609
+
1610
+ pts = [(float(p[0]), float(p[1])) for p in box_pts]
1611
+ if len(pts) == 4:
1612
+ r = MDRRectangle(lt=pts[0], rt=pts[1], rb=pts[2], lb=pts[3])
1613
+ if r.is_valid and r.area > 1: # Ensure valid geometry
1614
+ yield MDROcrFragment(order=-1, text=txt, rank=float(conf), rect=r)
1615
+ fragments_generated_count += 1
1616
+ # else:
1617
+ # print(f" DEBUG OCR Engine: Fragment {i} has invalid/small rectangle. Area: {r.area}. Valid: {r.is_valid}. Skipping.")
1618
+ # else:
1619
+ # print(f" DEBUG OCR Engine: Fragment {i} box_pts not length 4: {len(pts)}. Skipping.")
1620
+
1621
+ print(f" DEBUG OCR Engine: Generated {fragments_generated_count} MDROcrFragment objects.") # DEBUG
1622
 
1623
  def _preprocess(self, img: np.ndarray) -> np.ndarray:
1624
  if len(img.shape) == 3 and img.shape[2] == 4: