rodrigomasini commited on
Commit
763903d
·
verified ·
1 Parent(s): 0029256

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +84 -59
mdr_pdf_parser.py CHANGED
@@ -26,7 +26,6 @@ import fitz # PyMuPDF
26
  from fitz import Document as FitzDocument, Page as FitzPage, Matrix as FitzMatrix
27
  import numpy as np
28
  import cv2 # OpenCV
29
- import torch # PyTorch
30
  import requests # For downloading models
31
  from pathlib import Path
32
  from enum import auto, Enum
@@ -46,7 +45,7 @@ from alphabet_detector import AlphabetDetector
46
  from munch import Munch
47
  from transformers import LayoutLMv3ForTokenClassification
48
  import onnxruntime
49
- from enum import auto, Enum
50
  # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
51
  from huggingface_hub import hf_hub_download
52
  from huggingface_hub.errors import HfHubHTTPError
@@ -1396,91 +1395,109 @@ class _MDR_TextSystem:
1396
  self.save_crop = getattr(args, 'save_crop_res', False)
1397
  self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
1398
 
1399
- # --- START: CORRECTED/ADDED __call__ METHOD ---
 
1400
  def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
1401
- """
1402
- Processes an image to detect and recognize text.
1403
- Args:
1404
- img: A NumPy array representing the image (BGR format).
1405
- Returns:
1406
- A tuple containing:
1407
- - A list of detected text bounding boxes (each box is a NumPy array of 4 points).
1408
- - A list of recognition results (each result is a tuple of [text, confidence_score]).
1409
- """
1410
- ori_im = img.copy() # Keep original for cropping
1411
 
1412
- # 1. Detect text boxes using self.detector
1413
- # The detector's __call__ method handles its own preprocessing.
1414
- # dt_boxes are expected to be in original image coordinates.
1415
- dt_boxes: np.ndarray = self.detector(img) # This is an np.ndarray of shape (N, 4, 2) or empty
1416
  print(
1417
  f" DEBUG TextSystem: Detector found {len(dt_boxes) if dt_boxes is not None and dt_boxes.size > 0 else 0} initial boxes.")
1418
-
1419
- if dt_boxes is None or dt_boxes.size == 0: # Check if array is empty
1420
  return [], []
1421
 
1422
- # 2. Sort boxes (typically top-to-bottom, left-to-right)
1423
  dt_boxes_sorted: list[np.ndarray] = self._sort_boxes(dt_boxes)
1424
  print(f" DEBUG TextSystem: Sorted {len(dt_boxes_sorted)} boxes.")
1425
-
1426
- if not dt_boxes_sorted: # If sorting resulted in empty list (e.g. due to unexpected format)
1427
  return [], []
1428
 
1429
- # 3. Get cropped images from detected boxes
1430
  img_crop_list: list[np.ndarray] = []
1431
  for i in range(len(dt_boxes_sorted)):
1432
- # dt_boxes_sorted[i] is a single box (e.g., 4x2 array of points)
1433
  crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
1434
- img_crop_list.append(crop_im)
1435
- print(f" DEBUG TextSystem: Created {len(img_crop_list)} crops for further processing.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1436
 
1437
- # 4. (Optional) Classify text orientation and rotate crops if necessary
1438
- # The classifier's __call__ method handles its own preprocessing and modifies img_crop_list in place.
1439
- if self.use_cls and self.classifier is not None and img_crop_list:
 
 
 
 
 
 
 
1440
  print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
1441
- img_crop_list, cls_results = self.classifier(img_crop_list) # classifier updates img_crop_list
1442
  print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
1443
 
1444
- # 5. Recognize text in the (potentially rotated) cropped images
1445
- # The recognizer's __call__ method handles its own preprocessing.
1446
  rec_results: list[tuple[str, float]] = []
1447
- if img_crop_list:
1448
- print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
1449
- rec_results = self.recognizer(img_crop_list)
1450
- else:
1451
- print(f" DEBUG TextSystem: No crops to recognize.")
1452
 
1453
- # 6. Filter results
1454
  final_boxes_to_return: list[np.ndarray] = []
1455
  final_recs_to_return: list[tuple[str, float]] = []
1456
  final_crops_for_saving: list[np.ndarray] = []
1457
 
1458
- if rec_results and len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
 
 
 
1459
  for i in range(len(rec_results)):
1460
  text, confidence = rec_results[i]
1461
- if confidence >= self.drop_score and text and not mdr_is_whitespace(text):
1462
- final_boxes_to_return.append(dt_boxes_sorted[i])
1463
- final_recs_to_return.append(rec_results[i])
1464
- if self.save_crop:
1465
- final_crops_for_saving.append(img_crop_list[i])
 
 
 
 
 
 
 
 
 
 
1466
  else:
1467
- print(f" DEBUG TextSystem: Warning - Mismatch or empty rec_results. "
1468
- f"len(rec_results)={len(rec_results) if rec_results else 'None'}, "
1469
  f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
1470
- f"len(img_crop_list)={len(img_crop_list)}. No results will be returned from this stage.")
1471
- # Do not return here, allow empty lists to propagate if that's the case
 
 
1472
 
1473
  print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
1474
 
1475
- # 7. (Optional) Save cropped images that passed all filters
1476
  if self.save_crop and final_crops_for_saving:
1477
  print(f" DEBUG TextSystem: Saving {len(final_crops_for_saving)} filtered crops.")
1478
  self._save_crops(final_crops_for_saving, final_recs_to_return)
1479
 
1480
  return final_boxes_to_return, final_recs_to_return
1481
 
1482
- # --- END: CORRECTED/ADDED __call__ METHOD ---
1483
-
1484
  def _sort_boxes(self, boxes):
1485
  if boxes is None or len(boxes) == 0: return []
1486
 
@@ -2437,14 +2454,22 @@ class MDRImageOptimizer:
2437
  in fragments if (r := f.rect)]
2438
 
2439
  def finalize_layout_coords(self, layouts: list[MDRLayoutElement]):
2440
- if self._rot_ctx is None or self._adjust_points: return
2441
- if len(self._fragments) == len(self._rot_ctx.fragment_origin_rectangles): [setattr(f, 'rect', orig_r) for
2442
- f, orig_r in zip(self._fragments,
2443
- self._rot_ctx.fragment_origin_rectangles)]
2444
- adj = self._rot_ctx.to_origin;
2445
- [setattr(l, 'rect',
2446
- MDRRectangle(lt=adj.adjust(r.lt), rt=adj.adjust(r.rt), lb=adj.adjust(r.lb), rb=adj.adjust(r.rb))) for l
2447
- in layouts if (r := l.rect)]
 
 
 
 
 
 
 
 
2448
 
2449
 
2450
  # --- MDR Image Clipping ---
 
26
  from fitz import Document as FitzDocument, Page as FitzPage, Matrix as FitzMatrix
27
  import numpy as np
28
  import cv2 # OpenCV
 
29
  import requests # For downloading models
30
  from pathlib import Path
31
  from enum import auto, Enum
 
45
  from munch import Munch
46
  from transformers import LayoutLMv3ForTokenClassification
47
  import onnxruntime
48
+
49
  # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
50
  from huggingface_hub import hf_hub_download
51
  from huggingface_hub.errors import HfHubHTTPError
 
1395
  self.save_crop = getattr(args, 'save_crop_res', False)
1396
  self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
1397
 
1398
+ # In class _MDR_TextSystem:
1399
+
1400
  def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
1401
+ ori_im = img.copy()
 
 
 
 
 
 
 
 
 
1402
 
1403
+ dt_boxes: np.ndarray = self.detector(img)
 
 
 
1404
  print(
1405
  f" DEBUG TextSystem: Detector found {len(dt_boxes) if dt_boxes is not None and dt_boxes.size > 0 else 0} initial boxes.")
1406
+ if dt_boxes is None or dt_boxes.size == 0:
 
1407
  return [], []
1408
 
 
1409
  dt_boxes_sorted: list[np.ndarray] = self._sort_boxes(dt_boxes)
1410
  print(f" DEBUG TextSystem: Sorted {len(dt_boxes_sorted)} boxes.")
1411
+ if not dt_boxes_sorted:
 
1412
  return [], []
1413
 
 
1414
  img_crop_list: list[np.ndarray] = []
1415
  for i in range(len(dt_boxes_sorted)):
 
1416
  crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
1417
+ # Ensure crop_im is not empty or too small before adding
1418
+ if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:
1419
+ img_crop_list.append(crop_im)
1420
+ else:
1421
+ print(
1422
+ f" DEBUG TextSystem: Crop {i} was None or too small, skipping. Original box: {dt_boxes_sorted[i]}")
1423
+ # To maintain correspondence, we might need to handle this more carefully
1424
+ # For now, this might lead to length mismatches if not all crops are valid.
1425
+ # A better approach might be to filter dt_boxes_sorted alongside img_crop_list creation.
1426
+
1427
+ # Let's refine the filtering of boxes and creation of crops to ensure they always match
1428
+ valid_boxes_for_cropping: list[np.ndarray] = []
1429
+ img_crop_list_refined: list[np.ndarray] = []
1430
+ for i, box_pts in enumerate(dt_boxes_sorted):
1431
+ crop_im = mdr_get_rotated_crop(ori_im, box_pts)
1432
+ if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1: # Min height/width for a crop
1433
+ valid_boxes_for_cropping.append(box_pts)
1434
+ img_crop_list_refined.append(crop_im)
1435
+ else:
1436
+ print(
1437
+ f" DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
1438
 
1439
+ dt_boxes_sorted = valid_boxes_for_cropping # Update dt_boxes_sorted to only include those that yielded valid crops
1440
+ img_crop_list = img_crop_list_refined # Use the refined list of crops
1441
+
1442
+ print(f" DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
1443
+
1444
+ if not img_crop_list: # If no valid crops were made
1445
+ print(" DEBUG TextSystem: No valid crops generated. Returning empty.")
1446
+ return [], []
1447
+
1448
+ if self.use_cls and self.classifier is not None: # No need to check img_crop_list again, it's guaranteed non-empty here
1449
  print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
1450
+ img_crop_list, cls_results = self.classifier(img_crop_list)
1451
  print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
1452
 
 
 
1453
  rec_results: list[tuple[str, float]] = []
1454
+ # No need to check img_crop_list again
1455
+ print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
1456
+ rec_results = self.recognizer(img_crop_list)
1457
+ print(f" DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
 
1458
 
 
1459
  final_boxes_to_return: list[np.ndarray] = []
1460
  final_recs_to_return: list[tuple[str, float]] = []
1461
  final_crops_for_saving: list[np.ndarray] = []
1462
 
1463
+ # Ensure lengths match before iterating. This is crucial.
1464
+ if len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
1465
+ print(
1466
+ f" DEBUG TextSystem: Filtering {len(rec_results)} recognition results with drop_score: {self.drop_score}")
1467
  for i in range(len(rec_results)):
1468
  text, confidence = rec_results[i]
1469
+
1470
+ # Log each result before filtering
1471
+ print(f" DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
1472
+
1473
+ if confidence >= self.drop_score:
1474
+ if text and not mdr_is_whitespace(text):
1475
+ final_boxes_to_return.append(dt_boxes_sorted[i])
1476
+ final_recs_to_return.append(rec_results[i])
1477
+ if self.save_crop:
1478
+ final_crops_for_saving.append(img_crop_list[i])
1479
+ else:
1480
+ print(f" DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
1481
+ else:
1482
+ print(
1483
+ f" DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
1484
  else:
1485
+ print(f" DEBUG TextSystem: CRITICAL MISMATCH in lengths after recognition! "
1486
+ f"len(rec_results)={len(rec_results)}, "
1487
  f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
1488
+ f"len(img_crop_list)={len(img_crop_list)}. "
1489
+ f"This indicates an issue in crop generation or recognizer batching. No results will be returned.")
1490
+ # Return empty if critical mismatch, as indexing will fail or be incorrect.
1491
+ return [], []
1492
 
1493
  print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
1494
 
 
1495
  if self.save_crop and final_crops_for_saving:
1496
  print(f" DEBUG TextSystem: Saving {len(final_crops_for_saving)} filtered crops.")
1497
  self._save_crops(final_crops_for_saving, final_recs_to_return)
1498
 
1499
  return final_boxes_to_return, final_recs_to_return
1500
 
 
 
1501
  def _sort_boxes(self, boxes):
1502
  if boxes is None or len(boxes) == 0: return []
1503
 
 
2454
  in fragments if (r := f.rect)]
2455
 
2456
  def finalize_layout_coords(self, layouts: list[MDRLayoutElement]):
2457
+ if self._rot_ctx is None: # If no rotation context, nothing to do
2458
+ return
2459
+
2460
+ if not self._adjust_points: # If we are NOT adjusting points back to original,
2461
+ # then restore original fragment rectangles
2462
+ if len(self._fragments) == len(self._rot_ctx.fragment_origin_rectangles):
2463
+ for f, orig_r in zip(self._fragments, self._rot_ctx.fragment_origin_rectangles):
2464
+ f.rect = orig_r
2465
+ # And adjust layout rectangles to origin coordinates
2466
+ adj = self._rot_ctx.to_origin
2467
+ for l in layouts:
2468
+ if (r := l.rect): # Check if rect exists
2469
+ l.rect = MDRRectangle(lt=adj.adjust(r.lt), rt=adj.adjust(r.rt), lb=adj.adjust(r.lb),
2470
+ rb=adj.adjust(r.rb))
2471
+ # If self._adjust_points is True, the coordinates (already adjusted to the rotated image) are kept as is.
2472
+ # No further action is needed for the True case here, as the adjustments happened in receive_fragments.
2473
 
2474
 
2475
  # --- MDR Image Clipping ---