Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +84 -59
mdr_pdf_parser.py
CHANGED
@@ -26,7 +26,6 @@ import fitz # PyMuPDF
|
|
26 |
from fitz import Document as FitzDocument, Page as FitzPage, Matrix as FitzMatrix
|
27 |
import numpy as np
|
28 |
import cv2 # OpenCV
|
29 |
-
import torch # PyTorch
|
30 |
import requests # For downloading models
|
31 |
from pathlib import Path
|
32 |
from enum import auto, Enum
|
@@ -46,7 +45,7 @@ from alphabet_detector import AlphabetDetector
|
|
46 |
from munch import Munch
|
47 |
from transformers import LayoutLMv3ForTokenClassification
|
48 |
import onnxruntime
|
49 |
-
|
50 |
# --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
|
51 |
from huggingface_hub import hf_hub_download
|
52 |
from huggingface_hub.errors import HfHubHTTPError
|
@@ -1396,91 +1395,109 @@ class _MDR_TextSystem:
|
|
1396 |
self.save_crop = getattr(args, 'save_crop_res', False)
|
1397 |
self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
|
1398 |
|
1399 |
-
|
|
|
1400 |
def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
|
1401 |
-
|
1402 |
-
Processes an image to detect and recognize text.
|
1403 |
-
Args:
|
1404 |
-
img: A NumPy array representing the image (BGR format).
|
1405 |
-
Returns:
|
1406 |
-
A tuple containing:
|
1407 |
-
- A list of detected text bounding boxes (each box is a NumPy array of 4 points).
|
1408 |
-
- A list of recognition results (each result is a tuple of [text, confidence_score]).
|
1409 |
-
"""
|
1410 |
-
ori_im = img.copy() # Keep original for cropping
|
1411 |
|
1412 |
-
|
1413 |
-
# The detector's __call__ method handles its own preprocessing.
|
1414 |
-
# dt_boxes are expected to be in original image coordinates.
|
1415 |
-
dt_boxes: np.ndarray = self.detector(img) # This is an np.ndarray of shape (N, 4, 2) or empty
|
1416 |
print(
|
1417 |
f" DEBUG TextSystem: Detector found {len(dt_boxes) if dt_boxes is not None and dt_boxes.size > 0 else 0} initial boxes.")
|
1418 |
-
|
1419 |
-
if dt_boxes is None or dt_boxes.size == 0: # Check if array is empty
|
1420 |
return [], []
|
1421 |
|
1422 |
-
# 2. Sort boxes (typically top-to-bottom, left-to-right)
|
1423 |
dt_boxes_sorted: list[np.ndarray] = self._sort_boxes(dt_boxes)
|
1424 |
print(f" DEBUG TextSystem: Sorted {len(dt_boxes_sorted)} boxes.")
|
1425 |
-
|
1426 |
-
if not dt_boxes_sorted: # If sorting resulted in empty list (e.g. due to unexpected format)
|
1427 |
return [], []
|
1428 |
|
1429 |
-
# 3. Get cropped images from detected boxes
|
1430 |
img_crop_list: list[np.ndarray] = []
|
1431 |
for i in range(len(dt_boxes_sorted)):
|
1432 |
-
# dt_boxes_sorted[i] is a single box (e.g., 4x2 array of points)
|
1433 |
crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
|
1434 |
-
|
1435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1436 |
|
1437 |
-
#
|
1438 |
-
|
1439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1440 |
print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
|
1441 |
-
img_crop_list, cls_results = self.classifier(img_crop_list)
|
1442 |
print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
|
1443 |
|
1444 |
-
# 5. Recognize text in the (potentially rotated) cropped images
|
1445 |
-
# The recognizer's __call__ method handles its own preprocessing.
|
1446 |
rec_results: list[tuple[str, float]] = []
|
1447 |
-
|
1448 |
-
|
1449 |
-
|
1450 |
-
|
1451 |
-
print(f" DEBUG TextSystem: No crops to recognize.")
|
1452 |
|
1453 |
-
# 6. Filter results
|
1454 |
final_boxes_to_return: list[np.ndarray] = []
|
1455 |
final_recs_to_return: list[tuple[str, float]] = []
|
1456 |
final_crops_for_saving: list[np.ndarray] = []
|
1457 |
|
1458 |
-
|
|
|
|
|
|
|
1459 |
for i in range(len(rec_results)):
|
1460 |
text, confidence = rec_results[i]
|
1461 |
-
|
1462 |
-
|
1463 |
-
|
1464 |
-
|
1465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1466 |
else:
|
1467 |
-
print(f" DEBUG TextSystem:
|
1468 |
-
f"len(rec_results)={len(rec_results)
|
1469 |
f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
|
1470 |
-
f"len(img_crop_list)={len(img_crop_list)}.
|
1471 |
-
|
|
|
|
|
1472 |
|
1473 |
print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
|
1474 |
|
1475 |
-
# 7. (Optional) Save cropped images that passed all filters
|
1476 |
if self.save_crop and final_crops_for_saving:
|
1477 |
print(f" DEBUG TextSystem: Saving {len(final_crops_for_saving)} filtered crops.")
|
1478 |
self._save_crops(final_crops_for_saving, final_recs_to_return)
|
1479 |
|
1480 |
return final_boxes_to_return, final_recs_to_return
|
1481 |
|
1482 |
-
# --- END: CORRECTED/ADDED __call__ METHOD ---
|
1483 |
-
|
1484 |
def _sort_boxes(self, boxes):
|
1485 |
if boxes is None or len(boxes) == 0: return []
|
1486 |
|
@@ -2437,14 +2454,22 @@ class MDRImageOptimizer:
|
|
2437 |
in fragments if (r := f.rect)]
|
2438 |
|
2439 |
def finalize_layout_coords(self, layouts: list[MDRLayoutElement]):
|
2440 |
-
if self._rot_ctx is None
|
2441 |
-
|
2442 |
-
|
2443 |
-
|
2444 |
-
|
2445 |
-
|
2446 |
-
|
2447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2448 |
|
2449 |
|
2450 |
# --- MDR Image Clipping ---
|
|
|
26 |
from fitz import Document as FitzDocument, Page as FitzPage, Matrix as FitzMatrix
|
27 |
import numpy as np
|
28 |
import cv2 # OpenCV
|
|
|
29 |
import requests # For downloading models
|
30 |
from pathlib import Path
|
31 |
from enum import auto, Enum
|
|
|
45 |
from munch import Munch
|
46 |
from transformers import LayoutLMv3ForTokenClassification
|
47 |
import onnxruntime
|
48 |
+
|
49 |
# --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
|
50 |
from huggingface_hub import hf_hub_download
|
51 |
from huggingface_hub.errors import HfHubHTTPError
|
|
|
1395 |
self.save_crop = getattr(args, 'save_crop_res', False)
|
1396 |
self.crop_dir = getattr(args, 'crop_res_save_dir', "./output/mdr_crop_res")
|
1397 |
|
1398 |
+
# In class _MDR_TextSystem:
|
1399 |
+
|
1400 |
def __call__(self, img: np.ndarray) -> tuple[list[np.ndarray], list[tuple[str, float]]]:
|
1401 |
+
ori_im = img.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1402 |
|
1403 |
+
dt_boxes: np.ndarray = self.detector(img)
|
|
|
|
|
|
|
1404 |
print(
|
1405 |
f" DEBUG TextSystem: Detector found {len(dt_boxes) if dt_boxes is not None and dt_boxes.size > 0 else 0} initial boxes.")
|
1406 |
+
if dt_boxes is None or dt_boxes.size == 0:
|
|
|
1407 |
return [], []
|
1408 |
|
|
|
1409 |
dt_boxes_sorted: list[np.ndarray] = self._sort_boxes(dt_boxes)
|
1410 |
print(f" DEBUG TextSystem: Sorted {len(dt_boxes_sorted)} boxes.")
|
1411 |
+
if not dt_boxes_sorted:
|
|
|
1412 |
return [], []
|
1413 |
|
|
|
1414 |
img_crop_list: list[np.ndarray] = []
|
1415 |
for i in range(len(dt_boxes_sorted)):
|
|
|
1416 |
crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
|
1417 |
+
# Ensure crop_im is not empty or too small before adding
|
1418 |
+
if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1:
|
1419 |
+
img_crop_list.append(crop_im)
|
1420 |
+
else:
|
1421 |
+
print(
|
1422 |
+
f" DEBUG TextSystem: Crop {i} was None or too small, skipping. Original box: {dt_boxes_sorted[i]}")
|
1423 |
+
# To maintain correspondence, we might need to handle this more carefully
|
1424 |
+
# For now, this might lead to length mismatches if not all crops are valid.
|
1425 |
+
# A better approach might be to filter dt_boxes_sorted alongside img_crop_list creation.
|
1426 |
+
|
1427 |
+
# Let's refine the filtering of boxes and creation of crops to ensure they always match
|
1428 |
+
valid_boxes_for_cropping: list[np.ndarray] = []
|
1429 |
+
img_crop_list_refined: list[np.ndarray] = []
|
1430 |
+
for i, box_pts in enumerate(dt_boxes_sorted):
|
1431 |
+
crop_im = mdr_get_rotated_crop(ori_im, box_pts)
|
1432 |
+
if crop_im is not None and crop_im.shape[0] > 1 and crop_im.shape[1] > 1: # Min height/width for a crop
|
1433 |
+
valid_boxes_for_cropping.append(box_pts)
|
1434 |
+
img_crop_list_refined.append(crop_im)
|
1435 |
+
else:
|
1436 |
+
print(
|
1437 |
+
f" DEBUG TextSystem: Crop for box {i} (pts: {box_pts}) was None or too small. Skipping this box.")
|
1438 |
|
1439 |
+
dt_boxes_sorted = valid_boxes_for_cropping # Update dt_boxes_sorted to only include those that yielded valid crops
|
1440 |
+
img_crop_list = img_crop_list_refined # Use the refined list of crops
|
1441 |
+
|
1442 |
+
print(f" DEBUG TextSystem: Created {len(img_crop_list)} valid crops for further processing.")
|
1443 |
+
|
1444 |
+
if not img_crop_list: # If no valid crops were made
|
1445 |
+
print(" DEBUG TextSystem: No valid crops generated. Returning empty.")
|
1446 |
+
return [], []
|
1447 |
+
|
1448 |
+
if self.use_cls and self.classifier is not None: # No need to check img_crop_list again, it's guaranteed non-empty here
|
1449 |
print(f" DEBUG TextSystem: Applying text classification for {len(img_crop_list)} crops.")
|
1450 |
+
img_crop_list, cls_results = self.classifier(img_crop_list)
|
1451 |
print(f" DEBUG TextSystem: Classification complete. {len(cls_results if cls_results else [])} results.")
|
1452 |
|
|
|
|
|
1453 |
rec_results: list[tuple[str, float]] = []
|
1454 |
+
# No need to check img_crop_list again
|
1455 |
+
print(f" DEBUG TextSystem: Recognizing text for {len(img_crop_list)} crops.")
|
1456 |
+
rec_results = self.recognizer(img_crop_list)
|
1457 |
+
print(f" DEBUG TextSystem: Recognizer returned {len(rec_results)} results.")
|
|
|
1458 |
|
|
|
1459 |
final_boxes_to_return: list[np.ndarray] = []
|
1460 |
final_recs_to_return: list[tuple[str, float]] = []
|
1461 |
final_crops_for_saving: list[np.ndarray] = []
|
1462 |
|
1463 |
+
# Ensure lengths match before iterating. This is crucial.
|
1464 |
+
if len(rec_results) == len(dt_boxes_sorted) and len(rec_results) == len(img_crop_list):
|
1465 |
+
print(
|
1466 |
+
f" DEBUG TextSystem: Filtering {len(rec_results)} recognition results with drop_score: {self.drop_score}")
|
1467 |
for i in range(len(rec_results)):
|
1468 |
text, confidence = rec_results[i]
|
1469 |
+
|
1470 |
+
# Log each result before filtering
|
1471 |
+
print(f" DEBUG TextSystem: Rec item {i} - Text: '{text}', Confidence: {confidence:.4f}")
|
1472 |
+
|
1473 |
+
if confidence >= self.drop_score:
|
1474 |
+
if text and not mdr_is_whitespace(text):
|
1475 |
+
final_boxes_to_return.append(dt_boxes_sorted[i])
|
1476 |
+
final_recs_to_return.append(rec_results[i])
|
1477 |
+
if self.save_crop:
|
1478 |
+
final_crops_for_saving.append(img_crop_list[i])
|
1479 |
+
else:
|
1480 |
+
print(f" DEBUG TextSystem: Item {i} REJECTED (empty/whitespace text).")
|
1481 |
+
else:
|
1482 |
+
print(
|
1483 |
+
f" DEBUG TextSystem: Item {i} REJECTED (confidence {confidence:.4f} < drop_score {self.drop_score}).")
|
1484 |
else:
|
1485 |
+
print(f" DEBUG TextSystem: CRITICAL MISMATCH in lengths after recognition! "
|
1486 |
+
f"len(rec_results)={len(rec_results)}, "
|
1487 |
f"len(dt_boxes_sorted)={len(dt_boxes_sorted)}, "
|
1488 |
+
f"len(img_crop_list)={len(img_crop_list)}. "
|
1489 |
+
f"This indicates an issue in crop generation or recognizer batching. No results will be returned.")
|
1490 |
+
# Return empty if critical mismatch, as indexing will fail or be incorrect.
|
1491 |
+
return [], []
|
1492 |
|
1493 |
print(f" DEBUG TextSystem: Kept {len(final_boxes_to_return)} boxes after recognition and filtering.")
|
1494 |
|
|
|
1495 |
if self.save_crop and final_crops_for_saving:
|
1496 |
print(f" DEBUG TextSystem: Saving {len(final_crops_for_saving)} filtered crops.")
|
1497 |
self._save_crops(final_crops_for_saving, final_recs_to_return)
|
1498 |
|
1499 |
return final_boxes_to_return, final_recs_to_return
|
1500 |
|
|
|
|
|
1501 |
def _sort_boxes(self, boxes):
|
1502 |
if boxes is None or len(boxes) == 0: return []
|
1503 |
|
|
|
2454 |
in fragments if (r := f.rect)]
|
2455 |
|
2456 |
def finalize_layout_coords(self, layouts: list[MDRLayoutElement]):
|
2457 |
+
if self._rot_ctx is None: # If no rotation context, nothing to do
|
2458 |
+
return
|
2459 |
+
|
2460 |
+
if not self._adjust_points: # If we are NOT adjusting points back to original,
|
2461 |
+
# then restore original fragment rectangles
|
2462 |
+
if len(self._fragments) == len(self._rot_ctx.fragment_origin_rectangles):
|
2463 |
+
for f, orig_r in zip(self._fragments, self._rot_ctx.fragment_origin_rectangles):
|
2464 |
+
f.rect = orig_r
|
2465 |
+
# And adjust layout rectangles to origin coordinates
|
2466 |
+
adj = self._rot_ctx.to_origin
|
2467 |
+
for l in layouts:
|
2468 |
+
if (r := l.rect): # Check if rect exists
|
2469 |
+
l.rect = MDRRectangle(lt=adj.adjust(r.lt), rt=adj.adjust(r.rt), lb=adj.adjust(r.lb),
|
2470 |
+
rb=adj.adjust(r.rb))
|
2471 |
+
# If self._adjust_points is True, the coordinates (already adjusted to the rotated image) are kept as is.
|
2472 |
+
# No further action is needed for the True case here, as the adjustments happened in receive_fragments.
|
2473 |
|
2474 |
|
2475 |
# --- MDR Image Clipping ---
|