Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +135 -42
mdr_pdf_parser.py
CHANGED
@@ -2221,27 +2221,98 @@ class MDRLayoutReader:
|
|
2221 |
bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
|
2222 |
return bbox_list
|
2223 |
|
2224 |
-
|
|
|
|
|
|
|
|
|
|
|
2225 |
layout_map = defaultdict(list)
|
2226 |
-
|
2227 |
-
|
2228 |
-
|
2229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2230 |
nfo = 0
|
2231 |
-
for
|
2232 |
-
|
2233 |
-
if not frags:
|
2234 |
continue
|
2235 |
-
|
2236 |
-
|
2237 |
-
|
2238 |
-
|
2239 |
-
|
2240 |
-
|
2241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2242 |
frag.order = nfo
|
2243 |
nfo += 1
|
2244 |
-
|
|
|
2245 |
|
2246 |
def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
|
2247 |
heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
|
@@ -2586,52 +2657,74 @@ class MDRExtractionEngine:
|
|
2586 |
print(f"MDR Extraction Engine initialized on device: {self._device}")
|
2587 |
|
2588 |
# --- MODIFIED _get_yolo_model METHOD for HF ---
|
2589 |
-
|
2590 |
-
|
2591 |
-
|
|
|
|
|
2592 |
repo_id = "hantian/yolo-doclaynet"
|
2593 |
filename = "yolov10b-doclaynet.pt"
|
2594 |
-
|
2595 |
-
yolo_cache_dir = Path(self._model_dir) / "
|
2596 |
-
mdr_ensure_directory(str(yolo_cache_dir))
|
2597 |
|
2598 |
print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
|
2599 |
print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
|
2600 |
|
2601 |
try:
|
2602 |
-
# Download the model file using huggingface_hub, caching it
|
2603 |
yolo_model_filepath = hf_hub_download(
|
2604 |
repo_id=repo_id,
|
2605 |
filename=filename,
|
2606 |
-
cache_dir=yolo_cache_dir,
|
2607 |
-
local_files_only=False,
|
2608 |
-
force_download=False,
|
2609 |
)
|
2610 |
print(f"YOLO model file path: {yolo_model_filepath}")
|
2611 |
|
2612 |
-
#
|
2613 |
-
|
2614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2615 |
|
2616 |
-
|
2617 |
-
|
2618 |
-
print(
|
2619 |
-
f"ERROR: Failed to download/access YOLO model via Hugging Face Hub: {e}") # Slightly updated message
|
2620 |
self._yolo = None
|
2621 |
-
except FileNotFoundError as e: #
|
2622 |
-
print(f"ERROR: YOLO model file not found
|
2623 |
self._yolo = None
|
2624 |
-
except Exception as e:
|
2625 |
-
|
2626 |
-
|
2627 |
-
|
2628 |
self._yolo = None
|
2629 |
|
2630 |
-
elif YOLOv10 is None:
|
2631 |
-
print("MDR YOLOv10 class not available. Layout detection skipped.")
|
2632 |
-
|
2633 |
return self._yolo
|
2634 |
|
|
|
2635 |
def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
|
2636 |
"""Analyzes a single page image to extract layout and content."""
|
2637 |
print(" Engine: Analyzing image...")
|
|
|
2221 |
bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
|
2222 |
return bbox_list
|
2223 |
|
2224 |
+
# In class MDRLayoutReader
|
2225 |
+
|
2226 |
+
def _apply_order(self, original_layouts_list: list[MDRLayoutElement],
|
2227 |
+
ordered_bbox_list_with_final_orders: list[_MDR_ReaderBBox]) -> list[MDRLayoutElement]:
|
2228 |
+
|
2229 |
+
# layout_map: maps original layout index to a list of its _MDR_ReaderBBox objects (which now have final .order)
|
2230 |
layout_map = defaultdict(list)
|
2231 |
+
for bbox_item in ordered_bbox_list_with_final_orders:
|
2232 |
+
layout_map[bbox_item.layout_index].append(bbox_item)
|
2233 |
+
|
2234 |
+
# Determine the new order of layouts themselves
|
2235 |
+
# The .order in bbox_item here is the *within-layout* order for fragments/virtual boxes.
|
2236 |
+
# We need the median of these *final reading orders* to sort the layouts.
|
2237 |
+
# The .order attribute of _MDR_ReaderBBox should have been updated by mdr_parse_reader_logits.
|
2238 |
+
|
2239 |
+
layout_median_orders = []
|
2240 |
+
for original_layout_idx, bboxes_for_this_layout in layout_map.items():
|
2241 |
+
if bboxes_for_this_layout: # Ensure there are bboxes
|
2242 |
+
# Each bbox_item.order here is its final reading order determined by LayoutLM
|
2243 |
+
median_order_for_layout = self._median([b.order for b in bboxes_for_this_layout])
|
2244 |
+
layout_median_orders.append((original_layout_idx, median_order_for_layout))
|
2245 |
+
|
2246 |
+
layout_median_orders.sort(key=lambda x: x[1]) # Sort layouts by their median reading order
|
2247 |
+
|
2248 |
+
# Create the new list of sorted layouts
|
2249 |
+
# Important: We are reordering the original_layouts_list.
|
2250 |
+
# The fragment objects within these layouts are the ones we need to sort.
|
2251 |
+
final_sorted_layouts = [original_layouts_list[idx] for idx, _ in layout_median_orders]
|
2252 |
+
|
2253 |
+
# Now, sort fragments within each layout
|
2254 |
+
# nfo (next fragment order) is a global counter for the absolute order of fragments across all layouts
|
2255 |
nfo = 0
|
2256 |
+
for layout_obj in final_sorted_layouts:
|
2257 |
+
if not layout_obj.fragments: # Skip layouts with no fragments
|
|
|
2258 |
continue
|
2259 |
+
|
2260 |
+
# Get the _MDR_ReaderBBox items that correspond to this specific layout_obj
|
2261 |
+
# We need the original index of layout_obj from the input `original_layouts_list`
|
2262 |
+
# This assumes original_layouts_list has not been reordered yet by this function.
|
2263 |
+
try:
|
2264 |
+
# Find the original index of the current layout_obj
|
2265 |
+
# This is safe if original_layouts_list is the list passed into this function
|
2266 |
+
original_idx_of_current_layout = original_layouts_list.index(layout_obj)
|
2267 |
+
except ValueError:
|
2268 |
+
# This should not happen if layout_obj came from original_layouts_list via layout_median_orders
|
2269 |
+
print(
|
2270 |
+
f" ERROR: Could not find layout in original list during fragment sort. Skipping fragment sort for this layout.")
|
2271 |
+
# Assign sequential order as a fallback for fragments in this layout
|
2272 |
+
for i_frag, frag_in_layout in enumerate(layout_obj.fragments):
|
2273 |
+
frag_in_layout.order = nfo + i_frag
|
2274 |
+
nfo += len(layout_obj.fragments)
|
2275 |
+
continue
|
2276 |
+
|
2277 |
+
# Get the _MDR_ReaderBBox items for this layout, which contain the final .order for each fragment_index
|
2278 |
+
reader_bboxes_for_this_layout = [
|
2279 |
+
b for b in layout_map[original_idx_of_current_layout] if not b.virtual
|
2280 |
+
]
|
2281 |
+
|
2282 |
+
if reader_bboxes_for_this_layout:
|
2283 |
+
# Create a map from original_fragment_index to its new_reading_order
|
2284 |
+
frag_idx_to_new_order_map = {
|
2285 |
+
b.fragment_index: b.order for b in reader_bboxes_for_this_layout
|
2286 |
+
}
|
2287 |
+
|
2288 |
+
# Sort the actual MDROcrFragment objects in layout_obj.fragments
|
2289 |
+
# The key for sorting should use the original index of the fragment
|
2290 |
+
# to look up its new_reading_order from the map.
|
2291 |
+
# We assume layout_obj.fragments has not been reordered yet by this function for this layout.
|
2292 |
+
# We need to sort a list of (fragment_object, original_index) tuples first.
|
2293 |
+
|
2294 |
+
fragments_with_original_indices = list(enumerate(layout_obj.fragments))
|
2295 |
+
|
2296 |
+
fragments_with_original_indices.sort(
|
2297 |
+
key=lambda item: frag_idx_to_new_order_map.get(item[0], float('inf')) # item[0] is original index
|
2298 |
+
)
|
2299 |
+
|
2300 |
+
# Reconstruct the sorted list of fragment objects
|
2301 |
+
layout_obj.fragments = [item[1] for item in
|
2302 |
+
fragments_with_original_indices] # item[1] is fragment object
|
2303 |
+
|
2304 |
+
else: # No corresponding reader_bboxes (e.g. layout was all virtual or had no frags initially)
|
2305 |
+
# or if the layout was created as a fallback and has no reader_bboxes.
|
2306 |
+
print(
|
2307 |
+
f" LayoutReader ApplyOrder: No reader_bboxes for layout (orig_idx {original_idx_of_current_layout}). Sorting frags geometrically.")
|
2308 |
+
layout_obj.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0])) # Fallback geometric sort
|
2309 |
+
|
2310 |
+
# Assign the final absolute order (nfo)
|
2311 |
+
for frag in layout_obj.fragments:
|
2312 |
frag.order = nfo
|
2313 |
nfo += 1
|
2314 |
+
|
2315 |
+
return final_sorted_layouts
|
2316 |
|
2317 |
def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
|
2318 |
heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
|
|
|
2657 |
print(f"MDR Extraction Engine initialized on device: {self._device}")
|
2658 |
|
2659 |
# --- MODIFIED _get_yolo_model METHOD for HF ---
|
2660 |
+
# In class MDRExtractionEngine:
|
2661 |
+
|
2662 |
+
def _get_yolo_model(self) -> Any | None: # Return type can be ultralytics.YOLO
|
2663 |
+
"""Loads the YOLOv10b-DocLayNet layout detection model."""
|
2664 |
+
if self._yolo is None:
|
2665 |
repo_id = "hantian/yolo-doclaynet"
|
2666 |
filename = "yolov10b-doclaynet.pt"
|
2667 |
+
|
2668 |
+
yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
|
2669 |
+
mdr_ensure_directory(str(yolo_cache_dir))
|
2670 |
|
2671 |
print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
|
2672 |
print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
|
2673 |
|
2674 |
try:
|
|
|
2675 |
yolo_model_filepath = hf_hub_download(
|
2676 |
repo_id=repo_id,
|
2677 |
filename=filename,
|
2678 |
+
cache_dir=yolo_cache_dir,
|
2679 |
+
local_files_only=False,
|
2680 |
+
force_download=False,
|
2681 |
)
|
2682 |
print(f"YOLO model file path: {yolo_model_filepath}")
|
2683 |
|
2684 |
+
# --- START: MODIFIED LOADING ---
|
2685 |
+
# Attempt to load directly using ultralytics.YOLO
|
2686 |
+
try:
|
2687 |
+
from ultralytics import YOLO as UltralyticsYOLO
|
2688 |
+
self._yolo = UltralyticsYOLO(yolo_model_filepath)
|
2689 |
+
print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
|
2690 |
+
except ImportError:
|
2691 |
+
print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
|
2692 |
+
print("Please install it: pip install ultralytics")
|
2693 |
+
self._yolo = None
|
2694 |
+
return None # Critical failure
|
2695 |
+
except Exception as e_ultra: # Catch other ultralytics loading errors
|
2696 |
+
print(f"ERROR: Failed to load YOLO model with ultralytics.YOLO: {e_ultra}")
|
2697 |
+
# If direct ultralytics fails, and your YOLOv10 wrapper exists, you could try it as a fallback,
|
2698 |
+
# but it's likely to fail if ultralytics.YOLO failed due to model structure.
|
2699 |
+
if YOLOv10 is not None:
|
2700 |
+
print("Attempting fallback to doclayout_yolo.YOLOv10 wrapper...")
|
2701 |
+
try:
|
2702 |
+
self._yolo = YOLOv10(yolo_model_filepath)
|
2703 |
+
print("MDR YOLOv10b-DocLayNet model loaded with doclayout_yolo.YOLOv10 wrapper (fallback).")
|
2704 |
+
except Exception as e_wrapper:
|
2705 |
+
print(f"ERROR: Fallback to doclayout_yolo.YOLOv10 also failed: {e_wrapper}")
|
2706 |
+
self._yolo = None
|
2707 |
+
return None
|
2708 |
+
else:
|
2709 |
+
self._yolo = None
|
2710 |
+
return None
|
2711 |
+
# --- END: MODIFIED LOADING ---
|
2712 |
|
2713 |
+
except HfHubHTTPError as e:
|
2714 |
+
print(f"ERROR: Failed to download/access YOLO model '{filename}' via Hugging Face Hub: {e}")
|
|
|
|
|
2715 |
self._yolo = None
|
2716 |
+
except FileNotFoundError as e: # This might be redundant if hf_hub_download raises its own error
|
2717 |
+
print(f"ERROR: YOLO model file '{filename}' not found after download attempt: {e}")
|
2718 |
self._yolo = None
|
2719 |
+
except Exception as e: # General catch-all for unexpected issues during hf_hub_download or path ops
|
2720 |
+
print(f"ERROR: An unexpected issue occurred related to YOLO model file handling for {filename}: {e}")
|
2721 |
+
import traceback
|
2722 |
+
traceback.print_exc()
|
2723 |
self._yolo = None
|
2724 |
|
|
|
|
|
|
|
2725 |
return self._yolo
|
2726 |
|
2727 |
+
|
2728 |
def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
|
2729 |
"""Analyzes a single page image to extract layout and content."""
|
2730 |
print(" Engine: Analyzing image...")
|