Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +97 -22
mdr_pdf_parser.py
CHANGED
@@ -1086,26 +1086,45 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
|
|
1086 |
self.input_name = self.get_input_name(self.sess)
|
1087 |
self.output_name = self.get_output_name(self.sess)
|
1088 |
|
1089 |
-
|
1090 |
-
|
1091 |
-
|
1092 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1093 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1094 |
-
|
1095 |
-
tw = min(imgW, int(ceil(imgH * max(r, max_r))))
|
1096 |
-
tw = max(1, tw)
|
1097 |
-
resized = cv2.resize(img, (tw, imgH))
|
1098 |
resized = resized.astype("float32")
|
1099 |
-
|
|
|
|
|
1100 |
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
|
|
|
|
1101 |
resized = resized[:, :, np.newaxis]
|
1102 |
-
|
1103 |
-
|
1104 |
-
resized
|
1105 |
-
resized
|
1106 |
-
|
1107 |
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1108 |
-
padding[:, :, 0:tw] = resized
|
|
|
1109 |
return padding
|
1110 |
|
1111 |
def __call__(self, img_list):
|
@@ -1211,6 +1230,21 @@ class _MDR_TextSystem:
|
|
1211 |
# Continue with unclassified (but valid) crops
|
1212 |
|
1213 |
print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1214 |
try:
|
1215 |
rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
|
1216 |
print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
|
@@ -1699,17 +1733,58 @@ class MDRLayoutReader:
|
|
1699 |
"""Determines reading order of layout elements using LayoutLMv3."""
|
1700 |
|
1701 |
def __init__(self, model_path: str):
|
1702 |
-
self._model_path = model_path
|
1703 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1704 |
|
1705 |
def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
|
1706 |
if self._model is None:
|
1707 |
-
cache = mdr_ensure_directory(self._model_path)
|
1708 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1709 |
try:
|
1710 |
-
self._model = LayoutLMv3ForTokenClassification.from_pretrained(
|
1711 |
-
|
1712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1713 |
return self._model
|
1714 |
|
1715 |
def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
|
|
|
1086 |
self.input_name = self.get_input_name(self.sess)
|
1087 |
self.output_name = self.get_output_name(self.sess)
|
1088 |
|
1089 |
+
# In class _MDR_TextRecognizer
|
1090 |
+
def _resize_norm(self, img, max_r): # img is a single crop
|
1091 |
+
imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
|
1092 |
+
h_orig, w_orig = img.shape[:2]
|
1093 |
+
print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
|
1094 |
+
|
1095 |
+
if h_orig == 0 or w_orig == 0:
|
1096 |
+
print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop. Returning zeros.")
|
1097 |
+
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1098 |
+
|
1099 |
+
r_current = w_orig / float(h_orig)
|
1100 |
+
# tw is target width, calculated to maintain aspect ratio up to imgW, using max of current ratio and batch max ratio
|
1101 |
+
tw = min(imgW, int(ceil(imgH * max(r_current, max_r))))
|
1102 |
+
tw = max(1, tw) # Ensure target width is at least 1
|
1103 |
+
print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
|
1104 |
+
|
1105 |
+
try:
|
1106 |
+
resized = cv2.resize(img, (tw, imgH)) # Resize to (target_width, fixed_height)
|
1107 |
+
except Exception as e_resize:
|
1108 |
+
print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH})")
|
1109 |
+
# Fallback: return zeros or try to pad original without resize if resize fails
|
1110 |
return np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1111 |
+
|
|
|
|
|
|
|
1112 |
resized = resized.astype("float32")
|
1113 |
+
# ... rest of the normalization ...
|
1114 |
+
# (This part seems standard, but worth checking if the image becomes all black/white after this)
|
1115 |
+
if imgC == 1 and len(resized.shape) == 3: # if model expects grayscale but crop is color
|
1116 |
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
1117 |
+
resized = resized[:, :, np.newaxis] # Add channel dim
|
1118 |
+
if len(resized.shape) == 2: # if grayscale and no channel dim
|
1119 |
resized = resized[:, :, np.newaxis]
|
1120 |
+
|
1121 |
+
resized = resized.transpose((2, 0, 1)) / 255.0 # HWC to CHW and scale to 0-1
|
1122 |
+
resized -= 0.5 # Normalize to -0.5 to 0.5
|
1123 |
+
resized /= 0.5 # Normalize to -1 to 1
|
1124 |
+
|
1125 |
padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
1126 |
+
padding[:, :, 0:tw] = resized # Place resized image into padded canvas
|
1127 |
+
print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
|
1128 |
return padding
|
1129 |
|
1130 |
def __call__(self, img_list):
|
|
|
1230 |
# Continue with unclassified (but valid) crops
|
1231 |
|
1232 |
print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
|
1233 |
+
# ---- START TEMP CODE TO SAVE CROPS ----
|
1234 |
+
save_crop_path_dir = Path("./temp_recognizer_crops")
|
1235 |
+
save_crop_path_dir.mkdir(parents=True, exist_ok=True)
|
1236 |
+
for i_crop, crop_image_np in enumerate(valid_crops):
|
1237 |
+
try:
|
1238 |
+
# Ensure crop_image_np is a valid image array (e.g., uint8)
|
1239 |
+
if crop_image_np is not None and crop_image_np.size > 0:
|
1240 |
+
# OpenCV expects BGR if color, or grayscale
|
1241 |
+
cv2.imwrite(str(save_crop_path_dir / f"crop_to_recognize_{self.crop_idx + i_crop}.png"), crop_image_np)
|
1242 |
+
else:
|
1243 |
+
print(f" DEBUG OCR SYS: Crop {i_crop} is None or empty, not saving.")
|
1244 |
+
except Exception as e_save:
|
1245 |
+
print(f" DEBUG OCR SYS: Failed to save crop {i_crop}: {e_save}")
|
1246 |
+
print(f" DEBUG OCR SYS: Saved {len(valid_crops)} crops for recognizer to {save_crop_path_dir}")
|
1247 |
+
# ---- END TEMP CODE TO SAVE CROPS ----
|
1248 |
try:
|
1249 |
rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
|
1250 |
print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
|
|
|
1733 |
"""Determines reading order of layout elements using LayoutLMv3."""
|
1734 |
|
1735 |
def __init__(self, model_path: str):
|
1736 |
+
self._model_path = model_path
|
1737 |
+
self._model: LayoutLMv3ForTokenClassification | None = None
|
1738 |
+
# Determine device more robustly, self._device will be 'cuda' or 'cpu'
|
1739 |
+
if torch.cuda.is_available(): # Check if CUDA is actually available at runtime
|
1740 |
+
self._device = "cuda"
|
1741 |
+
print("MDRLayoutReader: CUDA is available. Setting device to cuda.")
|
1742 |
+
else:
|
1743 |
+
self._device = "cpu"
|
1744 |
+
print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
|
1745 |
|
1746 |
def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
|
1747 |
if self._model is None:
|
1748 |
+
cache = mdr_ensure_directory(self._model_path)
|
1749 |
+
name = "microsoft/layoutlmv3-base"
|
1750 |
+
# The h_path was for a specific fine-tuned model 'hantian/layoutreader'
|
1751 |
+
# If you intend to use a specific fine-tuned head, ensure it's correctly downloaded
|
1752 |
+
# and compatible. For now, let's assume microsoft/layoutlmv3-base is the target
|
1753 |
+
# if a more specific one isn't found or intended.
|
1754 |
+
# The original code had a slightly confusing h_path logic.
|
1755 |
+
# Let's simplify to prioritize a local cache of "microsoft/layoutlmv3-base"
|
1756 |
+
# or a specific model if `self._model_path` points to a complete model directory.
|
1757 |
+
|
1758 |
+
model_load_path = name # Default to Hugging Face model name
|
1759 |
+
local_files_only_flag = False
|
1760 |
+
|
1761 |
+
# Check if self._model_path is a directory containing a full model
|
1762 |
+
# (e.g., config.json, pytorch_model.bin)
|
1763 |
+
# This part of the original logic for 'h_path' was a bit specific.
|
1764 |
+
# For LayoutLMv3, usually, you'd just use "microsoft/layoutlmv3-base"
|
1765 |
+
# and let transformers handle caching, or provide a path to a fully saved model.
|
1766 |
+
|
1767 |
+
# Let's assume the primary goal is to load "microsoft/layoutlmv3-base"
|
1768 |
+
# and allow it to be cached in `self._model_path/layoutreader`
|
1769 |
+
# The `cache_dir` argument to `from_pretrained` handles this.
|
1770 |
+
|
1771 |
+
print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{model_load_path}'. Cache dir: {cache}")
|
1772 |
try:
|
1773 |
+
self._model = LayoutLMv3ForTokenClassification.from_pretrained(
|
1774 |
+
model_load_path,
|
1775 |
+
cache_dir=cache, # Transformers will cache here
|
1776 |
+
local_files_only=local_files_only_flag, # Set to True if you want to force local only after first download
|
1777 |
+
num_labels=_MDR_MAX_LEN+1 # This is for the classification head
|
1778 |
+
)
|
1779 |
+
# Explicitly move model to the determined device
|
1780 |
+
self._model.to(torch.device(self._device)) # MODIFIED LINE
|
1781 |
+
self._model.eval()
|
1782 |
+
print(f"MDR LayoutReader model '{model_load_path}' loaded successfully on device: {self._model.device}.") # Use model.device
|
1783 |
+
except Exception as e:
|
1784 |
+
print(f"ERROR loading MDR LayoutReader model '{model_load_path}': {e}")
|
1785 |
+
import traceback
|
1786 |
+
traceback.print_exc()
|
1787 |
+
self._model = None
|
1788 |
return self._model
|
1789 |
|
1790 |
def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
|