rodrigomasini commited on
Commit
a1b1436
·
verified ·
1 Parent(s): 8d56101

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +97 -22
mdr_pdf_parser.py CHANGED
@@ -1086,26 +1086,45 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
1086
  self.input_name = self.get_input_name(self.sess)
1087
  self.output_name = self.get_output_name(self.sess)
1088
 
1089
- def _resize_norm(self, img, max_r):
1090
- imgC, imgH, imgW = self.shape
1091
- h, w = img.shape[:2]
1092
- if h == 0 or w == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1093
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1094
- r = w / float(h)
1095
- tw = min(imgW, int(ceil(imgH * max(r, max_r))))
1096
- tw = max(1, tw)
1097
- resized = cv2.resize(img, (tw, imgH))
1098
  resized = resized.astype("float32")
1099
- if imgC == 1 and len(resized.shape) == 3:
 
 
1100
  resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
 
 
1101
  resized = resized[:, :, np.newaxis]
1102
- if len(resized.shape) == 2:
1103
- resized = resized[:, :, np.newaxis]
1104
- resized = resized.transpose((2, 0, 1)) / 255.0
1105
- resized -= 0.5
1106
- resized /= 0.5
1107
  padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1108
- padding[:, :, 0:tw] = resized
 
1109
  return padding
1110
 
1111
  def __call__(self, img_list):
@@ -1211,6 +1230,21 @@ class _MDR_TextSystem:
1211
  # Continue with unclassified (but valid) crops
1212
 
1213
  print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1214
  try:
1215
  rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
1216
  print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
@@ -1699,17 +1733,58 @@ class MDRLayoutReader:
1699
  """Determines reading order of layout elements using LayoutLMv3."""
1700
 
1701
  def __init__(self, model_path: str):
1702
- self._model_path = model_path; self._model: LayoutLMv3ForTokenClassification | None = None
1703
- self._device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
1704
 
1705
  def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
1706
  if self._model is None:
1707
- cache = mdr_ensure_directory(self._model_path); name = "microsoft/layoutlmv3-base"; h_path = os.path.join(cache, "models--hantian--layoutreader")
1708
- local = os.path.exists(h_path); load_p = h_path if local else name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709
  try:
1710
- self._model = LayoutLMv3ForTokenClassification.from_pretrained(load_p, cache_dir=cache, local_files_only=local, num_labels=_MDR_MAX_LEN+1)
1711
- self._model.to(self._device); self._model.eval(); print(f"MDR LayoutReader loaded on {self._device}.")
1712
- except Exception as e: print(f"ERROR loading MDR LayoutReader: {e}"); self._model = None
 
 
 
 
 
 
 
 
 
 
 
 
1713
  return self._model
1714
 
1715
  def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:
 
1086
  self.input_name = self.get_input_name(self.sess)
1087
  self.output_name = self.get_output_name(self.sess)
1088
 
1089
+ # In class _MDR_TextRecognizer
1090
+ def _resize_norm(self, img, max_r): # img is a single crop
1091
+ imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
1092
+ h_orig, w_orig = img.shape[:2]
1093
+ print(f" DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
1094
+
1095
+ if h_orig == 0 or w_orig == 0:
1096
+ print(f" DEBUG RECOGNIZER: _resize_norm received zero-dimension crop. Returning zeros.")
1097
+ return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1098
+
1099
+ r_current = w_orig / float(h_orig)
1100
+ # tw is target width, calculated to maintain aspect ratio up to imgW, using max of current ratio and batch max ratio
1101
+ tw = min(imgW, int(ceil(imgH * max(r_current, max_r))))
1102
+ tw = max(1, tw) # Ensure target width is at least 1
1103
+ print(f" DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
1104
+
1105
+ try:
1106
+ resized = cv2.resize(img, (tw, imgH)) # Resize to (target_width, fixed_height)
1107
+ except Exception as e_resize:
1108
+ print(f" DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH})")
1109
+ # Fallback: return zeros or try to pad original without resize if resize fails
1110
  return np.zeros((imgC, imgH, imgW), dtype=np.float32)
1111
+
 
 
 
1112
  resized = resized.astype("float32")
1113
+ # ... rest of the normalization ...
1114
+ # (This part seems standard, but worth checking if the image becomes all black/white after this)
1115
+ if imgC == 1 and len(resized.shape) == 3: # if model expects grayscale but crop is color
1116
  resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
1117
+ resized = resized[:, :, np.newaxis] # Add channel dim
1118
+ if len(resized.shape) == 2: # if grayscale and no channel dim
1119
  resized = resized[:, :, np.newaxis]
1120
+
1121
+ resized = resized.transpose((2, 0, 1)) / 255.0 # HWC to CHW and scale to 0-1
1122
+ resized -= 0.5 # Normalize to -0.5 to 0.5
1123
+ resized /= 0.5 # Normalize to -1 to 1
1124
+
1125
  padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
1126
+ padding[:, :, 0:tw] = resized # Place resized image into padded canvas
1127
+ print(f" DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
1128
  return padding
1129
 
1130
  def __call__(self, img_list):
 
1230
  # Continue with unclassified (but valid) crops
1231
 
1232
  print(f" DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
1233
+ # ---- START TEMP CODE TO SAVE CROPS ----
1234
+ save_crop_path_dir = Path("./temp_recognizer_crops")
1235
+ save_crop_path_dir.mkdir(parents=True, exist_ok=True)
1236
+ for i_crop, crop_image_np in enumerate(valid_crops):
1237
+ try:
1238
+ # Ensure crop_image_np is a valid image array (e.g., uint8)
1239
+ if crop_image_np is not None and crop_image_np.size > 0:
1240
+ # OpenCV expects BGR if color, or grayscale
1241
+ cv2.imwrite(str(save_crop_path_dir / f"crop_to_recognize_{self.crop_idx + i_crop}.png"), crop_image_np)
1242
+ else:
1243
+ print(f" DEBUG OCR SYS: Crop {i_crop} is None or empty, not saving.")
1244
+ except Exception as e_save:
1245
+ print(f" DEBUG OCR SYS: Failed to save crop {i_crop}: {e_save}")
1246
+ print(f" DEBUG OCR SYS: Saved {len(valid_crops)} crops for recognizer to {save_crop_path_dir}")
1247
+ # ---- END TEMP CODE TO SAVE CROPS ----
1248
  try:
1249
  rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
1250
  print(f" DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
 
1733
  """Determines reading order of layout elements using LayoutLMv3."""
1734
 
1735
  def __init__(self, model_path: str):
1736
+ self._model_path = model_path
1737
+ self._model: LayoutLMv3ForTokenClassification | None = None
1738
+ # Determine device more robustly, self._device will be 'cuda' or 'cpu'
1739
+ if torch.cuda.is_available(): # Check if CUDA is actually available at runtime
1740
+ self._device = "cuda"
1741
+ print("MDRLayoutReader: CUDA is available. Setting device to cuda.")
1742
+ else:
1743
+ self._device = "cpu"
1744
+ print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
1745
 
1746
  def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
1747
  if self._model is None:
1748
+ cache = mdr_ensure_directory(self._model_path)
1749
+ name = "microsoft/layoutlmv3-base"
1750
+ # The h_path was for a specific fine-tuned model 'hantian/layoutreader'
1751
+ # If you intend to use a specific fine-tuned head, ensure it's correctly downloaded
1752
+ # and compatible. For now, let's assume microsoft/layoutlmv3-base is the target
1753
+ # if a more specific one isn't found or intended.
1754
+ # The original code had a slightly confusing h_path logic.
1755
+ # Let's simplify to prioritize a local cache of "microsoft/layoutlmv3-base"
1756
+ # or a specific model if `self._model_path` points to a complete model directory.
1757
+
1758
+ model_load_path = name # Default to Hugging Face model name
1759
+ local_files_only_flag = False
1760
+
1761
+ # Check if self._model_path is a directory containing a full model
1762
+ # (e.g., config.json, pytorch_model.bin)
1763
+ # This part of the original logic for 'h_path' was a bit specific.
1764
+ # For LayoutLMv3, usually, you'd just use "microsoft/layoutlmv3-base"
1765
+ # and let transformers handle caching, or provide a path to a fully saved model.
1766
+
1767
+ # Let's assume the primary goal is to load "microsoft/layoutlmv3-base"
1768
+ # and allow it to be cached in `self._model_path/layoutreader`
1769
+ # The `cache_dir` argument to `from_pretrained` handles this.
1770
+
1771
+ print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{model_load_path}'. Cache dir: {cache}")
1772
  try:
1773
+ self._model = LayoutLMv3ForTokenClassification.from_pretrained(
1774
+ model_load_path,
1775
+ cache_dir=cache, # Transformers will cache here
1776
+ local_files_only=local_files_only_flag, # Set to True if you want to force local only after first download
1777
+ num_labels=_MDR_MAX_LEN+1 # This is for the classification head
1778
+ )
1779
+ # Explicitly move model to the determined device
1780
+ self._model.to(torch.device(self._device)) # MODIFIED LINE
1781
+ self._model.eval()
1782
+ print(f"MDR LayoutReader model '{model_load_path}' loaded successfully on device: {self._model.device}.") # Use model.device
1783
+ except Exception as e:
1784
+ print(f"ERROR loading MDR LayoutReader model '{model_load_path}': {e}")
1785
+ import traceback
1786
+ traceback.print_exc()
1787
+ self._model = None
1788
  return self._model
1789
 
1790
  def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]: