Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on 29 days ago

Commit

a1b1436

verified ·

1 Parent(s): 8d56101

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +97 -22

mdr_pdf_parser.py CHANGED Viewed

@@ -1086,26 +1086,45 @@ class _MDR_TextRecognizer(_MDR_PredictBase):
     self.input_name = self.get_input_name(self.sess)
     self.output_name = self.get_output_name(self.sess)
-  def _resize_norm(self, img, max_r):
-    imgC, imgH, imgW = self.shape
-    h, w = img.shape[:2]
-    if h == 0 or w == 0:
         return np.zeros((imgC, imgH, imgW), dtype=np.float32)
-    r = w / float(h)
-    tw = min(imgW, int(ceil(imgH * max(r, max_r))))
-    tw = max(1, tw)
-    resized = cv2.resize(img, (tw, imgH))
     resized = resized.astype("float32")
-    if imgC == 1 and len(resized.shape) == 3:
         resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
         resized = resized[:, :, np.newaxis]
-    if len(resized.shape) == 2:
-        resized = resized[:, :, np.newaxis]
-    resized = resized.transpose((2, 0, 1)) / 255.0
-    resized -= 0.5
-    resized /= 0.5
     padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-    padding[:, :, 0:tw] = resized
     return padding
   def __call__(self, img_list):
@@ -1211,6 +1230,21 @@ class _MDR_TextSystem:
             # Continue with unclassified (but valid) crops
     print(f"  DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
     try:
         rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
         print(f"    DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
@@ -1699,17 +1733,58 @@ class MDRLayoutReader:
   """Determines reading order of layout elements using LayoutLMv3."""
   def __init__(self, model_path: str):
-    self._model_path = model_path; self._model: LayoutLMv3ForTokenClassification | None = None
-    self._device = "cuda" if torch.cuda.is_available() else "cpu"
   def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
     if self._model is None:
-      cache = mdr_ensure_directory(self._model_path); name = "microsoft/layoutlmv3-base"; h_path = os.path.join(cache, "models--hantian--layoutreader")
-      local = os.path.exists(h_path); load_p = h_path if local else name
       try:
-        self._model = LayoutLMv3ForTokenClassification.from_pretrained(load_p, cache_dir=cache, local_files_only=local, num_labels=_MDR_MAX_LEN+1)
-        self._model.to(self._device); self._model.eval(); print(f"MDR LayoutReader loaded on {self._device}.")
-      except Exception as e: print(f"ERROR loading MDR LayoutReader: {e}"); self._model = None
     return self._model
   def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]:

     self.input_name = self.get_input_name(self.sess)
     self.output_name = self.get_output_name(self.sess)
+  # In class _MDR_TextRecognizer
+  def _resize_norm(self, img, max_r): # img is a single crop
+    imgC, imgH, imgW = self.shape # e.g., (3, 48, 320)
+    h_orig, w_orig = img.shape[:2]
+    print(f"    DEBUG RECOGNIZER: _resize_norm input crop shape: ({h_orig}, {w_orig}), target shape: {self.shape}, max_r_batch: {max_r:.2f}")
+    if h_orig == 0 or w_orig == 0:
+        print(f"    DEBUG RECOGNIZER: _resize_norm received zero-dimension crop. Returning zeros.")
+        return np.zeros((imgC, imgH, imgW), dtype=np.float32)
+    r_current = w_orig / float(h_orig)
+    # tw is target width, calculated to maintain aspect ratio up to imgW, using max of current ratio and batch max ratio
+    tw = min(imgW, int(ceil(imgH * max(r_current, max_r))))
+    tw = max(1, tw) # Ensure target width is at least 1
+    print(f"    DEBUG RECOGNIZER: _resize_norm calculated target width (tw): {tw} for target height (imgH): {imgH}")
+    try:
+        resized = cv2.resize(img, (tw, imgH)) # Resize to (target_width, fixed_height)
+    except Exception as e_resize:
+        print(f"    DEBUG RECOGNIZER: _resize_norm cv2.resize failed: {e_resize}. Original shape ({h_orig},{w_orig}), target ({tw},{imgH})")
+        # Fallback: return zeros or try to pad original without resize if resize fails
         return np.zeros((imgC, imgH, imgW), dtype=np.float32)
     resized = resized.astype("float32")
+    # ... rest of the normalization ...
+    # (This part seems standard, but worth checking if the image becomes all black/white after this)
+    if imgC == 1 and len(resized.shape) == 3: # if model expects grayscale but crop is color
         resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
+        resized = resized[:, :, np.newaxis] # Add channel dim
+    if len(resized.shape) == 2: # if grayscale and no channel dim
         resized = resized[:, :, np.newaxis]
+    resized = resized.transpose((2, 0, 1)) / 255.0 # HWC to CHW and scale to 0-1
+    resized -= 0.5 # Normalize to -0.5 to 0.5
+    resized /= 0.5 # Normalize to -1 to 1
     padding = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+    padding[:, :, 0:tw] = resized # Place resized image into padded canvas
+    print(f"    DEBUG RECOGNIZER: _resize_norm output padded shape: {padding.shape}")
     return padding
   def __call__(self, img_list):
             # Continue with unclassified (but valid) crops
     print(f"  DEBUG OCR SYS: Applying recognizer to {len(valid_crops)} crops...") # DEBUG
+      # ---- START TEMP CODE TO SAVE CROPS ----
+    save_crop_path_dir = Path("./temp_recognizer_crops")
+    save_crop_path_dir.mkdir(parents=True, exist_ok=True)
+    for i_crop, crop_image_np in enumerate(valid_crops):
+        try:
+            # Ensure crop_image_np is a valid image array (e.g., uint8)
+            if crop_image_np is not None and crop_image_np.size > 0:
+                 # OpenCV expects BGR if color, or grayscale
+                cv2.imwrite(str(save_crop_path_dir / f"crop_to_recognize_{self.crop_idx + i_crop}.png"), crop_image_np)
+            else:
+                print(f"    DEBUG OCR SYS: Crop {i_crop} is None or empty, not saving.")
+        except Exception as e_save:
+            print(f"    DEBUG OCR SYS: Failed to save crop {i_crop}: {e_save}")
+    print(f"  DEBUG OCR SYS: Saved {len(valid_crops)} crops for recognizer to {save_crop_path_dir}")
+    # ---- END TEMP CODE TO SAVE CROPS ----
     try:
         rec_res = self.recognizer(valid_crops) # rec_res is a list of [text, score]
         print(f"    DEBUG OCR SYS: Recognizer results count: {len(rec_res)}. First few results: {rec_res[:3]}") # DEBUG
   """Determines reading order of layout elements using LayoutLMv3."""
   def __init__(self, model_path: str):
+    self._model_path = model_path
+    self._model: LayoutLMv3ForTokenClassification | None = None
+    # Determine device more robustly, self._device will be 'cuda' or 'cpu'
+    if torch.cuda.is_available(): # Check if CUDA is actually available at runtime
+        self._device = "cuda"
+        print("MDRLayoutReader: CUDA is available. Setting device to cuda.")
+    else:
+        self._device = "cpu"
+        print("MDRLayoutReader: CUDA not available. Setting device to cpu.")
   def _get_model(self) -> LayoutLMv3ForTokenClassification | None:
     if self._model is None:
+      cache = mdr_ensure_directory(self._model_path)
+      name = "microsoft/layoutlmv3-base"
+      # The h_path was for a specific fine-tuned model 'hantian/layoutreader'
+      # If you intend to use a specific fine-tuned head, ensure it's correctly downloaded
+      # and compatible. For now, let's assume microsoft/layoutlmv3-base is the target
+      # if a more specific one isn't found or intended.
+      # The original code had a slightly confusing h_path logic.
+      # Let's simplify to prioritize a local cache of "microsoft/layoutlmv3-base"
+      # or a specific model if `self._model_path` points to a complete model directory.
+      model_load_path = name # Default to Hugging Face model name
+      local_files_only_flag = False
+      # Check if self._model_path is a directory containing a full model
+      # (e.g., config.json, pytorch_model.bin)
+      # This part of the original logic for 'h_path' was a bit specific.
+      # For LayoutLMv3, usually, you'd just use "microsoft/layoutlmv3-base"
+      # and let transformers handle caching, or provide a path to a fully saved model.
+      # Let's assume the primary goal is to load "microsoft/layoutlmv3-base"
+      # and allow it to be cached in `self._model_path/layoutreader`
+      # The `cache_dir` argument to `from_pretrained` handles this.
+      print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{model_load_path}'. Cache dir: {cache}")
       try:
+        self._model = LayoutLMv3ForTokenClassification.from_pretrained(
+            model_load_path,
+            cache_dir=cache, # Transformers will cache here
+            local_files_only=local_files_only_flag, # Set to True if you want to force local only after first download
+            num_labels=_MDR_MAX_LEN+1 # This is for the classification head
+        )
+        # Explicitly move model to the determined device
+        self._model.to(torch.device(self._device)) # MODIFIED LINE
+        self._model.eval()
+        print(f"MDR LayoutReader model '{model_load_path}' loaded successfully on device: {self._model.device}.") # Use model.device
+      except Exception as e:
+        print(f"ERROR loading MDR LayoutReader model '{model_load_path}': {e}")
+        import traceback
+        traceback.print_exc()
+        self._model = None
     return self._model
   def determine_reading_order(self, layouts: list[MDRLayoutElement], size: tuple[int, int]) -> list[MDRLayoutElement]: