rodrigomasini commited on
Commit
0029256
·
verified ·
1 Parent(s): f6abc87

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +40 -10
mdr_pdf_parser.py CHANGED
@@ -1013,7 +1013,8 @@ class _MDR_TextDetector(_MDR_PredictBase):
1013
  new_boxes.append(box)
1014
  return np.array(new_boxes)
1015
 
1016
- # In class _MDR_TextDetector:
 
1017
  def __call__(self, img):
1018
  ori_im = img.copy()
1019
  data = {"image": img}
@@ -1026,14 +1027,14 @@ class _MDR_TextDetector(_MDR_PredictBase):
1026
  print(f" DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
1027
  import traceback
1028
  traceback.print_exc()
1029
- return np.array([]) # Return empty array on failure
1030
 
1031
  if data is None:
1032
  print(
1033
  " DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
1034
  return np.array([])
1035
 
1036
- processed_img, shape_list = data # shape_list is [src_h, src_w, ratio_h, ratio_w]
1037
  if processed_img is None:
1038
  print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
1039
  return np.array([])
@@ -1052,7 +1053,7 @@ class _MDR_TextDetector(_MDR_PredictBase):
1052
  print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
1053
  import traceback
1054
  traceback.print_exc()
1055
- return np.array([]) # Return empty array on failure
1056
  print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
1057
 
1058
  preds = {"maps": outputs[0]}
@@ -1064,17 +1065,46 @@ class _MDR_TextDetector(_MDR_PredictBase):
1064
  traceback.print_exc()
1065
  return np.array([])
1066
 
1067
- if not post_res or not post_res[0].get('points'):
1068
- print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no points.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1069
  return np.array([])
 
1070
 
1071
- boxes_from_post = post_res[0]['points']
1072
  print(
1073
  f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
1074
 
1075
- if not isinstance(boxes_from_post, (list, np.ndarray)) or len(
1076
- boxes_from_post) == 0: # Check if it's empty or not list-like
1077
- print(" DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter.")
 
 
 
1078
  return np.array([])
1079
 
1080
  if self.args.det_box_type == 'poly':
 
1013
  new_boxes.append(box)
1014
  return np.array(new_boxes)
1015
 
1016
+ # In class _MDR_TextDetector:
1017
+
1018
  def __call__(self, img):
1019
  ori_im = img.copy()
1020
  data = {"image": img}
 
1027
  print(f" DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
1028
  import traceback
1029
  traceback.print_exc()
1030
+ return np.array([])
1031
 
1032
  if data is None:
1033
  print(
1034
  " DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
1035
  return np.array([])
1036
 
1037
+ processed_img, shape_list = data
1038
  if processed_img is None:
1039
  print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
1040
  return np.array([])
 
1053
  print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
1054
  import traceback
1055
  traceback.print_exc()
1056
+ return np.array([])
1057
  print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
1058
 
1059
  preds = {"maps": outputs[0]}
 
1065
  traceback.print_exc()
1066
  return np.array([])
1067
 
1068
+ # --- START: REFINED CHECK ---
1069
+ # 1. Check if post_res itself is valid and contains the expected structure.
1070
+ if not post_res or not isinstance(post_res, list) or len(post_res) == 0 or \
1071
+ not isinstance(post_res[0], dict) or 'points' not in post_res[0]:
1072
+ print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned invalid or empty structure for points.")
1073
+ return np.array([])
1074
+
1075
+ boxes_from_post = post_res[0]['points'] # This is expected to be a np.ndarray or a list of boxes
1076
+
1077
+ # 2. Check if boxes_from_post is actually empty.
1078
+ # For a NumPy array, check its size. For a list, check if it's empty.
1079
+ no_boxes_found = False
1080
+ if isinstance(boxes_from_post, np.ndarray):
1081
+ if boxes_from_post.size == 0:
1082
+ no_boxes_found = True
1083
+ elif isinstance(boxes_from_post, list):
1084
+ if not boxes_from_post: # Empty list
1085
+ no_boxes_found = True
1086
+ elif boxes_from_post is None: # Explicitly check for None
1087
+ no_boxes_found = True
1088
+ else:
1089
+ # Should not happen if _MDR_DBPostProcess behaves as expected, but good to log
1090
+ print(
1091
+ f" DEBUG OCR: _MDR_TextDetector: 'points' from DBPostProcess is of unexpected type: {type(boxes_from_post)}")
1092
+ return np.array([])
1093
+
1094
+ if no_boxes_found:
1095
+ print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no actual point data.")
1096
  return np.array([])
1097
+ # --- END: REFINED CHECK ---
1098
 
 
1099
  print(
1100
  f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
1101
 
1102
+ # The following check might be redundant now but can be kept for extra safety
1103
+ # or if boxes_from_post could be other types not handled above.
1104
+ if not isinstance(boxes_from_post, (list, np.ndarray)) or \
1105
+ (isinstance(boxes_from_post, np.ndarray) and boxes_from_post.size == 0) or \
1106
+ (isinstance(boxes_from_post, list) and not boxes_from_post):
1107
+ print(" DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter (secondary check).")
1108
  return np.array([])
1109
 
1110
  if self.args.det_box_type == 'poly':