Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -91,32 +91,19 @@ def get_yolo_model(model_path):
|
|
91 |
|
92 |
@torch.inference_mode()
|
93 |
def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
|
94 |
-
|
95 |
-
Generates parsed textual content for detected icons from the image.
|
96 |
-
|
97 |
-
Args:
|
98 |
-
filtered_boxes: Tensor of bounding boxes.
|
99 |
-
starting_idx: Starting index for non-OCR boxes.
|
100 |
-
image_source: Original image as a NumPy array.
|
101 |
-
caption_model_processor: Dictionary with keys 'model' and 'processor'.
|
102 |
-
prompt: Optional prompt text.
|
103 |
-
batch_size: Batch size for processing.
|
104 |
-
|
105 |
-
Returns:
|
106 |
-
List of generated texts.
|
107 |
-
"""
|
108 |
to_pil = ToPILImage()
|
109 |
if starting_idx:
|
110 |
non_ocr_boxes = filtered_boxes[starting_idx:]
|
111 |
else:
|
112 |
non_ocr_boxes = filtered_boxes
|
113 |
cropped_pil_images = []
|
114 |
-
for coord in non_ocr_boxes:
|
115 |
xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
|
116 |
ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
|
117 |
cropped_image = image_source[ymin:ymax, xmin:xmax, :]
|
118 |
cropped_pil_images.append(to_pil(cropped_image))
|
119 |
-
|
120 |
model, processor = caption_model_processor['model'], caption_model_processor['processor']
|
121 |
if not prompt:
|
122 |
if 'florence' in model.config.name_or_path:
|
@@ -127,7 +114,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
|
127 |
generated_texts = []
|
128 |
device = model.device
|
129 |
for i in range(0, len(cropped_pil_images), batch_size):
|
130 |
-
batch = cropped_pil_images[i:i+batch_size]
|
131 |
if model.device.type == 'cuda':
|
132 |
inputs = processor(images=batch, text=[prompt] * len(batch), return_tensors="pt").to(device=device, dtype=torch.float16)
|
133 |
else:
|
@@ -156,6 +143,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
|
156 |
return generated_texts
|
157 |
|
158 |
|
|
|
159 |
def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
|
160 |
"""
|
161 |
Generates parsed textual content for detected icons using the phi3_v model variant.
|
|
|
91 |
|
92 |
@torch.inference_mode()
|
93 |
def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
|
94 |
+
# Now batch_size defaults to 32 if not provided
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
to_pil = ToPILImage()
|
96 |
if starting_idx:
|
97 |
non_ocr_boxes = filtered_boxes[starting_idx:]
|
98 |
else:
|
99 |
non_ocr_boxes = filtered_boxes
|
100 |
cropped_pil_images = []
|
101 |
+
for i, coord in enumerate(non_ocr_boxes):
|
102 |
xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
|
103 |
ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
|
104 |
cropped_image = image_source[ymin:ymax, xmin:xmax, :]
|
105 |
cropped_pil_images.append(to_pil(cropped_image))
|
106 |
+
|
107 |
model, processor = caption_model_processor['model'], caption_model_processor['processor']
|
108 |
if not prompt:
|
109 |
if 'florence' in model.config.name_or_path:
|
|
|
114 |
generated_texts = []
|
115 |
device = model.device
|
116 |
for i in range(0, len(cropped_pil_images), batch_size):
|
117 |
+
batch = cropped_pil_images[i:i + batch_size]
|
118 |
if model.device.type == 'cuda':
|
119 |
inputs = processor(images=batch, text=[prompt] * len(batch), return_tensors="pt").to(device=device, dtype=torch.float16)
|
120 |
else:
|
|
|
143 |
return generated_texts
|
144 |
|
145 |
|
146 |
+
|
147 |
def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
|
148 |
"""
|
149 |
Generates parsed textual content for detected icons using the phi3_v model variant.
|