ginipick commited on
Commit
c2f47fd
Β·
verified Β·
1 Parent(s): 23a3bee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -78
app.py CHANGED
@@ -7,6 +7,7 @@ from PIL import Image
7
  import io
8
  import base64, os
9
  from huggingface_hub import snapshot_download
 
10
 
11
  # Import μœ ν‹Έλ¦¬ν‹° ν•¨μˆ˜λ“€
12
  from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
@@ -15,28 +16,50 @@ from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processo
15
  repo_id = "microsoft/OmniParser-v2.0" # HF repository ID
16
  local_dir = "weights" # Local directory for weights
17
 
18
- snapshot_download(repo_id=repo_id, local_dir=local_dir)
19
- print(f"Repository downloaded to: {local_dir}")
 
 
 
 
20
 
21
- # Load models
22
- yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
23
- caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption")
24
- # Alternative caption model (BLIP2) can be used as below:
25
- # caption_model_processor = get_caption_model_processor(model_name="blip2", model_name_or_path="weights/icon_caption_blip2")
 
 
 
 
 
 
26
 
27
  # Markdown header text
28
  MARKDOWN = """
29
  # OmniParser V2 ProπŸ”₯
 
 
 
 
 
30
  """
31
 
32
- DEVICE = torch.device('cuda')
 
33
 
34
  # Custom CSS for UI enhancement
35
  custom_css = """
36
  body { background-color: #f0f2f5; }
37
- .gradio-container { font-family: 'Segoe UI', sans-serif; }
38
  h1, h2, h3, h4 { color: #283E51; }
39
- button { border-radius: 6px; }
 
 
 
 
 
 
40
  """
41
 
42
  @spaces.GPU
@@ -47,14 +70,22 @@ def process(
47
  iou_threshold,
48
  use_paddleocr,
49
  imgsz
50
- ) -> Optional[tuple]:
51
- # μž…λ ₯κ°’ 검증
 
 
52
  if image_input is None:
53
- return None, "Please upload an image for processing."
54
 
55
  try:
 
 
 
 
56
  # Calculate overlay ratio based on input image width
57
- box_overlay_ratio = image_input.size[0] / 3200
 
 
58
  draw_bbox_config = {
59
  'text_scale': 0.8 * box_overlay_ratio,
60
  'text_thickness': max(int(2 * box_overlay_ratio), 1),
@@ -62,94 +93,170 @@ def process(
62
  'thickness': max(int(3 * box_overlay_ratio), 1),
63
  }
64
 
65
- # Run OCR bounding box detection
66
- ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
67
- image_input,
68
- display_img=False,
69
- output_bb_format='xyxy',
70
- goal_filtering=None,
71
- easyocr_args={'paragraph': False, 'text_threshold': 0.9},
72
- use_paddleocr=use_paddleocr
73
- )
74
- text, ocr_bbox = ocr_bbox_rslt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # Get labeled image and parsed content via SOM (YOLO + caption model)
77
- dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
78
- image_input,
79
- yolo_model,
80
- BOX_TRESHOLD=box_threshold,
81
- output_coord_in_ratio=True,
82
- ocr_bbox=ocr_bbox,
83
- draw_bbox_config=draw_bbox_config,
84
- caption_model_processor=caption_model_processor,
85
- ocr_text=text,
86
- iou_threshold=iou_threshold,
87
- imgsz=imgsz
88
- )
 
 
 
 
 
 
 
 
 
89
 
90
  # Decode processed image from base64
91
- image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
92
- print('Finish processing image.')
 
 
 
 
93
 
94
  # Format parsed content list into a multi-line string
95
- parsed_text = "\n".join([f"icon {i}: {v}" for i, v in enumerate(parsed_content_list)])
 
 
 
 
 
 
 
 
96
  return image, parsed_text
 
97
  except Exception as e:
98
- print(f"Error during processing: {str(e)}")
99
- return None, f"Error: {str(e)}"
 
 
100
 
101
  # Build Gradio UI with enhanced layout and functionality
102
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
103
  gr.Markdown(MARKDOWN)
104
 
105
  with gr.Row():
106
- # 쒌츑 μ‚¬μ΄λ“œλ°” (μ•„μ½”λ””μ–Έ ν˜•νƒœ) : μ—…λ‘œλ“œ 및 μ„€μ •
107
  with gr.Column(scale=1):
108
- with gr.Accordion("Upload Image & Settings", open=True):
109
  image_input_component = gr.Image(
110
  type='pil',
111
- label='Upload Image',
112
  elem_id="input_image"
113
  )
114
- gr.Markdown("### Detection Settings")
115
- box_threshold_component = gr.Slider(
116
- label='Box Threshold',
117
- minimum=0.01, maximum=1.0, step=0.01, value=0.05,
118
- info="Minimum confidence for bounding boxes."
119
- )
120
- iou_threshold_component = gr.Slider(
121
- label='IOU Threshold',
122
- minimum=0.01, maximum=1.0, step=0.01, value=0.1,
123
- info="Threshold for non-maximum suppression overlap."
124
- )
125
- use_paddleocr_component = gr.Checkbox(
126
- label='Use PaddleOCR', value=True,
127
- info="Toggle between PaddleOCR and EasyOCR."
128
- )
129
- imgsz_component = gr.Slider(
130
- label='Icon Detect Image Size',
131
- minimum=640, maximum=1920, step=32, value=640,
132
- info="Resize input image for icon detection."
133
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  submit_button_component = gr.Button(
135
- value='Process Image', variant='primary'
 
 
136
  )
 
 
 
 
 
 
 
 
 
137
 
138
- # 우츑 메인 μ˜μ—­ : κ²°κ³Ό νƒ­
139
  with gr.Column(scale=2):
140
  with gr.Tabs():
141
- with gr.Tab("Output Image"):
142
  image_output_component = gr.Image(
143
- type='pil', label='Processed Image'
 
 
144
  )
145
- with gr.Tab("Parsed Text"):
146
- text_output_component = gr.Textbox(
147
- label='Parsed Screen Elements',
148
- placeholder='The structured elements will appear here.',
149
- lines=10
150
  )
 
 
 
151
 
152
- # λ²„νŠΌ 클릭 μ‹œ ν”„λ‘œμ„ΈμŠ€ μ‹€ν–‰ (λ‘œλ”© μŠ€ν”Όλ„ˆ 적용)
153
  submit_button_component.click(
154
  fn=process,
155
  inputs=[
@@ -159,8 +266,39 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
159
  use_paddleocr_component,
160
  imgsz_component
161
  ],
162
- outputs=[image_output_component, text_output_component]
 
163
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Launch with queue support
166
- demo.queue().launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
7
  import io
8
  import base64, os
9
  from huggingface_hub import snapshot_download
10
+ import traceback
11
 
12
  # Import μœ ν‹Έλ¦¬ν‹° ν•¨μˆ˜λ“€
13
  from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
 
16
  repo_id = "microsoft/OmniParser-v2.0" # HF repository ID
17
  local_dir = "weights" # Local directory for weights
18
 
19
+ # Check if weights already exist to avoid re-downloading
20
+ if not os.path.exists(local_dir):
21
+ snapshot_download(repo_id=repo_id, local_dir=local_dir)
22
+ print(f"Repository downloaded to: {local_dir}")
23
+ else:
24
+ print(f"Weights already exist at: {local_dir}")
25
 
26
+ # Load models with error handling
27
+ try:
28
+ yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
29
+ caption_model_processor = get_caption_model_processor(
30
+ model_name="florence2",
31
+ model_name_or_path="weights/icon_caption"
32
+ )
33
+ print("Models loaded successfully")
34
+ except Exception as e:
35
+ print(f"Error loading models: {e}")
36
+ raise
37
 
38
  # Markdown header text
39
  MARKDOWN = """
40
  # OmniParser V2 ProπŸ”₯
41
+
42
+ <div style="background-color: #f0f8ff; padding: 15px; border-radius: 10px; margin-bottom: 20px;">
43
+ <p style="margin: 0;">🎯 <strong>AI-powered screen understanding tool</strong> that detects UI elements and extracts text with high accuracy.</p>
44
+ <p style="margin: 5px 0 0 0;">πŸ“ Supports both PaddleOCR and EasyOCR for flexible text extraction.</p>
45
+ </div>
46
  """
47
 
48
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
49
+ print(f"Using device: {DEVICE}")
50
 
51
  # Custom CSS for UI enhancement
52
  custom_css = """
53
  body { background-color: #f0f2f5; }
54
+ .gradio-container { font-family: 'Segoe UI', sans-serif; max-width: 1400px; margin: auto; }
55
  h1, h2, h3, h4 { color: #283E51; }
56
+ button { border-radius: 6px; transition: all 0.3s ease; }
57
+ button:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(0,0,0,0.15); }
58
+ .output-image { border: 2px solid #e1e4e8; border-radius: 8px; }
59
+ #input_image { border: 2px dashed #4a90e2; border-radius: 8px; }
60
+ #input_image:hover { border-color: #2c5aa0; }
61
+ .gr-box { border-radius: 8px; }
62
+ .gr-padded { padding: 16px; }
63
  """
64
 
65
  @spaces.GPU
 
70
  iou_threshold,
71
  use_paddleocr,
72
  imgsz
73
+ ) -> tuple:
74
+ """Process image with error handling and validation"""
75
+
76
+ # Input validation
77
  if image_input is None:
78
+ return None, "⚠️ Please upload an image for processing."
79
 
80
  try:
81
+ # Log processing parameters
82
+ print(f"Processing with parameters: box_threshold={box_threshold}, "
83
+ f"iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}")
84
+
85
  # Calculate overlay ratio based on input image width
86
+ image_width = image_input.size[0]
87
+ box_overlay_ratio = max(0.5, min(2.0, image_width / 3200)) # Clamp ratio between 0.5 and 2.0
88
+
89
  draw_bbox_config = {
90
  'text_scale': 0.8 * box_overlay_ratio,
91
  'text_thickness': max(int(2 * box_overlay_ratio), 1),
 
93
  'thickness': max(int(3 * box_overlay_ratio), 1),
94
  }
95
 
96
+ # Run OCR bounding box detection with error handling
97
+ try:
98
+ ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
99
+ image_input,
100
+ display_img=False,
101
+ output_bb_format='xyxy',
102
+ goal_filtering=None,
103
+ easyocr_args={'paragraph': False, 'text_threshold': 0.9},
104
+ use_paddleocr=use_paddleocr
105
+ )
106
+
107
+ # Handle None result from OCR
108
+ if ocr_bbox_rslt is None:
109
+ print("OCR returned None, using empty results")
110
+ text, ocr_bbox = [], []
111
+ else:
112
+ text, ocr_bbox = ocr_bbox_rslt
113
+
114
+ # Validate OCR results
115
+ if text is None:
116
+ text = []
117
+ if ocr_bbox is None:
118
+ ocr_bbox = []
119
+
120
+ print(f"OCR found {len(text)} text regions")
121
+
122
+ except Exception as e:
123
+ print(f"OCR error: {e}, continuing with empty OCR results")
124
+ text, ocr_bbox = [], []
125
 
126
  # Get labeled image and parsed content via SOM (YOLO + caption model)
127
+ try:
128
+ dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
129
+ image_input,
130
+ yolo_model,
131
+ BOX_TRESHOLD=box_threshold,
132
+ output_coord_in_ratio=True,
133
+ ocr_bbox=ocr_bbox if ocr_bbox else [], # Ensure it's never None
134
+ draw_bbox_config=draw_bbox_config,
135
+ caption_model_processor=caption_model_processor,
136
+ ocr_text=text if text else [], # Ensure it's never None
137
+ iou_threshold=iou_threshold,
138
+ imgsz=imgsz
139
+ )
140
+
141
+ if dino_labled_img is None:
142
+ raise ValueError("Failed to generate labeled image")
143
+
144
+ except Exception as e:
145
+ print(f"Error in SOM processing: {e}")
146
+ # Return original image with error message if SOM fails
147
+ return image_input, f"⚠️ Error during element detection: {str(e)}"
148
 
149
  # Decode processed image from base64
150
+ try:
151
+ image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
152
+ print('Successfully decoded processed image')
153
+ except Exception as e:
154
+ print(f"Error decoding image: {e}")
155
+ return image_input, f"⚠️ Error decoding processed image: {str(e)}"
156
 
157
  # Format parsed content list into a multi-line string
158
+ if parsed_content_list and len(parsed_content_list) > 0:
159
+ parsed_text = "🎯 **Detected Elements:**\n\n"
160
+ for i, v in enumerate(parsed_content_list):
161
+ if v: # Only add non-empty content
162
+ parsed_text += f"**Icon {i}:** {v}\n"
163
+ else:
164
+ parsed_text = "ℹ️ No UI elements detected. Try adjusting the detection thresholds."
165
+
166
+ print(f'Finished processing image. Found {len(parsed_content_list)} elements.')
167
  return image, parsed_text
168
+
169
  except Exception as e:
170
+ error_msg = f"⚠️ Unexpected error: {str(e)}"
171
+ print(f"Error during processing: {e}")
172
+ print(traceback.format_exc())
173
+ return None, error_msg
174
 
175
  # Build Gradio UI with enhanced layout and functionality
176
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro") as demo:
177
  gr.Markdown(MARKDOWN)
178
 
179
  with gr.Row():
180
+ # Left sidebar: Upload and settings
181
  with gr.Column(scale=1):
182
+ with gr.Accordion("πŸ“€ Upload Image & Settings", open=True):
183
  image_input_component = gr.Image(
184
  type='pil',
185
+ label='Upload Screenshot/UI Image',
186
  elem_id="input_image"
187
  )
188
+
189
+ gr.Markdown("### πŸŽ›οΈ Detection Settings")
190
+
191
+ with gr.Group():
192
+ box_threshold_component = gr.Slider(
193
+ label='πŸ“Š Box Threshold',
194
+ minimum=0.01,
195
+ maximum=1.0,
196
+ step=0.01,
197
+ value=0.05,
198
+ info="Lower values detect more elements (may include false positives)"
199
+ )
200
+
201
+ iou_threshold_component = gr.Slider(
202
+ label='πŸ”² IOU Threshold',
203
+ minimum=0.01,
204
+ maximum=1.0,
205
+ step=0.01,
206
+ value=0.1,
207
+ info="Controls overlap filtering (lower = less filtering)"
208
+ )
209
+
210
+ use_paddleocr_component = gr.Checkbox(
211
+ label='πŸ”€ Use PaddleOCR',
212
+ value=True,
213
+ info="βœ“ PaddleOCR (faster) | βœ— EasyOCR (more languages)"
214
+ )
215
+
216
+ imgsz_component = gr.Slider(
217
+ label='πŸ“ Detection Image Size',
218
+ minimum=640,
219
+ maximum=1920,
220
+ step=32,
221
+ value=640,
222
+ info="Higher = better accuracy but slower (640 recommended)"
223
+ )
224
+
225
  submit_button_component = gr.Button(
226
+ value='πŸš€ Process Image',
227
+ variant='primary',
228
+ size='lg'
229
  )
230
+
231
+ # Add examples section
232
+ gr.Markdown("### πŸ’‘ Quick Tips")
233
+ gr.Markdown("""
234
+ - **For mobile apps:** Use default settings
235
+ - **For desktop apps:** Try image size 1280
236
+ - **For complex UIs:** Lower box threshold to 0.03
237
+ - **Too many boxes?** Increase IOU threshold
238
+ """)
239
 
240
+ # Right main area: Results tabs
241
  with gr.Column(scale=2):
242
  with gr.Tabs():
243
+ with gr.Tab("πŸ–ΌοΈ Annotated Image"):
244
  image_output_component = gr.Image(
245
+ type='pil',
246
+ label='Processed Image with Annotations',
247
+ elem_classes=["output-image"]
248
  )
249
+
250
+ with gr.Tab("πŸ“ Extracted Elements"):
251
+ text_output_component = gr.Markdown(
252
+ value="*Parsed elements will appear here after processing...*",
253
+ elem_classes=["parsed-text"]
254
  )
255
+
256
+ # Add status indicator
257
+ status_text = gr.Markdown("", visible=True)
258
 
259
+ # Button click event with loading spinner
260
  submit_button_component.click(
261
  fn=process,
262
  inputs=[
 
266
  use_paddleocr_component,
267
  imgsz_component
268
  ],
269
+ outputs=[image_output_component, text_output_component],
270
+ show_progress=True
271
  )
272
+
273
+ # Add sample images if available
274
+ if os.path.exists("samples"):
275
+ gr.Examples(
276
+ examples=[
277
+ ["samples/mobile_app.png", 0.05, 0.1, True, 640],
278
+ ["samples/desktop_app.png", 0.05, 0.1, True, 1280],
279
+ ],
280
+ inputs=[
281
+ image_input_component,
282
+ box_threshold_component,
283
+ iou_threshold_component,
284
+ use_paddleocr_component,
285
+ imgsz_component
286
+ ],
287
+ outputs=[image_output_component, text_output_component],
288
+ fn=process,
289
+ cache_examples=False
290
+ )
291
 
292
+ # Launch with queue support and error handling
293
+ if __name__ == "__main__":
294
+ try:
295
+ demo.queue(max_size=10)
296
+ demo.launch(
297
+ share=False,
298
+ show_error=True,
299
+ server_name="0.0.0.0",
300
+ server_port=7860
301
+ )
302
+ except Exception as e:
303
+ print(f"Failed to launch app: {e}")
304
+ raise