Spaces:

ginigen
/

OmniParser-v2-pro

Running on Zero

App Files Files Community

ginipick commited on Aug 19

Commit

c2f47fd

verified ·

1 Parent(s): 23a3bee

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -78

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from PIL import Image
 import io
 import base64, os
 from huggingface_hub import snapshot_download
 # Import 유틸리티 함수들
 from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
@@ -15,28 +16,50 @@ from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processo
 repo_id = "microsoft/OmniParser-v2.0"  # HF repository ID
 local_dir = "weights"  # Local directory for weights
-snapshot_download(repo_id=repo_id, local_dir=local_dir)
-print(f"Repository downloaded to: {local_dir}")
-# Load models
-yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
-caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption")
-# Alternative caption model (BLIP2) can be used as below:
-# caption_model_processor = get_caption_model_processor(model_name="blip2", model_name_or_path="weights/icon_caption_blip2")
 # Markdown header text
 MARKDOWN = """
 # OmniParser V2 Pro🔥
 """
-DEVICE = torch.device('cuda')
 # Custom CSS for UI enhancement
 custom_css = """
 body { background-color: #f0f2f5; }
-.gradio-container { font-family: 'Segoe UI', sans-serif; }
 h1, h2, h3, h4 { color: #283E51; }
-button { border-radius: 6px; }
 """
 @spaces.GPU
@@ -47,14 +70,22 @@ def process(
     iou_threshold,
     use_paddleocr,
     imgsz
-) -> Optional[tuple]:
-    # 입력값 검증
     if image_input is None:
-        return None, "Please upload an image for processing."
     try:
         # Calculate overlay ratio based on input image width
-        box_overlay_ratio = image_input.size[0] / 3200
         draw_bbox_config = {
             'text_scale': 0.8 * box_overlay_ratio,
             'text_thickness': max(int(2 * box_overlay_ratio), 1),
@@ -62,94 +93,170 @@ def process(
             'thickness': max(int(3 * box_overlay_ratio), 1),
         }
-        # Run OCR bounding box detection
-        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
-            image_input,
-            display_img=False,
-            output_bb_format='xyxy',
-            goal_filtering=None,
-            easyocr_args={'paragraph': False, 'text_threshold': 0.9},
-            use_paddleocr=use_paddleocr
-        )
-        text, ocr_bbox = ocr_bbox_rslt
         # Get labeled image and parsed content via SOM (YOLO + caption model)
-        dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
-            image_input,
-            yolo_model,
-            BOX_TRESHOLD=box_threshold,
-            output_coord_in_ratio=True,
-            ocr_bbox=ocr_bbox,
-            draw_bbox_config=draw_bbox_config,
-            caption_model_processor=caption_model_processor,
-            ocr_text=text,
-            iou_threshold=iou_threshold,
-            imgsz=imgsz
-        )
         # Decode processed image from base64
-        image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-        print('Finish processing image.')
         # Format parsed content list into a multi-line string
-        parsed_text = "\n".join([f"icon {i}: {v}" for i, v in enumerate(parsed_content_list)])
         return image, parsed_text
     except Exception as e:
-        print(f"Error during processing: {str(e)}")
-        return None, f"Error: {str(e)}"
 # Build Gradio UI with enhanced layout and functionality
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
-        # 좌측 사이드바 (아코디언 형태) : 업로드 및 설정
         with gr.Column(scale=1):
-            with gr.Accordion("Upload Image & Settings", open=True):
                 image_input_component = gr.Image(
                     type='pil',
-                    label='Upload Image',
                     elem_id="input_image"
                 )
-                gr.Markdown("### Detection Settings")
-                box_threshold_component = gr.Slider(
-                    label='Box Threshold',
-                    minimum=0.01, maximum=1.0, step=0.01, value=0.05,
-                    info="Minimum confidence for bounding boxes."
-                )
-                iou_threshold_component = gr.Slider(
-                    label='IOU Threshold',
-                    minimum=0.01, maximum=1.0, step=0.01, value=0.1,
-                    info="Threshold for non-maximum suppression overlap."
-                )
-                use_paddleocr_component = gr.Checkbox(
-                    label='Use PaddleOCR', value=True,
-                    info="Toggle between PaddleOCR and EasyOCR."
-                )
-                imgsz_component = gr.Slider(
-                    label='Icon Detect Image Size',
-                    minimum=640, maximum=1920, step=32, value=640,
-                    info="Resize input image for icon detection."
-                )
                 submit_button_component = gr.Button(
-                    value='Process Image', variant='primary'
                 )
-        # 우측 메인 영역 : 결과 탭
         with gr.Column(scale=2):
             with gr.Tabs():
-                with gr.Tab("Output Image"):
                     image_output_component = gr.Image(
-                        type='pil', label='Processed Image'
                     )
-                with gr.Tab("Parsed Text"):
-                    text_output_component = gr.Textbox(
-                        label='Parsed Screen Elements',
-                        placeholder='The structured elements will appear here.',
-                        lines=10
                     )
-    # 버튼 클릭 시 프로세스 실행 (로딩 스피너 적용)
     submit_button_component.click(
         fn=process,
         inputs=[
@@ -159,8 +266,39 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
             use_paddleocr_component,
             imgsz_component
         ],
-        outputs=[image_output_component, text_output_component]
     )
-# Launch with queue support
-demo.queue().launch(share=False)

 import io
 import base64, os
 from huggingface_hub import snapshot_download
+import traceback
 # Import 유틸리티 함수들
 from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
 repo_id = "microsoft/OmniParser-v2.0"  # HF repository ID
 local_dir = "weights"  # Local directory for weights
+# Check if weights already exist to avoid re-downloading
+if not os.path.exists(local_dir):
+    snapshot_download(repo_id=repo_id, local_dir=local_dir)
+    print(f"Repository downloaded to: {local_dir}")
+else:
+    print(f"Weights already exist at: {local_dir}")
+# Load models with error handling
+try:
+    yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
+    caption_model_processor = get_caption_model_processor(
+        model_name="florence2",
+        model_name_or_path="weights/icon_caption"
+    )
+    print("Models loaded successfully")
+except Exception as e:
+    print(f"Error loading models: {e}")
+    raise
 # Markdown header text
 MARKDOWN = """
 # OmniParser V2 Pro🔥
+<div style="background-color: #f0f8ff; padding: 15px; border-radius: 10px; margin-bottom: 20px;">
+    <p style="margin: 0;">🎯 <strong>AI-powered screen understanding tool</strong> that detects UI elements and extracts text with high accuracy.</p>
+    <p style="margin: 5px 0 0 0;">📝 Supports both PaddleOCR and EasyOCR for flexible text extraction.</p>
+</div>
 """
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Using device: {DEVICE}")
 # Custom CSS for UI enhancement
 custom_css = """
 body { background-color: #f0f2f5; }
+.gradio-container { font-family: 'Segoe UI', sans-serif; max-width: 1400px; margin: auto; }
 h1, h2, h3, h4 { color: #283E51; }
+button { border-radius: 6px; transition: all 0.3s ease; }
+button:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(0,0,0,0.15); }
+.output-image { border: 2px solid #e1e4e8; border-radius: 8px; }
+#input_image { border: 2px dashed #4a90e2; border-radius: 8px; }
+#input_image:hover { border-color: #2c5aa0; }
+.gr-box { border-radius: 8px; }
+.gr-padded { padding: 16px; }
 """
 @spaces.GPU
     iou_threshold,
     use_paddleocr,
     imgsz
+) -> tuple:
+    """Process image with error handling and validation"""
+    # Input validation
     if image_input is None:
+        return None, "⚠️ Please upload an image for processing."
     try:
+        # Log processing parameters
+        print(f"Processing with parameters: box_threshold={box_threshold}, "
+              f"iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}")
         # Calculate overlay ratio based on input image width
+        image_width = image_input.size[0]
+        box_overlay_ratio = max(0.5, min(2.0, image_width / 3200))  # Clamp ratio between 0.5 and 2.0
         draw_bbox_config = {
             'text_scale': 0.8 * box_overlay_ratio,
             'text_thickness': max(int(2 * box_overlay_ratio), 1),
             'thickness': max(int(3 * box_overlay_ratio), 1),
         }
+        # Run OCR bounding box detection with error handling
+        try:
+            ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
+                image_input,
+                display_img=False,
+                output_bb_format='xyxy',
+                goal_filtering=None,
+                easyocr_args={'paragraph': False, 'text_threshold': 0.9},
+                use_paddleocr=use_paddleocr
+            )
+            # Handle None result from OCR
+            if ocr_bbox_rslt is None:
+                print("OCR returned None, using empty results")
+                text, ocr_bbox = [], []
+            else:
+                text, ocr_bbox = ocr_bbox_rslt
+            # Validate OCR results
+            if text is None:
+                text = []
+            if ocr_bbox is None:
+                ocr_bbox = []
+            print(f"OCR found {len(text)} text regions")
+        except Exception as e:
+            print(f"OCR error: {e}, continuing with empty OCR results")
+            text, ocr_bbox = [], []
         # Get labeled image and parsed content via SOM (YOLO + caption model)
+        try:
+            dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
+                image_input,
+                yolo_model,
+                BOX_TRESHOLD=box_threshold,
+                output_coord_in_ratio=True,
+                ocr_bbox=ocr_bbox if ocr_bbox else [],  # Ensure it's never None
+                draw_bbox_config=draw_bbox_config,
+                caption_model_processor=caption_model_processor,
+                ocr_text=text if text else [],  # Ensure it's never None
+                iou_threshold=iou_threshold,
+                imgsz=imgsz
+            )
+            if dino_labled_img is None:
+                raise ValueError("Failed to generate labeled image")
+        except Exception as e:
+            print(f"Error in SOM processing: {e}")
+            # Return original image with error message if SOM fails
+            return image_input, f"⚠️ Error during element detection: {str(e)}"
         # Decode processed image from base64
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
+            print('Successfully decoded processed image')
+        except Exception as e:
+            print(f"Error decoding image: {e}")
+            return image_input, f"⚠️ Error decoding processed image: {str(e)}"
         # Format parsed content list into a multi-line string
+        if parsed_content_list and len(parsed_content_list) > 0:
+            parsed_text = "🎯 **Detected Elements:**\n\n"
+            for i, v in enumerate(parsed_content_list):
+                if v:  # Only add non-empty content
+                    parsed_text += f"**Icon {i}:** {v}\n"
+        else:
+            parsed_text = "ℹ️ No UI elements detected. Try adjusting the detection thresholds."
+        print(f'Finished processing image. Found {len(parsed_content_list)} elements.')
         return image, parsed_text
     except Exception as e:
+        error_msg = f"⚠️ Unexpected error: {str(e)}"
+        print(f"Error during processing: {e}")
+        print(traceback.format_exc())
+        return None, error_msg
 # Build Gradio UI with enhanced layout and functionality
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro") as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
+        # Left sidebar: Upload and settings
         with gr.Column(scale=1):
+            with gr.Accordion("📤 Upload Image & Settings", open=True):
                 image_input_component = gr.Image(
                     type='pil',
+                    label='Upload Screenshot/UI Image',
                     elem_id="input_image"
                 )
+                gr.Markdown("### 🎛️ Detection Settings")
+                with gr.Group():
+                    box_threshold_component = gr.Slider(
+                        label='📊 Box Threshold',
+                        minimum=0.01,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.05,
+                        info="Lower values detect more elements (may include false positives)"
+                    )
+                    iou_threshold_component = gr.Slider(
+                        label='🔲 IOU Threshold',
+                        minimum=0.01,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.1,
+                        info="Controls overlap filtering (lower = less filtering)"
+                    )
+                    use_paddleocr_component = gr.Checkbox(
+                        label='🔤 Use PaddleOCR',
+                        value=True,
+                        info="✓ PaddleOCR (faster) | ✗ EasyOCR (more languages)"
+                    )
+                    imgsz_component = gr.Slider(
+                        label='📐 Detection Image Size',
+                        minimum=640,
+                        maximum=1920,
+                        step=32,
+                        value=640,
+                        info="Higher = better accuracy but slower (640 recommended)"
+                    )
                 submit_button_component = gr.Button(
+                    value='🚀 Process Image',
+                    variant='primary',
+                    size='lg'
                 )
+                # Add examples section
+                gr.Markdown("### 💡 Quick Tips")
+                gr.Markdown("""
+                - **For mobile apps:** Use default settings
+                - **For desktop apps:** Try image size 1280
+                - **For complex UIs:** Lower box threshold to 0.03
+                - **Too many boxes?** Increase IOU threshold
+                """)
+        # Right main area: Results tabs
         with gr.Column(scale=2):
             with gr.Tabs():
+                with gr.Tab("🖼️ Annotated Image"):
                     image_output_component = gr.Image(
+                        type='pil',
+                        label='Processed Image with Annotations',
+                        elem_classes=["output-image"]
                     )
+                with gr.Tab("📝 Extracted Elements"):
+                    text_output_component = gr.Markdown(
+                        value="*Parsed elements will appear here after processing...*",
+                        elem_classes=["parsed-text"]
                     )
+            # Add status indicator
+            status_text = gr.Markdown("", visible=True)
+    # Button click event with loading spinner
     submit_button_component.click(
         fn=process,
         inputs=[
             use_paddleocr_component,
             imgsz_component
         ],
+        outputs=[image_output_component, text_output_component],
+        show_progress=True
     )
+    # Add sample images if available
+    if os.path.exists("samples"):
+        gr.Examples(
+            examples=[
+                ["samples/mobile_app.png", 0.05, 0.1, True, 640],
+                ["samples/desktop_app.png", 0.05, 0.1, True, 1280],
+            ],
+            inputs=[
+                image_input_component,
+                box_threshold_component,
+                iou_threshold_component,
+                use_paddleocr_component,
+                imgsz_component
+            ],
+            outputs=[image_output_component, text_output_component],
+            fn=process,
+            cache_examples=False
+        )
+# Launch with queue support and error handling
+if __name__ == "__main__":
+    try:
+        demo.queue(max_size=10)
+        demo.launch(
+            share=False,
+            show_error=True,
+            server_name="0.0.0.0",
+            server_port=7860
+        )
+    except Exception as e:
+        print(f"Failed to launch app: {e}")
+        raise