import gradio as gr import requests import json import base64 from PIL import Image, ImageDraw, ImageFont import io def process_with_openrouter(image, prompt, api_key, model="google/gemini-2.5-pro", temperature=0.5): """Process image with OpenRouter API for object detection""" if not api_key: return "Please enter your OpenRouter API key", "error" if image is None: return "Please upload an image", "error" try: buffered = io.BytesIO() image.save(buffered, format="PNG") img_base64 = base64.b64encode(buffered.getvalue()).decode() headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } data = { "model": model, "messages": [ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"} } ] } ], "temperature": temperature } response = requests.post( "https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data, timeout=60 ) if response.status_code == 200: result = response.json() content = result['choices'][0]['message']['content'] if '```json' in content: content = content.split('```json')[1].split('```')[0].strip() elif '```' in content: content = content.split('```')[1].split('```')[0].strip() return content, None else: return f"Error: {response.status_code} - {response.text}", "error" except Exception as e: return f"Error processing request: {str(e)}", "error" def draw_bounding_boxes(image, detections): """Draw bounding boxes with detailed labels on the image""" if not detections or len(detections) == 0: return image annotated_image = image.copy() draw = ImageDraw.Draw(annotated_image) try: font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 14) small_font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12) except: font = ImageFont.load_default() small_font = ImageFont.load_default() colors = ["#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", "#00FFFF", "#FFA500", "#800080"] for i, detection in enumerate(detections): if all(key in detection for key in ['x', 'y', 'width', 'height']): x = detection['x'] * image.width y = detection['y'] * image.height width = detection['width'] * image.width height = detection['height'] * image.height # Get detection information label = detection.get('label', f'Detection {i+1}') class_name = detection.get('class', 'unknown') details = detection.get('details', '') criteria_match = detection.get('criteria_match', '') confidence = detection.get('confidence', 1.0) x1, y1 = int(x), int(y) x2, y2 = int(x + width), int(y + height) x1 = max(0, min(x1, image.width)) y1 = max(0, min(y1, image.height)) x2 = max(0, min(x2, image.width)) y2 = max(0, min(y2, image.height)) color = colors[i % len(colors)] # Draw bounding box with thicker line for better visibility draw.rectangle([x1, y1, x2, y2], outline=color, width=4) # Create multi-line label with detailed information display_lines = [] display_lines.append(f"{class_name} ({confidence:.2f})") if details: # Truncate details if too long details_short = details[:40] + "..." if len(details) > 40 else details display_lines.append(details_short) if criteria_match: display_lines.append(f"Criteria: {criteria_match}") # Calculate total label size max_width = 0 total_height = 0 line_heights = [] for line in display_lines: text_bbox = draw.textbbox((0, 0), line, font=small_font) line_width = text_bbox[2] - text_bbox[0] line_height = text_bbox[3] - text_bbox[1] max_width = max(max_width, line_width) total_height += line_height + 2 line_heights.append(line_height) # Position label above the box, or below if no space above if y1 - total_height - 4 >= 0: label_y = y1 - total_height - 4 else: label_y = y2 + 2 label_x = x1 # Ensure label stays within image bounds if label_x + max_width > image.width: label_x = image.width - max_width - 4 # Draw label background draw.rectangle( [label_x - 2, label_y, label_x + max_width + 4, label_y + total_height + 2], fill=color, outline=color ) # Draw each line of text current_y = label_y + 2 for j, line in enumerate(display_lines): draw.text((label_x + 2, current_y), line, fill="white", font=small_font) current_y += line_heights[j] + 2 return annotated_image def create_detection_prompt(detailed_classes, confidence_threshold=0.5, detection_mode="specific"): """Create a detection prompt for detailed class specifications with different modes""" if isinstance(detailed_classes, str): detailed_classes = [cls.strip() for cls in detailed_classes.split('\n') if cls.strip()] # Build detailed detection instructions if detection_mode == "specific": condition_text = "ONLY detect objects that match these specific detailed criteria. Ignore all other objects:" elif detection_mode == "include": condition_text = "Detect objects matching these detailed criteria AND any other objects you can identify:" else: # "exclude" condition_text = "Detect all objects EXCEPT those matching these detailed criteria. Avoid detecting:" # Format each detailed class specification detailed_specs = [] for i, spec in enumerate(detailed_classes, 1): detailed_specs.append(f"{i}. {spec}") classes_text = "\n".join(detailed_specs) if detailed_specs else "No specific criteria provided" prompt = f"""{condition_text} {classes_text} Detection Instructions: - Carefully analyze each object against the detailed specifications above - Only include detections with confidence above {confidence_threshold} - For each detection, provide specific measurements, characteristics, or details when possible - Be precise about the criteria matching (e.g., actual crack width, size measurements, specific conditions) Output a JSON list where each entry contains: - "x": normalized x coordinate (0-1) of top-left corner - "y": normalized y coordinate (0-1) of top-left corner - "width": normalized width (0-1) of the bounding box - "height": normalized height (0-1) of the bounding box - "label": detailed description with measurements/characteristics and confidence score - "confidence": confidence score (0-1) - "class": the general category name - "details": specific measurements, characteristics, or conditions observed - "criteria_match": which detailed criteria this detection matches (reference number from list above) Example format for crack detection: [{{"x": 0.1, "y": 0.2, "width": 0.3, "height": 0.4, "label": "crack width ~3mm, length ~15cm (0.92)", "confidence": 0.92, "class": "crack", "details": "width: 3mm, length: 15cm, surface: concrete", "criteria_match": 1}}]""" return prompt def create_interface(): """Create the Gradio interface for object detection""" with gr.Blocks(title="Detailed Object Detection", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🔍 Detailed Object Detection with Custom Specifications") gr.Markdown("Detect objects with detailed specifications (e.g., 'crack width more than 2mm', 'rust spots larger than 5cm')") with gr.Row(): with gr.Column(scale=1): gr.Markdown("## ⚙️ Configuration") api_key = gr.Textbox( label="OpenRouter API Key", placeholder="Enter your OpenRouter API key...", type="password" ) model = gr.Dropdown( choices=[ "google/gemini-2.5-pro", "google/gemini-1.5-pro", "google/gemini-1.5-flash", "anthropic/claude-3.5-sonnet", "openai/gpt-4o", "openai/gpt-4o-mini" ], value="google/gemini-2.5-pro", label="Detection Model" ) detection_mode = gr.Radio( choices=[ ("Detect Only These Specifications", "specific"), ("Include These + Others", "include"), ("Exclude These Specifications", "exclude") ], value="specific", label="Detection Mode", info="How to handle the specified detailed criteria" ) detailed_specifications = gr.Textbox( label="Detailed Detection Specifications", placeholder="""Enter each specification on a new line, e.g.: crack width more than 2mm rust spots larger than 5cm in diameter concrete spalling deeper than 1cm structural damage with visible deformation paint peeling areas greater than 10cm²""", value="""crack width more than 2mm rust spots larger than 5cm in diameter concrete spalling deeper than 1cm""", lines=8, info="Enter detailed specifications, one per line" ) confidence_threshold = gr.Slider( minimum=0.1, maximum=1.0, value=0.5, step=0.05, label="Confidence Threshold", info="Minimum confidence for detection" ) temperature = gr.Slider( minimum=0, maximum=1, value=0.3, step=0.05, label="Temperature", info="Lower values for more consistent results" ) image_input = gr.Image( type="pil", label="Upload Image for Detection" ) detect_btn = gr.Button("🚀 Detect Objects", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("## 📊 Detection Results") annotated_image = gr.Image( label="Detected Objects", type="pil" ) detection_results = gr.Textbox( label="Detection Details (JSON)", lines=10, show_copy_button=True ) detection_summary = gr.Textbox( label="Detection Summary", lines=3 ) def process_detection(image, detailed_specs, conf_threshold, api_key_val, model_val, temp_val, mode_val): if not api_key_val: return None, "❌ Please enter your OpenRouter API key", "No API key provided" if image is None: return None, "❌ Please upload an image", "No image uploaded" if not detailed_specs or not detailed_specs.strip(): return None, "❌ Please enter at least one detailed specification", "No specifications provided" try: prompt = create_detection_prompt(detailed_specs, conf_threshold, mode_val) result, error = process_with_openrouter(image, prompt, api_key_val, model_val, temp_val) if error: return None, f"❌ Error: {result}", "Detection failed" detections = json.loads(result) if isinstance(detections, list) and len(detections) > 0: annotated_img = draw_bounding_boxes(image, detections) filtered_detections = [d for d in detections if d.get('confidence', 1.0) >= conf_threshold] mode_descriptions = { "specific": "Detecting only objects matching detailed specifications", "include": "Including specified detailed criteria + other objects", "exclude": "Excluding objects matching detailed specifications" } summary_text = f"✅ {mode_descriptions.get(mode_val, 'Detection')} - Found {len(filtered_detections)} objects" if filtered_detections: # Group by class and show details class_details = {} for det in filtered_detections: class_name = det.get('class', 'unknown') details = det.get('details', '') criteria_match = det.get('criteria_match', '') if class_name not in class_details: class_details[class_name] = [] class_details[class_name].append({ 'details': details, 'criteria': criteria_match, 'confidence': det.get('confidence', 1.0) }) summary_text += "\n\nDetailed Results:" for class_name, items in class_details.items(): summary_text += f"\n• {class_name} ({len(items)} found):" for item in items[:3]: # Show first 3 items summary_text += f"\n - {item['details']} (conf: {item['confidence']:.2f})" if item['criteria']: summary_text += f" [criteria: {item['criteria']}]" if len(items) > 3: summary_text += f"\n ... and {len(items)-3} more" return annotated_img, json.dumps(filtered_detections, indent=2), summary_text else: return image, "No objects detected matching detailed specifications", "No detections matching criteria above confidence threshold" except json.JSONDecodeError: return None, f"❌ Invalid JSON response: {result}", "JSON parsing failed" except Exception as e: return None, f"❌ Error: {str(e)}", "Processing error" detect_btn.click( process_detection, inputs=[image_input, detailed_specifications, confidence_threshold, api_key, model, temperature, detection_mode], outputs=[annotated_image, detection_results, detection_summary] ) gr.Markdown(""" ## 💡 Usage Tips - **Specific Mode**: Only detect objects matching your detailed specifications - **Include Mode**: Detect your specified criteria plus any other objects found - **Exclude Mode**: Detect everything except objects matching your specifications ### Example Detailed Specifications: ``` crack width more than 2mm rust spots larger than 5cm in diameter concrete spalling deeper than 1cm structural damage with visible deformation paint peeling areas greater than 10cm² corrosion affecting more than 20% of surface area missing bolts or fasteners water damage stains larger than 15cm ``` - Enter one detailed specification per line - Be specific about measurements, sizes, conditions - Adjust confidence threshold to filter weak detections - Use lower temperature values for consistent results - Get your API key from [openrouter.ai](https://openrouter.ai/) """) return demo if __name__ == "__main__": print("🚀 Starting Object Detection App...") demo = create_interface() demo.launch(share=False, inbrowser=True)