import gradio as gr import cv2 import numpy as np import requests from ultralytics import YOLO # Load the Latex2Layout model model_path = "latex2layout_object_detection_yolov8.pt" latex2layout_model = YOLO(model_path) def detect_layout(image): """ Perform layout detection on the uploaded image using the Latex2Layout model. Args: image: The uploaded image (numpy array) Returns: annotated_image: Image with detection boxes drawn layout_info: Text description of detected layout elements """ if image is None: return None, "Error: No image uploaded." # Run detection results = latex2layout_model(image) result = results[0] # Create a copy of the image for visualization annotated_image = image.copy() layout_annotations = [] # Get image dimensions img_height, img_width = image.shape[:2] # Process detection results for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy()) conf = float(box.conf[0]) cls_id = int(box.cls[0]) cls_name = result.names[cls_id] # Draw bounding box and label on the image color = tuple(np.random.randint(0, 255, 3).tolist()) cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2) label = f'{cls_name} {conf:.2f}' (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) cv2.rectangle(annotated_image, (x1, y1-label_height-5), (x1+label_width, y1), color, -1) cv2.putText(annotated_image, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) # Format layout info for Qwen2.5-VL layout_annotations.append(f"{cls_name} at position ({x1},{y1},{x2},{y2}) with confidence {conf:.2f}") layout_info = "Detected layout elements: " + "; ".join(layout_annotations) if layout_annotations else "No layout elements detected." return annotated_image, layout_info def call_qwen_vl_api(api_url, image, layout_info, question): """ Call the Qwen2.5-VL API with the image, layout info, and user question. Args: api_url: The URL of the Qwen2.5-VL API image: The uploaded image (numpy array) layout_info: Text description of detected layout elements question: User's question about the image and layout Returns: answer: Response from the Qwen2.5-VL API """ if not api_url: return "Error: Please provide a valid Qwen2.5-VL API URL." if not question: return "Error: Please enter a question." try: # Convert image to a format suitable for API (e.g., base64 or raw bytes might be needed; adjust per API spec) # Here, we assume the API accepts a URL or raw data; for simplicity, we use a placeholder payload = { "image": image.tolist(), # Adjust this based on API requirements (e.g., base64 encoding) "prompt": f"{layout_info}\n\nQuestion: {question}", } response = requests.post(api_url, json=payload, timeout=30) response.raise_for_status() # Raise an error for bad status codes return response.json().get("answer", "Error: No answer received from API.") except requests.exceptions.RequestException as e: return f"Error: API call failed - {str(e)}" def process_image_and_question(api_url, image, question): """ Process the image with Latex2Layout and query Qwen2.5-VL API. Args: api_url: Qwen2.5-VL API URL image: Uploaded image question: User's question Returns: annotated_image: Image with detection boxes layout_info: Detected layout description answer: API response to the question """ annotated_image, layout_info = detect_layout(image) if annotated_image is None: return None, layout_info, "Error: Detection failed." answer = call_qwen_vl_api(api_url, image, layout_info, question) return annotated_image, layout_info, answer # Custom CSS for styling custom_css = """ .container { max-width: 1200px; margin: auto; } .button-primary { background-color: #4CAF50; color: white; } .gr-image { border: 2px solid #ddd; border-radius: 5px; } .gr-textbox { font-family: monospace; } """ # Create Gradio interface with gr.Blocks( title="Latex2Layout Detection & QA", theme=gr.themes.Default(), css=custom_css ) as demo: gr.Markdown( """ # Latex2Layout Layout Detection & Q&A Upload an image to detect layout elements using the **Latex2Layout** model, then ask questions about the layout and image content using the Qwen2.5-VL API. """ ) # API URL input api_url_input = gr.Textbox( label="Qwen2.5-VL API URL", placeholder="Enter the Qwen2.5-VL API URL here", value="" ) # Main layout with gr.Row(): # Input column with gr.Column(scale=1): input_image = gr.Image( label="Upload Image", type="numpy", height=400, elem_classes="gr-image" ) question_input = gr.Textbox( label="Ask a Question", placeholder="e.g., What is the layout structure of this image?", lines=2 ) submit_btn = gr.Button( "Detect & Ask", variant="primary", elem_classes="button-primary" ) gr.Markdown("**Tip**: Provide a clear image and specific question for best results.") # Output column with gr.Column(scale=1): output_image = gr.Image( label="Detected Layout", height=400, elem_classes="gr-image" ) layout_output = gr.Textbox( label="Layout Information", lines=5, max_lines=10, elem_classes="gr-textbox" ) answer_output = gr.Textbox( label="Answer", lines=5, max_lines=10, elem_classes="gr-textbox" ) # Event handler submit_btn.click( fn=process_image_and_question, inputs=[api_url_input, input_image, question_input], outputs=[output_image, layout_output, answer_output], _js="() => { document.querySelector('.button-primary').innerText = 'Processing...'; }", show_progress=True ).then( fn=lambda: gr.update(value="Detect & Ask"), outputs=submit_btn, _js="() => { document.querySelector('.button-primary').innerText = 'Detect & Ask'; }" ) # Launch the application if __name__ == "__main__": demo.launch()