Spaces:

ChaseHan
/

Latex2Layout_PDF_Layout_Parsing

Running

App Files Files Community

ChaseHan commited on Apr 18

Commit

15de4c7

verified ·

1 Parent(s): 49fbaa3

add qwen

Browse files

Files changed (1) hide show

app.py +99 -75

app.py CHANGED Viewed

@@ -1,126 +1,142 @@
 import gradio as gr
 import cv2
 import numpy as np
-import tempfile
 from ultralytics import YOLO
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from PIL import Image
-import torch
-# Load the Latex2Layout model for layout detection
-latex2layout_model_path = "latex2layout_object_detection_yolov8.pt"
-latex2layout_model = YOLO(latex2layout_model_path)
-# Download and load the Qwen2.5-VL-3B model
-qwen_model_path = "Qwen/Qwen2.5-VL-3B"
-qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_path, device_map="auto", trust_remote_code=True)
-qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_path)
 def detect_layout(image):
     """
-    Detect layout elements in the image using the Latex2Layout model.
     Args:
         image: The uploaded image (numpy array)
     Returns:
-        layout_description: Textual description of detected layout elements
     """
     if image is None:
-        return "Error: No image provided."
-    # Run layout detection
     results = latex2layout_model(image)
     result = results[0]
-    layout_description = []
     for box in result.boxes:
         x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
         cls_id = int(box.cls[0])
         cls_name = result.names[cls_id]
-        layout_description.append(f"{cls_name} at position ({x1}, {y1}, {x2}, {y2})")
-    return ", ".join(layout_description) if layout_description else "No elements detected."
-def process_image_and_question(image, question):
     """
-    Process the image with Latex2Layout and answer the question using Qwen2.5-VL.
     Args:
         image: The uploaded image (numpy array)
-        question: The user's question (string)
     Returns:
-        annotated_image: Image with detection boxes
-        response: Answer from Qwen2.5-VL
     """
-    if image is None or not question:
-        return None, "Error: Please upload an image and provide a question."
-    # Convert numpy image to PIL for Qwen2.5-VL
-    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
-    # Detect layout using Latex2Layout
-    layout_description = detect_layout(image)
-    # Prepare annotated image
-    annotated_image = image.copy()
-    results = latex2layout_model(image)[0]
-    for box in results.boxes:
-        x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
-        conf = float(box.conf[0])
-        cls_id = int(box.cls[0])
-        cls_name = results.names[cls_id]
-        color = tuple(np.random.randint(0, 255, 3).tolist())
-        cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
-        label = f'{cls_name} {conf:.2f}'
-        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-        cv2.rectangle(annotated_image, (x1, y1-label_height-5), (x1+label_width, y1), color, -1)
-        cv2.putText(annotated_image, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-    # Prepare input for Qwen2.5-VL
-    input_text = f"Layout: {layout_description}\nQuestion: {question}"
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image_pil},
-                {"type": "text", "text": input_text}
-            ]
         }
-    ]
-    # Tokenize and generate response
-    inputs = qwen_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    model_inputs = qwen_tokenizer([inputs], return_tensors="pt").to(qwen_model.device)
-    with torch.no_grad():
-        output_ids = qwen_model.generate(**model_inputs, max_new_tokens=100)
-    response = qwen_tokenizer.decode(output_ids[0][len(model_inputs["input_ids"][0]):], skip_special_tokens=True)
-    return annotated_image, response
 # Custom CSS for styling
 custom_css = """
     .container { max-width: 1200px; margin: auto; }
     .button-primary { background-color: #4CAF50; color: white; }
     .gr-image { border: 2px solid #ddd; border-radius: 5px; }
-    .gr-textbox { font-family: Arial; }
 """
 # Create Gradio interface
 with gr.Blocks(
-    title="Latex2Layout Visual Q&A",
     theme=gr.themes.Default(),
     css=custom_css
 ) as demo:
     gr.Markdown(
         """
-        # Latex2Layout Visual Q&A
-        Upload an image and ask a question about its layout. The **Latex2Layout** model detects elements, and **Qwen2.5-VL** provides answers based on the image and layout information.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(
                 label="Upload Image",
@@ -130,22 +146,30 @@ with gr.Blocks(
             )
             question_input = gr.Textbox(
                 label="Ask a Question",
-                placeholder="e.g., What elements are in the image?",
                 lines=2
             )
             submit_btn = gr.Button(
-                "Get Answer",
                 variant="primary",
                 elem_classes="button-primary"
             )
         with gr.Column(scale=1):
             output_image = gr.Image(
                 label="Detected Layout",
                 height=400,
                 elem_classes="gr-image"
             )
-            output_text = gr.Textbox(
                 label="Answer",
                 lines=5,
                 max_lines=10,
@@ -155,14 +179,14 @@ with gr.Blocks(
     # Event handler
     submit_btn.click(
         fn=process_image_and_question,
-        inputs=[input_image, question_input],
-        outputs=[output_image, output_text],
         _js="() => { document.querySelector('.button-primary').innerText = 'Processing...'; }",
         show_progress=True
     ).then(
-        fn=lambda: gr.update(value="Get Answer"),
         outputs=submit_btn,
-        _js="() => { document.querySelector('.button-primary').innerText = 'Get Answer'; }"
     )
 # Launch the application

 import gradio as gr
 import cv2
 import numpy as np
+import requests
 from ultralytics import YOLO
+# Load the Latex2Layout model
+model_path = "latex2layout_object_detection_yolov8.pt"
+latex2layout_model = YOLO(model_path)
 def detect_layout(image):
     """
+    Perform layout detection on the uploaded image using the Latex2Layout model.
     Args:
         image: The uploaded image (numpy array)
     Returns:
+        annotated_image: Image with detection boxes drawn
+        layout_info: Text description of detected layout elements
     """
     if image is None:
+        return None, "Error: No image uploaded."
+    # Run detection
     results = latex2layout_model(image)
     result = results[0]
+    # Create a copy of the image for visualization
+    annotated_image = image.copy()
+    layout_annotations = []
+    # Get image dimensions
+    img_height, img_width = image.shape[:2]
+    # Process detection results
     for box in result.boxes:
         x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
+        conf = float(box.conf[0])
         cls_id = int(box.cls[0])
         cls_name = result.names[cls_id]
+        # Draw bounding box and label on the image
+        color = tuple(np.random.randint(0, 255, 3).tolist())
+        cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
+        label = f'{cls_name} {conf:.2f}'
+        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        cv2.rectangle(annotated_image, (x1, y1-label_height-5), (x1+label_width, y1), color, -1)
+        cv2.putText(annotated_image, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+        # Format layout info for Qwen2.5-VL
+        layout_annotations.append(f"{cls_name} at position ({x1},{y1},{x2},{y2}) with confidence {conf:.2f}")
+    layout_info = "Detected layout elements: " + "; ".join(layout_annotations) if layout_annotations else "No layout elements detected."
+    return annotated_image, layout_info
+def call_qwen_vl_api(api_url, image, layout_info, question):
     """
+    Call the Qwen2.5-VL API with the image, layout info, and user question.
     Args:
+        api_url: The URL of the Qwen2.5-VL API
         image: The uploaded image (numpy array)
+        layout_info: Text description of detected layout elements
+        question: User's question about the image and layout
     Returns:
+        answer: Response from the Qwen2.5-VL API
     """
+    if not api_url:
+        return "Error: Please provide a valid Qwen2.5-VL API URL."
+    if not question:
+        return "Error: Please enter a question."
+    try:
+        # Convert image to a format suitable for API (e.g., base64 or raw bytes might be needed; adjust per API spec)
+        # Here, we assume the API accepts a URL or raw data; for simplicity, we use a placeholder
+        payload = {
+            "image": image.tolist(),  # Adjust this based on API requirements (e.g., base64 encoding)
+            "prompt": f"{layout_info}\n\nQuestion: {question}",
         }
+        response = requests.post(api_url, json=payload, timeout=30)
+        response.raise_for_status()  # Raise an error for bad status codes
+        return response.json().get("answer", "Error: No answer received from API.")
+    except requests.exceptions.RequestException as e:
+        return f"Error: API call failed - {str(e)}"
+def process_image_and_question(api_url, image, question):
+    """
+    Process the image with Latex2Layout and query Qwen2.5-VL API.
+    Args:
+        api_url: Qwen2.5-VL API URL
+        image: Uploaded image
+        question: User's question
+    Returns:
+        annotated_image: Image with detection boxes
+        layout_info: Detected layout description
+        answer: API response to the question
+    """
+    annotated_image, layout_info = detect_layout(image)
+    if annotated_image is None:
+        return None, layout_info, "Error: Detection failed."
+    answer = call_qwen_vl_api(api_url, image, layout_info, question)
+    return annotated_image, layout_info, answer
 # Custom CSS for styling
 custom_css = """
     .container { max-width: 1200px; margin: auto; }
     .button-primary { background-color: #4CAF50; color: white; }
     .gr-image { border: 2px solid #ddd; border-radius: 5px; }
+    .gr-textbox { font-family: monospace; }
 """
 # Create Gradio interface
 with gr.Blocks(
+    title="Latex2Layout Detection & QA",
     theme=gr.themes.Default(),
     css=custom_css
 ) as demo:
     gr.Markdown(
         """
+        # Latex2Layout Layout Detection & Q&A
+        Upload an image to detect layout elements using the **Latex2Layout** model, then ask questions about the layout and image content using the Qwen2.5-VL API.
         """
     )
+    # API URL input
+    api_url_input = gr.Textbox(
+        label="Qwen2.5-VL API URL",
+        placeholder="Enter the Qwen2.5-VL API URL here",
+        value=""
+    )
+    # Main layout
     with gr.Row():
+        # Input column
         with gr.Column(scale=1):
             input_image = gr.Image(
                 label="Upload Image",
             )
             question_input = gr.Textbox(
                 label="Ask a Question",
+                placeholder="e.g., What is the layout structure of this image?",
                 lines=2
             )
             submit_btn = gr.Button(
+                "Detect & Ask",
                 variant="primary",
                 elem_classes="button-primary"
             )
+            gr.Markdown("**Tip**: Provide a clear image and specific question for best results.")
+        # Output column
         with gr.Column(scale=1):
             output_image = gr.Image(
                 label="Detected Layout",
                 height=400,
                 elem_classes="gr-image"
             )
+            layout_output = gr.Textbox(
+                label="Layout Information",
+                lines=5,
+                max_lines=10,
+                elem_classes="gr-textbox"
+            )
+            answer_output = gr.Textbox(
                 label="Answer",
                 lines=5,
                 max_lines=10,
     # Event handler
     submit_btn.click(
         fn=process_image_and_question,
+        inputs=[api_url_input, input_image, question_input],
+        outputs=[output_image, layout_output, answer_output],
         _js="() => { document.querySelector('.button-primary').innerText = 'Processing...'; }",
         show_progress=True
     ).then(
+        fn=lambda: gr.update(value="Detect & Ask"),
         outputs=submit_btn,
+        _js="() => { document.querySelector('.button-primary').innerText = 'Detect & Ask'; }"
     )
 # Launch the application