Spaces:

ChaseHan
/

Latex2Layout_PDF_Layout_Parsing

Running

App Files Files Community

ChaseHan commited on Apr 18

Commit

49fbaa3

verified ·

1 Parent(s): 83d84a0

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -97

app.py CHANGED Viewed

@@ -1,109 +1,126 @@
 import gradio as gr
 import cv2
 import numpy as np
-import os
 import tempfile
 from ultralytics import YOLO
-# Load the Latex2Layout model
-model_path = "latex2layout_object_detection_yolov8.pt"
-model = YOLO(model_path)
-def detect_and_visualize(image):
     """
-    Perform layout detection on the uploaded image using the Latex2Layout model and visualize the results.
     Args:
-        image: The uploaded image
     Returns:
-        annotated_image: Image with detection boxes
-        layout_annotations: Annotations in YOLO format
     """
     if image is None:
-        return None, "Error: No image uploaded."
-    # Run detection using the Latex2Layout model
-    results = model(image)
     result = results[0]
-    # Create a copy of the image for visualization
-    annotated_image = image.copy()
-    layout_annotations = []
-    # Get image dimensions
-    img_height, img_width = image.shape[:2]
-    # Draw detection results
     for box in result.boxes:
         x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
-        conf = float(box.conf[0])
         cls_id = int(box.cls[0])
         cls_name = result.names[cls_id]
-        # Generate a color for each class
         color = tuple(np.random.randint(0, 255, 3).tolist())
-        # Draw bounding box and label
         cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
         label = f'{cls_name} {conf:.2f}'
         (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
         cv2.rectangle(annotated_image, (x1, y1-label_height-5), (x1+label_width, y1), color, -1)
         cv2.putText(annotated_image, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-        # Convert to YOLO format (normalized)
-        x_center = (x1 + x2) / (2 * img_width)
-        y_center = (y1 + y2) / (2 * img_height)
-        width = (x2 - x1) / img_width
-        height = (y2 - y1) / img_height
-        layout_annotations.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")
-    return annotated_image, "\n".join(layout_annotations)
-def save_layout_annotations(layout_annotations_str):
-    """
-    Save layout annotations to a temporary file and return the file path.
-    Args:
-        layout_annotations_str: Annotations string in YOLO format
-    Returns:
-        file_path: Path to the saved annotation file
-    """
-    if not layout_annotations_str:
-        return None
-    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
-    with open(temp_file.name, "w") as f:
-        f.write(layout_annotations_str)
-    return temp_file.name
 # Custom CSS for styling
 custom_css = """
     .container { max-width: 1200px; margin: auto; }
     .button-primary { background-color: #4CAF50; color: white; }
-    .button-secondary { background-color: #008CBA; color: white; }
     .gr-image { border: 2px solid #ddd; border-radius: 5px; }
-    .gr-textbox { font-family: monospace; }
 """
-# Create Gradio interface with enhanced styling
 with gr.Blocks(
-    title="Latex2Layout Detection",
     theme=gr.themes.Default(),
     css=custom_css
 ) as demo:
-    # Header with instructions
     gr.Markdown(
         """
-        # Latex2Layout Layout Detection
-        Upload an image to detect layout elements using the **Latex2Layout** model. View the annotated image and download the results in YOLO format.
         """
     )
-    # Main layout with two columns
     with gr.Row():
-        # Input column
         with gr.Column(scale=1):
             input_image = gr.Image(
                 label="Upload Image",
@@ -111,63 +128,43 @@ with gr.Blocks(
                 height=400,
                 elem_classes="gr-image"
             )
-            detect_btn = gr.Button(
-                "Start Detection",
                 variant="primary",
                 elem_classes="button-primary"
             )
-            gr.Markdown("**Tip**: Upload a clear image for optimal detection results.")
-        # Output column
         with gr.Column(scale=1):
             output_image = gr.Image(
-                label="Detection Results",
                 height=400,
                 elem_classes="gr-image"
             )
-            layout_annotations = gr.Textbox(
-                label="Layout Annotations (YOLO Format)",
-                lines=10,
-                max_lines=15,
                 elem_classes="gr-textbox"
             )
-            download_btn = gr.Button(
-                "Download Annotations",
-                variant="secondary",
-                elem_classes="button-secondary"
-            )
-            download_file = gr.File(
-                label="Download File",
-                interactive=False
-            )
-    # Example image button (optional)
-    with gr.Row():
-        gr.Button("Load Example Image").click(
-            fn=lambda: cv2.imread("example_image.jpg"),
-            outputs=input_image
-        )
-    # Event handlers
-    detect_btn.click(
-        fn=detect_and_visualize,
-        inputs=input_image,
-        outputs=[output_image, layout_annotations],
         _js="() => { document.querySelector('.button-primary').innerText = 'Processing...'; }",
         show_progress=True
     ).then(
-        fn=lambda: gr.update(value="Start Detection"),
-        outputs=detect_btn,
-        _js="() => { document.querySelector('.button-primary').innerText = 'Start Detection'; }"
-    )
-    download_btn.click(
-        fn=save_layout_annotations,
-        inputs=layout_annotations,
-        outputs=download_file
     )
 # Launch the application
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import cv2
 import numpy as np
 import tempfile
 from ultralytics import YOLO
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import torch
+# Load the Latex2Layout model for layout detection
+latex2layout_model_path = "latex2layout_object_detection_yolov8.pt"
+latex2layout_model = YOLO(latex2layout_model_path)
+# Download and load the Qwen2.5-VL-3B model
+qwen_model_path = "Qwen/Qwen2.5-VL-3B"
+qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_path, device_map="auto", trust_remote_code=True)
+qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_path)
+def detect_layout(image):
     """
+    Detect layout elements in the image using the Latex2Layout model.
     Args:
+        image: The uploaded image (numpy array)
     Returns:
+        layout_description: Textual description of detected layout elements
     """
     if image is None:
+        return "Error: No image provided."
+    # Run layout detection
+    results = latex2layout_model(image)
     result = results[0]
+    layout_description = []
     for box in result.boxes:
         x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
         cls_id = int(box.cls[0])
         cls_name = result.names[cls_id]
+        layout_description.append(f"{cls_name} at position ({x1}, {y1}, {x2}, {y2})")
+    return ", ".join(layout_description) if layout_description else "No elements detected."
+def process_image_and_question(image, question):
+    """
+    Process the image with Latex2Layout and answer the question using Qwen2.5-VL.
+    Args:
+        image: The uploaded image (numpy array)
+        question: The user's question (string)
+    Returns:
+        annotated_image: Image with detection boxes
+        response: Answer from Qwen2.5-VL
+    """
+    if image is None or not question:
+        return None, "Error: Please upload an image and provide a question."
+    # Convert numpy image to PIL for Qwen2.5-VL
+    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+    # Detect layout using Latex2Layout
+    layout_description = detect_layout(image)
+    # Prepare annotated image
+    annotated_image = image.copy()
+    results = latex2layout_model(image)[0]
+    for box in results.boxes:
+        x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
+        conf = float(box.conf[0])
+        cls_id = int(box.cls[0])
+        cls_name = results.names[cls_id]
         color = tuple(np.random.randint(0, 255, 3).tolist())
         cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
         label = f'{cls_name} {conf:.2f}'
         (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
         cv2.rectangle(annotated_image, (x1, y1-label_height-5), (x1+label_width, y1), color, -1)
         cv2.putText(annotated_image, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+    # Prepare input for Qwen2.5-VL
+    input_text = f"Layout: {layout_description}\nQuestion: {question}"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image_pil},
+                {"type": "text", "text": input_text}
+            ]
+        }
+    ]
+    # Tokenize and generate response
+    inputs = qwen_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    model_inputs = qwen_tokenizer([inputs], return_tensors="pt").to(qwen_model.device)
+    with torch.no_grad():
+        output_ids = qwen_model.generate(**model_inputs, max_new_tokens=100)
+    response = qwen_tokenizer.decode(output_ids[0][len(model_inputs["input_ids"][0]):], skip_special_tokens=True)
+    return annotated_image, response
 # Custom CSS for styling
 custom_css = """
     .container { max-width: 1200px; margin: auto; }
     .button-primary { background-color: #4CAF50; color: white; }
     .gr-image { border: 2px solid #ddd; border-radius: 5px; }
+    .gr-textbox { font-family: Arial; }
 """
+# Create Gradio interface
 with gr.Blocks(
+    title="Latex2Layout Visual Q&A",
     theme=gr.themes.Default(),
     css=custom_css
 ) as demo:
     gr.Markdown(
         """
+        # Latex2Layout Visual Q&A
+        Upload an image and ask a question about its layout. The **Latex2Layout** model detects elements, and **Qwen2.5-VL** provides answers based on the image and layout information.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(
                 label="Upload Image",
                 height=400,
                 elem_classes="gr-image"
             )
+            question_input = gr.Textbox(
+                label="Ask a Question",
+                placeholder="e.g., What elements are in the image?",
+                lines=2
+            )
+            submit_btn = gr.Button(
+                "Get Answer",
                 variant="primary",
                 elem_classes="button-primary"
             )
         with gr.Column(scale=1):
             output_image = gr.Image(
+                label="Detected Layout",
                 height=400,
                 elem_classes="gr-image"
             )
+            output_text = gr.Textbox(
+                label="Answer",
+                lines=5,
+                max_lines=10,
                 elem_classes="gr-textbox"
             )
+    # Event handler
+    submit_btn.click(
+        fn=process_image_and_question,
+        inputs=[input_image, question_input],
+        outputs=[output_image, output_text],
         _js="() => { document.querySelector('.button-primary').innerText = 'Processing...'; }",
         show_progress=True
     ).then(
+        fn=lambda: gr.update(value="Get Answer"),
+        outputs=submit_btn,
+        _js="() => { document.querySelector('.button-primary').innerText = 'Get Answer'; }"
     )
 # Launch the application
 if __name__ == "__main__":
     demo.launch()