Spaces:

ChaseHan
/

Latex2Layout_PDF_Layout_Parsing

Running

App Files Files Community

ChaseHan commited on Apr 18

Commit

6a41fcf

verified ·

1 Parent(s): f56f5ba

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -113

app.py CHANGED Viewed

@@ -9,11 +9,14 @@ import base64
 from openai import OpenAI
 from ultralytics import YOLO
-# Load the Latex2Layout model
 model_path = "latex2layout_object_detection_yolov8.pt"
 if not os.path.exists(model_path):
     raise FileNotFoundError(f"Model file not found at {model_path}")
 try:
     model = YOLO(model_path)
 except Exception as e:
@@ -21,191 +24,192 @@ except Exception as e:
 # Qwen API configuration
 QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-QWEN_MODEL_ID = "qwen2.5-vl-3b-instruct"
 def encode_image(image_array):
     """
-    Encode numpy array image to base64 string.
     Args:
-        image_array: numpy array of the image
     Returns:
-        base64 encoded string of the image
     """
-    # Convert numpy array to PIL Image
-    pil_image = Image.fromarray(image_array)
-    # Convert PIL Image to bytes
-    img_byte_arr = io.BytesIO()
-    pil_image.save(img_byte_arr, format='PNG')
-    img_byte_arr = img_byte_arr.getvalue()
-    # Encode to base64
-    return base64.b64encode(img_byte_arr).decode("utf-8")
-def detect_layout(image):
     """
-    Perform layout detection on the uploaded image using local YOLO model.
     Args:
-        image: The uploaded image as a numpy array
     Returns:
-        annotated_image: Image with detection boxes
-        layout_info: Layout detection results
     """
-    if image is None:
-        return None, "Error: No image uploaded."
     try:
-        # Run detection using local YOLO model
         results = model(image)
         result = results[0]
-        # Create a copy of the image for visualization
         annotated_image = image.copy()
         layout_info = []
-        # Draw detection results
         for box in result.boxes:
-            # Get bounding box coordinates
-            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
             conf = float(box.conf[0])
             cls_id = int(box.cls[0])
             cls_name = result.names[cls_id]
-            # Generate a color for each class
             color = tuple(np.random.randint(0, 255, 3).tolist())
             # Draw bounding box and label
             cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
-            label = f'{cls_name} {conf:.2f}'
             (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-            cv2.rectangle(annotated_image, (x1, y1-label_height-5), (x1+label_width, y1), color, -1)
-            cv2.putText(annotated_image, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-            # Add detection to layout info
             layout_info.append({
-                'bbox': [x1, y1, x2, y2],
-                'class': cls_name,
-                'confidence': conf
             })
-        # Format layout information for Qwen
-        layout_info_str = json.dumps(layout_info, indent=2)
         return annotated_image, layout_info_str
     except Exception as e:
         return None, f"Error during layout detection: {str(e)}"
-def qa_about_layout(image, question, layout_info, api_key):
     """
-    Answer questions about the layout using Qwen2.5-VL API.
     Args:
-        image: The uploaded image
-        question: User's question about the layout
-        layout_info: Layout detection results from YOLO
-        api_key: User's Qwen API key
     Returns:
-        answer: Qwen's answer to the question
     """
-    if image is None or not question:
-        return "Please upload an image and ask a question."
-    if not layout_info:
-        return "No layout information available. Please detect layout first."
     if not api_key:
-        return "Please enter your Qwen API key."
     try:
         # Encode image to base64
         base64_image = encode_image(image)
         # Initialize OpenAI client for Qwen API
-        client = OpenAI(
-            api_key=api_key,
-            base_url=QWEN_BASE_URL,
-        )
-        # Prepare system prompt with layout information
-        system_prompt = f"""You are a helpful assistant specialized in analyzing document layouts.
-        The following layout information has been detected in the image:
-        {layout_info}
-        Please answer questions about the layout based on this information and the image."""
-        # Prepare messages for API call
         messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": system_prompt}]
-            },
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
-                    },
                     {"type": "text", "text": question},
                 ],
-            }
         ]
         # Call Qwen API
-        completion = client.chat.completions.create(
-            model=QWEN_MODEL_ID,
-            messages=messages,
-        )
         return completion.choices[0].message.content
     except Exception as e:
         return f"Error during QA: {str(e)}"
-# Create Gradio interface
 with gr.Blocks(title="Latex2Layout QA System") as demo:
     gr.Markdown("# Latex2Layout QA System")
-    gr.Markdown("Upload an image, detect layout elements, and ask questions about the layout.")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="numpy")
             detect_btn = gr.Button("Detect Layout")
-            gr.Markdown("**Tip**: Upload a clear image for optimal detection results.")
         with gr.Column(scale=1):
-            output_image = gr.Image(label="Detection Results")
-            layout_info = gr.Textbox(label="Layout Information", lines=10)
     with gr.Row():
         with gr.Column(scale=1):
             api_key_input = gr.Textbox(
                 label="Qwen API Key",
-                placeholder="Enter your Qwen API key here",
                 type="password"
             )
-            question_input = gr.Textbox(label="Ask a question about the layout")
             qa_btn = gr.Button("Ask Question")
         with gr.Column(scale=1):
-            answer_output = gr.Textbox(label="Answer", lines=5)
     # Event handlers
     detect_btn.click(
         fn=detect_layout,
         inputs=[input_image],
         outputs=[output_image, layout_info]
     )
     qa_btn.click(
         fn=qa_about_layout,
-        inputs=[input_image, question_input, layout_info, api_key_input],
         outputs=[answer_output]
     )

 from openai import OpenAI
 from ultralytics import YOLO
+# Define the Latex2Layout model path
 model_path = "latex2layout_object_detection_yolov8.pt"
+# Verify model file existence
 if not os.path.exists(model_path):
     raise FileNotFoundError(f"Model file not found at {model_path}")
+# Load the Latex2Layout model with error handling
 try:
     model = YOLO(model_path)
 except Exception as e:
 # Qwen API configuration
 QWEN_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+QWEN_MODELS = {
+    "Qwen2.5-VL-3B-Instruct": "qwen2.5-vl-3b-instruct",
+    "Qwen2.5-VL-7B-Instruct": "qwen2.5-vl-7b-instruct",
+    "Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
+}
 def encode_image(image_array):
     """
+    Convert a numpy array image to a base64-encoded string.
     Args:
+        image_array: Numpy array representing the image.
     Returns:
+        str: Base64-encoded string of the image.
     """
+    try:
+        pil_image = Image.fromarray(image_array)
+        img_byte_arr = io.BytesIO()
+        pil_image.save(img_byte_arr, format='PNG')
+        return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
+    except Exception as e:
+        raise ValueError(f"Failed to encode image: {e}")
+def detect_layout(image, confidence_threshold=0.5):
     """
+    Detect layout elements in the uploaded image using the Latex2Layout model.
     Args:
+        image: Uploaded image as a numpy array.
+        confidence_threshold: Minimum confidence score to retain detections (default: 0.5).
     Returns:
+        tuple: (annotated_image, layout_info_str)
+            - annotated_image: Image with bounding boxes drawn (confidence >= 0.5).
+            - layout_info_str: JSON string of layout detections (confidence >= 0.5).
     """
+    if image is None or not isinstance(image, np.ndarray):
+        return None, "Error: No image uploaded or invalid image format."
     try:
+        # Perform detection
         results = model(image)
         result = results[0]
         annotated_image = image.copy()
         layout_info = []
+        # Process detections
         for box in result.boxes:
             conf = float(box.conf[0])
+            # Filter out detections below the confidence threshold
+            if conf < confidence_threshold:
+                continue
+            # Extract and convert bounding box coordinates
+            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
             cls_id = int(box.cls[0])
             cls_name = result.names[cls_id]
+            # Assign a random color for visualization
             color = tuple(np.random.randint(0, 255, 3).tolist())
             # Draw bounding box and label
             cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
+            label = f"{cls_name} {conf:.2f}"
             (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
+            cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+            # Store layout information
             layout_info.append({
+                "bbox": [x1, y1, x2, y2],
+                "class": cls_name,
+                "confidence": conf
             })
+        # Format layout info as JSON string
+        layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
         return annotated_image, layout_info_str
     except Exception as e:
         return None, f"Error during layout detection: {str(e)}"
+def qa_about_layout(image, question, layout_info, api_key, model_name):
     """
+    Answer layout-related questions using the Qwen API.
     Args:
+        image: Uploaded image as a numpy array.
+        question: User's question about the layout.
+        layout_info: JSON string of layout detection results.
+        api_key: User's Qwen API key.
+        model_name: Selected Qwen model name from dropdown.
     Returns:
+        str: Qwen's response to the question.
     """
+    if image is None or not isinstance(image, np.ndarray):
+        return "Error: Please upload a valid image."
+    if not question:
+        return "Error: Please enter a question."
     if not api_key:
+        return "Error: Please provide a Qwen API key."
+    if not layout_info:
+        return "Error: No layout information available. Detect layout first."
     try:
         # Encode image to base64
         base64_image = encode_image(image)
+        # Map model name to ID
+        model_id = QWEN_MODELS.get(model_name)
+        if not model_id:
+            return "Error: Invalid Qwen model selected."
         # Initialize OpenAI client for Qwen API
+        client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
+        # Construct system prompt with layout info
+        system_prompt = f"""You are an assistant specialized in document layout analysis.
+The following layout elements were detected in the image (confidence >= 0.5):
+{layout_info}
+Use this information and the image to answer layout-related questions."""
+        # Prepare API request messages
         messages = [
+            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
             {
                 "role": "user",
                 "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}},
                     {"type": "text", "text": question},
                 ],
+            },
         ]
         # Call Qwen API
+        completion = client.chat.completions.create(model=model_id, messages=messages)
         return completion.choices[0].message.content
     except Exception as e:
         return f"Error during QA: {str(e)}"
+# Build Gradio interface
 with gr.Blocks(title="Latex2Layout QA System") as demo:
     gr.Markdown("# Latex2Layout QA System")
+    gr.Markdown("Upload an image to detect layout elements and ask questions about the layout using Qwen models.")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="numpy")
             detect_btn = gr.Button("Detect Layout")
+            gr.Markdown("**Tip**: Use clear images for best results.")
         with gr.Column(scale=1):
+            output_image = gr.Image(label="Detected Layout")
+            layout_info = gr.Textbox(label="Layout Information", lines=10, interactive=False)
     with gr.Row():
         with gr.Column(scale=1):
             api_key_input = gr.Textbox(
                 label="Qwen API Key",
+                placeholder="Enter your Qwen API key",
                 type="password"
             )
+            model_select = gr.Dropdown(
+                label="Select Qwen Model",
+                choices=list(QWEN_MODELS.keys()),
+                value="Qwen2.5-VL-3B-Instruct"
+            )
+            question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
             qa_btn = gr.Button("Ask Question")
         with gr.Column(scale=1):
+            answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
     # Event handlers
     detect_btn.click(
         fn=detect_layout,
         inputs=[input_image],
         outputs=[output_image, layout_info]
     )
     qa_btn.click(
         fn=qa_about_layout,
+        inputs=[input_image, question_input, layout_info, api_key_input, model_select],
         outputs=[answer_output]
     )