Spaces:

ChaseHan
/

Latex2Layout_PDF_Layout_Parsing

Running

App Files Files Community

ChaseHan commited on Apr 18

Commit

55866d0

verified ·

1 Parent(s): 6a41fcf

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -19

app.py CHANGED Viewed

@@ -30,6 +30,12 @@ QWEN_MODELS = {
     "Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
 }
 def encode_image(image_array):
     """
     Convert a numpy array image to a base64-encoded string.
@@ -74,49 +80,70 @@ def detect_layout(image, confidence_threshold=0.5):
         # Process detections
         for box in result.boxes:
             conf = float(box.conf[0])
-            # Filter out detections below the confidence threshold
             if conf < confidence_threshold:
                 continue
-            # Extract and convert bounding box coordinates
             x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
             cls_id = int(box.cls[0])
             cls_name = result.names[cls_id]
-            # Assign a random color for visualization
             color = tuple(np.random.randint(0, 255, 3).tolist())
-            # Draw bounding box and label
             cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
             label = f"{cls_name} {conf:.2f}"
             (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
             cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
             cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-            # Store layout information
             layout_info.append({
                 "bbox": [x1, y1, x2, y2],
                 "class": cls_name,
                 "confidence": conf
             })
-        # Format layout info as JSON string
         layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
         return annotated_image, layout_info_str
     except Exception as e:
         return None, f"Error during layout detection: {str(e)}"
-def qa_about_layout(image, question, layout_info, api_key, model_name):
     """
-    Answer layout-related questions using the Qwen API.
     Args:
         image: Uploaded image as a numpy array.
         question: User's question about the layout.
         layout_info: JSON string of layout detection results.
         api_key: User's Qwen API key.
-        model_name: Selected Qwen model name from dropdown.
     Returns:
         str: Qwen's response to the question.
@@ -139,16 +166,12 @@ def qa_about_layout(image, question, layout_info, api_key, model_name):
         if not model_id:
             return "Error: Invalid Qwen model selected."
         # Initialize OpenAI client for Qwen API
         client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
-        # Construct system prompt with layout info
-        system_prompt = f"""You are an assistant specialized in document layout analysis.
-The following layout elements were detected in the image (confidence >= 0.5):
-{layout_info}
-Use this information and the image to answer layout-related questions."""
         # Prepare API request messages
         messages = [
             {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
@@ -171,12 +194,13 @@ Use this information and the image to answer layout-related questions."""
 # Build Gradio interface
 with gr.Blocks(title="Latex2Layout QA System") as demo:
     gr.Markdown("# Latex2Layout QA System")
-    gr.Markdown("Upload an image to detect layout elements and ask questions about the layout using Qwen models.")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="numpy")
             detect_btn = gr.Button("Detect Layout")
             gr.Markdown("**Tip**: Use clear images for best results.")
         with gr.Column(scale=1):
@@ -195,6 +219,13 @@ with gr.Blocks(title="Latex2Layout QA System") as demo:
                 choices=list(QWEN_MODELS.keys()),
                 value="Qwen2.5-VL-3B-Instruct"
             )
             question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
             qa_btn = gr.Button("Ask Question")
@@ -207,9 +238,14 @@ with gr.Blocks(title="Latex2Layout QA System") as demo:
         inputs=[input_image],
         outputs=[output_image, layout_info]
     )
     qa_btn.click(
         fn=qa_about_layout,
-        inputs=[input_image, question_input, layout_info, api_key_input, model_select],
         outputs=[answer_output]
     )

     "Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
 }
+# Default system prompt template
+default_system_prompt = """You are an assistant specialized in document layout analysis.
+The following layout elements were detected in the image (confidence >= 0.5):
+{layout_info}
+Use this information and the image to answer layout-related questions."""
 def encode_image(image_array):
     """
     Convert a numpy array image to a base64-encoded string.
         # Process detections
         for box in result.boxes:
             conf = float(box.conf[0])
             if conf < confidence_threshold:
                 continue
             x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
             cls_id = int(box.cls[0])
             cls_name = result.names[cls_id]
             color = tuple(np.random.randint(0, 255, 3).tolist())
             cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
             label = f"{cls_name} {conf:.2f}"
             (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
             cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
             cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
             layout_info.append({
                 "bbox": [x1, y1, x2, y2],
                 "class": cls_name,
                 "confidence": conf
             })
         layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
         return annotated_image, layout_info_str
     except Exception as e:
         return None, f"Error during layout detection: {str(e)}"
+def detect_example_image():
     """
+    Load and detect layout elements in the example image (./image1.png).
+    Returns:
+        tuple: (example_image, annotated_image, layout_info_str)
+            - example_image: Original example image.
+            - annotated_image: Annotated example image.
+            - layout_info_str: JSON string of layout detections.
+    """
+    example_image_path = "./image1.png"
+    if not os.path.exists(example_image_path):
+        return None, None, "Error: Example image not found."
+    try:
+        # Load image in BGR and convert to RGB
+        bgr_image = cv2.imread(example_image_path)
+        if bgr_image is None:
+            return None, None, "Error: Failed to load example image."
+        rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
+        # Run detection
+        annotated_image, layout_info_str = detect_layout(rgb_image)
+        return rgb_image, annotated_image, layout_info_str
+    except Exception as e:
+        return None, None, f"Error processing example image: {str(e)}"
+def qa_about_layout(image, question, layout_info, api_key, model_name, system_prompt_template):
+    """
+    Answer layout-related questions using the Qwen API with an editable system prompt.
     Args:
         image: Uploaded image as a numpy array.
         question: User's question about the layout.
         layout_info: JSON string of layout detection results.
         api_key: User's Qwen API key.
+        model_name: Selected Qwen model name.
+        system_prompt_template: Editable system prompt template.
     Returns:
         str: Qwen's response to the question.
         if not model_id:
             return "Error: Invalid Qwen model selected."
+        # Replace placeholder in system prompt with layout info
+        system_prompt = system_prompt_template.replace("{layout_info}", layout_info)
         # Initialize OpenAI client for Qwen API
         client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
         # Prepare API request messages
         messages = [
             {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
 # Build Gradio interface
 with gr.Blocks(title="Latex2Layout QA System") as demo:
     gr.Markdown("# Latex2Layout QA System")
+    gr.Markdown("Upload an image or use the example to detect layout elements and ask questions using Qwen models.")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="numpy")
             detect_btn = gr.Button("Detect Layout")
+            example_btn = gr.Button("Detect Example Image")
             gr.Markdown("**Tip**: Use clear images for best results.")
         with gr.Column(scale=1):
                 choices=list(QWEN_MODELS.keys()),
                 value="Qwen2.5-VL-3B-Instruct"
             )
+            gr.Markdown("**System Prompt Template**: Edit the prompt sent to Qwen. Include `{layout_info}` to insert detection results.")
+            system_prompt_input = gr.Textbox(
+                label="System Prompt Template",
+                value=default_system_prompt,
+                lines=5,
+                placeholder="Edit the system prompt here. Keep {layout_info} to include detection results."
+            )
             question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
             qa_btn = gr.Button("Ask Question")
         inputs=[input_image],
         outputs=[output_image, layout_info]
     )
+    example_btn.click(
+        fn=detect_example_image,
+        inputs=[],
+        outputs=[input_image, output_image, layout_info]
+    )
     qa_btn.click(
         fn=qa_about_layout,
+        inputs=[input_image, question_input, layout_info, api_key_input, model_select, system_prompt_input],
         outputs=[answer_output]
     )