ChaseHan commited on
Commit
55866d0
·
verified ·
1 Parent(s): 6a41fcf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -19
app.py CHANGED
@@ -30,6 +30,12 @@ QWEN_MODELS = {
30
  "Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
31
  }
32
 
 
 
 
 
 
 
33
  def encode_image(image_array):
34
  """
35
  Convert a numpy array image to a base64-encoded string.
@@ -74,49 +80,70 @@ def detect_layout(image, confidence_threshold=0.5):
74
  # Process detections
75
  for box in result.boxes:
76
  conf = float(box.conf[0])
77
- # Filter out detections below the confidence threshold
78
  if conf < confidence_threshold:
79
  continue
80
 
81
- # Extract and convert bounding box coordinates
82
  x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
83
  cls_id = int(box.cls[0])
84
  cls_name = result.names[cls_id]
85
 
86
- # Assign a random color for visualization
87
  color = tuple(np.random.randint(0, 255, 3).tolist())
88
-
89
- # Draw bounding box and label
90
  cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
91
  label = f"{cls_name} {conf:.2f}"
92
  (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
93
  cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
94
  cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
95
 
96
- # Store layout information
97
  layout_info.append({
98
  "bbox": [x1, y1, x2, y2],
99
  "class": cls_name,
100
  "confidence": conf
101
  })
102
 
103
- # Format layout info as JSON string
104
  layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
105
  return annotated_image, layout_info_str
106
 
107
  except Exception as e:
108
  return None, f"Error during layout detection: {str(e)}"
109
 
110
- def qa_about_layout(image, question, layout_info, api_key, model_name):
111
  """
112
- Answer layout-related questions using the Qwen API.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  Args:
115
  image: Uploaded image as a numpy array.
116
  question: User's question about the layout.
117
  layout_info: JSON string of layout detection results.
118
  api_key: User's Qwen API key.
119
- model_name: Selected Qwen model name from dropdown.
 
120
 
121
  Returns:
122
  str: Qwen's response to the question.
@@ -139,16 +166,12 @@ def qa_about_layout(image, question, layout_info, api_key, model_name):
139
  if not model_id:
140
  return "Error: Invalid Qwen model selected."
141
 
 
 
 
142
  # Initialize OpenAI client for Qwen API
143
  client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
144
 
145
- # Construct system prompt with layout info
146
- system_prompt = f"""You are an assistant specialized in document layout analysis.
147
- The following layout elements were detected in the image (confidence >= 0.5):
148
- {layout_info}
149
-
150
- Use this information and the image to answer layout-related questions."""
151
-
152
  # Prepare API request messages
153
  messages = [
154
  {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
@@ -171,12 +194,13 @@ Use this information and the image to answer layout-related questions."""
171
  # Build Gradio interface
172
  with gr.Blocks(title="Latex2Layout QA System") as demo:
173
  gr.Markdown("# Latex2Layout QA System")
174
- gr.Markdown("Upload an image to detect layout elements and ask questions about the layout using Qwen models.")
175
 
176
  with gr.Row():
177
  with gr.Column(scale=1):
178
  input_image = gr.Image(label="Upload Image", type="numpy")
179
  detect_btn = gr.Button("Detect Layout")
 
180
  gr.Markdown("**Tip**: Use clear images for best results.")
181
 
182
  with gr.Column(scale=1):
@@ -195,6 +219,13 @@ with gr.Blocks(title="Latex2Layout QA System") as demo:
195
  choices=list(QWEN_MODELS.keys()),
196
  value="Qwen2.5-VL-3B-Instruct"
197
  )
 
 
 
 
 
 
 
198
  question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
199
  qa_btn = gr.Button("Ask Question")
200
 
@@ -207,9 +238,14 @@ with gr.Blocks(title="Latex2Layout QA System") as demo:
207
  inputs=[input_image],
208
  outputs=[output_image, layout_info]
209
  )
 
 
 
 
 
210
  qa_btn.click(
211
  fn=qa_about_layout,
212
- inputs=[input_image, question_input, layout_info, api_key_input, model_select],
213
  outputs=[answer_output]
214
  )
215
 
 
30
  "Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
31
  }
32
 
33
+ # Default system prompt template
34
+ default_system_prompt = """You are an assistant specialized in document layout analysis.
35
+ The following layout elements were detected in the image (confidence >= 0.5):
36
+ {layout_info}
37
+ Use this information and the image to answer layout-related questions."""
38
+
39
  def encode_image(image_array):
40
  """
41
  Convert a numpy array image to a base64-encoded string.
 
80
  # Process detections
81
  for box in result.boxes:
82
  conf = float(box.conf[0])
 
83
  if conf < confidence_threshold:
84
  continue
85
 
 
86
  x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
87
  cls_id = int(box.cls[0])
88
  cls_name = result.names[cls_id]
89
 
 
90
  color = tuple(np.random.randint(0, 255, 3).tolist())
 
 
91
  cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
92
  label = f"{cls_name} {conf:.2f}"
93
  (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
94
  cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
95
  cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
96
 
 
97
  layout_info.append({
98
  "bbox": [x1, y1, x2, y2],
99
  "class": cls_name,
100
  "confidence": conf
101
  })
102
 
 
103
  layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
104
  return annotated_image, layout_info_str
105
 
106
  except Exception as e:
107
  return None, f"Error during layout detection: {str(e)}"
108
 
109
+ def detect_example_image():
110
  """
111
+ Load and detect layout elements in the example image (./image1.png).
112
+
113
+ Returns:
114
+ tuple: (example_image, annotated_image, layout_info_str)
115
+ - example_image: Original example image.
116
+ - annotated_image: Annotated example image.
117
+ - layout_info_str: JSON string of layout detections.
118
+ """
119
+ example_image_path = "./image1.png"
120
+ if not os.path.exists(example_image_path):
121
+ return None, None, "Error: Example image not found."
122
+
123
+ try:
124
+ # Load image in BGR and convert to RGB
125
+ bgr_image = cv2.imread(example_image_path)
126
+ if bgr_image is None:
127
+ return None, None, "Error: Failed to load example image."
128
+ rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
129
+
130
+ # Run detection
131
+ annotated_image, layout_info_str = detect_layout(rgb_image)
132
+ return rgb_image, annotated_image, layout_info_str
133
+ except Exception as e:
134
+ return None, None, f"Error processing example image: {str(e)}"
135
+
136
+ def qa_about_layout(image, question, layout_info, api_key, model_name, system_prompt_template):
137
+ """
138
+ Answer layout-related questions using the Qwen API with an editable system prompt.
139
 
140
  Args:
141
  image: Uploaded image as a numpy array.
142
  question: User's question about the layout.
143
  layout_info: JSON string of layout detection results.
144
  api_key: User's Qwen API key.
145
+ model_name: Selected Qwen model name.
146
+ system_prompt_template: Editable system prompt template.
147
 
148
  Returns:
149
  str: Qwen's response to the question.
 
166
  if not model_id:
167
  return "Error: Invalid Qwen model selected."
168
 
169
+ # Replace placeholder in system prompt with layout info
170
+ system_prompt = system_prompt_template.replace("{layout_info}", layout_info)
171
+
172
  # Initialize OpenAI client for Qwen API
173
  client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
174
 
 
 
 
 
 
 
 
175
  # Prepare API request messages
176
  messages = [
177
  {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
 
194
  # Build Gradio interface
195
  with gr.Blocks(title="Latex2Layout QA System") as demo:
196
  gr.Markdown("# Latex2Layout QA System")
197
+ gr.Markdown("Upload an image or use the example to detect layout elements and ask questions using Qwen models.")
198
 
199
  with gr.Row():
200
  with gr.Column(scale=1):
201
  input_image = gr.Image(label="Upload Image", type="numpy")
202
  detect_btn = gr.Button("Detect Layout")
203
+ example_btn = gr.Button("Detect Example Image")
204
  gr.Markdown("**Tip**: Use clear images for best results.")
205
 
206
  with gr.Column(scale=1):
 
219
  choices=list(QWEN_MODELS.keys()),
220
  value="Qwen2.5-VL-3B-Instruct"
221
  )
222
+ gr.Markdown("**System Prompt Template**: Edit the prompt sent to Qwen. Include `{layout_info}` to insert detection results.")
223
+ system_prompt_input = gr.Textbox(
224
+ label="System Prompt Template",
225
+ value=default_system_prompt,
226
+ lines=5,
227
+ placeholder="Edit the system prompt here. Keep {layout_info} to include detection results."
228
+ )
229
  question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
230
  qa_btn = gr.Button("Ask Question")
231
 
 
238
  inputs=[input_image],
239
  outputs=[output_image, layout_info]
240
  )
241
+ example_btn.click(
242
+ fn=detect_example_image,
243
+ inputs=[],
244
+ outputs=[input_image, output_image, layout_info]
245
+ )
246
  qa_btn.click(
247
  fn=qa_about_layout,
248
+ inputs=[input_image, question_input, layout_info, api_key_input, model_select, system_prompt_input],
249
  outputs=[answer_output]
250
  )
251