Update app.py
Browse files
app.py
CHANGED
@@ -30,6 +30,12 @@ QWEN_MODELS = {
|
|
30 |
"Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
|
31 |
}
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def encode_image(image_array):
|
34 |
"""
|
35 |
Convert a numpy array image to a base64-encoded string.
|
@@ -74,49 +80,70 @@ def detect_layout(image, confidence_threshold=0.5):
|
|
74 |
# Process detections
|
75 |
for box in result.boxes:
|
76 |
conf = float(box.conf[0])
|
77 |
-
# Filter out detections below the confidence threshold
|
78 |
if conf < confidence_threshold:
|
79 |
continue
|
80 |
|
81 |
-
# Extract and convert bounding box coordinates
|
82 |
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
|
83 |
cls_id = int(box.cls[0])
|
84 |
cls_name = result.names[cls_id]
|
85 |
|
86 |
-
# Assign a random color for visualization
|
87 |
color = tuple(np.random.randint(0, 255, 3).tolist())
|
88 |
-
|
89 |
-
# Draw bounding box and label
|
90 |
cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
|
91 |
label = f"{cls_name} {conf:.2f}"
|
92 |
(label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
93 |
cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
|
94 |
cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
95 |
|
96 |
-
# Store layout information
|
97 |
layout_info.append({
|
98 |
"bbox": [x1, y1, x2, y2],
|
99 |
"class": cls_name,
|
100 |
"confidence": conf
|
101 |
})
|
102 |
|
103 |
-
# Format layout info as JSON string
|
104 |
layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
|
105 |
return annotated_image, layout_info_str
|
106 |
|
107 |
except Exception as e:
|
108 |
return None, f"Error during layout detection: {str(e)}"
|
109 |
|
110 |
-
def
|
111 |
"""
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
Args:
|
115 |
image: Uploaded image as a numpy array.
|
116 |
question: User's question about the layout.
|
117 |
layout_info: JSON string of layout detection results.
|
118 |
api_key: User's Qwen API key.
|
119 |
-
model_name: Selected Qwen model name
|
|
|
120 |
|
121 |
Returns:
|
122 |
str: Qwen's response to the question.
|
@@ -139,16 +166,12 @@ def qa_about_layout(image, question, layout_info, api_key, model_name):
|
|
139 |
if not model_id:
|
140 |
return "Error: Invalid Qwen model selected."
|
141 |
|
|
|
|
|
|
|
142 |
# Initialize OpenAI client for Qwen API
|
143 |
client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
|
144 |
|
145 |
-
# Construct system prompt with layout info
|
146 |
-
system_prompt = f"""You are an assistant specialized in document layout analysis.
|
147 |
-
The following layout elements were detected in the image (confidence >= 0.5):
|
148 |
-
{layout_info}
|
149 |
-
|
150 |
-
Use this information and the image to answer layout-related questions."""
|
151 |
-
|
152 |
# Prepare API request messages
|
153 |
messages = [
|
154 |
{"role": "system", "content": [{"type": "text", "text": system_prompt}]},
|
@@ -171,12 +194,13 @@ Use this information and the image to answer layout-related questions."""
|
|
171 |
# Build Gradio interface
|
172 |
with gr.Blocks(title="Latex2Layout QA System") as demo:
|
173 |
gr.Markdown("# Latex2Layout QA System")
|
174 |
-
gr.Markdown("Upload an image to detect layout elements and ask questions
|
175 |
|
176 |
with gr.Row():
|
177 |
with gr.Column(scale=1):
|
178 |
input_image = gr.Image(label="Upload Image", type="numpy")
|
179 |
detect_btn = gr.Button("Detect Layout")
|
|
|
180 |
gr.Markdown("**Tip**: Use clear images for best results.")
|
181 |
|
182 |
with gr.Column(scale=1):
|
@@ -195,6 +219,13 @@ with gr.Blocks(title="Latex2Layout QA System") as demo:
|
|
195 |
choices=list(QWEN_MODELS.keys()),
|
196 |
value="Qwen2.5-VL-3B-Instruct"
|
197 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
|
199 |
qa_btn = gr.Button("Ask Question")
|
200 |
|
@@ -207,9 +238,14 @@ with gr.Blocks(title="Latex2Layout QA System") as demo:
|
|
207 |
inputs=[input_image],
|
208 |
outputs=[output_image, layout_info]
|
209 |
)
|
|
|
|
|
|
|
|
|
|
|
210 |
qa_btn.click(
|
211 |
fn=qa_about_layout,
|
212 |
-
inputs=[input_image, question_input, layout_info, api_key_input, model_select],
|
213 |
outputs=[answer_output]
|
214 |
)
|
215 |
|
|
|
30 |
"Qwen2.5-VL-14B-Instruct": "qwen2.5-vl-14b-instruct",
|
31 |
}
|
32 |
|
33 |
+
# Default system prompt template
|
34 |
+
default_system_prompt = """You are an assistant specialized in document layout analysis.
|
35 |
+
The following layout elements were detected in the image (confidence >= 0.5):
|
36 |
+
{layout_info}
|
37 |
+
Use this information and the image to answer layout-related questions."""
|
38 |
+
|
39 |
def encode_image(image_array):
|
40 |
"""
|
41 |
Convert a numpy array image to a base64-encoded string.
|
|
|
80 |
# Process detections
|
81 |
for box in result.boxes:
|
82 |
conf = float(box.conf[0])
|
|
|
83 |
if conf < confidence_threshold:
|
84 |
continue
|
85 |
|
|
|
86 |
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
|
87 |
cls_id = int(box.cls[0])
|
88 |
cls_name = result.names[cls_id]
|
89 |
|
|
|
90 |
color = tuple(np.random.randint(0, 255, 3).tolist())
|
|
|
|
|
91 |
cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2)
|
92 |
label = f"{cls_name} {conf:.2f}"
|
93 |
(label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
94 |
cv2.rectangle(annotated_image, (x1, y1 - label_height - 5), (x1 + label_width, y1), color, -1)
|
95 |
cv2.putText(annotated_image, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
96 |
|
|
|
97 |
layout_info.append({
|
98 |
"bbox": [x1, y1, x2, y2],
|
99 |
"class": cls_name,
|
100 |
"confidence": conf
|
101 |
})
|
102 |
|
|
|
103 |
layout_info_str = json.dumps(layout_info, indent=2) if layout_info else "No layout elements detected with confidence >= 0.5."
|
104 |
return annotated_image, layout_info_str
|
105 |
|
106 |
except Exception as e:
|
107 |
return None, f"Error during layout detection: {str(e)}"
|
108 |
|
109 |
+
def detect_example_image():
|
110 |
"""
|
111 |
+
Load and detect layout elements in the example image (./image1.png).
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
tuple: (example_image, annotated_image, layout_info_str)
|
115 |
+
- example_image: Original example image.
|
116 |
+
- annotated_image: Annotated example image.
|
117 |
+
- layout_info_str: JSON string of layout detections.
|
118 |
+
"""
|
119 |
+
example_image_path = "./image1.png"
|
120 |
+
if not os.path.exists(example_image_path):
|
121 |
+
return None, None, "Error: Example image not found."
|
122 |
+
|
123 |
+
try:
|
124 |
+
# Load image in BGR and convert to RGB
|
125 |
+
bgr_image = cv2.imread(example_image_path)
|
126 |
+
if bgr_image is None:
|
127 |
+
return None, None, "Error: Failed to load example image."
|
128 |
+
rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
|
129 |
+
|
130 |
+
# Run detection
|
131 |
+
annotated_image, layout_info_str = detect_layout(rgb_image)
|
132 |
+
return rgb_image, annotated_image, layout_info_str
|
133 |
+
except Exception as e:
|
134 |
+
return None, None, f"Error processing example image: {str(e)}"
|
135 |
+
|
136 |
+
def qa_about_layout(image, question, layout_info, api_key, model_name, system_prompt_template):
|
137 |
+
"""
|
138 |
+
Answer layout-related questions using the Qwen API with an editable system prompt.
|
139 |
|
140 |
Args:
|
141 |
image: Uploaded image as a numpy array.
|
142 |
question: User's question about the layout.
|
143 |
layout_info: JSON string of layout detection results.
|
144 |
api_key: User's Qwen API key.
|
145 |
+
model_name: Selected Qwen model name.
|
146 |
+
system_prompt_template: Editable system prompt template.
|
147 |
|
148 |
Returns:
|
149 |
str: Qwen's response to the question.
|
|
|
166 |
if not model_id:
|
167 |
return "Error: Invalid Qwen model selected."
|
168 |
|
169 |
+
# Replace placeholder in system prompt with layout info
|
170 |
+
system_prompt = system_prompt_template.replace("{layout_info}", layout_info)
|
171 |
+
|
172 |
# Initialize OpenAI client for Qwen API
|
173 |
client = OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
# Prepare API request messages
|
176 |
messages = [
|
177 |
{"role": "system", "content": [{"type": "text", "text": system_prompt}]},
|
|
|
194 |
# Build Gradio interface
|
195 |
with gr.Blocks(title="Latex2Layout QA System") as demo:
|
196 |
gr.Markdown("# Latex2Layout QA System")
|
197 |
+
gr.Markdown("Upload an image or use the example to detect layout elements and ask questions using Qwen models.")
|
198 |
|
199 |
with gr.Row():
|
200 |
with gr.Column(scale=1):
|
201 |
input_image = gr.Image(label="Upload Image", type="numpy")
|
202 |
detect_btn = gr.Button("Detect Layout")
|
203 |
+
example_btn = gr.Button("Detect Example Image")
|
204 |
gr.Markdown("**Tip**: Use clear images for best results.")
|
205 |
|
206 |
with gr.Column(scale=1):
|
|
|
219 |
choices=list(QWEN_MODELS.keys()),
|
220 |
value="Qwen2.5-VL-3B-Instruct"
|
221 |
)
|
222 |
+
gr.Markdown("**System Prompt Template**: Edit the prompt sent to Qwen. Include `{layout_info}` to insert detection results.")
|
223 |
+
system_prompt_input = gr.Textbox(
|
224 |
+
label="System Prompt Template",
|
225 |
+
value=default_system_prompt,
|
226 |
+
lines=5,
|
227 |
+
placeholder="Edit the system prompt here. Keep {layout_info} to include detection results."
|
228 |
+
)
|
229 |
question_input = gr.Textbox(label="Ask About the Layout", placeholder="e.g., 'Where is the heading?'")
|
230 |
qa_btn = gr.Button("Ask Question")
|
231 |
|
|
|
238 |
inputs=[input_image],
|
239 |
outputs=[output_image, layout_info]
|
240 |
)
|
241 |
+
example_btn.click(
|
242 |
+
fn=detect_example_image,
|
243 |
+
inputs=[],
|
244 |
+
outputs=[input_image, output_image, layout_info]
|
245 |
+
)
|
246 |
qa_btn.click(
|
247 |
fn=qa_about_layout,
|
248 |
+
inputs=[input_image, question_input, layout_info, api_key_input, model_select, system_prompt_input],
|
249 |
outputs=[answer_output]
|
250 |
)
|
251 |
|