Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -127,6 +127,47 @@ def index_from_url(url: str) -> Tuple[str, str]:
|
|
127 |
return status, local_path
|
128 |
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
# =============================
|
131 |
# Local Search (ColPali)
|
132 |
# =============================
|
@@ -169,6 +210,45 @@ def search(query: str, k: int = 5) -> List[int]:
|
|
169 |
return top_k_indices
|
170 |
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
173 |
"""Turn page indices into OpenAI vision content parts."""
|
174 |
parts: List[Dict[str, Any]] = []
|
@@ -186,7 +266,9 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
|
186 |
# Agent System Prompt
|
187 |
# =============================
|
188 |
|
189 |
-
|
|
|
|
|
190 |
"""
|
191 |
You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
|
192 |
Act iteratively:
|
@@ -205,12 +287,31 @@ Deliverable:
|
|
205 |
).strip()
|
206 |
|
207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
# =============================
|
209 |
# MCP config (search-only)
|
210 |
# =============================
|
|
|
211 |
DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
|
212 |
DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
|
213 |
-
DEFAULT_ALLOWED_TOOLS = "mcp_test_search" # search-only; no get_pages
|
214 |
|
215 |
|
216 |
# =============================
|
@@ -222,8 +323,7 @@ def stream_agent(question: str,
|
|
222 |
model_name: str,
|
223 |
server_url: str,
|
224 |
server_label: str,
|
225 |
-
|
226 |
-
allowed_tools: str):
|
227 |
"""
|
228 |
Multi-round streaming:
|
229 |
• Seed: optional local ColPali search on the user question to attach initial pages.
|
@@ -231,6 +331,10 @@ def stream_agent(question: str,
|
|
231 |
• If the model calls mcp_test_search and returns indices, we end the stream and
|
232 |
start a NEW API call with previous_response_id + the requested pages attached.
|
233 |
"""
|
|
|
|
|
|
|
|
|
234 |
if not api_key:
|
235 |
yield "⚠️ **Please provide your OpenAI API key.**", "", ""
|
236 |
return
|
@@ -243,7 +347,7 @@ def stream_agent(question: str,
|
|
243 |
|
244 |
# Optional seeding: attach some likely pages on round 1
|
245 |
try:
|
246 |
-
seed_indices = search(question, k=5)
|
247 |
except Exception as e:
|
248 |
yield f"❌ Search failed: {e}", "", ""
|
249 |
return
|
@@ -256,8 +360,8 @@ def stream_agent(question: str,
|
|
256 |
"type": "mcp",
|
257 |
"server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
|
258 |
"server_url": server_url or DEFAULT_MCP_SERVER_URL,
|
259 |
-
"allowed_tools": [
|
260 |
-
"require_approval":
|
261 |
}]
|
262 |
|
263 |
# Shared mutable state for each round
|
@@ -282,7 +386,7 @@ def stream_agent(question: str,
|
|
282 |
if round_idx == 1:
|
283 |
parts.append({"type": "input_text", "text": question})
|
284 |
else:
|
285 |
-
parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you
|
286 |
|
287 |
parts += _build_image_parts_from_indices(attached_indices)
|
288 |
if attached_indices:
|
@@ -392,7 +496,7 @@ def stream_agent(question: str,
|
|
392 |
expanded.add(i - 1)
|
393 |
expanded.add(i + 1)
|
394 |
expanded = {i for i in expanded if 0 <= i < len(images)}
|
395 |
-
pending_indices = sorted(expanded)
|
396 |
round_idx += 1
|
397 |
continue
|
398 |
|
@@ -565,14 +669,10 @@ def build_ui():
|
|
565 |
value=DEFAULT_MCP_SERVER_LABEL,
|
566 |
)
|
567 |
with gr.Row():
|
568 |
-
|
569 |
-
label="
|
570 |
-
|
571 |
-
|
572 |
-
require_approval_box = gr.Dropdown(
|
573 |
-
label="Require Approval",
|
574 |
-
choices=["never", "auto", "always"],
|
575 |
-
value="never",
|
576 |
)
|
577 |
|
578 |
with gr.Column(scale=3):
|
@@ -593,6 +693,7 @@ def build_ui():
|
|
593 |
server_label_box,
|
594 |
require_approval_box,
|
595 |
allowed_tools_box,
|
|
|
596 |
],
|
597 |
outputs=[final_md, summary_md, log_md],
|
598 |
)
|
|
|
127 |
return status, local_path
|
128 |
|
129 |
|
130 |
+
def query_gpt(query: str, retrieved_images: list[tuple[Image.Image, str]]) -> str:
|
131 |
+
"""Calls OpenAI's GPT model with the query and image data."""
|
132 |
+
if api_key and api_key.startswith("sk"):
|
133 |
+
try:
|
134 |
+
from openai import OpenAI
|
135 |
+
|
136 |
+
base64_images = [encode_image_to_base64(im_caption[0]) for im_caption in retrieved_images]
|
137 |
+
client = OpenAI(api_key=api_key.strip())
|
138 |
+
PROMPT = """
|
139 |
+
You are a smart assistant designed to answer questions about a PDF document.
|
140 |
+
You are given relevant information in the form of PDF pages. Use them to construct a short response to the question, and cite your sources (page numbers, etc).
|
141 |
+
If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
|
142 |
+
Give detailed and extensive answers, only containing info in the pages you are given.
|
143 |
+
You can answer using information contained in plots and figures if necessary.
|
144 |
+
Answer in the same language as the query.
|
145 |
+
Query: {query}
|
146 |
+
PDF pages:
|
147 |
+
""".strip()
|
148 |
+
|
149 |
+
response = client.responses.create(
|
150 |
+
model="gpt-5",
|
151 |
+
input=[
|
152 |
+
{
|
153 |
+
"role": "user",
|
154 |
+
"content": (
|
155 |
+
[{"type": "input_text", "text": PROMPT.format(query=query)}] +
|
156 |
+
[{"type": "input_image",
|
157 |
+
"image_url": f"data:image/jpeg;base64,{im}"}
|
158 |
+
for im in base64_images]
|
159 |
+
)
|
160 |
+
}
|
161 |
+
],
|
162 |
+
# max_tokens=500,
|
163 |
+
)
|
164 |
+
return response.output_text
|
165 |
+
except Exception as e:
|
166 |
+
print(e)
|
167 |
+
return "OpenAI API connection failure. Verify that OPENAI_API_KEY is set and valid (sk-***)."
|
168 |
+
return "Set OPENAI_API_KEY in your environment to get a custom response."
|
169 |
+
|
170 |
+
|
171 |
# =============================
|
172 |
# Local Search (ColPali)
|
173 |
# =============================
|
|
|
210 |
return top_k_indices
|
211 |
|
212 |
|
213 |
+
def search_synthetize(query: str, k: int = 5) -> List[int]:
|
214 |
+
"""
|
215 |
+
Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
216 |
+
MCP tool description:
|
217 |
+
- name: mcp_test_search
|
218 |
+
- description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
219 |
+
- input_schema:
|
220 |
+
type: object
|
221 |
+
properties:
|
222 |
+
query: {type: string, description: "User query in natural language."}
|
223 |
+
k: {type: integer, minimum: 1, maximum: 20, default: 5. description: "Number of top pages to retrieve."}
|
224 |
+
required: ["query"]
|
225 |
+
Args:
|
226 |
+
query (str): Natural-language question to search for.
|
227 |
+
k (int): Number of top results to return (1–10).
|
228 |
+
Returns:
|
229 |
+
ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
|
230 |
+
"""
|
231 |
+
top_k_indices = search(query, k)
|
232 |
+
|
233 |
+
expanded = set(top_k_indices)
|
234 |
+
for i in base:
|
235 |
+
expanded.add(i - 1)
|
236 |
+
expanded.add(i + 1)
|
237 |
+
expanded = {i for i in expanded if 0 <= i < len(images)}
|
238 |
+
expanded = sorted(expanded)
|
239 |
+
|
240 |
+
# Build gallery results with 1-based page numbering
|
241 |
+
results = []
|
242 |
+
for idx in expanded:
|
243 |
+
page_num = idx + 1
|
244 |
+
results.append((images[idx], f"Page {page_num}"))
|
245 |
+
|
246 |
+
# Generate grounded response
|
247 |
+
ai_response = query_gpt(query, results)
|
248 |
+
print("[search_synthetize]", ai_response)
|
249 |
+
return ai_response
|
250 |
+
|
251 |
+
|
252 |
def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
253 |
"""Turn page indices into OpenAI vision content parts."""
|
254 |
parts: List[Dict[str, Any]] = []
|
|
|
266 |
# Agent System Prompt
|
267 |
# =============================
|
268 |
|
269 |
+
|
270 |
+
|
271 |
+
SYSTEM1 = (
|
272 |
"""
|
273 |
You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
|
274 |
Act iteratively:
|
|
|
287 |
).strip()
|
288 |
|
289 |
|
290 |
+
SYSTEM2 = """
|
291 |
+
You are a PDF research agent with a single tool: mcp_test_search_synthetize(query: string, k: int).
|
292 |
+
Act iteratively:
|
293 |
+
1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions in the english language, not just keywords.
|
294 |
+
2) For each sub-query, call mcp_test_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
|
295 |
+
3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
|
296 |
+
|
297 |
+
Grounding & citations:
|
298 |
+
• Use ONLY information from retrieved pages.
|
299 |
+
• After any claim, cite the page as (p.<page>).
|
300 |
+
• If an answer is not present, say “Not found in the provided pages.”
|
301 |
+
|
302 |
+
Final deliverable (must be clear and standalone):
|
303 |
+
• Write a detailed answer in Markdown that directly addresses the user request in the request language.
|
304 |
+
• If dates or items are requested, include a concise table with the requested fields.
|
305 |
+
• Do not refer to “the above” or “previous messages”.
|
306 |
+
"""
|
307 |
+
|
308 |
+
|
309 |
# =============================
|
310 |
# MCP config (search-only)
|
311 |
# =============================
|
312 |
+
VISUAL_REASONING = True
|
313 |
DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
|
314 |
DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
|
|
|
315 |
|
316 |
|
317 |
# =============================
|
|
|
323 |
model_name: str,
|
324 |
server_url: str,
|
325 |
server_label: str,
|
326 |
+
visual_reasoning: str):
|
|
|
327 |
"""
|
328 |
Multi-round streaming:
|
329 |
• Seed: optional local ColPali search on the user question to attach initial pages.
|
|
|
331 |
• If the model calls mcp_test_search and returns indices, we end the stream and
|
332 |
start a NEW API call with previous_response_id + the requested pages attached.
|
333 |
"""
|
334 |
+
visual_reasoning = True if visual_reasoning=="Visual Reasoning" else False
|
335 |
+
allowed_tools = "mcp_test_search" if visual_reasoning else "mcp_test_search_synthetize"
|
336 |
+
SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
|
337 |
+
|
338 |
if not api_key:
|
339 |
yield "⚠️ **Please provide your OpenAI API key.**", "", ""
|
340 |
return
|
|
|
347 |
|
348 |
# Optional seeding: attach some likely pages on round 1
|
349 |
try:
|
350 |
+
seed_indices = [] if visual_reasoning is False else search(question, k=5)
|
351 |
except Exception as e:
|
352 |
yield f"❌ Search failed: {e}", "", ""
|
353 |
return
|
|
|
360 |
"type": "mcp",
|
361 |
"server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
|
362 |
"server_url": server_url or DEFAULT_MCP_SERVER_URL,
|
363 |
+
"allowed_tools": [allowed_tools],
|
364 |
+
"require_approval": "never",
|
365 |
}]
|
366 |
|
367 |
# Shared mutable state for each round
|
|
|
386 |
if round_idx == 1:
|
387 |
parts.append({"type": "input_text", "text": question})
|
388 |
else:
|
389 |
+
parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
|
390 |
|
391 |
parts += _build_image_parts_from_indices(attached_indices)
|
392 |
if attached_indices:
|
|
|
496 |
expanded.add(i - 1)
|
497 |
expanded.add(i + 1)
|
498 |
expanded = {i for i in expanded if 0 <= i < len(images)}
|
499 |
+
pending_indices = sorted(expanded) if len(expanded) < 15 else sorted(base)
|
500 |
round_idx += 1
|
501 |
continue
|
502 |
|
|
|
669 |
value=DEFAULT_MCP_SERVER_LABEL,
|
670 |
)
|
671 |
with gr.Row():
|
672 |
+
visual_reasoning_box = gr.Dropdown(
|
673 |
+
label="Visual Reasoning",
|
674 |
+
choices=["Visual Reasoning", "Vision Summary"],
|
675 |
+
value="Visual Reasoning",
|
|
|
|
|
|
|
|
|
676 |
)
|
677 |
|
678 |
with gr.Column(scale=3):
|
|
|
693 |
server_label_box,
|
694 |
require_approval_box,
|
695 |
allowed_tools_box,
|
696 |
+
visual_reasoning_box
|
697 |
],
|
698 |
outputs=[final_md, summary_md, log_md],
|
699 |
)
|