manu commited on
Commit
6e4c2c5
·
verified ·
1 Parent(s): 06ea901

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -17
app.py CHANGED
@@ -127,6 +127,47 @@ def index_from_url(url: str) -> Tuple[str, str]:
127
  return status, local_path
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  # =============================
131
  # Local Search (ColPali)
132
  # =============================
@@ -169,6 +210,45 @@ def search(query: str, k: int = 5) -> List[int]:
169
  return top_k_indices
170
 
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
173
  """Turn page indices into OpenAI vision content parts."""
174
  parts: List[Dict[str, Any]] = []
@@ -186,7 +266,9 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
186
  # Agent System Prompt
187
  # =============================
188
 
189
- SYSTEM = (
 
 
190
  """
191
  You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
192
  Act iteratively:
@@ -205,12 +287,31 @@ Deliverable:
205
  ).strip()
206
 
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  # =============================
209
  # MCP config (search-only)
210
  # =============================
 
211
  DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
212
  DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
213
- DEFAULT_ALLOWED_TOOLS = "mcp_test_search" # search-only; no get_pages
214
 
215
 
216
  # =============================
@@ -222,8 +323,7 @@ def stream_agent(question: str,
222
  model_name: str,
223
  server_url: str,
224
  server_label: str,
225
- require_approval: str,
226
- allowed_tools: str):
227
  """
228
  Multi-round streaming:
229
  • Seed: optional local ColPali search on the user question to attach initial pages.
@@ -231,6 +331,10 @@ def stream_agent(question: str,
231
  • If the model calls mcp_test_search and returns indices, we end the stream and
232
  start a NEW API call with previous_response_id + the requested pages attached.
233
  """
 
 
 
 
234
  if not api_key:
235
  yield "⚠️ **Please provide your OpenAI API key.**", "", ""
236
  return
@@ -243,7 +347,7 @@ def stream_agent(question: str,
243
 
244
  # Optional seeding: attach some likely pages on round 1
245
  try:
246
- seed_indices = search(question, k=5) or []
247
  except Exception as e:
248
  yield f"❌ Search failed: {e}", "", ""
249
  return
@@ -256,8 +360,8 @@ def stream_agent(question: str,
256
  "type": "mcp",
257
  "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
258
  "server_url": server_url or DEFAULT_MCP_SERVER_URL,
259
- "allowed_tools": [t.strip() for t in (allowed_tools or DEFAULT_ALLOWED_TOOLS).split(",") if t.strip()],
260
- "require_approval": require_approval or "never",
261
  }]
262
 
263
  # Shared mutable state for each round
@@ -282,7 +386,7 @@ def stream_agent(question: str,
282
  if round_idx == 1:
283
  parts.append({"type": "input_text", "text": question})
284
  else:
285
- parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you can ask further questions to the search tool."})
286
 
287
  parts += _build_image_parts_from_indices(attached_indices)
288
  if attached_indices:
@@ -392,7 +496,7 @@ def stream_agent(question: str,
392
  expanded.add(i - 1)
393
  expanded.add(i + 1)
394
  expanded = {i for i in expanded if 0 <= i < len(images)}
395
- pending_indices = sorted(expanded)
396
  round_idx += 1
397
  continue
398
 
@@ -565,14 +669,10 @@ def build_ui():
565
  value=DEFAULT_MCP_SERVER_LABEL,
566
  )
567
  with gr.Row():
568
- allowed_tools_box = gr.Textbox(
569
- label="Allowed Tools (comma-separated)",
570
- value=DEFAULT_ALLOWED_TOOLS,
571
- )
572
- require_approval_box = gr.Dropdown(
573
- label="Require Approval",
574
- choices=["never", "auto", "always"],
575
- value="never",
576
  )
577
 
578
  with gr.Column(scale=3):
@@ -593,6 +693,7 @@ def build_ui():
593
  server_label_box,
594
  require_approval_box,
595
  allowed_tools_box,
 
596
  ],
597
  outputs=[final_md, summary_md, log_md],
598
  )
 
127
  return status, local_path
128
 
129
 
130
+ def query_gpt(query: str, retrieved_images: list[tuple[Image.Image, str]]) -> str:
131
+ """Calls OpenAI's GPT model with the query and image data."""
132
+ if api_key and api_key.startswith("sk"):
133
+ try:
134
+ from openai import OpenAI
135
+
136
+ base64_images = [encode_image_to_base64(im_caption[0]) for im_caption in retrieved_images]
137
+ client = OpenAI(api_key=api_key.strip())
138
+ PROMPT = """
139
+ You are a smart assistant designed to answer questions about a PDF document.
140
+ You are given relevant information in the form of PDF pages. Use them to construct a short response to the question, and cite your sources (page numbers, etc).
141
+ If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
142
+ Give detailed and extensive answers, only containing info in the pages you are given.
143
+ You can answer using information contained in plots and figures if necessary.
144
+ Answer in the same language as the query.
145
+ Query: {query}
146
+ PDF pages:
147
+ """.strip()
148
+
149
+ response = client.responses.create(
150
+ model="gpt-5",
151
+ input=[
152
+ {
153
+ "role": "user",
154
+ "content": (
155
+ [{"type": "input_text", "text": PROMPT.format(query=query)}] +
156
+ [{"type": "input_image",
157
+ "image_url": f"data:image/jpeg;base64,{im}"}
158
+ for im in base64_images]
159
+ )
160
+ }
161
+ ],
162
+ # max_tokens=500,
163
+ )
164
+ return response.output_text
165
+ except Exception as e:
166
+ print(e)
167
+ return "OpenAI API connection failure. Verify that OPENAI_API_KEY is set and valid (sk-***)."
168
+ return "Set OPENAI_API_KEY in your environment to get a custom response."
169
+
170
+
171
  # =============================
172
  # Local Search (ColPali)
173
  # =============================
 
210
  return top_k_indices
211
 
212
 
213
+ def search_synthetize(query: str, k: int = 5) -> List[int]:
214
+ """
215
+ Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
216
+ MCP tool description:
217
+ - name: mcp_test_search
218
+ - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
219
+ - input_schema:
220
+ type: object
221
+ properties:
222
+ query: {type: string, description: "User query in natural language."}
223
+ k: {type: integer, minimum: 1, maximum: 20, default: 5. description: "Number of top pages to retrieve."}
224
+ required: ["query"]
225
+ Args:
226
+ query (str): Natural-language question to search for.
227
+ k (int): Number of top results to return (1–10).
228
+ Returns:
229
+ ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
230
+ """
231
+ top_k_indices = search(query, k)
232
+
233
+ expanded = set(top_k_indices)
234
+ for i in base:
235
+ expanded.add(i - 1)
236
+ expanded.add(i + 1)
237
+ expanded = {i for i in expanded if 0 <= i < len(images)}
238
+ expanded = sorted(expanded)
239
+
240
+ # Build gallery results with 1-based page numbering
241
+ results = []
242
+ for idx in expanded:
243
+ page_num = idx + 1
244
+ results.append((images[idx], f"Page {page_num}"))
245
+
246
+ # Generate grounded response
247
+ ai_response = query_gpt(query, results)
248
+ print("[search_synthetize]", ai_response)
249
+ return ai_response
250
+
251
+
252
  def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
253
  """Turn page indices into OpenAI vision content parts."""
254
  parts: List[Dict[str, Any]] = []
 
266
  # Agent System Prompt
267
  # =============================
268
 
269
+
270
+
271
+ SYSTEM1 = (
272
  """
273
  You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
274
  Act iteratively:
 
287
  ).strip()
288
 
289
 
290
+ SYSTEM2 = """
291
+ You are a PDF research agent with a single tool: mcp_test_search_synthetize(query: string, k: int).
292
+ Act iteratively:
293
+ 1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions in the english language, not just keywords.
294
+ 2) For each sub-query, call mcp_test_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
295
+ 3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
296
+
297
+ Grounding & citations:
298
+ • Use ONLY information from retrieved pages.
299
+ • After any claim, cite the page as (p.<page>).
300
+ • If an answer is not present, say “Not found in the provided pages.”
301
+
302
+ Final deliverable (must be clear and standalone):
303
+ • Write a detailed answer in Markdown that directly addresses the user request in the request language.
304
+ • If dates or items are requested, include a concise table with the requested fields.
305
+ • Do not refer to “the above” or “previous messages”.
306
+ """
307
+
308
+
309
  # =============================
310
  # MCP config (search-only)
311
  # =============================
312
+ VISUAL_REASONING = True
313
  DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
314
  DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
 
315
 
316
 
317
  # =============================
 
323
  model_name: str,
324
  server_url: str,
325
  server_label: str,
326
+ visual_reasoning: str):
 
327
  """
328
  Multi-round streaming:
329
  • Seed: optional local ColPali search on the user question to attach initial pages.
 
331
  • If the model calls mcp_test_search and returns indices, we end the stream and
332
  start a NEW API call with previous_response_id + the requested pages attached.
333
  """
334
+ visual_reasoning = True if visual_reasoning=="Visual Reasoning" else False
335
+ allowed_tools = "mcp_test_search" if visual_reasoning else "mcp_test_search_synthetize"
336
+ SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
337
+
338
  if not api_key:
339
  yield "⚠️ **Please provide your OpenAI API key.**", "", ""
340
  return
 
347
 
348
  # Optional seeding: attach some likely pages on round 1
349
  try:
350
+ seed_indices = [] if visual_reasoning is False else search(question, k=5)
351
  except Exception as e:
352
  yield f"❌ Search failed: {e}", "", ""
353
  return
 
360
  "type": "mcp",
361
  "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
362
  "server_url": server_url or DEFAULT_MCP_SERVER_URL,
363
+ "allowed_tools": [allowed_tools],
364
+ "require_approval": "never",
365
  }]
366
 
367
  # Shared mutable state for each round
 
386
  if round_idx == 1:
387
  parts.append({"type": "input_text", "text": question})
388
  else:
389
+ parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
390
 
391
  parts += _build_image_parts_from_indices(attached_indices)
392
  if attached_indices:
 
496
  expanded.add(i - 1)
497
  expanded.add(i + 1)
498
  expanded = {i for i in expanded if 0 <= i < len(images)}
499
+ pending_indices = sorted(expanded) if len(expanded) < 15 else sorted(base)
500
  round_idx += 1
501
  continue
502
 
 
669
  value=DEFAULT_MCP_SERVER_LABEL,
670
  )
671
  with gr.Row():
672
+ visual_reasoning_box = gr.Dropdown(
673
+ label="Visual Reasoning",
674
+ choices=["Visual Reasoning", "Vision Summary"],
675
+ value="Visual Reasoning",
 
 
 
 
676
  )
677
 
678
  with gr.Column(scale=3):
 
693
  server_label_box,
694
  require_approval_box,
695
  allowed_tools_box,
696
+ visual_reasoning_box
697
  ],
698
  outputs=[final_md, summary_md, log_md],
699
  )