manu commited on
Commit
82b43ed
·
verified ·
1 Parent(s): bf520fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -81
app.py CHANGED
@@ -125,6 +125,18 @@ def index_from_url(url: str) -> Tuple[str, str]:
125
  return status, local_path
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # =============================
129
  # MCP Tools
130
  # =============================
@@ -176,46 +188,6 @@ def search(query: str, k: int = 5) -> List[int]:
176
  return sorted(expanded)
177
 
178
 
179
- def get_pages(indices: List[int]) -> Dict[str, Any]:
180
- """
181
- Return page images (as data URLs) for the given 0-based indices.
182
-
183
- MCP tool description:
184
- - name: mcp_test_get_pages
185
- - description: Given 0-based indices from mcp_test_search, return the corresponding page images as data URLs for vision reasoning.
186
- - input_schema:
187
- type: object
188
- properties:
189
- indices: {
190
- type: array,
191
- items: { type: integer, minimum: 0 },
192
- description: "0-based page indices to fetch",
193
- }
194
- required: ["indices"]
195
-
196
- Returns:
197
- {"images": [{"index": int, "page": int, "image_url": str}], "count": int}
198
- """
199
- global images
200
- indices = eval(indices)
201
- print("indices to get", indices)
202
-
203
- if not images:
204
- return {"images": [], "count": 0}
205
-
206
- uniq = sorted({i for i in indices if 0 <= i < len(images)})
207
- payload = []
208
- for idx in uniq:
209
- im = images[idx]
210
- b64 = encode_image_to_base64(im)
211
- payload.append({
212
- "index": idx,
213
- "page": idx + 1,
214
- "image_url": f"data:image/jpeg;base64,{b64}",
215
- })
216
- return {"images": payload, "count": len(payload)}
217
-
218
-
219
  # =============================
220
  # Gradio UI — Unified App
221
  # =============================
@@ -228,7 +200,7 @@ You are a PDF research agent with two tools:
228
 
229
  Policy & procedure:
230
  1) Break the user task into 1–4 targeted sub-queries (in English).
231
- 2) For each sub-query, call mcp_test_search to get indices; THEN immediately call mcp_get_pages with those indices to obtain the page images.
232
  3) Continue reasoning using ONLY the provided images. If info is insufficient, iterate: refine sub-queries and call the tools again. You may make further tool calls later in the conversation as needed.
233
 
234
  Grounding & citations:
@@ -269,6 +241,7 @@ def stream_agent(question: str,
269
 
270
  client = OpenAI(api_key=api_key)
271
 
 
272
  tools = [{
273
  "type": "mcp",
274
  "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
@@ -277,43 +250,128 @@ def stream_agent(question: str,
277
  "require_approval": require_approval or "never",
278
  }]
279
 
280
- req_kwargs = dict(
281
- model=model,
282
- input=[
283
- {"role": "system", "content": SYSTEM},
284
- {"role": "user", "content": question},
285
- ],
286
- reasoning={"effort": "medium", "summary": "auto"},
287
- tools=tools,
288
- )
289
-
290
- try:
291
- with client.responses.stream(**req_kwargs) as stream:
292
- for event in stream:
293
- etype = getattr(event, "type", "")
294
-
295
- if etype == "response.output_text.delta":
296
- final_text += event.delta
297
- yield final_text, summary_text, "\n".join(log_lines[-400:])
298
-
299
- elif etype == "response.reasoning_summary_text.delta":
300
- summary_text += event.delta
301
- yield final_text, summary_text, "\n".join(log_lines[-400:])
302
-
303
- elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
304
- # Show tool call argument deltas in the log for transparency
305
- log_lines.append(str(event.delta))
306
-
307
- elif etype == "response.error":
308
- log_lines.append(f"[error] {getattr(event, 'error', '')}")
309
- yield final_text, summary_text, "\n".join(log_lines[-400:])
310
-
311
- # finalize
312
- _final = stream.get_final_response()
313
- yield final_text, summary_text, "\n".join(log_lines[-400:])
314
-
315
- except Exception as e:
316
- yield f"❌ {e}", summary_text, "\n".join(log_lines[-400:])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
 
319
  CUSTOM_CSS = """
@@ -434,11 +492,8 @@ def build_ui():
434
 
435
  with gr.Column(scale=2):
436
  output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
437
- output_payload = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
438
-
439
 
440
  search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
441
- get_pages_button.click(get_pages, inputs=[output_text], outputs=[output_payload])
442
 
443
  with gr.Tab("3) Agent (Streaming)"):
444
  with gr.Row(equal_height=True):
 
125
  return status, local_path
126
 
127
 
128
+ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
129
+ """Turn page indices into OpenAI vision content parts."""
130
+ parts: List[Dict[str, Any]] = []
131
+ seen = sorted({i for i in indices if 0 <= i < len(images)})
132
+ for idx in seen:
133
+ b64 = encode_image_to_base64(images[idx])
134
+ parts.append({
135
+ "type": "input_image",
136
+ "image_url": f"data:image/jpeg;base64,{b64}",
137
+ })
138
+ return parts
139
+
140
  # =============================
141
  # MCP Tools
142
  # =============================
 
188
  return sorted(expanded)
189
 
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  # =============================
192
  # Gradio UI — Unified App
193
  # =============================
 
200
 
201
  Policy & procedure:
202
  1) Break the user task into 1–4 targeted sub-queries (in English).
203
+ 2) For each sub-query, call mcp_test_search to get indices; Once you receive the indices to use, print "Received" and stop generating. Images will be injected in your stream.
204
  3) Continue reasoning using ONLY the provided images. If info is insufficient, iterate: refine sub-queries and call the tools again. You may make further tool calls later in the conversation as needed.
205
 
206
  Grounding & citations:
 
241
 
242
  client = OpenAI(api_key=api_key)
243
 
244
+ prev_response_id: Optional[str] = None
245
  tools = [{
246
  "type": "mcp",
247
  "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
 
250
  "require_approval": require_approval or "never",
251
  }]
252
 
253
+ # seed pages once (optional)
254
+ seed_indices = search(question, k=5) or []
255
+ pending_indices = list(seed_indices)
256
+
257
+ def run_round(round_idx: int, attached_indices: List[int]):
258
+ nonlocal prev_response_id
259
+ assembled_text = ""
260
+ assembled_summary = ""
261
+ # Will hold the most recent indices returned by mcp_test_search in THIS stream
262
+ last_search_indices: List[int] = []
263
+
264
+ # Build user parts (attach any seed pages we already have)
265
+ parts: List[Dict[str, Any]] = [{"type": "input_text", "text": question if round_idx == 1 else "Continue with new pages."}]
266
+ parts += _build_image_parts_from_indices(attached_indices)
267
+
268
+ # First call includes system; follow-ups use previous_response_id
269
+ if prev_response_id:
270
+ req_input = [{"role": "user", "content": parts}]
271
+ else:
272
+ req_input = [
273
+ {"role": "system", "content": SYSTEM},
274
+ {"role": "user", "content": parts},
275
+ ]
276
+
277
+ req_kwargs = dict(
278
+ model=model_name,
279
+ input=req_input,
280
+ reasoning={"effort": "medium", "summary": "auto"},
281
+ tools=tools,
282
+ store=True,
283
+ )
284
+ if prev_response_id:
285
+ req_kwargs["previous_response_id"] = prev_response_id
286
+
287
+ # Helper to try extracting a JSON int array from tool result text
288
+ def _maybe_parse_indices(chunk: str) -> List[int]:
289
+ import json, re
290
+ # Find the last bracketed JSON array in the chunk
291
+ arrs = re.findall(r'\[[^\]]*\]', chunk)
292
+ for s in reversed(arrs):
293
+ try:
294
+ val = json.loads(s)
295
+ if isinstance(val, list) and all(isinstance(x, int) for x in val):
296
+ return sorted({x for x in val if isinstance(x, int)})
297
+ except Exception:
298
+ pass
299
+ return []
300
+
301
+ tool_result_buffer = "" # accumulate tool result deltas
302
+
303
+ try:
304
+ with client.responses.stream(**req_kwargs) as stream:
305
+ for event in stream:
306
+ etype = getattr(event, "type", "")
307
+
308
+ if etype == "response.output_text.delta":
309
+ assembled_text += event.delta
310
+ yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
311
+
312
+ elif etype == "response.reasoning_summary_text.delta":
313
+ assembled_summary += event.delta
314
+ yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
315
+
316
+ # Capture tool *arguments* in the log for transparency (optional)
317
+ elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
318
+ log_lines.append(str(event.delta))
319
+
320
+ # ⬇️ NEW: capture tool *results* (indices JSON) from MCP
321
+ elif etype.startswith("response.tool_result"):
322
+ # Different SDKs expose .delta or .output_text; handle both
323
+ delta = getattr(event, "delta", "") or getattr(event, "output_text", "")
324
+ if delta:
325
+ tool_result_buffer += str(delta)
326
+ # opportunistic parse so UI can progress early
327
+ parsed_now = _maybe_parse_indices(tool_result_buffer)
328
+ if parsed_now:
329
+ print(parsed_now)
330
+ last_search_indices = parsed_now
331
+ log_lines.append(f"[tool-result] indices={last_search_indices}")
332
+ yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
333
+
334
+ # Finalize, remember response id for follow-ups
335
+ _final = stream.get_final_response()
336
+ try:
337
+ prev_response_id = getattr(_final, "id", None)
338
+ except Exception:
339
+ prev_response_id = None
340
+
341
+ # If the model produced search results this round, hand them back to the controller
342
+ if last_search_indices:
343
+ return sorted(set(last_search_indices))
344
+
345
+ # Otherwise, just render whatever text we have
346
+ yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
347
+ return None
348
+
349
+ except Exception as e:
350
+ log_lines.append(f"[round {round_idx}] stream error: {e}")
351
+ yield f"❌ {e}", assembled_summary or "", "\n".join(log_lines[-400:])
352
+ return None
353
+
354
+ # Controller: iterate rounds until model stops searching
355
+ max_rounds = 3
356
+ round_idx = 1
357
+ while round_idx <= max_rounds:
358
+ # Start a round with any pending images we already have
359
+ next_indices = None
360
+ for final_md, summary_md, log_md in run_round(round_idx, pending_indices):
361
+ yield final_md, summary_md, log_md
362
+
363
+ # If the model called mcp_test_search, we got indices back; fetch those pages next.
364
+ # (We ignore pending_indices now—move to the model-chosen ones.)
365
+ if isinstance(next_indices, list) and next_indices:
366
+ pending_indices = next_indices
367
+ # Attach those pages in a **new** GPT-5 call using previous_response_id
368
+ round_idx += 1
369
+ continue
370
+
371
+ # No tool search results this round → we’re done
372
+ break
373
+
374
+ return
375
 
376
 
377
  CUSTOM_CSS = """
 
492
 
493
  with gr.Column(scale=2):
494
  output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
 
 
495
 
496
  search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
 
497
 
498
  with gr.Tab("3) Agent (Streaming)"):
499
  with gr.Row(equal_height=True):