visual-deepsearch

Sleeping

App Files Files Community

manu commited on 3 days ago

Commit

0a4c43d

verified ·

1 Parent(s): 7227c5c

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -11

app.py CHANGED Viewed

@@ -192,8 +192,8 @@ You are a PDF research agent with a single tool: mcp_test_search(query: string,
 Act iteratively:
   1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions in the english language, not just keywords.
   2) For each sub-query, call mcp_test_search (k=5 by default; increase to up to 10 if you need to go deep).
-  3) You will receive the output of mcp_test_search as a list of indices corresponding to page numbers. Stop generating once all the tool calls end. You will later be fed the corresponding pages as images in a follow-up message.
-  4) Stop early when confident; otherwise run new search calls using the tool when need be to find additional information. Use up to 5 rounds of iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Workflow:
   • Use ONLY the provided images for grounding and cite as (p.<page>).
@@ -282,12 +282,12 @@ def stream_agent(question: str,
         if round_idx == 1:
             parts.append({"type": "input_text", "text": question})
         else:
-            parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages."})
         parts += _build_image_parts_from_indices(attached_indices)
         if attached_indices:
             pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
-            parts.append({"type": "input_text", "text": f"(Attached pages: {pages_str}). Use ONLY these images; cite as (p.X)."})
         # First call includes system; follow-ups use previous_response_id
         if prev_response_id:
@@ -340,14 +340,12 @@ def stream_agent(question: str,
                     elif etype in ("response.mcp_call_arguments.delta", "response.tool_call_arguments.delta"):
                         delta = getattr(event, "delta", None)
                         if delta:
-                            log_lines.append(str(delta))
                             round_state["summary_text"] += "\nQuery call: " + event.delta + "\n"
                             yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
                     # Capture tool RESULT text and try to parse indices
                     elif etype.startswith("response.output_item.done"):
-                        print("A tool output was detected")
-                        print(event)
                         delta_text = getattr(event.item, "output", "")
                         if delta_text:
                             tool_result_buffer += str(delta_text)
@@ -356,6 +354,8 @@ def stream_agent(question: str,
                                 round_state["last_search_indices"] += parsed_now
                                 log_lines.append(f"[tool-result] indices={parsed_now}")
                                 yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
                 # Finalize this response; remember ID for follow-ups
                 _final = stream.get_final_response()
@@ -517,7 +517,7 @@ def build_ui():
             index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
         # ---- Tab 2: Ask (Direct — returns indices)
-        with gr.Tab("2) Ask (Direct — returns indices)"):
             with gr.Row():
                 with gr.Column(scale=1):
                     query_box = gr.Textbox(placeholder="Enter your question…", label="Query", lines=4)
@@ -530,7 +530,7 @@ def build_ui():
             search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
         # ---- Tab 3: Agent (Streaming)
-        with gr.Tab("3) Agent (Streaming)"):
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
                     with gr.Group():
@@ -576,9 +576,9 @@ def build_ui():
                             )
                 with gr.Column(scale=3):
-                    with gr.Tab("Answer (Markdown)"):
                         final_md = gr.Markdown(value="", elem_classes=["card", "markdown-wrap"])
-                    with gr.Tab("Live Summary (Markdown)"):
                         summary_md = gr.Markdown(value="", elem_classes=["card", "summary-wrap"])
                     with gr.Tab("Event Log"):
                         log_md = gr.Markdown(value="", elem_classes=["card", "log-box"])

 Act iteratively:
   1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions in the english language, not just keywords.
   2) For each sub-query, call mcp_test_search (k=5 by default; increase to up to 10 if you need to go deep).
+  3) You will receive the output of mcp_test_search as a list of indices corresponding to page numbers. Stop generating once all the tool calls end. You will be fed the corresponding pages as images in a follow-up message.
+  4) Stop early when confident; otherwise run new search calls using the tool to find additional missing information. Use up to 5 rounds of iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Workflow:
   • Use ONLY the provided images for grounding and cite as (p.<page>).
         if round_idx == 1:
             parts.append({"type": "input_text", "text": question})
         else:
+            parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you can ask further questions to the search tool."})
         parts += _build_image_parts_from_indices(attached_indices)
         if attached_indices:
             pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
+            parts.append({"type": "input_text", "text": f"(Attached pages from round {round_idx}: {pages_str}). Ground your answer in these images, or query for new pages."})
         # First call includes system; follow-ups use previous_response_id
         if prev_response_id:
                     elif etype in ("response.mcp_call_arguments.delta", "response.tool_call_arguments.delta"):
                         delta = getattr(event, "delta", None)
                         if delta:
+                            log_lines.append("[call] " + str(delta))
                             round_state["summary_text"] += "\nQuery call: " + event.delta + "\n"
                             yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
                     # Capture tool RESULT text and try to parse indices
                     elif etype.startswith("response.output_item.done"):
                         delta_text = getattr(event.item, "output", "")
                         if delta_text:
                             tool_result_buffer += str(delta_text)
                                 round_state["last_search_indices"] += parsed_now
                                 log_lines.append(f"[tool-result] indices={parsed_now}")
                                 yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
+                    else:
+                        print(etype)
                 # Finalize this response; remember ID for follow-ups
                 _final = stream.get_final_response()
             index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
         # ---- Tab 2: Ask (Direct — returns indices)
+        with gr.Tab("2) Direct Search"):
             with gr.Row():
                 with gr.Column(scale=1):
                     query_box = gr.Textbox(placeholder="Enter your question…", label="Query", lines=4)
             search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
         # ---- Tab 3: Agent (Streaming)
+        with gr.Tab("3) Deep Search"):
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
                     with gr.Group():
                             )
                 with gr.Column(scale=3):
+                    with gr.Tab("Answer"):
                         final_md = gr.Markdown(value="", elem_classes=["card", "markdown-wrap"])
+                    with gr.Tab("Live Reasoning"):
                         summary_md = gr.Markdown(value="", elem_classes=["card", "summary-wrap"])
                     with gr.Tab("Event Log"):
                         log_md = gr.Markdown(value="", elem_classes=["card", "log-box"])