visual-deepsearch

Sleeping

App Files Files Community

manu commited on 10 days ago

Commit

6e4c2c5

verified ·

1 Parent(s): 06ea901

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -17

app.py CHANGED Viewed

@@ -127,6 +127,47 @@ def index_from_url(url: str) -> Tuple[str, str]:
     return status, local_path
 # =============================
 # Local Search (ColPali)
 # =============================
@@ -169,6 +210,45 @@ def search(query: str, k: int = 5) -> List[int]:
     return top_k_indices
 def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
     """Turn page indices into OpenAI vision content parts."""
     parts: List[Dict[str, Any]] = []
@@ -186,7 +266,9 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
 # Agent System Prompt
 # =============================
-SYSTEM = (
     """
 You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
 Act iteratively:
@@ -205,12 +287,31 @@ Deliverable:
 ).strip()
 # =============================
 # MCP config (search-only)
 # =============================
 DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
 DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
-DEFAULT_ALLOWED_TOOLS = "mcp_test_search"  # search-only; no get_pages
 # =============================
@@ -222,8 +323,7 @@ def stream_agent(question: str,
                  model_name: str,
                  server_url: str,
                  server_label: str,
-                 require_approval: str,
-                 allowed_tools: str):
     """
     Multi-round streaming:
       • Seed: optional local ColPali search on the user question to attach initial pages.
@@ -231,6 +331,10 @@ def stream_agent(question: str,
       • If the model calls mcp_test_search and returns indices, we end the stream and
         start a NEW API call with previous_response_id + the requested pages attached.
     """
     if not api_key:
         yield "⚠️ **Please provide your OpenAI API key.**", "", ""
         return
@@ -243,7 +347,7 @@ def stream_agent(question: str,
     # Optional seeding: attach some likely pages on round 1
     try:
-        seed_indices = search(question, k=5) or []
     except Exception as e:
         yield f"❌ Search failed: {e}", "", ""
         return
@@ -256,8 +360,8 @@ def stream_agent(question: str,
         "type": "mcp",
         "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
         "server_url": server_url or DEFAULT_MCP_SERVER_URL,
-        "allowed_tools": [t.strip() for t in (allowed_tools or DEFAULT_ALLOWED_TOOLS).split(",") if t.strip()],
-        "require_approval": require_approval or "never",
     }]
     # Shared mutable state for each round
@@ -282,7 +386,7 @@ def stream_agent(question: str,
         if round_idx == 1:
             parts.append({"type": "input_text", "text": question})
         else:
-            parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you can ask further questions to the search tool."})
         parts += _build_image_parts_from_indices(attached_indices)
         if attached_indices:
@@ -392,7 +496,7 @@ def stream_agent(question: str,
                 expanded.add(i - 1)
                 expanded.add(i + 1)
             expanded = {i for i in expanded if 0 <= i < len(images)}
-            pending_indices = sorted(expanded)
             round_idx += 1
             continue
@@ -565,14 +669,10 @@ def build_ui():
                                 value=DEFAULT_MCP_SERVER_LABEL,
                             )
                         with gr.Row():
-                            allowed_tools_box = gr.Textbox(
-                                label="Allowed Tools (comma-separated)",
-                                value=DEFAULT_ALLOWED_TOOLS,
-                            )
-                            require_approval_box = gr.Dropdown(
-                                label="Require Approval",
-                                choices=["never", "auto", "always"],
-                                value="never",
                             )
                 with gr.Column(scale=3):
@@ -593,6 +693,7 @@ def build_ui():
                     server_label_box,
                     require_approval_box,
                     allowed_tools_box,
                 ],
                 outputs=[final_md, summary_md, log_md],
             )

     return status, local_path
+def query_gpt(query: str, retrieved_images: list[tuple[Image.Image, str]]) -> str:
+    """Calls OpenAI's GPT model with the query and image data."""
+    if api_key and api_key.startswith("sk"):
+        try:
+            from openai import OpenAI
+            base64_images = [encode_image_to_base64(im_caption[0]) for im_caption in retrieved_images]
+            client = OpenAI(api_key=api_key.strip())
+            PROMPT = """
+You are a smart assistant designed to answer questions about a PDF document.
+You are given relevant information in the form of PDF pages. Use them to construct a short response to the question, and cite your sources (page numbers, etc).
+If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
+Give detailed and extensive answers, only containing info in the pages you are given.
+You can answer using information contained in plots and figures if necessary.
+Answer in the same language as the query.
+Query: {query}
+PDF pages:
+""".strip()
+            response = client.responses.create(
+                model="gpt-5",
+                input=[
+                    {
+                        "role": "user",
+                        "content": (
+                            [{"type": "input_text", "text": PROMPT.format(query=query)}] +
+                            [{"type": "input_image",
+                              "image_url": f"data:image/jpeg;base64,{im}"}
+                             for im in base64_images]
+                        )
+                    }
+                ],
+                # max_tokens=500,
+            )
+            return response.output_text
+        except Exception as e:
+            print(e)
+            return "OpenAI API connection failure. Verify that OPENAI_API_KEY is set and valid (sk-***)."
+    return "Set OPENAI_API_KEY in your environment to get a custom response."
 # =============================
 # Local Search (ColPali)
 # =============================
     return top_k_indices
+def search_synthetize(query: str, k: int = 5) -> List[int]:
+   """
+    Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
+    MCP tool description:
+      - name: mcp_test_search
+      - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
+      - input_schema:
+          type: object
+          properties:
+            query: {type: string, description: "User query in natural language."}
+            k: {type: integer, minimum: 1, maximum: 20, default: 5. description: "Number of top pages to retrieve."}
+          required: ["query"]
+    Args:
+        query (str): Natural-language question to search for.
+        k (int): Number of top results to return (1–10).
+    Returns:
+        ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
+    """
+    top_k_indices = search(query, k)
+    expanded = set(top_k_indices)
+    for i in base:
+        expanded.add(i - 1)
+        expanded.add(i + 1)
+    expanded = {i for i in expanded if 0 <= i < len(images)}
+    expanded = sorted(expanded)
+    # Build gallery results with 1-based page numbering
+    results = []
+    for idx in expanded:
+        page_num = idx + 1
+        results.append((images[idx], f"Page {page_num}"))
+    # Generate grounded response
+    ai_response = query_gpt(query, results)
+    print("[search_synthetize]", ai_response)
+    return ai_response
 def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
     """Turn page indices into OpenAI vision content parts."""
     parts: List[Dict[str, Any]] = []
 # Agent System Prompt
 # =============================
+SYSTEM1 = (
     """
 You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
 Act iteratively:
 ).strip()
+SYSTEM2 = """
+You are a PDF research agent with a single tool: mcp_test_search_synthetize(query: string, k: int).
+Act iteratively:
+  1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions in the english language, not just keywords.
+  2) For each sub-query, call mcp_test_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
+  3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
+Grounding & citations:
+  • Use ONLY information from retrieved pages.
+  • After any claim, cite the page as (p.<page>).
+  • If an answer is not present, say “Not found in the provided pages.”
+Final deliverable (must be clear and standalone):
+  • Write a detailed answer in Markdown that directly addresses the user request in the request language.
+  • If dates or items are requested, include a concise table with the requested fields.
+  • Do not refer to “the above” or “previous messages”.
+"""
 # =============================
 # MCP config (search-only)
 # =============================
+VISUAL_REASONING = True
 DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
 DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
 # =============================
                  model_name: str,
                  server_url: str,
                  server_label: str,
+                 visual_reasoning: str):
     """
     Multi-round streaming:
       • Seed: optional local ColPali search on the user question to attach initial pages.
       • If the model calls mcp_test_search and returns indices, we end the stream and
         start a NEW API call with previous_response_id + the requested pages attached.
     """
+    visual_reasoning = True if visual_reasoning=="Visual Reasoning" else False
+    allowed_tools = "mcp_test_search"  if visual_reasoning else "mcp_test_search_synthetize"
+    SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
     if not api_key:
         yield "⚠️ **Please provide your OpenAI API key.**", "", ""
         return
     # Optional seeding: attach some likely pages on round 1
     try:
+        seed_indices = [] if visual_reasoning is False else search(question, k=5)
     except Exception as e:
         yield f"❌ Search failed: {e}", "", ""
         return
         "type": "mcp",
         "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
         "server_url": server_url or DEFAULT_MCP_SERVER_URL,
+        "allowed_tools": [allowed_tools],
+        "require_approval": "never",
     }]
     # Shared mutable state for each round
         if round_idx == 1:
             parts.append({"type": "input_text", "text": question})
         else:
+            parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
         parts += _build_image_parts_from_indices(attached_indices)
         if attached_indices:
                 expanded.add(i - 1)
                 expanded.add(i + 1)
             expanded = {i for i in expanded if 0 <= i < len(images)}
+            pending_indices = sorted(expanded) if len(expanded) < 15 else sorted(base)
             round_idx += 1
             continue
                                 value=DEFAULT_MCP_SERVER_LABEL,
                             )
                         with gr.Row():
+                            visual_reasoning_box = gr.Dropdown(
+                                label="Visual Reasoning",
+                                choices=["Visual Reasoning", "Vision Summary"],
+                                value="Visual Reasoning",
                             )
                 with gr.Column(scale=3):
                     server_label_box,
                     require_approval_box,
                     allowed_tools_box,
+                    visual_reasoning_box
                 ],
                 outputs=[final_md, summary_md, log_md],
             )