Spaces:

Neurocognitive
/

agentic-RAG

Sleeping

File size: 28,765 Bytes

import gradio as gr
import re
import os # Used for environment variables if you switch to a real LLM
import datetime # For timestamping entries in the knowledge base
import arxiv # Python library for interacting with the arXiv API
import requests # For making HTTP requests to download PDF files
import fitz # PyMuPDF library for extracting text from PDF documents

# --- Agent Core Logic ---

# CURRENT_PAPER_CONTEXT: A global dictionary to help the mock_llm maintain state
# about the paper currently being processed within a single agent run.
# In a real agent with a proper LLM, state management would be more sophisticated,
# possibly integrated into the agent's memory or passed explicitly.
# This is reset for each new user query processed by the agent.
CURRENT_PAPER_CONTEXT = {}

# KNOWLEDGE_BASE: An in-memory list to store dictionaries of scraped paper information.
# This acts as a simple knowledge base for the duration of the Gradio session.
# For persistence, you would use a database or file storage.
KNOWLEDGE_BASE = []


def mock_llm(prompt: str, tools_description: str) -> str:
    """
    A mock Large Language Model (LLM) for the arXiv scraping agent.
    This function simulates LLM behavior using simplistic keyword-based logic
    and the global CURRENT_PAPER_CONTEXT to make decisions.
    A real LLM would use its trained knowledge and reasoning capabilities.
    """
    global CURRENT_PAPER_CONTEXT
    # Print the prompt for debugging (visible in Hugging Face Space logs or local console)
    print(f"\n----- Mock LLM Input -----\nPrompt:\n{prompt}\nTools Available:\n{tools_description}\n--------------------------\n")
    lower_prompt = prompt.lower() # Normalize for case-insensitive matching

    # Scenario 1: Observation received from ArxivSearchTool
    if "observation from last action (arxivsearchtool):" in lower_prompt:
        # Attempt to parse paper details from the search tool's observation
        match = re.search(r"top result:\s*'(.*?)'\s*\(id:\s*([\d\.]+),\s*url:\s*(https?://[^\s]+)\)", lower_prompt, re.IGNORECASE)
        if match:
            title, paper_id, url = match.groups()
            # Update context: we've found a paper to process
            CURRENT_PAPER_CONTEXT = {'id': paper_id, 'title': title, 'url': url, 'status': 'found_paper'}
            # LLM decides the next action is to scrape this paper
            return f"""Thought: I have found a paper titled '{title}' with ID {paper_id}. I should now scrape its content to extract information using the PaperScraperTool.
Action: PaperScraperTool
Action Input: {url}""" # Use the arXiv page URL as input for the scraper
        else:
            # If parsing fails, update context and conclude
            CURRENT_PAPER_CONTEXT = {'status': 'search_failed_to_parse'}
            return f"""Thought: I received search results from ArxivSearchTool, but I couldn't parse the top paper details from the observation. I cannot proceed with scraping.
Final Answer: I found some papers but had trouble extracting specific details for scraping. Please check the raw search results if they were logged, or try a different query."""

    # Scenario 2: Observation received from PaperScraperTool
    elif "observation from last action (paperscrapertool):" in lower_prompt:
        if CURRENT_PAPER_CONTEXT.get('status') == 'found_paper': # Check if we were expecting scraped content
            # Simulate extracting abstract and snippet from the observation
            # A real LLM would parse this more intelligently from the tool's output string.
            abstract_match = re.search(r"abstract:\s*(.*?)(full text snippet:|$)", lower_prompt, re.IGNORECASE | re.DOTALL)
            text_snippet_match = re.search(r"full text snippet:\s*(.*)", lower_prompt, re.IGNORECASE | re.DOTALL)
            abstract = abstract_match.group(1).strip() if abstract_match else "Could not extract abstract from observation."
            text_snippet = text_snippet_match.group(1).strip() if text_snippet_match else "Could not extract text snippet from observation."

            # Prepare data for the knowledge base
            paper_data_for_kb = {
                "id": CURRENT_PAPER_CONTEXT.get('id', 'unknown_id'),
                "title": CURRENT_PAPER_CONTEXT.get('title', 'Unknown Title'),
                "url": CURRENT_PAPER_CONTEXT.get('url', 'unknown_url'),
                "abstract": abstract,
                "text_snippet": text_snippet, # In a real case, this might be more structured or the full text
                "scraped_at": datetime.datetime.now().isoformat()
            }
            CURRENT_PAPER_CONTEXT['status'] = 'scraped_paper' # Update context
            # LLM decides the next action is to store this data
            return f"""Thought: I have the scraped content for '{CURRENT_PAPER_CONTEXT.get('title')}'. I should now store this information in the knowledge base using the KnowledgeBaseStorageTool.
Action: KnowledgeBaseStorageTool
Action Input: {str(paper_data_for_kb)}""" # Pass data as a string (mock LLM limitation)
        else:
            return f"""Thought: I received scraped content, but I don't have the correct prior context (e.g., which paper was being scraped). This is unexpected.
Final Answer: Error processing scraped content due to missing or incorrect context. The scraping might have occurred without a preceding successful search and paper identification."""

    # Scenario 3: Observation received from KnowledgeBaseStorageTool
    elif "observation from last action (knowledgebasestoragetool):" in lower_prompt:
        if CURRENT_PAPER_CONTEXT.get('status') == 'scraped_paper': # Check if we were expecting storage confirmation
            paper_title = CURRENT_PAPER_CONTEXT.get('title', 'the paper')
            CURRENT_PAPER_CONTEXT = {} # Reset context as this task is complete
            # LLM concludes the process
            return f"""Thought: The paper '{paper_title}' has been successfully processed (found, scraped, and stored) in the knowledge base. The task is complete.
Final Answer: Successfully found, scraped, and stored information for '{paper_title}'."""
        else:
            CURRENT_PAPER_CONTEXT = {} # Reset context
            return f"""Thought: I received a storage confirmation, but the context was unclear or didn't match the expected 'scraped_paper' status.
Final Answer: A storage action was observed, but there might have been issues in the preceding steps. The overall process integrity is uncertain."""

    # Scenario 4: Initial query processing (likely a search request)
    if "find papers on" in lower_prompt or "search arxiv for" in lower_prompt:
        query_match = re.search(r"(?:find papers on|search arxiv for)\s*(.+)", lower_prompt)
        search_query = query_match.group(1).strip() if query_match else "default search: quantum computing"
        CURRENT_PAPER_CONTEXT = {'query': search_query, 'status': 'searching'} # Set initial context
        # LLM decides to use the search tool
        return f"""Thought: The user wants to find papers about '{search_query}'. I should use the ArxivSearchTool to find relevant papers.
Action: ArxivSearchTool
Action Input: {search_query}"""
    
    # Fallback Scenario: Query not understood by the mock LLM's simple logic
    else:
        CURRENT_PAPER_CONTEXT = {} # Reset context
        original_query = prompt.split("User query:", 1)[-1].split("\n", 1)[0].strip() if "User query:" in prompt else "the user's query"
        return f"""Thought: I'm not sure how to handle this query: '{original_query}'. My current mocked abilities are limited to searching arXiv based on keywords like 'find papers on' or 'search arxiv for', then scraping and storing the first result.
Final Answer: I can only search arXiv for papers and process them if the query starts with 'find papers on' or 'search arxiv for'. Please rephrase your query (e.g., 'find papers on artificial intelligence')."""

class Tool:
    """A simple class to represent a tool that the agent can use."""
    def __init__(self, name: str, description: str, func):
        self.name = name
        self.description = description # Crucial for the LLM to understand the tool's purpose
        self.func = func # The actual Python function to execute

    def run(self, action_input: str) -> str:
        """Executes the tool's function with the given input."""
        print(f"TOOL EXECUTING: {self.name} with input: '{action_input}'")
        try:
            result = self.func(action_input)
            print(f"TOOL RESULT ({self.name}): {result}")
            return result
        except Exception as e:
            error_message = f"Error executing tool {self.name}: {str(e)}"
            print(error_message)
            return error_message # Return error message as observation

def arxiv_search_func(query: str, max_results=1) -> str:
    """Tool function: Searches arXiv for papers matching the query."""
    try:
        search = arxiv.Search(
            query=query,
            max_results=max_results, # Limiting to 1 for faster demo and simpler mock LLM logic
            sort_by=arxiv.SortCriterion.Relevance
        )
        results_data = []
        for r in search.results(): # arxiv.Client().results(search) is an alternative
            results_data.append({
                "id": r.entry_id.split('/')[-1], # Get the versionless ID (e.g., "1703.03400")
                "title": r.title,
                "authors": [author.name for author in r.authors],
                "summary": r.summary,
                "published": r.published.isoformat(),
                "pdf_url": r.pdf_url, # Direct PDF link
                "arxiv_url": r.entry_id # Link to the abstract page (e.g., "http://arxiv.org/abs/1703.03400v5")
            })
        
        if not results_data:
            return f"No papers found on arXiv for query: '{query}'."
        
        # For the mock LLM, provide a clear summary of the top result
        top_result = results_data[0]
        return (f"Found {len(results_data)} papers. "
                f"Top result: '{top_result['title']}' (ID: {top_result['id']}, URL: {top_result['arxiv_url']})")

    except Exception as e:
        return f"Error searching arXiv: {str(e)}"

def paper_scraper_func(pdf_url_or_id: str) -> str:
    """Tool function: Downloads an arXiv PDF and extracts its text content."""
    try:
        # Determine the direct PDF URL from various input formats
        if "arxiv.org/abs/" in pdf_url_or_id: # e.g., http://arxiv.org/abs/1703.03400
            paper_id_match = re.search(r'abs/([\d\.]+)', pdf_url_or_id)
            if not paper_id_match: raise ValueError("Could not extract paper ID from abs URL.")
            paper_id = paper_id_match.group(1)
            pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
        elif "arxiv.org/pdf/" in pdf_url_or_id: # e.g., http://arxiv.org/pdf/1703.03400.pdf
             pdf_url = pdf_url_or_id
        elif re.match(r'^[\d\.]+(v\d+)?$', pdf_url_or_id): # e.g., 1703.03400 or 1703.03400v5
            pdf_url = f"https://arxiv.org/pdf/{pdf_url_or_id}.pdf"
        else:
            raise ValueError(f"Invalid input format for PaperScraperTool: '{pdf_url_or_id}'. Expected arXiv URL or ID.")
        
        print(f"Attempting to download PDF from: {pdf_url}")
        response = requests.get(pdf_url, timeout=30) # Added timeout for network robustness
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)

        global CURRENT_PAPER_CONTEXT # Use context set by LLM/previous steps
        paper_title = CURRENT_PAPER_CONTEXT.get('title', f"paper from {pdf_url}")
        paper_id_context = CURRENT_PAPER_CONTEXT.get('id', 'unknown_id_from_context')

        full_text = ""
        abstract = "Could not reliably extract abstract from PDF text."
        
        # Use fitz (PyMuPDF) to open PDF from downloaded bytes
        with fitz.open(stream=response.content, filetype="pdf") as doc:
            for page_num, page in enumerate(doc):
                full_text += page.get_text("text") # "text" preserves some layout
                if page_num == 0: # Attempt to extract abstract from the first page
                    first_page_text = page.get_text("text")
                    # Heuristic for abstract extraction (can be improved)
                    abstract_match = re.search(r"Abstract\s*([\s\S]*?)(?:1\.|Introduction|Keywords|I\.|\n\s*\n\s*\n)", first_page_text, re.IGNORECASE | re.DOTALL)
                    if abstract_match:
                        abstract = abstract_match.group(1).strip().replace('\n', ' ')
        
        # Provide a snippet for the observation (full text can be very long)
        text_snippet = (full_text[:500] + "...") if len(full_text) > 500 else full_text
        
        return (f"Scraped content for '{paper_title}' (ID: {paper_id_context}). "
                f"Abstract: {abstract} Full text snippet: {text_snippet}")

    except requests.exceptions.RequestException as e:
        return f"Error downloading PDF from '{pdf_url_or_id}': {str(e)}"
    except Exception as e:
        return f"Error scraping paper '{pdf_url_or_id}': {str(e)}"

def knowledge_base_storage_func(paper_data_str: str) -> str:
    """Tool function: Stores the extracted paper information into the KNOWLEDGE_BASE."""
    global KNOWLEDGE_BASE, CURRENT_PAPER_CONTEXT
    try:
        # The mock LLM provides paper_data_str as a string representation of a dictionary.
        # WARNING: eval() is risky if the input string is not strictly controlled.
        # A real LLM should be prompted to return JSON, and then use json.loads().
        # For this demo, we assume the mock LLM's output is "safe" for eval.
        if isinstance(paper_data_str, str):
            try:
                paper_data = eval(paper_data_str) # Convert string to dict
                if not isinstance(paper_data, dict):
                    raise ValueError("Parsed data from string is not a dictionary.")
            except Exception as e:
                return f"Error parsing paper data string for storage: {str(e)}. Input data string was: '{paper_data_str}'"
        elif isinstance(paper_data_str, dict): # If a dict is somehow passed directly
             paper_data = paper_data_str
        else:
            return f"Invalid data type received for storage: {type(paper_data_str)}. Expected string (evaluable to dict) or dict."

        # Validate essential keys
        required_keys = ["id", "title", "url", "abstract"]
        if not all(key in paper_data for key in required_keys):
            missing_keys = [key for key in required_keys if key not in paper_data]
            return f"Error: Missing required keys for storage: {missing_keys}. Received data: {paper_data}"

        # Avoid adding duplicate papers by ID
        if any(p["id"] == paper_data["id"] for p in KNOWLEDGE_BASE):
            return f"Paper with ID '{paper_data['id']}' is already in the knowledge base. Not adding again."
        
        KNOWLEDGE_BASE.append(paper_data)
        return (f"Successfully stored paper '{paper_data.get('id', 'N/A')}' (Title: '{paper_data.get('title', 'N/A')}') in the knowledge base. "
                f"Knowledge base now contains {len(KNOWLEDGE_BASE)} papers.")
    except Exception as e:
        # If storage fails critically, reset context to prevent loops with bad data
        CURRENT_PAPER_CONTEXT = {}
        return f"Critical error storing paper in knowledge base: {str(e)}. Input was: '{paper_data_str}'"

# Define the list of tools available to the agent
tools_list = [
    Tool(
        name="ArxivSearchTool",
        description="Searches the arXiv repository for research papers based on a query. Input should be the search query (e.g., 'machine learning for climate change'). Returns a summary of search results, highlighting the top paper found.",
        func=arxiv_search_func
    ),
    Tool(
        name="PaperScraperTool",
        description="Downloads an arXiv paper PDF given its arXiv abstract page URL (e.g., 'http://arxiv.org/abs/1234.5678') or just its ID (e.g., '1234.5678') and extracts its text content, including the abstract and a snippet of the full text.",
        func=paper_scraper_func
    ),
    Tool(
        name="KnowledgeBaseStorageTool",
        description="Stores extracted information about a paper (such as its ID, title, URL, abstract, and text snippet) into the system's knowledge base. Input should be a string representation of a Python dictionary containing these paper details.",
        func=knowledge_base_storage_func
    )
]

def get_tools_description_for_prompt(tool_list_arg):
    """Formats tool descriptions for the LLM prompt to help it choose tools."""
    return "\n".join([f"- {tool.name}: {tool.description}" for tool in tool_list_arg])

def parse_llm_react_output(llm_response: str):
    """
    Parses the LLM's ReAct-formatted response to extract Thought, Action, Action Input, or Final Answer.
    """
    thought_match = re.search(r"Thought:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
    action_match = re.search(r"Action:\s*([\w_]+)", llm_response, re.IGNORECASE | re.DOTALL) # Tool names are usually alphanumeric with underscores
    action_input_match = re.search(r"Action Input:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
    final_answer_match = re.search(r"Final Answer:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)

    thought = thought_match.group(1).strip() if thought_match else None
    action = action_match.group(1).strip() if action_match else None
    
    action_input_str = "" # Default to empty string if no input
    if action_input_match:
        action_input_str = action_input_match.group(1).strip()
    elif action: # If there's an action but "Action Input:" line is missing, assume empty input
        action_input_str = ""
        
    final_answer = final_answer_match.group(1).strip() if final_answer_match else None
    return thought, action, action_input_str, final_answer

class ReActAgent:
    """A simple ReAct agent that uses an LLM to reason and act."""
    def __init__(self, llm_function, tool_list_arg, max_iterations=7): # Max iterations for the ReAct loop
        self.llm_function = llm_function
        self.tools = {tool.name: tool for tool in tool_list_arg} # Store tools in a dict for easy lookup
        self.tools_description = get_tools_description_for_prompt(tool_list_arg)
        self.max_iterations = max_iterations
        self.agent_log = [] # Stores the step-by-step log for display in Gradio

    def run(self, user_query: str):
        """Runs the ReAct loop for a given user query."""
        global CURRENT_PAPER_CONTEXT
        CURRENT_PAPER_CONTEXT = {} # Ensure context is fresh for each new query
        self.agent_log = [f"User Query: {user_query}\n"] # Start log with the user query

        # Construct the initial part of the prompt for the LLM
        prompt_history = f"User query: {user_query}\n"
        prompt_history += "You are an AI assistant that processes arXiv papers. You must use the ReAct format: Thought, Action, Action Input, Observation, and finally Final Answer.\n"
        prompt_history += "Based on the user query, decide on a thought, then an action to take using one of the available tools.\n"
        prompt_history += "After an action, you will receive an observation. Reason about the observation to decide the next step.\n"
        prompt_history += "If you have enough information from an observation to answer the user query, or if a multi-step task (like search, scrape, store) is complete, respond with 'Final Answer:'.\n"
        prompt_history += "When using PaperScraperTool, the input is the arXiv URL or ID. When using KnowledgeBaseStorageTool, the input is a string representation of a dictionary with paper details.\n"


        for i in range(self.max_iterations):
            log_entry = f"\n--- Iteration {i + 1} ---\n"
            
            # Call the LLM with the current prompt history and tool descriptions
            llm_response_str = self.llm_function(prompt_history, self.tools_description)
            log_entry += f"LLM Raw Response (Mocked):\n{llm_response_str}\n" # Clearly label as mocked
            
            thought, action_name, action_input, final_answer = parse_llm_react_output(llm_response_str)

            if thought:
                log_entry += f"Thought: {thought}\n"
                prompt_history += f"Thought: {thought}\n" # Add thought to history for next LLM call
            else:
                log_entry += "Warning: No thought found in LLM response for this iteration.\n"

            if final_answer:
                log_entry += f"\nFinal Answer from Agent: {final_answer}\n"
                self.agent_log.append(log_entry)
                CURRENT_PAPER_CONTEXT = {} # Clear context as task is finished
                return final_answer, "\n".join(self.agent_log)

            if action_name:
                log_entry += f"Action: {action_name}\nAction Input: '{action_input}'\n"
                prompt_history += f"Action: {action_name}\nAction Input: {action_input}\n"
                
                if action_name in self.tools:
                    tool_to_use = self.tools[action_name]
                    observation = tool_to_use.run(action_input) # Execute the tool
                    log_entry += f"Observation: {observation}\n"
                    prompt_history += f"Observation: {observation}\n" # Add observation to history
                else:
                    observation = f"Error: Tool '{action_name}' not found. Please choose from the available tools."
                    log_entry += f"{observation}\n"
                    prompt_history += f"Observation: {observation}\n" # Feed error back to LLM
            else:
                # If LLM provides no action and no final answer, it might be stuck
                log_entry += "LLM did not specify an action or a final answer. The agent might be stuck or the task is implicitly complete based on LLM's internal state (which is hard for a mock to determine).\n"
                self.agent_log.append(log_entry)
                CURRENT_PAPER_CONTEXT = {} # Clear context
                # Attempt to give a more informative "stuck" message
                last_thought_or_obs = thought if thought else "No clear thought before stopping."
                return f"Agent concluded: No further action or final answer provided by LLM. Last thought: {last_thought_or_obs}", "\n".join(self.agent_log)
            
            self.agent_log.append(log_entry) # Append current iteration's log

        # If max_iterations is reached without a final answer
        self.agent_log.append("\nMax iterations reached. Stopping.\n")
        CURRENT_PAPER_CONTEXT = {} # Clear context
        return "Agent stopped: Maximum iterations reached without a final answer.", "\n".join(self.agent_log)

# --- Gradio App Definition ---

# Instantiate the agent globally. This ensures that the agent (and its KNOWLEDGE_BASE)
# persists across multiple interactions within the same Gradio session.
# For a deployed app with multiple users, this global KNOWLEDGE_BASE would be shared,
# which might not be desired. Consider session state or a proper database for such scenarios.
arxiv_agent_instance = ReActAgent(llm_function=mock_llm, tool_list_arg=tools_list)

def process_gradio_query(user_query_text: str):
    """
    This function is called by the Gradio interface when the user submits a query.
    It runs the ReAct agent and formats the outputs for display in the UI.
    """
    if not user_query_text or not user_query_text.strip():
        # Handle empty input gracefully
        empty_kb_message = "Knowledge Base is currently empty." if not KNOWLEDGE_BASE else KNOWLEDGE_BASE
        return "Please enter a query.", empty_kb_message, "No agent activity to log for an empty query."

    # Run the agent with the user's query
    final_answer, agent_log_str = arxiv_agent_instance.run(user_query_text)
    
    # Prepare the Knowledge Base for display in Gradio (as JSON)
    # If KNOWLEDGE_BASE is empty, gr.JSON will handle it gracefully.
    # If it has content, it will be a list of dictionaries.
    kb_display_data = KNOWLEDGE_BASE if KNOWLEDGE_BASE else "Knowledge Base is currently empty."

    return final_answer, kb_display_data, agent_log_str

# Define the Gradio interface using gr.Blocks for more layout control
with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
    gr.Markdown(
        """
        # 📄🤖 arXiv Research Paper Agent (Demo)
        This agent uses a **mocked LLM** to simulate searching arXiv, scraping paper content, 
        and adding it to an in-memory knowledge base.
        Enter a query like: `Find papers on 'topic X' and add the first one to the knowledge base.`
        The agent will show its thought process (as if from an LLM) and tool interactions.
        """
    )
    
    with gr.Row():
        query_input_textbox = gr.Textbox(
            label="Your Query for the arXiv Agent",
            placeholder="e.g., Find papers on 'transformer models' and add the first one to the knowledge base.",
            lines=2
        )
    
    submit_query_button = gr.Button("Run Agent �", variant="primary")
    
    with gr.Accordion("Agent's Final Answer & Step-by-Step Log", open=True):
        agent_final_answer_output_textbox = gr.Textbox(
            label="Agent's Final Answer", 
            lines=3, 
            interactive=False,
            placeholder="Agent's final conclusion will appear here..."
        )
        agent_log_output_textbox = gr.Textbox(
            label="Agent's Step-by-Step Log (Simulated LLM Thoughts & Tool Use)", 
            lines=15, 
            interactive=False,
            placeholder="Detailed agent activity log..."
        )

    with gr.Accordion("In-Memory Knowledge Base Contents", open=True):
        knowledge_base_output_json = gr.JSON(
            label="Current Knowledge Base (Papers stored in this session)"
        )
        # For a more tabular view, if KNOWLEDGE_BASE items are consistent dictionaries:
        # knowledge_base_output_df = gr.DataFrame(
        #     label="Current Knowledge Base (Table View)",
        #     headers=["ID", "Title", "URL", "Abstract Snippet", "Scraped At"], # Adjust headers as needed
        #     # You'd need to transform KNOWLEDGE_BASE into a list of lists for gr.DataFrame
        # )

    # Connect the button click to the processing function
    submit_query_button.click(
        fn=process_gradio_query,
        inputs=[query_input_textbox],
        outputs=[agent_final_answer_output_textbox, knowledge_base_output_json, agent_log_output_textbox]
    )
    
    gr.Examples(
        examples=[
            ["Find papers on 'reinforcement learning for robotics' and add the first one to the knowledge base."],
            ["Search arxiv for 'quantum machine learning' and process the top result."],
            ["Find papers on 'explainable AI in healthcare' and add the first one to the knowledge base."],
            ["Find papers on 'graph neural networks for drug discovery' and add the top one to the knowledge base."],
        ],
        inputs=[query_input_textbox],
        # Optional: Define outputs and function for examples if they should pre-fill or behave differently
        # outputs=[agent_final_answer_output_textbox, knowledge_base_output_json, agent_log_output_textbox],
        # fn=process_gradio_query 
    )
    
    gr.Markdown(
        """
        ---
        *Powered by a Mock LLM & Gradio. For a real application, replace `mock_llm` with an actual LLM integration.*
        *PDF scraping uses PyMuPDF. arXiv interaction uses the `arxiv` library.*
        *Knowledge Base is in-memory and resets if the Gradio app restarts.*
        """
    )

if __name__ == "__main__":
    # Instructions to run this Gradio app locally:
    # 1. Ensure all dependencies are installed:
    #    pip install gradio arxiv PyMuPDF requests
    # 2. Save this code as a Python file (e.g., app.py).
    # 3. Run the file from your terminal:
    #    python app.py
    # This will launch a local web server, and Gradio will provide a URL (usually http://127.0.0.1:7860)
    # that you can open in your web browser to interact with the app.
    #
    # For deployment on Hugging Face Spaces:
    # - Name this file `app.py`.
    # - Create a `requirements.txt` file in the same directory with the content:
    #   gradio
    #   arxiv
    #   PyMuPDF
    #   requests
    # - Create a new Space on Hugging Face, select "Gradio" as the SDK, and upload these files.
    demo.launch() # debug=True can be helpful for local development