APrmn8 commited on
Commit
18e9f44
·
verified ·
1 Parent(s): 08a44c9

rombak total

Browse files
Files changed (2) hide show
  1. app.py +454 -294
  2. requirements.txt +2 -7
app.py CHANGED
@@ -1,329 +1,489 @@
1
- # app.py
2
  import gradio as gr
3
- import os
4
  import re
5
- import shutil
6
- import torch
7
- import pickle # For saving/loading Python objects
8
-
9
- # LangChain imports
10
- from langchain_community.document_loaders import PyPDFLoader
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
12
- from langchain_community.embeddings import HuggingFaceEmbeddings
13
- from langchain_community.vectorstores import FAISS
14
- from langchain.chains import RetrievalQA
15
- from langchain_community.llms import HuggingFacePipeline
16
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
17
-
18
- # --- Configuration ---
19
- ARXIV_DIR = "./arxiv_papers" # Directory to save downloaded papers
20
- KB_STORAGE_DIR = "./knowledge_base_storage" # Directory to save/load KB
21
- FAISS_INDEX_PATH = os.path.join(KB_STORAGE_DIR, "faiss_index.bin")
22
- CHUNKS_PATH = os.path.join(KB_STORAGE_DIR, "knowledge_base_chunks.pkl")
23
-
24
- CHUNK_SIZE = 500 # Characters per chunk
25
- CHUNK_OVERLAP = 50 # Overlap between chunks
26
- EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
27
- LLM_MODEL_NAME = "google/flan-t5-small"
28
-
29
- # Ensure KB storage directory exists
30
- os.makedirs(KB_STORAGE_DIR, exist_ok=True)
31
-
32
- # --- Helper Functions for arXiv and PDF Processing ---
33
-
34
- def clean_text(text: str) -> str:
35
- """Basic text cleaning: replaces multiple spaces/newlines with single space and strips whitespace."""
36
- text = re.sub(r'\s+', ' ', text)
37
- text = text.strip()
38
- return text
39
-
40
- def get_arxiv_papers(query: str, max_papers: int = 5) -> list[str]:
41
  """
42
- Searches arXiv for papers, downloads their PDFs, and returns a list of file paths.
43
- Clears the ARXIV_DIR before downloading new papers.
 
 
44
  """
45
- # Clear existing papers before downloading new ones
46
- if os.path.exists(ARXIV_DIR):
47
- shutil.rmtree(ARXIV_DIR)
48
- os.makedirs(ARXIV_DIR, exist_ok=True)
49
-
50
- print(f"Searching arXiv for '{query}' and downloading up to {max_papers} papers...")
51
- import arxiv # Import here to ensure it's available when this function is called
52
- search_results = arxiv.Search(
53
- query=query,
54
- max_results=max_papers,
55
- sort_by=arxiv.SortCriterion.Relevance,
56
- sort_order=arxiv.SortOrder.Descending
57
- )
58
- downloaded_files = []
59
- for i, result in enumerate(search_results.results()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  try:
61
- # Create a safe filename
62
- safe_title = re.sub(r'[\\/:*?"<>|]', '', result.title) # Remove invalid characters
63
- filename = f"{ARXIV_DIR}/{safe_title[:100]}_{result.arxiv_id}.pdf" # Limit title length
64
- print(f"Downloading paper {i+1}/{max_papers}: {result.title}")
65
- result.download_pdf(filename=filename)
66
- downloaded_files.append(filename)
67
  except Exception as e:
68
- print(f"Could not download {result.title}: {e}")
69
- return downloaded_files
70
-
71
- # --- RAGAgent Class ---
72
-
73
- class RAGAgent:
74
- def __init__(self):
75
- self.embedding_model = None
76
- self.llm = None
77
- self.vectorstore = None
78
- self.qa_chain = None
79
- self.is_initialized = False
80
-
81
- def _load_models(self):
82
- """Loads the embedding and generation models if not already loaded."""
83
- if self.embedding_model is None:
84
- print(f"Loading Embedding Model: {EMBEDDING_MODEL_NAME}...")
85
- self.embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
 
 
 
 
 
86
 
87
- if self.llm is None:
88
- print(f"Loading LLM Model: {LLM_MODEL_NAME}...")
89
- tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
90
- model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)
91
-
92
- # Determine device for pipeline
93
- device = 0 if torch.cuda.is_available() else -1
94
-
95
- # Create a Hugging Face pipeline for text generation
96
- text_generation_pipeline = pipeline(
97
- "text2text-generation",
98
- model=model,
99
- tokenizer=tokenizer,
100
- max_new_tokens=150, # Set a default max_new_tokens for the pipeline
101
- min_length=20,
102
- num_beams=5,
103
- early_stopping=True,
104
- device=device
105
- )
106
- self.llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
107
 
108
- self.is_initialized = True
109
-
110
- def initialize_knowledge_base(self, arxiv_query: str, max_papers: int = 5) -> str:
111
- """
112
- Initializes the knowledge base by downloading, extracting, and chunking
113
- arXiv papers using LangChain components, then building a FAISS vectorstore.
114
- """
115
- self._load_models() # Ensure models are loaded first
116
-
117
- # Clear existing papers before downloading new ones
118
- if os.path.exists(ARXIV_DIR):
119
- shutil.rmtree(ARXIV_DIR)
120
- os.makedirs(ARXIV_DIR, exist_ok=True)
121
-
122
- self.vectorstore = None
123
- self.qa_chain = None
124
- self.knowledge_base_chunks = [] # Reset chunks
125
-
126
- print(f"Searching arXiv for '{arxiv_query}' and downloading up to {max_papers} papers...")
127
- try:
128
- # Manual download using arxiv library (as it offers more control over filenames)
129
- pdf_paths = get_arxiv_papers(arxiv_query, max_papers) # Call the helper function
130
-
131
- if not pdf_paths:
132
- return "No papers found or downloaded for the given query. Please try a different query."
133
-
134
- # Load documents from downloaded PDFs using PyPDFLoader
135
- all_documents = []
136
- for pdf_path in pdf_paths:
137
- try:
138
- loader = PyPDFLoader(pdf_path)
139
- all_documents.extend(loader.load())
140
- except Exception as e:
141
- print(f"Error loading PDF {pdf_path}: {e}")
142
-
143
- if not all_documents:
144
- return "Could not load any documents from downloaded PDFs. Please try a different query or fewer papers."
145
-
146
- print(f"Loaded {len(all_documents)} raw documents from PDFs.")
147
-
148
- # Split documents into chunks using RecursiveCharacterTextSplitter
149
- text_splitter = RecursiveCharacterTextSplitter(
150
- chunk_size=CHUNK_SIZE,
151
- chunk_overlap=CHUNK_OVERLAP,
152
- length_function=len,
153
- is_separator_regex=False,
154
- )
155
- self.knowledge_base_chunks = text_splitter.split_documents(all_documents)
156
-
157
- if not self.knowledge_base_chunks:
158
- return "No meaningful text chunks could be created from the papers after splitting."
159
-
160
- print(f"Total chunks created: {len(self.knowledge_base_chunks)}")
161
-
162
- # Create FAISS vectorstore from chunks and embeddings
163
- print("Creating FAISS vectorstore from chunks...")
164
- self.vectorstore = FAISS.from_documents(self.knowledge_base_chunks, self.embedding_model)
165
- print(f"FAISS vectorstore created with {len(self.knowledge_base_chunks)} documents.")
166
-
167
- # Create RetrievalQA chain
168
- self.qa_chain = RetrievalQA.from_chain_type(
169
- llm=self.llm,
170
- chain_type="stuff", # "stuff" puts all retrieved docs into one prompt
171
- retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 docs
172
- return_source_documents=False # Set to True if you want to return source docs
173
- )
174
-
175
- return f"Knowledge base loaded with {len(self.knowledge_base_chunks)} chunks from {len(pdf_paths)} arXiv papers on '{arxiv_query}'."
176
 
177
- except Exception as e:
178
- print(f"Error during knowledge base initialization: {e}")
179
- return f"An error occurred during knowledge base initialization: {e}"
180
 
181
- def save_knowledge_base(self) -> str:
182
- """Saves the current FAISS vectorstore and knowledge base chunks to disk."""
183
- if not self.vectorstore or not self.knowledge_base_chunks:
184
- return "No knowledge base to save. Please load one first."
185
 
186
- try:
187
- # Save FAISS index
188
- self.vectorstore.save_local(KB_STORAGE_DIR, index_name="faiss_index")
189
- # Save chunks (metadata for FAISS, or for re-building if needed)
190
- with open(CHUNKS_PATH, 'wb') as f:
191
- pickle.dump(self.knowledge_base_chunks, f)
192
- print(f"Knowledge base saved to {KB_STORAGE_DIR}")
193
- return f"Knowledge base saved successfully to {KB_STORAGE_DIR}."
194
- except Exception as e:
195
- print(f"Error saving knowledge base: {e}")
196
- return f"Error saving knowledge base: {e}"
197
-
198
- def load_knowledge_base(self) -> str:
199
- """Loads the FAISS vectorstore and knowledge base chunks from disk."""
200
- self._load_models() # Ensure models are loaded before loading KB
201
 
202
- if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(CHUNKS_PATH):
203
- return "Saved knowledge base not found. Please load or create one first."
204
 
205
- try:
206
- # Load FAISS index
207
- self.vectorstore = FAISS.load_local(KB_STORAGE_DIR, self.embedding_model, index_name="faiss_index", allow_dangerous_deserialization=True)
208
- # Load chunks
209
- with open(CHUNKS_PATH, 'rb') as f:
210
- self.knowledge_base_chunks = pickle.load(f)
211
-
212
- # Re-create RetrievalQA chain after loading vectorstore
213
- self.qa_chain = RetrievalQA.from_chain_type(
214
- llm=self.llm,
215
- chain_type="stuff",
216
- retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
217
- return_source_documents=False
218
- )
219
-
220
- print(f"Knowledge base loaded from {KB_STORAGE_DIR}")
221
- return f"Knowledge base loaded successfully from {KB_STORAGE_DIR} with {len(self.knowledge_base_chunks)} chunks."
222
- except Exception as e:
223
- print(f"Error loading knowledge base: {e}")
224
- self.vectorstore = None
225
- self.qa_chain = None
226
- self.knowledge_base_chunks = []
227
- return f"Error loading knowledge base: {e}"
228
-
229
- def query_agent(self, query: str) -> str:
230
- """
231
- Retrieves relevant information from the knowledge base and generates an answer
232
- using the LangChain RetrievalQA chain.
233
- """
234
- if not query.strip():
235
- return "Please enter a question."
236
- if not self.is_initialized or self.qa_chain is None:
237
- return "Knowledge base not loaded. Please initialize it by providing an arXiv query or loading from disk."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
- print(f"\n--- Querying LLM with LangChain QA Chain ---\nQuestion: {query}\n----------------------")
 
 
240
 
241
- try:
242
- # Use the RetrievalQA chain to get the answer
243
- result = self.qa_chain.invoke({"query": query})
244
- answer = result["result"].strip()
245
- except Exception as e:
246
- print(f"Error during generation: {e}")
247
- answer = "I apologize, but I encountered an error while generating the answer. Please try again or rephrase your question."
 
248
 
249
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # --- Gradio Interface ---
 
 
 
252
 
253
- # Instantiate the RAGAgent
254
- rag_agent_instance = RAGAgent()
255
 
256
- print("Setting up Gradio interface...")
 
 
 
 
257
 
258
- with gr.Blocks() as demo:
259
- gr.Markdown("# 📚 Educational RAG Agent with Persistent Knowledge Base")
260
- gr.Markdown("First, load a knowledge base from arXiv, then you can save it or load a previously saved one. Finally, ask questions!")
 
 
 
 
 
 
261
 
262
- with gr.Row():
263
- arxiv_input = gr.Textbox(
264
- label="arXiv Search Query (e.g., 'Large Language Models', 'Reinforcement Learning')",
265
- placeholder="Enter a topic to search for papers on arXiv...",
266
- lines=1
267
- )
268
- max_papers_slider = gr.Slider(
269
- minimum=1,
270
- maximum=10,
271
- step=1,
272
- value=3,
273
- label="Max Papers to Download"
274
- )
275
- load_kb_from_arxiv_button = gr.Button("Load KB from arXiv")
276
 
277
- kb_status_output = gr.Textbox(label="Knowledge Base Status", interactive=False)
 
 
 
278
 
279
- with gr.Row():
280
- save_kb_button = gr.Button("Save Knowledge Base to Disk")
281
- load_kb_from_disk_button = gr.Button("Load Knowledge Base from Disk")
282
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  with gr.Row():
284
- question_input = gr.Textbox(
285
- lines=3,
286
- placeholder="Ask a question based on the loaded knowledge base...",
287
- label="Your Question"
288
  )
289
- answer_output = gr.Textbox(label="Answer", lines=7, interactive=False)
290
 
291
- submit_button = gr.Button("Get Answer")
292
-
293
- load_kb_from_arxiv_button.click(
294
- fn=rag_agent_instance.initialize_knowledge_base,
295
- inputs=[arxiv_input, max_papers_slider],
296
- outputs=kb_status_output
297
- )
298
-
299
- save_kb_button.click(
300
- fn=rag_agent_instance.save_knowledge_base,
301
- inputs=[],
302
- outputs=kb_status_output
303
- )
304
-
305
- load_kb_from_disk_button.click(
306
- fn=rag_agent_instance.load_knowledge_base,
307
- inputs=[],
308
- outputs=kb_status_output
309
- )
310
 
311
- submit_button.click(
312
- fn=rag_agent_instance.query_agent,
313
- inputs=question_input,
314
- outputs=answer_output
 
 
 
 
 
 
 
 
 
 
 
 
315
  )
316
-
317
  gr.Examples(
318
  examples=[
319
- ["What is the transformer architecture?"],
320
- ["Explain attention mechanisms in deep learning."],
321
- ["What are the challenges in reinforcement learning?"],
 
322
  ],
323
- inputs=question_input
 
 
 
 
 
 
 
 
 
 
 
 
324
  )
325
 
326
- # Launch the Gradio app
327
  if __name__ == "__main__":
328
- print("Launching Gradio app...")
329
- demo.launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import re
3
+ import os # Used for environment variables if you switch to a real LLM
4
+ import datetime # For timestamping entries in the knowledge base
5
+ import arxiv # Python library for interacting with the arXiv API
6
+ import requests # For making HTTP requests to download PDF files
7
+ import fitz # PyMuPDF library for extracting text from PDF documents
8
+
9
+ # --- Agent Core Logic ---
10
+
11
+ # CURRENT_PAPER_CONTEXT: A global dictionary to help the mock_llm maintain state
12
+ # about the paper currently being processed within a single agent run.
13
+ # In a real agent with a proper LLM, state management would be more sophisticated,
14
+ # possibly integrated into the agent's memory or passed explicitly.
15
+ # This is reset for each new user query processed by the agent.
16
+ CURRENT_PAPER_CONTEXT = {}
17
+
18
+ # KNOWLEDGE_BASE: An in-memory list to store dictionaries of scraped paper information.
19
+ # This acts as a simple knowledge base for the duration of the Gradio session.
20
+ # For persistence, you would use a database or file storage.
21
+ KNOWLEDGE_BASE = []
22
+
23
+
24
+ def mock_llm(prompt: str, tools_description: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  """
26
+ A mock Large Language Model (LLM) for the arXiv scraping agent.
27
+ This function simulates LLM behavior using simplistic keyword-based logic
28
+ and the global CURRENT_PAPER_CONTEXT to make decisions.
29
+ A real LLM would use its trained knowledge and reasoning capabilities.
30
  """
31
+ global CURRENT_PAPER_CONTEXT
32
+ # Print the prompt for debugging (visible in Hugging Face Space logs or local console)
33
+ print(f"\n----- Mock LLM Input -----\nPrompt:\n{prompt}\nTools Available:\n{tools_description}\n--------------------------\n")
34
+ lower_prompt = prompt.lower() # Normalize for case-insensitive matching
35
+
36
+ # Scenario 1: Observation received from ArxivSearchTool
37
+ if "observation from last action (arxivsearchtool):" in lower_prompt:
38
+ # Attempt to parse paper details from the search tool's observation
39
+ match = re.search(r"top result:\s*'(.*?)'\s*\(id:\s*([\d\.]+),\s*url:\s*(https?://[^\s]+)\)", lower_prompt, re.IGNORECASE)
40
+ if match:
41
+ title, paper_id, url = match.groups()
42
+ # Update context: we've found a paper to process
43
+ CURRENT_PAPER_CONTEXT = {'id': paper_id, 'title': title, 'url': url, 'status': 'found_paper'}
44
+ # LLM decides the next action is to scrape this paper
45
+ return f"""Thought: I have found a paper titled '{title}' with ID {paper_id}. I should now scrape its content to extract information using the PaperScraperTool.
46
+ Action: PaperScraperTool
47
+ Action Input: {url}""" # Use the arXiv page URL as input for the scraper
48
+ else:
49
+ # If parsing fails, update context and conclude
50
+ CURRENT_PAPER_CONTEXT = {'status': 'search_failed_to_parse'}
51
+ return f"""Thought: I received search results from ArxivSearchTool, but I couldn't parse the top paper details from the observation. I cannot proceed with scraping.
52
+ Final Answer: I found some papers but had trouble extracting specific details for scraping. Please check the raw search results if they were logged, or try a different query."""
53
+
54
+ # Scenario 2: Observation received from PaperScraperTool
55
+ elif "observation from last action (paperscrapertool):" in lower_prompt:
56
+ if CURRENT_PAPER_CONTEXT.get('status') == 'found_paper': # Check if we were expecting scraped content
57
+ # Simulate extracting abstract and snippet from the observation
58
+ # A real LLM would parse this more intelligently from the tool's output string.
59
+ abstract_match = re.search(r"abstract:\s*(.*?)(full text snippet:|$)", lower_prompt, re.IGNORECASE | re.DOTALL)
60
+ text_snippet_match = re.search(r"full text snippet:\s*(.*)", lower_prompt, re.IGNORECASE | re.DOTALL)
61
+ abstract = abstract_match.group(1).strip() if abstract_match else "Could not extract abstract from observation."
62
+ text_snippet = text_snippet_match.group(1).strip() if text_snippet_match else "Could not extract text snippet from observation."
63
+
64
+ # Prepare data for the knowledge base
65
+ paper_data_for_kb = {
66
+ "id": CURRENT_PAPER_CONTEXT.get('id', 'unknown_id'),
67
+ "title": CURRENT_PAPER_CONTEXT.get('title', 'Unknown Title'),
68
+ "url": CURRENT_PAPER_CONTEXT.get('url', 'unknown_url'),
69
+ "abstract": abstract,
70
+ "text_snippet": text_snippet, # In a real case, this might be more structured or the full text
71
+ "scraped_at": datetime.datetime.now().isoformat()
72
+ }
73
+ CURRENT_PAPER_CONTEXT['status'] = 'scraped_paper' # Update context
74
+ # LLM decides the next action is to store this data
75
+ return f"""Thought: I have the scraped content for '{CURRENT_PAPER_CONTEXT.get('title')}'. I should now store this information in the knowledge base using the KnowledgeBaseStorageTool.
76
+ Action: KnowledgeBaseStorageTool
77
+ Action Input: {str(paper_data_for_kb)}""" # Pass data as a string (mock LLM limitation)
78
+ else:
79
+ return f"""Thought: I received scraped content, but I don't have the correct prior context (e.g., which paper was being scraped). This is unexpected.
80
+ Final Answer: Error processing scraped content due to missing or incorrect context. The scraping might have occurred without a preceding successful search and paper identification."""
81
+
82
+ # Scenario 3: Observation received from KnowledgeBaseStorageTool
83
+ elif "observation from last action (knowledgebasestoragetool):" in lower_prompt:
84
+ if CURRENT_PAPER_CONTEXT.get('status') == 'scraped_paper': # Check if we were expecting storage confirmation
85
+ paper_title = CURRENT_PAPER_CONTEXT.get('title', 'the paper')
86
+ CURRENT_PAPER_CONTEXT = {} # Reset context as this task is complete
87
+ # LLM concludes the process
88
+ return f"""Thought: The paper '{paper_title}' has been successfully processed (found, scraped, and stored) in the knowledge base. The task is complete.
89
+ Final Answer: Successfully found, scraped, and stored information for '{paper_title}'."""
90
+ else:
91
+ CURRENT_PAPER_CONTEXT = {} # Reset context
92
+ return f"""Thought: I received a storage confirmation, but the context was unclear or didn't match the expected 'scraped_paper' status.
93
+ Final Answer: A storage action was observed, but there might have been issues in the preceding steps. The overall process integrity is uncertain."""
94
+
95
+ # Scenario 4: Initial query processing (likely a search request)
96
+ if "find papers on" in lower_prompt or "search arxiv for" in lower_prompt:
97
+ query_match = re.search(r"(?:find papers on|search arxiv for)\s*(.+)", lower_prompt)
98
+ search_query = query_match.group(1).strip() if query_match else "default search: quantum computing"
99
+ CURRENT_PAPER_CONTEXT = {'query': search_query, 'status': 'searching'} # Set initial context
100
+ # LLM decides to use the search tool
101
+ return f"""Thought: The user wants to find papers about '{search_query}'. I should use the ArxivSearchTool to find relevant papers.
102
+ Action: ArxivSearchTool
103
+ Action Input: {search_query}"""
104
+
105
+ # Fallback Scenario: Query not understood by the mock LLM's simple logic
106
+ else:
107
+ CURRENT_PAPER_CONTEXT = {} # Reset context
108
+ original_query = prompt.split("User query:", 1)[-1].split("\n", 1)[0].strip() if "User query:" in prompt else "the user's query"
109
+ return f"""Thought: I'm not sure how to handle this query: '{original_query}'. My current mocked abilities are limited to searching arXiv based on keywords like 'find papers on' or 'search arxiv for', then scraping and storing the first result.
110
+ Final Answer: I can only search arXiv for papers and process them if the query starts with 'find papers on' or 'search arxiv for'. Please rephrase your query (e.g., 'find papers on artificial intelligence')."""
111
+
112
+ class Tool:
113
+ """A simple class to represent a tool that the agent can use."""
114
+ def __init__(self, name: str, description: str, func):
115
+ self.name = name
116
+ self.description = description # Crucial for the LLM to understand the tool's purpose
117
+ self.func = func # The actual Python function to execute
118
+
119
+ def run(self, action_input: str) -> str:
120
+ """Executes the tool's function with the given input."""
121
+ print(f"TOOL EXECUTING: {self.name} with input: '{action_input}'")
122
  try:
123
+ result = self.func(action_input)
124
+ print(f"TOOL RESULT ({self.name}): {result}")
125
+ return result
 
 
 
126
  except Exception as e:
127
+ error_message = f"Error executing tool {self.name}: {str(e)}"
128
+ print(error_message)
129
+ return error_message # Return error message as observation
130
+
131
+ def arxiv_search_func(query: str, max_results=1) -> str:
132
+ """Tool function: Searches arXiv for papers matching the query."""
133
+ try:
134
+ search = arxiv.Search(
135
+ query=query,
136
+ max_results=max_results, # Limiting to 1 for faster demo and simpler mock LLM logic
137
+ sort_by=arxiv.SortCriterion.Relevance
138
+ )
139
+ results_data = []
140
+ for r in search.results(): # arxiv.Client().results(search) is an alternative
141
+ results_data.append({
142
+ "id": r.entry_id.split('/')[-1], # Get the versionless ID (e.g., "1703.03400")
143
+ "title": r.title,
144
+ "authors": [author.name for author in r.authors],
145
+ "summary": r.summary,
146
+ "published": r.published.isoformat(),
147
+ "pdf_url": r.pdf_url, # Direct PDF link
148
+ "arxiv_url": r.entry_id # Link to the abstract page (e.g., "http://arxiv.org/abs/1703.03400v5")
149
+ })
150
 
151
+ if not results_data:
152
+ return f"No papers found on arXiv for query: '{query}'."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # For the mock LLM, provide a clear summary of the top result
155
+ top_result = results_data[0]
156
+ return (f"Found {len(results_data)} papers. "
157
+ f"Top result: '{top_result['title']}' (ID: {top_result['id']}, URL: {top_result['arxiv_url']})")
158
+
159
+ except Exception as e:
160
+ return f"Error searching arXiv: {str(e)}"
161
+
162
+ def paper_scraper_func(pdf_url_or_id: str) -> str:
163
+ """Tool function: Downloads an arXiv PDF and extracts its text content."""
164
+ try:
165
+ # Determine the direct PDF URL from various input formats
166
+ if "arxiv.org/abs/" in pdf_url_or_id: # e.g., http://arxiv.org/abs/1703.03400
167
+ paper_id_match = re.search(r'abs/([\d\.]+)', pdf_url_or_id)
168
+ if not paper_id_match: raise ValueError("Could not extract paper ID from abs URL.")
169
+ paper_id = paper_id_match.group(1)
170
+ pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
171
+ elif "arxiv.org/pdf/" in pdf_url_or_id: # e.g., http://arxiv.org/pdf/1703.03400.pdf
172
+ pdf_url = pdf_url_or_id
173
+ elif re.match(r'^[\d\.]+(v\d+)?$', pdf_url_or_id): # e.g., 1703.03400 or 1703.03400v5
174
+ pdf_url = f"https://arxiv.org/pdf/{pdf_url_or_id}.pdf"
175
+ else:
176
+ raise ValueError(f"Invalid input format for PaperScraperTool: '{pdf_url_or_id}'. Expected arXiv URL or ID.")
177
+
178
+ print(f"Attempting to download PDF from: {pdf_url}")
179
+ response = requests.get(pdf_url, timeout=30) # Added timeout for network robustness
180
+ response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ global CURRENT_PAPER_CONTEXT # Use context set by LLM/previous steps
183
+ paper_title = CURRENT_PAPER_CONTEXT.get('title', f"paper from {pdf_url}")
184
+ paper_id_context = CURRENT_PAPER_CONTEXT.get('id', 'unknown_id_from_context')
185
 
186
+ full_text = ""
187
+ abstract = "Could not reliably extract abstract from PDF text."
 
 
188
 
189
+ # Use fitz (PyMuPDF) to open PDF from downloaded bytes
190
+ with fitz.open(stream=response.content, filetype="pdf") as doc:
191
+ for page_num, page in enumerate(doc):
192
+ full_text += page.get_text("text") # "text" preserves some layout
193
+ if page_num == 0: # Attempt to extract abstract from the first page
194
+ first_page_text = page.get_text("text")
195
+ # Heuristic for abstract extraction (can be improved)
196
+ abstract_match = re.search(r"Abstract\s*([\s\S]*?)(?:1\.|Introduction|Keywords|I\.|\n\s*\n\s*\n)", first_page_text, re.IGNORECASE | re.DOTALL)
197
+ if abstract_match:
198
+ abstract = abstract_match.group(1).strip().replace('\n', ' ')
 
 
 
 
 
199
 
200
+ # Provide a snippet for the observation (full text can be very long)
201
+ text_snippet = (full_text[:500] + "...") if len(full_text) > 500 else full_text
202
 
203
+ return (f"Scraped content for '{paper_title}' (ID: {paper_id_context}). "
204
+ f"Abstract: {abstract} Full text snippet: {text_snippet}")
205
+
206
+ except requests.exceptions.RequestException as e:
207
+ return f"Error downloading PDF from '{pdf_url_or_id}': {str(e)}"
208
+ except Exception as e:
209
+ return f"Error scraping paper '{pdf_url_or_id}': {str(e)}"
210
+
211
+ def knowledge_base_storage_func(paper_data_str: str) -> str:
212
+ """Tool function: Stores the extracted paper information into the KNOWLEDGE_BASE."""
213
+ global KNOWLEDGE_BASE, CURRENT_PAPER_CONTEXT
214
+ try:
215
+ # The mock LLM provides paper_data_str as a string representation of a dictionary.
216
+ # WARNING: eval() is risky if the input string is not strictly controlled.
217
+ # A real LLM should be prompted to return JSON, and then use json.loads().
218
+ # For this demo, we assume the mock LLM's output is "safe" for eval.
219
+ if isinstance(paper_data_str, str):
220
+ try:
221
+ paper_data = eval(paper_data_str) # Convert string to dict
222
+ if not isinstance(paper_data, dict):
223
+ raise ValueError("Parsed data from string is not a dictionary.")
224
+ except Exception as e:
225
+ return f"Error parsing paper data string for storage: {str(e)}. Input data string was: '{paper_data_str}'"
226
+ elif isinstance(paper_data_str, dict): # If a dict is somehow passed directly
227
+ paper_data = paper_data_str
228
+ else:
229
+ return f"Invalid data type received for storage: {type(paper_data_str)}. Expected string (evaluable to dict) or dict."
230
+
231
+ # Validate essential keys
232
+ required_keys = ["id", "title", "url", "abstract"]
233
+ if not all(key in paper_data for key in required_keys):
234
+ missing_keys = [key for key in required_keys if key not in paper_data]
235
+ return f"Error: Missing required keys for storage: {missing_keys}. Received data: {paper_data}"
236
+
237
+ # Avoid adding duplicate papers by ID
238
+ if any(p["id"] == paper_data["id"] for p in KNOWLEDGE_BASE):
239
+ return f"Paper with ID '{paper_data['id']}' is already in the knowledge base. Not adding again."
240
+
241
+ KNOWLEDGE_BASE.append(paper_data)
242
+ return (f"Successfully stored paper '{paper_data.get('id', 'N/A')}' (Title: '{paper_data.get('title', 'N/A')}') in the knowledge base. "
243
+ f"Knowledge base now contains {len(KNOWLEDGE_BASE)} papers.")
244
+ except Exception as e:
245
+ # If storage fails critically, reset context to prevent loops with bad data
246
+ CURRENT_PAPER_CONTEXT = {}
247
+ return f"Critical error storing paper in knowledge base: {str(e)}. Input was: '{paper_data_str}'"
248
+
249
+ # Define the list of tools available to the agent
250
+ tools_list = [
251
+ Tool(
252
+ name="ArxivSearchTool",
253
+ description="Searches the arXiv repository for research papers based on a query. Input should be the search query (e.g., 'machine learning for climate change'). Returns a summary of search results, highlighting the top paper found.",
254
+ func=arxiv_search_func
255
+ ),
256
+ Tool(
257
+ name="PaperScraperTool",
258
+ description="Downloads an arXiv paper PDF given its arXiv abstract page URL (e.g., 'http://arxiv.org/abs/1234.5678') or just its ID (e.g., '1234.5678') and extracts its text content, including the abstract and a snippet of the full text.",
259
+ func=paper_scraper_func
260
+ ),
261
+ Tool(
262
+ name="KnowledgeBaseStorageTool",
263
+ description="Stores extracted information about a paper (such as its ID, title, URL, abstract, and text snippet) into the system's knowledge base. Input should be a string representation of a Python dictionary containing these paper details.",
264
+ func=knowledge_base_storage_func
265
+ )
266
+ ]
267
 
268
+ def get_tools_description_for_prompt(tool_list_arg):
269
+ """Formats tool descriptions for the LLM prompt to help it choose tools."""
270
+ return "\n".join([f"- {tool.name}: {tool.description}" for tool in tool_list_arg])
271
 
272
+ def parse_llm_react_output(llm_response: str):
273
+ """
274
+ Parses the LLM's ReAct-formatted response to extract Thought, Action, Action Input, or Final Answer.
275
+ """
276
+ thought_match = re.search(r"Thought:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
277
+ action_match = re.search(r"Action:\s*([\w_]+)", llm_response, re.IGNORECASE | re.DOTALL) # Tool names are usually alphanumeric with underscores
278
+ action_input_match = re.search(r"Action Input:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
279
+ final_answer_match = re.search(r"Final Answer:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
280
 
281
+ thought = thought_match.group(1).strip() if thought_match else None
282
+ action = action_match.group(1).strip() if action_match else None
283
+
284
+ action_input_str = "" # Default to empty string if no input
285
+ if action_input_match:
286
+ action_input_str = action_input_match.group(1).strip()
287
+ elif action: # If there's an action but "Action Input:" line is missing, assume empty input
288
+ action_input_str = ""
289
+
290
+ final_answer = final_answer_match.group(1).strip() if final_answer_match else None
291
+ return thought, action, action_input_str, final_answer
292
+
293
+ class ReActAgent:
294
+ """A simple ReAct agent that uses an LLM to reason and act."""
295
+ def __init__(self, llm_function, tool_list_arg, max_iterations=7): # Max iterations for the ReAct loop
296
+ self.llm_function = llm_function
297
+ self.tools = {tool.name: tool for tool in tool_list_arg} # Store tools in a dict for easy lookup
298
+ self.tools_description = get_tools_description_for_prompt(tool_list_arg)
299
+ self.max_iterations = max_iterations
300
+ self.agent_log = [] # Stores the step-by-step log for display in Gradio
301
+
302
+ def run(self, user_query: str):
303
+ """Runs the ReAct loop for a given user query."""
304
+ global CURRENT_PAPER_CONTEXT
305
+ CURRENT_PAPER_CONTEXT = {} # Ensure context is fresh for each new query
306
+ self.agent_log = [f"User Query: {user_query}\n"] # Start log with the user query
307
+
308
+ # Construct the initial part of the prompt for the LLM
309
+ prompt_history = f"User query: {user_query}\n"
310
+ prompt_history += "You are an AI assistant that processes arXiv papers. You must use the ReAct format: Thought, Action, Action Input, Observation, and finally Final Answer.\n"
311
+ prompt_history += "Based on the user query, decide on a thought, then an action to take using one of the available tools.\n"
312
+ prompt_history += "After an action, you will receive an observation. Reason about the observation to decide the next step.\n"
313
+ prompt_history += "If you have enough information from an observation to answer the user query, or if a multi-step task (like search, scrape, store) is complete, respond with 'Final Answer:'.\n"
314
+ prompt_history += "When using PaperScraperTool, the input is the arXiv URL or ID. When using KnowledgeBaseStorageTool, the input is a string representation of a dictionary with paper details.\n"
315
+
316
+
317
+ for i in range(self.max_iterations):
318
+ log_entry = f"\n--- Iteration {i + 1} ---\n"
319
+
320
+ # Call the LLM with the current prompt history and tool descriptions
321
+ llm_response_str = self.llm_function(prompt_history, self.tools_description)
322
+ log_entry += f"LLM Raw Response (Mocked):\n{llm_response_str}\n" # Clearly label as mocked
323
+
324
+ thought, action_name, action_input, final_answer = parse_llm_react_output(llm_response_str)
325
+
326
+ if thought:
327
+ log_entry += f"Thought: {thought}\n"
328
+ prompt_history += f"Thought: {thought}\n" # Add thought to history for next LLM call
329
+ else:
330
+ log_entry += "Warning: No thought found in LLM response for this iteration.\n"
331
+
332
+ if final_answer:
333
+ log_entry += f"\nFinal Answer from Agent: {final_answer}\n"
334
+ self.agent_log.append(log_entry)
335
+ CURRENT_PAPER_CONTEXT = {} # Clear context as task is finished
336
+ return final_answer, "\n".join(self.agent_log)
337
+
338
+ if action_name:
339
+ log_entry += f"Action: {action_name}\nAction Input: '{action_input}'\n"
340
+ prompt_history += f"Action: {action_name}\nAction Input: {action_input}\n"
341
+
342
+ if action_name in self.tools:
343
+ tool_to_use = self.tools[action_name]
344
+ observation = tool_to_use.run(action_input) # Execute the tool
345
+ log_entry += f"Observation: {observation}\n"
346
+ prompt_history += f"Observation: {observation}\n" # Add observation to history
347
+ else:
348
+ observation = f"Error: Tool '{action_name}' not found. Please choose from the available tools."
349
+ log_entry += f"{observation}\n"
350
+ prompt_history += f"Observation: {observation}\n" # Feed error back to LLM
351
+ else:
352
+ # If LLM provides no action and no final answer, it might be stuck
353
+ log_entry += "LLM did not specify an action or a final answer. The agent might be stuck or the task is implicitly complete based on LLM's internal state (which is hard for a mock to determine).\n"
354
+ self.agent_log.append(log_entry)
355
+ CURRENT_PAPER_CONTEXT = {} # Clear context
356
+ # Attempt to give a more informative "stuck" message
357
+ last_thought_or_obs = thought if thought else "No clear thought before stopping."
358
+ return f"Agent concluded: No further action or final answer provided by LLM. Last thought: {last_thought_or_obs}", "\n".join(self.agent_log)
359
+
360
+ self.agent_log.append(log_entry) # Append current iteration's log
361
 
362
+ # If max_iterations is reached without a final answer
363
+ self.agent_log.append("\nMax iterations reached. Stopping.\n")
364
+ CURRENT_PAPER_CONTEXT = {} # Clear context
365
+ return "Agent stopped: Maximum iterations reached without a final answer.", "\n".join(self.agent_log)
366
 
367
+ # --- Gradio App Definition ---
 
368
 
369
+ # Instantiate the agent globally. This ensures that the agent (and its KNOWLEDGE_BASE)
370
+ # persists across multiple interactions within the same Gradio session.
371
+ # For a deployed app with multiple users, this global KNOWLEDGE_BASE would be shared,
372
+ # which might not be desired. Consider session state or a proper database for such scenarios.
373
+ arxiv_agent_instance = ReActAgent(llm_function=mock_llm, tool_list_arg=tools_list)
374
 
375
+ def process_gradio_query(user_query_text: str):
376
+ """
377
+ This function is called by the Gradio interface when the user submits a query.
378
+ It runs the ReAct agent and formats the outputs for display in the UI.
379
+ """
380
+ if not user_query_text or not user_query_text.strip():
381
+ # Handle empty input gracefully
382
+ empty_kb_message = "Knowledge Base is currently empty." if not KNOWLEDGE_BASE else KNOWLEDGE_BASE
383
+ return "Please enter a query.", empty_kb_message, "No agent activity to log for an empty query."
384
 
385
+ # Run the agent with the user's query
386
+ final_answer, agent_log_str = arxiv_agent_instance.run(user_query_text)
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
+ # Prepare the Knowledge Base for display in Gradio (as JSON)
389
+ # If KNOWLEDGE_BASE is empty, gr.JSON will handle it gracefully.
390
+ # If it has content, it will be a list of dictionaries.
391
+ kb_display_data = KNOWLEDGE_BASE if KNOWLEDGE_BASE else "Knowledge Base is currently empty."
392
 
393
+ return final_answer, kb_display_data, agent_log_str
 
 
394
 
395
+ # Define the Gradio interface using gr.Blocks for more layout control
396
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
397
+ gr.Markdown(
398
+ """
399
+ # 📄🤖 arXiv Research Paper Agent (Demo)
400
+ This agent uses a **mocked LLM** to simulate searching arXiv, scraping paper content,
401
+ and adding it to an in-memory knowledge base.
402
+ Enter a query like: `Find papers on 'topic X' and add the first one to the knowledge base.`
403
+ The agent will show its thought process (as if from an LLM) and tool interactions.
404
+ """
405
+ )
406
+
407
  with gr.Row():
408
+ query_input_textbox = gr.Textbox(
409
+ label="Your Query for the arXiv Agent",
410
+ placeholder="e.g., Find papers on 'transformer models' and add the first one to the knowledge base.",
411
+ lines=2
412
  )
 
413
 
414
+ submit_query_button = gr.Button("Run Agent �", variant="primary")
415
+
416
+ with gr.Accordion("Agent's Final Answer & Step-by-Step Log", open=True):
417
+ agent_final_answer_output_textbox = gr.Textbox(
418
+ label="Agent's Final Answer",
419
+ lines=3,
420
+ interactive=False,
421
+ placeholder="Agent's final conclusion will appear here..."
422
+ )
423
+ agent_log_output_textbox = gr.Textbox(
424
+ label="Agent's Step-by-Step Log (Simulated LLM Thoughts & Tool Use)",
425
+ lines=15,
426
+ interactive=False,
427
+ placeholder="Detailed agent activity log..."
428
+ )
 
 
 
 
429
 
430
+ with gr.Accordion("In-Memory Knowledge Base Contents", open=True):
431
+ knowledge_base_output_json = gr.JSON(
432
+ label="Current Knowledge Base (Papers stored in this session)"
433
+ )
434
+ # For a more tabular view, if KNOWLEDGE_BASE items are consistent dictionaries:
435
+ # knowledge_base_output_df = gr.DataFrame(
436
+ # label="Current Knowledge Base (Table View)",
437
+ # headers=["ID", "Title", "URL", "Abstract Snippet", "Scraped At"], # Adjust headers as needed
438
+ # # You'd need to transform KNOWLEDGE_BASE into a list of lists for gr.DataFrame
439
+ # )
440
+
441
+ # Connect the button click to the processing function
442
+ submit_query_button.click(
443
+ fn=process_gradio_query,
444
+ inputs=[query_input_textbox],
445
+ outputs=[agent_final_answer_output_textbox, knowledge_base_output_json, agent_log_output_textbox]
446
  )
447
+
448
  gr.Examples(
449
  examples=[
450
+ ["Find papers on 'reinforcement learning for robotics' and add the first one to the knowledge base."],
451
+ ["Search arxiv for 'quantum machine learning' and process the top result."],
452
+ ["Find papers on 'explainable AI in healthcare' and add the first one to the knowledge base."],
453
+ ["Find papers on 'graph neural networks for drug discovery' and add the top one to the knowledge base."],
454
  ],
455
+ inputs=[query_input_textbox],
456
+ # Optional: Define outputs and function for examples if they should pre-fill or behave differently
457
+ # outputs=[agent_final_answer_output_textbox, knowledge_base_output_json, agent_log_output_textbox],
458
+ # fn=process_gradio_query
459
+ )
460
+
461
+ gr.Markdown(
462
+ """
463
+ ---
464
+ *Powered by a Mock LLM & Gradio. For a real application, replace `mock_llm` with an actual LLM integration.*
465
+ *PDF scraping uses PyMuPDF. arXiv interaction uses the `arxiv` library.*
466
+ *Knowledge Base is in-memory and resets if the Gradio app restarts.*
467
+ """
468
  )
469
 
 
470
  if __name__ == "__main__":
471
+ # Instructions to run this Gradio app locally:
472
+ # 1. Ensure all dependencies are installed:
473
+ # pip install gradio arxiv PyMuPDF requests
474
+ # 2. Save this code as a Python file (e.g., app.py).
475
+ # 3. Run the file from your terminal:
476
+ # python app.py
477
+ # This will launch a local web server, and Gradio will provide a URL (usually http://127.0.0.1:7860)
478
+ # that you can open in your web browser to interact with the app.
479
+ #
480
+ # For deployment on Hugging Face Spaces:
481
+ # - Name this file `app.py`.
482
+ # - Create a `requirements.txt` file in the same directory with the content:
483
+ # gradio
484
+ # arxiv
485
+ # PyMuPDF
486
+ # requests
487
+ # - Create a new Space on Hugging Face, select "Gradio" as the SDK, and upload these files.
488
+ demo.launch() # debug=True can be helpful for local development
489
+
requirements.txt CHANGED
@@ -1,10 +1,5 @@
1
  huggingface_hub==0.25.2
2
  gradio
3
- transformers
4
- sentence-transformers
5
- faiss-cpu
6
- torch
7
  arxiv
8
- pypdf2
9
- langchain
10
- langchain-community
 
1
  huggingface_hub==0.25.2
2
  gradio
 
 
 
 
3
  arxiv
4
+ PyMuPDF
5
+ requests