Spaces:
Sleeping
Sleeping
rombak total
Browse files- app.py +454 -294
- requirements.txt +2 -7
app.py
CHANGED
@@ -1,329 +1,489 @@
|
|
1 |
-
# app.py
|
2 |
import gradio as gr
|
3 |
-
import os
|
4 |
import re
|
5 |
-
import
|
6 |
-
import
|
7 |
-
import
|
8 |
-
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
LLM_MODEL_NAME = "google/flan-t5-small"
|
28 |
-
|
29 |
-
# Ensure KB storage directory exists
|
30 |
-
os.makedirs(KB_STORAGE_DIR, exist_ok=True)
|
31 |
-
|
32 |
-
# --- Helper Functions for arXiv and PDF Processing ---
|
33 |
-
|
34 |
-
def clean_text(text: str) -> str:
|
35 |
-
"""Basic text cleaning: replaces multiple spaces/newlines with single space and strips whitespace."""
|
36 |
-
text = re.sub(r'\s+', ' ', text)
|
37 |
-
text = text.strip()
|
38 |
-
return text
|
39 |
-
|
40 |
-
def get_arxiv_papers(query: str, max_papers: int = 5) -> list[str]:
|
41 |
"""
|
42 |
-
|
43 |
-
|
|
|
|
|
44 |
"""
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
try:
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
print(f"Downloading paper {i+1}/{max_papers}: {result.title}")
|
65 |
-
result.download_pdf(filename=filename)
|
66 |
-
downloaded_files.append(filename)
|
67 |
except Exception as e:
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
if
|
88 |
-
|
89 |
-
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
|
90 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)
|
91 |
-
|
92 |
-
# Determine device for pipeline
|
93 |
-
device = 0 if torch.cuda.is_available() else -1
|
94 |
-
|
95 |
-
# Create a Hugging Face pipeline for text generation
|
96 |
-
text_generation_pipeline = pipeline(
|
97 |
-
"text2text-generation",
|
98 |
-
model=model,
|
99 |
-
tokenizer=tokenizer,
|
100 |
-
max_new_tokens=150, # Set a default max_new_tokens for the pipeline
|
101 |
-
min_length=20,
|
102 |
-
num_beams=5,
|
103 |
-
early_stopping=True,
|
104 |
-
device=device
|
105 |
-
)
|
106 |
-
self.llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
""
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
all_documents = []
|
136 |
-
for pdf_path in pdf_paths:
|
137 |
-
try:
|
138 |
-
loader = PyPDFLoader(pdf_path)
|
139 |
-
all_documents.extend(loader.load())
|
140 |
-
except Exception as e:
|
141 |
-
print(f"Error loading PDF {pdf_path}: {e}")
|
142 |
-
|
143 |
-
if not all_documents:
|
144 |
-
return "Could not load any documents from downloaded PDFs. Please try a different query or fewer papers."
|
145 |
-
|
146 |
-
print(f"Loaded {len(all_documents)} raw documents from PDFs.")
|
147 |
-
|
148 |
-
# Split documents into chunks using RecursiveCharacterTextSplitter
|
149 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
150 |
-
chunk_size=CHUNK_SIZE,
|
151 |
-
chunk_overlap=CHUNK_OVERLAP,
|
152 |
-
length_function=len,
|
153 |
-
is_separator_regex=False,
|
154 |
-
)
|
155 |
-
self.knowledge_base_chunks = text_splitter.split_documents(all_documents)
|
156 |
-
|
157 |
-
if not self.knowledge_base_chunks:
|
158 |
-
return "No meaningful text chunks could be created from the papers after splitting."
|
159 |
-
|
160 |
-
print(f"Total chunks created: {len(self.knowledge_base_chunks)}")
|
161 |
-
|
162 |
-
# Create FAISS vectorstore from chunks and embeddings
|
163 |
-
print("Creating FAISS vectorstore from chunks...")
|
164 |
-
self.vectorstore = FAISS.from_documents(self.knowledge_base_chunks, self.embedding_model)
|
165 |
-
print(f"FAISS vectorstore created with {len(self.knowledge_base_chunks)} documents.")
|
166 |
-
|
167 |
-
# Create RetrievalQA chain
|
168 |
-
self.qa_chain = RetrievalQA.from_chain_type(
|
169 |
-
llm=self.llm,
|
170 |
-
chain_type="stuff", # "stuff" puts all retrieved docs into one prompt
|
171 |
-
retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 docs
|
172 |
-
return_source_documents=False # Set to True if you want to return source docs
|
173 |
-
)
|
174 |
-
|
175 |
-
return f"Knowledge base loaded with {len(self.knowledge_base_chunks)} chunks from {len(pdf_paths)} arXiv papers on '{arxiv_query}'."
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
if not self.vectorstore or not self.knowledge_base_chunks:
|
184 |
-
return "No knowledge base to save. Please load one first."
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
return f"Error saving knowledge base: {e}"
|
197 |
-
|
198 |
-
def load_knowledge_base(self) -> str:
|
199 |
-
"""Loads the FAISS vectorstore and knowledge base chunks from disk."""
|
200 |
-
self._load_models() # Ensure models are loaded before loading KB
|
201 |
|
202 |
-
|
203 |
-
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
-
|
|
|
|
|
240 |
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
248 |
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
-
#
|
|
|
|
|
|
|
252 |
|
253 |
-
#
|
254 |
-
rag_agent_instance = RAGAgent()
|
255 |
|
256 |
-
|
|
|
|
|
|
|
|
|
257 |
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
-
with
|
263 |
-
|
264 |
-
label="arXiv Search Query (e.g., 'Large Language Models', 'Reinforcement Learning')",
|
265 |
-
placeholder="Enter a topic to search for papers on arXiv...",
|
266 |
-
lines=1
|
267 |
-
)
|
268 |
-
max_papers_slider = gr.Slider(
|
269 |
-
minimum=1,
|
270 |
-
maximum=10,
|
271 |
-
step=1,
|
272 |
-
value=3,
|
273 |
-
label="Max Papers to Download"
|
274 |
-
)
|
275 |
-
load_kb_from_arxiv_button = gr.Button("Load KB from arXiv")
|
276 |
|
277 |
-
|
|
|
|
|
|
|
278 |
|
279 |
-
|
280 |
-
save_kb_button = gr.Button("Save Knowledge Base to Disk")
|
281 |
-
load_kb_from_disk_button = gr.Button("Load Knowledge Base from Disk")
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
with gr.Row():
|
284 |
-
|
285 |
-
|
286 |
-
placeholder="
|
287 |
-
|
288 |
)
|
289 |
-
answer_output = gr.Textbox(label="Answer", lines=7, interactive=False)
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
fn=rag_agent_instance.load_knowledge_base,
|
307 |
-
inputs=[],
|
308 |
-
outputs=kb_status_output
|
309 |
-
)
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
)
|
316 |
-
|
317 |
gr.Examples(
|
318 |
examples=[
|
319 |
-
["
|
320 |
-
["
|
321 |
-
["
|
|
|
322 |
],
|
323 |
-
inputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
)
|
325 |
|
326 |
-
# Launch the Gradio app
|
327 |
if __name__ == "__main__":
|
328 |
-
|
329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import re
|
3 |
+
import os # Used for environment variables if you switch to a real LLM
|
4 |
+
import datetime # For timestamping entries in the knowledge base
|
5 |
+
import arxiv # Python library for interacting with the arXiv API
|
6 |
+
import requests # For making HTTP requests to download PDF files
|
7 |
+
import fitz # PyMuPDF library for extracting text from PDF documents
|
8 |
+
|
9 |
+
# --- Agent Core Logic ---
|
10 |
+
|
11 |
+
# CURRENT_PAPER_CONTEXT: A global dictionary to help the mock_llm maintain state
|
12 |
+
# about the paper currently being processed within a single agent run.
|
13 |
+
# In a real agent with a proper LLM, state management would be more sophisticated,
|
14 |
+
# possibly integrated into the agent's memory or passed explicitly.
|
15 |
+
# This is reset for each new user query processed by the agent.
|
16 |
+
CURRENT_PAPER_CONTEXT = {}
|
17 |
+
|
18 |
+
# KNOWLEDGE_BASE: An in-memory list to store dictionaries of scraped paper information.
|
19 |
+
# This acts as a simple knowledge base for the duration of the Gradio session.
|
20 |
+
# For persistence, you would use a database or file storage.
|
21 |
+
KNOWLEDGE_BASE = []
|
22 |
+
|
23 |
+
|
24 |
+
def mock_llm(prompt: str, tools_description: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
"""
|
26 |
+
A mock Large Language Model (LLM) for the arXiv scraping agent.
|
27 |
+
This function simulates LLM behavior using simplistic keyword-based logic
|
28 |
+
and the global CURRENT_PAPER_CONTEXT to make decisions.
|
29 |
+
A real LLM would use its trained knowledge and reasoning capabilities.
|
30 |
"""
|
31 |
+
global CURRENT_PAPER_CONTEXT
|
32 |
+
# Print the prompt for debugging (visible in Hugging Face Space logs or local console)
|
33 |
+
print(f"\n----- Mock LLM Input -----\nPrompt:\n{prompt}\nTools Available:\n{tools_description}\n--------------------------\n")
|
34 |
+
lower_prompt = prompt.lower() # Normalize for case-insensitive matching
|
35 |
+
|
36 |
+
# Scenario 1: Observation received from ArxivSearchTool
|
37 |
+
if "observation from last action (arxivsearchtool):" in lower_prompt:
|
38 |
+
# Attempt to parse paper details from the search tool's observation
|
39 |
+
match = re.search(r"top result:\s*'(.*?)'\s*\(id:\s*([\d\.]+),\s*url:\s*(https?://[^\s]+)\)", lower_prompt, re.IGNORECASE)
|
40 |
+
if match:
|
41 |
+
title, paper_id, url = match.groups()
|
42 |
+
# Update context: we've found a paper to process
|
43 |
+
CURRENT_PAPER_CONTEXT = {'id': paper_id, 'title': title, 'url': url, 'status': 'found_paper'}
|
44 |
+
# LLM decides the next action is to scrape this paper
|
45 |
+
return f"""Thought: I have found a paper titled '{title}' with ID {paper_id}. I should now scrape its content to extract information using the PaperScraperTool.
|
46 |
+
Action: PaperScraperTool
|
47 |
+
Action Input: {url}""" # Use the arXiv page URL as input for the scraper
|
48 |
+
else:
|
49 |
+
# If parsing fails, update context and conclude
|
50 |
+
CURRENT_PAPER_CONTEXT = {'status': 'search_failed_to_parse'}
|
51 |
+
return f"""Thought: I received search results from ArxivSearchTool, but I couldn't parse the top paper details from the observation. I cannot proceed with scraping.
|
52 |
+
Final Answer: I found some papers but had trouble extracting specific details for scraping. Please check the raw search results if they were logged, or try a different query."""
|
53 |
+
|
54 |
+
# Scenario 2: Observation received from PaperScraperTool
|
55 |
+
elif "observation from last action (paperscrapertool):" in lower_prompt:
|
56 |
+
if CURRENT_PAPER_CONTEXT.get('status') == 'found_paper': # Check if we were expecting scraped content
|
57 |
+
# Simulate extracting abstract and snippet from the observation
|
58 |
+
# A real LLM would parse this more intelligently from the tool's output string.
|
59 |
+
abstract_match = re.search(r"abstract:\s*(.*?)(full text snippet:|$)", lower_prompt, re.IGNORECASE | re.DOTALL)
|
60 |
+
text_snippet_match = re.search(r"full text snippet:\s*(.*)", lower_prompt, re.IGNORECASE | re.DOTALL)
|
61 |
+
abstract = abstract_match.group(1).strip() if abstract_match else "Could not extract abstract from observation."
|
62 |
+
text_snippet = text_snippet_match.group(1).strip() if text_snippet_match else "Could not extract text snippet from observation."
|
63 |
+
|
64 |
+
# Prepare data for the knowledge base
|
65 |
+
paper_data_for_kb = {
|
66 |
+
"id": CURRENT_PAPER_CONTEXT.get('id', 'unknown_id'),
|
67 |
+
"title": CURRENT_PAPER_CONTEXT.get('title', 'Unknown Title'),
|
68 |
+
"url": CURRENT_PAPER_CONTEXT.get('url', 'unknown_url'),
|
69 |
+
"abstract": abstract,
|
70 |
+
"text_snippet": text_snippet, # In a real case, this might be more structured or the full text
|
71 |
+
"scraped_at": datetime.datetime.now().isoformat()
|
72 |
+
}
|
73 |
+
CURRENT_PAPER_CONTEXT['status'] = 'scraped_paper' # Update context
|
74 |
+
# LLM decides the next action is to store this data
|
75 |
+
return f"""Thought: I have the scraped content for '{CURRENT_PAPER_CONTEXT.get('title')}'. I should now store this information in the knowledge base using the KnowledgeBaseStorageTool.
|
76 |
+
Action: KnowledgeBaseStorageTool
|
77 |
+
Action Input: {str(paper_data_for_kb)}""" # Pass data as a string (mock LLM limitation)
|
78 |
+
else:
|
79 |
+
return f"""Thought: I received scraped content, but I don't have the correct prior context (e.g., which paper was being scraped). This is unexpected.
|
80 |
+
Final Answer: Error processing scraped content due to missing or incorrect context. The scraping might have occurred without a preceding successful search and paper identification."""
|
81 |
+
|
82 |
+
# Scenario 3: Observation received from KnowledgeBaseStorageTool
|
83 |
+
elif "observation from last action (knowledgebasestoragetool):" in lower_prompt:
|
84 |
+
if CURRENT_PAPER_CONTEXT.get('status') == 'scraped_paper': # Check if we were expecting storage confirmation
|
85 |
+
paper_title = CURRENT_PAPER_CONTEXT.get('title', 'the paper')
|
86 |
+
CURRENT_PAPER_CONTEXT = {} # Reset context as this task is complete
|
87 |
+
# LLM concludes the process
|
88 |
+
return f"""Thought: The paper '{paper_title}' has been successfully processed (found, scraped, and stored) in the knowledge base. The task is complete.
|
89 |
+
Final Answer: Successfully found, scraped, and stored information for '{paper_title}'."""
|
90 |
+
else:
|
91 |
+
CURRENT_PAPER_CONTEXT = {} # Reset context
|
92 |
+
return f"""Thought: I received a storage confirmation, but the context was unclear or didn't match the expected 'scraped_paper' status.
|
93 |
+
Final Answer: A storage action was observed, but there might have been issues in the preceding steps. The overall process integrity is uncertain."""
|
94 |
+
|
95 |
+
# Scenario 4: Initial query processing (likely a search request)
|
96 |
+
if "find papers on" in lower_prompt or "search arxiv for" in lower_prompt:
|
97 |
+
query_match = re.search(r"(?:find papers on|search arxiv for)\s*(.+)", lower_prompt)
|
98 |
+
search_query = query_match.group(1).strip() if query_match else "default search: quantum computing"
|
99 |
+
CURRENT_PAPER_CONTEXT = {'query': search_query, 'status': 'searching'} # Set initial context
|
100 |
+
# LLM decides to use the search tool
|
101 |
+
return f"""Thought: The user wants to find papers about '{search_query}'. I should use the ArxivSearchTool to find relevant papers.
|
102 |
+
Action: ArxivSearchTool
|
103 |
+
Action Input: {search_query}"""
|
104 |
+
|
105 |
+
# Fallback Scenario: Query not understood by the mock LLM's simple logic
|
106 |
+
else:
|
107 |
+
CURRENT_PAPER_CONTEXT = {} # Reset context
|
108 |
+
original_query = prompt.split("User query:", 1)[-1].split("\n", 1)[0].strip() if "User query:" in prompt else "the user's query"
|
109 |
+
return f"""Thought: I'm not sure how to handle this query: '{original_query}'. My current mocked abilities are limited to searching arXiv based on keywords like 'find papers on' or 'search arxiv for', then scraping and storing the first result.
|
110 |
+
Final Answer: I can only search arXiv for papers and process them if the query starts with 'find papers on' or 'search arxiv for'. Please rephrase your query (e.g., 'find papers on artificial intelligence')."""
|
111 |
+
|
112 |
+
class Tool:
|
113 |
+
"""A simple class to represent a tool that the agent can use."""
|
114 |
+
def __init__(self, name: str, description: str, func):
|
115 |
+
self.name = name
|
116 |
+
self.description = description # Crucial for the LLM to understand the tool's purpose
|
117 |
+
self.func = func # The actual Python function to execute
|
118 |
+
|
119 |
+
def run(self, action_input: str) -> str:
|
120 |
+
"""Executes the tool's function with the given input."""
|
121 |
+
print(f"TOOL EXECUTING: {self.name} with input: '{action_input}'")
|
122 |
try:
|
123 |
+
result = self.func(action_input)
|
124 |
+
print(f"TOOL RESULT ({self.name}): {result}")
|
125 |
+
return result
|
|
|
|
|
|
|
126 |
except Exception as e:
|
127 |
+
error_message = f"Error executing tool {self.name}: {str(e)}"
|
128 |
+
print(error_message)
|
129 |
+
return error_message # Return error message as observation
|
130 |
+
|
131 |
+
def arxiv_search_func(query: str, max_results=1) -> str:
|
132 |
+
"""Tool function: Searches arXiv for papers matching the query."""
|
133 |
+
try:
|
134 |
+
search = arxiv.Search(
|
135 |
+
query=query,
|
136 |
+
max_results=max_results, # Limiting to 1 for faster demo and simpler mock LLM logic
|
137 |
+
sort_by=arxiv.SortCriterion.Relevance
|
138 |
+
)
|
139 |
+
results_data = []
|
140 |
+
for r in search.results(): # arxiv.Client().results(search) is an alternative
|
141 |
+
results_data.append({
|
142 |
+
"id": r.entry_id.split('/')[-1], # Get the versionless ID (e.g., "1703.03400")
|
143 |
+
"title": r.title,
|
144 |
+
"authors": [author.name for author in r.authors],
|
145 |
+
"summary": r.summary,
|
146 |
+
"published": r.published.isoformat(),
|
147 |
+
"pdf_url": r.pdf_url, # Direct PDF link
|
148 |
+
"arxiv_url": r.entry_id # Link to the abstract page (e.g., "http://arxiv.org/abs/1703.03400v5")
|
149 |
+
})
|
150 |
|
151 |
+
if not results_data:
|
152 |
+
return f"No papers found on arXiv for query: '{query}'."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
# For the mock LLM, provide a clear summary of the top result
|
155 |
+
top_result = results_data[0]
|
156 |
+
return (f"Found {len(results_data)} papers. "
|
157 |
+
f"Top result: '{top_result['title']}' (ID: {top_result['id']}, URL: {top_result['arxiv_url']})")
|
158 |
+
|
159 |
+
except Exception as e:
|
160 |
+
return f"Error searching arXiv: {str(e)}"
|
161 |
+
|
162 |
+
def paper_scraper_func(pdf_url_or_id: str) -> str:
|
163 |
+
"""Tool function: Downloads an arXiv PDF and extracts its text content."""
|
164 |
+
try:
|
165 |
+
# Determine the direct PDF URL from various input formats
|
166 |
+
if "arxiv.org/abs/" in pdf_url_or_id: # e.g., http://arxiv.org/abs/1703.03400
|
167 |
+
paper_id_match = re.search(r'abs/([\d\.]+)', pdf_url_or_id)
|
168 |
+
if not paper_id_match: raise ValueError("Could not extract paper ID from abs URL.")
|
169 |
+
paper_id = paper_id_match.group(1)
|
170 |
+
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
|
171 |
+
elif "arxiv.org/pdf/" in pdf_url_or_id: # e.g., http://arxiv.org/pdf/1703.03400.pdf
|
172 |
+
pdf_url = pdf_url_or_id
|
173 |
+
elif re.match(r'^[\d\.]+(v\d+)?$', pdf_url_or_id): # e.g., 1703.03400 or 1703.03400v5
|
174 |
+
pdf_url = f"https://arxiv.org/pdf/{pdf_url_or_id}.pdf"
|
175 |
+
else:
|
176 |
+
raise ValueError(f"Invalid input format for PaperScraperTool: '{pdf_url_or_id}'. Expected arXiv URL or ID.")
|
177 |
+
|
178 |
+
print(f"Attempting to download PDF from: {pdf_url}")
|
179 |
+
response = requests.get(pdf_url, timeout=30) # Added timeout for network robustness
|
180 |
+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
+
global CURRENT_PAPER_CONTEXT # Use context set by LLM/previous steps
|
183 |
+
paper_title = CURRENT_PAPER_CONTEXT.get('title', f"paper from {pdf_url}")
|
184 |
+
paper_id_context = CURRENT_PAPER_CONTEXT.get('id', 'unknown_id_from_context')
|
185 |
|
186 |
+
full_text = ""
|
187 |
+
abstract = "Could not reliably extract abstract from PDF text."
|
|
|
|
|
188 |
|
189 |
+
# Use fitz (PyMuPDF) to open PDF from downloaded bytes
|
190 |
+
with fitz.open(stream=response.content, filetype="pdf") as doc:
|
191 |
+
for page_num, page in enumerate(doc):
|
192 |
+
full_text += page.get_text("text") # "text" preserves some layout
|
193 |
+
if page_num == 0: # Attempt to extract abstract from the first page
|
194 |
+
first_page_text = page.get_text("text")
|
195 |
+
# Heuristic for abstract extraction (can be improved)
|
196 |
+
abstract_match = re.search(r"Abstract\s*([\s\S]*?)(?:1\.|Introduction|Keywords|I\.|\n\s*\n\s*\n)", first_page_text, re.IGNORECASE | re.DOTALL)
|
197 |
+
if abstract_match:
|
198 |
+
abstract = abstract_match.group(1).strip().replace('\n', ' ')
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
+
# Provide a snippet for the observation (full text can be very long)
|
201 |
+
text_snippet = (full_text[:500] + "...") if len(full_text) > 500 else full_text
|
202 |
|
203 |
+
return (f"Scraped content for '{paper_title}' (ID: {paper_id_context}). "
|
204 |
+
f"Abstract: {abstract} Full text snippet: {text_snippet}")
|
205 |
+
|
206 |
+
except requests.exceptions.RequestException as e:
|
207 |
+
return f"Error downloading PDF from '{pdf_url_or_id}': {str(e)}"
|
208 |
+
except Exception as e:
|
209 |
+
return f"Error scraping paper '{pdf_url_or_id}': {str(e)}"
|
210 |
+
|
211 |
+
def knowledge_base_storage_func(paper_data_str: str) -> str:
|
212 |
+
"""Tool function: Stores the extracted paper information into the KNOWLEDGE_BASE."""
|
213 |
+
global KNOWLEDGE_BASE, CURRENT_PAPER_CONTEXT
|
214 |
+
try:
|
215 |
+
# The mock LLM provides paper_data_str as a string representation of a dictionary.
|
216 |
+
# WARNING: eval() is risky if the input string is not strictly controlled.
|
217 |
+
# A real LLM should be prompted to return JSON, and then use json.loads().
|
218 |
+
# For this demo, we assume the mock LLM's output is "safe" for eval.
|
219 |
+
if isinstance(paper_data_str, str):
|
220 |
+
try:
|
221 |
+
paper_data = eval(paper_data_str) # Convert string to dict
|
222 |
+
if not isinstance(paper_data, dict):
|
223 |
+
raise ValueError("Parsed data from string is not a dictionary.")
|
224 |
+
except Exception as e:
|
225 |
+
return f"Error parsing paper data string for storage: {str(e)}. Input data string was: '{paper_data_str}'"
|
226 |
+
elif isinstance(paper_data_str, dict): # If a dict is somehow passed directly
|
227 |
+
paper_data = paper_data_str
|
228 |
+
else:
|
229 |
+
return f"Invalid data type received for storage: {type(paper_data_str)}. Expected string (evaluable to dict) or dict."
|
230 |
+
|
231 |
+
# Validate essential keys
|
232 |
+
required_keys = ["id", "title", "url", "abstract"]
|
233 |
+
if not all(key in paper_data for key in required_keys):
|
234 |
+
missing_keys = [key for key in required_keys if key not in paper_data]
|
235 |
+
return f"Error: Missing required keys for storage: {missing_keys}. Received data: {paper_data}"
|
236 |
+
|
237 |
+
# Avoid adding duplicate papers by ID
|
238 |
+
if any(p["id"] == paper_data["id"] for p in KNOWLEDGE_BASE):
|
239 |
+
return f"Paper with ID '{paper_data['id']}' is already in the knowledge base. Not adding again."
|
240 |
+
|
241 |
+
KNOWLEDGE_BASE.append(paper_data)
|
242 |
+
return (f"Successfully stored paper '{paper_data.get('id', 'N/A')}' (Title: '{paper_data.get('title', 'N/A')}') in the knowledge base. "
|
243 |
+
f"Knowledge base now contains {len(KNOWLEDGE_BASE)} papers.")
|
244 |
+
except Exception as e:
|
245 |
+
# If storage fails critically, reset context to prevent loops with bad data
|
246 |
+
CURRENT_PAPER_CONTEXT = {}
|
247 |
+
return f"Critical error storing paper in knowledge base: {str(e)}. Input was: '{paper_data_str}'"
|
248 |
+
|
249 |
+
# Define the list of tools available to the agent
|
250 |
+
tools_list = [
|
251 |
+
Tool(
|
252 |
+
name="ArxivSearchTool",
|
253 |
+
description="Searches the arXiv repository for research papers based on a query. Input should be the search query (e.g., 'machine learning for climate change'). Returns a summary of search results, highlighting the top paper found.",
|
254 |
+
func=arxiv_search_func
|
255 |
+
),
|
256 |
+
Tool(
|
257 |
+
name="PaperScraperTool",
|
258 |
+
description="Downloads an arXiv paper PDF given its arXiv abstract page URL (e.g., 'http://arxiv.org/abs/1234.5678') or just its ID (e.g., '1234.5678') and extracts its text content, including the abstract and a snippet of the full text.",
|
259 |
+
func=paper_scraper_func
|
260 |
+
),
|
261 |
+
Tool(
|
262 |
+
name="KnowledgeBaseStorageTool",
|
263 |
+
description="Stores extracted information about a paper (such as its ID, title, URL, abstract, and text snippet) into the system's knowledge base. Input should be a string representation of a Python dictionary containing these paper details.",
|
264 |
+
func=knowledge_base_storage_func
|
265 |
+
)
|
266 |
+
]
|
267 |
|
268 |
+
def get_tools_description_for_prompt(tool_list_arg):
|
269 |
+
"""Formats tool descriptions for the LLM prompt to help it choose tools."""
|
270 |
+
return "\n".join([f"- {tool.name}: {tool.description}" for tool in tool_list_arg])
|
271 |
|
272 |
+
def parse_llm_react_output(llm_response: str):
|
273 |
+
"""
|
274 |
+
Parses the LLM's ReAct-formatted response to extract Thought, Action, Action Input, or Final Answer.
|
275 |
+
"""
|
276 |
+
thought_match = re.search(r"Thought:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
|
277 |
+
action_match = re.search(r"Action:\s*([\w_]+)", llm_response, re.IGNORECASE | re.DOTALL) # Tool names are usually alphanumeric with underscores
|
278 |
+
action_input_match = re.search(r"Action Input:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
|
279 |
+
final_answer_match = re.search(r"Final Answer:\s*(.*)", llm_response, re.IGNORECASE | re.DOTALL)
|
280 |
|
281 |
+
thought = thought_match.group(1).strip() if thought_match else None
|
282 |
+
action = action_match.group(1).strip() if action_match else None
|
283 |
+
|
284 |
+
action_input_str = "" # Default to empty string if no input
|
285 |
+
if action_input_match:
|
286 |
+
action_input_str = action_input_match.group(1).strip()
|
287 |
+
elif action: # If there's an action but "Action Input:" line is missing, assume empty input
|
288 |
+
action_input_str = ""
|
289 |
+
|
290 |
+
final_answer = final_answer_match.group(1).strip() if final_answer_match else None
|
291 |
+
return thought, action, action_input_str, final_answer
|
292 |
+
|
293 |
+
class ReActAgent:
|
294 |
+
"""A simple ReAct agent that uses an LLM to reason and act."""
|
295 |
+
def __init__(self, llm_function, tool_list_arg, max_iterations=7): # Max iterations for the ReAct loop
|
296 |
+
self.llm_function = llm_function
|
297 |
+
self.tools = {tool.name: tool for tool in tool_list_arg} # Store tools in a dict for easy lookup
|
298 |
+
self.tools_description = get_tools_description_for_prompt(tool_list_arg)
|
299 |
+
self.max_iterations = max_iterations
|
300 |
+
self.agent_log = [] # Stores the step-by-step log for display in Gradio
|
301 |
+
|
302 |
+
def run(self, user_query: str):
|
303 |
+
"""Runs the ReAct loop for a given user query."""
|
304 |
+
global CURRENT_PAPER_CONTEXT
|
305 |
+
CURRENT_PAPER_CONTEXT = {} # Ensure context is fresh for each new query
|
306 |
+
self.agent_log = [f"User Query: {user_query}\n"] # Start log with the user query
|
307 |
+
|
308 |
+
# Construct the initial part of the prompt for the LLM
|
309 |
+
prompt_history = f"User query: {user_query}\n"
|
310 |
+
prompt_history += "You are an AI assistant that processes arXiv papers. You must use the ReAct format: Thought, Action, Action Input, Observation, and finally Final Answer.\n"
|
311 |
+
prompt_history += "Based on the user query, decide on a thought, then an action to take using one of the available tools.\n"
|
312 |
+
prompt_history += "After an action, you will receive an observation. Reason about the observation to decide the next step.\n"
|
313 |
+
prompt_history += "If you have enough information from an observation to answer the user query, or if a multi-step task (like search, scrape, store) is complete, respond with 'Final Answer:'.\n"
|
314 |
+
prompt_history += "When using PaperScraperTool, the input is the arXiv URL or ID. When using KnowledgeBaseStorageTool, the input is a string representation of a dictionary with paper details.\n"
|
315 |
+
|
316 |
+
|
317 |
+
for i in range(self.max_iterations):
|
318 |
+
log_entry = f"\n--- Iteration {i + 1} ---\n"
|
319 |
+
|
320 |
+
# Call the LLM with the current prompt history and tool descriptions
|
321 |
+
llm_response_str = self.llm_function(prompt_history, self.tools_description)
|
322 |
+
log_entry += f"LLM Raw Response (Mocked):\n{llm_response_str}\n" # Clearly label as mocked
|
323 |
+
|
324 |
+
thought, action_name, action_input, final_answer = parse_llm_react_output(llm_response_str)
|
325 |
+
|
326 |
+
if thought:
|
327 |
+
log_entry += f"Thought: {thought}\n"
|
328 |
+
prompt_history += f"Thought: {thought}\n" # Add thought to history for next LLM call
|
329 |
+
else:
|
330 |
+
log_entry += "Warning: No thought found in LLM response for this iteration.\n"
|
331 |
+
|
332 |
+
if final_answer:
|
333 |
+
log_entry += f"\nFinal Answer from Agent: {final_answer}\n"
|
334 |
+
self.agent_log.append(log_entry)
|
335 |
+
CURRENT_PAPER_CONTEXT = {} # Clear context as task is finished
|
336 |
+
return final_answer, "\n".join(self.agent_log)
|
337 |
+
|
338 |
+
if action_name:
|
339 |
+
log_entry += f"Action: {action_name}\nAction Input: '{action_input}'\n"
|
340 |
+
prompt_history += f"Action: {action_name}\nAction Input: {action_input}\n"
|
341 |
+
|
342 |
+
if action_name in self.tools:
|
343 |
+
tool_to_use = self.tools[action_name]
|
344 |
+
observation = tool_to_use.run(action_input) # Execute the tool
|
345 |
+
log_entry += f"Observation: {observation}\n"
|
346 |
+
prompt_history += f"Observation: {observation}\n" # Add observation to history
|
347 |
+
else:
|
348 |
+
observation = f"Error: Tool '{action_name}' not found. Please choose from the available tools."
|
349 |
+
log_entry += f"{observation}\n"
|
350 |
+
prompt_history += f"Observation: {observation}\n" # Feed error back to LLM
|
351 |
+
else:
|
352 |
+
# If LLM provides no action and no final answer, it might be stuck
|
353 |
+
log_entry += "LLM did not specify an action or a final answer. The agent might be stuck or the task is implicitly complete based on LLM's internal state (which is hard for a mock to determine).\n"
|
354 |
+
self.agent_log.append(log_entry)
|
355 |
+
CURRENT_PAPER_CONTEXT = {} # Clear context
|
356 |
+
# Attempt to give a more informative "stuck" message
|
357 |
+
last_thought_or_obs = thought if thought else "No clear thought before stopping."
|
358 |
+
return f"Agent concluded: No further action or final answer provided by LLM. Last thought: {last_thought_or_obs}", "\n".join(self.agent_log)
|
359 |
+
|
360 |
+
self.agent_log.append(log_entry) # Append current iteration's log
|
361 |
|
362 |
+
# If max_iterations is reached without a final answer
|
363 |
+
self.agent_log.append("\nMax iterations reached. Stopping.\n")
|
364 |
+
CURRENT_PAPER_CONTEXT = {} # Clear context
|
365 |
+
return "Agent stopped: Maximum iterations reached without a final answer.", "\n".join(self.agent_log)
|
366 |
|
367 |
+
# --- Gradio App Definition ---
|
|
|
368 |
|
369 |
+
# Instantiate the agent globally. This ensures that the agent (and its KNOWLEDGE_BASE)
|
370 |
+
# persists across multiple interactions within the same Gradio session.
|
371 |
+
# For a deployed app with multiple users, this global KNOWLEDGE_BASE would be shared,
|
372 |
+
# which might not be desired. Consider session state or a proper database for such scenarios.
|
373 |
+
arxiv_agent_instance = ReActAgent(llm_function=mock_llm, tool_list_arg=tools_list)
|
374 |
|
375 |
+
def process_gradio_query(user_query_text: str):
|
376 |
+
"""
|
377 |
+
This function is called by the Gradio interface when the user submits a query.
|
378 |
+
It runs the ReAct agent and formats the outputs for display in the UI.
|
379 |
+
"""
|
380 |
+
if not user_query_text or not user_query_text.strip():
|
381 |
+
# Handle empty input gracefully
|
382 |
+
empty_kb_message = "Knowledge Base is currently empty." if not KNOWLEDGE_BASE else KNOWLEDGE_BASE
|
383 |
+
return "Please enter a query.", empty_kb_message, "No agent activity to log for an empty query."
|
384 |
|
385 |
+
# Run the agent with the user's query
|
386 |
+
final_answer, agent_log_str = arxiv_agent_instance.run(user_query_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
+
# Prepare the Knowledge Base for display in Gradio (as JSON)
|
389 |
+
# If KNOWLEDGE_BASE is empty, gr.JSON will handle it gracefully.
|
390 |
+
# If it has content, it will be a list of dictionaries.
|
391 |
+
kb_display_data = KNOWLEDGE_BASE if KNOWLEDGE_BASE else "Knowledge Base is currently empty."
|
392 |
|
393 |
+
return final_answer, kb_display_data, agent_log_str
|
|
|
|
|
394 |
|
395 |
+
# Define the Gradio interface using gr.Blocks for more layout control
|
396 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
|
397 |
+
gr.Markdown(
|
398 |
+
"""
|
399 |
+
# 📄🤖 arXiv Research Paper Agent (Demo)
|
400 |
+
This agent uses a **mocked LLM** to simulate searching arXiv, scraping paper content,
|
401 |
+
and adding it to an in-memory knowledge base.
|
402 |
+
Enter a query like: `Find papers on 'topic X' and add the first one to the knowledge base.`
|
403 |
+
The agent will show its thought process (as if from an LLM) and tool interactions.
|
404 |
+
"""
|
405 |
+
)
|
406 |
+
|
407 |
with gr.Row():
|
408 |
+
query_input_textbox = gr.Textbox(
|
409 |
+
label="Your Query for the arXiv Agent",
|
410 |
+
placeholder="e.g., Find papers on 'transformer models' and add the first one to the knowledge base.",
|
411 |
+
lines=2
|
412 |
)
|
|
|
413 |
|
414 |
+
submit_query_button = gr.Button("Run Agent �", variant="primary")
|
415 |
+
|
416 |
+
with gr.Accordion("Agent's Final Answer & Step-by-Step Log", open=True):
|
417 |
+
agent_final_answer_output_textbox = gr.Textbox(
|
418 |
+
label="Agent's Final Answer",
|
419 |
+
lines=3,
|
420 |
+
interactive=False,
|
421 |
+
placeholder="Agent's final conclusion will appear here..."
|
422 |
+
)
|
423 |
+
agent_log_output_textbox = gr.Textbox(
|
424 |
+
label="Agent's Step-by-Step Log (Simulated LLM Thoughts & Tool Use)",
|
425 |
+
lines=15,
|
426 |
+
interactive=False,
|
427 |
+
placeholder="Detailed agent activity log..."
|
428 |
+
)
|
|
|
|
|
|
|
|
|
429 |
|
430 |
+
with gr.Accordion("In-Memory Knowledge Base Contents", open=True):
|
431 |
+
knowledge_base_output_json = gr.JSON(
|
432 |
+
label="Current Knowledge Base (Papers stored in this session)"
|
433 |
+
)
|
434 |
+
# For a more tabular view, if KNOWLEDGE_BASE items are consistent dictionaries:
|
435 |
+
# knowledge_base_output_df = gr.DataFrame(
|
436 |
+
# label="Current Knowledge Base (Table View)",
|
437 |
+
# headers=["ID", "Title", "URL", "Abstract Snippet", "Scraped At"], # Adjust headers as needed
|
438 |
+
# # You'd need to transform KNOWLEDGE_BASE into a list of lists for gr.DataFrame
|
439 |
+
# )
|
440 |
+
|
441 |
+
# Connect the button click to the processing function
|
442 |
+
submit_query_button.click(
|
443 |
+
fn=process_gradio_query,
|
444 |
+
inputs=[query_input_textbox],
|
445 |
+
outputs=[agent_final_answer_output_textbox, knowledge_base_output_json, agent_log_output_textbox]
|
446 |
)
|
447 |
+
|
448 |
gr.Examples(
|
449 |
examples=[
|
450 |
+
["Find papers on 'reinforcement learning for robotics' and add the first one to the knowledge base."],
|
451 |
+
["Search arxiv for 'quantum machine learning' and process the top result."],
|
452 |
+
["Find papers on 'explainable AI in healthcare' and add the first one to the knowledge base."],
|
453 |
+
["Find papers on 'graph neural networks for drug discovery' and add the top one to the knowledge base."],
|
454 |
],
|
455 |
+
inputs=[query_input_textbox],
|
456 |
+
# Optional: Define outputs and function for examples if they should pre-fill or behave differently
|
457 |
+
# outputs=[agent_final_answer_output_textbox, knowledge_base_output_json, agent_log_output_textbox],
|
458 |
+
# fn=process_gradio_query
|
459 |
+
)
|
460 |
+
|
461 |
+
gr.Markdown(
|
462 |
+
"""
|
463 |
+
---
|
464 |
+
*Powered by a Mock LLM & Gradio. For a real application, replace `mock_llm` with an actual LLM integration.*
|
465 |
+
*PDF scraping uses PyMuPDF. arXiv interaction uses the `arxiv` library.*
|
466 |
+
*Knowledge Base is in-memory and resets if the Gradio app restarts.*
|
467 |
+
"""
|
468 |
)
|
469 |
|
|
|
470 |
if __name__ == "__main__":
|
471 |
+
# Instructions to run this Gradio app locally:
|
472 |
+
# 1. Ensure all dependencies are installed:
|
473 |
+
# pip install gradio arxiv PyMuPDF requests
|
474 |
+
# 2. Save this code as a Python file (e.g., app.py).
|
475 |
+
# 3. Run the file from your terminal:
|
476 |
+
# python app.py
|
477 |
+
# This will launch a local web server, and Gradio will provide a URL (usually http://127.0.0.1:7860)
|
478 |
+
# that you can open in your web browser to interact with the app.
|
479 |
+
#
|
480 |
+
# For deployment on Hugging Face Spaces:
|
481 |
+
# - Name this file `app.py`.
|
482 |
+
# - Create a `requirements.txt` file in the same directory with the content:
|
483 |
+
# gradio
|
484 |
+
# arxiv
|
485 |
+
# PyMuPDF
|
486 |
+
# requests
|
487 |
+
# - Create a new Space on Hugging Face, select "Gradio" as the SDK, and upload these files.
|
488 |
+
demo.launch() # debug=True can be helpful for local development
|
489 |
+
�
|
requirements.txt
CHANGED
@@ -1,10 +1,5 @@
|
|
1 |
huggingface_hub==0.25.2
|
2 |
gradio
|
3 |
-
transformers
|
4 |
-
sentence-transformers
|
5 |
-
faiss-cpu
|
6 |
-
torch
|
7 |
arxiv
|
8 |
-
|
9 |
-
|
10 |
-
langchain-community
|
|
|
1 |
huggingface_hub==0.25.2
|
2 |
gradio
|
|
|
|
|
|
|
|
|
3 |
arxiv
|
4 |
+
PyMuPDF
|
5 |
+
requests
|
|