Update app.py
#226
by
lyndalynda
- opened
app.py
CHANGED
@@ -1,23 +1,382 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
-
import inspect
|
5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
10 |
|
11 |
-
# --- Basic Agent Definition ---
|
12 |
-
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
13 |
class BasicAgent:
|
14 |
def __init__(self):
|
15 |
-
print("
|
|
|
|
|
16 |
def __call__(self, question: str) -> str:
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
23 |
"""
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import requests
|
|
|
4 |
import pandas as pd
|
5 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, tool
|
6 |
+
import re
|
7 |
+
import json
|
8 |
+
import math
|
9 |
+
import tempfile
|
10 |
+
from pathlib import Path
|
11 |
+
from urllib.parse import urlparse, parse_qs
|
12 |
+
import yt_dlp
|
13 |
+
from PIL import Image
|
14 |
+
import pytesseract
|
15 |
|
16 |
+
hf_token = os.getenv("HF_TOKEN")
|
17 |
+
SPACE_ID = os.getenv("SPACE_ID")
|
18 |
+
SPACE_HOST = os.getenv("SPACE_HOST")
|
19 |
+
# --- OUTILS CRITIQUES POUR GAIA ---
|
20 |
+
@tool
|
21 |
+
def web_browser(url: str) -> str:
|
22 |
+
"""
|
23 |
+
Fetches content from a web URL.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
url: The URL to fetch content from.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
Text content from the webpage.
|
30 |
+
"""
|
31 |
+
try:
|
32 |
+
headers = {
|
33 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
34 |
+
}
|
35 |
+
response = requests.get(url, headers=headers, timeout=10)
|
36 |
+
response.raise_for_status()
|
37 |
+
|
38 |
+
# Simple text extraction (you might want to use BeautifulSoup for better parsing)
|
39 |
+
content = response.text
|
40 |
+
# Basic cleaning
|
41 |
+
content = re.sub(r'<[^>]+>', ' ', content) # Remove HTML tags
|
42 |
+
content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
|
43 |
+
|
44 |
+
return content[:2000] + "..." if len(content) > 2000 else content
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
return f"Error accessing URL: {str(e)}"
|
48 |
+
|
49 |
+
@tool
|
50 |
+
def youtube_transcript_extractor(url: str) -> str:
|
51 |
+
"""
|
52 |
+
Extracts transcript or information from YouTube videos.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
url: YouTube URL.
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
Video information and transcript if available.
|
59 |
+
"""
|
60 |
+
try:
|
61 |
+
# Extract video ID from URL
|
62 |
+
if "youtube.com/watch" in url:
|
63 |
+
video_id = parse_qs(urlparse(url).query).get('v', [None])[0]
|
64 |
+
elif "youtu.be/" in url:
|
65 |
+
video_id = urlparse(url).path[1:]
|
66 |
+
else:
|
67 |
+
return "Invalid YouTube URL format"
|
68 |
+
|
69 |
+
if not video_id:
|
70 |
+
return "Could not extract video ID from URL"
|
71 |
+
|
72 |
+
# Use youtube-dl to get video info
|
73 |
+
ydl_opts = {
|
74 |
+
'quiet': True,
|
75 |
+
'no_warnings': True,
|
76 |
+
'writesubtitles': True,
|
77 |
+
'writeautomaticsub': True,
|
78 |
+
}
|
79 |
+
|
80 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
81 |
+
info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
|
82 |
+
|
83 |
+
result = f"Title: {info.get('title', 'N/A')}\n"
|
84 |
+
result += f"Description: {info.get('description', 'N/A')[:500]}...\n"
|
85 |
+
result += f"Duration: {info.get('duration', 'N/A')} seconds\n"
|
86 |
+
result += f"View count: {info.get('view_count', 'N/A')}\n"
|
87 |
+
|
88 |
+
# Try to get subtitles/transcript
|
89 |
+
if 'subtitles' in info and info['subtitles']:
|
90 |
+
result += "\n--- Transcript Available ---\n"
|
91 |
+
# This is a simplified approach - you'd need more complex logic for full transcript
|
92 |
+
|
93 |
+
return result
|
94 |
+
|
95 |
+
except Exception as e:
|
96 |
+
return f"Error extracting YouTube content: {str(e)}"
|
97 |
+
|
98 |
+
@tool
|
99 |
+
def image_ocr_analyzer(image_path: str) -> str:
|
100 |
+
"""
|
101 |
+
Performs OCR on images to extract text.
|
102 |
+
|
103 |
+
Args:
|
104 |
+
image_path: Path to the image file.
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
Extracted text from the image.
|
108 |
+
"""
|
109 |
+
try:
|
110 |
+
# Open image with PIL
|
111 |
+
image = Image.open(image_path)
|
112 |
+
|
113 |
+
# Perform OCR
|
114 |
+
extracted_text = pytesseract.image_to_string(image)
|
115 |
+
|
116 |
+
if not extracted_text.strip():
|
117 |
+
return "No text found in the image"
|
118 |
+
|
119 |
+
return f"Extracted text:\n{extracted_text.strip()}"
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
return f"Error performing OCR: {str(e)}"
|
123 |
+
|
124 |
+
@tool
|
125 |
+
def pdf_text_extractor(file_path: str) -> str:
|
126 |
+
"""
|
127 |
+
Extracts text from PDF files.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
file_path: Path to the PDF file.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
Extracted text from PDF.
|
134 |
+
"""
|
135 |
+
try:
|
136 |
+
import PyPDF2
|
137 |
+
|
138 |
+
with open(file_path, 'rb') as file:
|
139 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
140 |
+
text = ""
|
141 |
+
|
142 |
+
for page_num in range(len(pdf_reader.pages)):
|
143 |
+
page = pdf_reader.pages[page_num]
|
144 |
+
text += page.extract_text() + "\n"
|
145 |
+
|
146 |
+
return text[:3000] + "..." if len(text) > 3000 else text
|
147 |
+
|
148 |
+
except Exception as e:
|
149 |
+
return f"Error extracting PDF text: {str(e)}"
|
150 |
+
|
151 |
+
@tool
|
152 |
+
def veterinary_document_analyzer(text: str) -> str:
|
153 |
+
"""
|
154 |
+
Analyzes veterinary documents to extract specific information like names.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
text: Document text to analyze.
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
Extracted veterinary information.
|
161 |
+
"""
|
162 |
+
try:
|
163 |
+
# Look for veterinarian names and surnames
|
164 |
+
vet_patterns = [
|
165 |
+
r"Dr\.?\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # Dr. First Last
|
166 |
+
r"Doctor\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # Doctor First Last
|
167 |
+
r"veterinarian\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # veterinarian First Last
|
168 |
+
r"DVM\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # DVM First Last
|
169 |
+
]
|
170 |
+
|
171 |
+
found_vets = []
|
172 |
+
for pattern in vet_patterns:
|
173 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
174 |
+
for match in matches:
|
175 |
+
full_name = f"{match[0]} {match[1]}"
|
176 |
+
if full_name not in found_vets:
|
177 |
+
found_vets.append(full_name)
|
178 |
+
|
179 |
+
if found_vets:
|
180 |
+
return f"Found veterinarian(s): {', '.join(found_vets)}"
|
181 |
+
else:
|
182 |
+
return "No veterinarian names found in the document"
|
183 |
+
|
184 |
+
except Exception as e:
|
185 |
+
return f"Error analyzing veterinary document: {str(e)}"
|
186 |
+
|
187 |
+
# --- Outils existants améliorés ---
|
188 |
+
@tool
|
189 |
+
def analyze_excel_file(file_path: str, analysis_type: str = "general") -> str:
|
190 |
+
"""
|
191 |
+
Analyzes Excel files with multiple analysis types.
|
192 |
+
"""
|
193 |
+
try:
|
194 |
+
df = pd.read_excel(file_path)
|
195 |
+
|
196 |
+
if analysis_type == "general":
|
197 |
+
return f"Excel file contains {len(df)} rows and {len(df.columns)} columns. Columns: {list(df.columns)}"
|
198 |
+
|
199 |
+
elif analysis_type == "food_sales":
|
200 |
+
if 'category' in df.columns and 'price' in df.columns and 'quantity' in df.columns:
|
201 |
+
food_df = df[df['category'].str.lower() == 'food']
|
202 |
+
total_sales = (food_df['price'] * food_df['quantity']).sum()
|
203 |
+
return f"Total food sales: ${total_sales:.2f}"
|
204 |
+
else:
|
205 |
+
return "Required columns (category, price, quantity) not found"
|
206 |
+
|
207 |
+
elif analysis_type == "summary":
|
208 |
+
summary = df.describe(include='all').to_string()
|
209 |
+
return f"Data summary:\n{summary}"
|
210 |
+
|
211 |
+
elif analysis_type == "categories":
|
212 |
+
if 'category' in df.columns:
|
213 |
+
categories = df['category'].value_counts()
|
214 |
+
return f"Categories breakdown:\n{categories.to_string()}"
|
215 |
+
else:
|
216 |
+
return "No category column found"
|
217 |
+
|
218 |
+
return "Unknown analysis type"
|
219 |
+
|
220 |
+
except Exception as e:
|
221 |
+
return f"Error analyzing Excel file: {str(e)}"
|
222 |
+
|
223 |
+
@tool
|
224 |
+
def advanced_calculator(expression: str) -> str:
|
225 |
+
"""
|
226 |
+
Evaluates mathematical expressions safely, including advanced functions.
|
227 |
+
"""
|
228 |
+
try:
|
229 |
+
expression = expression.replace('^', '**')
|
230 |
+
allowed_functions = {
|
231 |
+
'abs': abs, 'round': round, 'min': min, 'max': max,
|
232 |
+
'sum': sum, 'len': len,
|
233 |
+
'sqrt': math.sqrt, 'pow': math.pow, 'log': math.log,
|
234 |
+
'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
|
235 |
+
'pi': math.pi, 'e': math.e,
|
236 |
+
'floor': math.floor, 'ceil': math.ceil
|
237 |
+
}
|
238 |
+
result = eval(expression, {"__builtins__": {}}, allowed_functions)
|
239 |
+
return str(result)
|
240 |
+
|
241 |
+
except Exception as e:
|
242 |
+
return f"Error in calculation: {str(e)}"
|
243 |
+
|
244 |
+
@tool
|
245 |
+
def smart_text_analyzer(text: str, task_type: str = "general") -> str:
|
246 |
+
"""
|
247 |
+
Analyzes text with focus on GAIA-specific tasks.
|
248 |
+
|
249 |
+
Args:
|
250 |
+
text: Text to analyze.
|
251 |
+
task_type: 'general', 'names', 'dates', 'numbers', 'veterinary'.
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
Analysis results.
|
255 |
+
"""
|
256 |
+
try:
|
257 |
+
if task_type == "names":
|
258 |
+
# Extract proper names
|
259 |
+
name_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
|
260 |
+
names = re.findall(name_pattern, text)
|
261 |
+
return f"Found names: {list(set(names))}"
|
262 |
+
|
263 |
+
elif task_type == "veterinary":
|
264 |
+
return veterinary_document_analyzer(text)
|
265 |
+
|
266 |
+
elif task_type == "dates":
|
267 |
+
date_patterns = [
|
268 |
+
r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY
|
269 |
+
r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD
|
270 |
+
r'\b\w+\s+\d{1,2},\s+\d{4}\b' # Month DD, YYYY
|
271 |
+
]
|
272 |
+
dates = []
|
273 |
+
for pattern in date_patterns:
|
274 |
+
dates.extend(re.findall(pattern, text))
|
275 |
+
return f"Found dates: {dates}"
|
276 |
+
|
277 |
+
elif task_type == "numbers":
|
278 |
+
numbers = re.findall(r'-?\d+\.?\d*', text)
|
279 |
+
return f"Found numbers: {[float(n) for n in numbers if n]}"
|
280 |
+
|
281 |
+
else:
|
282 |
+
return f"Characters: {len(text)}, Words: {len(text.split())}, Lines: {len(text.splitlines())}"
|
283 |
+
|
284 |
+
except Exception as e:
|
285 |
+
return f"Error in text analysis: {str(e)}"
|
286 |
+
|
287 |
+
# --- Configuration du modèle OPTIMISÉE ---
|
288 |
+
# Changer pour un modèle plus léger qui ne dépasse pas ton quota
|
289 |
+
model = HfApiModel(
|
290 |
+
max_tokens=2048, # Réduit pour économiser le quota
|
291 |
+
temperature=0.1,
|
292 |
+
model_id='microsoft/DialoGPT-medium', # Modèle plus léger
|
293 |
+
# Ou essaye: 'HuggingFaceH4/zephyr-7b-beta' si disponible
|
294 |
+
)
|
295 |
+
|
296 |
+
# --- Initialisation des outils ---
|
297 |
+
search_tool = DuckDuckGoSearchTool()
|
298 |
+
|
299 |
+
# IMPORTANT: Ajouter TOUS les outils à la liste
|
300 |
+
tools = [
|
301 |
+
search_tool, # ⚠️ TU AVAIS OUBLIÉ ÇA !
|
302 |
+
web_browser,
|
303 |
+
youtube_transcript_extractor,
|
304 |
+
image_ocr_analyzer,
|
305 |
+
pdf_text_extractor,
|
306 |
+
veterinary_document_analyzer,
|
307 |
+
smart_text_analyzer,
|
308 |
+
advanced_calculator,
|
309 |
+
analyze_excel_file,
|
310 |
+
]
|
311 |
+
|
312 |
+
# Agent avec plus d'étapes pour les tâches complexes
|
313 |
+
agent_code = CodeAgent(
|
314 |
+
tools=tools,
|
315 |
+
model=model,
|
316 |
+
max_steps=15, # Augmenté pour les tâches complexes GAIA
|
317 |
+
additional_authorized_imports=[
|
318 |
+
"os", "tempfile", "pathlib", "re", "json", "math", "pandas",
|
319 |
+
"requests", "PIL", "pytesseract", "PyPDF2", "yt_dlp"
|
320 |
+
]
|
321 |
+
)
|
322 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
323 |
|
|
|
|
|
324 |
class BasicAgent:
|
325 |
def __init__(self):
|
326 |
+
print("Enhanced GAIA Agent initialized with web browsing capabilities.")
|
327 |
+
self.agent = agent_code
|
328 |
+
|
329 |
def __call__(self, question: str) -> str:
|
330 |
+
try:
|
331 |
+
# Prompt amélioré spécifiquement pour GAIA
|
332 |
+
enhanced_question = self._create_gaia_prompt(question)
|
333 |
+
|
334 |
+
result = self.agent.run(enhanced_question)
|
335 |
+
|
336 |
+
# Post-processing pour GAIA
|
337 |
+
cleaned_result = self._clean_gaia_result(result)
|
338 |
+
|
339 |
+
return cleaned_result if cleaned_result else "No response generated."
|
340 |
+
|
341 |
+
except Exception as e:
|
342 |
+
print(f"Agent error: {e}")
|
343 |
+
# Fallback strategy
|
344 |
+
try:
|
345 |
+
fallback_prompt = f"""
|
346 |
+
CRITICAL GAIA TASK: {question}
|
347 |
+
|
348 |
+
Use available tools to find the answer. If it's a YouTube video, use youtube_transcript_extractor.
|
349 |
+
If it's about documents, use appropriate analyzers.
|
350 |
+
Be precise and direct in your final answer.
|
351 |
+
"""
|
352 |
+
simple_result = self.agent.run(fallback_prompt)
|
353 |
+
return simple_result if simple_result else f"Error: {e}"
|
354 |
+
except:
|
355 |
+
return f"Error: {e}"
|
356 |
+
|
357 |
+
def _create_gaia_prompt(self, question: str) -> str:
|
358 |
+
"""Crée un prompt optimisé pour GAIA."""
|
359 |
+
return f"""
|
360 |
+
GAIA EVALUATION TASK - ANSWER PRECISELY
|
361 |
+
|
362 |
+
Question: {question}
|
363 |
+
|
364 |
+
INSTRUCTIONS:
|
365 |
+
1. If this involves a YouTube video, use youtube_transcript_extractor tool
|
366 |
+
2. If this involves web content, use web_browser tool
|
367 |
+
3. If this involves documents/PDFs, use appropriate analyzers
|
368 |
+
4. If this involves images, use image_ocr_analyzer
|
369 |
+
5. If this needs search, use the search tool
|
370 |
+
6. For calculations, use advanced_calculator
|
371 |
+
7. Be EXACT and SPECIFIC in your final answer
|
372 |
+
8. Don't provide explanations unless asked - just the answer
|
373 |
+
|
374 |
+
Work step by step and use the right tools for this task.
|
375 |
+
"""
|
376 |
+
|
377 |
+
|
378 |
+
|
379 |
+
|
380 |
|
381 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
382 |
"""
|