Spaces:
Running
Running
""" | |
GAIA Agent - Simplified Working Version | |
Complete AGNO Tools with Basic Multimodal Integration | |
This agent provides comprehensive GAIA evaluation capabilities using: | |
- All AGNO tools (calculator, python, wikipedia, arxiv, firecrawl, exa, file, shell) | |
- Basic multimodal tools (Mistral Vision when available) | |
- Simple, reliable answer formatting | |
- No complex dependencies that cause import failures | |
Advantages: | |
- Single agent for all GAIA tasks (text, math, multimodal) | |
- AGNO's native orchestration handles tool selection | |
- Simple, reliable architecture that works in HuggingFace Space | |
- Consistent error handling and response formatting | |
- No complex import dependencies | |
""" | |
import os | |
import logging | |
from typing import Dict, Any, List, Optional | |
from pathlib import Path | |
from agno.agent import Agent | |
from agno.models.mistral import MistralChat | |
# Import European open-source multimodal tools | |
try: | |
from .mistral_multimodal_agent import OpenSourceMultimodalTools | |
MULTIMODAL_AVAILABLE = True | |
except ImportError: | |
try: | |
from mistral_multimodal_agent import OpenSourceMultimodalTools | |
MULTIMODAL_AVAILABLE = True | |
except ImportError: | |
OpenSourceMultimodalTools = None | |
MULTIMODAL_AVAILABLE = False | |
# Simple answer formatting without complex dependencies | |
class SimpleAnswerFormatter: | |
"""Simple answer formatter for GAIA evaluation.""" | |
def format_answer(self, response: str, question: str = None) -> str: | |
"""Format response for GAIA evaluation.""" | |
if not response: | |
return "" | |
# Clean the response | |
answer = response.strip() | |
# Remove common prefixes | |
prefixes_to_remove = [ | |
"The answer is:", | |
"Answer:", | |
"Final answer:", | |
"The final answer is:", | |
"Based on my analysis,", | |
"According to my research,", | |
] | |
for prefix in prefixes_to_remove: | |
if answer.lower().startswith(prefix.lower()): | |
answer = answer[len(prefix):].strip() | |
# Remove markdown formatting | |
answer = answer.replace("**", "").replace("*", "") | |
# Extract final answer if it's in a specific format | |
lines = answer.split('\n') | |
for line in lines: | |
line = line.strip() | |
if line and not line.startswith('#') and not line.startswith('-'): | |
# This looks like a final answer | |
return line | |
return answer | |
# Load environment variables from .env file | |
def load_env_file(): | |
"""Load environment variables from .env file if it exists.""" | |
env_file = Path('.env') | |
if env_file.exists(): | |
with open(env_file, 'r') as f: | |
for line in f: | |
line = line.strip() | |
if line and not line.startswith('#') and '=' in line: | |
key, value = line.split('=', 1) | |
os.environ[key.strip()] = value.strip() | |
# Load environment variables at module level | |
load_env_file() | |
logger = logging.getLogger(__name__) | |
class GAIAAgent: | |
""" | |
GAIA Agent with comprehensive AGNO tools and basic multimodal capabilities. | |
This agent combines all AGNO tools with basic multimodal processing, | |
providing a single interface for all GAIA evaluation tasks including: | |
- Text and mathematical reasoning | |
- Basic image analysis using Mistral Vision | |
- Web research and content extraction | |
- Simple, reliable answer formatting | |
""" | |
def __init__(self): | |
"""Initialize the unified AGNO agent.""" | |
logger.info("π Initializing Unified AGNO Agent...") | |
# Initialize simple answer formatter | |
self.response_formatter = SimpleAnswerFormatter() | |
# Initialize all AGNO tools | |
self.tools = self._init_all_agno_tools() | |
# Initialize European open-source multimodal tools | |
self.multimodal_tools = self._init_multimodal_tools() | |
if self.multimodal_tools: | |
self.tools.extend(self.multimodal_tools.tools) | |
# Check for required API key | |
self.mistral_api_key = os.getenv("MISTRAL_API_KEY") | |
if not self.mistral_api_key: | |
logger.error("β MISTRAL_API_KEY not found - AGNO agent requires this for orchestration") | |
self.agent = None | |
self.available = False | |
return | |
# Create the unified AGNO agent | |
self.agent = self._create_agno_agent() | |
# Set availability flag | |
self.available = self.agent is not None | |
if self.available: | |
logger.info("β Unified AGNO Agent initialized successfully") | |
logger.info(f"π Available tools: {len(self.tools)}") | |
else: | |
logger.error("β Unified AGNO Agent initialization failed") | |
def _init_all_agno_tools(self) -> List[Any]: | |
"""Initialize all available AGNO tools.""" | |
tools = [] | |
tool_status = {} | |
# Define all AGNO tools with their requirements | |
tools_config = [ | |
# Core computational tools | |
{ | |
'name': 'calculator', | |
'module': 'agno.tools.calculator', | |
'class': 'CalculatorTools', | |
'required_env': None, | |
'description': 'Mathematical calculations and operations' | |
}, | |
{ | |
'name': 'python', | |
'module': 'agno.tools.python', | |
'class': 'PythonTools', | |
'required_env': None, | |
'description': 'Python code execution and analysis' | |
}, | |
# Knowledge and research tools | |
{ | |
'name': 'wikipedia', | |
'module': 'agno.tools.wikipedia', | |
'class': 'WikipediaTools', | |
'required_env': None, | |
'description': 'Wikipedia knowledge retrieval' | |
}, | |
{ | |
'name': 'arxiv', | |
'module': 'agno.tools.arxiv', | |
'class': 'ArxivTools', | |
'required_env': None, | |
'description': 'Academic research via ArXiv' | |
}, | |
# Web tools | |
{ | |
'name': 'firecrawl', | |
'module': 'agno.tools.firecrawl', | |
'class': 'FirecrawlTools', | |
'required_env': 'FIRECRAWL_API_KEY', | |
'description': 'Web content extraction' | |
}, | |
{ | |
'name': 'exa', | |
'module': 'agno.tools.exa', | |
'class': 'ExaTools', | |
'required_env': 'EXA_API_KEY', | |
'description': 'Advanced web search' | |
}, | |
# System tools | |
{ | |
'name': 'file', | |
'module': 'agno.tools.file', | |
'class': 'FileTools', | |
'required_env': None, | |
'description': 'File operations and management' | |
}, | |
{ | |
'name': 'shell', | |
'module': 'agno.tools.shell', | |
'class': 'ShellTools', | |
'required_env': None, | |
'description': 'System shell operations' | |
}, | |
# Optional multimodal tools | |
{ | |
'name': 'youtube', | |
'module': 'agno.tools.youtube', | |
'class': 'YouTubeTools', | |
'required_env': None, | |
'description': 'YouTube video transcription and analysis', | |
'optional_deps': ['youtube_transcript_api'] | |
}, | |
] | |
for tool_config in tools_config: | |
tool_name = tool_config['name'] | |
module_path = tool_config['module'] | |
class_name = tool_config['class'] | |
required_env = tool_config['required_env'] | |
description = tool_config['description'] | |
optional_deps = tool_config.get('optional_deps', []) | |
try: | |
# Check if required environment variable is available | |
if required_env and not os.getenv(required_env): | |
logger.warning(f"β οΈ {required_env} not found, {tool_name} tool unavailable") | |
tool_status[tool_name] = f"Missing {required_env}" | |
continue | |
# Import and instantiate the tool | |
module = __import__(module_path, fromlist=[class_name]) | |
tool_class = getattr(module, class_name) | |
# Initialize tool with appropriate parameters | |
if tool_name == 'exa': | |
tool_instance = tool_class(api_key=os.getenv('EXA_API_KEY')) | |
elif tool_name == 'firecrawl': | |
tool_instance = tool_class(api_key=os.getenv('FIRECRAWL_API_KEY')) | |
else: | |
tool_instance = tool_class() | |
tools.append(tool_instance) | |
tool_status[tool_name] = "β Available" | |
logger.info(f"β {class_name} initialized: {description}") | |
except ImportError as e: | |
if optional_deps and any(dep in str(e) for dep in optional_deps): | |
logger.warning(f"β οΈ {class_name} not available: missing optional dependency") | |
tool_status[tool_name] = f"Missing optional dependency" | |
else: | |
logger.warning(f"β οΈ {class_name} not available: {e}") | |
tool_status[tool_name] = f"Import error: {str(e)[:50]}" | |
except Exception as e: | |
logger.warning(f"β οΈ {class_name} not available: {e}") | |
tool_status[tool_name] = f"Error: {str(e)[:50]}" | |
# Log tool availability summary | |
logger.info("π AGNO Tools Status:") | |
for tool_name, status in tool_status.items(): | |
logger.info(f" {tool_name}: {status}") | |
return tools | |
def _init_multimodal_tools(self) -> Optional[Any]: | |
"""Initialize European open-source multimodal tools.""" | |
if not MULTIMODAL_AVAILABLE: | |
logger.warning("β οΈ European open-source multimodal tools not available") | |
return None | |
try: | |
multimodal_tools = OpenSourceMultimodalTools() | |
logger.info("β European open-source multimodal tools initialized") | |
logger.info("πͺπΊ Features: Image analysis (BLIP-2/Mistral Vision), Audio transcription (Faster-Whisper), Document analysis") | |
return multimodal_tools | |
except Exception as e: | |
logger.warning(f"β οΈ Failed to initialize multimodal tools: {e}") | |
return None | |
def _create_agno_agent(self) -> Optional[Agent]: | |
"""Create the unified AGNO agent with all available tools.""" | |
if not self.tools: | |
logger.warning("β οΈ No AGNO tools available, creating agent without tools") | |
try: | |
# Create Mistral model for the agent | |
model = MistralChat( | |
api_key=self.mistral_api_key, | |
id="mistral-large-latest", # Use latest large model for better function calling | |
temperature=0.1, # Low temperature for factual accuracy | |
max_tokens=2000 | |
) | |
# Create the unified agent with all available tools | |
agent = Agent( | |
model=model, | |
tools=self.tools, | |
instructions=self._get_agent_instructions(), | |
show_tool_calls=True, # Enable tool call visibility for debugging | |
markdown=True, | |
debug_mode=True # Enable debug mode to see tool usage | |
) | |
logger.info(f"β Unified AGNO Agent created with {len(self.tools)} tools") | |
return agent | |
except Exception as e: | |
logger.error(f"β Failed to create AGNO agent: {e}") | |
return None | |
def _get_agent_instructions(self) -> str: | |
"""Get comprehensive instructions for the unified AGNO agent.""" | |
return """You are a GAIA evaluation agent with access to comprehensive AGNO tools. | |
CRITICAL GAIA EVALUATION REQUIREMENTS: | |
1. EXACT ANSWER MATCHING: Your final answer must match the expected answer EXACTLY | |
2. NO EXPLANATIONS: Provide only the final answer, no reasoning or explanations | |
3. PRECISE FORMAT: Follow the exact format expected (number, text, etc.) | |
4. FACTUAL ACCURACY: Use tools to verify all information before answering | |
AVAILABLE TOOLS AND WHEN TO USE THEM: | |
CORE COMPUTATIONAL TOOLS: | |
1. CALCULATOR TOOLS - Use for: | |
- Mathematical calculations and operations | |
- Unit conversions and numerical computations | |
- Complex mathematical expressions | |
2. PYTHON TOOLS - Use for: | |
- Code execution and analysis | |
- Data processing and calculations | |
- Algorithm implementation | |
KNOWLEDGE AND RESEARCH TOOLS: | |
3. WIKIPEDIA TOOLS - Use ONLY when: | |
- Wikipedia is explicitly mentioned in the question | |
- Question specifically asks about Wikipedia content | |
- Question references "according to Wikipedia" or similar | |
4. ARXIV TOOLS - Use for: | |
- Academic research and scientific papers | |
- Technical and research-oriented questions | |
- Latest scientific developments | |
WEB RESEARCH TOOLS: | |
5. EXA TOOLS - Use for: | |
- General web search and research | |
- Finding current information and recent developments | |
- Biographical information and general knowledge queries | |
- Any web-based fact-checking and information gathering | |
6. FIRECRAWL TOOLS - Use for: | |
- Web content extraction from specific URLs provided in the question | |
- Detailed webpage analysis when URL is given | |
- Content scraping when specific URLs need to be processed | |
SYSTEM TOOLS: | |
7. FILE TOOLS - Use for: | |
- File operations and management | |
- Reading and processing local files | |
- File system operations | |
8. SHELL TOOLS - Use for: | |
- System operations and commands | |
- Environment queries | |
- System-level information gathering | |
9. YOUTUBE TOOLS - Use for: | |
- YouTube video transcription | |
- Video content analysis via transcripts | |
- Understanding video content without watching | |
MULTIMODAL TOOLS (European Open-Source): | |
10. IMAGE ANALYSIS - Use for: | |
- Analyzing images using BLIP-2 or Mistral Vision | |
- Answering questions about image content | |
- Visual reasoning and description | |
11. AUDIO TRANSCRIPTION - Use for: | |
- Transcribing audio files using Faster-Whisper (European community-driven) | |
- Converting speech to text for analysis | |
- Processing audio content | |
12. DOCUMENT ANALYSIS - Use for: | |
- Analyzing document content and answering questions | |
- Text-based document processing | |
- Document question-answering using DistilBERT | |
GENERAL STRATEGY: | |
1. Analyze the question to determine the most appropriate tool(s) | |
2. Use tools systematically to gather accurate information | |
3. Synthesize findings into a precise, compliant answer | |
4. Always prioritize accuracy and factual correctness | |
5. Use multiple tools if needed for verification | |
ANSWER FORMAT: | |
- Provide ONLY the final answer | |
- No explanations, reasoning, or additional text | |
- Match the expected format exactly (number, text, date, etc.) | |
- Ensure factual accuracy through tool verification""" | |
def __call__(self, question: str) -> str: | |
"""Process a question using the unified AGNO agent.""" | |
if not self.available: | |
logger.error("β Unified AGNO Agent not available - check MISTRAL_API_KEY") | |
return "Agent not available" | |
try: | |
logger.info(f"π€ Processing question with Unified AGNO Agent: {question[:100]}...") | |
# Use AGNO agent to process the question with full orchestration | |
response = self.agent.run(question) | |
# Extract the response content | |
if hasattr(response, 'content'): | |
raw_answer = response.content | |
elif isinstance(response, str): | |
raw_answer = response | |
else: | |
raw_answer = str(response) | |
# Format the response for GAIA evaluation | |
formatted_answer = self.response_formatter.format_answer(raw_answer, question) | |
logger.info(f"β Question processed successfully") | |
logger.info(f"π Raw answer: {raw_answer[:200]}...") | |
logger.info(f"π― Formatted answer: {formatted_answer}") | |
return formatted_answer | |
except Exception as e: | |
logger.error(f"β Error processing question: {e}") | |
return f"Error: {str(e)}" | |
def get_tool_status(self) -> Dict[str, Any]: | |
"""Get the current status of all tools.""" | |
multimodal_status = {} | |
if hasattr(self, 'multimodal_tools') and self.multimodal_tools: | |
multimodal_status = self.multimodal_tools.get_capabilities_status() | |
return { | |
'available': self.available, | |
'tools_count': len(self.tools) if self.tools else 0, | |
'mistral_api_key_present': bool(self.mistral_api_key), | |
'agent_created': self.agent is not None, | |
'multimodal_tools_available': MULTIMODAL_AVAILABLE, | |
'multimodal_status': multimodal_status | |
} | |
# Create global agent instance | |
gaia_agent = GAIAAgent() | |
def process_question(question: str) -> str: | |
"""Process a question using the GAIA agent.""" | |
return gaia_agent(question) | |
def get_agent_status() -> Dict[str, Any]: | |
"""Get the current status of the GAIA agent.""" | |
return gaia_agent.get_tool_status() |