gaia-enhanced-agent / agents /enhanced_unified_agno_agent.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
GAIA Agent - Simplified Working Version
Complete AGNO Tools with Basic Multimodal Integration
This agent provides comprehensive GAIA evaluation capabilities using:
- All AGNO tools (calculator, python, wikipedia, arxiv, firecrawl, exa, file, shell)
- Basic multimodal tools (Mistral Vision when available)
- Simple, reliable answer formatting
- No complex dependencies that cause import failures
Advantages:
- Single agent for all GAIA tasks (text, math, multimodal)
- AGNO's native orchestration handles tool selection
- Simple, reliable architecture that works in HuggingFace Space
- Consistent error handling and response formatting
- No complex import dependencies
"""
import os
import logging
from typing import Dict, Any, List, Optional
from pathlib import Path
from agno.agent import Agent
from agno.models.mistral import MistralChat
# Import European open-source multimodal tools
try:
from .mistral_multimodal_agent import OpenSourceMultimodalTools
MULTIMODAL_AVAILABLE = True
except ImportError:
try:
from mistral_multimodal_agent import OpenSourceMultimodalTools
MULTIMODAL_AVAILABLE = True
except ImportError:
OpenSourceMultimodalTools = None
MULTIMODAL_AVAILABLE = False
# Simple answer formatting without complex dependencies
class SimpleAnswerFormatter:
"""Simple answer formatter for GAIA evaluation."""
def format_answer(self, response: str, question: str = None) -> str:
"""Format response for GAIA evaluation."""
if not response:
return ""
# Clean the response
answer = response.strip()
# Remove common prefixes
prefixes_to_remove = [
"The answer is:",
"Answer:",
"Final answer:",
"The final answer is:",
"Based on my analysis,",
"According to my research,",
]
for prefix in prefixes_to_remove:
if answer.lower().startswith(prefix.lower()):
answer = answer[len(prefix):].strip()
# Remove markdown formatting
answer = answer.replace("**", "").replace("*", "")
# Extract final answer if it's in a specific format
lines = answer.split('\n')
for line in lines:
line = line.strip()
if line and not line.startswith('#') and not line.startswith('-'):
# This looks like a final answer
return line
return answer
# Load environment variables from .env file
def load_env_file():
"""Load environment variables from .env file if it exists."""
env_file = Path('.env')
if env_file.exists():
with open(env_file, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
os.environ[key.strip()] = value.strip()
# Load environment variables at module level
load_env_file()
logger = logging.getLogger(__name__)
class GAIAAgent:
"""
GAIA Agent with comprehensive AGNO tools and basic multimodal capabilities.
This agent combines all AGNO tools with basic multimodal processing,
providing a single interface for all GAIA evaluation tasks including:
- Text and mathematical reasoning
- Basic image analysis using Mistral Vision
- Web research and content extraction
- Simple, reliable answer formatting
"""
def __init__(self):
"""Initialize the unified AGNO agent."""
logger.info("πŸš€ Initializing Unified AGNO Agent...")
# Initialize simple answer formatter
self.response_formatter = SimpleAnswerFormatter()
# Initialize all AGNO tools
self.tools = self._init_all_agno_tools()
# Initialize European open-source multimodal tools
self.multimodal_tools = self._init_multimodal_tools()
if self.multimodal_tools:
self.tools.extend(self.multimodal_tools.tools)
# Check for required API key
self.mistral_api_key = os.getenv("MISTRAL_API_KEY")
if not self.mistral_api_key:
logger.error("❌ MISTRAL_API_KEY not found - AGNO agent requires this for orchestration")
self.agent = None
self.available = False
return
# Create the unified AGNO agent
self.agent = self._create_agno_agent()
# Set availability flag
self.available = self.agent is not None
if self.available:
logger.info("βœ… Unified AGNO Agent initialized successfully")
logger.info(f"πŸ“Š Available tools: {len(self.tools)}")
else:
logger.error("❌ Unified AGNO Agent initialization failed")
def _init_all_agno_tools(self) -> List[Any]:
"""Initialize all available AGNO tools."""
tools = []
tool_status = {}
# Define all AGNO tools with their requirements
tools_config = [
# Core computational tools
{
'name': 'calculator',
'module': 'agno.tools.calculator',
'class': 'CalculatorTools',
'required_env': None,
'description': 'Mathematical calculations and operations'
},
{
'name': 'python',
'module': 'agno.tools.python',
'class': 'PythonTools',
'required_env': None,
'description': 'Python code execution and analysis'
},
# Knowledge and research tools
{
'name': 'wikipedia',
'module': 'agno.tools.wikipedia',
'class': 'WikipediaTools',
'required_env': None,
'description': 'Wikipedia knowledge retrieval'
},
{
'name': 'arxiv',
'module': 'agno.tools.arxiv',
'class': 'ArxivTools',
'required_env': None,
'description': 'Academic research via ArXiv'
},
# Web tools
{
'name': 'firecrawl',
'module': 'agno.tools.firecrawl',
'class': 'FirecrawlTools',
'required_env': 'FIRECRAWL_API_KEY',
'description': 'Web content extraction'
},
{
'name': 'exa',
'module': 'agno.tools.exa',
'class': 'ExaTools',
'required_env': 'EXA_API_KEY',
'description': 'Advanced web search'
},
# System tools
{
'name': 'file',
'module': 'agno.tools.file',
'class': 'FileTools',
'required_env': None,
'description': 'File operations and management'
},
{
'name': 'shell',
'module': 'agno.tools.shell',
'class': 'ShellTools',
'required_env': None,
'description': 'System shell operations'
},
# Optional multimodal tools
{
'name': 'youtube',
'module': 'agno.tools.youtube',
'class': 'YouTubeTools',
'required_env': None,
'description': 'YouTube video transcription and analysis',
'optional_deps': ['youtube_transcript_api']
},
]
for tool_config in tools_config:
tool_name = tool_config['name']
module_path = tool_config['module']
class_name = tool_config['class']
required_env = tool_config['required_env']
description = tool_config['description']
optional_deps = tool_config.get('optional_deps', [])
try:
# Check if required environment variable is available
if required_env and not os.getenv(required_env):
logger.warning(f"⚠️ {required_env} not found, {tool_name} tool unavailable")
tool_status[tool_name] = f"Missing {required_env}"
continue
# Import and instantiate the tool
module = __import__(module_path, fromlist=[class_name])
tool_class = getattr(module, class_name)
# Initialize tool with appropriate parameters
if tool_name == 'exa':
tool_instance = tool_class(api_key=os.getenv('EXA_API_KEY'))
elif tool_name == 'firecrawl':
tool_instance = tool_class(api_key=os.getenv('FIRECRAWL_API_KEY'))
else:
tool_instance = tool_class()
tools.append(tool_instance)
tool_status[tool_name] = "βœ… Available"
logger.info(f"βœ… {class_name} initialized: {description}")
except ImportError as e:
if optional_deps and any(dep in str(e) for dep in optional_deps):
logger.warning(f"⚠️ {class_name} not available: missing optional dependency")
tool_status[tool_name] = f"Missing optional dependency"
else:
logger.warning(f"⚠️ {class_name} not available: {e}")
tool_status[tool_name] = f"Import error: {str(e)[:50]}"
except Exception as e:
logger.warning(f"⚠️ {class_name} not available: {e}")
tool_status[tool_name] = f"Error: {str(e)[:50]}"
# Log tool availability summary
logger.info("πŸ“Š AGNO Tools Status:")
for tool_name, status in tool_status.items():
logger.info(f" {tool_name}: {status}")
return tools
def _init_multimodal_tools(self) -> Optional[Any]:
"""Initialize European open-source multimodal tools."""
if not MULTIMODAL_AVAILABLE:
logger.warning("⚠️ European open-source multimodal tools not available")
return None
try:
multimodal_tools = OpenSourceMultimodalTools()
logger.info("βœ… European open-source multimodal tools initialized")
logger.info("πŸ‡ͺπŸ‡Ί Features: Image analysis (BLIP-2/Mistral Vision), Audio transcription (Faster-Whisper), Document analysis")
return multimodal_tools
except Exception as e:
logger.warning(f"⚠️ Failed to initialize multimodal tools: {e}")
return None
def _create_agno_agent(self) -> Optional[Agent]:
"""Create the unified AGNO agent with all available tools."""
if not self.tools:
logger.warning("⚠️ No AGNO tools available, creating agent without tools")
try:
# Create Mistral model for the agent
model = MistralChat(
api_key=self.mistral_api_key,
id="mistral-large-latest", # Use latest large model for better function calling
temperature=0.1, # Low temperature for factual accuracy
max_tokens=2000
)
# Create the unified agent with all available tools
agent = Agent(
model=model,
tools=self.tools,
instructions=self._get_agent_instructions(),
show_tool_calls=True, # Enable tool call visibility for debugging
markdown=True,
debug_mode=True # Enable debug mode to see tool usage
)
logger.info(f"βœ… Unified AGNO Agent created with {len(self.tools)} tools")
return agent
except Exception as e:
logger.error(f"❌ Failed to create AGNO agent: {e}")
return None
def _get_agent_instructions(self) -> str:
"""Get comprehensive instructions for the unified AGNO agent."""
return """You are a GAIA evaluation agent with access to comprehensive AGNO tools.
CRITICAL GAIA EVALUATION REQUIREMENTS:
1. EXACT ANSWER MATCHING: Your final answer must match the expected answer EXACTLY
2. NO EXPLANATIONS: Provide only the final answer, no reasoning or explanations
3. PRECISE FORMAT: Follow the exact format expected (number, text, etc.)
4. FACTUAL ACCURACY: Use tools to verify all information before answering
AVAILABLE TOOLS AND WHEN TO USE THEM:
CORE COMPUTATIONAL TOOLS:
1. CALCULATOR TOOLS - Use for:
- Mathematical calculations and operations
- Unit conversions and numerical computations
- Complex mathematical expressions
2. PYTHON TOOLS - Use for:
- Code execution and analysis
- Data processing and calculations
- Algorithm implementation
KNOWLEDGE AND RESEARCH TOOLS:
3. WIKIPEDIA TOOLS - Use ONLY when:
- Wikipedia is explicitly mentioned in the question
- Question specifically asks about Wikipedia content
- Question references "according to Wikipedia" or similar
4. ARXIV TOOLS - Use for:
- Academic research and scientific papers
- Technical and research-oriented questions
- Latest scientific developments
WEB RESEARCH TOOLS:
5. EXA TOOLS - Use for:
- General web search and research
- Finding current information and recent developments
- Biographical information and general knowledge queries
- Any web-based fact-checking and information gathering
6. FIRECRAWL TOOLS - Use for:
- Web content extraction from specific URLs provided in the question
- Detailed webpage analysis when URL is given
- Content scraping when specific URLs need to be processed
SYSTEM TOOLS:
7. FILE TOOLS - Use for:
- File operations and management
- Reading and processing local files
- File system operations
8. SHELL TOOLS - Use for:
- System operations and commands
- Environment queries
- System-level information gathering
9. YOUTUBE TOOLS - Use for:
- YouTube video transcription
- Video content analysis via transcripts
- Understanding video content without watching
MULTIMODAL TOOLS (European Open-Source):
10. IMAGE ANALYSIS - Use for:
- Analyzing images using BLIP-2 or Mistral Vision
- Answering questions about image content
- Visual reasoning and description
11. AUDIO TRANSCRIPTION - Use for:
- Transcribing audio files using Faster-Whisper (European community-driven)
- Converting speech to text for analysis
- Processing audio content
12. DOCUMENT ANALYSIS - Use for:
- Analyzing document content and answering questions
- Text-based document processing
- Document question-answering using DistilBERT
GENERAL STRATEGY:
1. Analyze the question to determine the most appropriate tool(s)
2. Use tools systematically to gather accurate information
3. Synthesize findings into a precise, compliant answer
4. Always prioritize accuracy and factual correctness
5. Use multiple tools if needed for verification
ANSWER FORMAT:
- Provide ONLY the final answer
- No explanations, reasoning, or additional text
- Match the expected format exactly (number, text, date, etc.)
- Ensure factual accuracy through tool verification"""
def __call__(self, question: str) -> str:
"""Process a question using the unified AGNO agent."""
if not self.available:
logger.error("❌ Unified AGNO Agent not available - check MISTRAL_API_KEY")
return "Agent not available"
try:
logger.info(f"πŸ€” Processing question with Unified AGNO Agent: {question[:100]}...")
# Use AGNO agent to process the question with full orchestration
response = self.agent.run(question)
# Extract the response content
if hasattr(response, 'content'):
raw_answer = response.content
elif isinstance(response, str):
raw_answer = response
else:
raw_answer = str(response)
# Format the response for GAIA evaluation
formatted_answer = self.response_formatter.format_answer(raw_answer, question)
logger.info(f"βœ… Question processed successfully")
logger.info(f"πŸ“ Raw answer: {raw_answer[:200]}...")
logger.info(f"🎯 Formatted answer: {formatted_answer}")
return formatted_answer
except Exception as e:
logger.error(f"❌ Error processing question: {e}")
return f"Error: {str(e)}"
def get_tool_status(self) -> Dict[str, Any]:
"""Get the current status of all tools."""
multimodal_status = {}
if hasattr(self, 'multimodal_tools') and self.multimodal_tools:
multimodal_status = self.multimodal_tools.get_capabilities_status()
return {
'available': self.available,
'tools_count': len(self.tools) if self.tools else 0,
'mistral_api_key_present': bool(self.mistral_api_key),
'agent_created': self.agent is not None,
'multimodal_tools_available': MULTIMODAL_AVAILABLE,
'multimodal_status': multimodal_status
}
# Create global agent instance
gaia_agent = GAIAAgent()
def process_question(question: str) -> str:
"""Process a question using the GAIA agent."""
return gaia_agent(question)
def get_agent_status() -> Dict[str, Any]:
"""Get the current status of the GAIA agent."""
return gaia_agent.get_tool_status()