#!/usr/bin/env python3 """ Test script to validate the fixed GAIA agent improvements. This script tests the key fixes that should improve the 5/20 evaluation score. """ import os import sys import traceback from pathlib import Path # Add the deployment-ready directory to the path sys.path.insert(0, str(Path(__file__).parent)) def load_env_file(): """Load environment variables from .env file if it exists.""" env_file = Path('.env') if env_file.exists(): with open(env_file, 'r') as f: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: key, value = line.split('=', 1) os.environ[key.strip()] = value.strip() # Load environment variables load_env_file() def test_answer_formatter(): """Test the fixed answer formatter.""" print("\n" + "="*50) print("๐Ÿงช Testing Fixed Answer Formatter") print("="*50) try: from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter formatter = FixedGAIAAnswerFormatter() # Test cases that should work test_cases = [ { 'input': 'Let me calculate this. The answer is 42. FINAL ANSWER: 42', 'expected': '42', 'description': 'Basic FINAL ANSWER format' }, { 'input': 'After analysis, I found the result. FINAL ANSWER: Paris', 'expected': 'Paris', 'description': 'Text answer with FINAL ANSWER' }, { 'input': 'FINAL ANSWER: blue, green, red', 'expected': 'blue, green, red', 'description': 'List format' }, { 'input': 'The calculation shows 1234 FINAL ANSWER: 1234', 'expected': '1234', 'description': 'Number without commas' }, { 'input': 'No final answer format here, just 25', 'expected': '25', 'description': 'Fallback extraction' } ] all_passed = True for i, test_case in enumerate(test_cases, 1): result = formatter.format_answer(test_case['input'], "test question") expected = test_case['expected'] passed = result == expected all_passed = all_passed and passed status = "โœ… PASS" if passed else "โŒ FAIL" print(f"Test {i}: {status} - {test_case['description']}") print(f" Input: {test_case['input'][:50]}...") print(f" Expected: '{expected}'") print(f" Got: '{result}'") print() if all_passed: print("โœ… All answer formatter tests passed!") else: print("โŒ Some answer formatter tests failed!") return all_passed except Exception as e: print(f"โŒ Error testing answer formatter: {e}") traceback.print_exc() return False def test_fixed_agent_import(): """Test importing the fixed agent.""" print("\n" + "="*50) print("๐Ÿงช Testing Fixed Agent Import") print("="*50) try: from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent, get_agent_status print("โœ… Successfully imported FixedGAIAAgent") # Test agent status function status = get_agent_status() print(f"๐Ÿ“Š Agent Status: {status}") return True except Exception as e: print(f"โŒ Error importing fixed agent: {e}") traceback.print_exc() return False def test_fixed_agent_initialization(): """Test initializing the fixed agent.""" print("\n" + "="*50) print("๐Ÿงช Testing Fixed Agent Initialization") print("="*50) try: from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent # Check for required API key mistral_key = os.getenv("MISTRAL_API_KEY") if not mistral_key: print("โš ๏ธ MISTRAL_API_KEY not found - agent will not be fully functional") print("๐Ÿ’ก Set MISTRAL_API_KEY in .env file for full testing") return False print("โœ… MISTRAL_API_KEY found") # Initialize agent agent = FixedGAIAAgent() if agent.available: print("โœ… Fixed agent initialized successfully") status = agent.get_tool_status() print(f"๐Ÿ“Š Tool Status: {status}") return True else: print("โŒ Fixed agent initialization failed") return False except Exception as e: print(f"โŒ Error initializing fixed agent: {e}") traceback.print_exc() return False def test_fixed_agent_simple_question(): """Test the fixed agent with a simple question.""" print("\n" + "="*50) print("๐Ÿงช Testing Fixed Agent with Simple Question") print("="*50) try: from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent # Check for required API key mistral_key = os.getenv("MISTRAL_API_KEY") if not mistral_key: print("โš ๏ธ MISTRAL_API_KEY not found - skipping agent test") return False # Initialize agent agent = FixedGAIAAgent() if not agent.available: print("โŒ Agent not available - skipping test") return False # Test with a simple math question test_question = "What is 25 * 17?" print(f"๐Ÿค” Testing question: {test_question}") answer = agent(test_question) print(f"๐ŸŽฏ Agent answer: '{answer}'") # Check if answer looks reasonable if answer and answer != "unknown" and "425" in answer: print("โœ… Agent provided reasonable answer") return True else: print("โŒ Agent answer doesn't look correct") return False except Exception as e: print(f"โŒ Error testing fixed agent: {e}") traceback.print_exc() return False def test_app_integration(): """Test the app integration with fixed agent.""" print("\n" + "="*50) print("๐Ÿงช Testing App Integration") print("="*50) try: # Import the app module import app print("โœ… Successfully imported app module") # Check if fixed agent is available if hasattr(app, 'FIXED_AGNO_AVAILABLE') and app.FIXED_AGNO_AVAILABLE: print("โœ… Fixed AGNO agent available in app") else: print("โš ๏ธ Fixed AGNO agent not available in app") return True except Exception as e: print(f"โŒ Error testing app integration: {e}") traceback.print_exc() return False def main(): """Run all tests.""" print("๐Ÿš€ Starting Fixed GAIA Agent Test Suite") print("This validates the fixes for the 5/20 evaluation score issue") tests = [ ("Answer Formatter", test_answer_formatter), ("Fixed Agent Import", test_fixed_agent_import), ("Fixed Agent Initialization", test_fixed_agent_initialization), ("Simple Question Test", test_fixed_agent_simple_question), ("App Integration", test_app_integration), ] results = [] for test_name, test_func in tests: try: result = test_func() results.append((test_name, result)) except Exception as e: print(f"โŒ Test '{test_name}' crashed: {e}") results.append((test_name, False)) # Summary print("\n" + "="*50) print("๐Ÿ“Š Test Results Summary") print("="*50) passed = 0 total = len(results) for test_name, result in results: status = "โœ… PASS" if result else "โŒ FAIL" print(f"{status} {test_name}") if result: passed += 1 print(f"\n๐ŸŽฏ Overall: {passed}/{total} tests passed") if passed == total: print("๐ŸŽ‰ All tests passed! The fixes should improve evaluation performance.") elif passed >= total * 0.8: print("โš ๏ธ Most tests passed. Some issues may remain.") else: print("โŒ Many tests failed. Significant issues remain.") return passed == total if __name__ == "__main__": success = main() sys.exit(0 if success else 1)