Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 10,408 Bytes

9a6a4dc

#!/usr/bin/env python3
"""
Test Phase 1 Improvements - Tool Execution and Answer Formatting

This script tests the critical fixes implemented in Phase 1:
1. Tool execution debugging and validation
2. Enhanced answer formatting with multiple patterns
3. GAIA format compliance validation
4. Comprehensive error handling and fallback systems

Usage:
    python test_phase1_improvements.py
"""

import os
import sys
import logging
from pathlib import Path

# Add the deployment-ready directory to the path
sys.path.insert(0, str(Path(__file__).parent))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def test_tool_execution_debugger():
    """Test the ToolExecutionDebugger functionality."""
    logger.info("🔧 Testing ToolExecutionDebugger...")
    
    try:
        from utils.tool_execution_debugger import ToolExecutionDebugger
        
        debugger = ToolExecutionDebugger()
        
        # Test JSON syntax detection
        test_responses = [
            "The answer is 42",  # Normal response
            '{"function": "calculator", "parameters": {"expression": "2+2"}}',  # JSON syntax issue
            "FINAL ANSWER: 42",  # Proper format
            "I need to use the calculator tool: {\"tool\": \"calc\"}",  # Mixed content
        ]
        
        for i, response in enumerate(test_responses):
            issues = debugger.detect_json_syntax_in_response(response)
            logger.info(f"  Test {i+1}: {'❌ Issues detected' if issues else '✅ Clean'} - {issues}")
        
        # Test tool validation
        class MockTool:
            def __init__(self, name):
                self.name = name
            
            def __class__(self):
                return type(self.name, (), {})
        
        mock_tool = MockTool("TestTool")
        validation = debugger.validate_tool_registration("TestTool", mock_tool)
        logger.info(f"  Tool validation: {validation}")
        
        # Get debug stats
        stats = debugger.get_debug_stats()
        logger.info(f"  Debug stats: {stats}")
        
        logger.info("✅ ToolExecutionDebugger tests passed")
        return True
        
    except Exception as e:
        logger.error(f"❌ ToolExecutionDebugger test failed: {e}")
        return False

def test_enhanced_answer_formatter():
    """Test the EnhancedGAIAAnswerFormatter functionality."""
    logger.info("🎯 Testing EnhancedGAIAAnswerFormatter...")
    
    try:
        from utils.enhanced_gaia_answer_formatter import EnhancedGAIAAnswerFormatter
        
        formatter = EnhancedGAIAAnswerFormatter()
        
        # Test cases covering different answer types and formats
        test_cases = [
            # Number formatting
            {
                'input': "The calculation gives us 1,234.50 as the result.",
                'question': "What is 1000 + 234.5?",
                'expected_type': 'number',
                'description': 'Number with comma removal'
            },
            {
                'input': "FINAL ANSWER: 42",
                'question': "How many items are there?",
                'expected_type': 'number',
                'description': 'Simple FINAL ANSWER format'
            },
            
            # String formatting
            {
                'input': "The capital of France is Paris.",
                'question': "What is the capital of France?",
                'expected_type': 'string',
                'description': 'String extraction from sentence'
            },
            {
                'input': 'FINAL ANSWER: "The Eiffel Tower"',
                'question': "What is the famous tower in Paris?",
                'expected_type': 'string',
                'description': 'String with quotes removal'
            },
            
            # List formatting
            {
                'input': "The colors are red, blue, and green.",
                'question': "List three primary colors",
                'expected_type': 'list',
                'description': 'List with "and" removal'
            },
            {
                'input': "FINAL ANSWER: apple; banana; orange",
                'question': "Name three fruits",
                'expected_type': 'list',
                'description': 'List with semicolon separation'
            },
            
            # Boolean formatting
            {
                'input': "Yes, Paris is in France.",
                'question': "Is Paris in France?",
                'expected_type': 'boolean',
                'description': 'Boolean yes answer'
            },
            {
                'input': "No, that is incorrect.",
                'question': "Is London in Germany?",
                'expected_type': 'boolean',
                'description': 'Boolean no answer'
            },
            
            # Complex cases
            {
                'input': "After analyzing the data, I can conclude that the answer is 3.14159.",
                'question': "What is the value of pi to 5 decimal places?",
                'expected_type': 'number',
                'description': 'Number extraction from complex text'
            },
            {
                'input': "Let me search for this information... The result shows that Einstein was born in 1879.",
                'question': "When was Einstein born?",
                'expected_type': 'number',
                'description': 'Year extraction from narrative'
            }
        ]
        
        results = []
        for i, test_case in enumerate(test_cases):
            try:
                formatted = formatter.format_answer(test_case['input'], test_case['question'])
                results.append({
                    'test': i + 1,
                    'description': test_case['description'],
                    'input': test_case['input'][:50] + "..." if len(test_case['input']) > 50 else test_case['input'],
                    'output': formatted,
                    'status': '✅ Success'
                })
                logger.info(f"  Test {i+1}: ✅ {test_case['description']} → '{formatted}'")
            except Exception as e:
                results.append({
                    'test': i + 1,
                    'description': test_case['description'],
                    'input': test_case['input'][:50] + "..." if len(test_case['input']) > 50 else test_case['input'],
                    'output': f"Error: {e}",
                    'status': '❌ Failed'
                })
                logger.error(f"  Test {i+1}: ❌ {test_case['description']} failed: {e}")
        
        # Get formatting statistics
        stats = formatter.get_formatting_stats()
        logger.info(f"  Formatting stats: {stats}")
        
        # Summary
        successful_tests = sum(1 for r in results if r['status'] == '✅ Success')
        logger.info(f"✅ Enhanced formatter tests: {successful_tests}/{len(test_cases)} passed")
        
        return successful_tests == len(test_cases)
        
    except Exception as e:
        logger.error(f"❌ EnhancedGAIAAnswerFormatter test failed: {e}")
        return False

def test_agent_integration():
    """Test the integration of improvements in the main agent."""
    logger.info("🤖 Testing agent integration...")
    
    try:
        # Check if MISTRAL_API_KEY is available
        if not os.getenv("MISTRAL_API_KEY"):
            logger.warning("⚠️ MISTRAL_API_KEY not found - skipping agent integration test")
            return True
        
        from agents.enhanced_unified_agno_agent import GAIAAgent
        
        # Initialize agent
        agent = GAIAAgent()
        
        if not agent.available:
            logger.warning("⚠️ Agent not available - check API key and dependencies")
            return False
        
        # Test tool status
        tool_status = agent.get_tool_status()
        logger.info(f"  Tool status: {tool_status}")
        
        # Test simple question (if agent is available)
        test_question = "What is 2 + 2?"
        logger.info(f"  Testing question: {test_question}")
        
        try:
            response = agent(test_question)
            logger.info(f"  Response: {response}")
            
            # Check if response is properly formatted
            if response and response != "Agent not available" and response != "Unable to process this question":
                logger.info("✅ Agent integration test passed")
                return True
            else:
                logger.warning("⚠️ Agent returned error response")
                return False
                
        except Exception as e:
            logger.error(f"❌ Agent execution failed: {e}")
            return False
        
    except Exception as e:
        logger.error(f"❌ Agent integration test failed: {e}")
        return False

def run_phase1_tests():
    """Run all Phase 1 improvement tests."""
    logger.info("🚀 Starting Phase 1 Improvement Tests")
    logger.info("=" * 60)
    
    test_results = {}
    
    # Test 1: Tool Execution Debugger
    test_results['tool_debugger'] = test_tool_execution_debugger()
    
    # Test 2: Enhanced Answer Formatter
    test_results['answer_formatter'] = test_enhanced_answer_formatter()
    
    # Test 3: Agent Integration
    test_results['agent_integration'] = test_agent_integration()
    
    # Summary
    logger.info("=" * 60)
    logger.info("📊 Phase 1 Test Results Summary:")
    
    total_tests = len(test_results)
    passed_tests = sum(1 for result in test_results.values() if result)
    
    for test_name, result in test_results.items():
        status = "✅ PASSED" if result else "❌ FAILED"
        logger.info(f"  {test_name}: {status}")
    
    logger.info(f"\nOverall: {passed_tests}/{total_tests} tests passed")
    
    if passed_tests == total_tests:
        logger.info("🎉 All Phase 1 improvements are working correctly!")
        logger.info("📈 Ready to proceed with Phase 2 (Answer Formatting Enhancement)")
    else:
        logger.warning("⚠️ Some tests failed - review logs and fix issues before proceeding")
    
    return passed_tests == total_tests

if __name__ == "__main__":
    success = run_phase1_tests()
    sys.exit(0 if success else 1)