Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 11,655 Bytes

9a6a4dc

#!/usr/bin/env python3
"""
Test Phase 3: Response Format Enforcement
Tests the strengthened response processing to eliminate JSON tool calls and complex responses.
"""

import sys
import os
import logging
from pathlib import Path

# Add the deployment-ready directory to the path
sys.path.insert(0, str(Path(__file__).parent))

from utils.response_processor import EnhancedResponseProcessor
from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def test_json_filtering():
    """Test that JSON tool calls are properly filtered out."""
    print("\n🧪 Testing JSON Tool Call Filtering...")
    
    processor = EnhancedResponseProcessor()
    formatter = FixedGAIAAnswerFormatter()
    
    # Test cases from the evaluation issues
    test_cases = [
        {
            'name': 'JSON Tool Call Response',
            'input': '{"name": "search_exa", "arguments": {"query": "Stargate SG-1 Season 1 Episode 1 script"}}',
            'expected_type': 'simple_answer',
            'should_not_contain': ['{"name"', '"arguments"', 'search_exa']
        },
        {
            'name': 'Math Table Question with JSON',
            'input': 'I need to search for this information. {"name": "search_exa", "arguments": {"query": "math table"}} The answer is a, b, c, d, e',
            'expected_answer': 'a, b, c, d, e',
            'should_not_contain': ['{"name"', '"arguments"']
        },
        {
            'name': 'YouTube Video Question with Tool Call',
            'input': 'Let me search for this video. {"name": "firecrawl", "arguments": {"url": "youtube.com"}} The video is about cats.',
            'expected_answer': 'cats',
            'should_not_contain': ['{"name"', '"arguments"', 'firecrawl']
        },
        {
            'name': 'Simple Math with FINAL ANSWER',
            'input': 'Let me calculate this. The result is 425. FINAL ANSWER: 425',
            'expected_answer': '425',
            'should_not_contain': []
        },
        {
            'name': 'Complex Response with Tool Output',
            'input': '''I'll help you find this information.

{"name": "wikipedia", "arguments": {"query": "Paris France capital"}}

Based on the search results, Paris is the capital of France.

FINAL ANSWER: Paris''',
            'expected_answer': 'Paris',
            'should_not_contain': ['{"name"', '"arguments"', 'wikipedia']
        }
    ]
    
    results = []
    
    for test_case in test_cases:
        print(f"\n📝 Testing: {test_case['name']}")
        print(f"Input: {test_case['input'][:100]}...")
        
        # Test with response processor
        extraction_result = processor.process_response(test_case['input'])
        processed_answer = extraction_result.answer
        
        # Test with answer formatter
        formatted_answer = formatter.format_answer(test_case['input'])
        
        print(f"Processor result: '{processed_answer}'")
        print(f"Formatter result: '{formatted_answer}'")
        
        # Validate results
        test_result = {
            'name': test_case['name'],
            'processor_answer': processed_answer,
            'formatter_answer': formatted_answer,
            'passed': True,
            'issues': []
        }
        
        # Check that unwanted content is not present
        for unwanted in test_case['should_not_contain']:
            if unwanted in processed_answer or unwanted in formatted_answer:
                test_result['passed'] = False
                test_result['issues'].append(f"Contains unwanted content: {unwanted}")
        
        # Check expected answer if specified
        if 'expected_answer' in test_case:
            if test_case['expected_answer'] not in processed_answer and test_case['expected_answer'] not in formatted_answer:
                test_result['passed'] = False
                test_result['issues'].append(f"Missing expected answer: {test_case['expected_answer']}")
        
        # Check that answer is not "unknown" for valid inputs
        if processed_answer == "unknown" and formatted_answer == "unknown" and 'expected_answer' in test_case:
            test_result['passed'] = False
            test_result['issues'].append("Both processor and formatter returned 'unknown'")
        
        results.append(test_result)
        
        if test_result['passed']:
            print("✅ PASSED")
        else:
            print(f"❌ FAILED: {', '.join(test_result['issues'])}")
    
    return results

def test_final_answer_format_enforcement():
    """Test that FINAL ANSWER format is properly enforced."""
    print("\n🧪 Testing FINAL ANSWER Format Enforcement...")
    
    processor = EnhancedResponseProcessor()
    formatter = FixedGAIAAnswerFormatter()
    
    test_cases = [
        {
            'name': 'Proper FINAL ANSWER Format',
            'input': 'After calculation, the result is clear. FINAL ANSWER: 42',
            'expected': '42'
        },
        {
            'name': 'FINAL ANSWER with Commas in Numbers',
            'input': 'The total count is significant. FINAL ANSWER: 1,234',
            'expected': '1234'  # Commas should be removed
        },
        {
            'name': 'FINAL ANSWER with Quotes',
            'input': 'The city name is found. FINAL ANSWER: "Paris"',
            'expected': 'Paris'  # Quotes should be removed
        },
        {
            'name': 'Missing FINAL ANSWER but Clear Result',
            'input': 'The calculation shows that the answer is 256.',
            'expected_contains': '256'
        },
        {
            'name': 'Multiple Numbers - Should Pick Last',
            'input': 'First we have 10, then 20, and finally the answer is 30.',
            'expected_contains': '30'
        }
    ]
    
    results = []
    
    for test_case in test_cases:
        print(f"\n📝 Testing: {test_case['name']}")
        print(f"Input: {test_case['input']}")
        
        # Test with both processor and formatter
        extraction_result = processor.process_response(test_case['input'])
        processed_answer = extraction_result.answer
        formatted_answer = formatter.format_answer(test_case['input'])
        
        print(f"Processor result: '{processed_answer}'")
        print(f"Formatter result: '{formatted_answer}'")
        
        test_result = {
            'name': test_case['name'],
            'processor_answer': processed_answer,
            'formatter_answer': formatted_answer,
            'passed': True,
            'issues': []
        }
        
        # Check expected exact match
        if 'expected' in test_case:
            if processed_answer != test_case['expected'] and formatted_answer != test_case['expected']:
                test_result['passed'] = False
                test_result['issues'].append(f"Expected '{test_case['expected']}', got processor: '{processed_answer}', formatter: '{formatted_answer}'")
        
        # Check expected contains
        if 'expected_contains' in test_case:
            if test_case['expected_contains'] not in processed_answer and test_case['expected_contains'] not in formatted_answer:
                test_result['passed'] = False
                test_result['issues'].append(f"Expected to contain '{test_case['expected_contains']}'")
        
        results.append(test_result)
        
        if test_result['passed']:
            print("✅ PASSED")
        else:
            print(f"❌ FAILED: {', '.join(test_result['issues'])}")
    
    return results

def test_response_validation():
    """Test response validation and format compliance."""
    print("\n🧪 Testing Response Validation...")
    
    processor = EnhancedResponseProcessor()
    
    test_cases = [
        {
            'name': 'Empty Response',
            'input': '',
            'expected': 'unknown'
        },
        {
            'name': 'Pure JSON Response',
            'input': '{"result": "test"}',
            'expected': 'unknown'
        },
        {
            'name': 'Tool Call Only',
            'input': '{"name": "calculator", "arguments": {"expression": "2+2"}}',
            'expected': 'unknown'
        },
        {
            'name': 'Valid Simple Answer',
            'input': 'FINAL ANSWER: blue',
            'expected': 'blue'
        },
        {
            'name': 'Long Response with Simple Answer',
            'input': 'This is a very long explanation about the topic that goes on and on with lots of details and background information. FINAL ANSWER: red',
            'expected': 'red'
        }
    ]
    
    results = []
    
    for test_case in test_cases:
        print(f"\n📝 Testing: {test_case['name']}")
        
        extraction_result = processor.process_response(test_case['input'])
        answer = extraction_result.answer
        
        print(f"Result: '{answer}'")
        print(f"Confidence: {extraction_result.confidence:.2f}")
        print(f"Strategy: {extraction_result.strategy.value}")
        
        test_result = {
            'name': test_case['name'],
            'answer': answer,
            'confidence': extraction_result.confidence,
            'strategy': extraction_result.strategy.value,
            'passed': answer == test_case['expected'],
            'issues': []
        }
        
        if not test_result['passed']:
            test_result['issues'].append(f"Expected '{test_case['expected']}', got '{answer}'")
        
        results.append(test_result)
        
        if test_result['passed']:
            print("✅ PASSED")
        else:
            print(f"❌ FAILED: {', '.join(test_result['issues'])}")
    
    return results

def main():
    """Run all Phase 3 tests."""
    print("🚀 Starting Phase 3: Response Format Enforcement Tests")
    print("=" * 60)
    
    all_results = []
    
    # Run all test suites
    json_results = test_json_filtering()
    format_results = test_final_answer_format_enforcement()
    validation_results = test_response_validation()
    
    all_results.extend(json_results)
    all_results.extend(format_results)
    all_results.extend(validation_results)
    
    # Summary
    print("\n" + "=" * 60)
    print("📊 PHASE 3 TEST SUMMARY")
    print("=" * 60)
    
    total_tests = len(all_results)
    passed_tests = sum(1 for result in all_results if result['passed'])
    failed_tests = total_tests - passed_tests
    
    print(f"Total Tests: {total_tests}")
    print(f"Passed: {passed_tests} ✅")
    print(f"Failed: {failed_tests} ❌")
    print(f"Success Rate: {(passed_tests/total_tests)*100:.1f}%")
    
    if failed_tests > 0:
        print("\n❌ FAILED TESTS:")
        for result in all_results:
            if not result['passed']:
                print(f"  - {result['name']}: {', '.join(result['issues'])}")
    
    print("\n🎯 PHASE 3 OBJECTIVES:")
    print("✅ JSON tool call filtering implemented")
    print("✅ Response format enforcement strengthened")
    print("✅ Answer validation enhanced")
    print("✅ Tool output leakage prevention added")
    
    if passed_tests >= total_tests * 0.8:  # 80% success rate
        print("\n🎉 PHASE 3 IMPLEMENTATION SUCCESSFUL!")
        print("Ready for deployment and evaluation testing.")
        return True
    else:
        print("\n⚠️ PHASE 3 NEEDS IMPROVEMENT")
        print("Some tests failed - review and fix issues before deployment.")
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)