gaia-enhanced-agent / test_phase3_response_format_enforcement.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
#!/usr/bin/env python3
"""
Test Phase 3: Response Format Enforcement
Tests the strengthened response processing to eliminate JSON tool calls and complex responses.
"""
import sys
import os
import logging
from pathlib import Path
# Add the deployment-ready directory to the path
sys.path.insert(0, str(Path(__file__).parent))
from utils.response_processor import EnhancedResponseProcessor
from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def test_json_filtering():
"""Test that JSON tool calls are properly filtered out."""
print("\nπŸ§ͺ Testing JSON Tool Call Filtering...")
processor = EnhancedResponseProcessor()
formatter = FixedGAIAAnswerFormatter()
# Test cases from the evaluation issues
test_cases = [
{
'name': 'JSON Tool Call Response',
'input': '{"name": "search_exa", "arguments": {"query": "Stargate SG-1 Season 1 Episode 1 script"}}',
'expected_type': 'simple_answer',
'should_not_contain': ['{"name"', '"arguments"', 'search_exa']
},
{
'name': 'Math Table Question with JSON',
'input': 'I need to search for this information. {"name": "search_exa", "arguments": {"query": "math table"}} The answer is a, b, c, d, e',
'expected_answer': 'a, b, c, d, e',
'should_not_contain': ['{"name"', '"arguments"']
},
{
'name': 'YouTube Video Question with Tool Call',
'input': 'Let me search for this video. {"name": "firecrawl", "arguments": {"url": "youtube.com"}} The video is about cats.',
'expected_answer': 'cats',
'should_not_contain': ['{"name"', '"arguments"', 'firecrawl']
},
{
'name': 'Simple Math with FINAL ANSWER',
'input': 'Let me calculate this. The result is 425. FINAL ANSWER: 425',
'expected_answer': '425',
'should_not_contain': []
},
{
'name': 'Complex Response with Tool Output',
'input': '''I'll help you find this information.
{"name": "wikipedia", "arguments": {"query": "Paris France capital"}}
Based on the search results, Paris is the capital of France.
FINAL ANSWER: Paris''',
'expected_answer': 'Paris',
'should_not_contain': ['{"name"', '"arguments"', 'wikipedia']
}
]
results = []
for test_case in test_cases:
print(f"\nπŸ“ Testing: {test_case['name']}")
print(f"Input: {test_case['input'][:100]}...")
# Test with response processor
extraction_result = processor.process_response(test_case['input'])
processed_answer = extraction_result.answer
# Test with answer formatter
formatted_answer = formatter.format_answer(test_case['input'])
print(f"Processor result: '{processed_answer}'")
print(f"Formatter result: '{formatted_answer}'")
# Validate results
test_result = {
'name': test_case['name'],
'processor_answer': processed_answer,
'formatter_answer': formatted_answer,
'passed': True,
'issues': []
}
# Check that unwanted content is not present
for unwanted in test_case['should_not_contain']:
if unwanted in processed_answer or unwanted in formatted_answer:
test_result['passed'] = False
test_result['issues'].append(f"Contains unwanted content: {unwanted}")
# Check expected answer if specified
if 'expected_answer' in test_case:
if test_case['expected_answer'] not in processed_answer and test_case['expected_answer'] not in formatted_answer:
test_result['passed'] = False
test_result['issues'].append(f"Missing expected answer: {test_case['expected_answer']}")
# Check that answer is not "unknown" for valid inputs
if processed_answer == "unknown" and formatted_answer == "unknown" and 'expected_answer' in test_case:
test_result['passed'] = False
test_result['issues'].append("Both processor and formatter returned 'unknown'")
results.append(test_result)
if test_result['passed']:
print("βœ… PASSED")
else:
print(f"❌ FAILED: {', '.join(test_result['issues'])}")
return results
def test_final_answer_format_enforcement():
"""Test that FINAL ANSWER format is properly enforced."""
print("\nπŸ§ͺ Testing FINAL ANSWER Format Enforcement...")
processor = EnhancedResponseProcessor()
formatter = FixedGAIAAnswerFormatter()
test_cases = [
{
'name': 'Proper FINAL ANSWER Format',
'input': 'After calculation, the result is clear. FINAL ANSWER: 42',
'expected': '42'
},
{
'name': 'FINAL ANSWER with Commas in Numbers',
'input': 'The total count is significant. FINAL ANSWER: 1,234',
'expected': '1234' # Commas should be removed
},
{
'name': 'FINAL ANSWER with Quotes',
'input': 'The city name is found. FINAL ANSWER: "Paris"',
'expected': 'Paris' # Quotes should be removed
},
{
'name': 'Missing FINAL ANSWER but Clear Result',
'input': 'The calculation shows that the answer is 256.',
'expected_contains': '256'
},
{
'name': 'Multiple Numbers - Should Pick Last',
'input': 'First we have 10, then 20, and finally the answer is 30.',
'expected_contains': '30'
}
]
results = []
for test_case in test_cases:
print(f"\nπŸ“ Testing: {test_case['name']}")
print(f"Input: {test_case['input']}")
# Test with both processor and formatter
extraction_result = processor.process_response(test_case['input'])
processed_answer = extraction_result.answer
formatted_answer = formatter.format_answer(test_case['input'])
print(f"Processor result: '{processed_answer}'")
print(f"Formatter result: '{formatted_answer}'")
test_result = {
'name': test_case['name'],
'processor_answer': processed_answer,
'formatter_answer': formatted_answer,
'passed': True,
'issues': []
}
# Check expected exact match
if 'expected' in test_case:
if processed_answer != test_case['expected'] and formatted_answer != test_case['expected']:
test_result['passed'] = False
test_result['issues'].append(f"Expected '{test_case['expected']}', got processor: '{processed_answer}', formatter: '{formatted_answer}'")
# Check expected contains
if 'expected_contains' in test_case:
if test_case['expected_contains'] not in processed_answer and test_case['expected_contains'] not in formatted_answer:
test_result['passed'] = False
test_result['issues'].append(f"Expected to contain '{test_case['expected_contains']}'")
results.append(test_result)
if test_result['passed']:
print("βœ… PASSED")
else:
print(f"❌ FAILED: {', '.join(test_result['issues'])}")
return results
def test_response_validation():
"""Test response validation and format compliance."""
print("\nπŸ§ͺ Testing Response Validation...")
processor = EnhancedResponseProcessor()
test_cases = [
{
'name': 'Empty Response',
'input': '',
'expected': 'unknown'
},
{
'name': 'Pure JSON Response',
'input': '{"result": "test"}',
'expected': 'unknown'
},
{
'name': 'Tool Call Only',
'input': '{"name": "calculator", "arguments": {"expression": "2+2"}}',
'expected': 'unknown'
},
{
'name': 'Valid Simple Answer',
'input': 'FINAL ANSWER: blue',
'expected': 'blue'
},
{
'name': 'Long Response with Simple Answer',
'input': 'This is a very long explanation about the topic that goes on and on with lots of details and background information. FINAL ANSWER: red',
'expected': 'red'
}
]
results = []
for test_case in test_cases:
print(f"\nπŸ“ Testing: {test_case['name']}")
extraction_result = processor.process_response(test_case['input'])
answer = extraction_result.answer
print(f"Result: '{answer}'")
print(f"Confidence: {extraction_result.confidence:.2f}")
print(f"Strategy: {extraction_result.strategy.value}")
test_result = {
'name': test_case['name'],
'answer': answer,
'confidence': extraction_result.confidence,
'strategy': extraction_result.strategy.value,
'passed': answer == test_case['expected'],
'issues': []
}
if not test_result['passed']:
test_result['issues'].append(f"Expected '{test_case['expected']}', got '{answer}'")
results.append(test_result)
if test_result['passed']:
print("βœ… PASSED")
else:
print(f"❌ FAILED: {', '.join(test_result['issues'])}")
return results
def main():
"""Run all Phase 3 tests."""
print("πŸš€ Starting Phase 3: Response Format Enforcement Tests")
print("=" * 60)
all_results = []
# Run all test suites
json_results = test_json_filtering()
format_results = test_final_answer_format_enforcement()
validation_results = test_response_validation()
all_results.extend(json_results)
all_results.extend(format_results)
all_results.extend(validation_results)
# Summary
print("\n" + "=" * 60)
print("πŸ“Š PHASE 3 TEST SUMMARY")
print("=" * 60)
total_tests = len(all_results)
passed_tests = sum(1 for result in all_results if result['passed'])
failed_tests = total_tests - passed_tests
print(f"Total Tests: {total_tests}")
print(f"Passed: {passed_tests} βœ…")
print(f"Failed: {failed_tests} ❌")
print(f"Success Rate: {(passed_tests/total_tests)*100:.1f}%")
if failed_tests > 0:
print("\n❌ FAILED TESTS:")
for result in all_results:
if not result['passed']:
print(f" - {result['name']}: {', '.join(result['issues'])}")
print("\n🎯 PHASE 3 OBJECTIVES:")
print("βœ… JSON tool call filtering implemented")
print("βœ… Response format enforcement strengthened")
print("βœ… Answer validation enhanced")
print("βœ… Tool output leakage prevention added")
if passed_tests >= total_tests * 0.8: # 80% success rate
print("\nπŸŽ‰ PHASE 3 IMPLEMENTATION SUCCESSFUL!")
print("Ready for deployment and evaluation testing.")
return True
else:
print("\n⚠️ PHASE 3 NEEDS IMPROVEMENT")
print("Some tests failed - review and fix issues before deployment.")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)