Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Test Phase 3: Response Format Enforcement | |
Tests the strengthened response processing to eliminate JSON tool calls and complex responses. | |
""" | |
import sys | |
import os | |
import logging | |
from pathlib import Path | |
# Add the deployment-ready directory to the path | |
sys.path.insert(0, str(Path(__file__).parent)) | |
from utils.response_processor import EnhancedResponseProcessor | |
from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def test_json_filtering(): | |
"""Test that JSON tool calls are properly filtered out.""" | |
print("\nπ§ͺ Testing JSON Tool Call Filtering...") | |
processor = EnhancedResponseProcessor() | |
formatter = FixedGAIAAnswerFormatter() | |
# Test cases from the evaluation issues | |
test_cases = [ | |
{ | |
'name': 'JSON Tool Call Response', | |
'input': '{"name": "search_exa", "arguments": {"query": "Stargate SG-1 Season 1 Episode 1 script"}}', | |
'expected_type': 'simple_answer', | |
'should_not_contain': ['{"name"', '"arguments"', 'search_exa'] | |
}, | |
{ | |
'name': 'Math Table Question with JSON', | |
'input': 'I need to search for this information. {"name": "search_exa", "arguments": {"query": "math table"}} The answer is a, b, c, d, e', | |
'expected_answer': 'a, b, c, d, e', | |
'should_not_contain': ['{"name"', '"arguments"'] | |
}, | |
{ | |
'name': 'YouTube Video Question with Tool Call', | |
'input': 'Let me search for this video. {"name": "firecrawl", "arguments": {"url": "youtube.com"}} The video is about cats.', | |
'expected_answer': 'cats', | |
'should_not_contain': ['{"name"', '"arguments"', 'firecrawl'] | |
}, | |
{ | |
'name': 'Simple Math with FINAL ANSWER', | |
'input': 'Let me calculate this. The result is 425. FINAL ANSWER: 425', | |
'expected_answer': '425', | |
'should_not_contain': [] | |
}, | |
{ | |
'name': 'Complex Response with Tool Output', | |
'input': '''I'll help you find this information. | |
{"name": "wikipedia", "arguments": {"query": "Paris France capital"}} | |
Based on the search results, Paris is the capital of France. | |
FINAL ANSWER: Paris''', | |
'expected_answer': 'Paris', | |
'should_not_contain': ['{"name"', '"arguments"', 'wikipedia'] | |
} | |
] | |
results = [] | |
for test_case in test_cases: | |
print(f"\nπ Testing: {test_case['name']}") | |
print(f"Input: {test_case['input'][:100]}...") | |
# Test with response processor | |
extraction_result = processor.process_response(test_case['input']) | |
processed_answer = extraction_result.answer | |
# Test with answer formatter | |
formatted_answer = formatter.format_answer(test_case['input']) | |
print(f"Processor result: '{processed_answer}'") | |
print(f"Formatter result: '{formatted_answer}'") | |
# Validate results | |
test_result = { | |
'name': test_case['name'], | |
'processor_answer': processed_answer, | |
'formatter_answer': formatted_answer, | |
'passed': True, | |
'issues': [] | |
} | |
# Check that unwanted content is not present | |
for unwanted in test_case['should_not_contain']: | |
if unwanted in processed_answer or unwanted in formatted_answer: | |
test_result['passed'] = False | |
test_result['issues'].append(f"Contains unwanted content: {unwanted}") | |
# Check expected answer if specified | |
if 'expected_answer' in test_case: | |
if test_case['expected_answer'] not in processed_answer and test_case['expected_answer'] not in formatted_answer: | |
test_result['passed'] = False | |
test_result['issues'].append(f"Missing expected answer: {test_case['expected_answer']}") | |
# Check that answer is not "unknown" for valid inputs | |
if processed_answer == "unknown" and formatted_answer == "unknown" and 'expected_answer' in test_case: | |
test_result['passed'] = False | |
test_result['issues'].append("Both processor and formatter returned 'unknown'") | |
results.append(test_result) | |
if test_result['passed']: | |
print("β PASSED") | |
else: | |
print(f"β FAILED: {', '.join(test_result['issues'])}") | |
return results | |
def test_final_answer_format_enforcement(): | |
"""Test that FINAL ANSWER format is properly enforced.""" | |
print("\nπ§ͺ Testing FINAL ANSWER Format Enforcement...") | |
processor = EnhancedResponseProcessor() | |
formatter = FixedGAIAAnswerFormatter() | |
test_cases = [ | |
{ | |
'name': 'Proper FINAL ANSWER Format', | |
'input': 'After calculation, the result is clear. FINAL ANSWER: 42', | |
'expected': '42' | |
}, | |
{ | |
'name': 'FINAL ANSWER with Commas in Numbers', | |
'input': 'The total count is significant. FINAL ANSWER: 1,234', | |
'expected': '1234' # Commas should be removed | |
}, | |
{ | |
'name': 'FINAL ANSWER with Quotes', | |
'input': 'The city name is found. FINAL ANSWER: "Paris"', | |
'expected': 'Paris' # Quotes should be removed | |
}, | |
{ | |
'name': 'Missing FINAL ANSWER but Clear Result', | |
'input': 'The calculation shows that the answer is 256.', | |
'expected_contains': '256' | |
}, | |
{ | |
'name': 'Multiple Numbers - Should Pick Last', | |
'input': 'First we have 10, then 20, and finally the answer is 30.', | |
'expected_contains': '30' | |
} | |
] | |
results = [] | |
for test_case in test_cases: | |
print(f"\nπ Testing: {test_case['name']}") | |
print(f"Input: {test_case['input']}") | |
# Test with both processor and formatter | |
extraction_result = processor.process_response(test_case['input']) | |
processed_answer = extraction_result.answer | |
formatted_answer = formatter.format_answer(test_case['input']) | |
print(f"Processor result: '{processed_answer}'") | |
print(f"Formatter result: '{formatted_answer}'") | |
test_result = { | |
'name': test_case['name'], | |
'processor_answer': processed_answer, | |
'formatter_answer': formatted_answer, | |
'passed': True, | |
'issues': [] | |
} | |
# Check expected exact match | |
if 'expected' in test_case: | |
if processed_answer != test_case['expected'] and formatted_answer != test_case['expected']: | |
test_result['passed'] = False | |
test_result['issues'].append(f"Expected '{test_case['expected']}', got processor: '{processed_answer}', formatter: '{formatted_answer}'") | |
# Check expected contains | |
if 'expected_contains' in test_case: | |
if test_case['expected_contains'] not in processed_answer and test_case['expected_contains'] not in formatted_answer: | |
test_result['passed'] = False | |
test_result['issues'].append(f"Expected to contain '{test_case['expected_contains']}'") | |
results.append(test_result) | |
if test_result['passed']: | |
print("β PASSED") | |
else: | |
print(f"β FAILED: {', '.join(test_result['issues'])}") | |
return results | |
def test_response_validation(): | |
"""Test response validation and format compliance.""" | |
print("\nπ§ͺ Testing Response Validation...") | |
processor = EnhancedResponseProcessor() | |
test_cases = [ | |
{ | |
'name': 'Empty Response', | |
'input': '', | |
'expected': 'unknown' | |
}, | |
{ | |
'name': 'Pure JSON Response', | |
'input': '{"result": "test"}', | |
'expected': 'unknown' | |
}, | |
{ | |
'name': 'Tool Call Only', | |
'input': '{"name": "calculator", "arguments": {"expression": "2+2"}}', | |
'expected': 'unknown' | |
}, | |
{ | |
'name': 'Valid Simple Answer', | |
'input': 'FINAL ANSWER: blue', | |
'expected': 'blue' | |
}, | |
{ | |
'name': 'Long Response with Simple Answer', | |
'input': 'This is a very long explanation about the topic that goes on and on with lots of details and background information. FINAL ANSWER: red', | |
'expected': 'red' | |
} | |
] | |
results = [] | |
for test_case in test_cases: | |
print(f"\nπ Testing: {test_case['name']}") | |
extraction_result = processor.process_response(test_case['input']) | |
answer = extraction_result.answer | |
print(f"Result: '{answer}'") | |
print(f"Confidence: {extraction_result.confidence:.2f}") | |
print(f"Strategy: {extraction_result.strategy.value}") | |
test_result = { | |
'name': test_case['name'], | |
'answer': answer, | |
'confidence': extraction_result.confidence, | |
'strategy': extraction_result.strategy.value, | |
'passed': answer == test_case['expected'], | |
'issues': [] | |
} | |
if not test_result['passed']: | |
test_result['issues'].append(f"Expected '{test_case['expected']}', got '{answer}'") | |
results.append(test_result) | |
if test_result['passed']: | |
print("β PASSED") | |
else: | |
print(f"β FAILED: {', '.join(test_result['issues'])}") | |
return results | |
def main(): | |
"""Run all Phase 3 tests.""" | |
print("π Starting Phase 3: Response Format Enforcement Tests") | |
print("=" * 60) | |
all_results = [] | |
# Run all test suites | |
json_results = test_json_filtering() | |
format_results = test_final_answer_format_enforcement() | |
validation_results = test_response_validation() | |
all_results.extend(json_results) | |
all_results.extend(format_results) | |
all_results.extend(validation_results) | |
# Summary | |
print("\n" + "=" * 60) | |
print("π PHASE 3 TEST SUMMARY") | |
print("=" * 60) | |
total_tests = len(all_results) | |
passed_tests = sum(1 for result in all_results if result['passed']) | |
failed_tests = total_tests - passed_tests | |
print(f"Total Tests: {total_tests}") | |
print(f"Passed: {passed_tests} β ") | |
print(f"Failed: {failed_tests} β") | |
print(f"Success Rate: {(passed_tests/total_tests)*100:.1f}%") | |
if failed_tests > 0: | |
print("\nβ FAILED TESTS:") | |
for result in all_results: | |
if not result['passed']: | |
print(f" - {result['name']}: {', '.join(result['issues'])}") | |
print("\nπ― PHASE 3 OBJECTIVES:") | |
print("β JSON tool call filtering implemented") | |
print("β Response format enforcement strengthened") | |
print("β Answer validation enhanced") | |
print("β Tool output leakage prevention added") | |
if passed_tests >= total_tests * 0.8: # 80% success rate | |
print("\nπ PHASE 3 IMPLEMENTATION SUCCESSFUL!") | |
print("Ready for deployment and evaluation testing.") | |
return True | |
else: | |
print("\nβ οΈ PHASE 3 NEEDS IMPROVEMENT") | |
print("Some tests failed - review and fix issues before deployment.") | |
return False | |
if __name__ == "__main__": | |
success = main() | |
sys.exit(0 if success else 1) |