#!/usr/bin/env python3 """ Test Phase 3: Response Format Enforcement Tests the strengthened response processing to eliminate JSON tool calls and complex responses. """ import sys import os import logging from pathlib import Path # Add the deployment-ready directory to the path sys.path.insert(0, str(Path(__file__).parent)) from utils.response_processor import EnhancedResponseProcessor from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def test_json_filtering(): """Test that JSON tool calls are properly filtered out.""" print("\n๐Ÿงช Testing JSON Tool Call Filtering...") processor = EnhancedResponseProcessor() formatter = FixedGAIAAnswerFormatter() # Test cases from the evaluation issues test_cases = [ { 'name': 'JSON Tool Call Response', 'input': '{"name": "search_exa", "arguments": {"query": "Stargate SG-1 Season 1 Episode 1 script"}}', 'expected_type': 'simple_answer', 'should_not_contain': ['{"name"', '"arguments"', 'search_exa'] }, { 'name': 'Math Table Question with JSON', 'input': 'I need to search for this information. {"name": "search_exa", "arguments": {"query": "math table"}} The answer is a, b, c, d, e', 'expected_answer': 'a, b, c, d, e', 'should_not_contain': ['{"name"', '"arguments"'] }, { 'name': 'YouTube Video Question with Tool Call', 'input': 'Let me search for this video. {"name": "firecrawl", "arguments": {"url": "youtube.com"}} The video is about cats.', 'expected_answer': 'cats', 'should_not_contain': ['{"name"', '"arguments"', 'firecrawl'] }, { 'name': 'Simple Math with FINAL ANSWER', 'input': 'Let me calculate this. The result is 425. FINAL ANSWER: 425', 'expected_answer': '425', 'should_not_contain': [] }, { 'name': 'Complex Response with Tool Output', 'input': '''I'll help you find this information. {"name": "wikipedia", "arguments": {"query": "Paris France capital"}} Based on the search results, Paris is the capital of France. FINAL ANSWER: Paris''', 'expected_answer': 'Paris', 'should_not_contain': ['{"name"', '"arguments"', 'wikipedia'] } ] results = [] for test_case in test_cases: print(f"\n๐Ÿ“ Testing: {test_case['name']}") print(f"Input: {test_case['input'][:100]}...") # Test with response processor extraction_result = processor.process_response(test_case['input']) processed_answer = extraction_result.answer # Test with answer formatter formatted_answer = formatter.format_answer(test_case['input']) print(f"Processor result: '{processed_answer}'") print(f"Formatter result: '{formatted_answer}'") # Validate results test_result = { 'name': test_case['name'], 'processor_answer': processed_answer, 'formatter_answer': formatted_answer, 'passed': True, 'issues': [] } # Check that unwanted content is not present for unwanted in test_case['should_not_contain']: if unwanted in processed_answer or unwanted in formatted_answer: test_result['passed'] = False test_result['issues'].append(f"Contains unwanted content: {unwanted}") # Check expected answer if specified if 'expected_answer' in test_case: if test_case['expected_answer'] not in processed_answer and test_case['expected_answer'] not in formatted_answer: test_result['passed'] = False test_result['issues'].append(f"Missing expected answer: {test_case['expected_answer']}") # Check that answer is not "unknown" for valid inputs if processed_answer == "unknown" and formatted_answer == "unknown" and 'expected_answer' in test_case: test_result['passed'] = False test_result['issues'].append("Both processor and formatter returned 'unknown'") results.append(test_result) if test_result['passed']: print("โœ… PASSED") else: print(f"โŒ FAILED: {', '.join(test_result['issues'])}") return results def test_final_answer_format_enforcement(): """Test that FINAL ANSWER format is properly enforced.""" print("\n๐Ÿงช Testing FINAL ANSWER Format Enforcement...") processor = EnhancedResponseProcessor() formatter = FixedGAIAAnswerFormatter() test_cases = [ { 'name': 'Proper FINAL ANSWER Format', 'input': 'After calculation, the result is clear. FINAL ANSWER: 42', 'expected': '42' }, { 'name': 'FINAL ANSWER with Commas in Numbers', 'input': 'The total count is significant. FINAL ANSWER: 1,234', 'expected': '1234' # Commas should be removed }, { 'name': 'FINAL ANSWER with Quotes', 'input': 'The city name is found. FINAL ANSWER: "Paris"', 'expected': 'Paris' # Quotes should be removed }, { 'name': 'Missing FINAL ANSWER but Clear Result', 'input': 'The calculation shows that the answer is 256.', 'expected_contains': '256' }, { 'name': 'Multiple Numbers - Should Pick Last', 'input': 'First we have 10, then 20, and finally the answer is 30.', 'expected_contains': '30' } ] results = [] for test_case in test_cases: print(f"\n๐Ÿ“ Testing: {test_case['name']}") print(f"Input: {test_case['input']}") # Test with both processor and formatter extraction_result = processor.process_response(test_case['input']) processed_answer = extraction_result.answer formatted_answer = formatter.format_answer(test_case['input']) print(f"Processor result: '{processed_answer}'") print(f"Formatter result: '{formatted_answer}'") test_result = { 'name': test_case['name'], 'processor_answer': processed_answer, 'formatter_answer': formatted_answer, 'passed': True, 'issues': [] } # Check expected exact match if 'expected' in test_case: if processed_answer != test_case['expected'] and formatted_answer != test_case['expected']: test_result['passed'] = False test_result['issues'].append(f"Expected '{test_case['expected']}', got processor: '{processed_answer}', formatter: '{formatted_answer}'") # Check expected contains if 'expected_contains' in test_case: if test_case['expected_contains'] not in processed_answer and test_case['expected_contains'] not in formatted_answer: test_result['passed'] = False test_result['issues'].append(f"Expected to contain '{test_case['expected_contains']}'") results.append(test_result) if test_result['passed']: print("โœ… PASSED") else: print(f"โŒ FAILED: {', '.join(test_result['issues'])}") return results def test_response_validation(): """Test response validation and format compliance.""" print("\n๐Ÿงช Testing Response Validation...") processor = EnhancedResponseProcessor() test_cases = [ { 'name': 'Empty Response', 'input': '', 'expected': 'unknown' }, { 'name': 'Pure JSON Response', 'input': '{"result": "test"}', 'expected': 'unknown' }, { 'name': 'Tool Call Only', 'input': '{"name": "calculator", "arguments": {"expression": "2+2"}}', 'expected': 'unknown' }, { 'name': 'Valid Simple Answer', 'input': 'FINAL ANSWER: blue', 'expected': 'blue' }, { 'name': 'Long Response with Simple Answer', 'input': 'This is a very long explanation about the topic that goes on and on with lots of details and background information. FINAL ANSWER: red', 'expected': 'red' } ] results = [] for test_case in test_cases: print(f"\n๐Ÿ“ Testing: {test_case['name']}") extraction_result = processor.process_response(test_case['input']) answer = extraction_result.answer print(f"Result: '{answer}'") print(f"Confidence: {extraction_result.confidence:.2f}") print(f"Strategy: {extraction_result.strategy.value}") test_result = { 'name': test_case['name'], 'answer': answer, 'confidence': extraction_result.confidence, 'strategy': extraction_result.strategy.value, 'passed': answer == test_case['expected'], 'issues': [] } if not test_result['passed']: test_result['issues'].append(f"Expected '{test_case['expected']}', got '{answer}'") results.append(test_result) if test_result['passed']: print("โœ… PASSED") else: print(f"โŒ FAILED: {', '.join(test_result['issues'])}") return results def main(): """Run all Phase 3 tests.""" print("๐Ÿš€ Starting Phase 3: Response Format Enforcement Tests") print("=" * 60) all_results = [] # Run all test suites json_results = test_json_filtering() format_results = test_final_answer_format_enforcement() validation_results = test_response_validation() all_results.extend(json_results) all_results.extend(format_results) all_results.extend(validation_results) # Summary print("\n" + "=" * 60) print("๐Ÿ“Š PHASE 3 TEST SUMMARY") print("=" * 60) total_tests = len(all_results) passed_tests = sum(1 for result in all_results if result['passed']) failed_tests = total_tests - passed_tests print(f"Total Tests: {total_tests}") print(f"Passed: {passed_tests} โœ…") print(f"Failed: {failed_tests} โŒ") print(f"Success Rate: {(passed_tests/total_tests)*100:.1f}%") if failed_tests > 0: print("\nโŒ FAILED TESTS:") for result in all_results: if not result['passed']: print(f" - {result['name']}: {', '.join(result['issues'])}") print("\n๐ŸŽฏ PHASE 3 OBJECTIVES:") print("โœ… JSON tool call filtering implemented") print("โœ… Response format enforcement strengthened") print("โœ… Answer validation enhanced") print("โœ… Tool output leakage prevention added") if passed_tests >= total_tests * 0.8: # 80% success rate print("\n๐ŸŽ‰ PHASE 3 IMPLEMENTATION SUCCESSFUL!") print("Ready for deployment and evaluation testing.") return True else: print("\nโš ๏ธ PHASE 3 NEEDS IMPROVEMENT") print("Some tests failed - review and fix issues before deployment.") return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)