File size: 10,408 Bytes
9a6a4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/env python3
"""
Test Phase 1 Improvements - Tool Execution and Answer Formatting

This script tests the critical fixes implemented in Phase 1:
1. Tool execution debugging and validation
2. Enhanced answer formatting with multiple patterns
3. GAIA format compliance validation
4. Comprehensive error handling and fallback systems

Usage:
    python test_phase1_improvements.py
"""

import os
import sys
import logging
from pathlib import Path

# Add the deployment-ready directory to the path
sys.path.insert(0, str(Path(__file__).parent))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def test_tool_execution_debugger():
    """Test the ToolExecutionDebugger functionality."""
    logger.info("πŸ”§ Testing ToolExecutionDebugger...")
    
    try:
        from utils.tool_execution_debugger import ToolExecutionDebugger
        
        debugger = ToolExecutionDebugger()
        
        # Test JSON syntax detection
        test_responses = [
            "The answer is 42",  # Normal response
            '{"function": "calculator", "parameters": {"expression": "2+2"}}',  # JSON syntax issue
            "FINAL ANSWER: 42",  # Proper format
            "I need to use the calculator tool: {\"tool\": \"calc\"}",  # Mixed content
        ]
        
        for i, response in enumerate(test_responses):
            issues = debugger.detect_json_syntax_in_response(response)
            logger.info(f"  Test {i+1}: {'❌ Issues detected' if issues else 'βœ… Clean'} - {issues}")
        
        # Test tool validation
        class MockTool:
            def __init__(self, name):
                self.name = name
            
            def __class__(self):
                return type(self.name, (), {})
        
        mock_tool = MockTool("TestTool")
        validation = debugger.validate_tool_registration("TestTool", mock_tool)
        logger.info(f"  Tool validation: {validation}")
        
        # Get debug stats
        stats = debugger.get_debug_stats()
        logger.info(f"  Debug stats: {stats}")
        
        logger.info("βœ… ToolExecutionDebugger tests passed")
        return True
        
    except Exception as e:
        logger.error(f"❌ ToolExecutionDebugger test failed: {e}")
        return False

def test_enhanced_answer_formatter():
    """Test the EnhancedGAIAAnswerFormatter functionality."""
    logger.info("🎯 Testing EnhancedGAIAAnswerFormatter...")
    
    try:
        from utils.enhanced_gaia_answer_formatter import EnhancedGAIAAnswerFormatter
        
        formatter = EnhancedGAIAAnswerFormatter()
        
        # Test cases covering different answer types and formats
        test_cases = [
            # Number formatting
            {
                'input': "The calculation gives us 1,234.50 as the result.",
                'question': "What is 1000 + 234.5?",
                'expected_type': 'number',
                'description': 'Number with comma removal'
            },
            {
                'input': "FINAL ANSWER: 42",
                'question': "How many items are there?",
                'expected_type': 'number',
                'description': 'Simple FINAL ANSWER format'
            },
            
            # String formatting
            {
                'input': "The capital of France is Paris.",
                'question': "What is the capital of France?",
                'expected_type': 'string',
                'description': 'String extraction from sentence'
            },
            {
                'input': 'FINAL ANSWER: "The Eiffel Tower"',
                'question': "What is the famous tower in Paris?",
                'expected_type': 'string',
                'description': 'String with quotes removal'
            },
            
            # List formatting
            {
                'input': "The colors are red, blue, and green.",
                'question': "List three primary colors",
                'expected_type': 'list',
                'description': 'List with "and" removal'
            },
            {
                'input': "FINAL ANSWER: apple; banana; orange",
                'question': "Name three fruits",
                'expected_type': 'list',
                'description': 'List with semicolon separation'
            },
            
            # Boolean formatting
            {
                'input': "Yes, Paris is in France.",
                'question': "Is Paris in France?",
                'expected_type': 'boolean',
                'description': 'Boolean yes answer'
            },
            {
                'input': "No, that is incorrect.",
                'question': "Is London in Germany?",
                'expected_type': 'boolean',
                'description': 'Boolean no answer'
            },
            
            # Complex cases
            {
                'input': "After analyzing the data, I can conclude that the answer is 3.14159.",
                'question': "What is the value of pi to 5 decimal places?",
                'expected_type': 'number',
                'description': 'Number extraction from complex text'
            },
            {
                'input': "Let me search for this information... The result shows that Einstein was born in 1879.",
                'question': "When was Einstein born?",
                'expected_type': 'number',
                'description': 'Year extraction from narrative'
            }
        ]
        
        results = []
        for i, test_case in enumerate(test_cases):
            try:
                formatted = formatter.format_answer(test_case['input'], test_case['question'])
                results.append({
                    'test': i + 1,
                    'description': test_case['description'],
                    'input': test_case['input'][:50] + "..." if len(test_case['input']) > 50 else test_case['input'],
                    'output': formatted,
                    'status': 'βœ… Success'
                })
                logger.info(f"  Test {i+1}: βœ… {test_case['description']} β†’ '{formatted}'")
            except Exception as e:
                results.append({
                    'test': i + 1,
                    'description': test_case['description'],
                    'input': test_case['input'][:50] + "..." if len(test_case['input']) > 50 else test_case['input'],
                    'output': f"Error: {e}",
                    'status': '❌ Failed'
                })
                logger.error(f"  Test {i+1}: ❌ {test_case['description']} failed: {e}")
        
        # Get formatting statistics
        stats = formatter.get_formatting_stats()
        logger.info(f"  Formatting stats: {stats}")
        
        # Summary
        successful_tests = sum(1 for r in results if r['status'] == 'βœ… Success')
        logger.info(f"βœ… Enhanced formatter tests: {successful_tests}/{len(test_cases)} passed")
        
        return successful_tests == len(test_cases)
        
    except Exception as e:
        logger.error(f"❌ EnhancedGAIAAnswerFormatter test failed: {e}")
        return False

def test_agent_integration():
    """Test the integration of improvements in the main agent."""
    logger.info("πŸ€– Testing agent integration...")
    
    try:
        # Check if MISTRAL_API_KEY is available
        if not os.getenv("MISTRAL_API_KEY"):
            logger.warning("⚠️ MISTRAL_API_KEY not found - skipping agent integration test")
            return True
        
        from agents.enhanced_unified_agno_agent import GAIAAgent
        
        # Initialize agent
        agent = GAIAAgent()
        
        if not agent.available:
            logger.warning("⚠️ Agent not available - check API key and dependencies")
            return False
        
        # Test tool status
        tool_status = agent.get_tool_status()
        logger.info(f"  Tool status: {tool_status}")
        
        # Test simple question (if agent is available)
        test_question = "What is 2 + 2?"
        logger.info(f"  Testing question: {test_question}")
        
        try:
            response = agent(test_question)
            logger.info(f"  Response: {response}")
            
            # Check if response is properly formatted
            if response and response != "Agent not available" and response != "Unable to process this question":
                logger.info("βœ… Agent integration test passed")
                return True
            else:
                logger.warning("⚠️ Agent returned error response")
                return False
                
        except Exception as e:
            logger.error(f"❌ Agent execution failed: {e}")
            return False
        
    except Exception as e:
        logger.error(f"❌ Agent integration test failed: {e}")
        return False

def run_phase1_tests():
    """Run all Phase 1 improvement tests."""
    logger.info("πŸš€ Starting Phase 1 Improvement Tests")
    logger.info("=" * 60)
    
    test_results = {}
    
    # Test 1: Tool Execution Debugger
    test_results['tool_debugger'] = test_tool_execution_debugger()
    
    # Test 2: Enhanced Answer Formatter
    test_results['answer_formatter'] = test_enhanced_answer_formatter()
    
    # Test 3: Agent Integration
    test_results['agent_integration'] = test_agent_integration()
    
    # Summary
    logger.info("=" * 60)
    logger.info("πŸ“Š Phase 1 Test Results Summary:")
    
    total_tests = len(test_results)
    passed_tests = sum(1 for result in test_results.values() if result)
    
    for test_name, result in test_results.items():
        status = "βœ… PASSED" if result else "❌ FAILED"
        logger.info(f"  {test_name}: {status}")
    
    logger.info(f"\nOverall: {passed_tests}/{total_tests} tests passed")
    
    if passed_tests == total_tests:
        logger.info("πŸŽ‰ All Phase 1 improvements are working correctly!")
        logger.info("πŸ“ˆ Ready to proceed with Phase 2 (Answer Formatting Enhancement)")
    else:
        logger.warning("⚠️ Some tests failed - review logs and fix issues before proceeding")
    
    return passed_tests == total_tests

if __name__ == "__main__":
    success = run_phase1_tests()
    sys.exit(0 if success else 1)