File size: 17,789 Bytes
9a6a4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
"""
Comprehensive Testing for Phase 4 Tool Selection Optimization

This test suite validates the tool selection optimization implementation
to ensure it addresses the critical evaluation issues identified:
1. Inappropriate tool selection for specific question types
2. Tool usage pattern optimization
3. Dynamic tool selection based on question analysis
4. Tool execution strategy optimization
"""

import pytest
import logging
from typing import List, Dict, Any
from unittest.mock import Mock, patch

# Import the modules to test
from utils.enhanced_question_classifier import (
    EnhancedQuestionClassifier,
    ClassificationResult,
    QuestionType,
    ToolType
)
from utils.tool_selector import (
    ToolSelector,
    ToolSelectionResult,
    ToolExecutionPlan,
    ToolExecutionStrategy,
    ToolPriority
)
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent

logger = logging.getLogger(__name__)


class TestEnhancedQuestionClassifier:
    """Test the enhanced question classifier."""
    
    def setup_method(self):
        """Set up test fixtures."""
        self.classifier = EnhancedQuestionClassifier()
    
    def test_bird_species_classification(self):
        """Test classification of bird species counting questions."""
        question = "How many bird species are there in the world?"
        result = self.classifier.classify_question(question)
        
        assert result.question_type == QuestionType.KNOWLEDGE_FACTS
        assert result.sub_category == "counting_facts"
        assert ToolType.WIKIPEDIA in result.recommended_tools
        assert ToolType.EXA in result.recommended_tools
        assert result.confidence > 0.8
        assert "bird species" in result.reasoning.lower()
    
    def test_exponentiation_classification(self):
        """Test classification of exponentiation questions."""
        question = "What is 2^8?"
        result = self.classifier.classify_question(question)
        
        assert result.question_type == QuestionType.MATHEMATICAL
        assert result.sub_category == "exponentiation"
        assert ToolType.PYTHON in result.recommended_tools
        assert result.confidence > 0.8
        assert "exponentiation" in result.reasoning.lower()
    
    def test_artist_discography_classification(self):
        """Test classification of artist discography questions."""
        question = "What albums did Mercedes Sosa release between 2000 and 2009?"
        result = self.classifier.classify_question(question)
        
        assert result.question_type == QuestionType.WEB_RESEARCH
        assert result.sub_category == "artist_discography"
        assert ToolType.EXA in result.recommended_tools
        assert result.confidence > 0.7
        assert "discography" in result.reasoning.lower()
    
    def test_basic_arithmetic_classification(self):
        """Test classification of basic arithmetic questions."""
        question = "What is 25 * 17?"
        result = self.classifier.classify_question(question)
        
        assert result.question_type == QuestionType.MATHEMATICAL
        assert result.sub_category == "basic_arithmetic"
        assert ToolType.CALCULATOR in result.recommended_tools
        assert result.confidence > 0.9
    
    def test_youtube_content_classification(self):
        """Test classification of YouTube content questions."""
        question = "What is discussed in this YouTube video? https://youtube.com/watch?v=example"
        result = self.classifier.classify_question(question)
        
        assert result.question_type == QuestionType.VIDEO_ANALYSIS
        assert ToolType.YOUTUBE in result.recommended_tools
        assert result.confidence > 0.8
    
    def test_multimodal_image_classification(self):
        """Test classification with image attachments."""
        question = "What do you see in this image?"
        files = [{"type": "image", "path": "test.jpg"}]
        result = self.classifier.classify_question(question, files)
        
        assert result.question_type == QuestionType.MULTIMODAL
        assert result.sub_category == "image_analysis"
        assert ToolType.IMAGE_ANALYSIS in result.recommended_tools
        assert result.confidence > 0.8


class TestToolSelector:
    """Test the tool selector optimization."""
    
    def setup_method(self):
        """Set up test fixtures."""
        self.selector = ToolSelector()
    
    def test_bird_species_optimization_rule(self):
        """Test optimization rule for bird species counting."""
        question = "How many bird species are there in the world?"
        result = self.selector.select_optimal_tools(question)
        
        assert result.primary_plan.tool_type == ToolType.WIKIPEDIA
        assert result.execution_strategy == ToolExecutionStrategy.SEQUENTIAL
        assert result.confidence > 0.9
        assert "bird species counting" in result.optimization_reasoning.lower()
        assert len(result.fallback_plans) > 0
        assert result.fallback_plans[0].tool_type == ToolType.EXA
    
    def test_exponentiation_optimization_rule(self):
        """Test optimization rule for exponentiation."""
        question = "What is 2^8?"
        result = self.selector.select_optimal_tools(question)
        
        assert result.primary_plan.tool_type == ToolType.PYTHON
        assert result.execution_strategy == ToolExecutionStrategy.SEQUENTIAL
        assert result.confidence > 0.8
        assert "exponentiation" in result.optimization_reasoning.lower()
        assert "variable_to_return" in result.primary_plan.parameters
    
    def test_artist_discography_optimization_rule(self):
        """Test optimization rule for artist discography."""
        question = "What albums did Mercedes Sosa release between 2000 and 2009?"
        result = self.selector.select_optimal_tools(question)
        
        assert result.primary_plan.tool_type == ToolType.EXA
        assert result.execution_strategy == ToolExecutionStrategy.SEQUENTIAL
        assert result.confidence > 0.8
        assert "discography" in result.optimization_reasoning.lower()
    
    def test_basic_arithmetic_optimization_rule(self):
        """Test optimization rule for basic arithmetic."""
        question = "What is 25 * 17?"
        result = self.selector.select_optimal_tools(question)
        
        assert result.primary_plan.tool_type == ToolType.CALCULATOR
        assert result.execution_strategy == ToolExecutionStrategy.SEQUENTIAL
        assert result.confidence > 0.9
        assert "arithmetic" in result.optimization_reasoning.lower()
    
    def test_youtube_optimization_rule(self):
        """Test optimization rule for YouTube content."""
        question = "What is discussed in https://youtube.com/watch?v=example?"
        result = self.selector.select_optimal_tools(question)
        
        assert result.primary_plan.tool_type == ToolType.YOUTUBE
        assert result.execution_strategy == ToolExecutionStrategy.SEQUENTIAL
        assert result.confidence > 0.9
        assert "youtube" in result.optimization_reasoning.lower()
    
    def test_general_classification_fallback(self):
        """Test fallback to general classification when no specific rule matches."""
        question = "What is the weather like today?"
        result = self.selector.select_optimal_tools(question)
        
        # Should fall back to general classification
        assert result.primary_plan.tool_type in [ToolType.EXA, ToolType.WIKIPEDIA]
        assert result.execution_strategy == ToolExecutionStrategy.SEQUENTIAL
        assert "Classification-based selection" in result.optimization_reasoning
    
    def test_tool_performance_tracking(self):
        """Test tool performance tracking functionality."""
        # Update performance for a tool
        self.selector.update_tool_performance(ToolType.WIKIPEDIA, True, 5.0, 0.9)
        
        # Check that performance was updated
        stats = self.selector.performance_stats[ToolType.WIKIPEDIA]
        assert stats['usage_count'] == 1
        assert stats['failure_count'] == 0
        assert stats['success_rate'] > 0.8
        assert stats['avg_response_time'] < 10.0
    
    def test_performance_report_generation(self):
        """Test performance report generation."""
        report = self.selector.get_tool_performance_report()
        
        assert 'tool_performance' in report
        assert 'optimization_rules' in report
        assert 'performance_summary' in report
        assert len(report['optimization_rules']) > 0
        assert 'avg_success_rate' in report['performance_summary']


class TestFixedGAIAAgentIntegration:
    """Test integration of tool selection optimization in the main agent."""
    
    def setup_method(self):
        """Set up test fixtures."""
        # Mock the agent initialization to avoid API key requirements
        with patch('agents.fixed_enhanced_unified_agno_agent.MistralChat'), \
             patch('agents.fixed_enhanced_unified_agno_agent.Agent'):
            self.agent = FixedGAIAAgent()
            self.agent.available = True
            self.agent.agent = Mock()
    
    def test_tool_optimization_integration(self):
        """Test that tool optimization is properly integrated."""
        # Check that optimization components are initialized
        assert hasattr(self.agent, 'question_classifier')
        assert hasattr(self.agent, 'tool_selector')
        assert isinstance(self.agent.question_classifier, EnhancedQuestionClassifier)
        assert isinstance(self.agent.tool_selector, ToolSelector)
    
    def test_apply_tool_optimizations_method(self):
        """Test the _apply_tool_optimizations method."""
        question = "What is 2^8?"
        
        # Create a mock tool selection result
        mock_selection = ToolSelectionResult(
            primary_plan=ToolExecutionPlan(
                tool_type=ToolType.PYTHON,
                priority=ToolPriority.CRITICAL,
                parameters={"variable_to_return": "result"},
                expected_output="Numeric result",
                success_criteria="Output contains: result",
                fallback_tools=[],
                timeout_seconds=30,
                retry_count=1
            ),
            fallback_plans=[],
            execution_strategy=ToolExecutionStrategy.SEQUENTIAL,
            optimization_reasoning="Exponentiation requires Python",
            confidence=0.9,
            estimated_success_rate=0.85
        )
        
        # Test the optimization application
        optimized_question = self.agent._apply_tool_optimizations(question, mock_selection)
        
        assert "TOOL OPTIMIZATION GUIDANCE" in optimized_question
        assert "python" in optimized_question.lower()
        assert "confidence: 0.9" in optimized_question.lower()
        assert question in optimized_question


class TestCriticalEvaluationScenarios:
    """Test scenarios that address the specific evaluation issues."""
    
    def setup_method(self):
        """Set up test fixtures."""
        self.selector = ToolSelector()
    
    def test_bird_species_not_calculator(self):
        """Test that bird species questions don't use calculator (addresses '468' issue)."""
        question = "How many bird species are there in the world?"
        result = self.selector.select_optimal_tools(question)
        
        # Should NOT use calculator
        assert result.primary_plan.tool_type != ToolType.CALCULATOR
        # Should use Wikipedia or Exa
        assert result.primary_plan.tool_type in [ToolType.WIKIPEDIA, ToolType.EXA]
    
    def test_exponentiation_uses_python(self):
        """Test that exponentiation uses Python, not calculator."""
        questions = [
            "What is 2^8?",
            "Calculate 3 to the power of 4",
            "What is 5**3?"
        ]
        
        for question in questions:
            result = self.selector.select_optimal_tools(question)
            assert result.primary_plan.tool_type == ToolType.PYTHON
            assert "variable_to_return" in result.primary_plan.parameters
    
    def test_artist_discography_specific_search(self):
        """Test that artist discography uses targeted search."""
        question = "What albums did Mercedes Sosa release between 2000 and 2009?"
        result = self.selector.select_optimal_tools(question)
        
        assert result.primary_plan.tool_type == ToolType.EXA
        # Should have specific search parameters
        assert "Mercedes Sosa" in str(result.primary_plan.parameters).replace("'", "").replace('"', '')
    
    def test_factual_counting_authoritative_sources(self):
        """Test that factual counting uses authoritative sources."""
        questions = [
            "How many countries are in the world?",
            "How many continents are there?",
            "How many oceans exist?"
        ]
        
        for question in questions:
            result = self.selector.select_optimal_tools(question)
            # Should use Wikipedia or Exa, not calculator
            assert result.primary_plan.tool_type in [ToolType.WIKIPEDIA, ToolType.EXA]
            assert result.primary_plan.tool_type != ToolType.CALCULATOR


class TestToolSelectionConfidence:
    """Test confidence scoring and selection quality."""
    
    def setup_method(self):
        """Set up test fixtures."""
        self.selector = ToolSelector()
    
    def test_high_confidence_specific_rules(self):
        """Test that specific optimization rules have high confidence."""
        high_confidence_questions = [
            "How many bird species are there in the world?",
            "What is 2^8?",
            "What is 25 * 17?",
            "https://youtube.com/watch?v=example"
        ]
        
        for question in high_confidence_questions:
            result = self.selector.select_optimal_tools(question)
            assert result.confidence > 0.8, f"Low confidence for: {question}"
    
    def test_success_rate_estimation(self):
        """Test success rate estimation for tool combinations."""
        question = "How many bird species are there in the world?"
        result = self.selector.select_optimal_tools(question)
        
        # Should have reasonable success rate with fallbacks
        assert result.estimated_success_rate > 0.7
        assert result.estimated_success_rate <= 1.0
    
    def test_fallback_strategy_quality(self):
        """Test quality of fallback strategies."""
        question = "How many bird species are there in the world?"
        result = self.selector.select_optimal_tools(question)
        
        # Should have at least one fallback
        assert len(result.fallback_plans) > 0
        
        # Fallback should be different from primary
        primary_tool = result.primary_plan.tool_type
        fallback_tools = [plan.tool_type for plan in result.fallback_plans]
        assert primary_tool not in fallback_tools


# Integration test scenarios
@pytest.mark.integration
class TestEndToEndOptimization:
    """End-to-end testing of the optimization system."""
    
    def test_complete_optimization_pipeline(self):
        """Test the complete optimization pipeline."""
        # Test questions that previously caused issues
        test_cases = [
            {
                'question': "How many bird species are there in the world?",
                'expected_tool': ToolType.WIKIPEDIA,
                'should_not_use': ToolType.CALCULATOR
            },
            {
                'question': "What is 2^8?",
                'expected_tool': ToolType.PYTHON,
                'should_not_use': ToolType.CALCULATOR
            },
            {
                'question': "What albums did Mercedes Sosa release between 2000 and 2009?",
                'expected_tool': ToolType.EXA,
                'should_not_use': ToolType.CALCULATOR
            }
        ]
        
        selector = ToolSelector()
        
        for case in test_cases:
            result = selector.select_optimal_tools(case['question'])
            
            # Check expected tool is selected
            assert result.primary_plan.tool_type == case['expected_tool'], \
                f"Wrong tool for: {case['question']}"
            
            # Check problematic tool is not used
            assert result.primary_plan.tool_type != case['should_not_use'], \
                f"Should not use {case['should_not_use'].value} for: {case['question']}"
            
            # Check confidence is reasonable
            assert result.confidence > 0.7, \
                f"Low confidence for: {case['question']}"


if __name__ == "__main__":
    # Configure logging for tests
    logging.basicConfig(level=logging.INFO)
    
    # Run specific test scenarios
    print("πŸ§ͺ Running Phase 4 Tool Selection Optimization Tests")
    print("=" * 60)
    
    # Test critical scenarios
    test_selector = TestCriticalEvaluationScenarios()
    test_selector.setup_method()
    
    print("Testing bird species optimization...")
    test_selector.test_bird_species_not_calculator()
    print("βœ… Bird species test passed")
    
    print("Testing exponentiation optimization...")
    test_selector.test_exponentiation_uses_python()
    print("βœ… Exponentiation test passed")
    
    print("Testing artist discography optimization...")
    test_selector.test_artist_discography_specific_search()
    print("βœ… Artist discography test passed")
    
    print("Testing factual counting optimization...")
    test_selector.test_factual_counting_authoritative_sources()
    print("βœ… Factual counting test passed")
    
    print("\n🎯 All critical optimization tests passed!")
    print("Phase 4 tool selection optimization is working correctly.")