File size: 11,352 Bytes
9a6a4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
"""
Calculator 100% Accuracy Fix - TDD Implementation
Comprehensive test suite to achieve 100% calculator accuracy.
"""

import pytest
import sys
import os
import logging
import re
from pathlib import Path

# Add the deployment-ready directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent

logger = logging.getLogger(__name__)


class TestCalculator100Accuracy:
    """Test suite to achieve 100% calculator accuracy."""
    
    @pytest.fixture(autouse=True)
    def setup_method(self):
        """Set up test fixtures."""
        self.agent = FixedGAIAAgent()
        
    def extract_numeric_answer(self, response: str) -> str:
        """Extract numeric answer from agent response."""
        # Remove common prefixes and suffixes
        cleaned = response.strip()
        
        # Remove markdown formatting
        cleaned = re.sub(r'[*_`]', '', cleaned)
        
        # Remove common phrases
        prefixes_to_remove = [
            'the answer is', 'the result is', 'the calculation gives',
            'this equals', 'equals', 'is equal to', 'the value is',
            'answer:', 'result:', 'solution:', '='
        ]
        
        for prefix in prefixes_to_remove:
            cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE)
        
        # Extract number patterns (including decimals, negatives, scientific notation)
        # Use word boundaries to avoid matching trailing punctuation
        number_patterns = [
            r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b',  # Scientific notation with word boundary
            r'-?\d+\.\d+\b',  # Decimal numbers with word boundary
            r'-?\d+\b',  # Integers with word boundary
        ]
        
        for pattern in number_patterns:
            matches = re.findall(pattern, cleaned)
            if matches:
                # Return the first number found
                return matches[0].strip()
        
        # If no number found, return the cleaned response
        return cleaned.strip()
    
    def test_basic_arithmetic_100_percent(self):
        """Test basic arithmetic with 100% accuracy requirement."""
        test_cases = [
            {
                'question': 'Calculate 25 * 17',
                'expected': '425',
                'operation': 'multiplication'
            },
            {
                'question': 'What is 144 divided by 12?',
                'expected': '12',
                'operation': 'division'
            },
            {
                'question': 'Add 100 and 50',
                'expected': '150',
                'operation': 'addition'
            },
            {
                'question': 'Subtract 75 from 200',
                'expected': '125',
                'operation': 'subtraction'
            },
            {
                'question': 'What is 2 to the power of 8?',
                'expected': '256',
                'operation': 'exponentiation'
            }
        ]
        
        failed_operations = []
        
        for case in test_cases:
            if not self.agent.available:
                pytest.skip("Agent not available for testing")
            
            try:
                result = self.agent(case['question'])
                
                # Extract numeric answer
                extracted_answer = self.extract_numeric_answer(result)
                expected = case['expected']
                
                # Check if the result matches
                if extracted_answer != expected:
                    # Try float comparison for close matches
                    try:
                        result_num = float(extracted_answer)
                        expected_num = float(expected)
                        if abs(result_num - expected_num) < 0.001:
                            logger.info(f"βœ… {case['operation']} passed (float): {case['question']} β†’ {extracted_answer}")
                            continue
                    except ValueError:
                        pass
                    
                    failed_operations.append({
                        'question': case['question'],
                        'expected': expected,
                        'actual': extracted_answer,
                        'full_response': result,
                        'operation': case['operation']
                    })
                    logger.error(f"❌ {case['operation']} failed: {case['question']}")
                    logger.error(f"   Expected: {expected}")
                    logger.error(f"   Extracted: {extracted_answer}")
                    logger.error(f"   Full response: {result}")
                else:
                    logger.info(f"βœ… {case['operation']} passed: {case['question']} β†’ {extracted_answer}")
                    
            except Exception as e:
                failed_operations.append({
                    'question': case['question'],
                    'expected': case['expected'],
                    'actual': f"ERROR: {e}",
                    'full_response': str(e),
                    'operation': case['operation']
                })
                logger.error(f"❌ {case['operation']} error: {case['question']} β†’ {e}")
        
        # Calculate accuracy
        accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
        logger.info(f"πŸ“Š Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
        
        # Report failures
        if failed_operations:
            logger.error("❌ Failed operations:")
            for failure in failed_operations:
                logger.error(f"   {failure['operation']}: {failure['question']}")
                logger.error(f"      Expected: {failure['expected']}")
                logger.error(f"      Got: {failure['actual']}")
        
        # Assert 100% accuracy
        assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests"
    
    def test_complex_mathematical_operations(self):
        """Test complex mathematical operations for 100% accuracy."""
        test_cases = [
            {
                'question': 'Calculate the square root of 144',
                'expected': '12',
                'operation': 'square_root'
            },
            {
                'question': 'What is 5 factorial?',
                'expected': '120',
                'operation': 'factorial'
            },
            {
                'question': 'Calculate sin(30 degrees)',
                'expected': '0.5',
                'operation': 'trigonometry',
                'tolerance': 0.01
            },
            {
                'question': 'What is the natural logarithm of e?',
                'expected': '1',
                'operation': 'logarithm',
                'tolerance': 0.01
            }
        ]
        
        failed_operations = []
        
        for case in test_cases:
            if not self.agent.available:
                pytest.skip("Agent not available for testing")
            
            try:
                result = self.agent(case['question'])
                
                # Extract numeric answer
                extracted_answer = self.extract_numeric_answer(result)
                expected = case['expected']
                tolerance = case.get('tolerance', 0.001)
                
                # Check if the result matches
                try:
                    result_num = float(extracted_answer)
                    expected_num = float(expected)
                    if abs(result_num - expected_num) <= tolerance:
                        logger.info(f"βœ… {case['operation']} passed: {case['question']} β†’ {extracted_answer}")
                        continue
                except ValueError:
                    # Try exact string match
                    if extracted_answer == expected:
                        logger.info(f"βœ… {case['operation']} passed: {case['question']} β†’ {extracted_answer}")
                        continue
                
                failed_operations.append({
                    'question': case['question'],
                    'expected': expected,
                    'actual': extracted_answer,
                    'full_response': result,
                    'operation': case['operation']
                })
                logger.error(f"❌ {case['operation']} failed: {case['question']}")
                logger.error(f"   Expected: {expected}")
                logger.error(f"   Extracted: {extracted_answer}")
                    
            except Exception as e:
                failed_operations.append({
                    'question': case['question'],
                    'expected': case['expected'],
                    'actual': f"ERROR: {e}",
                    'full_response': str(e),
                    'operation': case['operation']
                })
                logger.error(f"❌ {case['operation']} error: {case['question']} β†’ {e}")
        
        # Calculate accuracy
        accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
        logger.info(f"πŸ“Š Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
        
        # Report results (don't assert for complex operations, just report)
        if failed_operations:
            logger.warning("⚠️ Complex operations that need improvement:")
            for failure in failed_operations:
                logger.warning(f"   {failure['operation']}: {failure['question']}")
                logger.warning(f"      Expected: {failure['expected']}")
                logger.warning(f"      Got: {failure['actual']}")
    
    def test_answer_extraction_patterns(self):
        """Test various answer extraction patterns to improve accuracy."""
        test_responses = [
            ("The answer is 425", "425"),
            ("This calculation gives us 425.", "425"),
            ("425", "425"),
            ("The result is: 425", "425"),
            ("**Answer: 425**", "425"),
            ("Solution: 425", "425"),
            ("= 425", "425"),
            ("425.0", "425.0"),
            ("-123", "-123"),
            ("1.23e+5", "1.23e+5"),
        ]
        
        failed_extractions = []
        
        for response, expected in test_responses:
            extracted = self.extract_numeric_answer(response)
            if extracted != expected:
                failed_extractions.append({
                    'response': response,
                    'expected': expected,
                    'extracted': extracted
                })
                logger.error(f"❌ Extraction failed: '{response}' β†’ Expected: '{expected}', Got: '{extracted}'")
            else:
                logger.info(f"βœ… Extraction passed: '{response}' β†’ '{extracted}'")
        
        # Assert perfect extraction
        assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions"


if __name__ == "__main__":
    # Run the calculator accuracy tests
    pytest.main([__file__, "-v", "-s"])