Spaces:
Running
Running
File size: 11,352 Bytes
9a6a4dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
"""
Calculator 100% Accuracy Fix - TDD Implementation
Comprehensive test suite to achieve 100% calculator accuracy.
"""
import pytest
import sys
import os
import logging
import re
from pathlib import Path
# Add the deployment-ready directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent
logger = logging.getLogger(__name__)
class TestCalculator100Accuracy:
"""Test suite to achieve 100% calculator accuracy."""
@pytest.fixture(autouse=True)
def setup_method(self):
"""Set up test fixtures."""
self.agent = FixedGAIAAgent()
def extract_numeric_answer(self, response: str) -> str:
"""Extract numeric answer from agent response."""
# Remove common prefixes and suffixes
cleaned = response.strip()
# Remove markdown formatting
cleaned = re.sub(r'[*_`]', '', cleaned)
# Remove common phrases
prefixes_to_remove = [
'the answer is', 'the result is', 'the calculation gives',
'this equals', 'equals', 'is equal to', 'the value is',
'answer:', 'result:', 'solution:', '='
]
for prefix in prefixes_to_remove:
cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE)
# Extract number patterns (including decimals, negatives, scientific notation)
# Use word boundaries to avoid matching trailing punctuation
number_patterns = [
r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b', # Scientific notation with word boundary
r'-?\d+\.\d+\b', # Decimal numbers with word boundary
r'-?\d+\b', # Integers with word boundary
]
for pattern in number_patterns:
matches = re.findall(pattern, cleaned)
if matches:
# Return the first number found
return matches[0].strip()
# If no number found, return the cleaned response
return cleaned.strip()
def test_basic_arithmetic_100_percent(self):
"""Test basic arithmetic with 100% accuracy requirement."""
test_cases = [
{
'question': 'Calculate 25 * 17',
'expected': '425',
'operation': 'multiplication'
},
{
'question': 'What is 144 divided by 12?',
'expected': '12',
'operation': 'division'
},
{
'question': 'Add 100 and 50',
'expected': '150',
'operation': 'addition'
},
{
'question': 'Subtract 75 from 200',
'expected': '125',
'operation': 'subtraction'
},
{
'question': 'What is 2 to the power of 8?',
'expected': '256',
'operation': 'exponentiation'
}
]
failed_operations = []
for case in test_cases:
if not self.agent.available:
pytest.skip("Agent not available for testing")
try:
result = self.agent(case['question'])
# Extract numeric answer
extracted_answer = self.extract_numeric_answer(result)
expected = case['expected']
# Check if the result matches
if extracted_answer != expected:
# Try float comparison for close matches
try:
result_num = float(extracted_answer)
expected_num = float(expected)
if abs(result_num - expected_num) < 0.001:
logger.info(f"β
{case['operation']} passed (float): {case['question']} β {extracted_answer}")
continue
except ValueError:
pass
failed_operations.append({
'question': case['question'],
'expected': expected,
'actual': extracted_answer,
'full_response': result,
'operation': case['operation']
})
logger.error(f"β {case['operation']} failed: {case['question']}")
logger.error(f" Expected: {expected}")
logger.error(f" Extracted: {extracted_answer}")
logger.error(f" Full response: {result}")
else:
logger.info(f"β
{case['operation']} passed: {case['question']} β {extracted_answer}")
except Exception as e:
failed_operations.append({
'question': case['question'],
'expected': case['expected'],
'actual': f"ERROR: {e}",
'full_response': str(e),
'operation': case['operation']
})
logger.error(f"β {case['operation']} error: {case['question']} β {e}")
# Calculate accuracy
accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
logger.info(f"π Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
# Report failures
if failed_operations:
logger.error("β Failed operations:")
for failure in failed_operations:
logger.error(f" {failure['operation']}: {failure['question']}")
logger.error(f" Expected: {failure['expected']}")
logger.error(f" Got: {failure['actual']}")
# Assert 100% accuracy
assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests"
def test_complex_mathematical_operations(self):
"""Test complex mathematical operations for 100% accuracy."""
test_cases = [
{
'question': 'Calculate the square root of 144',
'expected': '12',
'operation': 'square_root'
},
{
'question': 'What is 5 factorial?',
'expected': '120',
'operation': 'factorial'
},
{
'question': 'Calculate sin(30 degrees)',
'expected': '0.5',
'operation': 'trigonometry',
'tolerance': 0.01
},
{
'question': 'What is the natural logarithm of e?',
'expected': '1',
'operation': 'logarithm',
'tolerance': 0.01
}
]
failed_operations = []
for case in test_cases:
if not self.agent.available:
pytest.skip("Agent not available for testing")
try:
result = self.agent(case['question'])
# Extract numeric answer
extracted_answer = self.extract_numeric_answer(result)
expected = case['expected']
tolerance = case.get('tolerance', 0.001)
# Check if the result matches
try:
result_num = float(extracted_answer)
expected_num = float(expected)
if abs(result_num - expected_num) <= tolerance:
logger.info(f"β
{case['operation']} passed: {case['question']} β {extracted_answer}")
continue
except ValueError:
# Try exact string match
if extracted_answer == expected:
logger.info(f"β
{case['operation']} passed: {case['question']} β {extracted_answer}")
continue
failed_operations.append({
'question': case['question'],
'expected': expected,
'actual': extracted_answer,
'full_response': result,
'operation': case['operation']
})
logger.error(f"β {case['operation']} failed: {case['question']}")
logger.error(f" Expected: {expected}")
logger.error(f" Extracted: {extracted_answer}")
except Exception as e:
failed_operations.append({
'question': case['question'],
'expected': case['expected'],
'actual': f"ERROR: {e}",
'full_response': str(e),
'operation': case['operation']
})
logger.error(f"β {case['operation']} error: {case['question']} β {e}")
# Calculate accuracy
accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
logger.info(f"π Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
# Report results (don't assert for complex operations, just report)
if failed_operations:
logger.warning("β οΈ Complex operations that need improvement:")
for failure in failed_operations:
logger.warning(f" {failure['operation']}: {failure['question']}")
logger.warning(f" Expected: {failure['expected']}")
logger.warning(f" Got: {failure['actual']}")
def test_answer_extraction_patterns(self):
"""Test various answer extraction patterns to improve accuracy."""
test_responses = [
("The answer is 425", "425"),
("This calculation gives us 425.", "425"),
("425", "425"),
("The result is: 425", "425"),
("**Answer: 425**", "425"),
("Solution: 425", "425"),
("= 425", "425"),
("425.0", "425.0"),
("-123", "-123"),
("1.23e+5", "1.23e+5"),
]
failed_extractions = []
for response, expected in test_responses:
extracted = self.extract_numeric_answer(response)
if extracted != expected:
failed_extractions.append({
'response': response,
'expected': expected,
'extracted': extracted
})
logger.error(f"β Extraction failed: '{response}' β Expected: '{expected}', Got: '{extracted}'")
else:
logger.info(f"β
Extraction passed: '{response}' β '{extracted}'")
# Assert perfect extraction
assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions"
if __name__ == "__main__":
# Run the calculator accuracy tests
pytest.main([__file__, "-v", "-s"]) |