Spaces:
Running
Running
""" | |
Calculator 100% Accuracy Fix - TDD Implementation | |
Comprehensive test suite to achieve 100% calculator accuracy. | |
""" | |
import pytest | |
import sys | |
import os | |
import logging | |
import re | |
from pathlib import Path | |
# Add the deployment-ready directory to the path | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) | |
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent | |
logger = logging.getLogger(__name__) | |
class TestCalculator100Accuracy: | |
"""Test suite to achieve 100% calculator accuracy.""" | |
def setup_method(self): | |
"""Set up test fixtures.""" | |
self.agent = FixedGAIAAgent() | |
def extract_numeric_answer(self, response: str) -> str: | |
"""Extract numeric answer from agent response.""" | |
# Remove common prefixes and suffixes | |
cleaned = response.strip() | |
# Remove markdown formatting | |
cleaned = re.sub(r'[*_`]', '', cleaned) | |
# Remove common phrases | |
prefixes_to_remove = [ | |
'the answer is', 'the result is', 'the calculation gives', | |
'this equals', 'equals', 'is equal to', 'the value is', | |
'answer:', 'result:', 'solution:', '=' | |
] | |
for prefix in prefixes_to_remove: | |
cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE) | |
# Extract number patterns (including decimals, negatives, scientific notation) | |
# Use word boundaries to avoid matching trailing punctuation | |
number_patterns = [ | |
r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b', # Scientific notation with word boundary | |
r'-?\d+\.\d+\b', # Decimal numbers with word boundary | |
r'-?\d+\b', # Integers with word boundary | |
] | |
for pattern in number_patterns: | |
matches = re.findall(pattern, cleaned) | |
if matches: | |
# Return the first number found | |
return matches[0].strip() | |
# If no number found, return the cleaned response | |
return cleaned.strip() | |
def test_basic_arithmetic_100_percent(self): | |
"""Test basic arithmetic with 100% accuracy requirement.""" | |
test_cases = [ | |
{ | |
'question': 'Calculate 25 * 17', | |
'expected': '425', | |
'operation': 'multiplication' | |
}, | |
{ | |
'question': 'What is 144 divided by 12?', | |
'expected': '12', | |
'operation': 'division' | |
}, | |
{ | |
'question': 'Add 100 and 50', | |
'expected': '150', | |
'operation': 'addition' | |
}, | |
{ | |
'question': 'Subtract 75 from 200', | |
'expected': '125', | |
'operation': 'subtraction' | |
}, | |
{ | |
'question': 'What is 2 to the power of 8?', | |
'expected': '256', | |
'operation': 'exponentiation' | |
} | |
] | |
failed_operations = [] | |
for case in test_cases: | |
if not self.agent.available: | |
pytest.skip("Agent not available for testing") | |
try: | |
result = self.agent(case['question']) | |
# Extract numeric answer | |
extracted_answer = self.extract_numeric_answer(result) | |
expected = case['expected'] | |
# Check if the result matches | |
if extracted_answer != expected: | |
# Try float comparison for close matches | |
try: | |
result_num = float(extracted_answer) | |
expected_num = float(expected) | |
if abs(result_num - expected_num) < 0.001: | |
logger.info(f"β {case['operation']} passed (float): {case['question']} β {extracted_answer}") | |
continue | |
except ValueError: | |
pass | |
failed_operations.append({ | |
'question': case['question'], | |
'expected': expected, | |
'actual': extracted_answer, | |
'full_response': result, | |
'operation': case['operation'] | |
}) | |
logger.error(f"β {case['operation']} failed: {case['question']}") | |
logger.error(f" Expected: {expected}") | |
logger.error(f" Extracted: {extracted_answer}") | |
logger.error(f" Full response: {result}") | |
else: | |
logger.info(f"β {case['operation']} passed: {case['question']} β {extracted_answer}") | |
except Exception as e: | |
failed_operations.append({ | |
'question': case['question'], | |
'expected': case['expected'], | |
'actual': f"ERROR: {e}", | |
'full_response': str(e), | |
'operation': case['operation'] | |
}) | |
logger.error(f"β {case['operation']} error: {case['question']} β {e}") | |
# Calculate accuracy | |
accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100 | |
logger.info(f"π Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})") | |
# Report failures | |
if failed_operations: | |
logger.error("β Failed operations:") | |
for failure in failed_operations: | |
logger.error(f" {failure['operation']}: {failure['question']}") | |
logger.error(f" Expected: {failure['expected']}") | |
logger.error(f" Got: {failure['actual']}") | |
# Assert 100% accuracy | |
assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests" | |
def test_complex_mathematical_operations(self): | |
"""Test complex mathematical operations for 100% accuracy.""" | |
test_cases = [ | |
{ | |
'question': 'Calculate the square root of 144', | |
'expected': '12', | |
'operation': 'square_root' | |
}, | |
{ | |
'question': 'What is 5 factorial?', | |
'expected': '120', | |
'operation': 'factorial' | |
}, | |
{ | |
'question': 'Calculate sin(30 degrees)', | |
'expected': '0.5', | |
'operation': 'trigonometry', | |
'tolerance': 0.01 | |
}, | |
{ | |
'question': 'What is the natural logarithm of e?', | |
'expected': '1', | |
'operation': 'logarithm', | |
'tolerance': 0.01 | |
} | |
] | |
failed_operations = [] | |
for case in test_cases: | |
if not self.agent.available: | |
pytest.skip("Agent not available for testing") | |
try: | |
result = self.agent(case['question']) | |
# Extract numeric answer | |
extracted_answer = self.extract_numeric_answer(result) | |
expected = case['expected'] | |
tolerance = case.get('tolerance', 0.001) | |
# Check if the result matches | |
try: | |
result_num = float(extracted_answer) | |
expected_num = float(expected) | |
if abs(result_num - expected_num) <= tolerance: | |
logger.info(f"β {case['operation']} passed: {case['question']} β {extracted_answer}") | |
continue | |
except ValueError: | |
# Try exact string match | |
if extracted_answer == expected: | |
logger.info(f"β {case['operation']} passed: {case['question']} β {extracted_answer}") | |
continue | |
failed_operations.append({ | |
'question': case['question'], | |
'expected': expected, | |
'actual': extracted_answer, | |
'full_response': result, | |
'operation': case['operation'] | |
}) | |
logger.error(f"β {case['operation']} failed: {case['question']}") | |
logger.error(f" Expected: {expected}") | |
logger.error(f" Extracted: {extracted_answer}") | |
except Exception as e: | |
failed_operations.append({ | |
'question': case['question'], | |
'expected': case['expected'], | |
'actual': f"ERROR: {e}", | |
'full_response': str(e), | |
'operation': case['operation'] | |
}) | |
logger.error(f"β {case['operation']} error: {case['question']} β {e}") | |
# Calculate accuracy | |
accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100 | |
logger.info(f"π Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})") | |
# Report results (don't assert for complex operations, just report) | |
if failed_operations: | |
logger.warning("β οΈ Complex operations that need improvement:") | |
for failure in failed_operations: | |
logger.warning(f" {failure['operation']}: {failure['question']}") | |
logger.warning(f" Expected: {failure['expected']}") | |
logger.warning(f" Got: {failure['actual']}") | |
def test_answer_extraction_patterns(self): | |
"""Test various answer extraction patterns to improve accuracy.""" | |
test_responses = [ | |
("The answer is 425", "425"), | |
("This calculation gives us 425.", "425"), | |
("425", "425"), | |
("The result is: 425", "425"), | |
("**Answer: 425**", "425"), | |
("Solution: 425", "425"), | |
("= 425", "425"), | |
("425.0", "425.0"), | |
("-123", "-123"), | |
("1.23e+5", "1.23e+5"), | |
] | |
failed_extractions = [] | |
for response, expected in test_responses: | |
extracted = self.extract_numeric_answer(response) | |
if extracted != expected: | |
failed_extractions.append({ | |
'response': response, | |
'expected': expected, | |
'extracted': extracted | |
}) | |
logger.error(f"β Extraction failed: '{response}' β Expected: '{expected}', Got: '{extracted}'") | |
else: | |
logger.info(f"β Extraction passed: '{response}' β '{extracted}'") | |
# Assert perfect extraction | |
assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions" | |
if __name__ == "__main__": | |
# Run the calculator accuracy tests | |
pytest.main([__file__, "-v", "-s"]) |