gaia-enhanced-agent / tests /test_calculator_accuracy_100.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
Calculator 100% Accuracy Fix - TDD Implementation
Comprehensive test suite to achieve 100% calculator accuracy.
"""
import pytest
import sys
import os
import logging
import re
from pathlib import Path
# Add the deployment-ready directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent
logger = logging.getLogger(__name__)
class TestCalculator100Accuracy:
"""Test suite to achieve 100% calculator accuracy."""
@pytest.fixture(autouse=True)
def setup_method(self):
"""Set up test fixtures."""
self.agent = FixedGAIAAgent()
def extract_numeric_answer(self, response: str) -> str:
"""Extract numeric answer from agent response."""
# Remove common prefixes and suffixes
cleaned = response.strip()
# Remove markdown formatting
cleaned = re.sub(r'[*_`]', '', cleaned)
# Remove common phrases
prefixes_to_remove = [
'the answer is', 'the result is', 'the calculation gives',
'this equals', 'equals', 'is equal to', 'the value is',
'answer:', 'result:', 'solution:', '='
]
for prefix in prefixes_to_remove:
cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE)
# Extract number patterns (including decimals, negatives, scientific notation)
# Use word boundaries to avoid matching trailing punctuation
number_patterns = [
r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b', # Scientific notation with word boundary
r'-?\d+\.\d+\b', # Decimal numbers with word boundary
r'-?\d+\b', # Integers with word boundary
]
for pattern in number_patterns:
matches = re.findall(pattern, cleaned)
if matches:
# Return the first number found
return matches[0].strip()
# If no number found, return the cleaned response
return cleaned.strip()
def test_basic_arithmetic_100_percent(self):
"""Test basic arithmetic with 100% accuracy requirement."""
test_cases = [
{
'question': 'Calculate 25 * 17',
'expected': '425',
'operation': 'multiplication'
},
{
'question': 'What is 144 divided by 12?',
'expected': '12',
'operation': 'division'
},
{
'question': 'Add 100 and 50',
'expected': '150',
'operation': 'addition'
},
{
'question': 'Subtract 75 from 200',
'expected': '125',
'operation': 'subtraction'
},
{
'question': 'What is 2 to the power of 8?',
'expected': '256',
'operation': 'exponentiation'
}
]
failed_operations = []
for case in test_cases:
if not self.agent.available:
pytest.skip("Agent not available for testing")
try:
result = self.agent(case['question'])
# Extract numeric answer
extracted_answer = self.extract_numeric_answer(result)
expected = case['expected']
# Check if the result matches
if extracted_answer != expected:
# Try float comparison for close matches
try:
result_num = float(extracted_answer)
expected_num = float(expected)
if abs(result_num - expected_num) < 0.001:
logger.info(f"βœ… {case['operation']} passed (float): {case['question']} β†’ {extracted_answer}")
continue
except ValueError:
pass
failed_operations.append({
'question': case['question'],
'expected': expected,
'actual': extracted_answer,
'full_response': result,
'operation': case['operation']
})
logger.error(f"❌ {case['operation']} failed: {case['question']}")
logger.error(f" Expected: {expected}")
logger.error(f" Extracted: {extracted_answer}")
logger.error(f" Full response: {result}")
else:
logger.info(f"βœ… {case['operation']} passed: {case['question']} β†’ {extracted_answer}")
except Exception as e:
failed_operations.append({
'question': case['question'],
'expected': case['expected'],
'actual': f"ERROR: {e}",
'full_response': str(e),
'operation': case['operation']
})
logger.error(f"❌ {case['operation']} error: {case['question']} β†’ {e}")
# Calculate accuracy
accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
logger.info(f"πŸ“Š Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
# Report failures
if failed_operations:
logger.error("❌ Failed operations:")
for failure in failed_operations:
logger.error(f" {failure['operation']}: {failure['question']}")
logger.error(f" Expected: {failure['expected']}")
logger.error(f" Got: {failure['actual']}")
# Assert 100% accuracy
assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests"
def test_complex_mathematical_operations(self):
"""Test complex mathematical operations for 100% accuracy."""
test_cases = [
{
'question': 'Calculate the square root of 144',
'expected': '12',
'operation': 'square_root'
},
{
'question': 'What is 5 factorial?',
'expected': '120',
'operation': 'factorial'
},
{
'question': 'Calculate sin(30 degrees)',
'expected': '0.5',
'operation': 'trigonometry',
'tolerance': 0.01
},
{
'question': 'What is the natural logarithm of e?',
'expected': '1',
'operation': 'logarithm',
'tolerance': 0.01
}
]
failed_operations = []
for case in test_cases:
if not self.agent.available:
pytest.skip("Agent not available for testing")
try:
result = self.agent(case['question'])
# Extract numeric answer
extracted_answer = self.extract_numeric_answer(result)
expected = case['expected']
tolerance = case.get('tolerance', 0.001)
# Check if the result matches
try:
result_num = float(extracted_answer)
expected_num = float(expected)
if abs(result_num - expected_num) <= tolerance:
logger.info(f"βœ… {case['operation']} passed: {case['question']} β†’ {extracted_answer}")
continue
except ValueError:
# Try exact string match
if extracted_answer == expected:
logger.info(f"βœ… {case['operation']} passed: {case['question']} β†’ {extracted_answer}")
continue
failed_operations.append({
'question': case['question'],
'expected': expected,
'actual': extracted_answer,
'full_response': result,
'operation': case['operation']
})
logger.error(f"❌ {case['operation']} failed: {case['question']}")
logger.error(f" Expected: {expected}")
logger.error(f" Extracted: {extracted_answer}")
except Exception as e:
failed_operations.append({
'question': case['question'],
'expected': case['expected'],
'actual': f"ERROR: {e}",
'full_response': str(e),
'operation': case['operation']
})
logger.error(f"❌ {case['operation']} error: {case['question']} β†’ {e}")
# Calculate accuracy
accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
logger.info(f"πŸ“Š Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
# Report results (don't assert for complex operations, just report)
if failed_operations:
logger.warning("⚠️ Complex operations that need improvement:")
for failure in failed_operations:
logger.warning(f" {failure['operation']}: {failure['question']}")
logger.warning(f" Expected: {failure['expected']}")
logger.warning(f" Got: {failure['actual']}")
def test_answer_extraction_patterns(self):
"""Test various answer extraction patterns to improve accuracy."""
test_responses = [
("The answer is 425", "425"),
("This calculation gives us 425.", "425"),
("425", "425"),
("The result is: 425", "425"),
("**Answer: 425**", "425"),
("Solution: 425", "425"),
("= 425", "425"),
("425.0", "425.0"),
("-123", "-123"),
("1.23e+5", "1.23e+5"),
]
failed_extractions = []
for response, expected in test_responses:
extracted = self.extract_numeric_answer(response)
if extracted != expected:
failed_extractions.append({
'response': response,
'expected': expected,
'extracted': extracted
})
logger.error(f"❌ Extraction failed: '{response}' β†’ Expected: '{expected}', Got: '{extracted}'")
else:
logger.info(f"βœ… Extraction passed: '{response}' β†’ '{extracted}'")
# Assert perfect extraction
assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions"
if __name__ == "__main__":
# Run the calculator accuracy tests
pytest.main([__file__, "-v", "-s"])