""" Calculator 100% Accuracy Fix - TDD Implementation Comprehensive test suite to achieve 100% calculator accuracy. """ import pytest import sys import os import logging import re from pathlib import Path # Add the deployment-ready directory to the path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent logger = logging.getLogger(__name__) class TestCalculator100Accuracy: """Test suite to achieve 100% calculator accuracy.""" @pytest.fixture(autouse=True) def setup_method(self): """Set up test fixtures.""" self.agent = FixedGAIAAgent() def extract_numeric_answer(self, response: str) -> str: """Extract numeric answer from agent response.""" # Remove common prefixes and suffixes cleaned = response.strip() # Remove markdown formatting cleaned = re.sub(r'[*_`]', '', cleaned) # Remove common phrases prefixes_to_remove = [ 'the answer is', 'the result is', 'the calculation gives', 'this equals', 'equals', 'is equal to', 'the value is', 'answer:', 'result:', 'solution:', '=' ] for prefix in prefixes_to_remove: cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE) # Extract number patterns (including decimals, negatives, scientific notation) # Use word boundaries to avoid matching trailing punctuation number_patterns = [ r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b', # Scientific notation with word boundary r'-?\d+\.\d+\b', # Decimal numbers with word boundary r'-?\d+\b', # Integers with word boundary ] for pattern in number_patterns: matches = re.findall(pattern, cleaned) if matches: # Return the first number found return matches[0].strip() # If no number found, return the cleaned response return cleaned.strip() def test_basic_arithmetic_100_percent(self): """Test basic arithmetic with 100% accuracy requirement.""" test_cases = [ { 'question': 'Calculate 25 * 17', 'expected': '425', 'operation': 'multiplication' }, { 'question': 'What is 144 divided by 12?', 'expected': '12', 'operation': 'division' }, { 'question': 'Add 100 and 50', 'expected': '150', 'operation': 'addition' }, { 'question': 'Subtract 75 from 200', 'expected': '125', 'operation': 'subtraction' }, { 'question': 'What is 2 to the power of 8?', 'expected': '256', 'operation': 'exponentiation' } ] failed_operations = [] for case in test_cases: if not self.agent.available: pytest.skip("Agent not available for testing") try: result = self.agent(case['question']) # Extract numeric answer extracted_answer = self.extract_numeric_answer(result) expected = case['expected'] # Check if the result matches if extracted_answer != expected: # Try float comparison for close matches try: result_num = float(extracted_answer) expected_num = float(expected) if abs(result_num - expected_num) < 0.001: logger.info(f"✅ {case['operation']} passed (float): {case['question']} → {extracted_answer}") continue except ValueError: pass failed_operations.append({ 'question': case['question'], 'expected': expected, 'actual': extracted_answer, 'full_response': result, 'operation': case['operation'] }) logger.error(f"❌ {case['operation']} failed: {case['question']}") logger.error(f" Expected: {expected}") logger.error(f" Extracted: {extracted_answer}") logger.error(f" Full response: {result}") else: logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}") except Exception as e: failed_operations.append({ 'question': case['question'], 'expected': case['expected'], 'actual': f"ERROR: {e}", 'full_response': str(e), 'operation': case['operation'] }) logger.error(f"❌ {case['operation']} error: {case['question']} → {e}") # Calculate accuracy accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100 logger.info(f"📊 Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})") # Report failures if failed_operations: logger.error("❌ Failed operations:") for failure in failed_operations: logger.error(f" {failure['operation']}: {failure['question']}") logger.error(f" Expected: {failure['expected']}") logger.error(f" Got: {failure['actual']}") # Assert 100% accuracy assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests" def test_complex_mathematical_operations(self): """Test complex mathematical operations for 100% accuracy.""" test_cases = [ { 'question': 'Calculate the square root of 144', 'expected': '12', 'operation': 'square_root' }, { 'question': 'What is 5 factorial?', 'expected': '120', 'operation': 'factorial' }, { 'question': 'Calculate sin(30 degrees)', 'expected': '0.5', 'operation': 'trigonometry', 'tolerance': 0.01 }, { 'question': 'What is the natural logarithm of e?', 'expected': '1', 'operation': 'logarithm', 'tolerance': 0.01 } ] failed_operations = [] for case in test_cases: if not self.agent.available: pytest.skip("Agent not available for testing") try: result = self.agent(case['question']) # Extract numeric answer extracted_answer = self.extract_numeric_answer(result) expected = case['expected'] tolerance = case.get('tolerance', 0.001) # Check if the result matches try: result_num = float(extracted_answer) expected_num = float(expected) if abs(result_num - expected_num) <= tolerance: logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}") continue except ValueError: # Try exact string match if extracted_answer == expected: logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}") continue failed_operations.append({ 'question': case['question'], 'expected': expected, 'actual': extracted_answer, 'full_response': result, 'operation': case['operation'] }) logger.error(f"❌ {case['operation']} failed: {case['question']}") logger.error(f" Expected: {expected}") logger.error(f" Extracted: {extracted_answer}") except Exception as e: failed_operations.append({ 'question': case['question'], 'expected': case['expected'], 'actual': f"ERROR: {e}", 'full_response': str(e), 'operation': case['operation'] }) logger.error(f"❌ {case['operation']} error: {case['question']} → {e}") # Calculate accuracy accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100 logger.info(f"📊 Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})") # Report results (don't assert for complex operations, just report) if failed_operations: logger.warning("⚠️ Complex operations that need improvement:") for failure in failed_operations: logger.warning(f" {failure['operation']}: {failure['question']}") logger.warning(f" Expected: {failure['expected']}") logger.warning(f" Got: {failure['actual']}") def test_answer_extraction_patterns(self): """Test various answer extraction patterns to improve accuracy.""" test_responses = [ ("The answer is 425", "425"), ("This calculation gives us 425.", "425"), ("425", "425"), ("The result is: 425", "425"), ("**Answer: 425**", "425"), ("Solution: 425", "425"), ("= 425", "425"), ("425.0", "425.0"), ("-123", "-123"), ("1.23e+5", "1.23e+5"), ] failed_extractions = [] for response, expected in test_responses: extracted = self.extract_numeric_answer(response) if extracted != expected: failed_extractions.append({ 'response': response, 'expected': expected, 'extracted': extracted }) logger.error(f"❌ Extraction failed: '{response}' → Expected: '{expected}', Got: '{extracted}'") else: logger.info(f"✅ Extraction passed: '{response}' → '{extracted}'") # Assert perfect extraction assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions" if __name__ == "__main__": # Run the calculator accuracy tests pytest.main([__file__, "-v", "-s"])