Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 11,352 Bytes

9a6a4dc

"""
Calculator 100% Accuracy Fix - TDD Implementation
Comprehensive test suite to achieve 100% calculator accuracy.
"""

import pytest
import sys
import os
import logging
import re
from pathlib import Path

# Add the deployment-ready directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent

logger = logging.getLogger(__name__)


class TestCalculator100Accuracy:
    """Test suite to achieve 100% calculator accuracy."""
    
    @pytest.fixture(autouse=True)
    def setup_method(self):
        """Set up test fixtures."""
        self.agent = FixedGAIAAgent()
        
    def extract_numeric_answer(self, response: str) -> str:
        """Extract numeric answer from agent response."""
        # Remove common prefixes and suffixes
        cleaned = response.strip()
        
        # Remove markdown formatting
        cleaned = re.sub(r'[*_`]', '', cleaned)
        
        # Remove common phrases
        prefixes_to_remove = [
            'the answer is', 'the result is', 'the calculation gives',
            'this equals', 'equals', 'is equal to', 'the value is',
            'answer:', 'result:', 'solution:', '='
        ]
        
        for prefix in prefixes_to_remove:
            cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE)
        
        # Extract number patterns (including decimals, negatives, scientific notation)
        # Use word boundaries to avoid matching trailing punctuation
        number_patterns = [
            r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b',  # Scientific notation with word boundary
            r'-?\d+\.\d+\b',  # Decimal numbers with word boundary
            r'-?\d+\b',  # Integers with word boundary
        ]
        
        for pattern in number_patterns:
            matches = re.findall(pattern, cleaned)
            if matches:
                # Return the first number found
                return matches[0].strip()
        
        # If no number found, return the cleaned response
        return cleaned.strip()
    
    def test_basic_arithmetic_100_percent(self):
        """Test basic arithmetic with 100% accuracy requirement."""
        test_cases = [
            {
                'question': 'Calculate 25 * 17',
                'expected': '425',
                'operation': 'multiplication'
            },
            {
                'question': 'What is 144 divided by 12?',
                'expected': '12',
                'operation': 'division'
            },
            {
                'question': 'Add 100 and 50',
                'expected': '150',
                'operation': 'addition'
            },
            {
                'question': 'Subtract 75 from 200',
                'expected': '125',
                'operation': 'subtraction'
            },
            {
                'question': 'What is 2 to the power of 8?',
                'expected': '256',
                'operation': 'exponentiation'
            }
        ]
        
        failed_operations = []
        
        for case in test_cases:
            if not self.agent.available:
                pytest.skip("Agent not available for testing")
            
            try:
                result = self.agent(case['question'])
                
                # Extract numeric answer
                extracted_answer = self.extract_numeric_answer(result)
                expected = case['expected']
                
                # Check if the result matches
                if extracted_answer != expected:
                    # Try float comparison for close matches
                    try:
                        result_num = float(extracted_answer)
                        expected_num = float(expected)
                        if abs(result_num - expected_num) < 0.001:
                            logger.info(f"✅ {case['operation']} passed (float): {case['question']} → {extracted_answer}")
                            continue
                    except ValueError:
                        pass
                    
                    failed_operations.append({
                        'question': case['question'],
                        'expected': expected,
                        'actual': extracted_answer,
                        'full_response': result,
                        'operation': case['operation']
                    })
                    logger.error(f"❌ {case['operation']} failed: {case['question']}")
                    logger.error(f"   Expected: {expected}")
                    logger.error(f"   Extracted: {extracted_answer}")
                    logger.error(f"   Full response: {result}")
                else:
                    logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}")
                    
            except Exception as e:
                failed_operations.append({
                    'question': case['question'],
                    'expected': case['expected'],
                    'actual': f"ERROR: {e}",
                    'full_response': str(e),
                    'operation': case['operation']
                })
                logger.error(f"❌ {case['operation']} error: {case['question']} → {e}")
        
        # Calculate accuracy
        accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
        logger.info(f"📊 Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
        
        # Report failures
        if failed_operations:
            logger.error("❌ Failed operations:")
            for failure in failed_operations:
                logger.error(f"   {failure['operation']}: {failure['question']}")
                logger.error(f"      Expected: {failure['expected']}")
                logger.error(f"      Got: {failure['actual']}")
        
        # Assert 100% accuracy
        assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests"
    
    def test_complex_mathematical_operations(self):
        """Test complex mathematical operations for 100% accuracy."""
        test_cases = [
            {
                'question': 'Calculate the square root of 144',
                'expected': '12',
                'operation': 'square_root'
            },
            {
                'question': 'What is 5 factorial?',
                'expected': '120',
                'operation': 'factorial'
            },
            {
                'question': 'Calculate sin(30 degrees)',
                'expected': '0.5',
                'operation': 'trigonometry',
                'tolerance': 0.01
            },
            {
                'question': 'What is the natural logarithm of e?',
                'expected': '1',
                'operation': 'logarithm',
                'tolerance': 0.01
            }
        ]
        
        failed_operations = []
        
        for case in test_cases:
            if not self.agent.available:
                pytest.skip("Agent not available for testing")
            
            try:
                result = self.agent(case['question'])
                
                # Extract numeric answer
                extracted_answer = self.extract_numeric_answer(result)
                expected = case['expected']
                tolerance = case.get('tolerance', 0.001)
                
                # Check if the result matches
                try:
                    result_num = float(extracted_answer)
                    expected_num = float(expected)
                    if abs(result_num - expected_num) <= tolerance:
                        logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}")
                        continue
                except ValueError:
                    # Try exact string match
                    if extracted_answer == expected:
                        logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}")
                        continue
                
                failed_operations.append({
                    'question': case['question'],
                    'expected': expected,
                    'actual': extracted_answer,
                    'full_response': result,
                    'operation': case['operation']
                })
                logger.error(f"❌ {case['operation']} failed: {case['question']}")
                logger.error(f"   Expected: {expected}")
                logger.error(f"   Extracted: {extracted_answer}")
                    
            except Exception as e:
                failed_operations.append({
                    'question': case['question'],
                    'expected': case['expected'],
                    'actual': f"ERROR: {e}",
                    'full_response': str(e),
                    'operation': case['operation']
                })
                logger.error(f"❌ {case['operation']} error: {case['question']} → {e}")
        
        # Calculate accuracy
        accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
        logger.info(f"📊 Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")
        
        # Report results (don't assert for complex operations, just report)
        if failed_operations:
            logger.warning("⚠️ Complex operations that need improvement:")
            for failure in failed_operations:
                logger.warning(f"   {failure['operation']}: {failure['question']}")
                logger.warning(f"      Expected: {failure['expected']}")
                logger.warning(f"      Got: {failure['actual']}")
    
    def test_answer_extraction_patterns(self):
        """Test various answer extraction patterns to improve accuracy."""
        test_responses = [
            ("The answer is 425", "425"),
            ("This calculation gives us 425.", "425"),
            ("425", "425"),
            ("The result is: 425", "425"),
            ("**Answer: 425**", "425"),
            ("Solution: 425", "425"),
            ("= 425", "425"),
            ("425.0", "425.0"),
            ("-123", "-123"),
            ("1.23e+5", "1.23e+5"),
        ]
        
        failed_extractions = []
        
        for response, expected in test_responses:
            extracted = self.extract_numeric_answer(response)
            if extracted != expected:
                failed_extractions.append({
                    'response': response,
                    'expected': expected,
                    'extracted': extracted
                })
                logger.error(f"❌ Extraction failed: '{response}' → Expected: '{expected}', Got: '{extracted}'")
            else:
                logger.info(f"✅ Extraction passed: '{response}' → '{extracted}'")
        
        # Assert perfect extraction
        assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions"


if __name__ == "__main__":
    # Run the calculator accuracy tests
    pytest.main([__file__, "-v", "-s"])