Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

gaia-enhanced-agent / tests /test_calculator_accuracy_100.py

GAIA Agent Deployment

Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements

9a6a4dc 9 days ago

11.4 kB

	"""
	Calculator 100% Accuracy Fix - TDD Implementation
	Comprehensive test suite to achieve 100% calculator accuracy.
	"""

	import pytest
	import sys
	import os
	import logging
	import re
	from pathlib import Path

	# Add the deployment-ready directory to the path
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

	from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent

	logger = logging.getLogger(__name__)


	class TestCalculator100Accuracy:
	"""Test suite to achieve 100% calculator accuracy."""

	@pytest.fixture(autouse=True)
	def setup_method(self):
	"""Set up test fixtures."""
	self.agent = FixedGAIAAgent()

	def extract_numeric_answer(self, response: str) -> str:
	"""Extract numeric answer from agent response."""
	# Remove common prefixes and suffixes
	cleaned = response.strip()

	# Remove markdown formatting
	cleaned = re.sub(r'[*_`]', '', cleaned)

	# Remove common phrases
	prefixes_to_remove = [
	'the answer is', 'the result is', 'the calculation gives',
	'this equals', 'equals', 'is equal to', 'the value is',
	'answer:', 'result:', 'solution:', '='
	]

	for prefix in prefixes_to_remove:
	cleaned = re.sub(rf'^{re.escape(prefix)}\s*', '', cleaned, flags=re.IGNORECASE)

	# Extract number patterns (including decimals, negatives, scientific notation)
	# Use word boundaries to avoid matching trailing punctuation
	number_patterns = [
	r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b', # Scientific notation with word boundary
	r'-?\d+\.\d+\b', # Decimal numbers with word boundary
	r'-?\d+\b', # Integers with word boundary
	]

	for pattern in number_patterns:
	matches = re.findall(pattern, cleaned)
	if matches:
	# Return the first number found
	return matches[0].strip()

	# If no number found, return the cleaned response
	return cleaned.strip()

	def test_basic_arithmetic_100_percent(self):
	"""Test basic arithmetic with 100% accuracy requirement."""
	test_cases = [
	{
	'question': 'Calculate 25 * 17',
	'expected': '425',
	'operation': 'multiplication'
	},
	{
	'question': 'What is 144 divided by 12?',
	'expected': '12',
	'operation': 'division'
	},
	{
	'question': 'Add 100 and 50',
	'expected': '150',
	'operation': 'addition'
	},
	{
	'question': 'Subtract 75 from 200',
	'expected': '125',
	'operation': 'subtraction'
	},
	{
	'question': 'What is 2 to the power of 8?',
	'expected': '256',
	'operation': 'exponentiation'
	}
	]

	failed_operations = []

	for case in test_cases:
	if not self.agent.available:
	pytest.skip("Agent not available for testing")

	try:
	result = self.agent(case['question'])

	# Extract numeric answer
	extracted_answer = self.extract_numeric_answer(result)
	expected = case['expected']

	# Check if the result matches
	if extracted_answer != expected:
	# Try float comparison for close matches
	try:
	result_num = float(extracted_answer)
	expected_num = float(expected)
	if abs(result_num - expected_num) < 0.001:
	logger.info(f"✅ {case['operation']} passed (float): {case['question']} → {extracted_answer}")
	continue
	except ValueError:
	pass

	failed_operations.append({
	'question': case['question'],
	'expected': expected,
	'actual': extracted_answer,
	'full_response': result,
	'operation': case['operation']
	})
	logger.error(f"❌ {case['operation']} failed: {case['question']}")
	logger.error(f" Expected: {expected}")
	logger.error(f" Extracted: {extracted_answer}")
	logger.error(f" Full response: {result}")
	else:
	logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}")

	except Exception as e:
	failed_operations.append({
	'question': case['question'],
	'expected': case['expected'],
	'actual': f"ERROR: {e}",
	'full_response': str(e),
	'operation': case['operation']
	})
	logger.error(f"❌ {case['operation']} error: {case['question']} → {e}")

	# Calculate accuracy
	accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
	logger.info(f"📊 Calculator accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")

	# Report failures
	if failed_operations:
	logger.error("❌ Failed operations:")
	for failure in failed_operations:
	logger.error(f" {failure['operation']}: {failure['question']}")
	logger.error(f" Expected: {failure['expected']}")
	logger.error(f" Got: {failure['actual']}")

	# Assert 100% accuracy
	assert len(failed_operations) == 0, f"Calculator must achieve 100% accuracy. Failed {len(failed_operations)} out of {len(test_cases)} tests"

	def test_complex_mathematical_operations(self):
	"""Test complex mathematical operations for 100% accuracy."""
	test_cases = [
	{
	'question': 'Calculate the square root of 144',
	'expected': '12',
	'operation': 'square_root'
	},
	{
	'question': 'What is 5 factorial?',
	'expected': '120',
	'operation': 'factorial'
	},
	{
	'question': 'Calculate sin(30 degrees)',
	'expected': '0.5',
	'operation': 'trigonometry',
	'tolerance': 0.01
	},
	{
	'question': 'What is the natural logarithm of e?',
	'expected': '1',
	'operation': 'logarithm',
	'tolerance': 0.01
	}
	]

	failed_operations = []

	for case in test_cases:
	if not self.agent.available:
	pytest.skip("Agent not available for testing")

	try:
	result = self.agent(case['question'])

	# Extract numeric answer
	extracted_answer = self.extract_numeric_answer(result)
	expected = case['expected']
	tolerance = case.get('tolerance', 0.001)

	# Check if the result matches
	try:
	result_num = float(extracted_answer)
	expected_num = float(expected)
	if abs(result_num - expected_num) <= tolerance:
	logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}")
	continue
	except ValueError:
	# Try exact string match
	if extracted_answer == expected:
	logger.info(f"✅ {case['operation']} passed: {case['question']} → {extracted_answer}")
	continue

	failed_operations.append({
	'question': case['question'],
	'expected': expected,
	'actual': extracted_answer,
	'full_response': result,
	'operation': case['operation']
	})
	logger.error(f"❌ {case['operation']} failed: {case['question']}")
	logger.error(f" Expected: {expected}")
	logger.error(f" Extracted: {extracted_answer}")

	except Exception as e:
	failed_operations.append({
	'question': case['question'],
	'expected': case['expected'],
	'actual': f"ERROR: {e}",
	'full_response': str(e),
	'operation': case['operation']
	})
	logger.error(f"❌ {case['operation']} error: {case['question']} → {e}")

	# Calculate accuracy
	accuracy = (len(test_cases) - len(failed_operations)) / len(test_cases) * 100
	logger.info(f"📊 Complex math accuracy: {accuracy:.1f}% ({len(test_cases) - len(failed_operations)}/{len(test_cases)})")

	# Report results (don't assert for complex operations, just report)
	if failed_operations:
	logger.warning("⚠️ Complex operations that need improvement:")
	for failure in failed_operations:
	logger.warning(f" {failure['operation']}: {failure['question']}")
	logger.warning(f" Expected: {failure['expected']}")
	logger.warning(f" Got: {failure['actual']}")

	def test_answer_extraction_patterns(self):
	"""Test various answer extraction patterns to improve accuracy."""
	test_responses = [
	("The answer is 425", "425"),
	("This calculation gives us 425.", "425"),
	("425", "425"),
	("The result is: 425", "425"),
	("Answer: 425", "425"),
	("Solution: 425", "425"),
	("= 425", "425"),
	("425.0", "425.0"),
	("-123", "-123"),
	("1.23e+5", "1.23e+5"),
	]

	failed_extractions = []

	for response, expected in test_responses:
	extracted = self.extract_numeric_answer(response)
	if extracted != expected:
	failed_extractions.append({
	'response': response,
	'expected': expected,
	'extracted': extracted
	})
	logger.error(f"❌ Extraction failed: '{response}' → Expected: '{expected}', Got: '{extracted}'")
	else:
	logger.info(f"✅ Extraction passed: '{response}' → '{extracted}'")

	# Assert perfect extraction
	assert len(failed_extractions) == 0, f"Answer extraction must be 100% accurate. Failed {len(failed_extractions)} extractions"


	if __name__ == "__main__":
	# Run the calculator accuracy tests
	pytest.main([__file__, "-v", "-s"])