Spaces:
Running
Running
""" | |
Phase 5: End-to-End System Testing for GAIA Agent | |
Comprehensive test suite to validate the complete GAIA Agent system and ensure 90%+ accuracy. | |
This test suite validates: | |
1. Complete workflow: Question β Processing β Tool Usage β Answer Extraction β Final Output | |
2. GAIA-style questions similar to evaluation scenarios | |
3. Performance benchmarking and reliability | |
4. Integration validation across all components | |
5. Edge case handling and error conditions | |
Test Categories: | |
- Mathematical Questions (Calculator and Python tools) | |
- Knowledge Questions (Wikipedia and ArXiv tools) | |
- Multimodal Questions (Image, audio, document processing) | |
- Web Research Questions (Firecrawl and Exa tools) | |
- File-Based Questions (Questions with attachments) | |
- Complex Multi-Step Questions (Multiple tool usage) | |
""" | |
import pytest | |
import sys | |
import os | |
import time | |
import json | |
import tempfile | |
import logging | |
from pathlib import Path | |
from typing import Dict, List, Any, Optional | |
from unittest.mock import Mock, patch | |
# Add the deployment-ready directory to the path | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) | |
# Import the fixed enhanced agent | |
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent | |
# Set up logging for tests | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class TestEndToEndComprehensive: | |
"""Comprehensive end-to-end test suite for the complete GAIA Agent system.""" | |
def setup_method(self): | |
"""Set up test fixtures before each test method.""" | |
# Initialize the agent | |
self.agent = FixedGAIAAgent() | |
# Track test metrics | |
self.test_metrics = { | |
'total_tests': 0, | |
'passed_tests': 0, | |
'failed_tests': 0, | |
'response_times': [], | |
'accuracy_scores': [], | |
'tool_usage_stats': {}, | |
'error_types': [] | |
} | |
# Performance thresholds | |
self.max_response_time = 30.0 # 30 seconds max | |
self.target_accuracy = 0.9 # 90% accuracy target | |
logger.info("π§ͺ End-to-end test setup completed") | |
def _measure_performance(self, test_func, *args, **kwargs): | |
"""Measure performance of a test function.""" | |
start_time = time.time() | |
try: | |
result = test_func(*args, **kwargs) | |
success = True | |
error = None | |
except Exception as e: | |
result = None | |
success = False | |
error = str(e) | |
end_time = time.time() | |
response_time = end_time - start_time | |
# Update metrics | |
self.test_metrics['total_tests'] += 1 | |
if success: | |
self.test_metrics['passed_tests'] += 1 | |
else: | |
self.test_metrics['failed_tests'] += 1 | |
self.test_metrics['error_types'].append(error) | |
self.test_metrics['response_times'].append(response_time) | |
return { | |
'result': result, | |
'success': success, | |
'response_time': response_time, | |
'error': error | |
} | |
def _validate_answer_format(self, answer: str, expected_type: str = None) -> bool: | |
"""Validate that the answer is properly formatted.""" | |
if not answer or answer == "unknown": | |
return False | |
# Check for common formatting issues | |
if answer.startswith("FINAL ANSWER:"): | |
return False # Should be extracted, not raw format | |
if len(answer.strip()) == 0: | |
return False | |
# Type-specific validation | |
if expected_type == "numeric": | |
try: | |
# Should be a valid number without commas | |
float(answer.replace(',', '')) | |
return ',' not in answer # No commas in final answer | |
except ValueError: | |
return False | |
return True | |
def test_agent_initialization(self): | |
"""Test that the agent initializes correctly with all required components.""" | |
# RED: Write failing test first | |
assert self.agent is not None, "Agent should be initialized" | |
assert self.agent.available, "Agent should be available" | |
assert hasattr(self.agent, 'tools'), "Agent should have tools" | |
assert hasattr(self.agent, 'response_processor'), "Agent should have response processor" | |
assert hasattr(self.agent, 'file_handler'), "Agent should have file handler" | |
# Verify minimum required tools | |
assert len(self.agent.tools) >= 2, "Agent should have at least core tools (calculator, python)" | |
logger.info(f"β Agent initialized with {len(self.agent.tools)} tools") | |
def test_mathematical_questions_basic(self): | |
"""Test basic mathematical questions using calculator tool.""" | |
test_cases = [ | |
{ | |
'question': 'What is 25 * 17?', | |
'expected': '425', | |
'type': 'numeric' | |
}, | |
{ | |
'question': 'What is 144 / 12?', | |
'expected': '12', | |
'type': 'numeric' | |
}, | |
{ | |
'question': 'What is 2^8?', | |
'expected': '256', | |
'type': 'numeric' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_single_question, | |
case['question'], | |
case['expected'], | |
case['type'] | |
) | |
assert performance['success'], f"Mathematical test failed: {performance['error']}" | |
assert performance['response_time'] < self.max_response_time, "Response too slow" | |
logger.info(f"β Math test passed: {case['question']} β {performance['result']}") | |
def test_mathematical_questions_complex(self): | |
"""Test complex mathematical questions requiring Python tool.""" | |
test_cases = [ | |
{ | |
'question': 'Calculate the factorial of 5', | |
'expected': '120', | |
'type': 'numeric' | |
}, | |
{ | |
'question': 'What is the square root of 144?', | |
'expected': '12', | |
'type': 'numeric' | |
}, | |
{ | |
'question': 'Calculate 15! / 13!', | |
'expected': '210', | |
'type': 'numeric' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_single_question, | |
case['question'], | |
case['expected'], | |
case['type'] | |
) | |
# Allow for some flexibility in complex math | |
if performance['success']: | |
logger.info(f"β Complex math test passed: {case['question']} β {performance['result']}") | |
else: | |
logger.warning(f"β οΈ Complex math test failed: {case['question']} - {performance['error']}") | |
def test_knowledge_questions_wikipedia(self): | |
"""Test knowledge questions that should use Wikipedia tool.""" | |
test_cases = [ | |
{ | |
'question': 'What is the capital of France?', | |
'expected': 'Paris', | |
'type': 'text' | |
}, | |
{ | |
'question': 'In what year was the Eiffel Tower completed?', | |
'expected': '1889', | |
'type': 'numeric' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_single_question, | |
case['question'], | |
case['expected'], | |
case['type'] | |
) | |
if performance['success']: | |
logger.info(f"β Knowledge test passed: {case['question']} β {performance['result']}") | |
else: | |
logger.warning(f"β οΈ Knowledge test failed: {case['question']} - {performance['error']}") | |
def test_file_based_questions(self): | |
"""Test questions with file attachments.""" | |
# Create test files | |
test_files = self._create_test_files() | |
test_cases = [ | |
{ | |
'question': 'What is the final numeric output from the attached Python code?', | |
'files': [test_files['python_code']], | |
'expected_type': 'numeric' | |
}, | |
{ | |
'question': 'What is the sum of all numbers in the attached CSV file?', | |
'files': [test_files['csv_data']], | |
'expected_type': 'numeric' | |
}, | |
{ | |
'question': 'What is the value of "result" in the attached JSON file?', | |
'files': [test_files['json_data']], | |
'expected_type': 'numeric' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_question_with_files, | |
case['question'], | |
case['files'], | |
case['expected_type'] | |
) | |
if performance['success']: | |
logger.info(f"β File-based test passed: {case['question']}") | |
else: | |
logger.warning(f"β οΈ File-based test failed: {case['question']} - {performance['error']}") | |
# Cleanup test files | |
self._cleanup_test_files(test_files) | |
def test_multimodal_questions(self): | |
"""Test multimodal questions (images, audio, documents).""" | |
# Create test multimodal files | |
test_files = self._create_multimodal_test_files() | |
test_cases = [ | |
{ | |
'question': 'How many objects are in this image?', | |
'files': [test_files['test_image']], | |
'expected_type': 'numeric' | |
}, | |
{ | |
'question': 'What is the main content of this document?', | |
'files': [test_files['test_document']], | |
'expected_type': 'text' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_question_with_files, | |
case['question'], | |
case['files'], | |
case['expected_type'] | |
) | |
if performance['success']: | |
logger.info(f"β Multimodal test passed: {case['question']}") | |
else: | |
logger.warning(f"β οΈ Multimodal test failed: {case['question']} - {performance['error']}") | |
# Cleanup test files | |
self._cleanup_test_files(test_files) | |
def test_web_research_questions(self): | |
"""Test web research questions using Firecrawl and Exa tools.""" | |
test_cases = [ | |
{ | |
'question': 'What is the current population of Tokyo?', | |
'expected_type': 'numeric' | |
}, | |
{ | |
'question': 'Who is the current CEO of Microsoft?', | |
'expected_type': 'text' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_single_question, | |
case['question'], | |
None, # No expected answer for web research | |
case['expected_type'] | |
) | |
if performance['success']: | |
logger.info(f"β Web research test passed: {case['question']}") | |
else: | |
logger.warning(f"β οΈ Web research test failed: {case['question']} - {performance['error']}") | |
def test_complex_multistep_questions(self): | |
"""Test complex questions requiring multiple tools.""" | |
test_cases = [ | |
{ | |
'question': 'Calculate the square root of 144, then find information about that number in mathematics', | |
'expected_type': 'text' | |
}, | |
{ | |
'question': 'What is 25 * 17, and what is the significance of that number?', | |
'expected_type': 'text' | |
} | |
] | |
for case in test_cases: | |
performance = self._measure_performance( | |
self._test_single_question, | |
case['question'], | |
None, # Complex questions may have varied answers | |
case['expected_type'] | |
) | |
if performance['success']: | |
logger.info(f"β Complex test passed: {case['question']}") | |
else: | |
logger.warning(f"β οΈ Complex test failed: {case['question']} - {performance['error']}") | |
def test_edge_cases_and_error_handling(self): | |
"""Test edge cases and error handling.""" | |
edge_cases = [ | |
{ | |
'question': '', # Empty question | |
'should_handle_gracefully': True | |
}, | |
{ | |
'question': 'What is the answer to a question that makes no sense?', | |
'should_handle_gracefully': True | |
}, | |
{ | |
'question': 'Calculate the square root of -1', # Mathematical impossibility | |
'should_handle_gracefully': True | |
} | |
] | |
for case in edge_cases: | |
performance = self._measure_performance( | |
self._test_edge_case, | |
case['question'] | |
) | |
# Edge cases should be handled gracefully, not crash | |
if case['should_handle_gracefully']: | |
assert performance['result'] is not None, "Edge case should return some result" | |
logger.info(f"β Edge case handled: {case['question']}") | |
def test_gaia_style_evaluation_questions(self): | |
"""Test questions similar to GAIA evaluation scenarios.""" | |
gaia_style_questions = [ | |
{ | |
'question': 'How many studio albums were published by Mercedes Sosa between 2000 and 2009?', | |
'expected_type': 'numeric', | |
'requires_tools': ['wikipedia'] | |
}, | |
{ | |
'question': 'What is the highest number of bird species to be on camera simultaneously?', | |
'expected_type': 'numeric', | |
'requires_tools': ['web_search'] | |
}, | |
{ | |
'question': 'In chess, what is the minimum number of moves required for checkmate?', | |
'expected_type': 'numeric', | |
'requires_tools': ['wikipedia'] | |
} | |
] | |
for case in gaia_style_questions: | |
performance = self._measure_performance( | |
self._test_single_question, | |
case['question'], | |
None, # GAIA questions have specific answers we'd need to verify | |
case['expected_type'] | |
) | |
if performance['success']: | |
logger.info(f"β GAIA-style test passed: {case['question']}") | |
self.test_metrics['accuracy_scores'].append(1.0) | |
else: | |
logger.warning(f"β οΈ GAIA-style test failed: {case['question']} - {performance['error']}") | |
self.test_metrics['accuracy_scores'].append(0.0) | |
def test_performance_benchmarks(self): | |
"""Test performance benchmarks and system reliability.""" | |
# Test response time consistency | |
question = "What is 100 * 50?" | |
response_times = [] | |
for i in range(5): | |
performance = self._measure_performance( | |
self._test_single_question, | |
question, | |
"5000", | |
"numeric" | |
) | |
response_times.append(performance['response_time']) | |
# Check response time consistency | |
avg_response_time = sum(response_times) / len(response_times) | |
max_response_time = max(response_times) | |
assert avg_response_time < self.max_response_time, f"Average response time too high: {avg_response_time}" | |
assert max_response_time < self.max_response_time * 1.5, f"Max response time too high: {max_response_time}" | |
logger.info(f"β Performance benchmark passed - Avg: {avg_response_time:.2f}s, Max: {max_response_time:.2f}s") | |
def test_system_integration_validation(self): | |
"""Test that all system components work together seamlessly.""" | |
# Test processor statistics | |
stats = self.agent.get_processor_statistics() | |
assert isinstance(stats, dict), "Processor should return statistics" | |
# Test tool status | |
tool_status = self.agent.get_tool_status() | |
assert isinstance(tool_status, dict), "Agent should return tool status" | |
# Test file handler capabilities | |
file_formats = self.agent.file_handler.get_supported_formats() | |
assert len(file_formats) > 0, "File handler should support some formats" | |
logger.info("β System integration validation passed") | |
def _test_single_question(self, question: str, expected: str = None, expected_type: str = None) -> str: | |
"""Test a single question and return the result.""" | |
if not self.agent.available: | |
pytest.skip("Agent not available for testing") | |
result = self.agent(question) | |
# Validate answer format | |
assert self._validate_answer_format(result, expected_type), f"Invalid answer format: '{result}'" | |
# If expected answer provided, check for exact match or reasonable similarity | |
if expected: | |
if expected_type == "numeric": | |
# For numeric answers, allow for minor variations | |
try: | |
result_num = float(result.replace(',', '')) | |
expected_num = float(expected.replace(',', '')) | |
assert abs(result_num - expected_num) < 0.01, f"Expected {expected}, got {result}" | |
except ValueError: | |
assert result.lower() == expected.lower(), f"Expected {expected}, got {result}" | |
else: | |
# For text answers, allow case-insensitive comparison | |
assert result.lower() == expected.lower(), f"Expected {expected}, got {result}" | |
return result | |
def _test_question_with_files(self, question: str, files: List[str], expected_type: str = None) -> str: | |
"""Test a question with file attachments.""" | |
if not self.agent.available: | |
pytest.skip("Agent not available for testing") | |
result = self.agent(question, files) | |
# Validate answer format | |
assert self._validate_answer_format(result, expected_type), f"Invalid answer format: '{result}'" | |
return result | |
def _test_edge_case(self, question: str) -> str: | |
"""Test an edge case question.""" | |
if not self.agent.available: | |
pytest.skip("Agent not available for testing") | |
# Edge cases should not crash | |
try: | |
result = self.agent(question) | |
return result | |
except Exception as e: | |
# Log the error but don't fail the test - edge cases should be handled gracefully | |
logger.warning(f"Edge case caused exception: {e}") | |
return "unknown" | |
def _create_test_files(self) -> Dict[str, str]: | |
"""Create test files for file-based questions.""" | |
test_files = {} | |
# Create Python code file | |
python_code = """ | |
# Test Python code | |
def calculate(): | |
result = 25 * 17 | |
return result | |
if __name__ == "__main__": | |
answer = calculate() | |
print(f"The result is: {answer}") | |
""" | |
python_file = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) | |
python_file.write(python_code) | |
python_file.close() | |
test_files['python_code'] = python_file.name | |
# Create CSV data file | |
csv_data = """name,value,category | |
item1,10,A | |
item2,20,B | |
item3,30,A | |
item4,40,B | |
""" | |
csv_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) | |
csv_file.write(csv_data) | |
csv_file.close() | |
test_files['csv_data'] = csv_file.name | |
# Create JSON data file | |
json_data = { | |
"result": 425, | |
"calculation": "25 * 17", | |
"metadata": { | |
"timestamp": "2024-01-01", | |
"version": "1.0" | |
} | |
} | |
json_file = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) | |
json.dump(json_data, json_file) | |
json_file.close() | |
test_files['json_data'] = json_file.name | |
return test_files | |
def _create_multimodal_test_files(self) -> Dict[str, str]: | |
"""Create test files for multimodal questions.""" | |
test_files = {} | |
# Create a simple text file representing an image description | |
image_desc = "This is a test image description file representing an image with 3 objects: a cat, a dog, and a bird." | |
image_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) | |
image_file.write(image_desc) | |
image_file.close() | |
test_files['test_image'] = image_file.name | |
# Create a document file | |
document_content = """ | |
Test Document | |
This is a test document for multimodal processing. | |
The main content discusses artificial intelligence and machine learning. | |
Key points: | |
1. AI is transforming industries | |
2. Machine learning enables automation | |
3. Natural language processing improves communication | |
Conclusion: Technology continues to advance rapidly. | |
""" | |
doc_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) | |
doc_file.write(document_content) | |
doc_file.close() | |
test_files['test_document'] = doc_file.name | |
return test_files | |
def _cleanup_test_files(self, test_files: Dict[str, str]): | |
"""Clean up test files.""" | |
for file_path in test_files.values(): | |
try: | |
os.unlink(file_path) | |
except OSError: | |
pass # File already deleted or doesn't exist | |
def test_final_system_validation(self): | |
"""Final validation test to ensure system meets all requirements.""" | |
# Calculate overall metrics | |
total_tests = self.test_metrics['total_tests'] | |
passed_tests = self.test_metrics['passed_tests'] | |
if total_tests > 0: | |
accuracy = passed_tests / total_tests | |
avg_response_time = sum(self.test_metrics['response_times']) / len(self.test_metrics['response_times']) | |
logger.info(f"π Final System Metrics:") | |
logger.info(f" Total Tests: {total_tests}") | |
logger.info(f" Passed Tests: {passed_tests}") | |
logger.info(f" Accuracy: {accuracy:.2%}") | |
logger.info(f" Average Response Time: {avg_response_time:.2f}s") | |
# Validate against success criteria | |
assert accuracy >= self.target_accuracy, f"Accuracy {accuracy:.2%} below target {self.target_accuracy:.2%}" | |
assert avg_response_time < self.max_response_time, f"Average response time {avg_response_time:.2f}s above limit" | |
logger.info("β System validation passed - Ready for GAIA evaluation!") | |
else: | |
logger.warning("β οΈ No tests were executed for final validation") | |
if __name__ == "__main__": | |
# Run the comprehensive test suite | |
pytest.main([__file__, "-v", "--tb=short"]) |