Spaces:
Running
Running
""" | |
Performance Benchmark Test Suite for GAIA Agent | |
Measures response time, accuracy, and reliability metrics to ensure 90%+ accuracy target. | |
This module provides comprehensive performance testing including: | |
1. Response time benchmarking | |
2. Accuracy measurement across question types | |
3. Reliability and consistency testing | |
4. Tool usage efficiency analysis | |
5. Memory and resource usage monitoring | |
""" | |
import pytest | |
import sys | |
import os | |
import time | |
import statistics | |
import psutil | |
import threading | |
from pathlib import Path | |
from typing import Dict, List, Any, Optional, Tuple | |
from dataclasses import dataclass | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
# Add the deployment-ready directory to the path | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) | |
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent | |
class PerformanceMetrics: | |
"""Data class for storing performance metrics.""" | |
response_time: float | |
accuracy: float | |
memory_usage_mb: float | |
cpu_usage_percent: float | |
tool_calls: int | |
success: bool | |
error_message: Optional[str] = None | |
class BenchmarkResults: | |
"""Data class for storing benchmark results.""" | |
total_tests: int | |
successful_tests: int | |
failed_tests: int | |
average_response_time: float | |
median_response_time: float | |
min_response_time: float | |
max_response_time: float | |
overall_accuracy: float | |
memory_usage_stats: Dict[str, float] | |
cpu_usage_stats: Dict[str, float] | |
tool_usage_stats: Dict[str, int] | |
category_performance: Dict[str, Dict[str, float]] | |
class PerformanceBenchmark: | |
"""Performance benchmark suite for GAIA Agent.""" | |
def __init__(self): | |
"""Initialize the performance benchmark.""" | |
self.agent = FixedGAIAAgent() | |
self.metrics: List[PerformanceMetrics] = [] | |
# Performance thresholds | |
self.max_response_time = 30.0 # 30 seconds | |
self.target_accuracy = 0.9 # 90% accuracy | |
self.max_memory_usage = 1000 # 1GB in MB | |
self.max_cpu_usage = 80 # 80% CPU | |
# Test questions for benchmarking | |
self.benchmark_questions = self._get_benchmark_questions() | |
def _get_benchmark_questions(self) -> List[Dict[str, Any]]: | |
"""Get standardized benchmark questions.""" | |
return [ | |
# Fast mathematical questions | |
{ | |
'question': 'What is 25 * 17?', | |
'expected': '425', | |
'category': 'math_basic', | |
'expected_time': 5.0 | |
}, | |
{ | |
'question': 'What is 144 / 12?', | |
'expected': '12', | |
'category': 'math_basic', | |
'expected_time': 5.0 | |
}, | |
{ | |
'question': 'Calculate 2^8', | |
'expected': '256', | |
'category': 'math_basic', | |
'expected_time': 5.0 | |
}, | |
# Medium complexity questions | |
{ | |
'question': 'What is the factorial of 5?', | |
'expected': '120', | |
'category': 'math_medium', | |
'expected_time': 10.0 | |
}, | |
{ | |
'question': 'What is the square root of 144?', | |
'expected': '12', | |
'category': 'math_medium', | |
'expected_time': 10.0 | |
}, | |
# Knowledge questions | |
{ | |
'question': 'What is the capital of France?', | |
'expected': 'Paris', | |
'category': 'knowledge', | |
'expected_time': 15.0 | |
}, | |
{ | |
'question': 'In what year was the Eiffel Tower completed?', | |
'expected': '1889', | |
'category': 'knowledge', | |
'expected_time': 15.0 | |
}, | |
# Complex questions | |
{ | |
'question': 'Calculate the square root of 144, then multiply by 5', | |
'expected': '60', | |
'category': 'complex', | |
'expected_time': 20.0 | |
} | |
] | |
def measure_single_question_performance(self, question_data: Dict[str, Any]) -> PerformanceMetrics: | |
"""Measure performance for a single question.""" | |
question = question_data['question'] | |
expected = question_data['expected'] | |
category = question_data['category'] | |
# Get initial system metrics | |
process = psutil.Process() | |
initial_memory = process.memory_info().rss / 1024 / 1024 # MB | |
initial_cpu = process.cpu_percent() | |
# Measure response time | |
start_time = time.time() | |
try: | |
# Execute question | |
answer = self.agent(question) | |
success = True | |
error_message = None | |
# Validate accuracy | |
accuracy = self._calculate_accuracy(answer, expected, category) | |
except Exception as e: | |
answer = None | |
success = False | |
error_message = str(e) | |
accuracy = 0.0 | |
end_time = time.time() | |
response_time = end_time - start_time | |
# Get final system metrics | |
final_memory = process.memory_info().rss / 1024 / 1024 # MB | |
final_cpu = process.cpu_percent() | |
memory_usage = final_memory - initial_memory | |
cpu_usage = max(final_cpu - initial_cpu, 0) | |
# Count tool calls (approximate) | |
tool_calls = self._estimate_tool_calls(question, category) | |
return PerformanceMetrics( | |
response_time=response_time, | |
accuracy=accuracy, | |
memory_usage_mb=memory_usage, | |
cpu_usage_percent=cpu_usage, | |
tool_calls=tool_calls, | |
success=success, | |
error_message=error_message | |
) | |
def run_response_time_benchmark(self) -> Dict[str, float]: | |
"""Run response time benchmark across all question types.""" | |
print("π Running Response Time Benchmark...") | |
response_times = [] | |
category_times = {} | |
for question_data in self.benchmark_questions: | |
category = question_data['category'] | |
expected_time = question_data['expected_time'] | |
print(f"β±οΈ Testing: {question_data['question'][:50]}...") | |
metrics = self.measure_single_question_performance(question_data) | |
response_times.append(metrics.response_time) | |
if category not in category_times: | |
category_times[category] = [] | |
category_times[category].append(metrics.response_time) | |
# Check against expected time | |
if metrics.response_time > expected_time: | |
print(f"β οΈ Slower than expected: {metrics.response_time:.2f}s > {expected_time}s") | |
else: | |
print(f"β Within expected time: {metrics.response_time:.2f}s <= {expected_time}s") | |
# Calculate statistics | |
avg_time = statistics.mean(response_times) | |
median_time = statistics.median(response_times) | |
min_time = min(response_times) | |
max_time = max(response_times) | |
print(f"\nπ Response Time Statistics:") | |
print(f"Average: {avg_time:.2f}s") | |
print(f"Median: {median_time:.2f}s") | |
print(f"Min: {min_time:.2f}s") | |
print(f"Max: {max_time:.2f}s") | |
# Category breakdown | |
print(f"\nπ Category Breakdown:") | |
for category, times in category_times.items(): | |
cat_avg = statistics.mean(times) | |
print(f"{category}: {cat_avg:.2f}s avg") | |
return { | |
'average': avg_time, | |
'median': median_time, | |
'min': min_time, | |
'max': max_time, | |
'category_averages': {cat: statistics.mean(times) for cat, times in category_times.items()} | |
} | |
def run_accuracy_benchmark(self) -> Dict[str, float]: | |
"""Run accuracy benchmark across all question types.""" | |
print("π― Running Accuracy Benchmark...") | |
total_questions = 0 | |
correct_answers = 0 | |
category_accuracy = {} | |
for question_data in self.benchmark_questions: | |
category = question_data['category'] | |
print(f"π Testing: {question_data['question'][:50]}...") | |
metrics = self.measure_single_question_performance(question_data) | |
total_questions += 1 | |
if metrics.accuracy > 0.8: # Consider >80% accuracy as correct | |
correct_answers += 1 | |
print(f"β Correct answer (accuracy: {metrics.accuracy:.2f})") | |
else: | |
print(f"β Incorrect answer (accuracy: {metrics.accuracy:.2f})") | |
# Track category accuracy | |
if category not in category_accuracy: | |
category_accuracy[category] = {'correct': 0, 'total': 0} | |
category_accuracy[category]['total'] += 1 | |
if metrics.accuracy > 0.8: | |
category_accuracy[category]['correct'] += 1 | |
# Calculate overall accuracy | |
overall_accuracy = correct_answers / total_questions if total_questions > 0 else 0 | |
print(f"\nπ Accuracy Statistics:") | |
print(f"Overall Accuracy: {overall_accuracy:.2%}") | |
print(f"Correct Answers: {correct_answers}/{total_questions}") | |
# Category breakdown | |
print(f"\nπ Category Accuracy:") | |
category_percentages = {} | |
for category, stats in category_accuracy.items(): | |
cat_accuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0 | |
category_percentages[category] = cat_accuracy | |
print(f"{category}: {cat_accuracy:.2%} ({stats['correct']}/{stats['total']})") | |
return { | |
'overall': overall_accuracy, | |
'correct_count': correct_answers, | |
'total_count': total_questions, | |
'category_accuracy': category_percentages | |
} | |
def run_reliability_benchmark(self, iterations: int = 5) -> Dict[str, Any]: | |
"""Run reliability benchmark with multiple iterations.""" | |
print(f"π Running Reliability Benchmark ({iterations} iterations)...") | |
# Test the same question multiple times | |
test_question = { | |
'question': 'What is 25 * 17?', | |
'expected': '425', | |
'category': 'math_basic' | |
} | |
results = [] | |
response_times = [] | |
accuracies = [] | |
for i in range(iterations): | |
print(f"π Iteration {i+1}/{iterations}") | |
metrics = self.measure_single_question_performance(test_question) | |
results.append(metrics) | |
response_times.append(metrics.response_time) | |
accuracies.append(metrics.accuracy) | |
# Calculate consistency metrics | |
time_std = statistics.stdev(response_times) if len(response_times) > 1 else 0 | |
time_cv = time_std / statistics.mean(response_times) if statistics.mean(response_times) > 0 else 0 | |
accuracy_std = statistics.stdev(accuracies) if len(accuracies) > 1 else 0 | |
success_rate = sum(1 for r in results if r.success) / len(results) | |
print(f"\nπ Reliability Statistics:") | |
print(f"Success Rate: {success_rate:.2%}") | |
print(f"Response Time CV: {time_cv:.2%}") | |
print(f"Accuracy Std Dev: {accuracy_std:.3f}") | |
return { | |
'success_rate': success_rate, | |
'response_time_consistency': time_cv, | |
'accuracy_consistency': accuracy_std, | |
'iterations': iterations, | |
'all_results': results | |
} | |
def run_concurrent_load_test(self, concurrent_requests: int = 3) -> Dict[str, Any]: | |
"""Run concurrent load test to measure performance under load.""" | |
print(f"β‘ Running Concurrent Load Test ({concurrent_requests} concurrent requests)...") | |
test_question = { | |
'question': 'What is 144 / 12?', | |
'expected': '12', | |
'category': 'math_basic' | |
} | |
def run_single_test(): | |
return self.measure_single_question_performance(test_question) | |
start_time = time.time() | |
# Run concurrent requests | |
with ThreadPoolExecutor(max_workers=concurrent_requests) as executor: | |
futures = [executor.submit(run_single_test) for _ in range(concurrent_requests)] | |
results = [future.result() for future in as_completed(futures)] | |
end_time = time.time() | |
total_time = end_time - start_time | |
# Analyze results | |
success_count = sum(1 for r in results if r.success) | |
avg_response_time = statistics.mean([r.response_time for r in results]) | |
max_response_time = max([r.response_time for r in results]) | |
throughput = concurrent_requests / total_time # requests per second | |
print(f"\nπ Load Test Results:") | |
print(f"Total Time: {total_time:.2f}s") | |
print(f"Success Rate: {success_count}/{concurrent_requests} ({success_count/concurrent_requests:.2%})") | |
print(f"Average Response Time: {avg_response_time:.2f}s") | |
print(f"Max Response Time: {max_response_time:.2f}s") | |
print(f"Throughput: {throughput:.2f} requests/second") | |
return { | |
'total_time': total_time, | |
'success_rate': success_count / concurrent_requests, | |
'average_response_time': avg_response_time, | |
'max_response_time': max_response_time, | |
'throughput': throughput, | |
'concurrent_requests': concurrent_requests | |
} | |
def run_memory_usage_benchmark(self) -> Dict[str, float]: | |
"""Run memory usage benchmark.""" | |
print("πΎ Running Memory Usage Benchmark...") | |
process = psutil.Process() | |
initial_memory = process.memory_info().rss / 1024 / 1024 # MB | |
memory_measurements = [initial_memory] | |
# Run several questions and monitor memory | |
for question_data in self.benchmark_questions[:5]: # Test first 5 questions | |
print(f"πΎ Testing memory usage: {question_data['question'][:30]}...") | |
before_memory = process.memory_info().rss / 1024 / 1024 | |
metrics = self.measure_single_question_performance(question_data) | |
after_memory = process.memory_info().rss / 1024 / 1024 | |
memory_measurements.append(after_memory) | |
print(f"Memory: {before_memory:.1f}MB β {after_memory:.1f}MB (Ξ{after_memory-before_memory:+.1f}MB)") | |
final_memory = process.memory_info().rss / 1024 / 1024 | |
total_memory_increase = final_memory - initial_memory | |
max_memory = max(memory_measurements) | |
avg_memory = statistics.mean(memory_measurements) | |
print(f"\nπ Memory Usage Statistics:") | |
print(f"Initial Memory: {initial_memory:.1f}MB") | |
print(f"Final Memory: {final_memory:.1f}MB") | |
print(f"Total Increase: {total_memory_increase:+.1f}MB") | |
print(f"Peak Memory: {max_memory:.1f}MB") | |
print(f"Average Memory: {avg_memory:.1f}MB") | |
return { | |
'initial_memory_mb': initial_memory, | |
'final_memory_mb': final_memory, | |
'total_increase_mb': total_memory_increase, | |
'peak_memory_mb': max_memory, | |
'average_memory_mb': avg_memory | |
} | |
def run_comprehensive_benchmark(self) -> BenchmarkResults: | |
"""Run comprehensive benchmark covering all aspects.""" | |
print("π Running Comprehensive Performance Benchmark") | |
print("=" * 60) | |
# Run all benchmark components | |
response_time_results = self.run_response_time_benchmark() | |
accuracy_results = self.run_accuracy_benchmark() | |
reliability_results = self.run_reliability_benchmark() | |
load_test_results = self.run_concurrent_load_test() | |
memory_results = self.run_memory_usage_benchmark() | |
# Compile comprehensive results | |
results = BenchmarkResults( | |
total_tests=len(self.benchmark_questions), | |
successful_tests=accuracy_results['correct_count'], | |
failed_tests=accuracy_results['total_count'] - accuracy_results['correct_count'], | |
average_response_time=response_time_results['average'], | |
median_response_time=response_time_results['median'], | |
min_response_time=response_time_results['min'], | |
max_response_time=response_time_results['max'], | |
overall_accuracy=accuracy_results['overall'], | |
memory_usage_stats=memory_results, | |
cpu_usage_stats={'average': 0, 'peak': 0}, # Would need more detailed CPU monitoring | |
tool_usage_stats={}, # Would need tool call tracking | |
category_performance={ | |
cat: {'accuracy': acc, 'avg_time': response_time_results['category_averages'].get(cat, 0)} | |
for cat, acc in accuracy_results['category_accuracy'].items() | |
} | |
) | |
# Print comprehensive summary | |
print("\nπ COMPREHENSIVE BENCHMARK RESULTS") | |
print("=" * 60) | |
print(f"π Overall Performance:") | |
print(f" Accuracy: {results.overall_accuracy:.2%} (Target: {self.target_accuracy:.2%})") | |
print(f" Average Response Time: {results.average_response_time:.2f}s (Limit: {self.max_response_time}s)") | |
print(f" Success Rate: {results.successful_tests}/{results.total_tests}") | |
print(f"\nβ±οΈ Response Time Analysis:") | |
print(f" Average: {results.average_response_time:.2f}s") | |
print(f" Median: {results.median_response_time:.2f}s") | |
print(f" Range: {results.min_response_time:.2f}s - {results.max_response_time:.2f}s") | |
print(f"\nπΎ Memory Usage:") | |
print(f" Peak: {memory_results['peak_memory_mb']:.1f}MB") | |
print(f" Average: {memory_results['average_memory_mb']:.1f}MB") | |
print(f" Total Increase: {memory_results['total_increase_mb']:+.1f}MB") | |
print(f"\nπ Reliability:") | |
print(f" Success Rate: {reliability_results['success_rate']:.2%}") | |
print(f" Response Time Consistency: {reliability_results['response_time_consistency']:.2%}") | |
print(f"\nβ‘ Load Performance:") | |
print(f" Concurrent Success Rate: {load_test_results['success_rate']:.2%}") | |
print(f" Throughput: {load_test_results['throughput']:.2f} req/s") | |
# Validate against targets | |
meets_accuracy_target = results.overall_accuracy >= self.target_accuracy | |
meets_response_time_target = results.average_response_time <= self.max_response_time | |
meets_memory_target = memory_results['peak_memory_mb'] <= self.max_memory_usage | |
print(f"\nβ Target Validation:") | |
print(f" Accuracy Target: {'β PASS' if meets_accuracy_target else 'β FAIL'}") | |
print(f" Response Time Target: {'β PASS' if meets_response_time_target else 'β FAIL'}") | |
print(f" Memory Usage Target: {'β PASS' if meets_memory_target else 'β FAIL'}") | |
overall_pass = meets_accuracy_target and meets_response_time_target and meets_memory_target | |
print(f"\nπ OVERALL RESULT: {'β PASS - READY FOR GAIA EVALUATION' if overall_pass else 'β FAIL - NEEDS OPTIMIZATION'}") | |
return results | |
def _calculate_accuracy(self, actual: str, expected: str, category: str) -> float: | |
"""Calculate accuracy score for an answer.""" | |
if not actual or actual == "unknown": | |
return 0.0 | |
actual_clean = actual.strip().lower() | |
expected_clean = expected.strip().lower() | |
# Exact match | |
if actual_clean == expected_clean: | |
return 1.0 | |
# Numeric comparison for math questions | |
if category.startswith('math'): | |
try: | |
actual_num = float(actual.replace(',', '')) | |
expected_num = float(expected.replace(',', '')) | |
if abs(actual_num - expected_num) < 0.01: | |
return 1.0 | |
else: | |
return 0.0 | |
except ValueError: | |
pass | |
# Partial match for text answers | |
if expected_clean in actual_clean or actual_clean in expected_clean: | |
return 0.8 | |
return 0.0 | |
def _estimate_tool_calls(self, question: str, category: str) -> int: | |
"""Estimate number of tool calls based on question type.""" | |
if category.startswith('math'): | |
return 1 # Usually calculator or python | |
elif category == 'knowledge': | |
return 2 # Usually wikipedia + processing | |
elif category == 'complex': | |
return 3 # Multiple tools | |
else: | |
return 1 | |
class TestPerformanceBenchmark: | |
"""Test suite for performance benchmarking.""" | |
def setup_method(self): | |
"""Set up test fixtures.""" | |
self.benchmark = PerformanceBenchmark() | |
def test_agent_availability(self): | |
"""Test that the agent is available for benchmarking.""" | |
assert self.benchmark.agent is not None, "Agent should be initialized" | |
assert self.benchmark.agent.available, "Agent should be available" | |
def test_response_time_benchmark(self): | |
"""Test response time benchmark.""" | |
if not self.benchmark.agent.available: | |
pytest.skip("Agent not available for benchmarking") | |
results = self.benchmark.run_response_time_benchmark() | |
# Validate results structure | |
assert 'average' in results | |
assert 'median' in results | |
assert 'min' in results | |
assert 'max' in results | |
# Validate performance thresholds | |
assert results['average'] <= self.benchmark.max_response_time, f"Average response time {results['average']:.2f}s exceeds limit" | |
assert results['max'] <= self.benchmark.max_response_time * 2, f"Max response time {results['max']:.2f}s too high" | |
print(f"β Response time benchmark passed - Average: {results['average']:.2f}s") | |
def test_accuracy_benchmark(self): | |
"""Test accuracy benchmark.""" | |
if not self.benchmark.agent.available: | |
pytest.skip("Agent not available for benchmarking") | |
results = self.benchmark.run_accuracy_benchmark() | |
# Validate results structure | |
assert 'overall' in results | |
assert 'correct_count' in results | |
assert 'total_count' in results | |
# Validate accuracy threshold | |
assert results['overall'] >= 0.5, f"Accuracy {results['overall']:.2%} too low for basic functionality" | |
print(f"β Accuracy benchmark completed - Overall: {results['overall']:.2%}") | |
def test_reliability_benchmark(self): | |
"""Test reliability benchmark.""" | |
if not self.benchmark.agent.available: | |
pytest.skip("Agent not available for benchmarking") | |
results = self.benchmark.run_reliability_benchmark(iterations=3) | |
# Validate results structure | |
assert 'success_rate' in results | |
assert 'response_time_consistency' in results | |
# Validate reliability thresholds | |
assert results['success_rate'] >= 0.8, f"Success rate {results['success_rate']:.2%} too low" | |
assert results['response_time_consistency'] <= 0.5, f"Response time too inconsistent: {results['response_time_consistency']:.2%}" | |
print(f"β Reliability benchmark passed - Success rate: {results['success_rate']:.2%}") | |
def test_memory_usage_benchmark(self): | |
"""Test memory usage benchmark.""" | |
if not self.benchmark.agent.available: | |
pytest.skip("Agent not available for benchmarking") | |
results = self.benchmark.run_memory_usage_benchmark() | |
# Validate results structure | |
assert 'peak_memory_mb' in results | |
assert 'total_increase_mb' in results | |
# Validate memory usage | |
assert results['peak_memory_mb'] <= self.benchmark.max_memory_usage, f"Peak memory {results['peak_memory_mb']:.1f}MB exceeds limit" | |
print(f"β Memory usage benchmark passed - Peak: {results['peak_memory_mb']:.1f}MB") | |
def test_comprehensive_benchmark(self): | |
"""Test comprehensive benchmark suite.""" | |
if not self.benchmark.agent.available: | |
pytest.skip("Agent not available for benchmarking") | |
results = self.benchmark.run_comprehensive_benchmark() | |
# Validate comprehensive results | |
assert isinstance(results, BenchmarkResults) | |
assert results.total_tests > 0 | |
assert results.overall_accuracy >= 0.0 | |
assert results.average_response_time > 0.0 | |
# Log final results | |
print(f"β Comprehensive benchmark completed") | |
print(f" Accuracy: {results.overall_accuracy:.2%}") | |
print(f" Avg Response Time: {results.average_response_time:.2f}s") | |
print(f" Success Rate: {results.successful_tests}/{results.total_tests}") | |
if __name__ == "__main__": | |
# Run performance benchmarks | |
benchmark = PerformanceBenchmark() | |
if benchmark.agent.available: | |
print("π Starting Performance Benchmark Suite") | |
results = benchmark.run_comprehensive_benchmark() | |
# Save results to file | |
import json | |
results_dict = { | |
'total_tests': results.total_tests, | |
'successful_tests': results.successful_tests, | |
'failed_tests': results.failed_tests, | |
'overall_accuracy': results.overall_accuracy, | |
'average_response_time': results.average_response_time, | |
'median_response_time': results.median_response_time, | |
'min_response_time': results.min_response_time, | |
'max_response_time': results.max_response_time, | |
'memory_usage_stats': results.memory_usage_stats, | |
'category_performance': results.category_performance | |
} | |
with open('benchmark_results.json', 'w') as f: | |
json.dump(results_dict, f, indent=2) | |
print(f"\nπ Results saved to benchmark_results.json") | |
else: | |
print("β Agent not available - cannot run benchmarks") | |
# Also run pytest tests | |
pytest.main([__file__, "-v"]) |