gaia-enhanced-agent / tests /test_answer_formatter_comprehensive.py
GAIA Agent Deployment
Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements
9a6a4dc
"""
Comprehensive Test Suite for GAIA Answer Formatter
Phase 1: Answer Format Validation and Testing
Tests all response patterns identified in the evaluation results:
- Verbose explanations that need answer extraction
- Responses with "FINAL ANSWER:" format
- Edge cases like "just 25" patterns
- Numeric answers with unnecessary formatting
- Text answers with extra explanations
- Error responses and graceful handling
"""
import pytest
import sys
import os
# Add the deployment-ready directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter
class TestAnswerFormatterComprehensive:
"""Comprehensive test suite for the fixed GAIA answer formatter."""
def setup_method(self):
"""Set up test fixtures before each test method."""
self.formatter = FixedGAIAAnswerFormatter()
def test_verbose_explanation_extraction(self):
"""Test extraction from verbose explanations - the main failure pattern."""
test_cases = [
# Primary failure pattern from evaluation
("The final numeric output from the attached Python code is 16", "16"),
("Based on my analysis, the answer is clearly 42.", "42"),
("After processing the image, I found 3 objects.", "3"),
("The calculation shows that the result is 256.", "256"),
("Upon examination of the data, the value is 1024.", "1024"),
# More complex verbose patterns
("After careful analysis of the provided data and considering all factors, the final answer is 789.", "789"),
("The image processing algorithm detected exactly 15 distinct objects in the scene.", "15"),
("Following the mathematical computation steps outlined above, we arrive at 2048.", "2048"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_final_answer_format_extraction(self):
"""Test extraction from proper FINAL ANSWER format."""
test_cases = [
# Standard FINAL ANSWER format
("FINAL ANSWER: 6", "6"),
("FINAL ANSWER: 42", "42"),
("FINAL ANSWER: The answer is 25", "25"), # Should extract just the number
# FINAL ANSWER with extra content
("Multiple lines of explanation\nFINAL ANSWER: 100\nExtra text after", "100"),
("Some reasoning here.\nFINAL ANSWER: 777", "777"),
("Complex analysis...\nFINAL ANSWER: Paris\nAdditional notes", "Paris"),
# FINAL ANSWER with variations
("Final Answer: 123", "123"),
("FINAL ANSWER:456", "456"),
("FINAL ANSWER: The result is 999", "999"), # Should extract just the number
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_simple_pattern_extraction(self):
"""Test extraction from simple patterns like 'just 25'."""
test_cases = [
# "just X" patterns
("The answer is just 25", "25"),
("It's just 42", "42"),
("just 100", "100"),
("Just Paris", "Paris"),
# "answer is X" patterns
("The answer is 50", "50"),
("Answer is 75", "75"),
("The result is 200", "200"),
("Result is 300", "300"),
# Numbers at end of text
("After all calculations: 999", "999"),
("The final value: 1234", "1234"),
("Conclusion 567", "567"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_numeric_formatting_cleanup(self):
"""Test cleanup of numeric answers with unnecessary formatting."""
test_cases = [
# Remove commas from numbers
("The answer is 1,234", "1234"),
("Result: 10,000", "10000"),
("FINAL ANSWER: 1,234,567", "1234567"),
# Remove trailing periods from short answers
("42.", "42"),
("Paris.", "Paris"),
("100.", "100"),
# Remove quotes
('"42"', "42"),
("'Paris'", "Paris"),
('"The answer is 25"', "25"), # Should extract just the number from quoted text
# Clean up prefixes
("Answer: 42", "42"),
("The answer is: 100", "100"),
("Result: Paris", "Paris"),
("The result is: 200", "200"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_error_response_handling(self):
"""Test graceful handling of error responses."""
error_responses = [
"I'm sorry, I am unable to process the image at the moment. Please try again later.",
"Error: Unable to access the file.",
"I cannot process this request.",
"Sorry, there was an error processing your request.",
"Unable to complete the analysis.",
]
for error_response in error_responses:
result = self.formatter.format_answer(error_response)
# Should return something reasonable, not crash
assert result is not None
assert len(result) > 0
# Should not return "unknown" for these specific error patterns
# Instead should return a meaningful fallback
assert result != "unknown" or len(error_response.strip()) == 0
def test_complex_multiline_responses(self):
"""Test extraction from complex multiline responses."""
test_cases = [
# Code execution with output
("""
Here's the Python code execution:
```python
result = 2 + 2
print(result)
```
Output: 4
The final numeric output from the attached Python code is 4
""", "4"),
# Data analysis response
("""
Data Analysis Results:
- Mean: 45.6
- Median: 42
- Mode: 38
Based on the statistical analysis, the answer is 42.
""", "42"),
# Step-by-step reasoning
("""
Step 1: Calculate the base value
Step 2: Apply the multiplier
Step 3: Add the offset
Final calculation: 150
""", "150"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_edge_cases_and_malformed_input(self):
"""Test edge cases and malformed input handling."""
test_cases = [
# Empty or whitespace
("", "unknown"),
(" ", "unknown"),
("\n\n\n", "unknown"),
# Only punctuation or symbols
("...", "..."),
("???", "???"),
("!!!", "!!!"),
# Very long responses
("A" * 1000 + " The answer is 42", "42"),
# Multiple numbers - should pick the most relevant
("There are 5 cats, 10 dogs, and the answer is 15", "15"),
("Values: 1, 2, 3, 4, 5. Final: 5", "5"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_text_answers_with_explanations(self):
"""Test extraction of text answers with extra explanations."""
test_cases = [
# City/location answers
("After analyzing the geographical data, the city is Paris", "Paris"),
("The location mentioned in the document is London", "London"),
("Based on the coordinates, this is New York", "New York"),
# Name answers
("The author of this work is Shakespeare", "Shakespeare"),
("According to the records, the name is Einstein", "Einstein"),
# Yes/No answers
("After careful consideration, the answer is yes", "yes"),
("Based on the evidence, the answer is no", "no"),
# Color answers
("The dominant color in the image is blue", "blue"),
("Analysis shows the color is red", "red"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_fallback_mechanisms(self):
"""Test fallback mechanisms when FINAL ANSWER format is not present."""
test_cases = [
# Should extract from last meaningful line
("Line 1\nLine 2\nThe answer is 42", "42"),
# Should extract from first substantial content
("42\nSome explanation after", "42"),
# Should handle mixed content
("# Header\n- Bullet point\nThe result is 100", "100"),
# Should extract numbers when no clear pattern
("Some text with numbers 5, 10, 15", "15"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_performance_requirements(self):
"""Test that formatting operations complete within performance requirements."""
import time
# Test with a reasonably complex response
complex_response = """
This is a complex response with multiple paragraphs and various content.
First, let me analyze the data:
- Point 1: Some analysis
- Point 2: More analysis
- Point 3: Even more analysis
Then, I'll perform calculations:
Step 1: 10 + 5 = 15
Step 2: 15 * 2 = 30
Step 3: 30 - 5 = 25
Finally, based on all this analysis, the answer is 25.
"""
start_time = time.time()
result = self.formatter.format_answer(complex_response)
end_time = time.time()
# Should complete in under 100ms as per requirements
assert (end_time - start_time) < 0.1, "Formatting took too long"
assert result == "25", f"Expected '25', got '{result}'"
def test_consistency_and_determinism(self):
"""Test that the formatter produces consistent results."""
test_input = "The final numeric output from the attached Python code is 16"
expected = "16"
# Run the same input multiple times
results = []
for _ in range(10):
result = self.formatter.format_answer(test_input)
results.append(result)
# All results should be identical
assert all(r == expected for r in results), f"Inconsistent results: {results}"
# All results should be the same
assert len(set(results)) == 1, f"Non-deterministic results: {results}"
class TestAnswerFormatterIntegration:
"""Integration tests for the answer formatter with real-world scenarios."""
def setup_method(self):
"""Set up test fixtures before each test method."""
self.formatter = FixedGAIAAnswerFormatter()
def test_gaia_evaluation_patterns(self):
"""Test specific patterns from GAIA evaluation results."""
# These are based on actual evaluation failures
evaluation_patterns = [
# Pattern 1: Verbose numeric explanations
("The final numeric output from the attached Python code is 16", "16"),
("After executing the code, the result is 42", "42"),
("The calculation yields 256", "256"),
# Pattern 2: Image analysis responses
("I can see 3 objects in the image", "3"),
("The image contains 5 distinct elements", "5"),
("Analysis of the image reveals 7 items", "7"),
# Pattern 3: Document processing responses
("The document mentions the year 1995", "1995"),
("According to the text, the value is 2024", "2024"),
# Pattern 4: Mixed content with clear answers
("Based on my analysis of the provided data, considering all factors, the answer is clearly 789", "789"),
]
for input_text, expected in evaluation_patterns:
result = self.formatter.format_answer(input_text)
assert result == expected, f"GAIA pattern failed - input: '{input_text}' - got '{result}', expected '{expected}'"
def test_zero_false_positives(self):
"""Test that the formatter doesn't extract incorrect answers."""
# These should NOT extract numbers that aren't the actual answer
non_answer_patterns = [
("I processed 5 files but couldn't find the answer", "unknown"), # Should not return "5"
("After 10 attempts, I'm unable to determine the result", "unknown"), # Should not return "10"
("The process took 30 seconds but failed", "unknown"), # Should not return "30"
]
for input_text, expected in non_answer_patterns:
result = self.formatter.format_answer(input_text)
# The result should not be a number that appears in the text but isn't the answer
numbers_in_text = ["5", "10", "30"]
if expected == "unknown":
assert result not in numbers_in_text, f"False positive - extracted '{result}' from '{input_text}'"
if __name__ == "__main__":
# Run the tests
pytest.main([__file__, "-v"])