Spaces:
Running
Running
""" | |
Comprehensive Test Suite for GAIA Answer Formatter | |
Phase 1: Answer Format Validation and Testing | |
Tests all response patterns identified in the evaluation results: | |
- Verbose explanations that need answer extraction | |
- Responses with "FINAL ANSWER:" format | |
- Edge cases like "just 25" patterns | |
- Numeric answers with unnecessary formatting | |
- Text answers with extra explanations | |
- Error responses and graceful handling | |
""" | |
import pytest | |
import sys | |
import os | |
# Add the deployment-ready directory to the path | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) | |
from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter | |
class TestAnswerFormatterComprehensive: | |
"""Comprehensive test suite for the fixed GAIA answer formatter.""" | |
def setup_method(self): | |
"""Set up test fixtures before each test method.""" | |
self.formatter = FixedGAIAAnswerFormatter() | |
def test_verbose_explanation_extraction(self): | |
"""Test extraction from verbose explanations - the main failure pattern.""" | |
test_cases = [ | |
# Primary failure pattern from evaluation | |
("The final numeric output from the attached Python code is 16", "16"), | |
("Based on my analysis, the answer is clearly 42.", "42"), | |
("After processing the image, I found 3 objects.", "3"), | |
("The calculation shows that the result is 256.", "256"), | |
("Upon examination of the data, the value is 1024.", "1024"), | |
# More complex verbose patterns | |
("After careful analysis of the provided data and considering all factors, the final answer is 789.", "789"), | |
("The image processing algorithm detected exactly 15 distinct objects in the scene.", "15"), | |
("Following the mathematical computation steps outlined above, we arrive at 2048.", "2048"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_final_answer_format_extraction(self): | |
"""Test extraction from proper FINAL ANSWER format.""" | |
test_cases = [ | |
# Standard FINAL ANSWER format | |
("FINAL ANSWER: 6", "6"), | |
("FINAL ANSWER: 42", "42"), | |
("FINAL ANSWER: The answer is 25", "25"), # Should extract just the number | |
# FINAL ANSWER with extra content | |
("Multiple lines of explanation\nFINAL ANSWER: 100\nExtra text after", "100"), | |
("Some reasoning here.\nFINAL ANSWER: 777", "777"), | |
("Complex analysis...\nFINAL ANSWER: Paris\nAdditional notes", "Paris"), | |
# FINAL ANSWER with variations | |
("Final Answer: 123", "123"), | |
("FINAL ANSWER:456", "456"), | |
("FINAL ANSWER: The result is 999", "999"), # Should extract just the number | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_simple_pattern_extraction(self): | |
"""Test extraction from simple patterns like 'just 25'.""" | |
test_cases = [ | |
# "just X" patterns | |
("The answer is just 25", "25"), | |
("It's just 42", "42"), | |
("just 100", "100"), | |
("Just Paris", "Paris"), | |
# "answer is X" patterns | |
("The answer is 50", "50"), | |
("Answer is 75", "75"), | |
("The result is 200", "200"), | |
("Result is 300", "300"), | |
# Numbers at end of text | |
("After all calculations: 999", "999"), | |
("The final value: 1234", "1234"), | |
("Conclusion 567", "567"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_numeric_formatting_cleanup(self): | |
"""Test cleanup of numeric answers with unnecessary formatting.""" | |
test_cases = [ | |
# Remove commas from numbers | |
("The answer is 1,234", "1234"), | |
("Result: 10,000", "10000"), | |
("FINAL ANSWER: 1,234,567", "1234567"), | |
# Remove trailing periods from short answers | |
("42.", "42"), | |
("Paris.", "Paris"), | |
("100.", "100"), | |
# Remove quotes | |
('"42"', "42"), | |
("'Paris'", "Paris"), | |
('"The answer is 25"', "25"), # Should extract just the number from quoted text | |
# Clean up prefixes | |
("Answer: 42", "42"), | |
("The answer is: 100", "100"), | |
("Result: Paris", "Paris"), | |
("The result is: 200", "200"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_error_response_handling(self): | |
"""Test graceful handling of error responses.""" | |
error_responses = [ | |
"I'm sorry, I am unable to process the image at the moment. Please try again later.", | |
"Error: Unable to access the file.", | |
"I cannot process this request.", | |
"Sorry, there was an error processing your request.", | |
"Unable to complete the analysis.", | |
] | |
for error_response in error_responses: | |
result = self.formatter.format_answer(error_response) | |
# Should return something reasonable, not crash | |
assert result is not None | |
assert len(result) > 0 | |
# Should not return "unknown" for these specific error patterns | |
# Instead should return a meaningful fallback | |
assert result != "unknown" or len(error_response.strip()) == 0 | |
def test_complex_multiline_responses(self): | |
"""Test extraction from complex multiline responses.""" | |
test_cases = [ | |
# Code execution with output | |
(""" | |
Here's the Python code execution: | |
```python | |
result = 2 + 2 | |
print(result) | |
``` | |
Output: 4 | |
The final numeric output from the attached Python code is 4 | |
""", "4"), | |
# Data analysis response | |
(""" | |
Data Analysis Results: | |
- Mean: 45.6 | |
- Median: 42 | |
- Mode: 38 | |
Based on the statistical analysis, the answer is 42. | |
""", "42"), | |
# Step-by-step reasoning | |
(""" | |
Step 1: Calculate the base value | |
Step 2: Apply the multiplier | |
Step 3: Add the offset | |
Final calculation: 150 | |
""", "150"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_edge_cases_and_malformed_input(self): | |
"""Test edge cases and malformed input handling.""" | |
test_cases = [ | |
# Empty or whitespace | |
("", "unknown"), | |
(" ", "unknown"), | |
("\n\n\n", "unknown"), | |
# Only punctuation or symbols | |
("...", "..."), | |
("???", "???"), | |
("!!!", "!!!"), | |
# Very long responses | |
("A" * 1000 + " The answer is 42", "42"), | |
# Multiple numbers - should pick the most relevant | |
("There are 5 cats, 10 dogs, and the answer is 15", "15"), | |
("Values: 1, 2, 3, 4, 5. Final: 5", "5"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_text_answers_with_explanations(self): | |
"""Test extraction of text answers with extra explanations.""" | |
test_cases = [ | |
# City/location answers | |
("After analyzing the geographical data, the city is Paris", "Paris"), | |
("The location mentioned in the document is London", "London"), | |
("Based on the coordinates, this is New York", "New York"), | |
# Name answers | |
("The author of this work is Shakespeare", "Shakespeare"), | |
("According to the records, the name is Einstein", "Einstein"), | |
# Yes/No answers | |
("After careful consideration, the answer is yes", "yes"), | |
("Based on the evidence, the answer is no", "no"), | |
# Color answers | |
("The dominant color in the image is blue", "blue"), | |
("Analysis shows the color is red", "red"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_fallback_mechanisms(self): | |
"""Test fallback mechanisms when FINAL ANSWER format is not present.""" | |
test_cases = [ | |
# Should extract from last meaningful line | |
("Line 1\nLine 2\nThe answer is 42", "42"), | |
# Should extract from first substantial content | |
("42\nSome explanation after", "42"), | |
# Should handle mixed content | |
("# Header\n- Bullet point\nThe result is 100", "100"), | |
# Should extract numbers when no clear pattern | |
("Some text with numbers 5, 10, 15", "15"), | |
] | |
for input_text, expected in test_cases: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_performance_requirements(self): | |
"""Test that formatting operations complete within performance requirements.""" | |
import time | |
# Test with a reasonably complex response | |
complex_response = """ | |
This is a complex response with multiple paragraphs and various content. | |
First, let me analyze the data: | |
- Point 1: Some analysis | |
- Point 2: More analysis | |
- Point 3: Even more analysis | |
Then, I'll perform calculations: | |
Step 1: 10 + 5 = 15 | |
Step 2: 15 * 2 = 30 | |
Step 3: 30 - 5 = 25 | |
Finally, based on all this analysis, the answer is 25. | |
""" | |
start_time = time.time() | |
result = self.formatter.format_answer(complex_response) | |
end_time = time.time() | |
# Should complete in under 100ms as per requirements | |
assert (end_time - start_time) < 0.1, "Formatting took too long" | |
assert result == "25", f"Expected '25', got '{result}'" | |
def test_consistency_and_determinism(self): | |
"""Test that the formatter produces consistent results.""" | |
test_input = "The final numeric output from the attached Python code is 16" | |
expected = "16" | |
# Run the same input multiple times | |
results = [] | |
for _ in range(10): | |
result = self.formatter.format_answer(test_input) | |
results.append(result) | |
# All results should be identical | |
assert all(r == expected for r in results), f"Inconsistent results: {results}" | |
# All results should be the same | |
assert len(set(results)) == 1, f"Non-deterministic results: {results}" | |
class TestAnswerFormatterIntegration: | |
"""Integration tests for the answer formatter with real-world scenarios.""" | |
def setup_method(self): | |
"""Set up test fixtures before each test method.""" | |
self.formatter = FixedGAIAAnswerFormatter() | |
def test_gaia_evaluation_patterns(self): | |
"""Test specific patterns from GAIA evaluation results.""" | |
# These are based on actual evaluation failures | |
evaluation_patterns = [ | |
# Pattern 1: Verbose numeric explanations | |
("The final numeric output from the attached Python code is 16", "16"), | |
("After executing the code, the result is 42", "42"), | |
("The calculation yields 256", "256"), | |
# Pattern 2: Image analysis responses | |
("I can see 3 objects in the image", "3"), | |
("The image contains 5 distinct elements", "5"), | |
("Analysis of the image reveals 7 items", "7"), | |
# Pattern 3: Document processing responses | |
("The document mentions the year 1995", "1995"), | |
("According to the text, the value is 2024", "2024"), | |
# Pattern 4: Mixed content with clear answers | |
("Based on my analysis of the provided data, considering all factors, the answer is clearly 789", "789"), | |
] | |
for input_text, expected in evaluation_patterns: | |
result = self.formatter.format_answer(input_text) | |
assert result == expected, f"GAIA pattern failed - input: '{input_text}' - got '{result}', expected '{expected}'" | |
def test_zero_false_positives(self): | |
"""Test that the formatter doesn't extract incorrect answers.""" | |
# These should NOT extract numbers that aren't the actual answer | |
non_answer_patterns = [ | |
("I processed 5 files but couldn't find the answer", "unknown"), # Should not return "5" | |
("After 10 attempts, I'm unable to determine the result", "unknown"), # Should not return "10" | |
("The process took 30 seconds but failed", "unknown"), # Should not return "30" | |
] | |
for input_text, expected in non_answer_patterns: | |
result = self.formatter.format_answer(input_text) | |
# The result should not be a number that appears in the text but isn't the answer | |
numbers_in_text = ["5", "10", "30"] | |
if expected == "unknown": | |
assert result not in numbers_in_text, f"False positive - extracted '{result}' from '{input_text}'" | |
if __name__ == "__main__": | |
# Run the tests | |
pytest.main([__file__, "-v"]) |