Spaces:
Running
Running
File size: 14,936 Bytes
9a6a4dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 |
"""
Comprehensive Test Suite for GAIA Answer Formatter
Phase 1: Answer Format Validation and Testing
Tests all response patterns identified in the evaluation results:
- Verbose explanations that need answer extraction
- Responses with "FINAL ANSWER:" format
- Edge cases like "just 25" patterns
- Numeric answers with unnecessary formatting
- Text answers with extra explanations
- Error responses and graceful handling
"""
import pytest
import sys
import os
# Add the deployment-ready directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from utils.fixed_answer_formatter import FixedGAIAAnswerFormatter
class TestAnswerFormatterComprehensive:
"""Comprehensive test suite for the fixed GAIA answer formatter."""
def setup_method(self):
"""Set up test fixtures before each test method."""
self.formatter = FixedGAIAAnswerFormatter()
def test_verbose_explanation_extraction(self):
"""Test extraction from verbose explanations - the main failure pattern."""
test_cases = [
# Primary failure pattern from evaluation
("The final numeric output from the attached Python code is 16", "16"),
("Based on my analysis, the answer is clearly 42.", "42"),
("After processing the image, I found 3 objects.", "3"),
("The calculation shows that the result is 256.", "256"),
("Upon examination of the data, the value is 1024.", "1024"),
# More complex verbose patterns
("After careful analysis of the provided data and considering all factors, the final answer is 789.", "789"),
("The image processing algorithm detected exactly 15 distinct objects in the scene.", "15"),
("Following the mathematical computation steps outlined above, we arrive at 2048.", "2048"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_final_answer_format_extraction(self):
"""Test extraction from proper FINAL ANSWER format."""
test_cases = [
# Standard FINAL ANSWER format
("FINAL ANSWER: 6", "6"),
("FINAL ANSWER: 42", "42"),
("FINAL ANSWER: The answer is 25", "25"), # Should extract just the number
# FINAL ANSWER with extra content
("Multiple lines of explanation\nFINAL ANSWER: 100\nExtra text after", "100"),
("Some reasoning here.\nFINAL ANSWER: 777", "777"),
("Complex analysis...\nFINAL ANSWER: Paris\nAdditional notes", "Paris"),
# FINAL ANSWER with variations
("Final Answer: 123", "123"),
("FINAL ANSWER:456", "456"),
("FINAL ANSWER: The result is 999", "999"), # Should extract just the number
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_simple_pattern_extraction(self):
"""Test extraction from simple patterns like 'just 25'."""
test_cases = [
# "just X" patterns
("The answer is just 25", "25"),
("It's just 42", "42"),
("just 100", "100"),
("Just Paris", "Paris"),
# "answer is X" patterns
("The answer is 50", "50"),
("Answer is 75", "75"),
("The result is 200", "200"),
("Result is 300", "300"),
# Numbers at end of text
("After all calculations: 999", "999"),
("The final value: 1234", "1234"),
("Conclusion 567", "567"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_numeric_formatting_cleanup(self):
"""Test cleanup of numeric answers with unnecessary formatting."""
test_cases = [
# Remove commas from numbers
("The answer is 1,234", "1234"),
("Result: 10,000", "10000"),
("FINAL ANSWER: 1,234,567", "1234567"),
# Remove trailing periods from short answers
("42.", "42"),
("Paris.", "Paris"),
("100.", "100"),
# Remove quotes
('"42"', "42"),
("'Paris'", "Paris"),
('"The answer is 25"', "25"), # Should extract just the number from quoted text
# Clean up prefixes
("Answer: 42", "42"),
("The answer is: 100", "100"),
("Result: Paris", "Paris"),
("The result is: 200", "200"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_error_response_handling(self):
"""Test graceful handling of error responses."""
error_responses = [
"I'm sorry, I am unable to process the image at the moment. Please try again later.",
"Error: Unable to access the file.",
"I cannot process this request.",
"Sorry, there was an error processing your request.",
"Unable to complete the analysis.",
]
for error_response in error_responses:
result = self.formatter.format_answer(error_response)
# Should return something reasonable, not crash
assert result is not None
assert len(result) > 0
# Should not return "unknown" for these specific error patterns
# Instead should return a meaningful fallback
assert result != "unknown" or len(error_response.strip()) == 0
def test_complex_multiline_responses(self):
"""Test extraction from complex multiline responses."""
test_cases = [
# Code execution with output
("""
Here's the Python code execution:
```python
result = 2 + 2
print(result)
```
Output: 4
The final numeric output from the attached Python code is 4
""", "4"),
# Data analysis response
("""
Data Analysis Results:
- Mean: 45.6
- Median: 42
- Mode: 38
Based on the statistical analysis, the answer is 42.
""", "42"),
# Step-by-step reasoning
("""
Step 1: Calculate the base value
Step 2: Apply the multiplier
Step 3: Add the offset
Final calculation: 150
""", "150"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_edge_cases_and_malformed_input(self):
"""Test edge cases and malformed input handling."""
test_cases = [
# Empty or whitespace
("", "unknown"),
(" ", "unknown"),
("\n\n\n", "unknown"),
# Only punctuation or symbols
("...", "..."),
("???", "???"),
("!!!", "!!!"),
# Very long responses
("A" * 1000 + " The answer is 42", "42"),
# Multiple numbers - should pick the most relevant
("There are 5 cats, 10 dogs, and the answer is 15", "15"),
("Values: 1, 2, 3, 4, 5. Final: 5", "5"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_text_answers_with_explanations(self):
"""Test extraction of text answers with extra explanations."""
test_cases = [
# City/location answers
("After analyzing the geographical data, the city is Paris", "Paris"),
("The location mentioned in the document is London", "London"),
("Based on the coordinates, this is New York", "New York"),
# Name answers
("The author of this work is Shakespeare", "Shakespeare"),
("According to the records, the name is Einstein", "Einstein"),
# Yes/No answers
("After careful consideration, the answer is yes", "yes"),
("Based on the evidence, the answer is no", "no"),
# Color answers
("The dominant color in the image is blue", "blue"),
("Analysis shows the color is red", "red"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_fallback_mechanisms(self):
"""Test fallback mechanisms when FINAL ANSWER format is not present."""
test_cases = [
# Should extract from last meaningful line
("Line 1\nLine 2\nThe answer is 42", "42"),
# Should extract from first substantial content
("42\nSome explanation after", "42"),
# Should handle mixed content
("# Header\n- Bullet point\nThe result is 100", "100"),
# Should extract numbers when no clear pattern
("Some text with numbers 5, 10, 15", "15"),
]
for input_text, expected in test_cases:
result = self.formatter.format_answer(input_text)
assert result == expected, f"Failed for input: '{input_text}' - got '{result}', expected '{expected}'"
def test_performance_requirements(self):
"""Test that formatting operations complete within performance requirements."""
import time
# Test with a reasonably complex response
complex_response = """
This is a complex response with multiple paragraphs and various content.
First, let me analyze the data:
- Point 1: Some analysis
- Point 2: More analysis
- Point 3: Even more analysis
Then, I'll perform calculations:
Step 1: 10 + 5 = 15
Step 2: 15 * 2 = 30
Step 3: 30 - 5 = 25
Finally, based on all this analysis, the answer is 25.
"""
start_time = time.time()
result = self.formatter.format_answer(complex_response)
end_time = time.time()
# Should complete in under 100ms as per requirements
assert (end_time - start_time) < 0.1, "Formatting took too long"
assert result == "25", f"Expected '25', got '{result}'"
def test_consistency_and_determinism(self):
"""Test that the formatter produces consistent results."""
test_input = "The final numeric output from the attached Python code is 16"
expected = "16"
# Run the same input multiple times
results = []
for _ in range(10):
result = self.formatter.format_answer(test_input)
results.append(result)
# All results should be identical
assert all(r == expected for r in results), f"Inconsistent results: {results}"
# All results should be the same
assert len(set(results)) == 1, f"Non-deterministic results: {results}"
class TestAnswerFormatterIntegration:
"""Integration tests for the answer formatter with real-world scenarios."""
def setup_method(self):
"""Set up test fixtures before each test method."""
self.formatter = FixedGAIAAnswerFormatter()
def test_gaia_evaluation_patterns(self):
"""Test specific patterns from GAIA evaluation results."""
# These are based on actual evaluation failures
evaluation_patterns = [
# Pattern 1: Verbose numeric explanations
("The final numeric output from the attached Python code is 16", "16"),
("After executing the code, the result is 42", "42"),
("The calculation yields 256", "256"),
# Pattern 2: Image analysis responses
("I can see 3 objects in the image", "3"),
("The image contains 5 distinct elements", "5"),
("Analysis of the image reveals 7 items", "7"),
# Pattern 3: Document processing responses
("The document mentions the year 1995", "1995"),
("According to the text, the value is 2024", "2024"),
# Pattern 4: Mixed content with clear answers
("Based on my analysis of the provided data, considering all factors, the answer is clearly 789", "789"),
]
for input_text, expected in evaluation_patterns:
result = self.formatter.format_answer(input_text)
assert result == expected, f"GAIA pattern failed - input: '{input_text}' - got '{result}', expected '{expected}'"
def test_zero_false_positives(self):
"""Test that the formatter doesn't extract incorrect answers."""
# These should NOT extract numbers that aren't the actual answer
non_answer_patterns = [
("I processed 5 files but couldn't find the answer", "unknown"), # Should not return "5"
("After 10 attempts, I'm unable to determine the result", "unknown"), # Should not return "10"
("The process took 30 seconds but failed", "unknown"), # Should not return "30"
]
for input_text, expected in non_answer_patterns:
result = self.formatter.format_answer(input_text)
# The result should not be a number that appears in the text but isn't the answer
numbers_in_text = ["5", "10", "30"]
if expected == "unknown":
assert result not in numbers_in_text, f"False positive - extracted '{result}' from '{input_text}'"
if __name__ == "__main__":
# Run the tests
pytest.main([__file__, "-v"]) |