|
import logging |
|
|
|
import pytest |
|
|
|
from api.runner import AgentRunner |
|
|
|
|
|
test_logger = logging.getLogger("test_agent") |
|
test_logger.setLevel(logging.INFO) |
|
|
|
|
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning:httpx._models") |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
QUESTIONS_URL = f"{DEFAULT_API_URL}/questions" |
|
|
|
|
|
@pytest.fixture(scope="session") |
|
def agent(): |
|
"""Fixture to create and return an AgentRunner instance.""" |
|
test_logger.info("Creating AgentRunner instance") |
|
return AgentRunner() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestBasicCodeAgentCapabilities: |
|
"""Test basic capabilities of the code agent.""" |
|
|
|
def setup_method(self): |
|
"""Setup method to initialize the agent before each test.""" |
|
test_logger.info("Creating AgentRunner instance") |
|
self.agent = AgentRunner() |
|
|
|
def test_simple_math_calculation_with_steps(self): |
|
"""Test that the agent can perform basic math calculations and log steps.""" |
|
question = "What is the result of the following operation: 5 + 3 + 1294.678?" |
|
test_logger.info(f"Testing math calculation with question: {question}") |
|
|
|
|
|
response = self.agent(question) |
|
|
|
|
|
expected_result = str(5 + 3 + 1294.678) |
|
assert ( |
|
expected_result in response |
|
), f"Response should contain the result {expected_result}" |
|
|
|
|
|
assert self.agent.last_state is not None, "Agent should store last state" |
|
assert "step_logs" in self.agent.last_state, "State should contain step_logs" |
|
assert ( |
|
len(self.agent.last_state["step_logs"]) > 0 |
|
), "Should have at least one step logged" |
|
|
|
|
|
for step in self.agent.last_state["step_logs"]: |
|
assert "step_number" in step, "Each step should have a step_number" |
|
assert any( |
|
key in step for key in ["thought", "code", "observation"] |
|
), "Each step should have at least one of: thought, code, or observation" |
|
|
|
|
|
assert ( |
|
"final_answer" in response.lower() |
|
), "Response should indicate it's providing an answer" |
|
|
|
def test_document_qa_and_image_generation_with_steps(self): |
|
"""Test that the agent can search for information and generate images, with step logging.""" |
|
question = ( |
|
"Search for information about the Mona Lisa and generate an image of it." |
|
) |
|
test_logger.info( |
|
f"Testing document QA and image generation with question: {question}" |
|
) |
|
|
|
|
|
response = self.agent(question) |
|
|
|
|
|
assert "mona lisa" in response.lower(), "Response should mention Mona Lisa" |
|
assert "image" in response.lower(), "Response should mention image generation" |
|
|
|
|
|
assert self.agent.last_state is not None, "Agent should store last state" |
|
assert "step_logs" in self.agent.last_state, "State should contain step_logs" |
|
assert ( |
|
len(self.agent.last_state["step_logs"]) > 1 |
|
), "Should have multiple steps logged" |
|
|
|
|
|
steps = self.agent.last_state["step_logs"] |
|
search_steps = [step for step in steps if "search" in str(step).lower()] |
|
image_steps = [step for step in steps if "image" in str(step).lower()] |
|
|
|
assert len(search_steps) > 0, "Should have search steps" |
|
assert len(image_steps) > 0, "Should have image generation steps" |
|
|
|
|
|
for step in steps: |
|
assert "step_number" in step, "Each step should have a step_number" |
|
assert any( |
|
key in step for key in ["thought", "code", "observation"] |
|
), "Each step should have at least one of: thought, code, or observation" |
|
|
|
|
|
def test_simple_math_calculation_with_steps(): |
|
"""Test that the agent can perform a simple math calculation and verify intermediate steps.""" |
|
agent = AgentRunner() |
|
question = "What is the result of the following operation: 5 + 3 + 1294.678?" |
|
|
|
|
|
response = agent(question) |
|
|
|
|
|
assert agent.last_state is not None, "Last state should be stored" |
|
step_logs = agent.last_state.get("step_logs", []) |
|
assert len(step_logs) > 0, "Should have recorded step logs" |
|
|
|
for step in step_logs: |
|
assert "step_number" in step, "Each step should have a step number" |
|
assert any( |
|
key in step for key in ["thought", "code", "observation"] |
|
), "Each step should have at least one of thought/code/observation" |
|
|
|
|
|
expected_result = 1302.678 |
|
|
|
|
|
import re |
|
|
|
|
|
latex_match = re.search(r"\\boxed{([^}]+)}", response) |
|
if latex_match: |
|
|
|
latex_content = latex_match.group(1) |
|
numbers = re.findall(r"\d+\.?\d*", latex_content) |
|
else: |
|
|
|
numbers = re.findall(r"\d+\.?\d*", response) |
|
|
|
assert numbers, "Response should contain at least one number" |
|
|
|
|
|
has_correct_result = any(abs(float(n) - expected_result) < 0.001 for n in numbers) |
|
assert ( |
|
has_correct_result |
|
), f"Response should contain the result {expected_result}, got {response}" |
|
|
|
|
|
assert ( |
|
"final_answer" in response.lower() |
|
), "Response should indicate it's using final_answer" |
|
|
|
|
|
def test_document_qa_and_image_generation_with_steps(): |
|
"""Test document QA and image generation with step verification.""" |
|
agent = AgentRunner() |
|
question = "Can you search for information about the Mona Lisa and generate an image inspired by it?" |
|
|
|
|
|
response = agent(question) |
|
|
|
|
|
assert agent.last_state is not None, "Last state should be stored" |
|
step_logs = agent.last_state.get("step_logs", []) |
|
assert len(step_logs) > 0, "Should have recorded step logs" |
|
|
|
|
|
has_search_step = False |
|
has_image_step = False |
|
|
|
for step in step_logs: |
|
assert "step_number" in step, "Each step should have a step number" |
|
assert any( |
|
key in step for key in ["thought", "code", "observation"] |
|
), "Each step should have at least one of thought/code/observation" |
|
|
|
|
|
step_content = str(step.get("thought", "")) + str(step.get("code", "")) |
|
if "search" in step_content.lower(): |
|
has_search_step = True |
|
if "image" in step_content.lower() or "dalle" in step_content.lower(): |
|
has_image_step = True |
|
|
|
assert has_search_step, "Should include a search step" |
|
assert has_image_step, "Should include an image generation step" |
|
assert ( |
|
"final_answer" in response.lower() |
|
), "Response should indicate it's using final_answer" |
|
|
|
|
|
if __name__ == "__main__": |
|
pytest.main([__file__, "-s", "-v", "-x"]) |
|
|