|
|
|
|
|
from dataclasses import dataclass |
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
from enum import Enum |
|
import datetime |
|
import uuid |
|
import json |
|
import copy |
|
|
|
class TaskStatus(Enum): |
|
"""Status of a recursive task.""" |
|
INITIALIZED = "initialized" |
|
IN_PROGRESS = "in_progress" |
|
CONVERGED = "converged" |
|
MAX_ITERATIONS = "max_iterations" |
|
PERFECT_SOLUTION = "perfect_solution" |
|
ABANDONED = "abandoned" |
|
|
|
|
|
@dataclass |
|
class ProblemState: |
|
"""Represents the current state of a problem in the recursive task.""" |
|
problem_id: str |
|
description: str |
|
code_context: Dict[str, Any] |
|
requirements: List[Dict[str, Any]] |
|
difficulty: float |
|
evolution_stage: int |
|
adaptation_vector: List[float] |
|
|
|
|
|
@dataclass |
|
class EvaluationResult: |
|
"""Results from evaluating a solution.""" |
|
success: bool |
|
score: float |
|
execution_results: Dict[str, Any] |
|
error_details: Optional[Dict[str, Any]] = None |
|
test_results: Optional[Dict[str, Any]] = None |
|
metrics: Optional[Dict[str, float]] = None |
|
|
|
|
|
@dataclass |
|
class Feedback: |
|
"""Structured feedback on a solution.""" |
|
summary: str |
|
issues: List[Dict[str, Any]] |
|
suggestions: List[Dict[str, Any]] |
|
focus_areas: List[str] |
|
adaptation_hints: List[Dict[str, Any]] |
|
|
|
|
|
class ConvergenceCriteria: |
|
"""Criteria for determining when a recursive task has converged.""" |
|
|
|
def __init__(self, config: Dict[str, Any] = None): |
|
self.config = config or {} |
|
self.score_threshold = self.config.get("score_threshold", 0.95) |
|
self.min_iterations = self.config.get("min_iterations", 1) |
|
self.max_iterations = self.config.get("max_iterations", 10) |
|
self.score_delta_threshold = self.config.get("score_delta_threshold", 0.01) |
|
self.consecutive_plateau_limit = self.config.get("consecutive_plateau_limit", 3) |
|
|
|
def has_converged(self, trajectory: "Trajectory") -> bool: |
|
"""Determine if the task has converged based on the trajectory.""" |
|
if len(trajectory.steps) < self.min_iterations: |
|
return False |
|
|
|
if len(trajectory.steps) >= self.max_iterations: |
|
return True |
|
|
|
|
|
latest_score = trajectory.steps[-1].result.score |
|
if latest_score >= self.score_threshold: |
|
return True |
|
|
|
|
|
if len(trajectory.steps) >= self.consecutive_plateau_limit + 1: |
|
recent_scores = [step.result.score for step in |
|
trajectory.steps[-self.consecutive_plateau_limit-1:]] |
|
deltas = [abs(recent_scores[i+1] - recent_scores[i]) |
|
for i in range(len(recent_scores)-1)] |
|
|
|
if all(delta < self.score_delta_threshold for delta in deltas): |
|
return True |
|
|
|
return False |
|
|
|
|
|
@dataclass |
|
class TrajectoryStep: |
|
"""A single step in a solution trajectory.""" |
|
step_id: str |
|
timestamp: datetime.datetime |
|
problem_state: ProblemState |
|
solution: str |
|
result: EvaluationResult |
|
feedback: Feedback |
|
|
|
|
|
class Trajectory: |
|
"""Tracks the evolution of solutions over multiple iterations.""" |
|
|
|
def __init__(self, task_id: str): |
|
self.task_id = task_id |
|
self.steps: List[TrajectoryStep] = [] |
|
self.metadata: Dict[str, Any] = { |
|
"start_time": datetime.datetime.now(), |
|
"task_id": task_id |
|
} |
|
|
|
def add_step(self, problem_state: ProblemState, solution: str, |
|
result: EvaluationResult, feedback: Feedback) -> None: |
|
"""Add a step to the trajectory.""" |
|
step = TrajectoryStep( |
|
step_id=str(uuid.uuid4()), |
|
timestamp=datetime.datetime.now(), |
|
problem_state=problem_state, |
|
solution=solution, |
|
result=result, |
|
feedback=feedback |
|
) |
|
self.steps.append(step) |
|
|
|
def get_solution_series(self) -> List[str]: |
|
"""Return the series of solutions.""" |
|
return [step.solution for step in self.steps] |
|
|
|
def get_score_series(self) -> List[float]: |
|
"""Return the series of scores.""" |
|
return [step.result.score for step in self.steps] |
|
|
|
def get_latest_step(self) -> Optional[TrajectoryStep]: |
|
"""Get the most recent step in the trajectory.""" |
|
if not self.steps: |
|
return None |
|
return self.steps[-1] |
|
|
|
def calculate_improvement_rate(self) -> float: |
|
"""Calculate the rate of improvement across iterations.""" |
|
scores = self.get_score_series() |
|
if len(scores) < 2: |
|
return 0.0 |
|
|
|
return (scores[-1] - scores[0]) / len(scores) |
|
|
|
def calculate_volatility(self) -> float: |
|
"""Calculate the volatility of scores across iterations.""" |
|
scores = self.get_score_series() |
|
if len(scores) < 2: |
|
return 0.0 |
|
|
|
deltas = [abs(scores[i+1] - scores[i]) for i in range(len(scores)-1)] |
|
return sum(deltas) / len(deltas) |
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
"""Convert the trajectory to a dictionary for serialization.""" |
|
return { |
|
"task_id": self.task_id, |
|
"metadata": self.metadata, |
|
"steps": [ |
|
{ |
|
"step_id": step.step_id, |
|
"timestamp": step.timestamp.isoformat(), |
|
"problem_state": { |
|
"problem_id": step.problem_state.problem_id, |
|
"description": step.problem_state.description, |
|
"code_context": step.problem_state.code_context, |
|
"requirements": step.problem_state.requirements, |
|
"difficulty": step.problem_state.difficulty, |
|
"evolution_stage": step.problem_state.evolution_stage, |
|
"adaptation_vector": step.problem_state.adaptation_vector |
|
}, |
|
"solution": step.solution, |
|
"result": { |
|
"success": step.result.success, |
|
"score": step.result.score, |
|
"execution_results": step.result.execution_results, |
|
"error_details": step.result.error_details, |
|
"test_results": step.result.test_results, |
|
"metrics": step.result.metrics |
|
}, |
|
"feedback": { |
|
"summary": step.feedback.summary, |
|
"issues": step.feedback.issues, |
|
"suggestions": step.feedback.suggestions, |
|
"focus_areas": step.feedback.focus_areas, |
|
"adaptation_hints": step.feedback.adaptation_hints |
|
} |
|
} |
|
for step in self.steps |
|
] |
|
} |
|
|
|
@classmethod |
|
def from_dict(cls, data: Dict[str, Any]) -> "Trajectory": |
|
"""Create a trajectory from a dictionary.""" |
|
trajectory = cls(data["task_id"]) |
|
trajectory.metadata = data["metadata"] |
|
|
|
for step_data in data["steps"]: |
|
problem_state = ProblemState( |
|
problem_id=step_data["problem_state"]["problem_id"], |
|
description=step_data["problem_state"]["description"], |
|
code_context=step_data["problem_state"]["code_context"], |
|
requirements=step_data["problem_state"]["requirements"], |
|
difficulty=step_data["problem_state"]["difficulty"], |
|
evolution_stage=step_data["problem_state"]["evolution_stage"], |
|
adaptation_vector=step_data["problem_state"]["adaptation_vector"] |
|
) |
|
|
|
result = EvaluationResult( |
|
success=step_data["result"]["success"], |
|
score=step_data["result"]["score"], |
|
execution_results=step_data["result"]["execution_results"], |
|
error_details=step_data["result"]["error_details"], |
|
test_results=step_data["result"]["test_results"], |
|
metrics=step_data["result"]["metrics"] |
|
) |
|
|
|
feedback = Feedback( |
|
summary=step_data["feedback"]["summary"], |
|
issues=step_data["feedback"]["issues"], |
|
suggestions=step_data["feedback"]["suggestions"], |
|
focus_areas=step_data["feedback"]["focus_areas"], |
|
adaptation_hints=step_data["feedback"]["adaptation_hints"] |
|
) |
|
|
|
trajectory.add_step( |
|
problem_state=problem_state, |
|
solution=step_data["solution"], |
|
result=result, |
|
feedback=feedback |
|
) |
|
|
|
return trajectory |
|
|
|
def save(self, filepath: str) -> None: |
|
"""Save the trajectory to a file.""" |
|
with open(filepath, "w") as f: |
|
json.dump(self.to_dict(), f, indent=2) |
|
|
|
@classmethod |
|
def load(cls, filepath: str) -> "Trajectory": |
|
"""Load a trajectory from a file.""" |
|
with open(filepath, "r") as f: |
|
data = json.load(f) |
|
return cls.from_dict(data) |
|
|
|
|
|
class RecursiveTask: |
|
""" |
|
Base class for recursive tasks that evolve based on model solutions. |
|
|
|
A recursive task provides a dynamic problem that adapts based on the |
|
model's attempted solutions, creating a feedback loop that more accurately |
|
reflects real-world software engineering challenges. |
|
""" |
|
|
|
def __init__(self, |
|
initial_state: ProblemState, |
|
config: Dict[str, Any] = None): |
|
""" |
|
Initialize the recursive task with an initial problem state. |
|
|
|
Args: |
|
initial_state: The initial state of the problem |
|
config: Configuration options for the task |
|
""" |
|
self.task_id = str(uuid.uuid4()) |
|
self.state = initial_state |
|
self.config = config or {} |
|
self.trajectory = Trajectory(self.task_id) |
|
self.status = TaskStatus.INITIALIZED |
|
self.convergence_criteria = ConvergenceCriteria( |
|
config.get("convergence_criteria", {})) |
|
|
|
def get_current_problem(self) -> Dict[str, Any]: |
|
""" |
|
Return the current problem description and context. |
|
|
|
Returns: |
|
A dictionary containing the current problem description and context |
|
""" |
|
return { |
|
"description": self.state.description, |
|
"code_context": self.state.code_context, |
|
"requirements": self.state.requirements, |
|
"evolution_stage": self.state.evolution_stage |
|
} |
|
|
|
def evaluate_solution(self, solution: str) -> Tuple[EvaluationResult, Feedback]: |
|
""" |
|
Evaluate a solution and generate feedback. |
|
|
|
Args: |
|
solution: The solution to evaluate |
|
|
|
Returns: |
|
A tuple containing the evaluation result and feedback |
|
""" |
|
|
|
result = self._run_evaluation(solution) |
|
|
|
|
|
feedback = self._generate_feedback(solution, result) |
|
|
|
return result, feedback |
|
|
|
def update_state(self, |
|
solution: str, |
|
result: EvaluationResult, |
|
feedback: Feedback) -> ProblemState: |
|
""" |
|
Update the problem state based on the solution and feedback. |
|
|
|
This method implements the recursive nature of the benchmark by |
|
evolving the problem based on the model's solution attempt. |
|
|
|
Args: |
|
solution: The attempted solution |
|
result: The evaluation result |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The updated problem state |
|
""" |
|
|
|
self.trajectory.add_step( |
|
problem_state=self.state, |
|
solution=solution, |
|
result=result, |
|
feedback=feedback |
|
) |
|
|
|
|
|
if self.convergence_criteria.has_converged(self.trajectory): |
|
if self.trajectory.steps[-1].result.score >= self.convergence_criteria.score_threshold: |
|
self.status = TaskStatus.PERFECT_SOLUTION |
|
elif len(self.trajectory.steps) >= self.convergence_criteria.max_iterations: |
|
self.status = TaskStatus.MAX_ITERATIONS |
|
else: |
|
self.status = TaskStatus.CONVERGED |
|
return self.state |
|
|
|
|
|
self.state = self._evolve_state(solution, result, feedback) |
|
|
|
|
|
self.status = TaskStatus.IN_PROGRESS |
|
|
|
return self.state |
|
|
|
def _run_evaluation(self, solution: str) -> EvaluationResult: |
|
""" |
|
Run evaluation logic specific to this task. |
|
|
|
Args: |
|
solution: The solution to evaluate |
|
|
|
Returns: |
|
The evaluation result |
|
""" |
|
raise NotImplementedError("Subclasses must implement this method") |
|
|
|
def _generate_feedback(self, |
|
solution: str, |
|
result: EvaluationResult) -> Feedback: |
|
""" |
|
Generate structured feedback based on evaluation results. |
|
|
|
Args: |
|
solution: The solution that was evaluated |
|
result: The evaluation result |
|
|
|
Returns: |
|
Structured feedback |
|
""" |
|
raise NotImplementedError("Subclasses must implement this method") |
|
|
|
def _evolve_state(self, |
|
solution: str, |
|
result: EvaluationResult, |
|
feedback: Feedback) -> ProblemState: |
|
""" |
|
Evolve the problem state based on the solution and feedback. |
|
|
|
This method implements the recursive nature of the benchmark by |
|
defining how the problem changes in response to solution attempts. |
|
|
|
Args: |
|
solution: The attempted solution |
|
result: The evaluation result |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The evolved problem state |
|
""" |
|
raise NotImplementedError("Subclasses must implement this method") |
|
|
|
def get_trajectory(self) -> Trajectory: |
|
""" |
|
Get the complete solution trajectory for this task. |
|
|
|
Returns: |
|
The solution trajectory |
|
""" |
|
return self.trajectory |
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
""" |
|
Convert the task to a dictionary for serialization. |
|
|
|
Returns: |
|
A dictionary representation of the task |
|
""" |
|
return { |
|
"task_id": self.task_id, |
|
"status": self.status.value, |
|
"state": { |
|
"problem_id": self.state.problem_id, |
|
"description": self.state.description, |
|
"code_context": self.state.code_context, |
|
"requirements": self.state.requirements, |
|
"difficulty": self.state.difficulty, |
|
"evolution_stage": self.state.evolution_stage, |
|
"adaptation_vector": self.state.adaptation_vector |
|
}, |
|
"config": self.config, |
|
"trajectory": self.trajectory.to_dict() |
|
} |
|
|
|
def save(self, filepath: str) -> None: |
|
""" |
|
Save the task to a file. |
|
|
|
Args: |
|
filepath: Path to save the task |
|
""" |
|
with open(filepath, "w") as f: |
|
json.dump(self.to_dict(), f, indent=2) |
|
|
|
@classmethod |
|
def load(cls, filepath: str) -> "RecursiveTask": |
|
""" |
|
Load a task from a file. |
|
|
|
Args: |
|
filepath: Path to load the task from |
|
|
|
Returns: |
|
The loaded task |
|
""" |
|
with open(filepath, "r") as f: |
|
data = json.load(f) |
|
|
|
|
|
|
|
raise NotImplementedError("Subclasses must implement this method") |
|
|