|
|
|
|
|
from typing import Any, Dict, List, Optional, Tuple, Union, Callable |
|
import datetime |
|
import uuid |
|
import json |
|
import os |
|
import logging |
|
from dataclasses import dataclass, field |
|
|
|
from recursive_swe_bench.core.recursive_task import ( |
|
RecursiveTask, Trajectory, TrajectoryStep, ProblemState, |
|
EvaluationResult, Feedback, TaskStatus |
|
) |
|
|
|
class RecursiveEvaluator: |
|
""" |
|
The core evaluation harness for recursive benchmark tasks. |
|
|
|
This class orchestrates the recursive evaluation process, managing the interactions |
|
between models and tasks, tracking trajectories, and calculating metrics. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model: Any, |
|
metrics: Dict[str, Any], |
|
config: Dict[str, Any] = None |
|
): |
|
""" |
|
Initialize the recursive evaluator. |
|
|
|
Args: |
|
model: The model to evaluate |
|
metrics: Dictionary of metric calculators |
|
config: Configuration options |
|
""" |
|
self.model = model |
|
self.metrics = metrics |
|
self.config = config or {} |
|
self.logger = self._setup_logger() |
|
|
|
def _setup_logger(self) -> logging.Logger: |
|
"""Set up logging for the evaluator.""" |
|
logger = logging.getLogger("RecursiveEvaluator") |
|
handler = logging.StreamHandler() |
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
handler.setFormatter(formatter) |
|
logger.addHandler(handler) |
|
logger.setLevel(self.config.get("log_level", logging.INFO)) |
|
return logger |
|
|
|
def evaluate_task( |
|
self, |
|
task: RecursiveTask, |
|
max_iterations: int = 5 |
|
) -> Tuple[Trajectory, Dict[str, float]]: |
|
""" |
|
Run a full recursive evaluation on a single task. |
|
|
|
Args: |
|
task: The task to evaluate |
|
max_iterations: Maximum number of iterations |
|
|
|
Returns: |
|
The trajectory and calculated metrics |
|
""" |
|
self.logger.info(f"Starting evaluation of task {task.task_id}") |
|
|
|
for i in range(max_iterations): |
|
self.logger.info(f"Starting iteration {i+1}/{max_iterations}") |
|
|
|
|
|
problem = task.get_current_problem() |
|
self.logger.debug(f"Problem state: evolution_stage={problem['evolution_stage']}") |
|
|
|
|
|
formatted_problem = self._format_problem_for_model(problem, task.trajectory) |
|
|
|
|
|
self.logger.debug("Requesting solution from model") |
|
solution = self.model.solve(formatted_problem) |
|
|
|
|
|
self.logger.debug("Evaluating solution") |
|
result, feedback = task.evaluate_solution(solution) |
|
|
|
|
|
self.logger.info(f"Solution score: {result.score:.4f}, Success: {result.success}") |
|
|
|
|
|
new_state = task.update_state(solution, result, feedback) |
|
|
|
|
|
if task.status != TaskStatus.IN_PROGRESS: |
|
self.logger.info(f"Task complete with status: {task.status.value}") |
|
break |
|
|
|
|
|
self.logger.info("Calculating metrics") |
|
metrics_result = self._calculate_metrics(task.trajectory) |
|
|
|
return task.trajectory, metrics_result |
|
|
|
def evaluate_task_set( |
|
self, |
|
tasks: List[RecursiveTask], |
|
max_iterations: int = 5, |
|
output_dir: Optional[str] = None |
|
) -> Dict[str, Any]: |
|
""" |
|
Evaluate a set of tasks and aggregate the results. |
|
|
|
Args: |
|
tasks: List of tasks to evaluate |
|
max_iterations: Maximum iterations per task |
|
output_dir: Directory to save results (optional) |
|
|
|
Returns: |
|
Dictionary of aggregated results |
|
""" |
|
self.logger.info(f"Evaluating {len(tasks)} tasks") |
|
|
|
results = {} |
|
trajectories = {} |
|
all_metrics = {} |
|
|
|
for i, task in enumerate(tasks): |
|
self.logger.info(f"Evaluating task {i+1}/{len(tasks)}: {task.task_id}") |
|
|
|
|
|
trajectory, metrics = self.evaluate_task(task, max_iterations) |
|
|
|
|
|
trajectories[task.task_id] = trajectory |
|
all_metrics[task.task_id] = metrics |
|
|
|
|
|
if output_dir: |
|
os.makedirs(output_dir, exist_ok=True) |
|
task_output_path = os.path.join(output_dir, f"task_{task.task_id}.json") |
|
task.save(task_output_path) |
|
self.logger.info(f"Saved task to {task_output_path}") |
|
|
|
|
|
aggregated_metrics = self._aggregate_metrics(all_metrics) |
|
|
|
|
|
results = { |
|
"aggregated_metrics": aggregated_metrics, |
|
"task_metrics": all_metrics, |
|
"timestamp": datetime.datetime.now().isoformat(), |
|
"model_info": self.model.get_meta_information(), |
|
"total_tasks": len(tasks), |
|
"config": self.config |
|
} |
|
|
|
|
|
if output_dir: |
|
results_path = os.path.join(output_dir, "aggregated_results.json") |
|
with open(results_path, "w") as f: |
|
json.dump(results, f, indent=2) |
|
self.logger.info(f"Saved aggregated results to {results_path}") |
|
|
|
return results |
|
|
|
def _format_problem_for_model( |
|
self, |
|
problem: Dict[str, Any], |
|
trajectory: Trajectory |
|
) -> Dict[str, Any]: |
|
""" |
|
Format the problem in a way the model can understand. |
|
|
|
Args: |
|
problem: The problem state |
|
trajectory: The trajectory so far |
|
|
|
Returns: |
|
Formatted problem for the model |
|
""" |
|
|
|
previous_steps = [] |
|
for step in trajectory.steps: |
|
previous_steps.append({ |
|
"problem": { |
|
"description": step.problem_state.description, |
|
"requirements": step.problem_state.requirements, |
|
"evolution_stage": step.problem_state.evolution_stage |
|
}, |
|
"solution": step.solution, |
|
"feedback": { |
|
"summary": step.feedback.summary, |
|
"issues": step.feedback.issues, |
|
"suggestions": step.feedback.suggestions, |
|
"focus_areas": step.feedback.focus_areas |
|
} |
|
}) |
|
|
|
|
|
formatted_problem = { |
|
"description": problem["description"], |
|
"code_context": problem["code_context"], |
|
"requirements": problem["requirements"], |
|
"iteration": problem["evolution_stage"] + 1, |
|
"previous_attempts": previous_steps |
|
} |
|
|
|
return formatted_problem |
|
|
|
def _calculate_metrics(self, trajectory: Trajectory) -> Dict[str, float]: |
|
""" |
|
Calculate metrics across the trajectory. |
|
|
|
Args: |
|
trajectory: The solution trajectory |
|
|
|
Returns: |
|
Dictionary of metric values |
|
""" |
|
return {name: metric.calculate(trajectory) |
|
for name, metric in self.metrics.items()} |
|
|
|
def _aggregate_metrics( |
|
self, |
|
all_metrics: Dict[str, Dict[str, float]] |
|
) -> Dict[str, float]: |
|
""" |
|
Aggregate metrics across multiple tasks. |
|
|
|
Args: |
|
all_metrics: Dictionary of metrics per task |
|
|
|
Returns: |
|
Dictionary of aggregated metrics |
|
""" |
|
|
|
if not all_metrics: |
|
return {} |
|
|
|
sample_metrics = next(iter(all_metrics.values())) |
|
aggregated = {name: 0.0 for name in sample_metrics.keys()} |
|
|
|
|
|
for task_metrics in all_metrics.values(): |
|
for name, value in task_metrics.items(): |
|
aggregated[name] += value |
|
|
|
|
|
for name in aggregated: |
|
aggregated[name] /= len(all_metrics) |
|
|
|
return aggregated |
|
|
|
|
|
|
|
|
|
from typing import Any, Dict, List, Optional |
|
import numpy as np |
|
from recursive_swe_bench.core.recursive_task import Trajectory |
|
|
|
|
|
class RecursiveMetric: |
|
"""Base class for recursive metrics.""" |
|
|
|
def __init__(self, config: Dict[str, Any] = None): |
|
self.config = config or {} |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
""" |
|
Calculate the metric value for a trajectory. |
|
|
|
Args: |
|
trajectory: The solution trajectory |
|
|
|
Returns: |
|
The metric value |
|
""" |
|
raise NotImplementedError("Subclasses must implement this method") |
|
|
|
|
|
class ConvergenceRate(RecursiveMetric): |
|
""" |
|
Measures how quickly the model reaches a stable solution. |
|
|
|
A lower value indicates faster convergence. |
|
""" |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
scores = trajectory.get_score_series() |
|
if len(scores) < 2: |
|
return 0.0 |
|
|
|
|
|
deltas = [abs(scores[i+1] - scores[i]) |
|
for i in range(len(scores)-1)] |
|
|
|
|
|
|
|
return sum(deltas) / len(deltas) |
|
|
|
|
|
class AdaptationEfficiency(RecursiveMetric): |
|
""" |
|
Measures improvement per feedback iteration. |
|
|
|
A higher value indicates more efficient adaptation. |
|
""" |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
scores = trajectory.get_score_series() |
|
if len(scores) < 2: |
|
return 0.0 |
|
|
|
|
|
total_improvement = max(0.0, scores[-1] - scores[0]) |
|
|
|
|
|
return total_improvement / (len(scores) - 1) |
|
|
|
|
|
class LearningCurveArea(RecursiveMetric): |
|
""" |
|
Measures the area under the learning curve. |
|
|
|
A higher value indicates better overall performance across iterations. |
|
""" |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
scores = trajectory.get_score_series() |
|
if not scores: |
|
return 0.0 |
|
|
|
|
|
|
|
max_score = self.config.get("max_score", 1.0) |
|
max_area = max_score * len(scores) |
|
|
|
return sum(scores) / max_area |
|
|
|
|
|
class ProbabilisticSolutionQuality(RecursiveMetric): |
|
""" |
|
Measures the distribution of solution quality using non-deterministic assessment. |
|
|
|
This metric captures the robustness of solutions by measuring the variability in quality |
|
across multiple probabilistic evaluations. |
|
""" |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
|
|
steps = trajectory.steps |
|
if not steps: |
|
return 0.0 |
|
|
|
|
|
distributions = [] |
|
for step in steps: |
|
if (step.result.metrics and |
|
"probabilistic_quality_distribution" in step.result.metrics): |
|
distributions.append( |
|
step.result.metrics["probabilistic_quality_distribution"]) |
|
|
|
if not distributions: |
|
|
|
return trajectory.get_score_series()[-1] |
|
|
|
|
|
final_distribution = distributions[-1] |
|
return sum(prob * val for val, prob in final_distribution.items()) |
|
|
|
|
|
class TransferLearningFactor(RecursiveMetric): |
|
""" |
|
Measures how well learning transfers across related problems. |
|
|
|
This requires multiple trajectories from related tasks. |
|
""" |
|
|
|
def __init__(self, config: Dict[str, Any] = None, related_trajectories: List[Trajectory] = None): |
|
super().__init__(config) |
|
self.related_trajectories = related_trajectories or [] |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
|
|
if not self.related_trajectories: |
|
return 0.0 |
|
|
|
|
|
current_learning_rate = self._calculate_learning_rate(trajectory) |
|
|
|
related_learning_rates = [ |
|
self._calculate_learning_rate(rel_traj) |
|
for rel_traj in self.related_trajectories |
|
] |
|
|
|
|
|
valid_related_rates = [rate for rate in related_learning_rates if rate is not None] |
|
|
|
if not valid_related_rates: |
|
return 0.0 |
|
|
|
|
|
|
|
avg_related_rate = sum(valid_related_rates) / len(valid_related_rates) |
|
|
|
if avg_related_rate == 0: |
|
return 0.0 |
|
|
|
return current_learning_rate / avg_related_rate |
|
|
|
def _calculate_learning_rate(self, trajectory: Trajectory) -> Optional[float]: |
|
"""Calculate the learning rate for a trajectory.""" |
|
scores = trajectory.get_score_series() |
|
if len(scores) < 2: |
|
return None |
|
|
|
|
|
return (scores[-1] - scores[0]) / (len(scores) - 1) |
|
|
|
|
|
class DynamicComplexityHandling(RecursiveMetric): |
|
""" |
|
Measures how well the model handles varying problem complexity. |
|
|
|
This metric evaluates performance while accounting for changes in problem difficulty. |
|
""" |
|
|
|
def calculate(self, trajectory: Trajectory) -> float: |
|
if not trajectory.steps: |
|
return 0.0 |
|
|
|
|
|
scores = trajectory.get_score_series() |
|
difficulties = [step.problem_state.difficulty for step in trajectory.steps] |
|
|
|
if len(scores) < 2: |
|
return scores[0] |
|
|
|
|
|
normalized_scores = [scores[i] * (1 + difficulties[i]) |
|
for i in range(len(scores))] |
|
|
|
|
|
return sum(normalized_scores) / len(normalized_scores) |
|
|