recursivelabs's picture
Upload 7 files
e8a0a6a verified
# recursive_swe_bench/evaluation/harness.py
from typing import Any, Dict, List, Optional, Tuple, Union, Callable
import datetime
import uuid
import json
import os
import logging
from dataclasses import dataclass, field
from recursive_swe_bench.core.recursive_task import (
RecursiveTask, Trajectory, TrajectoryStep, ProblemState,
EvaluationResult, Feedback, TaskStatus
)
class RecursiveEvaluator:
"""
The core evaluation harness for recursive benchmark tasks.
This class orchestrates the recursive evaluation process, managing the interactions
between models and tasks, tracking trajectories, and calculating metrics.
"""
def __init__(
self,
model: Any, # Model interface
metrics: Dict[str, Any], # Metric calculators
config: Dict[str, Any] = None
):
"""
Initialize the recursive evaluator.
Args:
model: The model to evaluate
metrics: Dictionary of metric calculators
config: Configuration options
"""
self.model = model
self.metrics = metrics
self.config = config or {}
self.logger = self._setup_logger()
def _setup_logger(self) -> logging.Logger:
"""Set up logging for the evaluator."""
logger = logging.getLogger("RecursiveEvaluator")
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(self.config.get("log_level", logging.INFO))
return logger
def evaluate_task(
self,
task: RecursiveTask,
max_iterations: int = 5
) -> Tuple[Trajectory, Dict[str, float]]:
"""
Run a full recursive evaluation on a single task.
Args:
task: The task to evaluate
max_iterations: Maximum number of iterations
Returns:
The trajectory and calculated metrics
"""
self.logger.info(f"Starting evaluation of task {task.task_id}")
for i in range(max_iterations):
self.logger.info(f"Starting iteration {i+1}/{max_iterations}")
# Get the current problem
problem = task.get_current_problem()
self.logger.debug(f"Problem state: evolution_stage={problem['evolution_stage']}")
# Format the problem for the model
formatted_problem = self._format_problem_for_model(problem, task.trajectory)
# Get model solution
self.logger.debug("Requesting solution from model")
solution = self.model.solve(formatted_problem)
# Evaluate the solution
self.logger.debug("Evaluating solution")
result, feedback = task.evaluate_solution(solution)
# Log the results
self.logger.info(f"Solution score: {result.score:.4f}, Success: {result.success}")
# Update the task state based on the solution
new_state = task.update_state(solution, result, feedback)
# Check if we've reached a terminal state
if task.status != TaskStatus.IN_PROGRESS:
self.logger.info(f"Task complete with status: {task.status.value}")
break
# Calculate metrics across the trajectory
self.logger.info("Calculating metrics")
metrics_result = self._calculate_metrics(task.trajectory)
return task.trajectory, metrics_result
def evaluate_task_set(
self,
tasks: List[RecursiveTask],
max_iterations: int = 5,
output_dir: Optional[str] = None
) -> Dict[str, Any]:
"""
Evaluate a set of tasks and aggregate the results.
Args:
tasks: List of tasks to evaluate
max_iterations: Maximum iterations per task
output_dir: Directory to save results (optional)
Returns:
Dictionary of aggregated results
"""
self.logger.info(f"Evaluating {len(tasks)} tasks")
results = {}
trajectories = {}
all_metrics = {}
for i, task in enumerate(tasks):
self.logger.info(f"Evaluating task {i+1}/{len(tasks)}: {task.task_id}")
# Evaluate the task
trajectory, metrics = self.evaluate_task(task, max_iterations)
# Store the results
trajectories[task.task_id] = trajectory
all_metrics[task.task_id] = metrics
# Save the trajectory if output_dir is provided
if output_dir:
os.makedirs(output_dir, exist_ok=True)
task_output_path = os.path.join(output_dir, f"task_{task.task_id}.json")
task.save(task_output_path)
self.logger.info(f"Saved task to {task_output_path}")
# Aggregate metrics across all tasks
aggregated_metrics = self._aggregate_metrics(all_metrics)
# Compile results
results = {
"aggregated_metrics": aggregated_metrics,
"task_metrics": all_metrics,
"timestamp": datetime.datetime.now().isoformat(),
"model_info": self.model.get_meta_information(),
"total_tasks": len(tasks),
"config": self.config
}
# Save aggregated results if output_dir is provided
if output_dir:
results_path = os.path.join(output_dir, "aggregated_results.json")
with open(results_path, "w") as f:
json.dump(results, f, indent=2)
self.logger.info(f"Saved aggregated results to {results_path}")
return results
def _format_problem_for_model(
self,
problem: Dict[str, Any],
trajectory: Trajectory
) -> Dict[str, Any]:
"""
Format the problem in a way the model can understand.
Args:
problem: The problem state
trajectory: The trajectory so far
Returns:
Formatted problem for the model
"""
# Extract the previous steps if they exist
previous_steps = []
for step in trajectory.steps:
previous_steps.append({
"problem": {
"description": step.problem_state.description,
"requirements": step.problem_state.requirements,
"evolution_stage": step.problem_state.evolution_stage
},
"solution": step.solution,
"feedback": {
"summary": step.feedback.summary,
"issues": step.feedback.issues,
"suggestions": step.feedback.suggestions,
"focus_areas": step.feedback.focus_areas
}
})
# Format the problem with the trajectory context
formatted_problem = {
"description": problem["description"],
"code_context": problem["code_context"],
"requirements": problem["requirements"],
"iteration": problem["evolution_stage"] + 1,
"previous_attempts": previous_steps
}
return formatted_problem
def _calculate_metrics(self, trajectory: Trajectory) -> Dict[str, float]:
"""
Calculate metrics across the trajectory.
Args:
trajectory: The solution trajectory
Returns:
Dictionary of metric values
"""
return {name: metric.calculate(trajectory)
for name, metric in self.metrics.items()}
def _aggregate_metrics(
self,
all_metrics: Dict[str, Dict[str, float]]
) -> Dict[str, float]:
"""
Aggregate metrics across multiple tasks.
Args:
all_metrics: Dictionary of metrics per task
Returns:
Dictionary of aggregated metrics
"""
# Initialize aggregated metrics
if not all_metrics:
return {}
sample_metrics = next(iter(all_metrics.values()))
aggregated = {name: 0.0 for name in sample_metrics.keys()}
# Sum up metrics
for task_metrics in all_metrics.values():
for name, value in task_metrics.items():
aggregated[name] += value
# Calculate averages
for name in aggregated:
aggregated[name] /= len(all_metrics)
return aggregated
# recursive_swe_bench/evaluation/metrics/recursive.py
from typing import Any, Dict, List, Optional
import numpy as np
from recursive_swe_bench.core.recursive_task import Trajectory
class RecursiveMetric:
"""Base class for recursive metrics."""
def __init__(self, config: Dict[str, Any] = None):
self.config = config or {}
def calculate(self, trajectory: Trajectory) -> float:
"""
Calculate the metric value for a trajectory.
Args:
trajectory: The solution trajectory
Returns:
The metric value
"""
raise NotImplementedError("Subclasses must implement this method")
class ConvergenceRate(RecursiveMetric):
"""
Measures how quickly the model reaches a stable solution.
A lower value indicates faster convergence.
"""
def calculate(self, trajectory: Trajectory) -> float:
scores = trajectory.get_score_series()
if len(scores) < 2:
return 0.0
# Calculate changes between consecutive scores
deltas = [abs(scores[i+1] - scores[i])
for i in range(len(scores)-1)]
# A lower sum indicates faster convergence
# Normalize by the number of iterations
return sum(deltas) / len(deltas)
class AdaptationEfficiency(RecursiveMetric):
"""
Measures improvement per feedback iteration.
A higher value indicates more efficient adaptation.
"""
def calculate(self, trajectory: Trajectory) -> float:
scores = trajectory.get_score_series()
if len(scores) < 2:
return 0.0
# Calculate the improvement from first to last iteration
total_improvement = max(0.0, scores[-1] - scores[0])
# Normalize by the number of iterations
return total_improvement / (len(scores) - 1)
class LearningCurveArea(RecursiveMetric):
"""
Measures the area under the learning curve.
A higher value indicates better overall performance across iterations.
"""
def calculate(self, trajectory: Trajectory) -> float:
scores = trajectory.get_score_series()
if not scores:
return 0.0
# Calculate the area under the curve
# Normalize by the maximum possible area (perfect score from the start)
max_score = self.config.get("max_score", 1.0)
max_area = max_score * len(scores)
return sum(scores) / max_area
class ProbabilisticSolutionQuality(RecursiveMetric):
"""
Measures the distribution of solution quality using non-deterministic assessment.
This metric captures the robustness of solutions by measuring the variability in quality
across multiple probabilistic evaluations.
"""
def calculate(self, trajectory: Trajectory) -> float:
# For each step, we expect the result.metrics to contain probabilistic assessments
steps = trajectory.steps
if not steps:
return 0.0
# Extract probabilistic quality distributions if available
distributions = []
for step in steps:
if (step.result.metrics and
"probabilistic_quality_distribution" in step.result.metrics):
distributions.append(
step.result.metrics["probabilistic_quality_distribution"])
if not distributions:
# Fall back to deterministic scores if no distributions are available
return trajectory.get_score_series()[-1]
# Calculate the expected value of the final distribution
final_distribution = distributions[-1]
return sum(prob * val for val, prob in final_distribution.items())
class TransferLearningFactor(RecursiveMetric):
"""
Measures how well learning transfers across related problems.
This requires multiple trajectories from related tasks.
"""
def __init__(self, config: Dict[str, Any] = None, related_trajectories: List[Trajectory] = None):
super().__init__(config)
self.related_trajectories = related_trajectories or []
def calculate(self, trajectory: Trajectory) -> float:
# This metric requires related trajectories
if not self.related_trajectories:
return 0.0
# Get learning rates for the current trajectory and related ones
current_learning_rate = self._calculate_learning_rate(trajectory)
related_learning_rates = [
self._calculate_learning_rate(rel_traj)
for rel_traj in self.related_trajectories
]
# Filter out invalid learning rates
valid_related_rates = [rate for rate in related_learning_rates if rate is not None]
if not valid_related_rates:
return 0.0
# Calculate the transfer factor as the ratio of the current learning rate
# to the average of related learning rates
avg_related_rate = sum(valid_related_rates) / len(valid_related_rates)
if avg_related_rate == 0:
return 0.0
return current_learning_rate / avg_related_rate
def _calculate_learning_rate(self, trajectory: Trajectory) -> Optional[float]:
"""Calculate the learning rate for a trajectory."""
scores = trajectory.get_score_series()
if len(scores) < 2:
return None
# Calculate improvement per iteration
return (scores[-1] - scores[0]) / (len(scores) - 1)
class DynamicComplexityHandling(RecursiveMetric):
"""
Measures how well the model handles varying problem complexity.
This metric evaluates performance while accounting for changes in problem difficulty.
"""
def calculate(self, trajectory: Trajectory) -> float:
if not trajectory.steps:
return 0.0
# Extract scores and difficulties
scores = trajectory.get_score_series()
difficulties = [step.problem_state.difficulty for step in trajectory.steps]
if len(scores) < 2:
return scores[0] # Return the single score if only one step
# Calculate normalized scores (adjusted by difficulty)
normalized_scores = [scores[i] * (1 + difficulties[i])
for i in range(len(scores))]
# Return the average normalized score
return sum(normalized_scores) / len(normalized_scores)