# recursive_swe_bench/evaluation/harness.py from typing import Any, Dict, List, Optional, Tuple, Union, Callable import datetime import uuid import json import os import logging from dataclasses import dataclass, field from recursive_swe_bench.core.recursive_task import ( RecursiveTask, Trajectory, TrajectoryStep, ProblemState, EvaluationResult, Feedback, TaskStatus ) class RecursiveEvaluator: """ The core evaluation harness for recursive benchmark tasks. This class orchestrates the recursive evaluation process, managing the interactions between models and tasks, tracking trajectories, and calculating metrics. """ def __init__( self, model: Any, # Model interface metrics: Dict[str, Any], # Metric calculators config: Dict[str, Any] = None ): """ Initialize the recursive evaluator. Args: model: The model to evaluate metrics: Dictionary of metric calculators config: Configuration options """ self.model = model self.metrics = metrics self.config = config or {} self.logger = self._setup_logger() def _setup_logger(self) -> logging.Logger: """Set up logging for the evaluator.""" logger = logging.getLogger("RecursiveEvaluator") handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(self.config.get("log_level", logging.INFO)) return logger def evaluate_task( self, task: RecursiveTask, max_iterations: int = 5 ) -> Tuple[Trajectory, Dict[str, float]]: """ Run a full recursive evaluation on a single task. Args: task: The task to evaluate max_iterations: Maximum number of iterations Returns: The trajectory and calculated metrics """ self.logger.info(f"Starting evaluation of task {task.task_id}") for i in range(max_iterations): self.logger.info(f"Starting iteration {i+1}/{max_iterations}") # Get the current problem problem = task.get_current_problem() self.logger.debug(f"Problem state: evolution_stage={problem['evolution_stage']}") # Format the problem for the model formatted_problem = self._format_problem_for_model(problem, task.trajectory) # Get model solution self.logger.debug("Requesting solution from model") solution = self.model.solve(formatted_problem) # Evaluate the solution self.logger.debug("Evaluating solution") result, feedback = task.evaluate_solution(solution) # Log the results self.logger.info(f"Solution score: {result.score:.4f}, Success: {result.success}") # Update the task state based on the solution new_state = task.update_state(solution, result, feedback) # Check if we've reached a terminal state if task.status != TaskStatus.IN_PROGRESS: self.logger.info(f"Task complete with status: {task.status.value}") break # Calculate metrics across the trajectory self.logger.info("Calculating metrics") metrics_result = self._calculate_metrics(task.trajectory) return task.trajectory, metrics_result def evaluate_task_set( self, tasks: List[RecursiveTask], max_iterations: int = 5, output_dir: Optional[str] = None ) -> Dict[str, Any]: """ Evaluate a set of tasks and aggregate the results. Args: tasks: List of tasks to evaluate max_iterations: Maximum iterations per task output_dir: Directory to save results (optional) Returns: Dictionary of aggregated results """ self.logger.info(f"Evaluating {len(tasks)} tasks") results = {} trajectories = {} all_metrics = {} for i, task in enumerate(tasks): self.logger.info(f"Evaluating task {i+1}/{len(tasks)}: {task.task_id}") # Evaluate the task trajectory, metrics = self.evaluate_task(task, max_iterations) # Store the results trajectories[task.task_id] = trajectory all_metrics[task.task_id] = metrics # Save the trajectory if output_dir is provided if output_dir: os.makedirs(output_dir, exist_ok=True) task_output_path = os.path.join(output_dir, f"task_{task.task_id}.json") task.save(task_output_path) self.logger.info(f"Saved task to {task_output_path}") # Aggregate metrics across all tasks aggregated_metrics = self._aggregate_metrics(all_metrics) # Compile results results = { "aggregated_metrics": aggregated_metrics, "task_metrics": all_metrics, "timestamp": datetime.datetime.now().isoformat(), "model_info": self.model.get_meta_information(), "total_tasks": len(tasks), "config": self.config } # Save aggregated results if output_dir is provided if output_dir: results_path = os.path.join(output_dir, "aggregated_results.json") with open(results_path, "w") as f: json.dump(results, f, indent=2) self.logger.info(f"Saved aggregated results to {results_path}") return results def _format_problem_for_model( self, problem: Dict[str, Any], trajectory: Trajectory ) -> Dict[str, Any]: """ Format the problem in a way the model can understand. Args: problem: The problem state trajectory: The trajectory so far Returns: Formatted problem for the model """ # Extract the previous steps if they exist previous_steps = [] for step in trajectory.steps: previous_steps.append({ "problem": { "description": step.problem_state.description, "requirements": step.problem_state.requirements, "evolution_stage": step.problem_state.evolution_stage }, "solution": step.solution, "feedback": { "summary": step.feedback.summary, "issues": step.feedback.issues, "suggestions": step.feedback.suggestions, "focus_areas": step.feedback.focus_areas } }) # Format the problem with the trajectory context formatted_problem = { "description": problem["description"], "code_context": problem["code_context"], "requirements": problem["requirements"], "iteration": problem["evolution_stage"] + 1, "previous_attempts": previous_steps } return formatted_problem def _calculate_metrics(self, trajectory: Trajectory) -> Dict[str, float]: """ Calculate metrics across the trajectory. Args: trajectory: The solution trajectory Returns: Dictionary of metric values """ return {name: metric.calculate(trajectory) for name, metric in self.metrics.items()} def _aggregate_metrics( self, all_metrics: Dict[str, Dict[str, float]] ) -> Dict[str, float]: """ Aggregate metrics across multiple tasks. Args: all_metrics: Dictionary of metrics per task Returns: Dictionary of aggregated metrics """ # Initialize aggregated metrics if not all_metrics: return {} sample_metrics = next(iter(all_metrics.values())) aggregated = {name: 0.0 for name in sample_metrics.keys()} # Sum up metrics for task_metrics in all_metrics.values(): for name, value in task_metrics.items(): aggregated[name] += value # Calculate averages for name in aggregated: aggregated[name] /= len(all_metrics) return aggregated # recursive_swe_bench/evaluation/metrics/recursive.py from typing import Any, Dict, List, Optional import numpy as np from recursive_swe_bench.core.recursive_task import Trajectory class RecursiveMetric: """Base class for recursive metrics.""" def __init__(self, config: Dict[str, Any] = None): self.config = config or {} def calculate(self, trajectory: Trajectory) -> float: """ Calculate the metric value for a trajectory. Args: trajectory: The solution trajectory Returns: The metric value """ raise NotImplementedError("Subclasses must implement this method") class ConvergenceRate(RecursiveMetric): """ Measures how quickly the model reaches a stable solution. A lower value indicates faster convergence. """ def calculate(self, trajectory: Trajectory) -> float: scores = trajectory.get_score_series() if len(scores) < 2: return 0.0 # Calculate changes between consecutive scores deltas = [abs(scores[i+1] - scores[i]) for i in range(len(scores)-1)] # A lower sum indicates faster convergence # Normalize by the number of iterations return sum(deltas) / len(deltas) class AdaptationEfficiency(RecursiveMetric): """ Measures improvement per feedback iteration. A higher value indicates more efficient adaptation. """ def calculate(self, trajectory: Trajectory) -> float: scores = trajectory.get_score_series() if len(scores) < 2: return 0.0 # Calculate the improvement from first to last iteration total_improvement = max(0.0, scores[-1] - scores[0]) # Normalize by the number of iterations return total_improvement / (len(scores) - 1) class LearningCurveArea(RecursiveMetric): """ Measures the area under the learning curve. A higher value indicates better overall performance across iterations. """ def calculate(self, trajectory: Trajectory) -> float: scores = trajectory.get_score_series() if not scores: return 0.0 # Calculate the area under the curve # Normalize by the maximum possible area (perfect score from the start) max_score = self.config.get("max_score", 1.0) max_area = max_score * len(scores) return sum(scores) / max_area class ProbabilisticSolutionQuality(RecursiveMetric): """ Measures the distribution of solution quality using non-deterministic assessment. This metric captures the robustness of solutions by measuring the variability in quality across multiple probabilistic evaluations. """ def calculate(self, trajectory: Trajectory) -> float: # For each step, we expect the result.metrics to contain probabilistic assessments steps = trajectory.steps if not steps: return 0.0 # Extract probabilistic quality distributions if available distributions = [] for step in steps: if (step.result.metrics and "probabilistic_quality_distribution" in step.result.metrics): distributions.append( step.result.metrics["probabilistic_quality_distribution"]) if not distributions: # Fall back to deterministic scores if no distributions are available return trajectory.get_score_series()[-1] # Calculate the expected value of the final distribution final_distribution = distributions[-1] return sum(prob * val for val, prob in final_distribution.items()) class TransferLearningFactor(RecursiveMetric): """ Measures how well learning transfers across related problems. This requires multiple trajectories from related tasks. """ def __init__(self, config: Dict[str, Any] = None, related_trajectories: List[Trajectory] = None): super().__init__(config) self.related_trajectories = related_trajectories or [] def calculate(self, trajectory: Trajectory) -> float: # This metric requires related trajectories if not self.related_trajectories: return 0.0 # Get learning rates for the current trajectory and related ones current_learning_rate = self._calculate_learning_rate(trajectory) related_learning_rates = [ self._calculate_learning_rate(rel_traj) for rel_traj in self.related_trajectories ] # Filter out invalid learning rates valid_related_rates = [rate for rate in related_learning_rates if rate is not None] if not valid_related_rates: return 0.0 # Calculate the transfer factor as the ratio of the current learning rate # to the average of related learning rates avg_related_rate = sum(valid_related_rates) / len(valid_related_rates) if avg_related_rate == 0: return 0.0 return current_learning_rate / avg_related_rate def _calculate_learning_rate(self, trajectory: Trajectory) -> Optional[float]: """Calculate the learning rate for a trajectory.""" scores = trajectory.get_score_series() if len(scores) < 2: return None # Calculate improvement per iteration return (scores[-1] - scores[0]) / (len(scores) - 1) class DynamicComplexityHandling(RecursiveMetric): """ Measures how well the model handles varying problem complexity. This metric evaluates performance while accounting for changes in problem difficulty. """ def calculate(self, trajectory: Trajectory) -> float: if not trajectory.steps: return 0.0 # Extract scores and difficulties scores = trajectory.get_score_series() difficulties = [step.problem_state.difficulty for step in trajectory.steps] if len(scores) < 2: return scores[0] # Return the single score if only one step # Calculate normalized scores (adjusted by difficulty) normalized_scores = [scores[i] * (1 + difficulties[i]) for i in range(len(scores))] # Return the average normalized score return sum(normalized_scores) / len(normalized_scores)