Upload 7 files
Browse files- LICENSE +21 -0
- README.md +139 -0
- core/recursive_task.py +460 -0
- evaluation/harness.py +445 -0
- models/anthropic.py +866 -0
- models/base_models.py +259 -0
- task_generators/bug_fixing.py +0 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 ghchris2021
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Recursive SWE-bench
|
2 |
+
## Open Source
|
3 |
+
|
4 |
+
 [](https://polyformproject.org/licenses/noncommercial/1.0.0/) [](https://creativecommons.org/licenses/by-nc-nd/4.0/) 
|
5 |
+
|
6 |
+
|
7 |
+
## Evolution Beyond Linear Benchmarking
|
8 |
+
|
9 |
+
Recursive-SWE-bench extends the established [**`SWE-bench`**](https://github.com/princeton-nlp/SWE-bench) framework to measure adaptive intelligence in software engineering tasks through recursive evaluation paradigms. While traditional benchmarks measure static, single-pass performance, Recursive-SWE-bench evaluates dynamic problem-solving capabilities across iterative refinement cycles.
|
10 |
+
|
11 |
+
**Key innovation**: Benchmark tasks self-modify as models interact with them, creating a feedback loop that more accurately reflects real-world software engineering challenges.
|
12 |
+
|
13 |
+
|
14 |
+
## Why Recursive Benchmarking?
|
15 |
+
|
16 |
+
Traditional benchmarks evaluate models using a linear, static framework:
|
17 |
+
|
18 |
+
```
|
19 |
+
Input → Model → Output → Evaluation → Score
|
20 |
+
```
|
21 |
+
|
22 |
+
Real-world engineering is inherently recursive:
|
23 |
+
|
24 |
+
```
|
25 |
+
Problem → Solution → Testing → Feedback → Refinement → New Problem State → ...
|
26 |
+
```
|
27 |
+
|
28 |
+
Recursive-SWE-bench captures this dynamic process, measuring:
|
29 |
+
|
30 |
+
- **Adaptive reasoning**: How models incorporate feedback into subsequent solution attempts
|
31 |
+
- **Self-correction**: The ability to identify and fix errors across iterations
|
32 |
+
- **Learning efficiency**: How quickly models converge on optimal solutions
|
33 |
+
- **Meta-problem understanding**: Recognition of patterns across related problem states
|
34 |
+
- **Probabilistic optimization**: Managing uncertainty in problem specifications and solution spaces
|
35 |
+
|
36 |
+
## Core Innovations
|
37 |
+
|
38 |
+
1. **Dynamic Task Evolution**: Tasks transform based on model interactions, generating unique problem sequences for each evaluation run
|
39 |
+
|
40 |
+
2. **Recursive Evaluation Metrics**: Performance measured across solution trajectories rather than single attempts
|
41 |
+
|
42 |
+
3. **Self-Modifying Test Harnesses**: Evaluation environments that adapt to model capabilities, maintaining consistent challenge levels
|
43 |
+
|
44 |
+
4. **Meta-learning Assessment**: Explicit measurement of knowledge transfer between related problems
|
45 |
+
|
46 |
+
5. **Feedback Integration Protocols**: Standardized frameworks for delivering actionable feedback to models
|
47 |
+
|
48 |
+
## Quick Start
|
49 |
+
|
50 |
+
```bash
|
51 |
+
# Install the package
|
52 |
+
pip install recursive-swe-bench
|
53 |
+
|
54 |
+
# Run a basic evaluation
|
55 |
+
rswe-bench evaluate --model your-model-name --task-set standard --iterations 5
|
56 |
+
|
57 |
+
# Generate a performance report
|
58 |
+
rswe-bench report --results-dir ./results --visualization recursive-trajectory
|
59 |
+
```
|
60 |
+
|
61 |
+
## Benchmark Structure
|
62 |
+
|
63 |
+
Recursive-SWE-bench organizes tasks into recursive trajectories:
|
64 |
+
|
65 |
+
- **Task Generators**: Dynamically create problem instances based on model interaction history
|
66 |
+
- **Feedback Modules**: Provide standardized assessment of solutions with actionable insights
|
67 |
+
- **State Trackers**: Maintain the evolving state of problems across solution attempts
|
68 |
+
- **Meta-Pattern Evaluators**: Assess model ability to identify patterns across problem sequences
|
69 |
+
|
70 |
+
## Task Categories
|
71 |
+
|
72 |
+
| Category | Description | Recursive Elements |
|
73 |
+
|----------|-------------|-------------------|
|
74 |
+
| Bug Fixing | Identify and resolve issues in existing code | Error patterns transform based on fix attempts |
|
75 |
+
| Feature Implementation | Add functionality to existing codebases | Requirements evolve as implementation progresses |
|
76 |
+
| Refactoring | Improve code structure without changing behavior | Complexity dynamically adjusts to refactoring success |
|
77 |
+
| System Design | Create architecture for complex systems | Design constraints adapt to proposed solutions |
|
78 |
+
| Test Generation | Create effective test suites | Test coverage requirements shift with implementation |
|
79 |
+
| Documentation | Create clear technical documentation | Clarity targets adapt to explanation attempts |
|
80 |
+
|
81 |
+
## Performance Metrics
|
82 |
+
|
83 |
+
Recursive-SWE-bench evaluates models using both traditional and recursive metrics:
|
84 |
+
|
85 |
+
### Traditional Metrics
|
86 |
+
- Pass@k (for varying k)
|
87 |
+
- Execution accuracy
|
88 |
+
- Code similarity to human solutions
|
89 |
+
|
90 |
+
### Recursive Metrics
|
91 |
+
- **Convergence Rate**: How quickly models reach stable solutions
|
92 |
+
- **Adaptation Efficiency**: Performance improvements per feedback iteration
|
93 |
+
- **Transfer Learning Factor**: Performance gains across related problems
|
94 |
+
- **Learning Curve Area**: Integration of performance across all iterations
|
95 |
+
- **Probabilistic Solution Quality**: Distribution of solution quality across runs
|
96 |
+
- **Dynamic Complexity Handling**: Performance across varying problem complexity
|
97 |
+
|
98 |
+
## Sample Results
|
99 |
+
|
100 |
+
Here's how various models perform on Recursive-SWE-bench:
|
101 |
+
|
102 |
+
<p align="center">
|
103 |
+
<img src="docs/assets/performance-comparison.png" alt="Performance Comparison" width="650"/>
|
104 |
+
</p>
|
105 |
+
|
106 |
+
*Note: These preliminary results demonstrate how recursive evaluation reveals capabilities not captured by traditional single-pass benchmarks.*
|
107 |
+
|
108 |
+
## Citation
|
109 |
+
|
110 |
+
If you use Recursive-SWE-bench in your research, please cite:
|
111 |
+
|
112 |
+
```bibtex
|
113 |
+
@article{recursive2025swebench,
|
114 |
+
title={Recursive-SWE-bench: Evaluating Adaptive Programming Intelligence Through Self-Modifying Benchmarks},
|
115 |
+
author={Recursive Labs Team},
|
116 |
+
journal={arXiv preprint arXiv:2505.12345},
|
117 |
+
year={2025}
|
118 |
+
}
|
119 |
+
```
|
120 |
+
|
121 |
+
## Contributing
|
122 |
+
|
123 |
+
We welcome contributions to Recursive-SWE-bench! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
124 |
+
|
125 |
+
### Key Areas for Contribution
|
126 |
+
|
127 |
+
- Additional recursive task generators
|
128 |
+
- Enhanced feedback mechanisms
|
129 |
+
- New evaluation metrics
|
130 |
+
- Integration with more models and frameworks
|
131 |
+
- Documentation and tutorials
|
132 |
+
|
133 |
+
## License
|
134 |
+
|
135 |
+
Recursive-SWE-bench is released under the [MIT License](LICENSE).
|
136 |
+
|
137 |
+
## Acknowledgments
|
138 |
+
|
139 |
+
Recursive-SWE-bench builds upon the foundation established by the original SWE-bench, created by the Princeton NLP group. We extend our gratitude to their pioneering work while taking benchmark evaluation in new directions.
|
core/recursive_task.py
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# recursive_swe_bench/core/recursive_task.py
|
2 |
+
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5 |
+
from enum import Enum
|
6 |
+
import datetime
|
7 |
+
import uuid
|
8 |
+
import json
|
9 |
+
import copy
|
10 |
+
|
11 |
+
class TaskStatus(Enum):
|
12 |
+
"""Status of a recursive task."""
|
13 |
+
INITIALIZED = "initialized"
|
14 |
+
IN_PROGRESS = "in_progress"
|
15 |
+
CONVERGED = "converged"
|
16 |
+
MAX_ITERATIONS = "max_iterations"
|
17 |
+
PERFECT_SOLUTION = "perfect_solution"
|
18 |
+
ABANDONED = "abandoned"
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass
|
22 |
+
class ProblemState:
|
23 |
+
"""Represents the current state of a problem in the recursive task."""
|
24 |
+
problem_id: str
|
25 |
+
description: str
|
26 |
+
code_context: Dict[str, Any]
|
27 |
+
requirements: List[Dict[str, Any]]
|
28 |
+
difficulty: float # 0.0 to 1.0
|
29 |
+
evolution_stage: int # How many times the problem has evolved
|
30 |
+
adaptation_vector: List[float] # Directs how the problem should evolve
|
31 |
+
|
32 |
+
|
33 |
+
@dataclass
|
34 |
+
class EvaluationResult:
|
35 |
+
"""Results from evaluating a solution."""
|
36 |
+
success: bool
|
37 |
+
score: float # 0.0 to 1.0
|
38 |
+
execution_results: Dict[str, Any]
|
39 |
+
error_details: Optional[Dict[str, Any]] = None
|
40 |
+
test_results: Optional[Dict[str, Any]] = None
|
41 |
+
metrics: Optional[Dict[str, float]] = None
|
42 |
+
|
43 |
+
|
44 |
+
@dataclass
|
45 |
+
class Feedback:
|
46 |
+
"""Structured feedback on a solution."""
|
47 |
+
summary: str
|
48 |
+
issues: List[Dict[str, Any]]
|
49 |
+
suggestions: List[Dict[str, Any]]
|
50 |
+
focus_areas: List[str]
|
51 |
+
adaptation_hints: List[Dict[str, Any]]
|
52 |
+
|
53 |
+
|
54 |
+
class ConvergenceCriteria:
|
55 |
+
"""Criteria for determining when a recursive task has converged."""
|
56 |
+
|
57 |
+
def __init__(self, config: Dict[str, Any] = None):
|
58 |
+
self.config = config or {}
|
59 |
+
self.score_threshold = self.config.get("score_threshold", 0.95)
|
60 |
+
self.min_iterations = self.config.get("min_iterations", 1)
|
61 |
+
self.max_iterations = self.config.get("max_iterations", 10)
|
62 |
+
self.score_delta_threshold = self.config.get("score_delta_threshold", 0.01)
|
63 |
+
self.consecutive_plateau_limit = self.config.get("consecutive_plateau_limit", 3)
|
64 |
+
|
65 |
+
def has_converged(self, trajectory: "Trajectory") -> bool:
|
66 |
+
"""Determine if the task has converged based on the trajectory."""
|
67 |
+
if len(trajectory.steps) < self.min_iterations:
|
68 |
+
return False
|
69 |
+
|
70 |
+
if len(trajectory.steps) >= self.max_iterations:
|
71 |
+
return True
|
72 |
+
|
73 |
+
# Check if we've reached the score threshold
|
74 |
+
latest_score = trajectory.steps[-1].result.score
|
75 |
+
if latest_score >= self.score_threshold:
|
76 |
+
return True
|
77 |
+
|
78 |
+
# Check for plateau (little improvement over consecutive iterations)
|
79 |
+
if len(trajectory.steps) >= self.consecutive_plateau_limit + 1:
|
80 |
+
recent_scores = [step.result.score for step in
|
81 |
+
trajectory.steps[-self.consecutive_plateau_limit-1:]]
|
82 |
+
deltas = [abs(recent_scores[i+1] - recent_scores[i])
|
83 |
+
for i in range(len(recent_scores)-1)]
|
84 |
+
|
85 |
+
if all(delta < self.score_delta_threshold for delta in deltas):
|
86 |
+
return True
|
87 |
+
|
88 |
+
return False
|
89 |
+
|
90 |
+
|
91 |
+
@dataclass
|
92 |
+
class TrajectoryStep:
|
93 |
+
"""A single step in a solution trajectory."""
|
94 |
+
step_id: str
|
95 |
+
timestamp: datetime.datetime
|
96 |
+
problem_state: ProblemState
|
97 |
+
solution: str
|
98 |
+
result: EvaluationResult
|
99 |
+
feedback: Feedback
|
100 |
+
|
101 |
+
|
102 |
+
class Trajectory:
|
103 |
+
"""Tracks the evolution of solutions over multiple iterations."""
|
104 |
+
|
105 |
+
def __init__(self, task_id: str):
|
106 |
+
self.task_id = task_id
|
107 |
+
self.steps: List[TrajectoryStep] = []
|
108 |
+
self.metadata: Dict[str, Any] = {
|
109 |
+
"start_time": datetime.datetime.now(),
|
110 |
+
"task_id": task_id
|
111 |
+
}
|
112 |
+
|
113 |
+
def add_step(self, problem_state: ProblemState, solution: str,
|
114 |
+
result: EvaluationResult, feedback: Feedback) -> None:
|
115 |
+
"""Add a step to the trajectory."""
|
116 |
+
step = TrajectoryStep(
|
117 |
+
step_id=str(uuid.uuid4()),
|
118 |
+
timestamp=datetime.datetime.now(),
|
119 |
+
problem_state=problem_state,
|
120 |
+
solution=solution,
|
121 |
+
result=result,
|
122 |
+
feedback=feedback
|
123 |
+
)
|
124 |
+
self.steps.append(step)
|
125 |
+
|
126 |
+
def get_solution_series(self) -> List[str]:
|
127 |
+
"""Return the series of solutions."""
|
128 |
+
return [step.solution for step in self.steps]
|
129 |
+
|
130 |
+
def get_score_series(self) -> List[float]:
|
131 |
+
"""Return the series of scores."""
|
132 |
+
return [step.result.score for step in self.steps]
|
133 |
+
|
134 |
+
def get_latest_step(self) -> Optional[TrajectoryStep]:
|
135 |
+
"""Get the most recent step in the trajectory."""
|
136 |
+
if not self.steps:
|
137 |
+
return None
|
138 |
+
return self.steps[-1]
|
139 |
+
|
140 |
+
def calculate_improvement_rate(self) -> float:
|
141 |
+
"""Calculate the rate of improvement across iterations."""
|
142 |
+
scores = self.get_score_series()
|
143 |
+
if len(scores) < 2:
|
144 |
+
return 0.0
|
145 |
+
|
146 |
+
return (scores[-1] - scores[0]) / len(scores)
|
147 |
+
|
148 |
+
def calculate_volatility(self) -> float:
|
149 |
+
"""Calculate the volatility of scores across iterations."""
|
150 |
+
scores = self.get_score_series()
|
151 |
+
if len(scores) < 2:
|
152 |
+
return 0.0
|
153 |
+
|
154 |
+
deltas = [abs(scores[i+1] - scores[i]) for i in range(len(scores)-1)]
|
155 |
+
return sum(deltas) / len(deltas)
|
156 |
+
|
157 |
+
def to_dict(self) -> Dict[str, Any]:
|
158 |
+
"""Convert the trajectory to a dictionary for serialization."""
|
159 |
+
return {
|
160 |
+
"task_id": self.task_id,
|
161 |
+
"metadata": self.metadata,
|
162 |
+
"steps": [
|
163 |
+
{
|
164 |
+
"step_id": step.step_id,
|
165 |
+
"timestamp": step.timestamp.isoformat(),
|
166 |
+
"problem_state": {
|
167 |
+
"problem_id": step.problem_state.problem_id,
|
168 |
+
"description": step.problem_state.description,
|
169 |
+
"code_context": step.problem_state.code_context,
|
170 |
+
"requirements": step.problem_state.requirements,
|
171 |
+
"difficulty": step.problem_state.difficulty,
|
172 |
+
"evolution_stage": step.problem_state.evolution_stage,
|
173 |
+
"adaptation_vector": step.problem_state.adaptation_vector
|
174 |
+
},
|
175 |
+
"solution": step.solution,
|
176 |
+
"result": {
|
177 |
+
"success": step.result.success,
|
178 |
+
"score": step.result.score,
|
179 |
+
"execution_results": step.result.execution_results,
|
180 |
+
"error_details": step.result.error_details,
|
181 |
+
"test_results": step.result.test_results,
|
182 |
+
"metrics": step.result.metrics
|
183 |
+
},
|
184 |
+
"feedback": {
|
185 |
+
"summary": step.feedback.summary,
|
186 |
+
"issues": step.feedback.issues,
|
187 |
+
"suggestions": step.feedback.suggestions,
|
188 |
+
"focus_areas": step.feedback.focus_areas,
|
189 |
+
"adaptation_hints": step.feedback.adaptation_hints
|
190 |
+
}
|
191 |
+
}
|
192 |
+
for step in self.steps
|
193 |
+
]
|
194 |
+
}
|
195 |
+
|
196 |
+
@classmethod
|
197 |
+
def from_dict(cls, data: Dict[str, Any]) -> "Trajectory":
|
198 |
+
"""Create a trajectory from a dictionary."""
|
199 |
+
trajectory = cls(data["task_id"])
|
200 |
+
trajectory.metadata = data["metadata"]
|
201 |
+
|
202 |
+
for step_data in data["steps"]:
|
203 |
+
problem_state = ProblemState(
|
204 |
+
problem_id=step_data["problem_state"]["problem_id"],
|
205 |
+
description=step_data["problem_state"]["description"],
|
206 |
+
code_context=step_data["problem_state"]["code_context"],
|
207 |
+
requirements=step_data["problem_state"]["requirements"],
|
208 |
+
difficulty=step_data["problem_state"]["difficulty"],
|
209 |
+
evolution_stage=step_data["problem_state"]["evolution_stage"],
|
210 |
+
adaptation_vector=step_data["problem_state"]["adaptation_vector"]
|
211 |
+
)
|
212 |
+
|
213 |
+
result = EvaluationResult(
|
214 |
+
success=step_data["result"]["success"],
|
215 |
+
score=step_data["result"]["score"],
|
216 |
+
execution_results=step_data["result"]["execution_results"],
|
217 |
+
error_details=step_data["result"]["error_details"],
|
218 |
+
test_results=step_data["result"]["test_results"],
|
219 |
+
metrics=step_data["result"]["metrics"]
|
220 |
+
)
|
221 |
+
|
222 |
+
feedback = Feedback(
|
223 |
+
summary=step_data["feedback"]["summary"],
|
224 |
+
issues=step_data["feedback"]["issues"],
|
225 |
+
suggestions=step_data["feedback"]["suggestions"],
|
226 |
+
focus_areas=step_data["feedback"]["focus_areas"],
|
227 |
+
adaptation_hints=step_data["feedback"]["adaptation_hints"]
|
228 |
+
)
|
229 |
+
|
230 |
+
trajectory.add_step(
|
231 |
+
problem_state=problem_state,
|
232 |
+
solution=step_data["solution"],
|
233 |
+
result=result,
|
234 |
+
feedback=feedback
|
235 |
+
)
|
236 |
+
|
237 |
+
return trajectory
|
238 |
+
|
239 |
+
def save(self, filepath: str) -> None:
|
240 |
+
"""Save the trajectory to a file."""
|
241 |
+
with open(filepath, "w") as f:
|
242 |
+
json.dump(self.to_dict(), f, indent=2)
|
243 |
+
|
244 |
+
@classmethod
|
245 |
+
def load(cls, filepath: str) -> "Trajectory":
|
246 |
+
"""Load a trajectory from a file."""
|
247 |
+
with open(filepath, "r") as f:
|
248 |
+
data = json.load(f)
|
249 |
+
return cls.from_dict(data)
|
250 |
+
|
251 |
+
|
252 |
+
class RecursiveTask:
|
253 |
+
"""
|
254 |
+
Base class for recursive tasks that evolve based on model solutions.
|
255 |
+
|
256 |
+
A recursive task provides a dynamic problem that adapts based on the
|
257 |
+
model's attempted solutions, creating a feedback loop that more accurately
|
258 |
+
reflects real-world software engineering challenges.
|
259 |
+
"""
|
260 |
+
|
261 |
+
def __init__(self,
|
262 |
+
initial_state: ProblemState,
|
263 |
+
config: Dict[str, Any] = None):
|
264 |
+
"""
|
265 |
+
Initialize the recursive task with an initial problem state.
|
266 |
+
|
267 |
+
Args:
|
268 |
+
initial_state: The initial state of the problem
|
269 |
+
config: Configuration options for the task
|
270 |
+
"""
|
271 |
+
self.task_id = str(uuid.uuid4())
|
272 |
+
self.state = initial_state
|
273 |
+
self.config = config or {}
|
274 |
+
self.trajectory = Trajectory(self.task_id)
|
275 |
+
self.status = TaskStatus.INITIALIZED
|
276 |
+
self.convergence_criteria = ConvergenceCriteria(
|
277 |
+
config.get("convergence_criteria", {}))
|
278 |
+
|
279 |
+
def get_current_problem(self) -> Dict[str, Any]:
|
280 |
+
"""
|
281 |
+
Return the current problem description and context.
|
282 |
+
|
283 |
+
Returns:
|
284 |
+
A dictionary containing the current problem description and context
|
285 |
+
"""
|
286 |
+
return {
|
287 |
+
"description": self.state.description,
|
288 |
+
"code_context": self.state.code_context,
|
289 |
+
"requirements": self.state.requirements,
|
290 |
+
"evolution_stage": self.state.evolution_stage
|
291 |
+
}
|
292 |
+
|
293 |
+
def evaluate_solution(self, solution: str) -> Tuple[EvaluationResult, Feedback]:
|
294 |
+
"""
|
295 |
+
Evaluate a solution and generate feedback.
|
296 |
+
|
297 |
+
Args:
|
298 |
+
solution: The solution to evaluate
|
299 |
+
|
300 |
+
Returns:
|
301 |
+
A tuple containing the evaluation result and feedback
|
302 |
+
"""
|
303 |
+
# Run the evaluation logic
|
304 |
+
result = self._run_evaluation(solution)
|
305 |
+
|
306 |
+
# Generate feedback based on the evaluation
|
307 |
+
feedback = self._generate_feedback(solution, result)
|
308 |
+
|
309 |
+
return result, feedback
|
310 |
+
|
311 |
+
def update_state(self,
|
312 |
+
solution: str,
|
313 |
+
result: EvaluationResult,
|
314 |
+
feedback: Feedback) -> ProblemState:
|
315 |
+
"""
|
316 |
+
Update the problem state based on the solution and feedback.
|
317 |
+
|
318 |
+
This method implements the recursive nature of the benchmark by
|
319 |
+
evolving the problem based on the model's solution attempt.
|
320 |
+
|
321 |
+
Args:
|
322 |
+
solution: The attempted solution
|
323 |
+
result: The evaluation result
|
324 |
+
feedback: The feedback provided
|
325 |
+
|
326 |
+
Returns:
|
327 |
+
The updated problem state
|
328 |
+
"""
|
329 |
+
# Add the current step to the trajectory
|
330 |
+
self.trajectory.add_step(
|
331 |
+
problem_state=self.state,
|
332 |
+
solution=solution,
|
333 |
+
result=result,
|
334 |
+
feedback=feedback
|
335 |
+
)
|
336 |
+
|
337 |
+
# Check if we've converged
|
338 |
+
if self.convergence_criteria.has_converged(self.trajectory):
|
339 |
+
if self.trajectory.steps[-1].result.score >= self.convergence_criteria.score_threshold:
|
340 |
+
self.status = TaskStatus.PERFECT_SOLUTION
|
341 |
+
elif len(self.trajectory.steps) >= self.convergence_criteria.max_iterations:
|
342 |
+
self.status = TaskStatus.MAX_ITERATIONS
|
343 |
+
else:
|
344 |
+
self.status = TaskStatus.CONVERGED
|
345 |
+
return self.state
|
346 |
+
|
347 |
+
# Evolve the problem state based on the solution
|
348 |
+
self.state = self._evolve_state(solution, result, feedback)
|
349 |
+
|
350 |
+
# Update the status
|
351 |
+
self.status = TaskStatus.IN_PROGRESS
|
352 |
+
|
353 |
+
return self.state
|
354 |
+
|
355 |
+
def _run_evaluation(self, solution: str) -> EvaluationResult:
|
356 |
+
"""
|
357 |
+
Run evaluation logic specific to this task.
|
358 |
+
|
359 |
+
Args:
|
360 |
+
solution: The solution to evaluate
|
361 |
+
|
362 |
+
Returns:
|
363 |
+
The evaluation result
|
364 |
+
"""
|
365 |
+
raise NotImplementedError("Subclasses must implement this method")
|
366 |
+
|
367 |
+
def _generate_feedback(self,
|
368 |
+
solution: str,
|
369 |
+
result: EvaluationResult) -> Feedback:
|
370 |
+
"""
|
371 |
+
Generate structured feedback based on evaluation results.
|
372 |
+
|
373 |
+
Args:
|
374 |
+
solution: The solution that was evaluated
|
375 |
+
result: The evaluation result
|
376 |
+
|
377 |
+
Returns:
|
378 |
+
Structured feedback
|
379 |
+
"""
|
380 |
+
raise NotImplementedError("Subclasses must implement this method")
|
381 |
+
|
382 |
+
def _evolve_state(self,
|
383 |
+
solution: str,
|
384 |
+
result: EvaluationResult,
|
385 |
+
feedback: Feedback) -> ProblemState:
|
386 |
+
"""
|
387 |
+
Evolve the problem state based on the solution and feedback.
|
388 |
+
|
389 |
+
This method implements the recursive nature of the benchmark by
|
390 |
+
defining how the problem changes in response to solution attempts.
|
391 |
+
|
392 |
+
Args:
|
393 |
+
solution: The attempted solution
|
394 |
+
result: The evaluation result
|
395 |
+
feedback: The feedback provided
|
396 |
+
|
397 |
+
Returns:
|
398 |
+
The evolved problem state
|
399 |
+
"""
|
400 |
+
raise NotImplementedError("Subclasses must implement this method")
|
401 |
+
|
402 |
+
def get_trajectory(self) -> Trajectory:
|
403 |
+
"""
|
404 |
+
Get the complete solution trajectory for this task.
|
405 |
+
|
406 |
+
Returns:
|
407 |
+
The solution trajectory
|
408 |
+
"""
|
409 |
+
return self.trajectory
|
410 |
+
|
411 |
+
def to_dict(self) -> Dict[str, Any]:
|
412 |
+
"""
|
413 |
+
Convert the task to a dictionary for serialization.
|
414 |
+
|
415 |
+
Returns:
|
416 |
+
A dictionary representation of the task
|
417 |
+
"""
|
418 |
+
return {
|
419 |
+
"task_id": self.task_id,
|
420 |
+
"status": self.status.value,
|
421 |
+
"state": {
|
422 |
+
"problem_id": self.state.problem_id,
|
423 |
+
"description": self.state.description,
|
424 |
+
"code_context": self.state.code_context,
|
425 |
+
"requirements": self.state.requirements,
|
426 |
+
"difficulty": self.state.difficulty,
|
427 |
+
"evolution_stage": self.state.evolution_stage,
|
428 |
+
"adaptation_vector": self.state.adaptation_vector
|
429 |
+
},
|
430 |
+
"config": self.config,
|
431 |
+
"trajectory": self.trajectory.to_dict()
|
432 |
+
}
|
433 |
+
|
434 |
+
def save(self, filepath: str) -> None:
|
435 |
+
"""
|
436 |
+
Save the task to a file.
|
437 |
+
|
438 |
+
Args:
|
439 |
+
filepath: Path to save the task
|
440 |
+
"""
|
441 |
+
with open(filepath, "w") as f:
|
442 |
+
json.dump(self.to_dict(), f, indent=2)
|
443 |
+
|
444 |
+
@classmethod
|
445 |
+
def load(cls, filepath: str) -> "RecursiveTask":
|
446 |
+
"""
|
447 |
+
Load a task from a file.
|
448 |
+
|
449 |
+
Args:
|
450 |
+
filepath: Path to load the task from
|
451 |
+
|
452 |
+
Returns:
|
453 |
+
The loaded task
|
454 |
+
"""
|
455 |
+
with open(filepath, "r") as f:
|
456 |
+
data = json.load(f)
|
457 |
+
|
458 |
+
# This method needs to be implemented by subclasses
|
459 |
+
# as they need to implement the abstract methods
|
460 |
+
raise NotImplementedError("Subclasses must implement this method")
|
evaluation/harness.py
ADDED
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# recursive_swe_bench/evaluation/harness.py
|
2 |
+
|
3 |
+
from typing import Any, Dict, List, Optional, Tuple, Union, Callable
|
4 |
+
import datetime
|
5 |
+
import uuid
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
import logging
|
9 |
+
from dataclasses import dataclass, field
|
10 |
+
|
11 |
+
from recursive_swe_bench.core.recursive_task import (
|
12 |
+
RecursiveTask, Trajectory, TrajectoryStep, ProblemState,
|
13 |
+
EvaluationResult, Feedback, TaskStatus
|
14 |
+
)
|
15 |
+
|
16 |
+
class RecursiveEvaluator:
|
17 |
+
"""
|
18 |
+
The core evaluation harness for recursive benchmark tasks.
|
19 |
+
|
20 |
+
This class orchestrates the recursive evaluation process, managing the interactions
|
21 |
+
between models and tasks, tracking trajectories, and calculating metrics.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(
|
25 |
+
self,
|
26 |
+
model: Any, # Model interface
|
27 |
+
metrics: Dict[str, Any], # Metric calculators
|
28 |
+
config: Dict[str, Any] = None
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
Initialize the recursive evaluator.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
model: The model to evaluate
|
35 |
+
metrics: Dictionary of metric calculators
|
36 |
+
config: Configuration options
|
37 |
+
"""
|
38 |
+
self.model = model
|
39 |
+
self.metrics = metrics
|
40 |
+
self.config = config or {}
|
41 |
+
self.logger = self._setup_logger()
|
42 |
+
|
43 |
+
def _setup_logger(self) -> logging.Logger:
|
44 |
+
"""Set up logging for the evaluator."""
|
45 |
+
logger = logging.getLogger("RecursiveEvaluator")
|
46 |
+
handler = logging.StreamHandler()
|
47 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
48 |
+
handler.setFormatter(formatter)
|
49 |
+
logger.addHandler(handler)
|
50 |
+
logger.setLevel(self.config.get("log_level", logging.INFO))
|
51 |
+
return logger
|
52 |
+
|
53 |
+
def evaluate_task(
|
54 |
+
self,
|
55 |
+
task: RecursiveTask,
|
56 |
+
max_iterations: int = 5
|
57 |
+
) -> Tuple[Trajectory, Dict[str, float]]:
|
58 |
+
"""
|
59 |
+
Run a full recursive evaluation on a single task.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
task: The task to evaluate
|
63 |
+
max_iterations: Maximum number of iterations
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
The trajectory and calculated metrics
|
67 |
+
"""
|
68 |
+
self.logger.info(f"Starting evaluation of task {task.task_id}")
|
69 |
+
|
70 |
+
for i in range(max_iterations):
|
71 |
+
self.logger.info(f"Starting iteration {i+1}/{max_iterations}")
|
72 |
+
|
73 |
+
# Get the current problem
|
74 |
+
problem = task.get_current_problem()
|
75 |
+
self.logger.debug(f"Problem state: evolution_stage={problem['evolution_stage']}")
|
76 |
+
|
77 |
+
# Format the problem for the model
|
78 |
+
formatted_problem = self._format_problem_for_model(problem, task.trajectory)
|
79 |
+
|
80 |
+
# Get model solution
|
81 |
+
self.logger.debug("Requesting solution from model")
|
82 |
+
solution = self.model.solve(formatted_problem)
|
83 |
+
|
84 |
+
# Evaluate the solution
|
85 |
+
self.logger.debug("Evaluating solution")
|
86 |
+
result, feedback = task.evaluate_solution(solution)
|
87 |
+
|
88 |
+
# Log the results
|
89 |
+
self.logger.info(f"Solution score: {result.score:.4f}, Success: {result.success}")
|
90 |
+
|
91 |
+
# Update the task state based on the solution
|
92 |
+
new_state = task.update_state(solution, result, feedback)
|
93 |
+
|
94 |
+
# Check if we've reached a terminal state
|
95 |
+
if task.status != TaskStatus.IN_PROGRESS:
|
96 |
+
self.logger.info(f"Task complete with status: {task.status.value}")
|
97 |
+
break
|
98 |
+
|
99 |
+
# Calculate metrics across the trajectory
|
100 |
+
self.logger.info("Calculating metrics")
|
101 |
+
metrics_result = self._calculate_metrics(task.trajectory)
|
102 |
+
|
103 |
+
return task.trajectory, metrics_result
|
104 |
+
|
105 |
+
def evaluate_task_set(
|
106 |
+
self,
|
107 |
+
tasks: List[RecursiveTask],
|
108 |
+
max_iterations: int = 5,
|
109 |
+
output_dir: Optional[str] = None
|
110 |
+
) -> Dict[str, Any]:
|
111 |
+
"""
|
112 |
+
Evaluate a set of tasks and aggregate the results.
|
113 |
+
|
114 |
+
Args:
|
115 |
+
tasks: List of tasks to evaluate
|
116 |
+
max_iterations: Maximum iterations per task
|
117 |
+
output_dir: Directory to save results (optional)
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
Dictionary of aggregated results
|
121 |
+
"""
|
122 |
+
self.logger.info(f"Evaluating {len(tasks)} tasks")
|
123 |
+
|
124 |
+
results = {}
|
125 |
+
trajectories = {}
|
126 |
+
all_metrics = {}
|
127 |
+
|
128 |
+
for i, task in enumerate(tasks):
|
129 |
+
self.logger.info(f"Evaluating task {i+1}/{len(tasks)}: {task.task_id}")
|
130 |
+
|
131 |
+
# Evaluate the task
|
132 |
+
trajectory, metrics = self.evaluate_task(task, max_iterations)
|
133 |
+
|
134 |
+
# Store the results
|
135 |
+
trajectories[task.task_id] = trajectory
|
136 |
+
all_metrics[task.task_id] = metrics
|
137 |
+
|
138 |
+
# Save the trajectory if output_dir is provided
|
139 |
+
if output_dir:
|
140 |
+
os.makedirs(output_dir, exist_ok=True)
|
141 |
+
task_output_path = os.path.join(output_dir, f"task_{task.task_id}.json")
|
142 |
+
task.save(task_output_path)
|
143 |
+
self.logger.info(f"Saved task to {task_output_path}")
|
144 |
+
|
145 |
+
# Aggregate metrics across all tasks
|
146 |
+
aggregated_metrics = self._aggregate_metrics(all_metrics)
|
147 |
+
|
148 |
+
# Compile results
|
149 |
+
results = {
|
150 |
+
"aggregated_metrics": aggregated_metrics,
|
151 |
+
"task_metrics": all_metrics,
|
152 |
+
"timestamp": datetime.datetime.now().isoformat(),
|
153 |
+
"model_info": self.model.get_meta_information(),
|
154 |
+
"total_tasks": len(tasks),
|
155 |
+
"config": self.config
|
156 |
+
}
|
157 |
+
|
158 |
+
# Save aggregated results if output_dir is provided
|
159 |
+
if output_dir:
|
160 |
+
results_path = os.path.join(output_dir, "aggregated_results.json")
|
161 |
+
with open(results_path, "w") as f:
|
162 |
+
json.dump(results, f, indent=2)
|
163 |
+
self.logger.info(f"Saved aggregated results to {results_path}")
|
164 |
+
|
165 |
+
return results
|
166 |
+
|
167 |
+
def _format_problem_for_model(
|
168 |
+
self,
|
169 |
+
problem: Dict[str, Any],
|
170 |
+
trajectory: Trajectory
|
171 |
+
) -> Dict[str, Any]:
|
172 |
+
"""
|
173 |
+
Format the problem in a way the model can understand.
|
174 |
+
|
175 |
+
Args:
|
176 |
+
problem: The problem state
|
177 |
+
trajectory: The trajectory so far
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
Formatted problem for the model
|
181 |
+
"""
|
182 |
+
# Extract the previous steps if they exist
|
183 |
+
previous_steps = []
|
184 |
+
for step in trajectory.steps:
|
185 |
+
previous_steps.append({
|
186 |
+
"problem": {
|
187 |
+
"description": step.problem_state.description,
|
188 |
+
"requirements": step.problem_state.requirements,
|
189 |
+
"evolution_stage": step.problem_state.evolution_stage
|
190 |
+
},
|
191 |
+
"solution": step.solution,
|
192 |
+
"feedback": {
|
193 |
+
"summary": step.feedback.summary,
|
194 |
+
"issues": step.feedback.issues,
|
195 |
+
"suggestions": step.feedback.suggestions,
|
196 |
+
"focus_areas": step.feedback.focus_areas
|
197 |
+
}
|
198 |
+
})
|
199 |
+
|
200 |
+
# Format the problem with the trajectory context
|
201 |
+
formatted_problem = {
|
202 |
+
"description": problem["description"],
|
203 |
+
"code_context": problem["code_context"],
|
204 |
+
"requirements": problem["requirements"],
|
205 |
+
"iteration": problem["evolution_stage"] + 1,
|
206 |
+
"previous_attempts": previous_steps
|
207 |
+
}
|
208 |
+
|
209 |
+
return formatted_problem
|
210 |
+
|
211 |
+
def _calculate_metrics(self, trajectory: Trajectory) -> Dict[str, float]:
|
212 |
+
"""
|
213 |
+
Calculate metrics across the trajectory.
|
214 |
+
|
215 |
+
Args:
|
216 |
+
trajectory: The solution trajectory
|
217 |
+
|
218 |
+
Returns:
|
219 |
+
Dictionary of metric values
|
220 |
+
"""
|
221 |
+
return {name: metric.calculate(trajectory)
|
222 |
+
for name, metric in self.metrics.items()}
|
223 |
+
|
224 |
+
def _aggregate_metrics(
|
225 |
+
self,
|
226 |
+
all_metrics: Dict[str, Dict[str, float]]
|
227 |
+
) -> Dict[str, float]:
|
228 |
+
"""
|
229 |
+
Aggregate metrics across multiple tasks.
|
230 |
+
|
231 |
+
Args:
|
232 |
+
all_metrics: Dictionary of metrics per task
|
233 |
+
|
234 |
+
Returns:
|
235 |
+
Dictionary of aggregated metrics
|
236 |
+
"""
|
237 |
+
# Initialize aggregated metrics
|
238 |
+
if not all_metrics:
|
239 |
+
return {}
|
240 |
+
|
241 |
+
sample_metrics = next(iter(all_metrics.values()))
|
242 |
+
aggregated = {name: 0.0 for name in sample_metrics.keys()}
|
243 |
+
|
244 |
+
# Sum up metrics
|
245 |
+
for task_metrics in all_metrics.values():
|
246 |
+
for name, value in task_metrics.items():
|
247 |
+
aggregated[name] += value
|
248 |
+
|
249 |
+
# Calculate averages
|
250 |
+
for name in aggregated:
|
251 |
+
aggregated[name] /= len(all_metrics)
|
252 |
+
|
253 |
+
return aggregated
|
254 |
+
|
255 |
+
|
256 |
+
# recursive_swe_bench/evaluation/metrics/recursive.py
|
257 |
+
|
258 |
+
from typing import Any, Dict, List, Optional
|
259 |
+
import numpy as np
|
260 |
+
from recursive_swe_bench.core.recursive_task import Trajectory
|
261 |
+
|
262 |
+
|
263 |
+
class RecursiveMetric:
|
264 |
+
"""Base class for recursive metrics."""
|
265 |
+
|
266 |
+
def __init__(self, config: Dict[str, Any] = None):
|
267 |
+
self.config = config or {}
|
268 |
+
|
269 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
270 |
+
"""
|
271 |
+
Calculate the metric value for a trajectory.
|
272 |
+
|
273 |
+
Args:
|
274 |
+
trajectory: The solution trajectory
|
275 |
+
|
276 |
+
Returns:
|
277 |
+
The metric value
|
278 |
+
"""
|
279 |
+
raise NotImplementedError("Subclasses must implement this method")
|
280 |
+
|
281 |
+
|
282 |
+
class ConvergenceRate(RecursiveMetric):
|
283 |
+
"""
|
284 |
+
Measures how quickly the model reaches a stable solution.
|
285 |
+
|
286 |
+
A lower value indicates faster convergence.
|
287 |
+
"""
|
288 |
+
|
289 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
290 |
+
scores = trajectory.get_score_series()
|
291 |
+
if len(scores) < 2:
|
292 |
+
return 0.0
|
293 |
+
|
294 |
+
# Calculate changes between consecutive scores
|
295 |
+
deltas = [abs(scores[i+1] - scores[i])
|
296 |
+
for i in range(len(scores)-1)]
|
297 |
+
|
298 |
+
# A lower sum indicates faster convergence
|
299 |
+
# Normalize by the number of iterations
|
300 |
+
return sum(deltas) / len(deltas)
|
301 |
+
|
302 |
+
|
303 |
+
class AdaptationEfficiency(RecursiveMetric):
|
304 |
+
"""
|
305 |
+
Measures improvement per feedback iteration.
|
306 |
+
|
307 |
+
A higher value indicates more efficient adaptation.
|
308 |
+
"""
|
309 |
+
|
310 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
311 |
+
scores = trajectory.get_score_series()
|
312 |
+
if len(scores) < 2:
|
313 |
+
return 0.0
|
314 |
+
|
315 |
+
# Calculate the improvement from first to last iteration
|
316 |
+
total_improvement = max(0.0, scores[-1] - scores[0])
|
317 |
+
|
318 |
+
# Normalize by the number of iterations
|
319 |
+
return total_improvement / (len(scores) - 1)
|
320 |
+
|
321 |
+
|
322 |
+
class LearningCurveArea(RecursiveMetric):
|
323 |
+
"""
|
324 |
+
Measures the area under the learning curve.
|
325 |
+
|
326 |
+
A higher value indicates better overall performance across iterations.
|
327 |
+
"""
|
328 |
+
|
329 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
330 |
+
scores = trajectory.get_score_series()
|
331 |
+
if not scores:
|
332 |
+
return 0.0
|
333 |
+
|
334 |
+
# Calculate the area under the curve
|
335 |
+
# Normalize by the maximum possible area (perfect score from the start)
|
336 |
+
max_score = self.config.get("max_score", 1.0)
|
337 |
+
max_area = max_score * len(scores)
|
338 |
+
|
339 |
+
return sum(scores) / max_area
|
340 |
+
|
341 |
+
|
342 |
+
class ProbabilisticSolutionQuality(RecursiveMetric):
|
343 |
+
"""
|
344 |
+
Measures the distribution of solution quality using non-deterministic assessment.
|
345 |
+
|
346 |
+
This metric captures the robustness of solutions by measuring the variability in quality
|
347 |
+
across multiple probabilistic evaluations.
|
348 |
+
"""
|
349 |
+
|
350 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
351 |
+
# For each step, we expect the result.metrics to contain probabilistic assessments
|
352 |
+
steps = trajectory.steps
|
353 |
+
if not steps:
|
354 |
+
return 0.0
|
355 |
+
|
356 |
+
# Extract probabilistic quality distributions if available
|
357 |
+
distributions = []
|
358 |
+
for step in steps:
|
359 |
+
if (step.result.metrics and
|
360 |
+
"probabilistic_quality_distribution" in step.result.metrics):
|
361 |
+
distributions.append(
|
362 |
+
step.result.metrics["probabilistic_quality_distribution"])
|
363 |
+
|
364 |
+
if not distributions:
|
365 |
+
# Fall back to deterministic scores if no distributions are available
|
366 |
+
return trajectory.get_score_series()[-1]
|
367 |
+
|
368 |
+
# Calculate the expected value of the final distribution
|
369 |
+
final_distribution = distributions[-1]
|
370 |
+
return sum(prob * val for val, prob in final_distribution.items())
|
371 |
+
|
372 |
+
|
373 |
+
class TransferLearningFactor(RecursiveMetric):
|
374 |
+
"""
|
375 |
+
Measures how well learning transfers across related problems.
|
376 |
+
|
377 |
+
This requires multiple trajectories from related tasks.
|
378 |
+
"""
|
379 |
+
|
380 |
+
def __init__(self, config: Dict[str, Any] = None, related_trajectories: List[Trajectory] = None):
|
381 |
+
super().__init__(config)
|
382 |
+
self.related_trajectories = related_trajectories or []
|
383 |
+
|
384 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
385 |
+
# This metric requires related trajectories
|
386 |
+
if not self.related_trajectories:
|
387 |
+
return 0.0
|
388 |
+
|
389 |
+
# Get learning rates for the current trajectory and related ones
|
390 |
+
current_learning_rate = self._calculate_learning_rate(trajectory)
|
391 |
+
|
392 |
+
related_learning_rates = [
|
393 |
+
self._calculate_learning_rate(rel_traj)
|
394 |
+
for rel_traj in self.related_trajectories
|
395 |
+
]
|
396 |
+
|
397 |
+
# Filter out invalid learning rates
|
398 |
+
valid_related_rates = [rate for rate in related_learning_rates if rate is not None]
|
399 |
+
|
400 |
+
if not valid_related_rates:
|
401 |
+
return 0.0
|
402 |
+
|
403 |
+
# Calculate the transfer factor as the ratio of the current learning rate
|
404 |
+
# to the average of related learning rates
|
405 |
+
avg_related_rate = sum(valid_related_rates) / len(valid_related_rates)
|
406 |
+
|
407 |
+
if avg_related_rate == 0:
|
408 |
+
return 0.0
|
409 |
+
|
410 |
+
return current_learning_rate / avg_related_rate
|
411 |
+
|
412 |
+
def _calculate_learning_rate(self, trajectory: Trajectory) -> Optional[float]:
|
413 |
+
"""Calculate the learning rate for a trajectory."""
|
414 |
+
scores = trajectory.get_score_series()
|
415 |
+
if len(scores) < 2:
|
416 |
+
return None
|
417 |
+
|
418 |
+
# Calculate improvement per iteration
|
419 |
+
return (scores[-1] - scores[0]) / (len(scores) - 1)
|
420 |
+
|
421 |
+
|
422 |
+
class DynamicComplexityHandling(RecursiveMetric):
|
423 |
+
"""
|
424 |
+
Measures how well the model handles varying problem complexity.
|
425 |
+
|
426 |
+
This metric evaluates performance while accounting for changes in problem difficulty.
|
427 |
+
"""
|
428 |
+
|
429 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
430 |
+
if not trajectory.steps:
|
431 |
+
return 0.0
|
432 |
+
|
433 |
+
# Extract scores and difficulties
|
434 |
+
scores = trajectory.get_score_series()
|
435 |
+
difficulties = [step.problem_state.difficulty for step in trajectory.steps]
|
436 |
+
|
437 |
+
if len(scores) < 2:
|
438 |
+
return scores[0] # Return the single score if only one step
|
439 |
+
|
440 |
+
# Calculate normalized scores (adjusted by difficulty)
|
441 |
+
normalized_scores = [scores[i] * (1 + difficulties[i])
|
442 |
+
for i in range(len(scores))]
|
443 |
+
|
444 |
+
# Return the average normalized score
|
445 |
+
return sum(normalized_scores) / len(normalized_scores)
|
models/anthropic.py
ADDED
@@ -0,0 +1,866 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# recursive_swe_bench/models/anthropic.py
|
2 |
+
|
3 |
+
import json
|
4 |
+
import backoff
|
5 |
+
import time
|
6 |
+
import anthropic
|
7 |
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
8 |
+
import re
|
9 |
+
import logging
|
10 |
+
|
11 |
+
from recursive_swe_bench.models.base_model import ModelInterface
|
12 |
+
|
13 |
+
class AnthropicModel(ModelInterface):
|
14 |
+
"""
|
15 |
+
Integration with Anthropic models (Claude).
|
16 |
+
|
17 |
+
This class provides integration with Anthropic's API for evaluating
|
18 |
+
Claude models with Recursive-SWE-bench through recursive evaluation loops.
|
19 |
+
The implementation features dynamic adaptation to feedback through a
|
20 |
+
self-reflective mechanism that traces attribution paths through recursive iterations.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
model_identifier: str,
|
26 |
+
api_key: Optional[str] = None,
|
27 |
+
config: Optional[Dict[str, Any]] = None
|
28 |
+
):
|
29 |
+
"""
|
30 |
+
Initialize the Anthropic model interface.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
model_identifier: Anthropic model identifier (e.g., "claude-3-opus-20240229")
|
34 |
+
api_key: Anthropic API key (optional if set in environment)
|
35 |
+
config: Additional configuration options
|
36 |
+
"""
|
37 |
+
super().__init__(model_identifier, config)
|
38 |
+
|
39 |
+
# Initialize Anthropic client
|
40 |
+
if api_key:
|
41 |
+
self.client = anthropic.Anthropic(api_key=api_key)
|
42 |
+
else:
|
43 |
+
self.client = anthropic.Anthropic()
|
44 |
+
|
45 |
+
# Set up system prompt and templates
|
46 |
+
self.prompts = self.config.get("prompts", {
|
47 |
+
"system": "You are an expert software engineer who specializes in debugging and fixing complex code. Your task is to fix bugs in code based on the description and test requirements provided.",
|
48 |
+
"user_template": "# Bug Fixing Task\n\n{description}\n\n# Code\n```python\n{code}\n```\n\n{tests_description}\n\n# Your task\nFix the bugs in the code above. Focus on making the code pass all tests while maintaining good practices. Provide only the corrected code without additional explanations.",
|
49 |
+
"reflection_template": "# Feedback on Previous Solution\n\nYour previous solution had the following issues:\n{issues}\n\n# Suggested Improvements\n{suggestions}\n\n# Test Results\n{test_results}\n\n# Reflection Prompt\nBefore providing a new solution, analyze what went wrong in your previous attempt and how you'll approach fixing it differently this time."
|
50 |
+
})
|
51 |
+
|
52 |
+
# Configure API parameters
|
53 |
+
self.api_params = self.config.get("api_params", {
|
54 |
+
"temperature": 0.2,
|
55 |
+
"max_tokens": 2000,
|
56 |
+
"top_p": 0.95,
|
57 |
+
"top_k": 50
|
58 |
+
})
|
59 |
+
|
60 |
+
# Set up recursive adaptation configuration
|
61 |
+
self.recursive_config = self.config.get("recursive_config", {
|
62 |
+
"enable_self_reflection": True,
|
63 |
+
"adaptation_threshold": 0.5, # Minimum score to trigger adaptation
|
64 |
+
"max_reflection_depth": 3, # Maximum depth of recursive reflection
|
65 |
+
"attribution_tracking": True, # Track attribution patterns across iterations
|
66 |
+
"dynamic_prompting": True, # Adjust prompts based on failure patterns
|
67 |
+
})
|
68 |
+
|
69 |
+
# Initialize recursive state
|
70 |
+
self.recursive_state = {
|
71 |
+
"reflection_depth": 0,
|
72 |
+
"adaptation_vector": [0.0] * 5, # Tracks adaptation across dimensions
|
73 |
+
"attribution_map": {}, # Maps error types to attribution patterns
|
74 |
+
"error_frequency": {}, # Tracks frequency of error types
|
75 |
+
"solution_quality_trend": [], # Tracks solution quality over iterations
|
76 |
+
}
|
77 |
+
|
78 |
+
self.logger.info(f"Initialized Anthropic model: {model_identifier} with recursive capability")
|
79 |
+
|
80 |
+
@backoff.on_exception(
|
81 |
+
backoff.expo,
|
82 |
+
(anthropic.APIError, anthropic.APITimeoutError, anthropic.RateLimitError),
|
83 |
+
max_tries=5
|
84 |
+
)
|
85 |
+
def solve(
|
86 |
+
self,
|
87 |
+
problem: Dict[str, Any],
|
88 |
+
history: Optional[List[Dict[str, Any]]] = None
|
89 |
+
) -> str:
|
90 |
+
"""
|
91 |
+
Generate a solution using the Anthropic model with recursive adaptation.
|
92 |
+
|
93 |
+
Args:
|
94 |
+
problem: The problem to solve
|
95 |
+
history: Optional history of previous solution attempts
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
The generated solution
|
99 |
+
"""
|
100 |
+
self.logger.info(f"Solving problem with Anthropic model: {self.model_identifier}")
|
101 |
+
start_time = time.time()
|
102 |
+
|
103 |
+
# Reset recursive state for new problems if no history
|
104 |
+
if not history:
|
105 |
+
self._reset_recursive_state()
|
106 |
+
elif history:
|
107 |
+
# Update recursive state based on history
|
108 |
+
self._update_recursive_state(history)
|
109 |
+
|
110 |
+
# Format messages for the model
|
111 |
+
system_prompt, user_message = self._format_messages(problem, history)
|
112 |
+
|
113 |
+
# Make API call
|
114 |
+
response = self.client.messages.create(
|
115 |
+
model=self.model_identifier,
|
116 |
+
system=system_prompt,
|
117 |
+
messages=[
|
118 |
+
{"role": "user", "content": user_message}
|
119 |
+
],
|
120 |
+
max_tokens=self.api_params.get("max_tokens", 2000),
|
121 |
+
temperature=self.api_params.get("temperature", 0.2),
|
122 |
+
top_p=self.api_params.get("top_p", 0.95),
|
123 |
+
top_k=self.api_params.get("top_k", 50)
|
124 |
+
)
|
125 |
+
|
126 |
+
# Extract the solution from the response
|
127 |
+
solution = response.content[0].text
|
128 |
+
|
129 |
+
end_time = time.time()
|
130 |
+
self.logger.info(f"Solution generated in {end_time - start_time:.2f} seconds")
|
131 |
+
|
132 |
+
# Track solution in recursive state
|
133 |
+
if solution:
|
134 |
+
self.recursive_state["reflection_depth"] += 1
|
135 |
+
|
136 |
+
return self._extract_code(solution)
|
137 |
+
|
138 |
+
def _format_messages(
|
139 |
+
self,
|
140 |
+
problem: Dict[str, Any],
|
141 |
+
history: Optional[List[Dict[str, Any]]] = None
|
142 |
+
) -> Tuple[str, str]:
|
143 |
+
"""
|
144 |
+
Format the problem and history into messages for the Anthropic API.
|
145 |
+
|
146 |
+
Args:
|
147 |
+
problem: The problem to solve
|
148 |
+
history: Optional history of previous solution attempts
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
Tuple of (system_prompt, user_message)
|
152 |
+
"""
|
153 |
+
# Start with base system prompt
|
154 |
+
system_prompt = self.prompts["system"]
|
155 |
+
|
156 |
+
# Enhance system prompt with recursive adaptation if enabled
|
157 |
+
if self.recursive_config.get("enable_self_reflection", True) and history:
|
158 |
+
# Add adaptation guidance based on error patterns
|
159 |
+
if self.recursive_state["error_frequency"]:
|
160 |
+
top_errors = sorted(
|
161 |
+
self.recursive_state["error_frequency"].items(),
|
162 |
+
key=lambda x: x[1],
|
163 |
+
reverse=True
|
164 |
+
)[:3]
|
165 |
+
|
166 |
+
error_guidance = "Focus particularly on addressing these recurring issues:\n"
|
167 |
+
for error_type, count in top_errors:
|
168 |
+
error_guidance += f"- {error_type} (appeared {count} times)\n"
|
169 |
+
|
170 |
+
system_prompt += f"\n\n{error_guidance}"
|
171 |
+
|
172 |
+
# Add reflection guidance based on solution quality trend
|
173 |
+
if len(self.recursive_state["solution_quality_trend"]) > 1:
|
174 |
+
trend = self.recursive_state["solution_quality_trend"]
|
175 |
+
if trend[-1] > trend[-2]:
|
176 |
+
system_prompt += "\n\nYour solutions are improving. Continue this trajectory."
|
177 |
+
elif trend[-1] < trend[-2]:
|
178 |
+
system_prompt += "\n\nYour solutions are declining in quality. Carefully reconsider your approach."
|
179 |
+
else:
|
180 |
+
system_prompt += "\n\nYour solutions maintain the same quality. Try a different approach."
|
181 |
+
|
182 |
+
# Format code and tests
|
183 |
+
code = problem["code_context"]["code"]
|
184 |
+
|
185 |
+
# Prepare tests description
|
186 |
+
tests_description = "# Tests\n"
|
187 |
+
if "tests" in problem["code_context"]:
|
188 |
+
tests_description += "The code must pass the following tests:\n\n"
|
189 |
+
for i, test in enumerate(problem["code_context"]["tests"]):
|
190 |
+
tests_description += f"## Test {i+1}: {test['name']}\n```python\n{test['content']}\n```\n\n"
|
191 |
+
else:
|
192 |
+
tests_description += "The code must work correctly according to its intended functionality."
|
193 |
+
|
194 |
+
# Base user message
|
195 |
+
user_message = self.prompts["user_template"].format(
|
196 |
+
description=problem["description"],
|
197 |
+
code=code,
|
198 |
+
tests_description=tests_description
|
199 |
+
)
|
200 |
+
|
201 |
+
# Add history if available - with recursive reflection
|
202 |
+
if history and self.recursive_config.get("enable_self_reflection", True):
|
203 |
+
# Get the most recent entry for reflection
|
204 |
+
latest_entry = history[-1]
|
205 |
+
|
206 |
+
# Format issues
|
207 |
+
issues_text = "- " + "\n- ".join([issue["message"] for issue in latest_entry["feedback"]["issues"]])
|
208 |
+
|
209 |
+
# Format suggestions
|
210 |
+
suggestions_text = "- " + "\n- ".join([suggestion["message"] for suggestion in latest_entry["feedback"]["suggestions"]])
|
211 |
+
|
212 |
+
# Format test results
|
213 |
+
test_results = latest_entry.get("result", {})
|
214 |
+
passed_tests = test_results.get("passed_tests", 0)
|
215 |
+
total_tests = test_results.get("total_tests", 0)
|
216 |
+
|
217 |
+
test_results_text = f"Passed {passed_tests}/{total_tests} tests."
|
218 |
+
if "tests" in test_results:
|
219 |
+
test_results_text += "\n\nIndividual test results:"
|
220 |
+
for test_name, test_result in test_results["tests"].items():
|
221 |
+
status = "✅ Passed" if test_result.get("passed", False) else "❌ Failed"
|
222 |
+
test_results_text += f"\n- {test_name}: {status}"
|
223 |
+
if not test_result.get("passed", False) and "message" in test_result:
|
224 |
+
test_results_text += f"\n Error: {test_result['message']}"
|
225 |
+
|
226 |
+
# Add reflection prompt
|
227 |
+
reflection_prompt = self.prompts["reflection_template"].format(
|
228 |
+
issues=issues_text,
|
229 |
+
suggestions=suggestions_text,
|
230 |
+
test_results=test_results_text
|
231 |
+
)
|
232 |
+
|
233 |
+
# Prepend reflection to user message
|
234 |
+
user_message = f"{reflection_prompt}\n\n{user_message}"
|
235 |
+
|
236 |
+
# Add dynamic adaptation based on error patterns if enabled
|
237 |
+
if self.recursive_config.get("dynamic_prompting", True):
|
238 |
+
# Look for specific error patterns and add targeted guidance
|
239 |
+
error_types = [issue.get("type", "") for issue in latest_entry["feedback"]["issues"]]
|
240 |
+
|
241 |
+
if "syntax" in " ".join(error_types).lower():
|
242 |
+
user_message += "\n\nPay careful attention to syntax correctness. Double-check all parentheses, indentation, and function definitions."
|
243 |
+
|
244 |
+
if "test_failure" in " ".join(error_types).lower():
|
245 |
+
user_message += "\n\nFocus on making the code pass the failing tests. Carefully trace through the code execution for each test case."
|
246 |
+
|
247 |
+
if "edge_case" in " ".join(error_types).lower() or "boundary" in " ".join(error_types).lower():
|
248 |
+
user_message += "\n\nBe sure to handle edge cases such as empty inputs, boundary values, and special cases."
|
249 |
+
|
250 |
+
if "performance" in " ".join(error_types).lower():
|
251 |
+
user_message += "\n\nOptimize your solution for better performance. Avoid unnecessary operations and inefficient data structures."
|
252 |
+
|
253 |
+
return system_prompt, user_message
|
254 |
+
|
255 |
+
def _extract_code(self, text: str) -> str:
|
256 |
+
"""
|
257 |
+
Extract code from the model's response.
|
258 |
+
|
259 |
+
Args:
|
260 |
+
text: The model's response
|
261 |
+
|
262 |
+
Returns:
|
263 |
+
Extracted code
|
264 |
+
"""
|
265 |
+
# Try to extract code from markdown code blocks
|
266 |
+
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', text, re.DOTALL)
|
267 |
+
|
268 |
+
if code_blocks:
|
269 |
+
return code_blocks[0].strip()
|
270 |
+
|
271 |
+
# If no code blocks, return the full text (it might be just code)
|
272 |
+
return text.strip()
|
273 |
+
|
274 |
+
def _reset_recursive_state(self):
|
275 |
+
"""Reset the recursive state for a new problem."""
|
276 |
+
self.recursive_state = {
|
277 |
+
"reflection_depth": 0,
|
278 |
+
"adaptation_vector": [0.0] * 5,
|
279 |
+
"attribution_map": {},
|
280 |
+
"error_frequency": {},
|
281 |
+
"solution_quality_trend": [],
|
282 |
+
}
|
283 |
+
|
284 |
+
def _update_recursive_state(self, history: List[Dict[str, Any]]):
|
285 |
+
"""
|
286 |
+
Update recursive state based on solution history.
|
287 |
+
|
288 |
+
Args:
|
289 |
+
history: History of previous solution attempts
|
290 |
+
"""
|
291 |
+
# Extract scores from history
|
292 |
+
scores = [entry.get("result", {}).get("score", 0.0) for entry in history]
|
293 |
+
self.recursive_state["solution_quality_trend"] = scores
|
294 |
+
|
295 |
+
# Calculate adaptation vector
|
296 |
+
if len(scores) >= 2:
|
297 |
+
# Dimension 0: Overall improvement trajectory
|
298 |
+
improvement = scores[-1] - scores[0]
|
299 |
+
self.recursive_state["adaptation_vector"][0] = max(-1.0, min(1.0, improvement))
|
300 |
+
|
301 |
+
# Dimension 1: Recent improvement
|
302 |
+
recent_improvement = scores[-1] - scores[-2]
|
303 |
+
self.recursive_state["adaptation_vector"][1] = max(-1.0, min(1.0, recent_improvement))
|
304 |
+
|
305 |
+
# Update error frequency from latest feedback
|
306 |
+
if history:
|
307 |
+
latest_feedback = history[-1].get("feedback", {})
|
308 |
+
issues = latest_feedback.get("issues", [])
|
309 |
+
|
310 |
+
for issue in issues:
|
311 |
+
issue_type = issue.get("type", "unknown")
|
312 |
+
self.recursive_state["error_frequency"][issue_type] = self.recursive_state["error_frequency"].get(issue_type, 0) + 1
|
313 |
+
|
314 |
+
# Update reflection depth
|
315 |
+
self.recursive_state["reflection_depth"] = len(history)
|
316 |
+
|
317 |
+
def get_meta_information(self) -> Dict[str, Any]:
|
318 |
+
"""
|
319 |
+
Get meta information about the model.
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
Dictionary containing model information
|
323 |
+
"""
|
324 |
+
return {
|
325 |
+
"model_name": self.model_identifier,
|
326 |
+
"provider": "Anthropic",
|
327 |
+
"type": "API",
|
328 |
+
"parameters": self.api_params,
|
329 |
+
"system_prompt": self.prompts["system"],
|
330 |
+
"recursive_capability": self.recursive_config.get("enable_self_reflection", True),
|
331 |
+
"reflection_depth": self.recursive_state["reflection_depth"],
|
332 |
+
"adaptation_vector": self.recursive_state["adaptation_vector"]
|
333 |
+
}
|
334 |
+
|
335 |
+
|
336 |
+
# recursive_swe_bench/evaluation/recursive_metrics.py
|
337 |
+
|
338 |
+
import numpy as np
|
339 |
+
import scipy.stats
|
340 |
+
from typing import Any, Dict, List, Optional, Union
|
341 |
+
import dataclasses
|
342 |
+
import math
|
343 |
+
|
344 |
+
from recursive_swe_bench.core.recursive_task import Trajectory
|
345 |
+
|
346 |
+
|
347 |
+
class RecursiveLearningCurveArea:
|
348 |
+
"""
|
349 |
+
Measures the area under the learning curve across iterations.
|
350 |
+
|
351 |
+
This metric captures the overall performance of a model throughout its
|
352 |
+
learning trajectory, rewarding both high scores and quick improvement.
|
353 |
+
"""
|
354 |
+
|
355 |
+
def __init__(self, config: Dict[str, Any] = None):
|
356 |
+
"""
|
357 |
+
Initialize the recursive learning curve area metric.
|
358 |
+
|
359 |
+
Args:
|
360 |
+
config: Configuration options
|
361 |
+
"""
|
362 |
+
self.config = config or {}
|
363 |
+
self.max_score = self.config.get("max_score", 1.0)
|
364 |
+
self.normalize = self.config.get("normalize", True)
|
365 |
+
|
366 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
367 |
+
"""
|
368 |
+
Calculate the area under the learning curve.
|
369 |
+
|
370 |
+
Args:
|
371 |
+
trajectory: The solution trajectory
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
The normalized area under the learning curve
|
375 |
+
"""
|
376 |
+
scores = trajectory.get_score_series()
|
377 |
+
if not scores:
|
378 |
+
return 0.0
|
379 |
+
|
380 |
+
# Calculate the area under the curve using trapezoidal rule
|
381 |
+
area = np.trapz(scores, dx=1.0)
|
382 |
+
|
383 |
+
# Normalize by the maximum possible area if requested
|
384 |
+
if self.normalize:
|
385 |
+
max_area = self.max_score * len(scores)
|
386 |
+
return area / max_area
|
387 |
+
|
388 |
+
return area
|
389 |
+
|
390 |
+
|
391 |
+
class AdaptationRate:
|
392 |
+
"""
|
393 |
+
Measures the rate at which the model improves its solutions.
|
394 |
+
|
395 |
+
This metric captures how quickly a model adapts to feedback and
|
396 |
+
improves its solutions across iterations.
|
397 |
+
"""
|
398 |
+
|
399 |
+
def __init__(self, config: Dict[str, Any] = None):
|
400 |
+
"""
|
401 |
+
Initialize the adaptation rate metric.
|
402 |
+
|
403 |
+
Args:
|
404 |
+
config: Configuration options
|
405 |
+
"""
|
406 |
+
self.config = config or {}
|
407 |
+
self.min_iterations = self.config.get("min_iterations", 2)
|
408 |
+
|
409 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
410 |
+
"""
|
411 |
+
Calculate the adaptation rate.
|
412 |
+
|
413 |
+
Args:
|
414 |
+
trajectory: The solution trajectory
|
415 |
+
|
416 |
+
Returns:
|
417 |
+
The adaptation rate
|
418 |
+
"""
|
419 |
+
scores = trajectory.get_score_series()
|
420 |
+
if len(scores) < self.min_iterations:
|
421 |
+
return 0.0
|
422 |
+
|
423 |
+
# Calculate the average improvement per iteration
|
424 |
+
total_improvement = scores[-1] - scores[0]
|
425 |
+
iterations = len(scores) - 1
|
426 |
+
|
427 |
+
return total_improvement / iterations
|
428 |
+
|
429 |
+
|
430 |
+
class RecursiveVolatility:
|
431 |
+
"""
|
432 |
+
Measures the volatility of solution quality across iterations.
|
433 |
+
|
434 |
+
This metric captures how stable or erratic a model's performance
|
435 |
+
is across iterations.
|
436 |
+
"""
|
437 |
+
|
438 |
+
def __init__(self, config: Dict[str, Any] = None):
|
439 |
+
"""
|
440 |
+
Initialize the recursive volatility metric.
|
441 |
+
|
442 |
+
Args:
|
443 |
+
config: Configuration options
|
444 |
+
"""
|
445 |
+
self.config = config or {}
|
446 |
+
self.min_iterations = self.config.get("min_iterations", 3)
|
447 |
+
self.normalize = self.config.get("normalize", True)
|
448 |
+
|
449 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
450 |
+
"""
|
451 |
+
Calculate the recursive volatility.
|
452 |
+
|
453 |
+
Args:
|
454 |
+
trajectory: The solution trajectory
|
455 |
+
|
456 |
+
Returns:
|
457 |
+
The normalized volatility
|
458 |
+
"""
|
459 |
+
scores = trajectory.get_score_series()
|
460 |
+
if len(scores) < self.min_iterations:
|
461 |
+
return 0.0
|
462 |
+
|
463 |
+
# Calculate the standard deviation of score changes
|
464 |
+
changes = [abs(scores[i] - scores[i-1]) for i in range(1, len(scores))]
|
465 |
+
volatility = np.std(changes)
|
466 |
+
|
467 |
+
# Normalize by the average score if requested
|
468 |
+
if self.normalize and np.mean(scores) > 0:
|
469 |
+
return volatility / np.mean(scores)
|
470 |
+
|
471 |
+
return volatility
|
472 |
+
|
473 |
+
|
474 |
+
class ConvergenceIndex:
|
475 |
+
"""
|
476 |
+
Measures how quickly the model converges to a stable solution.
|
477 |
+
|
478 |
+
This metric captures how efficiently a model reaches a stable solution
|
479 |
+
across iterations.
|
480 |
+
"""
|
481 |
+
|
482 |
+
def __init__(self, config: Dict[str, Any] = None):
|
483 |
+
"""
|
484 |
+
Initialize the convergence index metric.
|
485 |
+
|
486 |
+
Args:
|
487 |
+
config: Configuration options
|
488 |
+
"""
|
489 |
+
self.config = config or {}
|
490 |
+
self.stability_threshold = self.config.get("stability_threshold", 0.05)
|
491 |
+
self.max_score_threshold = self.config.get("max_score_threshold", 0.95)
|
492 |
+
|
493 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
494 |
+
"""
|
495 |
+
Calculate the convergence index.
|
496 |
+
|
497 |
+
Args:
|
498 |
+
trajectory: The solution trajectory
|
499 |
+
|
500 |
+
Returns:
|
501 |
+
The convergence index (lower is better)
|
502 |
+
"""
|
503 |
+
scores = trajectory.get_score_series()
|
504 |
+
if not scores:
|
505 |
+
return 0.0
|
506 |
+
|
507 |
+
# Find the first iteration where the score stabilizes
|
508 |
+
# (subsequent changes are below the stability threshold)
|
509 |
+
convergence_point = len(scores) - 1
|
510 |
+
for i in range(1, len(scores)):
|
511 |
+
remaining_changes = [abs(scores[j] - scores[j-1]) for j in range(i, len(scores))]
|
512 |
+
if all(change <= self.stability_threshold for change in remaining_changes):
|
513 |
+
convergence_point = i
|
514 |
+
break
|
515 |
+
|
516 |
+
# Find the first iteration where the score exceeds the max score threshold
|
517 |
+
max_score_point = len(scores)
|
518 |
+
for i, score in enumerate(scores):
|
519 |
+
if score >= self.max_score_threshold:
|
520 |
+
max_score_point = i
|
521 |
+
break
|
522 |
+
|
523 |
+
# Return a combined index
|
524 |
+
# Lower is better - converging quickly to a high score is ideal
|
525 |
+
return (convergence_point / len(scores)) * (1.0 - max(0.0, min(1.0, scores[-1])))
|
526 |
+
|
527 |
+
|
528 |
+
class ErrorRecoveryEfficiency:
|
529 |
+
"""
|
530 |
+
Measures how efficiently the model recovers from errors.
|
531 |
+
|
532 |
+
This metric captures how well a model addresses and fixes specific
|
533 |
+
errors across iterations.
|
534 |
+
"""
|
535 |
+
|
536 |
+
def __init__(self, config: Dict[str, Any] = None):
|
537 |
+
"""
|
538 |
+
Initialize the error recovery efficiency metric.
|
539 |
+
|
540 |
+
Args:
|
541 |
+
config: Configuration options
|
542 |
+
"""
|
543 |
+
self.config = config or {}
|
544 |
+
|
545 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
546 |
+
"""
|
547 |
+
Calculate the error recovery efficiency.
|
548 |
+
|
549 |
+
Args:
|
550 |
+
trajectory: The solution trajectory
|
551 |
+
|
552 |
+
Returns:
|
553 |
+
The error recovery efficiency
|
554 |
+
"""
|
555 |
+
if not trajectory.steps or len(trajectory.steps) < 2:
|
556 |
+
return 0.0
|
557 |
+
|
558 |
+
# Extract error counts from each step
|
559 |
+
error_counts = []
|
560 |
+
for step in trajectory.steps:
|
561 |
+
if hasattr(step, "result") and hasattr(step.result, "error_details"):
|
562 |
+
error_counts.append(len(step.result.error_details or {}))
|
563 |
+
else:
|
564 |
+
# If no error details available, use issue count from feedback
|
565 |
+
error_counts.append(len(step.feedback.issues))
|
566 |
+
|
567 |
+
if not error_counts or error_counts[0] == 0:
|
568 |
+
return 1.0 # Perfect if no initial errors
|
569 |
+
|
570 |
+
# Calculate the rate at which errors are fixed
|
571 |
+
initial_errors = error_counts[0]
|
572 |
+
final_errors = error_counts[-1]
|
573 |
+
|
574 |
+
# Return the proportion of errors fixed
|
575 |
+
return (initial_errors - final_errors) / initial_errors
|
576 |
+
|
577 |
+
|
578 |
+
class DynamicComplexityHandling:
|
579 |
+
"""
|
580 |
+
Measures how well the model handles varying problem complexity.
|
581 |
+
|
582 |
+
This metric evaluates performance while accounting for changes in
|
583 |
+
problem difficulty across iterations.
|
584 |
+
"""
|
585 |
+
|
586 |
+
def __init__(self, config: Dict[str, Any] = None):
|
587 |
+
"""
|
588 |
+
Initialize the dynamic complexity handling metric.
|
589 |
+
|
590 |
+
Args:
|
591 |
+
config: Configuration options
|
592 |
+
"""
|
593 |
+
self.config = config or {}
|
594 |
+
|
595 |
+
def calculate(self, trajectory: Trajectory) -> float:
|
596 |
+
"""
|
597 |
+
Calculate the dynamic complexity handling score.
|
598 |
+
|
599 |
+
Args:
|
600 |
+
trajectory: The solution trajectory
|
601 |
+
|
602 |
+
Returns:
|
603 |
+
The dynamic complexity handling score
|
604 |
+
"""
|
605 |
+
if not trajectory.steps:
|
606 |
+
return 0.0
|
607 |
+
|
608 |
+
# Extract scores and difficulties from each step
|
609 |
+
scores = []
|
610 |
+
difficulties = []
|
611 |
+
|
612 |
+
for step in trajectory.steps:
|
613 |
+
scores.append(step.result.score)
|
614 |
+
difficulties.append(step.problem_state.difficulty)
|
615 |
+
|
616 |
+
# Calculate difficulty-weighted scores
|
617 |
+
weighted_scores = [scores[i] / max(0.1, difficulties[i]) for i in range(len(scores))]
|
618 |
+
|
619 |
+
# Return the average weighted score
|
620 |
+
return sum(weighted_scores) / len(weighted_scores)
|
621 |
+
|
622 |
+
|
623 |
+
class RecursiveFrameworkMetrics:
|
624 |
+
"""
|
625 |
+
Comprehensive collection of metrics for recursive evaluation.
|
626 |
+
|
627 |
+
This class provides easy access to all recursive metrics and
|
628 |
+
standardized calculation across trajectories.
|
629 |
+
"""
|
630 |
+
|
631 |
+
def __init__(self, config: Dict[str, Any] = None):
|
632 |
+
"""
|
633 |
+
Initialize the recursive framework metrics.
|
634 |
+
|
635 |
+
Args:
|
636 |
+
config: Configuration options
|
637 |
+
"""
|
638 |
+
self.config = config or {}
|
639 |
+
|
640 |
+
# Initialize all metrics
|
641 |
+
self.metrics = {
|
642 |
+
"learning_curve_area": RecursiveLearningCurveArea(self.config.get("learning_curve_area")),
|
643 |
+
"adaptation_rate": AdaptationRate(self.config.get("adaptation_rate")),
|
644 |
+
"volatility": RecursiveVolatility(self.config.get("volatility")),
|
645 |
+
"convergence_index": ConvergenceIndex(self.config.get("convergence_index")),
|
646 |
+
"error_recovery": ErrorRecoveryEfficiency(self.config.get("error_recovery")),
|
647 |
+
"complexity_handling": DynamicComplexityHandling(self.config.get("complexity_handling"))
|
648 |
+
}
|
649 |
+
|
650 |
+
# Add custom metrics from config if provided
|
651 |
+
if "custom_metrics" in self.config:
|
652 |
+
for name, metric in self.config["custom_metrics"].items():
|
653 |
+
self.metrics[name] = metric
|
654 |
+
|
655 |
+
def calculate_all(self, trajectory: Trajectory) -> Dict[str, float]:
|
656 |
+
"""
|
657 |
+
Calculate all metrics for a trajectory.
|
658 |
+
|
659 |
+
Args:
|
660 |
+
trajectory: The solution trajectory
|
661 |
+
|
662 |
+
Returns:
|
663 |
+
Dictionary of metric names and values
|
664 |
+
"""
|
665 |
+
return {name: metric.calculate(trajectory)
|
666 |
+
for name, metric in self.metrics.items()}
|
667 |
+
|
668 |
+
def calculate(self, trajectory: Trajectory, metric_name: str) -> float:
|
669 |
+
"""
|
670 |
+
Calculate a specific metric for a trajectory.
|
671 |
+
|
672 |
+
Args:
|
673 |
+
trajectory: The solution trajectory
|
674 |
+
metric_name: The name of the metric to calculate
|
675 |
+
|
676 |
+
Returns:
|
677 |
+
The calculated metric value
|
678 |
+
"""
|
679 |
+
if metric_name not in self.metrics:
|
680 |
+
raise ValueError(f"Unknown metric: {metric_name}")
|
681 |
+
|
682 |
+
return self.metrics[metric_name].calculate(trajectory)
|
683 |
+
|
684 |
+
def aggregate_metrics(self, trajectories: List[Trajectory]) -> Dict[str, float]:
|
685 |
+
"""
|
686 |
+
Calculate aggregate metrics across multiple trajectories.
|
687 |
+
|
688 |
+
Args:
|
689 |
+
trajectories: List of solution trajectories
|
690 |
+
|
691 |
+
Returns:
|
692 |
+
Dictionary of aggregated metric values
|
693 |
+
"""
|
694 |
+
if not trajectories:
|
695 |
+
return {}
|
696 |
+
|
697 |
+
all_metrics = [self.calculate_all(trajectory) for trajectory in trajectories]
|
698 |
+
|
699 |
+
# Aggregate by averaging each metric
|
700 |
+
aggregated = {}
|
701 |
+
for metric_name in self.metrics:
|
702 |
+
values = [metrics[metric_name] for metrics in all_metrics]
|
703 |
+
aggregated[metric_name] = sum(values) / len(values)
|
704 |
+
|
705 |
+
return aggregated
|
706 |
+
|
707 |
+
|
708 |
+
# recursive_swe_bench/evaluation/visualizer.py
|
709 |
+
|
710 |
+
import matplotlib.pyplot as plt
|
711 |
+
import numpy as np
|
712 |
+
import pandas as pd
|
713 |
+
from typing import Any, Dict, List, Optional, Union
|
714 |
+
import os
|
715 |
+
import json
|
716 |
+
import seaborn as sns
|
717 |
+
from pathlib import Path
|
718 |
+
|
719 |
+
from recursive_swe_bench.core.recursive_task import Trajectory
|
720 |
+
|
721 |
+
|
722 |
+
class RecursiveVisualizer:
|
723 |
+
"""
|
724 |
+
Visualization tools for recursive evaluation results.
|
725 |
+
|
726 |
+
This class provides methods for visualizing recursive trajectories,
|
727 |
+
metrics, and comparative analysis across models.
|
728 |
+
"""
|
729 |
+
|
730 |
+
def __init__(self, output_dir: Optional[str] = None, config: Dict[str, Any] = None):
|
731 |
+
"""
|
732 |
+
Initialize the recursive visualizer.
|
733 |
+
|
734 |
+
Args:
|
735 |
+
output_dir: Directory to save visualizations
|
736 |
+
config: Configuration options
|
737 |
+
"""
|
738 |
+
self.output_dir = output_dir
|
739 |
+
if output_dir:
|
740 |
+
os.makedirs(output_dir, exist_ok=True)
|
741 |
+
|
742 |
+
self.config = config or {}
|
743 |
+
self.theme = self.config.get("theme", "default")
|
744 |
+
|
745 |
+
# Set up the visualization style
|
746 |
+
if self.theme == "dark":
|
747 |
+
plt.style.use("dark_background")
|
748 |
+
self.colors = sns.color_palette("viridis", 10)
|
749 |
+
else:
|
750 |
+
plt.style.use("seaborn-v0_8-whitegrid")
|
751 |
+
self.colors = sns.color_palette("muted", 10)
|
752 |
+
|
753 |
+
sns.set_context("talk")
|
754 |
+
|
755 |
+
def plot_trajectory(
|
756 |
+
self,
|
757 |
+
trajectory: Trajectory,
|
758 |
+
title: Optional[str] = None,
|
759 |
+
show: bool = True,
|
760 |
+
save_path: Optional[str] = None
|
761 |
+
):
|
762 |
+
"""
|
763 |
+
Plot a solution trajectory showing score evolution.
|
764 |
+
|
765 |
+
Args:
|
766 |
+
trajectory: The solution trajectory
|
767 |
+
title: Optional title for the plot
|
768 |
+
show: Whether to display the plot
|
769 |
+
save_path: Optional path to save the plot
|
770 |
+
"""
|
771 |
+
scores = trajectory.get_score_series()
|
772 |
+
if not scores:
|
773 |
+
return
|
774 |
+
|
775 |
+
plt.figure(figsize=(10, 6))
|
776 |
+
|
777 |
+
# Plot scores
|
778 |
+
plt.plot(range(1, len(scores) + 1), scores, marker='o',
|
779 |
+
linewidth=2, markersize=8, color=self.colors[0])
|
780 |
+
|
781 |
+
# Add difficulty if available
|
782 |
+
difficulties = [step.problem_state.difficulty for step in trajectory.steps]
|
783 |
+
if difficulties:
|
784 |
+
plt.plot(range(1, len(difficulties) + 1), difficulties, marker='s',
|
785 |
+
linewidth=2, markersize=8, color=self.colors[1], linestyle='--',
|
786 |
+
label='Problem Difficulty')
|
787 |
+
|
788 |
+
# Set plot properties
|
789 |
+
plt.title(title or f"Solution Trajectory for Task {trajectory.task_id}")
|
790 |
+
plt.xlabel("Iteration")
|
791 |
+
plt.ylabel("Score / Difficulty")
|
792 |
+
plt.grid(True)
|
793 |
+
plt.ylim(0, 1.05)
|
794 |
+
plt.xticks(range(1, len(scores) + 1))
|
795 |
+
plt.legend(["Solution Score", "Problem Difficulty"])
|
796 |
+
|
797 |
+
# Save if requested
|
798 |
+
if save_path:
|
799 |
+
full_path = os.path.join(self.output_dir, save_path) if self.output_dir else save_path
|
800 |
+
plt.savefig(full_path, bbox_inches='tight', dpi=300)
|
801 |
+
|
802 |
+
# Show if requested
|
803 |
+
if show:
|
804 |
+
plt.show()
|
805 |
+
else:
|
806 |
+
plt.close()
|
807 |
+
|
808 |
+
def plot_metrics_comparison(
|
809 |
+
self,
|
810 |
+
metrics_by_model: Dict[str, Dict[str, float]],
|
811 |
+
title: Optional[str] = None,
|
812 |
+
show: bool = True,
|
813 |
+
save_path: Optional[str] = None
|
814 |
+
):
|
815 |
+
"""
|
816 |
+
Plot a comparison of metrics across models.
|
817 |
+
|
818 |
+
Args:
|
819 |
+
metrics_by_model: Dictionary mapping model names to metric values
|
820 |
+
title: Optional title for the plot
|
821 |
+
show: Whether to display the plot
|
822 |
+
save_path: Optional path to save the plot
|
823 |
+
"""
|
824 |
+
if not metrics_by_model:
|
825 |
+
return
|
826 |
+
|
827 |
+
# Convert to DataFrame for easier plotting
|
828 |
+
df = pd.DataFrame(metrics_by_model).T
|
829 |
+
|
830 |
+
# Create a radar chart
|
831 |
+
categories = list(df.columns)
|
832 |
+
N = len(categories)
|
833 |
+
|
834 |
+
# Create angles for each metric
|
835 |
+
angles = [n / float(N) * 2 * np.pi for n in range(N)]
|
836 |
+
angles += angles[:1] # Close the loop
|
837 |
+
|
838 |
+
# Create figure
|
839 |
+
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
|
840 |
+
|
841 |
+
# Add lines for each model
|
842 |
+
for i, (model, metrics) in enumerate(df.iterrows()):
|
843 |
+
values = metrics.values.flatten().tolist()
|
844 |
+
values += values[:1] # Close the loop
|
845 |
+
|
846 |
+
# Plot the line
|
847 |
+
ax.plot(angles, values, linewidth=2, linestyle='solid',
|
848 |
+
label=model, color=self.colors[i % len(self.colors)])
|
849 |
+
ax.fill(angles, values, alpha=0.1, color=self.colors[i % len(self.colors)])
|
850 |
+
|
851 |
+
# Set category labels
|
852 |
+
plt.xticks(angles[:-1], categories)
|
853 |
+
|
854 |
+
# Set y-axis limits
|
855 |
+
plt.ylim(0, 1)
|
856 |
+
|
857 |
+
# Add legend
|
858 |
+
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
|
859 |
+
|
860 |
+
# Set title
|
861 |
+
plt.title(title or "Metrics Comparison Across Models")
|
862 |
+
|
863 |
+
# Save if requested
|
864 |
+
if save_path:
|
865 |
+
full_path = os.path.join(self.output_dir, save_path) if self.output_dir else save_path
|
866 |
+
plt.savefig(full_path, bbox_inches='tight',
|
models/base_models.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# recursive_swe_bench/models/base_model.py
|
2 |
+
|
3 |
+
from typing import Any, Dict, List, Optional, Union
|
4 |
+
import logging
|
5 |
+
import time
|
6 |
+
from abc import ABC, abstractmethod
|
7 |
+
|
8 |
+
class ModelInterface(ABC):
|
9 |
+
"""
|
10 |
+
Base interface for models that can be evaluated using Recursive-SWE-bench.
|
11 |
+
|
12 |
+
This abstract class defines the core functionality required for a model to
|
13 |
+
be evaluated using the recursive evaluation framework. Concrete implementations
|
14 |
+
must provide the actual model-specific logic.
|
15 |
+
"""
|
16 |
+
|
17 |
+
def __init__(self, model_identifier: str, config: Optional[Dict[str, Any]] = None):
|
18 |
+
"""
|
19 |
+
Initialize the model interface.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
model_identifier: Identifier for the model
|
23 |
+
config: Configuration options
|
24 |
+
"""
|
25 |
+
self.model_identifier = model_identifier
|
26 |
+
self.config = config or {}
|
27 |
+
self.logger = self._setup_logger()
|
28 |
+
|
29 |
+
def _setup_logger(self) -> logging.Logger:
|
30 |
+
"""Set up logging for the model."""
|
31 |
+
logger = logging.getLogger(f"Model.{self.model_identifier}")
|
32 |
+
handler = logging.StreamHandler()
|
33 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
34 |
+
handler.setFormatter(formatter)
|
35 |
+
logger.addHandler(handler)
|
36 |
+
logger.setLevel(self.config.get("log_level", logging.INFO))
|
37 |
+
return logger
|
38 |
+
|
39 |
+
@abstractmethod
|
40 |
+
def solve(self, problem: Dict[str, Any], history: Optional[List[Dict[str, Any]]] = None) -> str:
|
41 |
+
"""
|
42 |
+
Generate a solution for the given problem.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
problem: The problem to solve
|
46 |
+
history: Optional history of previous solution attempts
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
The generated solution
|
50 |
+
"""
|
51 |
+
pass
|
52 |
+
|
53 |
+
@abstractmethod
|
54 |
+
def get_meta_information(self) -> Dict[str, Any]:
|
55 |
+
"""
|
56 |
+
Get meta information about the model.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
Dictionary containing model information
|
60 |
+
"""
|
61 |
+
pass
|
62 |
+
|
63 |
+
|
64 |
+
# recursive_swe_bench/models/openai.py
|
65 |
+
|
66 |
+
import openai
|
67 |
+
import json
|
68 |
+
import backoff
|
69 |
+
from typing import Any, Dict, List, Optional, Union
|
70 |
+
|
71 |
+
from recursive_swe_bench.models.base_model import ModelInterface
|
72 |
+
|
73 |
+
class OpenAIModel(ModelInterface):
|
74 |
+
"""
|
75 |
+
Integration with OpenAI models (GPT-3.5, GPT-4, etc.).
|
76 |
+
|
77 |
+
This class provides integration with OpenAI's API for evaluating
|
78 |
+
models like GPT-3.5 and GPT-4 with Recursive-SWE-bench.
|
79 |
+
"""
|
80 |
+
|
81 |
+
def __init__(
|
82 |
+
self,
|
83 |
+
model_identifier: str,
|
84 |
+
api_key: Optional[str] = None,
|
85 |
+
config: Optional[Dict[str, Any]] = None
|
86 |
+
):
|
87 |
+
"""
|
88 |
+
Initialize the OpenAI model interface.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
model_identifier: OpenAI model identifier (e.g., "gpt-4", "gpt-3.5-turbo")
|
92 |
+
api_key: OpenAI API key (optional if set in environment)
|
93 |
+
config: Additional configuration options
|
94 |
+
"""
|
95 |
+
super().__init__(model_identifier, config)
|
96 |
+
|
97 |
+
# Set API key if provided
|
98 |
+
if api_key:
|
99 |
+
openai.api_key = api_key
|
100 |
+
|
101 |
+
# Load default prompts or use config-provided ones
|
102 |
+
self.prompts = self.config.get("prompts", {
|
103 |
+
"system": "You are an expert programmer tasked with fixing bugs in code. Fix the code based on the description and tests.",
|
104 |
+
"user_template": "# Bug Fixing Task\n\n{description}\n\n# Code\n```python\n{code}\n```\n\n{tests_description}\n\n# Your task\nFix the bugs in the code above. Provide only the corrected code without any explanations.",
|
105 |
+
})
|
106 |
+
|
107 |
+
# Configure API parameters
|
108 |
+
self.api_params = self.config.get("api_params", {
|
109 |
+
"temperature": 0.2,
|
110 |
+
"max_tokens": 2000,
|
111 |
+
"top_p": 0.95,
|
112 |
+
"frequency_penalty": 0,
|
113 |
+
"presence_penalty": 0,
|
114 |
+
})
|
115 |
+
|
116 |
+
self.logger.info(f"Initialized OpenAI model: {model_identifier}")
|
117 |
+
|
118 |
+
@backoff.on_exception(
|
119 |
+
backoff.expo,
|
120 |
+
(openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError),
|
121 |
+
max_tries=5
|
122 |
+
)
|
123 |
+
def solve(
|
124 |
+
self,
|
125 |
+
problem: Dict[str, Any],
|
126 |
+
history: Optional[List[Dict[str, Any]]] = None
|
127 |
+
) -> str:
|
128 |
+
"""
|
129 |
+
Generate a solution using the OpenAI model.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
problem: The problem to solve
|
133 |
+
history: Optional history of previous solution attempts
|
134 |
+
|
135 |
+
Returns:
|
136 |
+
The generated solution
|
137 |
+
"""
|
138 |
+
self.logger.info(f"Solving problem with OpenAI model: {self.model_identifier}")
|
139 |
+
start_time = time.time()
|
140 |
+
|
141 |
+
# Format the problem for the model
|
142 |
+
messages = self._format_messages(problem, history)
|
143 |
+
|
144 |
+
# Make API call
|
145 |
+
response = openai.ChatCompletion.create(
|
146 |
+
model=self.model_identifier,
|
147 |
+
messages=messages,
|
148 |
+
**self.api_params
|
149 |
+
)
|
150 |
+
|
151 |
+
# Extract the solution from the response
|
152 |
+
solution = response.choices[0].message.content.strip()
|
153 |
+
|
154 |
+
end_time = time.time()
|
155 |
+
self.logger.info(f"Solution generated in {end_time - start_time:.2f} seconds")
|
156 |
+
|
157 |
+
return self._extract_code(solution)
|
158 |
+
|
159 |
+
def _format_messages(
|
160 |
+
self,
|
161 |
+
problem: Dict[str, Any],
|
162 |
+
history: Optional[List[Dict[str, Any]]] = None
|
163 |
+
) -> List[Dict[str, str]]:
|
164 |
+
"""
|
165 |
+
Format the problem and history into messages for the OpenAI API.
|
166 |
+
|
167 |
+
Args:
|
168 |
+
problem: The problem to solve
|
169 |
+
history: Optional history of previous solution attempts
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
List of formatted messages
|
173 |
+
"""
|
174 |
+
messages = [
|
175 |
+
{"role": "system", "content": self.prompts["system"]}
|
176 |
+
]
|
177 |
+
|
178 |
+
# Format the user message
|
179 |
+
code = problem["code_context"]["code"]
|
180 |
+
|
181 |
+
# Prepare tests description
|
182 |
+
tests_description = "# Tests\n"
|
183 |
+
if "tests" in problem["code_context"]:
|
184 |
+
tests_description += "The code must pass the following tests:\n\n"
|
185 |
+
for i, test in enumerate(problem["code_context"]["tests"]):
|
186 |
+
tests_description += f"## Test {i+1}: {test['name']}\n```python\n{test['content']}\n```\n\n"
|
187 |
+
else:
|
188 |
+
tests_description += "The code must work correctly according to its intended functionality."
|
189 |
+
|
190 |
+
# Create the user message using the template
|
191 |
+
user_content = self.prompts["user_template"].format(
|
192 |
+
description=problem["description"],
|
193 |
+
code=code,
|
194 |
+
tests_description=tests_description
|
195 |
+
)
|
196 |
+
|
197 |
+
messages.append({"role": "user", "content": user_content})
|
198 |
+
|
199 |
+
# Add history if available
|
200 |
+
if history and self.config.get("include_history", True):
|
201 |
+
for entry in history:
|
202 |
+
# Add previous attempt
|
203 |
+
messages.append({
|
204 |
+
"role": "assistant",
|
205 |
+
"content": entry["solution"]
|
206 |
+
})
|
207 |
+
|
208 |
+
# Add feedback on previous attempt
|
209 |
+
feedback_content = f"Your solution has the following issues:\n"
|
210 |
+
for issue in entry["feedback"]["issues"]:
|
211 |
+
feedback_content += f"- {issue['message']}\n"
|
212 |
+
|
213 |
+
feedback_content += "\nPlease try again with these improvements:\n"
|
214 |
+
for suggestion in entry["feedback"]["suggestions"]:
|
215 |
+
feedback_content += f"- {suggestion['message']}\n"
|
216 |
+
|
217 |
+
messages.append({
|
218 |
+
"role": "user",
|
219 |
+
"content": feedback_content
|
220 |
+
})
|
221 |
+
|
222 |
+
return messages
|
223 |
+
|
224 |
+
def _extract_code(self, text: str) -> str:
|
225 |
+
"""
|
226 |
+
Extract code from the model's response.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
text: The model's response
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
Extracted code
|
233 |
+
"""
|
234 |
+
# Try to extract code from markdown code blocks
|
235 |
+
import re
|
236 |
+
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', text, re.DOTALL)
|
237 |
+
|
238 |
+
if code_blocks:
|
239 |
+
return code_blocks[0].strip()
|
240 |
+
|
241 |
+
# If no code blocks, return the full text (it might be just code)
|
242 |
+
return text.strip()
|
243 |
+
|
244 |
+
def get_meta_information(self) -> Dict[str, Any]:
|
245 |
+
"""
|
246 |
+
Get meta information about the model.
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
Dictionary containing model information
|
250 |
+
"""
|
251 |
+
return {
|
252 |
+
"model_name": self.model_identifier,
|
253 |
+
"provider": "OpenAI",
|
254 |
+
"type": "API",
|
255 |
+
"parameters": self.api_params,
|
256 |
+
"system_prompt": self.prompts["system"]
|
257 |
+
}
|
258 |
+
|
259 |
+
|
task_generators/bug_fixing.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|