|
|
|
|
|
from typing import Any, Dict, List, Optional, Union |
|
import logging |
|
import time |
|
from abc import ABC, abstractmethod |
|
|
|
class ModelInterface(ABC): |
|
""" |
|
Base interface for models that can be evaluated using Recursive-SWE-bench. |
|
|
|
This abstract class defines the core functionality required for a model to |
|
be evaluated using the recursive evaluation framework. Concrete implementations |
|
must provide the actual model-specific logic. |
|
""" |
|
|
|
def __init__(self, model_identifier: str, config: Optional[Dict[str, Any]] = None): |
|
""" |
|
Initialize the model interface. |
|
|
|
Args: |
|
model_identifier: Identifier for the model |
|
config: Configuration options |
|
""" |
|
self.model_identifier = model_identifier |
|
self.config = config or {} |
|
self.logger = self._setup_logger() |
|
|
|
def _setup_logger(self) -> logging.Logger: |
|
"""Set up logging for the model.""" |
|
logger = logging.getLogger(f"Model.{self.model_identifier}") |
|
handler = logging.StreamHandler() |
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
handler.setFormatter(formatter) |
|
logger.addHandler(handler) |
|
logger.setLevel(self.config.get("log_level", logging.INFO)) |
|
return logger |
|
|
|
@abstractmethod |
|
def solve(self, problem: Dict[str, Any], history: Optional[List[Dict[str, Any]]] = None) -> str: |
|
""" |
|
Generate a solution for the given problem. |
|
|
|
Args: |
|
problem: The problem to solve |
|
history: Optional history of previous solution attempts |
|
|
|
Returns: |
|
The generated solution |
|
""" |
|
pass |
|
|
|
@abstractmethod |
|
def get_meta_information(self) -> Dict[str, Any]: |
|
""" |
|
Get meta information about the model. |
|
|
|
Returns: |
|
Dictionary containing model information |
|
""" |
|
pass |
|
|
|
|
|
|
|
|
|
import openai |
|
import json |
|
import backoff |
|
from typing import Any, Dict, List, Optional, Union |
|
|
|
from recursive_swe_bench.models.base_model import ModelInterface |
|
|
|
class OpenAIModel(ModelInterface): |
|
""" |
|
Integration with OpenAI models (GPT-3.5, GPT-4, etc.). |
|
|
|
This class provides integration with OpenAI's API for evaluating |
|
models like GPT-3.5 and GPT-4 with Recursive-SWE-bench. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_identifier: str, |
|
api_key: Optional[str] = None, |
|
config: Optional[Dict[str, Any]] = None |
|
): |
|
""" |
|
Initialize the OpenAI model interface. |
|
|
|
Args: |
|
model_identifier: OpenAI model identifier (e.g., "gpt-4", "gpt-3.5-turbo") |
|
api_key: OpenAI API key (optional if set in environment) |
|
config: Additional configuration options |
|
""" |
|
super().__init__(model_identifier, config) |
|
|
|
|
|
if api_key: |
|
openai.api_key = api_key |
|
|
|
|
|
self.prompts = self.config.get("prompts", { |
|
"system": "You are an expert programmer tasked with fixing bugs in code. Fix the code based on the description and tests.", |
|
"user_template": "# Bug Fixing Task\n\n{description}\n\n# Code\n```python\n{code}\n```\n\n{tests_description}\n\n# Your task\nFix the bugs in the code above. Provide only the corrected code without any explanations.", |
|
}) |
|
|
|
|
|
self.api_params = self.config.get("api_params", { |
|
"temperature": 0.2, |
|
"max_tokens": 2000, |
|
"top_p": 0.95, |
|
"frequency_penalty": 0, |
|
"presence_penalty": 0, |
|
}) |
|
|
|
self.logger.info(f"Initialized OpenAI model: {model_identifier}") |
|
|
|
@backoff.on_exception( |
|
backoff.expo, |
|
(openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError), |
|
max_tries=5 |
|
) |
|
def solve( |
|
self, |
|
problem: Dict[str, Any], |
|
history: Optional[List[Dict[str, Any]]] = None |
|
) -> str: |
|
""" |
|
Generate a solution using the OpenAI model. |
|
|
|
Args: |
|
problem: The problem to solve |
|
history: Optional history of previous solution attempts |
|
|
|
Returns: |
|
The generated solution |
|
""" |
|
self.logger.info(f"Solving problem with OpenAI model: {self.model_identifier}") |
|
start_time = time.time() |
|
|
|
|
|
messages = self._format_messages(problem, history) |
|
|
|
|
|
response = openai.ChatCompletion.create( |
|
model=self.model_identifier, |
|
messages=messages, |
|
**self.api_params |
|
) |
|
|
|
|
|
solution = response.choices[0].message.content.strip() |
|
|
|
end_time = time.time() |
|
self.logger.info(f"Solution generated in {end_time - start_time:.2f} seconds") |
|
|
|
return self._extract_code(solution) |
|
|
|
def _format_messages( |
|
self, |
|
problem: Dict[str, Any], |
|
history: Optional[List[Dict[str, Any]]] = None |
|
) -> List[Dict[str, str]]: |
|
""" |
|
Format the problem and history into messages for the OpenAI API. |
|
|
|
Args: |
|
problem: The problem to solve |
|
history: Optional history of previous solution attempts |
|
|
|
Returns: |
|
List of formatted messages |
|
""" |
|
messages = [ |
|
{"role": "system", "content": self.prompts["system"]} |
|
] |
|
|
|
|
|
code = problem["code_context"]["code"] |
|
|
|
|
|
tests_description = "# Tests\n" |
|
if "tests" in problem["code_context"]: |
|
tests_description += "The code must pass the following tests:\n\n" |
|
for i, test in enumerate(problem["code_context"]["tests"]): |
|
tests_description += f"## Test {i+1}: {test['name']}\n```python\n{test['content']}\n```\n\n" |
|
else: |
|
tests_description += "The code must work correctly according to its intended functionality." |
|
|
|
|
|
user_content = self.prompts["user_template"].format( |
|
description=problem["description"], |
|
code=code, |
|
tests_description=tests_description |
|
) |
|
|
|
messages.append({"role": "user", "content": user_content}) |
|
|
|
|
|
if history and self.config.get("include_history", True): |
|
for entry in history: |
|
|
|
messages.append({ |
|
"role": "assistant", |
|
"content": entry["solution"] |
|
}) |
|
|
|
|
|
feedback_content = f"Your solution has the following issues:\n" |
|
for issue in entry["feedback"]["issues"]: |
|
feedback_content += f"- {issue['message']}\n" |
|
|
|
feedback_content += "\nPlease try again with these improvements:\n" |
|
for suggestion in entry["feedback"]["suggestions"]: |
|
feedback_content += f"- {suggestion['message']}\n" |
|
|
|
messages.append({ |
|
"role": "user", |
|
"content": feedback_content |
|
}) |
|
|
|
return messages |
|
|
|
def _extract_code(self, text: str) -> str: |
|
""" |
|
Extract code from the model's response. |
|
|
|
Args: |
|
text: The model's response |
|
|
|
Returns: |
|
Extracted code |
|
""" |
|
|
|
import re |
|
code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', text, re.DOTALL) |
|
|
|
if code_blocks: |
|
return code_blocks[0].strip() |
|
|
|
|
|
return text.strip() |
|
|
|
def get_meta_information(self) -> Dict[str, Any]: |
|
""" |
|
Get meta information about the model. |
|
|
|
Returns: |
|
Dictionary containing model information |
|
""" |
|
return { |
|
"model_name": self.model_identifier, |
|
"provider": "OpenAI", |
|
"type": "API", |
|
"parameters": self.api_params, |
|
"system_prompt": self.prompts["system"] |
|
} |
|
|
|
|
|
|