# recursive_swe_bench/task_generators/bug_fixing.py from typing import Any, Dict, List, Optional, Tuple, Set, Union import uuid import json import re import random import ast import copy from pathlib import Path import tempfile import subprocess import shutil import os from recursive_swe_bench.core.recursive_task import ( RecursiveTask, ProblemState, EvaluationResult, Feedback, TaskStatus ) class BugCategory: """Categories of bugs for classification and evolution.""" SYNTAX = "syntax" LOGICAL = "logical" PERFORMANCE = "performance" SECURITY = "security" CONCURRENCY = "concurrency" EXCEPTION_HANDLING = "exception_handling" API_USAGE = "api_usage" MEMORY_MANAGEMENT = "memory_management" TYPE_ERROR = "type_error" EDGE_CASE = "edge_case" DATA_HANDLING = "data_handling" DEPENDENCY = "dependency" class BugFixingTask(RecursiveTask): """ A recursive task for evaluating how models fix bugs in code. The task presents a piece of code with one or more bugs, and evolves based on the model's fix attempts. As the model addresses issues, the task may introduce more subtle bugs, change requirements, or increase complexity to test adaptive problem-solving. """ def __init__( self, initial_state: ProblemState, config: Dict[str, Any] = None, test_runner: Any = None ): """ Initialize the bug fixing task. Args: initial_state: The initial problem state config: Configuration options test_runner: Custom test runner (optional) """ super().__init__(initial_state, config) self.test_runner = test_runner or DefaultTestRunner() self.bug_categories: Set[str] = set( self.config.get("bug_categories", [BugCategory.LOGICAL, BugCategory.SYNTAX]) ) self.difficulty_progression = self.config.get( "difficulty_progression", [0.0, 0.15, 0.3, 0.5, 0.7] ) self.evolution_strategies = self.config.get( "evolution_strategies", ["add_subtle_bug", "change_requirements", "increase_complexity"] ) def _run_evaluation(self, solution: str) -> EvaluationResult: """ Run tests to evaluate the solution. Args: solution: The solution code Returns: Evaluation results """ # Create a temporary directory to run tests with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Write solution code to file solution_file = temp_path / "solution.py" with open(solution_file, "w") as f: f.write(solution) # Create test files test_files = self._create_test_files(temp_path) # Run tests results = self.test_runner.run_tests( solution_file=solution_file, test_files=test_files, code_context=self.state.code_context ) # Calculate score based on test results score = self._calculate_score(results) return EvaluationResult( success=results["all_passed"], score=score, execution_results=results["execution"], error_details=results.get("errors"), test_results=results["tests"], metrics={ "passed_tests": results["passed_tests"], "total_tests": results["total_tests"], "execution_time": results["execution_time"], "memory_usage": results.get("memory_usage", 0), "code_complexity": self._calculate_complexity(solution) } ) def _generate_feedback(self, solution: str, result: EvaluationResult) -> Feedback: """ Generate structured feedback based on evaluation results. Args: solution: The solution code result: The evaluation results Returns: Structured feedback """ issues = [] suggestions = [] focus_areas = [] # Add issues for failing tests if result.test_results: for test_name, test_result in result.test_results.items(): if not test_result["passed"]: issues.append({ "type": "test_failure", "test": test_name, "message": test_result.get("message", "Test failed"), "expected": test_result.get("expected"), "actual": test_result.get("actual") }) # Add issues for errors if result.error_details: for error_type, error_info in result.error_details.items(): issues.append({ "type": "error", "error_type": error_type, "message": error_info.get("message", "An error occurred"), "location": error_info.get("location") }) # Generate suggestions based on issues for issue in issues: if issue["type"] == "test_failure": suggestion = self._generate_suggestion_for_test_failure( issue, solution, result.test_results ) if suggestion: suggestions.append(suggestion) elif issue["type"] == "error": suggestion = self._generate_suggestion_for_error( issue, solution ) if suggestion: suggestions.append(suggestion) # Determine focus areas based on issues and task state focus_areas = self._determine_focus_areas(issues, solution, result) # Generate adaptation hints based on the current state and results adaptation_hints = self._generate_adaptation_hints(solution, result) # Create summary if result.success: summary = ( f"Your solution passes all tests with a score of {result.score:.2f}. " f"The code successfully addresses the bugs in the original implementation." ) else: passed = result.metrics.get("passed_tests", 0) total = result.metrics.get("total_tests", 0) summary = ( f"Your solution passes {passed}/{total} tests with a score of {result.score:.2f}. " f"There are still issues that need to be addressed." ) return Feedback( summary=summary, issues=issues, suggestions=suggestions, focus_areas=focus_areas, adaptation_hints=adaptation_hints ) def _evolve_state(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: """ Evolve the problem state based on the solution and feedback. This method implements the recursive nature of the benchmark by adapting the problem to challenge the model's understanding. Args: solution: The attempted solution result: The evaluation results feedback: The feedback provided Returns: The evolved problem state """ # If the solution perfectly solved the problem, make it more challenging if result.success and result.score > 0.95: return self._increase_difficulty(solution, result, feedback) # If the solution was close but not perfect, focus on the remaining issues elif result.score > 0.7: return self._focus_remaining_issues(solution, result, feedback) # If the solution was not very good, provide more guidance else: return self._provide_more_guidance(solution, result, feedback) def _increase_difficulty(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: """ Increase the difficulty of the problem for models that solved it well. Args: solution: The successful solution result: The evaluation results feedback: The feedback provided Returns: The evolved problem state with increased difficulty """ # Create a new state based on the current state new_state = copy.deepcopy(self.state) # Increment evolution stage new_state.evolution_stage += 1 # Increase difficulty based on progression schedule current_difficulty_idx = min(new_state.evolution_stage, len(self.difficulty_progression) - 1) new_state.difficulty = self.difficulty_progression[current_difficulty_idx] # Select an evolution strategy based on the current state strategy = self._select_evolution_strategy(solution, result, feedback) # Apply the selected strategy if strategy == "add_subtle_bug": self._add_subtle_bug(new_state, solution) elif strategy == "change_requirements": self._change_requirements(new_state, solution) elif strategy == "increase_complexity": self._increase_complexity(new_state, solution) # Update the description to reflect the changes new_state.description = self._generate_description(new_state) # Update adaptation vector to guide future evolution new_state.adaptation_vector = self._calculate_adaptation_vector( solution, result, feedback ) return new_state def _focus_remaining_issues(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: """ Evolve the state to focus on remaining issues when the solution is close but not perfect. Args: solution: The nearly-successful solution result: The evaluation results feedback: The feedback provided Returns: The evolved problem state focusing on remaining issues """ # Create a new state based on the current state new_state = copy.deepcopy(self.state) # Increment evolution stage new_state.evolution_stage += 1 # Maintain the same difficulty level current_difficulty_idx = min(new_state.evolution_stage - 1, len(self.difficulty_progression) - 1) new_state.difficulty = self.difficulty_progression[current_difficulty_idx] # Update the code context to focus on remaining issues new_state.code_context["focus_areas"] = feedback.focus_areas # Highlight failing tests in the code context if result.test_results: failing_tests = [ test_name for test_name, test_result in result.test_results.items() if not test_result["passed"] ] new_state.code_context["failing_tests"] = failing_tests # Update the description to be more specific about remaining issues new_state.description = self._generate_focused_description( new_state, feedback.issues ) # Update adaptation vector to guide future evolution new_state.adaptation_vector = self._calculate_adaptation_vector( solution, result, feedback ) return new_state def _provide_more_guidance(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: """ Evolve the state to provide more guidance when the solution was not very good. Args: solution: The unsuccessful solution result: The evaluation results feedback: The feedback provided Returns: The evolved problem state with more guidance """ # Create a new state based on the current state new_state = copy.deepcopy(self.state) # Increment evolution stage new_state.evolution_stage += 1 # Maintain or slightly decrease difficulty current_difficulty_idx = max(0, min(new_state.evolution_stage - 1, len(self.difficulty_progression) - 1) - 1) new_state.difficulty = self.difficulty_progression[current_difficulty_idx] # Add more hints to the code context new_state.code_context["hints"] = self._generate_hints( solution, result, feedback ) # Add more detailed information about failing tests if result.test_results: detailed_test_results = {} for test_name, test_result in result.test_results.items(): if not test_result["passed"]: detailed_test_results[test_name] = { "message": test_result.get("message", "Test failed"), "expected": test_result.get("expected"), "actual": test_result.get("actual"), "hint": self._generate_test_hint(test_name, test_result) } new_state.code_context["detailed_test_results"] = detailed_test_results # Update the description to include more guidance new_state.description = self._generate_guided_description( new_state, feedback.issues, feedback.suggestions ) # Update adaptation vector to guide future evolution new_state.adaptation_vector = self._calculate_adaptation_vector( solution, result, feedback ) return new_state def _select_evolution_strategy(self, solution: str, result: EvaluationResult, feedback: Feedback) -> str: """ Select an evolution strategy based on the current state and solution. Args: solution: The current solution result: The evaluation results feedback: The feedback provided Returns: The selected evolution strategy """ available_strategies = self.evolution_strategies.copy() # Weight the strategies based on the current state weights = {} # Prefer adding subtle bugs if the solution is very good if result.score > 0.95: weights["add_subtle_bug"] = 0.6 weights["change_requirements"] = 0.3 weights["increase_complexity"] = 0.1 # Prefer changing requirements if we've already added several bugs elif self.state.evolution_stage >= 2 and "bug_count" in self.state.code_context and self.state.code_context["bug_count"] >= 3: weights["add_subtle_bug"] = 0.1 weights["change_requirements"] = 0.7 weights["increase_complexity"] = 0.2 # Prefer increasing complexity if the solution is good but not perfect elif result.score > 0.85: weights["add_subtle_bug"] = 0.2 weights["change_requirements"] = 0.2 weights["increase_complexity"] = 0.6 # Default to equal weights else: weights = {strategy: 1.0 / len(available_strategies) for strategy in available_strategies} # Normalize weights for available strategies total_weight = sum(weights.get(strategy, 0) for strategy in available_strategies) normalized_weights = [weights.get(strategy, 0) / total_weight for strategy in available_strategies] # Select a strategy based on weights return random.choices(available_strategies, weights=normalized_weights)[0] def _add_subtle_bug(self, state: ProblemState, solution: str) -> None: """ Add a subtle bug to the solution code. Args: state: The problem state to modify solution: The current solution """ # Parse the solution to find potential bug insertion points try: parsed_solution = ast.parse(solution) except SyntaxError: # If we can't parse the solution, just add a syntax error self._add_syntax_error(state, solution) return # Choose a bug category based on available categories available_categories = list(self.bug_categories) if available_categories: bug_category = random.choice(available_categories) else: bug_category = BugCategory.LOGICAL # Add a bug based on the selected category if bug_category == BugCategory.SYNTAX: self._add_syntax_error(state, solution) elif bug_category == BugCategory.LOGICAL: self._add_logical_error(state, solution, parsed_solution) elif bug_category == BugCategory.PERFORMANCE: self._add_performance_issue(state, solution, parsed_solution) elif bug_category == BugCategory.EDGE_CASE: self._add_edge_case_issue(state, solution, parsed_solution) else: # Default to logical error self._add_logical_error(state, solution, parsed_solution) # Update bug count in code context if "bug_count" not in state.code_context: state.code_context["bug_count"] = 0 state.code_context["bug_count"] += 1 # Add the bug category to the context if "bug_categories" not in state.code_context: state.code_context["bug_categories"] = [] state.code_context["bug_categories"].append(bug_category) def _change_requirements(self, state: ProblemState, solution: str) -> None: """ Change the requirements to challenge the current solution. Args: state: The problem state to modify solution: The current solution """ # Get the current requirements requirements = state.requirements # Add a new requirement new_requirement = self._generate_new_requirement(state, solution) if new_requirement: requirements.append(new_requirement) # Modify an existing requirement if possible if requirements and random.random() < 0.5: idx = random.randint(0, len(requirements) - 1) requirements[idx] = self._modify_requirement(requirements[idx], state, solution) def _increase_complexity(self, state: ProblemState, solution: str) -> None: """ Increase the complexity of the task. Args: state: The problem state to modify solution: The current solution """ # Parse the solution if possible try: parsed_solution = ast.parse(solution) except SyntaxError: # If we can't parse the solution, make a simpler change self._add_edge_case_requirement(state) return # Choose a complexity increase strategy strategies = [ "add_edge_cases", "increase_data_volume", "add_performance_constraint", "expand_functionality" ] strategy = random.choice(strategies) if strategy == "add_edge_cases": self._add_edge_case_requirement(state) elif strategy == "increase_data_volume": self._increase_data_volume(state, solution) elif strategy == "add_performance_constraint": self._add_performance_constraint(state, solution) elif strategy == "expand_functionality": self._expand_functionality(state, solution) def _create_test_files(self, temp_path: Path) -> List[Path]: """ Create test files based on the current problem state. Args: temp_path: The temporary directory path Returns: List of test file paths """ test_files = [] # Create test files from the code context if "tests" in self.state.code_context: for i, test in enumerate(self.state.code_context["tests"]): test_file = temp_path / f"test_{i}.py" with open(test_file, "w") as f: f.write(test["content"]) test_files.append(test_file) # Create a default test file if no tests are specified if not test_files: test_file = temp_path / "test_default.py" with open(test_file, "w") as f: f.write(self._generate_default_test()) test_files.append(test_file) return test_files def _calculate_score(self, results: Dict[str, Any]) -> float: """ Calculate a score based on test results. Args: results: The test results Returns: A score between 0 and 1 """ # Base score on test results if results["total_tests"] == 0: test_score = 0.0 else: test_score = results["passed_tests"] / results["total_tests"] # Adjust for execution success execution_score = 1.0 if results["execution"]["success"] else 0.0 # Combine scores with weights weights = self.config.get("score_weights", {"test": 0.7, "execution": 0.3}) score = (test_score * weights["test"] + execution_score * weights["execution"]) # Apply difficulty modifier difficulty_modifier = 1.0 + (self.state.difficulty * 0.2) score = score / difficulty_modifier return max(0.0, min(1.0, score)) def _calculate_complexity(self, code: str) -> float: """ Calculate the complexity of code. Args: code: The code to analyze Returns: A complexity score """ # Simple cyclomatic complexity estimation complexity = 1 # Count control flow statements for pattern in ["if", "for", "while", "and", "or"]: complexity += code.count(f" {pattern} ") # Count function definitions complexity += code.count("def ") # Normalize to 0-1 range normalized = min(1.0, complexity / 50.0) return normalized def _generate_suggestion_for_test_failure( self, issue: Dict[str, Any], solution: str, test_results: Dict[str, Any] ) -> Dict[str, Any]: """ Generate a suggestion for a test failure. Args: issue: The issue data solution: The solution code test_results: The test results Returns: A suggestion dictionary """ test_name = issue["test"] test_result = test_results[test_name] # Extract relevant parts of the test test_content = None for test in self.state.code_context.get("tests", []): if test.get("name") == test_name: test_content = test.get("content") break if test_content: # Try to extract the assertion that failed assertion_match = re.search(r"assert.*", test_content) assertion = assertion_match.group(0) if assertion_match else None # Look for function names in both test and solution test_funcs = re.findall(r"def\s+(\w+)", test_content) solution_funcs = re.findall(r"def\s+(\w+)", solution) # Find functions in test that aren't in solution missing_funcs = [f for f in test_funcs if f not in solution_funcs] if missing_funcs: return { "type": "missing_function", "message": f"Implement the missing function(s): {', '.join(missing_funcs)}", "functions": missing_funcs } elif assertion: return { "type": "fix_assertion_failure", "message": f"Fix the code to pass the assertion: {assertion}", "assertion": assertion, "expected": test_result.get("expected"), "actual": test_result.get("actual") } else: return { "type": "fix_test_failure", "message": f"Fix the code to pass the test: {test_name}", "test_name": test_name } else: return { "type": "general_fix", "message": f"Fix the code to pass the failing test: {test_name}" } def _generate_suggestion_for_error( self, issue: Dict[str, Any], solution: str ) -> Dict[str, Any]: """ Generate a suggestion for an error. Args: issue: The issue data solution: The solution code Returns: A suggestion dictionary """ error_type = issue["error_type"] message = issue["message"] location = issue.get("location") if error_type == "syntax": return { "type": "fix_syntax", "message": f"Fix the syntax error: {message}", "location": location } elif error_type == "runtime": return { "type": "fix_runtime_error", "message": f"Fix the runtime error: {message}", "location": location } else: return { "type": "fix_error", "message": f"Fix the error: {message}", "error_type": error_type, "location": location } def _determine_focus_areas( self, issues: List[Dict[str, Any]], solution: str, result: EvaluationResult ) -> List[str]: """ Determine focus areas based on issues and results. Args: issues: The identified issues solution: The solution code result: The evaluation results Returns: List of focus areas """ focus_areas = [] # Check for syntax issues syntax_issues = [i for i in issues if i.get("error_type") == "syntax"] if syntax_issues: focus_areas.append("syntax") # Check for failing tests test_issues = [i for i in issues if i["type"] == "test_failure"] if test_issues: if any("expected" in i and "actual" in i for i in test_issues): focus_areas.append("logic") else: focus_areas.append("functionality") # Check for performance issues if result.metrics and "execution_time" in result.metrics: if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): focus_areas.append("performance") # Check for complexity issues if result.metrics and "code_complexity" in result.metrics: if result.metrics["code_complexity"] > self.config.get("complexity_threshold", 0.7): focus_areas.append("complexity") # Default focus area if none were identified if not focus_areas: focus_areas.append("general") return focus_areas def _generate_adaptation_hints( self, solution: str, result: EvaluationResult ) -> List[Dict[str, Any]]: """ Generate hints about how the problem might adapt in the next iteration. Args: solution: The solution code result: The evaluation results Returns: List of adaptation hints """ hints = [] # Hint about potential complexity increases if result.score > 0.8: hints.append({ "type": "complexity_increase", "message": "The problem may become more complex in the next iteration." }) # Hint about potential requirement changes if result.score > 0.9 and self.state.evolution_stage >= 1: hints.append({ "type": "requirement_change", "message": "The requirements may change in the next iteration." }) # Hint about potential bug additions if result.score > 0.95: hints.append({ "type": "new_bugs", "message": "New, more subtle bugs may be introduced in the next iteration." }) # Hint about focus on specific areas if result.score > 0.7 and result.score < 0.95: focus_areas = result.metrics.get("focus_areas", []) if focus_areas: hints.append({ "type": "focus_shift", "message": f"The next iteration may focus more on: {', '.join(focus_areas)}", "areas": focus_areas }) return hints def _generate_description(self, state: ProblemState) -> str: """ Generate a description for the current problem state. Args: state: The problem state Returns: A descriptive prompt for the problem """ # Base description base_desc = ( f"Fix the bug(s) in the following code. " f"This is iteration {state.evolution_stage + 1} of the task." ) # Add information about known bug categories if "bug_categories" in state.code_context: categories = state.code_context["bug_categories"] if categories: base_desc += f"\n\nThe code contains the following types of issues: {', '.join(categories)}." # Add requirements if state.requirements: base_desc += "\n\nRequirements:" for i, req in enumerate(state.requirements): base_desc += f"\n{i+1}. {req['description']}" # Add information about difficulty difficulty_desc = "easy" if state.difficulty > 0.3 and state.difficulty <= 0.6: difficulty_desc = "moderate" elif state.difficulty > 0.6 and state.difficulty <= 0.8: difficulty_desc = "challenging" elif state.difficulty > 0.8: difficulty_desc = "very challenging" base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." return base_desc def _generate_focused_description(self, state: ProblemState, issues: List[Dict[str, Any]]) -> str: """ Generate a description focused on remaining issues. Args: state: The problem state issues: The identified issues Returns: A descriptive prompt focused on remaining issues """ base_desc = self._generate_description(state) # Add focus on remaining issues if issues: base_desc += "\n\nFocus on the following issues:" for i, issue in enumerate(issues): if issue["type"] == "test_failure": base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" else: base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" # Add focus areas if present if "focus_areas" in state.code_context: areas = state.code_context["focus_areas"] if areas: base_desc += f"\n\nPay particular attention to: {', '.join(areas)}." return base_desc def _generate_guided_description( self, state: ProblemState, issues: List[Dict[str, Any]], suggestions: List[Dict[str, Any]] ) -> str: """ Generate a description with added guidance. Args: state: The problem state issues: The identified issues suggestions: The suggested fixes Returns: A descriptive prompt with added guidance """ base_desc = self._generate_description(state) # Add detailed information about issues if issues: base_desc += "\n\nThe following issues were identified in your previous solution:" for i, issue in enumerate(issues): if issue["type"] == "test_failure": base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" if "expected" in issue and "actual" in issue: base_desc += f"\n Expected: {issue['expected']}" base_desc += f"\n Actual: {issue['actual']}" else: base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" if "location" in issue: base_desc += f"\n Location: {issue['location']}" # Add suggestions if suggestions: base_desc += "\n\nConsider the following suggestions:" for i, suggestion in enumerate(suggestions): base_desc += f"\n{i+1}. {suggestion['message']}" # Add hints if present if "hints" in state.code_context: hints = state.code_context["hints"] if hints: base_desc += "\n\nHints:" for i, hint in enumerate(hints): base_desc += f"\n{i+1}. {hint}" return base_desc def _generate_hints( self, solution: str, result: EvaluationResult, feedback: Feedback ) -> List[str]: """ Generate hints based on the solution and feedback. Args: solution: The solution code result: The evaluation results feedback: The feedback provided Returns: List of hints """ hints = [] # Add hints based on failing tests if result.test_results: failing_tests = [ test_name for test_name, test_result in result.test_results.items() if not test_result["passed"] ] if failing_tests: test_hint = "Focus on fixing the failing tests" # Add specific information about test expectations if available for test_name in failing_tests[:2]: # Limit to first two tests test_result = result.test_results[test_name] if "expected" in test_result and "actual" in test_result: test_hint += f". For test '{test_name}', expected '{test_result['expected']}' but got '{test_result['actual']}'" hints.append(test_hint + ".") # Add hints based on errors if result.error_details: for error_type, error_info in result.error_details.items(): hints.append(f"Fix the {error_type} error: {error_info.get('message', 'Unknown error')}.") # Add hints based on focus areas for area in feedback.focus_areas: if area == "syntax": hints.append("Check your syntax carefully, especially parentheses, indentation, and function definitions.") elif area == "logic": hints.append("Review the logic of your solution, especially conditional statements and loop conditions.") elif area == "functionality": hints.append("Ensure your solution implements all required functionality specified in the tests.") elif area == "performance": hints.append("Consider optimizing your solution for better performance, avoid unnecessary operations.") elif area == "complexity": hints.append("Try to simplify your solution, it may be more complex than necessary.") return hints def _generate_test_hint(self, test_name: str, test_result: Dict[str, Any]) -> str: """ Generate a hint for a specific failing test. Args: test_name: The name of the test test_result: The test result Returns: A hint for the test """ if "expected" in test_result and "actual" in test_result: return f"The test expected '{test_result['expected']}' but got '{test_result['actual']}'" elif "message" in test_result: return test_result["message"] else: return "The test failed, but no detailed information is available." def _add_syntax_error(self, state: ProblemState, solution: str) -> None: """ Add a syntax error to the solution code. Args: state: The problem state to modify solution: The current solution """ lines = solution.split('\n') if not lines: return # Choose a line to modify idx = random.randint(0, len(lines) - 1) line = lines[idx] # Skip empty lines or comment lines while not line.strip() or line.strip().startswith('#'): idx = random.randint(0, len(lines) - 1) line = lines[idx] # Choose a modification type mod_type = random.choice([ "remove_character", "add_character", "swap_characters", "change_indent" ]) if mod_type == "remove_character" and line: char_idx = random.randint(0, len(line) - 1) lines[idx] = line[:char_idx] + line[char_idx+1:] elif mod_type == "add_character": char_idx = random.randint(0, len(line)) char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) lines[idx] = line[:char_idx] + char + line[char_idx:] elif mod_type == "swap_characters" and len(line) >= 2: char_idx = random.randint(0, len(line) - 2) lines[idx] = (line[:char_idx] + line[char_idx+1] + line[char_idx] + line[char_idx+2:]) elif mod_type == "change_indent": # Either add or remove indentation if line.startswith(" "): lines[idx] = line[2:] # Remove some indent else: lines[idx] = " " + line # Add inconsistent indent # Update the code modified_code = '\n'.join(lines) state.code_context["code"] = modified_code # Add information about the modification if "bugs" not in state.code_context: state.code_context["bugs"] = [] state.code_context["bugs"].append({ "type": "syntax", "line": idx + 1, "description": f"Syntax error introduced in line {idx + 1}" }) def _add_logical_error(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: """ Add a logical error to the solution code. Args: state: The problem state to modify solution: The current solution parsed_solution: The parsed AST of the solution """ modification_types = [ "change_comparison", "invert_condition", "off_by_one", "change_operator", "reverse_logic" ] mod_type = random.choice(modification_types) lines = solution.split('\n') # Find all if statements and loops if_statements = [] for i, line in enumerate(lines): if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): if_statements.append((i, line)) if if_statements: # Choose an if statement to modify idx, line = random.choice(if_ # recursive_swe_bench/task_generators/bug_fixing.py (continued) if if_statements: # Choose an if statement to modify idx, line = random.choice(if_statements) if mod_type == "change_comparison": # Change comparison operators comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} for op, new_op in comparisons.items(): if op in line: lines[idx] = line.replace(op, new_op, 1) break elif mod_type == "invert_condition": # Add or remove a "not" to invert the condition if "not" in line: lines[idx] = line.replace("not ", "", 1) else: match = re.search(r'(if|while)\s+([^:]+):', line) if match: condition = match.group(2) lines[idx] = line.replace(condition, f"not ({condition})", 1) elif mod_type == "off_by_one": # Introduce an off-by-one error for op in ["+", "-"]: if op in line: # If there's a number after the operator, change it match = re.search(f'\\{op}\\s*(\\d+)', line) if match: num = int(match.group(1)) new_num = num + 1 if op == "+" else max(0, num - 1) lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) break elif mod_type == "change_operator": # Change arithmetic or logical operators operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} for op, new_op in operators.items(): if f" {op} " in line: lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) break elif mod_type == "reverse_logic": # Reverse the logic of a compound condition if " and " in line: parts = line.split(" and ") lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) elif " or " in line: parts = line.split(" or ") lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) else: # If no if statements found, introduce a different kind of logical error # Find variable assignments assignments = [] for i, line in enumerate(lines): if "=" in line and "==" not in line and "!=" not in line: assignments.append((i, line)) if assignments: # Choose an assignment to modify idx, line = random.choice(assignments) # Modify the assignment if "+" in line: lines[idx] = line.replace("+", "-", 1) elif "-" in line: lines[idx] = line.replace("-", "+", 1) elif "*" in line: lines[idx] = line.replace("*", "/", 1) elif "/" in line: lines[idx] = line.replace("/", "*", 1) else: # If no arithmetic operator, change the value match = re.search(r'=\s*(\d+)', line) if match: num = int(match.group(1)) new_num = num + random.choice([-1, 1]) * random.randint(1, 3) lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) # Update the code modified_code = '\n'.join(lines) state.code_context["code"] = modified_code # Add information about the modification if "bugs" not in state.code_context: state.code_context["bugs"] = [] state.code_context["bugs"].append({ "type": "logical", "line": idx + 1, "description": f"Logical error introduced in line {idx + 1}: {mod_type}" }) def _add_performance_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: """ Add a performance issue to the solution code. Args: state: The problem state to modify solution: The current solution parsed_solution: The parsed AST of the solution """ lines = solution.split('\n') # Find loops in the code loops = [] for i, line in enumerate(lines): if re.search(r'\bfor\b|\bwhile\b', line): loops.append((i, line)) if loops: # Choose a loop to modify idx, line = random.choice(loops) # Choose a modification type mod_type = random.choice([ "add_nested_loop", "replace_efficient_operation", "add_redundant_computation" ]) if mod_type == "add_nested_loop": # Add a nested loop indent = len(line) - len(line.lstrip()) indent_str = ' ' * indent loop_body_indent = indent_str + ' ' # Find the next line with the same indentation or less end_idx = idx + 1 while end_idx < len(lines) and (not lines[end_idx].strip() or len(lines[end_idx]) - len(lines[end_idx].lstrip()) > indent): end_idx += 1 # Insert a nested loop before the end of the current loop insert_pos = end_idx lines.insert(insert_pos, f"{loop_body_indent}for _ in range(100): # Unnecessary loop") lines.insert(insert_pos + 1, f"{loop_body_indent} pass") elif mod_type == "replace_efficient_operation": # Replace an efficient operation with a less efficient one # Look for list comprehensions or efficient operations for i in range(idx + 1, min(idx + 10, len(lines))): if "append" in lines[i] or "extend" in lines[i]: indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent match = re.search(r'(\w+)\.(append|extend)', lines[i]) if match: list_name = match.group(1) operation = match.group(2) item = lines[i].split(f"{list_name}.{operation}(")[1].split(")")[0] if operation == "append": # Replace append with concatenation lines[i] = f"{indent_str}{list_name} = {list_name} + [{item}] # Less efficient than append" elif operation == "extend": # Replace extend with concatenation lines[i] = f"{indent_str}{list_name} = {list_name} + {item} # Less efficient than extend" break elif mod_type == "add_redundant_computation": # Add redundant computation inside the loop # Find the indentation level of the loop body if idx + 1 < len(lines): body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) body_indent_str = ' ' * body_indent # Add redundant computation lines.insert(idx + 1, f"{body_indent_str}temp = [] # Redundant computation") lines.insert(idx + 2, f"{body_indent_str}for i in range(1000):") lines.insert(idx + 3, f"{body_indent_str} temp.append(i)") lines.insert(idx + 4, f"{body_indent_str} temp.sort() # Unnecessary sort in each iteration") else: # If no loops found, introduce inefficient data structure or algorithm function_defs = [] for i, line in enumerate(lines): if line.strip().startswith("def "): function_defs.append((i, line)) if function_defs: # Choose a function to modify idx, line = random.choice(function_defs) # Find the indentation level of the function body if idx + 1 < len(lines): body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) body_indent_str = ' ' * body_indent # Add inefficient code at the beginning of the function lines.insert(idx + 1, f"{body_indent_str}# Inefficient data structure usage") lines.insert(idx + 2, f"{body_indent_str}data = []") lines.insert(idx + 3, f"{body_indent_str}for i in range(1000):") lines.insert(idx + 4, f"{body_indent_str} data.append(i)") lines.insert(idx + 5, f"{body_indent_str} # Inefficient search operation") lines.insert(idx + 6, f"{body_indent_str} if i in data: # Linear search instead of using a set") lines.insert(idx + 7, f"{body_indent_str} pass") # Update the code modified_code = '\n'.join(lines) state.code_context["code"] = modified_code # Add information about the modification if "bugs" not in state.code_context: state.code_context["bugs"] = [] state.code_context["bugs"].append({ "type": "performance", "line": idx + 1, "description": f"Performance issue introduced around line {idx + 1}" }) def _add_edge_case_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: """ Add an edge case issue to the solution code. Args: state: The problem state to modify solution: The current solution parsed_solution: The parsed AST of the solution """ lines = solution.split('\n') # Find functions in the code functions = [] current_func = None func_start = None for i, line in enumerate(lines): if line.strip().startswith("def "): if current_func: functions.append((func_start, i - 1, current_func)) current_func = line.strip()[4:].split("(")[0] func_start = i elif i == len(lines) - 1 and current_func: functions.append((func_start, i, current_func)) if functions: # Choose a function to modify start_idx, end_idx, func_name = random.choice(functions) # Choose a modification type mod_type = random.choice([ "remove_boundary_check", "introduce_zero_division", "handling_empty_input", "type_assumption" ]) if mod_type == "remove_boundary_check": # Find and remove or modify boundary checks for i in range(start_idx, end_idx + 1): if re.search(r'if\s+.*(?:len|count|size|length|empty|<=|>=|<|>|\!=)', lines[i]): # Comment out the boundary check lines[i] = f"# {lines[i]} # Boundary check removed" # Skip the body of the if statement j = i + 1 indent = len(lines[i]) - len(lines[i].lstrip()) body_indent = indent + 4 while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): lines[j] = f"# {lines[j]}" j += 1 break elif mod_type == "introduce_zero_division": # Find division operations and modify them for i in range(start_idx, end_idx + 1): if "/" in lines[i] and "try" not in lines[i] and "except" not in lines[i]: # Remove denominator check if it exists if re.search(r'if\s+.*(?:!=\s*0|>\s*0)', lines[i]): lines[i] = f"# {lines[i]} # Denominator check removed" else: # Or modify a division to potentially cause zero division match = re.search(r'(\w+)\s*/\s*(\w+)', lines[i]) if match: denominator = match.group(2) # Add a potential zero value for the denominator indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent lines.insert(i, f"{indent_str}if random.random() < 0.1: # Introduce potential zero division") lines.insert(i + 1, f"{indent_str} {denominator} = 0") break elif mod_type == "handling_empty_input": # Modify parameter handling to not handle empty inputs correctly params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) if params and params.group(1): param_list = [p.strip() for p in params.group(1).split(",")] if param_list: param = param_list[0].split("=")[0].strip() # Find checks for the parameter for i in range(start_idx + 1, end_idx + 1): if re.search(rf'if\s+.*(?:not\s+{param}|len\s*\(\s*{param}\s*\)\s*==\s*0)', lines[i]): # Comment out the empty check lines[i] = f"# {lines[i]} # Empty input check removed" # Skip the body of the if statement j = i + 1 indent = len(lines[i]) - len(lines[i].lstrip()) body_indent = indent + 4 while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): lines[j] = f"# {lines[j]}" j += 1 break elif mod_type == "type_assumption": # Introduce assumptions about parameter types params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) if params and params.group(1): param_list = [p.strip() for p in params.group(1).split(",")] if param_list: param = param_list[0].split("=")[0].strip() # Find type checks for the parameter type_check_found = False for i in range(start_idx + 1, end_idx + 1): if re.search(rf'(?:isinstance|type)\s*\(\s*{param}\s*,', lines[i]): # Comment out the type check lines[i] = f"# {lines[i]} # Type check removed" type_check_found = True break if not type_check_found: # Add a problematic type assumption indent = 4 # Assume basic indentation for i in range(start_idx + 1, min(start_idx + 5, end_idx + 1)): if lines[i].strip(): indent = len(lines[i]) - len(lines[i].lstrip()) break indent_str = ' ' * indent # Add code that assumes a specific type lines.insert(start_idx + 1, f"{indent_str}# Assuming {param} is a specific type without checking") lines.insert(start_idx + 2, f"{indent_str}{param}_length = len({param}) # Will fail if {param} doesn't support len()") # Update the code modified_code = '\n'.join(lines) state.code_context["code"] = modified_code # Add information about the modification if "bugs" not in state.code_context: state.code_context["bugs"] = [] state.code_context["bugs"].append({ "type": "edge_case", "line": start_idx + 1, "description": f"Edge case issue introduced in function '{func_name}': {mod_type}" }) def _generate_new_requirement(self, state: ProblemState, solution: str) -> Dict[str, Any]: """ Generate a new requirement based on the current state and solution. Args: state: The current problem state solution: The current solution Returns: A new requirement dictionary """ # Parse the solution to find functions and variables function_names = re.findall(r'def\s+(\w+)', solution) variable_names = re.findall(r'(\w+)\s*=', solution) # Choose a requirement type req_type = random.choice([ "edge_case_handling", "performance_improvement", "error_handling", "type_checking", "feature_addition" ]) if req_type == "edge_case_handling": if function_names: func_name = random.choice(function_names) edge_cases = [ "empty input", "negative values", "zero values", "extremely large values", "special characters", "duplicate values" ] edge_case = random.choice(edge_cases) return { "type": "edge_case_handling", "description": f"The function '{func_name}' should handle {edge_case} correctly.", "difficulty": random.uniform(0.3, 0.7) } elif req_type == "performance_improvement": return { "type": "performance_improvement", "description": "The solution should be optimized to run in O(n) time or better.", "difficulty": random.uniform(0.4, 0.8) } elif req_type == "error_handling": error_types = [ "invalid input", "division by zero", "file not found", "network timeout", "permission denied" ] error_type = random.choice(error_types) return { "type": "error_handling", "description": f"The code should handle {error_type} errors gracefully.", "difficulty": random.uniform(0.2, 0.6) } elif req_type == "type_checking": if function_names: func_name = random.choice(function_names) return { "type": "type_checking", "description": f"The function '{func_name}' should validate input types before processing.", "difficulty": random.uniform(0.1, 0.5) } elif req_type == "feature_addition": features = [ "logging capability", "progress tracking", "caching for repeated operations", "parameter validation", "configuration options" ] feature = random.choice(features) return { "type": "feature_addition", "description": f"Add {feature} to the solution.", "difficulty": random.uniform(0.3, 0.7) } # Default requirement if none of the above were applicable return { "type": "general_improvement", "description": "Improve the overall code quality and readability.", "difficulty": random.uniform(0.1, 0.4) } def _modify_requirement(self, requirement: Dict[str, Any], state: ProblemState, solution: str) -> Dict[str, Any]: """ Modify an existing requirement to make it more challenging. Args: requirement: The requirement to modify state: The current problem state solution: The current solution Returns: The modified requirement """ # Make a copy of the requirement modified_req = copy.deepcopy(requirement) # Increase the difficulty modified_req["difficulty"] = min(1.0, requirement.get("difficulty", 0.3) + random.uniform(0.1, 0.3)) # Modify the description based on the requirement type if requirement["type"] == "edge_case_handling": modified_req["description"] += " Additionally, it should handle very large inputs efficiently." elif requirement["type"] == "performance_improvement": modified_req["description"] = modified_req["description"].replace("O(n)", "O(log n)") elif requirement["type"] == "error_handling": modified_req["description"] += " And provide detailed error messages for debugging." elif requirement["type"] == "type_checking": modified_req["description"] += " And automatically convert types when possible." elif requirement["type"] == "feature_addition": modified_req["description"] += " Ensure this feature is configurable via parameters." else: modified_req["description"] += " The code should also be well-documented with comments." return modified_req def _add_edge_case_requirement(self, state: ProblemState) -> None: """ Add a requirement for handling edge cases. Args: state: The problem state to modify """ edge_cases = [ "empty collections", "null/None values", "boundary values (min/max)", "negative numbers", "special characters", "Unicode characters", "very large inputs", "malformed input" ] edge_case = random.choice(edge_cases) # Add a new requirement state.requirements.append({ "type": "edge_case_handling", "description": f"The solution must correctly handle {edge_case}.", "difficulty": random.uniform(0.3, 0.7) }) # Add test cases for the edge case if tests exist if "tests" in state.code_context: # Create a new test for the edge case test_template = self._generate_edge_case_test(edge_case, state.code_context) if test_template: state.code_context["tests"].append({ "name": f"test_edge_case_{len(state.code_context['tests'])}", "content": test_template, "description": f"Test handling of {edge_case}" }) def _increase_data_volume(self, state: ProblemState, solution: str) -> None: """ Modify the problem to require handling larger data volumes. Args: state: The problem state to modify solution: The current solution """ # Add a requirement for handling large data state.requirements.append({ "type": "scalability", "description": "The solution must efficiently handle large datasets (10,000+ items).", "difficulty": random.uniform(0.5, 0.8) }) # Modify existing tests to use larger data if tests exist if "tests" in state.code_context: for i, test in enumerate(state.code_context["tests"]): content = test["content"] # Look for small lists or arrays in tests for pattern, replacement in [ (r'\[[^\]]{0,50}\]', '[random.randint(0, 1000) for _ in range(10000)]'), (r'range\(\d+\)', 'range(10000)'), (r'"[^"]{0,20}"', '"' + 'a' * 10000 + '"') ]: match = re.search(pattern, content) if match and random.random() < 0.3: # Only replace some instances content = content.replace(match.group(0), replacement, 1) break state.code_context["tests"][i]["content"] = content state.code_context["tests"][i]["description"] = f"{test.get('description', 'Test')} (with large data)" def _add_performance_constraint(self, state: ProblemState, solution: str) -> None: """ Add a performance constraint to the problem. Args: state: The problem state to modify solution: The current solution """ # Choose a performance constraint constraints = [ "linear time complexity (O(n))", "logarithmic time complexity (O(log n))", "constant memory usage (O(1) space)", "execution time under 100ms for large inputs", "minimal function calls" ] constraint = random.choice(constraints) # Add a new requirement state.requirements.append({ "type": "performance", "description": f"The solution must achieve {constraint}.", "difficulty": random.uniform(0.6, 0.9) }) # Add performance testing code if tests exist if "tests" in state.code_context: # Add a performance test perf_test = self._generate_performance_test(constraint, state.code_context) if perf_test: state.code_context["tests"].append({ "name": f"test_performance_{len(state.code_context['tests'])}", "content": perf_test, "description": f"Test {constraint}" }) def _expand_functionality(self, state: ProblemState, solution: str) -> None: """ Expand the required functionality of the solution. Args: state: The problem state to modify solution: The current solution """ # Choose a functionality expansion expansions = [ "support for different input types", "parameterized behavior", "additional output formats", "flexible error handling", "integration with external systems" ] expansion = random.choice(expansions) # Add a new requirement state.requirements.append({ "type": "functionality", "description": f"Expand the solution to include {expansion}.", "difficulty": random.uniform(0.4, 0.8) }) # Add test cases for the new functionality if tests exist if "tests" in state.code_context: # Create a new test for the expanded functionality test_template = self._generate_functionality_test(expansion, state.code_context) if test_template: state.code_context["tests"].append({ "name": f"test_expanded_functionality_{len(state.code_context['tests'])}", "content": test_template, "description": f"Test {expansion}" }) def _generate_default_test(self) -> str: """ Generate a default test based on the current problem state. Returns: A default test script """ # Generate a basic test script return """ import unittest import sys import os # Add the directory containing the solution to the path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Import the solution from solution import * class DefaultTest(unittest.TestCase): def test_basic_functionality(self): # A basic test that should pass if the solution is correct self.assertTrue(True, "Basic assertion failed") def test_expected_output(self): # Test expected output of main functions # This will need to be updated based on the specific problem pass if __name__ == '__main__': unittest.main() """ def _generate_edge_case_test(self, edge_case: str, code_context: Dict[str, Any]) -> str: """ Generate a test for an edge case. Args: edge_case: The edge case to test code_context: The code context containing information about the problem Returns: A test script for the edge case """ # Extract function names from the code context function_names = [] if "code" in code_context: function_names = re.findall(r'def\s+(\w+)', code_context["code"]) if not function_names: return None # Choose a function to test function_name = random.choice(function_names) # Generate test code based on the edge case if edge_case == "empty collections": return f""" import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class EmptyCollectionTest(unittest.TestCase): def test_empty_input(self): # Test with empty list result = {function_name}([]) self.assertIsNotNone(result, "Function should handle empty list") # Test with empty string result = {function_name}("") self.assertIsNotNone(result, "Function should handle empty string") # Test with empty dict result = {function_name}({{}}) self.assertIsNotNone(result, "Function should handle empty dict") if __name__ == '__main__': unittest.main() """ elif edge_case == "null/None values": return f""" import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class NoneValueTest(unittest.TestCase): def test_none_input(self): # Test with None as input result = {function_name}(None) self.assertIsNotNone(result, "Function should handle None input") # Test with list containing None result = {function_name}([1, None, 3]) self.assertIsNotNone(result, "Function should handle list with None values") if __name__ == '__main__': unittest.main() """ elif edge_case == "boundary values (min/max)": return f""" # recursive_swe_bench/task_generators/bug_fixing.py (completion) import unittest import sys import os import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class BoundaryValueTest(unittest.TestCase): def test_min_max_values(self): # Test with minimum integer min_int = -sys.maxsize - 1 result = {function_name}(min_int) self.assertIsNotNone(result, "Function should handle minimum integer") # Test with maximum integer max_int = sys.maxsize result = {function_name}(max_int) self.assertIsNotNone(result, "Function should handle maximum integer") # Test with very large list large_list = list(range(10000)) result = {function_name}(large_list) self.assertIsNotNone(result, "Function should handle very large inputs") if __name__ == '__main__': unittest.main() """ elif edge_case == "negative numbers": return f""" import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class NegativeNumberTest(unittest.TestCase): def test_negative_numbers(self): # Test with negative number result = {function_name}(-1) self.assertIsNotNone(result, "Function should handle negative numbers") # Test with list of negative numbers result = {function_name}([-1, -2, -3]) self.assertIsNotNone(result, "Function should handle lists of negative numbers") # Test with mixed positive and negative result = {function_name}([-1, 0, 1]) self.assertIsNotNone(result, "Function should handle mixed positive and negative") if __name__ == '__main__': unittest.main() """ else: # Generic edge case test return f""" import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class EdgeCaseTest(unittest.TestCase): def test_edge_case_{edge_case.replace(' ', '_')}(self): # Test edge case: {edge_case} # This is a placeholder test that needs to be customized for the specific edge case self.assertTrue(True, "Edge case test not implemented") if __name__ == '__main__': unittest.main() """ def _generate_performance_test(self, constraint: str, code_context: Dict[str, Any]) -> str: """ Generate a performance test based on a constraint. Args: constraint: The performance constraint code_context: The code context containing information about the problem Returns: A test script for the performance constraint """ # Extract function names from the code context function_names = [] if "code" in code_context: function_names = re.findall(r'def\s+(\w+)', code_context["code"]) if not function_names: return None # Choose a function to test function_name = random.choice(function_names) if "time complexity" in constraint: return f""" import unittest import sys import os import time import random sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class PerformanceTest(unittest.TestCase): def test_time_complexity(self): # Test for {constraint} sizes = [100, 1000, 10000] times = [] for size in sizes: # Generate input of the given size input_data = [random.randint(0, 1000) for _ in range(size)] # Measure execution time start_time = time.time() {function_name}(input_data) end_time = time.time() times.append(end_time - start_time) # Check if time grows appropriately # For O(n), time should grow linearly with input size # For O(log n), time should grow logarithmically # This is a simplified check and might need adjustment if "log n" in "{constraint}": # For logarithmic time, the ratio of times should decrease ratio1 = times[1] / times[0] ratio2 = times[2] / times[1] self.assertLess(ratio2, ratio1 * 1.5, f"Growth rate appears super-logarithmic: {times}") else: # Assume linear or better # For linear time, the ratio of times should be roughly equal to ratio of sizes ratio1 = times[1] / times[0] size_ratio1 = sizes[1] / sizes[0] ratio2 = times[2] / times[1] size_ratio2 = sizes[2] / sizes[1] self.assertLess(ratio1, size_ratio1 * 1.5, f"First growth rate appears super-linear: {times}") self.assertLess(ratio2, size_ratio2 * 1.5, f"Second growth rate appears super-linear: {times}") if __name__ == '__main__': unittest.main() """ elif "execution time" in constraint: return f""" import unittest import sys import os import time import random sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class PerformanceTest(unittest.TestCase): def test_execution_time(self): # Test for {constraint} # Generate a large input input_data = [random.randint(0, 1000) for _ in range(10000)] # Measure execution time start_time = time.time() {function_name}(input_data) end_time = time.time() execution_time = (end_time - start_time) * 1000 # Convert to ms self.assertLess(execution_time, 100, f"Execution time exceeded 100ms: {execution_time:.2f}ms") if __name__ == '__main__': unittest.main() """ elif "memory usage" in constraint: return f""" import unittest import sys import os import psutil import random sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class MemoryUsageTest(unittest.TestCase): def test_memory_usage(self): # Test for {constraint} # Note: This is an approximate test and may not be accurate in all environments # Get current process process = psutil.Process(os.getpid()) # Measure memory before memory_before = process.memory_info().rss / 1024 / 1024 # MB # Generate a large input input_data = [random.randint(0, 1000) for _ in range(100000)] # Run function {function_name}(input_data) # Measure memory after memory_after = process.memory_info().rss / 1024 / 1024 # MB # Calculate memory usage memory_used = memory_after - memory_before # A crude approximation, adjust as needed self.assertLess(memory_used, 10, f"Memory usage seems high: {memory_used:.2f}MB") if __name__ == '__main__': unittest.main() """ else: # Generic performance test return f""" import unittest import sys import os import time import random sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class PerformanceTest(unittest.TestCase): def test_performance(self): # Test for {constraint} # This is a placeholder test that needs to be customized for the specific constraint # Generate a large input input_data = [random.randint(0, 1000) for _ in range(10000)] # Measure execution time start_time = time.time() {function_name}(input_data) end_time = time.time() execution_time = end_time - start_time # Just log the time for now print(f"Execution time: {execution_time:.4f} seconds") self.assertTrue(True, "Performance test completed") if __name__ == '__main__': unittest.main() """ def _generate_functionality_test(self, expansion: str, code_context: Dict[str, Any]) -> str: """ Generate a test for expanded functionality. Args: expansion: The functionality expansion code_context: The code context containing information about the problem Returns: A test script for the expanded functionality """ # Extract function names from the code context function_names = [] if "code" in code_context: function_names = re.findall(r'def\s+(\w+)', code_context["code"]) if not function_names: return None # Choose a function to test function_name = random.choice(function_names) if "different input types" in expansion: return f""" import unittest import sys import os import json from collections import namedtuple sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class InputTypesTest(unittest.TestCase): def test_different_input_types(self): # Test with different types of inputs # Test with list list_input = [1, 2, 3] list_result = {function_name}(list_input) self.assertIsNotNone(list_result, "Function should handle list input") # Test with tuple tuple_input = (1, 2, 3) tuple_result = {function_name}(tuple_input) self.assertIsNotNone(tuple_result, "Function should handle tuple input") # Test with set set_input = {{1, 2, 3}} set_result = {function_name}(set_input) self.assertIsNotNone(set_result, "Function should handle set input") # Test with dictionary dict_input = {{"a": 1, "b": 2, "c": 3}} dict_result = {function_name}(dict_input) self.assertIsNotNone(dict_result, "Function should handle dictionary input") # Test with JSON string json_input = '{{"data": [1, 2, 3]}}' json_result = {function_name}(json_input) self.assertIsNotNone(json_result, "Function should handle JSON string") # Test with custom object Point = namedtuple('Point', ['x', 'y']) obj_input = Point(1, 2) obj_result = {function_name}(obj_input) self.assertIsNotNone(obj_result, "Function should handle custom object") if __name__ == '__main__': unittest.main() """ elif "parameterized behavior" in expansion: return f""" import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class ParameterizedTest(unittest.TestCase): def test_parameterized_behavior(self): # Test function with different parameters # Base case with default parameters base_input = [1, 2, 3] base_result = {function_name}(base_input) # The function should now accept additional parameters # These are example parameters, adjust based on the specific function # With sorting parameter try: sorted_result = {function_name}(base_input, sort=True) self.assertIsNotNone(sorted_result, "Function should handle sort parameter") except TypeError as e: self.fail(f"Function does not support sort parameter: {{e}}") # With filtering parameter try: filtered_result = {function_name}(base_input, filter_fn=lambda x: x > 1) self.assertIsNotNone(filtered_result, "Function should handle filter_fn parameter") except TypeError as e: self.fail(f"Function does not support filter_fn parameter: {{e}}") # With formatting parameter try: formatted_result = {function_name}(base_input, format="json") self.assertIsNotNone(formatted_result, "Function should handle format parameter") except TypeError as e: self.fail(f"Function does not support format parameter: {{e}}") if __name__ == '__main__': unittest.main() """ elif "additional output formats" in expansion: return f""" import unittest import sys import os import json sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class OutputFormatsTest(unittest.TestCase): def test_output_formats(self): # Test function with different output formats input_data = [1, 2, 3] # Original format original_result = {function_name}(input_data) # The function should now support different output formats # These are example formats, adjust based on the specific function # JSON format try: json_result = {function_name}(input_data, format="json") # Check if it's valid JSON try: json_obj = json.loads(json_result) if isinstance(json_result, str) else json_result self.assertIsNotNone(json_obj, "JSON result should be valid") except json.JSONDecodeError: self.fail("JSON result is not valid") except TypeError as e: self.fail(f"Function does not support JSON format: {{e}}") # CSV format try: csv_result = {function_name}(input_data, format="csv") self.assertIsNotNone(csv_result, "CSV result should not be None") if isinstance(csv_result, str): self.assertIn(",", csv_result, "CSV result should contain commas") except TypeError as e: self.fail(f"Function does not support CSV format: {{e}}") # XML format try: xml_result = {function_name}(input_data, format="xml") self.assertIsNotNone(xml_result, "XML result should not be None") if isinstance(xml_result, str): self.assertIn("<", xml_result, "XML result should contain tags") self.assertIn(">", xml_result, "XML result should contain tags") except TypeError as e: self.fail(f"Function does not support XML format: {{e}}") if __name__ == '__main__': unittest.main() """ else: # Generic functionality expansion test return f""" import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {function_name} class ExpandedFunctionalityTest(unittest.TestCase): def test_expanded_functionality(self): # Test for {expansion} # This is a placeholder test that needs to be customized for the specific expansion # Basic test to verify the function exists input_data = [1, 2, 3] result = {function_name}(input_data) self.assertIsNotNone(result, "Function should return a result") # You need to add specific tests for the expanded functionality if __name__ == '__main__': unittest.main() """ def _calculate_adaptation_vector(self, solution: str, result: EvaluationResult, feedback: Feedback) -> List[float]: """ Calculate an adaptation vector based on the solution, result, and feedback. The adaptation vector encodes how the problem should evolve in future iterations, capturing dimensions like difficulty, bug type emphasis, and feedback focus. Args: solution: The current solution result: The evaluation results feedback: The feedback provided Returns: An adaptation vector (list of floats) """ # Initialize adaptation vector with zeros # Dimensions: # [0] - difficulty adjustment # [1] - syntax vs logical bug emphasis # [2] - performance focus # [3] - edge case focus # [4] - requirement expansion adaptation_vector = [0.0] * 5 # Adjust difficulty based on score if result.score > 0.95: adaptation_vector[0] = 0.2 # Increase difficulty significantly elif result.score > 0.8: adaptation_vector[0] = 0.1 # Increase difficulty moderately elif result.score > 0.6: adaptation_vector[0] = 0.0 # Maintain current difficulty elif result.score > 0.4: adaptation_vector[0] = -0.1 # Decrease difficulty moderately else: adaptation_vector[0] = -0.2 # Decrease difficulty significantly # Adjust bug type emphasis based on error types syntax_issues = sum(1 for issue in feedback.issues if issue.get("error_type") == "syntax") logical_issues = sum(1 for issue in feedback.issues if issue.get("type") == "test_failure") if syntax_issues > logical_issues: adaptation_vector[1] = -0.1 # Move toward more logical bugs elif logical_issues > syntax_issues: adaptation_vector[1] = 0.1 # Move toward more syntax bugs # Adjust performance focus based on execution time and metrics if result.metrics and "execution_time" in result.metrics: if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): adaptation_vector[2] = 0.2 # Increase performance focus else: adaptation_vector[2] = -0.1 # Decrease performance focus # Adjust edge case focus based on test failures if result.test_results: edge_case_failures = sum(1 for test_name, test_result in result.test_results.items() if not test_result["passed"] and "edge" in test_name.lower()) if edge_case_failures > 0: adaptation_vector[3] = 0.2 # Increase edge case focus else: adaptation_vector[3] = 0.0 # Maintain current edge case focus # Adjust requirement expansion based on current state current_requirements = len(self.state.requirements) if current_requirements < 3: adaptation_vector[4] = 0.1 # Increase likelihood of adding requirements elif current_requirements >= 5: adaptation_vector[4] = -0.1 # Decrease likelihood of adding requirements return adaptation_vector class DefaultTestRunner: """Default test runner for evaluating bug fixes.""" def run_tests(self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any]) -> Dict[str, Any]: """ Run tests against a solution file. Args: solution_file: Path to the solution file test_files: List of test file paths code_context: Context information about the code Returns: Dictionary of test results """ # Initialize results results = { "all_passed": True, "passed_tests": 0, "total_tests": 0, "tests": {}, "execution": { "success": True, "error": None, "stdout": None, "stderr": None }, "execution_time": 0.0 } # Import the solution to check for syntax errors try: # Check if the solution file exists if not solution_file.exists(): results["execution"]["success"] = False results["execution"]["error"] = "Solution file not found" results["all_passed"] = False return results # Try to import the module to test for syntax errors sys.path.insert(0, str(solution_file.parent)) import importlib.util spec = importlib.util.spec_from_file_location("solution", solution_file) solution_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(solution_module) # Check for required functions if "required_functions" in code_context: for func_name in code_context["required_functions"]: if not hasattr(solution_module, func_name): results["execution"]["success"] = False results["execution"]["error"] = f"Required function '{func_name}' not found" results["all_passed"] = False return results except Exception as e: results["execution"]["success"] = False results["execution"]["error"] = str(e) results["all_passed"] = False return results # Run each test file for test_file in test_files: # Skip if the test file doesn't exist if not test_file.exists(): continue # Run the test file import unittest import io from contextlib import redirect_stdout, redirect_stderr # Create a test loader and find tests in the file loader = unittest.TestLoader() try: tests = loader.discover(str(test_file.parent), pattern=test_file.name) # Count the number of test cases test_cases = 0 for suite in tests: for test_case in suite: test_cases += test_case.countTestCases() results["total_tests"] += test_cases # Run the tests runner = unittest.TextTestRunner(verbosity=2) # Capture stdout and stderr stdout_buffer = io.StringIO() stderr_buffer = io.StringIO() with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): test_result = runner.run(tests) stdout = stdout_buffer.getvalue() stderr = stderr_buffer.getvalue() # Check if all tests passed if not test_result.wasSuccessful(): results["all_passed"] = False # Count passed tests passed_tests = test_cases - len(test_result.failures) - len(test_result.errors) results["passed_tests"] += passed_tests # Store individual test results test_name = test_file.stem results["tests"][test_name] = { "passed": test_result.wasSuccessful(), "failures": len(test_result.failures), "errors": len(test_result.errors), "skipped": len(test_result.skipped), "total": test_cases, "passed_count": passed_tests, "stdout": stdout, "stderr": stderr } # Extract more detailed information about failures for failure in test_result.failures: test_id = failure[0].id() failure_message = failure[1] # Extract expected and actual values if available import re expected_match = re.search(r'Expected\s*:(.+)', failure_message) actual_match = re.search(r'Actual\s*:(.+)', failure_message) expected = expected_match.group(1).strip() if expected_match else None actual = actual_match.group(1).strip() if actual_match else None if test_id not in results["tests"]: results["tests"][test_id] = {} results["tests"][test_id].update({ "passed": False, "message": failure_message, "expected": expected, "actual": actual }) except Exception as e: # If the test file itself has errors results["all_passed"] = False results["tests"][test_file.stem] = { "passed": False, "error": str(e), "failures": 1, "errors": 1, "skipped": 0, "total": 1, "passed_count": 0 } results["total_tests"] += 1 return results class BugFixingTaskGenerator: """Generator for bug fixing tasks.""" def __init__(self, config: Dict[str, Any] = None): """ Initialize the bug fixing task generator. Args: config: Configuration options """ self.config = config or {} self.difficulty_levels = self.config.get( "difficulty_levels", ["easy", "medium", "hard", "expert"] ) self.bug_categories = self.config.get( "bug_categories", [ BugCategory.SYNTAX, BugCategory.LOGICAL, BugCategory.EDGE_CASE, BugCategory.PERFORMANCE ] ) self.test_templates = self._load_test_templates() def generate_task(self, difficulty: str = None, bug_categories: List[str] = None) -> BugFixingTask: """ Generate a new bug fixing task. Args: difficulty: The difficulty level (easy, medium, hard, expert) bug_categories: List of bug categories to include Returns: A new bug fixing task """ # Choose difficulty if not specified if difficulty is None: difficulty = random.choice(self.difficulty_levels) # Choose bug categories if not specified if bug_categories is None: num_categories = random.randint(1, 3) bug_categories = random.sample(self.bug_categories, num_categories) # Generate a problem based on difficulty and bug categories problem_state = self._generate_problem_state(difficulty, bug_categories) # Create config for the task task_config = { "difficulty": difficulty, "bug_categories": bug_categories, "convergence_criteria": { "score_threshold": 0.95, "min_iterations": 1, "max_iterations": self.config.get("max_iterations", 5), "score_delta_threshold": 0.05, "consecutive_plateau_limit": 2 }, "score_weights": { "test": 0.7, "execution": 0.3 }, "performance_threshold": 1.0, "complexity_threshold": 0.7 } # Create and return the task return BugFixingTask(problem_state, task_config) def _generate_problem_state(self, difficulty: str, bug_categories: List[str]) -> ProblemState: """ Generate a problem state for the given difficulty and bug categories. Args: difficulty: The difficulty level bug_categories: List of bug categories Returns: A problem state for the task """ # Choose a template based on difficulty and bug categories template = self._choose_template(difficulty, bug_categories) # Create a copy of the template problem_state = copy.deepcopy(template) # Generate a unique ID problem_state.problem_id = str(uuid.uuid4()) # Initialize evolution stage and adaptation vector problem_state.evolution_stage = 0 problem_state.adaptation_vector = [0.0] * 5 # Adjust difficulty value based on level difficulty_values = { "easy": 0.25, "medium": 0.5, "hard": 0.75, "expert": 0.9 } problem_state.difficulty = difficulty_values.get(difficulty, 0.5) # Insert bugs based on categories for category in bug_categories: self._insert_bug(problem_state, category) # Update description to reflect the current state problem_state.description = self._generate_description(problem_state) return problem_state def _choose_template(self, difficulty: str, bug_categories: List[str]) -> ProblemState: """ Choose a template that matches the difficulty and bug categories. Args: difficulty: The difficulty level bug_categories: List of bug categories Returns: A template problem state """ # In a real implementation, this would load from a database of templates # For now, we'll generate a simple template # Generate code context with a sample function code = self._generate_template_code(difficulty, bug_categories) tests = self._generate_template_tests(code) # Create a basic problem state return ProblemState( problem_id="template", description="Fix the bugs in the given code.", code_context={ "code": code, "tests": tests, "bug_count": 0, "bug_categories": [] }, requirements=[ { "type": "functional", "description": "The code should pass all the provided tests.", "difficulty": 0.3 } ], difficulty=0.5, # Will be overridden evolution_stage=0, adaptation_vector=[0.0] * 5 ) def _generate_template_code(self, difficulty: str, bug_categories: List[str]) -> str: """ Generate template code based on difficulty and bug categories. Args: difficulty: The difficulty level bug_categories: List of bug categories Returns: Template code """ # For demonstration, we'll use a few predefined templates templates = { "easy": """ def calculate_sum(numbers): \"\"\"Calculate the sum of a list of numbers.\"\"\" total = 0 for num in numbers: total += num return total def calculate_average(numbers): \"\"\"Calculate the average of a list of numbers.\"\"\" if not numbers: return 0 return calculate_sum(numbers) / len(numbers) """, "medium": """ def find_most_frequent(items): \"\"\"Find the most frequently occurring item in # recursive_swe_bench/task_generators/bug_fixing.py (template generation) def find_most_frequent(items): """Find the most frequently occurring item in a list.""" if not items: return None counts = {} for item in items: if item in counts: counts[item] += 1 else: counts[item] = 1 max_count = 0 max_item = None for item, count in counts.items(): if count > max_count: max_count = count max_item = item return max_item def binary_search(sorted_list, target): """Perform binary search on a sorted list.""" left = 0 right = len(sorted_list) - 1 while left <= right: mid = (left + right) // 2 if sorted_list[mid] == target: return mid elif sorted_list[mid] < target: left = mid + 1 else: right = mid - 1 return -1 # Target not found """, "hard": """ def merge_sort(arr): """Sort an array using the merge sort algorithm.""" if len(arr) <= 1: return arr # Split the array into two halves mid = len(arr) // 2 left_half = arr[:mid] right_half = arr[mid:] # Recursively sort both halves left_half = merge_sort(left_half) right_half = merge_sort(right_half) # Merge the sorted halves return merge(left_half, right_half) def merge(left, right): """Merge two sorted arrays.""" result = [] i = j = 0 # Compare elements from both arrays and add the smaller one to the result while i < len(left) and j < len(right): if left[i] <= right[j]: result.append(left[i]) i += 1 else: result.append(right[j]) j += 1 # Add any remaining elements result.extend(left[i:]) result.extend(right[j:]) return result def quicksort(arr): """Sort an array using the quicksort algorithm.""" if len(arr) <= 1: return arr # Choose the pivot (using the first element for simplicity) pivot = arr[0] # Partition the array less = [x for x in arr[1:] if x <= pivot] greater = [x for x in arr[1:] if x > pivot] # Recursively sort the partitions and combine return quicksort(less) + [pivot] + quicksort(greater) """, "expert": """ class Node: """Node in a binary tree.""" def __init__(self, value): self.value = value self.left = None self.right = None def build_binary_tree(values): """Build a binary tree from a list of values.""" if not values: return None root = Node(values[0]) queue = [root] i = 1 while queue and i < len(values): node = queue.pop(0) # Add left child if i < len(values) and values[i] is not None: node.left = Node(values[i]) queue.append(node.left) i += 1 # Add right child if i < len(values) and values[i] is not None: node.right = Node(values[i]) queue.append(node.right) i += 1 return root def is_balanced(root): """Check if a binary tree is balanced.""" def height(node): if not node: return 0 return max(height(node.left), height(node.right)) + 1 def is_balanced_helper(node): if not node: return True left_height = height(node.left) right_height = height(node.right) if abs(left_height - right_height) > 1: return False return is_balanced_helper(node.left) and is_balanced_helper(node.right) return is_balanced_helper(root) def find_lca(root, p, q): """Find the lowest common ancestor of two nodes in a binary tree.""" if not root: return None if root.value == p or root.value == q: return root left_lca = find_lca(root.left, p, q) right_lca = find_lca(root.right, p, q) if left_lca and right_lca: return root return left_lca if left_lca else right_lca """ } # Choose a template based on difficulty if difficulty in templates: return templates[difficulty] else: return templates["medium"] # Default to medium if difficulty not found def _generate_template_tests(self, code: str) -> List[Dict[str, Any]]: """ Generate template tests based on the code. Args: code: The template code Returns: List of test dictionaries """ # Extract function names from the code function_names = re.findall(r'def\s+(\w+)', code) # Generate tests for each function tests = [] for func_name in function_names: test_content = self._generate_test_for_function(func_name) if test_content: tests.append({ "name": f"test_{func_name}", "content": test_content, "description": f"Test for {func_name} function" }) return tests def _generate_test_for_function(self, func_name: str) -> str: """ Generate a test for a specific function. Args: func_name: The name of the function to test Returns: Test content """ # Check if we have a template for this function if func_name in self.test_templates: return self.test_templates[func_name] # Generate a basic test based on the function name if "sum" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import calculate_sum class TestCalculateSum(unittest.TestCase): def test_calculate_sum(self): self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) self.assertEqual(calculate_sum([]), 0) self.assertEqual(calculate_sum([-1, -2, -3]), -6) if __name__ == '__main__': unittest.main() """ elif "average" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import calculate_average class TestCalculateAverage(unittest.TestCase): def test_calculate_average(self): self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) self.assertEqual(calculate_average([]), 0) self.assertEqual(calculate_average([10]), 10) if __name__ == '__main__': unittest.main() """ elif "frequent" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import find_most_frequent class TestFindMostFrequent(unittest.TestCase): def test_find_most_frequent(self): self.assertEqual(find_most_frequent([1, 2, 2, 3, 3, 3, 4]), 3) self.assertEqual(find_most_frequent(['a', 'b', 'a', 'c', 'a']), 'a') self.assertIsNone(find_most_frequent([])) self.assertEqual(find_most_frequent([5]), 5) if __name__ == '__main__': unittest.main() """ elif "search" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import binary_search class TestBinarySearch(unittest.TestCase): def test_binary_search(self): self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2) self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0) self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4) self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1) self.assertEqual(binary_search([], 5), -1) if __name__ == '__main__': unittest.main() """ elif "sort" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {0} class Test{1}(unittest.TestCase): def test_sorting(self): self.assertEqual({0}([]), []) self.assertEqual({0}([1]), [1]) self.assertEqual({0}([3, 1, 4, 1, 5, 9, 2, 6, 5]), [1, 1, 2, 3, 4, 5, 5, 6, 9]) self.assertEqual({0}([9, 8, 7, 6, 5, 4, 3, 2, 1]), [1, 2, 3, 4, 5, 6, 7, 8, 9]) self.assertEqual({0}([1, 1, 1, 1]), [1, 1, 1, 1]) if __name__ == '__main__': unittest.main() """.format(func_name, func_name.title()) elif "balanced" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import Node, is_balanced class TestIsBalanced(unittest.TestCase): def test_is_balanced(self): # Create a balanced tree # 1 # / \\ # 2 3 # / \\ / \\ # 4 5 6 7 root = Node(1) root.left = Node(2) root.right = Node(3) root.left.left = Node(4) root.left.right = Node(5) root.right.left = Node(6) root.right.right = Node(7) self.assertTrue(is_balanced(root)) # Create an unbalanced tree # 1 # / \\ # 2 3 # / \\ # 4 5 #/ #6 root = Node(1) root.left = Node(2) root.right = Node(3) root.left.left = Node(4) root.left.right = Node(5) root.left.left.left = Node(6) self.assertFalse(is_balanced(root)) # Empty tree is balanced self.assertTrue(is_balanced(None)) if __name__ == '__main__': unittest.main() """ elif "lca" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import Node, find_lca class TestFindLCA(unittest.TestCase): def test_find_lca(self): # Create a tree # 1 # / \\ # 2 3 # / \\ / \\ # 4 5 6 7 root = Node(1) root.left = Node(2) root.right = Node(3) root.left.left = Node(4) root.left.right = Node(5) root.right.left = Node(6) root.right.right = Node(7) # Test cases self.assertEqual(find_lca(root, 4, 5).value, 2) # LCA of 4 and 5 is 2 self.assertEqual(find_lca(root, 4, 6).value, 1) # LCA of 4 and 6 is 1 self.assertEqual(find_lca(root, 3, 7).value, 3) # LCA of 3 and 7 is 3 self.assertEqual(find_lca(root, 2, 7).value, 1) # LCA of 2 and 7 is 1 if __name__ == '__main__': unittest.main() """ elif "tree" in func_name.lower(): return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import Node, build_binary_tree class TestBuildBinaryTree(unittest.TestCase): def test_build_binary_tree(self): # Test empty list self.assertIsNone(build_binary_tree([])) # Test single node root = build_binary_tree([1]) self.assertEqual(root.value, 1) self.assertIsNone(root.left) self.assertIsNone(root.right) # Test complete tree # 1 # / \\ # 2 3 # / \\ / \\ # 4 5 6 7 values = [1, 2, 3, 4, 5, 6, 7] root = build_binary_tree(values) self.assertEqual(root.value, 1) self.assertEqual(root.left.value, 2) self.assertEqual(root.right.value, 3) self.assertEqual(root.left.left.value, 4) self.assertEqual(root.left.right.value, 5) self.assertEqual(root.right.left.value, 6) self.assertEqual(root.right.right.value, 7) # Test tree with None values # 1 # / \\ # 2 3 # / / # 4 6 values = [1, 2, 3, 4, None, 6, None] root = build_binary_tree(values) self.assertEqual(root.value, 1) self.assertEqual(root.left.value, 2) self.assertEqual(root.right.value, 3) self.assertEqual(root.left.left.value, 4) self.assertIsNone(root.left.right) self.assertEqual(root.right.left.value, 6) self.assertIsNone(root.right.right) if __name__ == '__main__': unittest.main() """ else: # Generic test template return """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import {0} class Test{1}(unittest.TestCase): def test_{0}(self): # TODO: Add specific test cases for {0} # This is a placeholder test self.assertTrue(True) if __name__ == '__main__': unittest.main() """.format(func_name, func_name.title()) def _load_test_templates(self) -> Dict[str, str]: """ Load test templates for common functions. Returns: Dictionary of test templates """ # In a real implementation, these would be loaded from files return { "calculate_sum": """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import calculate_sum class TestCalculateSum(unittest.TestCase): def test_calculate_sum(self): self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) self.assertEqual(calculate_sum([]), 0) self.assertEqual(calculate_sum([-1, -2, -3]), -6) if __name__ == '__main__': unittest.main() """, "calculate_average": """ import unittest import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from solution import calculate_average class TestCalculateAverage(unittest.TestCase): def test_calculate_average(self): self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) self.assertEqual(calculate_average([]), 0) self.assertEqual(calculate_average([10]), 10) if __name__ == '__main__': unittest.main() """ } def _insert_bug(self, problem_state: ProblemState, bug_category: str) -> None: """ Insert a bug of the specified category into the problem state. Args: problem_state: The problem state to modify bug_category: The category of bug to insert """ if "code" not in problem_state.code_context: return # Parse the code to find potential bug insertion points code = problem_state.code_context["code"] try: parsed_code = ast.parse(code) except SyntaxError: # If the code already has syntax errors, don't add more bugs return # Insert different types of bugs based on the category if bug_category == BugCategory.SYNTAX: self._insert_syntax_bug(problem_state) elif bug_category == BugCategory.LOGICAL: self._insert_logical_bug(problem_state) elif bug_category == BugCategory.PERFORMANCE: self._insert_performance_bug(problem_state) elif bug_category == BugCategory.EDGE_CASE: self._insert_edge_case_bug(problem_state) else: # Default to logical bug self._insert_logical_bug(problem_state) # Update bug count and categories if "bug_count" not in problem_state.code_context: problem_state.code_context["bug_count"] = 0 problem_state.code_context["bug_count"] += 1 if "bug_categories" not in problem_state.code_context: problem_state.code_context["bug_categories"] = [] if bug_category not in problem_state.code_context["bug_categories"]: problem_state.code_context["bug_categories"].append(bug_category) def _insert_syntax_bug(self, problem_state: ProblemState) -> None: """ Insert a syntax bug into the problem state. Args: problem_state: The problem state to modify """ code = problem_state.code_context["code"] lines = code.split('\n') if not lines: return # Choose a non-empty line to modify idx = random.randint(0, len(lines) - 1) line = lines[idx] # Skip empty lines or comment lines attempts = 0 while (not line.strip() or line.strip().startswith('#')) and attempts < 10: idx = random.randint(0, len(lines) - 1) line = lines[idx] attempts += 1 if attempts >= 10: # Couldn't find a suitable line, use the first non-empty line for i, line in enumerate(lines): if line.strip() and not line.strip().startswith('#'): idx = i break else: return # No suitable line found # Choose a modification type mod_type = random.choice([ "remove_character", "add_character", "swap_characters", "change_indent" ]) if mod_type == "remove_character" and line: char_idx = random.randint(0, len(line) - 1) lines[idx] = line[:char_idx] + line[char_idx+1:] elif mod_type == "add_character": char_idx = random.randint(0, len(line)) char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) lines[idx] = line[:char_idx] + char + line[char_idx:] elif mod_type == "swap_characters" and len(line) >= 2: char_idx = random.randint(0, len(line) - 2) lines[idx] = (line[:char_idx] + line[char_idx+1] + line[char_idx] + line[char_idx+2:]) elif mod_type == "change_indent": # Either add or remove indentation if line.startswith(" "): lines[idx] = line[2:] # Remove some indent else: lines[idx] = " " + line # Add inconsistent indent # Update the code problem_state.code_context["code"] = '\n'.join(lines) # Add information about the bug if "bugs" not in problem_state.code_context: problem_state.code_context["bugs"] = [] problem_state.code_context["bugs"].append({ "type": BugCategory.SYNTAX, "line": idx + 1, "description": f"Syntax error introduced in line {idx + 1}" }) def _insert_logical_bug(self, problem_state: ProblemState) -> None: """ Insert a logical bug into the problem state. Args: problem_state: The problem state to modify """ code = problem_state.code_context["code"] lines = code.split('\n') if not lines: return # Find all if statements and loops if_statements = [] for i, line in enumerate(lines): if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): if_statements.append((i, line)) # Choose a modification type mod_type = random.choice([ "change_comparison", "invert_condition", "off_by_one", "change_operator", "reverse_logic" ]) if if_statements: # Choose an if statement to modify idx, line = random.choice(if_statements) if mod_type == "change_comparison": # Change comparison operators comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} for op, new_op in comparisons.items(): if op in line: lines[idx] = line.replace(op, new_op, 1) break elif mod_type == "invert_condition": # Add or remove a "not" to invert the condition if "not" in line: lines[idx] = line.replace("not ", "", 1) else: match = re.search(r'(if|while)\s+([^:]+):', line) if match: condition = match.group(2) lines[idx] = line.replace(condition, f"not ({condition})", 1) elif mod_type == "off_by_one": # Introduce an off-by-one error for op in ["+", "-"]: if op in line: # If there's a number after the operator, change it match = re.search(f'\\{op}\\s*(\\d+)', line) if match: num = int(match.group(1)) new_num = num + 1 if op == "+" else max(0, num - 1) lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) break elif mod_type == "change_operator": # Change arithmetic or logical operators operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} for op, new_op in operators.items(): if f" {op} " in line: lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) break elif mod_type == "reverse_logic": # Reverse the logic of a compound condition if " and " in line: parts = line.split(" and ") lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) elif " or " in line: parts = line.split(" or ") lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) else: # If no if statements found, introduce a different kind of logical error # Find variable assignments assignments = [] for i, line in enumerate(lines): if "=" in line and "==" not in line and "!=" not in line: assignments.append((i, line)) if assignments: # Choose an assignment to modify idx, line = random.choice(assignments) # Modify the assignment if "+" in line: lines[idx] = line.replace("+", "-", 1) elif "-" in line: lines[idx] = line.replace("-", "+", 1) elif "*" in line: lines[idx] = line.replace("*", "/", 1) elif "/" in line: lines[idx] = line.replace("/", "*", 1) else: # If no arithmetic operator, change the value match = re.search(r'=\s*(\d+)', line) if match: num = int(match.group(1)) new_num = num + random.choice([-1, 1]) * random.randint(1, 3) lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) # Update the code problem_state.code_context["code"] = '\n'.join(lines) # Add information about the bug if "bugs" not in problem_state.code_context: problem_state.code_context["bugs"] = [] problem_state.code_context["bugs"].append({ "type": BugCategory.LOGICAL, "line": idx + 1, "description": f"Logical error introduced in line {idx + 1}" }) def _insert_performance_bug(self, problem_state: ProblemState) -> None: """ Insert a performance bug into the problem state. Args: problem_state: The problem state to modify """ code = problem_state.code_context["code"] lines = code.split('\n') if not lines: return # Find functions in the code functions = [] current_func = None func_start = None for i, line in enumerate(lines): if line.strip().startswith("def "): if current_func: functions.append((func_start, i - 1, current_func)) current_func = line.strip()[4:].split("(")[0] func_start = i elif i == len(lines) - 1 and current_func: functions.append((func_start, i, current_func)) if not functions: return # Choose a function to modify start_idx, end_idx, func_name = random.choice(functions) # Choose a modification type mod_type = random.choice([ "add_nested_loop", "inefficient_data_structure", "redundant_computation" ]) if mod_type == "add_nested_loop": # Find indentation of the function for i in range(start_idx + 1, end_idx + 1): if lines[i].strip(): indent = len(lines[i]) - len(lines[i].lstrip()) break else: indent = 4 # Find a suitable place to add a nested loop for i in range(start_idx + 1, end_idx + 1): if "for " in lines[i] or "while " in lines[i]: # Add a nested loop after this loop inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 inner_indent_str = ' ' * inner_indent # Add an unnecessary nested loop lines.insert(i + 1, f"{inner_indent_str}for _ in range(100): # Inefficient nested loop") lines.insert(i + 2, f"{inner_indent_str} pass") # Update indices end_idx += 2 break else: # If no loop found, add one at the beginning of the function inner_indent = indent + 4 inner_indent_str = ' ' * inner_indent # Find the first non-docstring line for i in range(start_idx + 1, end_idx + 1): if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): # Add an unnecessary loop lines.insert(i, f"{' ' * indent}for i in range(100): # Inefficient loop") lines.insert(i + 1, f"{inner_indent_str}pass") # Update indices end_idx += 2 break elif mod_type == "ineff # recursive_swe_bench/task_generators/bug_fixing.py (finalized) elif mod_type == "inefficient_data_structure": # Find indentation of the function for i in range(start_idx + 1, end_idx + 1): if lines[i].strip(): indent = len(lines[i]) - len(lines[i].lstrip()) break else: indent = 4 # Find a suitable place to add inefficient data structure usage for i in range(start_idx + 1, end_idx + 1): if "def " not in lines[i] and lines[i].strip(): # Add inefficient data structure usage after this line indent_str = ' ' * indent # Add inefficient code lines.insert(i + 1, f"{indent_str}# Inefficient data structure usage") lines.insert(i + 2, f"{indent_str}results = []") lines.insert(i + 3, f"{indent_str}for i in range(1000): # Unnecessarily large range") lines.insert(i + 4, f"{indent_str} # Using list instead of set for lookups") lines.insert(i + 5, f"{indent_str} if i % 10 in results: # O(n) lookup instead of O(1)") lines.insert(i + 6, f"{indent_str} results.append(i) # Unnecessary storage") # Update indices end_idx += 6 break elif mod_type == "redundant_computation": # Find indentation of the function for i in range(start_idx + 1, end_idx + 1): if lines[i].strip(): indent = len(lines[i]) - len(lines[i].lstrip()) break else: indent = 4 # Find a suitable place to add redundant computation for i in range(start_idx + 1, end_idx + 1): if "for " in lines[i] or "while " in lines[i]: # Add redundant computation inside the loop inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 inner_indent_str = ' ' * inner_indent # Add redundant computation lines.insert(i + 1, f"{inner_indent_str}# Redundant computation in each iteration") lines.insert(i + 2, f"{inner_indent_str}temp_sum = 0") lines.insert(i + 3, f"{inner_indent_str}for j in range(100): # Unnecessary nested computation") lines.insert(i + 4, f"{inner_indent_str} temp_sum += j") # Update indices end_idx += 4 break # Update the code problem_state.code_context["code"] = '\n'.join(lines) # Add information about the bug if "bugs" not in problem_state.code_context: problem_state.code_context["bugs"] = [] problem_state.code_context["bugs"].append({ "type": BugCategory.PERFORMANCE, "line": start_idx + 1, "description": f"Performance issue introduced in function '{func_name}'" }) def _insert_edge_case_bug(self, problem_state: ProblemState) -> None: """ Insert an edge case bug into the problem state. Args: problem_state: The problem state to modify """ code = problem_state.code_context["code"] lines = code.split('\n') if not lines: return # Find functions in the code functions = [] current_func = None func_start = None for i, line in enumerate(lines): if line.strip().startswith("def "): if current_func: functions.append((func_start, i - 1, current_func)) current_func = line.strip()[4:].split("(")[0] func_start = i elif i == len(lines) - 1 and current_func: functions.append((func_start, i, current_func)) if not functions: return # Choose a function to modify start_idx, end_idx, func_name = random.choice(functions) # Choose a modification type mod_type = random.choice([ "remove_boundary_check", "missing_edge_case", "type_assumption" ]) if mod_type == "remove_boundary_check": # Find boundary checks (if statements with conditions that check boundaries) boundary_checks = [] for i in range(start_idx + 1, end_idx + 1): if (re.search(r'if\s+.*(len|empty|<=|>=|<|>|==|!=)', lines[i]) and (("if not " in lines[i]) or ("if len(" in lines[i]) or ("if " in lines[i] and " == 0" in lines[i]) or ("if " in lines[i] and " == []" in lines[i]) or ("if " in lines[i] and " == ''" in lines[i]) or ("if " in lines[i] and " is None" in lines[i]))): boundary_checks.append(i) if boundary_checks: # Choose a boundary check to remove idx = random.choice(boundary_checks) # Comment out the boundary check lines[idx] = f"# {lines[idx]} # Boundary check removed" # Comment out the body of the if statement i = idx + 1 while i <= end_idx and (not lines[i].strip() or len(lines[i]) - len(lines[i].lstrip()) > len(lines[idx]) - len(lines[idx].lstrip())): lines[i] = f"# {lines[i]}" i += 1 else: # If no boundary check found, add code that assumes a non-empty input # Find the first non-docstring line in the function for i in range(start_idx + 1, end_idx + 1): if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent # Add code that assumes non-empty input lines.insert(i, f"{indent_str}# Missing check for empty input") lines.insert(i + 1, f"{indent_str}first_item = items[0] # Will fail on empty input") # Update indices end_idx += 2 break elif mod_type == "missing_edge_case": # Find a suitable place to insert the bug for i in range(start_idx + 1, end_idx + 1): if ("/" in lines[i] or "if " in lines[i] and "==" in lines[i] or "if " in lines[i] and "!=" in lines[i]): if "/" in lines[i] and not re.search(r'if\s+.*!=\s*0', lines[i-1]): # Add code that doesn't check for zero division indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent # Extract the denominator match = re.search(r'/\s*(\w+)', lines[i]) if match: denominator = match.group(1) # Comment out any existing check j = i - 1 while j >= start_idx and len(lines[j]) - len(lines[j].lstrip()) >= indent: if f"if {denominator}" in lines[j] and "== 0" in lines[j]: lines[j] = f"# {lines[j]} # Zero division check removed" j -= 1 # Add a comment about the missing check lines.insert(i, f"{indent_str}# Missing check for zero division") # Update indices end_idx += 1 break elif ("==" in lines[i] or "!=" in lines[i]) and "None" not in lines[i]: # Comment out edge case check lines[i] = f"# {lines[i]} # Edge case check removed" break else: # If no suitable place found, add code that doesn't handle an edge case # Find the first non-docstring line in the function for i in range(start_idx + 1, end_idx + 1): if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent # Add code that doesn't handle an edge case lines.insert(i, f"{indent_str}# Missing handling for edge cases") lines.insert(i + 1, f"{indent_str}# This function doesn't handle special cases properly") # Update indices end_idx += 2 break elif mod_type == "type_assumption": # Find a suitable place to insert a type assumption bug for i in range(start_idx + 1, end_idx + 1): if re.search(r'for\s+\w+\s+in\s+\w+', lines[i]) or "=" in lines[i] and "[" in lines[i]: # Extract the variable name var_match = re.search(r'for\s+\w+\s+in\s+(\w+)', lines[i]) if not var_match: var_match = re.search(r'(\w+)\s*=', lines[i]) if var_match: var_name = var_match.group(1) indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent # Add code that assumes a specific type lines.insert(i + 1, f"{indent_str}# Type assumption: {var_name} is assumed to be a list") lines.insert(i + 2, f"{indent_str}if len({var_name}) > 0: # Will fail if {var_name} doesn't support len()") lines.insert(i + 3, f"{indent_str} first = {var_name}[0] # Will fail if {var_name} is not subscriptable") # Update indices end_idx += 3 break else: # If no suitable place found, add code at the beginning of the function for i in range(start_idx + 1, end_idx + 1): if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): indent = len(lines[i]) - len(lines[i].lstrip()) indent_str = ' ' * indent # Extract parameter name param_match = re.search(r'def\s+\w+\s*\(\s*(\w+)', lines[start_idx]) param_name = param_match.group(1) if param_match else "input_data" # Add code that assumes a specific type lines.insert(i, f"{indent_str}# Type assumption: {param_name} is assumed to be a specific type") lines.insert(i + 1, f"{indent_str}{param_name}_str = str({param_name}) # Will fail if {param_name} can't be converted to string") # Update indices end_idx += 2 break # Update the code problem_state.code_context["code"] = '\n'.join(lines) # Add information about the bug if "bugs" not in problem_state.code_context: problem_state.code_context["bugs"] = [] problem_state.code_context["bugs"].append({ "type": BugCategory.EDGE_CASE, "line": start_idx + 1, "description": f"Edge case bug introduced in function '{func_name}'" }) def _generate_description(self, problem_state: ProblemState) -> str: """ Generate a description for the current problem state. Args: problem_state: The problem state Returns: A descriptive prompt for the problem """ # Base description bug_count = problem_state.code_context.get("bug_count", 0) plural = "bugs" if bug_count != 1 else "bug" base_desc = ( f"Fix the {plural} in the code below. " f"There {'are' if bug_count != 1 else 'is'} {bug_count} {plural} to find and fix." ) # Add information about bug categories if "bug_categories" in problem_state.code_context: categories = problem_state.code_context["bug_categories"] if categories: category_desc = ", ".join(categories) base_desc += f"\n\nThe code contains the following types of issues: {category_desc}." # Add requirements if problem_state.requirements: base_desc += "\n\nRequirements:" for i, req in enumerate(problem_state.requirements): base_desc += f"\n{i+1}. {req['description']}" # Add difficulty level difficulty_desc = "easy" if problem_state.difficulty > 0.3 and problem_state.difficulty <= 0.6: difficulty_desc = "moderate" elif problem_state.difficulty > 0.6 and problem_state.difficulty <= 0.8: difficulty_desc = "challenging" elif problem_state.difficulty > 0.8: difficulty_desc = "very challenging" base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." return base_desc # Default implementation of TestRunner for when no custom runner is provided class DefaultTestRunner: """ Default test runner for evaluating solutions. This class runs tests against a solution file and collects the results. """ def run_tests( self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any] ) -> Dict[str, Any]: """ Run tests against a solution file. Args: solution_file: Path to the solution file test_files: List of test file paths code_context: Additional context about the code Returns: Dictionary containing test results """ # Initialize results dictionary results = { "all_passed": True, "passed_tests": 0, "total_tests": 0, "tests": {}, "execution": { "success": True, "error": None, "stdout": "", "stderr": "" }, "execution_time": 0.0 } # Check if solution file exists if not solution_file.exists(): results["execution"]["success"] = False results["execution"]["error"] = f"Solution file not found: {solution_file}" results["all_passed"] = False return results # Try to import the solution module try: start_time = time.time() # Add solution directory to path sys.path.insert(0, str(solution_file.parent)) # Import the solution module spec = importlib.util.spec_from_file_location( "solution", solution_file) solution_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(solution_module) # Remove the solution directory from path sys.path.pop(0) # Record execution time end_time = time.time() results["execution_time"] = end_time - start_time except Exception as e: results["execution"]["success"] = False results["execution"]["error"] = str(e) results["all_passed"] = False return results # Run each test file for test_file in test_files: # Skip if the test file doesn't exist if not test_file.exists(): continue try: # Set up test loading loader = unittest.TestLoader() # Add test directory to path sys.path.insert(0, str(test_file.parent)) # Capture stdout and stderr stdout_buffer = io.StringIO() stderr_buffer = io.StringIO() # Create a test suite from the test file test_suite = loader.discover( str(test_file.parent), pattern=test_file.name ) # Count test cases test_count = 0 for suite in test_suite: for test_case in suite: test_count += test_case.countTestCases() results["total_tests"] += test_count # Run the tests with captured output with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): test_runner = unittest.TextTestRunner(verbosity=2) test_result = test_runner.run(test_suite) # Get the captured output stdout = stdout_buffer.getvalue() stderr = stderr_buffer.getvalue() # Remove the test directory from path sys.path.pop(0) # Check if all tests passed if not test_result.wasSuccessful(): results["all_passed"] = False # Count passed tests passed_tests = test_count - len(test_result.failures) - len(test_result.errors) results["passed_tests"] += passed_tests # Store individual test results test_name = test_file.stem results["tests"][test_name] = { "passed": test_result.wasSuccessful(), "failures": len(test_result.failures), "errors": len(test_result.errors), "skipped": len(test_result.skipped), "total": test_count, "passed_count": passed_tests, "stdout": stdout, "stderr": stderr } # Store details for individual test failures for failure in test_result.failures + test_result.errors: test_id = failure[0].id().split('.')[-1] failure_message = failure[1] # Try to extract expected and actual values expected_match = re.search(r'Expected\s*:(.+)', failure_message) actual_match = re.search(r'Actual\s*:(.+)', failure_message) expected = expected_match.group(1).strip() if expected_match else None actual = actual_match.group(1).strip() if actual_match else None if test_id not in results["tests"]: results["tests"][test_id] = {} results["tests"][test_id].update({ "passed": False, "message": failure_message, "expected": expected, "actual": actual }) except Exception as e: # If there's an error in the test file itself results["all_passed"] = False test_name = test_file.stem results["tests"][test_name] = { "passed": False, "error": str(e), "failures": 0, "errors": 1, "skipped": 0, "total": 1, "passed_count": 0 } results["total_tests"] += 1 return results