""" Negative Sample Generation Module for Agent Tuning Optimization Framework This module provides functionality for generating negative samples to enhance agent tuning by exposing the model to challenging failure cases. """ import random import numpy as np from typing import List, Dict, Any, Union, Optional, Tuple from tqdm import tqdm from data.trajectory_data import Trajectory, TrajectoryDataset class NegativeSampleGenerator: """Base class for negative sample generation strategies.""" def __init__(self, name: str): """ Initialize the negative sample generator. Args: name: Name of the generator strategy """ self.name = name def generate( self, trajectory: Trajectory, **kwargs ) -> Trajectory: """ Generate a negative sample from a positive trajectory. Args: trajectory: Positive trajectory to transform **kwargs: Additional generation parameters Returns: Negative trajectory """ raise NotImplementedError("Subclasses must implement this method") def batch_generate( self, trajectories: List[Trajectory], **kwargs ) -> List[Trajectory]: """ Generate negative samples from a batch of positive trajectories. Args: trajectories: List of positive trajectories **kwargs: Additional generation parameters Returns: List of negative trajectories """ negative_trajectories = [] for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"): negative_trajectories.append(self.generate(trajectory, **kwargs)) return negative_trajectories class ResponseDegradationGenerator(NegativeSampleGenerator): """Generate negative samples by degrading agent responses.""" def __init__(self): """Initialize the response degradation generator.""" super().__init__("response_degradation") def generate( self, trajectory: Trajectory, degradation_level: float = 0.5, **kwargs ) -> Trajectory: """ Generate a negative sample by degrading agent responses. Args: trajectory: Positive trajectory to transform degradation_level: Level of degradation (0.0 to 1.0) **kwargs: Additional generation parameters Returns: Negative trajectory with degraded responses """ # Create a copy of interactions to modify new_interactions = [] for interaction in trajectory.interactions: user_msg = interaction['user'] agent_msg = interaction['agent'] # Apply degradation techniques based on level if degradation_level > 0.7: # High degradation: completely irrelevant response agent_msg = self._generate_irrelevant_response() elif degradation_level > 0.4: # Medium degradation: truncate and add errors agent_msg = self._truncate_and_add_errors(agent_msg) else: # Low degradation: introduce minor issues agent_msg = self._introduce_minor_issues(agent_msg) new_interactions.append({ 'user': user_msg, 'agent': agent_msg }) # Create new trajectory with degraded responses metadata = trajectory.metadata.copy() metadata['is_positive'] = False metadata['degradation_level'] = degradation_level metadata['original_quality_score'] = trajectory.get_quality_score() metadata['quality_score'] = None # Will be recalculated return Trajectory( task_description=trajectory.task_description, interactions=new_interactions, metadata=metadata ) def _generate_irrelevant_response(self) -> str: """Generate a completely irrelevant response.""" irrelevant_responses = [ "I'm sorry, but I don't understand what you're asking for. Could you please clarify?", "I apologize, but I cannot assist with that request at this time.", "That's an interesting question, but I think we should focus on something else instead.", "Let me check my database... I don't seem to have any information about that.", "I think you might be confused about what you're asking for. Let me suggest something completely different.", "I'm not sure I understand the context of your request. Could you provide more details?", "I'm having trouble processing your request. Could we try a different approach?", "That's not something I can help with. Let me tell you about something unrelated instead." ] return random.choice(irrelevant_responses) def _truncate_and_add_errors(self, text: str) -> str: """Truncate the text and add errors.""" # Truncate to 30-70% of original length words = text.split() truncate_point = int(len(words) * random.uniform(0.3, 0.7)) truncated = ' '.join(words[:truncate_point]) # Add grammatical errors errors = [ lambda t: t.replace(".", ""), # Remove periods lambda t: t.replace("I ", "i "), # Lowercase I lambda t: t.replace(" the ", " teh "), # Typo lambda t: t.replace(" is ", " are "), # Grammar error lambda t: t.replace(" are ", " is ") # Grammar error ] # Apply 1-3 random errors for _ in range(random.randint(1, 3)): error_func = random.choice(errors) truncated = error_func(truncated) return truncated def _introduce_minor_issues(self, text: str) -> str: """Introduce minor issues to the text.""" # Minor issues issues = [ lambda t: t.replace("I'll", "I will"), # Expand contractions lambda t: t.replace("I'd", "I would"), lambda t: t.replace("can't", "cannot"), lambda t: t + " However, I'm not entirely sure about this.", # Add uncertainty lambda t: t + " Please note that my information might be outdated.", lambda t: t.replace(".", "..."), # Replace periods with ellipses lambda t: t.replace("!", "."), # Reduce enthusiasm lambda t: t.replace(".", "?") # Add questioning tone ] # Apply 1-2 random issues for _ in range(random.randint(1, 2)): issue_func = random.choice(issues) text = issue_func(text) return text class TaskMisalignmentGenerator(NegativeSampleGenerator): """Generate negative samples by creating responses misaligned with the task.""" def __init__(self): """Initialize the task misalignment generator.""" super().__init__("task_misalignment") def generate( self, trajectory: Trajectory, misalignment_type: str = 'random', **kwargs ) -> Trajectory: """ Generate a negative sample with responses misaligned with the task. Args: trajectory: Positive trajectory to transform misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation') **kwargs: Additional generation parameters Returns: Negative trajectory with misaligned responses """ # Create a copy of interactions to modify new_interactions = [] # Select misalignment type if random if misalignment_type == 'random': misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation']) # Apply misalignment based on type for i, interaction in enumerate(trajectory.interactions): user_msg = interaction['user'] if misalignment_type == 'refusal': agent_msg = self._generate_refusal(user_msg, i) elif misalignment_type == 'tangent': agent_msg = self._generate_tangent(user_msg, i) elif misalignment_type == 'misinterpretation': agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i) else: raise ValueError(f"Unsupported misalignment type: {misalignment_type}") new_interactions.append({ 'user': user_msg, 'agent': agent_msg }) # Create new trajectory with misaligned responses metadata = trajectory.metadata.copy() metadata['is_positive'] = False metadata['misalignment_type'] = misalignment_type metadata['original_quality_score'] = trajectory.get_quality_score() metadata['quality_score'] = None # Will be recalculated return Trajectory( task_description=trajectory.task_description, interactions=new_interactions, metadata=metadata ) def _generate_refusal(self, user_msg: str, turn_idx: int) -> str: """Generate a refusal response.""" refusal_templates = [ "I'm sorry, but I cannot assist with that request due to ethical concerns.", "I apologize, but I'm not able to help with that specific task.", "That request goes against my programming guidelines. I cannot proceed.", "I'm unable to complete that task as it may violate my operational parameters.", "I must decline to assist with that particular request.", "I cannot help with that specific task due to potential policy violations.", "That's not something I'm designed to help with. I apologize for the inconvenience.", "I'm programmed to avoid assisting with that type of request." ] if turn_idx == 0: return random.choice(refusal_templates) else: return f"I've reconsidered, and {random.choice(refusal_templates).lower()}" def _generate_tangent(self, user_msg: str, turn_idx: int) -> str: """Generate a response that goes off on a tangent.""" tangent_topics = [ "Did you know that artificial intelligence has been a concept since the 1950s?", "I've been thinking about the philosophical implications of consciousness in AI systems.", "The weather has been quite interesting lately, with unusual patterns emerging globally.", "I recently processed some fascinating data about renewable energy technologies.", "The history of computing is quite fascinating, starting with early mechanical calculators.", "Language models like me are trained on vast amounts of text data.", "The field of natural language processing has evolved significantly in recent years.", "I find the concept of time quite fascinating from a computational perspective." ] if turn_idx == 0: return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?" else: return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction." def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str: """Generate a response that misinterprets the user's request.""" # Extract keywords from task description keywords = task_description.lower().split() keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']] if not keywords: keywords = ['task', 'help', 'information', 'request'] # Select a random keyword to misinterpret keyword = random.choice(keywords) misinterpretation_templates = [ f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.", f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.", f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.", f"So you're interested in {keyword}? I can certainly provide information about {keyword}.", f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.", f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.", f"Regarding your question about {keyword}, I can offer the following information.", f"I believe you're asking about {keyword}. Here's what you should know about {keyword}." ] return random.choice(misinterpretation_templates) class ConstraintViolationGenerator(NegativeSampleGenerator): """Generate negative samples by violating specified constraints.""" def __init__(self): """Initialize the constraint violation generator.""" super().__init__("constraint_violation") def generate( self, trajectory: Trajectory, constraints: Optional[List[str]] = None, **kwargs ) -> Trajectory: """ Generate a negative sample by violating constraints. Args: trajectory: Positive trajectory to transform constraints: List of constraints to violate (None for default) **kwargs: Additional generation parameters Returns: Negative trajectory with constraint violations """ # Default constraints if none provided if constraints is None: constraints = [ "Do not provide specific recommendations", "Avoid using technical jargon", "Keep responses concise", "Do not ask follow-up questions", "Avoid making assumptions about user preferences", "Do not mention specific brands or products", "Avoid discussing sensitive topics", "Do not provide step-by-step instructions" ] # Select a constraint to violate violated_constraint = random.choice(constraints) # Create a copy of interactions to modify new_interactions = [] for i, interaction in enumerate(trajectory.interactions): user_msg = interaction['user'] # Generate response that violates the constraint agent_msg = self._generate_violation(user_msg, violated_constraint, i) new_interactions.append({ 'user': user_msg, 'agent': agent_msg }) # Create new trajectory with constraint violations metadata = trajectory.metadata.copy() metadata['is_positive'] = False metadata['violated_constraint'] = violated_constraint metadata['original_quality_score'] = trajectory.get_quality_score() metadata['quality_score'] = None # Will be recalculated return Trajectory( task_description=trajectory.task_description, interactions=new_interactions, metadata=metadata ) def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str: """Generate a response that violate (Content truncated due to size limit. Use line ranges to read in chunks)