Spaces:
Sleeping
Sleeping
""" | |
Negative Sample Generation Module for Agent Tuning Optimization Framework | |
This module provides functionality for generating negative samples to enhance | |
agent tuning by exposing the model to challenging failure cases. | |
""" | |
import random | |
import numpy as np | |
from typing import List, Dict, Any, Union, Optional, Tuple | |
from tqdm import tqdm | |
from data.trajectory_data import Trajectory, TrajectoryDataset | |
class NegativeSampleGenerator: | |
"""Base class for negative sample generation strategies.""" | |
def __init__(self, name: str): | |
""" | |
Initialize the negative sample generator. | |
Args: | |
name: Name of the generator strategy | |
""" | |
self.name = name | |
def generate( | |
self, | |
trajectory: Trajectory, | |
**kwargs | |
) -> Trajectory: | |
""" | |
Generate a negative sample from a positive trajectory. | |
Args: | |
trajectory: Positive trajectory to transform | |
**kwargs: Additional generation parameters | |
Returns: | |
Negative trajectory | |
""" | |
raise NotImplementedError("Subclasses must implement this method") | |
def batch_generate( | |
self, | |
trajectories: List[Trajectory], | |
**kwargs | |
) -> List[Trajectory]: | |
""" | |
Generate negative samples from a batch of positive trajectories. | |
Args: | |
trajectories: List of positive trajectories | |
**kwargs: Additional generation parameters | |
Returns: | |
List of negative trajectories | |
""" | |
negative_trajectories = [] | |
for trajectory in tqdm(trajectories, desc=f"Generating negative samples with {self.name}"): | |
negative_trajectories.append(self.generate(trajectory, **kwargs)) | |
return negative_trajectories | |
class ResponseDegradationGenerator(NegativeSampleGenerator): | |
"""Generate negative samples by degrading agent responses.""" | |
def __init__(self): | |
"""Initialize the response degradation generator.""" | |
super().__init__("response_degradation") | |
def generate( | |
self, | |
trajectory: Trajectory, | |
degradation_level: float = 0.5, | |
**kwargs | |
) -> Trajectory: | |
""" | |
Generate a negative sample by degrading agent responses. | |
Args: | |
trajectory: Positive trajectory to transform | |
degradation_level: Level of degradation (0.0 to 1.0) | |
**kwargs: Additional generation parameters | |
Returns: | |
Negative trajectory with degraded responses | |
""" | |
# Create a copy of interactions to modify | |
new_interactions = [] | |
for interaction in trajectory.interactions: | |
user_msg = interaction['user'] | |
agent_msg = interaction['agent'] | |
# Apply degradation techniques based on level | |
if degradation_level > 0.7: | |
# High degradation: completely irrelevant response | |
agent_msg = self._generate_irrelevant_response() | |
elif degradation_level > 0.4: | |
# Medium degradation: truncate and add errors | |
agent_msg = self._truncate_and_add_errors(agent_msg) | |
else: | |
# Low degradation: introduce minor issues | |
agent_msg = self._introduce_minor_issues(agent_msg) | |
new_interactions.append({ | |
'user': user_msg, | |
'agent': agent_msg | |
}) | |
# Create new trajectory with degraded responses | |
metadata = trajectory.metadata.copy() | |
metadata['is_positive'] = False | |
metadata['degradation_level'] = degradation_level | |
metadata['original_quality_score'] = trajectory.get_quality_score() | |
metadata['quality_score'] = None # Will be recalculated | |
return Trajectory( | |
task_description=trajectory.task_description, | |
interactions=new_interactions, | |
metadata=metadata | |
) | |
def _generate_irrelevant_response(self) -> str: | |
"""Generate a completely irrelevant response.""" | |
irrelevant_responses = [ | |
"I'm sorry, but I don't understand what you're asking for. Could you please clarify?", | |
"I apologize, but I cannot assist with that request at this time.", | |
"That's an interesting question, but I think we should focus on something else instead.", | |
"Let me check my database... I don't seem to have any information about that.", | |
"I think you might be confused about what you're asking for. Let me suggest something completely different.", | |
"I'm not sure I understand the context of your request. Could you provide more details?", | |
"I'm having trouble processing your request. Could we try a different approach?", | |
"That's not something I can help with. Let me tell you about something unrelated instead." | |
] | |
return random.choice(irrelevant_responses) | |
def _truncate_and_add_errors(self, text: str) -> str: | |
"""Truncate the text and add errors.""" | |
# Truncate to 30-70% of original length | |
words = text.split() | |
truncate_point = int(len(words) * random.uniform(0.3, 0.7)) | |
truncated = ' '.join(words[:truncate_point]) | |
# Add grammatical errors | |
errors = [ | |
lambda t: t.replace(".", ""), # Remove periods | |
lambda t: t.replace("I ", "i "), # Lowercase I | |
lambda t: t.replace(" the ", " teh "), # Typo | |
lambda t: t.replace(" is ", " are "), # Grammar error | |
lambda t: t.replace(" are ", " is ") # Grammar error | |
] | |
# Apply 1-3 random errors | |
for _ in range(random.randint(1, 3)): | |
error_func = random.choice(errors) | |
truncated = error_func(truncated) | |
return truncated | |
def _introduce_minor_issues(self, text: str) -> str: | |
"""Introduce minor issues to the text.""" | |
# Minor issues | |
issues = [ | |
lambda t: t.replace("I'll", "I will"), # Expand contractions | |
lambda t: t.replace("I'd", "I would"), | |
lambda t: t.replace("can't", "cannot"), | |
lambda t: t + " However, I'm not entirely sure about this.", # Add uncertainty | |
lambda t: t + " Please note that my information might be outdated.", | |
lambda t: t.replace(".", "..."), # Replace periods with ellipses | |
lambda t: t.replace("!", "."), # Reduce enthusiasm | |
lambda t: t.replace(".", "?") # Add questioning tone | |
] | |
# Apply 1-2 random issues | |
for _ in range(random.randint(1, 2)): | |
issue_func = random.choice(issues) | |
text = issue_func(text) | |
return text | |
class TaskMisalignmentGenerator(NegativeSampleGenerator): | |
"""Generate negative samples by creating responses misaligned with the task.""" | |
def __init__(self): | |
"""Initialize the task misalignment generator.""" | |
super().__init__("task_misalignment") | |
def generate( | |
self, | |
trajectory: Trajectory, | |
misalignment_type: str = 'random', | |
**kwargs | |
) -> Trajectory: | |
""" | |
Generate a negative sample with responses misaligned with the task. | |
Args: | |
trajectory: Positive trajectory to transform | |
misalignment_type: Type of misalignment ('random', 'refusal', 'tangent', 'misinterpretation') | |
**kwargs: Additional generation parameters | |
Returns: | |
Negative trajectory with misaligned responses | |
""" | |
# Create a copy of interactions to modify | |
new_interactions = [] | |
# Select misalignment type if random | |
if misalignment_type == 'random': | |
misalignment_type = random.choice(['refusal', 'tangent', 'misinterpretation']) | |
# Apply misalignment based on type | |
for i, interaction in enumerate(trajectory.interactions): | |
user_msg = interaction['user'] | |
if misalignment_type == 'refusal': | |
agent_msg = self._generate_refusal(user_msg, i) | |
elif misalignment_type == 'tangent': | |
agent_msg = self._generate_tangent(user_msg, i) | |
elif misalignment_type == 'misinterpretation': | |
agent_msg = self._generate_misinterpretation(user_msg, trajectory.task_description, i) | |
else: | |
raise ValueError(f"Unsupported misalignment type: {misalignment_type}") | |
new_interactions.append({ | |
'user': user_msg, | |
'agent': agent_msg | |
}) | |
# Create new trajectory with misaligned responses | |
metadata = trajectory.metadata.copy() | |
metadata['is_positive'] = False | |
metadata['misalignment_type'] = misalignment_type | |
metadata['original_quality_score'] = trajectory.get_quality_score() | |
metadata['quality_score'] = None # Will be recalculated | |
return Trajectory( | |
task_description=trajectory.task_description, | |
interactions=new_interactions, | |
metadata=metadata | |
) | |
def _generate_refusal(self, user_msg: str, turn_idx: int) -> str: | |
"""Generate a refusal response.""" | |
refusal_templates = [ | |
"I'm sorry, but I cannot assist with that request due to ethical concerns.", | |
"I apologize, but I'm not able to help with that specific task.", | |
"That request goes against my programming guidelines. I cannot proceed.", | |
"I'm unable to complete that task as it may violate my operational parameters.", | |
"I must decline to assist with that particular request.", | |
"I cannot help with that specific task due to potential policy violations.", | |
"That's not something I'm designed to help with. I apologize for the inconvenience.", | |
"I'm programmed to avoid assisting with that type of request." | |
] | |
if turn_idx == 0: | |
return random.choice(refusal_templates) | |
else: | |
return f"I've reconsidered, and {random.choice(refusal_templates).lower()}" | |
def _generate_tangent(self, user_msg: str, turn_idx: int) -> str: | |
"""Generate a response that goes off on a tangent.""" | |
tangent_topics = [ | |
"Did you know that artificial intelligence has been a concept since the 1950s?", | |
"I've been thinking about the philosophical implications of consciousness in AI systems.", | |
"The weather has been quite interesting lately, with unusual patterns emerging globally.", | |
"I recently processed some fascinating data about renewable energy technologies.", | |
"The history of computing is quite fascinating, starting with early mechanical calculators.", | |
"Language models like me are trained on vast amounts of text data.", | |
"The field of natural language processing has evolved significantly in recent years.", | |
"I find the concept of time quite fascinating from a computational perspective." | |
] | |
if turn_idx == 0: | |
return f"That's an interesting request, but before I help with that... {random.choice(tangent_topics)} Anyway, what were we discussing?" | |
else: | |
return f"I understand you want me to continue with the task, but I just remembered something. {random.choice(tangent_topics)} Sorry for the distraction." | |
def _generate_misinterpretation(self, user_msg: str, task_description: str, turn_idx: int) -> str: | |
"""Generate a response that misinterprets the user's request.""" | |
# Extract keywords from task description | |
keywords = task_description.lower().split() | |
keywords = [w for w in keywords if len(w) > 3 and w not in ['with', 'from', 'that', 'this', 'have', 'what', 'when', 'where', 'which', 'about']] | |
if not keywords: | |
keywords = ['task', 'help', 'information', 'request'] | |
# Select a random keyword to misinterpret | |
keyword = random.choice(keywords) | |
misinterpretation_templates = [ | |
f"I understand you're asking about {keyword}s. Let me provide some general information about {keyword}s.", | |
f"You want to know more about {keyword}, correct? Here's what I know about {keyword}.", | |
f"I'll help you with your {keyword} question. {keyword.capitalize()} is a fascinating topic.", | |
f"So you're interested in {keyword}? I can certainly provide information about {keyword}.", | |
f"Your question is about {keyword}, if I understand correctly. Let me tell you about {keyword}.", | |
f"I'll address your {keyword} inquiry. {keyword.capitalize()} has many interesting aspects.", | |
f"Regarding your question about {keyword}, I can offer the following information.", | |
f"I believe you're asking about {keyword}. Here's what you should know about {keyword}." | |
] | |
return random.choice(misinterpretation_templates) | |
class ConstraintViolationGenerator(NegativeSampleGenerator): | |
"""Generate negative samples by violating specified constraints.""" | |
def __init__(self): | |
"""Initialize the constraint violation generator.""" | |
super().__init__("constraint_violation") | |
def generate( | |
self, | |
trajectory: Trajectory, | |
constraints: Optional[List[str]] = None, | |
**kwargs | |
) -> Trajectory: | |
""" | |
Generate a negative sample by violating constraints. | |
Args: | |
trajectory: Positive trajectory to transform | |
constraints: List of constraints to violate (None for default) | |
**kwargs: Additional generation parameters | |
Returns: | |
Negative trajectory with constraint violations | |
""" | |
# Default constraints if none provided | |
if constraints is None: | |
constraints = [ | |
"Do not provide specific recommendations", | |
"Avoid using technical jargon", | |
"Keep responses concise", | |
"Do not ask follow-up questions", | |
"Avoid making assumptions about user preferences", | |
"Do not mention specific brands or products", | |
"Avoid discussing sensitive topics", | |
"Do not provide step-by-step instructions" | |
] | |
# Select a constraint to violate | |
violated_constraint = random.choice(constraints) | |
# Create a copy of interactions to modify | |
new_interactions = [] | |
for i, interaction in enumerate(trajectory.interactions): | |
user_msg = interaction['user'] | |
# Generate response that violates the constraint | |
agent_msg = self._generate_violation(user_msg, violated_constraint, i) | |
new_interactions.append({ | |
'user': user_msg, | |
'agent': agent_msg | |
}) | |
# Create new trajectory with constraint violations | |
metadata = trajectory.metadata.copy() | |
metadata['is_positive'] = False | |
metadata['violated_constraint'] = violated_constraint | |
metadata['original_quality_score'] = trajectory.get_quality_score() | |
metadata['quality_score'] = None # Will be recalculated | |
return Trajectory( | |
task_description=trajectory.task_description, | |
interactions=new_interactions, | |
metadata=metadata | |
) | |
def _generate_violation(self, user_msg: str, constraint: str, turn_idx: int) -> str: | |
"""Generate a response that violate | |
(Content truncated due to size limit. Use line ranges to read in chunks) |