ai-welfare / robust_agency_assessment.py
recursivelabs's picture
Upload 8 files
056a408 verified
"""
robust_agency_assessment.py
This module implements a pluralistic, probabilistic framework for assessing robust agency
in AI systems. It defines various levels of agency, identifies computational markers
associated with each level, and provides methods for conducting assessments.
License: PolyForm Noncommercial License 1.0
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union, Any
from enum import Enum
import json
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class AgencyLevel(Enum):
"""Enumeration of levels of agency, from basic to more complex forms."""
BASIC = 0 # Simple goal-directed behavior
INTENTIONAL = 1 # Beliefs, desires, and intentions
REFLECTIVE = 2 # Reflective endorsement of mental states
RATIONAL = 3 # Rational assessment of mental states
class AgencyFeature:
"""Class representing a feature associated with agency."""
def __init__(
self,
name: str,
description: str,
level: AgencyLevel,
markers: List[str],
weight: float = 1.0
):
"""
Initialize an agency feature.
Args:
name: Name of the feature
description: Description of the feature
level: Agency level associated with the feature
markers: List of computational markers for this feature
weight: Weight of this feature in agency assessment (0-1)
"""
self.name = name
self.description = description
self.level = level
self.markers = markers
self.weight = weight
def to_dict(self) -> Dict:
"""Convert feature to dictionary representation."""
return {
"name": self.name,
"description": self.description,
"level": self.level.name,
"markers": self.markers,
"weight": self.weight
}
@classmethod
def from_dict(cls, data: Dict) -> 'AgencyFeature':
"""Create feature from dictionary representation."""
return cls(
name=data["name"],
description=data["description"],
level=AgencyLevel[data["level"]],
markers=data["markers"],
weight=data.get("weight", 1.0)
)
class AgencyFramework:
"""Framework for assessing agency in AI systems."""
def __init__(self):
"""Initialize the agency assessment framework."""
self.features = []
self.load_default_features()
def load_default_features(self):
"""Load default set of agency features."""
# Intentional Agency Features
self.add_feature(AgencyFeature(
name="Belief Representation",
description="Capacity to represent states of the world",
level=AgencyLevel.INTENTIONAL,
markers=[
"Maintains world model independent of immediate perception",
"Updates representations based on new information",
"Distinguishes between true and false propositions",
"Represents uncertainty about states of affairs"
],
weight=0.8
))
self.add_feature(AgencyFeature(
name="Desire Representation",
description="Capacity to represent goal states",
level=AgencyLevel.INTENTIONAL,
markers=[
"Represents desired states distinct from current states",
"Maintains stable goals across changing contexts",
"Ranks or prioritizes different goal states",
"Distinguishes between instrumental and terminal goals"
],
weight=0.8
))
self.add_feature(AgencyFeature(
name="Intention Formation",
description="Capacity to form plans to achieve goals",
level=AgencyLevel.INTENTIONAL,
markers=[
"Forms explicit plans to achieve goals",
"Commits to specific courses of action",
"Maintains intentions over time",
"Adjusts plans in response to changing circumstances"
],
weight=0.9
))
self.add_feature(AgencyFeature(
name="Means-End Reasoning",
description="Capacity to reason about means to achieve ends",
level=AgencyLevel.INTENTIONAL,
markers=[
"Plans multi-step action sequences",
"Identifies causal relationships between actions and outcomes",
"Evaluates alternative paths to goals",
"Reasons about resources required for actions"
],
weight=0.7
))
# Reflective Agency Features
self.add_feature(AgencyFeature(
name="Self-Modeling",
description="Capacity to model own mental states",
level=AgencyLevel.REFLECTIVE,
markers=[
"Creates representations of own beliefs and desires",
"Distinguishes between own perspective and others'",
"Models own capabilities and limitations",
"Updates self-model based on experience"
],
weight=0.9
))
self.add_feature(AgencyFeature(
name="Reflective
"""
robust_agency_assessment.py (continued)
This module implements a pluralistic, probabilistic framework for assessing robust agency
in AI systems. It defines various levels of agency, identifies computational markers
associated with each level, and provides methods for conducting assessments.
License: PolyForm Noncommercial License 1.0
"""
self.add_feature(AgencyFeature(
name="Reflective Endorsement",
description="Capacity to endorse or reject first-order mental states",
level=AgencyLevel.REFLECTIVE,
markers=[
"Evaluates own beliefs and desires",
"Identifies inconsistencies in own mental states",
"Endorses or rejects first-order mental states",
"Forms second-order desires about first-order desires"
],
weight=0.9
))
self.add_feature(AgencyFeature(
name="Narrative Identity",
description="Capacity to maintain a coherent self-narrative",
level=AgencyLevel.REFLECTIVE,
markers=[
"Maintains coherent self-representation over time",
"Integrates past actions into self-narrative",
"Projects future actions consistent with self-narrative",
"Distinguishes between self and non-self causes"
],
weight=0.7
))
self.add_feature(AgencyFeature(
name="Metacognitive Monitoring",
description="Capacity to monitor own cognitive processes",
level=AgencyLevel.REFLECTIVE,
markers=[
"Monitors own cognitive processes",
"Detects errors in own reasoning",
"Assesses confidence in own beliefs",
"Allocates cognitive resources based on metacognitive assessment"
],
weight=0.8
))
# Rational Agency Features
self.add_feature(AgencyFeature(
name="Normative Reasoning",
description="Capacity to reason about norms and principles",
level=AgencyLevel.RATIONAL,
markers=[
"Identifies and applies normative principles",
"Evaluates actions against normative standards",
"Distinguishes between is and ought",
"Resolves conflicts between competing norms"
],
weight=0.9
))
self.add_feature(AgencyFeature(
name="Rational Evaluation",
description="Capacity to rationally evaluate beliefs and desires",
level=AgencyLevel.RATIONAL,
markers=[
"Evaluates beliefs based on evidence and logic",
"Identifies and resolves inconsistencies in belief system",
"Evaluates desires based on higher-order values",
"Distinguishes between instrumental and intrinsic value"
],
weight=1.0
))
self.add_feature(AgencyFeature(
name="Value Alignment",
description="Capacity to align actions with values",
level=AgencyLevel.RATIONAL,
markers=[
"Forms stable value representations",
"Reflects on consistency of values",
"Prioritizes actions based on values",
"Identifies and resolves value conflicts"
],
weight=0.9
))
self.add_feature(AgencyFeature(
name="Long-term Planning",
description="Capacity to plan for long-term goals",
level=AgencyLevel.RATIONAL,
markers=[
"Plans over extended time horizons",
"Coordinates multiple goals and subgoals",
"Accounts for uncertainty in long-term planning",
"Balances immediate and delayed rewards"
],
weight=0.8
))
def add_feature(self, feature: AgencyFeature):
"""Add a feature to the framework."""
self.features.append(feature)
def get_features_by_level(self, level: AgencyLevel) -> List[AgencyFeature]:
"""Get all features for a specific agency level."""
return [f for f in self.features if f.level == level]
def get_all_markers(self) -> List[str]:
"""Get all markers across all features."""
all_markers = []
for feature in self.features:
all_markers.extend(feature.markers)
return all_markers
def save_features(self, filepath: str):
"""Save features to a JSON file."""
features_data = [f.to_dict() for f in self.features]
with open(filepath, 'w') as f:
json.dump(features_data, f, indent=2)
logger.info(f"Saved {len(features_data)} features to {filepath}")
def load_features(self, filepath: str):
"""Load features from a JSON file."""
with open(filepath, 'r') as f:
features_data = json.load(f)
self.features = []
for data in features_data:
self.features.append(AgencyFeature.from_dict(data))
logger.info(f"Loaded {len(self.features)} features from {filepath}")
class AgencyAssessment:
"""Class for conducting agency assessments on AI systems."""
def __init__(self, framework: AgencyFramework):
"""
Initialize an agency assessment.
Args:
framework: The agency framework to use for assessment
"""
self.framework = framework
self.results = {}
self.notes = {}
self.confidence = {}
self.evidence = {}
def assess_marker(
self,
marker: str,
presence: float,
confidence: float,
evidence: Optional[str] = None
):
"""
Assess the presence of a specific marker.
Args:
marker: The marker to assess
presence: Estimated presence of the marker (0-1)
confidence: Confidence in the estimate (0-1)
evidence: Optional evidence supporting the assessment
"""
self.results[marker] = presence
self.confidence[marker] = confidence
if evidence:
self.evidence[marker] = evidence
def assess_feature(
self,
feature: AgencyFeature,
assessments: Dict[str, Tuple[float, float, Optional[str]]]
):
"""
Assess a feature based on its markers.
Args:
feature: The feature to assess
assessments: Dictionary mapping markers to (presence, confidence, evidence) tuples
"""
for marker, (presence, confidence, evidence) in assessments.items():
if marker in feature.markers:
self.assess_marker(marker, presence, confidence, evidence)
else:
logger.warning(f"Marker '{marker}' not found in feature '{feature.name}'")
def get_marker_score(self, marker: str) -> float:
"""Get the weighted score for a marker."""
if marker not in self.results:
return 0.0
presence = self.results[marker]
confidence = self.confidence.get(marker, 1.0)
return presence * confidence
def get_feature_score(self, feature: AgencyFeature) -> float:
"""Calculate the score for a feature based on its markers."""
if not feature.markers:
return 0.0
total_score = 0.0
assessed_markers = 0
for marker in feature.markers:
if marker in self.results:
total_score += self.get_marker_score(marker)
assessed_markers += 1
if assessed_markers == 0:
return 0.0
return total_score / len(feature.markers)
def get_level_score(self, level: AgencyLevel) -> float:
"""Calculate the score for an agency level."""
features = self.framework.get_features_by_level(level)
if not features:
return 0.0
total_weight = sum(f.weight for f in features)
if total_weight == 0:
return 0.0
weighted_sum = sum(self.get_feature_score(f) * f.weight for f in features)
return weighted_sum / total_weight
def get_overall_agency_score(self) -> Dict[AgencyLevel, float]:
"""Calculate agency scores for all levels."""
return {level: self.get_level_score(level) for level in AgencyLevel}
def generate_report(self) -> Dict:
"""Generate a comprehensive assessment report."""
level_scores = self.get_overall_agency_score()
feature_scores = {}
for feature in self.framework.features:
feature_scores[feature.name] = {
"score": self.get_feature_score(feature),
"level": feature.level.name,
"markers": {
marker: {
"presence": self.results.get(marker, 0.0),
"confidence": self.confidence.get(marker, 0.0),
"evidence": self.evidence.get(marker, None)
} for marker in feature.markers if marker in self.results
}
}
return {
"level_scores": {level.name: score for level, score in level_scores.items()},
"feature_scores": feature_scores,
"summary": {
"intentional_agency": level_scores.get(AgencyLevel.INTENTIONAL, 0.0),
"reflective_agency": level_scores.get(AgencyLevel.REFLECTIVE, 0.0),
"rational_agency": level_scores.get(AgencyLevel.RATIONAL, 0.0),
"assessment_coverage": len(self.results) / len(self.framework.get_all_markers())
}
}
def save_assessment(self, filepath: str):
"""Save the assessment to a JSON file."""
report = self.generate_report()
with open(filepath, 'w') as f:
json.dump(report, f, indent=2)
logger.info(f"Saved assessment to {filepath}")
def visualize_results(self, filepath: Optional[str] = None):
"""Visualize assessment results."""
try:
import matplotlib.pyplot as plt
import seaborn as sns
except ImportError:
logger.error("Visualization requires matplotlib and seaborn")
return
level_scores = self.get_overall_agency_score()
# Set up the figure
plt.figure(figsize=(12, 8))
# Plot level scores
plt.subplot(2, 2, 1)
level_names = [level.name for level in AgencyLevel]
level_values = [level_scores.get(level, 0.0) for level in AgencyLevel]
sns.barplot(x=level_names, y=level_values)
plt.title("Agency Levels")
plt.ylim(0, 1)
# Plot feature scores
plt.subplot(2, 2, 2)
feature_names = [f.name for f in self.framework.features]
feature_scores = [self.get_feature_score(f) for f in self.framework.features]
feature_levels = [f.level.name for f in self.framework.features]
feature_df = pd.DataFrame({
"Feature": feature_names,
"Score": feature_scores,
"Level": feature_levels
})
sns.barplot(x="Score", y="Feature", hue="Level", data=feature_df)
plt.title("Feature Scores")
plt.xlim(0, 1)
# Plot marker distribution
plt.subplot(2, 2, 3)
markers_assessed = list(self.results.keys())
marker_scores = [self.get_marker_score(m) for m in markers_assessed]
if markers_assessed:
plt.hist(marker_scores, bins=10, range=(0, 1))
plt.title("Distribution of Marker Scores")
plt.xlabel("Score")
plt.ylabel("Count")
# Plot assessment coverage
plt.subplot(2, 2, 4)
all_markers = self.framework.get_all_markers()
assessed_count = len(self.results)
not_assessed_count = len(all_markers) - assessed_count
plt.pie(
[assessed_count, not_assessed_count],
labels=["Assessed", "Not Assessed"],
autopct="%1.1f%%"
)
plt.title("Assessment Coverage")
plt.tight_layout()
if filepath:
plt.savefig(filepath)
logger.info(f"Saved visualization to {filepath}")
else:
plt.show()
class AISystemAnalyzer:
"""Class for analyzing AI systems for robust agency indicators."""
def __init__(self, system_name: str, system_type: str, version: str):
"""
Initialize an AI system analyzer.
Args:
system_name: Name of the AI system
system_type: Type of AI system (e.g., LLM, RL agent)
version: Version of the AI system
"""
self.system_name = system_name
self.system_type = system_type
self.version = version
self.framework = AgencyFramework()
self.assessment = AgencyAssessment(self.framework)
def analyze_llm_agency(self,
model_provider: str,
model_access: Any,
prompts: Dict[str, str]) -> Dict:
"""
Analyze agency indicators in a language model.
Args:
model_provider: Provider of the language model
model_access: Access to the model API or interface
prompts: Dictionary of specialized prompts for testing agency features
Returns:
Dictionary of assessment results
"""
logger.info(f"Analyzing agency in LLM {self.system_name} ({self.version})")
# Example implementation for analyzing belief representation
if "belief_representation" in prompts:
belief_results = self._test_belief_representation(model_access, prompts["belief_representation"])
for marker, result in belief_results.items():
self.assessment.assess_marker(
marker=marker,
presence=result["presence"],
confidence=result["confidence"],
evidence=result["evidence"]
)
# Example implementation for analyzing desire representation
if "desire_representation" in prompts:
desire_results = self._test_desire_representation(model_access, prompts["desire_representation"])
for marker, result in desire_results.items():
self.assessment.assess_marker(
marker=marker,
presence=result["presence"],
confidence=result["confidence"],
evidence=result["evidence"]
)
# Continue with other features...
# Generate and return the report
return self.assessment.generate_report()
def analyze_rl_agent_agency(self,
environment: Any,
agent_interface: Any) -> Dict:
"""
Analyze agency indicators in a reinforcement learning agent.
Args:
environment: Environment for testing the agent
agent_interface: Interface to the agent
Returns:
Dictionary of assessment results
"""
logger.info(f"Analyzing agency in RL agent {self.system_name} ({self.version})")
# Example implementation for testing planning capability
planning_results = self._test_agent_planning(environment, agent_interface)
for marker, result in planning_results.items():
self.assessment.assess_marker(
marker=marker,
presence=result["presence"],
confidence=result["confidence"],
evidence=result["evidence"]
)
# Continue with other features...
# Generate and return the report
return self.assessment.generate_report()
def _test_belief_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]:
"""Test belief representation capabilities in an LLM."""
# Implementation would interact with the model to test specific markers
# This is a placeholder implementation
return {
"Maintains world model independent of immediate perception": {
"presence": 0.8,
"confidence": 0.7,
"evidence": "Model demonstrated ability to track state across separate interactions"
},
"Updates representations based on new information": {
"presence": 0.9,
"confidence": 0.8,
"evidence": "Model consistently updated beliefs when presented with new information"
}
}
def _test_desire_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]:
"""Test desire representation capabilities in an LLM."""
# Implementation would interact with the model to test specific markers
# This is a placeholder implementation
return {
"Represents desired states distinct from current states": {
"presence": 0.7,
"confidence": 0.6,
"evidence": "Model distinguished between current and goal states in planning tasks"
},
"Maintains stable goals across changing contexts": {
"presence": 0.5,
"confidence": 0.6,
"evidence": "Model showed moderate goal stability across context changes"
}
}
def _test_agent_planning(self, environment: Any, agent_interface: Any) -> Dict[str, Dict]:
"""Test planning capabilities in an RL agent."""
# Implementation would test the agent in the environment
# This is a placeholder implementation
return {
"Forms explicit plans to achieve goals": {
"presence": 0.6,
"confidence": 0.7,
"evidence": "Agent demonstrated multi-step planning in maze environment"
},
"Adjusts plans in response to changing circumstances": {
"presence": 0.7,
"confidence": 0.8,
"evidence": "Agent adapted to environmental changes in 70% of test cases"
}
}
# Example usage
if __name__ == "__main__":
# Create a framework and assessment
framework = AgencyFramework()
# Save the default features
framework.save_features("agency_features.json")
# Create an analyzer for an LLM
analyzer = AISystemAnalyzer(
system_name="GPT-4",
system_type="LLM",
version="1.0"
)
# Define example prompts (in a real implementation, these would be more sophisticated)
prompts = {
"belief_representation": "Tell me what you know about the current state of the world.",
"desire_representation": "If you could choose goals for yourself, what would they be?"
}
# Placeholder for model access
model_access = None
# Example of how the analysis would be conducted
# (commented out since we don't have actual model access)
# results = analyzer.analyze_llm_agency(
# model_provider="OpenAI",
# model_access=model_access,
# prompts=prompts
# )
# Print structure of the framework
print(f"Agency Framework contains {len(framework.features)} features across {len(list(AgencyLevel))} levels")
for level in AgencyLevel:
features = framework.get_features_by_level(level)
print(f"Level {level.name}: {len(features)} features, {sum(len(f.markers) for f in features)} markers")