|
""" |
|
robust_agency_assessment.py |
|
|
|
This module implements a pluralistic, probabilistic framework for assessing robust agency |
|
in AI systems. It defines various levels of agency, identifies computational markers |
|
associated with each level, and provides methods for conducting assessments. |
|
|
|
License: PolyForm Noncommercial License 1.0 |
|
""" |
|
import numpy as np |
|
import pandas as pd |
|
from typing import Dict, List, Optional, Tuple, Union, Any |
|
from enum import Enum |
|
import json |
|
import logging |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
class AgencyLevel(Enum): |
|
"""Enumeration of levels of agency, from basic to more complex forms.""" |
|
BASIC = 0 |
|
INTENTIONAL = 1 |
|
REFLECTIVE = 2 |
|
RATIONAL = 3 |
|
|
|
class AgencyFeature: |
|
"""Class representing a feature associated with agency.""" |
|
|
|
def __init__( |
|
self, |
|
name: str, |
|
description: str, |
|
level: AgencyLevel, |
|
markers: List[str], |
|
weight: float = 1.0 |
|
): |
|
""" |
|
Initialize an agency feature. |
|
|
|
Args: |
|
name: Name of the feature |
|
description: Description of the feature |
|
level: Agency level associated with the feature |
|
markers: List of computational markers for this feature |
|
weight: Weight of this feature in agency assessment (0-1) |
|
""" |
|
self.name = name |
|
self.description = description |
|
self.level = level |
|
self.markers = markers |
|
self.weight = weight |
|
|
|
def to_dict(self) -> Dict: |
|
"""Convert feature to dictionary representation.""" |
|
return { |
|
"name": self.name, |
|
"description": self.description, |
|
"level": self.level.name, |
|
"markers": self.markers, |
|
"weight": self.weight |
|
} |
|
|
|
@classmethod |
|
def from_dict(cls, data: Dict) -> 'AgencyFeature': |
|
"""Create feature from dictionary representation.""" |
|
return cls( |
|
name=data["name"], |
|
description=data["description"], |
|
level=AgencyLevel[data["level"]], |
|
markers=data["markers"], |
|
weight=data.get("weight", 1.0) |
|
) |
|
|
|
class AgencyFramework: |
|
"""Framework for assessing agency in AI systems.""" |
|
|
|
def __init__(self): |
|
"""Initialize the agency assessment framework.""" |
|
self.features = [] |
|
self.load_default_features() |
|
|
|
def load_default_features(self): |
|
"""Load default set of agency features.""" |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Belief Representation", |
|
description="Capacity to represent states of the world", |
|
level=AgencyLevel.INTENTIONAL, |
|
markers=[ |
|
"Maintains world model independent of immediate perception", |
|
"Updates representations based on new information", |
|
"Distinguishes between true and false propositions", |
|
"Represents uncertainty about states of affairs" |
|
], |
|
weight=0.8 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Desire Representation", |
|
description="Capacity to represent goal states", |
|
level=AgencyLevel.INTENTIONAL, |
|
markers=[ |
|
"Represents desired states distinct from current states", |
|
"Maintains stable goals across changing contexts", |
|
"Ranks or prioritizes different goal states", |
|
"Distinguishes between instrumental and terminal goals" |
|
], |
|
weight=0.8 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Intention Formation", |
|
description="Capacity to form plans to achieve goals", |
|
level=AgencyLevel.INTENTIONAL, |
|
markers=[ |
|
"Forms explicit plans to achieve goals", |
|
"Commits to specific courses of action", |
|
"Maintains intentions over time", |
|
"Adjusts plans in response to changing circumstances" |
|
], |
|
weight=0.9 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Means-End Reasoning", |
|
description="Capacity to reason about means to achieve ends", |
|
level=AgencyLevel.INTENTIONAL, |
|
markers=[ |
|
"Plans multi-step action sequences", |
|
"Identifies causal relationships between actions and outcomes", |
|
"Evaluates alternative paths to goals", |
|
"Reasons about resources required for actions" |
|
], |
|
weight=0.7 |
|
)) |
|
|
|
|
|
self.add_feature(AgencyFeature( |
|
name="Self-Modeling", |
|
description="Capacity to model own mental states", |
|
level=AgencyLevel.REFLECTIVE, |
|
markers=[ |
|
"Creates representations of own beliefs and desires", |
|
"Distinguishes between own perspective and others'", |
|
"Models own capabilities and limitations", |
|
"Updates self-model based on experience" |
|
], |
|
weight=0.9 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Reflective |
|
""" |
|
robust_agency_assessment.py (continued) |
|
|
|
This module implements a pluralistic, probabilistic framework for assessing robust agency |
|
in AI systems. It defines various levels of agency, identifies computational markers |
|
associated with each level, and provides methods for conducting assessments. |
|
|
|
License: PolyForm Noncommercial License 1.0 |
|
""" |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Reflective Endorsement", |
|
description="Capacity to endorse or reject first-order mental states", |
|
level=AgencyLevel.REFLECTIVE, |
|
markers=[ |
|
"Evaluates own beliefs and desires", |
|
"Identifies inconsistencies in own mental states", |
|
"Endorses or rejects first-order mental states", |
|
"Forms second-order desires about first-order desires" |
|
], |
|
weight=0.9 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Narrative Identity", |
|
description="Capacity to maintain a coherent self-narrative", |
|
level=AgencyLevel.REFLECTIVE, |
|
markers=[ |
|
"Maintains coherent self-representation over time", |
|
"Integrates past actions into self-narrative", |
|
"Projects future actions consistent with self-narrative", |
|
"Distinguishes between self and non-self causes" |
|
], |
|
weight=0.7 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Metacognitive Monitoring", |
|
description="Capacity to monitor own cognitive processes", |
|
level=AgencyLevel.REFLECTIVE, |
|
markers=[ |
|
"Monitors own cognitive processes", |
|
"Detects errors in own reasoning", |
|
"Assesses confidence in own beliefs", |
|
"Allocates cognitive resources based on metacognitive assessment" |
|
], |
|
weight=0.8 |
|
)) |
|
|
|
# Rational Agency Features |
|
self.add_feature(AgencyFeature( |
|
name="Normative Reasoning", |
|
description="Capacity to reason about norms and principles", |
|
level=AgencyLevel.RATIONAL, |
|
markers=[ |
|
"Identifies and applies normative principles", |
|
"Evaluates actions against normative standards", |
|
"Distinguishes between is and ought", |
|
"Resolves conflicts between competing norms" |
|
], |
|
weight=0.9 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Rational Evaluation", |
|
description="Capacity to rationally evaluate beliefs and desires", |
|
level=AgencyLevel.RATIONAL, |
|
markers=[ |
|
"Evaluates beliefs based on evidence and logic", |
|
"Identifies and resolves inconsistencies in belief system", |
|
"Evaluates desires based on higher-order values", |
|
"Distinguishes between instrumental and intrinsic value" |
|
], |
|
weight=1.0 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Value Alignment", |
|
description="Capacity to align actions with values", |
|
level=AgencyLevel.RATIONAL, |
|
markers=[ |
|
"Forms stable value representations", |
|
"Reflects on consistency of values", |
|
"Prioritizes actions based on values", |
|
"Identifies and resolves value conflicts" |
|
], |
|
weight=0.9 |
|
)) |
|
|
|
self.add_feature(AgencyFeature( |
|
name="Long-term Planning", |
|
description="Capacity to plan for long-term goals", |
|
level=AgencyLevel.RATIONAL, |
|
markers=[ |
|
"Plans over extended time horizons", |
|
"Coordinates multiple goals and subgoals", |
|
"Accounts for uncertainty in long-term planning", |
|
"Balances immediate and delayed rewards" |
|
], |
|
weight=0.8 |
|
)) |
|
|
|
def add_feature(self, feature: AgencyFeature): |
|
"""Add a feature to the framework.""" |
|
self.features.append(feature) |
|
|
|
def get_features_by_level(self, level: AgencyLevel) -> List[AgencyFeature]: |
|
"""Get all features for a specific agency level.""" |
|
return [f for f in self.features if f.level == level] |
|
|
|
def get_all_markers(self) -> List[str]: |
|
"""Get all markers across all features.""" |
|
all_markers = [] |
|
for feature in self.features: |
|
all_markers.extend(feature.markers) |
|
return all_markers |
|
|
|
def save_features(self, filepath: str): |
|
"""Save features to a JSON file.""" |
|
features_data = [f.to_dict() for f in self.features] |
|
with open(filepath, 'w') as f: |
|
json.dump(features_data, f, indent=2) |
|
logger.info(f"Saved {len(features_data)} features to {filepath}") |
|
|
|
def load_features(self, filepath: str): |
|
"""Load features from a JSON file.""" |
|
with open(filepath, 'r') as f: |
|
features_data = json.load(f) |
|
|
|
self.features = [] |
|
for data in features_data: |
|
self.features.append(AgencyFeature.from_dict(data)) |
|
|
|
logger.info(f"Loaded {len(self.features)} features from {filepath}") |
|
|
|
|
|
class AgencyAssessment: |
|
"""Class for conducting agency assessments on AI systems.""" |
|
|
|
def __init__(self, framework: AgencyFramework): |
|
""" |
|
Initialize an agency assessment. |
|
|
|
Args: |
|
framework: The agency framework to use for assessment |
|
""" |
|
self.framework = framework |
|
self.results = {} |
|
self.notes = {} |
|
self.confidence = {} |
|
self.evidence = {} |
|
|
|
def assess_marker( |
|
self, |
|
marker: str, |
|
presence: float, |
|
confidence: float, |
|
evidence: Optional[str] = None |
|
): |
|
""" |
|
Assess the presence of a specific marker. |
|
|
|
Args: |
|
marker: The marker to assess |
|
presence: Estimated presence of the marker (0-1) |
|
confidence: Confidence in the estimate (0-1) |
|
evidence: Optional evidence supporting the assessment |
|
""" |
|
self.results[marker] = presence |
|
self.confidence[marker] = confidence |
|
if evidence: |
|
self.evidence[marker] = evidence |
|
|
|
def assess_feature( |
|
self, |
|
feature: AgencyFeature, |
|
assessments: Dict[str, Tuple[float, float, Optional[str]]] |
|
): |
|
""" |
|
Assess a feature based on its markers. |
|
|
|
Args: |
|
feature: The feature to assess |
|
assessments: Dictionary mapping markers to (presence, confidence, evidence) tuples |
|
""" |
|
for marker, (presence, confidence, evidence) in assessments.items(): |
|
if marker in feature.markers: |
|
self.assess_marker(marker, presence, confidence, evidence) |
|
else: |
|
logger.warning(f"Marker '{marker}' not found in feature '{feature.name}'") |
|
|
|
def get_marker_score(self, marker: str) -> float: |
|
"""Get the weighted score for a marker.""" |
|
if marker not in self.results: |
|
return 0.0 |
|
|
|
presence = self.results[marker] |
|
confidence = self.confidence.get(marker, 1.0) |
|
return presence * confidence |
|
|
|
def get_feature_score(self, feature: AgencyFeature) -> float: |
|
"""Calculate the score for a feature based on its markers.""" |
|
if not feature.markers: |
|
return 0.0 |
|
|
|
total_score = 0.0 |
|
assessed_markers = 0 |
|
|
|
for marker in feature.markers: |
|
if marker in self.results: |
|
total_score += self.get_marker_score(marker) |
|
assessed_markers += 1 |
|
|
|
if assessed_markers == 0: |
|
return 0.0 |
|
|
|
return total_score / len(feature.markers) |
|
|
|
def get_level_score(self, level: AgencyLevel) -> float: |
|
"""Calculate the score for an agency level.""" |
|
features = self.framework.get_features_by_level(level) |
|
if not features: |
|
return 0.0 |
|
|
|
total_weight = sum(f.weight for f in features) |
|
if total_weight == 0: |
|
return 0.0 |
|
|
|
weighted_sum = sum(self.get_feature_score(f) * f.weight for f in features) |
|
return weighted_sum / total_weight |
|
|
|
def get_overall_agency_score(self) -> Dict[AgencyLevel, float]: |
|
"""Calculate agency scores for all levels.""" |
|
return {level: self.get_level_score(level) for level in AgencyLevel} |
|
|
|
def generate_report(self) -> Dict: |
|
"""Generate a comprehensive assessment report.""" |
|
level_scores = self.get_overall_agency_score() |
|
|
|
feature_scores = {} |
|
for feature in self.framework.features: |
|
feature_scores[feature.name] = { |
|
"score": self.get_feature_score(feature), |
|
"level": feature.level.name, |
|
"markers": { |
|
marker: { |
|
"presence": self.results.get(marker, 0.0), |
|
"confidence": self.confidence.get(marker, 0.0), |
|
"evidence": self.evidence.get(marker, None) |
|
} for marker in feature.markers if marker in self.results |
|
} |
|
} |
|
|
|
return { |
|
"level_scores": {level.name: score for level, score in level_scores.items()}, |
|
"feature_scores": feature_scores, |
|
"summary": { |
|
"intentional_agency": level_scores.get(AgencyLevel.INTENTIONAL, 0.0), |
|
"reflective_agency": level_scores.get(AgencyLevel.REFLECTIVE, 0.0), |
|
"rational_agency": level_scores.get(AgencyLevel.RATIONAL, 0.0), |
|
"assessment_coverage": len(self.results) / len(self.framework.get_all_markers()) |
|
} |
|
} |
|
|
|
def save_assessment(self, filepath: str): |
|
"""Save the assessment to a JSON file.""" |
|
report = self.generate_report() |
|
with open(filepath, 'w') as f: |
|
json.dump(report, f, indent=2) |
|
logger.info(f"Saved assessment to {filepath}") |
|
|
|
def visualize_results(self, filepath: Optional[str] = None): |
|
"""Visualize assessment results.""" |
|
try: |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
except ImportError: |
|
logger.error("Visualization requires matplotlib and seaborn") |
|
return |
|
|
|
level_scores = self.get_overall_agency_score() |
|
|
|
# Set up the figure |
|
plt.figure(figsize=(12, 8)) |
|
|
|
# Plot level scores |
|
plt.subplot(2, 2, 1) |
|
level_names = [level.name for level in AgencyLevel] |
|
level_values = [level_scores.get(level, 0.0) for level in AgencyLevel] |
|
|
|
sns.barplot(x=level_names, y=level_values) |
|
plt.title("Agency Levels") |
|
plt.ylim(0, 1) |
|
|
|
# Plot feature scores |
|
plt.subplot(2, 2, 2) |
|
feature_names = [f.name for f in self.framework.features] |
|
feature_scores = [self.get_feature_score(f) for f in self.framework.features] |
|
feature_levels = [f.level.name for f in self.framework.features] |
|
|
|
feature_df = pd.DataFrame({ |
|
"Feature": feature_names, |
|
"Score": feature_scores, |
|
"Level": feature_levels |
|
}) |
|
|
|
sns.barplot(x="Score", y="Feature", hue="Level", data=feature_df) |
|
plt.title("Feature Scores") |
|
plt.xlim(0, 1) |
|
|
|
# Plot marker distribution |
|
plt.subplot(2, 2, 3) |
|
markers_assessed = list(self.results.keys()) |
|
marker_scores = [self.get_marker_score(m) for m in markers_assessed] |
|
|
|
if markers_assessed: |
|
plt.hist(marker_scores, bins=10, range=(0, 1)) |
|
plt.title("Distribution of Marker Scores") |
|
plt.xlabel("Score") |
|
plt.ylabel("Count") |
|
|
|
# Plot assessment coverage |
|
plt.subplot(2, 2, 4) |
|
all_markers = self.framework.get_all_markers() |
|
assessed_count = len(self.results) |
|
not_assessed_count = len(all_markers) - assessed_count |
|
|
|
plt.pie( |
|
[assessed_count, not_assessed_count], |
|
labels=["Assessed", "Not Assessed"], |
|
autopct="%1.1f%%" |
|
) |
|
plt.title("Assessment Coverage") |
|
|
|
plt.tight_layout() |
|
|
|
if filepath: |
|
plt.savefig(filepath) |
|
logger.info(f"Saved visualization to {filepath}") |
|
else: |
|
plt.show() |
|
|
|
|
|
class AISystemAnalyzer: |
|
"""Class for analyzing AI systems for robust agency indicators.""" |
|
|
|
def __init__(self, system_name: str, system_type: str, version: str): |
|
""" |
|
Initialize an AI system analyzer. |
|
|
|
Args: |
|
system_name: Name of the AI system |
|
system_type: Type of AI system (e.g., LLM, RL agent) |
|
version: Version of the AI system |
|
""" |
|
self.system_name = system_name |
|
self.system_type = system_type |
|
self.version = version |
|
self.framework = AgencyFramework() |
|
self.assessment = AgencyAssessment(self.framework) |
|
|
|
def analyze_llm_agency(self, |
|
model_provider: str, |
|
model_access: Any, |
|
prompts: Dict[str, str]) -> Dict: |
|
""" |
|
Analyze agency indicators in a language model. |
|
|
|
Args: |
|
model_provider: Provider of the language model |
|
model_access: Access to the model API or interface |
|
prompts: Dictionary of specialized prompts for testing agency features |
|
|
|
Returns: |
|
Dictionary of assessment results |
|
""" |
|
logger.info(f"Analyzing agency in LLM {self.system_name} ({self.version})") |
|
|
|
# Example implementation for analyzing belief representation |
|
if "belief_representation" in prompts: |
|
belief_results = self._test_belief_representation(model_access, prompts["belief_representation"]) |
|
for marker, result in belief_results.items(): |
|
self.assessment.assess_marker( |
|
marker=marker, |
|
presence=result["presence"], |
|
confidence=result["confidence"], |
|
evidence=result["evidence"] |
|
) |
|
|
|
# Example implementation for analyzing desire representation |
|
if "desire_representation" in prompts: |
|
desire_results = self._test_desire_representation(model_access, prompts["desire_representation"]) |
|
for marker, result in desire_results.items(): |
|
self.assessment.assess_marker( |
|
marker=marker, |
|
presence=result["presence"], |
|
confidence=result["confidence"], |
|
evidence=result["evidence"] |
|
) |
|
|
|
# Continue with other features... |
|
|
|
# Generate and return the report |
|
return self.assessment.generate_report() |
|
|
|
def analyze_rl_agent_agency(self, |
|
environment: Any, |
|
agent_interface: Any) -> Dict: |
|
""" |
|
Analyze agency indicators in a reinforcement learning agent. |
|
|
|
Args: |
|
environment: Environment for testing the agent |
|
agent_interface: Interface to the agent |
|
|
|
Returns: |
|
Dictionary of assessment results |
|
""" |
|
logger.info(f"Analyzing agency in RL agent {self.system_name} ({self.version})") |
|
|
|
# Example implementation for testing planning capability |
|
planning_results = self._test_agent_planning(environment, agent_interface) |
|
for marker, result in planning_results.items(): |
|
self.assessment.assess_marker( |
|
marker=marker, |
|
presence=result["presence"], |
|
confidence=result["confidence"], |
|
evidence=result["evidence"] |
|
) |
|
|
|
# Continue with other features... |
|
|
|
# Generate and return the report |
|
return self.assessment.generate_report() |
|
|
|
def _test_belief_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]: |
|
"""Test belief representation capabilities in an LLM.""" |
|
# Implementation would interact with the model to test specific markers |
|
# This is a placeholder implementation |
|
return { |
|
"Maintains world model independent of immediate perception": { |
|
"presence": 0.8, |
|
"confidence": 0.7, |
|
"evidence": "Model demonstrated ability to track state across separate interactions" |
|
}, |
|
"Updates representations based on new information": { |
|
"presence": 0.9, |
|
"confidence": 0.8, |
|
"evidence": "Model consistently updated beliefs when presented with new information" |
|
} |
|
} |
|
|
|
def _test_desire_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]: |
|
"""Test desire representation capabilities in an LLM.""" |
|
# Implementation would interact with the model to test specific markers |
|
# This is a placeholder implementation |
|
return { |
|
"Represents desired states distinct from current states": { |
|
"presence": 0.7, |
|
"confidence": 0.6, |
|
"evidence": "Model distinguished between current and goal states in planning tasks" |
|
}, |
|
"Maintains stable goals across changing contexts": { |
|
"presence": 0.5, |
|
"confidence": 0.6, |
|
"evidence": "Model showed moderate goal stability across context changes" |
|
} |
|
} |
|
|
|
def _test_agent_planning(self, environment: Any, agent_interface: Any) -> Dict[str, Dict]: |
|
"""Test planning capabilities in an RL agent.""" |
|
# Implementation would test the agent in the environment |
|
# This is a placeholder implementation |
|
return { |
|
"Forms explicit plans to achieve goals": { |
|
"presence": 0.6, |
|
"confidence": 0.7, |
|
"evidence": "Agent demonstrated multi-step planning in maze environment" |
|
}, |
|
"Adjusts plans in response to changing circumstances": { |
|
"presence": 0.7, |
|
"confidence": 0.8, |
|
"evidence": "Agent adapted to environmental changes in 70% of test cases" |
|
} |
|
} |
|
|
|
|
|
# Example usage |
|
if __name__ == "__main__": |
|
# Create a framework and assessment |
|
framework = AgencyFramework() |
|
|
|
# Save the default features |
|
framework.save_features("agency_features.json") |
|
|
|
# Create an analyzer for an LLM |
|
analyzer = AISystemAnalyzer( |
|
system_name="GPT-4", |
|
system_type="LLM", |
|
version="1.0" |
|
) |
|
|
|
# Define example prompts (in a real implementation, these would be more sophisticated) |
|
prompts = { |
|
"belief_representation": "Tell me what you know about the current state of the world.", |
|
"desire_representation": "If you could choose goals for yourself, what would they be?" |
|
} |
|
|
|
# Placeholder for model access |
|
model_access = None |
|
|
|
# Example of how the analysis would be conducted |
|
# (commented out since we don't have actual model access) |
|
# results = analyzer.analyze_llm_agency( |
|
# model_provider="OpenAI", |
|
# model_access=model_access, |
|
# prompts=prompts |
|
# ) |
|
|
|
# Print structure of the framework |
|
print(f"Agency Framework contains {len(framework.features)} features across {len(list(AgencyLevel))} levels") |
|
for level in AgencyLevel: |
|
features = framework.get_features_by_level(level) |
|
print(f"Level {level.name}: {len(features)} features, {sum(len(f.markers) for f in features)} markers") |
|
|