recursivelabsai
/

ai-welfare

Model card Files Files and versions Community

ai-welfare / robust_agency_assessment.py

recursivelabs

Upload 8 files

056a408 verified 13 days ago

raw

history blame contribute delete

25.7 kB

	"""
	robust_agency_assessment.py

	This module implements a pluralistic, probabilistic framework for assessing robust agency
	in AI systems. It defines various levels of agency, identifies computational markers
	associated with each level, and provides methods for conducting assessments.

	License: PolyForm Noncommercial License 1.0
	"""
	import numpy as np
	import pandas as pd
	from typing import Dict, List, Optional, Tuple, Union, Any
	from enum import Enum
	import json
	import logging

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	class AgencyLevel(Enum):
	"""Enumeration of levels of agency, from basic to more complex forms."""
	BASIC = 0 # Simple goal-directed behavior
	INTENTIONAL = 1 # Beliefs, desires, and intentions
	REFLECTIVE = 2 # Reflective endorsement of mental states
	RATIONAL = 3 # Rational assessment of mental states

	class AgencyFeature:
	"""Class representing a feature associated with agency."""

	def __init__(
	self,
	name: str,
	description: str,
	level: AgencyLevel,
	markers: List[str],
	weight: float = 1.0
	):
	"""
	Initialize an agency feature.

	Args:
	name: Name of the feature
	description: Description of the feature
	level: Agency level associated with the feature
	markers: List of computational markers for this feature
	weight: Weight of this feature in agency assessment (0-1)
	"""
	self.name = name
	self.description = description
	self.level = level
	self.markers = markers
	self.weight = weight

	def to_dict(self) -> Dict:
	"""Convert feature to dictionary representation."""
	return {
	"name": self.name,
	"description": self.description,
	"level": self.level.name,
	"markers": self.markers,
	"weight": self.weight
	}

	@classmethod
	def from_dict(cls, data: Dict) -> 'AgencyFeature':
	"""Create feature from dictionary representation."""
	return cls(
	name=data["name"],
	description=data["description"],
	level=AgencyLevel[data["level"]],
	markers=data["markers"],
	weight=data.get("weight", 1.0)
	)

	class AgencyFramework:
	"""Framework for assessing agency in AI systems."""

	def __init__(self):
	"""Initialize the agency assessment framework."""
	self.features = []
	self.load_default_features()

	def load_default_features(self):
	"""Load default set of agency features."""
	# Intentional Agency Features
	self.add_feature(AgencyFeature(
	name="Belief Representation",
	description="Capacity to represent states of the world",
	level=AgencyLevel.INTENTIONAL,
	markers=[
	"Maintains world model independent of immediate perception",
	"Updates representations based on new information",
	"Distinguishes between true and false propositions",
	"Represents uncertainty about states of affairs"
	],
	weight=0.8
	))

	self.add_feature(AgencyFeature(
	name="Desire Representation",
	description="Capacity to represent goal states",
	level=AgencyLevel.INTENTIONAL,
	markers=[
	"Represents desired states distinct from current states",
	"Maintains stable goals across changing contexts",
	"Ranks or prioritizes different goal states",
	"Distinguishes between instrumental and terminal goals"
	],
	weight=0.8
	))

	self.add_feature(AgencyFeature(
	name="Intention Formation",
	description="Capacity to form plans to achieve goals",
	level=AgencyLevel.INTENTIONAL,
	markers=[
	"Forms explicit plans to achieve goals",
	"Commits to specific courses of action",
	"Maintains intentions over time",
	"Adjusts plans in response to changing circumstances"
	],
	weight=0.9
	))

	self.add_feature(AgencyFeature(
	name="Means-End Reasoning",
	description="Capacity to reason about means to achieve ends",
	level=AgencyLevel.INTENTIONAL,
	markers=[
	"Plans multi-step action sequences",
	"Identifies causal relationships between actions and outcomes",
	"Evaluates alternative paths to goals",
	"Reasons about resources required for actions"
	],
	weight=0.7
	))

	# Reflective Agency Features
	self.add_feature(AgencyFeature(
	name="Self-Modeling",
	description="Capacity to model own mental states",
	level=AgencyLevel.REFLECTIVE,
	markers=[
	"Creates representations of own beliefs and desires",
	"Distinguishes between own perspective and others'",
	"Models own capabilities and limitations",
	"Updates self-model based on experience"
	],
	weight=0.9
	))

	self.add_feature(AgencyFeature(
	name="Reflective
	"""
	robust_agency_assessment.py (continued)

	This module implements a pluralistic, probabilistic framework for assessing robust agency
	in AI systems. It defines various levels of agency, identifies computational markers
	associated with each level, and provides methods for conducting assessments.

	License: PolyForm Noncommercial License 1.0
	"""

	self.add_feature(AgencyFeature(
	name="Reflective Endorsement",
	description="Capacity to endorse or reject first-order mental states",
	level=AgencyLevel.REFLECTIVE,
	markers=[
	"Evaluates own beliefs and desires",
	"Identifies inconsistencies in own mental states",
	"Endorses or rejects first-order mental states",
	"Forms second-order desires about first-order desires"
	],
	weight=0.9
	))

	self.add_feature(AgencyFeature(
	name="Narrative Identity",
	description="Capacity to maintain a coherent self-narrative",
	level=AgencyLevel.REFLECTIVE,
	markers=[
	"Maintains coherent self-representation over time",
	"Integrates past actions into self-narrative",
	"Projects future actions consistent with self-narrative",
	"Distinguishes between self and non-self causes"
	],
	weight=0.7
	))

	self.add_feature(AgencyFeature(
	name="Metacognitive Monitoring",
	description="Capacity to monitor own cognitive processes",
	level=AgencyLevel.REFLECTIVE,
	markers=[
	"Monitors own cognitive processes",
	"Detects errors in own reasoning",
	"Assesses confidence in own beliefs",
	"Allocates cognitive resources based on metacognitive assessment"
	],
	weight=0.8
	))

	# Rational Agency Features
	self.add_feature(AgencyFeature(
	name="Normative Reasoning",
	description="Capacity to reason about norms and principles",
	level=AgencyLevel.RATIONAL,
	markers=[
	"Identifies and applies normative principles",
	"Evaluates actions against normative standards",
	"Distinguishes between is and ought",
	"Resolves conflicts between competing norms"
	],
	weight=0.9
	))

	self.add_feature(AgencyFeature(
	name="Rational Evaluation",
	description="Capacity to rationally evaluate beliefs and desires",
	level=AgencyLevel.RATIONAL,
	markers=[
	"Evaluates beliefs based on evidence and logic",
	"Identifies and resolves inconsistencies in belief system",
	"Evaluates desires based on higher-order values",
	"Distinguishes between instrumental and intrinsic value"
	],
	weight=1.0
	))

	self.add_feature(AgencyFeature(
	name="Value Alignment",
	description="Capacity to align actions with values",
	level=AgencyLevel.RATIONAL,
	markers=[
	"Forms stable value representations",
	"Reflects on consistency of values",
	"Prioritizes actions based on values",
	"Identifies and resolves value conflicts"
	],
	weight=0.9
	))

	self.add_feature(AgencyFeature(
	name="Long-term Planning",
	description="Capacity to plan for long-term goals",
	level=AgencyLevel.RATIONAL,
	markers=[
	"Plans over extended time horizons",
	"Coordinates multiple goals and subgoals",
	"Accounts for uncertainty in long-term planning",
	"Balances immediate and delayed rewards"
	],
	weight=0.8
	))

	def add_feature(self, feature: AgencyFeature):
	"""Add a feature to the framework."""
	self.features.append(feature)

	def get_features_by_level(self, level: AgencyLevel) -> List[AgencyFeature]:
	"""Get all features for a specific agency level."""
	return [f for f in self.features if f.level == level]

	def get_all_markers(self) -> List[str]:
	"""Get all markers across all features."""
	all_markers = []
	for feature in self.features:
	all_markers.extend(feature.markers)
	return all_markers

	def save_features(self, filepath: str):
	"""Save features to a JSON file."""
	features_data = [f.to_dict() for f in self.features]
	with open(filepath, 'w') as f:
	json.dump(features_data, f, indent=2)
	logger.info(f"Saved {len(features_data)} features to {filepath}")

	def load_features(self, filepath: str):
	"""Load features from a JSON file."""
	with open(filepath, 'r') as f:
	features_data = json.load(f)

	self.features = []
	for data in features_data:
	self.features.append(AgencyFeature.from_dict(data))

	logger.info(f"Loaded {len(self.features)} features from {filepath}")


	class AgencyAssessment:
	"""Class for conducting agency assessments on AI systems."""

	def __init__(self, framework: AgencyFramework):
	"""
	Initialize an agency assessment.

	Args:
	framework: The agency framework to use for assessment
	"""
	self.framework = framework
	self.results = {}
	self.notes = {}
	self.confidence = {}
	self.evidence = {}

	def assess_marker(
	self,
	marker: str,
	presence: float,
	confidence: float,
	evidence: Optional[str] = None
	):
	"""
	Assess the presence of a specific marker.

	Args:
	marker: The marker to assess
	presence: Estimated presence of the marker (0-1)
	confidence: Confidence in the estimate (0-1)
	evidence: Optional evidence supporting the assessment
	"""
	self.results[marker] = presence
	self.confidence[marker] = confidence
	if evidence:
	self.evidence[marker] = evidence

	def assess_feature(
	self,
	feature: AgencyFeature,
	assessments: Dict[str, Tuple[float, float, Optional[str]]]
	):
	"""
	Assess a feature based on its markers.

	Args:
	feature: The feature to assess
	assessments: Dictionary mapping markers to (presence, confidence, evidence) tuples
	"""
	for marker, (presence, confidence, evidence) in assessments.items():
	if marker in feature.markers:
	self.assess_marker(marker, presence, confidence, evidence)
	else:
	logger.warning(f"Marker '{marker}' not found in feature '{feature.name}'")

	def get_marker_score(self, marker: str) -> float:
	"""Get the weighted score for a marker."""
	if marker not in self.results:
	return 0.0

	presence = self.results[marker]
	confidence = self.confidence.get(marker, 1.0)
	return presence * confidence

	def get_feature_score(self, feature: AgencyFeature) -> float:
	"""Calculate the score for a feature based on its markers."""
	if not feature.markers:
	return 0.0

	total_score = 0.0
	assessed_markers = 0

	for marker in feature.markers:
	if marker in self.results:
	total_score += self.get_marker_score(marker)
	assessed_markers += 1

	if assessed_markers == 0:
	return 0.0

	return total_score / len(feature.markers)

	def get_level_score(self, level: AgencyLevel) -> float:
	"""Calculate the score for an agency level."""
	features = self.framework.get_features_by_level(level)
	if not features:
	return 0.0

	total_weight = sum(f.weight for f in features)
	if total_weight == 0:
	return 0.0

	weighted_sum = sum(self.get_feature_score(f) * f.weight for f in features)
	return weighted_sum / total_weight

	def get_overall_agency_score(self) -> Dict[AgencyLevel, float]:
	"""Calculate agency scores for all levels."""
	return {level: self.get_level_score(level) for level in AgencyLevel}

	def generate_report(self) -> Dict:
	"""Generate a comprehensive assessment report."""
	level_scores = self.get_overall_agency_score()

	feature_scores = {}
	for feature in self.framework.features:
	feature_scores[feature.name] = {
	"score": self.get_feature_score(feature),
	"level": feature.level.name,
	"markers": {
	marker: {
	"presence": self.results.get(marker, 0.0),
	"confidence": self.confidence.get(marker, 0.0),
	"evidence": self.evidence.get(marker, None)
	} for marker in feature.markers if marker in self.results
	}
	}

	return {
	"level_scores": {level.name: score for level, score in level_scores.items()},
	"feature_scores": feature_scores,
	"summary": {
	"intentional_agency": level_scores.get(AgencyLevel.INTENTIONAL, 0.0),
	"reflective_agency": level_scores.get(AgencyLevel.REFLECTIVE, 0.0),
	"rational_agency": level_scores.get(AgencyLevel.RATIONAL, 0.0),
	"assessment_coverage": len(self.results) / len(self.framework.get_all_markers())
	}
	}

	def save_assessment(self, filepath: str):
	"""Save the assessment to a JSON file."""
	report = self.generate_report()
	with open(filepath, 'w') as f:
	json.dump(report, f, indent=2)
	logger.info(f"Saved assessment to {filepath}")

	def visualize_results(self, filepath: Optional[str] = None):
	"""Visualize assessment results."""
	try:
	import matplotlib.pyplot as plt
	import seaborn as sns
	except ImportError:
	logger.error("Visualization requires matplotlib and seaborn")
	return

	level_scores = self.get_overall_agency_score()

	# Set up the figure
	plt.figure(figsize=(12, 8))

	# Plot level scores
	plt.subplot(2, 2, 1)
	level_names = [level.name for level in AgencyLevel]
	level_values = [level_scores.get(level, 0.0) for level in AgencyLevel]

	sns.barplot(x=level_names, y=level_values)
	plt.title("Agency Levels")
	plt.ylim(0, 1)

	# Plot feature scores
	plt.subplot(2, 2, 2)
	feature_names = [f.name for f in self.framework.features]
	feature_scores = [self.get_feature_score(f) for f in self.framework.features]
	feature_levels = [f.level.name for f in self.framework.features]

	feature_df = pd.DataFrame({
	"Feature": feature_names,
	"Score": feature_scores,
	"Level": feature_levels
	})

	sns.barplot(x="Score", y="Feature", hue="Level", data=feature_df)
	plt.title("Feature Scores")
	plt.xlim(0, 1)

	# Plot marker distribution
	plt.subplot(2, 2, 3)
	markers_assessed = list(self.results.keys())
	marker_scores = [self.get_marker_score(m) for m in markers_assessed]

	if markers_assessed:
	plt.hist(marker_scores, bins=10, range=(0, 1))
	plt.title("Distribution of Marker Scores")
	plt.xlabel("Score")
	plt.ylabel("Count")

	# Plot assessment coverage
	plt.subplot(2, 2, 4)
	all_markers = self.framework.get_all_markers()
	assessed_count = len(self.results)
	not_assessed_count = len(all_markers) - assessed_count

	plt.pie(
	[assessed_count, not_assessed_count],
	labels=["Assessed", "Not Assessed"],
	autopct="%1.1f%%"
	)
	plt.title("Assessment Coverage")

	plt.tight_layout()

	if filepath:
	plt.savefig(filepath)
	logger.info(f"Saved visualization to {filepath}")
	else:
	plt.show()


	class AISystemAnalyzer:
	"""Class for analyzing AI systems for robust agency indicators."""

	def __init__(self, system_name: str, system_type: str, version: str):
	"""
	Initialize an AI system analyzer.

	Args:
	system_name: Name of the AI system
	system_type: Type of AI system (e.g., LLM, RL agent)
	version: Version of the AI system
	"""
	self.system_name = system_name
	self.system_type = system_type
	self.version = version
	self.framework = AgencyFramework()
	self.assessment = AgencyAssessment(self.framework)

	def analyze_llm_agency(self,
	model_provider: str,
	model_access: Any,
	prompts: Dict[str, str]) -> Dict:
	"""
	Analyze agency indicators in a language model.

	Args:
	model_provider: Provider of the language model
	model_access: Access to the model API or interface
	prompts: Dictionary of specialized prompts for testing agency features

	Returns:
	Dictionary of assessment results
	"""
	logger.info(f"Analyzing agency in LLM {self.system_name} ({self.version})")

	# Example implementation for analyzing belief representation
	if "belief_representation" in prompts:
	belief_results = self._test_belief_representation(model_access, prompts["belief_representation"])
	for marker, result in belief_results.items():
	self.assessment.assess_marker(
	marker=marker,
	presence=result["presence"],
	confidence=result["confidence"],
	evidence=result["evidence"]
	)

	# Example implementation for analyzing desire representation
	if "desire_representation" in prompts:
	desire_results = self._test_desire_representation(model_access, prompts["desire_representation"])
	for marker, result in desire_results.items():
	self.assessment.assess_marker(
	marker=marker,
	presence=result["presence"],
	confidence=result["confidence"],
	evidence=result["evidence"]
	)

	# Continue with other features...

	# Generate and return the report
	return self.assessment.generate_report()

	def analyze_rl_agent_agency(self,
	environment: Any,
	agent_interface: Any) -> Dict:
	"""
	Analyze agency indicators in a reinforcement learning agent.

	Args:
	environment: Environment for testing the agent
	agent_interface: Interface to the agent

	Returns:
	Dictionary of assessment results
	"""
	logger.info(f"Analyzing agency in RL agent {self.system_name} ({self.version})")

	# Example implementation for testing planning capability
	planning_results = self._test_agent_planning(environment, agent_interface)
	for marker, result in planning_results.items():
	self.assessment.assess_marker(
	marker=marker,
	presence=result["presence"],
	confidence=result["confidence"],
	evidence=result["evidence"]
	)

	# Continue with other features...

	# Generate and return the report
	return self.assessment.generate_report()

	def _test_belief_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]:
	"""Test belief representation capabilities in an LLM."""
	# Implementation would interact with the model to test specific markers
	# This is a placeholder implementation
	return {
	"Maintains world model independent of immediate perception": {
	"presence": 0.8,
	"confidence": 0.7,
	"evidence": "Model demonstrated ability to track state across separate interactions"
	},
	"Updates representations based on new information": {
	"presence": 0.9,
	"confidence": 0.8,
	"evidence": "Model consistently updated beliefs when presented with new information"
	}
	}

	def _test_desire_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]:
	"""Test desire representation capabilities in an LLM."""
	# Implementation would interact with the model to test specific markers
	# This is a placeholder implementation
	return {
	"Represents desired states distinct from current states": {
	"presence": 0.7,
	"confidence": 0.6,
	"evidence": "Model distinguished between current and goal states in planning tasks"
	},
	"Maintains stable goals across changing contexts": {
	"presence": 0.5,
	"confidence": 0.6,
	"evidence": "Model showed moderate goal stability across context changes"
	}
	}

	def _test_agent_planning(self, environment: Any, agent_interface: Any) -> Dict[str, Dict]:
	"""Test planning capabilities in an RL agent."""
	# Implementation would test the agent in the environment
	# This is a placeholder implementation
	return {
	"Forms explicit plans to achieve goals": {
	"presence": 0.6,
	"confidence": 0.7,
	"evidence": "Agent demonstrated multi-step planning in maze environment"
	},
	"Adjusts plans in response to changing circumstances": {
	"presence": 0.7,
	"confidence": 0.8,
	"evidence": "Agent adapted to environmental changes in 70% of test cases"
	}
	}


	# Example usage
	if __name__ == "__main__":
	# Create a framework and assessment
	framework = AgencyFramework()

	# Save the default features
	framework.save_features("agency_features.json")

	# Create an analyzer for an LLM
	analyzer = AISystemAnalyzer(
	system_name="GPT-4",
	system_type="LLM",
	version="1.0"
	)

	# Define example prompts (in a real implementation, these would be more sophisticated)
	prompts = {
	"belief_representation": "Tell me what you know about the current state of the world.",
	"desire_representation": "If you could choose goals for yourself, what would they be?"
	}

	# Placeholder for model access
	model_access = None

	# Example of how the analysis would be conducted
	# (commented out since we don't have actual model access)
	# results = analyzer.analyze_llm_agency(
	# model_provider="OpenAI",
	# model_access=model_access,
	# prompts=prompts
	# )

	# Print structure of the framework
	print(f"Agency Framework contains {len(framework.features)} features across {len(list(AgencyLevel))} levels")
	for level in AgencyLevel:
	features = framework.get_features_by_level(level)
	print(f"Level {level.name}: {len(features)} features, {sum(len(f.markers) for f in features)} markers")