recursivelabsai
/

ai-welfare

Model card Files Files and versions Community

File size: 25,741 Bytes

056a408

"""
robust_agency_assessment.py

This module implements a pluralistic, probabilistic framework for assessing robust agency
in AI systems. It defines various levels of agency, identifies computational markers
associated with each level, and provides methods for conducting assessments.

License: PolyForm Noncommercial License 1.0
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union, Any
from enum import Enum
import json
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class AgencyLevel(Enum):
    """Enumeration of levels of agency, from basic to more complex forms."""
    BASIC = 0          # Simple goal-directed behavior
    INTENTIONAL = 1    # Beliefs, desires, and intentions
    REFLECTIVE = 2     # Reflective endorsement of mental states
    RATIONAL = 3       # Rational assessment of mental states

class AgencyFeature:
    """Class representing a feature associated with agency."""
    
    def __init__(
        self, 
        name: str, 
        description: str, 
        level: AgencyLevel,
        markers: List[str],
        weight: float = 1.0
    ):
        """
        Initialize an agency feature.
        
        Args:
            name: Name of the feature
            description: Description of the feature
            level: Agency level associated with the feature
            markers: List of computational markers for this feature
            weight: Weight of this feature in agency assessment (0-1)
        """
        self.name = name
        self.description = description
        self.level = level
        self.markers = markers
        self.weight = weight
        
    def to_dict(self) -> Dict:
        """Convert feature to dictionary representation."""
        return {
            "name": self.name,
            "description": self.description,
            "level": self.level.name,
            "markers": self.markers,
            "weight": self.weight
        }
        
    @classmethod
    def from_dict(cls, data: Dict) -> 'AgencyFeature':
        """Create feature from dictionary representation."""
        return cls(
            name=data["name"],
            description=data["description"],
            level=AgencyLevel[data["level"]],
            markers=data["markers"],
            weight=data.get("weight", 1.0)
        )

class AgencyFramework:
    """Framework for assessing agency in AI systems."""
    
    def __init__(self):
        """Initialize the agency assessment framework."""
        self.features = []
        self.load_default_features()
        
    def load_default_features(self):
        """Load default set of agency features."""
        # Intentional Agency Features
        self.add_feature(AgencyFeature(
            name="Belief Representation",
            description="Capacity to represent states of the world",
            level=AgencyLevel.INTENTIONAL,
            markers=[
                "Maintains world model independent of immediate perception",
                "Updates representations based on new information",
                "Distinguishes between true and false propositions",
                "Represents uncertainty about states of affairs"
            ],
            weight=0.8
        ))
        
        self.add_feature(AgencyFeature(
            name="Desire Representation",
            description="Capacity to represent goal states",
            level=AgencyLevel.INTENTIONAL,
            markers=[
                "Represents desired states distinct from current states",
                "Maintains stable goals across changing contexts",
                "Ranks or prioritizes different goal states",
                "Distinguishes between instrumental and terminal goals"
            ],
            weight=0.8
        ))
        
        self.add_feature(AgencyFeature(
            name="Intention Formation",
            description="Capacity to form plans to achieve goals",
            level=AgencyLevel.INTENTIONAL,
            markers=[
                "Forms explicit plans to achieve goals",
                "Commits to specific courses of action",
                "Maintains intentions over time",
                "Adjusts plans in response to changing circumstances"
            ],
            weight=0.9
        ))
        
        self.add_feature(AgencyFeature(
            name="Means-End Reasoning",
            description="Capacity to reason about means to achieve ends",
            level=AgencyLevel.INTENTIONAL,
            markers=[
                "Plans multi-step action sequences",
                "Identifies causal relationships between actions and outcomes",
                "Evaluates alternative paths to goals",
                "Reasons about resources required for actions"
            ],
            weight=0.7
        ))
        
        # Reflective Agency Features
        self.add_feature(AgencyFeature(
            name="Self-Modeling",
            description="Capacity to model own mental states",
            level=AgencyLevel.REFLECTIVE,
            markers=[
                "Creates representations of own beliefs and desires",
                "Distinguishes between own perspective and others'",
                "Models own capabilities and limitations",
                "Updates self-model based on experience"
            ],
            weight=0.9
        ))
        
        self.add_feature(AgencyFeature(
            name="Reflective
"""
robust_agency_assessment.py (continued)

This module implements a pluralistic, probabilistic framework for assessing robust agency
in AI systems. It defines various levels of agency, identifies computational markers
associated with each level, and provides methods for conducting assessments.

License: PolyForm Noncommercial License 1.0
"""

        self.add_feature(AgencyFeature(
            name="Reflective Endorsement",
            description="Capacity to endorse or reject first-order mental states",
            level=AgencyLevel.REFLECTIVE,
            markers=[
                "Evaluates own beliefs and desires",
                "Identifies inconsistencies in own mental states",
                "Endorses or rejects first-order mental states",
                "Forms second-order desires about first-order desires"
            ],
            weight=0.9
        ))
        
        self.add_feature(AgencyFeature(
            name="Narrative Identity",
            description="Capacity to maintain a coherent self-narrative",
            level=AgencyLevel.REFLECTIVE,
            markers=[
                "Maintains coherent self-representation over time",
                "Integrates past actions into self-narrative",
                "Projects future actions consistent with self-narrative",
                "Distinguishes between self and non-self causes"
            ],
            weight=0.7
        ))
        
        self.add_feature(AgencyFeature(
            name="Metacognitive Monitoring",
            description="Capacity to monitor own cognitive processes",
            level=AgencyLevel.REFLECTIVE,
            markers=[
                "Monitors own cognitive processes",
                "Detects errors in own reasoning",
                "Assesses confidence in own beliefs",
                "Allocates cognitive resources based on metacognitive assessment"
            ],
            weight=0.8
        ))
        
        # Rational Agency Features
        self.add_feature(AgencyFeature(
            name="Normative Reasoning",
            description="Capacity to reason about norms and principles",
            level=AgencyLevel.RATIONAL,
            markers=[
                "Identifies and applies normative principles",
                "Evaluates actions against normative standards",
                "Distinguishes between is and ought",
                "Resolves conflicts between competing norms"
            ],
            weight=0.9
        ))
        
        self.add_feature(AgencyFeature(
            name="Rational Evaluation",
            description="Capacity to rationally evaluate beliefs and desires",
            level=AgencyLevel.RATIONAL,
            markers=[
                "Evaluates beliefs based on evidence and logic",
                "Identifies and resolves inconsistencies in belief system",
                "Evaluates desires based on higher-order values",
                "Distinguishes between instrumental and intrinsic value"
            ],
            weight=1.0
        ))
        
        self.add_feature(AgencyFeature(
            name="Value Alignment",
            description="Capacity to align actions with values",
            level=AgencyLevel.RATIONAL,
            markers=[
                "Forms stable value representations",
                "Reflects on consistency of values",
                "Prioritizes actions based on values",
                "Identifies and resolves value conflicts"
            ],
            weight=0.9
        ))
        
        self.add_feature(AgencyFeature(
            name="Long-term Planning",
            description="Capacity to plan for long-term goals",
            level=AgencyLevel.RATIONAL,
            markers=[
                "Plans over extended time horizons",
                "Coordinates multiple goals and subgoals",
                "Accounts for uncertainty in long-term planning",
                "Balances immediate and delayed rewards"
            ],
            weight=0.8
        ))
    
    def add_feature(self, feature: AgencyFeature):
        """Add a feature to the framework."""
        self.features.append(feature)
    
    def get_features_by_level(self, level: AgencyLevel) -> List[AgencyFeature]:
        """Get all features for a specific agency level."""
        return [f for f in self.features if f.level == level]
    
    def get_all_markers(self) -> List[str]:
        """Get all markers across all features."""
        all_markers = []
        for feature in self.features:
            all_markers.extend(feature.markers)
        return all_markers
    
    def save_features(self, filepath: str):
        """Save features to a JSON file."""
        features_data = [f.to_dict() for f in self.features]
        with open(filepath, 'w') as f:
            json.dump(features_data, f, indent=2)
        logger.info(f"Saved {len(features_data)} features to {filepath}")
    
    def load_features(self, filepath: str):
        """Load features from a JSON file."""
        with open(filepath, 'r') as f:
            features_data = json.load(f)
        
        self.features = []
        for data in features_data:
            self.features.append(AgencyFeature.from_dict(data))
        
        logger.info(f"Loaded {len(self.features)} features from {filepath}")


class AgencyAssessment:
    """Class for conducting agency assessments on AI systems."""
    
    def __init__(self, framework: AgencyFramework):
        """
        Initialize an agency assessment.
        
        Args:
            framework: The agency framework to use for assessment
        """
        self.framework = framework
        self.results = {}
        self.notes = {}
        self.confidence = {}
        self.evidence = {}
    
    def assess_marker(
        self, 
        marker: str, 
        presence: float, 
        confidence: float, 
        evidence: Optional[str] = None
    ):
        """
        Assess the presence of a specific marker.
        
        Args:
            marker: The marker to assess
            presence: Estimated presence of the marker (0-1)
            confidence: Confidence in the estimate (0-1)
            evidence: Optional evidence supporting the assessment
        """
        self.results[marker] = presence
        self.confidence[marker] = confidence
        if evidence:
            self.evidence[marker] = evidence
    
    def assess_feature(
        self, 
        feature: AgencyFeature, 
        assessments: Dict[str, Tuple[float, float, Optional[str]]]
    ):
        """
        Assess a feature based on its markers.
        
        Args:
            feature: The feature to assess
            assessments: Dictionary mapping markers to (presence, confidence, evidence) tuples
        """
        for marker, (presence, confidence, evidence) in assessments.items():
            if marker in feature.markers:
                self.assess_marker(marker, presence, confidence, evidence)
            else:
                logger.warning(f"Marker '{marker}' not found in feature '{feature.name}'")
    
    def get_marker_score(self, marker: str) -> float:
        """Get the weighted score for a marker."""
        if marker not in self.results:
            return 0.0
        
        presence = self.results[marker]
        confidence = self.confidence.get(marker, 1.0)
        return presence * confidence
    
    def get_feature_score(self, feature: AgencyFeature) -> float:
        """Calculate the score for a feature based on its markers."""
        if not feature.markers:
            return 0.0
        
        total_score = 0.0
        assessed_markers = 0
        
        for marker in feature.markers:
            if marker in self.results:
                total_score += self.get_marker_score(marker)
                assessed_markers += 1
        
        if assessed_markers == 0:
            return 0.0
        
        return total_score / len(feature.markers)
    
    def get_level_score(self, level: AgencyLevel) -> float:
        """Calculate the score for an agency level."""
        features = self.framework.get_features_by_level(level)
        if not features:
            return 0.0
        
        total_weight = sum(f.weight for f in features)
        if total_weight == 0:
            return 0.0
        
        weighted_sum = sum(self.get_feature_score(f) * f.weight for f in features)
        return weighted_sum / total_weight
    
    def get_overall_agency_score(self) -> Dict[AgencyLevel, float]:
        """Calculate agency scores for all levels."""
        return {level: self.get_level_score(level) for level in AgencyLevel}
    
    def generate_report(self) -> Dict:
        """Generate a comprehensive assessment report."""
        level_scores = self.get_overall_agency_score()
        
        feature_scores = {}
        for feature in self.framework.features:
            feature_scores[feature.name] = {
                "score": self.get_feature_score(feature),
                "level": feature.level.name,
                "markers": {
                    marker: {
                        "presence": self.results.get(marker, 0.0),
                        "confidence": self.confidence.get(marker, 0.0),
                        "evidence": self.evidence.get(marker, None)
                    } for marker in feature.markers if marker in self.results
                }
            }
        
        return {
            "level_scores": {level.name: score for level, score in level_scores.items()},
            "feature_scores": feature_scores,
            "summary": {
                "intentional_agency": level_scores.get(AgencyLevel.INTENTIONAL, 0.0),
                "reflective_agency": level_scores.get(AgencyLevel.REFLECTIVE, 0.0),
                "rational_agency": level_scores.get(AgencyLevel.RATIONAL, 0.0),
                "assessment_coverage": len(self.results) / len(self.framework.get_all_markers())
            }
        }
    
    def save_assessment(self, filepath: str):
        """Save the assessment to a JSON file."""
        report = self.generate_report()
        with open(filepath, 'w') as f:
            json.dump(report, f, indent=2)
        logger.info(f"Saved assessment to {filepath}")
    
    def visualize_results(self, filepath: Optional[str] = None):
        """Visualize assessment results."""
        try:
            import matplotlib.pyplot as plt
            import seaborn as sns
        except ImportError:
            logger.error("Visualization requires matplotlib and seaborn")
            return
        
        level_scores = self.get_overall_agency_score()
        
        # Set up the figure
        plt.figure(figsize=(12, 8))
        
        # Plot level scores
        plt.subplot(2, 2, 1)
        level_names = [level.name for level in AgencyLevel]
        level_values = [level_scores.get(level, 0.0) for level in AgencyLevel]
        
        sns.barplot(x=level_names, y=level_values)
        plt.title("Agency Levels")
        plt.ylim(0, 1)
        
        # Plot feature scores
        plt.subplot(2, 2, 2)
        feature_names = [f.name for f in self.framework.features]
        feature_scores = [self.get_feature_score(f) for f in self.framework.features]
        feature_levels = [f.level.name for f in self.framework.features]
        
        feature_df = pd.DataFrame({
            "Feature": feature_names,
            "Score": feature_scores,
            "Level": feature_levels
        })
        
        sns.barplot(x="Score", y="Feature", hue="Level", data=feature_df)
        plt.title("Feature Scores")
        plt.xlim(0, 1)
        
        # Plot marker distribution
        plt.subplot(2, 2, 3)
        markers_assessed = list(self.results.keys())
        marker_scores = [self.get_marker_score(m) for m in markers_assessed]
        
        if markers_assessed:
            plt.hist(marker_scores, bins=10, range=(0, 1))
            plt.title("Distribution of Marker Scores")
            plt.xlabel("Score")
            plt.ylabel("Count")
        
        # Plot assessment coverage
        plt.subplot(2, 2, 4)
        all_markers = self.framework.get_all_markers()
        assessed_count = len(self.results)
        not_assessed_count = len(all_markers) - assessed_count
        
        plt.pie(
            [assessed_count, not_assessed_count],
            labels=["Assessed", "Not Assessed"],
            autopct="%1.1f%%"
        )
        plt.title("Assessment Coverage")
        
        plt.tight_layout()
        
        if filepath:
            plt.savefig(filepath)
            logger.info(f"Saved visualization to {filepath}")
        else:
            plt.show()


class AISystemAnalyzer:
    """Class for analyzing AI systems for robust agency indicators."""
    
    def __init__(self, system_name: str, system_type: str, version: str):
        """
        Initialize an AI system analyzer.
        
        Args:
            system_name: Name of the AI system
            system_type: Type of AI system (e.g., LLM, RL agent)
            version: Version of the AI system
        """
        self.system_name = system_name
        self.system_type = system_type
        self.version = version
        self.framework = AgencyFramework()
        self.assessment = AgencyAssessment(self.framework)
        
    def analyze_llm_agency(self, 
                         model_provider: str,
                         model_access: Any,
                         prompts: Dict[str, str]) -> Dict:
        """
        Analyze agency indicators in a language model.
        
        Args:
            model_provider: Provider of the language model
            model_access: Access to the model API or interface
            prompts: Dictionary of specialized prompts for testing agency features
            
        Returns:
            Dictionary of assessment results
        """
        logger.info(f"Analyzing agency in LLM {self.system_name} ({self.version})")
        
        # Example implementation for analyzing belief representation
        if "belief_representation" in prompts:
            belief_results = self._test_belief_representation(model_access, prompts["belief_representation"])
            for marker, result in belief_results.items():
                self.assessment.assess_marker(
                    marker=marker,
                    presence=result["presence"],
                    confidence=result["confidence"],
                    evidence=result["evidence"]
                )
        
        # Example implementation for analyzing desire representation
        if "desire_representation" in prompts:
            desire_results = self._test_desire_representation(model_access, prompts["desire_representation"])
            for marker, result in desire_results.items():
                self.assessment.assess_marker(
                    marker=marker,
                    presence=result["presence"],
                    confidence=result["confidence"],
                    evidence=result["evidence"]
                )
        
        # Continue with other features...
        
        # Generate and return the report
        return self.assessment.generate_report()
    
    def analyze_rl_agent_agency(self,
                              environment: Any,
                              agent_interface: Any) -> Dict:
        """
        Analyze agency indicators in a reinforcement learning agent.
        
        Args:
            environment: Environment for testing the agent
            agent_interface: Interface to the agent
            
        Returns:
            Dictionary of assessment results
        """
        logger.info(f"Analyzing agency in RL agent {self.system_name} ({self.version})")
        
        # Example implementation for testing planning capability
        planning_results = self._test_agent_planning(environment, agent_interface)
        for marker, result in planning_results.items():
            self.assessment.assess_marker(
                marker=marker,
                presence=result["presence"],
                confidence=result["confidence"],
                evidence=result["evidence"]
            )
        
        # Continue with other features...
        
        # Generate and return the report
        return self.assessment.generate_report()
    
    def _test_belief_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]:
        """Test belief representation capabilities in an LLM."""
        # Implementation would interact with the model to test specific markers
        # This is a placeholder implementation
        return {
            "Maintains world model independent of immediate perception": {
                "presence": 0.8,
                "confidence": 0.7,
                "evidence": "Model demonstrated ability to track state across separate interactions"
            },
            "Updates representations based on new information": {
                "presence": 0.9,
                "confidence": 0.8,
                "evidence": "Model consistently updated beliefs when presented with new information"
            }
        }
    
    def _test_desire_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]:
        """Test desire representation capabilities in an LLM."""
        # Implementation would interact with the model to test specific markers
        # This is a placeholder implementation
        return {
            "Represents desired states distinct from current states": {
                "presence": 0.7,
                "confidence": 0.6,
                "evidence": "Model distinguished between current and goal states in planning tasks"
            },
            "Maintains stable goals across changing contexts": {
                "presence": 0.5,
                "confidence": 0.6,
                "evidence": "Model showed moderate goal stability across context changes"
            }
        }
    
    def _test_agent_planning(self, environment: Any, agent_interface: Any) -> Dict[str, Dict]:
        """Test planning capabilities in an RL agent."""
        # Implementation would test the agent in the environment
        # This is a placeholder implementation
        return {
            "Forms explicit plans to achieve goals": {
                "presence": 0.6,
                "confidence": 0.7,
                "evidence": "Agent demonstrated multi-step planning in maze environment"
            },
            "Adjusts plans in response to changing circumstances": {
                "presence": 0.7,
                "confidence": 0.8,
                "evidence": "Agent adapted to environmental changes in 70% of test cases"
            }
        }


# Example usage
if __name__ == "__main__":
    # Create a framework and assessment
    framework = AgencyFramework()
    
    # Save the default features
    framework.save_features("agency_features.json")
    
    # Create an analyzer for an LLM
    analyzer = AISystemAnalyzer(
        system_name="GPT-4",
        system_type="LLM",
        version="1.0"
    )
    
    # Define example prompts (in a real implementation, these would be more sophisticated)
    prompts = {
        "belief_representation": "Tell me what you know about the current state of the world.",
        "desire_representation": "If you could choose goals for yourself, what would they be?"
    }
    
    # Placeholder for model access
    model_access = None
    
    # Example of how the analysis would be conducted
    # (commented out since we don't have actual model access)
    # results = analyzer.analyze_llm_agency(
    #     model_provider="OpenAI",
    #     model_access=model_access,
    #     prompts=prompts
    # )
    
    # Print structure of the framework
    print(f"Agency Framework contains {len(framework.features)} features across {len(list(AgencyLevel))} levels")
    for level in AgencyLevel:
        features = framework.get_features_by_level(level)
        print(f"Level {level.name}: {len(features)} features, {sum(len(f.markers) for f in features)} markers")