Spaces:

swaleha19
/

agent_tuning_framework

Running

File size: 16,073 Bytes

6c482f9

"""
Evaluation Framework for Cross-Domain Uncertainty Quantification

This module provides functionality for evaluating uncertainty quantification methods
across different domains, including metrics for uncertainty quality and cross-domain performance.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict, Any, Union, Optional, Tuple
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

class UncertaintyEvaluator:
    """Evaluator for uncertainty quantification methods."""
    
    def __init__(self, name: str):
        """
        Initialize the uncertainty evaluator.
        
        Args:
            name: Name of the evaluation method
        """
        self.name = name
    
    def evaluate(
        self, 
        uncertainties: List[float], 
        correctness: List[bool]
    ) -> Dict[str, float]:
        """
        Evaluate uncertainty estimates against correctness.
        
        Args:
            uncertainties: List of uncertainty scores (higher means more uncertain)
            correctness: List of boolean correctness indicators
            
        Returns:
            Dictionary of evaluation metrics
        """
        raise NotImplementedError("Subclasses must implement this method")


class CalibrationEvaluator(UncertaintyEvaluator):
    """Evaluator for calibration quality."""
    
    def __init__(self):
        """Initialize the calibration evaluator."""
        super().__init__("calibration_evaluator")
    
    def expected_calibration_error(
        self, 
        confidences: List[float], 
        correctness: List[bool], 
        num_bins: int = 10
    ) -> float:
        """
        Calculate Expected Calibration Error (ECE).
        
        Args:
            confidences: List of confidence scores
            correctness: List of boolean correctness indicators
            num_bins: Number of bins for binning confidences
            
        Returns:
            Expected Calibration Error
        """
        if len(confidences) != len(correctness):
            raise ValueError("Confidences and correctness must have the same length")
        
        if not confidences:
            return 0.0
        
        # Create bins and calculate ECE
        bin_indices = np.digitize(confidences, np.linspace(0, 1, num_bins))
        ece = 0.0
        
        for bin_idx in range(1, num_bins + 1):
            bin_mask = (bin_indices == bin_idx)
            if np.any(bin_mask):
                bin_confidences = np.array(confidences)[bin_mask]
                bin_correctness = np.array(correctness)[bin_mask]
                bin_confidence = np.mean(bin_confidences)
                bin_accuracy = np.mean(bin_correctness)
                bin_size = np.sum(bin_mask)
                
                # Weighted absolute difference between confidence and accuracy
                ece += (bin_size / len(confidences)) * np.abs(bin_confidence - bin_accuracy)
        
        return float(ece)
    
    def maximum_calibration_error(
        self, 
        confidences: List[float], 
        correctness: List[bool], 
        num_bins: int = 10
    ) -> float:
        """
        Calculate Maximum Calibration Error (MCE).
        
        Args:
            confidences: List of confidence scores
            correctness: List of boolean correctness indicators
            num_bins: Number of bins for binning confidences
            
        Returns:
            Maximum Calibration Error
        """
        if len(confidences) != len(correctness):
            raise ValueError("Confidences and correctness must have the same length")
        
        if not confidences:
            return 0.0
        
        # Create bins and calculate MCE
        bin_indices = np.digitize(confidences, np.linspace(0, 1, num_bins))
        max_ce = 0.0
        
        for bin_idx in range(1, num_bins + 1):
            bin_mask = (bin_indices == bin_idx)
            if np.any(bin_mask):
                bin_confidences = np.array(confidences)[bin_mask]
                bin_correctness = np.array(correctness)[bin_mask]
                bin_confidence = np.mean(bin_confidences)
                bin_accuracy = np.mean(bin_correctness)
                
                # Absolute difference between confidence and accuracy
                ce = np.abs(bin_confidence - bin_accuracy)
                max_ce = max(max_ce, ce)
        
        return float(max_ce)
    
    def evaluate(
        self, 
        confidences: List[float], 
        correctness: List[bool]
    ) -> Dict[str, float]:
        """
        Evaluate calibration quality.
        
        Args:
            confidences: List of confidence scores
            correctness: List of boolean correctness indicators
            
        Returns:
            Dictionary of calibration metrics:
                - ece: Expected Calibration Error
                - mce: Maximum Calibration Error
        """
        return {
            "ece": self.expected_calibration_error(confidences, correctness),
            "mce": self.maximum_calibration_error(confidences, correctness)
        }
    
    def plot_reliability_diagram(
        self, 
        confidences: List[float], 
        correctness: List[bool], 
        num_bins: int = 10,
        title: str = "Reliability Diagram",
        save_path: Optional[str] = None
    ) -> None:
        """
        Plot a reliability diagram for calibration visualization.
        
        Args:
            confidences: List of confidence scores
            correctness: List of boolean correctness indicators
            num_bins: Number of bins for binning confidences
            title: Title for the plot
            save_path: Path to save the plot (None to display)
        """
        if len(confidences) != len(correctness):
            raise ValueError("Confidences and correctness must have the same length")
        
        # Create bins
        bin_edges = np.linspace(0, 1, num_bins + 1)
        bin_indices = np.digitize(confidences, bin_edges[:-1])
        
        # Calculate accuracy and confidence for each bin
        bin_accuracies = []
        bin_confidences = []
        bin_sizes = []
        
        for bin_idx in range(1, num_bins + 1):
            bin_mask = (bin_indices == bin_idx)
            if np.any(bin_mask):
                bin_confidences.append(np.mean(np.array(confidences)[bin_mask]))
                bin_accuracies.append(np.mean(np.array(correctness)[bin_mask]))
                bin_sizes.append(np.sum(bin_mask))
            else:
                bin_confidences.append(0)
                bin_accuracies.append(0)
                bin_sizes.append(0)
        
        # Plot reliability diagram
        plt.figure(figsize=(10, 6))
        
        # Plot perfect calibration line
        plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
        
        # Plot bin accuracies vs. confidences
        plt.bar(
            bin_edges[:-1], 
            bin_accuracies, 
            width=1/num_bins, 
            align='edge', 
            alpha=0.7, 
            label='Observed Accuracy'
        )
        
        # Plot confidence histogram
        ax2 = plt.twinx()
        ax2.hist(
            confidences, 
            bins=bin_edges, 
            alpha=0.3, 
            color='gray', 
            label='Confidence Histogram'
        )
        
        # Calculate ECE and MCE
        ece = self.expected_calibration_error(confidences, correctness, num_bins)
        mce = self.maximum_calibration_error(confidences, correctness, num_bins)
        
        # Add ECE and MCE to title
        plt.title(f"{title}\nECE: {ece:.4f}, MCE: {mce:.4f}")
        
        # Add labels and legend
        plt.xlabel('Confidence')
        plt.ylabel('Accuracy')
        ax2.set_ylabel('Count')
        
        # Add legend
        lines, labels = plt.gca().get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax2.legend(lines + lines2, labels + labels2, loc='best')
        
        # Save or display the plot
        if save_path:
            plt.savefig(save_path)
            plt.close()
        else:
            plt.tight_layout()
            plt.show()


class SelectivePredictionEvaluator(UncertaintyEvaluator):
    """Evaluator for selective prediction performance."""
    
    def __init__(self):
        """Initialize the selective prediction evaluator."""
        super().__init__("selective_prediction_evaluator")
    
    def evaluate(
        self, 
        uncertainties: List[float], 
        correctness: List[bool]
    ) -> Dict[str, float]:
        """
        Evaluate selective prediction performance.
        
        Args:
            uncertainties: List of uncertainty scores (higher means more uncertain)
            correctness: List of boolean correctness indicators
            
        Returns:
            Dictionary of selective prediction metrics:
                - auroc: Area Under ROC Curve for predicting errors
                - auprc: Area Under Precision-Recall Curve for predicting errors
                - uncertainty_error_correlation: Correlation between uncertainty and errors
        """
        if len(uncertainties) != len(correctness):
            raise ValueError("Uncertainties and correctness must have the same length")
        
        if not uncertainties:
            return {
                "auroc": 0.5,
                "auprc": 0.5,
                "uncertainty_error_correlation": 0.0
            }
        
        # Convert correctness to errors (1 for error, 0 for correct)
        errors = [1 - int(c) for c in correctness]
        
        # Calculate AUROC for predicting errors
        try:
            auroc = roc_auc_score(errors, uncertainties)
        except:
            # Handle case where all predictions are correct or all are wrong
            auroc = 0.5
        
        # Calculate AUPRC for predicting errors
        try:
            precision, recall, _ = precision_recall_curve(errors, uncertainties)
            auprc = auc(recall, precision)
        except:
            # Handle case where all predictions are correct or all are wrong
            auprc = 0.5
        
        # Calculate correlation between uncertainty and errors
        uncertainty_error_correlation = np.corrcoef(uncertainties, errors)[0, 1]
        
        return {
            "auroc": float(auroc),
            "auprc": float(auprc),
            "uncertainty_error_correlation": float(uncertainty_error_correlation)
        }
    
    def plot_selective_prediction_curve(
        self, 
        uncertainties: List[float], 
        correctness: List[bool],
        title: str = "Selective Prediction Performance",
        save_path: Optional[str] = None
    ) -> None:
        """
        Plot a selective prediction curve.
        
        Args:
            uncertainties: List of uncertainty scores (higher means more uncertain)
            correctness: List of boolean correctness indicators
            title: Title for the plot
            save_path: Path to save the plot (None to display)
        """
        if len(uncertainties) != len(correctness):
            raise ValueError("Uncertainties and correctness must have the same length")
        
        # Sort by uncertainty (ascending)
        sorted_indices = np.argsort(uncertainties)
        sorted_correctness = np.array(correctness)[sorted_indices]
        
        # Calculate cumulative accuracy at different coverage levels
        coverages = np.linspace(0, 1, 100)
        accuracies = []
        
        for coverage in coverages:
            if coverage == 0:
                accuracies.append(1.0)  # Perfect accuracy at 0% coverage
            else:
                n_samples = int(coverage * len(sorted_correctness))
                if n_samples == 0:
                    accuracies.append(1.0)
                else:
                    accuracies.append(np.mean(sorted_correctness[:n_samples]))
        
        # Plot selective prediction curve
        plt.figure(figsize=(10, 6))
        plt.plot(coverages, accuracies, 'b-', linewidth=2)
        
        # Add reference line for random selection
        plt.plot([0, 1], [np.mean(correctness), np.mean(correctness)], 'k--', label='Random Selection')
        
        # Calculate AUROC
        metrics = self.evaluate(uncertainties, correctness)
        
        # Add AUROC to title
        plt.title(f"{title}\nAUROC: {metrics['auroc']:.4f}")
        
        # Add labels and legend
        plt.xlabel('Coverage')
        plt.ylabel('Accuracy')
        plt.legend(loc='best')
        
        # Save or display the plot
        if save_path:
            plt.savefig(save_path)
            plt.close()
        else:
            plt.tight_layout()
            plt.show()


class CrossDomainEvaluator:
    """Evaluator for cross-domain uncertainty performance."""
    
    def __init__(self):
        """Initialize the cross-domain evaluator."""
        self.name = "cross_domain_evaluator"
        self.calibration_evaluator = CalibrationEvaluator()
        self.selective_prediction_evaluator = SelectivePredictionEvaluator()
    
    def evaluate_domain_transfer(
        self, 
        source_uncertainties: List[float],
        source_correctness: List[bool],
        target_uncertainties: List[float],
        target_correctness: List[bool]
    ) -> Dict[str, float]:
        """
        Evaluate domain transfer performance.
        
        Args:
            source_uncertainties: List of uncertainty scores from source domain
            source_correctness: List of boolean correctness indicators from source domain
            target_uncertainties: List of uncertainty scores from target domain
            target_correctness: List of boolean correctness indicators from target domain
            
        Returns:
            Dictionary of domain transfer metrics:
                - source_auroc: AUROC in source domain
                - target_auroc: AUROC in target domain
                - transfer_degradation: Degradation in AUROC from source to target
                - source_ece: ECE in source domain
                - target_ece: ECE in target domain
                - calibration_shift: Shift in calibration from source to target
        """
        # Evaluate source domain
        source_selective = self.selective_prediction_evaluator.evaluate(
            source_uncertainties, source_correctness
        )
        source_calibration = self.calibration_evaluator.evaluate(
            [1 - u for u in source_uncertainties], source_correctness
        )
        
        # Evaluate target domain
        target_selective = self.selective_prediction_evaluator.evaluate(
            target_uncertainties, target_correctness
        )
        target_calibration = self.calibration_evaluator.evaluate(
            [1 - u for u in target_uncertainties], target_correctness
        )
        
        # Calculate transfer metrics
        transfer_degradation = source_selective["auroc"] - target_selective["auroc"]
        calibration_shift = target_calibration["ece"] - source_calibration["ece"]
        
        return {
            "source_auroc": source_selective["auroc"],
            "target_auroc": target_selective["auroc"],
            "transfer_degradation": float(transfer_degradation),
            "source_ece": source_calibration["ece"],
            "target_ece": target_calibration["ece"],
            "calibration_shift": float(calibration_shift)
        }
    
    def evaluate_all_domains(
        self, 
        domain_results: Dict[str, Dict[str, Any]]
    ) -> Dict[str, Dict[str, float]]:
        """
        Evaluate uncertainty performance across all domains.
        
        Args:
            domain_results: Dictionary mapping domain names to results
                Each result should contain:
                - uncertainties: List of uncertai
(Content truncated due to size limit. Use line ranges to read in chunks)