File size: 3,367 Bytes
1721aea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Base functionality for Propensity Score methods 
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from typing import List, Optional, Dict, Any

# Placeholder for LLM interaction to select model type
def select_propensity_model(df: pd.DataFrame, treatment: str, covariates: List[str], 
                            query: Optional[str] = None) -> str:
    '''Selects the appropriate propensity score model type (e.g., logistic, GBM).
    
    Placeholder: Currently defaults to Logistic Regression.
    '''
    # TODO: Implement LLM call or heuristic to select model based on data characteristics
    return "logistic"

def estimate_propensity_scores(df: pd.DataFrame, treatment: str, 
                               covariates: List[str], model_type: str = 'logistic',
                               **kwargs) -> np.ndarray:
    '''Estimate propensity scores using a specified model.
    
    Args:
        df: DataFrame containing the data
        treatment: Name of the treatment variable
        covariates: List of covariate variable names
        model_type: Type of model to use ('logistic' supported for now)
        **kwargs: Additional arguments for the model

    Returns:
        Array of propensity scores
    '''
    
    X = df[covariates]
    y = df[treatment]
    
    # Standardize covariates for logistic regression
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    if model_type.lower() == 'logistic':
        # Fit logistic regression
        model = LogisticRegression(max_iter=kwargs.get('max_iter', 1000), 
                                   solver=kwargs.get('solver', 'liblinear'), # Use liblinear for L1/L2
                                   C=kwargs.get('C', 1.0),
                                   penalty=kwargs.get('penalty', 'l2'))
        model.fit(X_scaled, y)
        
        # Predict probabilities
        propensity_scores = model.predict_proba(X_scaled)[:, 1]
    # TODO: Add other model types like Gradient Boosting, etc.
    # elif model_type.lower() == 'gbm':
    #     from sklearn.ensemble import GradientBoostingClassifier
    #     model = GradientBoostingClassifier(...)
    #     model.fit(X, y)
    #     propensity_scores = model.predict_proba(X)[:, 1]
    else:
        raise ValueError(f"Unsupported propensity score model type: {model_type}")
    
    # Clip scores to avoid extremes which can cause issues in weighting/matching
    propensity_scores = np.clip(propensity_scores, 0.01, 0.99)
        
    return propensity_scores

# Common formatting function (can be expanded)
def format_ps_results(effect_estimate: float, effect_se: float, 
                      diagnostics: Dict[str, Any], method_details: str, 
                      parameters: Dict[str, Any]) -> Dict[str, Any]:
    '''Standard formatter for PS method results.'''
    ci_lower = effect_estimate - 1.96 * effect_se
    ci_upper = effect_estimate + 1.96 * effect_se
    return {
        "effect_estimate": float(effect_estimate),
        "effect_se": float(effect_se),
        "confidence_interval": [float(ci_lower), float(ci_upper)],
        "diagnostics": diagnostics,
        "method_details": method_details,
        "parameters": parameters
        # Add p-value if needed (can be calculated from estimate and SE)
    }