FireShadow's picture
Initial clean commit
1721aea
# Base functionality for Propensity Score methods
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from typing import List, Optional, Dict, Any
# Placeholder for LLM interaction to select model type
def select_propensity_model(df: pd.DataFrame, treatment: str, covariates: List[str],
query: Optional[str] = None) -> str:
'''Selects the appropriate propensity score model type (e.g., logistic, GBM).
Placeholder: Currently defaults to Logistic Regression.
'''
# TODO: Implement LLM call or heuristic to select model based on data characteristics
return "logistic"
def estimate_propensity_scores(df: pd.DataFrame, treatment: str,
covariates: List[str], model_type: str = 'logistic',
**kwargs) -> np.ndarray:
'''Estimate propensity scores using a specified model.
Args:
df: DataFrame containing the data
treatment: Name of the treatment variable
covariates: List of covariate variable names
model_type: Type of model to use ('logistic' supported for now)
**kwargs: Additional arguments for the model
Returns:
Array of propensity scores
'''
X = df[covariates]
y = df[treatment]
# Standardize covariates for logistic regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
if model_type.lower() == 'logistic':
# Fit logistic regression
model = LogisticRegression(max_iter=kwargs.get('max_iter', 1000),
solver=kwargs.get('solver', 'liblinear'), # Use liblinear for L1/L2
C=kwargs.get('C', 1.0),
penalty=kwargs.get('penalty', 'l2'))
model.fit(X_scaled, y)
# Predict probabilities
propensity_scores = model.predict_proba(X_scaled)[:, 1]
# TODO: Add other model types like Gradient Boosting, etc.
# elif model_type.lower() == 'gbm':
# from sklearn.ensemble import GradientBoostingClassifier
# model = GradientBoostingClassifier(...)
# model.fit(X, y)
# propensity_scores = model.predict_proba(X)[:, 1]
else:
raise ValueError(f"Unsupported propensity score model type: {model_type}")
# Clip scores to avoid extremes which can cause issues in weighting/matching
propensity_scores = np.clip(propensity_scores, 0.01, 0.99)
return propensity_scores
# Common formatting function (can be expanded)
def format_ps_results(effect_estimate: float, effect_se: float,
diagnostics: Dict[str, Any], method_details: str,
parameters: Dict[str, Any]) -> Dict[str, Any]:
'''Standard formatter for PS method results.'''
ci_lower = effect_estimate - 1.96 * effect_se
ci_upper = effect_estimate + 1.96 * effect_se
return {
"effect_estimate": float(effect_estimate),
"effect_se": float(effect_se),
"confidence_interval": [float(ci_lower), float(ci_upper)],
"diagnostics": diagnostics,
"method_details": method_details,
"parameters": parameters
# Add p-value if needed (can be calculated from estimate and SE)
}