Spaces:
Running
Running
# Base functionality for Propensity Score methods | |
import pandas as pd | |
import numpy as np | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.preprocessing import StandardScaler | |
from typing import List, Optional, Dict, Any | |
# Placeholder for LLM interaction to select model type | |
def select_propensity_model(df: pd.DataFrame, treatment: str, covariates: List[str], | |
query: Optional[str] = None) -> str: | |
'''Selects the appropriate propensity score model type (e.g., logistic, GBM). | |
Placeholder: Currently defaults to Logistic Regression. | |
''' | |
# TODO: Implement LLM call or heuristic to select model based on data characteristics | |
return "logistic" | |
def estimate_propensity_scores(df: pd.DataFrame, treatment: str, | |
covariates: List[str], model_type: str = 'logistic', | |
**kwargs) -> np.ndarray: | |
'''Estimate propensity scores using a specified model. | |
Args: | |
df: DataFrame containing the data | |
treatment: Name of the treatment variable | |
covariates: List of covariate variable names | |
model_type: Type of model to use ('logistic' supported for now) | |
**kwargs: Additional arguments for the model | |
Returns: | |
Array of propensity scores | |
''' | |
X = df[covariates] | |
y = df[treatment] | |
# Standardize covariates for logistic regression | |
scaler = StandardScaler() | |
X_scaled = scaler.fit_transform(X) | |
if model_type.lower() == 'logistic': | |
# Fit logistic regression | |
model = LogisticRegression(max_iter=kwargs.get('max_iter', 1000), | |
solver=kwargs.get('solver', 'liblinear'), # Use liblinear for L1/L2 | |
C=kwargs.get('C', 1.0), | |
penalty=kwargs.get('penalty', 'l2')) | |
model.fit(X_scaled, y) | |
# Predict probabilities | |
propensity_scores = model.predict_proba(X_scaled)[:, 1] | |
# TODO: Add other model types like Gradient Boosting, etc. | |
# elif model_type.lower() == 'gbm': | |
# from sklearn.ensemble import GradientBoostingClassifier | |
# model = GradientBoostingClassifier(...) | |
# model.fit(X, y) | |
# propensity_scores = model.predict_proba(X)[:, 1] | |
else: | |
raise ValueError(f"Unsupported propensity score model type: {model_type}") | |
# Clip scores to avoid extremes which can cause issues in weighting/matching | |
propensity_scores = np.clip(propensity_scores, 0.01, 0.99) | |
return propensity_scores | |
# Common formatting function (can be expanded) | |
def format_ps_results(effect_estimate: float, effect_se: float, | |
diagnostics: Dict[str, Any], method_details: str, | |
parameters: Dict[str, Any]) -> Dict[str, Any]: | |
'''Standard formatter for PS method results.''' | |
ci_lower = effect_estimate - 1.96 * effect_se | |
ci_upper = effect_estimate + 1.96 * effect_se | |
return { | |
"effect_estimate": float(effect_estimate), | |
"effect_se": float(effect_se), | |
"confidence_interval": [float(ci_lower), float(ci_upper)], | |
"diagnostics": diagnostics, | |
"method_details": method_details, | |
"parameters": parameters | |
# Add p-value if needed (can be calculated from estimate and SE) | |
} |