# Base functionality for Propensity Score methods import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from typing import List, Optional, Dict, Any # Placeholder for LLM interaction to select model type def select_propensity_model(df: pd.DataFrame, treatment: str, covariates: List[str], query: Optional[str] = None) -> str: '''Selects the appropriate propensity score model type (e.g., logistic, GBM). Placeholder: Currently defaults to Logistic Regression. ''' # TODO: Implement LLM call or heuristic to select model based on data characteristics return "logistic" def estimate_propensity_scores(df: pd.DataFrame, treatment: str, covariates: List[str], model_type: str = 'logistic', **kwargs) -> np.ndarray: '''Estimate propensity scores using a specified model. Args: df: DataFrame containing the data treatment: Name of the treatment variable covariates: List of covariate variable names model_type: Type of model to use ('logistic' supported for now) **kwargs: Additional arguments for the model Returns: Array of propensity scores ''' X = df[covariates] y = df[treatment] # Standardize covariates for logistic regression scaler = StandardScaler() X_scaled = scaler.fit_transform(X) if model_type.lower() == 'logistic': # Fit logistic regression model = LogisticRegression(max_iter=kwargs.get('max_iter', 1000), solver=kwargs.get('solver', 'liblinear'), # Use liblinear for L1/L2 C=kwargs.get('C', 1.0), penalty=kwargs.get('penalty', 'l2')) model.fit(X_scaled, y) # Predict probabilities propensity_scores = model.predict_proba(X_scaled)[:, 1] # TODO: Add other model types like Gradient Boosting, etc. # elif model_type.lower() == 'gbm': # from sklearn.ensemble import GradientBoostingClassifier # model = GradientBoostingClassifier(...) # model.fit(X, y) # propensity_scores = model.predict_proba(X)[:, 1] else: raise ValueError(f"Unsupported propensity score model type: {model_type}") # Clip scores to avoid extremes which can cause issues in weighting/matching propensity_scores = np.clip(propensity_scores, 0.01, 0.99) return propensity_scores # Common formatting function (can be expanded) def format_ps_results(effect_estimate: float, effect_se: float, diagnostics: Dict[str, Any], method_details: str, parameters: Dict[str, Any]) -> Dict[str, Any]: '''Standard formatter for PS method results.''' ci_lower = effect_estimate - 1.96 * effect_se ci_upper = effect_estimate + 1.96 * effect_se return { "effect_estimate": float(effect_estimate), "effect_se": float(effect_se), "confidence_interval": [float(ci_lower), float(ci_upper)], "diagnostics": diagnostics, "method_details": method_details, "parameters": parameters # Add p-value if needed (can be calculated from estimate and SE) }