Spaces:
Running
Running
File size: 4,231 Bytes
1721aea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
"""
Backdoor Adjustment Estimator using Regression.
Estimates the Average Treatment Effect (ATE) by regressing the outcome on the
treatment and a set of covariates assumed to satisfy the backdoor criterion.
"""
import pandas as pd
import numpy as np
import statsmodels.api as sm
from typing import Dict, Any, List, Optional
import logging
from langchain.chat_models.base import BaseChatModel # For type hinting llm
# Import diagnostics and llm assist (placeholders for now)
from .diagnostics import run_backdoor_diagnostics
from .llm_assist import interpret_backdoor_results, identify_backdoor_set
logger = logging.getLogger(__name__)
def estimate_effect(
df: pd.DataFrame,
treatment: str,
outcome: str,
covariates: List[str], # Backdoor set - Required for this method
query: Optional[str] = None, # For potential LLM use
llm: Optional[BaseChatModel] = None, # For potential LLM use
**kwargs # To capture any other potential arguments
) -> Dict[str, Any]:
"""
Estimates the causal effect using Backdoor Adjustment (via OLS regression).
Assumes the provided `covariates` list satisfies the backdoor criterion.
Args:
df: Input DataFrame.
treatment: Name of the treatment variable column.
outcome: Name of the outcome variable column.
covariates: List of covariate names forming the backdoor adjustment set.
query: Optional user query for context (e.g., for LLM).
llm: Optional Language Model instance.
**kwargs: Additional keyword arguments.
Returns:
Dictionary containing estimation results:
- 'effect_estimate': The estimated coefficient for the treatment variable.
- 'p_value': The p-value associated with the treatment coefficient.
- 'confidence_interval': The 95% confidence interval for the effect.
- 'standard_error': The standard error of the treatment coefficient.
- 'formula': The regression formula used.
- 'model_summary': Summary object from statsmodels.
- 'diagnostics': Placeholder for diagnostic results.
- 'interpretation': LLM interpretation.
"""
if not covariates: # Check if the list is empty or None
raise ValueError("Backdoor Adjustment requires a non-empty list of covariates (adjustment set).")
required_cols = [treatment, outcome] + covariates
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns for Backdoor Adjustment: {missing_cols}")
# Prepare data for statsmodels (add constant, handle potential NaNs)
df_analysis = df[required_cols].dropna()
if df_analysis.empty:
raise ValueError("No data remaining after dropping NaNs for required columns.")
X = df_analysis[[treatment] + covariates]
X = sm.add_constant(X) # Add intercept
y = df_analysis[outcome]
# Build the formula string for reporting
formula = f"{outcome} ~ {treatment} + " + " + ".join(covariates) + " + const"
logger.info(f"Running Backdoor Adjustment regression: {formula}")
try:
model = sm.OLS(y, X)
results = model.fit()
effect_estimate = results.params[treatment]
p_value = results.pvalues[treatment]
conf_int = results.conf_int(alpha=0.05).loc[treatment].tolist()
std_err = results.bse[treatment]
# Run diagnostics (Placeholders)
# Pass the full design matrix X for potential VIF checks etc.
diag_results = run_backdoor_diagnostics(results, X)
# Get interpretation
interpretation = interpret_backdoor_results(results, diag_results, treatment, covariates, llm=llm)
return {
'effect_estimate': effect_estimate,
'p_value': p_value,
'confidence_interval': conf_int,
'standard_error': std_err,
'formula': formula,
'model_summary': results.summary(),
'diagnostics': diag_results,
'interpretation': interpretation,
'method_used': 'Backdoor Adjustment (OLS)'
}
except Exception as e:
logger.error(f"Backdoor Adjustment failed: {e}")
raise
|