FireShadow's picture
Initial clean commit
1721aea
"""
Regression Discontinuity Design (RDD) Estimator.
Tries to use DoWhy's RDD implementation first, falling back to a basic
comparison of linear fits around the cutoff if DoWhy fails.
"""
import pandas as pd
import statsmodels.api as sm
from dowhy import CausalModel
from typing import Dict, Any, List, Optional
import logging
from langchain.chat_models.base import BaseChatModel # For type hinting llm
from .diagnostics import run_rdd_diagnostics
from .llm_assist import interpret_rdd_results
logger = logging.getLogger(__name__)
# Attempt to import specific functions from the evan-magnusson/rdd package
_rdd_estimator_func_em = None
_rdd_optimal_bw_func_em = None
_rdd_em_import_error_message = ""
try:
import rdd
from rdd import rdd
logger.info("Successfully imported 'rdd' and 'optimal_bandwidth' from evan-magnusson/rdd package.")
except ImportError as e:
_rdd_em_import_error_message = f"ImportError for evan-magnusson/rdd: {e}. This package is needed for 'effect_estimate_rdd'."
logger.warning(_rdd_em_import_error_message)
except Exception as e: # Catch other potential errors during import
_rdd_em_import_error_message = f"An unexpected error occurred during import from evan-magnusson/rdd: {e}"
logger.warning(_rdd_em_import_error_message)
def estimate_effect_dowhy(df: pd.DataFrame, treatment: str, outcome: str, running_variable: str, cutoff_value: float, covariates: Optional[List[str]], **kwargs) -> Dict[str, Any]:
"""Estimate RDD effect using DoWhy."""
logger.info("Attempting RDD estimation using DoWhy.")
if covariates:
logger.warning("Covariates provided but may not be used by the DoWhy RDD method_name='rdd'. Support varies.")
# For DoWhy RDD, we don't typically specify common causes in the model
# constructor in the same way as backdoor. The running variable is handled
# via method_params. Covariates might be used by specific underlying estimators
# if supported, but the basic RDD identification doesn't use them directly.
model = CausalModel(
data=df,
treatment=treatment,
outcome=outcome,
# No explicit graph needed for iv.regression_discontinuity method
)
# Identify the effect (DoWhy internally identifies RDD as IV)
# Although potentially redundant if method_name implies identification,
# the API requires identified_estimand as the first argument.
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
# Estimate using RDD method
# Note: DoWhy's RDD often has limited direct support for covariates.
# Bandwidth selection is crucial and often done internally or specified.
bandwidth = kwargs.get('bandwidth') # Get user-specified bandwidth if provided
if bandwidth is None:
# Very basic default bandwidth if none provided - consider better methods
range_rv = df[running_variable].max() - df[running_variable].min()
bandwidth = 0.1 * range_rv
logger.warning(f"No bandwidth specified, using basic default: {bandwidth:.3f}")
estimate = model.estimate_effect(
identified_estimand, # ADD identified_estimand argument
method_name="iv.regression_discontinuity",
method_params={
'rd_variable_name': running_variable,
'rd_threshold_value': cutoff_value,
'rd_bandwidth': bandwidth,
# 'covariates': covariates # Support depends on DoWhy version/estimator
},
test_significance=True # Ask DoWhy to calculate p-values if possible
)
# Extract results - DoWhy's RDD estimate structure might vary
effect = estimate.value
# DoWhy's RDD significance testing might be limited/indirect
# Try to get p-value if estimate object supports it, else None
p_value = getattr(estimate, 'test_significance_pvalue', None)
if isinstance(p_value, (list, tuple)):
p_value = p_value[0] # Handle cases where it might be wrapped
# Confidence intervals might not be directly available from this method easily
conf_int = getattr(estimate, 'confidence_interval', None)
std_err = getattr(estimate, 'standard_error', None)
return {
'effect_estimate': effect,
'p_value': p_value,
'confidence_interval': conf_int,
'standard_error': std_err,
'method_details': f"DoWhy RDD (Bandwidth: {bandwidth:.3f})",
}
def estimate_effect_fallback(df: pd.DataFrame, treatment: str, outcome: str, running_variable: str, cutoff_value: float, covariates: Optional[List[str]], **kwargs) -> Dict[str, Any]:
"""Estimate RDD effect using simple linear regression comparison fallback."""
logger.warning("DoWhy RDD failed or not used. Falling back to simple linear regression comparison.")
if covariates:
logger.warning("Covariates provided but are ignored in the fallback RDD linear regression estimation.")
bandwidth = kwargs.get('bandwidth')
if bandwidth is None:
range_rv = df[running_variable].max() - df[running_variable].min()
bandwidth = 0.1 * range_rv
logger.warning(f"No bandwidth specified for fallback, using basic default: {bandwidth:.3f}")
# Filter data within bandwidth
df_bw = df[(df[running_variable] >= cutoff_value - bandwidth) & (df[running_variable] <= cutoff_value + bandwidth)].copy()
if df_bw.empty:
raise ValueError("No data within the specified bandwidth.")
df_bw['above_cutoff'] = (df_bw[running_variable] >= cutoff_value).astype(int)
# Define predictors for the regression
# Interaction term allows different slopes above and below the cutoff
df_bw['running_centered'] = df_bw[running_variable] - cutoff_value
df_bw['running_x_above'] = df_bw['running_centered'] * df_bw['above_cutoff']
predictors = ['above_cutoff', 'running_centered', 'running_x_above']
# Covariates are NOT included in this basic RDD model
# if covariates:
# predictors.extend(covariates) # REMOVED as per user request
required_cols = [outcome] + predictors
missing_cols = [col for col in required_cols if col not in df_bw.columns]
if missing_cols:
raise ValueError(f"Fallback RDD missing columns: {missing_cols}")
df_analysis = df_bw[required_cols].dropna()
if df_analysis.empty:
raise ValueError("No data remaining after dropping NaNs for fallback RDD.")
X = df_analysis[predictors]
X = sm.add_constant(X)
y = df_analysis[outcome]
formula = f"{outcome} ~ {' + '.join(predictors)} + const"
logger.info(f"Running fallback RDD regression: {formula}")
model = sm.OLS(y, X)
# Use robust standard errors
results = model.fit(cov_type='HC1')
# The coefficient for 'above_cutoff' represents the jump at the cutoff
effect = results.params['above_cutoff']
p_value = results.pvalues['above_cutoff']
conf_int = results.conf_int().loc['above_cutoff'].tolist()
std_err = results.bse['above_cutoff']
return {
'effect_estimate': effect,
'p_value': p_value,
'confidence_interval': conf_int,
'standard_error': std_err,
'method_details': f"Fallback Linear Interaction (Bandwidth: {bandwidth:.3f})",
'formula': formula,
'model_summary': results.summary()
}
def effect_estimate_rdd(
df: pd.DataFrame,
outcome: str,
running_variable: str,
cutoff_value: float,
treatment: Optional[str] = None, # Kept for API consistency, but unused by evan-magnusson/rdd
covariates: Optional[List[str]] = None,
bandwidth: Optional[float] = None,
**kwargs
) -> Dict[str, Any]:
"""
Estimates RDD effect using the 'evan-magnusson/rdd' package.
Uses IK optimal bandwidth selection from the same package by default.
"""
logger.info(f"Attempting RDD estimation using 'evan-magnusson/rdd' for outcome '{outcome}' and running variable '{running_variable}'.")
if treatment:
logger.info(f"Treatment variable '{treatment}' provided but is not explicitly used by the evan-magnusson/rdd estimation function.")
if covariates:
logger.warning("Covariates provided but are ignored by this 'evan-magnusson/rdd' implementation.")
# --- Bandwidth Selection ---
final_bandwidth = None
bandwidth_selection_method = "unknown"
if bandwidth is not None and bandwidth > 0:
logger.info(f"Using user-specified bandwidth: {bandwidth:.4f}")
final_bandwidth = bandwidth
bandwidth_selection_method = "user-specified"
else:
if bandwidth is not None and bandwidth <= 0:
logger.warning(f"User-specified bandwidth {bandwidth} is not positive. Attempting IK optimal bandwidth selection.")
try:
logger.info(f"Attempting IK optimal bandwidth selection using _rdd_optimal_bw_func_em for {outcome} ~ {running_variable} cut at {cutoff_value}.")
optimal_bw_val = rdd.optimal_bandwidth(df[outcome], df[running_variable], cut=cutoff_value)
if optimal_bw_val is not None and optimal_bw_val > 0:
final_bandwidth = optimal_bw_val
bandwidth_selection_method = "ik_optimal (evan-magnusson/rdd)"
logger.info(f"IK optimal bandwidth from evan-magnusson/rdd: {final_bandwidth:.4f}")
else:
logger.warning(f"IK optimal bandwidth from evan-magnusson/rdd was None or non-positive: {optimal_bw_val}. Falling back to default.")
except Exception as e:
logger.warning(f"IK optimal bandwidth selection from evan-magnusson/rdd failed: {e}. Falling back to default.")
if final_bandwidth is None: # Fallback if user did not specify and IK failed/invalid
logger.info("Falling back to default bandwidth (10% of running variable range).")
rv_min = df[running_variable].min()
rv_max = df[running_variable].max()
rv_range = rv_max - rv_min
if rv_range > 0:
final_bandwidth = 0.1 * rv_range
bandwidth_selection_method = "default_10_percent_range"
logger.info(f"Using default 10% range bandwidth: {final_bandwidth:.4f}")
else:
err_msg = "Running variable range is not positive. Cannot determine a default bandwidth for evan-magnusson/rdd."
logger.error(err_msg)
raise ValueError(err_msg)
if final_bandwidth is None or final_bandwidth <= 0:
raise ValueError(f"Could not determine a valid positive bandwidth for evan-magnusson/rdd. Last method: {bandwidth_selection_method}")
# --- RDD Estimation ---
try:
logger.info(f"Running RDD estimation with evan-magnusson/rdd: y='{outcome}', x='{running_variable}', cut={cutoff_value}, bw={final_bandwidth:.4f}")
# The evan-magnusson/rdd package's rdd function typically handles dataframes directly
# Ensure correct xname for truncated_data
data_rdd = rdd.truncated_data(df, running_variable,final_bandwidth, cut=cutoff_value)
model = rdd.rdd(
data_rdd,
xname=running_variable, # Correct: Name of the running variable column
yname=outcome, # Correct: Name of the outcome variable column
cut=cutoff_value
)
# Extract results - this package creates a treatment dummy 'TREATED'
# The 'model' object has a 'results' attribute which is a statsmodels result instance
sm_results = model.fit()
print(sm_results.summary())
# Extract results - using 'TREATED' based on the provided summary output
effect = sm_results.params.get('TREATED')
std_err = sm_results.bse.get('TREATED')
p_value = sm_results.pvalues.get('TREATED')
conf_int_series = sm_results.conf_int()
conf_int = conf_int_series.loc['TREATED'].tolist() if 'TREATED' in conf_int_series.index else [None, None]
n_obs = model.nobs # or model.n_ if nobs is not available (check package details)
# The formula is implicit in the local linear regression performed by the package
# Update to reflect 'TREATED' as the dummy variable name if consistently used by the package
formula_desc = f"Local linear RDD: {outcome} ~ TREATED + {running_variable}_centered + TREATED*{running_variable}_centered (implicit, from evan-magnusson/rdd)"
return {
'effect_estimate': effect,
'standard_error': std_err,
'p_value': p_value,
'confidence_interval': conf_int,
'method_details': f"RDD (evan-magnusson/rdd package, Bandwidth: {final_bandwidth:.4f})",
'bandwidth_used': final_bandwidth,
'bandwidth_selection_method': bandwidth_selection_method,
'n_obs_in_bandwidth': int(n_obs) if n_obs is not None else None,
'formula': formula_desc,
'model_summary': sm_results.summary().as_text() if sm_results else "Summary not available."
}
except Exception as e:
logger.error(f"RDD estimation using 'evan-magnusson/rdd' failed: {e}", exc_info=True)
# Consider re-raising or returning a more structured error
raise e # Or return a dict like in the import failure case
def estimate_effect(
df: pd.DataFrame,
treatment: str,
outcome: str,
running_variable: str,
cutoff_value: float,
covariates: Optional[List[str]] = None,
bandwidth: Optional[float] = None, # Optional bandwidth param
query: Optional[str] = None,
llm: Optional[BaseChatModel] = None,
**kwargs # Capture other args like rd_estimator from DoWhy if needed
) -> Dict[str, Any]:
"""
Estimates the causal effect using Regression Discontinuity Design.
Tries DoWhy implementation first if use_dowhy=True, otherwise uses fallback.
Args:
df: Input DataFrame.
treatment: Name of the treatment variable (often implicitly defined by cutoff).
DoWhy might still need it, fallback doesn't use it directly.
outcome: Name of the outcome variable.
running_variable: Name of the variable determining treatment assignment.
cutoff: The threshold value for the running variable.
covariates: Optional list of covariate names (support varies).
bandwidth: Optional bandwidth around the cutoff. If None, a default is used.
use_dowhy: Whether to attempt using the DoWhy library first.
query: Optional user query for context.
llm: Optional Language Model instance.
**kwargs: Additional keyword arguments for underlying methods.
Returns:
Dictionary containing estimation results.
"""
required_args = {
"running_variable": running_variable,
"cutoff_value": cutoff_value
}
if any(val is None for val in required_args.values()):
raise ValueError(f"Missing required RDD arguments: running_variable and cutoff must be provided.")
results = {}
rdd_em_estimation_error = None # Error from effect_estimate_rdd (evan-magnusson)
fallback_estimation_error = None # Error from estimate_effect_fallback
# --- Try effect_estimate_rdd (evan-magnusson/rdd) First ---
try:
logger.info("Attempting RDD estimation using 'effect_estimate_rdd' (evan-magnusson/rdd package).")
# Note: treatment is passed but might be unused, covariates are also passed but typically ignored by this specific rdd package
results = effect_estimate_rdd(
df,
outcome,
running_variable,
cutoff_value,
treatment=treatment, # For API consistency, though evan-magnusson/rdd doesn't use it explicitly
covariates=covariates,
bandwidth=bandwidth,
**kwargs
)
results['method_used'] = 'evan-magnusson/rdd' # Ensure method_used is set
logger.info("Successfully estimated effect using 'effect_estimate_rdd'.")
except ImportError as ie: # Specifically catch import errors for the rdd package
logger.warning(f"'effect_estimate_rdd' could not run due to ImportError (likely evan-magnusson/rdd package not available/functional): {ie}")
rdd_em_estimation_error = ie
except Exception as e:
logger.warning(f"'effect_estimate_rdd' failed during execution: {e}")
rdd_em_estimation_error = e
# --- Fallback to estimate_effect_fallback if effect_estimate_rdd failed ---
if not results: # If effect_estimate_rdd wasn't used or failed
logger.info("'effect_estimate_rdd' did not produce results. Attempting fallback using 'estimate_effect_fallback'.")
try:
fallback_results = estimate_effect_fallback(df, treatment, outcome, running_variable, cutoff_value, covariates, bandwidth=bandwidth, **kwargs)
results.update(fallback_results)
results['method_used'] = 'Fallback RDD (Linear Interaction with Robust Errors)'
fallback_estimation_error = None # Clear fallback error if it succeeded
logger.info("Successfully estimated effect using 'estimate_effect_fallback'.")
except Exception as e:
logger.error(f"Fallback RDD estimation ('estimate_effect_fallback') also failed: {e}")
fallback_estimation_error = e
# Determine final error status
final_estimation_error = None
if not results: # If still no results, determine which error to report
if fallback_estimation_error: # Fallback was attempted and failed
final_estimation_error = fallback_estimation_error
logger.error(f"All RDD estimation attempts failed. Last error (from fallback): {final_estimation_error}")
elif rdd_em_estimation_error: # effect_estimate_rdd was attempted and failed, fallback was not (or also failed but error not captured)
final_estimation_error = rdd_em_estimation_error
logger.error(f"All RDD estimation attempts failed. Last error (from effect_estimate_rdd): {final_estimation_error}")
else:
logger.error("All RDD estimation attempts failed for an unknown reason.")
if final_estimation_error:
raise ValueError(f"RDD estimation failed. Last error: {final_estimation_error}")
else:
raise ValueError("RDD estimation failed using all available methods for an unknown reason.")
# --- Diagnostics ---
try:
diag_results = run_rdd_diagnostics(df, outcome, running_variable, cutoff_value, covariates, bandwidth)
results['diagnostics'] = diag_results
except Exception as diag_e:
logger.error(f"RDD Diagnostics failed: {diag_e}")
results['diagnostics'] = {"status": "Failed", "error": str(diag_e)}
# --- Interpretation ---
try:
interpretation = interpret_rdd_results(results, results.get('diagnostics'), llm=llm)
results['interpretation'] = interpretation
except Exception as interp_e:
logger.error(f"RDD Interpretation failed: {interp_e}")
results['interpretation'] = "Interpretation failed."
# Add info about primary attempt if fallback was used
if rdd_em_estimation_error and results.get('method_used', '').startswith('Fallback'):
results['primary_rdd_em_error_info'] = str(rdd_em_estimation_error)
return results