FireShadow's picture
Initial clean commit
1721aea
"""
Diagnostic checks for Regression Discontinuity Design (RDD).
"""
from typing import Dict, Any, List, Optional
import pandas as pd
import numpy as np
from scipy import stats
import logging
logger = logging.getLogger(__name__)
def run_rdd_diagnostics(
df: pd.DataFrame,
outcome: str,
running_variable: str,
cutoff: float,
covariates: Optional[List[str]] = None,
bandwidth: Optional[float] = None
) -> Dict[str, Any]:
"""
Runs diagnostic checks for RDD analysis.
Currently includes:
- Covariate Balance Check (t-tests)
Placeholders for:
- Density Test (McCrary)
- Placebo Cutoff Tests
- Bandwidth Sensitivity
Args:
df: Input DataFrame.
outcome: Name of the outcome variable.
running_variable: Name of the running variable.
cutoff: The threshold value.
covariates: Optional list of covariate names to check for balance.
bandwidth: Optional bandwidth to restrict the analysis. If None, a default is used.
Returns:
Dictionary containing diagnostic results.
"""
diagnostics = {}
details = {}
if bandwidth is None:
# Use the same default as estimator for consistency
range_rv = df[running_variable].max() - df[running_variable].min()
bandwidth = 0.1 * range_rv
logger.warning(f"No bandwidth provided for diagnostics, using basic default: {bandwidth:.3f}")
# --- Filter data within bandwidth ---
df_bw = df[(df[running_variable] >= cutoff - bandwidth) & (df[running_variable] <= cutoff + bandwidth)].copy()
if df_bw.empty:
logger.warning("No data within bandwidth for diagnostics.")
return {"status": "Skipped", "reason": "No data in bandwidth", "details": details}
df_below = df_bw[df_bw[running_variable] < cutoff]
df_above = df_bw[df_bw[running_variable] >= cutoff]
if df_below.empty or df_above.empty:
logger.warning("Insufficient data above or below cutoff within bandwidth for diagnostics.")
return {"status": "Skipped", "reason": "Insufficient data near cutoff", "details": details}
# --- Covariate Balance Check ---
if covariates:
balance_results = {}
details['covariate_balance'] = balance_results
for cov in covariates:
if cov in df_bw.columns:
try:
# Perform t-test for difference in means
t_stat, p_val = stats.ttest_ind(
df_below[cov].dropna(),
df_above[cov].dropna(),
equal_var=False # Welch's t-test
)
balance_results[cov] = {
't_statistic': t_stat,
'p_value': p_val,
'balanced': "Yes" if p_val > 0.05 else "No (p <= 0.05)"
}
except Exception as e:
logger.warning(f"Could not perform t-test for covariate '{cov}': {e}")
balance_results[cov] = {"status": "Test Failed", "error": str(e)}
else:
balance_results[cov] = {"status": "Column Not Found"}
else:
details['covariate_balance'] = "No covariates provided to check."
# --- Placeholders for other common RDD diagnostics ---
details['continuity_density_test'] = "Not Implemented (Requires specialized libraries like rdd)"
details['placebo_cutoff_test'] = "Not Implemented (Requires re-running estimation)"
details['bandwidth_sensitivity'] = "Not Implemented (Requires re-running estimation)"
details['visual_inspection'] = "Recommended (Plot outcome vs running variable with fits)"
return {"status": "Success (Partial Implementation)", "details": details}