Spaces:
Running
Running
""" | |
Diagnostic checks for Regression Discontinuity Design (RDD). | |
""" | |
from typing import Dict, Any, List, Optional | |
import pandas as pd | |
import numpy as np | |
from scipy import stats | |
import logging | |
logger = logging.getLogger(__name__) | |
def run_rdd_diagnostics( | |
df: pd.DataFrame, | |
outcome: str, | |
running_variable: str, | |
cutoff: float, | |
covariates: Optional[List[str]] = None, | |
bandwidth: Optional[float] = None | |
) -> Dict[str, Any]: | |
""" | |
Runs diagnostic checks for RDD analysis. | |
Currently includes: | |
- Covariate Balance Check (t-tests) | |
Placeholders for: | |
- Density Test (McCrary) | |
- Placebo Cutoff Tests | |
- Bandwidth Sensitivity | |
Args: | |
df: Input DataFrame. | |
outcome: Name of the outcome variable. | |
running_variable: Name of the running variable. | |
cutoff: The threshold value. | |
covariates: Optional list of covariate names to check for balance. | |
bandwidth: Optional bandwidth to restrict the analysis. If None, a default is used. | |
Returns: | |
Dictionary containing diagnostic results. | |
""" | |
diagnostics = {} | |
details = {} | |
if bandwidth is None: | |
# Use the same default as estimator for consistency | |
range_rv = df[running_variable].max() - df[running_variable].min() | |
bandwidth = 0.1 * range_rv | |
logger.warning(f"No bandwidth provided for diagnostics, using basic default: {bandwidth:.3f}") | |
# --- Filter data within bandwidth --- | |
df_bw = df[(df[running_variable] >= cutoff - bandwidth) & (df[running_variable] <= cutoff + bandwidth)].copy() | |
if df_bw.empty: | |
logger.warning("No data within bandwidth for diagnostics.") | |
return {"status": "Skipped", "reason": "No data in bandwidth", "details": details} | |
df_below = df_bw[df_bw[running_variable] < cutoff] | |
df_above = df_bw[df_bw[running_variable] >= cutoff] | |
if df_below.empty or df_above.empty: | |
logger.warning("Insufficient data above or below cutoff within bandwidth for diagnostics.") | |
return {"status": "Skipped", "reason": "Insufficient data near cutoff", "details": details} | |
# --- Covariate Balance Check --- | |
if covariates: | |
balance_results = {} | |
details['covariate_balance'] = balance_results | |
for cov in covariates: | |
if cov in df_bw.columns: | |
try: | |
# Perform t-test for difference in means | |
t_stat, p_val = stats.ttest_ind( | |
df_below[cov].dropna(), | |
df_above[cov].dropna(), | |
equal_var=False # Welch's t-test | |
) | |
balance_results[cov] = { | |
't_statistic': t_stat, | |
'p_value': p_val, | |
'balanced': "Yes" if p_val > 0.05 else "No (p <= 0.05)" | |
} | |
except Exception as e: | |
logger.warning(f"Could not perform t-test for covariate '{cov}': {e}") | |
balance_results[cov] = {"status": "Test Failed", "error": str(e)} | |
else: | |
balance_results[cov] = {"status": "Column Not Found"} | |
else: | |
details['covariate_balance'] = "No covariates provided to check." | |
# --- Placeholders for other common RDD diagnostics --- | |
details['continuity_density_test'] = "Not Implemented (Requires specialized libraries like rdd)" | |
details['placebo_cutoff_test'] = "Not Implemented (Requires re-running estimation)" | |
details['bandwidth_sensitivity'] = "Not Implemented (Requires re-running estimation)" | |
details['visual_inspection'] = "Recommended (Plot outcome vs running variable with fits)" | |
return {"status": "Success (Partial Implementation)", "details": details} | |