Spaces:
Running
Running
File size: 3,783 Bytes
1721aea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
"""
Diagnostic checks for Regression Discontinuity Design (RDD).
"""
from typing import Dict, Any, List, Optional
import pandas as pd
import numpy as np
from scipy import stats
import logging
logger = logging.getLogger(__name__)
def run_rdd_diagnostics(
df: pd.DataFrame,
outcome: str,
running_variable: str,
cutoff: float,
covariates: Optional[List[str]] = None,
bandwidth: Optional[float] = None
) -> Dict[str, Any]:
"""
Runs diagnostic checks for RDD analysis.
Currently includes:
- Covariate Balance Check (t-tests)
Placeholders for:
- Density Test (McCrary)
- Placebo Cutoff Tests
- Bandwidth Sensitivity
Args:
df: Input DataFrame.
outcome: Name of the outcome variable.
running_variable: Name of the running variable.
cutoff: The threshold value.
covariates: Optional list of covariate names to check for balance.
bandwidth: Optional bandwidth to restrict the analysis. If None, a default is used.
Returns:
Dictionary containing diagnostic results.
"""
diagnostics = {}
details = {}
if bandwidth is None:
# Use the same default as estimator for consistency
range_rv = df[running_variable].max() - df[running_variable].min()
bandwidth = 0.1 * range_rv
logger.warning(f"No bandwidth provided for diagnostics, using basic default: {bandwidth:.3f}")
# --- Filter data within bandwidth ---
df_bw = df[(df[running_variable] >= cutoff - bandwidth) & (df[running_variable] <= cutoff + bandwidth)].copy()
if df_bw.empty:
logger.warning("No data within bandwidth for diagnostics.")
return {"status": "Skipped", "reason": "No data in bandwidth", "details": details}
df_below = df_bw[df_bw[running_variable] < cutoff]
df_above = df_bw[df_bw[running_variable] >= cutoff]
if df_below.empty or df_above.empty:
logger.warning("Insufficient data above or below cutoff within bandwidth for diagnostics.")
return {"status": "Skipped", "reason": "Insufficient data near cutoff", "details": details}
# --- Covariate Balance Check ---
if covariates:
balance_results = {}
details['covariate_balance'] = balance_results
for cov in covariates:
if cov in df_bw.columns:
try:
# Perform t-test for difference in means
t_stat, p_val = stats.ttest_ind(
df_below[cov].dropna(),
df_above[cov].dropna(),
equal_var=False # Welch's t-test
)
balance_results[cov] = {
't_statistic': t_stat,
'p_value': p_val,
'balanced': "Yes" if p_val > 0.05 else "No (p <= 0.05)"
}
except Exception as e:
logger.warning(f"Could not perform t-test for covariate '{cov}': {e}")
balance_results[cov] = {"status": "Test Failed", "error": str(e)}
else:
balance_results[cov] = {"status": "Column Not Found"}
else:
details['covariate_balance'] = "No covariates provided to check."
# --- Placeholders for other common RDD diagnostics ---
details['continuity_density_test'] = "Not Implemented (Requires specialized libraries like rdd)"
details['placebo_cutoff_test'] = "Not Implemented (Requires re-running estimation)"
details['bandwidth_sensitivity'] = "Not Implemented (Requires re-running estimation)"
details['visual_inspection'] = "Recommended (Plot outcome vs running variable with fits)"
return {"status": "Success (Partial Implementation)", "details": details}
|