""" Diagnostic checks for Regression Discontinuity Design (RDD). """ from typing import Dict, Any, List, Optional import pandas as pd import numpy as np from scipy import stats import logging logger = logging.getLogger(__name__) def run_rdd_diagnostics( df: pd.DataFrame, outcome: str, running_variable: str, cutoff: float, covariates: Optional[List[str]] = None, bandwidth: Optional[float] = None ) -> Dict[str, Any]: """ Runs diagnostic checks for RDD analysis. Currently includes: - Covariate Balance Check (t-tests) Placeholders for: - Density Test (McCrary) - Placebo Cutoff Tests - Bandwidth Sensitivity Args: df: Input DataFrame. outcome: Name of the outcome variable. running_variable: Name of the running variable. cutoff: The threshold value. covariates: Optional list of covariate names to check for balance. bandwidth: Optional bandwidth to restrict the analysis. If None, a default is used. Returns: Dictionary containing diagnostic results. """ diagnostics = {} details = {} if bandwidth is None: # Use the same default as estimator for consistency range_rv = df[running_variable].max() - df[running_variable].min() bandwidth = 0.1 * range_rv logger.warning(f"No bandwidth provided for diagnostics, using basic default: {bandwidth:.3f}") # --- Filter data within bandwidth --- df_bw = df[(df[running_variable] >= cutoff - bandwidth) & (df[running_variable] <= cutoff + bandwidth)].copy() if df_bw.empty: logger.warning("No data within bandwidth for diagnostics.") return {"status": "Skipped", "reason": "No data in bandwidth", "details": details} df_below = df_bw[df_bw[running_variable] < cutoff] df_above = df_bw[df_bw[running_variable] >= cutoff] if df_below.empty or df_above.empty: logger.warning("Insufficient data above or below cutoff within bandwidth for diagnostics.") return {"status": "Skipped", "reason": "Insufficient data near cutoff", "details": details} # --- Covariate Balance Check --- if covariates: balance_results = {} details['covariate_balance'] = balance_results for cov in covariates: if cov in df_bw.columns: try: # Perform t-test for difference in means t_stat, p_val = stats.ttest_ind( df_below[cov].dropna(), df_above[cov].dropna(), equal_var=False # Welch's t-test ) balance_results[cov] = { 't_statistic': t_stat, 'p_value': p_val, 'balanced': "Yes" if p_val > 0.05 else "No (p <= 0.05)" } except Exception as e: logger.warning(f"Could not perform t-test for covariate '{cov}': {e}") balance_results[cov] = {"status": "Test Failed", "error": str(e)} else: balance_results[cov] = {"status": "Column Not Found"} else: details['covariate_balance'] = "No covariates provided to check." # --- Placeholders for other common RDD diagnostics --- details['continuity_density_test'] = "Not Implemented (Requires specialized libraries like rdd)" details['placebo_cutoff_test'] = "Not Implemented (Requires re-running estimation)" details['bandwidth_sensitivity'] = "Not Implemented (Requires re-running estimation)" details['visual_inspection'] = "Recommended (Plot outcome vs running variable with fits)" return {"status": "Success (Partial Implementation)", "details": details}