File size: 3,783 Bytes
1721aea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Diagnostic checks for Regression Discontinuity Design (RDD).
"""

from typing import Dict, Any, List, Optional
import pandas as pd
import numpy as np
from scipy import stats
import logging

logger = logging.getLogger(__name__)

def run_rdd_diagnostics(
    df: pd.DataFrame,
    outcome: str,
    running_variable: str,
    cutoff: float,
    covariates: Optional[List[str]] = None,
    bandwidth: Optional[float] = None
) -> Dict[str, Any]:
    """
    Runs diagnostic checks for RDD analysis.

    Currently includes:
    - Covariate Balance Check (t-tests)
    Placeholders for:
    - Density Test (McCrary)
    - Placebo Cutoff Tests
    - Bandwidth Sensitivity

    Args:
        df: Input DataFrame.
        outcome: Name of the outcome variable.
        running_variable: Name of the running variable.
        cutoff: The threshold value.
        covariates: Optional list of covariate names to check for balance.
        bandwidth: Optional bandwidth to restrict the analysis. If None, a default is used.

    Returns:
        Dictionary containing diagnostic results.
    """
    diagnostics = {}
    details = {}

    if bandwidth is None:
        # Use the same default as estimator for consistency
        range_rv = df[running_variable].max() - df[running_variable].min()
        bandwidth = 0.1 * range_rv
        logger.warning(f"No bandwidth provided for diagnostics, using basic default: {bandwidth:.3f}")

    # --- Filter data within bandwidth --- 
    df_bw = df[(df[running_variable] >= cutoff - bandwidth) & (df[running_variable] <= cutoff + bandwidth)].copy()
    if df_bw.empty:
        logger.warning("No data within bandwidth for diagnostics.")
        return {"status": "Skipped", "reason": "No data in bandwidth", "details": details}

    df_below = df_bw[df_bw[running_variable] < cutoff]
    df_above = df_bw[df_bw[running_variable] >= cutoff]

    if df_below.empty or df_above.empty:
        logger.warning("Insufficient data above or below cutoff within bandwidth for diagnostics.")
        return {"status": "Skipped", "reason": "Insufficient data near cutoff", "details": details}

    # --- Covariate Balance Check --- 
    if covariates:
        balance_results = {}
        details['covariate_balance'] = balance_results
        for cov in covariates:
            if cov in df_bw.columns:
                try:
                    # Perform t-test for difference in means
                    t_stat, p_val = stats.ttest_ind(
                        df_below[cov].dropna(), 
                        df_above[cov].dropna(), 
                        equal_var=False # Welch's t-test
                    )
                    balance_results[cov] = {
                        't_statistic': t_stat,
                        'p_value': p_val,
                        'balanced': "Yes" if p_val > 0.05 else "No (p <= 0.05)"
                    }
                except Exception as e:
                    logger.warning(f"Could not perform t-test for covariate '{cov}': {e}")
                    balance_results[cov] = {"status": "Test Failed", "error": str(e)}
            else:
                balance_results[cov] = {"status": "Column Not Found"}
    else:
         details['covariate_balance'] = "No covariates provided to check."

    # --- Placeholders for other common RDD diagnostics --- 
    details['continuity_density_test'] = "Not Implemented (Requires specialized libraries like rdd)"
    details['placebo_cutoff_test'] = "Not Implemented (Requires re-running estimation)"
    details['bandwidth_sensitivity'] = "Not Implemented (Requires re-running estimation)"
    details['visual_inspection'] = "Recommended (Plot outcome vs running variable with fits)"

    return {"status": "Success (Partial Implementation)", "details": details}