Spaces:

CausalNLP
/

causal-agent

Running

File size: 19,451 Bytes

1721aea

"""
Regression Discontinuity Design (RDD) Estimator.

Tries to use DoWhy's RDD implementation first, falling back to a basic
comparison of linear fits around the cutoff if DoWhy fails.
"""

import pandas as pd
import statsmodels.api as sm
from dowhy import CausalModel
from typing import Dict, Any, List, Optional
import logging
from langchain.chat_models.base import BaseChatModel # For type hinting llm

from .diagnostics import run_rdd_diagnostics
from .llm_assist import interpret_rdd_results

logger = logging.getLogger(__name__)

# Attempt to import specific functions from the evan-magnusson/rdd package
_rdd_estimator_func_em = None
_rdd_optimal_bw_func_em = None
_rdd_em_import_error_message = ""
try:
    import rdd
    from rdd import rdd

    logger.info("Successfully imported 'rdd' and 'optimal_bandwidth' from evan-magnusson/rdd package.")
except ImportError as e:
    _rdd_em_import_error_message = f"ImportError for evan-magnusson/rdd: {e}. This package is needed for 'effect_estimate_rdd'."
    logger.warning(_rdd_em_import_error_message)
except Exception as e: # Catch other potential errors during import
    _rdd_em_import_error_message = f"An unexpected error occurred during import from evan-magnusson/rdd: {e}"
    logger.warning(_rdd_em_import_error_message)

def estimate_effect_dowhy(df: pd.DataFrame, treatment: str, outcome: str, running_variable: str, cutoff_value: float, covariates: Optional[List[str]], **kwargs) -> Dict[str, Any]:
    """Estimate RDD effect using DoWhy."""
    logger.info("Attempting RDD estimation using DoWhy.")
    if covariates:
        logger.warning("Covariates provided but may not be used by the DoWhy RDD method_name='rdd'. Support varies.")
    # For DoWhy RDD, we don't typically specify common causes in the model
    # constructor in the same way as backdoor. The running variable is handled
    # via method_params. Covariates might be used by specific underlying estimators
    # if supported, but the basic RDD identification doesn't use them directly.
    model = CausalModel(
        data=df,
        treatment=treatment,
        outcome=outcome,
        # No explicit graph needed for iv.regression_discontinuity method
    )
    
    # Identify the effect (DoWhy internally identifies RDD as IV)
    # Although potentially redundant if method_name implies identification, 
    # the API requires identified_estimand as the first argument.
    identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

    # Estimate using RDD method
    # Note: DoWhy's RDD often has limited direct support for covariates.
    # Bandwidth selection is crucial and often done internally or specified.
    bandwidth = kwargs.get('bandwidth') # Get user-specified bandwidth if provided
    if bandwidth is None:
        # Very basic default bandwidth if none provided - consider better methods
        range_rv = df[running_variable].max() - df[running_variable].min()
        bandwidth = 0.1 * range_rv 
        logger.warning(f"No bandwidth specified, using basic default: {bandwidth:.3f}")

    estimate = model.estimate_effect(
        identified_estimand, # ADD identified_estimand argument
        method_name="iv.regression_discontinuity",
        method_params={
            'rd_variable_name': running_variable,
            'rd_threshold_value': cutoff_value,
            'rd_bandwidth': bandwidth,
            # 'covariates': covariates # Support depends on DoWhy version/estimator
        },
        test_significance=True # Ask DoWhy to calculate p-values if possible
    )
    
    # Extract results - DoWhy's RDD estimate structure might vary
    effect = estimate.value
    # DoWhy's RDD significance testing might be limited/indirect
    # Try to get p-value if estimate object supports it, else None
    p_value = getattr(estimate, 'test_significance_pvalue', None)
    if isinstance(p_value, (list, tuple)):
        p_value = p_value[0] # Handle cases where it might be wrapped
        
    # Confidence intervals might not be directly available from this method easily
    conf_int = getattr(estimate, 'confidence_interval', None)
    std_err = getattr(estimate, 'standard_error', None)

    return {
        'effect_estimate': effect,
        'p_value': p_value,
        'confidence_interval': conf_int,
        'standard_error': std_err,
        'method_details': f"DoWhy RDD (Bandwidth: {bandwidth:.3f})",
    }

def estimate_effect_fallback(df: pd.DataFrame, treatment: str, outcome: str, running_variable: str, cutoff_value: float, covariates: Optional[List[str]], **kwargs) -> Dict[str, Any]:
    """Estimate RDD effect using simple linear regression comparison fallback."""
    logger.warning("DoWhy RDD failed or not used. Falling back to simple linear regression comparison.")
    if covariates:
        logger.warning("Covariates provided but are ignored in the fallback RDD linear regression estimation.")

    bandwidth = kwargs.get('bandwidth')
    if bandwidth is None:
        range_rv = df[running_variable].max() - df[running_variable].min()
        bandwidth = 0.1 * range_rv
        logger.warning(f"No bandwidth specified for fallback, using basic default: {bandwidth:.3f}")

    # Filter data within bandwidth
    df_bw = df[(df[running_variable] >= cutoff_value - bandwidth) & (df[running_variable] <= cutoff_value + bandwidth)].copy()
    if df_bw.empty:
        raise ValueError("No data within the specified bandwidth.")
        
    df_bw['above_cutoff'] = (df_bw[running_variable] >= cutoff_value).astype(int)

    # Define predictors for the regression
    # Interaction term allows different slopes above and below the cutoff
    df_bw['running_centered'] = df_bw[running_variable] - cutoff_value
    df_bw['running_x_above'] = df_bw['running_centered'] * df_bw['above_cutoff']
    predictors = ['above_cutoff', 'running_centered', 'running_x_above']

    # Covariates are NOT included in this basic RDD model
    # if covariates:
        # predictors.extend(covariates) # REMOVED as per user request

    required_cols = [outcome] + predictors
    missing_cols = [col for col in required_cols if col not in df_bw.columns]
    if missing_cols:
         raise ValueError(f"Fallback RDD missing columns: {missing_cols}")
         
    df_analysis = df_bw[required_cols].dropna()
    if df_analysis.empty:
        raise ValueError("No data remaining after dropping NaNs for fallback RDD.")

    X = df_analysis[predictors]
    X = sm.add_constant(X)
    y = df_analysis[outcome]
    
    formula = f"{outcome} ~ {' + '.join(predictors)} + const"
    logger.info(f"Running fallback RDD regression: {formula}")
    
    model = sm.OLS(y, X)
    # Use robust standard errors
    results = model.fit(cov_type='HC1')
    
    # The coefficient for 'above_cutoff' represents the jump at the cutoff
    effect = results.params['above_cutoff']
    p_value = results.pvalues['above_cutoff']
    conf_int = results.conf_int().loc['above_cutoff'].tolist()
    std_err = results.bse['above_cutoff']
    
    return {
        'effect_estimate': effect,
        'p_value': p_value,
        'confidence_interval': conf_int,
        'standard_error': std_err,
        'method_details': f"Fallback Linear Interaction (Bandwidth: {bandwidth:.3f})",
        'formula': formula,
        'model_summary': results.summary()
    }

def effect_estimate_rdd(
    df: pd.DataFrame,
    outcome: str,
    running_variable: str,
    cutoff_value: float,
    treatment: Optional[str] = None, # Kept for API consistency, but unused by evan-magnusson/rdd
    covariates: Optional[List[str]] = None,
    bandwidth: Optional[float] = None,
    **kwargs
) -> Dict[str, Any]:
    """
    Estimates RDD effect using the 'evan-magnusson/rdd' package.
    Uses IK optimal bandwidth selection from the same package by default.
    """
    logger.info(f"Attempting RDD estimation using 'evan-magnusson/rdd' for outcome '{outcome}' and running variable '{running_variable}'.")

    

    if treatment:
        logger.info(f"Treatment variable '{treatment}' provided but is not explicitly used by the evan-magnusson/rdd estimation function.")
    if covariates:
        logger.warning("Covariates provided but are ignored by this 'evan-magnusson/rdd' implementation.")

    # --- Bandwidth Selection ---
    final_bandwidth = None
    bandwidth_selection_method = "unknown"

    if bandwidth is not None and bandwidth > 0:
        logger.info(f"Using user-specified bandwidth: {bandwidth:.4f}")
        final_bandwidth = bandwidth
        bandwidth_selection_method = "user-specified"
    else:
        if bandwidth is not None and bandwidth <= 0:
            logger.warning(f"User-specified bandwidth {bandwidth} is not positive. Attempting IK optimal bandwidth selection.")
        try:
            logger.info(f"Attempting IK optimal bandwidth selection using _rdd_optimal_bw_func_em for {outcome} ~ {running_variable} cut at {cutoff_value}.")
            optimal_bw_val = rdd.optimal_bandwidth(df[outcome], df[running_variable], cut=cutoff_value)
            if optimal_bw_val is not None and optimal_bw_val > 0:
                final_bandwidth = optimal_bw_val
                bandwidth_selection_method = "ik_optimal (evan-magnusson/rdd)"
                logger.info(f"IK optimal bandwidth from evan-magnusson/rdd: {final_bandwidth:.4f}")
            else:
                logger.warning(f"IK optimal bandwidth from evan-magnusson/rdd was None or non-positive: {optimal_bw_val}. Falling back to default.")
        except Exception as e:
            logger.warning(f"IK optimal bandwidth selection from evan-magnusson/rdd failed: {e}. Falling back to default.")

        if final_bandwidth is None: # Fallback if user did not specify and IK failed/invalid
            logger.info("Falling back to default bandwidth (10% of running variable range).")
            rv_min = df[running_variable].min()
            rv_max = df[running_variable].max()
            rv_range = rv_max - rv_min
            if rv_range > 0:
                final_bandwidth = 0.1 * rv_range
                bandwidth_selection_method = "default_10_percent_range"
                logger.info(f"Using default 10% range bandwidth: {final_bandwidth:.4f}")
            else:
                err_msg = "Running variable range is not positive. Cannot determine a default bandwidth for evan-magnusson/rdd."
                logger.error(err_msg)
                raise ValueError(err_msg)

    if final_bandwidth is None or final_bandwidth <= 0:
        raise ValueError(f"Could not determine a valid positive bandwidth for evan-magnusson/rdd. Last method: {bandwidth_selection_method}")

    # --- RDD Estimation ---
    try:
        logger.info(f"Running RDD estimation with evan-magnusson/rdd: y='{outcome}', x='{running_variable}', cut={cutoff_value}, bw={final_bandwidth:.4f}")
        # The evan-magnusson/rdd package's rdd function typically handles dataframes directly
        # Ensure correct xname for truncated_data
        data_rdd = rdd.truncated_data(df, running_variable,final_bandwidth, cut=cutoff_value)
        model = rdd.rdd(
            data_rdd, 
            xname=running_variable,  # Correct: Name of the running variable column
            yname=outcome,           # Correct: Name of the outcome variable column
            cut=cutoff_value
        )
        
        # Extract results - this package creates a treatment dummy 'TREATED'
        # The 'model' object has a 'results' attribute which is a statsmodels result instance
        sm_results = model.fit()
        print(sm_results.summary())
        
        # Extract results - using 'TREATED' based on the provided summary output
        effect = sm_results.params.get('TREATED')
        std_err = sm_results.bse.get('TREATED')
        p_value = sm_results.pvalues.get('TREATED')
        
        conf_int_series = sm_results.conf_int()
        conf_int = conf_int_series.loc['TREATED'].tolist() if 'TREATED' in conf_int_series.index else [None, None]

        n_obs = model.nobs # or model.n_ if nobs is not available (check package details)
        
        # The formula is implicit in the local linear regression performed by the package
        # Update to reflect 'TREATED' as the dummy variable name if consistently used by the package
        formula_desc = f"Local linear RDD: {outcome} ~ TREATED + {running_variable}_centered + TREATED*{running_variable}_centered (implicit, from evan-magnusson/rdd)"

        return {
            'effect_estimate': effect,
            'standard_error': std_err,
            'p_value': p_value,
            'confidence_interval': conf_int,
            'method_details': f"RDD (evan-magnusson/rdd package, Bandwidth: {final_bandwidth:.4f})",
            'bandwidth_used': final_bandwidth,
            'bandwidth_selection_method': bandwidth_selection_method,
            'n_obs_in_bandwidth': int(n_obs) if n_obs is not None else None,
            'formula': formula_desc,
            'model_summary': sm_results.summary().as_text() if sm_results else "Summary not available."
        }

    except Exception as e:
        logger.error(f"RDD estimation using 'evan-magnusson/rdd' failed: {e}", exc_info=True)
        # Consider re-raising or returning a more structured error
        raise e # Or return a dict like in the import failure case

def estimate_effect(
    df: pd.DataFrame,
    treatment: str, 
    outcome: str,
    running_variable: str, 
    cutoff_value: float, 
    covariates: Optional[List[str]] = None,
    bandwidth: Optional[float] = None, # Optional bandwidth param
    query: Optional[str] = None,
    llm: Optional[BaseChatModel] = None,
    **kwargs # Capture other args like rd_estimator from DoWhy if needed
) -> Dict[str, Any]:
    """
    Estimates the causal effect using Regression Discontinuity Design.

    Tries DoWhy implementation first if use_dowhy=True, otherwise uses fallback.

    Args:
        df: Input DataFrame.
        treatment: Name of the treatment variable (often implicitly defined by cutoff).
                   DoWhy might still need it, fallback doesn't use it directly.
        outcome: Name of the outcome variable.
        running_variable: Name of the variable determining treatment assignment.
        cutoff: The threshold value for the running variable.
        covariates: Optional list of covariate names (support varies).
        bandwidth: Optional bandwidth around the cutoff. If None, a default is used.
        use_dowhy: Whether to attempt using the DoWhy library first.
        query: Optional user query for context.
        llm: Optional Language Model instance.
        **kwargs: Additional keyword arguments for underlying methods.

    Returns:
        Dictionary containing estimation results.
    """
    required_args = { 
        "running_variable": running_variable,
        "cutoff_value": cutoff_value
    }
    if any(val is None for val in required_args.values()):
        raise ValueError(f"Missing required RDD arguments: running_variable and cutoff must be provided.")

    results = {}
    rdd_em_estimation_error = None # Error from effect_estimate_rdd (evan-magnusson)
    fallback_estimation_error = None # Error from estimate_effect_fallback
    
    # --- Try effect_estimate_rdd (evan-magnusson/rdd) First --- 
    try:
        logger.info("Attempting RDD estimation using 'effect_estimate_rdd' (evan-magnusson/rdd package).")
        # Note: treatment is passed but might be unused, covariates are also passed but typically ignored by this specific rdd package
        results = effect_estimate_rdd(
            df, 
            outcome, 
            running_variable, 
            cutoff_value, 
            treatment=treatment, # For API consistency, though evan-magnusson/rdd doesn't use it explicitly
            covariates=covariates, 
            bandwidth=bandwidth, 
            **kwargs
        )
        results['method_used'] = 'evan-magnusson/rdd' # Ensure method_used is set
        logger.info("Successfully estimated effect using 'effect_estimate_rdd'.")
    except ImportError as ie: # Specifically catch import errors for the rdd package
        logger.warning(f"'effect_estimate_rdd' could not run due to ImportError (likely evan-magnusson/rdd package not available/functional): {ie}")
        rdd_em_estimation_error = ie
    except Exception as e:
        logger.warning(f"'effect_estimate_rdd' failed during execution: {e}")
        rdd_em_estimation_error = e
            
    # --- Fallback to estimate_effect_fallback if effect_estimate_rdd failed ---
    if not results: # If effect_estimate_rdd wasn't used or failed
        logger.info("'effect_estimate_rdd' did not produce results. Attempting fallback using 'estimate_effect_fallback'.")
        try:
            fallback_results = estimate_effect_fallback(df, treatment, outcome, running_variable, cutoff_value, covariates, bandwidth=bandwidth, **kwargs)
            results.update(fallback_results)
            results['method_used'] = 'Fallback RDD (Linear Interaction with Robust Errors)'
            fallback_estimation_error = None # Clear fallback error if it succeeded
            logger.info("Successfully estimated effect using 'estimate_effect_fallback'.")
        except Exception as e:
            logger.error(f"Fallback RDD estimation ('estimate_effect_fallback') also failed: {e}")
            fallback_estimation_error = e 

    # Determine final error status
    final_estimation_error = None
    if not results: # If still no results, determine which error to report
        if fallback_estimation_error: # Fallback was attempted and failed
            final_estimation_error = fallback_estimation_error
            logger.error(f"All RDD estimation attempts failed. Last error (from fallback): {final_estimation_error}")
        elif rdd_em_estimation_error: # effect_estimate_rdd was attempted and failed, fallback was not (or also failed but error not captured)
            final_estimation_error = rdd_em_estimation_error
            logger.error(f"All RDD estimation attempts failed. Last error (from effect_estimate_rdd): {final_estimation_error}")
        else:
             logger.error("All RDD estimation attempts failed for an unknown reason.")
        
        if final_estimation_error:
            raise ValueError(f"RDD estimation failed. Last error: {final_estimation_error}")
        else:
            raise ValueError("RDD estimation failed using all available methods for an unknown reason.")
        
    # --- Diagnostics --- 
    try:
        diag_results = run_rdd_diagnostics(df, outcome, running_variable, cutoff_value, covariates, bandwidth)
        results['diagnostics'] = diag_results
    except Exception as diag_e:
        logger.error(f"RDD Diagnostics failed: {diag_e}")
        results['diagnostics'] = {"status": "Failed", "error": str(diag_e)}
        
    # --- Interpretation --- 
    try:
        interpretation = interpret_rdd_results(results, results.get('diagnostics'), llm=llm)
        results['interpretation'] = interpretation
    except Exception as interp_e:
        logger.error(f"RDD Interpretation failed: {interp_e}")
        results['interpretation'] = "Interpretation failed."
        
    # Add info about primary attempt if fallback was used
    if rdd_em_estimation_error and results.get('method_used', '').startswith('Fallback'):
         results['primary_rdd_em_error_info'] = str(rdd_em_estimation_error)

    return results