Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

causal-agent / auto_causal /methods /instrumental_variable /diagnostics.py

FireShadow

Initial clean commit

1721aea 13 days ago

raw

history blame contribute delete

10.8 kB

	# Placeholder for IV-specific diagnostic functions
	import pandas as pd
	import statsmodels.api as sm
	from statsmodels.regression.linear_model import OLS
	# from statsmodels.sandbox.regression.gmm import IV2SLSResults # Removed problematic import
	from typing import Dict, Any, List, Tuple, Optional
	import logging # Import logging
	import numpy as np # Import numpy for np.zeros

	# Configure logger
	logger = logging.getLogger(__name__)

	def calculate_first_stage_f_statistic(df: pd.DataFrame, treatment: str, instruments: List[str], covariates: List[str]) -> Tuple[Optional[float], Optional[float]]:
	"""
	Calculates the F-statistic for instrument relevance in the first stage regression.

	Regresses treatment ~ instruments + covariates.
	Tests the joint significance of the instrument coefficients.

	Args:
	df: Input DataFrame.
	treatment: Name of the treatment variable.
	instruments: List of instrument variable names.
	covariates: List of covariate names.

	Returns:
	A tuple containing (F-statistic, p-value). Returns (None, None) on error.
	"""
	logger.info("Diagnostics: Calculating First-Stage F-statistic...")
	try:
	df_copy = df.copy()
	df_copy['intercept'] = 1
	exog_vars = ['intercept'] + covariates
	all_first_stage_exog = list(dict.fromkeys(exog_vars + instruments)) # Ensure unique columns

	endog = df_copy[treatment]
	exog = df_copy[all_first_stage_exog]

	# Check for perfect multicollinearity before fitting
	if exog.shape[1] > 1:
	corr_matrix = exog.corr()
	# Check if correlation matrix calculation failed (e.g., constant columns) or high correlation
	if corr_matrix.isnull().values.any() or (corr_matrix.abs() > 0.9999).sum().sum() > exog.shape[1]: # Check off-diagonal elements
	logger.warning("High multicollinearity or constant column detected in first stage exogenous variables.")
	# Note: statsmodels OLS might handle perfect collinearity by dropping columns, but F-test might be unreliable.

	first_stage_model = OLS(endog, exog).fit()

	# Construct the restriction matrix (R) to test H0: instrument coeffs = 0
	num_instruments = len(instruments)
	if num_instruments == 0:
	logger.warning("No instruments provided for F-statistic calculation.")
	return None, None
	num_exog_total = len(all_first_stage_exog)

	# Ensure instruments are actually in the fitted model's exog names (in case statsmodels dropped some)
	fitted_exog_names = first_stage_model.model.exog_names
	valid_instruments = [inst for inst in instruments if inst in fitted_exog_names]
	if not valid_instruments:
	logger.error("None of the provided instruments were included in the first-stage regression model (possibly due to collinearity).")
	return None, None
	if len(valid_instruments) < len(instruments):
	logger.warning(f"Instruments dropped by OLS: {set(instruments) - set(valid_instruments)}")

	instrument_indices = [fitted_exog_names.index(inst) for inst in valid_instruments]

	# Need to adjust R matrix size based on fitted model's exog
	R = np.zeros((len(valid_instruments), len(fitted_exog_names)))
	for i, idx in enumerate(instrument_indices):
	R[i, idx] = 1

	# Perform F-test
	f_test_result = first_stage_model.f_test(R)

	f_statistic = float(f_test_result.fvalue)
	p_value = float(f_test_result.pvalue)

	logger.info(f" F-statistic: {f_statistic:.4f}, p-value: {p_value:.4f}")
	return f_statistic, p_value

	except Exception as e:
	logger.error(f"Error calculating first-stage F-statistic: {e}", exc_info=True)
	return None, None

	def run_overidentification_test(sm_results: Optional[Any], df: pd.DataFrame, treatment: str, outcome: str, instruments: List[str], covariates: List[str]) -> Tuple[Optional[float], Optional[float], Optional[str]]:
	"""
	Runs an overidentification test (Sargan-Hansen) if applicable.

	This test is only valid if the number of instruments exceeds the number
	of endogenous regressors (typically 1, the treatment variable).

	Requires results from a statsmodels IV estimation.

	Args:
	sm_results: The fitted results object from statsmodels IV2SLS.fit().
	df: Input DataFrame.
	treatment: Name of the treatment variable.
	outcome: Name of the outcome variable.
	instruments: List of instrument variable names.
	covariates: List of covariate names.

	Returns:
	Tuple: (test_statistic, p_value, status_message) or (None, None, error_message)
	"""
	logger.info("Diagnostics: Running Overidentification Test...")
	num_instruments = len(instruments)
	num_endog = 1 # Assuming only one treatment variable is endogenous

	if num_instruments <= num_endog:
	logger.info(" Over-ID test not applicable (model is exactly identified or underidentified).")
	return None, None, "Test not applicable (Need more instruments than endogenous regressors)"

	if sm_results is None or not hasattr(sm_results, 'resid'):
	logger.warning(" Over-ID test requires valid statsmodels results object with residuals.")
	return None, None, "Statsmodels results object not available or invalid for test."

	try:
	# Statsmodels IV2SLSResults does not seem to have a direct method for this test (as of common versions).
	# We need to calculate it manually using residuals and instruments.
	# Formula: N * R^2 from regressing residuals (u_hat) on all exogenous variables (instruments + covariates).
	# Degrees of freedom = num_instruments - num_endogenous_vars

	residuals = sm_results.resid
	df_copy = df.copy()
	df_copy['intercept'] = 1
	exog_vars = ['intercept'] + covariates
	all_exog_instruments = list(dict.fromkeys(exog_vars + instruments))

	# Ensure columns exist in the dataframe before selecting
	missing_cols = [col for col in all_exog_instruments if col not in df_copy.columns]
	if missing_cols:
	raise ValueError(f"Missing columns required for Over-ID test: {missing_cols}")

	exog_for_test = df_copy[all_exog_instruments]

	# Check shapes match after potential NA handling in main estimator
	if len(residuals) != exog_for_test.shape[0]:
	# Attempt to align based on index if lengths differ (might happen if NAs were dropped)
	logger.warning(f"Residual length ({len(residuals)}) differs from exog_for_test rows ({exog_for_test.shape[0]}). Trying to align indices.")
	common_index = residuals.index.intersection(exog_for_test.index)
	if len(common_index) == 0:
	raise ValueError("Cannot align residuals and exogenous variables for Over-ID test after NA handling.")
	residuals = residuals.loc[common_index]
	exog_for_test = exog_for_test.loc[common_index]
	logger.warning(f"Aligned to {len(common_index)} common observations.")


	# Regress residuals on all exogenous instruments
	aux_model = OLS(residuals, exog_for_test).fit()
	r_squared = aux_model.rsquared
	n_obs = len(residuals) # Use length of residuals after potential alignment

	test_statistic = n_obs * r_squared

	# Calculate p-value from Chi-squared distribution
	from scipy.stats import chi2
	degrees_of_freedom = num_instruments - num_endog
	if degrees_of_freedom < 0:
	# This shouldn't happen if the initial check passed, but as a safeguard
	raise ValueError("Degrees of freedom for Sargan test are negative.")
	elif degrees_of_freedom == 0:
	# R-squared should be 0 if exactly identified, but handle edge case
	p_value = 1.0 if np.isclose(test_statistic, 0) else 0.0
	else:
	p_value = chi2.sf(test_statistic, degrees_of_freedom)

	logger.info(f" Sargan Test Statistic: {test_statistic:.4f}, p-value: {p_value:.4f}, df: {degrees_of_freedom}")
	return test_statistic, p_value, "Test successful"

	except Exception as e:
	logger.error(f"Error running overidentification test: {e}", exc_info=True)
	return None, None, f"Error during test: {e}"

	def run_iv_diagnostics(df: pd.DataFrame, treatment: str, outcome: str, instruments: List[str], covariates: List[str], sm_results: Optional[Any] = None, dw_results: Optional[Any] = None) -> Dict[str, Any]:
	"""
	Runs standard IV diagnostic checks.

	Args:
	df: Input DataFrame.
	treatment: Name of the treatment variable.
	outcome: Name of the outcome variable.
	instruments: List of instrument variable names.
	covariates: List of covariate names.
	sm_results: Optional fitted results object from statsmodels IV2SLS.fit().
	dw_results: Optional results object from DoWhy (structure may vary).

	Returns:
	Dictionary containing diagnostic results.
	"""
	diagnostics = {}

	# 1. Instrument Relevance / Weak Instrument Test (First-Stage F-statistic)
	f_stat, f_p_val = calculate_first_stage_f_statistic(df, treatment, instruments, covariates)
	diagnostics['first_stage_f_statistic'] = f_stat
	diagnostics['first_stage_p_value'] = f_p_val
	diagnostics['is_instrument_weak'] = (f_stat < 10) if f_stat is not None else None # Common rule of thumb
	if f_stat is None:
	diagnostics['weak_instrument_test_status'] = "Error during calculation"
	elif diagnostics['is_instrument_weak']:
	diagnostics['weak_instrument_test_status'] = "Warning: Instrument(s) may be weak (F < 10)"
	else:
	diagnostics['weak_instrument_test_status'] = "Instrument(s) appear sufficiently strong (F >= 10)"


	# 2. Overidentification Test (e.g., Sargan-Hansen)
	overid_stat, overid_p_val, overid_status = run_overidentification_test(sm_results, df, treatment, outcome, instruments, covariates)
	diagnostics['overid_test_statistic'] = overid_stat
	diagnostics['overid_test_p_value'] = overid_p_val
	diagnostics['overid_test_status'] = overid_status
	diagnostics['overid_test_applicable'] = not ("not applicable" in overid_status.lower() if overid_status else True)

	# 3. Exogeneity/Exclusion Restriction (Conceptual Check)
	diagnostics['exclusion_restriction_assumption'] = "Assumed based on graph/input; cannot be statistically tested directly. Qualitative LLM check recommended."

	# Potential future additions:
	# - Endogeneity tests (e.g., Hausman test - requires comparing OLS and IV estimates)

	return diagnostics