Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

causal-agent / auto_causal /methods /linear_regression /estimator.py

FireShadow

Initial clean commit

1721aea 10 days ago

raw

history blame contribute delete

20.4 kB

	"""
	Linear Regression Estimator for Causal Inference.

	Uses Ordinary Least Squares (OLS) to estimate the treatment effect, potentially
	adjusting for covariates.
	"""
	import pandas as pd
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	from typing import Dict, Any, List, Optional, Union
	import logging
	from langchain.chat_models.base import BaseChatModel
	import re
	import json
	from pydantic import BaseModel, ValidationError
	from langchain_core.messages import HumanMessage
	from langchain_core.exceptions import OutputParserException


	from auto_causal.models import LLMIdentifiedRelevantParams
	from auto_causal.prompts.regression_prompts import STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE
	from auto_causal.config import get_llm_client

	# Placeholder for potential future LLM assistance integration
	# from .llm_assist import interpret_lr_results, suggest_lr_covariates
	# Placeholder for potential future diagnostics integration
	# from .diagnostics import run_lr_diagnostics

	logger = logging.getLogger(__name__)

	def _call_llm_for_var(llm: BaseChatModel, prompt: str, pydantic_model: BaseModel) -> Optional[BaseModel]:
	"""Helper to call LLM with structured output and handle errors."""
	try:
	messages = [HumanMessage(content=prompt)]
	structured_llm = llm.with_structured_output(pydantic_model)
	parsed_result = structured_llm.invoke(messages)
	return parsed_result
	except (OutputParserException, ValidationError) as e:
	logger.error(f"LLM call failed parsing/validation for {pydantic_model.__name__}: {e}")
	except Exception as e:
	logger.error(f"LLM call failed unexpectedly for {pydantic_model.__name__}: {e}", exc_info=True)
	return None

	# Define module-level helper function
	def _clean_variable_name_for_patsy_local(name: str) -> str:
	if not isinstance(name, str):
	name = str(name)
	name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
	if not re.match(r'^[a-zA-Z_]', name):
	name = 'var_' + name
	return name


	def estimate_effect(
	df: pd.DataFrame,
	treatment: str,
	outcome: str,
	covariates: Optional[List[str]] = None,
	query_str: Optional[str] = None, # For potential LLM use
	llm: Optional[BaseChatModel] = None, # For potential LLM use
	**kwargs # To capture any other potential arguments
	) -> Dict[str, Any]:
	"""
	Estimates the causal effect using Linear Regression (OLS).

	Args:
	df: Input DataFrame.
	treatment: Name of the treatment variable column.
	outcome: Name of the outcome variable column.
	covariates: Optional list of covariate names.
	query_str: Optional user query for context (e.g., for LLM).
	llm: Optional Language Model instance.
	**kwargs: Additional keyword arguments.

	Returns:
	Dictionary containing estimation results:
	- 'effect_estimate': The estimated coefficient for the treatment variable.
	- 'p_value': The p-value associated with the treatment coefficient.
	- 'confidence_interval': The 95% confidence interval for the effect.
	- 'standard_error': The standard error of the treatment coefficient.
	- 'formula': The regression formula used.
	- 'model_summary': Summary object from statsmodels.
	- 'diagnostics': Placeholder for diagnostic results.
	- 'interpretation': Placeholder for LLM interpretation.
	"""
	if covariates is None:
	covariates = []

	# Retrieve additional args from kwargs
	interaction_term_suggested = kwargs.get('interaction_term_suggested', False)
	# interaction_variable_candidate is the original name from query_interpreter
	interaction_variable_candidate_orig_name = kwargs.get('interaction_variable_candidate')
	treatment_reference_level = kwargs.get('treatment_reference_level')
	column_mappings = kwargs.get('column_mappings', {})

	required_cols = [treatment, outcome] + covariates
	# If interaction variable is suggested, ensure it (or its processed form) is in df for analysis
	# This check is complex here as interaction_variable_candidate_orig_name needs mapping to processed column(s)
	# We'll rely on df_analysis.dropna() and formula construction to handle missing interaction var columns later

	missing_cols = [col for col in required_cols if col not in df.columns]
	if missing_cols:
	raise ValueError(f"Missing required columns: {missing_cols}")

	# Prepare data for statsmodels (add constant, handle potential NaNs)
	df_analysis = df[required_cols].dropna()
	if df_analysis.empty:
	raise ValueError("No data remaining after dropping NaNs for required columns.")

	X = df_analysis[[treatment] + covariates]
	X = sm.add_constant(X) # Add intercept
	y = df_analysis[outcome]

	# --- Formula Construction ---
	outcome_col_name = outcome # Name in processed df
	treatment_col_name = treatment # Name in processed df
	processed_covariate_col_names = covariates # List of names in processed df

	rhs_terms = []

	# 1. Treatment Term
	treatment_patsy_term = treatment_col_name # Default
	original_treatment_info = column_mappings.get(treatment_col_name, {}) # Info from preprocess_data

	is_binary_encoded = original_treatment_info.get('transformed_as') == 'label_encoded_binary'
	is_still_categorical_in_df = df_analysis[treatment_col_name].dtype.name in ['object', 'category']

	if is_still_categorical_in_df and not is_binary_encoded: # Covers multi-level and binary categoricals not yet numeric
	if treatment_reference_level:
	treatment_patsy_term = f"C({treatment_col_name}, Treatment(reference='{treatment_reference_level}'))"
	logger.info(f"Treating '{treatment_col_name}' as multi-level categorical with reference '{treatment_reference_level}'.")
	else:
	# Default C() wrapping for categoricals if no specific reference is given.
	# This applies to multi-level or binary categoricals that were not label_encoded to 0/1 by preprocess_data.
	treatment_patsy_term = f"C({treatment_col_name})"
	logger.info(f"Treating '{treatment_col_name}' as categorical (Patsy will pick reference).")
	elif is_binary_encoded: # Was binary and explicitly label encoded to 0/1 by preprocess_data
	# Even if it's now numeric 0/1, C() ensures Patsy treats it categorically for parameter naming consistency.
	treatment_patsy_term = f"C({treatment_col_name})"
	logger.info(f"Treating label-encoded binary '{treatment_col_name}' as categorical for Patsy.")
	else: # Assumed to be already numeric (continuous or discrete numeric not needing C() for main effect)
	# treatment_patsy_term remains treatment_col_name (default)
	logger.info(f"Treating '{treatment_col_name}' as numeric for Patsy formula.")

	rhs_terms.append(treatment_patsy_term)

	# 2. Covariate Terms
	for cov_col_name in processed_covariate_col_names:
	if cov_col_name == treatment_col_name: # Should not happen if covariates list is clean
	continue
	# Assume covariates are already numeric/dummy. If one was object/category in df_analysis (unlikely), C() it.
	if df_analysis[cov_col_name].dtype.name in ['object', 'category']:
	rhs_terms.append(f"C({cov_col_name})")
	else:
	rhs_terms.append(cov_col_name)

	# 3. Interaction Term (Simplified: interaction_variable_candidate_orig_name must map to a single column in df_analysis)
	actual_interaction_term_added_to_formula = None
	if interaction_term_suggested and interaction_variable_candidate_orig_name:
	processed_interaction_col_name = None
	interaction_var_info = column_mappings.get(interaction_variable_candidate_orig_name, {})

	if interaction_var_info.get('transformed_as') == 'one_hot_encoded':
	logger.warning(f"Interaction with one-hot encoded variable '{interaction_variable_candidate_orig_name}' is complex. Currently skipping this interaction for Linear Regression.")
	elif interaction_var_info.get('new_column_name') and interaction_var_info['new_column_name'] in df_analysis.columns:
	processed_interaction_col_name = interaction_var_info['new_column_name']
	elif interaction_variable_candidate_orig_name in df_analysis.columns: # Was not in mappings, or mapping didn't change name (e.g. numeric)
	processed_interaction_col_name = interaction_variable_candidate_orig_name

	if processed_interaction_col_name:
	interaction_var_patsy_term = processed_interaction_col_name
	# If the processed interaction column itself is categorical (e.g. label encoded binary)
	if df_analysis[processed_interaction_col_name].dtype.name in ['object', 'category', 'bool'] or \
	interaction_var_info.get('original_dtype') in ['bool', 'category']:
	interaction_var_patsy_term = f"C({processed_interaction_col_name})"

	actual_interaction_term_added_to_formula = f"{treatment_patsy_term}:{interaction_var_patsy_term}"
	rhs_terms.append(actual_interaction_term_added_to_formula)
	logger.info(f"Adding interaction term to formula: {actual_interaction_term_added_to_formula}")
	elif interaction_variable_candidate_orig_name: # Log if it was suggested but couldn't be mapped/found
	logger.warning(f"Could not resolve interaction variable candidate '{interaction_variable_candidate_orig_name}' to a single usable column in processed data. Skipping interaction term.")

	# Build the formula string for reporting and fitting
	if not rhs_terms: # Should always have at least treatment
	formula = f"{outcome_col_name} ~ 1"
	else:
	formula = f"{outcome_col_name} ~ {' + '.join(rhs_terms)}"
	logger.info(f"Using formula for Linear Regression: {formula}")

	try:
	model = smf.ols(formula=formula, data=df_analysis)
	results = model.fit()
	logger.info("OLS model fitted successfully.")
	logger.info(results.summary()) # Changed to debug level for less verbose default logging

	# --- Result Extraction: LLM attempt first, then Regex fallback ---
	effect_estimates_by_level = {}
	all_params_extracted = False # Default to False
	llm_extraction_successful = False

	# Attempt LLM-based extraction if llm client and query are available
	llm = get_llm_client()
	if llm and query_str:
	logger.info(f"Attempting LLM-based result extraction (informed by query: '{query_str[:50]}...').")
	try:
	param_names_list = results.params.index.tolist()
	param_estimates_list = results.params.tolist()
	param_p_values_list = results.pvalues.tolist()
	param_std_errs_list = results.bse.tolist()

	conf_int_df = results.conf_int(alpha=0.05)
	param_conf_ints_low_list = []
	param_conf_ints_high_list = []

	if not conf_int_df.empty and len(conf_int_df.columns) == 2:
	aligned_conf_int_df = conf_int_df.reindex(results.params.index)
	param_conf_ints_low_list = aligned_conf_int_df.iloc[:, 0].fillna(float('nan')).tolist()
	param_conf_ints_high_list = aligned_conf_int_df.iloc[:, 1].fillna(float('nan')).tolist()
	else:
	nan_list_ci = [float('nan')] * len(param_names_list)
	param_conf_ints_low_list = nan_list_ci
	param_conf_ints_high_list = nan_list_ci

	# Placeholder for the new prompt template tailored for this extraction task
	# MOVED TO causalscientist/auto_causal/prompts/regression_prompts.py

	is_multilevel_case_for_prompt = bool(treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded)
	reference_level_for_prompt_str = str(treatment_reference_level) if is_multilevel_case_for_prompt else "N/A"

	indexed_param_names_for_prompt = [f"{idx}: '{name}'" for idx, name in enumerate(param_names_list)]
	indexed_param_names_str_for_prompt = "\n".join(indexed_param_names_for_prompt)

	prompt_text_for_identification = STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE.format(
	user_query=query_str,
	treatment_patsy_term=treatment_patsy_term,
	treatment_col_name=treatment_col_name,
	is_multilevel_case=is_multilevel_case_for_prompt,
	reference_level_for_prompt=reference_level_for_prompt_str,
	indexed_param_names_str=indexed_param_names_str_for_prompt, # Pass the indexed list as a string
	llm_response_schema_json=json.dumps(LLMIdentifiedRelevantParams.model_json_schema(), indent=2)
	)

	llm_identification_response = _call_llm_for_var(llm, prompt_text_for_identification, LLMIdentifiedRelevantParams)

	if llm_identification_response and llm_identification_response.identified_params:
	logger.info("LLM identified relevant parameters. Proceeding with programmatic extraction.")
	for item in llm_identification_response.identified_params:
	param_idx = item.param_index
	# Validate index against actual list length
	if 0 <= param_idx < len(results.params.index):
	actual_param_name = results.params.index[param_idx]
	# Sanity check if LLM returned name matches actual name at index
	if item.param_name != actual_param_name:
	logger.warning(f"LLM returned param_name '{item.param_name}' but name at index {param_idx} is '{actual_param_name}'. Using actual name from results.")

	current_effect_stats = {
	'estimate': results.params.iloc[param_idx],
	'p_value': results.pvalues.iloc[param_idx],
	'conf_int': results.conf_int(alpha=0.05).iloc[param_idx].tolist(),
	'std_err': results.bse.iloc[param_idx]
	}

	key_for_effect_dict = 'treatment_effect' # Default for single/binary
	if is_multilevel_case_for_prompt: # If it was a multi-level case
	match = re.search(r'\[T\.([^]]+)]', actual_param_name) # Use actual_param_name
	if match:
	level = match.group(1)
	if level != reference_level_for_prompt_str: # Ensure it's not the ref level itself
	key_for_effect_dict = level
	else:
	logger.warning(f"Could not parse level from LLM-identified param: {actual_param_name}. Storing with raw name.")
	key_for_effect_dict = actual_param_name # Fallback key

	effect_estimates_by_level[key_for_effect_dict] = current_effect_stats
	else:
	logger.warning(f"LLM returned an invalid parameter index: {param_idx}. Skipping.")

	if effect_estimates_by_level: # If any effects were successfully processed
	all_params_extracted = llm_identification_response.all_parameters_successfully_identified
	llm_extraction_successful = True
	logger.info(f"Successfully processed LLM-identified parameters. all_parameters_successfully_identified={all_params_extracted}")
	print(f"effect_estimates_by_level: {effect_estimates_by_level}")
	else:
	logger.warning("LLM identified parameters, but none could be processed into effects_estimates_by_level. Falling back to regex.")
	else:
	logger.warning("LLM parameter identification did not yield usable parameters. Falling back to regex.")

	except Exception as e_llm:
	logger.warning(f"LLM-based result extraction failed: {e_llm}. Falling back to regex.", exc_info=True)


	# --- End of Existing Regex Logic Block ---

	# Primary effect_estimate for simple reporting (e.g. first level or the only one)
	# For multi-level, this is ambiguous. For now, let's report None or the first one.
	# The full details are in effect_estimates_by_level.
	main_effect_estimate = None
	main_p_value = None
	main_conf_int = [None, None] # Default for single or if no effects
	main_std_err = None

	if effect_estimates_by_level:
	if 'treatment_effect' in effect_estimates_by_level: # Single effect case
	single_effect_data = effect_estimates_by_level['treatment_effect']
	main_effect_estimate = single_effect_data['estimate']
	main_p_value = single_effect_data['p_value']
	main_conf_int = single_effect_data['conf_int']
	main_std_err = single_effect_data['std_err']
	else: # Multi-level case
	logger.info("Multi-level treatment effects extracted. Populating dicts for main estimate fields.")
	effect_estimate_dict = {}
	p_value_dict = {}
	conf_int_dict = {}
	std_err_dict = {}
	for level, stats in effect_estimates_by_level.items():
	effect_estimate_dict[level] = stats.get('estimate')
	p_value_dict[level] = stats.get('p_value')
	conf_int_dict[level] = stats.get('conf_int') # This is already a list [low, high]
	std_err_dict[level] = stats.get('std_err')

	main_effect_estimate = effect_estimate_dict
	main_p_value = p_value_dict
	main_conf_int = conf_int_dict
	main_std_err = std_err_dict

	interpretation_details = {}
	if actual_interaction_term_added_to_formula and actual_interaction_term_added_to_formula in results.params.index:
	interpretation_details['interaction_term_coefficient'] = results.params[actual_interaction_term_added_to_formula]
	interpretation_details['interaction_term_p_value'] = results.pvalues[actual_interaction_term_added_to_formula]
	logger.info(f"Interaction term '{actual_interaction_term_added_to_formula}' coeff: {interpretation_details['interaction_term_coefficient']}")

	diag_results = {}
	interpretation = "Interpretation not available."

	output_dict = {
	'effect_estimate': main_effect_estimate,
	'p_value': main_p_value,
	'confidence_interval': main_conf_int,
	'standard_error': main_std_err,
	'estimated_effects_by_level': effect_estimates_by_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded and effect_estimates_by_level) else None,
	'reference_level_used': treatment_reference_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded) else None,
	'formula': formula,
	'model_summary_text': results.summary().as_text(), # Store as text for easier serialization
	'diagnostics': diag_results,
	'interpretation_details': interpretation_details, # Added interaction details
	'interpretation': interpretation,
	'method_used': 'Linear Regression (OLS)'
	}
	if not all_params_extracted:
	output_dict['warnings'] = ["Could not reliably extract all requested parameters from model results. Please check model_summary_text."]
	return output_dict

	except Exception as e:
	logger.error(f"Linear Regression failed: {e}")
	raise # Re-raise the exception after logging