Spaces:
Running
Running
File size: 20,417 Bytes
1721aea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 |
"""
Linear Regression Estimator for Causal Inference.
Uses Ordinary Least Squares (OLS) to estimate the treatment effect, potentially
adjusting for covariates.
"""
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from typing import Dict, Any, List, Optional, Union
import logging
from langchain.chat_models.base import BaseChatModel
import re
import json
from pydantic import BaseModel, ValidationError
from langchain_core.messages import HumanMessage
from langchain_core.exceptions import OutputParserException
from auto_causal.models import LLMIdentifiedRelevantParams
from auto_causal.prompts.regression_prompts import STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE
from auto_causal.config import get_llm_client
# Placeholder for potential future LLM assistance integration
# from .llm_assist import interpret_lr_results, suggest_lr_covariates
# Placeholder for potential future diagnostics integration
# from .diagnostics import run_lr_diagnostics
logger = logging.getLogger(__name__)
def _call_llm_for_var(llm: BaseChatModel, prompt: str, pydantic_model: BaseModel) -> Optional[BaseModel]:
"""Helper to call LLM with structured output and handle errors."""
try:
messages = [HumanMessage(content=prompt)]
structured_llm = llm.with_structured_output(pydantic_model)
parsed_result = structured_llm.invoke(messages)
return parsed_result
except (OutputParserException, ValidationError) as e:
logger.error(f"LLM call failed parsing/validation for {pydantic_model.__name__}: {e}")
except Exception as e:
logger.error(f"LLM call failed unexpectedly for {pydantic_model.__name__}: {e}", exc_info=True)
return None
# Define module-level helper function
def _clean_variable_name_for_patsy_local(name: str) -> str:
if not isinstance(name, str):
name = str(name)
name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
if not re.match(r'^[a-zA-Z_]', name):
name = 'var_' + name
return name
def estimate_effect(
df: pd.DataFrame,
treatment: str,
outcome: str,
covariates: Optional[List[str]] = None,
query_str: Optional[str] = None, # For potential LLM use
llm: Optional[BaseChatModel] = None, # For potential LLM use
**kwargs # To capture any other potential arguments
) -> Dict[str, Any]:
"""
Estimates the causal effect using Linear Regression (OLS).
Args:
df: Input DataFrame.
treatment: Name of the treatment variable column.
outcome: Name of the outcome variable column.
covariates: Optional list of covariate names.
query_str: Optional user query for context (e.g., for LLM).
llm: Optional Language Model instance.
**kwargs: Additional keyword arguments.
Returns:
Dictionary containing estimation results:
- 'effect_estimate': The estimated coefficient for the treatment variable.
- 'p_value': The p-value associated with the treatment coefficient.
- 'confidence_interval': The 95% confidence interval for the effect.
- 'standard_error': The standard error of the treatment coefficient.
- 'formula': The regression formula used.
- 'model_summary': Summary object from statsmodels.
- 'diagnostics': Placeholder for diagnostic results.
- 'interpretation': Placeholder for LLM interpretation.
"""
if covariates is None:
covariates = []
# Retrieve additional args from kwargs
interaction_term_suggested = kwargs.get('interaction_term_suggested', False)
# interaction_variable_candidate is the *original* name from query_interpreter
interaction_variable_candidate_orig_name = kwargs.get('interaction_variable_candidate')
treatment_reference_level = kwargs.get('treatment_reference_level')
column_mappings = kwargs.get('column_mappings', {})
required_cols = [treatment, outcome] + covariates
# If interaction variable is suggested, ensure it (or its processed form) is in df for analysis
# This check is complex here as interaction_variable_candidate_orig_name needs mapping to processed column(s)
# We'll rely on df_analysis.dropna() and formula construction to handle missing interaction var columns later
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
# Prepare data for statsmodels (add constant, handle potential NaNs)
df_analysis = df[required_cols].dropna()
if df_analysis.empty:
raise ValueError("No data remaining after dropping NaNs for required columns.")
X = df_analysis[[treatment] + covariates]
X = sm.add_constant(X) # Add intercept
y = df_analysis[outcome]
# --- Formula Construction ---
outcome_col_name = outcome # Name in processed df
treatment_col_name = treatment # Name in processed df
processed_covariate_col_names = covariates # List of names in processed df
rhs_terms = []
# 1. Treatment Term
treatment_patsy_term = treatment_col_name # Default
original_treatment_info = column_mappings.get(treatment_col_name, {}) # Info from preprocess_data
is_binary_encoded = original_treatment_info.get('transformed_as') == 'label_encoded_binary'
is_still_categorical_in_df = df_analysis[treatment_col_name].dtype.name in ['object', 'category']
if is_still_categorical_in_df and not is_binary_encoded: # Covers multi-level and binary categoricals not yet numeric
if treatment_reference_level:
treatment_patsy_term = f"C({treatment_col_name}, Treatment(reference='{treatment_reference_level}'))"
logger.info(f"Treating '{treatment_col_name}' as multi-level categorical with reference '{treatment_reference_level}'.")
else:
# Default C() wrapping for categoricals if no specific reference is given.
# This applies to multi-level or binary categoricals that were not label_encoded to 0/1 by preprocess_data.
treatment_patsy_term = f"C({treatment_col_name})"
logger.info(f"Treating '{treatment_col_name}' as categorical (Patsy will pick reference).")
elif is_binary_encoded: # Was binary and explicitly label encoded to 0/1 by preprocess_data
# Even if it's now numeric 0/1, C() ensures Patsy treats it categorically for parameter naming consistency.
treatment_patsy_term = f"C({treatment_col_name})"
logger.info(f"Treating label-encoded binary '{treatment_col_name}' as categorical for Patsy.")
else: # Assumed to be already numeric (continuous or discrete numeric not needing C() for main effect)
# treatment_patsy_term remains treatment_col_name (default)
logger.info(f"Treating '{treatment_col_name}' as numeric for Patsy formula.")
rhs_terms.append(treatment_patsy_term)
# 2. Covariate Terms
for cov_col_name in processed_covariate_col_names:
if cov_col_name == treatment_col_name: # Should not happen if covariates list is clean
continue
# Assume covariates are already numeric/dummy. If one was object/category in df_analysis (unlikely), C() it.
if df_analysis[cov_col_name].dtype.name in ['object', 'category']:
rhs_terms.append(f"C({cov_col_name})")
else:
rhs_terms.append(cov_col_name)
# 3. Interaction Term (Simplified: interaction_variable_candidate_orig_name must map to a single column in df_analysis)
actual_interaction_term_added_to_formula = None
if interaction_term_suggested and interaction_variable_candidate_orig_name:
processed_interaction_col_name = None
interaction_var_info = column_mappings.get(interaction_variable_candidate_orig_name, {})
if interaction_var_info.get('transformed_as') == 'one_hot_encoded':
logger.warning(f"Interaction with one-hot encoded variable '{interaction_variable_candidate_orig_name}' is complex. Currently skipping this interaction for Linear Regression.")
elif interaction_var_info.get('new_column_name') and interaction_var_info['new_column_name'] in df_analysis.columns:
processed_interaction_col_name = interaction_var_info['new_column_name']
elif interaction_variable_candidate_orig_name in df_analysis.columns: # Was not in mappings, or mapping didn't change name (e.g. numeric)
processed_interaction_col_name = interaction_variable_candidate_orig_name
if processed_interaction_col_name:
interaction_var_patsy_term = processed_interaction_col_name
# If the processed interaction column itself is categorical (e.g. label encoded binary)
if df_analysis[processed_interaction_col_name].dtype.name in ['object', 'category', 'bool'] or \
interaction_var_info.get('original_dtype') in ['bool', 'category']:
interaction_var_patsy_term = f"C({processed_interaction_col_name})"
actual_interaction_term_added_to_formula = f"{treatment_patsy_term}:{interaction_var_patsy_term}"
rhs_terms.append(actual_interaction_term_added_to_formula)
logger.info(f"Adding interaction term to formula: {actual_interaction_term_added_to_formula}")
elif interaction_variable_candidate_orig_name: # Log if it was suggested but couldn't be mapped/found
logger.warning(f"Could not resolve interaction variable candidate '{interaction_variable_candidate_orig_name}' to a single usable column in processed data. Skipping interaction term.")
# Build the formula string for reporting and fitting
if not rhs_terms: # Should always have at least treatment
formula = f"{outcome_col_name} ~ 1"
else:
formula = f"{outcome_col_name} ~ {' + '.join(rhs_terms)}"
logger.info(f"Using formula for Linear Regression: {formula}")
try:
model = smf.ols(formula=formula, data=df_analysis)
results = model.fit()
logger.info("OLS model fitted successfully.")
logger.info(results.summary()) # Changed to debug level for less verbose default logging
# --- Result Extraction: LLM attempt first, then Regex fallback ---
effect_estimates_by_level = {}
all_params_extracted = False # Default to False
llm_extraction_successful = False
# Attempt LLM-based extraction if llm client and query are available
llm = get_llm_client()
if llm and query_str:
logger.info(f"Attempting LLM-based result extraction (informed by query: '{query_str[:50]}...').")
try:
param_names_list = results.params.index.tolist()
param_estimates_list = results.params.tolist()
param_p_values_list = results.pvalues.tolist()
param_std_errs_list = results.bse.tolist()
conf_int_df = results.conf_int(alpha=0.05)
param_conf_ints_low_list = []
param_conf_ints_high_list = []
if not conf_int_df.empty and len(conf_int_df.columns) == 2:
aligned_conf_int_df = conf_int_df.reindex(results.params.index)
param_conf_ints_low_list = aligned_conf_int_df.iloc[:, 0].fillna(float('nan')).tolist()
param_conf_ints_high_list = aligned_conf_int_df.iloc[:, 1].fillna(float('nan')).tolist()
else:
nan_list_ci = [float('nan')] * len(param_names_list)
param_conf_ints_low_list = nan_list_ci
param_conf_ints_high_list = nan_list_ci
# Placeholder for the new prompt template tailored for this extraction task
# MOVED TO causalscientist/auto_causal/prompts/regression_prompts.py
is_multilevel_case_for_prompt = bool(treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded)
reference_level_for_prompt_str = str(treatment_reference_level) if is_multilevel_case_for_prompt else "N/A"
indexed_param_names_for_prompt = [f"{idx}: '{name}'" for idx, name in enumerate(param_names_list)]
indexed_param_names_str_for_prompt = "\n".join(indexed_param_names_for_prompt)
prompt_text_for_identification = STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE.format(
user_query=query_str,
treatment_patsy_term=treatment_patsy_term,
treatment_col_name=treatment_col_name,
is_multilevel_case=is_multilevel_case_for_prompt,
reference_level_for_prompt=reference_level_for_prompt_str,
indexed_param_names_str=indexed_param_names_str_for_prompt, # Pass the indexed list as a string
llm_response_schema_json=json.dumps(LLMIdentifiedRelevantParams.model_json_schema(), indent=2)
)
llm_identification_response = _call_llm_for_var(llm, prompt_text_for_identification, LLMIdentifiedRelevantParams)
if llm_identification_response and llm_identification_response.identified_params:
logger.info("LLM identified relevant parameters. Proceeding with programmatic extraction.")
for item in llm_identification_response.identified_params:
param_idx = item.param_index
# Validate index against actual list length
if 0 <= param_idx < len(results.params.index):
actual_param_name = results.params.index[param_idx]
# Sanity check if LLM returned name matches actual name at index
if item.param_name != actual_param_name:
logger.warning(f"LLM returned param_name '{item.param_name}' but name at index {param_idx} is '{actual_param_name}'. Using actual name from results.")
current_effect_stats = {
'estimate': results.params.iloc[param_idx],
'p_value': results.pvalues.iloc[param_idx],
'conf_int': results.conf_int(alpha=0.05).iloc[param_idx].tolist(),
'std_err': results.bse.iloc[param_idx]
}
key_for_effect_dict = 'treatment_effect' # Default for single/binary
if is_multilevel_case_for_prompt: # If it was a multi-level case
match = re.search(r'\[T\.([^]]+)]', actual_param_name) # Use actual_param_name
if match:
level = match.group(1)
if level != reference_level_for_prompt_str: # Ensure it's not the ref level itself
key_for_effect_dict = level
else:
logger.warning(f"Could not parse level from LLM-identified param: {actual_param_name}. Storing with raw name.")
key_for_effect_dict = actual_param_name # Fallback key
effect_estimates_by_level[key_for_effect_dict] = current_effect_stats
else:
logger.warning(f"LLM returned an invalid parameter index: {param_idx}. Skipping.")
if effect_estimates_by_level: # If any effects were successfully processed
all_params_extracted = llm_identification_response.all_parameters_successfully_identified
llm_extraction_successful = True
logger.info(f"Successfully processed LLM-identified parameters. all_parameters_successfully_identified={all_params_extracted}")
print(f"effect_estimates_by_level: {effect_estimates_by_level}")
else:
logger.warning("LLM identified parameters, but none could be processed into effects_estimates_by_level. Falling back to regex.")
else:
logger.warning("LLM parameter identification did not yield usable parameters. Falling back to regex.")
except Exception as e_llm:
logger.warning(f"LLM-based result extraction failed: {e_llm}. Falling back to regex.", exc_info=True)
# --- End of Existing Regex Logic Block ---
# Primary effect_estimate for simple reporting (e.g. first level or the only one)
# For multi-level, this is ambiguous. For now, let's report None or the first one.
# The full details are in effect_estimates_by_level.
main_effect_estimate = None
main_p_value = None
main_conf_int = [None, None] # Default for single or if no effects
main_std_err = None
if effect_estimates_by_level:
if 'treatment_effect' in effect_estimates_by_level: # Single effect case
single_effect_data = effect_estimates_by_level['treatment_effect']
main_effect_estimate = single_effect_data['estimate']
main_p_value = single_effect_data['p_value']
main_conf_int = single_effect_data['conf_int']
main_std_err = single_effect_data['std_err']
else: # Multi-level case
logger.info("Multi-level treatment effects extracted. Populating dicts for main estimate fields.")
effect_estimate_dict = {}
p_value_dict = {}
conf_int_dict = {}
std_err_dict = {}
for level, stats in effect_estimates_by_level.items():
effect_estimate_dict[level] = stats.get('estimate')
p_value_dict[level] = stats.get('p_value')
conf_int_dict[level] = stats.get('conf_int') # This is already a list [low, high]
std_err_dict[level] = stats.get('std_err')
main_effect_estimate = effect_estimate_dict
main_p_value = p_value_dict
main_conf_int = conf_int_dict
main_std_err = std_err_dict
interpretation_details = {}
if actual_interaction_term_added_to_formula and actual_interaction_term_added_to_formula in results.params.index:
interpretation_details['interaction_term_coefficient'] = results.params[actual_interaction_term_added_to_formula]
interpretation_details['interaction_term_p_value'] = results.pvalues[actual_interaction_term_added_to_formula]
logger.info(f"Interaction term '{actual_interaction_term_added_to_formula}' coeff: {interpretation_details['interaction_term_coefficient']}")
diag_results = {}
interpretation = "Interpretation not available."
output_dict = {
'effect_estimate': main_effect_estimate,
'p_value': main_p_value,
'confidence_interval': main_conf_int,
'standard_error': main_std_err,
'estimated_effects_by_level': effect_estimates_by_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded and effect_estimates_by_level) else None,
'reference_level_used': treatment_reference_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded) else None,
'formula': formula,
'model_summary_text': results.summary().as_text(), # Store as text for easier serialization
'diagnostics': diag_results,
'interpretation_details': interpretation_details, # Added interaction details
'interpretation': interpretation,
'method_used': 'Linear Regression (OLS)'
}
if not all_params_extracted:
output_dict['warnings'] = ["Could not reliably extract all requested parameters from model results. Please check model_summary_text."]
return output_dict
except Exception as e:
logger.error(f"Linear Regression failed: {e}")
raise # Re-raise the exception after logging |