Spaces:

CausalNLP
/

causal-agent

Running

File size: 20,417 Bytes

1721aea

"""
Linear Regression Estimator for Causal Inference.

Uses Ordinary Least Squares (OLS) to estimate the treatment effect, potentially
adjusting for covariates.
"""
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from typing import Dict, Any, List, Optional, Union
import logging
from langchain.chat_models.base import BaseChatModel
import re
import json
from pydantic import BaseModel, ValidationError
from langchain_core.messages import HumanMessage
from langchain_core.exceptions import OutputParserException


from auto_causal.models import LLMIdentifiedRelevantParams
from auto_causal.prompts.regression_prompts import STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE
from auto_causal.config import get_llm_client

# Placeholder for potential future LLM assistance integration
# from .llm_assist import interpret_lr_results, suggest_lr_covariates
# Placeholder for potential future diagnostics integration
# from .diagnostics import run_lr_diagnostics

logger = logging.getLogger(__name__)

def _call_llm_for_var(llm: BaseChatModel, prompt: str, pydantic_model: BaseModel) -> Optional[BaseModel]:
    """Helper to call LLM with structured output and handle errors."""
    try:
        messages = [HumanMessage(content=prompt)]
        structured_llm = llm.with_structured_output(pydantic_model)
        parsed_result = structured_llm.invoke(messages)
        return parsed_result
    except (OutputParserException, ValidationError) as e:
        logger.error(f"LLM call failed parsing/validation for {pydantic_model.__name__}: {e}")
    except Exception as e:
         logger.error(f"LLM call failed unexpectedly for {pydantic_model.__name__}: {e}", exc_info=True)
    return None

# Define module-level helper function
def _clean_variable_name_for_patsy_local(name: str) -> str:
    if not isinstance(name, str):
        name = str(name)
    name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
    if not re.match(r'^[a-zA-Z_]', name):
        name = 'var_' + name
    return name


def estimate_effect(
    df: pd.DataFrame,
    treatment: str,
    outcome: str,
    covariates: Optional[List[str]] = None,
    query_str: Optional[str] = None, # For potential LLM use
    llm: Optional[BaseChatModel] = None, # For potential LLM use
    **kwargs # To capture any other potential arguments
) -> Dict[str, Any]:
    """
    Estimates the causal effect using Linear Regression (OLS).

    Args:
        df: Input DataFrame.
        treatment: Name of the treatment variable column.
        outcome: Name of the outcome variable column.
        covariates: Optional list of covariate names.
        query_str: Optional user query for context (e.g., for LLM).
        llm: Optional Language Model instance.
        **kwargs: Additional keyword arguments.

    Returns:
        Dictionary containing estimation results:
        - 'effect_estimate': The estimated coefficient for the treatment variable.
        - 'p_value': The p-value associated with the treatment coefficient.
        - 'confidence_interval': The 95% confidence interval for the effect.
        - 'standard_error': The standard error of the treatment coefficient.
        - 'formula': The regression formula used.
        - 'model_summary': Summary object from statsmodels.
        - 'diagnostics': Placeholder for diagnostic results.
        - 'interpretation': Placeholder for LLM interpretation.
    """
    if covariates is None:
        covariates = []

    # Retrieve additional args from kwargs
    interaction_term_suggested = kwargs.get('interaction_term_suggested', False)
    # interaction_variable_candidate is the *original* name from query_interpreter
    interaction_variable_candidate_orig_name = kwargs.get('interaction_variable_candidate')
    treatment_reference_level = kwargs.get('treatment_reference_level')
    column_mappings = kwargs.get('column_mappings', {})

    required_cols = [treatment, outcome] + covariates
    # If interaction variable is suggested, ensure it (or its processed form) is in df for analysis
    # This check is complex here as interaction_variable_candidate_orig_name needs mapping to processed column(s)
    # We'll rely on df_analysis.dropna() and formula construction to handle missing interaction var columns later

    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    # Prepare data for statsmodels (add constant, handle potential NaNs)
    df_analysis = df[required_cols].dropna()
    if df_analysis.empty:
        raise ValueError("No data remaining after dropping NaNs for required columns.")
        
    X = df_analysis[[treatment] + covariates]
    X = sm.add_constant(X) # Add intercept
    y = df_analysis[outcome]

    # --- Formula Construction --- 
    outcome_col_name = outcome # Name in processed df
    treatment_col_name = treatment # Name in processed df
    processed_covariate_col_names = covariates # List of names in processed df

    rhs_terms = []

    # 1. Treatment Term
    treatment_patsy_term = treatment_col_name # Default
    original_treatment_info = column_mappings.get(treatment_col_name, {}) # Info from preprocess_data

    is_binary_encoded = original_treatment_info.get('transformed_as') == 'label_encoded_binary'
    is_still_categorical_in_df = df_analysis[treatment_col_name].dtype.name in ['object', 'category'] 

    if is_still_categorical_in_df and not is_binary_encoded: # Covers multi-level and binary categoricals not yet numeric
        if treatment_reference_level:
            treatment_patsy_term = f"C({treatment_col_name}, Treatment(reference='{treatment_reference_level}'))"
            logger.info(f"Treating '{treatment_col_name}' as multi-level categorical with reference '{treatment_reference_level}'.")
        else:
            # Default C() wrapping for categoricals if no specific reference is given.
            # This applies to multi-level or binary categoricals that were not label_encoded to 0/1 by preprocess_data.
            treatment_patsy_term = f"C({treatment_col_name})"
            logger.info(f"Treating '{treatment_col_name}' as categorical (Patsy will pick reference).")
    elif is_binary_encoded: # Was binary and explicitly label encoded to 0/1 by preprocess_data
        # Even if it's now numeric 0/1, C() ensures Patsy treats it categorically for parameter naming consistency.
        treatment_patsy_term = f"C({treatment_col_name})"
        logger.info(f"Treating label-encoded binary '{treatment_col_name}' as categorical for Patsy.")
    else: # Assumed to be already numeric (continuous or discrete numeric not needing C() for main effect)
        # treatment_patsy_term remains treatment_col_name (default)
        logger.info(f"Treating '{treatment_col_name}' as numeric for Patsy formula.")
    
    rhs_terms.append(treatment_patsy_term)

    # 2. Covariate Terms
    for cov_col_name in processed_covariate_col_names:
        if cov_col_name == treatment_col_name: # Should not happen if covariates list is clean
            continue 
        # Assume covariates are already numeric/dummy. If one was object/category in df_analysis (unlikely), C() it.
        if df_analysis[cov_col_name].dtype.name in ['object', 'category']:
            rhs_terms.append(f"C({cov_col_name})")
        else:
            rhs_terms.append(cov_col_name)
    
    # 3. Interaction Term (Simplified: interaction_variable_candidate_orig_name must map to a single column in df_analysis)
    actual_interaction_term_added_to_formula = None
    if interaction_term_suggested and interaction_variable_candidate_orig_name:
        processed_interaction_col_name = None
        interaction_var_info = column_mappings.get(interaction_variable_candidate_orig_name, {})

        if interaction_var_info.get('transformed_as') == 'one_hot_encoded':
            logger.warning(f"Interaction with one-hot encoded variable '{interaction_variable_candidate_orig_name}' is complex. Currently skipping this interaction for Linear Regression.")
        elif interaction_var_info.get('new_column_name') and interaction_var_info['new_column_name'] in df_analysis.columns:
            processed_interaction_col_name = interaction_var_info['new_column_name']
        elif interaction_variable_candidate_orig_name in df_analysis.columns: # Was not in mappings, or mapping didn't change name (e.g. numeric)
            processed_interaction_col_name = interaction_variable_candidate_orig_name
        
        if processed_interaction_col_name:
            interaction_var_patsy_term = processed_interaction_col_name
            # If the processed interaction column itself is categorical (e.g. label encoded binary)
            if df_analysis[processed_interaction_col_name].dtype.name in ['object', 'category', 'bool'] or \
               interaction_var_info.get('original_dtype') in ['bool', 'category']:
                interaction_var_patsy_term = f"C({processed_interaction_col_name})"
            
            actual_interaction_term_added_to_formula = f"{treatment_patsy_term}:{interaction_var_patsy_term}"
            rhs_terms.append(actual_interaction_term_added_to_formula)
            logger.info(f"Adding interaction term to formula: {actual_interaction_term_added_to_formula}")
        elif interaction_variable_candidate_orig_name: # Log if it was suggested but couldn't be mapped/found
            logger.warning(f"Could not resolve interaction variable candidate '{interaction_variable_candidate_orig_name}' to a single usable column in processed data. Skipping interaction term.")

    # Build the formula string for reporting and fitting
    if not rhs_terms: # Should always have at least treatment
        formula = f"{outcome_col_name} ~ 1"
    else:
        formula = f"{outcome_col_name} ~ {' + '.join(rhs_terms)}"
    logger.info(f"Using formula for Linear Regression: {formula}")

    try:
        model = smf.ols(formula=formula, data=df_analysis)
        results = model.fit()
        logger.info("OLS model fitted successfully.")
        logger.info(results.summary()) # Changed to debug level for less verbose default logging

        # --- Result Extraction: LLM attempt first, then Regex fallback ---
        effect_estimates_by_level = {}
        all_params_extracted = False # Default to False
        llm_extraction_successful = False

        # Attempt LLM-based extraction if llm client and query are available
        llm = get_llm_client()
        if llm and query_str:
            logger.info(f"Attempting LLM-based result extraction (informed by query: '{query_str[:50]}...').")
            try:
                param_names_list = results.params.index.tolist()
                param_estimates_list = results.params.tolist()
                param_p_values_list = results.pvalues.tolist()
                param_std_errs_list = results.bse.tolist()
                
                conf_int_df = results.conf_int(alpha=0.05)
                param_conf_ints_low_list = []
                param_conf_ints_high_list = []

                if not conf_int_df.empty and len(conf_int_df.columns) == 2:
                    aligned_conf_int_df = conf_int_df.reindex(results.params.index)
                    param_conf_ints_low_list = aligned_conf_int_df.iloc[:, 0].fillna(float('nan')).tolist()
                    param_conf_ints_high_list = aligned_conf_int_df.iloc[:, 1].fillna(float('nan')).tolist()
                else:
                    nan_list_ci = [float('nan')] * len(param_names_list)
                    param_conf_ints_low_list = nan_list_ci
                    param_conf_ints_high_list = nan_list_ci

                # Placeholder for the new prompt template tailored for this extraction task
                # MOVED TO causalscientist/auto_causal/prompts/regression_prompts.py

                is_multilevel_case_for_prompt = bool(treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded)
                reference_level_for_prompt_str = str(treatment_reference_level) if is_multilevel_case_for_prompt else "N/A"
                
                indexed_param_names_for_prompt = [f"{idx}: '{name}'" for idx, name in enumerate(param_names_list)]
                indexed_param_names_str_for_prompt = "\n".join(indexed_param_names_for_prompt)

                prompt_text_for_identification = STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE.format(
                    user_query=query_str,
                    treatment_patsy_term=treatment_patsy_term,
                    treatment_col_name=treatment_col_name,
                    is_multilevel_case=is_multilevel_case_for_prompt,
                    reference_level_for_prompt=reference_level_for_prompt_str,
                    indexed_param_names_str=indexed_param_names_str_for_prompt, # Pass the indexed list as a string
                    llm_response_schema_json=json.dumps(LLMIdentifiedRelevantParams.model_json_schema(), indent=2)
                )
                
                llm_identification_response = _call_llm_for_var(llm, prompt_text_for_identification, LLMIdentifiedRelevantParams)

                if llm_identification_response and llm_identification_response.identified_params:
                    logger.info("LLM identified relevant parameters. Proceeding with programmatic extraction.")
                    for item in llm_identification_response.identified_params:
                        param_idx = item.param_index
                        # Validate index against actual list length
                        if 0 <= param_idx < len(results.params.index):
                            actual_param_name = results.params.index[param_idx]
                            # Sanity check if LLM returned name matches actual name at index
                            if item.param_name != actual_param_name:
                                logger.warning(f"LLM returned param_name '{item.param_name}' but name at index {param_idx} is '{actual_param_name}'. Using actual name from results.")
                            
                            current_effect_stats = {
                                'estimate': results.params.iloc[param_idx],
                                'p_value': results.pvalues.iloc[param_idx],
                                'conf_int': results.conf_int(alpha=0.05).iloc[param_idx].tolist(),
                                'std_err': results.bse.iloc[param_idx]
                            }

                            key_for_effect_dict = 'treatment_effect' # Default for single/binary
                            if is_multilevel_case_for_prompt: # If it was a multi-level case
                                match = re.search(r'\[T\.([^]]+)]', actual_param_name) # Use actual_param_name
                                if match:
                                    level = match.group(1)
                                    if level != reference_level_for_prompt_str: # Ensure it's not the ref level itself
                                        key_for_effect_dict = level
                                else:
                                    logger.warning(f"Could not parse level from LLM-identified param: {actual_param_name}. Storing with raw name.")
                                    key_for_effect_dict = actual_param_name # Fallback key
                            
                            effect_estimates_by_level[key_for_effect_dict] = current_effect_stats
                        else:
                            logger.warning(f"LLM returned an invalid parameter index: {param_idx}. Skipping.")
                    
                    if effect_estimates_by_level: # If any effects were successfully processed
                        all_params_extracted = llm_identification_response.all_parameters_successfully_identified
                        llm_extraction_successful = True
                        logger.info(f"Successfully processed LLM-identified parameters. all_parameters_successfully_identified={all_params_extracted}")
                        print(f"effect_estimates_by_level: {effect_estimates_by_level}")
                    else:
                        logger.warning("LLM identified parameters, but none could be processed into effects_estimates_by_level. Falling back to regex.")
                else:
                    logger.warning("LLM parameter identification did not yield usable parameters. Falling back to regex.")
            
            except Exception as e_llm:
                logger.warning(f"LLM-based result extraction failed: {e_llm}. Falling back to regex.", exc_info=True)
        
        
            # --- End of Existing Regex Logic Block ---

        # Primary effect_estimate for simple reporting (e.g. first level or the only one)
        # For multi-level, this is ambiguous. For now, let's report None or the first one.
        # The full details are in effect_estimates_by_level.
        main_effect_estimate = None
        main_p_value = None
        main_conf_int = [None, None] # Default for single or if no effects
        main_std_err = None

        if effect_estimates_by_level:
            if 'treatment_effect' in effect_estimates_by_level: # Single effect case
                single_effect_data = effect_estimates_by_level['treatment_effect']
                main_effect_estimate = single_effect_data['estimate']
                main_p_value = single_effect_data['p_value']
                main_conf_int = single_effect_data['conf_int']
                main_std_err = single_effect_data['std_err']
            else: # Multi-level case
                logger.info("Multi-level treatment effects extracted. Populating dicts for main estimate fields.")
                effect_estimate_dict = {}
                p_value_dict = {}
                conf_int_dict = {}
                std_err_dict = {}
                for level, stats in effect_estimates_by_level.items():
                    effect_estimate_dict[level] = stats.get('estimate')
                    p_value_dict[level] = stats.get('p_value')
                    conf_int_dict[level] = stats.get('conf_int') # This is already a list [low, high]
                    std_err_dict[level] = stats.get('std_err')
                
                main_effect_estimate = effect_estimate_dict
                main_p_value = p_value_dict
                main_conf_int = conf_int_dict
                main_std_err = std_err_dict

        interpretation_details = {}
        if actual_interaction_term_added_to_formula and actual_interaction_term_added_to_formula in results.params.index:
            interpretation_details['interaction_term_coefficient'] = results.params[actual_interaction_term_added_to_formula]
            interpretation_details['interaction_term_p_value'] = results.pvalues[actual_interaction_term_added_to_formula]
            logger.info(f"Interaction term '{actual_interaction_term_added_to_formula}' coeff: {interpretation_details['interaction_term_coefficient']}")

        diag_results = {} 
        interpretation = "Interpretation not available." 

        output_dict = {
            'effect_estimate': main_effect_estimate,
            'p_value': main_p_value,
            'confidence_interval': main_conf_int,
            'standard_error': main_std_err,
            'estimated_effects_by_level': effect_estimates_by_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded and effect_estimates_by_level) else None,
            'reference_level_used': treatment_reference_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded) else None,
            'formula': formula,
            'model_summary_text': results.summary().as_text(), # Store as text for easier serialization
            'diagnostics': diag_results,
            'interpretation_details': interpretation_details, # Added interaction details
            'interpretation': interpretation,
            'method_used': 'Linear Regression (OLS)'
        }
        if not all_params_extracted:
            output_dict['warnings'] = ["Could not reliably extract all requested parameters from model results. Please check model_summary_text."]
        return output_dict

    except Exception as e:
        logger.error(f"Linear Regression failed: {e}")
        raise # Re-raise the exception after logging