File size: 2,698 Bytes
1721aea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Basic descriptive statistics for Difference in Means.
"""

from typing import Dict, Any
import pandas as pd
import numpy as np
import logging

logger = logging.getLogger(__name__)

def run_dim_diagnostics(df: pd.DataFrame, treatment: str, outcome: str) -> Dict[str, Any]:
    """
    Calculates basic descriptive statistics for treatment and control groups.
    
    Args:
        df: Input DataFrame (should already be filtered for NaNs in treatment/outcome).
        treatment: Name of the binary treatment variable column.
        outcome: Name of the outcome variable column.
        
    Returns:
        Dictionary containing group means, standard deviations, and counts.
    """
    details = {}
    try:
        grouped = df.groupby(treatment)[outcome]
        stats = grouped.agg(['mean', 'std', 'count'])
        
        # Ensure both groups (0 and 1) are present if possible
        control_stats = stats.loc[0].to_dict() if 0 in stats.index else {'mean': np.nan, 'std': np.nan, 'count': 0}
        treated_stats = stats.loc[1].to_dict() if 1 in stats.index else {'mean': np.nan, 'std': np.nan, 'count': 0}
        
        details['control_group_stats'] = control_stats
        details['treated_group_stats'] = treated_stats
        
        if control_stats['count'] == 0 or treated_stats['count'] == 0:
             logger.warning("One or both treatment groups have zero observations.")
             return {"status": "Warning - Empty Group(s)", "details": details}
        
        # Simple check for variance difference (Levene's test could be added)
        control_std = control_stats.get('std', 0)
        treated_std = treated_stats.get('std', 0)
        if control_std > 0 and treated_std > 0:
            ratio = (control_std**2) / (treated_std**2)
            details['variance_ratio_control_div_treated'] = ratio
            if ratio > 4 or ratio < 0.25: # Rule of thumb
                details['variance_homogeneity_status'] = "Potentially Unequal (ratio > 4 or < 0.25)"
            else:
                 details['variance_homogeneity_status'] = "Likely Similar"
        else:
            details['variance_homogeneity_status'] = "Could not calculate (zero variance in a group)"
            
        return {"status": "Success", "details": details}
        
    except KeyError as ke:
         logger.error(f"KeyError during diagnostics: {ke}. Treatment levels might not be 0/1.")
         return {"status": "Failed", "error": f"Treatment levels might not be 0/1: {ke}", "details": details}
    except Exception as e:
        logger.error(f"Error running Difference in Means diagnostics: {e}")
        return {"status": "Failed", "error": str(e), "details": details}