import gradio as gr
import numpy as np
from scipy import stats
from typing import List, Dict, Any, Union, Tuple
import json

def independent_t_test(group1: str, group2: str, equal_var: bool = True, alternative: str = "two-sided") -> Dict[str, Any]:
    """
    Perform an independent samples t-test between two groups.
    
    Args:
        group1 (str): Comma-separated values for group 1 (e.g., "1.2,2.3,3.4,2.1")
        group2 (str): Comma-separated values for group 2 (e.g., "2.1,3.2,4.1,3.5")
        equal_var (bool): If True, perform standard t-test assuming equal variances. If False, perform Welch's t-test
        alternative (str): Alternative hypothesis - 'two-sided', 'less', or 'greater'
    
    Returns:
        dict: Test results including t-statistic, p-value, degrees of freedom, and interpretation
    """
    try:
        # Parse input data
        data1 = [float(x.strip()) for x in group1.split(',') if x.strip()]
        data2 = [float(x.strip()) for x in group2.split(',') if x.strip()]
        
        if len(data1) < 2 or len(data2) < 2:
            return {"error": "Each group must have at least 2 observations"}
        
        # Perform t-test
        t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=equal_var, alternative=alternative)
        
        # Calculate descriptive statistics
        desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
        desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
        
        # Degrees of freedom
        if equal_var:
            df = len(data1) + len(data2) - 2
        else:
            # Welch's formula for unequal variances
            s1_sq, s2_sq = desc1["std"]**2, desc2["std"]**2
            n1, n2 = desc1["n"], desc2["n"]
            df = (s1_sq/n1 + s2_sq/n2)**2 / ((s1_sq/n1)**2/(n1-1) + (s2_sq/n2)**2/(n2-1))
        
        # Effect size (Cohen's d)
        pooled_std = np.sqrt(((len(data1)-1)*desc1["std"]**2 + (len(data2)-1)*desc2["std"]**2) / (len(data1)+len(data2)-2))
        cohens_d = (desc1["mean"] - desc2["mean"]) / pooled_std
        
        # Interpretation
        significance = "significant" if p_value < 0.05 else "not significant"
        effect_size_interp = "small" if abs(cohens_d) < 0.5 else "medium" if abs(cohens_d) < 0.8 else "large"
        
        return {
            "test_type": f"Independent t-test ({'equal variances' if equal_var else 'unequal variances'})",
            "t_statistic": round(t_stat, 4),
            "p_value": round(p_value, 6),
            "degrees_of_freedom": round(df, 2),
            "cohens_d": round(cohens_d, 4),
            "group1_stats": desc1,
            "group2_stats": desc2,
            "result": f"The difference between groups is {significance} (p = {p_value:.6f})",
            "effect_size": f"Effect size (Cohen's d = {cohens_d:.4f}) is {effect_size_interp}",
            "alternative_hypothesis": alternative
        }
    except Exception as e:
        return {"error": f"Error performing t-test: {str(e)}"}

def paired_t_test(before: str, after: str, alternative: str = "two-sided") -> Dict[str, Any]:
    """
    Perform a paired samples t-test.
    
    Args:
        before (str): Comma-separated values for before condition
        after (str): Comma-separated values for after condition  
        alternative (str): Alternative hypothesis - 'two-sided', 'less', or 'greater'
    
    Returns:
        dict: Test results including t-statistic, p-value, and interpretation
    """
    try:
        # Parse input data
        data_before = [float(x.strip()) for x in before.split(',') if x.strip()]
        data_after = [float(x.strip()) for x in after.split(',') if x.strip()]
        
        if len(data_before) != len(data_after):
            return {"error": "Before and after groups must have the same number of observations"}
        
        if len(data_before) < 2:
            return {"error": "Need at least 2 paired observations"}
        
        # Perform paired t-test
        t_stat, p_value = stats.ttest_rel(data_before, data_after, alternative=alternative)
        
        # Calculate differences and descriptive statistics
        differences = np.array(data_after) - np.array(data_before)
        mean_diff = np.mean(differences)
        std_diff = np.std(differences, ddof=1)
        
        # Effect size (Cohen's d for paired samples)
        cohens_d = mean_diff / std_diff
        
        # Degrees of freedom
        df = len(data_before) - 1
        
        # Interpretation
        significance = "significant" if p_value < 0.05 else "not significant"
        effect_size_interp = "small" if abs(cohens_d) < 0.5 else "medium" if abs(cohens_d) < 0.8 else "large"
        
        return {
            "test_type": "Paired t-test",
            "t_statistic": round(t_stat, 4),
            "p_value": round(p_value, 6),
            "degrees_of_freedom": df,
            "mean_difference": round(mean_diff, 4),
            "std_difference": round(std_diff, 4),
            "cohens_d": round(cohens_d, 4),
            "before_mean": round(np.mean(data_before), 4),
            "after_mean": round(np.mean(data_after), 4),
            "result": f"The paired difference is {significance} (p = {p_value:.6f})",
            "effect_size": f"Effect size (Cohen's d = {cohens_d:.4f}) is {effect_size_interp}",
            "alternative_hypothesis": alternative
        }
    except Exception as e:
        return {"error": f"Error performing paired t-test: {str(e)}"}

def one_sample_t_test(sample: str, population_mean: float, alternative: str = "two-sided") -> Dict[str, Any]:
    """
    Perform a one-sample t-test against a population mean.
    
    Args:
        sample (str): Comma-separated sample values
        population_mean (float): Hypothesized population mean
        alternative (str): Alternative hypothesis - 'two-sided', 'less', or 'greater'
    
    Returns:
        dict: Test results including t-statistic, p-value, and interpretation
    """
    try:
        # Parse input data
        data = [float(x.strip()) for x in sample.split(',') if x.strip()]
        
        if len(data) < 2:
            return {"error": "Sample must have at least 2 observations"}
        
        # Perform one-sample t-test
        t_stat, p_value = stats.ttest_1samp(data, population_mean, alternative=alternative)
        
        # Calculate descriptive statistics
        sample_mean = np.mean(data)
        sample_std = np.std(data, ddof=1)
        sample_size = len(data)
        
        # Effect size (Cohen's d)
        cohens_d = (sample_mean - population_mean) / sample_std
        
        # Degrees of freedom
        df = sample_size - 1
        
        # Interpretation
        significance = "significant" if p_value < 0.05 else "not significant"
        effect_size_interp = "small" if abs(cohens_d) < 0.5 else "medium" if abs(cohens_d) < 0.8 else "large"
        
        return {
            "test_type": "One-sample t-test",
            "t_statistic": round(t_stat, 4),
            "p_value": round(p_value, 6),
            "degrees_of_freedom": df,
            "sample_mean": round(sample_mean, 4),
            "population_mean": population_mean,
            "sample_std": round(sample_std, 4),
            "sample_size": sample_size,
            "cohens_d": round(cohens_d, 4),
            "result": f"Sample mean differs {significance}ly from population mean (p = {p_value:.6f})",
            "effect_size": f"Effect size (Cohen's d = {cohens_d:.4f}) is {effect_size_interp}",
            "alternative_hypothesis": alternative
        }
    except Exception as e:
        return {"error": f"Error performing one-sample t-test: {str(e)}"}

def one_way_anova(*groups: str) -> Dict[str, Any]:
    """
    Perform a one-way ANOVA test.
    
    Args:
        *groups: Variable number of comma-separated group values (minimum 2 groups)
    
    Returns:
        dict: ANOVA results including F-statistic, p-value, and interpretation
    """
    try:
        # Parse input data
        parsed_groups = []
        for i, group in enumerate(groups):
            if not group.strip():
                continue
            data = [float(x.strip()) for x in group.split(',') if x.strip()]
            if len(data) < 2:
                return {"error": f"Group {i+1} must have at least 2 observations"}
            parsed_groups.append(data)
        
        if len(parsed_groups) < 2:
            return {"error": "Need at least 2 groups for ANOVA"}
        
        # Perform one-way ANOVA
        f_stat, p_value = stats.f_oneway(*parsed_groups)
        
        # Calculate descriptive statistics for each group
        group_stats = []
        overall_data = []
        for i, group in enumerate(parsed_groups):
            group_stats.append({
                "group": i+1,
                "n": len(group),
                "mean": round(np.mean(group), 4),
                "std": round(np.std(group, ddof=1), 4)
            })
            overall_data.extend(group)
        
        # Calculate effect size (eta-squared)
        # SS_between / SS_total
        overall_mean = np.mean(overall_data)
        ss_total = sum((x - overall_mean)**2 for x in overall_data)
        ss_between = sum(len(group) * (np.mean(group) - overall_mean)**2 for group in parsed_groups)
        eta_squared = ss_between / ss_total if ss_total > 0 else 0
        
        # Degrees of freedom
        df_between = len(parsed_groups) - 1
        df_within = len(overall_data) - len(parsed_groups)
        
        # Interpretation
        significance = "significant" if p_value < 0.05 else "not significant"
        effect_size_interp = "small" if eta_squared < 0.06 else "medium" if eta_squared < 0.14 else "large"
        
        return {
            "test_type": "One-way ANOVA",
            "f_statistic": round(f_stat, 4),
            "p_value": round(p_value, 6),
            "df_between": df_between,
            "df_within": df_within,
            "eta_squared": round(eta_squared, 4),
            "group_statistics": group_stats,
            "result": f"Group differences are {significance} (p = {p_value:.6f})",
            "effect_size": f"Effect size (η² = {eta_squared:.4f}) is {effect_size_interp}",
            "note": "If significant, consider post-hoc tests to identify specific group differences"
        }
    except Exception as e:
        return {"error": f"Error performing ANOVA: {str(e)}"}

def chi_square_test(observed: str, expected: str = None) -> Dict[str, Any]:
    """
    Perform a chi-square goodness of fit test.
    
    Args:
        observed (str): Comma-separated observed frequencies
        expected (str): Comma-separated expected frequencies (optional, defaults to equal distribution)
    
    Returns:
        dict: Chi-square test results
    """
    try:
        # Parse observed frequencies
        obs_data = [float(x.strip()) for x in observed.split(',') if x.strip()]
        
        # Parse expected frequencies or create equal distribution
        if expected and expected.strip():
            exp_data = [float(x.strip()) for x in expected.split(',') if x.strip()]
            if len(obs_data) != len(exp_data):
                return {"error": "Observed and expected must have the same number of categories"}
        else:
            # Equal distribution
            total = sum(obs_data)
            exp_data = [total / len(obs_data)] * len(obs_data)
        
        # Perform chi-square test
        chi2_stat, p_value = stats.chisquare(obs_data, exp_data)
        
        # Degrees of freedom
        df = len(obs_data) - 1
        
        # Effect size (Cramér's V for goodness of fit)
        n = sum(obs_data)
        cramers_v = np.sqrt(chi2_stat / (n * (len(obs_data) - 1)))
        
        # Interpretation
        significance = "significant" if p_value < 0.05 else "not significant"
        effect_size_interp = "small" if cramers_v < 0.3 else "medium" if cramers_v < 0.5 else "large"
        
        return {
            "test_type": "Chi-square goodness of fit test",
            "chi_square_statistic": round(chi2_stat, 4),
            "p_value": round(p_value, 6),
            "degrees_of_freedom": df,
            "cramers_v": round(cramers_v, 4),
            "observed_frequencies": obs_data,
            "expected_frequencies": [round(x, 2) for x in exp_data],
            "result": f"Observed frequencies differ {significance}ly from expected (p = {p_value:.6f})",
            "effect_size": f"Effect size (Cramér's V = {cramers_v:.4f}) is {effect_size_interp}"
        }
    except Exception as e:
        return {"error": f"Error performing chi-square test: {str(e)}"}

def correlation_test(x_values: str, y_values: str, method: str = "pearson") -> Dict[str, Any]:
    """
    Perform correlation analysis between two variables.
    
    Args:
        x_values (str): Comma-separated X variable values
        y_values (str): Comma-separated Y variable values  
        method (str): Correlation method - 'pearson', 'spearman', or 'kendall'
    
    Returns:
        dict: Correlation results including coefficient and p-value
    """
    try:
        # Parse input data
        x_data = [float(x.strip()) for x in x_values.split(',') if x.strip()]
        y_data = [float(y.strip()) for y in y_values.split(',') if y.strip()]
        
        if len(x_data) != len(y_data):
            return {"error": "X and Y variables must have the same number of observations"}
        
        if len(x_data) < 3:
            return {"error": "Need at least 3 observations for correlation"}
        
        # Perform correlation test
        if method.lower() == "pearson":
            corr_coef, p_value = stats.pearsonr(x_data, y_data)
            test_name = "Pearson correlation"
        elif method.lower() == "spearman":
            corr_coef, p_value = stats.spearmanr(x_data, y_data)
            test_name = "Spearman rank correlation"
        elif method.lower() == "kendall":
            corr_coef, p_value = stats.kendalltau(x_data, y_data)
            test_name = "Kendall's tau correlation"
        else:
            return {"error": "Method must be 'pearson', 'spearman', or 'kendall'"}
        
        # Interpretation
        significance = "significant" if p_value < 0.05 else "not significant"
        
        # Correlation strength interpretation
        abs_corr = abs(corr_coef)
        if abs_corr < 0.3:
            strength = "weak"
        elif abs_corr < 0.7:
            strength = "moderate"
        else:
            strength = "strong"
        
        direction = "positive" if corr_coef > 0 else "negative"
        
        return {
            "test_type": test_name,
            "correlation_coefficient": round(corr_coef, 4),
            "p_value": round(p_value, 6),
            "sample_size": len(x_data),
            "result": f"The correlation is {significance} (p = {p_value:.6f})",
            "interpretation": f"{strength.title()} {direction} correlation (r = {corr_coef:.4f})",
            "method": method.lower()
        }
    except Exception as e:
        return {"error": f"Error performing correlation test: {str(e)}"}

# Create Gradio interfaces for each function
demo = gr.TabbedInterface(
    [
        gr.Interface(
            fn=independent_t_test,
            inputs=[
                gr.Textbox(placeholder="1.2,2.3,3.4,2.1", label="Group 1 (comma-separated)"),
                gr.Textbox(placeholder="2.1,3.2,4.1,3.5", label="Group 2 (comma-separated)"),
                gr.Checkbox(value=True, label="Equal variances"),
                gr.Dropdown(["two-sided", "less", "greater"], value="two-sided", label="Alternative hypothesis")
            ],
            outputs=gr.JSON(),
            title="Independent T-Test",
            description="Compare means between two independent groups"
        ),
        gr.Interface(
            fn=paired_t_test,
            inputs=[
                gr.Textbox(placeholder="10,12,11,13", label="Before (comma-separated)"),
                gr.Textbox(placeholder="12,14,13,15", label="After (comma-separated)"),
                gr.Dropdown(["two-sided", "less", "greater"], value="two-sided", label="Alternative hypothesis")
            ],
            outputs=gr.JSON(),
            title="Paired T-Test",
            description="Compare paired/matched samples"
        ),
        gr.Interface(
            fn=one_sample_t_test,
            inputs=[
                gr.Textbox(placeholder="10,12,11,13,9", label="Sample (comma-separated)"),
                gr.Number(value=10, label="Population mean"),
                gr.Dropdown(["two-sided", "less", "greater"], value="two-sided", label="Alternative hypothesis")
            ],
            outputs=gr.JSON(),
            title="One-Sample T-Test",
            description="Test sample mean against population mean"
        ),
        gr.Interface(
            fn=one_way_anova,
            inputs=[
                gr.Textbox(placeholder="1,2,3,2", label="Group 1 (comma-separated)"),
                gr.Textbox(placeholder="4,5,6,5", label="Group 2 (comma-separated)"),
                gr.Textbox(placeholder="7,8,9,8", label="Group 3 (comma-separated)", info="Optional"),
                gr.Textbox(placeholder="", label="Group 4 (comma-separated)", info="Optional"),
                gr.Textbox(placeholder="", label="Group 5 (comma-separated)", info="Optional")
            ],
            outputs=gr.JSON(),
            title="One-Way ANOVA",
            description="Compare means across multiple groups"
        ),
        gr.Interface(
            fn=chi_square_test,
            inputs=[
                gr.Textbox(placeholder="10,20,15,25", label="Observed frequencies (comma-separated)"),
                gr.Textbox(placeholder="", label="Expected frequencies (optional, comma-separated)")
            ],
            outputs=gr.JSON(),
            title="Chi-Square Test",
            description="Test goodness of fit for categorical data"
        ),
        gr.Interface(
            fn=correlation_test,
            inputs=[
                gr.Textbox(placeholder="1,2,3,4,5", label="X values (comma-separated)"),
                gr.Textbox(placeholder="2,4,6,8,10", label="Y values (comma-separated)"),
                gr.Dropdown(["pearson", "spearman", "kendall"], value="pearson", label="Correlation method")
            ],
            outputs=gr.JSON(),
            title="Correlation Analysis",
            description="Test correlation between two variables"
        )
    ],
    tab_names=["Independent T-Test", "Paired T-Test", "One-Sample T-Test", "ANOVA", "Chi-Square", "Correlation"]
)

if __name__ == "__main__":
    print(f"Gradio version: {gr.__version__}")
    demo.launch(mcp_server=True)