Spaces:

Agents-MCP-Hackathon
/

Statistical-Analysis-MCP

Running

App Files Files Community

JG1310 commited on 26 days ago

Commit

e09dd64

verified ·

1 Parent(s): 9d4ec19

Update app.py

Browse files

Files changed (1) hide show

app.py +553 -2

app.py CHANGED Viewed

@@ -851,6 +851,287 @@ def one_way_anova(
     except Exception as e:
         return {"error": f"Unexpected error in one-way ANOVA: {str(e)}"}
 def chi_square_test(
     dataframe: Optional[pd.DataFrame] = None,
     observed_str: Optional[str] = None,
@@ -1765,6 +2046,276 @@ def create_anova_tab():
             show_api=False
         )
 def create_chi_square_tab():
     """Create a complete chi-square goodness of fit test tab with all components and handlers."""
@@ -2125,9 +2676,9 @@ def create_t_test_interface():
                 description="**t-test between paired groups**"
             )
-            # Create paired t-test tab
-            anova_components = create_anova_tab()
             one_sample_components = create_one_sample_t_test_tab()
             chi_square_components = create_chi_square_tab()
             corr_components = create_correlation_tab()

     except Exception as e:
         return {"error": f"Unexpected error in one-way ANOVA: {str(e)}"}
+def multi_way_anova(
+    dataframe: Optional[pd.DataFrame] = None,
+    dependent_var: Optional[str] = None,
+    factors: Optional[str] = None,
+    alpha: float = 0.05,
+    effect_thresholds: str = "0.01,0.06,0.14",
+    include_interactions: bool = True,
+    max_interaction_order: Optional[int] = None,
+    sum_squares_type: int = 2
+) -> Dict[str, Any]:
+    """
+    Accepts multiple categorical factors and performs Multi-Way ANOVA to determine whether there are
+    statistically significant differences between group means when multiple factors are involved simultaneously.
+    Multi-way ANOVA extends the one-way ANOVA framework to handle complex experimental designs with multiple
+    categorical independent variables (factors), each with two or more levels. Unlike one-way ANOVA which tests
+    a single factor, multi-way ANOVA can simultaneously test: (1) main effects of each individual factor,
+    (2) interaction effects between factors, and (3) higher-order interactions. The test uses F-statistics to
+    compare variance between groups to variance within groups for each effect. Eta-squared (η²) measures effect
+    size as the proportion of total variance explained by each factor and interaction, with interpretation:
+    η² < 0.01 = negligible, 0.01-0.06 = small, 0.06-0.14 = medium, >0.14 = large (custom thresholds may be used).
+    EXAMPLE USE CASES: 2-way ANOVA for treatment × gender effects on blood pressure, 3-way ANOVA for teaching
+    method × school type × student age on test scores, 4-way ANOVA for drug × dose × gender × age effects on recovery.
+    Args:
+        dataframe (Optional[pd.DataFrame]): DataFrame containing the experimental data with factors as columns
+                                           and the dependent variable. All factors must be categorical.
+                                           If provided, dependent_var and factors parameters are required.
+        dependent_var (Optional[str]): Name of the dependent (outcome) variable column in the DataFrame.
+                                      Must be a continuous numeric variable.
+                                      Example: "test_score", "recovery_time", "blood_pressure"
+        factors (Optional[str]): Comma-separated string of factor column names from the DataFrame.
+                                Format: "factor1,factor2,factor3"
+                                Example: "treatment,gender,age_group" for a 3-way ANOVA
+                                Each factor must be categorical with 2 or more levels.
+        alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
+                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
+        effect_thresholds (str): Three comma-separated values defining eta-squared effect size boundaries.
+                               Format: "small_threshold,medium_threshold,large_threshold"
+                               Default "0.01,0.06,0.14" means: <0.01=negligible, 0.01-0.06=small, 0.06-0.14=medium, >0.14=large
+                               These follow Cohen's conventions for eta-squared interpretation.
+        include_interactions (bool): Whether to include interaction terms in the model.
+                                   True (default): Tests main effects AND interactions
+                                   False: Tests only main effects (additive model)
+        max_interaction_order (Optional[int]): Maximum order of interactions to include in the model.
+                                             If None, includes all possible interactions up to the number of factors.
+                                             Example: For 4 factors, setting to 2 includes only 2-way interactions.
+                                             Useful for simplifying complex models with many factors.
+        sum_squares_type (int): Type of sum of squares calculation for the ANOVA table.
+                              Type 1: Sequential (depends on order of factors)
+                              Type 2: Marginal (recommended for balanced designs, default)
+                              Type 3: Partial (recommended for unbalanced designs)
+    Returns:
+        dict: Comprehensive test results with the following keys:
+            - test_type (str): Description of the multi-way ANOVA performed (e.g., "3-way ANOVA with interactions")
+            - anova_table (pd.DataFrame): Complete ANOVA table with sum of squares, F-statistics, p-values, etc.
+            - significant_effects (List[str]): List of statistically significant main effects and interactions
+            - effect_sizes (Dict[str, float]): Eta-squared values for each effect measuring proportion of variance explained
+            - effect_interpretations (Dict[str, str]): Categorical interpretation of each effect size ("negligible", "small", "medium", "large")
+            - factor_summaries (Dict[str, dict]): Descriptive statistics for each factor level
+            - model_summary (dict): Overall model statistics (R², F-statistic, AIC, BIC, etc.)
+            - formula_used (str): The statsmodels formula string used for the analysis
+            - design_summary (dict): Information about the experimental design (balanced/unbalanced, sample sizes)
+            - alpha (float): Echo of significance level used
+            - factors_analyzed (List[str]): Echo of factors included in the analysis
+            - sum_squares_type (int): Echo of sum of squares type used
+            - effect_thresholds (List[float]): Echo of effect size thresholds used
+    """
+    try:
+        # Parse effect size thresholds
+        try:
+            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
+            if len(thresholds) != 3:
+                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
+        except:
+            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.01,0.06,0.14')"}
+        # Validate inputs
+        if dataframe is None or dataframe.empty:
+            return {"error": "DataFrame cannot be None or empty"}
+        if not dependent_var:
+            return {"error": "Dependent variable name is required"}
+        if dependent_var not in dataframe.columns:
+            return {"error": f"Dependent variable '{dependent_var}' not found in DataFrame columns"}
+        if not factors:
+            return {"error": "Factor names are required. Provide as comma-separated string (e.g., 'factor1,factor2,factor3')"}
+        # Parse factors
+        try:
+            factor_list = [f.strip() for f in factors.split(',') if f.strip()]
+            if len(factor_list) < 2:
+                return {"error": "At least 2 factors are required for multi-way ANOVA"}
+        except:
+            return {"error": "Invalid factors format. Use comma-separated factor names (e.g., 'treatment,gender,age_group')"}
+        # Check factors exist in DataFrame
+        missing_factors = [f for f in factor_list if f not in dataframe.columns]
+        if missing_factors:
+            return {"error": f"Factors not found in DataFrame: {missing_factors}"}
+        # Validate sum of squares type
+        if sum_squares_type not in [1, 2, 3]:
+            return {"error": "sum_squares_type must be 1, 2, or 3"}
+        # Clean and prepare the data
+        analysis_columns = [dependent_var] + factor_list
+        analysis_df = dataframe[analysis_columns].copy()
+        # Remove rows with missing values
+        initial_rows = len(analysis_df)
+        analysis_df = analysis_df.dropna()
+        final_rows = len(analysis_df)
+        if final_rows < initial_rows * 0.5:
+            return {"error": f"Too much missing data: only {final_rows} out of {initial_rows} rows usable"}
+        if final_rows < 20:
+            return {"error": f"Insufficient data after removing missing values: {final_rows} rows remaining (minimum 20 required)"}
+        # Validate dependent variable is numeric
+        try:
+            analysis_df[dependent_var] = pd.to_numeric(analysis_df[dependent_var])
+        except:
+            return {"error": f"Dependent variable '{dependent_var}' must be numeric"}
+        # Ensure factors are categorical and check levels
+        factor_level_counts = {}
+        for factor in factor_list:
+            analysis_df[factor] = analysis_df[factor].astype('category')
+            unique_levels = len(analysis_df[factor].cat.categories)
+            factor_level_counts[factor] = unique_levels
+            if unique_levels < 2:
+                return {"error": f"Factor '{factor}' must have at least 2 levels. Found {unique_levels} level(s)"}
+            if unique_levels > 20:
+                return {"error": f"Factor '{factor}' has too many levels ({unique_levels}). Consider combining levels or using a different analysis method"}
+        # Check for sufficient observations per factor combination
+        try:
+            cell_counts = analysis_df.groupby(factor_list).size()
+            min_cell_size = cell_counts.min()
+            empty_cells = (cell_counts == 0).sum()
+            if min_cell_size < 2:
+                return {"error": f"Some factor combinations have fewer than 2 observations. Minimum cell size: {min_cell_size}"}
+            if empty_cells > 0:
+                return {"error": f"Missing data: {empty_cells} factor combinations have no observations"}
+        except Exception as e:
+            return {"error": f"Error checking experimental design: {str(e)}"}
+        # Build formula components
+        formula_terms = []
+        # Add main effects (always included)
+        for factor in factor_list:
+            formula_terms.append(f"C({factor})")
+        # Add interaction terms if requested
+        if include_interactions and len(factor_list) > 1:
+            max_order = max_interaction_order if max_interaction_order is not None else len(factor_list)
+            max_order = min(max_order, len(factor_list))  # Don't exceed number of factors
+            # Generate all interaction combinations
+            for order in range(2, max_order + 1):
+                for combination in itertools.combinations(factor_list, order):
+                    interaction_term = ":".join([f"C({factor})" for factor in combination])
+                    formula_terms.append(interaction_term)
+        # Build the complete formula
+        formula = f"{dependent_var} ~ " + " + ".join(formula_terms)
+        # Fit the model
+        try:
+            model = ols(formula, data=analysis_df).fit()
+        except Exception as e:
+            return {"error": f"Model fitting failed: {str(e)}. This may indicate perfect multicollinearity or insufficient data variation"}
+        # Generate ANOVA table
+        try:
+            anova_table = sm.stats.anova_lm(model, typ=sum_squares_type)
+        except Exception as e:
+            return {"error": f"ANOVA table generation failed: {str(e)}"}
+        # Calculate effect sizes (eta-squared)
+        effect_sizes = {}
+        effect_interpretations = {}
+        total_ss = anova_table['sum_sq'].sum()
+        for index, row in anova_table.iterrows():
+            if index != 'Residual':
+                eta_squared = row['sum_sq'] / total_ss
+                effect_sizes[index] = eta_squared
+                # Interpret effect size
+                small_threshold, medium_threshold, large_threshold = thresholds
+                if eta_squared < small_threshold:
+                    effect_interpretations[index] = "negligible"
+                elif eta_squared < medium_threshold:
+                    effect_interpretations[index] = "small"
+                elif eta_squared < large_threshold:
+                    effect_interpretations[index] = "medium"
+                else:
+                    effect_interpretations[index] = "large"
+        # Identify significant effects
+        significant_effects = []
+        for index, row in anova_table.iterrows():
+            if index != 'Residual' and row['PR(>F)'] < alpha:
+                significant_effects.append(index)
+        # Calculate factor summaries
+        factor_summaries = {}
+        for factor in factor_list:
+            factor_stats = analysis_df.groupby(factor)[dependent_var].agg(['mean', 'std', 'count']).round(4)
+            factor_summaries[factor] = factor_stats.to_dict('index')
+        # Model summary statistics
+        model_summary = {
+            "r_squared": model.rsquared,
+            "adj_r_squared": model.rsquared_adj,
+            "f_statistic": model.fvalue,
+            "f_pvalue": model.f_pvalue,
+            "aic": model.aic,
+            "bic": model.bic,
+            "df_model": model.df_model,
+            "df_resid": model.df_resid,
+            "n_observations": int(model.nobs),
+            "mse_resid": model.mse_resid
+        }
+        # Design summary
+        total_combinations = np.prod(list(factor_level_counts.values()))
+        observed_combinations = len(cell_counts)
+        balanced = len(cell_counts.unique()) == 1  # All cells have same count
+        design_summary = {
+            "n_factors": len(factor_list),
+            "factor_levels": factor_level_counts,
+            "total_possible_combinations": total_combinations,
+            "observed_combinations": observed_combinations,
+            "is_balanced": balanced,
+            "min_cell_size": int(min_cell_size),
+            "max_cell_size": int(cell_counts.max()),
+            "mean_cell_size": round(cell_counts.mean(), 2)
+        }
+        # Determine test description
+        n_factors = len(factor_list)
+        test_description = f"{n_factors}-way ANOVA"
+        if include_interactions:
+            max_order_desc = max_interaction_order if max_interaction_order else n_factors
+            test_description += f" with interactions (up to {max_order_desc}-way)"
+        else:
+            test_description += " (main effects only)"
+        return {
+            "test_type": test_description,
+            "anova_table": anova_table,
+            "significant_effects": significant_effects,
+            "effect_sizes": effect_sizes,
+            "effect_interpretations": effect_interpretations,
+            "factor_summaries": factor_summaries,
+            "model_summary": model_summary,
+            "formula_used": formula,
+            "design_summary": design_summary,
+            "alpha": alpha,
+            "factors_analyzed": factor_list,
+            "sum_squares_type": sum_squares_type,
+            "effect_thresholds": thresholds
+        }
+    except Exception as e:
+        return {"error": f"Unexpected error in multi-way ANOVA: {str(e)}"}
 def chi_square_test(
     dataframe: Optional[pd.DataFrame] = None,
     observed_str: Optional[str] = None,
             show_api=False
         )
+def create_multi_way_anova_tab():
+    """Create a complete multi-way ANOVA tab with all components and handlers."""
+    with gr.TabItem("Multi-Way ANOVA"):
+        gr.Markdown("**Compare means across multiple categorical factors simultaneously**")
+        # Input method selector
+        input_method = gr.Radio(
+            choices=["File Upload"],
+            value="File Upload",
+            label="Input Method",
+            info="Multi-way ANOVA requires structured data - file upload recommended"
+        )
+        # File upload input section
+        with gr.Group(visible=True) as file_section:
+            gr.Markdown("### File Upload")
+            gr.Markdown("*Upload CSV or Excel file with dependent variable and multiple categorical factors*")
+            with gr.Row():
+                file_upload = gr.File(
+                    label="Upload CSV/Excel File",
+                    file_types=[".csv", ".xlsx", ".xls"],
+                    type="filepath"
+                )
+                has_header = gr.Checkbox(
+                    label="File has header row",
+                    value=True,
+                    info="Check if first row contains column names"
+                )
+            # Display loaded data preview
+            data_preview = gr.Dataframe(
+                label="Data Preview",
+                interactive=False,
+                row_count=10
+            )
+        # Variable specification
+        gr.Markdown("### Variable Specification")
+        with gr.Row():
+            dependent_var = gr.Dropdown(
+                label="Dependent Variable",
+                info="Select the continuous outcome variable",
+                interactive=True
+            )
+            factors = gr.Textbox(
+                label="Factors (comma-separated)",
+                placeholder="treatment,gender,age_group",
+                info="Enter factor column names separated by commas",
+                lines=2
+            )
+        # Advanced options
+        gr.Markdown("### Analysis Options")
+        with gr.Row():
+            include_interactions = gr.Checkbox(
+                label="Include Interactions",
+                value=True,
+                info="Test for interaction effects between factors"
+            )
+            max_interaction_order = gr.Number(
+                label="Max Interaction Order",
+                value=None,
+                minimum=2,
+                step=1,
+                info="Maximum interaction order (leave empty for all interactions)"
+            )
+        with gr.Row():
+            sum_squares_type = gr.Dropdown(
+                choices=[1, 2, 3],
+                value=2,
+                label="Sum of Squares Type",
+                info="Type 2 for balanced, Type 3 for unbalanced designs"
+            )
+            alpha = gr.Number(
+                value=0.05,
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                label="Significance Level (α)",
+                info="Probability threshold for statistical significance"
+            )
+        with gr.Row():
+            effect_thresholds = gr.Textbox(
+                value="0.01,0.06,0.14",
+                label="Effect Size Thresholds",
+                info="Eta-squared boundaries: small,medium,large"
+            )
+        # Action buttons
+        with gr.Row():
+            run_button = gr.Button("Run Multi-Way ANOVA", variant="primary", scale=1)
+            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
+        # Output display
+        output = gr.JSON(label="Multi-Way ANOVA Results")
+        # Information section
+        with gr.Accordion("Multi-Way ANOVA Information", open=False):
+            gr.Markdown("""
+            ### What is Multi-Way ANOVA?
+            Multi-way ANOVA extends one-way ANOVA to handle multiple categorical factors simultaneously:
+            **Main Effects**: How each factor independently affects the outcome
+            **Interaction Effects**: How factors work together (non-additively)
+            ### Example Designs:
+            - **2-way**: Treatment (A,B,C) × Gender (Male,Female) → 6 combinations
+            - **3-way**: Drug (A,B) × Dose (Low,High) × Age (Young,Old) → 8 combinations
+            - **4-way**: Method (A,B) × School (Public,Private) × Gender (M,F) × Grade (1st,2nd) → 16 combinations
+            ### Requirements:
+            - All factors must be categorical (not continuous)
+            - Dependent variable must be continuous
+            - At least 2 observations per factor combination
+            - Independence, normality, and equal variances assumptions
+            """)
+        # Example data section
+        with gr.Row():
+            gr.Markdown("### Quick Examples")
+            example_button = gr.Button("Load Example Data", variant="outline")
+        # State management
+        loaded_dataframe = gr.State(value=None)
+        # Helper function to load and preview file data
+        def load_multi_way_file(file_path, has_header_flag):
+            if file_path is None:
+                return None, None, []
+            try:
+                # Determine header parameter
+                header_param = 0 if has_header_flag else None
+                if file_path.endswith('.csv'):
+                    df = pd.read_csv(file_path, header=header_param)
+                elif file_path.endswith(('.xlsx', '.xls')):
+                    df = pd.read_excel(file_path, header=header_param)
+                else:
+                    return None, pd.DataFrame({'Error': ['Unsupported file format']}), []
+                # Set column names if no header
+                if not has_header_flag:
+                    df.columns = [f'Column_{i+1}' for i in range(len(df.columns))]
+                # Get column options for dropdown
+                column_options = list(df.columns)
+                # Return dataframe, preview, and column options
+                preview = df.head(15)
+                return df, preview, column_options
+            except Exception as e:
+                error_df = pd.DataFrame({'Error': [f"Failed to load file: {str(e)}"]})
+                return None, error_df, []
+        # Clear form function
+        def clear_multi_way():
+            return (
+                None,               # loaded_dataframe
+                None,               # data_preview
+                [],                 # dependent_var choices
+                None,               # dependent_var value
+                "",                 # factors
+                True,               # include_interactions
+                None,               # max_interaction_order
+                2,                  # sum_squares_type
+                0.05,               # alpha
+                "0.01,0.06,0.14",   # effect_thresholds
+                {}                  # output
+            )
+        # Example data function
+        def load_multi_way_example():
+            # Create example 3-way ANOVA data
+            np.random.seed(42)
+            treatments = ['Control', 'Treatment_A', 'Treatment_B']
+            genders = ['Male', 'Female']
+            ages = ['Young', 'Old']
+            data = []
+            for treatment in treatments:
+                for gender in genders:
+                    for age in ages:
+                        # Generate scores with different effects
+                        base_score = 50
+                        treatment_effect = {'Control': 0, 'Treatment_A': 8, 'Treatment_B': 12}[treatment]
+                        gender_effect = {'Male': 3, 'Female': -3}[gender]
+                        age_effect = {'Young': 5, 'Old': -5}[age]
+                        # Add interaction: Treatment_B works better for older patients
+                        interaction_effect = 0
+                        if treatment == 'Treatment_B' and age == 'Old':
+                            interaction_effect = 6
+                        n_per_cell = 15
+                        mean_score = base_score + treatment_effect + gender_effect + age_effect + interaction_effect
+                        scores = np.random.normal(mean_score, 6, n_per_cell)
+                        for score in scores:
+                            data.append({
+                                'test_score': round(score, 2),
+                                'treatment': treatment,
+                                'gender': gender,
+                                'age_group': age
+                            })
+            df = pd.DataFrame(data)
+            preview = df.head(15)
+            column_options = list(df.columns)
+            return df, preview, column_options, 'test_score', 'treatment,gender,age_group'
+        # EVENT HANDLERS
+        # File upload handlers
+        file_upload.change(
+            fn=load_multi_way_file,
+            inputs=[file_upload, has_header],
+            outputs=[loaded_dataframe, data_preview, dependent_var],
+            show_api=False
+        )
+        has_header.change(
+            fn=load_multi_way_file,
+            inputs=[file_upload, has_header],
+            outputs=[loaded_dataframe, data_preview, dependent_var],
+            show_api=False
+        )
+        # MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
+        run_button.click(
+            fn=multi_way_anova,
+            inputs=[
+                loaded_dataframe,       # dataframe
+                dependent_var,          # dependent_var
+                factors,                # factors
+                alpha,                  # alpha
+                effect_thresholds,      # effect_thresholds
+                include_interactions,   # include_interactions
+                max_interaction_order,  # max_interaction_order
+                sum_squares_type        # sum_squares_type
+            ],
+            outputs=output
+        )
+        # Clear form handler
+        clear_button.click(
+            fn=clear_multi_way,
+            outputs=[
+                loaded_dataframe, data_preview, dependent_var, dependent_var,
+                factors, include_interactions, max_interaction_order,
+                sum_squares_type, alpha, effect_thresholds, output
+            ],
+            show_api=False
+        )
+        # Example data handler
+        example_button.click(
+            fn=load_multi_way_example,
+            outputs=[loaded_dataframe, data_preview, dependent_var, dependent_var, factors],
+            show_api=False
+        )
 def create_chi_square_tab():
     """Create a complete chi-square goodness of fit test tab with all components and handlers."""
                 description="**t-test between paired groups**"
             )
             one_sample_components = create_one_sample_t_test_tab()
+            anova_components = create_anova_tab()
+            manova_components = create_multi_way_anova_tab()
             chi_square_components = create_chi_square_tab()
             corr_components = create_correlation_tab()