Spaces:

nikhilsingh
/

monte-carlo-simulation

Sleeping

File size: 17,588 Bytes

b75c41f

# ----------------------------------------------------------------------------
# Import necessary libraries
# ----------------------------------------------------------------------------
# pip install gradio numpy pandas matplotlib scipy transformers torch sentencepiece
# ----------------------------------------------------------------------------
import gradio as gr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
from transformers import pipeline
import warnings
import os

# Suppress warnings for a cleaner output
warnings.filterwarnings("ignore")
# Set Matplotlib backend to a non-interactive one to avoid display issues in some environments
plt.switch_backend('Agg')

# ----------------------------------------------------------------------------
# Global Variables and Initial Setup
# ----------------------------------------------------------------------------

# Initialize the Hugging Face pipeline for text generation.
# We use a small, efficient model to ensure the app runs smoothly.
try:
    explanation_generator = pipeline('text2text-generation', model='google/flan-t5-small')
except Exception as e:
    print(f"Could not load Hugging Face model. Explanations will be disabled. Error: {e}")
    explanation_generator = None

# Create a sample dataset for demonstration purposes.
# This simulates the uncertain costs (in thousands of $) for different tasks in a project.
sample_project_costs = pd.DataFrame({
    'task_cost_thousands': [12, 15, 10, 13, 18, 9, 22, 14, 16, 11, 17, 20]
})
SAMPLE_CSV_PATH = 'sample_project_costs.csv'
sample_project_costs.to_csv(SAMPLE_CSV_PATH, index=False)


# ----------------------------------------------------------------------------
# Core Logic Functions
# ----------------------------------------------------------------------------

def process_input_data(file_obj, example_choice, manual_mean, manual_std):
    """
    Processes the user's input from the UI.
    It prioritizes input in the order: File Upload > Example Dataset > Manual Entry.
    It validates the data to ensure it's a single column of numbers.

    Args:
        file_obj (File object): The uploaded file from gr.File.
        example_choice (str): The name of the chosen example dataset.
        manual_mean (float): Manually entered mean.
        manual_std (float): Manually entered standard deviation.

    Returns:
        tuple: A tuple containing:
               - A pandas DataFrame with the processed data.
               - A Matplotlib figure showing the data distribution.
               - A string with summary statistics.
               - A string with a validation message.
    """
    data = None
    source_info = ""

    # 1. Prioritize input source
    if file_obj is not None:
        try:
            df = pd.read_csv(file_obj.name)
            source_info = f"from uploaded file: {os.path.basename(file_obj.name)}"
            data = df
        except Exception as e:
            return None, None, None, f"Error reading file: {e}. Please ensure it's a valid CSV."
    elif example_choice == "Project Cost Estimation":
        df = pd.read_csv(SAMPLE_CSV_PATH)
        source_info = "from the 'Project Cost Estimation' example"
        data = df
    elif manual_mean is not None and manual_std is not None:
         # If manual input, we don't have raw data, just parameters.
         # We'll return these params to be used directly in the simulation.
         if manual_std <= 0:
             return None, None, None, "Manual Input Error: Standard Deviation must be positive."
         
         stats_text = (f"Source: Manual Input\n"
                       f"Mean: {manual_mean:.2f}\n"
                       f"Standard Deviation: {manual_std:.2f}")
         # Create a dummy plot for manual input
         fig, ax = plt.subplots()
         ax.text(0.5, 0.5, 'Manual input:\nNo data to plot.\nSimulation will use\nthe provided Mean/Std.', 
                 ha='center', va='center', fontsize=12)
         ax.set_xticks([])
         ax.set_yticks([])
         plt.tight_layout()
         
         # Use a special DataFrame to signal manual input downstream
         manual_df = pd.DataFrame({'mean': [manual_mean], 'std': [manual_std]})
         return manual_df, fig, stats_text, "Manual parameters accepted. Ready to run simulation."

    if data is None:
        return None, None, None, "No data source provided. Please upload a file, choose an example, or enter parameters."

    # 2. Validate data structure
    if data.shape[1] != 1 or not pd.api.types.is_numeric_dtype(data.iloc[:, 0]):
        error_msg = (f"Data Error: The data {source_info} is not compatible. "
                     "The app requires a CSV with a single column of numerical data. "
                     f"Detected {data.shape[1]} columns.")
        return None, None, None, error_msg

    # 3. Process valid data
    series = data.iloc[:, 0].dropna()
    mean = series.mean()
    std = series.std()

    if std == 0:
        return None, None, None, "Data Error: All values are the same. Standard deviation is zero, cannot simulate uncertainty."

    # 4. Generate visualization and stats
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.hist(series, bins='auto', density=True, alpha=0.7, label='Input Data Distribution')
    
    # Overlay a normal distribution curve
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mean, std)
    ax.plot(x, p, 'k', linewidth=2, label='Fitted Normal Curve')
    
    ax.set_title(f"Distribution of Input Data")
    ax.set_xlabel(series.name)
    ax.set_ylabel("Density")
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()

    stats_text = (f"Source: {source_info}\n"
                  f"Number of Data Points: {len(series)}\n"
                  f"Mean: {mean:.2f}\n"
                  f"Standard Deviation: {std:.2f}\n"
                  f"Min: {series.min():.2f}\n"
                  f"Max: {series.max():.2f}")

    validation_message = "Data loaded and validated successfully! Ready to run the simulation."

    return data, fig, stats_text, validation_message


def run_monte_carlo_simulation(data, num_simulations, target_value):
    """
    Performs the Monte Carlo simulation based on the processed data.

    Args:
        data (pd.DataFrame): The validated input data.
        num_simulations (int): The number of simulation iterations to run.
        target_value (float): A user-defined target to calculate probability against.

    Returns:
        tuple: A tuple containing:
               - A Matplotlib figure of the simulation results histogram.
               - A Matplotlib figure of the cumulative distribution (CDF).
               - A string containing detailed numerical results.
    """
    if data is None:
        return None, None, "Please process valid data before running the simulation."
        
    num_simulations = int(num_simulations)

    # Check if data is from manual input or from a file/example
    if 'mean' in data.columns and 'std' in data.columns and data.shape[0] == 1:
        mean = data['mean'].iloc[0]
        std = data['std'].iloc[0]
        data_name = "Value" # Generic name for manual input
    else:
        series = data.iloc[:, 0]
        mean = series.mean()
        std = series.std()
        data_name = series.name

    # The core of the Monte Carlo simulation: generate random samples
    # We assume the underlying uncertainty follows a Normal Distribution
    # defined by the mean and standard deviation of the input data.
    simulation_results = np.random.normal(mean, std, num_simulations)

    # --- Generate Results Histogram Plot ---
    fig_hist, ax_hist = plt.subplots(figsize=(8, 5))
    ax_hist.hist(simulation_results, bins=50, density=True, alpha=0.8, color='skyblue', edgecolor='black')
    
    # Calculate key statistics for plotting
    sim_mean = np.mean(simulation_results)
    p5 = np.percentile(simulation_results, 5)
    p95 = np.percentile(simulation_results, 95)
    
    # Add vertical lines for key statistics
    ax_hist.axvline(sim_mean, color='red', linestyle='--', linewidth=2, label=f'Mean: {sim_mean:.2f}')
    ax_hist.axvline(p5, color='green', linestyle=':', linewidth=2, label=f'5th Percentile (P5): {p5:.2f}')
    ax_hist.axvline(p95, color='green', linestyle=':', linewidth=2, label=f'95th Percentile (P95): {p95:.2f}')
    
    ax_hist.set_title(f'Monte Carlo Simulation Results ({num_simulations:,} Iterations)', fontsize=14)
    ax_hist.set_xlabel(f'Simulated {data_name}')
    ax_hist.set_ylabel('Probability Density')
    ax_hist.legend()
    ax_hist.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()

    # --- Generate Cumulative Distribution (CDF) Plot ---
    fig_cdf, ax_cdf = plt.subplots(figsize=(8, 5))
    sorted_results = np.sort(simulation_results)
    yvals = np.arange(len(sorted_results)) / float(len(sorted_results) - 1)
    ax_cdf.plot(sorted_results, yvals, label='CDF')

    # Add markers for P5, P50, P95
    p50 = np.percentile(simulation_results, 50)
    ax_cdf.plot(p5, 0.05, 'go', ms=8, label=f'P5: {p5:.2f}')
    ax_cdf.plot(p50, 0.50, 'ro', ms=8, label=f'Median (P50): {p50:.2f}')
    ax_cdf.plot(p95, 0.95, 'go', ms=8, label=f'P95: {p95:.2f}')

    ax_cdf.set_title('Cumulative Distribution Function (CDF)', fontsize=14)
    ax_cdf.set_xlabel(f'Simulated {data_name}')
    ax_cdf.set_ylabel('Cumulative Probability')
    ax_cdf.grid(True, linestyle='--', alpha=0.6)
    ax_cdf.legend()
    plt.tight_layout()

    # --- Calculate Final Numerical Results ---
    prob_achieved = 0
    if target_value is not None:
        prob_achieved = np.sum(simulation_results <= target_value) / num_simulations * 100

    results_summary = (
        f"Simulation Summary ({num_simulations:,} iterations):\n"
        f"--------------------------------------------------\n"
        f"Mean (Average Outcome): {sim_mean:.2f}\n"
        f"Standard Deviation: {np.std(simulation_results):.2f}\n\n"
        f"Percentiles (Confidence Range):\n"
        f"  - 5th Percentile (P5): {p5:.2f}\n"
        f"  - 50th Percentile (Median): {p50:.2f}\n"
        f"  - 95th Percentile (P95): {p95:.2f}\n"
        f"This means there is a 90% probability the outcome will be between {p5:.2f} and {p95:.2f}.\n\n"
    )
    if target_value is not None:
        results_summary += (
            f"Probability Analysis:\n"
            f"  - Probability of outcome being less than or equal to {target_value:.2f}: {prob_achieved:.2f}%\n"
        )

    return fig_hist, fig_cdf, results_summary


def generate_explanation(results_summary):
    """
    Uses a Hugging Face model to explain the simulation results in simple terms.

    Args:
        results_summary (str): The numerical summary from the simulation.

    Returns:
        str: A generated explanation of the results.
    """
    if explanation_generator is None:
        return "LLM model not loaded. Cannot generate explanation."
    if not results_summary or "Please process valid data" in results_summary:
        return "Run a successful simulation first to generate an explanation."

    prompt = f"""
    Explain the following Monte Carlo simulation results to a non-technical manager.
    Focus on what the numbers mean in terms of risk and decision-making. Be concise and clear.

    Results:
    {results_summary}

    Explanation:
    """

    try:
        response = explanation_generator(prompt, max_length=200, num_beams=3, no_repeat_ngram_size=2)
        return response[0]['generated_text']
    except Exception as e:
        return f"Error generating explanation: {e}"


# ----------------------------------------------------------------------------
# Gradio UI Layout
# ----------------------------------------------------------------------------

with gr.Blocks(theme=gr.themes.Soft(), title="Monte Carlo Simulation Explorer") as app:
    gr.Markdown(
        """
        # Welcome to the Monte Carlo Simulation Explorer!
        This tool helps you understand and perform a Monte Carlo simulation, a powerful technique for modeling uncertainty.
        **How it works:** Instead of guessing a single outcome, you provide a range of possible inputs (or a distribution). The simulation then runs thousands of trials with random values from that input, creating a probability distribution of all possible outcomes.
        **Get started:**
        1.  **Provide Data:** Use one of the methods in the "Data Collection" box below.
        2.  **Prepare Simulation:** Click the "Prepare Simulation" button to validate and visualize your input.
        3.  **Run Simulation:** Adjust the settings and click "Run Simulation".
        4.  **Interpret:** Analyze the resulting plots and get an AI-powered explanation.
        """
    )

    # --- Row 1: Data Input and Preparation ---
    with gr.Row():
        # --- Column 1.1: Data Collection ---
        with gr.Column(scale=1):
            with gr.Box():
                gr.Markdown("### 1. Data Collection")
                gr.Markdown("Choose **one** method below.")
                
                with gr.Tabs():
                    with gr.TabItem("Upload File"):
                        file_input = gr.File(label="Upload a Single-Column CSV File", file_types=[".csv"])
                    with gr.TabItem("Use Example"):
                        example_input = gr.Dropdown(
                            ["Project Cost Estimation"], label="Select an Example Dataset"
                        )
                    with gr.TabItem("Manual Input"):
                         gr.Markdown("Define a normal distribution manually.")
                         manual_mean_input = gr.Number(label="Mean (Average)")
                         manual_std_input = gr.Number(label="Standard Deviation (Spread)")

                prepare_button = gr.Button("Prepare Simulation", variant="secondary")
        
        # --- Column 1.2: Preparation Plots & Visualization ---
        with gr.Column(scale=2):
            with gr.Box():
                gr.Markdown("### 2. Preparation & Visualization")
                validation_output = gr.Textbox(label="Validation Status", interactive=False, lines=3)
                input_stats_output = gr.Textbox(label="Input Data Statistics", interactive=False, lines=6)
                input_plot_output = gr.Plot(label="Input Data Distribution")

    # --- Row 2: Simulation Controls and Results ---
    with gr.Row():
        with gr.Box():
            gr.Markdown("### 3. Simulation Run & Results")
            with gr.Row():
                with gr.Column(scale=1, min_width=250):
                    gr.Markdown("**Simulation Settings**")
                    num_simulations_input = gr.Slider(
                        minimum=1000, maximum=50000, value=10000, step=1000,
                        label="Number of Simulations"
                    )
                    target_value_input = gr.Number(
                        label="Target Value (Optional)",
                        info="Calculate the probability of the result being <= this value."
                    )
                    run_button = gr.Button("Run Simulation", variant="primary")
                
                with gr.Column(scale=3):
                    with gr.Tabs():
                        with gr.TabItem("Results Histogram"):
                            results_plot_output = gr.Plot(label="Simulation Outcome Distribution")
                        with gr.TabItem("Cumulative Probability (CDF)"):
                            cdf_plot_output = gr.Plot(label="Cumulative Distribution Function")
                        with gr.TabItem("Numerical Summary"):
                            results_summary_output = gr.Textbox(label="Detailed Results", interactive=False, lines=12)

    # --- Row 3: AI-Powered Explanation ---
    with gr.Row():
        with gr.Box():
            gr.Markdown("### 4. AI-Powered Explanation")
            explain_button = gr.Button("Explain the Takeaways", variant="secondary")
            explanation_output = gr.Textbox(
                label="Key Takeaways from the LLM",
                interactive=False,
                lines=5,
                placeholder="Click the button above to generate an explanation of the results..."
            )

    # ----------------------------------------------------------------------------
    # Define UI Component Interactions
    # ----------------------------------------------------------------------------
    
    # Hidden state to store the processed data between steps
    processed_data_state = gr.State()

    prepare_button.click(
        fn=process_input_data,
        inputs=[file_input, example_input, manual_mean_input, manual_std_input],
        outputs=[processed_data_state, input_plot_output, input_stats_output, validation_output]
    )

    run_button.click(
        fn=run_monte_carlo_simulation,
        inputs=[processed_data_state, num_simulations_input, target_value_input],
        outputs=[results_plot_output, cdf_plot_output, results_summary_output]
    )

    explain_button.click(
        fn=generate_explanation,
        inputs=[results_summary_output],
        outputs=[explanation_output]
    )

# ----------------------------------------------------------------------------
# Launch the Gradio App
# ----------------------------------------------------------------------------
if __name__ == "__main__":
    # To run this app, save the code as a Python file (e.g., main.py)
    # and run `python main.py` from your terminal.
    app.launch(debug=True)