Spaces:

Victarry
/

PP-schedule-visualizer

Running

App Files Files Community

Victarry commited on Apr 17

Commit

5126943

1 Parent(s): e1ba92f

Add interactive server.

Browse files

Files changed (3) hide show

README.md +15 -2
pyproject.toml +1 -1
src/server.py +224 -0

README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# Pipeline Parallelism Emulation
 This project provides tools for emulating and visualizing pipeline parallelism strategies used in large language model training.
@@ -37,7 +37,20 @@ Setup `uv` if not installed on your computer:
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
-## Usage
 ### Running for 1F1B strategy:
 ```bash

+# Pipeline Parallelism Emulation and Visualization
 This project provides tools for emulating and visualizing pipeline parallelism strategies used in large language model training.
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
+## Running the Interactive Server
+To visualize schedules interactively:
+```bash
+uv run src/server.py
+```
+This will start a Dash server (usually on `http://127.0.0.1:8050/`). Open this URL in your web browser.
+You can then adjust parameters like the number of devices, stages, batches, operation times, and select different scheduling strategies to see the resulting pipeline visualization.
+## Running from Command Line
 ### Running for 1F1B strategy:
 ```bash

pyproject.toml CHANGED Viewed

@@ -9,7 +9,7 @@ description = "Pipeline Parallelism Emulation and Visualization"
 readme = "README.md"
 requires-python = ">=3.10"
 authors = [
-    {name = "Project Author"}
 ]
 classifiers = [
     "Programming Language :: Python :: 3",

 readme = "README.md"
 requires-python = ">=3.10"
 authors = [
+    {name = "Zhenhuan Liu"}
 ]
 classifiers = [
     "Programming Language :: Python :: 3",

src/server.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import dash
+from dash import dcc, html, Input, Output, State, callback_context
+import plotly.graph_objects as go
+import webbrowser
+from threading import Timer
+from src.execution_model import ScheduleConfig, Schedule
+from src.strategies import (
+    generate_1f1b_schedule,
+    generate_zero_bubble_1p_schedule,
+    generate_1f1b_overlap_schedule,
+    generate_1f1b_interleave_schedule,
+    generate_1f1b_interleave_overlap_schedule,
+    generate_dualpipe_schedule
+)
+from src.visualizer import convert_schedule_to_visualization_format, create_pipeline_figure
+def open_browser(port):
+    webbrowser.open_new(f"http://127.0.0.1:{port}")
+STRATEGIES = {
+    "1f1b": generate_1f1b_schedule,
+    "zb1p": generate_zero_bubble_1p_schedule,
+    "1f1b_overlap": generate_1f1b_overlap_schedule,
+    "1f1b_interleave": generate_1f1b_interleave_schedule,
+    "1f1b_interleave_overlap": generate_1f1b_interleave_overlap_schedule,
+    "dualpipe": generate_dualpipe_schedule,
+}
+app = dash.Dash(__name__, suppress_callback_exceptions=True)
+app.title = "Pipeline Parallelism Visualizer"
+# Initial default values
+default_values = {
+    "num_devices": 4,
+    "num_stages": 8,
+    "num_batches": 16,
+    "p2p_latency": 0.1,
+    "op_time_forward": 1.0,
+    "op_time_backward_d": 1.0,
+    "op_time_backward_w": 1.0,
+    "op_time_backward": 2.0,
+    "strategy": "1f1b_interleave",
+    "split_backward": False,
+    "placement_strategy": "interleave"
+}
+app.layout = html.Div([
+    html.H1("Pipeline Parallelism Schedule Visualizer", style={'textAlign': 'center'}),
+    html.Div([
+        html.Div([
+            html.Label("Number of Devices (GPUs):"),
+            dcc.Input(id='num_devices', type='number', value=default_values["num_devices"], min=1, step=1, style={'width': '100%'}),
+            html.Label("Number of Stages (Model Chunks):"),
+            dcc.Input(id='num_stages', type='number', value=default_values["num_stages"], min=1, step=1, style={'width': '100%'}),
+            html.Label("Number of Microbatches:"),
+            dcc.Input(id='num_batches', type='number', value=default_values["num_batches"], min=1, step=1, style={'width': '100%'}),
+            html.Label("P2P Latency (ms):"),
+            dcc.Input(id='p2p_latency', type='number', value=default_values["p2p_latency"], min=0, step=0.01, style={'width': '100%'}),
+        ], style={'padding': 10, 'flex': 1}),
+        html.Div([
+            html.Label("Scheduling Strategy:"),
+            dcc.Dropdown(
+                id='strategy',
+                options=[{'label': k, 'value': k} for k in STRATEGIES.keys()],
+                value=default_values["strategy"],
+                clearable=False,
+                style={'width': '100%'}
+            ),
+            html.Label("Placement Strategy:"),
+            dcc.Dropdown(
+                id='placement_strategy',
+                options=[
+                    {'label': 'Standard', 'value': 'standard'},
+                    {'label': 'Interleave', 'value': 'interleave'},
+                    {'label': 'DualPipe', 'value': 'dualpipe'}
+                ],
+                value=default_values["placement_strategy"],
+                clearable=False,
+                style={'width': '100%'}
+            ),
+            html.Div([ # Wrap checkbox and label
+                dcc.Checklist(
+                    id='split_backward',
+                    options=[{'label': ' Split Backward Pass (for ZB-1P, DualPipe)', 'value': 'True'}],
+                    value=['True'] if default_values["split_backward"] else [],
+                    style={'display': 'inline-block'}
+                ),
+            ], style={'marginTop': '20px'}),
+        ], style={'padding': 10, 'flex': 1}),
+        html.Div([
+            html.Label("Operation Time - Forward (ms):"),
+            dcc.Input(id='op_time_forward', type='number', value=default_values["op_time_forward"], min=0.01, step=0.01, style={'width': '100%'}),
+            html.Label("Operation Time - Backward (ms):"),
+            dcc.Input(id='op_time_backward', type='number', value=default_values["op_time_backward"], min=0.01, step=0.01, style={'width': '100%'}),
+            html.Label("Operation Time - Backward D (Data Grad) (ms):"),
+            dcc.Input(id='op_time_backward_d', type='number', value=default_values["op_time_backward_d"], min=0.01, step=0.01, style={'width': '100%'}),
+            html.Label("Operation Time - Backward W (Weight Grad) (ms):"),
+            dcc.Input(id='op_time_backward_w', type='number', value=default_values["op_time_backward_w"], min=0.01, step=0.01, style={'width': '100%'}),
+        ], style={'padding': 10, 'flex': 1}),
+    ], style={'display': 'flex', 'flexDirection': 'row'}),
+    html.Div([
+        html.Button('Generate Schedule', id='generate-button', n_clicks=0, style={'margin': '20px auto', 'display': 'block'}),
+    ]),
+    html.Div(id='error-message', style={'color': 'red', 'textAlign': 'center', 'marginTop': '10px'}),
+    dcc.Loading(
+        id="loading-graph",
+        type="circle",
+        children=dcc.Graph(id='pipeline-graph', figure=go.Figure())
+    )
+])
+@app.callback(
+    Output('pipeline-graph', 'figure'),
+    Output('error-message', 'children'),
+    Input('generate-button', 'n_clicks'),
+    State('num_devices', 'value'),
+    State('num_stages', 'value'),
+    State('num_batches', 'value'),
+    State('p2p_latency', 'value'),
+    State('op_time_forward', 'value'),
+    State('op_time_backward', 'value'),
+    State('op_time_backward_d', 'value'),
+    State('op_time_backward_w', 'value'),
+    State('strategy', 'value'),
+    State('split_backward', 'value'),
+    State('placement_strategy', 'value'),
+    prevent_initial_call=True
+)
+def update_graph(n_clicks, num_devices, num_stages, num_batches, p2p_latency,
+                 op_time_forward, op_time_backward, op_time_backward_d, op_time_backward_w,
+                 strategy, split_backward_list, placement_strategy):
+    error_message = ""
+    fig = go.Figure()
+    split_backward = 'True' in split_backward_list
+    # Basic Validations
+    if not all([num_devices, num_stages, num_batches, op_time_forward]):
+        return fig, "Missing required input values."
+    if split_backward and not all([op_time_backward_d, op_time_backward_w]):
+        return fig, "Backward D and Backward W times are required when 'Split Backward' is checked."
+    if not split_backward and not op_time_backward:
+        return fig, "Backward time is required when 'Split Backward' is unchecked."
+    if num_stages % num_devices != 0 and placement_strategy != 'dualpipe':
+         return fig, "Number of Stages must be divisible by Number of Devices for standard/interleave placement."
+    if placement_strategy == 'dualpipe' and num_stages % 2 != 0:
+        return fig, "DualPipe requires an even number of stages."
+    if placement_strategy == 'dualpipe' and num_stages != num_devices:
+        return fig, "DualPipe requires Number of Stages to be equal to Number of Devices."
+    if strategy == 'dualpipe' and not split_backward:
+        return fig, "DualPipe strategy currently requires 'Split Backward' to be checked."
+    if strategy == 'dualpipe' and placement_strategy != 'dualpipe':
+        return fig, "DualPipe strategy requires 'DualPipe' placement strategy."
+    if strategy == 'zb1p' and not split_backward:
+        return fig, "ZB-1P strategy requires 'Split Backward' to be checked."
+    try:
+        op_times = {
+            "forward": float(op_time_forward),
+        }
+        if split_backward:
+            op_times["backward_D"] = float(op_time_backward_d)
+            op_times["backward_W"] = float(op_time_backward_w)
+            # Add combined backward time for compatibility if needed by some visualization or calculation
+            op_times["backward"] = float(op_time_backward_d) + float(op_time_backward_w)
+        else:
+            op_times["backward"] = float(op_time_backward)
+        config = ScheduleConfig(
+            num_devices=int(num_devices),
+            num_stages=int(num_stages),
+            num_batches=int(num_batches),
+            p2p_latency=float(p2p_latency),
+            placement_strategy=placement_strategy,
+            split_backward=split_backward,
+            op_times=op_times,
+        )
+        schedule_func = STRATEGIES.get(strategy)
+        if not schedule_func:
+            raise ValueError(f"Invalid strategy selected: {strategy}")
+        schedule = schedule_func(config)
+        schedule.execute() # Calculate start/end times
+        vis_data = convert_schedule_to_visualization_format(schedule)
+        fig = create_pipeline_figure(vis_data, show_progress=False) # Disable progress bar in server mode
+    except AssertionError as e:
+        error_message = f"Configuration Error: {e}"
+        fig = go.Figure() # Return empty figure on error
+    except ValueError as e:
+        error_message = f"Input Error: {e}"
+        fig = go.Figure()
+    except Exception as e:
+        error_message = f"An unexpected error occurred: {e}"
+        fig = go.Figure()
+    return fig, error_message
+if __name__ == '__main__':
+    port = 8050
+    # Timer(1, open_browser, args=(port,)).start() # Optional: automatically open browser
+    print(f"Dash server running on http://127.0.0.1:{port}")
+    app.run_server(debug=True, port=port)