Spaces:
Running
Running
""" | |
Main module for the WhisperKit Evaluation Dashboard. | |
This module sets up and runs the Gradio interface for the WhisperKit Evaluation Dashboard, | |
allowing users to explore and compare speech recognition model performance across different | |
devices, operating systems, and datasets. | |
""" | |
import json | |
import os | |
import re | |
from math import ceil, floor | |
import gradio as gr | |
import pandas as pd | |
from argmax_gradio_components import RangeSlider | |
from dotenv import load_dotenv | |
from huggingface_hub import login | |
# Import custom constants and utility functions | |
from constants import ( | |
BANNER_TEXT, | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
COL_NAMES, | |
HEADER, | |
METHODOLOGY_TEXT, | |
PERFORMANCE_TEXT, | |
) | |
from utils import ( | |
add_datasets_to_performance_columns, | |
calculate_quality_parity, | |
create_initial_performance_column_dict, | |
css, | |
fields, | |
get_os_name_and_version, | |
make_model_name_clickable_link, | |
plot_metric, | |
read_json_line_by_line, | |
) | |
# Load environment variables | |
load_dotenv() | |
# Get the Hugging Face token from the environment variable | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
# Use the token for login | |
login(token=HF_TOKEN, add_to_git_credential=True) | |
# Define repository and directory information | |
repo_id = "argmaxinc/whisperkit-evals-dataset" | |
directory = "xcresults/benchmark_results" | |
local_dir = "" | |
# Load benchmark data from JSON files | |
PERFORMANCE_DATA = read_json_line_by_line("dashboard_data/performance_data.json") | |
with open("dashboard_data/version.json", "r") as file: | |
VERSION_DATA = json.load(file) | |
# Load quality data (ground truth WER) | |
QUALITY_DATA = read_json_line_by_line("dashboard_data/quality_data.json") | |
SHA_TO_VERSION = { | |
VERSION_DATA["releases"][i]: VERSION_DATA["versions"][i] | |
for i in range(len(VERSION_DATA["versions"])) | |
} | |
# Convert JSON data to pandas DataFrames - performance only | |
benchmark_df = pd.json_normalize(PERFORMANCE_DATA) | |
releases = VERSION_DATA["releases"] | |
# Process timestamp data | |
benchmark_df["timestamp"] = pd.to_datetime(benchmark_df["timestamp"]).dt.tz_localize( | |
None | |
) | |
# Use average_wer directly from performance data | |
benchmark_df["english_wer"] = benchmark_df["average_wer"] | |
sorted_performance_df = ( | |
benchmark_df.assign(model_len=benchmark_df["model"].str.len()) | |
.sort_values( | |
by=["model_len", "model", "device", "os", "timestamp"], | |
ascending=[True, True, True, True, False], | |
) | |
.drop(columns=["model_len"]) | |
.drop_duplicates(subset=["model", "device", "os"], keep="first") | |
.reset_index(drop=True) | |
) | |
# Identify dataset-specific columns | |
dataset_speed_columns = [ | |
col for col in sorted_performance_df.columns if col.startswith("dataset_speed.") | |
] | |
dataset_toks_columns = [ | |
col | |
for col in sorted_performance_df.columns | |
if col.startswith("dataset_tokens_per_second.") | |
] | |
# Extract dataset names | |
PERFORMANCE_DATASETS = [col.split(".")[-1] for col in dataset_speed_columns] | |
# Prepare DataFrames for display | |
performance_df = sorted_performance_df[ | |
[ | |
"model", | |
"device", | |
"os", | |
"english_wer", | |
"qoi", | |
"speed", | |
"tokens_per_second", | |
"timestamp", | |
"commit_hash", | |
] | |
+ dataset_speed_columns | |
+ dataset_toks_columns | |
].copy() | |
# Calculate parity (difference between measured WER and ground truth WER) | |
performance_df["parity"] = performance_df.apply( | |
lambda row: calculate_quality_parity(QUALITY_DATA, row), axis=1 | |
) | |
# Rename columns for clarity | |
performance_df = performance_df.rename( | |
lambda x: COL_NAMES[x] if x in COL_NAMES else x, axis="columns" | |
) | |
# Process dataset-specific columns | |
for col in dataset_speed_columns: | |
dataset_name = col.split(".")[-1] | |
performance_df = performance_df.rename( | |
columns={ | |
col: f"{'Short-Form' if dataset_name == 'librispeech-10mins' else 'Long-Form'} Speed" | |
} | |
) | |
for col in dataset_toks_columns: | |
dataset_name = col.split(".")[-1] | |
performance_df = performance_df.rename( | |
columns={ | |
col: f"{'Short-Form' if dataset_name == 'librispeech-10mins' else 'Long-Form'} Tok/s" | |
} | |
) | |
# Process model names for display | |
performance_df["model_raw"] = performance_df["Model"].copy() | |
performance_df["Model"] = performance_df["Model"].apply( | |
lambda x: make_model_name_clickable_link(x) | |
) | |
# Extract unique devices and OS versions | |
initial_release_df = benchmark_df[benchmark_df["commit_hash"] == releases[-1]] | |
PERFORMANCE_DEVICES = initial_release_df["device"].unique().tolist() | |
PERFORMANCE_OS = ( | |
initial_release_df["os"].apply(get_os_name_and_version).unique().tolist() | |
) | |
PERFORMANCE_OS.sort() | |
# Create initial column dictionaries and update with dataset information | |
initial_performance_column_dict = create_initial_performance_column_dict() | |
performance_column_info = add_datasets_to_performance_columns( | |
initial_performance_column_dict, PERFORMANCE_DATASETS | |
) | |
# Unpack the returned dictionaries | |
updated_performance_column_dict = performance_column_info["column_dict"] | |
PerformanceAutoEvalColumn = performance_column_info["AutoEvalColumn"] | |
# Define column sets for different views | |
PERFORMANCE_COLS = performance_column_info["COLS"] | |
PERFORMANCE_TYPES = performance_column_info["TYPES"] | |
PERFORMANCE_ALWAYS_HERE_COLS = performance_column_info["ALWAYS_HERE_COLS"] | |
PERFORMANCE_TOGGLE_COLS = performance_column_info["TOGGLE_COLS"] | |
PERFORMANCE_SELECTED_COLS = performance_column_info["SELECTED_COLS"] | |
def get_release_devices(release): | |
""" | |
Get the list of devices for a specific release. | |
:param release: Selected release hash | |
:return: List of devices available in the release | |
""" | |
release_df = benchmark_df[benchmark_df["commit_hash"] == release] | |
return release_df["device"].unique().tolist() | |
def performance_filter( | |
df, | |
columns, | |
model_query, | |
exclude_models, | |
devices, | |
os, | |
short_speed_slider, | |
long_speed_slider, | |
short_toks_slider, | |
long_toks_slider, | |
release, | |
): | |
""" | |
Filters the performance DataFrame based on specified criteria. | |
:param df: The DataFrame to be filtered. | |
:param columns: The columns to be included in the filtered DataFrame. | |
:param model_query: The query string to filter the 'Model' column. | |
:param exclude_models: Models to exclude from the results. | |
:param devices: The devices to filter the 'Device' column. | |
:param os: The list of operating systems to filter the 'OS' column. | |
:param short_speed_slider: The range of values to filter the 'Short-Form Speed' column. | |
:param long_speed_slider: The range of values to filter the 'Long-Form Speed' column. | |
:param short_toks_slider: The range of values to filter the 'Short-Form Tok/s' column. | |
:param long_toks_slider: The range of values to filter the 'Long-Form Tok/s' column. | |
:return: The filtered DataFrame. | |
""" | |
filtered_df = df[df["commit_hash"] == release] | |
# Select columns based on input and always-present columns | |
filtered_df = filtered_df[ | |
PERFORMANCE_ALWAYS_HERE_COLS | |
+ [c for c in PERFORMANCE_COLS if c in df.columns and c in columns] | |
] | |
# Filter models based on query | |
if model_query: | |
filtered_df = filtered_df[ | |
filtered_df["Model"].str.contains( | |
"|".join(q.strip() for q in model_query.split(";")), case=False | |
) | |
] | |
# Exclude specified models | |
if exclude_models: | |
exclude_list = [m.strip() for m in exclude_models.split(";")] | |
filtered_df = filtered_df[ | |
~filtered_df["Model"].str.contains("|".join(exclude_list), case=False) | |
] | |
# Filter by devices | |
if devices: | |
filtered_df = filtered_df[filtered_df["Device"].isin(devices)] | |
else: | |
filtered_df = pd.DataFrame(columns=filtered_df.columns) | |
# Filter by operating systems | |
filtered_df = ( | |
filtered_df[ | |
( | |
filtered_df["OS"].str.contains( | |
"|".join(q.strip() for q in os), case=False | |
) | |
) | |
] | |
if os | |
else pd.DataFrame(columns=filtered_df.columns) | |
) | |
# Apply short-form and long-form speed and tokens per second filters | |
min_short_speed, max_short_speed = short_speed_slider | |
min_long_speed, max_long_speed = long_speed_slider | |
min_short_toks, max_short_toks = short_toks_slider | |
min_long_toks, max_long_toks = long_toks_slider | |
df["Short-Form Speed"] = pd.to_numeric(df["Short-Form Speed"], errors="coerce") | |
df["Long-Form Speed"] = pd.to_numeric(df["Long-Form Speed"], errors="coerce") | |
df["Short-Form Tok/s"] = pd.to_numeric(df["Short-Form Tok/s"], errors="coerce") | |
df["Long-Form Tok/s"] = pd.to_numeric(df["Long-Form Tok/s"], errors="coerce") | |
if "Short-Form Speed" in filtered_df.columns: | |
filtered_df = filtered_df[ | |
(filtered_df["Short-Form Speed"] >= min_short_speed) | |
& (filtered_df["Short-Form Speed"] <= max_short_speed) | |
] | |
if "Long-Form Speed" in filtered_df.columns: | |
filtered_df = filtered_df[ | |
(filtered_df["Long-Form Speed"] >= min_long_speed) | |
& (filtered_df["Long-Form Speed"] <= max_long_speed) | |
] | |
if "Short-Form Tok/s" in filtered_df.columns: | |
filtered_df = filtered_df[ | |
(filtered_df["Short-Form Tok/s"] >= min_short_toks) | |
& (filtered_df["Short-Form Tok/s"] <= max_short_toks) | |
] | |
if "Long-Form Tok/s" in filtered_df.columns: | |
filtered_df = filtered_df[ | |
(filtered_df["Long-Form Tok/s"] >= min_long_toks) | |
& (filtered_df["Long-Form Tok/s"] <= max_long_toks) | |
] | |
return filtered_df | |
def update_performance_filters(release): | |
""" | |
Updates the performance filters (devices and OS) based on the selected release. | |
:param release: Selected release hash | |
:return: Tuple containing updated device and OS choices | |
""" | |
# Filter benchmark data for the selected release | |
release_df = benchmark_df[benchmark_df["commit_hash"] == release] | |
# Get unique devices and OS versions for this release | |
release_devices = release_df["device"].unique().tolist() | |
release_os = release_df["os"].apply(get_os_name_and_version).unique().tolist() | |
release_os.sort() | |
return ( | |
gr.update(choices=release_devices, value=release_devices), | |
gr.update(choices=release_os, value=release_os), | |
) | |
def update_support_table(release): | |
""" | |
Updates the support table and its column configuration for a given release. | |
:param release: Selected release hash | |
:return: Tuple containing (updated DataFrame, updated column choices, updated column values) | |
""" | |
# Load new support data | |
support_data = pd.read_csv(f"dashboard_data/support_data_{release[:7]}.csv") | |
support_data.set_index(support_data.columns[0], inplace=True) | |
# Process model names | |
support_data["Model"] = support_data["Model"].apply(lambda x: x.replace("_", "/")) | |
support_data["Model"] = support_data["Model"].apply( | |
lambda x: make_model_name_clickable_link(x) | |
) | |
# Sort by model name length | |
support_data = ( | |
support_data.assign(model_len=support_data["Model"].str.len()) | |
.sort_values( | |
by=["model_len"], | |
ascending=[True], | |
) | |
.drop(columns=["model_len"]) | |
) | |
# Get new columns (excluding 'Model') | |
new_columns = support_data.columns.tolist()[1:] | |
return ( | |
gr.update(value=support_data, datatype=["html" for _ in support_data.columns]), | |
gr.update(choices=new_columns, value=new_columns), | |
gr.update(value=support_data), | |
) | |
diff_tab = gr.TabItem("Difference Checker", elem_id="diff_checker", id=2) | |
text_diff_elems = [] | |
tabs = gr.Tabs(elem_id="tab-elems") | |
font = [ | |
"Zwizz Regular", # Local font | |
"IBM Plex Mono", # Monospace font | |
"ui-sans-serif", | |
"system-ui", | |
"sans-serif", | |
] | |
# Macos 14, 15, 26 | |
# ios 17, 18, 26 | |
# Define the Gradio interface | |
with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo: | |
# Add header and banner to the interface | |
gr.HTML(HEADER) | |
gr.HTML(BANNER_TEXT, elem_classes="markdown-text") | |
gr.Markdown("### Release") | |
release_dropdown = gr.Dropdown( | |
choices=[ | |
(f"{release} v{SHA_TO_VERSION[release]}", release) for release in releases | |
], | |
label="Select Release", | |
value=releases[-1] if releases else None, | |
elem_id="release-dropdown", | |
container=False, | |
) | |
# Create tabs for different sections of the dashboard | |
with tabs.render(): | |
# Performance Tab | |
with gr.TabItem("Benchmark", elem_id="benchmark", id=0): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Row(): | |
with gr.Column(scale=6, elem_classes="filter_models_column"): | |
filter_performance_models = gr.Textbox( | |
placeholder="π Filter Model (separate multiple queries with ';')", | |
label="Filter Models", | |
) | |
with gr.Column(scale=4, elem_classes="exclude_models_column"): | |
exclude_performance_models = gr.Textbox( | |
placeholder="π Exclude Model", | |
label="Exclude Model", | |
) | |
with gr.Row(): | |
with gr.Accordion("See All Columns", open=False): | |
with gr.Row(): | |
with gr.Column(scale=9, elem_id="performance_columns"): | |
performance_shown_columns = gr.CheckboxGroup( | |
choices=PERFORMANCE_TOGGLE_COLS, | |
value=PERFORMANCE_SELECTED_COLS, | |
label="Toggle Columns", | |
elem_id="column-select", | |
interactive=True, | |
) | |
with gr.Column( | |
scale=1, | |
min_width=200, | |
elem_id="performance_select_columns", | |
): | |
with gr.Row(): | |
select_all_button = gr.Button( | |
"Select All", | |
elem_id="select-all-button", | |
interactive=True, | |
) | |
deselect_all_button = gr.Button( | |
"Deselect All", | |
elem_id="deselect-all-button", | |
interactive=True, | |
) | |
def select_all_columns(): | |
return PERFORMANCE_TOGGLE_COLS | |
def deselect_all_columns(): | |
return [] | |
select_all_button.click( | |
select_all_columns, | |
inputs=[], | |
outputs=performance_shown_columns, | |
) | |
deselect_all_button.click( | |
deselect_all_columns, | |
inputs=[], | |
outputs=performance_shown_columns, | |
) | |
with gr.Row(): | |
with gr.Accordion("Filter Devices", open=False): | |
with gr.Row(): | |
with gr.Column( | |
scale=9, elem_id="filter_devices_column" | |
): | |
performance_shown_devices = gr.CheckboxGroup( | |
choices=get_release_devices(releases[-1]), | |
value=get_release_devices(releases[-1]), | |
label="Filter Devices", | |
interactive=True, | |
) | |
with gr.Column( | |
scale=1, | |
min_width=200, | |
elem_id="filter_select_devices", | |
): | |
with gr.Row(): | |
select_all_devices_button = gr.Button( | |
"Select All", | |
elem_id="select-all-devices-button", | |
interactive=True, | |
) | |
deselect_all_devices_button = gr.Button( | |
"Deselect All", | |
elem_id="deselect-all-devices-button", | |
interactive=True, | |
) | |
def select_all_devices(release): | |
"""Returns all devices available in the current release""" | |
return get_release_devices(release) | |
def deselect_all_devices(): | |
"""Returns an empty list for deselecting all devices""" | |
return [] | |
select_all_devices_button.click( | |
select_all_devices, | |
inputs=[release_dropdown], | |
outputs=performance_shown_devices, | |
) | |
deselect_all_devices_button.click( | |
deselect_all_devices, | |
inputs=[], | |
outputs=performance_shown_devices, | |
) | |
with gr.Row(): | |
performance_shown_os = gr.CheckboxGroup( | |
choices=PERFORMANCE_OS, | |
value=PERFORMANCE_OS, | |
label="Filter OS", | |
interactive=True, | |
) | |
with gr.Column(scale=1): | |
with gr.Accordion("See Performance Filters"): | |
with gr.Row(): | |
with gr.Row(): | |
min_short_speed, max_short_speed = floor( | |
min(performance_df["Short-Form Speed"]) | |
), ceil(max(performance_df["Short-Form Speed"])) | |
short_speed_slider = RangeSlider( | |
value=[min_short_speed, max_short_speed], | |
minimum=min_short_speed, | |
maximum=max_short_speed, | |
step=0.001, | |
label="Short-Form Speed", | |
) | |
with gr.Row(): | |
min_long_speed, max_long_speed = floor( | |
min(performance_df["Long-Form Speed"]) | |
), ceil(max(performance_df["Long-Form Speed"])) | |
long_speed_slider = RangeSlider( | |
value=[min_long_speed, max_long_speed], | |
minimum=min_long_speed, | |
maximum=max_long_speed, | |
step=0.001, | |
label="Long-Form Speed", | |
) | |
with gr.Row(): | |
with gr.Row(): | |
min_short_toks, max_short_toks = floor( | |
min(performance_df["Short-Form Tok/s"]) | |
), ceil(max(performance_df["Short-Form Tok/s"])) | |
short_toks_slider = RangeSlider( | |
value=[min_short_toks, max_short_toks], | |
minimum=min_short_toks, | |
maximum=max_short_toks, | |
step=0.001, | |
label="Short-Form Tok/s", | |
) | |
with gr.Row(): | |
min_long_toks, max_long_toks = floor( | |
min(performance_df["Long-Form Tok/s"]) | |
), ceil(max(performance_df["Long-Form Tok/s"])) | |
long_toks_slider = RangeSlider( | |
value=[min_long_toks, max_long_toks], | |
minimum=min_long_toks, | |
maximum=max_long_toks, | |
step=0.001, | |
label="Long-Form Tok/s", | |
) | |
with gr.Row(): | |
gr.Markdown(PERFORMANCE_TEXT, elem_classes="markdown-text") | |
with gr.Row(): | |
initial_df = performance_df[ | |
performance_df["commit_hash"] == releases[-1] | |
] | |
leaderboard_df = gr.components.Dataframe( | |
value=initial_df[ | |
PERFORMANCE_ALWAYS_HERE_COLS + performance_shown_columns.value | |
], | |
headers=[ | |
PERFORMANCE_ALWAYS_HERE_COLS + performance_shown_columns.value | |
], | |
datatype=[ | |
c.type | |
for c in fields(PerformanceAutoEvalColumn) | |
if c.name in PERFORMANCE_COLS | |
], | |
elem_id="leaderboard-table", | |
elem_classes="large-table", | |
interactive=False, | |
) | |
# Copy of the leaderboard dataframe to apply filters to | |
hidden_leaderboard_df = gr.components.Dataframe( | |
value=performance_df, | |
headers=PERFORMANCE_COLS, | |
datatype=[ | |
c.type | |
for c in fields(PerformanceAutoEvalColumn) | |
if c.name in PERFORMANCE_COLS | |
], | |
visible=False, | |
) | |
# Inputs for the dataframe filter function | |
performance_filter_inputs = [ | |
hidden_leaderboard_df, | |
performance_shown_columns, | |
filter_performance_models, | |
exclude_performance_models, | |
performance_shown_devices, | |
performance_shown_os, | |
short_speed_slider, | |
long_speed_slider, | |
short_toks_slider, | |
long_toks_slider, | |
release_dropdown, | |
] | |
filter_output = leaderboard_df | |
filter_performance_models.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
exclude_performance_models.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
performance_shown_columns.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
performance_shown_devices.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
performance_shown_os.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
short_speed_slider.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
long_speed_slider.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
short_toks_slider.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
long_toks_slider.change( | |
performance_filter, performance_filter_inputs, filter_output | |
) | |
release_dropdown.change( | |
fn=update_performance_filters, | |
inputs=[release_dropdown], | |
outputs=[performance_shown_devices, performance_shown_os], | |
queue=False, | |
).then( | |
fn=performance_filter, | |
inputs=performance_filter_inputs, | |
outputs=filter_output, | |
) | |
# Timeline Tab | |
with gr.TabItem("Timeline", elem_id="timeline", id=4): | |
# Create subtabs for different metrics | |
with gr.Tabs(): | |
with gr.TabItem("QoI", id=0): | |
with gr.Row(): | |
with gr.Column(scale=6): | |
filter_qoi = gr.Textbox( | |
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
label="Filter", | |
) | |
with gr.Column(scale=4): | |
exclude_qoi = gr.Textbox( | |
placeholder="π Exclude Model-Device-OS", | |
label="Exclude", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
qoi_plot = gr.Plot(container=True) | |
demo.load( | |
lambda x, y, z: plot_metric( | |
x, | |
"qoi", | |
"QoI", | |
"QoI Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_qoi, | |
exclude_qoi, | |
], | |
qoi_plot, | |
) | |
filter_qoi.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"qoi", | |
"QoI", | |
"QoI Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_qoi, | |
exclude_qoi, | |
], | |
qoi_plot, | |
) | |
exclude_qoi.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"qoi", | |
"QoI", | |
"QoI Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_qoi, | |
exclude_qoi, | |
], | |
qoi_plot, | |
) | |
with gr.TabItem("Average WER", id=1): | |
with gr.Row(): | |
with gr.Column(scale=6): | |
filter_average_wer = gr.Textbox( | |
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
label="Filter", | |
) | |
with gr.Column(scale=4): | |
exclude_average_wer = gr.Textbox( | |
placeholder="π Exclude Model-Device-OS", | |
label="Exclude", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
average_wer_plot = gr.Plot(container=True) | |
demo.load( | |
lambda x, y, z: plot_metric( | |
x, | |
"average_wer", | |
"Average WER", | |
"Average WER Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_average_wer, | |
exclude_average_wer, | |
], | |
average_wer_plot, | |
) | |
filter_average_wer.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"average_wer", | |
"Average WER", | |
"Average WER Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_average_wer, | |
exclude_average_wer, | |
], | |
average_wer_plot, | |
) | |
exclude_average_wer.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"average_wer", | |
"Average WER", | |
"Average WER Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_average_wer, | |
exclude_average_wer, | |
], | |
average_wer_plot, | |
) | |
with gr.TabItem("Speed", id=2): | |
with gr.Row(): | |
with gr.Column(scale=6): | |
filter_speed = gr.Textbox( | |
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
label="Filter", | |
) | |
with gr.Column(scale=4): | |
exclude_speed = gr.Textbox( | |
placeholder="π Exclude Model-Device-OS", | |
label="Exclude", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
speed_plot = gr.Plot(container=True) | |
demo.load( | |
lambda x, y, z: plot_metric( | |
x, | |
"speed", | |
"Speed", | |
"Speed Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_speed, | |
exclude_speed, | |
], | |
speed_plot, | |
) | |
filter_speed.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"speed", | |
"Speed", | |
"Speed Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_speed, | |
exclude_speed, | |
], | |
speed_plot, | |
) | |
exclude_speed.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"speed", | |
"Speed", | |
"Speed Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_speed, | |
exclude_speed, | |
], | |
speed_plot, | |
) | |
with gr.TabItem("Tok/s", id=3): | |
with gr.Row(): | |
with gr.Column(scale=6): | |
filter_toks = gr.Textbox( | |
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
label="Filter", | |
) | |
with gr.Column(scale=4): | |
exclude_toks = gr.Textbox( | |
placeholder="π Exclude Model-Device-OS", | |
label="Exclude", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
toks_plot = gr.Plot(container=True) | |
demo.load( | |
lambda x, y, z: plot_metric( | |
x, | |
"tokens_per_second", | |
"Tok/s", | |
"Tok/s Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_toks, | |
exclude_toks, | |
], | |
toks_plot, | |
) | |
filter_toks.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"tokens_per_second", | |
"Tok/s", | |
"Tok/s Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_toks, | |
exclude_toks, | |
], | |
toks_plot, | |
) | |
exclude_toks.change( | |
lambda x, y, z: plot_metric( | |
x, | |
"tokens_per_second", | |
"Tok/s", | |
"Tok/s Over Time for Model-Device-OS Combinations", | |
y, | |
z, | |
), | |
[ | |
gr.Dataframe(benchmark_df, visible=False), | |
filter_toks, | |
exclude_toks, | |
], | |
toks_plot, | |
) | |
# Device Support Tab | |
with gr.TabItem("Device Support", elem_id="device_support", id=6): | |
# Add clear description of what Device Support means | |
gr.Markdown( | |
""" | |
## Device Support | |
This tab shows **test results for SKUs that we actually attempted to test**. It tells you whether tests passed, failed, or couldn't be completed for the devices we tried to run tests on. | |
### Please Note: | |
**This tab only shows devices we attempted to test** - it doesn't show the full universe of available devices. | |
**π For comprehensive coverage analysis**, see the **Test Coverage** tab which shows ALL available SKUs. | |
""", | |
elem_classes="markdown-text" | |
) | |
# Load device support data from CSV | |
support_data = pd.read_csv( | |
f"dashboard_data/support_data_{releases[-1][:7]}.csv" | |
) | |
support_data.set_index(support_data.columns[0], inplace=True) | |
support_data["Model"] = support_data["Model"].apply( | |
lambda x: x.replace("_", "/") | |
) | |
support_data["Model"] = support_data["Model"].apply( | |
lambda x: make_model_name_clickable_link(x) | |
) | |
support_data = ( | |
support_data.assign(model_len=support_data["Model"].str.len()) | |
.sort_values( | |
by=["model_len"], | |
ascending=[True], | |
) | |
.drop(columns=["model_len"]) | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Row(): | |
with gr.Column(scale=6, elem_id="filter_models_column"): | |
filter_support_models = gr.Textbox( | |
placeholder="π Filter Model (separate multiple queries with ';')", | |
label="Filter Models", | |
) | |
with gr.Column(scale=4, elem_classes="exclude_models_column"): | |
exclude_support_models = gr.Textbox( | |
placeholder="π Exclude Model", | |
label="Exclude Model", | |
) | |
with gr.Row(): | |
with gr.Accordion("See All Columns", open=False): | |
with gr.Row(): | |
with gr.Column(scale=9): | |
support_shown_columns = gr.CheckboxGroup( | |
choices=support_data.columns.tolist()[ | |
1: | |
], # Exclude 'Model' column | |
value=support_data.columns.tolist()[1:], | |
label="Toggle Columns", | |
elem_id="support-column-select", | |
interactive=True, | |
) | |
with gr.Column(scale=1, min_width=200): | |
with gr.Row(): | |
select_all_support_button = gr.Button( | |
"Select All", | |
elem_id="select-all-support-button", | |
interactive=True, | |
) | |
deselect_all_support_button = gr.Button( | |
"Deselect All", | |
elem_id="deselect-all-support-button", | |
interactive=True, | |
) | |
with gr.Column(): | |
gr.Markdown( | |
""" | |
### Legend | |
- β Supported: The model is supported and tested on this device. | |
- β οΈ Failed: Either the model tests failed on this device or the Speed Factor for the test is less than 1. | |
- ? Not Tested: The model is supported on this device but no test information available. | |
- Not Supported: The model is not supported on this device as per the [WhisperKit configuration](https://huggingface.co/argmaxinc/whisperkit-coreml/blob/main/config.json). | |
""" | |
) | |
# Display device support data in a table | |
device_support_table = gr.Dataframe( | |
value=support_data, | |
headers=support_data.columns.tolist(), | |
datatype=["html" for _ in support_data.columns], | |
elem_id="device-support-table", | |
elem_classes="large-table", | |
interactive=False, | |
) | |
# Hidden dataframe to store the original data | |
hidden_support_df = gr.Dataframe(value=support_data, visible=False) | |
def filter_support_data(df, columns, model_query, exclude_models): | |
""" | |
Filters the device support data based on specified criteria. | |
:param df: The DataFrame to be filtered | |
:param columns: Columns to include in the output | |
:param model_query: Query string to filter models | |
:param exclude_models: Models to exclude | |
:return: Filtered DataFrame | |
""" | |
filtered_df = df.copy() | |
# Filter models based on query | |
if model_query: | |
filtered_df = filtered_df[ | |
filtered_df["Model"].str.contains( | |
"|".join(q.strip() for q in model_query.split(";")), | |
case=False, | |
regex=True, | |
) | |
] | |
# Exclude specified models | |
if exclude_models: | |
exclude_list = [ | |
re.escape(m.strip()) for m in exclude_models.split(";") | |
] | |
filtered_df = filtered_df[ | |
~filtered_df["Model"].str.contains( | |
"|".join(exclude_list), case=False, regex=True | |
) | |
] | |
# Select columns | |
selected_columns = ["Model"] + [ | |
col for col in columns if col in df.columns | |
] | |
filtered_df = filtered_df[selected_columns] | |
return filtered_df | |
def select_all_support_columns(release): | |
""" | |
Returns all current columns from the support shown columns. | |
:param release: Selected release hash | |
:return: List of all available choices | |
""" | |
# Load new support data for the current release | |
support_data = pd.read_csv( | |
f"dashboard_data/support_data_{release[:7]}.csv" | |
) | |
support_data.set_index(support_data.columns[0], inplace=True) | |
# Return all columns except 'Model' | |
return [col for col in support_data.columns if col != "Model"] | |
def deselect_all_support_columns(): | |
return [] | |
# Connect select all and deselect all buttons | |
select_all_support_button.click( | |
select_all_support_columns, | |
inputs=[release_dropdown], | |
outputs=support_shown_columns, | |
) | |
deselect_all_support_button.click( | |
deselect_all_support_columns, | |
inputs=[], | |
outputs=support_shown_columns, | |
) | |
# Connect release dropdown to support data update | |
release_dropdown.change( | |
update_support_table, | |
inputs=[release_dropdown], | |
outputs=[ | |
device_support_table, | |
support_shown_columns, | |
hidden_support_df, | |
], | |
).then( | |
filter_support_data, | |
inputs=[ | |
hidden_support_df, | |
support_shown_columns, | |
filter_support_models, | |
exclude_support_models, | |
], | |
outputs=device_support_table, | |
) | |
# Also connect the filter inputs to update the table | |
for input_elem in [ | |
filter_support_models, | |
exclude_support_models, | |
support_shown_columns, | |
]: | |
input_elem.change( | |
filter_support_data, | |
inputs=[ | |
hidden_support_df, | |
support_shown_columns, | |
filter_support_models, | |
exclude_support_models, | |
], | |
outputs=device_support_table, | |
) | |
# Test Coverage Tab | |
with gr.TabItem("Test Coverage", elem_id="test_coverage", id=7): | |
# Add clear description of what Test Coverage means | |
gr.Markdown( | |
""" | |
## Test Coverage | |
This tab shows **ALL available SKUs** and our testing coverage across the entire device ecosystem. Uses chip-based expansion where testing one device covers all devices with the same chip. | |
""", | |
elem_classes="markdown-text" | |
) | |
def load_coverage_data(release): | |
"""Load test coverage data for a specific release.""" | |
try: | |
with open(f"dashboard_data/test_coverage_{release}.json", "r") as f: | |
return json.load(f) | |
except FileNotFoundError: | |
return { | |
"commit_hash": release, | |
"total_devices": 0, | |
"tested_devices": 0, | |
"skipped_devices": 0, | |
"coverage_percentage": 0.0, | |
"tested_device_list": [], | |
"skipped_device_list": [], | |
"tested_os_versions": [], | |
"has_target_os_coverage": False, | |
"covered_target_versions": [], | |
"missing_target_versions": [], | |
} | |
def format_coverage_devices(device_list): | |
"""Convert device list to DataFrame format.""" | |
if not device_list: | |
return pd.DataFrame(columns=["Device"]) | |
df = pd.DataFrame({"Device": device_list}) | |
return df.sort_values(["Device"]) | |
def update_coverage_data(release): | |
"""Update coverage data when release changes.""" | |
coverage_data = load_coverage_data(release) | |
# Format tested and skipped devices | |
tested_df = format_coverage_devices(coverage_data["tested_device_list"]) | |
skipped_df = format_coverage_devices( | |
coverage_data["skipped_device_list"] | |
) | |
# Check target OS coverage | |
target_os_status = "" | |
covered_versions = coverage_data.get("covered_target_versions", []) | |
missing_versions = coverage_data.get("missing_target_versions", []) | |
if covered_versions or missing_versions: | |
target_os_status = "\n- **Target OS Coverage**:\n" | |
if covered_versions: | |
unique_versions = sorted(set(covered_versions)) | |
target_os_status += f" - β **Tested**: {', '.join(unique_versions)}\n" | |
if missing_versions: | |
target_os_status += f" - β **Missing**: {', '.join(missing_versions)}" | |
# Create coverage summary | |
coverage_summary = f"""## Test Coverage Summary for Release {release} (v{SHA_TO_VERSION.get(release, 'Unknown')}) | |
- **Total Devices**: {coverage_data['total_devices']} | |
- **Tested Devices**: {coverage_data['tested_devices']} | |
- **Skipped Devices**: {coverage_data['skipped_devices']} | |
- **Coverage Percentage**: {coverage_data['coverage_percentage']:.1f}% | |
{target_os_status}""" | |
return ( | |
gr.update(value=coverage_summary), | |
gr.update(value=tested_df), | |
gr.update(value=skipped_df), | |
tested_df, | |
skipped_df, | |
) | |
def filter_coverage_devices(df, device_query, exclude_devices): | |
"""Filter coverage devices based on device queries.""" | |
if df is None or df.empty: | |
return df | |
filtered_df = df.copy() | |
# Filter devices based on query | |
if device_query: | |
filtered_df = filtered_df[ | |
filtered_df["Device"].str.contains( | |
"|".join(q.strip() for q in device_query.split(";")), | |
case=False, | |
regex=True, | |
) | |
] | |
# Exclude specified devices | |
if exclude_devices: | |
exclude_list = [ | |
re.escape(d.strip()) for d in exclude_devices.split(";") | |
] | |
filtered_df = filtered_df[ | |
~filtered_df["Device"].str.contains( | |
"|".join(exclude_list), case=False, regex=True | |
) | |
] | |
return filtered_df | |
# Load initial coverage data | |
initial_coverage = load_coverage_data(releases[-1]) | |
initial_tested_df = format_coverage_devices( | |
initial_coverage["tested_device_list"] | |
) | |
initial_skipped_df = format_coverage_devices( | |
initial_coverage["skipped_device_list"] | |
) | |
# Generate initial target OS status | |
initial_target_os_status = "" | |
covered_versions = initial_coverage.get("covered_target_versions", []) | |
missing_versions = initial_coverage.get("missing_target_versions", []) | |
if covered_versions or missing_versions: | |
initial_target_os_status = "\n- **Target OS Coverage**:\n" | |
if covered_versions: | |
unique_versions = sorted(set(covered_versions)) | |
initial_target_os_status += f" - β **Tested**: {', '.join(unique_versions)}\n" | |
if missing_versions: | |
initial_target_os_status += f" - β **Missing**: {', '.join(missing_versions)}" | |
# Create initial coverage summary content | |
initial_summary_content = f"""## Test Coverage Summary for Release {releases[-1]} (v{SHA_TO_VERSION.get(releases[-1], 'Unknown')}) | |
- **Total Devices**: {initial_coverage['total_devices']} | |
- **Tested Devices**: {initial_coverage['tested_devices']} | |
- **Skipped Devices**: {initial_coverage['skipped_devices']} | |
- **Coverage Percentage**: {initial_coverage['coverage_percentage']:.1f}% | |
{initial_target_os_status}""" | |
# Coverage summary | |
coverage_summary_text = gr.Markdown( | |
value=initial_summary_content, | |
elem_classes="markdown-text" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Row(): | |
with gr.Column(scale=6): | |
filter_coverage_devices_input = gr.Textbox( | |
placeholder="π Filter Device (separate multiple queries with ';')", | |
label="Filter Devices", | |
) | |
with gr.Column(scale=4): | |
exclude_coverage_devices_input = gr.Textbox( | |
placeholder="π Exclude Device", | |
label="Exclude Device", | |
) | |
# Create tabs for tested vs skipped devices | |
with gr.Tabs(): | |
with gr.TabItem("Tested Devices", id=0): | |
tested_devices_table = gr.Dataframe( | |
value=initial_tested_df, | |
headers=["Device"], | |
datatype=["str"], | |
elem_id="tested-devices-table", | |
elem_classes="large-table", | |
interactive=False, | |
) | |
with gr.TabItem("Skipped Devices", id=1): | |
skipped_devices_table = gr.Dataframe( | |
value=initial_skipped_df, | |
headers=["Device"], | |
datatype=["str"], | |
elem_id="skipped-devices-table", | |
elem_classes="large-table", | |
interactive=False, | |
) | |
# Hidden dataframes for filtering | |
hidden_tested_df = gr.Dataframe(value=initial_tested_df, visible=False) | |
hidden_skipped_df = gr.Dataframe(value=initial_skipped_df, visible=False) | |
# Connect release dropdown to coverage data update | |
release_dropdown.change( | |
update_coverage_data, | |
inputs=[release_dropdown], | |
outputs=[ | |
coverage_summary_text, | |
tested_devices_table, | |
skipped_devices_table, | |
hidden_tested_df, | |
hidden_skipped_df, | |
], | |
queue=False, | |
) | |
# Connect filter inputs to update both tables | |
for input_elem in [ | |
filter_coverage_devices_input, | |
exclude_coverage_devices_input, | |
]: | |
input_elem.change( | |
lambda tested_df, skipped_df, device_query, exclude_devices: ( | |
filter_coverage_devices( | |
tested_df, device_query, exclude_devices | |
), | |
filter_coverage_devices( | |
skipped_df, device_query, exclude_devices | |
), | |
), | |
inputs=[ | |
hidden_tested_df, | |
hidden_skipped_df, | |
filter_coverage_devices_input, | |
exclude_coverage_devices_input, | |
], | |
outputs=[tested_devices_table, skipped_devices_table], | |
) | |
# Methodology Tab | |
with gr.TabItem("Methodology", elem_id="methodology", id=8): | |
gr.Markdown(METHODOLOGY_TEXT, elem_id="methodology-text") | |
# Citation section | |
with gr.Accordion("π Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
lines=7, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
# Launch the Gradio interface | |
demo.launch(debug=True, share=True, ssr_mode=False) | |