Spaces:
Running
Running
Commit
·
00dd4ff
1
Parent(s):
3900100
Remove quality and multilingual tabs
Browse files- Makefile +0 -4
- constants.py +0 -10
- dashboard_data/multilingual_confusion_matrices.json +0 -0
- dashboard_data/quality_data.json +0 -23
- main.py +12 -387
- multilingual_generate.py +0 -133
- utils.py +0 -35
Makefile
CHANGED
@@ -4,12 +4,8 @@ format:
|
|
4 |
@pre-commit run --all-files
|
5 |
|
6 |
use-huggingface-data:
|
7 |
-
@python multilingual_generate.py download
|
8 |
@python performance_generate.py download
|
9 |
-
@python quality_generate.py
|
10 |
|
11 |
use-local-data:
|
12 |
@python performance_generate.py
|
13 |
|
14 |
-
update-performance-data:
|
15 |
-
@python performance_generate.py download
|
|
|
4 |
@pre-commit run --all-files
|
5 |
|
6 |
use-huggingface-data:
|
|
|
7 |
@python performance_generate.py download
|
|
|
8 |
|
9 |
use-local-data:
|
10 |
@python performance_generate.py
|
11 |
|
|
|
|
constants.py
CHANGED
@@ -40,11 +40,6 @@ On-device: <a href='https://github.com/argmaxinc/WhisperKit'>WhisperKit</a> (var
|
|
40 |
<a href='https://huggingface.co/datasets/argmaxinc/librispeech'>LibriSpeech</a>: ~5 hours of short English audio clips
|
41 |
<a href='https://huggingface.co/datasets/argmaxinc/earnings22'>Earnings22</a>: ~120 hours of English audio from earnings calls
|
42 |
|
43 |
-
🌐 Multilingual Benchmarks:
|
44 |
-
These benchmarks aim to demonstrate WhisperKit's capabilities across diverse languages, helping developers assess its suitability for multilingual applications.
|
45 |
-
\nDataset:
|
46 |
-
<a href='https://huggingface.co/datasets/argmaxinc/whisperkit-evals-multilingual'>Common Voice 17.0</a>: Short-form audio files (<30s/clip) for a maximum of 400 samples per language from Common Voice 17.0. Test set covers a wide range of languages to test model's versatility.
|
47 |
-
|
48 |
\nMetrics:
|
49 |
Average WER: Provides an overall measure of model performance across all languages.
|
50 |
Language-specific WER: Allows for detailed analysis of model performance for each supported language.
|
@@ -59,7 +54,6 @@ Results are shown for both forced (correct language given as input) and unforced
|
|
59 |
- <a href='https://github.com/argmaxinc/whisperkittools'>whisperkittools</a>
|
60 |
- <a href='https://huggingface.co/datasets/argmaxinc/librispeech'>LibriSpeech</a>
|
61 |
- <a href='https://huggingface.co/datasets/argmaxinc/earnings22'>Earnings22</a>
|
62 |
-
- <a href='https://huggingface.co/datasets/argmaxinc/whisperkit-evals-multilingual'>Common Voice 17.0</a>
|
63 |
- <a href='https://platform.openai.com/docs/guides/speech-to-text'>WhisperOpenAIAPI</a>
|
64 |
"""
|
65 |
|
@@ -79,14 +73,12 @@ METHODOLOGY_TEXT = dedent(
|
|
79 |
- **WER (Word Error Rate)** (⬇️): The ratio of words incorrectly transcribed when comparing the model's output to reference transcriptions, with lower values indicating better accuracy.
|
80 |
- **QoI (Quality of Inference)** (⬆️): The ratio of examples where WhisperKit performs no worse than the reference model.
|
81 |
- This metric does not capture improvements to the reference. It only measures potential regressions.
|
82 |
-
- **Multilingual results**: Separated into "language hinted" and "language predicted" categories to evaluate performance with and without prior knowledge of the input language.
|
83 |
|
84 |
## Data
|
85 |
|
86 |
- **Short-form**: 5 hours of English audiobook clips with 30s/clip comprising the [librispeech test set](https://huggingface.co/datasets/argmaxinc/librispeech). Proxy for average streaming performance.
|
87 |
- **Long-form**: 12 hours of earnings call recordings with ~1hr/clip in English with various accents. Built by randomly selecting 10% of the [earnings22 test set](https://huggingface.co/datasets/argmaxinc/earnings22-12hours). Proxy for average from-file performance.
|
88 |
- Full datasets are used for English Quality tests and random 10-minute subsets are used for Performance tests.
|
89 |
-
- **Multilingual**: Max 400 samples per language with <30s/clip from [Common Voice 17.0 Test Set](https://huggingface.co/datasets/argmaxinc/common_voice_17_0-argmax_subset-400). Common Voice covers 77 of the 99 languages supported by Whisper.
|
90 |
|
91 |
## Performance Measurement
|
92 |
|
@@ -101,7 +93,6 @@ METHODOLOGY_TEXT = dedent(
|
|
101 |
- Performance: Interactive filtering by model, device, OS, and performance metrics
|
102 |
- Timeline: Visualizations of performance trends
|
103 |
- English Quality: English transcription quality on short- and long-form audio
|
104 |
-
- Multilingual Quality: Multilingual (77) transcription quality on short-form audio with and without language prediction
|
105 |
- Device Support: Matrix of supported device, OS and model version combinations. Unsupported combinations are marked with :warning:.
|
106 |
- This methodology ensures a comprehensive and fair evaluation of speech recognition models supported by WhisperKit across a wide range of scenarios and use cases.
|
107 |
"""
|
@@ -141,7 +132,6 @@ COL_NAMES = {
|
|
141 |
"device": "Device",
|
142 |
"os": "OS",
|
143 |
"english_wer": "English WER",
|
144 |
-
"multilingual_wer": "Multilingual WER",
|
145 |
}
|
146 |
|
147 |
|
|
|
40 |
<a href='https://huggingface.co/datasets/argmaxinc/librispeech'>LibriSpeech</a>: ~5 hours of short English audio clips
|
41 |
<a href='https://huggingface.co/datasets/argmaxinc/earnings22'>Earnings22</a>: ~120 hours of English audio from earnings calls
|
42 |
|
|
|
|
|
|
|
|
|
|
|
43 |
\nMetrics:
|
44 |
Average WER: Provides an overall measure of model performance across all languages.
|
45 |
Language-specific WER: Allows for detailed analysis of model performance for each supported language.
|
|
|
54 |
- <a href='https://github.com/argmaxinc/whisperkittools'>whisperkittools</a>
|
55 |
- <a href='https://huggingface.co/datasets/argmaxinc/librispeech'>LibriSpeech</a>
|
56 |
- <a href='https://huggingface.co/datasets/argmaxinc/earnings22'>Earnings22</a>
|
|
|
57 |
- <a href='https://platform.openai.com/docs/guides/speech-to-text'>WhisperOpenAIAPI</a>
|
58 |
"""
|
59 |
|
|
|
73 |
- **WER (Word Error Rate)** (⬇️): The ratio of words incorrectly transcribed when comparing the model's output to reference transcriptions, with lower values indicating better accuracy.
|
74 |
- **QoI (Quality of Inference)** (⬆️): The ratio of examples where WhisperKit performs no worse than the reference model.
|
75 |
- This metric does not capture improvements to the reference. It only measures potential regressions.
|
|
|
76 |
|
77 |
## Data
|
78 |
|
79 |
- **Short-form**: 5 hours of English audiobook clips with 30s/clip comprising the [librispeech test set](https://huggingface.co/datasets/argmaxinc/librispeech). Proxy for average streaming performance.
|
80 |
- **Long-form**: 12 hours of earnings call recordings with ~1hr/clip in English with various accents. Built by randomly selecting 10% of the [earnings22 test set](https://huggingface.co/datasets/argmaxinc/earnings22-12hours). Proxy for average from-file performance.
|
81 |
- Full datasets are used for English Quality tests and random 10-minute subsets are used for Performance tests.
|
|
|
82 |
|
83 |
## Performance Measurement
|
84 |
|
|
|
93 |
- Performance: Interactive filtering by model, device, OS, and performance metrics
|
94 |
- Timeline: Visualizations of performance trends
|
95 |
- English Quality: English transcription quality on short- and long-form audio
|
|
|
96 |
- Device Support: Matrix of supported device, OS and model version combinations. Unsupported combinations are marked with :warning:.
|
97 |
- This methodology ensures a comprehensive and fair evaluation of speech recognition models supported by WhisperKit across a wide range of scenarios and use cases.
|
98 |
"""
|
|
|
132 |
"device": "Device",
|
133 |
"os": "OS",
|
134 |
"english_wer": "English WER",
|
|
|
135 |
}
|
136 |
|
137 |
|
dashboard_data/multilingual_confusion_matrices.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
dashboard_data/quality_data.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{"model": "openai/whisper-large-v3/947MB", "timestamp": "2024-10-18_16:59:10_GMT-0700", "average_wer": 9.74, "dataset_wer": {"librispeech": 2.41, "earnings22-12hours": 17.08}, "qoi": 0.94}
|
2 |
-
{"model": "openai/whisper-large-v2/turbo/955MB", "timestamp": "2024-10-18_16:52:35_GMT-0700", "average_wer": 7.27, "dataset_wer": {"librispeech": 2.4, "earnings22-12hours": 12.14}, "qoi": 0.94}
|
3 |
-
{"model": "openai/whisper-tiny.en", "timestamp": "2024-10-19_15:40:06_GMT-0700", "average_wer": 12.23, "dataset_wer": {"librispeech": 5.61, "earnings22-12hours": 18.86}, "qoi": 0.63}
|
4 |
-
{"model": "distil-whisper/distil-large-v3/594MB", "timestamp": "2024-10-20_13:02:33_GMT-0700", "average_wer": 8.96, "dataset_wer": {"librispeech": 2.87, "earnings22-12hours": 15.06}, "qoi": 0.86}
|
5 |
-
{"model": "openai/whisper-large-v2/949MB", "timestamp": "2024-10-18_19:51:30_GMT-0400", "average_wer": 7.88, "dataset_wer": {"librispeech": 2.38, "earnings22-12hours": 13.39}, "qoi": 0.94}
|
6 |
-
{"model": "openai/whisper-large-v3/turbo/954MB", "timestamp": "2024-10-20_13:49:26_GMT-0700", "average_wer": 22.75, "dataset_wer": {"librispeech": 2.51, "earnings22-12hours": 43.0}, "qoi": 0.93}
|
7 |
-
{"model": "distil-whisper/distil-large-v3", "timestamp": "2024-10-20_20:32:22_GMT-0700", "average_wer": 7.2, "dataset_wer": {"librispeech": 2.38, "earnings22-12hours": 12.02}, "qoi": 0.9}
|
8 |
-
{"model": "openai/whisper-large-v3-v20240930", "timestamp": "2024-10-18_18:35:46_GMT-0700", "average_wer": 6.74, "dataset_wer": {"librispeech": 1.93, "earnings22-12hours": 11.55}, "qoi": 0.94}
|
9 |
-
{"model": "openai/whisper-tiny", "timestamp": "2024-10-20_20:19:04_GMT-0700", "average_wer": 14.21, "dataset_wer": {"librispeech": 7.46, "earnings22-12hours": 20.97}, "qoi": 0.52}
|
10 |
-
{"model": "openai/whisper-large-v3-v20240930/turbo/632MB", "timestamp": "2024-10-18_20:10:30_GMT-0700", "average_wer": 6.86, "dataset_wer": {"librispeech": 1.95, "earnings22-12hours": 11.77}, "qoi": 0.93}
|
11 |
-
{"model": "openai/whisper-large-v2/turbo", "timestamp": "2024-10-18_14:58:38_GMT-0700", "average_wer": 7.25, "dataset_wer": {"librispeech": 2.4, "earnings22-12hours": 12.1}, "qoi": 0.96}
|
12 |
-
{"model": "openai/whisper-small", "timestamp": "2024-10-18_12:40:03_GMT-0700", "average_wer": 8.11, "dataset_wer": {"librispeech": 3.21, "earnings22-12hours": 13.0}, "qoi": 0.83}
|
13 |
-
{"model": "openai/whisper-large-v3-v20240930/turbo", "timestamp": "2024-10-18_19:37:26_GMT-0700", "average_wer": 6.72, "dataset_wer": {"librispeech": 1.92, "earnings22-12hours": 11.52}, "qoi": 0.94}
|
14 |
-
{"model": "openai/whisper-large-v3", "timestamp": "2024-10-18_18:01:14_GMT-0400", "average_wer": 6.85, "dataset_wer": {"librispeech": 2.02, "earnings22-12hours": 11.69}, "qoi": 0.95}
|
15 |
-
{"model": "openai/whisper-large-v3-v20240930/626MB", "timestamp": "2024-10-18_19:21:06_GMT-0700", "average_wer": 7.15, "dataset_wer": {"librispeech": 1.96, "earnings22-12hours": 12.35}, "qoi": 0.93}
|
16 |
-
{"model": "openai/whisper-base.en", "timestamp": "2024-10-20_12:31:44_GMT-0700", "average_wer": 9.59, "dataset_wer": {"librispeech": 3.98, "earnings22-12hours": 15.2}, "qoi": 0.75}
|
17 |
-
{"model": "openai/whisper-large-v3-v20240930/547MB", "timestamp": "2024-10-18_21:59:11_GMT-0400", "average_wer": 16.82, "dataset_wer": {"librispeech": 2.16, "earnings22-12hours": 31.49}, "qoi": 0.92}
|
18 |
-
{"model": "distil-whisper/distil-large-v3/turbo/600MB", "timestamp": "2024-10-18_17:50:17_GMT-0700", "average_wer": 8.33, "dataset_wer": {"librispeech": 2.8, "earnings22-12hours": 13.87}, "qoi": 0.86}
|
19 |
-
{"model": "openai/whisper-large-v2", "timestamp": "2024-10-18_17:07:15_GMT-0400", "average_wer": 7.32, "dataset_wer": {"librispeech": 2.36, "earnings22-12hours": 12.28}, "qoi": 0.97}
|
20 |
-
{"model": "openai/whisper-small.en", "timestamp": "2024-10-18_15:39:48_GMT-0400", "average_wer": 7.85, "dataset_wer": {"librispeech": 2.88, "earnings22-12hours": 12.82}, "qoi": 0.86}
|
21 |
-
{"model": "distil-whisper/distil-large-v3/turbo", "timestamp": "2024-10-20_12:45:20_GMT-0700", "average_wer": 7.2, "dataset_wer": {"librispeech": 2.35, "earnings22-12hours": 12.05}, "qoi": 0.9}
|
22 |
-
{"model": "openai/whisper-base", "timestamp": "2024-10-18_20:25:50_GMT-0700", "average_wer": 10.67, "dataset_wer": {"librispeech": 4.94, "earnings22-12hours": 16.4}, "qoi": 0.67}
|
23 |
-
{"model": "openai/whisper-large-v3/turbo", "timestamp": "2024-10-20_16:58:25_GMT-0400", "average_wer": 6.86, "dataset_wer": {"librispeech": 1.97, "earnings22-12hours": 11.74}, "qoi": 0.95}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -23,24 +23,16 @@ from constants import (
|
|
23 |
CITATION_BUTTON_TEXT,
|
24 |
COL_NAMES,
|
25 |
HEADER,
|
26 |
-
LANGUAGE_MAP,
|
27 |
METHODOLOGY_TEXT,
|
28 |
PERFORMANCE_TEXT,
|
29 |
-
QUALITY_TEXT,
|
30 |
-
# SHA_TO_VERSION,
|
31 |
)
|
32 |
from utils import (
|
33 |
add_datasets_to_performance_columns,
|
34 |
-
add_datasets_to_quality_columns,
|
35 |
-
create_confusion_matrix_plot,
|
36 |
create_initial_performance_column_dict,
|
37 |
-
create_initial_quality_column_dict,
|
38 |
css,
|
39 |
fields,
|
40 |
get_os_name_and_version,
|
41 |
-
make_dataset_wer_clickable_link,
|
42 |
make_model_name_clickable_link,
|
43 |
-
make_multilingual_model_clickable_link,
|
44 |
plot_metric,
|
45 |
read_json_line_by_line,
|
46 |
)
|
@@ -61,72 +53,23 @@ local_dir = ""
|
|
61 |
|
62 |
# Load benchmark data from JSON files
|
63 |
PERFORMANCE_DATA = read_json_line_by_line("dashboard_data/performance_data.json")
|
64 |
-
QUALITY_DATA = read_json_line_by_line("dashboard_data/quality_data.json")
|
65 |
with open("dashboard_data/version.json", "r") as file:
|
66 |
VERSION_DATA = json.load(file)
|
67 |
|
68 |
SHA_TO_VERSION = {
|
69 |
-
VERSION_DATA["releases"][i]: VERSION_DATA["versions"][i]
|
|
|
70 |
}
|
71 |
|
72 |
-
# Convert JSON data to pandas DataFrames
|
73 |
-
quality_df = pd.json_normalize(QUALITY_DATA)
|
74 |
benchmark_df = pd.json_normalize(PERFORMANCE_DATA)
|
75 |
releases = VERSION_DATA["releases"]
|
76 |
|
77 |
# Process timestamp data
|
78 |
-
benchmark_df["timestamp"] = pd.to_datetime(benchmark_df["timestamp"]).dt.tz_localize(
|
79 |
-
None
|
80 |
-
)
|
81 |
-
benchmark_df["timestamp"] = pd.to_datetime(benchmark_df["timestamp"]).dt.tz_localize(
|
82 |
-
None
|
83 |
-
)
|
84 |
-
|
85 |
-
# First create a temporary column for model length
|
86 |
-
sorted_quality_df = (
|
87 |
-
quality_df.assign(model_len=quality_df["model"].str.len())
|
88 |
-
.sort_values(
|
89 |
-
by=["model_len", "model", "timestamp"],
|
90 |
-
ascending=[True, True, False],
|
91 |
-
)
|
92 |
-
.drop(columns=["model_len"])
|
93 |
-
.drop_duplicates(subset=["model"], keep="first")
|
94 |
-
.reset_index(drop=True)
|
95 |
-
)
|
96 |
-
|
97 |
-
multilingual_df = pd.read_csv("dashboard_data/multilingual_results.csv")
|
98 |
-
multilingual_models_df = multilingual_df[["Model"]].drop_duplicates()
|
99 |
-
multilingual_models_buttons = []
|
100 |
-
for model in multilingual_models_df["Model"]:
|
101 |
-
elem_id = (
|
102 |
-
f"{model}".replace(" ", "_").replace('"', "").replace("'", "").replace(",", "")
|
103 |
-
)
|
104 |
-
multilingual_models_buttons.append(
|
105 |
-
gr.Button(value=model, elem_id=elem_id, visible=False)
|
106 |
-
)
|
107 |
-
multilingual_models_df["Model"] = multilingual_models_df["Model"].apply(
|
108 |
-
lambda x: make_multilingual_model_clickable_link(x)
|
109 |
-
)
|
110 |
-
|
111 |
-
with open("dashboard_data/multilingual_confusion_matrices.json", "r") as file:
|
112 |
-
confusion_matrix_map = dict(json.load(file))
|
113 |
-
|
114 |
-
# Create a mapping of model to average WER
|
115 |
-
model_to_english_wer = dict(zip(sorted_quality_df["model"], sorted_quality_df["average_wer"]))
|
116 |
-
model_to_multilingual_wer = dict(
|
117 |
-
zip(multilingual_df["Model"], multilingual_df["Average WER"])
|
118 |
-
)
|
119 |
-
|
120 |
-
# Add English WER and Multilingual WER to performance_df
|
121 |
-
benchmark_df["english_wer"] = benchmark_df["model"].map(model_to_english_wer)
|
122 |
-
benchmark_df["multilingual_wer"] = benchmark_df["model"].map(model_to_multilingual_wer)
|
123 |
-
benchmark_df.fillna({"multilingual_wer": "N/A"}, inplace=True) # Mark all untested models as N/A
|
124 |
-
|
125 |
-
# Mark English-only models
|
126 |
-
english_only_mask = benchmark_df["model"].str.contains(r"\.en$|distil-whisper", case=False, na=False)
|
127 |
-
benchmark_df.loc[english_only_mask, "multilingual_wer"] = "English-only model"
|
128 |
|
129 |
-
|
|
|
130 |
|
131 |
sorted_performance_df = (
|
132 |
benchmark_df.assign(model_len=benchmark_df["model"].str.len())
|
@@ -140,9 +83,6 @@ sorted_performance_df = (
|
|
140 |
)
|
141 |
|
142 |
# Identify dataset-specific columns
|
143 |
-
dataset_wer_columns = [
|
144 |
-
col for col in sorted_quality_df.columns if col.startswith("dataset_wer.")
|
145 |
-
]
|
146 |
dataset_speed_columns = [
|
147 |
col for col in sorted_performance_df.columns if col.startswith("dataset_speed.")
|
148 |
]
|
@@ -153,20 +93,15 @@ dataset_toks_columns = [
|
|
153 |
]
|
154 |
|
155 |
# Extract dataset names
|
156 |
-
QUALITY_DATASETS = [col.split(".")[-1] for col in dataset_wer_columns]
|
157 |
PERFORMANCE_DATASETS = [col.split(".")[-1] for col in dataset_speed_columns]
|
158 |
|
159 |
# Prepare DataFrames for display
|
160 |
-
model_df = sorted_quality_df[
|
161 |
-
["model", "average_wer", "qoi", "timestamp"] + dataset_wer_columns
|
162 |
-
]
|
163 |
performance_df = sorted_performance_df[
|
164 |
[
|
165 |
"model",
|
166 |
"device",
|
167 |
"os",
|
168 |
"english_wer",
|
169 |
-
"multilingual_wer",
|
170 |
"qoi",
|
171 |
"speed",
|
172 |
"tokens_per_second",
|
@@ -181,18 +116,8 @@ performance_df = sorted_performance_df[
|
|
181 |
performance_df = performance_df.rename(
|
182 |
lambda x: COL_NAMES[x] if x in COL_NAMES else x, axis="columns"
|
183 |
)
|
184 |
-
model_df = model_df.rename(
|
185 |
-
lambda x: COL_NAMES[x] if x in COL_NAMES else x, axis="columns"
|
186 |
-
)
|
187 |
|
188 |
# Process dataset-specific columns
|
189 |
-
for col in dataset_wer_columns:
|
190 |
-
dataset_name = col.split(".")[-1]
|
191 |
-
model_df = model_df.rename(columns={col: dataset_name})
|
192 |
-
model_df[dataset_name] = model_df.apply(
|
193 |
-
lambda x: make_dataset_wer_clickable_link(x, dataset_name), axis=1
|
194 |
-
)
|
195 |
-
|
196 |
for col in dataset_speed_columns:
|
197 |
dataset_name = col.split(".")[-1]
|
198 |
performance_df = performance_df.rename(
|
@@ -210,12 +135,8 @@ for col in dataset_toks_columns:
|
|
210 |
)
|
211 |
|
212 |
# Process model names for display
|
213 |
-
model_df["model_raw"] = model_df["Model"].copy()
|
214 |
performance_df["model_raw"] = performance_df["Model"].copy()
|
215 |
-
|
216 |
-
performance_df["Model"] = performance_df["Model"].apply(
|
217 |
-
lambda x: make_model_name_clickable_link(x)
|
218 |
-
)
|
219 |
|
220 |
# Extract unique devices and OS versions
|
221 |
initial_release_df = benchmark_df[benchmark_df["commit_hash"] == releases[-1]]
|
@@ -225,33 +146,22 @@ PERFORMANCE_OS.sort()
|
|
225 |
|
226 |
# Create initial column dictionaries and update with dataset information
|
227 |
initial_performance_column_dict = create_initial_performance_column_dict()
|
228 |
-
initial_quality_column_dict = create_initial_quality_column_dict()
|
229 |
|
230 |
performance_column_info = add_datasets_to_performance_columns(
|
231 |
initial_performance_column_dict, PERFORMANCE_DATASETS
|
232 |
)
|
233 |
-
quality_column_info = add_datasets_to_quality_columns(
|
234 |
-
initial_quality_column_dict, QUALITY_DATASETS
|
235 |
-
)
|
236 |
|
237 |
# Unpack the returned dictionaries
|
238 |
updated_performance_column_dict = performance_column_info["column_dict"]
|
239 |
-
updated_quality_column_dict = quality_column_info["column_dict"]
|
240 |
|
241 |
PerformanceAutoEvalColumn = performance_column_info["AutoEvalColumn"]
|
242 |
-
QualityAutoEvalColumn = quality_column_info["AutoEvalColumn"]
|
243 |
|
244 |
# Define column sets for different views
|
245 |
PERFORMANCE_COLS = performance_column_info["COLS"]
|
246 |
-
QUALITY_COLS = quality_column_info["COLS"]
|
247 |
PERFORMANCE_TYPES = performance_column_info["TYPES"]
|
248 |
-
QUALITY_TYPES = quality_column_info["TYPES"]
|
249 |
PERFORMANCE_ALWAYS_HERE_COLS = performance_column_info["ALWAYS_HERE_COLS"]
|
250 |
-
QUALITY_ALWAYS_HERE_COLS = quality_column_info["ALWAYS_HERE_COLS"]
|
251 |
PERFORMANCE_TOGGLE_COLS = performance_column_info["TOGGLE_COLS"]
|
252 |
-
QUALITY_TOGGLE_COLS = quality_column_info["TOGGLE_COLS"]
|
253 |
PERFORMANCE_SELECTED_COLS = performance_column_info["SELECTED_COLS"]
|
254 |
-
QUALITY_SELECTED_COLS = quality_column_info["SELECTED_COLS"]
|
255 |
|
256 |
def get_release_devices(release):
|
257 |
"""
|
@@ -367,55 +277,6 @@ def performance_filter(
|
|
367 |
return filtered_df
|
368 |
|
369 |
|
370 |
-
def quality_filter(df, columns, model_query, wer_slider, qoi_slider, exclude_models):
|
371 |
-
"""
|
372 |
-
Filters the quality DataFrame based on specified criteria.
|
373 |
-
:param df: The DataFrame to be filtered.
|
374 |
-
:param columns: The columns to be included in the filtered DataFrame.
|
375 |
-
:param model_query: The query string to filter the 'Model' column.
|
376 |
-
:param wer_slider: The range of values to filter the 'Average WER' column.
|
377 |
-
:param qoi_slider: The range of values to filter the 'QoI' column.
|
378 |
-
:param exclude_models: Models to exclude from the results.
|
379 |
-
:return: The filtered DataFrame.
|
380 |
-
"""
|
381 |
-
# Select columns based on input and always-present columns
|
382 |
-
filtered_df = df[
|
383 |
-
QUALITY_ALWAYS_HERE_COLS
|
384 |
-
+ [c for c in QUALITY_COLS if c in df.columns and c in columns]
|
385 |
-
]
|
386 |
-
|
387 |
-
# Filter models based on query
|
388 |
-
if model_query:
|
389 |
-
filtered_df = filtered_df[
|
390 |
-
filtered_df["Model"].str.contains(
|
391 |
-
"|".join(q.strip() for q in model_query.split(";")), case=False
|
392 |
-
)
|
393 |
-
]
|
394 |
-
|
395 |
-
# Exclude specified models
|
396 |
-
if exclude_models:
|
397 |
-
exclude_list = [m.strip() for m in exclude_models.split(";")]
|
398 |
-
filtered_df = filtered_df[
|
399 |
-
~filtered_df["Model"].str.contains("|".join(exclude_list), case=False)
|
400 |
-
]
|
401 |
-
|
402 |
-
# Apply WER and QoI filters
|
403 |
-
min_wer_slider, max_wer_slider = wer_slider
|
404 |
-
min_qoi_slider, max_qoi_slider = qoi_slider
|
405 |
-
if "Average WER" in filtered_df.columns:
|
406 |
-
filtered_df = filtered_df[
|
407 |
-
(filtered_df["Average WER"] >= min_wer_slider)
|
408 |
-
& (filtered_df["Average WER"] <= max_wer_slider)
|
409 |
-
]
|
410 |
-
if "QoI" in filtered_df.columns:
|
411 |
-
filtered_df = filtered_df[
|
412 |
-
(filtered_df["QoI"] >= min_qoi_slider)
|
413 |
-
& (filtered_df["QoI"] <= max_qoi_slider)
|
414 |
-
]
|
415 |
-
|
416 |
-
return filtered_df
|
417 |
-
|
418 |
-
|
419 |
def update_performance_filters(release):
|
420 |
"""
|
421 |
Updates the performance filters (devices and OS) based on the selected release.
|
@@ -481,96 +342,6 @@ text_diff_elems = []
|
|
481 |
|
482 |
tabs = gr.Tabs(elem_id="tab-elems")
|
483 |
|
484 |
-
|
485 |
-
def update_multilingual_results(selected_model):
|
486 |
-
"""
|
487 |
-
Updates the multilingual results display based on the selected model.
|
488 |
-
|
489 |
-
This function processes the multilingual data for the chosen model,
|
490 |
-
calculates average WER for different scenarios (language hinted vs. predicted),
|
491 |
-
and prepares language-specific WER data for display.
|
492 |
-
|
493 |
-
:param selected_model: The name of the selected model
|
494 |
-
:return: A list containing updated components for the Gradio interface
|
495 |
-
"""
|
496 |
-
if selected_model is None:
|
497 |
-
return "# Select a model from the dropdown to view results."
|
498 |
-
|
499 |
-
# Filter data for the selected model
|
500 |
-
model_data = multilingual_df[multilingual_df["Model"] == selected_model]
|
501 |
-
|
502 |
-
if model_data.empty:
|
503 |
-
return f"# No data available for model: {selected_model}"
|
504 |
-
|
505 |
-
# Separate data for forced and not forced scenarios
|
506 |
-
forced_data = model_data[model_data["Forced Tokens"] == True]
|
507 |
-
not_forced_data = model_data[model_data["Forced Tokens"] == False]
|
508 |
-
|
509 |
-
result_text = f"# Model: {selected_model}\n\n"
|
510 |
-
|
511 |
-
# Prepare average WER data
|
512 |
-
average_wer_data = []
|
513 |
-
if not forced_data.empty:
|
514 |
-
average_wer_data.append(
|
515 |
-
{
|
516 |
-
"Scenario": "Language Hinted",
|
517 |
-
"Average WER": forced_data.iloc[0]["Average WER"],
|
518 |
-
}
|
519 |
-
)
|
520 |
-
if not not_forced_data.empty:
|
521 |
-
average_wer_data.append(
|
522 |
-
{
|
523 |
-
"Scenario": "Language Predicted",
|
524 |
-
"Average WER": not_forced_data.iloc[0]["Average WER"],
|
525 |
-
}
|
526 |
-
)
|
527 |
-
average_wer_df = pd.DataFrame(average_wer_data)
|
528 |
-
average_wer_df["Average WER"] = average_wer_df["Average WER"].apply(
|
529 |
-
lambda x: round(x, 2)
|
530 |
-
)
|
531 |
-
|
532 |
-
# Prepare language-specific WER data
|
533 |
-
lang_columns = [col for col in model_data.columns if col.startswith("WER_")]
|
534 |
-
lang_wer_data = []
|
535 |
-
for column in lang_columns:
|
536 |
-
lang = column.split("_")[1]
|
537 |
-
forced_wer = forced_data[column].iloc[0] if not forced_data.empty else None
|
538 |
-
not_forced_wer = (
|
539 |
-
not_forced_data[column].iloc[0] if not not_forced_data.empty else None
|
540 |
-
)
|
541 |
-
if forced_wer is not None or not_forced_wer is not None:
|
542 |
-
lang_wer_data.append(
|
543 |
-
{
|
544 |
-
"Language": LANGUAGE_MAP[lang],
|
545 |
-
"Language Hinted WER": round(forced_wer, 2)
|
546 |
-
if forced_wer is not None
|
547 |
-
else "N/A",
|
548 |
-
"Language Predicted WER": round(not_forced_wer, 2)
|
549 |
-
if not_forced_wer is not None
|
550 |
-
else "N/A",
|
551 |
-
}
|
552 |
-
)
|
553 |
-
lang_wer_df = pd.DataFrame(lang_wer_data)
|
554 |
-
lang_wer_df = lang_wer_df.fillna("No Data")
|
555 |
-
|
556 |
-
# Create confusion matrix plot for unforced scenario
|
557 |
-
unforced_plot = None
|
558 |
-
if selected_model in confusion_matrix_map:
|
559 |
-
if "not_forced" in confusion_matrix_map[selected_model]:
|
560 |
-
unforced_plot = create_confusion_matrix_plot(
|
561 |
-
confusion_matrix_map[selected_model]["not_forced"]["matrix"],
|
562 |
-
confusion_matrix_map[selected_model]["not_forced"]["labels"],
|
563 |
-
False,
|
564 |
-
)
|
565 |
-
|
566 |
-
# Return updated components for Gradio interface
|
567 |
-
return [
|
568 |
-
gr.update(value=result_text),
|
569 |
-
gr.update(visible=True, value=average_wer_df),
|
570 |
-
gr.update(visible=True, value=lang_wer_df),
|
571 |
-
gr.update(visible=unforced_plot is not None, value=unforced_plot),
|
572 |
-
]
|
573 |
-
|
574 |
font = [
|
575 |
"Zwizz Regular", # Local font
|
576 |
"IBM Plex Mono", # Monospace font
|
@@ -579,6 +350,9 @@ font = [
|
|
579 |
"sans-serif",
|
580 |
]
|
581 |
|
|
|
|
|
|
|
582 |
# Define the Gradio interface
|
583 |
with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo:
|
584 |
# Add header and banner to the interface
|
@@ -596,7 +370,7 @@ with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo:
|
|
596 |
# Create tabs for different sections of the dashboard
|
597 |
with tabs.render():
|
598 |
# Performance Tab
|
599 |
-
with gr.TabItem("
|
600 |
with gr.Row():
|
601 |
with gr.Column(scale=1):
|
602 |
with gr.Row():
|
@@ -844,106 +618,6 @@ with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo:
|
|
844 |
outputs=filter_output
|
845 |
)
|
846 |
|
847 |
-
# English Quality Tab
|
848 |
-
with gr.TabItem("English Quality", elem_id="timeline", id=1):
|
849 |
-
with gr.Row():
|
850 |
-
with gr.Column(scale=1):
|
851 |
-
with gr.Row():
|
852 |
-
with gr.Column(scale=6, elem_classes="filter_models_column"):
|
853 |
-
filter_quality_models = gr.Textbox(
|
854 |
-
placeholder="🔍 Filter Model (separate multiple queries with ';')",
|
855 |
-
label="Filter Models",
|
856 |
-
)
|
857 |
-
with gr.Column(scale=4, elem_classes="exclude_models_column"):
|
858 |
-
exclude_quality_models = gr.Textbox(
|
859 |
-
placeholder="🔍 Exclude Model",
|
860 |
-
label="Exclude Model",
|
861 |
-
)
|
862 |
-
with gr.Row():
|
863 |
-
with gr.Accordion("See All Columns", open=False):
|
864 |
-
quality_shown_columns = gr.CheckboxGroup(
|
865 |
-
choices=QUALITY_TOGGLE_COLS,
|
866 |
-
value=QUALITY_SELECTED_COLS,
|
867 |
-
label="Toggle Columns",
|
868 |
-
elem_id="column-select",
|
869 |
-
interactive=True,
|
870 |
-
)
|
871 |
-
with gr.Column(scale=1):
|
872 |
-
with gr.Accordion("See Quality Filters"):
|
873 |
-
with gr.Row():
|
874 |
-
with gr.Row():
|
875 |
-
quality_min_avg_wer, quality_max_avg_wer = (
|
876 |
-
floor(min(model_df["Average WER"])),
|
877 |
-
ceil(max(model_df["Average WER"])) + 1,
|
878 |
-
)
|
879 |
-
wer_slider = RangeSlider(
|
880 |
-
value=[quality_min_avg_wer, quality_max_avg_wer],
|
881 |
-
minimum=quality_min_avg_wer,
|
882 |
-
maximum=quality_max_avg_wer,
|
883 |
-
label="Average WER",
|
884 |
-
)
|
885 |
-
with gr.Row():
|
886 |
-
quality_min_qoi, quality_max_qoi = floor(
|
887 |
-
min(model_df["QoI"])
|
888 |
-
), ceil(max(model_df["QoI"] + 1))
|
889 |
-
qoi_slider = RangeSlider(
|
890 |
-
value=[quality_min_qoi, quality_max_qoi],
|
891 |
-
minimum=quality_min_qoi,
|
892 |
-
maximum=quality_max_qoi,
|
893 |
-
label="QoI",
|
894 |
-
)
|
895 |
-
with gr.Row():
|
896 |
-
gr.Markdown(QUALITY_TEXT)
|
897 |
-
with gr.Row():
|
898 |
-
quality_leaderboard_df = gr.components.Dataframe(
|
899 |
-
value=model_df[
|
900 |
-
QUALITY_ALWAYS_HERE_COLS + quality_shown_columns.value
|
901 |
-
],
|
902 |
-
headers=[QUALITY_ALWAYS_HERE_COLS + quality_shown_columns.value],
|
903 |
-
datatype=[
|
904 |
-
c.type
|
905 |
-
for c in fields(QualityAutoEvalColumn)
|
906 |
-
if c.name in QUALITY_COLS
|
907 |
-
],
|
908 |
-
elem_id="leaderboard-table",
|
909 |
-
elem_classes="large-table",
|
910 |
-
interactive=False,
|
911 |
-
)
|
912 |
-
|
913 |
-
# Copy of the leaderboard dataframe to apply filters to
|
914 |
-
hidden_quality_leaderboard_df = gr.components.Dataframe(
|
915 |
-
value=model_df,
|
916 |
-
headers=QUALITY_COLS,
|
917 |
-
datatype=[
|
918 |
-
c.type
|
919 |
-
for c in fields(QualityAutoEvalColumn)
|
920 |
-
if c.name in QUALITY_COLS
|
921 |
-
],
|
922 |
-
visible=False,
|
923 |
-
)
|
924 |
-
|
925 |
-
# Inputs for the dataframe filter function
|
926 |
-
filter_inputs = [
|
927 |
-
hidden_quality_leaderboard_df,
|
928 |
-
quality_shown_columns,
|
929 |
-
filter_quality_models,
|
930 |
-
wer_slider,
|
931 |
-
qoi_slider,
|
932 |
-
exclude_quality_models,
|
933 |
-
]
|
934 |
-
filter_output = quality_leaderboard_df
|
935 |
-
filter_quality_models.change(
|
936 |
-
quality_filter, filter_inputs, filter_output
|
937 |
-
)
|
938 |
-
exclude_quality_models.change(
|
939 |
-
quality_filter, filter_inputs, filter_output
|
940 |
-
)
|
941 |
-
quality_shown_columns.change(
|
942 |
-
quality_filter, filter_inputs, filter_output
|
943 |
-
)
|
944 |
-
wer_slider.change(quality_filter, filter_inputs, filter_output)
|
945 |
-
qoi_slider.change(quality_filter, filter_inputs, filter_output)
|
946 |
-
|
947 |
# Timeline Tab
|
948 |
with gr.TabItem("Timeline", elem_id="timeline", id=4):
|
949 |
# Create subtabs for different metrics
|
@@ -1204,55 +878,6 @@ with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo:
|
|
1204 |
toks_plot,
|
1205 |
)
|
1206 |
|
1207 |
-
# Multilingual Quality Tab
|
1208 |
-
with gr.TabItem("Multilingual Quality", elem_id="multilingual", id=5):
|
1209 |
-
if multilingual_df is not None:
|
1210 |
-
with gr.Row():
|
1211 |
-
with gr.Column(scale=1):
|
1212 |
-
# Display table of multilingual models
|
1213 |
-
model_table = gr.Dataframe(
|
1214 |
-
value=multilingual_models_df,
|
1215 |
-
headers=["Model"],
|
1216 |
-
datatype=["html"],
|
1217 |
-
elem_classes="left-side-table",
|
1218 |
-
)
|
1219 |
-
# Placeholders for confusion matrix plots
|
1220 |
-
with gr.Row():
|
1221 |
-
unforced_confusion_matrix = gr.Plot(visible=False)
|
1222 |
-
with gr.Row():
|
1223 |
-
forced_confusion_matrix = gr.Plot(visible=False)
|
1224 |
-
|
1225 |
-
with gr.Column(scale=1):
|
1226 |
-
# Display area for selected model results
|
1227 |
-
results_markdown = gr.Markdown(
|
1228 |
-
"# Select a model from the table on the left to view results.",
|
1229 |
-
elem_id="multilingual-results",
|
1230 |
-
)
|
1231 |
-
# Tables for displaying average WER and language-specific WER
|
1232 |
-
average_wer_table = gr.Dataframe(
|
1233 |
-
value=None, elem_id="average-wer-table", visible=False
|
1234 |
-
)
|
1235 |
-
language_wer_table = gr.Dataframe(
|
1236 |
-
value=None, elem_id="general-wer-table", visible=False
|
1237 |
-
)
|
1238 |
-
|
1239 |
-
# Set up click event to update results when a model is selected
|
1240 |
-
for button in multilingual_models_buttons:
|
1241 |
-
button.render()
|
1242 |
-
button.click(
|
1243 |
-
fn=lambda x: update_multilingual_results(x),
|
1244 |
-
inputs=[button],
|
1245 |
-
outputs=[
|
1246 |
-
results_markdown,
|
1247 |
-
average_wer_table,
|
1248 |
-
language_wer_table,
|
1249 |
-
unforced_confusion_matrix,
|
1250 |
-
],
|
1251 |
-
)
|
1252 |
-
else:
|
1253 |
-
# Display message if no multilingual data is available
|
1254 |
-
gr.Markdown("No multilingual benchmark results available.")
|
1255 |
-
|
1256 |
# Device Support Tab
|
1257 |
with gr.TabItem("Device Support", elem_id="device_support", id=6):
|
1258 |
# Load device support data from CSV
|
@@ -1433,4 +1058,4 @@ with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo:
|
|
1433 |
)
|
1434 |
|
1435 |
# Launch the Gradio interface
|
1436 |
-
demo.launch(debug=True
|
|
|
23 |
CITATION_BUTTON_TEXT,
|
24 |
COL_NAMES,
|
25 |
HEADER,
|
|
|
26 |
METHODOLOGY_TEXT,
|
27 |
PERFORMANCE_TEXT,
|
|
|
|
|
28 |
)
|
29 |
from utils import (
|
30 |
add_datasets_to_performance_columns,
|
|
|
|
|
31 |
create_initial_performance_column_dict,
|
|
|
32 |
css,
|
33 |
fields,
|
34 |
get_os_name_and_version,
|
|
|
35 |
make_model_name_clickable_link,
|
|
|
36 |
plot_metric,
|
37 |
read_json_line_by_line,
|
38 |
)
|
|
|
53 |
|
54 |
# Load benchmark data from JSON files
|
55 |
PERFORMANCE_DATA = read_json_line_by_line("dashboard_data/performance_data.json")
|
|
|
56 |
with open("dashboard_data/version.json", "r") as file:
|
57 |
VERSION_DATA = json.load(file)
|
58 |
|
59 |
SHA_TO_VERSION = {
|
60 |
+
VERSION_DATA["releases"][i]: VERSION_DATA["versions"][i]
|
61 |
+
for i in range(len(VERSION_DATA["versions"]))
|
62 |
}
|
63 |
|
64 |
+
# Convert JSON data to pandas DataFrames - performance only
|
|
|
65 |
benchmark_df = pd.json_normalize(PERFORMANCE_DATA)
|
66 |
releases = VERSION_DATA["releases"]
|
67 |
|
68 |
# Process timestamp data
|
69 |
+
benchmark_df["timestamp"] = pd.to_datetime(benchmark_df["timestamp"]).dt.tz_localize(None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
# Use average_wer directly from performance data
|
72 |
+
benchmark_df["english_wer"] = benchmark_df["average_wer"]
|
73 |
|
74 |
sorted_performance_df = (
|
75 |
benchmark_df.assign(model_len=benchmark_df["model"].str.len())
|
|
|
83 |
)
|
84 |
|
85 |
# Identify dataset-specific columns
|
|
|
|
|
|
|
86 |
dataset_speed_columns = [
|
87 |
col for col in sorted_performance_df.columns if col.startswith("dataset_speed.")
|
88 |
]
|
|
|
93 |
]
|
94 |
|
95 |
# Extract dataset names
|
|
|
96 |
PERFORMANCE_DATASETS = [col.split(".")[-1] for col in dataset_speed_columns]
|
97 |
|
98 |
# Prepare DataFrames for display
|
|
|
|
|
|
|
99 |
performance_df = sorted_performance_df[
|
100 |
[
|
101 |
"model",
|
102 |
"device",
|
103 |
"os",
|
104 |
"english_wer",
|
|
|
105 |
"qoi",
|
106 |
"speed",
|
107 |
"tokens_per_second",
|
|
|
116 |
performance_df = performance_df.rename(
|
117 |
lambda x: COL_NAMES[x] if x in COL_NAMES else x, axis="columns"
|
118 |
)
|
|
|
|
|
|
|
119 |
|
120 |
# Process dataset-specific columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
for col in dataset_speed_columns:
|
122 |
dataset_name = col.split(".")[-1]
|
123 |
performance_df = performance_df.rename(
|
|
|
135 |
)
|
136 |
|
137 |
# Process model names for display
|
|
|
138 |
performance_df["model_raw"] = performance_df["Model"].copy()
|
139 |
+
performance_df["Model"] = performance_df["Model"].apply(lambda x: make_model_name_clickable_link(x))
|
|
|
|
|
|
|
140 |
|
141 |
# Extract unique devices and OS versions
|
142 |
initial_release_df = benchmark_df[benchmark_df["commit_hash"] == releases[-1]]
|
|
|
146 |
|
147 |
# Create initial column dictionaries and update with dataset information
|
148 |
initial_performance_column_dict = create_initial_performance_column_dict()
|
|
|
149 |
|
150 |
performance_column_info = add_datasets_to_performance_columns(
|
151 |
initial_performance_column_dict, PERFORMANCE_DATASETS
|
152 |
)
|
|
|
|
|
|
|
153 |
|
154 |
# Unpack the returned dictionaries
|
155 |
updated_performance_column_dict = performance_column_info["column_dict"]
|
|
|
156 |
|
157 |
PerformanceAutoEvalColumn = performance_column_info["AutoEvalColumn"]
|
|
|
158 |
|
159 |
# Define column sets for different views
|
160 |
PERFORMANCE_COLS = performance_column_info["COLS"]
|
|
|
161 |
PERFORMANCE_TYPES = performance_column_info["TYPES"]
|
|
|
162 |
PERFORMANCE_ALWAYS_HERE_COLS = performance_column_info["ALWAYS_HERE_COLS"]
|
|
|
163 |
PERFORMANCE_TOGGLE_COLS = performance_column_info["TOGGLE_COLS"]
|
|
|
164 |
PERFORMANCE_SELECTED_COLS = performance_column_info["SELECTED_COLS"]
|
|
|
165 |
|
166 |
def get_release_devices(release):
|
167 |
"""
|
|
|
277 |
return filtered_df
|
278 |
|
279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
def update_performance_filters(release):
|
281 |
"""
|
282 |
Updates the performance filters (devices and OS) based on the selected release.
|
|
|
342 |
|
343 |
tabs = gr.Tabs(elem_id="tab-elems")
|
344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
font = [
|
346 |
"Zwizz Regular", # Local font
|
347 |
"IBM Plex Mono", # Monospace font
|
|
|
350 |
"sans-serif",
|
351 |
]
|
352 |
|
353 |
+
# Macos 14, 15, 26
|
354 |
+
# ios 17, 18, 26
|
355 |
+
|
356 |
# Define the Gradio interface
|
357 |
with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo:
|
358 |
# Add header and banner to the interface
|
|
|
370 |
# Create tabs for different sections of the dashboard
|
371 |
with tabs.render():
|
372 |
# Performance Tab
|
373 |
+
with gr.TabItem("Benchmark", elem_id="benchmark", id=0):
|
374 |
with gr.Row():
|
375 |
with gr.Column(scale=1):
|
376 |
with gr.Row():
|
|
|
618 |
outputs=filter_output
|
619 |
)
|
620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
621 |
# Timeline Tab
|
622 |
with gr.TabItem("Timeline", elem_id="timeline", id=4):
|
623 |
# Create subtabs for different metrics
|
|
|
878 |
toks_plot,
|
879 |
)
|
880 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
881 |
# Device Support Tab
|
882 |
with gr.TabItem("Device Support", elem_id="device_support", id=6):
|
883 |
# Load device support data from CSV
|
|
|
1058 |
)
|
1059 |
|
1060 |
# Launch the Gradio interface
|
1061 |
+
demo.launch(debug=True)
|
multilingual_generate.py
DELETED
@@ -1,133 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import shutil
|
4 |
-
import sys
|
5 |
-
from collections import defaultdict
|
6 |
-
|
7 |
-
import numpy as np
|
8 |
-
import pandas as pd
|
9 |
-
from sklearn.metrics import confusion_matrix
|
10 |
-
|
11 |
-
from utils import compute_average_wer, download_dataset
|
12 |
-
|
13 |
-
|
14 |
-
def main():
|
15 |
-
"""
|
16 |
-
Main function to orchestrate the multilingual data generation process.
|
17 |
-
|
18 |
-
This function performs the following steps:
|
19 |
-
1. Downloads multilingual evaluation data if requested.
|
20 |
-
2. Processes multilingual evaluation files.
|
21 |
-
3. Calculates and saves results, including Word Error Rate (WER) and
|
22 |
-
language detection confusion matrices.
|
23 |
-
"""
|
24 |
-
source_repo = "argmaxinc/whisperkit-evals-multilingual"
|
25 |
-
source_subfolder = "WhisperKit"
|
26 |
-
source_directory = f"{source_repo}/{source_subfolder}"
|
27 |
-
if len(sys.argv) > 1 and sys.argv[1] == "download":
|
28 |
-
try:
|
29 |
-
shutil.rmtree(source_repo)
|
30 |
-
except:
|
31 |
-
print("Nothing to remove.")
|
32 |
-
download_dataset(source_repo, source_repo, source_subfolder)
|
33 |
-
|
34 |
-
results = defaultdict(
|
35 |
-
lambda: {
|
36 |
-
"average_wer": [],
|
37 |
-
"language_wer": defaultdict(list),
|
38 |
-
"language_detection": [],
|
39 |
-
}
|
40 |
-
)
|
41 |
-
|
42 |
-
confusion_matrices = {}
|
43 |
-
|
44 |
-
for subdir, _, files in os.walk(source_directory):
|
45 |
-
for filename in files:
|
46 |
-
if not filename.endswith(".json") or "summary" in filename:
|
47 |
-
continue
|
48 |
-
|
49 |
-
file_path = os.path.join(subdir, filename)
|
50 |
-
with open(file_path, "r") as f:
|
51 |
-
data = json.load(f)
|
52 |
-
|
53 |
-
subdir_components = subdir.split(os.path.sep)
|
54 |
-
is_forced = "forced" in subdir_components
|
55 |
-
model = subdir_components[-3] if not is_forced else subdir_components[-4]
|
56 |
-
|
57 |
-
key = f"{model}/{'forced' if is_forced else 'not_forced'}"
|
58 |
-
|
59 |
-
for item in data["results"]:
|
60 |
-
if "reference_language" not in item:
|
61 |
-
continue
|
62 |
-
reference_language = item["reference_language"]
|
63 |
-
wer = item["wer"]
|
64 |
-
detected_language = item["predicted_language"]
|
65 |
-
|
66 |
-
result = {
|
67 |
-
"reference": item["reference"],
|
68 |
-
"prediction": item["prediction"],
|
69 |
-
}
|
70 |
-
|
71 |
-
results[key]["average_wer"].append(result)
|
72 |
-
results[key]["language_wer"][reference_language].append(result)
|
73 |
-
results[key]["language_detection"].append(
|
74 |
-
(reference_language, detected_language)
|
75 |
-
)
|
76 |
-
|
77 |
-
calculate_and_save_results(results, confusion_matrices)
|
78 |
-
|
79 |
-
|
80 |
-
def calculate_and_save_results(results, confusion_matrices):
|
81 |
-
"""
|
82 |
-
Calculates final multilingual metrics and saves them to CSV and JSON files.
|
83 |
-
|
84 |
-
:param results: Dictionary containing raw multilingual evaluation data.
|
85 |
-
:param confusion_matrices: Dictionary to store confusion matrices for language detection.
|
86 |
-
|
87 |
-
This function processes the raw multilingual data, calculates average metrics,
|
88 |
-
creates confusion matrices for language detection, and saves the results to:
|
89 |
-
1. A CSV file with WER data for each model and language.
|
90 |
-
2. A JSON file with confusion matrices for language detection.
|
91 |
-
"""
|
92 |
-
wer_data = []
|
93 |
-
for key, data in results.items():
|
94 |
-
model, forced = key.rsplit("/", 1)
|
95 |
-
model = model.replace("_", "/")
|
96 |
-
row = {
|
97 |
-
"Model": model,
|
98 |
-
"Forced Tokens": forced == "forced",
|
99 |
-
"Average WER": compute_average_wer(data["average_wer"]),
|
100 |
-
}
|
101 |
-
for lang, wers in data["language_wer"].items():
|
102 |
-
row[f"WER_{lang}"] = compute_average_wer(wers)
|
103 |
-
wer_data.append(row)
|
104 |
-
|
105 |
-
true_languages, detected_languages = zip(*data["language_detection"])
|
106 |
-
unique_languages = sorted(set(true_languages))
|
107 |
-
cm = confusion_matrix(
|
108 |
-
true_languages, detected_languages, labels=unique_languages
|
109 |
-
)
|
110 |
-
|
111 |
-
row_sums = cm.sum(axis=1)
|
112 |
-
cm_normalized = np.zeros_like(cm, dtype=float)
|
113 |
-
non_zero_rows = row_sums != 0
|
114 |
-
cm_normalized[non_zero_rows] = (
|
115 |
-
cm[non_zero_rows] / row_sums[non_zero_rows, np.newaxis]
|
116 |
-
)
|
117 |
-
|
118 |
-
if model not in confusion_matrices:
|
119 |
-
confusion_matrices[model] = {}
|
120 |
-
confusion_matrices[model][forced] = {
|
121 |
-
"matrix": cm_normalized.tolist(),
|
122 |
-
"labels": unique_languages,
|
123 |
-
}
|
124 |
-
|
125 |
-
df = pd.DataFrame(wer_data)
|
126 |
-
df.to_csv("dashboard_data/multilingual_results.csv", index=False)
|
127 |
-
|
128 |
-
with open("dashboard_data/multilingual_confusion_matrices.json", "w") as f:
|
129 |
-
json.dump(confusion_matrices, f, indent=2)
|
130 |
-
|
131 |
-
|
132 |
-
if __name__ == "__main__":
|
133 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
CHANGED
@@ -84,23 +84,6 @@ def group_wer(group):
|
|
84 |
)
|
85 |
|
86 |
|
87 |
-
def load_multilingual_results(csv_file):
|
88 |
-
"""
|
89 |
-
Load multilingual results from a CSV file into a pandas DataFrame.
|
90 |
-
|
91 |
-
:param csv_file: Path to the CSV file containing multilingual results
|
92 |
-
:return: DataFrame with the loaded results, or None if the file is not found
|
93 |
-
|
94 |
-
This function attempts to load a CSV file using pandas, handling potential
|
95 |
-
FileNotFoundError exceptions.
|
96 |
-
"""
|
97 |
-
try:
|
98 |
-
df = pd.json_normalize(csv_file)
|
99 |
-
return df
|
100 |
-
except FileNotFoundError:
|
101 |
-
return None
|
102 |
-
|
103 |
-
|
104 |
def download_dataset(repo_id, local_dir, remote_dir, path_includes=""):
|
105 |
"""
|
106 |
Download benchmark result files from a specified Hugging Face repository to a local directory.
|
@@ -365,23 +348,6 @@ def make_timestamp_clickable_link(model, dataset, timestamp):
|
|
365 |
return f'<div style="color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;" {onclick} href="#">{timestamp}</div>'
|
366 |
|
367 |
|
368 |
-
def make_multilingual_model_clickable_link(model):
|
369 |
-
"""
|
370 |
-
Creates a clickable link for a multilingual model name.
|
371 |
-
|
372 |
-
:param model: String representing the model name
|
373 |
-
:return: An HTML string containing a clickable div for the model name
|
374 |
-
|
375 |
-
This function generates a formatted HTML div that can be used as a clickable
|
376 |
-
element in web interfaces, typically for displaying and interacting with multilingual model names.
|
377 |
-
"""
|
378 |
-
elem_id = (
|
379 |
-
f"{model}".replace(" ", "_").replace('"', "").replace("'", "").replace(",", "")
|
380 |
-
)
|
381 |
-
onclick = f"onclick=\"document.getElementById('{elem_id}').click();console.log('hello');\""
|
382 |
-
return f'<div style="color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;" {onclick} href="#">{model}</div>'
|
383 |
-
|
384 |
-
|
385 |
def plot_metric(
|
386 |
df, y_axis_col, y_axis_title, fig_title, filter_input=None, exclude_input=None
|
387 |
):
|
@@ -560,7 +526,6 @@ def create_initial_performance_column_dict():
|
|
560 |
],
|
561 |
["os", ColumnContent, ColumnContent("OS", "html", True, never_hidden=True)],
|
562 |
["english_wer", ColumnContent, ColumnContent("English WER", "html", True)],
|
563 |
-
["multilingual_wer", ColumnContent, ColumnContent("Multilingual WER", "str", True)],
|
564 |
["qoi", ColumnContent, ColumnContent("QoI", "html", False)],
|
565 |
["speed", ColumnContent, ColumnContent("Speed", "html", False)],
|
566 |
["toks", ColumnContent, ColumnContent("Tok / s", "html", False)],
|
|
|
84 |
)
|
85 |
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
def download_dataset(repo_id, local_dir, remote_dir, path_includes=""):
|
88 |
"""
|
89 |
Download benchmark result files from a specified Hugging Face repository to a local directory.
|
|
|
348 |
return f'<div style="color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;" {onclick} href="#">{timestamp}</div>'
|
349 |
|
350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
def plot_metric(
|
352 |
df, y_axis_col, y_axis_title, fig_title, filter_input=None, exclude_input=None
|
353 |
):
|
|
|
526 |
],
|
527 |
["os", ColumnContent, ColumnContent("OS", "html", True, never_hidden=True)],
|
528 |
["english_wer", ColumnContent, ColumnContent("English WER", "html", True)],
|
|
|
529 |
["qoi", ColumnContent, ColumnContent("QoI", "html", False)],
|
530 |
["speed", ColumnContent, ColumnContent("Speed", "html", False)],
|
531 |
["toks", ColumnContent, ColumnContent("Tok / s", "html", False)],
|