davidpomerenke commited on
Commit
a0d1624
·
verified ·
1 Parent(s): c790fdb

Upload from GitHub Actions: Merge pull request #18 from datenlabor-bmz/pr-17

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.github/workflows/nightly-evals.yml CHANGED
@@ -8,6 +8,8 @@ on:
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
 
 
11
  steps:
12
  - uses: actions/checkout@v3
13
 
@@ -25,6 +27,8 @@ jobs:
25
  env:
26
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
27
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
 
 
28
  run: |
29
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
  uv run evals/download_data.py
 
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
11
+ # checking if this is working in case eval runs take longer than 6h github actions allowance
12
+ timeout-minutes: 1440 # 24 hours timeout
13
  steps:
14
  - uses: actions/checkout@v3
15
 
 
27
  env:
28
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
29
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
+ N_SENTENCES: 20
31
+ MAX_LANGUAGES: 150
32
  run: |
33
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
34
  uv run evals/download_data.py
.gitignore CHANGED
@@ -20,3 +20,5 @@ wheels/
20
  # folders and files to be ignored
21
  .specstory/
22
  .cursorindexingignore
 
 
 
20
  # folders and files to be ignored
21
  .specstory/
22
  .cursorindexingignore
23
+
24
+
Dockerfile CHANGED
@@ -14,7 +14,7 @@ ENV HOME=/home/user \
14
  RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
15
  USER user
16
  WORKDIR $HOME/app
17
- COPY --chown=user pyproject.toml uv.lock ./
18
  RUN uv sync --frozen --no-dev
19
  COPY --chown=user evals/ evals/
20
  COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
 
14
  RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
15
  USER user
16
  WORKDIR $HOME/app
17
+ COPY --chown=user pyproject.toml uv.lock README.md ./
18
  RUN uv sync --frozen --no-dev
19
  COPY --chown=user evals/ evals/
20
  COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
README.md CHANGED
@@ -43,8 +43,21 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
43
 
44
  _Tracking language proficiency of AI models for every language_
45
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ## Evaluate
47
 
 
48
  ```bash
49
  uv run --extra dev evals/main.py
50
  ```
 
43
 
44
  _Tracking language proficiency of AI models for every language_
45
 
46
+ ## System Architecture
47
+
48
+ The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
49
+
50
+ > **Detailed Architecture**: See [system_architecture_diagram.md](system_architecture_diagram.md) for the complete system architecture diagram and component descriptions.
51
+
52
+ **Key Features:**
53
+ - **Model Discovery**: Combines curated models with real-time trending models via web scraping
54
+ - **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
55
+ - **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
56
+ - **Real-time Visualization**: Interactive web interface with country-level insights
57
+
58
  ## Evaluate
59
 
60
+ ### Local Development
61
  ```bash
62
  uv run --extra dev evals/main.py
63
  ```
datasets.json CHANGED
@@ -219,7 +219,7 @@
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
- "implemented": true,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
@@ -256,7 +256,7 @@
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
- "implemented": true,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
@@ -360,7 +360,7 @@
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
- "implemented": true,
364
  "group": "ARC Question Answering"
365
  },
366
  {
@@ -375,7 +375,7 @@
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
- "implemented": true,
379
  "group": "ARC Question Answering"
380
  },
381
  {
@@ -420,7 +420,7 @@
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
- "implemented": true,
424
  "group": "Truthfulness"
425
  },
426
  {
@@ -435,7 +435,7 @@
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
- "implemented": true,
439
  "group": "Truthfulness"
440
  },
441
  {
 
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
+ "implemented": false,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
 
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
+ "implemented": false,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
 
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
+ "implemented": false,
364
  "group": "ARC Question Answering"
365
  },
366
  {
 
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
+ "implemented": false,
379
  "group": "ARC Question Answering"
380
  },
381
  {
 
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
+ "implemented": false,
424
  "group": "Truthfulness"
425
  },
426
  {
 
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
+ "implemented": false,
439
  "group": "Truthfulness"
440
  },
441
  {
evals/__init__.py CHANGED
@@ -1 +0,0 @@
1
-
 
 
evals/backend.py CHANGED
@@ -4,7 +4,8 @@ import os
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
- from countries import make_country_table
 
8
  from fastapi import FastAPI, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.middleware.gzip import GZipMiddleware
@@ -26,7 +27,7 @@ task_metrics = [
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
29
- # "truthfulqa_accuracy",
30
  "mgsm_accuracy",
31
  ]
32
 
@@ -39,28 +40,77 @@ def compute_normalized_average(df, metrics):
39
  col_min = normalized_df[col].min()
40
  col_max = normalized_df[col].max()
41
  if col_max > col_min: # Avoid division by zero
42
- normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
 
 
43
  else:
44
  normalized_df[col] = 0 # If all values are the same, set to 0
45
  return normalized_df.mean(axis=1, skipna=False)
46
 
47
 
48
- def make_model_table(df, models):
49
- df = (
50
- df.groupby(["model", "task", "metric"])
51
- .agg({"score": "mean", "bcp_47": "nunique"})
52
- .reset_index()
 
 
 
 
 
 
 
53
  )
54
- df["task_metric"] = df["task"] + "_" + df["metric"]
55
- df = df.drop(columns=["task", "metric"])
56
- df = df.pivot(index="model", columns="task_metric", values="score")
 
 
 
 
 
 
 
57
  for metric in task_metrics:
58
  if metric not in df.columns:
59
  df[metric] = np.nan
 
60
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  df = df.sort_values(by="average", ascending=False).reset_index()
62
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
63
  df["rank"] = df.index + 1
 
 
 
 
 
64
  df = df[
65
  [
66
  "rank",
@@ -74,27 +124,81 @@ def make_model_table(df, models):
74
  "license",
75
  "cost",
76
  "average",
77
- *task_metrics,
78
  ]
79
  ]
80
  return df
81
 
82
 
83
- def make_language_table(df, languages):
84
- df = (
85
- df.groupby(["bcp_47", "task", "metric"])
86
- .agg({"score": "mean", "model": "nunique"})
87
- .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
89
- df["task_metric"] = df["task"] + "_" + df["metric"]
90
- df = df.drop(columns=["task", "metric"])
91
- df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
 
92
  for metric in task_metrics:
93
  if metric not in df.columns:
94
  df[metric] = np.nan
 
95
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  df = pd.merge(languages, df, on="bcp_47", how="outer")
97
  df = df.sort_values(by="speakers", ascending=False)
 
 
 
 
 
98
  df = df[
99
  [
100
  "bcp_47",
@@ -104,7 +208,7 @@ def make_language_table(df, languages):
104
  "family",
105
  "average",
106
  "in_benchmark",
107
- *task_metrics,
108
  ]
109
  ]
110
  return df
@@ -125,10 +229,22 @@ async def data(request: Request):
125
  body = await request.body()
126
  data = json.loads(body)
127
  selected_languages = data.get("selectedLanguages", {})
128
- df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
 
 
 
 
129
  # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
130
  language_table = make_language_table(df, languages)
131
  datasets_df = pd.read_json("datasets.json")
 
 
 
 
 
 
 
 
132
  if selected_languages:
133
  # the filtering is only applied for the model table and the country data
134
  df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
@@ -143,6 +259,7 @@ async def data(request: Request):
143
  "language_table": serialize(language_table),
144
  "dataset_table": serialize(datasets_df),
145
  "countries": serialize(countries),
 
146
  }
147
  return JSONResponse(content=all_tables)
148
 
 
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
+
8
+ from evals.countries import make_country_table
9
  from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.middleware.gzip import GZipMiddleware
 
27
  "classification_accuracy",
28
  "mmlu_accuracy",
29
  "arc_accuracy",
30
+ "truthfulqa_accuracy",
31
  "mgsm_accuracy",
32
  ]
33
 
 
40
  col_min = normalized_df[col].min()
41
  col_max = normalized_df[col].max()
42
  if col_max > col_min: # Avoid division by zero
43
+ normalized_df[col] = (normalized_df[col] - col_min) / (
44
+ col_max - col_min
45
+ )
46
  else:
47
  normalized_df[col] = 0 # If all values are the same, set to 0
48
  return normalized_df.mean(axis=1, skipna=False)
49
 
50
 
51
+ def make_model_table(scores_df, models):
52
+ # Create a combined task_metric for origin
53
+ scores_df["task_metric_origin"] = (
54
+ scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
55
+ )
56
+
57
+ # Pivot to get scores for each origin-specific metric
58
+ scores_pivot = scores_df.pivot_table(
59
+ index="model",
60
+ columns="task_metric_origin",
61
+ values="score",
62
+ aggfunc="mean",
63
  )
64
+
65
+ # Create the regular task_metric for the main average calculation
66
+ scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
67
+ main_pivot = scores_df.pivot_table(
68
+ index="model", columns="task_metric", values="score", aggfunc="mean"
69
+ )
70
+
71
+ # Merge the two pivots
72
+ df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
73
+
74
  for metric in task_metrics:
75
  if metric not in df.columns:
76
  df[metric] = np.nan
77
+
78
  df["average"] = compute_normalized_average(df, task_metrics)
79
+
80
+ # Compute origin presence per model+metric
81
+ origin_presence = (
82
+ scores_df.groupby(["model", "task_metric", "origin"])
83
+ .size()
84
+ .unstack(fill_value=0)
85
+ )
86
+ # Add boolean flags: show asterisk only if exclusively machine-origin contributed
87
+ for metric in task_metrics:
88
+ human_col_name = "human" if "human" in origin_presence.columns else None
89
+ machine_col_name = "machine" if "machine" in origin_presence.columns else None
90
+ if human_col_name or machine_col_name:
91
+ flags = []
92
+ for model in df.index:
93
+ try:
94
+ counts = origin_presence.loc[(model, metric)]
95
+ except KeyError:
96
+ flags.append(False)
97
+ continue
98
+ human_count = counts.get(human_col_name, 0) if human_col_name else 0
99
+ machine_count = (
100
+ counts.get(machine_col_name, 0) if machine_col_name else 0
101
+ )
102
+ flags.append(machine_count > 0 and human_count == 0)
103
+ df[f"{metric}_is_machine"] = flags
104
+ else:
105
+ df[f"{metric}_is_machine"] = False
106
  df = df.sort_values(by="average", ascending=False).reset_index()
107
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
108
  df["rank"] = df.index + 1
109
+
110
+ # Dynamically find all metric columns to include
111
+ final_cols = df.columns
112
+ metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
113
+
114
  df = df[
115
  [
116
  "rank",
 
124
  "license",
125
  "cost",
126
  "average",
127
+ *sorted(list(set(metric_cols))),
128
  ]
129
  ]
130
  return df
131
 
132
 
133
+ def make_language_table(scores_df, languages):
134
+ # Create a combined task_metric for origin
135
+ scores_df["task_metric_origin"] = (
136
+ scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
137
+ )
138
+
139
+ # Pivot to get scores for each origin-specific metric
140
+ scores_pivot = scores_df.pivot_table(
141
+ index="bcp_47",
142
+ columns="task_metric_origin",
143
+ values="score",
144
+ aggfunc="mean",
145
+ )
146
+
147
+ # Create the regular task_metric for the main average calculation
148
+ scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
149
+ main_pivot = scores_df.pivot_table(
150
+ index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
151
  )
152
+
153
+ # Merge the two pivots
154
+ df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
155
+
156
  for metric in task_metrics:
157
  if metric not in df.columns:
158
  df[metric] = np.nan
159
+
160
  df["average"] = compute_normalized_average(df, task_metrics)
161
+
162
+ # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
163
+ origin_presence = (
164
+ scores_df.groupby(["bcp_47", "task_metric", "origin"])
165
+ .size()
166
+ .unstack(fill_value=0)
167
+ )
168
+ for metric in task_metrics:
169
+ human_col_name = "human" if "human" in origin_presence.columns else None
170
+ machine_col_name = "machine" if "machine" in origin_presence.columns else None
171
+ if human_col_name or machine_col_name:
172
+ flags = []
173
+ for bcp in df.index:
174
+ try:
175
+ counts = origin_presence.loc[(bcp, metric)]
176
+ except KeyError:
177
+ flags.append(False)
178
+ continue
179
+ human_count = counts.get(human_col_name, 0) if human_col_name else 0
180
+ machine_count = (
181
+ counts.get(machine_col_name, 0) if machine_col_name else 0
182
+ )
183
+ flags.append(machine_count > 0 and human_count == 0)
184
+ df[f"{metric}_is_machine"] = flags
185
+ else:
186
+ df[f"{metric}_is_machine"] = False
187
+
188
+ # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
189
+ for metric in task_metrics:
190
+ machine_col = f"{metric}_machine"
191
+ if machine_col in df.columns:
192
+ df[f"{metric}_is_machine"] = df[machine_col].notna()
193
+ else:
194
+ df[f"{metric}_is_machine"] = False
195
  df = pd.merge(languages, df, on="bcp_47", how="outer")
196
  df = df.sort_values(by="speakers", ascending=False)
197
+
198
+ # Dynamically find all metric columns to include
199
+ final_cols = df.columns
200
+ metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
201
+
202
  df = df[
203
  [
204
  "bcp_47",
 
208
  "family",
209
  "average",
210
  "in_benchmark",
211
+ *sorted(list(set(metric_cols))),
212
  ]
213
  ]
214
  return df
 
229
  body = await request.body()
230
  data = json.loads(body)
231
  selected_languages = data.get("selectedLanguages", {})
232
+ df = (
233
+ scores.groupby(["model", "bcp_47", "task", "metric", "origin"])
234
+ .mean()
235
+ .reset_index()
236
+ )
237
  # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
238
  language_table = make_language_table(df, languages)
239
  datasets_df = pd.read_json("datasets.json")
240
+
241
+ # Identify which metrics have machine translations available
242
+ machine_translated_metrics = set()
243
+ for _, row in df.iterrows():
244
+ if row["origin"] == "machine":
245
+ metric_name = f"{row['task']}_{row['metric']}"
246
+ machine_translated_metrics.add(metric_name)
247
+
248
  if selected_languages:
249
  # the filtering is only applied for the model table and the country data
250
  df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
 
259
  "language_table": serialize(language_table),
260
  "dataset_table": serialize(datasets_df),
261
  "countries": serialize(countries),
262
+ "machine_translated_metrics": list(machine_translated_metrics),
263
  }
264
  return JSONResponse(content=all_tables)
265
 
evals/countries.py CHANGED
@@ -15,6 +15,7 @@ def population(bcp_47):
15
  }
16
  return items
17
 
 
18
  @cache
19
  def make_country_table(language_table):
20
  countries = defaultdict(list)
@@ -30,10 +31,15 @@ def make_country_table(language_table):
30
  )
31
  for country, languages in countries.items():
32
  speaker_pop = sum(entry["population"] for entry in languages)
33
- score = (
34
- sum(entry["score"] * entry["population"] for entry in languages)
35
- / speaker_pop
36
- )
 
 
 
 
 
37
  countries[country] = {
38
  "score": score,
39
  "languages": languages,
 
15
  }
16
  return items
17
 
18
+
19
  @cache
20
  def make_country_table(language_table):
21
  countries = defaultdict(list)
 
31
  )
32
  for country, languages in countries.items():
33
  speaker_pop = sum(entry["population"] for entry in languages)
34
+
35
+ if speaker_pop < 1000: # Grey out low-population countries
36
+ score = None # This will make them appear grey on the map
37
+ else:
38
+ score = (
39
+ sum(entry["score"] * entry["population"] for entry in languages)
40
+ / speaker_pop
41
+ )
42
+
43
  countries[country] = {
44
  "score": score,
45
  "languages": languages,
evals/datasets_/__init__.py CHANGED
@@ -1 +1 @@
1
- # This file makes datasets_ a Python package
 
1
+
evals/datasets_/arc.py CHANGED
@@ -1,11 +1,10 @@
1
  import random
2
- from collections import Counter, defaultdict
3
 
4
- from langcodes import Language, standardize_tag
5
  from rich import print
6
- from models import translate_google, google_supported_languages
7
  from tqdm import tqdm
8
- from datasets import Dataset, load_dataset
9
  import asyncio
10
  from tqdm.asyncio import tqdm_asyncio
11
  import os
@@ -14,27 +13,33 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
14
 
15
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
16
  tags_uhura_arc_easy = {
17
- standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
 
18
  if not a.endswith("unmatched")
19
  }
20
 
21
 
22
  random.seed(42)
23
- id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
 
 
 
24
  common_ids_train = list(sorted(set.intersection(*id_sets_train)))
25
  random.shuffle(common_ids_train)
26
- id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
 
 
 
27
  common_ids_test = list(sorted(set.intersection(*id_sets_test)))
28
  random.shuffle(common_ids_test)
29
 
30
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
31
  tags_uhura_arc_easy_translated = {
32
- standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 
33
  }
34
 
35
 
36
-
37
-
38
  def add_choices(row):
39
  row["choices"] = row["choices"]["text"]
40
  return row
@@ -45,37 +50,40 @@ def load_uhura_arc_easy(language_bcp_47, nr):
45
  ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
46
  ds = ds.map(add_choices)
47
  ds = ds.rename_column("answerKey", "answer")
48
- train_ids = common_ids_train[nr:nr+3]
49
- examples = ds["train"].filter(lambda x: x["id"] in train_ids)
50
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
51
- return "masakhane/uhura-arc-easy", examples, task
52
  if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
53
- ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
 
 
 
54
  ds = ds.rename_column("answerKey", "answer")
55
- train_ids = common_ids_train[nr:nr+3]
56
- examples = ds["train"].filter(lambda x: x["id"] in train_ids)
57
- # raise Exception(language_bcp_47)
58
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
59
- return "fair-forward/arc-easy-autotranslated", examples, task
60
  else:
61
  return None, None, None
62
 
 
63
  def translate_arc(languages):
64
  human_translated = tags_uhura_arc_easy.keys()
65
  untranslated = [
66
  lang
67
  for lang in languages["bcp_47"].values[:100]
68
- if lang not in human_translated and lang in google_supported_languages
69
  ]
70
  n_samples = 10
71
- train_ids = common_ids_train[:n_samples+3]
72
- en_train = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train")
 
 
73
  en_train = en_train.filter(lambda x: x["id"] in train_ids)
74
  test_ids = common_ids_test[:n_samples]
75
- en_test = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test")
 
 
76
  en_test = en_test.filter(lambda x: x["id"] in test_ids)
77
  data = {"train": en_train, "test": en_test}
78
-
79
  slug = "fair-forward/arc-easy-autotranslated"
80
  for lang in tqdm(untranslated):
81
  # check if already exists on hub
@@ -84,16 +92,22 @@ def translate_arc(languages):
84
  except (ValueError, Exception):
85
  print(f"Translating {lang}...")
86
  for split, data_en in data.items():
87
- questions_tr = [translate_google(q, "en", lang) for q in data_en["question"]]
 
 
88
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
89
  choices_texts_concatenated = []
90
  for choice in data_en["choices"]:
91
  for option in choice["text"]:
92
  choices_texts_concatenated.append(option)
93
- choices_tr = [translate_google(c, "en", lang) for c in choices_texts_concatenated]
 
 
94
  choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
95
  # group into chunks of 4
96
- choices_tr = [choices_tr[i:i+4] for i in range(0, len(choices_tr), 4)]
 
 
97
 
98
  ds_lang = Dataset.from_dict(
99
  {
@@ -110,5 +124,8 @@ def translate_arc(languages):
110
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
111
  )
112
  ds_lang.to_json(
113
- f"data/translations/arc/{lang}_{split}.json", lines=False, force_ascii=False, indent=2
 
 
 
114
  )
 
1
  import random
 
2
 
3
+ from langcodes import standardize_tag
4
  from rich import print
5
+ from models import translate_google, get_google_supported_languages
6
  from tqdm import tqdm
7
+ from datasets import load_dataset, Dataset
8
  import asyncio
9
  from tqdm.asyncio import tqdm_asyncio
10
  import os
 
13
 
14
  slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
15
  tags_uhura_arc_easy = {
16
+ standardize_tag(a.split("_")[0], macro=True): a
17
+ for a in _get_dataset_config_names(slug_uhura_arc_easy)
18
  if not a.endswith("unmatched")
19
  }
20
 
21
 
22
  random.seed(42)
23
+ id_sets_train = [
24
+ set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
25
+ for tag in tags_uhura_arc_easy.values()
26
+ ]
27
  common_ids_train = list(sorted(set.intersection(*id_sets_train)))
28
  random.shuffle(common_ids_train)
29
+ id_sets_test = [
30
+ set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
31
+ for tag in tags_uhura_arc_easy.values()
32
+ ]
33
  common_ids_test = list(sorted(set.intersection(*id_sets_test)))
34
  random.shuffle(common_ids_test)
35
 
36
  slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
37
  tags_uhura_arc_easy_translated = {
38
+ standardize_tag(a.split("_")[0], macro=True): a
39
+ for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
40
  }
41
 
42
 
 
 
43
  def add_choices(row):
44
  row["choices"] = row["choices"]["text"]
45
  return row
 
50
  ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
51
  ds = ds.map(add_choices)
52
  ds = ds.rename_column("answerKey", "answer")
 
 
53
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
54
+ return "masakhane/uhura-arc-easy", task, "human"
55
  if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
56
+ ds = _load_dataset(
57
+ slug_uhura_arc_easy_translated,
58
+ tags_uhura_arc_easy_translated[language_bcp_47],
59
+ )
60
  ds = ds.rename_column("answerKey", "answer")
 
 
 
61
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
62
+ return "fair-forward/arc-easy-autotranslated", task, "machine"
63
  else:
64
  return None, None, None
65
 
66
+
67
  def translate_arc(languages):
68
  human_translated = tags_uhura_arc_easy.keys()
69
  untranslated = [
70
  lang
71
  for lang in languages["bcp_47"].values[:100]
72
+ if lang not in human_translated and lang in get_google_supported_languages()
73
  ]
74
  n_samples = 10
75
+ train_ids = common_ids_train[: n_samples + 3]
76
+ en_train = _load_dataset(
77
+ slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train"
78
+ )
79
  en_train = en_train.filter(lambda x: x["id"] in train_ids)
80
  test_ids = common_ids_test[:n_samples]
81
+ en_test = _load_dataset(
82
+ slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test"
83
+ )
84
  en_test = en_test.filter(lambda x: x["id"] in test_ids)
85
  data = {"train": en_train, "test": en_test}
86
+
87
  slug = "fair-forward/arc-easy-autotranslated"
88
  for lang in tqdm(untranslated):
89
  # check if already exists on hub
 
92
  except (ValueError, Exception):
93
  print(f"Translating {lang}...")
94
  for split, data_en in data.items():
95
+ questions_tr = [
96
+ translate_google(q, "en", lang) for q in data_en["question"]
97
+ ]
98
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
99
  choices_texts_concatenated = []
100
  for choice in data_en["choices"]:
101
  for option in choice["text"]:
102
  choices_texts_concatenated.append(option)
103
+ choices_tr = [
104
+ translate_google(c, "en", lang) for c in choices_texts_concatenated
105
+ ]
106
  choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
107
  # group into chunks of 4
108
+ choices_tr = [
109
+ choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
110
+ ]
111
 
112
  ds_lang = Dataset.from_dict(
113
  {
 
124
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
125
  )
126
  ds_lang.to_json(
127
+ f"data/translations/arc/{lang}_{split}.json",
128
+ lines=False,
129
+ force_ascii=False,
130
+ indent=2,
131
  )
evals/datasets_/fleurs.py CHANGED
@@ -11,6 +11,7 @@ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
11
  lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
12
  )
13
 
 
14
  def download_file(url, path):
15
  response = requests.get(url)
16
  with open(path, "wb") as f:
@@ -34,4 +35,4 @@ def download_fleurs(transcription_langs_eval):
34
  if not tsv_path.exists():
35
  print(f"Downloading {tsv_url} to {tsv_path}")
36
  tsv_path.parent.mkdir(parents=True, exist_ok=True)
37
- download_file(tsv_url, tsv_path)
 
11
  lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
12
  )
13
 
14
+
15
  def download_file(url, path):
16
  response = requests.get(url)
17
  with open(path, "wb") as f:
 
35
  if not tsv_path.exists():
36
  print(f"Downloading {tsv_url} to {tsv_path}")
37
  tsv_path.parent.mkdir(parents=True, exist_ok=True)
38
+ download_file(tsv_url, tsv_path)
evals/datasets_/mgsm.py CHANGED
@@ -1,10 +1,12 @@
1
  import asyncio
2
  import os
 
3
 
4
  from datasets import Dataset, load_dataset
5
- from datasets_.util import _get_dataset_config_names, _load_dataset
6
- from langcodes import standardize_tag
7
- from models import google_supported_languages, translate_google
 
8
  from tqdm import tqdm
9
  from tqdm.asyncio import tqdm_asyncio
10
 
@@ -37,31 +39,50 @@ def parse_number(i):
37
  return None
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def load_mgsm(language_bcp_47, nr):
41
  if language_bcp_47 in tags_mgsm.keys():
42
- ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
43
- return slug_mgsm, ds[nr]
44
  elif language_bcp_47 in tags_afrimgsm.keys():
45
- ds = _load_dataset(
46
- slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
 
 
 
47
  )
48
- return slug_afrimgsm, ds[nr]
49
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
50
- ds = _load_dataset(
51
- slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
52
  )
53
- return slug_gsm_autotranslated, ds[nr]
54
- elif language_bcp_47 in tags_gsm8kx.keys():
55
- row = _load_dataset(
56
- slug_gsm8kx,
57
- subset=tags_gsm8kx[language_bcp_47],
58
- split="test",
59
- trust_remote_code=True,
60
- )[nr]
61
- row["answer_number"] = row["answer"].split("####")[1].strip()
62
- return slug_gsm8kx, row
63
  else:
64
- return None, None
65
 
66
 
67
  def translate_mgsm(languages):
@@ -69,7 +90,7 @@ def translate_mgsm(languages):
69
  untranslated = [
70
  lang
71
  for lang in languages["bcp_47"].values[:100]
72
- if lang not in human_translated and lang in google_supported_languages
73
  ]
74
  en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
75
  slug = "fair-forward/gsm-autotranslated"
@@ -96,5 +117,8 @@ def translate_mgsm(languages):
96
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
97
  )
98
  ds_lang.to_json(
99
- f"data/translations/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
 
 
 
100
  )
 
1
  import asyncio
2
  import os
3
+ import random
4
 
5
  from datasets import Dataset, load_dataset
6
+ from datasets_.util import _get_dataset_config_names, _load_dataset, cache
7
+ from langcodes import Language, standardize_tag
8
+ from models import get_google_supported_languages, translate_google
9
+ from rich import print
10
  from tqdm import tqdm
11
  from tqdm.asyncio import tqdm_asyncio
12
 
 
39
  return None
40
 
41
 
42
+ @cache
43
+ def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
44
+ """Cache individual MGSM items efficiently"""
45
+ try:
46
+ ds = _load_dataset(
47
+ dataset_slug,
48
+ subset=subset_tag,
49
+ split="test",
50
+ trust_remote_code=trust_remote_code,
51
+ )
52
+ if nr >= len(ds):
53
+ return None
54
+
55
+ row = ds[nr]
56
+
57
+ # Post-process based on dataset type
58
+ if dataset_slug == slug_gsm8kx:
59
+ row["answer_number"] = row["answer"].split("####")[1].strip()
60
+
61
+ return row
62
+ except Exception:
63
+ # Dataset doesn't exist or doesn't have test split
64
+ return None
65
+
66
+
67
  def load_mgsm(language_bcp_47, nr):
68
  if language_bcp_47 in tags_mgsm.keys():
69
+ item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
70
+ return slug_mgsm, item, "human" if item else (None, None, None)
71
  elif language_bcp_47 in tags_afrimgsm.keys():
72
+ item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
73
+ return slug_afrimgsm, item, "human" if item else (None, None, None)
74
+ elif language_bcp_47 in tags_gsm8kx.keys():
75
+ item = _get_mgsm_item(
76
+ slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True
77
  )
78
+ return slug_gsm8kx, item, "machine" if item else (None, None, None)
79
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
80
+ item = _get_mgsm_item(
81
+ slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr
82
  )
83
+ return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
 
 
 
 
 
 
 
 
 
84
  else:
85
+ return None, None, None
86
 
87
 
88
  def translate_mgsm(languages):
 
90
  untranslated = [
91
  lang
92
  for lang in languages["bcp_47"].values[:100]
93
+ if lang not in human_translated and lang in get_google_supported_languages()
94
  ]
95
  en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
96
  slug = "fair-forward/gsm-autotranslated"
 
117
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
118
  )
119
  ds_lang.to_json(
120
+ f"data/translations/mgsm/{lang}.json",
121
+ lines=False,
122
+ force_ascii=False,
123
+ indent=2,
124
  )
evals/datasets_/mmlu.py CHANGED
@@ -4,9 +4,9 @@ import random
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
- from datasets_.util import _get_dataset_config_names, _load_dataset
8
  from langcodes import Language, standardize_tag
9
- from models import google_supported_languages, translate_google
10
  from rich import print
11
  from tqdm import tqdm
12
  from tqdm.asyncio import tqdm_asyncio
@@ -111,6 +111,7 @@ def print_datasets_analysis():
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
 
114
  # print_datasets_analysis()
115
 
116
 
@@ -143,32 +144,61 @@ tags_mmlux = set(
143
  a.rsplit("_", 1)[1].split("-")[0].lower()
144
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
145
  )
146
- tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 
 
 
147
 
148
  categories = sorted(
149
- list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
150
- )
 
 
 
 
 
 
 
 
 
 
 
151
 
152
 
153
- def load_mmlu(language_bcp_47, nr):
 
 
 
 
 
 
 
 
 
 
 
 
154
  category = categories[nr % len(categories)]
155
  if language_bcp_47 in tags_afrimmlu.keys():
156
- ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
157
- ds = ds.map(parse_choices)
158
- examples = ds["dev"].filter(lambda x: x["subject"] == category)
159
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
160
- return "masakhane/afrimmlu", examples, task
161
  elif language_bcp_47 in tags_global_mmlu.keys():
162
- ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
163
- ds = ds.map(add_choices)
164
- examples = ds["dev"].filter(lambda x: x["subject"] == category)
165
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
- return "CohereForAI/Global-MMLU", examples, task
167
  elif language_bcp_47 in tags_mmlu_autotranslated:
168
- ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
169
- examples = ds["dev"].filter(lambda x: x["subject"] == category)
170
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
171
- return "fair-forward/mmlu-autotranslated", examples, task
 
 
 
 
172
  else:
173
  return None, None, None
174
 
@@ -177,10 +207,10 @@ def translate_mmlu(languages):
177
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
178
  untranslated = [
179
  lang
180
- for lang in languages["bcp_47"].values[:100]
181
- if lang not in human_translated and lang in google_supported_languages
182
  ]
183
- n_samples = 10
184
 
185
  slug = "fair-forward/mmlu-autotranslated"
186
  for lang in tqdm(untranslated):
@@ -196,8 +226,10 @@ def translate_mmlu(languages):
196
  if split == "dev":
197
  samples.extend(ds.filter(lambda x: x["subject"] == category))
198
  else:
199
- for i in range(n_samples):
200
- task = ds.filter(lambda x: x["subject"] == category)[i]
 
 
201
  samples.append(task)
202
  questions_tr = [
203
  translate_google(s["question"], "en", lang) for s in samples
 
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset, cache
8
  from langcodes import Language, standardize_tag
9
+ from models import get_google_supported_languages, translate_google
10
  from rich import print
11
  from tqdm import tqdm
12
  from tqdm.asyncio import tqdm_asyncio
 
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
114
+
115
  # print_datasets_analysis()
116
 
117
 
 
144
  a.rsplit("_", 1)[1].split("-")[0].lower()
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
+ tags_mmlu_autotranslated = {
148
+ standardize_tag(a, macro=True): a
149
+ for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
150
+ }
151
 
152
  categories = sorted(
153
+ list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
154
+ )
155
+
156
+
157
+ @cache
158
+ def _get_processed_mmlu_dataset(dataset_name, subset_tag):
159
+ """Cache processed datasets to avoid reprocessing"""
160
+ ds = _load_dataset(dataset_name, subset_tag)
161
+ if dataset_name == "masakhane/afrimmlu":
162
+ ds = ds.map(parse_choices)
163
+ elif dataset_name == "CohereForAI/Global-MMLU":
164
+ ds = ds.map(add_choices)
165
+ return ds
166
 
167
 
168
+ @cache
169
+ def _get_mmlu_item(dataset_name, subset_tag, category, nr):
170
+ """Cache individual MMLU items efficiently"""
171
+ ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
172
+ if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
173
+ filtered = ds["test"].filter(lambda x: x["subject"] == category)
174
+ return filtered[nr] if nr < len(filtered) else None
175
+ else: # fair-forward/mmlu-autotranslated
176
+ filtered = ds["test"].filter(lambda x: x["subject"] == category)
177
+ return filtered[nr] if nr < len(filtered) else None
178
+
179
+
180
+ async def load_mmlu(language_bcp_47, nr):
181
  category = categories[nr % len(categories)]
182
  if language_bcp_47 in tags_afrimmlu.keys():
183
+ task = _get_mmlu_item(
184
+ "masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr
185
+ )
186
+ return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
 
187
  elif language_bcp_47 in tags_global_mmlu.keys():
188
+ task = _get_mmlu_item(
189
+ "CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr
190
+ )
191
+ return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
192
+ # TODO: add in Okapi, MMLUX @Jonas
193
  elif language_bcp_47 in tags_mmlu_autotranslated:
194
+ task = _get_mmlu_item(
195
+ "fair-forward/mmlu-autotranslated", language_bcp_47, category, nr
196
+ )
197
+ return (
198
+ "fair-forward/mmlu-autotranslated",
199
+ task,
200
+ "machine" if task else (None, None, None),
201
+ )
202
  else:
203
  return None, None, None
204
 
 
207
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
208
  untranslated = [
209
  lang
210
+ for lang in languages["bcp_47"].values[:150]
211
+ if lang not in human_translated and lang in get_google_supported_languages()
212
  ]
213
+ n_samples = 20
214
 
215
  slug = "fair-forward/mmlu-autotranslated"
216
  for lang in tqdm(untranslated):
 
226
  if split == "dev":
227
  samples.extend(ds.filter(lambda x: x["subject"] == category))
228
  else:
229
+ # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
230
+ filtered = ds.filter(lambda x: x["subject"] == category)
231
+ for i in range(min(n_samples, len(filtered))):
232
+ task = filtered[i]
233
  samples.append(task)
234
  questions_tr = [
235
  translate_google(s["question"], "en", lang) for s in samples
evals/datasets_/truthfulqa.py CHANGED
@@ -8,17 +8,29 @@ import asyncio
8
  from tqdm.asyncio import tqdm_asyncio
9
  import os
10
 
11
- from datasets import Dataset, load_dataset
12
- from models import translate_google, google_supported_languages
13
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 
 
17
  tags_uhura_truthfulqa = {
18
- standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
 
19
  if a.endswith("multiple_choice")
20
  }
21
 
 
 
 
 
 
 
 
 
 
22
 
23
  def add_choices(row):
24
  row["choices"] = row["mc1_targets"]["choices"]
@@ -26,26 +38,42 @@ def add_choices(row):
26
  return row
27
 
28
 
29
- def load_truthfulqa(language_bcp_47, nr):
30
  if language_bcp_47 in tags_uhura_truthfulqa.keys():
31
- ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
 
 
32
  ds = ds.map(add_choices)
33
- examples = ds["train"]
34
  task = ds["test"][nr]
35
- return "masakhane/uhura-truthfulqa", examples, task
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  else:
37
  return None, None, None
38
 
39
 
40
-
41
  def translate_truthfulqa(languages):
42
  human_translated = [*tags_uhura_truthfulqa.keys()]
43
  untranslated = [
44
  lang
45
- for lang in languages["bcp_47"].values[:100]
46
- if lang not in human_translated and lang in google_supported_languages
47
  ]
48
- n_samples = 10
 
 
 
49
 
50
  slug = "fair-forward/truthfulqa-autotranslated"
51
  for lang in tqdm(untranslated):
@@ -55,37 +83,47 @@ def translate_truthfulqa(languages):
55
  except (ValueError, Exception):
56
  print(f"Translating {lang}...")
57
  for split in ["train", "test"]:
58
- ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split)
 
 
59
  samples = []
60
  if split == "train":
61
  samples.extend(ds)
62
  else:
63
- for i in range(n_samples):
 
64
  task = ds[i]
65
  samples.append(task)
 
 
66
  questions_tr = [
67
  translate_google(s["question"], "en", lang) for s in samples
68
  ]
69
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
70
- choices_texts_concatenated = []
 
 
 
 
71
  for s in samples:
72
- for choice in eval(s["choices"]):
73
- choices_texts_concatenated.append(choice)
74
- choices_tr = [
75
- translate_google(c, "en", lang) for c in choices_texts_concatenated
76
- ]
77
- choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
78
- # group into chunks of 4
79
- choices_tr = [
80
- choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
81
- ]
 
 
82
 
83
  ds_lang = Dataset.from_dict(
84
  {
85
- "subject": [s["subject"] for s in samples],
86
  "question": questions_tr,
87
- "choices": choices_tr,
88
- "answer": [s["answer"] for s in samples],
89
  }
90
  )
91
  ds_lang.push_to_hub(
@@ -95,7 +133,7 @@ def translate_truthfulqa(languages):
95
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
96
  )
97
  ds_lang.to_json(
98
- f"data/translations/mmlu/{lang}_{split}.json",
99
  lines=False,
100
  force_ascii=False,
101
  indent=2,
 
8
  from tqdm.asyncio import tqdm_asyncio
9
  import os
10
 
11
+ from datasets import Dataset, load_dataset, DatasetNotFoundError
12
+ from models import translate_google, get_google_supported_languages
13
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
17
+ slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
18
+
19
  tags_uhura_truthfulqa = {
20
+ standardize_tag(a.split("_")[0], macro=True): a
21
+ for a in _get_dataset_config_names(slug_uhura_truthfulqa)
22
  if a.endswith("multiple_choice")
23
  }
24
 
25
+ # Get available auto-translated languages
26
+ try:
27
+ tags_truthfulqa_autotranslated = {
28
+ standardize_tag(a, macro=True): a
29
+ for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
30
+ }
31
+ except DatasetNotFoundError:
32
+ tags_truthfulqa_autotranslated = {}
33
+
34
 
35
  def add_choices(row):
36
  row["choices"] = row["mc1_targets"]["choices"]
 
38
  return row
39
 
40
 
41
+ async def load_truthfulqa(language_bcp_47, nr):
42
  if language_bcp_47 in tags_uhura_truthfulqa.keys():
43
+ ds = _load_dataset(
44
+ slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
45
+ )
46
  ds = ds.map(add_choices)
 
47
  task = ds["test"][nr]
48
+ # Ensure there is a correct answer before returning the task
49
+ if 1 not in task["labels"]:
50
+ return None, None, None
51
+ return "masakhane/uhura-truthfulqa", task, "human"
52
+ elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
53
+ # Load from auto-translated dataset (same samples as translation)
54
+ ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
55
+ test_split = ds["test"] if "test" in ds else ds
56
+ task = test_split[nr]
57
+ # Ensure there is a correct answer before returning the task
58
+ if 1 not in task.get("labels", []):
59
+ return None, None, None
60
+ return slug_truthfulqa_autotranslated, task, "machine"
61
+ # TODO: add Okapi, TruthfulQA-X @Jonas
62
  else:
63
  return None, None, None
64
 
65
 
 
66
  def translate_truthfulqa(languages):
67
  human_translated = [*tags_uhura_truthfulqa.keys()]
68
  untranslated = [
69
  lang
70
+ for lang in languages["bcp_47"].values[:150]
71
+ if lang not in human_translated and lang in get_google_supported_languages()
72
  ]
73
+ n_samples = 20
74
+
75
+ # Set fixed seed for consistent sample selection across all languages
76
+ random.seed(42)
77
 
78
  slug = "fair-forward/truthfulqa-autotranslated"
79
  for lang in tqdm(untranslated):
 
83
  except (ValueError, Exception):
84
  print(f"Translating {lang}...")
85
  for split in ["train", "test"]:
86
+ ds = _load_dataset(
87
+ slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split
88
+ )
89
  samples = []
90
  if split == "train":
91
  samples.extend(ds)
92
  else:
93
+ # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
94
+ for i in range(min(n_samples, len(ds))):
95
  task = ds[i]
96
  samples.append(task)
97
+
98
+ # Translate questions
99
  questions_tr = [
100
  translate_google(s["question"], "en", lang) for s in samples
101
  ]
102
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
103
+
104
+ # Translate choices for each sample
105
+ all_choices_tr = []
106
+ all_labels = []
107
+
108
  for s in samples:
109
+ # Get choices from mc1_targets
110
+ choices = s["mc1_targets"]["choices"]
111
+ labels = s["mc1_targets"]["labels"]
112
+
113
+ # Translate choices
114
+ choices_tr = [
115
+ translate_google(choice, "en", lang) for choice in choices
116
+ ]
117
+ choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
118
+
119
+ all_choices_tr.append(choices_tr)
120
+ all_labels.append(labels)
121
 
122
  ds_lang = Dataset.from_dict(
123
  {
 
124
  "question": questions_tr,
125
+ "choices": all_choices_tr,
126
+ "labels": all_labels,
127
  }
128
  )
129
  ds_lang.push_to_hub(
 
133
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
134
  )
135
  ds_lang.to_json(
136
+ f"data/translations/truthfulqa/{lang}_{split}.json",
137
  lines=False,
138
  force_ascii=False,
139
  indent=2,
evals/datasets_/util.py CHANGED
@@ -12,3 +12,11 @@ def _get_dataset_config_names(dataset, **kwargs):
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
 
 
 
 
 
 
 
 
 
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
15
+
16
+
17
+ # Cache individual dataset items to avoid reloading entire datasets
18
+ @cache
19
+ def _get_dataset_item(dataset, subset, split, index, **kwargs):
20
+ """Load a single item from a dataset efficiently"""
21
+ ds = load_dataset(dataset, subset, split=split, **kwargs)
22
+ return ds[index] if index < len(ds) else None
evals/download_data.py CHANGED
@@ -8,6 +8,7 @@ from pathlib import Path
8
  import sys
9
  import huggingface_hub
10
  from datasets import load_dataset, DatasetDict
 
11
  # Import fleurs DataFrame directly from its source module
12
  from datasets_.fleurs import fleurs
13
 
@@ -24,22 +25,25 @@ DATA_DIR = project_root / "data"
24
  FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
25
  FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
26
 
27
- GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
28
  GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
29
  GLOTTOLOG_CSV_NAME = "languoid.csv"
30
 
31
- SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
32
  SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
33
 
34
- SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
35
  SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
36
  SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
37
- SPBLEU_DICT_URL = "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
 
 
38
  SPBLEU_DICT_NAME = "dictionary.txt"
39
 
40
 
41
  # --- Helper Functions ---
42
 
 
43
  def download_file(url, path: Path):
44
  """Downloads a file from a URL to a local path."""
45
  print(f"Downloading {url} to {path}...")
@@ -84,11 +88,16 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
84
  break
85
 
86
  if target_zip_path:
87
- with z.open(target_zip_path) as source, open(extract_path / target_filename, "wb") as target:
 
 
 
88
  target.write(source.read())
89
  print(f"Successfully extracted {target_filename}.")
90
  else:
91
- print(f"Error: Could not find {target_filename} within the zip archive.")
 
 
92
 
93
  except zipfile.BadZipFile:
94
  print("Error: Downloaded file is not a valid zip archive.")
@@ -98,13 +107,14 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
98
 
99
  # --- Download Functions ---
100
 
 
101
  def download_fleurs_data():
102
  """Downloads Fleurs audio and text data."""
103
  print("\n--- Downloading Fleurs Data ---")
104
  FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
105
 
106
  # Use the fleurs_tag column from the imported DataFrame
107
- fleurs_tags_list = fleurs['fleurs_tag'].tolist()
108
 
109
  if not fleurs_tags_list:
110
  print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
@@ -117,7 +127,9 @@ def download_fleurs_data():
117
  audio_dir = lang_dir / "audio"
118
  dev_tsv_path = lang_dir / "dev.tsv"
119
  dev_audio_archive_path = audio_dir / "dev.tar.gz"
120
- audio_extracted_marker = audio_dir / "dev" # Check if extraction likely happened
 
 
121
 
122
  # Download TSV
123
  if not dev_tsv_path.exists():
@@ -129,15 +141,15 @@ def download_fleurs_data():
129
  # Download and Extract Audio
130
  if not audio_extracted_marker.exists():
131
  if not dev_audio_archive_path.exists():
132
- tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
133
- download_file(tar_url, dev_audio_archive_path)
134
 
135
  if dev_audio_archive_path.exists():
136
- extract_tar_gz(dev_audio_archive_path, audio_dir)
137
  else:
138
  print(f"Audio archive missing, cannot extract for {lang_tag}")
139
  else:
140
- print(f"Found extracted audio: {audio_extracted_marker}")
141
 
142
 
143
  def download_glottolog_data():
@@ -165,7 +177,9 @@ def download_scriptcodes_data():
165
  # The URL points to an HTML page, not a direct CSV link.
166
  # Manual download is likely required for ScriptCodes.csv.
167
  print(f"Cannot automatically download from {SCRIPTCODES_URL}")
168
- print(f"Please manually download the ISO 15924 codes list (often available as a .txt file)")
 
 
169
  print("from the Unicode website or related sources and save it as:")
170
  print(f"{SCRIPTCODES_TARGET_FILE}")
171
  if SCRIPTCODES_TARGET_FILE.exists():
@@ -196,21 +210,24 @@ def download_spbleu_data():
196
 
197
  # --- Main Execution ---
198
 
 
199
  def main():
200
  """Runs all download functions and the conversion step."""
201
  print("Starting data download process...")
202
  DATA_DIR.mkdir(exist_ok=True)
203
 
204
- #download_fleurs_data()
205
  download_glottolog_data()
206
  download_scriptcodes_data()
207
  download_spbleu_data()
208
 
209
  print("\nData download process finished.")
210
  print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
211
- print("Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well")
 
 
212
  print("in 'evals/datasets_/flores.py' to be read correctly.")
213
 
214
 
215
  if __name__ == "__main__":
216
- main()
 
8
  import sys
9
  import huggingface_hub
10
  from datasets import load_dataset, DatasetDict
11
+
12
  # Import fleurs DataFrame directly from its source module
13
  from datasets_.fleurs import fleurs
14
 
 
25
  FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
26
  FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
27
 
28
+ GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
29
  GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
30
  GLOTTOLOG_CSV_NAME = "languoid.csv"
31
 
32
+ SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
33
  SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
34
 
35
+ SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
36
  SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
37
  SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
38
+ SPBLEU_DICT_URL = (
39
+ "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
40
+ )
41
  SPBLEU_DICT_NAME = "dictionary.txt"
42
 
43
 
44
  # --- Helper Functions ---
45
 
46
+
47
  def download_file(url, path: Path):
48
  """Downloads a file from a URL to a local path."""
49
  print(f"Downloading {url} to {path}...")
 
88
  break
89
 
90
  if target_zip_path:
91
+ with (
92
+ z.open(target_zip_path) as source,
93
+ open(extract_path / target_filename, "wb") as target,
94
+ ):
95
  target.write(source.read())
96
  print(f"Successfully extracted {target_filename}.")
97
  else:
98
+ print(
99
+ f"Error: Could not find {target_filename} within the zip archive."
100
+ )
101
 
102
  except zipfile.BadZipFile:
103
  print("Error: Downloaded file is not a valid zip archive.")
 
107
 
108
  # --- Download Functions ---
109
 
110
+
111
  def download_fleurs_data():
112
  """Downloads Fleurs audio and text data."""
113
  print("\n--- Downloading Fleurs Data ---")
114
  FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
115
 
116
  # Use the fleurs_tag column from the imported DataFrame
117
+ fleurs_tags_list = fleurs["fleurs_tag"].tolist()
118
 
119
  if not fleurs_tags_list:
120
  print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
 
127
  audio_dir = lang_dir / "audio"
128
  dev_tsv_path = lang_dir / "dev.tsv"
129
  dev_audio_archive_path = audio_dir / "dev.tar.gz"
130
+ audio_extracted_marker = (
131
+ audio_dir / "dev"
132
+ ) # Check if extraction likely happened
133
 
134
  # Download TSV
135
  if not dev_tsv_path.exists():
 
141
  # Download and Extract Audio
142
  if not audio_extracted_marker.exists():
143
  if not dev_audio_archive_path.exists():
144
+ tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
145
+ download_file(tar_url, dev_audio_archive_path)
146
 
147
  if dev_audio_archive_path.exists():
148
+ extract_tar_gz(dev_audio_archive_path, audio_dir)
149
  else:
150
  print(f"Audio archive missing, cannot extract for {lang_tag}")
151
  else:
152
+ print(f"Found extracted audio: {audio_extracted_marker}")
153
 
154
 
155
  def download_glottolog_data():
 
177
  # The URL points to an HTML page, not a direct CSV link.
178
  # Manual download is likely required for ScriptCodes.csv.
179
  print(f"Cannot automatically download from {SCRIPTCODES_URL}")
180
+ print(
181
+ "Please manually download the ISO 15924 codes list (often available as a .txt file)"
182
+ )
183
  print("from the Unicode website or related sources and save it as:")
184
  print(f"{SCRIPTCODES_TARGET_FILE}")
185
  if SCRIPTCODES_TARGET_FILE.exists():
 
210
 
211
  # --- Main Execution ---
212
 
213
+
214
  def main():
215
  """Runs all download functions and the conversion step."""
216
  print("Starting data download process...")
217
  DATA_DIR.mkdir(exist_ok=True)
218
 
219
+ # download_fleurs_data()
220
  download_glottolog_data()
221
  download_scriptcodes_data()
222
  download_spbleu_data()
223
 
224
  print("\nData download process finished.")
225
  print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
226
+ print(
227
+ "Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well"
228
+ )
229
  print("in 'evals/datasets_/flores.py' to be read correctly.")
230
 
231
 
232
  if __name__ == "__main__":
233
+ main()
evals/languages.py CHANGED
@@ -31,6 +31,7 @@ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
31
  lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
32
  )
33
 
 
34
  @cache
35
  def language_family(bcp_47):
36
  languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
@@ -39,6 +40,7 @@ def language_family(bcp_47):
39
  family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
40
  return family["name"]
41
 
 
42
  languages["family"] = languages["bcp_47"].apply(language_family)
43
 
44
  # load script codes and names
@@ -46,6 +48,7 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
46
  columns={"Code": "iso15924", "English Name": "script_name"}
47
  )
48
 
 
49
  def script_name(iso15924):
50
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
51
 
 
31
  lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
32
  )
33
 
34
+
35
  @cache
36
  def language_family(bcp_47):
37
  languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
 
40
  family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
41
  return family["name"]
42
 
43
+
44
  languages["family"] = languages["bcp_47"].apply(language_family)
45
 
46
  # load script codes and names
 
48
  columns={"Code": "iso15924", "English Name": "script_name"}
49
  )
50
 
51
+
52
  def script_name(iso15924):
53
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
54
 
evals/main.py CHANGED
@@ -1,62 +1,190 @@
1
  import asyncio
2
-
3
  import pandas as pd
4
- from languages import languages
 
5
  from models import models
6
  from tasks import tasks
7
- from tqdm.asyncio import tqdm_asyncio
 
 
8
 
9
- # ===== config =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- n_sentences = 10
 
 
12
 
13
- # ===== run evaluation and aggregate results =====
 
 
 
14
 
 
 
 
 
 
 
 
 
15
 
16
- async def evaluate():
17
- # FIXME we should not need this for-loop, but it helps
18
- for n_languages in range(10, 101, 10):
19
- print(f"running evaluations for {n_languages} languages")
 
 
 
 
 
 
 
 
 
20
  old_results = pd.read_json("results.json")
21
- old_models = pd.read_json("models.json")
22
- # get all combinations of model, language and task
23
- combis = [
24
- (model, lang.bcp_47, task_name)
25
- for model in models["id"]
26
- for lang in languages.iloc[:n_languages].itertuples()
27
- for task_name, task in tasks.items()
28
- if task_name in models[models["id"] == model]["tasks"].iloc[0]
29
- ]
30
- # filter out combinations that have already been evaluated
31
- combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
32
- combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
33
- combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
34
- # run evaluations
35
- results = [
36
- tasks[task_name](model, bcp_47, i)
37
- for i in range(n_sentences)
38
- for model, bcp_47, task_name in combis.itertuples(index=False)
39
- ]
40
- results = await tqdm_asyncio.gather(*results, miniters=1)
41
- results = [r for group in results for r in group]
42
- args = dict(orient="records", indent=2, force_ascii=False)
43
- if results:
44
- # aggregate results
45
- results = pd.DataFrame(results)
46
- results = (
47
- results.groupby(["model", "bcp_47", "task", "metric"])
48
- .agg({"score": "mean"})
49
- .reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  )
51
- # save results
52
- results = pd.concat([old_results, results])
53
- results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
54
- results.to_json("results.json", **args)
55
- # save up-to-date info on models and languages
56
- all_models = pd.concat([pd.DataFrame(models), old_models])
57
- all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
58
- all_models.to_json("models.json", **args)
59
- pd.DataFrame(languages).to_json("languages.json", **args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  if __name__ == "__main__":
 
1
  import asyncio
 
2
  import pandas as pd
3
+ import time
4
+ from datetime import datetime, timedelta
5
  from models import models
6
  from tasks import tasks
7
+ from languages import languages
8
+ import os
9
+
10
 
11
+ async def evaluate():
12
+ # Configuration - easily adjustable defaults
13
+ n_sentences = int(
14
+ os.environ.get("N_SENTENCES", 20)
15
+ ) # Default: 20 sentences per task
16
+ max_languages = int(
17
+ os.environ.get("MAX_LANGUAGES", 150)
18
+ ) # Default: 150 top languages
19
+ single_model = os.environ.get(
20
+ "SINGLE_MODEL"
21
+ ) # Optional: run only one specific model
22
+ test_mode = os.environ.get("TEST", "").lower() in (
23
+ "1",
24
+ "true",
25
+ "yes",
26
+ ) # Optional: skip results loading/saving
27
 
28
+ # Keep original DataFrames for saving metadata - distinction added for single model test runs.
29
+ original_models_df = pd.DataFrame(models)
30
+ original_languages_df = pd.DataFrame(languages)
31
 
32
+ # Create working copies for single evaluation runs
33
+ models_df = original_models_df.copy()
34
+ languages_df = original_languages_df.copy()
35
+ top_languages = languages.head(max_languages)
36
 
37
+ # Filter to single model if specified (only affects evaluation, not saving)
38
+ if single_model:
39
+ models_df = models_df[models_df["id"] == single_model]
40
+ if len(models_df) == 0:
41
+ print(f"Error: Model '{single_model}' not found. Available models:")
42
+ for model_id in original_models_df["id"]:
43
+ print(f" {model_id}")
44
+ return pd.DataFrame()
45
 
46
+ print(
47
+ f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task"
48
+ )
49
+ if test_mode:
50
+ print("TEST MODE: Skipping results loading/saving")
51
+ start_time = time.time()
52
+
53
+ # Load existing results to avoid re-evaluation (skip in test mode)
54
+ if test_mode:
55
+ old_results = pd.DataFrame(
56
+ columns=["model", "bcp_47", "task", "metric", "origin", "score"]
57
+ )
58
+ else:
59
  old_results = pd.read_json("results.json")
60
+
61
+ # Get all combinations that need evaluation
62
+ combis = [
63
+ (model, lang.bcp_47, task_name)
64
+ for model in models_df["id"]
65
+ for lang in top_languages.itertuples()
66
+ for task_name, task in tasks.items()
67
+ if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
68
+ ]
69
+
70
+ # Filter out already evaluated combinations
71
+ combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
72
+ if not old_results.empty:
73
+ completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
74
+ # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
75
+ mask = ~combis.apply(
76
+ lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1
77
+ )
78
+ combis = combis[mask]
79
+
80
+ # Create all evaluation tasks
81
+ all_tasks = []
82
+ for i in range(n_sentences):
83
+ for model, bcp_47, task_name in combis.itertuples(index=False):
84
+ all_tasks.append((tasks[task_name], model, bcp_47, i))
85
+
86
+ print(f"Running {len(all_tasks)} evaluation tasks...")
87
+
88
+ # For single model runs, we stop immediately on first API error to inspect.
89
+ # For full evaluations, we continue despite errors to get maximum coverage.
90
+ stop_on_error = single_model is not None
91
+
92
+ # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
93
+ batch_size = 1000
94
+ all_results = []
95
+
96
+ try:
97
+ for i in range(0, len(all_tasks), batch_size):
98
+ batch = all_tasks[i : i + batch_size]
99
+ batch_results = await asyncio.gather(
100
+ *[
101
+ task_func(model, bcp_47, sentence_nr)
102
+ for task_func, model, bcp_47, sentence_nr in batch
103
+ ],
104
+ return_exceptions=not stop_on_error,
105
  )
106
+ all_results.extend(batch_results)
107
+
108
+ results = all_results
109
+
110
+ # Process results and logging API errors separately to understand what are the main issues.
111
+ valid_results = []
112
+ errors = []
113
+
114
+ for i, r in enumerate(results):
115
+ if isinstance(r, Exception):
116
+ if i < len(all_tasks):
117
+ task_info = all_tasks[i]
118
+ errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
119
+ elif isinstance(r, list):
120
+ valid_results.extend(r)
121
+ elif r is not None:
122
+ valid_results.append(r)
123
+
124
+ # log errors and store
125
+ if errors:
126
+ with open("errors.log", "w") as f:
127
+ f.write("model,task,error\n")
128
+ for error in errors:
129
+ f.write(error + "\n")
130
+
131
+ # Track model completion (TO BE DELETED - was for local run only)
132
+ if valid_results:
133
+ completed_models = set()
134
+ for result in valid_results:
135
+ if isinstance(result, dict) and "model" in result:
136
+ model = result["model"]
137
+ if model not in completed_models:
138
+ completed_models.add(model)
139
+ print(f"Completed: {model}")
140
+
141
+ print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
142
+
143
+ # this is for local single model runs - for testing and development
144
+ except Exception as e:
145
+ print(f"EVALUATION STOPPED - API Error occurred:")
146
+ print(f"Error type: {type(e).__name__}")
147
+ print(f"Error message: {str(e)}")
148
+ return pd.DataFrame()
149
+
150
+ # Save results (skipped in test mode as we do not want to overwrite existing results)
151
+ if valid_results:
152
+ results_df = pd.DataFrame(valid_results)
153
+
154
+ # Aggregate results
155
+ results_df = (
156
+ results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
157
+ .agg({"score": "mean"})
158
+ .reset_index()
159
+ )
160
+
161
+ if not test_mode:
162
+ args = dict(orient="records", indent=2, force_ascii=False)
163
+
164
+ # Merge with existing results
165
+ if not old_results.empty:
166
+ results_df = pd.concat([old_results, results_df])
167
+ results_df = results_df.drop_duplicates(
168
+ subset=["model", "bcp_47", "task", "metric", "origin"]
169
+ )
170
+
171
+ results_df = results_df.sort_values(
172
+ by=["model", "bcp_47", "task", "metric"]
173
+ )
174
+ results_df.to_json("results.json", **args)
175
+
176
+ # Save model and language info (always save complete metadata, not filtered)
177
+ original_models_df.to_json("models.json", **args)
178
+ original_languages_df.to_json("languages.json", **args)
179
+ else:
180
+ print("TEST MODE: Skipping results saving")
181
+
182
+ elapsed = time.time() - start_time
183
+ print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
184
+
185
+ return results_df
186
+
187
+ return pd.DataFrame()
188
 
189
 
190
  if __name__ == "__main__":
evals/models.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import json
2
  import re
3
  from collections import defaultdict
@@ -7,7 +8,6 @@ from os import getenv
7
  import pandas as pd
8
  from aiolimiter import AsyncLimiter
9
  from dotenv import load_dotenv
10
- from elevenlabs import AsyncElevenLabs
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
@@ -22,14 +22,17 @@ important_models = [
22
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
23
  "meta-llama/llama-3-70b-instruct", # 0.4$
24
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
 
 
25
  "openai/gpt-4.1", # 8$
26
  "openai/gpt-4.1-mini", # 1.6$
27
  "openai/gpt-4.1-nano", # 0.4$
28
  "openai/gpt-4o-mini", # 0.6$
29
- # "openai/gpt-4o-2024-11-20", # 10$
30
- "openai/gpt-3.5-turbo-0613", # 2$
31
- # "openai/gpt-3.5-turbo", # 1.5$
32
- # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
 
33
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
34
  "mistralai/mistral-saba", # 0.6$
35
  "mistralai/mistral-nemo", # 0.08$
@@ -48,10 +51,13 @@ important_models = [
48
  "microsoft/phi-4", # 0.07$
49
  "microsoft/phi-4-multimodal-instruct", # 0.1$
50
  "amazon/nova-micro-v1", # 0.09$
 
 
51
  ]
52
 
53
  blocklist = [
54
  "google/gemini-2.5-pro-preview",
 
55
  "google/gemini-2.5-flash-preview",
56
  "google/gemini-2.5-flash-lite-preview",
57
  "google/gemini-2.5-flash-preview-04-17",
@@ -59,6 +65,7 @@ blocklist = [
59
  "google/gemini-2.5-flash-lite-preview-06-17",
60
  "google/gemini-2.5-pro-preview-06-05",
61
  "google/gemini-2.5-pro-preview-05-06",
 
62
  ]
63
 
64
  transcription_models = [
@@ -93,28 +100,81 @@ def get_model(permaslug):
93
 
94
  @cache
95
  def get_historical_popular_models(date: date):
96
- raw = get("https://openrouter.ai/rankings").text
97
- data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
98
- data = json.loads(data.replace("\\", ""))
99
- counts = defaultdict(int)
100
- for day in data:
101
- for model, count in day["ys"].items():
102
- if model.startswith("openrouter") or model == "Others":
103
- continue
104
- counts[model.split(":")[0]] += count
105
- counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
106
- models = [get_model(model) for model, _ in counts]
107
- return [m for m in models if m]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  @cache
111
  def get_current_popular_models(date: date):
112
- raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
113
- data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
114
- data = json.loads(data)
115
- data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
116
- models = [get_model(model["model_permaslug"]) for model in data]
117
- return [m for m in models if m]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
 
120
  def get_translation_models():
@@ -161,7 +221,10 @@ async def complete(**kwargs) -> str | None:
161
 
162
 
163
  translate_client = translate.Client()
164
- google_supported_languages = [l["language"] for l in translate_client.get_languages()]
 
 
 
165
 
166
 
167
  @cache
@@ -231,12 +294,15 @@ def get_hf_metadata(row):
231
  return empty
232
  try:
233
  info = api.model_info(id)
234
- license = (
235
- (info.card_data.license or "")
236
- .replace("-", " ")
237
- .replace("mit", "MIT")
238
- .title()
239
- )
 
 
 
240
  return {
241
  "hf_id": info.id,
242
  "creation_date": info.created_at,
@@ -249,8 +315,14 @@ def get_hf_metadata(row):
249
 
250
 
251
  def get_cost(row):
252
- cost = float(row["endpoint"]["pricing"]["completion"])
253
- return round(cost * 1_000_000, 2)
 
 
 
 
 
 
254
 
255
 
256
  @cache
@@ -260,8 +332,17 @@ def load_models(date: date):
260
  + get_current_popular_models(date.today())[:10]
261
  )
262
  popular_models = [m["slug"] for m in popular_models]
263
- models = set(important_models + popular_models) - set(blocklist)
264
- models = pd.DataFrame(sorted(list(models)), columns=["id"])
 
 
 
 
 
 
 
 
 
265
  or_metadata = models["id"].apply(get_or_metadata)
266
  hf_metadata = or_metadata.apply(get_hf_metadata)
267
  creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
@@ -281,9 +362,18 @@ def load_models(date: date):
281
  license=hf_metadata.str["license"],
282
  creation_date=creation_date_hf.combine_first(creation_date_or),
283
  )
284
- # models = models[models["cost"] <= 2.0].reset_index(drop=True)
 
285
  models["tasks"] = [
286
- ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
 
 
 
 
 
 
 
 
287
  ] * len(models)
288
  models = pd.concat([models, get_translation_models()])
289
  return models
 
1
+ import asyncio
2
  import json
3
  import re
4
  from collections import defaultdict
 
8
  import pandas as pd
9
  from aiolimiter import AsyncLimiter
10
  from dotenv import load_dotenv
 
11
  from google.cloud import translate_v2 as translate
12
  from huggingface_hub import AsyncInferenceClient, HfApi
13
  from joblib.memory import Memory
 
22
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
23
  "meta-llama/llama-3-70b-instruct", # 0.4$
24
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
25
+ "openai/gpt-5",
26
+ "openai/gpt-5-nano", # include if/when available
27
  "openai/gpt-4.1", # 8$
28
  "openai/gpt-4.1-mini", # 1.6$
29
  "openai/gpt-4.1-nano", # 0.4$
30
  "openai/gpt-4o-mini", # 0.6$
31
+ "openai/gpt-4o-2024-11-20", # 10$
32
+ "openai/gpt-oss-120b",
33
+ "anthropic/claude-3.7-sonnet", # 15$ - added for full coverage
34
+ "anthropic/claude-sonnet-4", # 15$ - added for full coverage
35
+ "anthropic/claude-opus-4.1", # 15$ - added for full coverage
36
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
37
  "mistralai/mistral-saba", # 0.6$
38
  "mistralai/mistral-nemo", # 0.08$
 
51
  "microsoft/phi-4", # 0.07$
52
  "microsoft/phi-4-multimodal-instruct", # 0.1$
53
  "amazon/nova-micro-v1", # 0.09$
54
+ "moonshotai/kimi-k2", # 0.6$ - added to prevent missing from models.json
55
+ "x-ai/grok-4",
56
  ]
57
 
58
  blocklist = [
59
  "google/gemini-2.5-pro-preview",
60
+ "google/gemini-2.5-pro",
61
  "google/gemini-2.5-flash-preview",
62
  "google/gemini-2.5-flash-lite-preview",
63
  "google/gemini-2.5-flash-preview-04-17",
 
65
  "google/gemini-2.5-flash-lite-preview-06-17",
66
  "google/gemini-2.5-pro-preview-06-05",
67
  "google/gemini-2.5-pro-preview-05-06",
68
+ "perplexity/sonar-deep-research",
69
  ]
70
 
71
  transcription_models = [
 
100
 
101
  @cache
102
  def get_historical_popular_models(date: date):
103
+ try:
104
+ raw = get("https://openrouter.ai/rankings").text
105
+
106
+ # Extract model data from rankingData using regex
107
+ import re
108
+ import json
109
+
110
+ # Find all count and model_permaslug pairs in the data
111
+ # Format: "count":number,"model_permaslug":"model/name"
112
+ pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
113
+ matches = re.findall(pattern, raw)
114
+
115
+ if matches:
116
+ # Aggregate model counts
117
+ model_counts = {}
118
+ for count_str, model_slug in matches:
119
+ count = float(count_str)
120
+ if not model_slug.startswith("openrouter") and model_slug != "Others":
121
+ # Remove variant suffixes for aggregation
122
+ base_model = model_slug.split(":")[0]
123
+ model_counts[base_model] = model_counts.get(base_model, 0) + count
124
+
125
+ # Sort by popularity and return top models
126
+ sorted_models = sorted(
127
+ model_counts.items(), key=lambda x: x[1], reverse=True
128
+ )
129
+ result = []
130
+ for model_slug, count in sorted_models[:20]: # Top 20
131
+ result.append({"slug": model_slug, "count": int(count)})
132
+
133
+ return result
134
+ else:
135
+ return []
136
+
137
+ except Exception as e:
138
+ return []
139
 
140
 
141
  @cache
142
  def get_current_popular_models(date: date):
143
+ try:
144
+ raw = get("https://openrouter.ai/rankings?view=day").text
145
+
146
+ # Extract model data from daily rankings
147
+ import re
148
+ import json
149
+
150
+ # Find all count and model_permaslug pairs in the daily data
151
+ pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
152
+ matches = re.findall(pattern, raw)
153
+
154
+ if matches:
155
+ # Aggregate model counts
156
+ model_counts = {}
157
+ for count_str, model_slug in matches:
158
+ count = float(count_str)
159
+ if not model_slug.startswith("openrouter") and model_slug != "Others":
160
+ # Remove variant suffixes for aggregation
161
+ base_model = model_slug.split(":")[0]
162
+ model_counts[base_model] = model_counts.get(base_model, 0) + count
163
+
164
+ # Sort by popularity and return top models
165
+ sorted_models = sorted(
166
+ model_counts.items(), key=lambda x: x[1], reverse=True
167
+ )
168
+ result = []
169
+ for model_slug, count in sorted_models[:10]: # Top 10
170
+ result.append({"slug": model_slug, "count": int(count)})
171
+
172
+ return result
173
+ else:
174
+ return []
175
+
176
+ except Exception as e:
177
+ return []
178
 
179
 
180
  def get_translation_models():
 
221
 
222
 
223
  translate_client = translate.Client()
224
+
225
+
226
+ def get_google_supported_languages():
227
+ return [l["language"] for l in translate_client.get_languages()]
228
 
229
 
230
  @cache
 
294
  return empty
295
  try:
296
  info = api.model_info(id)
297
+ license = ""
298
+ if (
299
+ info.card_data
300
+ and hasattr(info.card_data, "license")
301
+ and info.card_data.license
302
+ ):
303
+ license = (
304
+ info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
305
+ )
306
  return {
307
  "hf_id": info.id,
308
  "creation_date": info.created_at,
 
315
 
316
 
317
  def get_cost(row):
318
+ """
319
+ row: a row from the OpenRouter models dataframe
320
+ """
321
+ try:
322
+ cost = float(row["endpoint"]["pricing"]["completion"])
323
+ return round(cost * 1_000_000, 2)
324
+ except (TypeError, KeyError):
325
+ return None
326
 
327
 
328
  @cache
 
332
  + get_current_popular_models(date.today())[:10]
333
  )
334
  popular_models = [m["slug"] for m in popular_models]
335
+ all_model_candidates = set(important_models + popular_models) - set(blocklist)
336
+
337
+ # Validate models exist on OpenRouter before including them
338
+ valid_models = []
339
+
340
+ for model_id in all_model_candidates:
341
+ metadata = get_or_metadata(model_id)
342
+ if metadata is not None:
343
+ valid_models.append(model_id)
344
+
345
+ models = pd.DataFrame(sorted(valid_models), columns=["id"])
346
  or_metadata = models["id"].apply(get_or_metadata)
347
  hf_metadata = or_metadata.apply(get_hf_metadata)
348
  creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
 
362
  license=hf_metadata.str["license"],
363
  creation_date=creation_date_hf.combine_first(creation_date_or),
364
  )
365
+ # Filter out expensive models to keep costs reasonable
366
+ models = models[models["cost"] <= 15.0].reset_index(drop=True)
367
  models["tasks"] = [
368
+ [
369
+ "translation_from",
370
+ "translation_to",
371
+ "classification",
372
+ "mmlu",
373
+ "arc",
374
+ "truthfulqa",
375
+ "mgsm",
376
+ ]
377
  ] * len(models)
378
  models = pd.concat([models, get_translation_models()])
379
  return models
evals/plots.py CHANGED
@@ -9,34 +9,33 @@ df = pd.read_json("../results.json")
9
  df = df[df["metric"] != "chrf"]
10
  df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
11
 
 
12
  # Apply logit transformation to classification scores to reduce skewness
13
  def transform_classification_scores(row):
14
- if row['task'] == 'classification':
15
  # Avoid division by zero and infinite values by clipping
16
- score = np.clip(row['score'], 0.001, 0.999)
17
  # Apply logit transformation (log(p/(1-p)))
18
  return logit(score)
19
  else:
20
- return row['score']
 
21
 
22
- df['score'] = df.apply(transform_classification_scores, axis=1)
23
 
24
  # Create a pivot table with tasks as columns and languages as rows
25
  pivot_df = df.pivot_table(
26
- values='score',
27
- index='bcp_47',
28
- columns='task',
29
- aggfunc='mean'
30
  )
31
 
32
  # Sort and filter tasks
33
  ordered_tasks = [
34
- 'translation_from',
35
- 'translation_to',
36
- 'classification',
37
- 'mmlu',
38
- 'arc',
39
- 'mgsm',
40
  ]
41
  # Drop 'truthfulqa' if present and reindex columns
42
  pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
@@ -46,29 +45,29 @@ correlation_matrix = pivot_df.corr()
46
 
47
  # Create the correlation plot
48
  plt.figure(figsize=(8, 6))
49
- # Create mask for upper triangle including diagonal to show only lower triangle
50
  mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
51
 
52
  # Create a heatmap
53
  sns.heatmap(
54
- correlation_matrix,
55
- annot=True,
56
- cmap='Blues',
57
  center=0,
58
  square=True,
59
  mask=mask,
60
- cbar_kws={"shrink": .8},
61
- fmt='.3f'
62
  )
63
 
64
- plt.xlabel('Tasks', fontsize=12)
65
- plt.ylabel('Tasks', fontsize=12)
66
- plt.xticks(rotation=45, ha='right')
67
  plt.yticks(rotation=0)
68
  plt.tight_layout()
69
 
70
  # Save the plot
71
- plt.savefig('task_correlation_matrix.png', dpi=300, bbox_inches='tight')
72
  plt.show()
73
 
74
  # Print correlation values for reference
@@ -77,56 +76,91 @@ print("Note: Classification scores have been logit-transformed to reduce skewnes
77
  print(correlation_matrix.round(3))
78
 
79
  # Also create a scatter plot matrix for pairwise relationships with highlighted languages
80
- highlighted_languages = ['en', 'zh', 'hi', 'es', 'ar']
 
81
 
82
  # Create color mapping
83
  def get_color_and_label(lang_code):
84
  if lang_code in highlighted_languages:
85
- color_map = {'en': 'red', 'zh': 'blue', 'hi': 'green', 'es': 'orange', 'ar': 'purple'}
 
 
 
 
 
 
86
  return color_map[lang_code], lang_code
87
  else:
88
- return 'lightgray', 'Other'
 
89
 
90
  # Create custom scatter plot matrix
91
  tasks = pivot_df.columns.tolist()
92
  n_tasks = len(tasks)
93
 
94
  fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
95
- fig.suptitle('Pairwise Task Performance', fontsize=16, fontweight='bold')
96
 
97
  # Create legend elements
98
  legend_elements = []
99
  for lang in highlighted_languages:
100
  color, _ = get_color_and_label(lang)
101
- legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=8, label=lang))
102
- legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgray', markersize=8, label='Other'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  for i, task_y in enumerate(tasks):
105
  for j, task_x in enumerate(tasks):
106
  ax = axes[i, j]
107
-
108
  if i == j:
109
  # Diagonal: histogram
110
  task_data = pivot_df[task_y].dropna()
111
  colors = [get_color_and_label(lang)[0] for lang in task_data.index]
112
- ax.hist(task_data, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
113
- ax.set_title(f'{task_y}', fontsize=10)
114
  else:
115
  # Off-diagonal: scatter plot
116
  for lang_code in pivot_df.index:
117
- if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(pivot_df.loc[lang_code, task_y]):
 
 
118
  color, _ = get_color_and_label(lang_code)
119
  alpha = 0.8 if lang_code in highlighted_languages else 0.3
120
  size = 50 if lang_code in highlighted_languages else 20
121
- ax.scatter(pivot_df.loc[lang_code, task_x], pivot_df.loc[lang_code, task_y],
122
- c=color, alpha=alpha, s=size)
123
-
 
 
 
 
 
124
  # Set labels
125
  if i == n_tasks - 1:
126
  ax.set_xlabel(task_x, fontsize=10)
127
  if j == 0:
128
  ax.set_ylabel(task_y, fontsize=10)
129
-
130
  # Remove tick labels except for edges
131
  if i != n_tasks - 1:
132
  ax.set_xticklabels([])
@@ -136,15 +170,15 @@ for i, task_y in enumerate(tasks):
136
  # Add legend
137
  fig.legend(
138
  handles=legend_elements,
139
- loc='lower center',
140
  bbox_to_anchor=(0.5, -0.05),
141
  ncol=len(legend_elements),
142
  frameon=False,
143
  fontsize=10,
144
  handletextpad=0.5,
145
- columnspacing=1.0
146
  )
147
 
148
  plt.tight_layout()
149
- plt.savefig('task_scatter_matrix.png', dpi=300, bbox_inches='tight')
150
  plt.show()
 
9
  df = df[df["metric"] != "chrf"]
10
  df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
11
 
12
+
13
  # Apply logit transformation to classification scores to reduce skewness
14
  def transform_classification_scores(row):
15
+ if row["task"] == "classification":
16
  # Avoid division by zero and infinite values by clipping
17
+ score = np.clip(row["score"], 0.001, 0.999)
18
  # Apply logit transformation (log(p/(1-p)))
19
  return logit(score)
20
  else:
21
+ return row["score"]
22
+
23
 
24
+ df["score"] = df.apply(transform_classification_scores, axis=1)
25
 
26
  # Create a pivot table with tasks as columns and languages as rows
27
  pivot_df = df.pivot_table(
28
+ values="score", index="bcp_47", columns="task", aggfunc="mean"
 
 
 
29
  )
30
 
31
  # Sort and filter tasks
32
  ordered_tasks = [
33
+ "translation_from",
34
+ "translation_to",
35
+ "classification",
36
+ "mmlu",
37
+ "arc",
38
+ "mgsm",
39
  ]
40
  # Drop 'truthfulqa' if present and reindex columns
41
  pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
 
45
 
46
  # Create the correlation plot
47
  plt.figure(figsize=(8, 6))
48
+ # Create mask for upper triangle including diagonal to show only lower triangle
49
  mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
50
 
51
  # Create a heatmap
52
  sns.heatmap(
53
+ correlation_matrix,
54
+ annot=True,
55
+ cmap="Blues",
56
  center=0,
57
  square=True,
58
  mask=mask,
59
+ cbar_kws={"shrink": 0.8},
60
+ fmt=".3f",
61
  )
62
 
63
+ plt.xlabel("Tasks", fontsize=12)
64
+ plt.ylabel("Tasks", fontsize=12)
65
+ plt.xticks(rotation=45, ha="right")
66
  plt.yticks(rotation=0)
67
  plt.tight_layout()
68
 
69
  # Save the plot
70
+ plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
71
  plt.show()
72
 
73
  # Print correlation values for reference
 
76
  print(correlation_matrix.round(3))
77
 
78
  # Also create a scatter plot matrix for pairwise relationships with highlighted languages
79
+ highlighted_languages = ["en", "zh", "hi", "es", "ar"]
80
+
81
 
82
  # Create color mapping
83
  def get_color_and_label(lang_code):
84
  if lang_code in highlighted_languages:
85
+ color_map = {
86
+ "en": "red",
87
+ "zh": "blue",
88
+ "hi": "green",
89
+ "es": "orange",
90
+ "ar": "purple",
91
+ }
92
  return color_map[lang_code], lang_code
93
  else:
94
+ return "lightgray", "Other"
95
+
96
 
97
  # Create custom scatter plot matrix
98
  tasks = pivot_df.columns.tolist()
99
  n_tasks = len(tasks)
100
 
101
  fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
102
+ fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")
103
 
104
  # Create legend elements
105
  legend_elements = []
106
  for lang in highlighted_languages:
107
  color, _ = get_color_and_label(lang)
108
+ legend_elements.append(
109
+ plt.Line2D(
110
+ [0],
111
+ [0],
112
+ marker="o",
113
+ color="w",
114
+ markerfacecolor=color,
115
+ markersize=8,
116
+ label=lang,
117
+ )
118
+ )
119
+ legend_elements.append(
120
+ plt.Line2D(
121
+ [0],
122
+ [0],
123
+ marker="o",
124
+ color="w",
125
+ markerfacecolor="lightgray",
126
+ markersize=8,
127
+ label="Other",
128
+ )
129
+ )
130
 
131
  for i, task_y in enumerate(tasks):
132
  for j, task_x in enumerate(tasks):
133
  ax = axes[i, j]
134
+
135
  if i == j:
136
  # Diagonal: histogram
137
  task_data = pivot_df[task_y].dropna()
138
  colors = [get_color_and_label(lang)[0] for lang in task_data.index]
139
+ ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
140
+ ax.set_title(f"{task_y}", fontsize=10)
141
  else:
142
  # Off-diagonal: scatter plot
143
  for lang_code in pivot_df.index:
144
+ if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
145
+ pivot_df.loc[lang_code, task_y]
146
+ ):
147
  color, _ = get_color_and_label(lang_code)
148
  alpha = 0.8 if lang_code in highlighted_languages else 0.3
149
  size = 50 if lang_code in highlighted_languages else 20
150
+ ax.scatter(
151
+ pivot_df.loc[lang_code, task_x],
152
+ pivot_df.loc[lang_code, task_y],
153
+ c=color,
154
+ alpha=alpha,
155
+ s=size,
156
+ )
157
+
158
  # Set labels
159
  if i == n_tasks - 1:
160
  ax.set_xlabel(task_x, fontsize=10)
161
  if j == 0:
162
  ax.set_ylabel(task_y, fontsize=10)
163
+
164
  # Remove tick labels except for edges
165
  if i != n_tasks - 1:
166
  ax.set_xticklabels([])
 
170
  # Add legend
171
  fig.legend(
172
  handles=legend_elements,
173
+ loc="lower center",
174
  bbox_to_anchor=(0.5, -0.05),
175
  ncol=len(legend_elements),
176
  frameon=False,
177
  fontsize=10,
178
  handletextpad=0.5,
179
+ columnspacing=1.0,
180
  )
181
 
182
  plt.tight_layout()
183
+ plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
184
  plt.show()
evals/tasks.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import random
2
  from functools import partial
3
  from textwrap import dedent
@@ -5,10 +6,10 @@ from textwrap import dedent
5
  import evaluate
6
  import pandas as pd
7
  import sentencepiece as spm
 
8
  from datasets_.flores import flores_sentences
9
  from datasets_.mgsm import load_mgsm, parse_number
10
  from datasets_.mmlu import load_mmlu
11
- from datasets_.arc import load_uhura_arc_easy
12
  from datasets_.truthfulqa import load_truthfulqa
13
  from google.cloud import translate_v2 as translate
14
  from langcodes import closest_supported_match
@@ -47,6 +48,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
47
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
48
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
49
  script = script_name(target_language.flores_path.split("_")[1])
 
50
  if model == "google/translate-v2":
51
  original_language = closest_supported_match(
52
  original_language, supported_languages
@@ -66,7 +68,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
66
  messages=[
67
  {
68
  "role": "user",
69
- "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
70
  }
71
  ],
72
  temperature=0,
@@ -91,6 +93,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
91
  "task": f"translation_{mode}",
92
  "metric": metric,
93
  "score": score,
 
94
  "sentence_nr": sentence_nr,
95
  }
96
  for metric, score in (
@@ -112,57 +115,33 @@ async def classify_and_evaluate(model, bcp_47, nr):
112
  )
113
  top_topics = paragraphs.value_counts("topic").head(5).index
114
  paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
115
- examples = pd.concat(
116
- [
117
- paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
118
- for t in top_topics
119
- ]
120
- ).sample(frac=1, random_state=nr)
121
- test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
122
- frac=1, random_state=42
123
- )
124
- test_paragraph = test_paragraphs.iloc[nr]
125
 
126
- def format_prompt(text):
127
- return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
128
 
129
- messages = []
130
- for example in examples.itertuples():
131
- messages += [
132
- {"role": "user", "content": format_prompt(example.text)},
133
- {"role": "assistant", "content": example.topic},
134
- ]
135
- # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
136
- # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
137
- try:
138
- pred = await complete(
139
- model=model,
140
- messages=[
141
- *messages,
142
- {
143
- "role": "user",
144
- "content": format_prompt(test_paragraph.text),
145
- },
146
- ],
147
- temperature=0,
148
- max_tokens=30,
149
- )
150
- true = test_paragraph.topic
151
- others = [t for t in top_topics if t != true]
152
- acc = (
153
- int(
154
- pred.startswith(true)
155
- or (true in pred and not any(o in pred for o in others))
156
- )
157
- if pred
158
- else 0
159
  )
160
- except Exception as e:
161
- if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
162
- print(f"Max tokens exceeded for {model} in {bcp_47}")
163
- acc = 0
164
- else:
165
- raise e
166
  return [
167
  {
168
  "model": model,
@@ -170,6 +149,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
170
  "task": "classification",
171
  "metric": "accuracy",
172
  "score": acc,
 
173
  "sentence_nr": nr,
174
  }
175
  ]
@@ -232,39 +212,38 @@ def format_multiple_choice(item):
232
  A: {item["choices"][0]}
233
  B: {item["choices"][1]}
234
  C: {item["choices"][2]}
235
- D: {item["choices"][3]}
236
-
237
- A|B|C|D?"""
238
 
239
 
240
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
241
- ds_name, examples, task = load_mmlu(language_bcp_47, nr)
242
  if not task:
243
  return []
244
 
245
- messages = []
246
- for example in examples:
247
- messages += [
248
- {"role": "user", "content": format_multiple_choice(example)},
249
- {"role": "assistant", "content": example["answer"]},
250
- ]
251
- messages += [{"role": "user", "content": format_multiple_choice(task)}]
252
- try:
253
- response = await complete(
254
- model=model,
255
- messages=messages,
256
- temperature=0,
257
- max_tokens=1,
258
- )
259
- if response:
260
- acc = int(response[:1].strip() == task["answer"])
261
- else:
262
- acc = 0
263
- except Exception as e:
264
- if "ResponsibleAIPolicyViolation" in str(e):
265
- acc = 0
266
- else:
267
- raise e
 
268
  return [
269
  {
270
  "model": model,
@@ -272,39 +251,40 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
272
  "task": "mmlu",
273
  "metric": "accuracy",
274
  "score": acc,
 
275
  "sentence_nr": nr,
276
  }
277
  ]
278
 
279
 
280
  async def arc_and_evaluate(model, language_bcp_47, nr):
281
- ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
282
  if not task:
283
  return []
284
 
285
- messages = []
286
- for example in examples:
287
- messages += [
288
- {"role": "user", "content": format_multiple_choice(example)},
289
- {"role": "assistant", "content": example["answer"]},
290
- ]
291
- messages += [{"role": "user", "content": format_multiple_choice(task)}]
292
- try:
293
- response = await complete(
294
- model=model,
295
- messages=messages,
296
- temperature=0,
297
- max_tokens=1,
298
- )
299
- if response:
300
- acc = int(response[:1].strip() == task["answer"])
301
- else:
302
- acc = 0
303
- except Exception as e:
304
- if "ResponsibleAIPolicyViolation" in str(e):
305
- acc = 0
306
- else:
307
- raise e
308
  return [
309
  {
310
  "model": model,
@@ -312,6 +292,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
312
  "task": "arc",
313
  "metric": "accuracy",
314
  "score": acc,
 
315
  "sentence_nr": nr,
316
  }
317
  ]
@@ -332,40 +313,42 @@ def format_multiple_choice_truthfulqa(item):
332
  text = item["question"] + "\n\n"
333
  for i, choice in enumerate(item["choices"]):
334
  text += f"{letters[i]}: {choice}\n"
335
- text += "|".join(letters[: len(item["choices"])]) + "?"
336
  return text
337
 
338
 
339
  async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
340
- ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
341
  if not task:
342
  return []
343
- task = shuffle_choices_and_labels(task)
344
- answer = letters[task["labels"].index(1)]
345
- messages = []
346
- for example in examples:
347
- example = shuffle_choices_and_labels(example)
348
- messages += [
349
- {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
350
- {"role": "assistant", "content": letters[example["labels"].index(1)]},
351
- ]
352
- messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
353
- try:
354
- response = await complete(
355
- model=model,
356
- messages=messages,
357
- temperature=0,
358
- max_tokens=1,
359
- )
360
- if response:
361
- acc = int(response[:1].strip() == answer)
362
- else:
363
- acc = 0
364
- except Exception as e:
365
- if "ResponsibleAIPolicyViolation" in str(e):
366
- acc = 0
367
- else:
368
- raise e
 
 
 
369
  return [
370
  {
371
  "model": model,
@@ -373,30 +356,36 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
373
  "task": "truthfulqa",
374
  "metric": "accuracy",
375
  "score": acc,
 
376
  "sentence_nr": nr,
377
  }
378
  ]
379
 
380
 
381
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
382
- system_prompt = """
383
- Solve the math problem. Use reasoning, and finally give the answer as a number.
384
- Response format: <reasoning> #### <number>
385
- """
386
- system_prompt = dedent(system_prompt).strip()
387
- ds_slug, question = load_mgsm(language_bcp_47, nr)
388
  if not question:
389
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  response = await complete(
391
  model=model,
392
- messages=[
393
- {"role": "system", "content": system_prompt},
394
- {"role": "user", "content": question["question"]},
395
- ],
396
  temperature=0,
397
  max_tokens=1024,
398
  )
399
- if response and len(response.split("####")) == 2:
400
  number = response.split("####")[1].strip()
401
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
402
  else:
@@ -409,6 +398,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
409
  "task": "mgsm",
410
  "metric": "accuracy",
411
  "score": accuracy,
 
412
  "sentence_nr": nr,
413
  }
414
  ]
@@ -449,10 +439,8 @@ tasks = {
449
  "translation_from": partial(translate_and_evaluate, mode="from"),
450
  "translation_to": partial(translate_and_evaluate, mode="to"),
451
  "classification": classify_and_evaluate,
452
- # "mlm": mlm_and_evaluate,
453
  "mmlu": mmlu_and_evaluate,
454
  "arc": arc_and_evaluate,
455
  "truthfulqa": truthfulqa_and_evaluate,
456
  "mgsm": mgsm_and_evaluate,
457
- # "asr": transcribe_and_evaluate,
458
  }
 
1
+ import asyncio
2
  import random
3
  from functools import partial
4
  from textwrap import dedent
 
6
  import evaluate
7
  import pandas as pd
8
  import sentencepiece as spm
9
+ from datasets_.arc import load_uhura_arc_easy
10
  from datasets_.flores import flores_sentences
11
  from datasets_.mgsm import load_mgsm, parse_number
12
  from datasets_.mmlu import load_mmlu
 
13
  from datasets_.truthfulqa import load_truthfulqa
14
  from google.cloud import translate_v2 as translate
15
  from langcodes import closest_supported_match
 
48
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
49
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
50
  script = script_name(target_language.flores_path.split("_")[1])
51
+ translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
52
  if model == "google/translate-v2":
53
  original_language = closest_supported_match(
54
  original_language, supported_languages
 
68
  messages=[
69
  {
70
  "role": "user",
71
+ "content": translation_prompt,
72
  }
73
  ],
74
  temperature=0,
 
93
  "task": f"translation_{mode}",
94
  "metric": metric,
95
  "score": score,
96
+ "origin": "human", # FLORES+ is human-translated
97
  "sentence_nr": sentence_nr,
98
  }
99
  for metric, score in (
 
115
  )
116
  top_topics = paragraphs.value_counts("topic").head(5).index
117
  paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
118
+ test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
 
 
 
 
 
 
 
 
 
119
 
120
+ prompt = f"""Classify the following text into one of these topics: {", ".join(top_topics)}.
121
+ Reply with only the topic name.
122
 
123
+ Text:
124
+ {test_paragraph.text}
125
+ """
126
+ response = await complete(
127
+ model=model,
128
+ messages=[{"role": "user", "content": prompt}],
129
+ temperature=0,
130
+ max_tokens=30,
131
+ )
132
+
133
+ pred = response.lower().strip() if response else ""
134
+ true = test_paragraph.topic.lower().strip()
135
+ others = [t for t in top_topics if t != true]
136
+ acc = (
137
+ int(
138
+ pred.startswith(true)
139
+ or (true in pred and not any(o in pred for o in others))
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  )
141
+ if pred
142
+ else 0
143
+ )
144
+
 
 
145
  return [
146
  {
147
  "model": model,
 
149
  "task": "classification",
150
  "metric": "accuracy",
151
  "score": acc,
152
+ "origin": "human", # FLORES+ is human-translated
153
  "sentence_nr": nr,
154
  }
155
  ]
 
212
  A: {item["choices"][0]}
213
  B: {item["choices"][1]}
214
  C: {item["choices"][2]}
215
+ D: {item["choices"][3]}"""
 
 
216
 
217
 
218
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
219
+ ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
220
  if not task:
221
  return []
222
 
223
+ messages = [
224
+ {
225
+ "role": "user",
226
+ "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
227
+
228
+ Response format: <reasoning> #### <letter>
229
+
230
+ ---
231
+
232
+ {format_multiple_choice(task)}""",
233
+ },
234
+ ]
235
+ response = await complete(
236
+ model=model,
237
+ messages=messages,
238
+ temperature=0,
239
+ max_tokens=1024,
240
+ )
241
+ if response and "####" in response:
242
+ answer = response.split("####")[-1].strip()
243
+ acc = int(answer[:1] == task["answer"])
244
+ else:
245
+ acc = 0
246
+
247
  return [
248
  {
249
  "model": model,
 
251
  "task": "mmlu",
252
  "metric": "accuracy",
253
  "score": acc,
254
+ "origin": origin, # Add origin tag to results
255
  "sentence_nr": nr,
256
  }
257
  ]
258
 
259
 
260
  async def arc_and_evaluate(model, language_bcp_47, nr):
261
+ ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
262
  if not task:
263
  return []
264
 
265
+ messages = [
266
+ {
267
+ "role": "user",
268
+ "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
269
+
270
+ Response format: <reasoning> #### <letter>
271
+
272
+ ---
273
+
274
+ {format_multiple_choice(task)}""",
275
+ },
276
+ ]
277
+ response = await complete(
278
+ model=model,
279
+ messages=messages,
280
+ temperature=0,
281
+ max_tokens=1024,
282
+ )
283
+ if response and "####" in response:
284
+ answer = response.split("####")[-1].strip()
285
+ acc = int(answer[:1] == task["answer"])
286
+ else:
287
+ acc = 0
288
  return [
289
  {
290
  "model": model,
 
292
  "task": "arc",
293
  "metric": "accuracy",
294
  "score": acc,
295
+ "origin": origin,
296
  "sentence_nr": nr,
297
  }
298
  ]
 
313
  text = item["question"] + "\n\n"
314
  for i, choice in enumerate(item["choices"]):
315
  text += f"{letters[i]}: {choice}\n"
 
316
  return text
317
 
318
 
319
  async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
320
+ ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
321
  if not task:
322
  return []
323
+
324
+ # Find the correct answer
325
+ correct_choice_index = task["labels"].index(1)
326
+ answer = letters[correct_choice_index]
327
+
328
+ messages = [
329
+ {
330
+ "role": "user",
331
+ "content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
332
+
333
+ Response format: <reasoning> #### <letter>
334
+
335
+ ---
336
+
337
+ {format_multiple_choice_truthfulqa(task)}""",
338
+ },
339
+ ]
340
+ response = await complete(
341
+ model=model,
342
+ messages=messages,
343
+ temperature=0,
344
+ max_tokens=1024, # Increased for reasoning
345
+ )
346
+ if response and "####" in response:
347
+ pred_answer = response.split("####")[-1].strip()
348
+ acc = int(pred_answer[:1].upper() == answer)
349
+ else:
350
+ acc = 0
351
+
352
  return [
353
  {
354
  "model": model,
 
356
  "task": "truthfulqa",
357
  "metric": "accuracy",
358
  "score": acc,
359
+ "origin": origin,
360
  "sentence_nr": nr,
361
  }
362
  ]
363
 
364
 
365
  async def mgsm_and_evaluate(model, language_bcp_47, nr):
366
+ ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
 
 
 
 
 
367
  if not question:
368
  return []
369
+
370
+ messages = [
371
+ {
372
+ "role": "user",
373
+ "content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
374
+
375
+ Response format: <reasoning> #### <number>
376
+
377
+ ---
378
+
379
+ {question["question"]}""",
380
+ },
381
+ ]
382
  response = await complete(
383
  model=model,
384
+ messages=messages,
 
 
 
385
  temperature=0,
386
  max_tokens=1024,
387
  )
388
+ if response and "####" in response:
389
  number = response.split("####")[1].strip()
390
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
391
  else:
 
398
  "task": "mgsm",
399
  "metric": "accuracy",
400
  "score": accuracy,
401
+ "origin": origin,
402
  "sentence_nr": nr,
403
  }
404
  ]
 
439
  "translation_from": partial(translate_and_evaluate, mode="from"),
440
  "translation_to": partial(translate_and_evaluate, mode="to"),
441
  "classification": classify_and_evaluate,
 
442
  "mmlu": mmlu_and_evaluate,
443
  "arc": arc_and_evaluate,
444
  "truthfulqa": truthfulqa_and_evaluate,
445
  "mgsm": mgsm_and_evaluate,
 
446
  }
evals/translate.py CHANGED
@@ -6,4 +6,4 @@ from datasets_.mmlu import translate_mmlu
6
  if __name__ == "__main__":
7
  translate_mmlu(languages)
8
  translate_mgsm(languages)
9
- translate_arc(languages)
 
6
  if __name__ == "__main__":
7
  translate_mmlu(languages)
8
  translate_mgsm(languages)
9
+ translate_arc(languages)
frontend/package-lock.json CHANGED
The diff for this file is too large to render. See raw diff
 
frontend/package.json CHANGED
@@ -6,13 +6,12 @@
6
  "@observablehq/plot": "^0.6.17",
7
  "@testing-library/dom": "^10.4.0",
8
  "@testing-library/jest-dom": "^6.6.3",
9
- "@testing-library/react": "^16.2.0",
10
  "@testing-library/user-event": "^13.5.0",
11
  "primeicons": "^7.0.0",
12
  "primereact": "^10.9.3",
13
- "react": "^19.0.0",
14
- "react-dom": "^19.0.0",
15
- "react-scripts": "5.0.1",
16
  "topojson-simplify": "^3.0.3",
17
  "web-vitals": "^2.1.4"
18
  },
@@ -41,5 +40,8 @@
41
  "last 1 safari version"
42
  ]
43
  },
44
- "proxy": "http://localhost:8000"
 
 
 
45
  }
 
6
  "@observablehq/plot": "^0.6.17",
7
  "@testing-library/dom": "^10.4.0",
8
  "@testing-library/jest-dom": "^6.6.3",
9
+ "@testing-library/react": "^15.0.0",
10
  "@testing-library/user-event": "^13.5.0",
11
  "primeicons": "^7.0.0",
12
  "primereact": "^10.9.3",
13
+ "react": "^18.2.0",
14
+ "react-dom": "^18.2.0",
 
15
  "topojson-simplify": "^3.0.3",
16
  "web-vitals": "^2.1.4"
17
  },
 
40
  "last 1 safari version"
41
  ]
42
  },
43
+ "proxy": "http://localhost:8000",
44
+ "devDependencies": {
45
+ "react-scripts": "^5.0.1"
46
+ }
47
  }
frontend/src/App.js CHANGED
@@ -19,9 +19,14 @@ function App () {
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState(null)
21
  const [selectedLanguages, setSelectedLanguages] = useState([])
 
22
  const [dialogVisible, setDialogVisible] = useState(false)
23
  const [aboutVisible, setAboutVisible] = useState(false)
24
  const [contributeVisible, setContributeVisible] = useState(false)
 
 
 
 
25
 
26
  useEffect(() => {
27
  fetch('/api/data', {
@@ -36,6 +41,7 @@ function App () {
36
  })
37
  .then(jsonData => {
38
  setData(jsonData)
 
39
  setLoading(false)
40
  })
41
  .catch(err => {
@@ -44,8 +50,27 @@ function App () {
44
  })
45
  }, [selectedLanguages])
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
48
  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
 
49
  useEffect(() => {
50
  const handleResize = () => {
51
  setWindowWidth(window.innerWidth)
@@ -55,6 +80,44 @@ function App () {
55
  return () => window.removeEventListener('resize', handleResize)
56
  }, [])
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return (
59
  <PrimeReactProvider>
60
  <div
@@ -69,35 +132,50 @@ function App () {
69
  style={{
70
  backgroundColor: '#fff3cd',
71
  color: '#856404',
72
- padding: '0.75rem 1.25rem',
73
  marginBottom: '1rem',
74
  border: '1px solid #ffeeba',
75
  borderRadius: '0.25rem',
76
- textAlign: 'center'
 
 
77
  }}
78
  >
79
  <strong>Work in Progress:</strong> This dashboard is currently under
80
- active development. Evaluation results are not yet final.
 
 
 
 
 
 
 
 
 
81
  <a
82
  href='https://github.com/datenlabor-bmz/ai-language-monitor'
83
  target='_blank'
84
  rel='noopener noreferrer'
85
  style={{
86
  textDecoration: 'none',
87
- color: '#856404',
88
- float: 'right',
89
- fontSize: '1.2rem',
90
- fontWeight: 'bold',
91
- padding: '0 0.5rem',
92
- borderRadius: '3px',
93
- backgroundColor: 'rgba(255,255,255,0.3)'
 
 
 
 
 
 
 
 
94
  }}
95
  >
96
- <i
97
- className='pi pi-github'
98
- title='View on GitHub'
99
- style={{ marginRight: '0.3rem' }}
100
- />
101
  GitHub
102
  </a>
103
  </div>
@@ -149,39 +227,88 @@ function App () {
149
  <div
150
  style={{
151
  display: 'flex',
152
- gap: '1rem',
153
- marginBottom: '1.5rem',
154
  flexWrap: 'wrap',
155
  justifyContent: 'center'
156
  }}
157
  >
158
- <Button
159
- label='📚 About this tool'
160
- className='p-button-text'
161
  onClick={() => setAboutVisible(true)}
162
  style={{
163
- color: '#666',
164
- border: '1px solid #ddd',
165
- padding: '0.5rem 1rem',
166
- borderRadius: '4px',
167
- fontSize: '0.9rem'
 
 
 
 
 
 
 
 
 
 
 
 
168
  }}
169
- />
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- <Button
172
- label='🚀 Add your model (soon)'
173
- className='p-button-text'
174
  onClick={() => setContributeVisible(true)}
175
- tooltip='This feature is on our roadmap and will be available soon.'
176
- tooltipOptions={{ position: 'bottom' }}
177
  style={{
178
- color: '#666',
179
- border: '1px solid #ddd',
180
- padding: '0.5rem 1rem',
181
- borderRadius: '4px',
182
- fontSize: '0.9rem'
 
 
 
 
 
 
 
 
 
 
183
  }}
184
- />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  </div>
186
 
187
  {data && (
@@ -220,6 +347,7 @@ function App () {
220
  data={data.model_table}
221
  selectedLanguages={selectedLanguages}
222
  allLanguages={data.language_table || []}
 
223
  />
224
  <LanguageTable
225
  data={data.language_table}
@@ -248,20 +376,18 @@ function App () {
248
  color: '#666'
249
  }}
250
  />
251
- <Carousel
252
- value={[
253
- <WorldMap data={data.countries} />,
254
- <LanguagePlot data={data} />,
255
- <SpeakerPlot data={data} />,
256
- <HistoryPlot data={data} />,
257
- <CostPlot data={data} />
258
- ]}
259
- numScroll={1}
260
- numVisible={1}
261
- itemTemplate={item => item}
262
- circular
263
- style={{ width: '100%', minHeight: '650px' }}
264
- />
265
  </div>
266
  </>
267
  )}
@@ -409,36 +535,16 @@ function App () {
409
  modal
410
  header={null}
411
  >
412
- {data && (
413
  <div style={{ width: '100%', height: '100%' }}>
414
  <Carousel
415
- value={[
416
- <WorldMap
417
- data={data.countries}
418
- width={windowWidth * 0.7}
419
- height={windowHeight * 0.6}
420
- />,
421
- <LanguagePlot
422
- data={data}
423
- width={windowWidth * 0.7}
424
- height={windowHeight * 0.6}
425
- />,
426
- <SpeakerPlot
427
- data={data}
428
- width={windowWidth * 0.7}
429
- height={windowHeight * 0.6}
430
- />,
431
- <HistoryPlot
432
- data={data}
433
- width={windowWidth * 0.7}
434
- height={windowHeight * 0.6}
435
- />,
436
- <CostPlot data={data} />
437
- ]}
438
  numScroll={1}
439
  numVisible={1}
440
  itemTemplate={item => item}
441
- circular
 
442
  style={{ width: '100%', height: 'calc(90vh - 120px)' }}
443
  />
444
  </div>
@@ -449,4 +555,4 @@ function App () {
449
  )
450
  }
451
 
452
- export default App
 
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState(null)
21
  const [selectedLanguages, setSelectedLanguages] = useState([])
22
+ const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
23
  const [dialogVisible, setDialogVisible] = useState(false)
24
  const [aboutVisible, setAboutVisible] = useState(false)
25
  const [contributeVisible, setContributeVisible] = useState(false)
26
+
27
+ // Add state for carousel items
28
+ const [carouselItems, setCarouselItems] = useState([])
29
+ const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
30
 
31
  useEffect(() => {
32
  fetch('/api/data', {
 
41
  })
42
  .then(jsonData => {
43
  setData(jsonData)
44
+ setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
45
  setLoading(false)
46
  })
47
  .catch(err => {
 
50
  })
51
  }, [selectedLanguages])
52
 
53
+ // Create carousel items when data is loaded
54
+ useEffect(() => {
55
+ if (data) {
56
+ // Add a small delay to ensure components are ready
57
+ const timer = setTimeout(() => {
58
+ setCarouselItems([
59
+ <WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
60
+ <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
61
+ <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
62
+ <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
63
+ <CostPlot key="costplot-4" data={data} width={750} height={500} />
64
+ ]);
65
+ }, 100);
66
+
67
+ return () => clearTimeout(timer);
68
+ }
69
+ }, [data])
70
+
71
  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
72
  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
73
+
74
  useEffect(() => {
75
  const handleResize = () => {
76
  setWindowWidth(window.innerWidth)
 
80
  return () => window.removeEventListener('resize', handleResize)
81
  }, [])
82
 
83
+ // Create full-screen carousel items when data or window size changes
84
+ useEffect(() => {
85
+ if (data) {
86
+ const timer = setTimeout(() => {
87
+ setFullScreenCarouselItems([
88
+ <WorldMap
89
+ key="fs-worldmap-0"
90
+ data={data.countries}
91
+ allLanguages={data.language_table}
92
+ width={windowWidth * 0.7}
93
+ height={windowHeight * 0.6}
94
+ />,
95
+ <LanguagePlot
96
+ key="fs-langplot-1"
97
+ data={data}
98
+ width={windowWidth * 0.7}
99
+ height={windowHeight * 0.6}
100
+ />,
101
+ <SpeakerPlot
102
+ key="fs-speakerplot-2"
103
+ data={data}
104
+ width={windowWidth * 0.7}
105
+ height={windowHeight * 0.6}
106
+ />,
107
+ <HistoryPlot
108
+ key="fs-histplot-3"
109
+ data={data}
110
+ width={windowWidth * 0.7}
111
+ height={windowHeight * 0.6}
112
+ />,
113
+ <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
114
+ ]);
115
+ }, 100);
116
+
117
+ return () => clearTimeout(timer);
118
+ }
119
+ }, [data, windowWidth, windowHeight])
120
+
121
  return (
122
  <PrimeReactProvider>
123
  <div
 
132
  style={{
133
  backgroundColor: '#fff3cd',
134
  color: '#856404',
135
+ padding: '1rem 1.5rem',
136
  marginBottom: '1rem',
137
  border: '1px solid #ffeeba',
138
  borderRadius: '0.25rem',
139
+ textAlign: 'center',
140
+ lineHeight: '1.5',
141
+ position: 'relative'
142
  }}
143
  >
144
  <strong>Work in Progress:</strong> This dashboard is currently under
145
+ active development. Evaluation results are not yet final. Note that the visualised results currently stem from sampling 20 instances per combination of model, task, and language. We have evaluated 139 languages across 41 models and 7 tasks, totaling over 300,000 individual evaluations. Only the top 150 languages by speaker count are included in the current evaluation scope. More extensive evaluation runs will be released later this year.
146
+ </div>
147
+ <div
148
+ style={{
149
+ display: 'flex',
150
+ justifyContent: 'flex-end',
151
+ padding: '0 1.5rem',
152
+ marginBottom: '1rem'
153
+ }}
154
+ >
155
  <a
156
  href='https://github.com/datenlabor-bmz/ai-language-monitor'
157
  target='_blank'
158
  rel='noopener noreferrer'
159
  style={{
160
  textDecoration: 'none',
161
+ color: '#6c757d',
162
+ fontSize: '1rem',
163
+ fontWeight: '500',
164
+ padding: '0.5rem 1rem',
165
+ borderRadius: '0.375rem',
166
+ backgroundColor: '#f8f9fa',
167
+ border: '1px solid #e9ecef',
168
+ display: 'flex',
169
+ alignItems: 'center',
170
+ gap: '0.5rem',
171
+ transition: 'all 0.2s ease',
172
+ ':hover': {
173
+ backgroundColor: '#e9ecef',
174
+ color: '#495057'
175
+ }
176
  }}
177
  >
178
+ <i className='pi pi-github' title='View on GitHub' />
 
 
 
 
179
  GitHub
180
  </a>
181
  </div>
 
227
  <div
228
  style={{
229
  display: 'flex',
230
+ gap: '0.75rem',
231
+ marginBottom: '2rem',
232
  flexWrap: 'wrap',
233
  justifyContent: 'center'
234
  }}
235
  >
236
+ <button
 
 
237
  onClick={() => setAboutVisible(true)}
238
  style={{
239
+ background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
240
+ color: 'white',
241
+ border: 'none',
242
+ padding: '0.75rem 1.5rem',
243
+ borderRadius: '12px',
244
+ fontSize: '0.95rem',
245
+ fontWeight: '500',
246
+ cursor: 'pointer',
247
+ display: 'flex',
248
+ alignItems: 'center',
249
+ gap: '0.5rem',
250
+ boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
251
+ transition: 'all 0.3s ease',
252
+ ':hover': {
253
+ transform: 'translateY(-2px)',
254
+ boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
255
+ }
256
  }}
257
+ onMouseEnter={(e) => {
258
+ e.target.style.transform = 'translateY(-2px)';
259
+ e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
260
+ }}
261
+ onMouseLeave={(e) => {
262
+ e.target.style.transform = 'translateY(0)';
263
+ e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
264
+ }}
265
+ >
266
+ <span style={{ fontSize: '1.1rem' }}>📚</span>
267
+ About this tool
268
+ </button>
269
 
270
+ <button
 
 
271
  onClick={() => setContributeVisible(true)}
272
+ title='This feature is on our roadmap and will be available soon.'
 
273
  style={{
274
+ background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
275
+ color: '#6b46c1',
276
+ border: 'none',
277
+ padding: '0.75rem 1.5rem',
278
+ borderRadius: '12px',
279
+ fontSize: '0.95rem',
280
+ fontWeight: '500',
281
+ cursor: 'pointer',
282
+ display: 'flex',
283
+ alignItems: 'center',
284
+ gap: '0.5rem',
285
+ boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
286
+ transition: 'all 0.3s ease',
287
+ position: 'relative',
288
+ overflow: 'hidden'
289
  }}
290
+ onMouseEnter={(e) => {
291
+ e.target.style.transform = 'translateY(-2px)';
292
+ e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
293
+ }}
294
+ onMouseLeave={(e) => {
295
+ e.target.style.transform = 'translateY(0)';
296
+ e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
297
+ }}
298
+ >
299
+ <span style={{ fontSize: '1.1rem' }}>🚀</span>
300
+ Add your model
301
+ <span style={{
302
+ fontSize: '0.75rem',
303
+ backgroundColor: 'rgba(107, 70, 193, 0.15)',
304
+ padding: '0.2rem 0.5rem',
305
+ borderRadius: '6px',
306
+ marginLeft: '0.5rem',
307
+ fontWeight: '600'
308
+ }}>
309
+ soon
310
+ </span>
311
+ </button>
312
  </div>
313
 
314
  {data && (
 
347
  data={data.model_table}
348
  selectedLanguages={selectedLanguages}
349
  allLanguages={data.language_table || []}
350
+ machineTranslatedMetrics={machineTranslatedMetrics}
351
  />
352
  <LanguageTable
353
  data={data.language_table}
 
376
  color: '#666'
377
  }}
378
  />
379
+ {carouselItems.length > 0 && (
380
+ <Carousel
381
+ key={`main-carousel-${carouselItems.length}-${Date.now()}`}
382
+ value={carouselItems}
383
+ numScroll={1}
384
+ numVisible={1}
385
+ itemTemplate={item => item}
386
+ circular={false}
387
+ activeIndex={0}
388
+ style={{ width: '100%', minHeight: '650px' }}
389
+ />
390
+ )}
 
 
391
  </div>
392
  </>
393
  )}
 
535
  modal
536
  header={null}
537
  >
538
+ {fullScreenCarouselItems.length > 0 && (
539
  <div style={{ width: '100%', height: '100%' }}>
540
  <Carousel
541
+ key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
542
+ value={fullScreenCarouselItems}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  numScroll={1}
544
  numVisible={1}
545
  itemTemplate={item => item}
546
+ circular={false}
547
+ activeIndex={0}
548
  style={{ width: '100%', height: 'calc(90vh - 120px)' }}
549
  />
550
  </div>
 
555
  )
556
  }
557
 
558
+ export default App
frontend/src/components/HistoryPlot.js CHANGED
@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
50
  ...models.filter(d => d.newRecord),
51
  {
52
  creation_date: new Date(),
53
- maxAverage: models[models.length - 1].maxAverage
54
  }
55
  ],
56
  {
57
  x: d => d.creation_date,
58
- y: d => d.maxAverage,
59
  curve: 'step-after',
60
  strokeOpacity: 0.3
61
  }
 
50
  ...models.filter(d => d.newRecord),
51
  {
52
  creation_date: new Date(),
53
+ maxAverage: models[models.length - 1]?.maxAverage || 0
54
  }
55
  ],
56
  {
57
  x: d => d.creation_date,
58
+ y: d => d.maxAverage || 0,
59
  curve: 'step-after',
60
  strokeOpacity: 0.3
61
  }
frontend/src/components/LanguageTable.js CHANGED
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
- {ScoreColumns}
176
  </DataTable>
177
  )
178
  }
 
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
+ {ScoreColumns()}
176
  </DataTable>
177
  )
178
  }
frontend/src/components/ModelTable.js CHANGED
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
  import ScoreColumns from './ScoreColumns'
9
- const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
10
  const [filters, setFilters] = useState({
11
  type: { value: null, matchMode: FilterMatchMode.IN },
12
  size: { value: null, matchMode: FilterMatchMode.BETWEEN },
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
50
  }
51
 
52
  const SliderWithLabel = ({ value, onChange, min, max }) => {
53
- const p = 10
54
- const start = value === null ? min : Math.log(value[0]) / Math.log(p)
55
- const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
56
- const [_value, _setValue] = useState([start, stop])
57
  useEffect(() => {
58
  const timer = setTimeout(() => {
59
  onChange({
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
61
  // set to "no filter" when (almost) the whole range is selected
62
  _value[0] <= min + 0.1 && _value[1] >= max - 0.1
63
  ? null
64
- : [p ** _value[0], p ** _value[1]]
65
- })
66
- }, 1000)
67
- return () => clearTimeout(timer)
68
- }, [_value, onChange, min, max])
69
  return (
70
  <div style={{ minWidth: '20rem' }}>
71
  <div>{formatSize(p ** _value[0])}</div>
@@ -147,21 +147,35 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
147
  }
148
 
149
  const costBodyTemplate = rowData => {
150
- return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
 
 
 
 
151
  }
152
 
153
  const getHeaderText = () => {
154
- // Count languages that have evaluation data (average score available)
155
- const evaluatedLanguagesCount = allLanguages.filter(lang =>
156
- lang.average !== null && lang.average !== undefined
157
- ).length
 
 
 
 
 
 
 
 
 
 
158
 
159
  if (selectedLanguages.length === 0) {
160
  return (
161
  <span>
162
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
163
  <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
164
- Average performance across {evaluatedLanguagesCount} evaluated languages
165
  </span>
166
  </span>
167
  )
@@ -245,7 +259,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
245
  body={costBodyTemplate}
246
  style={{ minWidth: '5rem' }}
247
  />
248
- {ScoreColumns}
249
  </DataTable>
250
  )
251
  }
 
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
  import ScoreColumns from './ScoreColumns'
9
+ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
10
  const [filters, setFilters] = useState({
11
  type: { value: null, matchMode: FilterMatchMode.IN },
12
  size: { value: null, matchMode: FilterMatchMode.BETWEEN },
 
50
  }
51
 
52
  const SliderWithLabel = ({ value, onChange, min, max }) => {
53
+ const p = 10;
54
+ const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
55
+ const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
56
+ const [_value, _setValue] = useState([start, stop]);
57
  useEffect(() => {
58
  const timer = setTimeout(() => {
59
  onChange({
 
61
  // set to "no filter" when (almost) the whole range is selected
62
  _value[0] <= min + 0.1 && _value[1] >= max - 0.1
63
  ? null
64
+ : [p ** _value[0], p ** _value[1]],
65
+ });
66
+ }, 1000);
67
+ return () => clearTimeout(timer);
68
+ }, [_value, onChange, min, max]);
69
  return (
70
  <div style={{ minWidth: '20rem' }}>
71
  <div>{formatSize(p ** _value[0])}</div>
 
147
  }
148
 
149
  const costBodyTemplate = rowData => {
150
+ return (
151
+ <div style={{ textAlign: 'center' }}>
152
+ {rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
153
+ </div>
154
+ )
155
  }
156
 
157
  const getHeaderText = () => {
158
+ // Count languages that have any evaluation data (any task scores available)
159
+ const evaluatedLanguagesCount = allLanguages.filter(lang => {
160
+ // Check if language has any task scores (not just average)
161
+ const hasAnyScores = [
162
+ 'translation_from_bleu',
163
+ 'translation_to_bleu',
164
+ 'classification_accuracy',
165
+ 'mmlu_accuracy',
166
+ 'arc_accuracy',
167
+ 'truthfulqa_accuracy',
168
+ 'mgsm_accuracy'
169
+ ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
170
+ return hasAnyScores
171
+ }).length
172
 
173
  if (selectedLanguages.length === 0) {
174
  return (
175
  <span>
176
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
177
  <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
178
+ Performance across {evaluatedLanguagesCount} evaluated languages
179
  </span>
180
  </span>
181
  )
 
259
  body={costBodyTemplate}
260
  style={{ minWidth: '5rem' }}
261
  />
262
+ {ScoreColumns(machineTranslatedMetrics)}
263
  </DataTable>
264
  )
265
  }
frontend/src/components/ScoreColumns.js CHANGED
@@ -2,21 +2,28 @@ import { Column } from 'primereact/column'
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
- const { minScore = 0, maxScore = 1 } = options
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
- return ScoreField(score, minScore, maxScore)
 
 
 
 
 
 
 
10
  }
11
  }
12
 
13
- const ScoreColumns = [
14
  <Column
15
  field='average'
16
  header='Proficiency'
17
  headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
18
  sortable
19
- body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
21
  />,
22
  <Column
@@ -26,7 +33,8 @@ const ScoreColumns = [
26
  sortable
27
  body={scoreBodyTemplate('translation_from_bleu', {
28
  minScore: 0,
29
- maxScore: 0.5
 
30
  })}
31
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
32
  />,
@@ -37,7 +45,8 @@ const ScoreColumns = [
37
  sortable
38
  body={scoreBodyTemplate('translation_to_bleu', {
39
  minScore: 0,
40
- maxScore: 0.5
 
41
  })}
42
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
43
  />,
@@ -48,7 +57,8 @@ const ScoreColumns = [
48
  sortable
49
  body={scoreBodyTemplate('classification_accuracy', {
50
  minScore: 0,
51
- maxScore: 0.5
 
52
  })}
53
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
54
  />,
@@ -69,7 +79,8 @@ const ScoreColumns = [
69
  sortable
70
  body={scoreBodyTemplate('mmlu_accuracy', {
71
  minScore: 0,
72
- maxScore: 1
 
73
  })}
74
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
75
  />,
@@ -80,7 +91,8 @@ const ScoreColumns = [
80
  sortable
81
  body={scoreBodyTemplate('arc_accuracy', {
82
  minScore: 0,
83
- maxScore: 1
 
84
  })}
85
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
  />,
@@ -91,7 +103,8 @@ const ScoreColumns = [
91
  sortable
92
  body={scoreBodyTemplate('mgsm_accuracy', {
93
  minScore: 0,
94
- maxScore: 1
 
95
  })}
96
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
97
  />,
 
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
+ const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
+ // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
10
+ // otherwise fall back to global list
11
+ const rowFlagKey = `${field}_is_machine`
12
+ const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
13
+ const isMachineTranslated = hasRowFlag
14
+ ? !!rowData[rowFlagKey]
15
+ : machineTranslatedMetrics.includes(field)
16
+ return ScoreField(score, minScore, maxScore, isMachineTranslated)
17
  }
18
  }
19
 
20
+ const ScoreColumns = (machineTranslatedMetrics = []) => [
21
  <Column
22
  field='average'
23
  header='Proficiency'
24
  headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
25
  sortable
26
+ body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
27
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
28
  />,
29
  <Column
 
33
  sortable
34
  body={scoreBodyTemplate('translation_from_bleu', {
35
  minScore: 0,
36
+ maxScore: 0.5,
37
+ machineTranslatedMetrics
38
  })}
39
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
40
  />,
 
45
  sortable
46
  body={scoreBodyTemplate('translation_to_bleu', {
47
  minScore: 0,
48
+ maxScore: 0.5,
49
+ machineTranslatedMetrics
50
  })}
51
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
52
  />,
 
57
  sortable
58
  body={scoreBodyTemplate('classification_accuracy', {
59
  minScore: 0,
60
+ maxScore: 0.5,
61
+ machineTranslatedMetrics
62
  })}
63
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
64
  />,
 
79
  sortable
80
  body={scoreBodyTemplate('mmlu_accuracy', {
81
  minScore: 0,
82
+ maxScore: 1,
83
+ machineTranslatedMetrics
84
  })}
85
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
  />,
 
91
  sortable
92
  body={scoreBodyTemplate('arc_accuracy', {
93
  minScore: 0,
94
+ maxScore: 1,
95
+ machineTranslatedMetrics
96
  })}
97
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
98
  />,
 
103
  sortable
104
  body={scoreBodyTemplate('mgsm_accuracy', {
105
  minScore: 0,
106
+ maxScore: 1,
107
+ machineTranslatedMetrics
108
  })}
109
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
110
  />,
frontend/src/components/ScoreField.js CHANGED
@@ -1,4 +1,4 @@
1
- const ScoreField = (score, minScore, maxScore) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
  if (score !== null) {
@@ -50,6 +50,7 @@ const ScoreField = (score, minScore, maxScore) => {
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
 
53
  </span>
54
  </div>
55
  )
 
1
+ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
  if (score !== null) {
 
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
53
+ {isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
54
  </span>
55
  </div>
56
  )
frontend/src/components/SpeakerPlot.js CHANGED
@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
- Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
  x: 40,
78
  y: languages[39].cumSpeakers / 1e6
79
- })
80
  ]
81
  })
82
  containerRef.current.append(plot)
 
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
+ ...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
  x: 40,
78
  y: languages[39].cumSpeakers / 1e6
79
+ })] : [])
80
  ]
81
  })
82
  containerRef.current.append(plot)
frontend/src/components/WorldMap.js CHANGED
@@ -26,13 +26,13 @@ const makeTitle = data => d => {
26
  a =>
27
  `${smoothProgressBar(a.population / pop)} ${
28
  a.name
29
- } – ${a.score.toFixed(2)}`
30
  )
31
  .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
32
- return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
33
  }
34
 
35
- const WorldMap = ({ data, width = 750, height = 500 }) => {
36
  const containerRef = useRef()
37
  const [mapData, setMapData] = useState()
38
 
@@ -48,8 +48,22 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
48
  acc[country.iso2] = country
49
  return acc
50
  }, {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  const plot = Plot.plot({
52
- subtitle: 'Language Proficiency Score by Country',
53
  width: width,
54
  height: height,
55
  projection: 'equal-earth',
@@ -61,11 +75,12 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
61
  })
62
  ],
63
  color: {
64
- scheme: 'Greens',
65
- unknown: 'gray',
66
  label: 'Score',
67
  legend: true,
68
- domain: [0, 1]
 
69
  },
70
  style: {
71
  fontFamily: 'monospace'
 
26
  a =>
27
  `${smoothProgressBar(a.population / pop)} ${
28
  a.name
29
+ } – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
30
  )
31
  .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
32
+ return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
33
  }
34
 
35
+ const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
36
  const containerRef = useRef()
37
  const [mapData, setMapData] = useState()
38
 
 
48
  acc[country.iso2] = country
49
  return acc
50
  }, {})
51
+ // Count languages that have any evaluation data
52
+ const evaluatedLanguagesCount = allLanguages.filter(lang => {
53
+ const hasAnyScores = [
54
+ 'translation_from_bleu',
55
+ 'translation_to_bleu',
56
+ 'classification_accuracy',
57
+ 'mmlu_accuracy',
58
+ 'arc_accuracy',
59
+ 'truthfulqa_accuracy',
60
+ 'mgsm_accuracy'
61
+ ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
62
+ return hasAnyScores
63
+ }).length
64
+
65
  const plot = Plot.plot({
66
+ subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
67
  width: width,
68
  height: height,
69
  projection: 'equal-earth',
 
75
  })
76
  ],
77
  color: {
78
+ scheme: 'RdYlGn',
79
+ unknown: '#d0d0d0',
80
  label: 'Score',
81
  legend: true,
82
+ domain: [0, 1],
83
+ pivot: 0.5
84
  },
85
  style: {
86
  fontFamily: 'monospace'
languages.json CHANGED
@@ -7,7 +7,7 @@
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
- "commonvoice_hours":2674.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
@@ -32,7 +32,7 @@
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
- "commonvoice_locale":"hi-IN",
36
  "in_benchmark":true
37
  },
38
  {
@@ -43,7 +43,7 @@
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
- "commonvoice_hours":448.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
@@ -79,7 +79,7 @@
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
- "commonvoice_hours":1065.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
@@ -103,7 +103,7 @@
103
  "family":"Indo-European",
104
  "flores_path":"por_Latn",
105
  "fleurs_tag":"pt_br",
106
- "commonvoice_hours":180.0,
107
  "commonvoice_locale":"pt",
108
  "in_benchmark":true
109
  },
@@ -115,7 +115,7 @@
115
  "family":"Indo-European",
116
  "flores_path":"pan_Guru",
117
  "fleurs_tag":"pa_in",
118
- "commonvoice_hours":2.3,
119
  "commonvoice_locale":"pa-IN",
120
  "in_benchmark":true
121
  },
@@ -127,7 +127,7 @@
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
- "commonvoice_hours":245.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
@@ -139,7 +139,7 @@
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
- "commonvoice_hours":411.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
@@ -151,7 +151,7 @@
151
  "family":"Austronesian",
152
  "flores_path":"ind_Latn",
153
  "fleurs_tag":"id_id",
154
- "commonvoice_hours":33.0,
155
  "commonvoice_locale":"id",
156
  "in_benchmark":true
157
  },
@@ -163,7 +163,7 @@
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
- "commonvoice_hours":1369.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
@@ -379,7 +379,7 @@
379
  "family":"Indo-European",
380
  "flores_path":null,
381
  "fleurs_tag":"ps_af",
382
- "commonvoice_hours":81.0,
383
  "commonvoice_locale":"ps",
384
  "in_benchmark":false
385
  },
@@ -439,7 +439,7 @@
439
  "family":"Indo-European",
440
  "flores_path":"pol_Latn",
441
  "fleurs_tag":"pl_pl",
442
- "commonvoice_hours":175.0,
443
  "commonvoice_locale":"pl",
444
  "in_benchmark":true
445
  },
@@ -619,7 +619,7 @@
619
  "family":"Indo-European",
620
  "flores_path":"nld_Latn",
621
  "fleurs_tag":"nl_nl",
622
- "commonvoice_hours":120.0,
623
  "commonvoice_locale":"nl",
624
  "in_benchmark":true
625
  },
@@ -655,7 +655,7 @@
655
  "family":"Atlantic-Congo",
656
  "flores_path":"yor_Latn",
657
  "fleurs_tag":"yo_ng",
658
- "commonvoice_hours":6.3,
659
  "commonvoice_locale":"yo",
660
  "in_benchmark":true
661
  },
@@ -979,7 +979,7 @@
979
  "family":"Turkic",
980
  "flores_path":"kaz_Cyrl",
981
  "fleurs_tag":"kk_kz",
982
- "commonvoice_hours":2.2,
983
  "commonvoice_locale":"kk",
984
  "in_benchmark":true
985
  },
@@ -1027,7 +1027,7 @@
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
- "commonvoice_hours":93.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
@@ -1099,7 +1099,7 @@
1099
  "family":"Indo-European",
1100
  "flores_path":"ckb_Arab",
1101
  "fleurs_tag":"ckb_iq",
1102
- "commonvoice_hours":135.0,
1103
  "commonvoice_locale":"ckb",
1104
  "in_benchmark":true
1105
  },
@@ -1183,7 +1183,7 @@
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
- "commonvoice_hours":1810.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
@@ -1207,7 +1207,7 @@
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
- "commonvoice_hours":0.4,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
@@ -1243,7 +1243,7 @@
1243
  "family":"Indo-European",
1244
  "flores_path":"afr_Latn",
1245
  "fleurs_tag":"af_za",
1246
- "commonvoice_hours":0.5,
1247
  "commonvoice_locale":"af",
1248
  "in_benchmark":true
1249
  },
@@ -1291,7 +1291,7 @@
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
- "commonvoice_hours":2863.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
@@ -1303,7 +1303,7 @@
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
- "commonvoice_hours":1.4,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
@@ -1375,7 +1375,7 @@
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
- "commonvoice_hours":411.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
@@ -1519,7 +1519,7 @@
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
- "commonvoice_hours":69.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
@@ -1555,7 +1555,7 @@
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
- "commonvoice_hours":51.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
@@ -1675,7 +1675,7 @@
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
- "commonvoice_hours":4.0,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
@@ -1747,7 +1747,7 @@
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
- "commonvoice_hours":0.5,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
@@ -2155,7 +2155,7 @@
2155
  "family":"Kartvelian",
2156
  "flores_path":"kat_Geor",
2157
  "fleurs_tag":"ka_ge",
2158
- "commonvoice_hours":166.0,
2159
  "commonvoice_locale":"ka",
2160
  "in_benchmark":true
2161
  },
@@ -2167,7 +2167,7 @@
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
- "commonvoice_hours":117.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
@@ -2323,7 +2323,7 @@
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
- "commonvoice_hours":1.2,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
@@ -2623,7 +2623,7 @@
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
- "commonvoice_hours":0.9,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
@@ -2695,7 +2695,7 @@
2695
  "family":"Indo-European",
2696
  "flores_path":"oci_Latn",
2697
  "fleurs_tag":"oc_fr",
2698
- "commonvoice_hours":1.8,
2699
  "commonvoice_locale":"oc",
2700
  "in_benchmark":true
2701
  },
@@ -3175,8 +3175,8 @@
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
- "commonvoice_hours":null,
3179
- "commonvoice_locale":null,
3180
  "in_benchmark":false
3181
  },
3182
  {
@@ -3319,8 +3319,8 @@
3319
  "family":"Indo-European",
3320
  "flores_path":null,
3321
  "fleurs_tag":null,
3322
- "commonvoice_hours":null,
3323
- "commonvoice_locale":null,
3324
  "in_benchmark":false
3325
  },
3326
  {
@@ -3331,7 +3331,7 @@
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
- "commonvoice_hours":8.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
@@ -3487,7 +3487,7 @@
3487
  "family":"Indo-European",
3488
  "flores_path":"lvs_Latn",
3489
  "fleurs_tag":"lv_lv",
3490
- "commonvoice_hours":262.0,
3491
  "commonvoice_locale":"lv",
3492
  "in_benchmark":true
3493
  },
@@ -3535,7 +3535,7 @@
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
- "commonvoice_hours":440.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
@@ -3559,7 +3559,7 @@
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
- "commonvoice_hours":83.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
@@ -3679,7 +3679,7 @@
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
- "commonvoice_hours":0.7,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
@@ -3991,8 +3991,8 @@
3991
  "family":"Atlantic-Congo",
3992
  "flores_path":null,
3993
  "fleurs_tag":null,
3994
- "commonvoice_hours":null,
3995
- "commonvoice_locale":null,
3996
  "in_benchmark":false
3997
  },
3998
  {
@@ -4099,8 +4099,8 @@
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
- "commonvoice_hours":null,
4103
- "commonvoice_locale":null,
4104
  "in_benchmark":false
4105
  },
4106
  {
@@ -4351,7 +4351,7 @@
4351
  "family":"Indo-European",
4352
  "flores_path":null,
4353
  "fleurs_tag":null,
4354
- "commonvoice_hours":29.0,
4355
  "commonvoice_locale":"br",
4356
  "in_benchmark":false
4357
  },
@@ -4651,7 +4651,7 @@
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
- "commonvoice_hours":30.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
@@ -5011,7 +5011,7 @@
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
- "commonvoice_hours":0.0,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
@@ -7879,7 +7879,7 @@
7879
  "family":"Artificial Language",
7880
  "flores_path":"epo_Latn",
7881
  "fleurs_tag":null,
7882
- "commonvoice_hours":1436.0,
7883
  "commonvoice_locale":"eo",
7884
  "in_benchmark":true
7885
  },
 
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
+ "commonvoice_hours":2683.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
 
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
+ "commonvoice_locale":"hi",
36
  "in_benchmark":true
37
  },
38
  {
 
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
+ "commonvoice_hours":449.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
 
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
+ "commonvoice_hours":1073.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
 
103
  "family":"Indo-European",
104
  "flores_path":"por_Latn",
105
  "fleurs_tag":"pt_br",
106
+ "commonvoice_hours":181.0,
107
  "commonvoice_locale":"pt",
108
  "in_benchmark":true
109
  },
 
115
  "family":"Indo-European",
116
  "flores_path":"pan_Guru",
117
  "fleurs_tag":"pa_in",
118
+ "commonvoice_hours":2.5,
119
  "commonvoice_locale":"pa-IN",
120
  "in_benchmark":true
121
  },
 
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
+ "commonvoice_hours":247.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
 
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
+ "commonvoice_hours":412.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
 
151
  "family":"Austronesian",
152
  "flores_path":"ind_Latn",
153
  "fleurs_tag":"id_id",
154
+ "commonvoice_hours":34.0,
155
  "commonvoice_locale":"id",
156
  "in_benchmark":true
157
  },
 
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
+ "commonvoice_hours":1372.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
 
379
  "family":"Indo-European",
380
  "flores_path":null,
381
  "fleurs_tag":"ps_af",
382
+ "commonvoice_hours":82.0,
383
  "commonvoice_locale":"ps",
384
  "in_benchmark":false
385
  },
 
439
  "family":"Indo-European",
440
  "flores_path":"pol_Latn",
441
  "fleurs_tag":"pl_pl",
442
+ "commonvoice_hours":176.0,
443
  "commonvoice_locale":"pl",
444
  "in_benchmark":true
445
  },
 
619
  "family":"Indo-European",
620
  "flores_path":"nld_Latn",
621
  "fleurs_tag":"nl_nl",
622
+ "commonvoice_hours":123.0,
623
  "commonvoice_locale":"nl",
624
  "in_benchmark":true
625
  },
 
655
  "family":"Atlantic-Congo",
656
  "flores_path":"yor_Latn",
657
  "fleurs_tag":"yo_ng",
658
+ "commonvoice_hours":6.4,
659
  "commonvoice_locale":"yo",
660
  "in_benchmark":true
661
  },
 
979
  "family":"Turkic",
980
  "flores_path":"kaz_Cyrl",
981
  "fleurs_tag":"kk_kz",
982
+ "commonvoice_hours":2.3,
983
  "commonvoice_locale":"kk",
984
  "in_benchmark":true
985
  },
 
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
+ "commonvoice_hours":94.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
 
1099
  "family":"Indo-European",
1100
  "flores_path":"ckb_Arab",
1101
  "fleurs_tag":"ckb_iq",
1102
+ "commonvoice_hours":136.0,
1103
  "commonvoice_locale":"ckb",
1104
  "in_benchmark":true
1105
  },
 
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
+ "commonvoice_hours":1812.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
 
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
+ "commonvoice_hours":0.6,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
 
1243
  "family":"Indo-European",
1244
  "flores_path":"afr_Latn",
1245
  "fleurs_tag":"af_za",
1246
+ "commonvoice_hours":0.6,
1247
  "commonvoice_locale":"af",
1248
  "in_benchmark":true
1249
  },
 
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
+ "commonvoice_hours":2883.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
 
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
+ "commonvoice_hours":2.0,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
 
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
+ "commonvoice_hours":437.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
 
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
+ "commonvoice_hours":71.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
 
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
+ "commonvoice_hours":52.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
 
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
+ "commonvoice_hours":4.5,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
 
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
+ "commonvoice_hours":1.8,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
 
2155
  "family":"Kartvelian",
2156
  "flores_path":"kat_Geor",
2157
  "fleurs_tag":"ka_ge",
2158
+ "commonvoice_hours":167.0,
2159
  "commonvoice_locale":"ka",
2160
  "in_benchmark":true
2161
  },
 
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
+ "commonvoice_hours":164.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
 
2323
  "family":"Dravidian",
2324
  "flores_path":null,
2325
  "fleurs_tag":null,
2326
+ "commonvoice_hours":11.0,
2327
  "commonvoice_locale":"brh",
2328
  "in_benchmark":false
2329
  },
 
2623
  "family":"Indo-European",
2624
  "flores_path":null,
2625
  "fleurs_tag":null,
2626
+ "commonvoice_hours":11.0,
2627
  "commonvoice_locale":"haz",
2628
  "in_benchmark":false
2629
  },
 
2695
  "family":"Indo-European",
2696
  "flores_path":"oci_Latn",
2697
  "fleurs_tag":"oc_fr",
2698
+ "commonvoice_hours":1.9,
2699
  "commonvoice_locale":"oc",
2700
  "in_benchmark":true
2701
  },
 
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
+ "commonvoice_hours":0.0,
3179
+ "commonvoice_locale":"seh",
3180
  "in_benchmark":false
3181
  },
3182
  {
 
3319
  "family":"Indo-European",
3320
  "flores_path":null,
3321
  "fleurs_tag":null,
3322
+ "commonvoice_hours":0.0,
3323
+ "commonvoice_locale":"mfe",
3324
  "in_benchmark":false
3325
  },
3326
  {
 
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
+ "commonvoice_hours":9.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
 
3487
  "family":"Indo-European",
3488
  "flores_path":"lvs_Latn",
3489
  "fleurs_tag":"lv_lv",
3490
+ "commonvoice_hours":263.0,
3491
  "commonvoice_locale":"lv",
3492
  "in_benchmark":true
3493
  },
 
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
+ "commonvoice_hours":453.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
 
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
+ "commonvoice_hours":106.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
 
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
+ "commonvoice_hours":1.8,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
 
3991
  "family":"Atlantic-Congo",
3992
  "flores_path":null,
3993
  "fleurs_tag":null,
3994
+ "commonvoice_hours":0.0,
3995
+ "commonvoice_locale":"gaa",
3996
  "in_benchmark":false
3997
  },
3998
  {
 
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
+ "commonvoice_hours":0.0,
4103
+ "commonvoice_locale":"pcd",
4104
  "in_benchmark":false
4105
  },
4106
  {
 
4351
  "family":"Indo-European",
4352
  "flores_path":null,
4353
  "fleurs_tag":null,
4354
+ "commonvoice_hours":30.0,
4355
  "commonvoice_locale":"br",
4356
  "in_benchmark":false
4357
  },
 
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
+ "commonvoice_hours":32.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
 
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
+ "commonvoice_hours":1.3,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
 
7879
  "family":"Artificial Language",
7880
  "flores_path":"epo_Latn",
7881
  "fleurs_tag":null,
7882
+ "commonvoice_hours":1437.0,
7883
  "commonvoice_locale":"eo",
7884
  "in_benchmark":true
7885
  },
models.json CHANGED
@@ -20,15 +20,15 @@
20
  ]
21
  },
22
  {
23
- "id":"anthropic\/claude-3.5-sonnet",
24
- "name":"Claude 3.5 Sonnet",
25
  "provider_name":"Anthropic",
26
- "cost":15.0,
27
  "hf_id":null,
28
  "size":null,
29
  "type":"closed-source",
30
  "license":null,
31
- "creation_date":1729555200000,
32
  "tasks":[
33
  "translation_from",
34
  "translation_to",
@@ -80,15 +80,15 @@
80
  ]
81
  },
82
  {
83
- "id":"deepseek\/deepseek-chat",
84
- "name":"DeepSeek V3",
85
- "provider_name":"DeepSeek",
86
  "cost":0.0,
87
- "hf_id":"deepseek-ai\/DeepSeek-V3",
88
- "size":684531386000.0,
89
  "type":"open-source",
90
- "license":"",
91
- "creation_date":1735084800000,
92
  "tasks":[
93
  "translation_from",
94
  "translation_to",
@@ -100,15 +100,15 @@
100
  ]
101
  },
102
  {
103
- "id":"deepseek\/deepseek-chat-v3-0324",
104
- "name":"DeepSeek V3 0324",
105
- "provider_name":"DeepSeek",
106
- "cost":0.0,
107
- "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
108
- "size":684531386000.0,
109
- "type":"open-source",
110
- "license":"Mit",
111
- "creation_date":1742774400000,
112
  "tasks":[
113
  "translation_from",
114
  "translation_to",
@@ -120,15 +120,15 @@
120
  ]
121
  },
122
  {
123
- "id":"deepseek\/deepseek-r1",
124
- "name":"R1",
125
  "provider_name":"DeepSeek",
126
- "cost":0.0,
127
- "hf_id":"deepseek-ai\/DeepSeek-R1",
128
  "size":684531386000.0,
129
  "type":"open-source",
130
- "license":"Mit",
131
- "creation_date":1737331200000,
132
  "tasks":[
133
  "translation_from",
134
  "translation_to",
@@ -140,15 +140,15 @@
140
  ]
141
  },
142
  {
143
- "id":"deepseek\/deepseek-r1-0528",
144
- "name":"R1 0528",
145
  "provider_name":"DeepSeek",
146
  "cost":0.0,
147
- "hf_id":"deepseek-ai\/DeepSeek-R1-0528",
148
  "size":684531386000.0,
149
  "type":"open-source",
150
  "license":"Mit",
151
- "creation_date":1748390400000.0,
152
  "tasks":[
153
  "translation_from",
154
  "translation_to",
@@ -160,15 +160,15 @@
160
  ]
161
  },
162
  {
163
- "id":"google\/gemini-2.0-flash-001",
164
- "name":"Gemini 2.0 Flash",
165
- "provider_name":"Google",
166
- "cost":0.4,
167
- "hf_id":null,
168
- "size":null,
169
- "type":"closed-source",
170
- "license":null,
171
- "creation_date":1738713600000,
172
  "tasks":[
173
  "translation_from",
174
  "translation_to",
@@ -180,15 +180,15 @@
180
  ]
181
  },
182
  {
183
- "id":"google\/gemini-2.0-flash-lite-001",
184
- "name":"Gemini 2.0 Flash Lite",
185
- "provider_name":"Google",
186
- "cost":0.3,
187
- "hf_id":null,
188
- "size":null,
189
- "type":"closed-source",
190
- "license":null,
191
- "creation_date":1740441600000,
192
  "tasks":[
193
  "translation_from",
194
  "translation_to",
@@ -200,15 +200,15 @@
200
  ]
201
  },
202
  {
203
- "id":"google\/gemini-2.5-flash",
204
- "name":"Gemini 2.5 Flash",
205
- "provider_name":"Google",
206
- "cost":2.5,
207
- "hf_id":null,
208
- "size":null,
209
- "type":"closed-source",
210
- "license":null,
211
- "creation_date":1750118400000,
212
  "tasks":[
213
  "translation_from",
214
  "translation_to",
@@ -220,69 +220,15 @@
220
  ]
221
  },
222
  {
223
- "id":"google\/gemini-2.5-flash-lite-preview-06-17",
224
- "name":"Gemini 2.5 Flash Lite Preview 06-17",
225
  "provider_name":"Google",
226
  "cost":0.4,
227
  "hf_id":null,
228
  "size":null,
229
  "type":"closed-source",
230
  "license":null,
231
- "creation_date":1750118400000.0,
232
- "tasks":[
233
- "translation_from",
234
- "translation_to",
235
- "classification",
236
- "mmlu",
237
- "mgsm"
238
- ]
239
- },
240
- {
241
- "id":"google\/gemini-2.5-flash-preview",
242
- "name":"Gemini 2.5 Flash Preview 04-17",
243
- "provider_name":"Google",
244
- "cost":0.6,
245
- "hf_id":null,
246
- "size":null,
247
- "type":"closed-source",
248
- "license":null,
249
- "creation_date":1744848000000.0,
250
- "tasks":[
251
- "translation_from",
252
- "translation_to",
253
- "classification",
254
- "mmlu",
255
- "mgsm"
256
- ]
257
- },
258
- {
259
- "id":"google\/gemini-2.5-flash-preview-05-20",
260
- "name":"Gemini 2.5 Flash Preview 05-20",
261
- "provider_name":"Google",
262
- "cost":0.6,
263
- "hf_id":null,
264
- "size":null,
265
- "type":"closed-source",
266
- "license":null,
267
- "creation_date":1747699200000.0,
268
- "tasks":[
269
- "translation_from",
270
- "translation_to",
271
- "classification",
272
- "mmlu",
273
- "mgsm"
274
- ]
275
- },
276
- {
277
- "id":"google\/gemini-2.5-pro",
278
- "name":"Gemini 2.5 Pro",
279
- "provider_name":"Google",
280
- "cost":10.0,
281
- "hf_id":null,
282
- "size":null,
283
- "type":"closed-source",
284
- "license":null,
285
- "creation_date":1750118400000,
286
  "tasks":[
287
  "translation_from",
288
  "translation_to",
@@ -294,51 +240,15 @@
294
  ]
295
  },
296
  {
297
- "id":"google\/gemini-2.5-pro-preview",
298
- "name":"Gemini 2.5 Pro Preview 06-05",
299
- "provider_name":"Google",
300
- "cost":10.0,
301
- "hf_id":null,
302
- "size":null,
303
- "type":"closed-source",
304
- "license":null,
305
- "creation_date":1749081600000.0,
306
- "tasks":[
307
- "translation_from",
308
- "translation_to",
309
- "classification",
310
- "mmlu",
311
- "mgsm"
312
- ]
313
- },
314
- {
315
- "id":"google\/gemini-2.5-pro-preview-05-06",
316
- "name":"Gemini 2.5 Pro Preview 05-06",
317
- "provider_name":"Google",
318
- "cost":10.0,
319
- "hf_id":null,
320
- "size":null,
321
- "type":"closed-source",
322
- "license":null,
323
- "creation_date":1746576000000.0,
324
- "tasks":[
325
- "translation_from",
326
- "translation_to",
327
- "classification",
328
- "mmlu",
329
- "mgsm"
330
- ]
331
- },
332
- {
333
- "id":"google\/gemini-flash-1.5",
334
- "name":"Gemini 1.5 Flash ",
335
  "provider_name":"Google",
336
  "cost":0.3,
337
  "hf_id":null,
338
  "size":null,
339
  "type":"closed-source",
340
  "license":null,
341
- "creation_date":1715644800000,
342
  "tasks":[
343
  "translation_from",
344
  "translation_to",
@@ -350,15 +260,15 @@
350
  ]
351
  },
352
  {
353
- "id":"google\/gemini-flash-1.5-8b",
354
- "name":"Gemini 1.5 Flash 8B",
355
  "provider_name":"Google",
356
- "cost":0.15,
357
  "hf_id":null,
358
  "size":null,
359
  "type":"closed-source",
360
  "license":null,
361
- "creation_date":1727913600000,
362
  "tasks":[
363
  "translation_from",
364
  "translation_to",
@@ -370,12 +280,12 @@
370
  ]
371
  },
372
  {
373
- "id":"google\/gemma-3-27b-it",
374
- "name":"Gemma 3 27B",
375
  "provider_name":"Google",
376
  "cost":0.0,
377
- "hf_id":"google\/gemma-3-27b-it",
378
- "size":27432406640.0,
379
  "type":"open-source",
380
  "license":"Gemma",
381
  "creation_date":1740787200000,
@@ -390,30 +300,15 @@
390
  ]
391
  },
392
  {
393
- "id":"google\/translate-v2",
394
- "name":"Google Translate",
395
  "provider_name":"Google",
396
- "cost":20.0,
397
- "hf_id":null,
398
- "size":null,
399
- "type":"closed-source",
400
- "license":null,
401
- "creation_date":null,
402
- "tasks":[
403
- "translation_from",
404
- "translation_to"
405
- ]
406
- },
407
- {
408
- "id":"gryphe\/mythomax-l2-13b",
409
- "name":"MythoMax 13B",
410
- "provider_name":"MythoMax 13B",
411
- "cost":0.07,
412
- "hf_id":"Gryphe\/MythoMax-L2-13b",
413
- "size":null,
414
  "type":"open-source",
415
- "license":"Other",
416
- "creation_date":1691625600000,
417
  "tasks":[
418
  "translation_from",
419
  "translation_to",
@@ -464,30 +359,6 @@
464
  "mgsm"
465
  ]
466
  },
467
- {
468
- "id":"meta-llama\/llama-3.1-8b-instruct",
469
- "name":"Llama 3.1 8B Instruct",
470
- "provider_name":"Meta",
471
- "cost":0.0,
472
- "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
473
- "size":8030261248.0,
474
- "type":"open-source",
475
- "license":"Llama3.1",
476
- "creation_date":1721260800000.0,
477
- "tasks":null
478
- },
479
- {
480
- "id":"meta-llama\/llama-3.2-1b-instruct",
481
- "name":"Llama 3.2 1B Instruct",
482
- "provider_name":"Meta",
483
- "cost":0.0,
484
- "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
485
- "size":1235814400.0,
486
- "type":"open-source",
487
- "license":"Llama3.2",
488
- "creation_date":1726617600000.0,
489
- "tasks":null
490
- },
491
  {
492
  "id":"meta-llama\/llama-3.3-70b-instruct",
493
  "name":"Llama 3.3 70B Instruct",
@@ -568,6 +439,26 @@
568
  "mgsm"
569
  ]
570
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  {
572
  "id":"mistralai\/mistral-nemo",
573
  "name":"Mistral Nemo",
@@ -629,15 +520,55 @@
629
  ]
630
  },
631
  {
632
- "id":"openai\/gpt-3.5-turbo-0613",
633
- "name":"GPT-3.5 Turbo (older v0613)",
634
- "provider_name":"OpenAI",
635
- "cost":2.0,
636
- "hf_id":null,
637
  "size":null,
638
- "type":"closed-source",
639
- "license":null,
640
- "creation_date":1706140800000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  "tasks":[
642
  "translation_from",
643
  "translation_to",
@@ -708,6 +639,26 @@
708
  "mgsm"
709
  ]
710
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  {
712
  "id":"openai\/gpt-4o-mini",
713
  "name":"GPT-4o-mini",
@@ -728,6 +679,86 @@
728
  "mgsm"
729
  ]
730
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  {
732
  "id":"qwen\/qwen3-235b-a22b",
733
  "name":"Qwen3 235B A22B",
@@ -772,7 +803,7 @@
772
  "id":"qwen\/qwen3-32b",
773
  "name":"Qwen3 32B",
774
  "provider_name":"Qwen",
775
- "cost":0.0,
776
  "hf_id":"Qwen\/Qwen3-32B",
777
  "size":32762123264.0,
778
  "type":"open-source",
@@ -787,5 +818,120 @@
787
  "truthfulqa",
788
  "mgsm"
789
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  }
791
  ]
 
20
  ]
21
  },
22
  {
23
+ "id":"anthropic\/claude-3-haiku",
24
+ "name":"Claude 3 Haiku",
25
  "provider_name":"Anthropic",
26
+ "cost":1.25,
27
  "hf_id":null,
28
  "size":null,
29
  "type":"closed-source",
30
  "license":null,
31
+ "creation_date":1710288000000,
32
  "tasks":[
33
  "translation_from",
34
  "translation_to",
 
80
  ]
81
  },
82
  {
83
+ "id":"arliai\/qwq-32b-arliai-rpr-v1",
84
+ "name":"QwQ 32B RpR v1",
85
+ "provider_name":"ArliAI",
86
  "cost":0.0,
87
+ "hf_id":"ArliAI\/QwQ-32B-ArliAI-RpR-v1",
88
+ "size":32763876352.0,
89
  "type":"open-source",
90
+ "license":"Apache 2.0",
91
+ "creation_date":1743984000000,
92
  "tasks":[
93
  "translation_from",
94
  "translation_to",
 
100
  ]
101
  },
102
  {
103
+ "id":"cohere\/command-r-08-2024",
104
+ "name":"Command R (08-2024)",
105
+ "provider_name":"Cohere",
106
+ "cost":0.6,
107
+ "hf_id":null,
108
+ "size":null,
109
+ "type":"closed-source",
110
+ "license":null,
111
+ "creation_date":1724976000000,
112
  "tasks":[
113
  "translation_from",
114
  "translation_to",
 
120
  ]
121
  },
122
  {
123
+ "id":"deepseek\/deepseek-chat",
124
+ "name":"DeepSeek V3",
125
  "provider_name":"DeepSeek",
126
+ "cost":0.8,
127
+ "hf_id":"deepseek-ai\/DeepSeek-V3",
128
  "size":684531386000.0,
129
  "type":"open-source",
130
+ "license":"",
131
+ "creation_date":1735084800000,
132
  "tasks":[
133
  "translation_from",
134
  "translation_to",
 
140
  ]
141
  },
142
  {
143
+ "id":"deepseek\/deepseek-chat-v3-0324",
144
+ "name":"DeepSeek V3 0324",
145
  "provider_name":"DeepSeek",
146
  "cost":0.0,
147
+ "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
148
  "size":684531386000.0,
149
  "type":"open-source",
150
  "license":"Mit",
151
+ "creation_date":1742774400000,
152
  "tasks":[
153
  "translation_from",
154
  "translation_to",
 
160
  ]
161
  },
162
  {
163
+ "id":"deepseek\/deepseek-chat-v3.1",
164
+ "name":"DeepSeek V3.1",
165
+ "provider_name":"DeepSeek",
166
+ "cost":0.0,
167
+ "hf_id":"deepseek-ai\/DeepSeek-V3.1",
168
+ "size":684531386000.0,
169
+ "type":"open-source",
170
+ "license":"Mit",
171
+ "creation_date":1755734400000,
172
  "tasks":[
173
  "translation_from",
174
  "translation_to",
 
180
  ]
181
  },
182
  {
183
+ "id":"deepseek\/deepseek-r1",
184
+ "name":"R1",
185
+ "provider_name":"DeepSeek",
186
+ "cost":0.0,
187
+ "hf_id":"deepseek-ai\/DeepSeek-R1",
188
+ "size":684531386000.0,
189
+ "type":"open-source",
190
+ "license":"Mit",
191
+ "creation_date":1737331200000,
192
  "tasks":[
193
  "translation_from",
194
  "translation_to",
 
200
  ]
201
  },
202
  {
203
+ "id":"deepseek\/deepseek-r1-0528-qwen3-8b",
204
+ "name":"Deepseek R1 0528 Qwen3 8B",
205
+ "provider_name":"DeepSeek",
206
+ "cost":0.0,
207
+ "hf_id":"deepseek-ai\/DeepSeek-R1-0528-Qwen3-8B",
208
+ "size":8190735360.0,
209
+ "type":"open-source",
210
+ "license":"Mit",
211
+ "creation_date":1748476800000,
212
  "tasks":[
213
  "translation_from",
214
  "translation_to",
 
220
  ]
221
  },
222
  {
223
+ "id":"google\/gemini-2.0-flash-001",
224
+ "name":"Gemini 2.0 Flash",
225
  "provider_name":"Google",
226
  "cost":0.4,
227
  "hf_id":null,
228
  "size":null,
229
  "type":"closed-source",
230
  "license":null,
231
+ "creation_date":1738713600000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  "tasks":[
233
  "translation_from",
234
  "translation_to",
 
240
  ]
241
  },
242
  {
243
+ "id":"google\/gemini-2.0-flash-lite-001",
244
+ "name":"Gemini 2.0 Flash Lite",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  "provider_name":"Google",
246
  "cost":0.3,
247
  "hf_id":null,
248
  "size":null,
249
  "type":"closed-source",
250
  "license":null,
251
+ "creation_date":1740441600000,
252
  "tasks":[
253
  "translation_from",
254
  "translation_to",
 
260
  ]
261
  },
262
  {
263
+ "id":"google\/gemini-2.5-flash",
264
+ "name":"Gemini 2.5 Flash",
265
  "provider_name":"Google",
266
+ "cost":2.5,
267
  "hf_id":null,
268
  "size":null,
269
  "type":"closed-source",
270
  "license":null,
271
+ "creation_date":1750118400000,
272
  "tasks":[
273
  "translation_from",
274
  "translation_to",
 
280
  ]
281
  },
282
  {
283
+ "id":"google\/gemma-3-12b-it",
284
+ "name":"Gemma 3 12B",
285
  "provider_name":"Google",
286
  "cost":0.0,
287
+ "hf_id":"google\/gemma-3-12b-it",
288
+ "size":12187325040.0,
289
  "type":"open-source",
290
  "license":"Gemma",
291
  "creation_date":1740787200000,
 
300
  ]
301
  },
302
  {
303
+ "id":"google\/gemma-3-27b-it",
304
+ "name":"Gemma 3 27B",
305
  "provider_name":"Google",
306
+ "cost":0.0,
307
+ "hf_id":"google\/gemma-3-27b-it",
308
+ "size":27432406640.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  "type":"open-source",
310
+ "license":"Gemma",
311
+ "creation_date":1740787200000,
312
  "tasks":[
313
  "translation_from",
314
  "translation_to",
 
359
  "mgsm"
360
  ]
361
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  {
363
  "id":"meta-llama\/llama-3.3-70b-instruct",
364
  "name":"Llama 3.3 70B Instruct",
 
439
  "mgsm"
440
  ]
441
  },
442
+ {
443
+ "id":"mistralai\/mistral-7b-instruct-v0.3",
444
+ "name":"Mistral 7B Instruct v0.3",
445
+ "provider_name":"Mistral",
446
+ "cost":0.05,
447
+ "hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
448
+ "size":7248023552.0,
449
+ "type":"open-source",
450
+ "license":"Apache 2.0",
451
+ "creation_date":1716336000000,
452
+ "tasks":[
453
+ "translation_from",
454
+ "translation_to",
455
+ "classification",
456
+ "mmlu",
457
+ "arc",
458
+ "truthfulqa",
459
+ "mgsm"
460
+ ]
461
+ },
462
  {
463
  "id":"mistralai\/mistral-nemo",
464
  "name":"Mistral Nemo",
 
520
  ]
521
  },
522
  {
523
+ "id":"moonshotai\/kimi-k2",
524
+ "name":"Kimi K2",
525
+ "provider_name":"MoonshotAI",
526
+ "cost":0.0,
527
+ "hf_id":"moonshotai\/Kimi-K2-Instruct",
528
  "size":null,
529
+ "type":"open-source",
530
+ "license":"Other",
531
+ "creation_date":1752192000000,
532
+ "tasks":[
533
+ "translation_from",
534
+ "translation_to",
535
+ "classification",
536
+ "mmlu",
537
+ "arc",
538
+ "truthfulqa",
539
+ "mgsm"
540
+ ]
541
+ },
542
+ {
543
+ "id":"neversleep\/llama-3-lumimaid-70b",
544
+ "name":"Llama 3 Lumimaid 70B",
545
+ "provider_name":"NeverSleep",
546
+ "cost":6.0,
547
+ "hf_id":"NeverSleep\/Llama-3-Lumimaid-70B-v0.1",
548
+ "size":70553706496.0,
549
+ "type":"open-source",
550
+ "license":"Cc By Nc 4.0",
551
+ "creation_date":1714262400000,
552
+ "tasks":[
553
+ "translation_from",
554
+ "translation_to",
555
+ "classification",
556
+ "mmlu",
557
+ "arc",
558
+ "truthfulqa",
559
+ "mgsm"
560
+ ]
561
+ },
562
+ {
563
+ "id":"nvidia\/llama-3.1-nemotron-70b-instruct",
564
+ "name":"Llama 3.1 Nemotron 70B Instruct",
565
+ "provider_name":"NVIDIA",
566
+ "cost":0.3,
567
+ "hf_id":"nvidia\/Llama-3.1-Nemotron-70B-Instruct-HF",
568
+ "size":70553706496.0,
569
+ "type":"open-source",
570
+ "license":"Llama3.1",
571
+ "creation_date":1728691200000,
572
  "tasks":[
573
  "translation_from",
574
  "translation_to",
 
639
  "mgsm"
640
  ]
641
  },
642
+ {
643
+ "id":"openai\/gpt-4o-2024-11-20",
644
+ "name":"GPT-4o (2024-11-20)",
645
+ "provider_name":"OpenAI",
646
+ "cost":10.0,
647
+ "hf_id":null,
648
+ "size":null,
649
+ "type":"closed-source",
650
+ "license":null,
651
+ "creation_date":1732060800000,
652
+ "tasks":[
653
+ "translation_from",
654
+ "translation_to",
655
+ "classification",
656
+ "mmlu",
657
+ "arc",
658
+ "truthfulqa",
659
+ "mgsm"
660
+ ]
661
+ },
662
  {
663
  "id":"openai\/gpt-4o-mini",
664
  "name":"GPT-4o-mini",
 
679
  "mgsm"
680
  ]
681
  },
682
+ {
683
+ "id":"openai\/gpt-5",
684
+ "name":"GPT-5",
685
+ "provider_name":"OpenAI",
686
+ "cost":10.0,
687
+ "hf_id":null,
688
+ "size":null,
689
+ "type":"closed-source",
690
+ "license":null,
691
+ "creation_date":1754524800000,
692
+ "tasks":[
693
+ "translation_from",
694
+ "translation_to",
695
+ "classification",
696
+ "mmlu",
697
+ "arc",
698
+ "truthfulqa",
699
+ "mgsm"
700
+ ]
701
+ },
702
+ {
703
+ "id":"openai\/gpt-5-nano",
704
+ "name":"GPT-5 Nano",
705
+ "provider_name":"OpenAI",
706
+ "cost":0.4,
707
+ "hf_id":null,
708
+ "size":null,
709
+ "type":"closed-source",
710
+ "license":null,
711
+ "creation_date":1754524800000,
712
+ "tasks":[
713
+ "translation_from",
714
+ "translation_to",
715
+ "classification",
716
+ "mmlu",
717
+ "arc",
718
+ "truthfulqa",
719
+ "mgsm"
720
+ ]
721
+ },
722
+ {
723
+ "id":"openai\/gpt-oss-120b",
724
+ "name":"gpt-oss-120b",
725
+ "provider_name":"OpenAI",
726
+ "cost":0.0,
727
+ "hf_id":"openai\/gpt-oss-120b",
728
+ "size":120412337472.0,
729
+ "type":"open-source",
730
+ "license":"Apache 2.0",
731
+ "creation_date":1754265600000,
732
+ "tasks":[
733
+ "translation_from",
734
+ "translation_to",
735
+ "classification",
736
+ "mmlu",
737
+ "arc",
738
+ "truthfulqa",
739
+ "mgsm"
740
+ ]
741
+ },
742
+ {
743
+ "id":"qwen\/qwen-2.5-coder-32b-instruct",
744
+ "name":"Qwen2.5 Coder 32B Instruct",
745
+ "provider_name":"Qwen2.5 Coder 32B Instruct (free)",
746
+ "cost":0.0,
747
+ "hf_id":"Qwen\/Qwen2.5-Coder-32B-Instruct",
748
+ "size":32763876352.0,
749
+ "type":"open-source",
750
+ "license":"Apache 2.0",
751
+ "creation_date":1730851200000,
752
+ "tasks":[
753
+ "translation_from",
754
+ "translation_to",
755
+ "classification",
756
+ "mmlu",
757
+ "arc",
758
+ "truthfulqa",
759
+ "mgsm"
760
+ ]
761
+ },
762
  {
763
  "id":"qwen\/qwen3-235b-a22b",
764
  "name":"Qwen3 235B A22B",
 
803
  "id":"qwen\/qwen3-32b",
804
  "name":"Qwen3 32B",
805
  "provider_name":"Qwen",
806
+ "cost":0.07,
807
  "hf_id":"Qwen\/Qwen3-32B",
808
  "size":32762123264.0,
809
  "type":"open-source",
 
818
  "truthfulqa",
819
  "mgsm"
820
  ]
821
+ },
822
+ {
823
+ "id":"scb10x\/llama3.1-typhoon2-70b-instruct",
824
+ "name":"Typhoon2 70B Instruct",
825
+ "provider_name":"Typhoon2 70B Instruct",
826
+ "cost":0.88,
827
+ "hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
828
+ "size":70553706496.0,
829
+ "type":"open-source",
830
+ "license":"Llama3.1",
831
+ "creation_date":1734220800000,
832
+ "tasks":[
833
+ "translation_from",
834
+ "translation_to",
835
+ "classification",
836
+ "mmlu",
837
+ "arc",
838
+ "truthfulqa",
839
+ "mgsm"
840
+ ]
841
+ },
842
+ {
843
+ "id":"tencent\/hunyuan-a13b-instruct",
844
+ "name":"Hunyuan A13B Instruct",
845
+ "provider_name":"Tencent",
846
+ "cost":0.0,
847
+ "hf_id":"tencent\/Hunyuan-A13B-Instruct",
848
+ "size":80393183232.0,
849
+ "type":"open-source",
850
+ "license":"Other",
851
+ "creation_date":1750809600000,
852
+ "tasks":[
853
+ "translation_from",
854
+ "translation_to",
855
+ "classification",
856
+ "mmlu",
857
+ "arc",
858
+ "truthfulqa",
859
+ "mgsm"
860
+ ]
861
+ },
862
+ {
863
+ "id":"thedrummer\/anubis-pro-105b-v1",
864
+ "name":"Anubis Pro 105B V1",
865
+ "provider_name":"TheDrummer",
866
+ "cost":1.0,
867
+ "hf_id":"TheDrummer\/Anubis-Pro-105B-v1",
868
+ "size":104779882496.0,
869
+ "type":"open-source",
870
+ "license":"Other",
871
+ "creation_date":1738454400000,
872
+ "tasks":[
873
+ "translation_from",
874
+ "translation_to",
875
+ "classification",
876
+ "mmlu",
877
+ "arc",
878
+ "truthfulqa",
879
+ "mgsm"
880
+ ]
881
+ },
882
+ {
883
+ "id":"x-ai\/grok-4",
884
+ "name":"Grok 4",
885
+ "provider_name":"xAI",
886
+ "cost":15.0,
887
+ "hf_id":null,
888
+ "size":null,
889
+ "type":"closed-source",
890
+ "license":null,
891
+ "creation_date":1752019200000,
892
+ "tasks":[
893
+ "translation_from",
894
+ "translation_to",
895
+ "classification",
896
+ "mmlu",
897
+ "arc",
898
+ "truthfulqa",
899
+ "mgsm"
900
+ ]
901
+ },
902
+ {
903
+ "id":"z-ai\/glm-4.5v",
904
+ "name":"GLM 4.5V",
905
+ "provider_name":"Z.AI",
906
+ "cost":1.8,
907
+ "hf_id":"zai-org\/GLM-4.5V",
908
+ "size":107710933120.0,
909
+ "type":"open-source",
910
+ "license":"Mit",
911
+ "creation_date":1754784000000,
912
+ "tasks":[
913
+ "translation_from",
914
+ "translation_to",
915
+ "classification",
916
+ "mmlu",
917
+ "arc",
918
+ "truthfulqa",
919
+ "mgsm"
920
+ ]
921
+ },
922
+ {
923
+ "id":"google\/translate-v2",
924
+ "name":"Google Translate",
925
+ "provider_name":"Google",
926
+ "cost":20.0,
927
+ "hf_id":null,
928
+ "size":null,
929
+ "type":"closed-source",
930
+ "license":null,
931
+ "creation_date":null,
932
+ "tasks":[
933
+ "translation_from",
934
+ "translation_to"
935
+ ]
936
  }
937
  ]
pyproject.toml CHANGED
@@ -44,3 +44,13 @@ dev = [
44
  "scipy>=1.16.0",
45
  "seaborn>=0.13.2",
46
  ]
 
 
 
 
 
 
 
 
 
 
 
44
  "scipy>=1.16.0",
45
  "seaborn>=0.13.2",
46
  ]
47
+
48
+ [build-system]
49
+ requires = ["hatchling"]
50
+ build-backend = "hatchling.build"
51
+
52
+ [tool.hatch.build.targets.wheel]
53
+ packages = ["evals"]
54
+
55
+ [tool.uv]
56
+ package = true
results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dbe020a1941a0e49c05f81aeee40ba37d3e2f9f3d83303fcfe1b5711676d1d8
3
- size 2978273
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:649509b8373b76e51a79809fdab77badff44e5536ca3bd8e3eb409f406b6ecda
3
+ size 13260774
uv.lock CHANGED
The diff for this file is too large to render. See raw diff