Spaces:
Runtime error
Runtime error
Upload from GitHub Actions: Merge pull request #18 from datenlabor-bmz/pr-17
Browse files- .DS_Store +0 -0
- .github/workflows/nightly-evals.yml +4 -0
- .gitignore +2 -0
- Dockerfile +1 -1
- README.md +13 -0
- datasets.json +6 -6
- evals/__init__.py +0 -1
- evals/backend.py +139 -22
- evals/countries.py +10 -4
- evals/datasets_/__init__.py +1 -1
- evals/datasets_/arc.py +44 -27
- evals/datasets_/fleurs.py +2 -1
- evals/datasets_/mgsm.py +47 -23
- evals/datasets_/mmlu.py +57 -25
- evals/datasets_/truthfulqa.py +66 -28
- evals/datasets_/util.py +8 -0
- evals/download_data.py +33 -16
- evals/languages.py +3 -0
- evals/main.py +176 -48
- evals/models.py +126 -36
- evals/plots.py +75 -41
- evals/tasks.py +130 -142
- evals/translate.py +1 -1
- frontend/package-lock.json +0 -0
- frontend/package.json +7 -5
- frontend/src/App.js +183 -77
- frontend/src/components/HistoryPlot.js +2 -2
- frontend/src/components/LanguageTable.js +1 -1
- frontend/src/components/ModelTable.js +31 -17
- frontend/src/components/ScoreColumns.js +23 -10
- frontend/src/components/ScoreField.js +2 -1
- frontend/src/components/SpeakerPlot.js +2 -2
- frontend/src/components/WorldMap.js +22 -7
- languages.json +49 -49
- models.json +362 -216
- pyproject.toml +10 -0
- results.json +2 -2
- uv.lock +0 -0
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
.github/workflows/nightly-evals.yml
CHANGED
@@ -8,6 +8,8 @@ on:
|
|
8 |
jobs:
|
9 |
run-evals:
|
10 |
runs-on: ubuntu-latest
|
|
|
|
|
11 |
steps:
|
12 |
- uses: actions/checkout@v3
|
13 |
|
@@ -25,6 +27,8 @@ jobs:
|
|
25 |
env:
|
26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
|
|
28 |
run: |
|
29 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
30 |
uv run evals/download_data.py
|
|
|
8 |
jobs:
|
9 |
run-evals:
|
10 |
runs-on: ubuntu-latest
|
11 |
+
# checking if this is working in case eval runs take longer than 6h github actions allowance
|
12 |
+
timeout-minutes: 1440 # 24 hours timeout
|
13 |
steps:
|
14 |
- uses: actions/checkout@v3
|
15 |
|
|
|
27 |
env:
|
28 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
29 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
30 |
+
N_SENTENCES: 20
|
31 |
+
MAX_LANGUAGES: 150
|
32 |
run: |
|
33 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
34 |
uv run evals/download_data.py
|
.gitignore
CHANGED
@@ -20,3 +20,5 @@ wheels/
|
|
20 |
# folders and files to be ignored
|
21 |
.specstory/
|
22 |
.cursorindexingignore
|
|
|
|
|
|
20 |
# folders and files to be ignored
|
21 |
.specstory/
|
22 |
.cursorindexingignore
|
23 |
+
|
24 |
+
|
Dockerfile
CHANGED
@@ -14,7 +14,7 @@ ENV HOME=/home/user \
|
|
14 |
RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
|
15 |
USER user
|
16 |
WORKDIR $HOME/app
|
17 |
-
COPY --chown=user pyproject.toml uv.lock ./
|
18 |
RUN uv sync --frozen --no-dev
|
19 |
COPY --chown=user evals/ evals/
|
20 |
COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
|
|
|
14 |
RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
|
15 |
USER user
|
16 |
WORKDIR $HOME/app
|
17 |
+
COPY --chown=user pyproject.toml uv.lock README.md ./
|
18 |
RUN uv sync --frozen --no-dev
|
19 |
COPY --chown=user evals/ evals/
|
20 |
COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
|
README.md
CHANGED
@@ -43,8 +43,21 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
|
|
43 |
|
44 |
_Tracking language proficiency of AI models for every language_
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
## Evaluate
|
47 |
|
|
|
48 |
```bash
|
49 |
uv run --extra dev evals/main.py
|
50 |
```
|
|
|
43 |
|
44 |
_Tracking language proficiency of AI models for every language_
|
45 |
|
46 |
+
## System Architecture
|
47 |
+
|
48 |
+
The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
|
49 |
+
|
50 |
+
> **Detailed Architecture**: See [system_architecture_diagram.md](system_architecture_diagram.md) for the complete system architecture diagram and component descriptions.
|
51 |
+
|
52 |
+
**Key Features:**
|
53 |
+
- **Model Discovery**: Combines curated models with real-time trending models via web scraping
|
54 |
+
- **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
|
55 |
+
- **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
|
56 |
+
- **Real-time Visualization**: Interactive web interface with country-level insights
|
57 |
+
|
58 |
## Evaluate
|
59 |
|
60 |
+
### Local Development
|
61 |
```bash
|
62 |
uv run --extra dev evals/main.py
|
63 |
```
|
datasets.json
CHANGED
@@ -219,7 +219,7 @@
|
|
219 |
"parallel": true,
|
220 |
"translation": "machine",
|
221 |
"base": "MMLU",
|
222 |
-
"implemented":
|
223 |
"group": "Multitask Language Understanding"
|
224 |
},
|
225 |
{
|
@@ -256,7 +256,7 @@
|
|
256 |
"parallel": true,
|
257 |
"translation": "machine",
|
258 |
"base": "MMLU",
|
259 |
-
"implemented":
|
260 |
"group": "Multitask Language Understanding"
|
261 |
},
|
262 |
{
|
@@ -360,7 +360,7 @@
|
|
360 |
"parallel": true,
|
361 |
"translation": "machine",
|
362 |
"base": "AI2 ARC",
|
363 |
-
"implemented":
|
364 |
"group": "ARC Question Answering"
|
365 |
},
|
366 |
{
|
@@ -375,7 +375,7 @@
|
|
375 |
"parallel": true,
|
376 |
"translation": "machine",
|
377 |
"base": "AI2 ARC",
|
378 |
-
"implemented":
|
379 |
"group": "ARC Question Answering"
|
380 |
},
|
381 |
{
|
@@ -420,7 +420,7 @@
|
|
420 |
"parallel": true,
|
421 |
"translation": "machine",
|
422 |
"base": "TruthfulQA",
|
423 |
-
"implemented":
|
424 |
"group": "Truthfulness"
|
425 |
},
|
426 |
{
|
@@ -435,7 +435,7 @@
|
|
435 |
"parallel": true,
|
436 |
"translation": "machine",
|
437 |
"base": "TruthfulQA",
|
438 |
-
"implemented":
|
439 |
"group": "Truthfulness"
|
440 |
},
|
441 |
{
|
|
|
219 |
"parallel": true,
|
220 |
"translation": "machine",
|
221 |
"base": "MMLU",
|
222 |
+
"implemented": false,
|
223 |
"group": "Multitask Language Understanding"
|
224 |
},
|
225 |
{
|
|
|
256 |
"parallel": true,
|
257 |
"translation": "machine",
|
258 |
"base": "MMLU",
|
259 |
+
"implemented": false,
|
260 |
"group": "Multitask Language Understanding"
|
261 |
},
|
262 |
{
|
|
|
360 |
"parallel": true,
|
361 |
"translation": "machine",
|
362 |
"base": "AI2 ARC",
|
363 |
+
"implemented": false,
|
364 |
"group": "ARC Question Answering"
|
365 |
},
|
366 |
{
|
|
|
375 |
"parallel": true,
|
376 |
"translation": "machine",
|
377 |
"base": "AI2 ARC",
|
378 |
+
"implemented": false,
|
379 |
"group": "ARC Question Answering"
|
380 |
},
|
381 |
{
|
|
|
420 |
"parallel": true,
|
421 |
"translation": "machine",
|
422 |
"base": "TruthfulQA",
|
423 |
+
"implemented": false,
|
424 |
"group": "Truthfulness"
|
425 |
},
|
426 |
{
|
|
|
435 |
"parallel": true,
|
436 |
"translation": "machine",
|
437 |
"base": "TruthfulQA",
|
438 |
+
"implemented": false,
|
439 |
"group": "Truthfulness"
|
440 |
},
|
441 |
{
|
evals/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
evals/backend.py
CHANGED
@@ -4,7 +4,8 @@ import os
|
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
import uvicorn
|
7 |
-
|
|
|
8 |
from fastapi import FastAPI, Request
|
9 |
from fastapi.middleware.cors import CORSMiddleware
|
10 |
from fastapi.middleware.gzip import GZipMiddleware
|
@@ -26,7 +27,7 @@ task_metrics = [
|
|
26 |
"classification_accuracy",
|
27 |
"mmlu_accuracy",
|
28 |
"arc_accuracy",
|
29 |
-
|
30 |
"mgsm_accuracy",
|
31 |
]
|
32 |
|
@@ -39,28 +40,77 @@ def compute_normalized_average(df, metrics):
|
|
39 |
col_min = normalized_df[col].min()
|
40 |
col_max = normalized_df[col].max()
|
41 |
if col_max > col_min: # Avoid division by zero
|
42 |
-
normalized_df[col] = (normalized_df[col] - col_min) / (
|
|
|
|
|
43 |
else:
|
44 |
normalized_df[col] = 0 # If all values are the same, set to 0
|
45 |
return normalized_df.mean(axis=1, skipna=False)
|
46 |
|
47 |
|
48 |
-
def make_model_table(
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
)
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
for metric in task_metrics:
|
58 |
if metric not in df.columns:
|
59 |
df[metric] = np.nan
|
|
|
60 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
62 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
63 |
df["rank"] = df.index + 1
|
|
|
|
|
|
|
|
|
|
|
64 |
df = df[
|
65 |
[
|
66 |
"rank",
|
@@ -74,27 +124,81 @@ def make_model_table(df, models):
|
|
74 |
"license",
|
75 |
"cost",
|
76 |
"average",
|
77 |
-
*
|
78 |
]
|
79 |
]
|
80 |
return df
|
81 |
|
82 |
|
83 |
-
def make_language_table(
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
)
|
89 |
-
|
90 |
-
|
91 |
-
df =
|
|
|
92 |
for metric in task_metrics:
|
93 |
if metric not in df.columns:
|
94 |
df[metric] = np.nan
|
|
|
95 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
97 |
df = df.sort_values(by="speakers", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
98 |
df = df[
|
99 |
[
|
100 |
"bcp_47",
|
@@ -104,7 +208,7 @@ def make_language_table(df, languages):
|
|
104 |
"family",
|
105 |
"average",
|
106 |
"in_benchmark",
|
107 |
-
*
|
108 |
]
|
109 |
]
|
110 |
return df
|
@@ -125,10 +229,22 @@ async def data(request: Request):
|
|
125 |
body = await request.body()
|
126 |
data = json.loads(body)
|
127 |
selected_languages = data.get("selectedLanguages", {})
|
128 |
-
df =
|
|
|
|
|
|
|
|
|
129 |
# lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
130 |
language_table = make_language_table(df, languages)
|
131 |
datasets_df = pd.read_json("datasets.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
if selected_languages:
|
133 |
# the filtering is only applied for the model table and the country data
|
134 |
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
@@ -143,6 +259,7 @@ async def data(request: Request):
|
|
143 |
"language_table": serialize(language_table),
|
144 |
"dataset_table": serialize(datasets_df),
|
145 |
"countries": serialize(countries),
|
|
|
146 |
}
|
147 |
return JSONResponse(content=all_tables)
|
148 |
|
|
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
import uvicorn
|
7 |
+
|
8 |
+
from evals.countries import make_country_table
|
9 |
from fastapi import FastAPI, Request
|
10 |
from fastapi.middleware.cors import CORSMiddleware
|
11 |
from fastapi.middleware.gzip import GZipMiddleware
|
|
|
27 |
"classification_accuracy",
|
28 |
"mmlu_accuracy",
|
29 |
"arc_accuracy",
|
30 |
+
"truthfulqa_accuracy",
|
31 |
"mgsm_accuracy",
|
32 |
]
|
33 |
|
|
|
40 |
col_min = normalized_df[col].min()
|
41 |
col_max = normalized_df[col].max()
|
42 |
if col_max > col_min: # Avoid division by zero
|
43 |
+
normalized_df[col] = (normalized_df[col] - col_min) / (
|
44 |
+
col_max - col_min
|
45 |
+
)
|
46 |
else:
|
47 |
normalized_df[col] = 0 # If all values are the same, set to 0
|
48 |
return normalized_df.mean(axis=1, skipna=False)
|
49 |
|
50 |
|
51 |
+
def make_model_table(scores_df, models):
|
52 |
+
# Create a combined task_metric for origin
|
53 |
+
scores_df["task_metric_origin"] = (
|
54 |
+
scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
|
55 |
+
)
|
56 |
+
|
57 |
+
# Pivot to get scores for each origin-specific metric
|
58 |
+
scores_pivot = scores_df.pivot_table(
|
59 |
+
index="model",
|
60 |
+
columns="task_metric_origin",
|
61 |
+
values="score",
|
62 |
+
aggfunc="mean",
|
63 |
)
|
64 |
+
|
65 |
+
# Create the regular task_metric for the main average calculation
|
66 |
+
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
67 |
+
main_pivot = scores_df.pivot_table(
|
68 |
+
index="model", columns="task_metric", values="score", aggfunc="mean"
|
69 |
+
)
|
70 |
+
|
71 |
+
# Merge the two pivots
|
72 |
+
df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
|
73 |
+
|
74 |
for metric in task_metrics:
|
75 |
if metric not in df.columns:
|
76 |
df[metric] = np.nan
|
77 |
+
|
78 |
df["average"] = compute_normalized_average(df, task_metrics)
|
79 |
+
|
80 |
+
# Compute origin presence per model+metric
|
81 |
+
origin_presence = (
|
82 |
+
scores_df.groupby(["model", "task_metric", "origin"])
|
83 |
+
.size()
|
84 |
+
.unstack(fill_value=0)
|
85 |
+
)
|
86 |
+
# Add boolean flags: show asterisk only if exclusively machine-origin contributed
|
87 |
+
for metric in task_metrics:
|
88 |
+
human_col_name = "human" if "human" in origin_presence.columns else None
|
89 |
+
machine_col_name = "machine" if "machine" in origin_presence.columns else None
|
90 |
+
if human_col_name or machine_col_name:
|
91 |
+
flags = []
|
92 |
+
for model in df.index:
|
93 |
+
try:
|
94 |
+
counts = origin_presence.loc[(model, metric)]
|
95 |
+
except KeyError:
|
96 |
+
flags.append(False)
|
97 |
+
continue
|
98 |
+
human_count = counts.get(human_col_name, 0) if human_col_name else 0
|
99 |
+
machine_count = (
|
100 |
+
counts.get(machine_col_name, 0) if machine_col_name else 0
|
101 |
+
)
|
102 |
+
flags.append(machine_count > 0 and human_count == 0)
|
103 |
+
df[f"{metric}_is_machine"] = flags
|
104 |
+
else:
|
105 |
+
df[f"{metric}_is_machine"] = False
|
106 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
107 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
108 |
df["rank"] = df.index + 1
|
109 |
+
|
110 |
+
# Dynamically find all metric columns to include
|
111 |
+
final_cols = df.columns
|
112 |
+
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
113 |
+
|
114 |
df = df[
|
115 |
[
|
116 |
"rank",
|
|
|
124 |
"license",
|
125 |
"cost",
|
126 |
"average",
|
127 |
+
*sorted(list(set(metric_cols))),
|
128 |
]
|
129 |
]
|
130 |
return df
|
131 |
|
132 |
|
133 |
+
def make_language_table(scores_df, languages):
|
134 |
+
# Create a combined task_metric for origin
|
135 |
+
scores_df["task_metric_origin"] = (
|
136 |
+
scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
|
137 |
+
)
|
138 |
+
|
139 |
+
# Pivot to get scores for each origin-specific metric
|
140 |
+
scores_pivot = scores_df.pivot_table(
|
141 |
+
index="bcp_47",
|
142 |
+
columns="task_metric_origin",
|
143 |
+
values="score",
|
144 |
+
aggfunc="mean",
|
145 |
+
)
|
146 |
+
|
147 |
+
# Create the regular task_metric for the main average calculation
|
148 |
+
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
149 |
+
main_pivot = scores_df.pivot_table(
|
150 |
+
index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
|
151 |
)
|
152 |
+
|
153 |
+
# Merge the two pivots
|
154 |
+
df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
|
155 |
+
|
156 |
for metric in task_metrics:
|
157 |
if metric not in df.columns:
|
158 |
df[metric] = np.nan
|
159 |
+
|
160 |
df["average"] = compute_normalized_average(df, task_metrics)
|
161 |
+
|
162 |
+
# Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
|
163 |
+
origin_presence = (
|
164 |
+
scores_df.groupby(["bcp_47", "task_metric", "origin"])
|
165 |
+
.size()
|
166 |
+
.unstack(fill_value=0)
|
167 |
+
)
|
168 |
+
for metric in task_metrics:
|
169 |
+
human_col_name = "human" if "human" in origin_presence.columns else None
|
170 |
+
machine_col_name = "machine" if "machine" in origin_presence.columns else None
|
171 |
+
if human_col_name or machine_col_name:
|
172 |
+
flags = []
|
173 |
+
for bcp in df.index:
|
174 |
+
try:
|
175 |
+
counts = origin_presence.loc[(bcp, metric)]
|
176 |
+
except KeyError:
|
177 |
+
flags.append(False)
|
178 |
+
continue
|
179 |
+
human_count = counts.get(human_col_name, 0) if human_col_name else 0
|
180 |
+
machine_count = (
|
181 |
+
counts.get(machine_col_name, 0) if machine_col_name else 0
|
182 |
+
)
|
183 |
+
flags.append(machine_count > 0 and human_count == 0)
|
184 |
+
df[f"{metric}_is_machine"] = flags
|
185 |
+
else:
|
186 |
+
df[f"{metric}_is_machine"] = False
|
187 |
+
|
188 |
+
# Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
|
189 |
+
for metric in task_metrics:
|
190 |
+
machine_col = f"{metric}_machine"
|
191 |
+
if machine_col in df.columns:
|
192 |
+
df[f"{metric}_is_machine"] = df[machine_col].notna()
|
193 |
+
else:
|
194 |
+
df[f"{metric}_is_machine"] = False
|
195 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
196 |
df = df.sort_values(by="speakers", ascending=False)
|
197 |
+
|
198 |
+
# Dynamically find all metric columns to include
|
199 |
+
final_cols = df.columns
|
200 |
+
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
201 |
+
|
202 |
df = df[
|
203 |
[
|
204 |
"bcp_47",
|
|
|
208 |
"family",
|
209 |
"average",
|
210 |
"in_benchmark",
|
211 |
+
*sorted(list(set(metric_cols))),
|
212 |
]
|
213 |
]
|
214 |
return df
|
|
|
229 |
body = await request.body()
|
230 |
data = json.loads(body)
|
231 |
selected_languages = data.get("selectedLanguages", {})
|
232 |
+
df = (
|
233 |
+
scores.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
234 |
+
.mean()
|
235 |
+
.reset_index()
|
236 |
+
)
|
237 |
# lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
238 |
language_table = make_language_table(df, languages)
|
239 |
datasets_df = pd.read_json("datasets.json")
|
240 |
+
|
241 |
+
# Identify which metrics have machine translations available
|
242 |
+
machine_translated_metrics = set()
|
243 |
+
for _, row in df.iterrows():
|
244 |
+
if row["origin"] == "machine":
|
245 |
+
metric_name = f"{row['task']}_{row['metric']}"
|
246 |
+
machine_translated_metrics.add(metric_name)
|
247 |
+
|
248 |
if selected_languages:
|
249 |
# the filtering is only applied for the model table and the country data
|
250 |
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
|
|
259 |
"language_table": serialize(language_table),
|
260 |
"dataset_table": serialize(datasets_df),
|
261 |
"countries": serialize(countries),
|
262 |
+
"machine_translated_metrics": list(machine_translated_metrics),
|
263 |
}
|
264 |
return JSONResponse(content=all_tables)
|
265 |
|
evals/countries.py
CHANGED
@@ -15,6 +15,7 @@ def population(bcp_47):
|
|
15 |
}
|
16 |
return items
|
17 |
|
|
|
18 |
@cache
|
19 |
def make_country_table(language_table):
|
20 |
countries = defaultdict(list)
|
@@ -30,10 +31,15 @@ def make_country_table(language_table):
|
|
30 |
)
|
31 |
for country, languages in countries.items():
|
32 |
speaker_pop = sum(entry["population"] for entry in languages)
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
37 |
countries[country] = {
|
38 |
"score": score,
|
39 |
"languages": languages,
|
|
|
15 |
}
|
16 |
return items
|
17 |
|
18 |
+
|
19 |
@cache
|
20 |
def make_country_table(language_table):
|
21 |
countries = defaultdict(list)
|
|
|
31 |
)
|
32 |
for country, languages in countries.items():
|
33 |
speaker_pop = sum(entry["population"] for entry in languages)
|
34 |
+
|
35 |
+
if speaker_pop < 1000: # Grey out low-population countries
|
36 |
+
score = None # This will make them appear grey on the map
|
37 |
+
else:
|
38 |
+
score = (
|
39 |
+
sum(entry["score"] * entry["population"] for entry in languages)
|
40 |
+
/ speaker_pop
|
41 |
+
)
|
42 |
+
|
43 |
countries[country] = {
|
44 |
"score": score,
|
45 |
"languages": languages,
|
evals/datasets_/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
|
evals/datasets_/arc.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import random
|
2 |
-
from collections import Counter, defaultdict
|
3 |
|
4 |
-
from langcodes import
|
5 |
from rich import print
|
6 |
-
from models import translate_google,
|
7 |
from tqdm import tqdm
|
8 |
-
from datasets import
|
9 |
import asyncio
|
10 |
from tqdm.asyncio import tqdm_asyncio
|
11 |
import os
|
@@ -14,27 +13,33 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
|
|
14 |
|
15 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
16 |
tags_uhura_arc_easy = {
|
17 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
18 |
if not a.endswith("unmatched")
|
19 |
}
|
20 |
|
21 |
|
22 |
random.seed(42)
|
23 |
-
id_sets_train = [
|
|
|
|
|
|
|
24 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
25 |
random.shuffle(common_ids_train)
|
26 |
-
id_sets_test = [
|
|
|
|
|
|
|
27 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
28 |
random.shuffle(common_ids_test)
|
29 |
|
30 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
31 |
tags_uhura_arc_easy_translated = {
|
32 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
33 |
}
|
34 |
|
35 |
|
36 |
-
|
37 |
-
|
38 |
def add_choices(row):
|
39 |
row["choices"] = row["choices"]["text"]
|
40 |
return row
|
@@ -45,37 +50,40 @@ def load_uhura_arc_easy(language_bcp_47, nr):
|
|
45 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
46 |
ds = ds.map(add_choices)
|
47 |
ds = ds.rename_column("answerKey", "answer")
|
48 |
-
train_ids = common_ids_train[nr:nr+3]
|
49 |
-
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
50 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
51 |
-
return "masakhane/uhura-arc-easy",
|
52 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
53 |
-
ds = _load_dataset(
|
|
|
|
|
|
|
54 |
ds = ds.rename_column("answerKey", "answer")
|
55 |
-
train_ids = common_ids_train[nr:nr+3]
|
56 |
-
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
57 |
-
# raise Exception(language_bcp_47)
|
58 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
59 |
-
return "fair-forward/arc-easy-autotranslated",
|
60 |
else:
|
61 |
return None, None, None
|
62 |
|
|
|
63 |
def translate_arc(languages):
|
64 |
human_translated = tags_uhura_arc_easy.keys()
|
65 |
untranslated = [
|
66 |
lang
|
67 |
for lang in languages["bcp_47"].values[:100]
|
68 |
-
if lang not in human_translated and lang in
|
69 |
]
|
70 |
n_samples = 10
|
71 |
-
train_ids = common_ids_train[:n_samples+3]
|
72 |
-
en_train = _load_dataset(
|
|
|
|
|
73 |
en_train = en_train.filter(lambda x: x["id"] in train_ids)
|
74 |
test_ids = common_ids_test[:n_samples]
|
75 |
-
en_test = _load_dataset(
|
|
|
|
|
76 |
en_test = en_test.filter(lambda x: x["id"] in test_ids)
|
77 |
data = {"train": en_train, "test": en_test}
|
78 |
-
|
79 |
slug = "fair-forward/arc-easy-autotranslated"
|
80 |
for lang in tqdm(untranslated):
|
81 |
# check if already exists on hub
|
@@ -84,16 +92,22 @@ def translate_arc(languages):
|
|
84 |
except (ValueError, Exception):
|
85 |
print(f"Translating {lang}...")
|
86 |
for split, data_en in data.items():
|
87 |
-
questions_tr = [
|
|
|
|
|
88 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
89 |
choices_texts_concatenated = []
|
90 |
for choice in data_en["choices"]:
|
91 |
for option in choice["text"]:
|
92 |
choices_texts_concatenated.append(option)
|
93 |
-
choices_tr = [
|
|
|
|
|
94 |
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
95 |
# group into chunks of 4
|
96 |
-
choices_tr = [
|
|
|
|
|
97 |
|
98 |
ds_lang = Dataset.from_dict(
|
99 |
{
|
@@ -110,5 +124,8 @@ def translate_arc(languages):
|
|
110 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
111 |
)
|
112 |
ds_lang.to_json(
|
113 |
-
f"data/translations/arc/{lang}_{split}.json",
|
|
|
|
|
|
|
114 |
)
|
|
|
1 |
import random
|
|
|
2 |
|
3 |
+
from langcodes import standardize_tag
|
4 |
from rich import print
|
5 |
+
from models import translate_google, get_google_supported_languages
|
6 |
from tqdm import tqdm
|
7 |
+
from datasets import load_dataset, Dataset
|
8 |
import asyncio
|
9 |
from tqdm.asyncio import tqdm_asyncio
|
10 |
import os
|
|
|
13 |
|
14 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
15 |
tags_uhura_arc_easy = {
|
16 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
17 |
+
for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
18 |
if not a.endswith("unmatched")
|
19 |
}
|
20 |
|
21 |
|
22 |
random.seed(42)
|
23 |
+
id_sets_train = [
|
24 |
+
set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
|
25 |
+
for tag in tags_uhura_arc_easy.values()
|
26 |
+
]
|
27 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
28 |
random.shuffle(common_ids_train)
|
29 |
+
id_sets_test = [
|
30 |
+
set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
|
31 |
+
for tag in tags_uhura_arc_easy.values()
|
32 |
+
]
|
33 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
34 |
random.shuffle(common_ids_test)
|
35 |
|
36 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
37 |
tags_uhura_arc_easy_translated = {
|
38 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
39 |
+
for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
40 |
}
|
41 |
|
42 |
|
|
|
|
|
43 |
def add_choices(row):
|
44 |
row["choices"] = row["choices"]["text"]
|
45 |
return row
|
|
|
50 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
51 |
ds = ds.map(add_choices)
|
52 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
53 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
54 |
+
return "masakhane/uhura-arc-easy", task, "human"
|
55 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
56 |
+
ds = _load_dataset(
|
57 |
+
slug_uhura_arc_easy_translated,
|
58 |
+
tags_uhura_arc_easy_translated[language_bcp_47],
|
59 |
+
)
|
60 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
61 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
62 |
+
return "fair-forward/arc-easy-autotranslated", task, "machine"
|
63 |
else:
|
64 |
return None, None, None
|
65 |
|
66 |
+
|
67 |
def translate_arc(languages):
|
68 |
human_translated = tags_uhura_arc_easy.keys()
|
69 |
untranslated = [
|
70 |
lang
|
71 |
for lang in languages["bcp_47"].values[:100]
|
72 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
73 |
]
|
74 |
n_samples = 10
|
75 |
+
train_ids = common_ids_train[: n_samples + 3]
|
76 |
+
en_train = _load_dataset(
|
77 |
+
slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train"
|
78 |
+
)
|
79 |
en_train = en_train.filter(lambda x: x["id"] in train_ids)
|
80 |
test_ids = common_ids_test[:n_samples]
|
81 |
+
en_test = _load_dataset(
|
82 |
+
slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test"
|
83 |
+
)
|
84 |
en_test = en_test.filter(lambda x: x["id"] in test_ids)
|
85 |
data = {"train": en_train, "test": en_test}
|
86 |
+
|
87 |
slug = "fair-forward/arc-easy-autotranslated"
|
88 |
for lang in tqdm(untranslated):
|
89 |
# check if already exists on hub
|
|
|
92 |
except (ValueError, Exception):
|
93 |
print(f"Translating {lang}...")
|
94 |
for split, data_en in data.items():
|
95 |
+
questions_tr = [
|
96 |
+
translate_google(q, "en", lang) for q in data_en["question"]
|
97 |
+
]
|
98 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
99 |
choices_texts_concatenated = []
|
100 |
for choice in data_en["choices"]:
|
101 |
for option in choice["text"]:
|
102 |
choices_texts_concatenated.append(option)
|
103 |
+
choices_tr = [
|
104 |
+
translate_google(c, "en", lang) for c in choices_texts_concatenated
|
105 |
+
]
|
106 |
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
107 |
# group into chunks of 4
|
108 |
+
choices_tr = [
|
109 |
+
choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
|
110 |
+
]
|
111 |
|
112 |
ds_lang = Dataset.from_dict(
|
113 |
{
|
|
|
124 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
125 |
)
|
126 |
ds_lang.to_json(
|
127 |
+
f"data/translations/arc/{lang}_{split}.json",
|
128 |
+
lines=False,
|
129 |
+
force_ascii=False,
|
130 |
+
indent=2,
|
131 |
)
|
evals/datasets_/fleurs.py
CHANGED
@@ -11,6 +11,7 @@ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
|
|
11 |
lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
|
12 |
)
|
13 |
|
|
|
14 |
def download_file(url, path):
|
15 |
response = requests.get(url)
|
16 |
with open(path, "wb") as f:
|
@@ -34,4 +35,4 @@ def download_fleurs(transcription_langs_eval):
|
|
34 |
if not tsv_path.exists():
|
35 |
print(f"Downloading {tsv_url} to {tsv_path}")
|
36 |
tsv_path.parent.mkdir(parents=True, exist_ok=True)
|
37 |
-
download_file(tsv_url, tsv_path)
|
|
|
11 |
lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
|
12 |
)
|
13 |
|
14 |
+
|
15 |
def download_file(url, path):
|
16 |
response = requests.get(url)
|
17 |
with open(path, "wb") as f:
|
|
|
35 |
if not tsv_path.exists():
|
36 |
print(f"Downloading {tsv_url} to {tsv_path}")
|
37 |
tsv_path.parent.mkdir(parents=True, exist_ok=True)
|
38 |
+
download_file(tsv_url, tsv_path)
|
evals/datasets_/mgsm.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import asyncio
|
2 |
import os
|
|
|
3 |
|
4 |
from datasets import Dataset, load_dataset
|
5 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
6 |
-
from langcodes import standardize_tag
|
7 |
-
from models import
|
|
|
8 |
from tqdm import tqdm
|
9 |
from tqdm.asyncio import tqdm_asyncio
|
10 |
|
@@ -37,31 +39,50 @@ def parse_number(i):
|
|
37 |
return None
|
38 |
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def load_mgsm(language_bcp_47, nr):
|
41 |
if language_bcp_47 in tags_mgsm.keys():
|
42 |
-
|
43 |
-
return slug_mgsm,
|
44 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
47 |
)
|
48 |
-
return
|
49 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
50 |
-
|
51 |
-
slug_gsm_autotranslated,
|
52 |
)
|
53 |
-
return slug_gsm_autotranslated,
|
54 |
-
elif language_bcp_47 in tags_gsm8kx.keys():
|
55 |
-
row = _load_dataset(
|
56 |
-
slug_gsm8kx,
|
57 |
-
subset=tags_gsm8kx[language_bcp_47],
|
58 |
-
split="test",
|
59 |
-
trust_remote_code=True,
|
60 |
-
)[nr]
|
61 |
-
row["answer_number"] = row["answer"].split("####")[1].strip()
|
62 |
-
return slug_gsm8kx, row
|
63 |
else:
|
64 |
-
return None, None
|
65 |
|
66 |
|
67 |
def translate_mgsm(languages):
|
@@ -69,7 +90,7 @@ def translate_mgsm(languages):
|
|
69 |
untranslated = [
|
70 |
lang
|
71 |
for lang in languages["bcp_47"].values[:100]
|
72 |
-
if lang not in human_translated and lang in
|
73 |
]
|
74 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
75 |
slug = "fair-forward/gsm-autotranslated"
|
@@ -96,5 +117,8 @@ def translate_mgsm(languages):
|
|
96 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
97 |
)
|
98 |
ds_lang.to_json(
|
99 |
-
f"data/translations/mgsm/{lang}.json",
|
|
|
|
|
|
|
100 |
)
|
|
|
1 |
import asyncio
|
2 |
import os
|
3 |
+
import random
|
4 |
|
5 |
from datasets import Dataset, load_dataset
|
6 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
7 |
+
from langcodes import Language, standardize_tag
|
8 |
+
from models import get_google_supported_languages, translate_google
|
9 |
+
from rich import print
|
10 |
from tqdm import tqdm
|
11 |
from tqdm.asyncio import tqdm_asyncio
|
12 |
|
|
|
39 |
return None
|
40 |
|
41 |
|
42 |
+
@cache
|
43 |
+
def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
|
44 |
+
"""Cache individual MGSM items efficiently"""
|
45 |
+
try:
|
46 |
+
ds = _load_dataset(
|
47 |
+
dataset_slug,
|
48 |
+
subset=subset_tag,
|
49 |
+
split="test",
|
50 |
+
trust_remote_code=trust_remote_code,
|
51 |
+
)
|
52 |
+
if nr >= len(ds):
|
53 |
+
return None
|
54 |
+
|
55 |
+
row = ds[nr]
|
56 |
+
|
57 |
+
# Post-process based on dataset type
|
58 |
+
if dataset_slug == slug_gsm8kx:
|
59 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
60 |
+
|
61 |
+
return row
|
62 |
+
except Exception:
|
63 |
+
# Dataset doesn't exist or doesn't have test split
|
64 |
+
return None
|
65 |
+
|
66 |
+
|
67 |
def load_mgsm(language_bcp_47, nr):
|
68 |
if language_bcp_47 in tags_mgsm.keys():
|
69 |
+
item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
|
70 |
+
return slug_mgsm, item, "human" if item else (None, None, None)
|
71 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
72 |
+
item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
|
73 |
+
return slug_afrimgsm, item, "human" if item else (None, None, None)
|
74 |
+
elif language_bcp_47 in tags_gsm8kx.keys():
|
75 |
+
item = _get_mgsm_item(
|
76 |
+
slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True
|
77 |
)
|
78 |
+
return slug_gsm8kx, item, "machine" if item else (None, None, None)
|
79 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
80 |
+
item = _get_mgsm_item(
|
81 |
+
slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr
|
82 |
)
|
83 |
+
return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
else:
|
85 |
+
return None, None, None
|
86 |
|
87 |
|
88 |
def translate_mgsm(languages):
|
|
|
90 |
untranslated = [
|
91 |
lang
|
92 |
for lang in languages["bcp_47"].values[:100]
|
93 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
94 |
]
|
95 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
96 |
slug = "fair-forward/gsm-autotranslated"
|
|
|
117 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
118 |
)
|
119 |
ds_lang.to_json(
|
120 |
+
f"data/translations/mgsm/{lang}.json",
|
121 |
+
lines=False,
|
122 |
+
force_ascii=False,
|
123 |
+
indent=2,
|
124 |
)
|
evals/datasets_/mmlu.py
CHANGED
@@ -4,9 +4,9 @@ import random
|
|
4 |
from collections import Counter, defaultdict
|
5 |
|
6 |
from datasets import Dataset, load_dataset
|
7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
8 |
from langcodes import Language, standardize_tag
|
9 |
-
from models import
|
10 |
from rich import print
|
11 |
from tqdm import tqdm
|
12 |
from tqdm.asyncio import tqdm_asyncio
|
@@ -111,6 +111,7 @@ def print_datasets_analysis():
|
|
111 |
# MMLUX is translated using DeepL
|
112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
113 |
|
|
|
114 |
# print_datasets_analysis()
|
115 |
|
116 |
|
@@ -143,32 +144,61 @@ tags_mmlux = set(
|
|
143 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
144 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
145 |
)
|
146 |
-
tags_mmlu_autotranslated =
|
|
|
|
|
|
|
147 |
|
148 |
categories = sorted(
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
category = categories[nr % len(categories)]
|
155 |
if language_bcp_47 in tags_afrimmlu.keys():
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
return "masakhane/afrimmlu", examples, task
|
161 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
return
|
|
|
|
|
|
|
|
|
172 |
else:
|
173 |
return None, None, None
|
174 |
|
@@ -177,10 +207,10 @@ def translate_mmlu(languages):
|
|
177 |
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
178 |
untranslated = [
|
179 |
lang
|
180 |
-
for lang in languages["bcp_47"].values[:
|
181 |
-
if lang not in human_translated and lang in
|
182 |
]
|
183 |
-
n_samples =
|
184 |
|
185 |
slug = "fair-forward/mmlu-autotranslated"
|
186 |
for lang in tqdm(untranslated):
|
@@ -196,8 +226,10 @@ def translate_mmlu(languages):
|
|
196 |
if split == "dev":
|
197 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
198 |
else:
|
199 |
-
|
200 |
-
|
|
|
|
|
201 |
samples.append(task)
|
202 |
questions_tr = [
|
203 |
translate_google(s["question"], "en", lang) for s in samples
|
|
|
4 |
from collections import Counter, defaultdict
|
5 |
|
6 |
from datasets import Dataset, load_dataset
|
7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
8 |
from langcodes import Language, standardize_tag
|
9 |
+
from models import get_google_supported_languages, translate_google
|
10 |
from rich import print
|
11 |
from tqdm import tqdm
|
12 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
111 |
# MMLUX is translated using DeepL
|
112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
113 |
|
114 |
+
|
115 |
# print_datasets_analysis()
|
116 |
|
117 |
|
|
|
144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
146 |
)
|
147 |
+
tags_mmlu_autotranslated = {
|
148 |
+
standardize_tag(a, macro=True): a
|
149 |
+
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
150 |
+
}
|
151 |
|
152 |
categories = sorted(
|
153 |
+
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
154 |
+
)
|
155 |
+
|
156 |
+
|
157 |
+
@cache
|
158 |
+
def _get_processed_mmlu_dataset(dataset_name, subset_tag):
|
159 |
+
"""Cache processed datasets to avoid reprocessing"""
|
160 |
+
ds = _load_dataset(dataset_name, subset_tag)
|
161 |
+
if dataset_name == "masakhane/afrimmlu":
|
162 |
+
ds = ds.map(parse_choices)
|
163 |
+
elif dataset_name == "CohereForAI/Global-MMLU":
|
164 |
+
ds = ds.map(add_choices)
|
165 |
+
return ds
|
166 |
|
167 |
|
168 |
+
@cache
|
169 |
+
def _get_mmlu_item(dataset_name, subset_tag, category, nr):
|
170 |
+
"""Cache individual MMLU items efficiently"""
|
171 |
+
ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
|
172 |
+
if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
|
173 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
174 |
+
return filtered[nr] if nr < len(filtered) else None
|
175 |
+
else: # fair-forward/mmlu-autotranslated
|
176 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
177 |
+
return filtered[nr] if nr < len(filtered) else None
|
178 |
+
|
179 |
+
|
180 |
+
async def load_mmlu(language_bcp_47, nr):
|
181 |
category = categories[nr % len(categories)]
|
182 |
if language_bcp_47 in tags_afrimmlu.keys():
|
183 |
+
task = _get_mmlu_item(
|
184 |
+
"masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr
|
185 |
+
)
|
186 |
+
return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
|
|
|
187 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
188 |
+
task = _get_mmlu_item(
|
189 |
+
"CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr
|
190 |
+
)
|
191 |
+
return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
|
192 |
+
# TODO: add in Okapi, MMLUX @Jonas
|
193 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
194 |
+
task = _get_mmlu_item(
|
195 |
+
"fair-forward/mmlu-autotranslated", language_bcp_47, category, nr
|
196 |
+
)
|
197 |
+
return (
|
198 |
+
"fair-forward/mmlu-autotranslated",
|
199 |
+
task,
|
200 |
+
"machine" if task else (None, None, None),
|
201 |
+
)
|
202 |
else:
|
203 |
return None, None, None
|
204 |
|
|
|
207 |
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
208 |
untranslated = [
|
209 |
lang
|
210 |
+
for lang in languages["bcp_47"].values[:150]
|
211 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
212 |
]
|
213 |
+
n_samples = 20
|
214 |
|
215 |
slug = "fair-forward/mmlu-autotranslated"
|
216 |
for lang in tqdm(untranslated):
|
|
|
226 |
if split == "dev":
|
227 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
228 |
else:
|
229 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
230 |
+
filtered = ds.filter(lambda x: x["subject"] == category)
|
231 |
+
for i in range(min(n_samples, len(filtered))):
|
232 |
+
task = filtered[i]
|
233 |
samples.append(task)
|
234 |
questions_tr = [
|
235 |
translate_google(s["question"], "en", lang) for s in samples
|
evals/datasets_/truthfulqa.py
CHANGED
@@ -8,17 +8,29 @@ import asyncio
|
|
8 |
from tqdm.asyncio import tqdm_asyncio
|
9 |
import os
|
10 |
|
11 |
-
from datasets import Dataset, load_dataset
|
12 |
-
from models import translate_google,
|
13 |
|
14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
15 |
|
16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
|
|
|
|
17 |
tags_uhura_truthfulqa = {
|
18 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
19 |
if a.endswith("multiple_choice")
|
20 |
}
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def add_choices(row):
|
24 |
row["choices"] = row["mc1_targets"]["choices"]
|
@@ -26,26 +38,42 @@ def add_choices(row):
|
|
26 |
return row
|
27 |
|
28 |
|
29 |
-
def load_truthfulqa(language_bcp_47, nr):
|
30 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
31 |
-
ds = _load_dataset(
|
|
|
|
|
32 |
ds = ds.map(add_choices)
|
33 |
-
examples = ds["train"]
|
34 |
task = ds["test"][nr]
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
else:
|
37 |
return None, None, None
|
38 |
|
39 |
|
40 |
-
|
41 |
def translate_truthfulqa(languages):
|
42 |
human_translated = [*tags_uhura_truthfulqa.keys()]
|
43 |
untranslated = [
|
44 |
lang
|
45 |
-
for lang in languages["bcp_47"].values[:
|
46 |
-
if lang not in human_translated and lang in
|
47 |
]
|
48 |
-
n_samples =
|
|
|
|
|
|
|
49 |
|
50 |
slug = "fair-forward/truthfulqa-autotranslated"
|
51 |
for lang in tqdm(untranslated):
|
@@ -55,37 +83,47 @@ def translate_truthfulqa(languages):
|
|
55 |
except (ValueError, Exception):
|
56 |
print(f"Translating {lang}...")
|
57 |
for split in ["train", "test"]:
|
58 |
-
ds = _load_dataset(
|
|
|
|
|
59 |
samples = []
|
60 |
if split == "train":
|
61 |
samples.extend(ds)
|
62 |
else:
|
63 |
-
|
|
|
64 |
task = ds[i]
|
65 |
samples.append(task)
|
|
|
|
|
66 |
questions_tr = [
|
67 |
translate_google(s["question"], "en", lang) for s in samples
|
68 |
]
|
69 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
70 |
-
|
|
|
|
|
|
|
|
|
71 |
for s in samples:
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
choices_tr
|
81 |
-
|
|
|
|
|
82 |
|
83 |
ds_lang = Dataset.from_dict(
|
84 |
{
|
85 |
-
"subject": [s["subject"] for s in samples],
|
86 |
"question": questions_tr,
|
87 |
-
"choices":
|
88 |
-
"
|
89 |
}
|
90 |
)
|
91 |
ds_lang.push_to_hub(
|
@@ -95,7 +133,7 @@ def translate_truthfulqa(languages):
|
|
95 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
96 |
)
|
97 |
ds_lang.to_json(
|
98 |
-
f"data/translations/
|
99 |
lines=False,
|
100 |
force_ascii=False,
|
101 |
indent=2,
|
|
|
8 |
from tqdm.asyncio import tqdm_asyncio
|
9 |
import os
|
10 |
|
11 |
+
from datasets import Dataset, load_dataset, DatasetNotFoundError
|
12 |
+
from models import translate_google, get_google_supported_languages
|
13 |
|
14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
15 |
|
16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
17 |
+
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
|
18 |
+
|
19 |
tags_uhura_truthfulqa = {
|
20 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
21 |
+
for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
22 |
if a.endswith("multiple_choice")
|
23 |
}
|
24 |
|
25 |
+
# Get available auto-translated languages
|
26 |
+
try:
|
27 |
+
tags_truthfulqa_autotranslated = {
|
28 |
+
standardize_tag(a, macro=True): a
|
29 |
+
for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
|
30 |
+
}
|
31 |
+
except DatasetNotFoundError:
|
32 |
+
tags_truthfulqa_autotranslated = {}
|
33 |
+
|
34 |
|
35 |
def add_choices(row):
|
36 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
|
38 |
return row
|
39 |
|
40 |
|
41 |
+
async def load_truthfulqa(language_bcp_47, nr):
|
42 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
43 |
+
ds = _load_dataset(
|
44 |
+
slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
|
45 |
+
)
|
46 |
ds = ds.map(add_choices)
|
|
|
47 |
task = ds["test"][nr]
|
48 |
+
# Ensure there is a correct answer before returning the task
|
49 |
+
if 1 not in task["labels"]:
|
50 |
+
return None, None, None
|
51 |
+
return "masakhane/uhura-truthfulqa", task, "human"
|
52 |
+
elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
|
53 |
+
# Load from auto-translated dataset (same samples as translation)
|
54 |
+
ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
|
55 |
+
test_split = ds["test"] if "test" in ds else ds
|
56 |
+
task = test_split[nr]
|
57 |
+
# Ensure there is a correct answer before returning the task
|
58 |
+
if 1 not in task.get("labels", []):
|
59 |
+
return None, None, None
|
60 |
+
return slug_truthfulqa_autotranslated, task, "machine"
|
61 |
+
# TODO: add Okapi, TruthfulQA-X @Jonas
|
62 |
else:
|
63 |
return None, None, None
|
64 |
|
65 |
|
|
|
66 |
def translate_truthfulqa(languages):
|
67 |
human_translated = [*tags_uhura_truthfulqa.keys()]
|
68 |
untranslated = [
|
69 |
lang
|
70 |
+
for lang in languages["bcp_47"].values[:150]
|
71 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
72 |
]
|
73 |
+
n_samples = 20
|
74 |
+
|
75 |
+
# Set fixed seed for consistent sample selection across all languages
|
76 |
+
random.seed(42)
|
77 |
|
78 |
slug = "fair-forward/truthfulqa-autotranslated"
|
79 |
for lang in tqdm(untranslated):
|
|
|
83 |
except (ValueError, Exception):
|
84 |
print(f"Translating {lang}...")
|
85 |
for split in ["train", "test"]:
|
86 |
+
ds = _load_dataset(
|
87 |
+
slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split
|
88 |
+
)
|
89 |
samples = []
|
90 |
if split == "train":
|
91 |
samples.extend(ds)
|
92 |
else:
|
93 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
94 |
+
for i in range(min(n_samples, len(ds))):
|
95 |
task = ds[i]
|
96 |
samples.append(task)
|
97 |
+
|
98 |
+
# Translate questions
|
99 |
questions_tr = [
|
100 |
translate_google(s["question"], "en", lang) for s in samples
|
101 |
]
|
102 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
103 |
+
|
104 |
+
# Translate choices for each sample
|
105 |
+
all_choices_tr = []
|
106 |
+
all_labels = []
|
107 |
+
|
108 |
for s in samples:
|
109 |
+
# Get choices from mc1_targets
|
110 |
+
choices = s["mc1_targets"]["choices"]
|
111 |
+
labels = s["mc1_targets"]["labels"]
|
112 |
+
|
113 |
+
# Translate choices
|
114 |
+
choices_tr = [
|
115 |
+
translate_google(choice, "en", lang) for choice in choices
|
116 |
+
]
|
117 |
+
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
118 |
+
|
119 |
+
all_choices_tr.append(choices_tr)
|
120 |
+
all_labels.append(labels)
|
121 |
|
122 |
ds_lang = Dataset.from_dict(
|
123 |
{
|
|
|
124 |
"question": questions_tr,
|
125 |
+
"choices": all_choices_tr,
|
126 |
+
"labels": all_labels,
|
127 |
}
|
128 |
)
|
129 |
ds_lang.push_to_hub(
|
|
|
133 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
134 |
)
|
135 |
ds_lang.to_json(
|
136 |
+
f"data/translations/truthfulqa/{lang}_{split}.json",
|
137 |
lines=False,
|
138 |
force_ascii=False,
|
139 |
indent=2,
|
evals/datasets_/util.py
CHANGED
@@ -12,3 +12,11 @@ def _get_dataset_config_names(dataset, **kwargs):
|
|
12 |
@cache
|
13 |
def _load_dataset(dataset, subset, **kwargs):
|
14 |
return load_dataset(dataset, subset, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
@cache
|
13 |
def _load_dataset(dataset, subset, **kwargs):
|
14 |
return load_dataset(dataset, subset, **kwargs)
|
15 |
+
|
16 |
+
|
17 |
+
# Cache individual dataset items to avoid reloading entire datasets
|
18 |
+
@cache
|
19 |
+
def _get_dataset_item(dataset, subset, split, index, **kwargs):
|
20 |
+
"""Load a single item from a dataset efficiently"""
|
21 |
+
ds = load_dataset(dataset, subset, split=split, **kwargs)
|
22 |
+
return ds[index] if index < len(ds) else None
|
evals/download_data.py
CHANGED
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
8 |
import sys
|
9 |
import huggingface_hub
|
10 |
from datasets import load_dataset, DatasetDict
|
|
|
11 |
# Import fleurs DataFrame directly from its source module
|
12 |
from datasets_.fleurs import fleurs
|
13 |
|
@@ -24,22 +25,25 @@ DATA_DIR = project_root / "data"
|
|
24 |
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
|
25 |
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
|
26 |
|
27 |
-
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip"
|
28 |
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
|
29 |
GLOTTOLOG_CSV_NAME = "languoid.csv"
|
30 |
|
31 |
-
SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html"
|
32 |
SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
|
33 |
|
34 |
-
SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm"
|
35 |
SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
|
36 |
SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
|
37 |
-
SPBLEU_DICT_URL =
|
|
|
|
|
38 |
SPBLEU_DICT_NAME = "dictionary.txt"
|
39 |
|
40 |
|
41 |
# --- Helper Functions ---
|
42 |
|
|
|
43 |
def download_file(url, path: Path):
|
44 |
"""Downloads a file from a URL to a local path."""
|
45 |
print(f"Downloading {url} to {path}...")
|
@@ -84,11 +88,16 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
|
|
84 |
break
|
85 |
|
86 |
if target_zip_path:
|
87 |
-
with
|
|
|
|
|
|
|
88 |
target.write(source.read())
|
89 |
print(f"Successfully extracted {target_filename}.")
|
90 |
else:
|
91 |
-
print(
|
|
|
|
|
92 |
|
93 |
except zipfile.BadZipFile:
|
94 |
print("Error: Downloaded file is not a valid zip archive.")
|
@@ -98,13 +107,14 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
|
|
98 |
|
99 |
# --- Download Functions ---
|
100 |
|
|
|
101 |
def download_fleurs_data():
|
102 |
"""Downloads Fleurs audio and text data."""
|
103 |
print("\n--- Downloading Fleurs Data ---")
|
104 |
FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
|
105 |
|
106 |
# Use the fleurs_tag column from the imported DataFrame
|
107 |
-
fleurs_tags_list = fleurs[
|
108 |
|
109 |
if not fleurs_tags_list:
|
110 |
print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
|
@@ -117,7 +127,9 @@ def download_fleurs_data():
|
|
117 |
audio_dir = lang_dir / "audio"
|
118 |
dev_tsv_path = lang_dir / "dev.tsv"
|
119 |
dev_audio_archive_path = audio_dir / "dev.tar.gz"
|
120 |
-
audio_extracted_marker =
|
|
|
|
|
121 |
|
122 |
# Download TSV
|
123 |
if not dev_tsv_path.exists():
|
@@ -129,15 +141,15 @@ def download_fleurs_data():
|
|
129 |
# Download and Extract Audio
|
130 |
if not audio_extracted_marker.exists():
|
131 |
if not dev_audio_archive_path.exists():
|
132 |
-
|
133 |
-
|
134 |
|
135 |
if dev_audio_archive_path.exists():
|
136 |
-
|
137 |
else:
|
138 |
print(f"Audio archive missing, cannot extract for {lang_tag}")
|
139 |
else:
|
140 |
-
|
141 |
|
142 |
|
143 |
def download_glottolog_data():
|
@@ -165,7 +177,9 @@ def download_scriptcodes_data():
|
|
165 |
# The URL points to an HTML page, not a direct CSV link.
|
166 |
# Manual download is likely required for ScriptCodes.csv.
|
167 |
print(f"Cannot automatically download from {SCRIPTCODES_URL}")
|
168 |
-
print(
|
|
|
|
|
169 |
print("from the Unicode website or related sources and save it as:")
|
170 |
print(f"{SCRIPTCODES_TARGET_FILE}")
|
171 |
if SCRIPTCODES_TARGET_FILE.exists():
|
@@ -196,21 +210,24 @@ def download_spbleu_data():
|
|
196 |
|
197 |
# --- Main Execution ---
|
198 |
|
|
|
199 |
def main():
|
200 |
"""Runs all download functions and the conversion step."""
|
201 |
print("Starting data download process...")
|
202 |
DATA_DIR.mkdir(exist_ok=True)
|
203 |
|
204 |
-
#download_fleurs_data()
|
205 |
download_glottolog_data()
|
206 |
download_scriptcodes_data()
|
207 |
download_spbleu_data()
|
208 |
|
209 |
print("\nData download process finished.")
|
210 |
print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
|
211 |
-
print(
|
|
|
|
|
212 |
print("in 'evals/datasets_/flores.py' to be read correctly.")
|
213 |
|
214 |
|
215 |
if __name__ == "__main__":
|
216 |
-
main()
|
|
|
8 |
import sys
|
9 |
import huggingface_hub
|
10 |
from datasets import load_dataset, DatasetDict
|
11 |
+
|
12 |
# Import fleurs DataFrame directly from its source module
|
13 |
from datasets_.fleurs import fleurs
|
14 |
|
|
|
25 |
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
|
26 |
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
|
27 |
|
28 |
+
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
|
29 |
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
|
30 |
GLOTTOLOG_CSV_NAME = "languoid.csv"
|
31 |
|
32 |
+
SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
|
33 |
SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
|
34 |
|
35 |
+
SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
|
36 |
SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
|
37 |
SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
|
38 |
+
SPBLEU_DICT_URL = (
|
39 |
+
"https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
|
40 |
+
)
|
41 |
SPBLEU_DICT_NAME = "dictionary.txt"
|
42 |
|
43 |
|
44 |
# --- Helper Functions ---
|
45 |
|
46 |
+
|
47 |
def download_file(url, path: Path):
|
48 |
"""Downloads a file from a URL to a local path."""
|
49 |
print(f"Downloading {url} to {path}...")
|
|
|
88 |
break
|
89 |
|
90 |
if target_zip_path:
|
91 |
+
with (
|
92 |
+
z.open(target_zip_path) as source,
|
93 |
+
open(extract_path / target_filename, "wb") as target,
|
94 |
+
):
|
95 |
target.write(source.read())
|
96 |
print(f"Successfully extracted {target_filename}.")
|
97 |
else:
|
98 |
+
print(
|
99 |
+
f"Error: Could not find {target_filename} within the zip archive."
|
100 |
+
)
|
101 |
|
102 |
except zipfile.BadZipFile:
|
103 |
print("Error: Downloaded file is not a valid zip archive.")
|
|
|
107 |
|
108 |
# --- Download Functions ---
|
109 |
|
110 |
+
|
111 |
def download_fleurs_data():
|
112 |
"""Downloads Fleurs audio and text data."""
|
113 |
print("\n--- Downloading Fleurs Data ---")
|
114 |
FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
|
115 |
|
116 |
# Use the fleurs_tag column from the imported DataFrame
|
117 |
+
fleurs_tags_list = fleurs["fleurs_tag"].tolist()
|
118 |
|
119 |
if not fleurs_tags_list:
|
120 |
print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
|
|
|
127 |
audio_dir = lang_dir / "audio"
|
128 |
dev_tsv_path = lang_dir / "dev.tsv"
|
129 |
dev_audio_archive_path = audio_dir / "dev.tar.gz"
|
130 |
+
audio_extracted_marker = (
|
131 |
+
audio_dir / "dev"
|
132 |
+
) # Check if extraction likely happened
|
133 |
|
134 |
# Download TSV
|
135 |
if not dev_tsv_path.exists():
|
|
|
141 |
# Download and Extract Audio
|
142 |
if not audio_extracted_marker.exists():
|
143 |
if not dev_audio_archive_path.exists():
|
144 |
+
tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
|
145 |
+
download_file(tar_url, dev_audio_archive_path)
|
146 |
|
147 |
if dev_audio_archive_path.exists():
|
148 |
+
extract_tar_gz(dev_audio_archive_path, audio_dir)
|
149 |
else:
|
150 |
print(f"Audio archive missing, cannot extract for {lang_tag}")
|
151 |
else:
|
152 |
+
print(f"Found extracted audio: {audio_extracted_marker}")
|
153 |
|
154 |
|
155 |
def download_glottolog_data():
|
|
|
177 |
# The URL points to an HTML page, not a direct CSV link.
|
178 |
# Manual download is likely required for ScriptCodes.csv.
|
179 |
print(f"Cannot automatically download from {SCRIPTCODES_URL}")
|
180 |
+
print(
|
181 |
+
"Please manually download the ISO 15924 codes list (often available as a .txt file)"
|
182 |
+
)
|
183 |
print("from the Unicode website or related sources and save it as:")
|
184 |
print(f"{SCRIPTCODES_TARGET_FILE}")
|
185 |
if SCRIPTCODES_TARGET_FILE.exists():
|
|
|
210 |
|
211 |
# --- Main Execution ---
|
212 |
|
213 |
+
|
214 |
def main():
|
215 |
"""Runs all download functions and the conversion step."""
|
216 |
print("Starting data download process...")
|
217 |
DATA_DIR.mkdir(exist_ok=True)
|
218 |
|
219 |
+
# download_fleurs_data()
|
220 |
download_glottolog_data()
|
221 |
download_scriptcodes_data()
|
222 |
download_spbleu_data()
|
223 |
|
224 |
print("\nData download process finished.")
|
225 |
print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
|
226 |
+
print(
|
227 |
+
"Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well"
|
228 |
+
)
|
229 |
print("in 'evals/datasets_/flores.py' to be read correctly.")
|
230 |
|
231 |
|
232 |
if __name__ == "__main__":
|
233 |
+
main()
|
evals/languages.py
CHANGED
@@ -31,6 +31,7 @@ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
|
|
31 |
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
32 |
)
|
33 |
|
|
|
34 |
@cache
|
35 |
def language_family(bcp_47):
|
36 |
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
@@ -39,6 +40,7 @@ def language_family(bcp_47):
|
|
39 |
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
40 |
return family["name"]
|
41 |
|
|
|
42 |
languages["family"] = languages["bcp_47"].apply(language_family)
|
43 |
|
44 |
# load script codes and names
|
@@ -46,6 +48,7 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
|
|
46 |
columns={"Code": "iso15924", "English Name": "script_name"}
|
47 |
)
|
48 |
|
|
|
49 |
def script_name(iso15924):
|
50 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
51 |
|
|
|
31 |
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
32 |
)
|
33 |
|
34 |
+
|
35 |
@cache
|
36 |
def language_family(bcp_47):
|
37 |
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
|
|
40 |
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
41 |
return family["name"]
|
42 |
|
43 |
+
|
44 |
languages["family"] = languages["bcp_47"].apply(language_family)
|
45 |
|
46 |
# load script codes and names
|
|
|
48 |
columns={"Code": "iso15924", "English Name": "script_name"}
|
49 |
)
|
50 |
|
51 |
+
|
52 |
def script_name(iso15924):
|
53 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
54 |
|
evals/main.py
CHANGED
@@ -1,62 +1,190 @@
|
|
1 |
import asyncio
|
2 |
-
|
3 |
import pandas as pd
|
4 |
-
|
|
|
5 |
from models import models
|
6 |
from tasks import tasks
|
7 |
-
from
|
|
|
|
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
|
|
12 |
|
13 |
-
#
|
|
|
|
|
|
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
old_results = pd.read_json("results.json")
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
]
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
)
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
#
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
if __name__ == "__main__":
|
|
|
1 |
import asyncio
|
|
|
2 |
import pandas as pd
|
3 |
+
import time
|
4 |
+
from datetime import datetime, timedelta
|
5 |
from models import models
|
6 |
from tasks import tasks
|
7 |
+
from languages import languages
|
8 |
+
import os
|
9 |
+
|
10 |
|
11 |
+
async def evaluate():
|
12 |
+
# Configuration - easily adjustable defaults
|
13 |
+
n_sentences = int(
|
14 |
+
os.environ.get("N_SENTENCES", 20)
|
15 |
+
) # Default: 20 sentences per task
|
16 |
+
max_languages = int(
|
17 |
+
os.environ.get("MAX_LANGUAGES", 150)
|
18 |
+
) # Default: 150 top languages
|
19 |
+
single_model = os.environ.get(
|
20 |
+
"SINGLE_MODEL"
|
21 |
+
) # Optional: run only one specific model
|
22 |
+
test_mode = os.environ.get("TEST", "").lower() in (
|
23 |
+
"1",
|
24 |
+
"true",
|
25 |
+
"yes",
|
26 |
+
) # Optional: skip results loading/saving
|
27 |
|
28 |
+
# Keep original DataFrames for saving metadata - distinction added for single model test runs.
|
29 |
+
original_models_df = pd.DataFrame(models)
|
30 |
+
original_languages_df = pd.DataFrame(languages)
|
31 |
|
32 |
+
# Create working copies for single evaluation runs
|
33 |
+
models_df = original_models_df.copy()
|
34 |
+
languages_df = original_languages_df.copy()
|
35 |
+
top_languages = languages.head(max_languages)
|
36 |
|
37 |
+
# Filter to single model if specified (only affects evaluation, not saving)
|
38 |
+
if single_model:
|
39 |
+
models_df = models_df[models_df["id"] == single_model]
|
40 |
+
if len(models_df) == 0:
|
41 |
+
print(f"Error: Model '{single_model}' not found. Available models:")
|
42 |
+
for model_id in original_models_df["id"]:
|
43 |
+
print(f" {model_id}")
|
44 |
+
return pd.DataFrame()
|
45 |
|
46 |
+
print(
|
47 |
+
f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task"
|
48 |
+
)
|
49 |
+
if test_mode:
|
50 |
+
print("TEST MODE: Skipping results loading/saving")
|
51 |
+
start_time = time.time()
|
52 |
+
|
53 |
+
# Load existing results to avoid re-evaluation (skip in test mode)
|
54 |
+
if test_mode:
|
55 |
+
old_results = pd.DataFrame(
|
56 |
+
columns=["model", "bcp_47", "task", "metric", "origin", "score"]
|
57 |
+
)
|
58 |
+
else:
|
59 |
old_results = pd.read_json("results.json")
|
60 |
+
|
61 |
+
# Get all combinations that need evaluation
|
62 |
+
combis = [
|
63 |
+
(model, lang.bcp_47, task_name)
|
64 |
+
for model in models_df["id"]
|
65 |
+
for lang in top_languages.itertuples()
|
66 |
+
for task_name, task in tasks.items()
|
67 |
+
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
68 |
+
]
|
69 |
+
|
70 |
+
# Filter out already evaluated combinations
|
71 |
+
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
72 |
+
if not old_results.empty:
|
73 |
+
completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
|
74 |
+
# set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
|
75 |
+
mask = ~combis.apply(
|
76 |
+
lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1
|
77 |
+
)
|
78 |
+
combis = combis[mask]
|
79 |
+
|
80 |
+
# Create all evaluation tasks
|
81 |
+
all_tasks = []
|
82 |
+
for i in range(n_sentences):
|
83 |
+
for model, bcp_47, task_name in combis.itertuples(index=False):
|
84 |
+
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
85 |
+
|
86 |
+
print(f"Running {len(all_tasks)} evaluation tasks...")
|
87 |
+
|
88 |
+
# For single model runs, we stop immediately on first API error to inspect.
|
89 |
+
# For full evaluations, we continue despite errors to get maximum coverage.
|
90 |
+
stop_on_error = single_model is not None
|
91 |
+
|
92 |
+
# Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
|
93 |
+
batch_size = 1000
|
94 |
+
all_results = []
|
95 |
+
|
96 |
+
try:
|
97 |
+
for i in range(0, len(all_tasks), batch_size):
|
98 |
+
batch = all_tasks[i : i + batch_size]
|
99 |
+
batch_results = await asyncio.gather(
|
100 |
+
*[
|
101 |
+
task_func(model, bcp_47, sentence_nr)
|
102 |
+
for task_func, model, bcp_47, sentence_nr in batch
|
103 |
+
],
|
104 |
+
return_exceptions=not stop_on_error,
|
105 |
)
|
106 |
+
all_results.extend(batch_results)
|
107 |
+
|
108 |
+
results = all_results
|
109 |
+
|
110 |
+
# Process results and logging API errors separately to understand what are the main issues.
|
111 |
+
valid_results = []
|
112 |
+
errors = []
|
113 |
+
|
114 |
+
for i, r in enumerate(results):
|
115 |
+
if isinstance(r, Exception):
|
116 |
+
if i < len(all_tasks):
|
117 |
+
task_info = all_tasks[i]
|
118 |
+
errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
|
119 |
+
elif isinstance(r, list):
|
120 |
+
valid_results.extend(r)
|
121 |
+
elif r is not None:
|
122 |
+
valid_results.append(r)
|
123 |
+
|
124 |
+
# log errors and store
|
125 |
+
if errors:
|
126 |
+
with open("errors.log", "w") as f:
|
127 |
+
f.write("model,task,error\n")
|
128 |
+
for error in errors:
|
129 |
+
f.write(error + "\n")
|
130 |
+
|
131 |
+
# Track model completion (TO BE DELETED - was for local run only)
|
132 |
+
if valid_results:
|
133 |
+
completed_models = set()
|
134 |
+
for result in valid_results:
|
135 |
+
if isinstance(result, dict) and "model" in result:
|
136 |
+
model = result["model"]
|
137 |
+
if model not in completed_models:
|
138 |
+
completed_models.add(model)
|
139 |
+
print(f"Completed: {model}")
|
140 |
+
|
141 |
+
print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
|
142 |
+
|
143 |
+
# this is for local single model runs - for testing and development
|
144 |
+
except Exception as e:
|
145 |
+
print(f"EVALUATION STOPPED - API Error occurred:")
|
146 |
+
print(f"Error type: {type(e).__name__}")
|
147 |
+
print(f"Error message: {str(e)}")
|
148 |
+
return pd.DataFrame()
|
149 |
+
|
150 |
+
# Save results (skipped in test mode as we do not want to overwrite existing results)
|
151 |
+
if valid_results:
|
152 |
+
results_df = pd.DataFrame(valid_results)
|
153 |
+
|
154 |
+
# Aggregate results
|
155 |
+
results_df = (
|
156 |
+
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
157 |
+
.agg({"score": "mean"})
|
158 |
+
.reset_index()
|
159 |
+
)
|
160 |
+
|
161 |
+
if not test_mode:
|
162 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
163 |
+
|
164 |
+
# Merge with existing results
|
165 |
+
if not old_results.empty:
|
166 |
+
results_df = pd.concat([old_results, results_df])
|
167 |
+
results_df = results_df.drop_duplicates(
|
168 |
+
subset=["model", "bcp_47", "task", "metric", "origin"]
|
169 |
+
)
|
170 |
+
|
171 |
+
results_df = results_df.sort_values(
|
172 |
+
by=["model", "bcp_47", "task", "metric"]
|
173 |
+
)
|
174 |
+
results_df.to_json("results.json", **args)
|
175 |
+
|
176 |
+
# Save model and language info (always save complete metadata, not filtered)
|
177 |
+
original_models_df.to_json("models.json", **args)
|
178 |
+
original_languages_df.to_json("languages.json", **args)
|
179 |
+
else:
|
180 |
+
print("TEST MODE: Skipping results saving")
|
181 |
+
|
182 |
+
elapsed = time.time() - start_time
|
183 |
+
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
|
184 |
+
|
185 |
+
return results_df
|
186 |
+
|
187 |
+
return pd.DataFrame()
|
188 |
|
189 |
|
190 |
if __name__ == "__main__":
|
evals/models.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import json
|
2 |
import re
|
3 |
from collections import defaultdict
|
@@ -7,7 +8,6 @@ from os import getenv
|
|
7 |
import pandas as pd
|
8 |
from aiolimiter import AsyncLimiter
|
9 |
from dotenv import load_dotenv
|
10 |
-
from elevenlabs import AsyncElevenLabs
|
11 |
from google.cloud import translate_v2 as translate
|
12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
13 |
from joblib.memory import Memory
|
@@ -22,14 +22,17 @@ important_models = [
|
|
22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
|
|
|
|
25 |
"openai/gpt-4.1", # 8$
|
26 |
"openai/gpt-4.1-mini", # 1.6$
|
27 |
"openai/gpt-4.1-nano", # 0.4$
|
28 |
"openai/gpt-4o-mini", # 0.6$
|
29 |
-
|
30 |
-
"openai/gpt-
|
31 |
-
|
32 |
-
|
|
|
33 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
34 |
"mistralai/mistral-saba", # 0.6$
|
35 |
"mistralai/mistral-nemo", # 0.08$
|
@@ -48,10 +51,13 @@ important_models = [
|
|
48 |
"microsoft/phi-4", # 0.07$
|
49 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
50 |
"amazon/nova-micro-v1", # 0.09$
|
|
|
|
|
51 |
]
|
52 |
|
53 |
blocklist = [
|
54 |
"google/gemini-2.5-pro-preview",
|
|
|
55 |
"google/gemini-2.5-flash-preview",
|
56 |
"google/gemini-2.5-flash-lite-preview",
|
57 |
"google/gemini-2.5-flash-preview-04-17",
|
@@ -59,6 +65,7 @@ blocklist = [
|
|
59 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
60 |
"google/gemini-2.5-pro-preview-06-05",
|
61 |
"google/gemini-2.5-pro-preview-05-06",
|
|
|
62 |
]
|
63 |
|
64 |
transcription_models = [
|
@@ -93,28 +100,81 @@ def get_model(permaslug):
|
|
93 |
|
94 |
@cache
|
95 |
def get_historical_popular_models(date: date):
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
|
110 |
@cache
|
111 |
def get_current_popular_models(date: date):
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
|
120 |
def get_translation_models():
|
@@ -161,7 +221,10 @@ async def complete(**kwargs) -> str | None:
|
|
161 |
|
162 |
|
163 |
translate_client = translate.Client()
|
164 |
-
|
|
|
|
|
|
|
165 |
|
166 |
|
167 |
@cache
|
@@ -231,12 +294,15 @@ def get_hf_metadata(row):
|
|
231 |
return empty
|
232 |
try:
|
233 |
info = api.model_info(id)
|
234 |
-
license =
|
235 |
-
|
236 |
-
.
|
237 |
-
.
|
238 |
-
.
|
239 |
-
)
|
|
|
|
|
|
|
240 |
return {
|
241 |
"hf_id": info.id,
|
242 |
"creation_date": info.created_at,
|
@@ -249,8 +315,14 @@ def get_hf_metadata(row):
|
|
249 |
|
250 |
|
251 |
def get_cost(row):
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
|
256 |
@cache
|
@@ -260,8 +332,17 @@ def load_models(date: date):
|
|
260 |
+ get_current_popular_models(date.today())[:10]
|
261 |
)
|
262 |
popular_models = [m["slug"] for m in popular_models]
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
or_metadata = models["id"].apply(get_or_metadata)
|
266 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
267 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
@@ -281,9 +362,18 @@ def load_models(date: date):
|
|
281 |
license=hf_metadata.str["license"],
|
282 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
283 |
)
|
284 |
-
#
|
|
|
285 |
models["tasks"] = [
|
286 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
] * len(models)
|
288 |
models = pd.concat([models, get_translation_models()])
|
289 |
return models
|
|
|
1 |
+
import asyncio
|
2 |
import json
|
3 |
import re
|
4 |
from collections import defaultdict
|
|
|
8 |
import pandas as pd
|
9 |
from aiolimiter import AsyncLimiter
|
10 |
from dotenv import load_dotenv
|
|
|
11 |
from google.cloud import translate_v2 as translate
|
12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
13 |
from joblib.memory import Memory
|
|
|
22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
25 |
+
"openai/gpt-5",
|
26 |
+
"openai/gpt-5-nano", # include if/when available
|
27 |
"openai/gpt-4.1", # 8$
|
28 |
"openai/gpt-4.1-mini", # 1.6$
|
29 |
"openai/gpt-4.1-nano", # 0.4$
|
30 |
"openai/gpt-4o-mini", # 0.6$
|
31 |
+
"openai/gpt-4o-2024-11-20", # 10$
|
32 |
+
"openai/gpt-oss-120b",
|
33 |
+
"anthropic/claude-3.7-sonnet", # 15$ - added for full coverage
|
34 |
+
"anthropic/claude-sonnet-4", # 15$ - added for full coverage
|
35 |
+
"anthropic/claude-opus-4.1", # 15$ - added for full coverage
|
36 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
37 |
"mistralai/mistral-saba", # 0.6$
|
38 |
"mistralai/mistral-nemo", # 0.08$
|
|
|
51 |
"microsoft/phi-4", # 0.07$
|
52 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
53 |
"amazon/nova-micro-v1", # 0.09$
|
54 |
+
"moonshotai/kimi-k2", # 0.6$ - added to prevent missing from models.json
|
55 |
+
"x-ai/grok-4",
|
56 |
]
|
57 |
|
58 |
blocklist = [
|
59 |
"google/gemini-2.5-pro-preview",
|
60 |
+
"google/gemini-2.5-pro",
|
61 |
"google/gemini-2.5-flash-preview",
|
62 |
"google/gemini-2.5-flash-lite-preview",
|
63 |
"google/gemini-2.5-flash-preview-04-17",
|
|
|
65 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
66 |
"google/gemini-2.5-pro-preview-06-05",
|
67 |
"google/gemini-2.5-pro-preview-05-06",
|
68 |
+
"perplexity/sonar-deep-research",
|
69 |
]
|
70 |
|
71 |
transcription_models = [
|
|
|
100 |
|
101 |
@cache
|
102 |
def get_historical_popular_models(date: date):
|
103 |
+
try:
|
104 |
+
raw = get("https://openrouter.ai/rankings").text
|
105 |
+
|
106 |
+
# Extract model data from rankingData using regex
|
107 |
+
import re
|
108 |
+
import json
|
109 |
+
|
110 |
+
# Find all count and model_permaslug pairs in the data
|
111 |
+
# Format: "count":number,"model_permaslug":"model/name"
|
112 |
+
pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
|
113 |
+
matches = re.findall(pattern, raw)
|
114 |
+
|
115 |
+
if matches:
|
116 |
+
# Aggregate model counts
|
117 |
+
model_counts = {}
|
118 |
+
for count_str, model_slug in matches:
|
119 |
+
count = float(count_str)
|
120 |
+
if not model_slug.startswith("openrouter") and model_slug != "Others":
|
121 |
+
# Remove variant suffixes for aggregation
|
122 |
+
base_model = model_slug.split(":")[0]
|
123 |
+
model_counts[base_model] = model_counts.get(base_model, 0) + count
|
124 |
+
|
125 |
+
# Sort by popularity and return top models
|
126 |
+
sorted_models = sorted(
|
127 |
+
model_counts.items(), key=lambda x: x[1], reverse=True
|
128 |
+
)
|
129 |
+
result = []
|
130 |
+
for model_slug, count in sorted_models[:20]: # Top 20
|
131 |
+
result.append({"slug": model_slug, "count": int(count)})
|
132 |
+
|
133 |
+
return result
|
134 |
+
else:
|
135 |
+
return []
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
return []
|
139 |
|
140 |
|
141 |
@cache
|
142 |
def get_current_popular_models(date: date):
|
143 |
+
try:
|
144 |
+
raw = get("https://openrouter.ai/rankings?view=day").text
|
145 |
+
|
146 |
+
# Extract model data from daily rankings
|
147 |
+
import re
|
148 |
+
import json
|
149 |
+
|
150 |
+
# Find all count and model_permaslug pairs in the daily data
|
151 |
+
pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
|
152 |
+
matches = re.findall(pattern, raw)
|
153 |
+
|
154 |
+
if matches:
|
155 |
+
# Aggregate model counts
|
156 |
+
model_counts = {}
|
157 |
+
for count_str, model_slug in matches:
|
158 |
+
count = float(count_str)
|
159 |
+
if not model_slug.startswith("openrouter") and model_slug != "Others":
|
160 |
+
# Remove variant suffixes for aggregation
|
161 |
+
base_model = model_slug.split(":")[0]
|
162 |
+
model_counts[base_model] = model_counts.get(base_model, 0) + count
|
163 |
+
|
164 |
+
# Sort by popularity and return top models
|
165 |
+
sorted_models = sorted(
|
166 |
+
model_counts.items(), key=lambda x: x[1], reverse=True
|
167 |
+
)
|
168 |
+
result = []
|
169 |
+
for model_slug, count in sorted_models[:10]: # Top 10
|
170 |
+
result.append({"slug": model_slug, "count": int(count)})
|
171 |
+
|
172 |
+
return result
|
173 |
+
else:
|
174 |
+
return []
|
175 |
+
|
176 |
+
except Exception as e:
|
177 |
+
return []
|
178 |
|
179 |
|
180 |
def get_translation_models():
|
|
|
221 |
|
222 |
|
223 |
translate_client = translate.Client()
|
224 |
+
|
225 |
+
|
226 |
+
def get_google_supported_languages():
|
227 |
+
return [l["language"] for l in translate_client.get_languages()]
|
228 |
|
229 |
|
230 |
@cache
|
|
|
294 |
return empty
|
295 |
try:
|
296 |
info = api.model_info(id)
|
297 |
+
license = ""
|
298 |
+
if (
|
299 |
+
info.card_data
|
300 |
+
and hasattr(info.card_data, "license")
|
301 |
+
and info.card_data.license
|
302 |
+
):
|
303 |
+
license = (
|
304 |
+
info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
|
305 |
+
)
|
306 |
return {
|
307 |
"hf_id": info.id,
|
308 |
"creation_date": info.created_at,
|
|
|
315 |
|
316 |
|
317 |
def get_cost(row):
|
318 |
+
"""
|
319 |
+
row: a row from the OpenRouter models dataframe
|
320 |
+
"""
|
321 |
+
try:
|
322 |
+
cost = float(row["endpoint"]["pricing"]["completion"])
|
323 |
+
return round(cost * 1_000_000, 2)
|
324 |
+
except (TypeError, KeyError):
|
325 |
+
return None
|
326 |
|
327 |
|
328 |
@cache
|
|
|
332 |
+ get_current_popular_models(date.today())[:10]
|
333 |
)
|
334 |
popular_models = [m["slug"] for m in popular_models]
|
335 |
+
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
336 |
+
|
337 |
+
# Validate models exist on OpenRouter before including them
|
338 |
+
valid_models = []
|
339 |
+
|
340 |
+
for model_id in all_model_candidates:
|
341 |
+
metadata = get_or_metadata(model_id)
|
342 |
+
if metadata is not None:
|
343 |
+
valid_models.append(model_id)
|
344 |
+
|
345 |
+
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
346 |
or_metadata = models["id"].apply(get_or_metadata)
|
347 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
348 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
|
|
362 |
license=hf_metadata.str["license"],
|
363 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
364 |
)
|
365 |
+
# Filter out expensive models to keep costs reasonable
|
366 |
+
models = models[models["cost"] <= 15.0].reset_index(drop=True)
|
367 |
models["tasks"] = [
|
368 |
+
[
|
369 |
+
"translation_from",
|
370 |
+
"translation_to",
|
371 |
+
"classification",
|
372 |
+
"mmlu",
|
373 |
+
"arc",
|
374 |
+
"truthfulqa",
|
375 |
+
"mgsm",
|
376 |
+
]
|
377 |
] * len(models)
|
378 |
models = pd.concat([models, get_translation_models()])
|
379 |
return models
|
evals/plots.py
CHANGED
@@ -9,34 +9,33 @@ df = pd.read_json("../results.json")
|
|
9 |
df = df[df["metric"] != "chrf"]
|
10 |
df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
|
11 |
|
|
|
12 |
# Apply logit transformation to classification scores to reduce skewness
|
13 |
def transform_classification_scores(row):
|
14 |
-
if row[
|
15 |
# Avoid division by zero and infinite values by clipping
|
16 |
-
score = np.clip(row[
|
17 |
# Apply logit transformation (log(p/(1-p)))
|
18 |
return logit(score)
|
19 |
else:
|
20 |
-
return row[
|
|
|
21 |
|
22 |
-
df[
|
23 |
|
24 |
# Create a pivot table with tasks as columns and languages as rows
|
25 |
pivot_df = df.pivot_table(
|
26 |
-
values=
|
27 |
-
index='bcp_47',
|
28 |
-
columns='task',
|
29 |
-
aggfunc='mean'
|
30 |
)
|
31 |
|
32 |
# Sort and filter tasks
|
33 |
ordered_tasks = [
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
]
|
41 |
# Drop 'truthfulqa' if present and reindex columns
|
42 |
pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
|
@@ -46,29 +45,29 @@ correlation_matrix = pivot_df.corr()
|
|
46 |
|
47 |
# Create the correlation plot
|
48 |
plt.figure(figsize=(8, 6))
|
49 |
-
# Create mask for upper triangle including diagonal to show only lower triangle
|
50 |
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
|
51 |
|
52 |
# Create a heatmap
|
53 |
sns.heatmap(
|
54 |
-
correlation_matrix,
|
55 |
-
annot=True,
|
56 |
-
cmap=
|
57 |
center=0,
|
58 |
square=True,
|
59 |
mask=mask,
|
60 |
-
cbar_kws={"shrink": .8},
|
61 |
-
fmt=
|
62 |
)
|
63 |
|
64 |
-
plt.xlabel(
|
65 |
-
plt.ylabel(
|
66 |
-
plt.xticks(rotation=45, ha=
|
67 |
plt.yticks(rotation=0)
|
68 |
plt.tight_layout()
|
69 |
|
70 |
# Save the plot
|
71 |
-
plt.savefig(
|
72 |
plt.show()
|
73 |
|
74 |
# Print correlation values for reference
|
@@ -77,56 +76,91 @@ print("Note: Classification scores have been logit-transformed to reduce skewnes
|
|
77 |
print(correlation_matrix.round(3))
|
78 |
|
79 |
# Also create a scatter plot matrix for pairwise relationships with highlighted languages
|
80 |
-
highlighted_languages = [
|
|
|
81 |
|
82 |
# Create color mapping
|
83 |
def get_color_and_label(lang_code):
|
84 |
if lang_code in highlighted_languages:
|
85 |
-
color_map = {
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
return color_map[lang_code], lang_code
|
87 |
else:
|
88 |
-
return
|
|
|
89 |
|
90 |
# Create custom scatter plot matrix
|
91 |
tasks = pivot_df.columns.tolist()
|
92 |
n_tasks = len(tasks)
|
93 |
|
94 |
fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
|
95 |
-
fig.suptitle(
|
96 |
|
97 |
# Create legend elements
|
98 |
legend_elements = []
|
99 |
for lang in highlighted_languages:
|
100 |
color, _ = get_color_and_label(lang)
|
101 |
-
legend_elements.append(
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
for i, task_y in enumerate(tasks):
|
105 |
for j, task_x in enumerate(tasks):
|
106 |
ax = axes[i, j]
|
107 |
-
|
108 |
if i == j:
|
109 |
# Diagonal: histogram
|
110 |
task_data = pivot_df[task_y].dropna()
|
111 |
colors = [get_color_and_label(lang)[0] for lang in task_data.index]
|
112 |
-
ax.hist(task_data, bins=20, alpha=0.7, color=
|
113 |
-
ax.set_title(f
|
114 |
else:
|
115 |
# Off-diagonal: scatter plot
|
116 |
for lang_code in pivot_df.index:
|
117 |
-
if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
|
|
|
|
|
118 |
color, _ = get_color_and_label(lang_code)
|
119 |
alpha = 0.8 if lang_code in highlighted_languages else 0.3
|
120 |
size = 50 if lang_code in highlighted_languages else 20
|
121 |
-
ax.scatter(
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
124 |
# Set labels
|
125 |
if i == n_tasks - 1:
|
126 |
ax.set_xlabel(task_x, fontsize=10)
|
127 |
if j == 0:
|
128 |
ax.set_ylabel(task_y, fontsize=10)
|
129 |
-
|
130 |
# Remove tick labels except for edges
|
131 |
if i != n_tasks - 1:
|
132 |
ax.set_xticklabels([])
|
@@ -136,15 +170,15 @@ for i, task_y in enumerate(tasks):
|
|
136 |
# Add legend
|
137 |
fig.legend(
|
138 |
handles=legend_elements,
|
139 |
-
loc=
|
140 |
bbox_to_anchor=(0.5, -0.05),
|
141 |
ncol=len(legend_elements),
|
142 |
frameon=False,
|
143 |
fontsize=10,
|
144 |
handletextpad=0.5,
|
145 |
-
columnspacing=1.0
|
146 |
)
|
147 |
|
148 |
plt.tight_layout()
|
149 |
-
plt.savefig(
|
150 |
plt.show()
|
|
|
9 |
df = df[df["metric"] != "chrf"]
|
10 |
df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
|
11 |
|
12 |
+
|
13 |
# Apply logit transformation to classification scores to reduce skewness
|
14 |
def transform_classification_scores(row):
|
15 |
+
if row["task"] == "classification":
|
16 |
# Avoid division by zero and infinite values by clipping
|
17 |
+
score = np.clip(row["score"], 0.001, 0.999)
|
18 |
# Apply logit transformation (log(p/(1-p)))
|
19 |
return logit(score)
|
20 |
else:
|
21 |
+
return row["score"]
|
22 |
+
|
23 |
|
24 |
+
df["score"] = df.apply(transform_classification_scores, axis=1)
|
25 |
|
26 |
# Create a pivot table with tasks as columns and languages as rows
|
27 |
pivot_df = df.pivot_table(
|
28 |
+
values="score", index="bcp_47", columns="task", aggfunc="mean"
|
|
|
|
|
|
|
29 |
)
|
30 |
|
31 |
# Sort and filter tasks
|
32 |
ordered_tasks = [
|
33 |
+
"translation_from",
|
34 |
+
"translation_to",
|
35 |
+
"classification",
|
36 |
+
"mmlu",
|
37 |
+
"arc",
|
38 |
+
"mgsm",
|
39 |
]
|
40 |
# Drop 'truthfulqa' if present and reindex columns
|
41 |
pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
|
|
|
45 |
|
46 |
# Create the correlation plot
|
47 |
plt.figure(figsize=(8, 6))
|
48 |
+
# Create mask for upper triangle including diagonal to show only lower triangle
|
49 |
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
|
50 |
|
51 |
# Create a heatmap
|
52 |
sns.heatmap(
|
53 |
+
correlation_matrix,
|
54 |
+
annot=True,
|
55 |
+
cmap="Blues",
|
56 |
center=0,
|
57 |
square=True,
|
58 |
mask=mask,
|
59 |
+
cbar_kws={"shrink": 0.8},
|
60 |
+
fmt=".3f",
|
61 |
)
|
62 |
|
63 |
+
plt.xlabel("Tasks", fontsize=12)
|
64 |
+
plt.ylabel("Tasks", fontsize=12)
|
65 |
+
plt.xticks(rotation=45, ha="right")
|
66 |
plt.yticks(rotation=0)
|
67 |
plt.tight_layout()
|
68 |
|
69 |
# Save the plot
|
70 |
+
plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
|
71 |
plt.show()
|
72 |
|
73 |
# Print correlation values for reference
|
|
|
76 |
print(correlation_matrix.round(3))
|
77 |
|
78 |
# Also create a scatter plot matrix for pairwise relationships with highlighted languages
|
79 |
+
highlighted_languages = ["en", "zh", "hi", "es", "ar"]
|
80 |
+
|
81 |
|
82 |
# Create color mapping
|
83 |
def get_color_and_label(lang_code):
|
84 |
if lang_code in highlighted_languages:
|
85 |
+
color_map = {
|
86 |
+
"en": "red",
|
87 |
+
"zh": "blue",
|
88 |
+
"hi": "green",
|
89 |
+
"es": "orange",
|
90 |
+
"ar": "purple",
|
91 |
+
}
|
92 |
return color_map[lang_code], lang_code
|
93 |
else:
|
94 |
+
return "lightgray", "Other"
|
95 |
+
|
96 |
|
97 |
# Create custom scatter plot matrix
|
98 |
tasks = pivot_df.columns.tolist()
|
99 |
n_tasks = len(tasks)
|
100 |
|
101 |
fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
|
102 |
+
fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")
|
103 |
|
104 |
# Create legend elements
|
105 |
legend_elements = []
|
106 |
for lang in highlighted_languages:
|
107 |
color, _ = get_color_and_label(lang)
|
108 |
+
legend_elements.append(
|
109 |
+
plt.Line2D(
|
110 |
+
[0],
|
111 |
+
[0],
|
112 |
+
marker="o",
|
113 |
+
color="w",
|
114 |
+
markerfacecolor=color,
|
115 |
+
markersize=8,
|
116 |
+
label=lang,
|
117 |
+
)
|
118 |
+
)
|
119 |
+
legend_elements.append(
|
120 |
+
plt.Line2D(
|
121 |
+
[0],
|
122 |
+
[0],
|
123 |
+
marker="o",
|
124 |
+
color="w",
|
125 |
+
markerfacecolor="lightgray",
|
126 |
+
markersize=8,
|
127 |
+
label="Other",
|
128 |
+
)
|
129 |
+
)
|
130 |
|
131 |
for i, task_y in enumerate(tasks):
|
132 |
for j, task_x in enumerate(tasks):
|
133 |
ax = axes[i, j]
|
134 |
+
|
135 |
if i == j:
|
136 |
# Diagonal: histogram
|
137 |
task_data = pivot_df[task_y].dropna()
|
138 |
colors = [get_color_and_label(lang)[0] for lang in task_data.index]
|
139 |
+
ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
|
140 |
+
ax.set_title(f"{task_y}", fontsize=10)
|
141 |
else:
|
142 |
# Off-diagonal: scatter plot
|
143 |
for lang_code in pivot_df.index:
|
144 |
+
if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
|
145 |
+
pivot_df.loc[lang_code, task_y]
|
146 |
+
):
|
147 |
color, _ = get_color_and_label(lang_code)
|
148 |
alpha = 0.8 if lang_code in highlighted_languages else 0.3
|
149 |
size = 50 if lang_code in highlighted_languages else 20
|
150 |
+
ax.scatter(
|
151 |
+
pivot_df.loc[lang_code, task_x],
|
152 |
+
pivot_df.loc[lang_code, task_y],
|
153 |
+
c=color,
|
154 |
+
alpha=alpha,
|
155 |
+
s=size,
|
156 |
+
)
|
157 |
+
|
158 |
# Set labels
|
159 |
if i == n_tasks - 1:
|
160 |
ax.set_xlabel(task_x, fontsize=10)
|
161 |
if j == 0:
|
162 |
ax.set_ylabel(task_y, fontsize=10)
|
163 |
+
|
164 |
# Remove tick labels except for edges
|
165 |
if i != n_tasks - 1:
|
166 |
ax.set_xticklabels([])
|
|
|
170 |
# Add legend
|
171 |
fig.legend(
|
172 |
handles=legend_elements,
|
173 |
+
loc="lower center",
|
174 |
bbox_to_anchor=(0.5, -0.05),
|
175 |
ncol=len(legend_elements),
|
176 |
frameon=False,
|
177 |
fontsize=10,
|
178 |
handletextpad=0.5,
|
179 |
+
columnspacing=1.0,
|
180 |
)
|
181 |
|
182 |
plt.tight_layout()
|
183 |
+
plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
|
184 |
plt.show()
|
evals/tasks.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import random
|
2 |
from functools import partial
|
3 |
from textwrap import dedent
|
@@ -5,10 +6,10 @@ from textwrap import dedent
|
|
5 |
import evaluate
|
6 |
import pandas as pd
|
7 |
import sentencepiece as spm
|
|
|
8 |
from datasets_.flores import flores_sentences
|
9 |
from datasets_.mgsm import load_mgsm, parse_number
|
10 |
from datasets_.mmlu import load_mmlu
|
11 |
-
from datasets_.arc import load_uhura_arc_easy
|
12 |
from datasets_.truthfulqa import load_truthfulqa
|
13 |
from google.cloud import translate_v2 as translate
|
14 |
from langcodes import closest_supported_match
|
@@ -47,6 +48,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
47 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
48 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
49 |
script = script_name(target_language.flores_path.split("_")[1])
|
|
|
50 |
if model == "google/translate-v2":
|
51 |
original_language = closest_supported_match(
|
52 |
original_language, supported_languages
|
@@ -66,7 +68,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
66 |
messages=[
|
67 |
{
|
68 |
"role": "user",
|
69 |
-
"content":
|
70 |
}
|
71 |
],
|
72 |
temperature=0,
|
@@ -91,6 +93,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
91 |
"task": f"translation_{mode}",
|
92 |
"metric": metric,
|
93 |
"score": score,
|
|
|
94 |
"sentence_nr": sentence_nr,
|
95 |
}
|
96 |
for metric, score in (
|
@@ -112,57 +115,33 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
112 |
)
|
113 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
114 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
115 |
-
|
116 |
-
[
|
117 |
-
paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
|
118 |
-
for t in top_topics
|
119 |
-
]
|
120 |
-
).sample(frac=1, random_state=nr)
|
121 |
-
test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
|
122 |
-
frac=1, random_state=42
|
123 |
-
)
|
124 |
-
test_paragraph = test_paragraphs.iloc[nr]
|
125 |
|
126 |
-
|
127 |
-
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
]
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
],
|
147 |
-
temperature=0,
|
148 |
-
max_tokens=30,
|
149 |
-
)
|
150 |
-
true = test_paragraph.topic
|
151 |
-
others = [t for t in top_topics if t != true]
|
152 |
-
acc = (
|
153 |
-
int(
|
154 |
-
pred.startswith(true)
|
155 |
-
or (true in pred and not any(o in pred for o in others))
|
156 |
-
)
|
157 |
-
if pred
|
158 |
-
else 0
|
159 |
)
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
else:
|
165 |
-
raise e
|
166 |
return [
|
167 |
{
|
168 |
"model": model,
|
@@ -170,6 +149,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
170 |
"task": "classification",
|
171 |
"metric": "accuracy",
|
172 |
"score": acc,
|
|
|
173 |
"sentence_nr": nr,
|
174 |
}
|
175 |
]
|
@@ -232,39 +212,38 @@ def format_multiple_choice(item):
|
|
232 |
A: {item["choices"][0]}
|
233 |
B: {item["choices"][1]}
|
234 |
C: {item["choices"][2]}
|
235 |
-
D: {item["choices"][3]}
|
236 |
-
|
237 |
-
A|B|C|D?"""
|
238 |
|
239 |
|
240 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
241 |
-
ds_name,
|
242 |
if not task:
|
243 |
return []
|
244 |
|
245 |
-
messages = [
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
268 |
return [
|
269 |
{
|
270 |
"model": model,
|
@@ -272,39 +251,40 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
272 |
"task": "mmlu",
|
273 |
"metric": "accuracy",
|
274 |
"score": acc,
|
|
|
275 |
"sentence_nr": nr,
|
276 |
}
|
277 |
]
|
278 |
|
279 |
|
280 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
281 |
-
ds_name,
|
282 |
if not task:
|
283 |
return []
|
284 |
|
285 |
-
messages = [
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
return [
|
309 |
{
|
310 |
"model": model,
|
@@ -312,6 +292,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
312 |
"task": "arc",
|
313 |
"metric": "accuracy",
|
314 |
"score": acc,
|
|
|
315 |
"sentence_nr": nr,
|
316 |
}
|
317 |
]
|
@@ -332,40 +313,42 @@ def format_multiple_choice_truthfulqa(item):
|
|
332 |
text = item["question"] + "\n\n"
|
333 |
for i, choice in enumerate(item["choices"]):
|
334 |
text += f"{letters[i]}: {choice}\n"
|
335 |
-
text += "|".join(letters[: len(item["choices"])]) + "?"
|
336 |
return text
|
337 |
|
338 |
|
339 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
340 |
-
ds_name,
|
341 |
if not task:
|
342 |
return []
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
|
|
|
|
|
|
369 |
return [
|
370 |
{
|
371 |
"model": model,
|
@@ -373,30 +356,36 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
|
373 |
"task": "truthfulqa",
|
374 |
"metric": "accuracy",
|
375 |
"score": acc,
|
|
|
376 |
"sentence_nr": nr,
|
377 |
}
|
378 |
]
|
379 |
|
380 |
|
381 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
382 |
-
|
383 |
-
Solve the math problem. Use reasoning, and finally give the answer as a number.
|
384 |
-
Response format: <reasoning> #### <number>
|
385 |
-
"""
|
386 |
-
system_prompt = dedent(system_prompt).strip()
|
387 |
-
ds_slug, question = load_mgsm(language_bcp_47, nr)
|
388 |
if not question:
|
389 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
response = await complete(
|
391 |
model=model,
|
392 |
-
messages=
|
393 |
-
{"role": "system", "content": system_prompt},
|
394 |
-
{"role": "user", "content": question["question"]},
|
395 |
-
],
|
396 |
temperature=0,
|
397 |
max_tokens=1024,
|
398 |
)
|
399 |
-
if response and
|
400 |
number = response.split("####")[1].strip()
|
401 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
402 |
else:
|
@@ -409,6 +398,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
|
409 |
"task": "mgsm",
|
410 |
"metric": "accuracy",
|
411 |
"score": accuracy,
|
|
|
412 |
"sentence_nr": nr,
|
413 |
}
|
414 |
]
|
@@ -449,10 +439,8 @@ tasks = {
|
|
449 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
450 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
451 |
"classification": classify_and_evaluate,
|
452 |
-
# "mlm": mlm_and_evaluate,
|
453 |
"mmlu": mmlu_and_evaluate,
|
454 |
"arc": arc_and_evaluate,
|
455 |
"truthfulqa": truthfulqa_and_evaluate,
|
456 |
"mgsm": mgsm_and_evaluate,
|
457 |
-
# "asr": transcribe_and_evaluate,
|
458 |
}
|
|
|
1 |
+
import asyncio
|
2 |
import random
|
3 |
from functools import partial
|
4 |
from textwrap import dedent
|
|
|
6 |
import evaluate
|
7 |
import pandas as pd
|
8 |
import sentencepiece as spm
|
9 |
+
from datasets_.arc import load_uhura_arc_easy
|
10 |
from datasets_.flores import flores_sentences
|
11 |
from datasets_.mgsm import load_mgsm, parse_number
|
12 |
from datasets_.mmlu import load_mmlu
|
|
|
13 |
from datasets_.truthfulqa import load_truthfulqa
|
14 |
from google.cloud import translate_v2 as translate
|
15 |
from langcodes import closest_supported_match
|
|
|
48 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
49 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
50 |
script = script_name(target_language.flores_path.split("_")[1])
|
51 |
+
translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
|
52 |
if model == "google/translate-v2":
|
53 |
original_language = closest_supported_match(
|
54 |
original_language, supported_languages
|
|
|
68 |
messages=[
|
69 |
{
|
70 |
"role": "user",
|
71 |
+
"content": translation_prompt,
|
72 |
}
|
73 |
],
|
74 |
temperature=0,
|
|
|
93 |
"task": f"translation_{mode}",
|
94 |
"metric": metric,
|
95 |
"score": score,
|
96 |
+
"origin": "human", # FLORES+ is human-translated
|
97 |
"sentence_nr": sentence_nr,
|
98 |
}
|
99 |
for metric, score in (
|
|
|
115 |
)
|
116 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
117 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
118 |
+
test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
prompt = f"""Classify the following text into one of these topics: {", ".join(top_topics)}.
|
121 |
+
Reply with only the topic name.
|
122 |
|
123 |
+
Text:
|
124 |
+
{test_paragraph.text}
|
125 |
+
"""
|
126 |
+
response = await complete(
|
127 |
+
model=model,
|
128 |
+
messages=[{"role": "user", "content": prompt}],
|
129 |
+
temperature=0,
|
130 |
+
max_tokens=30,
|
131 |
+
)
|
132 |
+
|
133 |
+
pred = response.lower().strip() if response else ""
|
134 |
+
true = test_paragraph.topic.lower().strip()
|
135 |
+
others = [t for t in top_topics if t != true]
|
136 |
+
acc = (
|
137 |
+
int(
|
138 |
+
pred.startswith(true)
|
139 |
+
or (true in pred and not any(o in pred for o in others))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
)
|
141 |
+
if pred
|
142 |
+
else 0
|
143 |
+
)
|
144 |
+
|
|
|
|
|
145 |
return [
|
146 |
{
|
147 |
"model": model,
|
|
|
149 |
"task": "classification",
|
150 |
"metric": "accuracy",
|
151 |
"score": acc,
|
152 |
+
"origin": "human", # FLORES+ is human-translated
|
153 |
"sentence_nr": nr,
|
154 |
}
|
155 |
]
|
|
|
212 |
A: {item["choices"][0]}
|
213 |
B: {item["choices"][1]}
|
214 |
C: {item["choices"][2]}
|
215 |
+
D: {item["choices"][3]}"""
|
|
|
|
|
216 |
|
217 |
|
218 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
219 |
+
ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
|
220 |
if not task:
|
221 |
return []
|
222 |
|
223 |
+
messages = [
|
224 |
+
{
|
225 |
+
"role": "user",
|
226 |
+
"content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
227 |
+
|
228 |
+
Response format: <reasoning> #### <letter>
|
229 |
+
|
230 |
+
---
|
231 |
+
|
232 |
+
{format_multiple_choice(task)}""",
|
233 |
+
},
|
234 |
+
]
|
235 |
+
response = await complete(
|
236 |
+
model=model,
|
237 |
+
messages=messages,
|
238 |
+
temperature=0,
|
239 |
+
max_tokens=1024,
|
240 |
+
)
|
241 |
+
if response and "####" in response:
|
242 |
+
answer = response.split("####")[-1].strip()
|
243 |
+
acc = int(answer[:1] == task["answer"])
|
244 |
+
else:
|
245 |
+
acc = 0
|
246 |
+
|
247 |
return [
|
248 |
{
|
249 |
"model": model,
|
|
|
251 |
"task": "mmlu",
|
252 |
"metric": "accuracy",
|
253 |
"score": acc,
|
254 |
+
"origin": origin, # Add origin tag to results
|
255 |
"sentence_nr": nr,
|
256 |
}
|
257 |
]
|
258 |
|
259 |
|
260 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
261 |
+
ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
|
262 |
if not task:
|
263 |
return []
|
264 |
|
265 |
+
messages = [
|
266 |
+
{
|
267 |
+
"role": "user",
|
268 |
+
"content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
269 |
+
|
270 |
+
Response format: <reasoning> #### <letter>
|
271 |
+
|
272 |
+
---
|
273 |
+
|
274 |
+
{format_multiple_choice(task)}""",
|
275 |
+
},
|
276 |
+
]
|
277 |
+
response = await complete(
|
278 |
+
model=model,
|
279 |
+
messages=messages,
|
280 |
+
temperature=0,
|
281 |
+
max_tokens=1024,
|
282 |
+
)
|
283 |
+
if response and "####" in response:
|
284 |
+
answer = response.split("####")[-1].strip()
|
285 |
+
acc = int(answer[:1] == task["answer"])
|
286 |
+
else:
|
287 |
+
acc = 0
|
288 |
return [
|
289 |
{
|
290 |
"model": model,
|
|
|
292 |
"task": "arc",
|
293 |
"metric": "accuracy",
|
294 |
"score": acc,
|
295 |
+
"origin": origin,
|
296 |
"sentence_nr": nr,
|
297 |
}
|
298 |
]
|
|
|
313 |
text = item["question"] + "\n\n"
|
314 |
for i, choice in enumerate(item["choices"]):
|
315 |
text += f"{letters[i]}: {choice}\n"
|
|
|
316 |
return text
|
317 |
|
318 |
|
319 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
320 |
+
ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
|
321 |
if not task:
|
322 |
return []
|
323 |
+
|
324 |
+
# Find the correct answer
|
325 |
+
correct_choice_index = task["labels"].index(1)
|
326 |
+
answer = letters[correct_choice_index]
|
327 |
+
|
328 |
+
messages = [
|
329 |
+
{
|
330 |
+
"role": "user",
|
331 |
+
"content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
|
332 |
+
|
333 |
+
Response format: <reasoning> #### <letter>
|
334 |
+
|
335 |
+
---
|
336 |
+
|
337 |
+
{format_multiple_choice_truthfulqa(task)}""",
|
338 |
+
},
|
339 |
+
]
|
340 |
+
response = await complete(
|
341 |
+
model=model,
|
342 |
+
messages=messages,
|
343 |
+
temperature=0,
|
344 |
+
max_tokens=1024, # Increased for reasoning
|
345 |
+
)
|
346 |
+
if response and "####" in response:
|
347 |
+
pred_answer = response.split("####")[-1].strip()
|
348 |
+
acc = int(pred_answer[:1].upper() == answer)
|
349 |
+
else:
|
350 |
+
acc = 0
|
351 |
+
|
352 |
return [
|
353 |
{
|
354 |
"model": model,
|
|
|
356 |
"task": "truthfulqa",
|
357 |
"metric": "accuracy",
|
358 |
"score": acc,
|
359 |
+
"origin": origin,
|
360 |
"sentence_nr": nr,
|
361 |
}
|
362 |
]
|
363 |
|
364 |
|
365 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
366 |
+
ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
|
|
|
|
|
|
|
|
|
|
|
367 |
if not question:
|
368 |
return []
|
369 |
+
|
370 |
+
messages = [
|
371 |
+
{
|
372 |
+
"role": "user",
|
373 |
+
"content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
|
374 |
+
|
375 |
+
Response format: <reasoning> #### <number>
|
376 |
+
|
377 |
+
---
|
378 |
+
|
379 |
+
{question["question"]}""",
|
380 |
+
},
|
381 |
+
]
|
382 |
response = await complete(
|
383 |
model=model,
|
384 |
+
messages=messages,
|
|
|
|
|
|
|
385 |
temperature=0,
|
386 |
max_tokens=1024,
|
387 |
)
|
388 |
+
if response and "####" in response:
|
389 |
number = response.split("####")[1].strip()
|
390 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
391 |
else:
|
|
|
398 |
"task": "mgsm",
|
399 |
"metric": "accuracy",
|
400 |
"score": accuracy,
|
401 |
+
"origin": origin,
|
402 |
"sentence_nr": nr,
|
403 |
}
|
404 |
]
|
|
|
439 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
440 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
441 |
"classification": classify_and_evaluate,
|
|
|
442 |
"mmlu": mmlu_and_evaluate,
|
443 |
"arc": arc_and_evaluate,
|
444 |
"truthfulqa": truthfulqa_and_evaluate,
|
445 |
"mgsm": mgsm_and_evaluate,
|
|
|
446 |
}
|
evals/translate.py
CHANGED
@@ -6,4 +6,4 @@ from datasets_.mmlu import translate_mmlu
|
|
6 |
if __name__ == "__main__":
|
7 |
translate_mmlu(languages)
|
8 |
translate_mgsm(languages)
|
9 |
-
translate_arc(languages)
|
|
|
6 |
if __name__ == "__main__":
|
7 |
translate_mmlu(languages)
|
8 |
translate_mgsm(languages)
|
9 |
+
translate_arc(languages)
|
frontend/package-lock.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
frontend/package.json
CHANGED
@@ -6,13 +6,12 @@
|
|
6 |
"@observablehq/plot": "^0.6.17",
|
7 |
"@testing-library/dom": "^10.4.0",
|
8 |
"@testing-library/jest-dom": "^6.6.3",
|
9 |
-
"@testing-library/react": "^
|
10 |
"@testing-library/user-event": "^13.5.0",
|
11 |
"primeicons": "^7.0.0",
|
12 |
"primereact": "^10.9.3",
|
13 |
-
"react": "^
|
14 |
-
"react-dom": "^
|
15 |
-
"react-scripts": "5.0.1",
|
16 |
"topojson-simplify": "^3.0.3",
|
17 |
"web-vitals": "^2.1.4"
|
18 |
},
|
@@ -41,5 +40,8 @@
|
|
41 |
"last 1 safari version"
|
42 |
]
|
43 |
},
|
44 |
-
"proxy": "http://localhost:8000"
|
|
|
|
|
|
|
45 |
}
|
|
|
6 |
"@observablehq/plot": "^0.6.17",
|
7 |
"@testing-library/dom": "^10.4.0",
|
8 |
"@testing-library/jest-dom": "^6.6.3",
|
9 |
+
"@testing-library/react": "^15.0.0",
|
10 |
"@testing-library/user-event": "^13.5.0",
|
11 |
"primeicons": "^7.0.0",
|
12 |
"primereact": "^10.9.3",
|
13 |
+
"react": "^18.2.0",
|
14 |
+
"react-dom": "^18.2.0",
|
|
|
15 |
"topojson-simplify": "^3.0.3",
|
16 |
"web-vitals": "^2.1.4"
|
17 |
},
|
|
|
40 |
"last 1 safari version"
|
41 |
]
|
42 |
},
|
43 |
+
"proxy": "http://localhost:8000",
|
44 |
+
"devDependencies": {
|
45 |
+
"react-scripts": "^5.0.1"
|
46 |
+
}
|
47 |
}
|
frontend/src/App.js
CHANGED
@@ -19,9 +19,14 @@ function App () {
|
|
19 |
const [loading, setLoading] = useState(true)
|
20 |
const [error, setError] = useState(null)
|
21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
|
|
22 |
const [dialogVisible, setDialogVisible] = useState(false)
|
23 |
const [aboutVisible, setAboutVisible] = useState(false)
|
24 |
const [contributeVisible, setContributeVisible] = useState(false)
|
|
|
|
|
|
|
|
|
25 |
|
26 |
useEffect(() => {
|
27 |
fetch('/api/data', {
|
@@ -36,6 +41,7 @@ function App () {
|
|
36 |
})
|
37 |
.then(jsonData => {
|
38 |
setData(jsonData)
|
|
|
39 |
setLoading(false)
|
40 |
})
|
41 |
.catch(err => {
|
@@ -44,8 +50,27 @@ function App () {
|
|
44 |
})
|
45 |
}, [selectedLanguages])
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
48 |
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
|
|
49 |
useEffect(() => {
|
50 |
const handleResize = () => {
|
51 |
setWindowWidth(window.innerWidth)
|
@@ -55,6 +80,44 @@ function App () {
|
|
55 |
return () => window.removeEventListener('resize', handleResize)
|
56 |
}, [])
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
return (
|
59 |
<PrimeReactProvider>
|
60 |
<div
|
@@ -69,35 +132,50 @@ function App () {
|
|
69 |
style={{
|
70 |
backgroundColor: '#fff3cd',
|
71 |
color: '#856404',
|
72 |
-
padding: '
|
73 |
marginBottom: '1rem',
|
74 |
border: '1px solid #ffeeba',
|
75 |
borderRadius: '0.25rem',
|
76 |
-
textAlign: 'center'
|
|
|
|
|
77 |
}}
|
78 |
>
|
79 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
80 |
-
active development. Evaluation results are not yet final.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
<a
|
82 |
href='https://github.com/datenlabor-bmz/ai-language-monitor'
|
83 |
target='_blank'
|
84 |
rel='noopener noreferrer'
|
85 |
style={{
|
86 |
textDecoration: 'none',
|
87 |
-
color: '#
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
}}
|
95 |
>
|
96 |
-
<i
|
97 |
-
className='pi pi-github'
|
98 |
-
title='View on GitHub'
|
99 |
-
style={{ marginRight: '0.3rem' }}
|
100 |
-
/>
|
101 |
GitHub
|
102 |
</a>
|
103 |
</div>
|
@@ -149,39 +227,88 @@ function App () {
|
|
149 |
<div
|
150 |
style={{
|
151 |
display: 'flex',
|
152 |
-
gap: '
|
153 |
-
marginBottom: '
|
154 |
flexWrap: 'wrap',
|
155 |
justifyContent: 'center'
|
156 |
}}
|
157 |
>
|
158 |
-
<
|
159 |
-
label='📚 About this tool'
|
160 |
-
className='p-button-text'
|
161 |
onClick={() => setAboutVisible(true)}
|
162 |
style={{
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
}}
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
-
<
|
172 |
-
label='🚀 Add your model (soon)'
|
173 |
-
className='p-button-text'
|
174 |
onClick={() => setContributeVisible(true)}
|
175 |
-
|
176 |
-
tooltipOptions={{ position: 'bottom' }}
|
177 |
style={{
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
}}
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
</div>
|
186 |
|
187 |
{data && (
|
@@ -220,6 +347,7 @@ function App () {
|
|
220 |
data={data.model_table}
|
221 |
selectedLanguages={selectedLanguages}
|
222 |
allLanguages={data.language_table || []}
|
|
|
223 |
/>
|
224 |
<LanguageTable
|
225 |
data={data.language_table}
|
@@ -248,20 +376,18 @@ function App () {
|
|
248 |
color: '#666'
|
249 |
}}
|
250 |
/>
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
style={{ width: '100%', minHeight: '650px' }}
|
264 |
-
/>
|
265 |
</div>
|
266 |
</>
|
267 |
)}
|
@@ -409,36 +535,16 @@ function App () {
|
|
409 |
modal
|
410 |
header={null}
|
411 |
>
|
412 |
-
{
|
413 |
<div style={{ width: '100%', height: '100%' }}>
|
414 |
<Carousel
|
415 |
-
|
416 |
-
|
417 |
-
data={data.countries}
|
418 |
-
width={windowWidth * 0.7}
|
419 |
-
height={windowHeight * 0.6}
|
420 |
-
/>,
|
421 |
-
<LanguagePlot
|
422 |
-
data={data}
|
423 |
-
width={windowWidth * 0.7}
|
424 |
-
height={windowHeight * 0.6}
|
425 |
-
/>,
|
426 |
-
<SpeakerPlot
|
427 |
-
data={data}
|
428 |
-
width={windowWidth * 0.7}
|
429 |
-
height={windowHeight * 0.6}
|
430 |
-
/>,
|
431 |
-
<HistoryPlot
|
432 |
-
data={data}
|
433 |
-
width={windowWidth * 0.7}
|
434 |
-
height={windowHeight * 0.6}
|
435 |
-
/>,
|
436 |
-
<CostPlot data={data} />
|
437 |
-
]}
|
438 |
numScroll={1}
|
439 |
numVisible={1}
|
440 |
itemTemplate={item => item}
|
441 |
-
circular
|
|
|
442 |
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
443 |
/>
|
444 |
</div>
|
@@ -449,4 +555,4 @@ function App () {
|
|
449 |
)
|
450 |
}
|
451 |
|
452 |
-
export default App
|
|
|
19 |
const [loading, setLoading] = useState(true)
|
20 |
const [error, setError] = useState(null)
|
21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
22 |
+
const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
|
23 |
const [dialogVisible, setDialogVisible] = useState(false)
|
24 |
const [aboutVisible, setAboutVisible] = useState(false)
|
25 |
const [contributeVisible, setContributeVisible] = useState(false)
|
26 |
+
|
27 |
+
// Add state for carousel items
|
28 |
+
const [carouselItems, setCarouselItems] = useState([])
|
29 |
+
const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
|
30 |
|
31 |
useEffect(() => {
|
32 |
fetch('/api/data', {
|
|
|
41 |
})
|
42 |
.then(jsonData => {
|
43 |
setData(jsonData)
|
44 |
+
setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
|
45 |
setLoading(false)
|
46 |
})
|
47 |
.catch(err => {
|
|
|
50 |
})
|
51 |
}, [selectedLanguages])
|
52 |
|
53 |
+
// Create carousel items when data is loaded
|
54 |
+
useEffect(() => {
|
55 |
+
if (data) {
|
56 |
+
// Add a small delay to ensure components are ready
|
57 |
+
const timer = setTimeout(() => {
|
58 |
+
setCarouselItems([
|
59 |
+
<WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
|
60 |
+
<LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
|
61 |
+
<SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
|
62 |
+
<HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
|
63 |
+
<CostPlot key="costplot-4" data={data} width={750} height={500} />
|
64 |
+
]);
|
65 |
+
}, 100);
|
66 |
+
|
67 |
+
return () => clearTimeout(timer);
|
68 |
+
}
|
69 |
+
}, [data])
|
70 |
+
|
71 |
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
72 |
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
73 |
+
|
74 |
useEffect(() => {
|
75 |
const handleResize = () => {
|
76 |
setWindowWidth(window.innerWidth)
|
|
|
80 |
return () => window.removeEventListener('resize', handleResize)
|
81 |
}, [])
|
82 |
|
83 |
+
// Create full-screen carousel items when data or window size changes
|
84 |
+
useEffect(() => {
|
85 |
+
if (data) {
|
86 |
+
const timer = setTimeout(() => {
|
87 |
+
setFullScreenCarouselItems([
|
88 |
+
<WorldMap
|
89 |
+
key="fs-worldmap-0"
|
90 |
+
data={data.countries}
|
91 |
+
allLanguages={data.language_table}
|
92 |
+
width={windowWidth * 0.7}
|
93 |
+
height={windowHeight * 0.6}
|
94 |
+
/>,
|
95 |
+
<LanguagePlot
|
96 |
+
key="fs-langplot-1"
|
97 |
+
data={data}
|
98 |
+
width={windowWidth * 0.7}
|
99 |
+
height={windowHeight * 0.6}
|
100 |
+
/>,
|
101 |
+
<SpeakerPlot
|
102 |
+
key="fs-speakerplot-2"
|
103 |
+
data={data}
|
104 |
+
width={windowWidth * 0.7}
|
105 |
+
height={windowHeight * 0.6}
|
106 |
+
/>,
|
107 |
+
<HistoryPlot
|
108 |
+
key="fs-histplot-3"
|
109 |
+
data={data}
|
110 |
+
width={windowWidth * 0.7}
|
111 |
+
height={windowHeight * 0.6}
|
112 |
+
/>,
|
113 |
+
<CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
|
114 |
+
]);
|
115 |
+
}, 100);
|
116 |
+
|
117 |
+
return () => clearTimeout(timer);
|
118 |
+
}
|
119 |
+
}, [data, windowWidth, windowHeight])
|
120 |
+
|
121 |
return (
|
122 |
<PrimeReactProvider>
|
123 |
<div
|
|
|
132 |
style={{
|
133 |
backgroundColor: '#fff3cd',
|
134 |
color: '#856404',
|
135 |
+
padding: '1rem 1.5rem',
|
136 |
marginBottom: '1rem',
|
137 |
border: '1px solid #ffeeba',
|
138 |
borderRadius: '0.25rem',
|
139 |
+
textAlign: 'center',
|
140 |
+
lineHeight: '1.5',
|
141 |
+
position: 'relative'
|
142 |
}}
|
143 |
>
|
144 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
145 |
+
active development. Evaluation results are not yet final. Note that the visualised results currently stem from sampling 20 instances per combination of model, task, and language. We have evaluated 139 languages across 41 models and 7 tasks, totaling over 300,000 individual evaluations. Only the top 150 languages by speaker count are included in the current evaluation scope. More extensive evaluation runs will be released later this year.
|
146 |
+
</div>
|
147 |
+
<div
|
148 |
+
style={{
|
149 |
+
display: 'flex',
|
150 |
+
justifyContent: 'flex-end',
|
151 |
+
padding: '0 1.5rem',
|
152 |
+
marginBottom: '1rem'
|
153 |
+
}}
|
154 |
+
>
|
155 |
<a
|
156 |
href='https://github.com/datenlabor-bmz/ai-language-monitor'
|
157 |
target='_blank'
|
158 |
rel='noopener noreferrer'
|
159 |
style={{
|
160 |
textDecoration: 'none',
|
161 |
+
color: '#6c757d',
|
162 |
+
fontSize: '1rem',
|
163 |
+
fontWeight: '500',
|
164 |
+
padding: '0.5rem 1rem',
|
165 |
+
borderRadius: '0.375rem',
|
166 |
+
backgroundColor: '#f8f9fa',
|
167 |
+
border: '1px solid #e9ecef',
|
168 |
+
display: 'flex',
|
169 |
+
alignItems: 'center',
|
170 |
+
gap: '0.5rem',
|
171 |
+
transition: 'all 0.2s ease',
|
172 |
+
':hover': {
|
173 |
+
backgroundColor: '#e9ecef',
|
174 |
+
color: '#495057'
|
175 |
+
}
|
176 |
}}
|
177 |
>
|
178 |
+
<i className='pi pi-github' title='View on GitHub' />
|
|
|
|
|
|
|
|
|
179 |
GitHub
|
180 |
</a>
|
181 |
</div>
|
|
|
227 |
<div
|
228 |
style={{
|
229 |
display: 'flex',
|
230 |
+
gap: '0.75rem',
|
231 |
+
marginBottom: '2rem',
|
232 |
flexWrap: 'wrap',
|
233 |
justifyContent: 'center'
|
234 |
}}
|
235 |
>
|
236 |
+
<button
|
|
|
|
|
237 |
onClick={() => setAboutVisible(true)}
|
238 |
style={{
|
239 |
+
background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
|
240 |
+
color: 'white',
|
241 |
+
border: 'none',
|
242 |
+
padding: '0.75rem 1.5rem',
|
243 |
+
borderRadius: '12px',
|
244 |
+
fontSize: '0.95rem',
|
245 |
+
fontWeight: '500',
|
246 |
+
cursor: 'pointer',
|
247 |
+
display: 'flex',
|
248 |
+
alignItems: 'center',
|
249 |
+
gap: '0.5rem',
|
250 |
+
boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
|
251 |
+
transition: 'all 0.3s ease',
|
252 |
+
':hover': {
|
253 |
+
transform: 'translateY(-2px)',
|
254 |
+
boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
|
255 |
+
}
|
256 |
}}
|
257 |
+
onMouseEnter={(e) => {
|
258 |
+
e.target.style.transform = 'translateY(-2px)';
|
259 |
+
e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
|
260 |
+
}}
|
261 |
+
onMouseLeave={(e) => {
|
262 |
+
e.target.style.transform = 'translateY(0)';
|
263 |
+
e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
|
264 |
+
}}
|
265 |
+
>
|
266 |
+
<span style={{ fontSize: '1.1rem' }}>📚</span>
|
267 |
+
About this tool
|
268 |
+
</button>
|
269 |
|
270 |
+
<button
|
|
|
|
|
271 |
onClick={() => setContributeVisible(true)}
|
272 |
+
title='This feature is on our roadmap and will be available soon.'
|
|
|
273 |
style={{
|
274 |
+
background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
|
275 |
+
color: '#6b46c1',
|
276 |
+
border: 'none',
|
277 |
+
padding: '0.75rem 1.5rem',
|
278 |
+
borderRadius: '12px',
|
279 |
+
fontSize: '0.95rem',
|
280 |
+
fontWeight: '500',
|
281 |
+
cursor: 'pointer',
|
282 |
+
display: 'flex',
|
283 |
+
alignItems: 'center',
|
284 |
+
gap: '0.5rem',
|
285 |
+
boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
|
286 |
+
transition: 'all 0.3s ease',
|
287 |
+
position: 'relative',
|
288 |
+
overflow: 'hidden'
|
289 |
}}
|
290 |
+
onMouseEnter={(e) => {
|
291 |
+
e.target.style.transform = 'translateY(-2px)';
|
292 |
+
e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
|
293 |
+
}}
|
294 |
+
onMouseLeave={(e) => {
|
295 |
+
e.target.style.transform = 'translateY(0)';
|
296 |
+
e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
|
297 |
+
}}
|
298 |
+
>
|
299 |
+
<span style={{ fontSize: '1.1rem' }}>🚀</span>
|
300 |
+
Add your model
|
301 |
+
<span style={{
|
302 |
+
fontSize: '0.75rem',
|
303 |
+
backgroundColor: 'rgba(107, 70, 193, 0.15)',
|
304 |
+
padding: '0.2rem 0.5rem',
|
305 |
+
borderRadius: '6px',
|
306 |
+
marginLeft: '0.5rem',
|
307 |
+
fontWeight: '600'
|
308 |
+
}}>
|
309 |
+
soon
|
310 |
+
</span>
|
311 |
+
</button>
|
312 |
</div>
|
313 |
|
314 |
{data && (
|
|
|
347 |
data={data.model_table}
|
348 |
selectedLanguages={selectedLanguages}
|
349 |
allLanguages={data.language_table || []}
|
350 |
+
machineTranslatedMetrics={machineTranslatedMetrics}
|
351 |
/>
|
352 |
<LanguageTable
|
353 |
data={data.language_table}
|
|
|
376 |
color: '#666'
|
377 |
}}
|
378 |
/>
|
379 |
+
{carouselItems.length > 0 && (
|
380 |
+
<Carousel
|
381 |
+
key={`main-carousel-${carouselItems.length}-${Date.now()}`}
|
382 |
+
value={carouselItems}
|
383 |
+
numScroll={1}
|
384 |
+
numVisible={1}
|
385 |
+
itemTemplate={item => item}
|
386 |
+
circular={false}
|
387 |
+
activeIndex={0}
|
388 |
+
style={{ width: '100%', minHeight: '650px' }}
|
389 |
+
/>
|
390 |
+
)}
|
|
|
|
|
391 |
</div>
|
392 |
</>
|
393 |
)}
|
|
|
535 |
modal
|
536 |
header={null}
|
537 |
>
|
538 |
+
{fullScreenCarouselItems.length > 0 && (
|
539 |
<div style={{ width: '100%', height: '100%' }}>
|
540 |
<Carousel
|
541 |
+
key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
|
542 |
+
value={fullScreenCarouselItems}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
numScroll={1}
|
544 |
numVisible={1}
|
545 |
itemTemplate={item => item}
|
546 |
+
circular={false}
|
547 |
+
activeIndex={0}
|
548 |
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
549 |
/>
|
550 |
</div>
|
|
|
555 |
)
|
556 |
}
|
557 |
|
558 |
+
export default App
|
frontend/src/components/HistoryPlot.js
CHANGED
@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
50 |
...models.filter(d => d.newRecord),
|
51 |
{
|
52 |
creation_date: new Date(),
|
53 |
-
maxAverage: models[models.length - 1]
|
54 |
}
|
55 |
],
|
56 |
{
|
57 |
x: d => d.creation_date,
|
58 |
-
y: d => d.maxAverage,
|
59 |
curve: 'step-after',
|
60 |
strokeOpacity: 0.3
|
61 |
}
|
|
|
50 |
...models.filter(d => d.newRecord),
|
51 |
{
|
52 |
creation_date: new Date(),
|
53 |
+
maxAverage: models[models.length - 1]?.maxAverage || 0
|
54 |
}
|
55 |
],
|
56 |
{
|
57 |
x: d => d.creation_date,
|
58 |
+
y: d => d.maxAverage || 0,
|
59 |
curve: 'step-after',
|
60 |
strokeOpacity: 0.3
|
61 |
}
|
frontend/src/components/LanguageTable.js
CHANGED
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
|
|
172 |
filterElement={familyRowFilterTemplate}
|
173 |
style={{ minWidth: '10rem' }}
|
174 |
/>
|
175 |
-
{ScoreColumns}
|
176 |
</DataTable>
|
177 |
)
|
178 |
}
|
|
|
172 |
filterElement={familyRowFilterTemplate}
|
173 |
style={{ minWidth: '10rem' }}
|
174 |
/>
|
175 |
+
{ScoreColumns()}
|
176 |
</DataTable>
|
177 |
)
|
178 |
}
|
frontend/src/components/ModelTable.js
CHANGED
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
|
|
6 |
import Medal from './Medal'
|
7 |
import { Slider } from 'primereact/slider'
|
8 |
import ScoreColumns from './ScoreColumns'
|
9 |
-
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
10 |
const [filters, setFilters] = useState({
|
11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
50 |
}
|
51 |
|
52 |
const SliderWithLabel = ({ value, onChange, min, max }) => {
|
53 |
-
const p = 10
|
54 |
-
const start = value === null ? min : Math.log(value[0]) / Math.log(p)
|
55 |
-
const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
|
56 |
-
const [_value, _setValue] = useState([start, stop])
|
57 |
useEffect(() => {
|
58 |
const timer = setTimeout(() => {
|
59 |
onChange({
|
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
61 |
// set to "no filter" when (almost) the whole range is selected
|
62 |
_value[0] <= min + 0.1 && _value[1] >= max - 0.1
|
63 |
? null
|
64 |
-
: [p ** _value[0], p ** _value[1]]
|
65 |
-
})
|
66 |
-
}, 1000)
|
67 |
-
return () => clearTimeout(timer)
|
68 |
-
}, [_value, onChange, min, max])
|
69 |
return (
|
70 |
<div style={{ minWidth: '20rem' }}>
|
71 |
<div>{formatSize(p ** _value[0])}</div>
|
@@ -147,21 +147,35 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
147 |
}
|
148 |
|
149 |
const costBodyTemplate = rowData => {
|
150 |
-
return
|
|
|
|
|
|
|
|
|
151 |
}
|
152 |
|
153 |
const getHeaderText = () => {
|
154 |
-
// Count languages that have evaluation data (
|
155 |
-
const evaluatedLanguagesCount = allLanguages.filter(lang =>
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
if (selectedLanguages.length === 0) {
|
160 |
return (
|
161 |
<span>
|
162 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
163 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
164 |
-
|
165 |
</span>
|
166 |
</span>
|
167 |
)
|
@@ -245,7 +259,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
245 |
body={costBodyTemplate}
|
246 |
style={{ minWidth: '5rem' }}
|
247 |
/>
|
248 |
-
{ScoreColumns}
|
249 |
</DataTable>
|
250 |
)
|
251 |
}
|
|
|
6 |
import Medal from './Medal'
|
7 |
import { Slider } from 'primereact/slider'
|
8 |
import ScoreColumns from './ScoreColumns'
|
9 |
+
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
|
10 |
const [filters, setFilters] = useState({
|
11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
|
50 |
}
|
51 |
|
52 |
const SliderWithLabel = ({ value, onChange, min, max }) => {
|
53 |
+
const p = 10;
|
54 |
+
const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
|
55 |
+
const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
|
56 |
+
const [_value, _setValue] = useState([start, stop]);
|
57 |
useEffect(() => {
|
58 |
const timer = setTimeout(() => {
|
59 |
onChange({
|
|
|
61 |
// set to "no filter" when (almost) the whole range is selected
|
62 |
_value[0] <= min + 0.1 && _value[1] >= max - 0.1
|
63 |
? null
|
64 |
+
: [p ** _value[0], p ** _value[1]],
|
65 |
+
});
|
66 |
+
}, 1000);
|
67 |
+
return () => clearTimeout(timer);
|
68 |
+
}, [_value, onChange, min, max]);
|
69 |
return (
|
70 |
<div style={{ minWidth: '20rem' }}>
|
71 |
<div>{formatSize(p ** _value[0])}</div>
|
|
|
147 |
}
|
148 |
|
149 |
const costBodyTemplate = rowData => {
|
150 |
+
return (
|
151 |
+
<div style={{ textAlign: 'center' }}>
|
152 |
+
{rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
|
153 |
+
</div>
|
154 |
+
)
|
155 |
}
|
156 |
|
157 |
const getHeaderText = () => {
|
158 |
+
// Count languages that have any evaluation data (any task scores available)
|
159 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
160 |
+
// Check if language has any task scores (not just average)
|
161 |
+
const hasAnyScores = [
|
162 |
+
'translation_from_bleu',
|
163 |
+
'translation_to_bleu',
|
164 |
+
'classification_accuracy',
|
165 |
+
'mmlu_accuracy',
|
166 |
+
'arc_accuracy',
|
167 |
+
'truthfulqa_accuracy',
|
168 |
+
'mgsm_accuracy'
|
169 |
+
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
170 |
+
return hasAnyScores
|
171 |
+
}).length
|
172 |
|
173 |
if (selectedLanguages.length === 0) {
|
174 |
return (
|
175 |
<span>
|
176 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
177 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
178 |
+
Performance across {evaluatedLanguagesCount} evaluated languages
|
179 |
</span>
|
180 |
</span>
|
181 |
)
|
|
|
259 |
body={costBodyTemplate}
|
260 |
style={{ minWidth: '5rem' }}
|
261 |
/>
|
262 |
+
{ScoreColumns(machineTranslatedMetrics)}
|
263 |
</DataTable>
|
264 |
)
|
265 |
}
|
frontend/src/components/ScoreColumns.js
CHANGED
@@ -2,21 +2,28 @@ import { Column } from 'primereact/column'
|
|
2 |
import ScoreField from './ScoreField'
|
3 |
|
4 |
const scoreBodyTemplate = (field, options = {}) => {
|
5 |
-
const { minScore = 0, maxScore = 1 } = options
|
6 |
|
7 |
return rowData => {
|
8 |
const score = rowData[field]
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
}
|
11 |
}
|
12 |
|
13 |
-
const ScoreColumns = [
|
14 |
<Column
|
15 |
field='average'
|
16 |
header='Proficiency'
|
17 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
18 |
sortable
|
19 |
-
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
21 |
/>,
|
22 |
<Column
|
@@ -26,7 +33,8 @@ const ScoreColumns = [
|
|
26 |
sortable
|
27 |
body={scoreBodyTemplate('translation_from_bleu', {
|
28 |
minScore: 0,
|
29 |
-
maxScore: 0.5
|
|
|
30 |
})}
|
31 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
32 |
/>,
|
@@ -37,7 +45,8 @@ const ScoreColumns = [
|
|
37 |
sortable
|
38 |
body={scoreBodyTemplate('translation_to_bleu', {
|
39 |
minScore: 0,
|
40 |
-
maxScore: 0.5
|
|
|
41 |
})}
|
42 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
43 |
/>,
|
@@ -48,7 +57,8 @@ const ScoreColumns = [
|
|
48 |
sortable
|
49 |
body={scoreBodyTemplate('classification_accuracy', {
|
50 |
minScore: 0,
|
51 |
-
maxScore: 0.5
|
|
|
52 |
})}
|
53 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
54 |
/>,
|
@@ -69,7 +79,8 @@ const ScoreColumns = [
|
|
69 |
sortable
|
70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
71 |
minScore: 0,
|
72 |
-
maxScore: 1
|
|
|
73 |
})}
|
74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
75 |
/>,
|
@@ -80,7 +91,8 @@ const ScoreColumns = [
|
|
80 |
sortable
|
81 |
body={scoreBodyTemplate('arc_accuracy', {
|
82 |
minScore: 0,
|
83 |
-
maxScore: 1
|
|
|
84 |
})}
|
85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
86 |
/>,
|
@@ -91,7 +103,8 @@ const ScoreColumns = [
|
|
91 |
sortable
|
92 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
93 |
minScore: 0,
|
94 |
-
maxScore: 1
|
|
|
95 |
})}
|
96 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
97 |
/>,
|
|
|
2 |
import ScoreField from './ScoreField'
|
3 |
|
4 |
const scoreBodyTemplate = (field, options = {}) => {
|
5 |
+
const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
|
6 |
|
7 |
return rowData => {
|
8 |
const score = rowData[field]
|
9 |
+
// Prefer per-row flag if present (backend sets `<metric>_is_machine`),
|
10 |
+
// otherwise fall back to global list
|
11 |
+
const rowFlagKey = `${field}_is_machine`
|
12 |
+
const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
|
13 |
+
const isMachineTranslated = hasRowFlag
|
14 |
+
? !!rowData[rowFlagKey]
|
15 |
+
: machineTranslatedMetrics.includes(field)
|
16 |
+
return ScoreField(score, minScore, maxScore, isMachineTranslated)
|
17 |
}
|
18 |
}
|
19 |
|
20 |
+
const ScoreColumns = (machineTranslatedMetrics = []) => [
|
21 |
<Column
|
22 |
field='average'
|
23 |
header='Proficiency'
|
24 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
25 |
sortable
|
26 |
+
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
|
27 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
28 |
/>,
|
29 |
<Column
|
|
|
33 |
sortable
|
34 |
body={scoreBodyTemplate('translation_from_bleu', {
|
35 |
minScore: 0,
|
36 |
+
maxScore: 0.5,
|
37 |
+
machineTranslatedMetrics
|
38 |
})}
|
39 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
40 |
/>,
|
|
|
45 |
sortable
|
46 |
body={scoreBodyTemplate('translation_to_bleu', {
|
47 |
minScore: 0,
|
48 |
+
maxScore: 0.5,
|
49 |
+
machineTranslatedMetrics
|
50 |
})}
|
51 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
52 |
/>,
|
|
|
57 |
sortable
|
58 |
body={scoreBodyTemplate('classification_accuracy', {
|
59 |
minScore: 0,
|
60 |
+
maxScore: 0.5,
|
61 |
+
machineTranslatedMetrics
|
62 |
})}
|
63 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
64 |
/>,
|
|
|
79 |
sortable
|
80 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
81 |
minScore: 0,
|
82 |
+
maxScore: 1,
|
83 |
+
machineTranslatedMetrics
|
84 |
})}
|
85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
86 |
/>,
|
|
|
91 |
sortable
|
92 |
body={scoreBodyTemplate('arc_accuracy', {
|
93 |
minScore: 0,
|
94 |
+
maxScore: 1,
|
95 |
+
machineTranslatedMetrics
|
96 |
})}
|
97 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
98 |
/>,
|
|
|
103 |
sortable
|
104 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
105 |
minScore: 0,
|
106 |
+
maxScore: 1,
|
107 |
+
machineTranslatedMetrics
|
108 |
})}
|
109 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
110 |
/>,
|
frontend/src/components/ScoreField.js
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
const ScoreField = (score, minScore, maxScore) => {
|
2 |
let percentage = 100
|
3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
4 |
if (score !== null) {
|
@@ -50,6 +50,7 @@ const ScoreField = (score, minScore, maxScore) => {
|
|
50 |
}}
|
51 |
>
|
52 |
{score !== null ? (score * 100).toFixed(1)+"%" : '–'}
|
|
|
53 |
</span>
|
54 |
</div>
|
55 |
)
|
|
|
1 |
+
const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
|
2 |
let percentage = 100
|
3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
4 |
if (score !== null) {
|
|
|
50 |
}}
|
51 |
>
|
52 |
{score !== null ? (score * 100).toFixed(1)+"%" : '–'}
|
53 |
+
{isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
|
54 |
</span>
|
55 |
</div>
|
56 |
)
|
frontend/src/components/SpeakerPlot.js
CHANGED
@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
|
|
73 |
textStrokeOpacity: 0,
|
74 |
textFillOpacity: 0
|
75 |
}),
|
76 |
-
Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
77 |
x: 40,
|
78 |
y: languages[39].cumSpeakers / 1e6
|
79 |
-
})
|
80 |
]
|
81 |
})
|
82 |
containerRef.current.append(plot)
|
|
|
73 |
textStrokeOpacity: 0,
|
74 |
textFillOpacity: 0
|
75 |
}),
|
76 |
+
...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
77 |
x: 40,
|
78 |
y: languages[39].cumSpeakers / 1e6
|
79 |
+
})] : [])
|
80 |
]
|
81 |
})
|
82 |
containerRef.current.append(plot)
|
frontend/src/components/WorldMap.js
CHANGED
@@ -26,13 +26,13 @@ const makeTitle = data => d => {
|
|
26 |
a =>
|
27 |
`${smoothProgressBar(a.population / pop)} ${
|
28 |
a.name
|
29 |
-
} – ${a.score.toFixed(2)}`
|
30 |
)
|
31 |
.join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
|
32 |
-
return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
|
33 |
}
|
34 |
|
35 |
-
const WorldMap = ({ data, width = 750, height = 500 }) => {
|
36 |
const containerRef = useRef()
|
37 |
const [mapData, setMapData] = useState()
|
38 |
|
@@ -48,8 +48,22 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
|
|
48 |
acc[country.iso2] = country
|
49 |
return acc
|
50 |
}, {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
const plot = Plot.plot({
|
52 |
-
subtitle:
|
53 |
width: width,
|
54 |
height: height,
|
55 |
projection: 'equal-earth',
|
@@ -61,11 +75,12 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
|
|
61 |
})
|
62 |
],
|
63 |
color: {
|
64 |
-
scheme: '
|
65 |
-
unknown: '
|
66 |
label: 'Score',
|
67 |
legend: true,
|
68 |
-
domain: [0, 1]
|
|
|
69 |
},
|
70 |
style: {
|
71 |
fontFamily: 'monospace'
|
|
|
26 |
a =>
|
27 |
`${smoothProgressBar(a.population / pop)} ${
|
28 |
a.name
|
29 |
+
} – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
|
30 |
)
|
31 |
.join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
|
32 |
+
return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
|
33 |
}
|
34 |
|
35 |
+
const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
|
36 |
const containerRef = useRef()
|
37 |
const [mapData, setMapData] = useState()
|
38 |
|
|
|
48 |
acc[country.iso2] = country
|
49 |
return acc
|
50 |
}, {})
|
51 |
+
// Count languages that have any evaluation data
|
52 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
53 |
+
const hasAnyScores = [
|
54 |
+
'translation_from_bleu',
|
55 |
+
'translation_to_bleu',
|
56 |
+
'classification_accuracy',
|
57 |
+
'mmlu_accuracy',
|
58 |
+
'arc_accuracy',
|
59 |
+
'truthfulqa_accuracy',
|
60 |
+
'mgsm_accuracy'
|
61 |
+
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
62 |
+
return hasAnyScores
|
63 |
+
}).length
|
64 |
+
|
65 |
const plot = Plot.plot({
|
66 |
+
subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
|
67 |
width: width,
|
68 |
height: height,
|
69 |
projection: 'equal-earth',
|
|
|
75 |
})
|
76 |
],
|
77 |
color: {
|
78 |
+
scheme: 'RdYlGn',
|
79 |
+
unknown: '#d0d0d0',
|
80 |
label: 'Score',
|
81 |
legend: true,
|
82 |
+
domain: [0, 1],
|
83 |
+
pivot: 0.5
|
84 |
},
|
85 |
style: {
|
86 |
fontFamily: 'monospace'
|
languages.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"family":"Indo-European",
|
8 |
"flores_path":"eng_Latn",
|
9 |
"fleurs_tag":"en_us",
|
10 |
-
"commonvoice_hours":
|
11 |
"commonvoice_locale":"en",
|
12 |
"in_benchmark":true
|
13 |
},
|
@@ -32,7 +32,7 @@
|
|
32 |
"flores_path":"hin_Deva",
|
33 |
"fleurs_tag":"hi_in",
|
34 |
"commonvoice_hours":16.0,
|
35 |
-
"commonvoice_locale":"hi
|
36 |
"in_benchmark":true
|
37 |
},
|
38 |
{
|
@@ -43,7 +43,7 @@
|
|
43 |
"family":"Indo-European",
|
44 |
"flores_path":"spa_Latn",
|
45 |
"fleurs_tag":"es_419",
|
46 |
-
"commonvoice_hours":
|
47 |
"commonvoice_locale":"es",
|
48 |
"in_benchmark":true
|
49 |
},
|
@@ -79,7 +79,7 @@
|
|
79 |
"family":"Indo-European",
|
80 |
"flores_path":"fra_Latn",
|
81 |
"fleurs_tag":"fr_fr",
|
82 |
-
"commonvoice_hours":
|
83 |
"commonvoice_locale":"fr",
|
84 |
"in_benchmark":true
|
85 |
},
|
@@ -103,7 +103,7 @@
|
|
103 |
"family":"Indo-European",
|
104 |
"flores_path":"por_Latn",
|
105 |
"fleurs_tag":"pt_br",
|
106 |
-
"commonvoice_hours":
|
107 |
"commonvoice_locale":"pt",
|
108 |
"in_benchmark":true
|
109 |
},
|
@@ -115,7 +115,7 @@
|
|
115 |
"family":"Indo-European",
|
116 |
"flores_path":"pan_Guru",
|
117 |
"fleurs_tag":"pa_in",
|
118 |
-
"commonvoice_hours":2.
|
119 |
"commonvoice_locale":"pa-IN",
|
120 |
"in_benchmark":true
|
121 |
},
|
@@ -127,7 +127,7 @@
|
|
127 |
"family":"Indo-European",
|
128 |
"flores_path":"rus_Cyrl",
|
129 |
"fleurs_tag":"ru_ru",
|
130 |
-
"commonvoice_hours":
|
131 |
"commonvoice_locale":"ru",
|
132 |
"in_benchmark":true
|
133 |
},
|
@@ -139,7 +139,7 @@
|
|
139 |
"family":"Atlantic-Congo",
|
140 |
"flores_path":"swh_Latn",
|
141 |
"fleurs_tag":"sw_ke",
|
142 |
-
"commonvoice_hours":
|
143 |
"commonvoice_locale":"sw",
|
144 |
"in_benchmark":true
|
145 |
},
|
@@ -151,7 +151,7 @@
|
|
151 |
"family":"Austronesian",
|
152 |
"flores_path":"ind_Latn",
|
153 |
"fleurs_tag":"id_id",
|
154 |
-
"commonvoice_hours":
|
155 |
"commonvoice_locale":"id",
|
156 |
"in_benchmark":true
|
157 |
},
|
@@ -163,7 +163,7 @@
|
|
163 |
"family":"Indo-European",
|
164 |
"flores_path":"deu_Latn",
|
165 |
"fleurs_tag":"de_de",
|
166 |
-
"commonvoice_hours":
|
167 |
"commonvoice_locale":"de",
|
168 |
"in_benchmark":true
|
169 |
},
|
@@ -379,7 +379,7 @@
|
|
379 |
"family":"Indo-European",
|
380 |
"flores_path":null,
|
381 |
"fleurs_tag":"ps_af",
|
382 |
-
"commonvoice_hours":
|
383 |
"commonvoice_locale":"ps",
|
384 |
"in_benchmark":false
|
385 |
},
|
@@ -439,7 +439,7 @@
|
|
439 |
"family":"Indo-European",
|
440 |
"flores_path":"pol_Latn",
|
441 |
"fleurs_tag":"pl_pl",
|
442 |
-
"commonvoice_hours":
|
443 |
"commonvoice_locale":"pl",
|
444 |
"in_benchmark":true
|
445 |
},
|
@@ -619,7 +619,7 @@
|
|
619 |
"family":"Indo-European",
|
620 |
"flores_path":"nld_Latn",
|
621 |
"fleurs_tag":"nl_nl",
|
622 |
-
"commonvoice_hours":
|
623 |
"commonvoice_locale":"nl",
|
624 |
"in_benchmark":true
|
625 |
},
|
@@ -655,7 +655,7 @@
|
|
655 |
"family":"Atlantic-Congo",
|
656 |
"flores_path":"yor_Latn",
|
657 |
"fleurs_tag":"yo_ng",
|
658 |
-
"commonvoice_hours":6.
|
659 |
"commonvoice_locale":"yo",
|
660 |
"in_benchmark":true
|
661 |
},
|
@@ -979,7 +979,7 @@
|
|
979 |
"family":"Turkic",
|
980 |
"flores_path":"kaz_Cyrl",
|
981 |
"fleurs_tag":"kk_kz",
|
982 |
-
"commonvoice_hours":2.
|
983 |
"commonvoice_locale":"kk",
|
984 |
"in_benchmark":true
|
985 |
},
|
@@ -1027,7 +1027,7 @@
|
|
1027 |
"family":"Uralic",
|
1028 |
"flores_path":"hun_Latn",
|
1029 |
"fleurs_tag":"hu_hu",
|
1030 |
-
"commonvoice_hours":
|
1031 |
"commonvoice_locale":"hu",
|
1032 |
"in_benchmark":true
|
1033 |
},
|
@@ -1099,7 +1099,7 @@
|
|
1099 |
"family":"Indo-European",
|
1100 |
"flores_path":"ckb_Arab",
|
1101 |
"fleurs_tag":"ckb_iq",
|
1102 |
-
"commonvoice_hours":
|
1103 |
"commonvoice_locale":"ckb",
|
1104 |
"in_benchmark":true
|
1105 |
},
|
@@ -1183,7 +1183,7 @@
|
|
1183 |
"family":"Indo-European",
|
1184 |
"flores_path":"bel_Cyrl",
|
1185 |
"fleurs_tag":"be_by",
|
1186 |
-
"commonvoice_hours":
|
1187 |
"commonvoice_locale":"be",
|
1188 |
"in_benchmark":true
|
1189 |
},
|
@@ -1207,7 +1207,7 @@
|
|
1207 |
"family":"Indo-European",
|
1208 |
"flores_path":"tgk_Cyrl",
|
1209 |
"fleurs_tag":"tg_tj",
|
1210 |
-
"commonvoice_hours":0.
|
1211 |
"commonvoice_locale":"tg",
|
1212 |
"in_benchmark":true
|
1213 |
},
|
@@ -1243,7 +1243,7 @@
|
|
1243 |
"family":"Indo-European",
|
1244 |
"flores_path":"afr_Latn",
|
1245 |
"fleurs_tag":"af_za",
|
1246 |
-
"commonvoice_hours":0.
|
1247 |
"commonvoice_locale":"af",
|
1248 |
"in_benchmark":true
|
1249 |
},
|
@@ -1291,7 +1291,7 @@
|
|
1291 |
"family":"Indo-European",
|
1292 |
"flores_path":"cat_Latn",
|
1293 |
"fleurs_tag":"ca_es",
|
1294 |
-
"commonvoice_hours":
|
1295 |
"commonvoice_locale":"ca",
|
1296 |
"in_benchmark":true
|
1297 |
},
|
@@ -1303,7 +1303,7 @@
|
|
1303 |
"family":"Afro-Asiatic",
|
1304 |
"flores_path":"heb_Hebr",
|
1305 |
"fleurs_tag":"he_il",
|
1306 |
-
"commonvoice_hours":
|
1307 |
"commonvoice_locale":"he",
|
1308 |
"in_benchmark":true
|
1309 |
},
|
@@ -1375,7 +1375,7 @@
|
|
1375 |
"family":"Turkic",
|
1376 |
"flores_path":"uig_Arab",
|
1377 |
"fleurs_tag":null,
|
1378 |
-
"commonvoice_hours":
|
1379 |
"commonvoice_locale":"ug",
|
1380 |
"in_benchmark":true
|
1381 |
},
|
@@ -1519,7 +1519,7 @@
|
|
1519 |
"family":"Indo-European",
|
1520 |
"flores_path":"kmr_Latn",
|
1521 |
"fleurs_tag":null,
|
1522 |
-
"commonvoice_hours":
|
1523 |
"commonvoice_locale":"kmr",
|
1524 |
"in_benchmark":true
|
1525 |
},
|
@@ -1555,7 +1555,7 @@
|
|
1555 |
"family":"Indo-European",
|
1556 |
"flores_path":"slk_Latn",
|
1557 |
"fleurs_tag":"sk_sk",
|
1558 |
-
"commonvoice_hours":
|
1559 |
"commonvoice_locale":"sk",
|
1560 |
"in_benchmark":true
|
1561 |
},
|
@@ -1675,7 +1675,7 @@
|
|
1675 |
"family":"Tupian",
|
1676 |
"flores_path":"gug_Latn",
|
1677 |
"fleurs_tag":null,
|
1678 |
-
"commonvoice_hours":4.
|
1679 |
"commonvoice_locale":"gn",
|
1680 |
"in_benchmark":true
|
1681 |
},
|
@@ -1747,7 +1747,7 @@
|
|
1747 |
"family":"Indo-European",
|
1748 |
"flores_path":"nob_Latn",
|
1749 |
"fleurs_tag":"nb_no",
|
1750 |
-
"commonvoice_hours":
|
1751 |
"commonvoice_locale":"nb-NO",
|
1752 |
"in_benchmark":true
|
1753 |
},
|
@@ -2155,7 +2155,7 @@
|
|
2155 |
"family":"Kartvelian",
|
2156 |
"flores_path":"kat_Geor",
|
2157 |
"fleurs_tag":"ka_ge",
|
2158 |
-
"commonvoice_hours":
|
2159 |
"commonvoice_locale":"ka",
|
2160 |
"in_benchmark":true
|
2161 |
},
|
@@ -2167,7 +2167,7 @@
|
|
2167 |
"family":"Indo-European",
|
2168 |
"flores_path":"glg_Latn",
|
2169 |
"fleurs_tag":"gl_es",
|
2170 |
-
"commonvoice_hours":
|
2171 |
"commonvoice_locale":"gl",
|
2172 |
"in_benchmark":true
|
2173 |
},
|
@@ -2323,7 +2323,7 @@
|
|
2323 |
"family":"Dravidian",
|
2324 |
"flores_path":null,
|
2325 |
"fleurs_tag":null,
|
2326 |
-
"commonvoice_hours":
|
2327 |
"commonvoice_locale":"brh",
|
2328 |
"in_benchmark":false
|
2329 |
},
|
@@ -2623,7 +2623,7 @@
|
|
2623 |
"family":"Indo-European",
|
2624 |
"flores_path":null,
|
2625 |
"fleurs_tag":null,
|
2626 |
-
"commonvoice_hours":0
|
2627 |
"commonvoice_locale":"haz",
|
2628 |
"in_benchmark":false
|
2629 |
},
|
@@ -2695,7 +2695,7 @@
|
|
2695 |
"family":"Indo-European",
|
2696 |
"flores_path":"oci_Latn",
|
2697 |
"fleurs_tag":"oc_fr",
|
2698 |
-
"commonvoice_hours":1.
|
2699 |
"commonvoice_locale":"oc",
|
2700 |
"in_benchmark":true
|
2701 |
},
|
@@ -3175,8 +3175,8 @@
|
|
3175 |
"family":"Atlantic-Congo",
|
3176 |
"flores_path":null,
|
3177 |
"fleurs_tag":null,
|
3178 |
-
"commonvoice_hours":
|
3179 |
-
"commonvoice_locale":
|
3180 |
"in_benchmark":false
|
3181 |
},
|
3182 |
{
|
@@ -3319,8 +3319,8 @@
|
|
3319 |
"family":"Indo-European",
|
3320 |
"flores_path":null,
|
3321 |
"fleurs_tag":null,
|
3322 |
-
"commonvoice_hours":
|
3323 |
-
"commonvoice_locale":
|
3324 |
"in_benchmark":false
|
3325 |
},
|
3326 |
{
|
@@ -3331,7 +3331,7 @@
|
|
3331 |
"family":"Indo-European",
|
3332 |
"flores_path":"gle_Latn",
|
3333 |
"fleurs_tag":"ga_ie",
|
3334 |
-
"commonvoice_hours":
|
3335 |
"commonvoice_locale":"ga-IE",
|
3336 |
"in_benchmark":true
|
3337 |
},
|
@@ -3487,7 +3487,7 @@
|
|
3487 |
"family":"Indo-European",
|
3488 |
"flores_path":"lvs_Latn",
|
3489 |
"fleurs_tag":"lv_lv",
|
3490 |
-
"commonvoice_hours":
|
3491 |
"commonvoice_locale":"lv",
|
3492 |
"in_benchmark":true
|
3493 |
},
|
@@ -3535,7 +3535,7 @@
|
|
3535 |
"family":null,
|
3536 |
"flores_path":"eus_Latn",
|
3537 |
"fleurs_tag":null,
|
3538 |
-
"commonvoice_hours":
|
3539 |
"commonvoice_locale":"eu",
|
3540 |
"in_benchmark":true
|
3541 |
},
|
@@ -3559,7 +3559,7 @@
|
|
3559 |
"family":"Abkhaz-Adyge",
|
3560 |
"flores_path":null,
|
3561 |
"fleurs_tag":null,
|
3562 |
-
"commonvoice_hours":
|
3563 |
"commonvoice_locale":"kbd",
|
3564 |
"in_benchmark":false
|
3565 |
},
|
@@ -3679,7 +3679,7 @@
|
|
3679 |
"family":"Indo-European",
|
3680 |
"flores_path":"ydd_Hebr",
|
3681 |
"fleurs_tag":null,
|
3682 |
-
"commonvoice_hours":
|
3683 |
"commonvoice_locale":"yi",
|
3684 |
"in_benchmark":true
|
3685 |
},
|
@@ -3991,8 +3991,8 @@
|
|
3991 |
"family":"Atlantic-Congo",
|
3992 |
"flores_path":null,
|
3993 |
"fleurs_tag":null,
|
3994 |
-
"commonvoice_hours":
|
3995 |
-
"commonvoice_locale":
|
3996 |
"in_benchmark":false
|
3997 |
},
|
3998 |
{
|
@@ -4099,8 +4099,8 @@
|
|
4099 |
"family":"Indo-European",
|
4100 |
"flores_path":null,
|
4101 |
"fleurs_tag":null,
|
4102 |
-
"commonvoice_hours":
|
4103 |
-
"commonvoice_locale":
|
4104 |
"in_benchmark":false
|
4105 |
},
|
4106 |
{
|
@@ -4351,7 +4351,7 @@
|
|
4351 |
"family":"Indo-European",
|
4352 |
"flores_path":null,
|
4353 |
"fleurs_tag":null,
|
4354 |
-
"commonvoice_hours":
|
4355 |
"commonvoice_locale":"br",
|
4356 |
"in_benchmark":false
|
4357 |
},
|
@@ -4651,7 +4651,7 @@
|
|
4651 |
"family":"Abkhaz-Adyge",
|
4652 |
"flores_path":null,
|
4653 |
"fleurs_tag":null,
|
4654 |
-
"commonvoice_hours":
|
4655 |
"commonvoice_locale":"ady",
|
4656 |
"in_benchmark":false
|
4657 |
},
|
@@ -5011,7 +5011,7 @@
|
|
5011 |
"family":"Nakh-Daghestanian",
|
5012 |
"flores_path":"dar_Cyrl",
|
5013 |
"fleurs_tag":null,
|
5014 |
-
"commonvoice_hours":
|
5015 |
"commonvoice_locale":"dar",
|
5016 |
"in_benchmark":true
|
5017 |
},
|
@@ -7879,7 +7879,7 @@
|
|
7879 |
"family":"Artificial Language",
|
7880 |
"flores_path":"epo_Latn",
|
7881 |
"fleurs_tag":null,
|
7882 |
-
"commonvoice_hours":
|
7883 |
"commonvoice_locale":"eo",
|
7884 |
"in_benchmark":true
|
7885 |
},
|
|
|
7 |
"family":"Indo-European",
|
8 |
"flores_path":"eng_Latn",
|
9 |
"fleurs_tag":"en_us",
|
10 |
+
"commonvoice_hours":2683.0,
|
11 |
"commonvoice_locale":"en",
|
12 |
"in_benchmark":true
|
13 |
},
|
|
|
32 |
"flores_path":"hin_Deva",
|
33 |
"fleurs_tag":"hi_in",
|
34 |
"commonvoice_hours":16.0,
|
35 |
+
"commonvoice_locale":"hi",
|
36 |
"in_benchmark":true
|
37 |
},
|
38 |
{
|
|
|
43 |
"family":"Indo-European",
|
44 |
"flores_path":"spa_Latn",
|
45 |
"fleurs_tag":"es_419",
|
46 |
+
"commonvoice_hours":449.0,
|
47 |
"commonvoice_locale":"es",
|
48 |
"in_benchmark":true
|
49 |
},
|
|
|
79 |
"family":"Indo-European",
|
80 |
"flores_path":"fra_Latn",
|
81 |
"fleurs_tag":"fr_fr",
|
82 |
+
"commonvoice_hours":1073.0,
|
83 |
"commonvoice_locale":"fr",
|
84 |
"in_benchmark":true
|
85 |
},
|
|
|
103 |
"family":"Indo-European",
|
104 |
"flores_path":"por_Latn",
|
105 |
"fleurs_tag":"pt_br",
|
106 |
+
"commonvoice_hours":181.0,
|
107 |
"commonvoice_locale":"pt",
|
108 |
"in_benchmark":true
|
109 |
},
|
|
|
115 |
"family":"Indo-European",
|
116 |
"flores_path":"pan_Guru",
|
117 |
"fleurs_tag":"pa_in",
|
118 |
+
"commonvoice_hours":2.5,
|
119 |
"commonvoice_locale":"pa-IN",
|
120 |
"in_benchmark":true
|
121 |
},
|
|
|
127 |
"family":"Indo-European",
|
128 |
"flores_path":"rus_Cyrl",
|
129 |
"fleurs_tag":"ru_ru",
|
130 |
+
"commonvoice_hours":247.0,
|
131 |
"commonvoice_locale":"ru",
|
132 |
"in_benchmark":true
|
133 |
},
|
|
|
139 |
"family":"Atlantic-Congo",
|
140 |
"flores_path":"swh_Latn",
|
141 |
"fleurs_tag":"sw_ke",
|
142 |
+
"commonvoice_hours":412.0,
|
143 |
"commonvoice_locale":"sw",
|
144 |
"in_benchmark":true
|
145 |
},
|
|
|
151 |
"family":"Austronesian",
|
152 |
"flores_path":"ind_Latn",
|
153 |
"fleurs_tag":"id_id",
|
154 |
+
"commonvoice_hours":34.0,
|
155 |
"commonvoice_locale":"id",
|
156 |
"in_benchmark":true
|
157 |
},
|
|
|
163 |
"family":"Indo-European",
|
164 |
"flores_path":"deu_Latn",
|
165 |
"fleurs_tag":"de_de",
|
166 |
+
"commonvoice_hours":1372.0,
|
167 |
"commonvoice_locale":"de",
|
168 |
"in_benchmark":true
|
169 |
},
|
|
|
379 |
"family":"Indo-European",
|
380 |
"flores_path":null,
|
381 |
"fleurs_tag":"ps_af",
|
382 |
+
"commonvoice_hours":82.0,
|
383 |
"commonvoice_locale":"ps",
|
384 |
"in_benchmark":false
|
385 |
},
|
|
|
439 |
"family":"Indo-European",
|
440 |
"flores_path":"pol_Latn",
|
441 |
"fleurs_tag":"pl_pl",
|
442 |
+
"commonvoice_hours":176.0,
|
443 |
"commonvoice_locale":"pl",
|
444 |
"in_benchmark":true
|
445 |
},
|
|
|
619 |
"family":"Indo-European",
|
620 |
"flores_path":"nld_Latn",
|
621 |
"fleurs_tag":"nl_nl",
|
622 |
+
"commonvoice_hours":123.0,
|
623 |
"commonvoice_locale":"nl",
|
624 |
"in_benchmark":true
|
625 |
},
|
|
|
655 |
"family":"Atlantic-Congo",
|
656 |
"flores_path":"yor_Latn",
|
657 |
"fleurs_tag":"yo_ng",
|
658 |
+
"commonvoice_hours":6.4,
|
659 |
"commonvoice_locale":"yo",
|
660 |
"in_benchmark":true
|
661 |
},
|
|
|
979 |
"family":"Turkic",
|
980 |
"flores_path":"kaz_Cyrl",
|
981 |
"fleurs_tag":"kk_kz",
|
982 |
+
"commonvoice_hours":2.3,
|
983 |
"commonvoice_locale":"kk",
|
984 |
"in_benchmark":true
|
985 |
},
|
|
|
1027 |
"family":"Uralic",
|
1028 |
"flores_path":"hun_Latn",
|
1029 |
"fleurs_tag":"hu_hu",
|
1030 |
+
"commonvoice_hours":94.0,
|
1031 |
"commonvoice_locale":"hu",
|
1032 |
"in_benchmark":true
|
1033 |
},
|
|
|
1099 |
"family":"Indo-European",
|
1100 |
"flores_path":"ckb_Arab",
|
1101 |
"fleurs_tag":"ckb_iq",
|
1102 |
+
"commonvoice_hours":136.0,
|
1103 |
"commonvoice_locale":"ckb",
|
1104 |
"in_benchmark":true
|
1105 |
},
|
|
|
1183 |
"family":"Indo-European",
|
1184 |
"flores_path":"bel_Cyrl",
|
1185 |
"fleurs_tag":"be_by",
|
1186 |
+
"commonvoice_hours":1812.0,
|
1187 |
"commonvoice_locale":"be",
|
1188 |
"in_benchmark":true
|
1189 |
},
|
|
|
1207 |
"family":"Indo-European",
|
1208 |
"flores_path":"tgk_Cyrl",
|
1209 |
"fleurs_tag":"tg_tj",
|
1210 |
+
"commonvoice_hours":0.6,
|
1211 |
"commonvoice_locale":"tg",
|
1212 |
"in_benchmark":true
|
1213 |
},
|
|
|
1243 |
"family":"Indo-European",
|
1244 |
"flores_path":"afr_Latn",
|
1245 |
"fleurs_tag":"af_za",
|
1246 |
+
"commonvoice_hours":0.6,
|
1247 |
"commonvoice_locale":"af",
|
1248 |
"in_benchmark":true
|
1249 |
},
|
|
|
1291 |
"family":"Indo-European",
|
1292 |
"flores_path":"cat_Latn",
|
1293 |
"fleurs_tag":"ca_es",
|
1294 |
+
"commonvoice_hours":2883.0,
|
1295 |
"commonvoice_locale":"ca",
|
1296 |
"in_benchmark":true
|
1297 |
},
|
|
|
1303 |
"family":"Afro-Asiatic",
|
1304 |
"flores_path":"heb_Hebr",
|
1305 |
"fleurs_tag":"he_il",
|
1306 |
+
"commonvoice_hours":2.0,
|
1307 |
"commonvoice_locale":"he",
|
1308 |
"in_benchmark":true
|
1309 |
},
|
|
|
1375 |
"family":"Turkic",
|
1376 |
"flores_path":"uig_Arab",
|
1377 |
"fleurs_tag":null,
|
1378 |
+
"commonvoice_hours":437.0,
|
1379 |
"commonvoice_locale":"ug",
|
1380 |
"in_benchmark":true
|
1381 |
},
|
|
|
1519 |
"family":"Indo-European",
|
1520 |
"flores_path":"kmr_Latn",
|
1521 |
"fleurs_tag":null,
|
1522 |
+
"commonvoice_hours":71.0,
|
1523 |
"commonvoice_locale":"kmr",
|
1524 |
"in_benchmark":true
|
1525 |
},
|
|
|
1555 |
"family":"Indo-European",
|
1556 |
"flores_path":"slk_Latn",
|
1557 |
"fleurs_tag":"sk_sk",
|
1558 |
+
"commonvoice_hours":52.0,
|
1559 |
"commonvoice_locale":"sk",
|
1560 |
"in_benchmark":true
|
1561 |
},
|
|
|
1675 |
"family":"Tupian",
|
1676 |
"flores_path":"gug_Latn",
|
1677 |
"fleurs_tag":null,
|
1678 |
+
"commonvoice_hours":4.5,
|
1679 |
"commonvoice_locale":"gn",
|
1680 |
"in_benchmark":true
|
1681 |
},
|
|
|
1747 |
"family":"Indo-European",
|
1748 |
"flores_path":"nob_Latn",
|
1749 |
"fleurs_tag":"nb_no",
|
1750 |
+
"commonvoice_hours":1.8,
|
1751 |
"commonvoice_locale":"nb-NO",
|
1752 |
"in_benchmark":true
|
1753 |
},
|
|
|
2155 |
"family":"Kartvelian",
|
2156 |
"flores_path":"kat_Geor",
|
2157 |
"fleurs_tag":"ka_ge",
|
2158 |
+
"commonvoice_hours":167.0,
|
2159 |
"commonvoice_locale":"ka",
|
2160 |
"in_benchmark":true
|
2161 |
},
|
|
|
2167 |
"family":"Indo-European",
|
2168 |
"flores_path":"glg_Latn",
|
2169 |
"fleurs_tag":"gl_es",
|
2170 |
+
"commonvoice_hours":164.0,
|
2171 |
"commonvoice_locale":"gl",
|
2172 |
"in_benchmark":true
|
2173 |
},
|
|
|
2323 |
"family":"Dravidian",
|
2324 |
"flores_path":null,
|
2325 |
"fleurs_tag":null,
|
2326 |
+
"commonvoice_hours":11.0,
|
2327 |
"commonvoice_locale":"brh",
|
2328 |
"in_benchmark":false
|
2329 |
},
|
|
|
2623 |
"family":"Indo-European",
|
2624 |
"flores_path":null,
|
2625 |
"fleurs_tag":null,
|
2626 |
+
"commonvoice_hours":11.0,
|
2627 |
"commonvoice_locale":"haz",
|
2628 |
"in_benchmark":false
|
2629 |
},
|
|
|
2695 |
"family":"Indo-European",
|
2696 |
"flores_path":"oci_Latn",
|
2697 |
"fleurs_tag":"oc_fr",
|
2698 |
+
"commonvoice_hours":1.9,
|
2699 |
"commonvoice_locale":"oc",
|
2700 |
"in_benchmark":true
|
2701 |
},
|
|
|
3175 |
"family":"Atlantic-Congo",
|
3176 |
"flores_path":null,
|
3177 |
"fleurs_tag":null,
|
3178 |
+
"commonvoice_hours":0.0,
|
3179 |
+
"commonvoice_locale":"seh",
|
3180 |
"in_benchmark":false
|
3181 |
},
|
3182 |
{
|
|
|
3319 |
"family":"Indo-European",
|
3320 |
"flores_path":null,
|
3321 |
"fleurs_tag":null,
|
3322 |
+
"commonvoice_hours":0.0,
|
3323 |
+
"commonvoice_locale":"mfe",
|
3324 |
"in_benchmark":false
|
3325 |
},
|
3326 |
{
|
|
|
3331 |
"family":"Indo-European",
|
3332 |
"flores_path":"gle_Latn",
|
3333 |
"fleurs_tag":"ga_ie",
|
3334 |
+
"commonvoice_hours":9.3,
|
3335 |
"commonvoice_locale":"ga-IE",
|
3336 |
"in_benchmark":true
|
3337 |
},
|
|
|
3487 |
"family":"Indo-European",
|
3488 |
"flores_path":"lvs_Latn",
|
3489 |
"fleurs_tag":"lv_lv",
|
3490 |
+
"commonvoice_hours":263.0,
|
3491 |
"commonvoice_locale":"lv",
|
3492 |
"in_benchmark":true
|
3493 |
},
|
|
|
3535 |
"family":null,
|
3536 |
"flores_path":"eus_Latn",
|
3537 |
"fleurs_tag":null,
|
3538 |
+
"commonvoice_hours":453.0,
|
3539 |
"commonvoice_locale":"eu",
|
3540 |
"in_benchmark":true
|
3541 |
},
|
|
|
3559 |
"family":"Abkhaz-Adyge",
|
3560 |
"flores_path":null,
|
3561 |
"fleurs_tag":null,
|
3562 |
+
"commonvoice_hours":106.0,
|
3563 |
"commonvoice_locale":"kbd",
|
3564 |
"in_benchmark":false
|
3565 |
},
|
|
|
3679 |
"family":"Indo-European",
|
3680 |
"flores_path":"ydd_Hebr",
|
3681 |
"fleurs_tag":null,
|
3682 |
+
"commonvoice_hours":1.8,
|
3683 |
"commonvoice_locale":"yi",
|
3684 |
"in_benchmark":true
|
3685 |
},
|
|
|
3991 |
"family":"Atlantic-Congo",
|
3992 |
"flores_path":null,
|
3993 |
"fleurs_tag":null,
|
3994 |
+
"commonvoice_hours":0.0,
|
3995 |
+
"commonvoice_locale":"gaa",
|
3996 |
"in_benchmark":false
|
3997 |
},
|
3998 |
{
|
|
|
4099 |
"family":"Indo-European",
|
4100 |
"flores_path":null,
|
4101 |
"fleurs_tag":null,
|
4102 |
+
"commonvoice_hours":0.0,
|
4103 |
+
"commonvoice_locale":"pcd",
|
4104 |
"in_benchmark":false
|
4105 |
},
|
4106 |
{
|
|
|
4351 |
"family":"Indo-European",
|
4352 |
"flores_path":null,
|
4353 |
"fleurs_tag":null,
|
4354 |
+
"commonvoice_hours":30.0,
|
4355 |
"commonvoice_locale":"br",
|
4356 |
"in_benchmark":false
|
4357 |
},
|
|
|
4651 |
"family":"Abkhaz-Adyge",
|
4652 |
"flores_path":null,
|
4653 |
"fleurs_tag":null,
|
4654 |
+
"commonvoice_hours":32.0,
|
4655 |
"commonvoice_locale":"ady",
|
4656 |
"in_benchmark":false
|
4657 |
},
|
|
|
5011 |
"family":"Nakh-Daghestanian",
|
5012 |
"flores_path":"dar_Cyrl",
|
5013 |
"fleurs_tag":null,
|
5014 |
+
"commonvoice_hours":1.3,
|
5015 |
"commonvoice_locale":"dar",
|
5016 |
"in_benchmark":true
|
5017 |
},
|
|
|
7879 |
"family":"Artificial Language",
|
7880 |
"flores_path":"epo_Latn",
|
7881 |
"fleurs_tag":null,
|
7882 |
+
"commonvoice_hours":1437.0,
|
7883 |
"commonvoice_locale":"eo",
|
7884 |
"in_benchmark":true
|
7885 |
},
|
models.json
CHANGED
@@ -20,15 +20,15 @@
|
|
20 |
]
|
21 |
},
|
22 |
{
|
23 |
-
"id":"anthropic\/claude-3
|
24 |
-
"name":"Claude 3
|
25 |
"provider_name":"Anthropic",
|
26 |
-
"cost":
|
27 |
"hf_id":null,
|
28 |
"size":null,
|
29 |
"type":"closed-source",
|
30 |
"license":null,
|
31 |
-
"creation_date":
|
32 |
"tasks":[
|
33 |
"translation_from",
|
34 |
"translation_to",
|
@@ -80,15 +80,15 @@
|
|
80 |
]
|
81 |
},
|
82 |
{
|
83 |
-
"id":"
|
84 |
-
"name":"
|
85 |
-
"provider_name":"
|
86 |
"cost":0.0,
|
87 |
-
"hf_id":"
|
88 |
-
"size":
|
89 |
"type":"open-source",
|
90 |
-
"license":"",
|
91 |
-
"creation_date":
|
92 |
"tasks":[
|
93 |
"translation_from",
|
94 |
"translation_to",
|
@@ -100,15 +100,15 @@
|
|
100 |
]
|
101 |
},
|
102 |
{
|
103 |
-
"id":"
|
104 |
-
"name":"
|
105 |
-
"provider_name":"
|
106 |
-
"cost":0.
|
107 |
-
"hf_id":
|
108 |
-
"size":
|
109 |
-
"type":"
|
110 |
-
"license":
|
111 |
-
"creation_date":
|
112 |
"tasks":[
|
113 |
"translation_from",
|
114 |
"translation_to",
|
@@ -120,15 +120,15 @@
|
|
120 |
]
|
121 |
},
|
122 |
{
|
123 |
-
"id":"deepseek\/deepseek-
|
124 |
-
"name":"
|
125 |
"provider_name":"DeepSeek",
|
126 |
-
"cost":0.
|
127 |
-
"hf_id":"deepseek-ai\/DeepSeek-
|
128 |
"size":684531386000.0,
|
129 |
"type":"open-source",
|
130 |
-
"license":"
|
131 |
-
"creation_date":
|
132 |
"tasks":[
|
133 |
"translation_from",
|
134 |
"translation_to",
|
@@ -140,15 +140,15 @@
|
|
140 |
]
|
141 |
},
|
142 |
{
|
143 |
-
"id":"deepseek\/deepseek-
|
144 |
-
"name":"
|
145 |
"provider_name":"DeepSeek",
|
146 |
"cost":0.0,
|
147 |
-
"hf_id":"deepseek-ai\/DeepSeek-
|
148 |
"size":684531386000.0,
|
149 |
"type":"open-source",
|
150 |
"license":"Mit",
|
151 |
-
"creation_date":
|
152 |
"tasks":[
|
153 |
"translation_from",
|
154 |
"translation_to",
|
@@ -160,15 +160,15 @@
|
|
160 |
]
|
161 |
},
|
162 |
{
|
163 |
-
"id":"
|
164 |
-
"name":"
|
165 |
-
"provider_name":"
|
166 |
-
"cost":0.
|
167 |
-
"hf_id":
|
168 |
-
"size":
|
169 |
-
"type":"
|
170 |
-
"license":
|
171 |
-
"creation_date":
|
172 |
"tasks":[
|
173 |
"translation_from",
|
174 |
"translation_to",
|
@@ -180,15 +180,15 @@
|
|
180 |
]
|
181 |
},
|
182 |
{
|
183 |
-
"id":"
|
184 |
-
"name":"
|
185 |
-
"provider_name":"
|
186 |
-
"cost":0.
|
187 |
-
"hf_id":
|
188 |
-
"size":
|
189 |
-
"type":"
|
190 |
-
"license":
|
191 |
-
"creation_date":
|
192 |
"tasks":[
|
193 |
"translation_from",
|
194 |
"translation_to",
|
@@ -200,15 +200,15 @@
|
|
200 |
]
|
201 |
},
|
202 |
{
|
203 |
-
"id":"
|
204 |
-
"name":"
|
205 |
-
"provider_name":"
|
206 |
-
"cost":
|
207 |
-
"hf_id":
|
208 |
-
"size":
|
209 |
-
"type":"
|
210 |
-
"license":
|
211 |
-
"creation_date":
|
212 |
"tasks":[
|
213 |
"translation_from",
|
214 |
"translation_to",
|
@@ -220,69 +220,15 @@
|
|
220 |
]
|
221 |
},
|
222 |
{
|
223 |
-
"id":"google\/gemini-2.
|
224 |
-
"name":"Gemini 2.
|
225 |
"provider_name":"Google",
|
226 |
"cost":0.4,
|
227 |
"hf_id":null,
|
228 |
"size":null,
|
229 |
"type":"closed-source",
|
230 |
"license":null,
|
231 |
-
"creation_date":
|
232 |
-
"tasks":[
|
233 |
-
"translation_from",
|
234 |
-
"translation_to",
|
235 |
-
"classification",
|
236 |
-
"mmlu",
|
237 |
-
"mgsm"
|
238 |
-
]
|
239 |
-
},
|
240 |
-
{
|
241 |
-
"id":"google\/gemini-2.5-flash-preview",
|
242 |
-
"name":"Gemini 2.5 Flash Preview 04-17",
|
243 |
-
"provider_name":"Google",
|
244 |
-
"cost":0.6,
|
245 |
-
"hf_id":null,
|
246 |
-
"size":null,
|
247 |
-
"type":"closed-source",
|
248 |
-
"license":null,
|
249 |
-
"creation_date":1744848000000.0,
|
250 |
-
"tasks":[
|
251 |
-
"translation_from",
|
252 |
-
"translation_to",
|
253 |
-
"classification",
|
254 |
-
"mmlu",
|
255 |
-
"mgsm"
|
256 |
-
]
|
257 |
-
},
|
258 |
-
{
|
259 |
-
"id":"google\/gemini-2.5-flash-preview-05-20",
|
260 |
-
"name":"Gemini 2.5 Flash Preview 05-20",
|
261 |
-
"provider_name":"Google",
|
262 |
-
"cost":0.6,
|
263 |
-
"hf_id":null,
|
264 |
-
"size":null,
|
265 |
-
"type":"closed-source",
|
266 |
-
"license":null,
|
267 |
-
"creation_date":1747699200000.0,
|
268 |
-
"tasks":[
|
269 |
-
"translation_from",
|
270 |
-
"translation_to",
|
271 |
-
"classification",
|
272 |
-
"mmlu",
|
273 |
-
"mgsm"
|
274 |
-
]
|
275 |
-
},
|
276 |
-
{
|
277 |
-
"id":"google\/gemini-2.5-pro",
|
278 |
-
"name":"Gemini 2.5 Pro",
|
279 |
-
"provider_name":"Google",
|
280 |
-
"cost":10.0,
|
281 |
-
"hf_id":null,
|
282 |
-
"size":null,
|
283 |
-
"type":"closed-source",
|
284 |
-
"license":null,
|
285 |
-
"creation_date":1750118400000,
|
286 |
"tasks":[
|
287 |
"translation_from",
|
288 |
"translation_to",
|
@@ -294,51 +240,15 @@
|
|
294 |
]
|
295 |
},
|
296 |
{
|
297 |
-
"id":"google\/gemini-2.
|
298 |
-
"name":"Gemini 2.
|
299 |
-
"provider_name":"Google",
|
300 |
-
"cost":10.0,
|
301 |
-
"hf_id":null,
|
302 |
-
"size":null,
|
303 |
-
"type":"closed-source",
|
304 |
-
"license":null,
|
305 |
-
"creation_date":1749081600000.0,
|
306 |
-
"tasks":[
|
307 |
-
"translation_from",
|
308 |
-
"translation_to",
|
309 |
-
"classification",
|
310 |
-
"mmlu",
|
311 |
-
"mgsm"
|
312 |
-
]
|
313 |
-
},
|
314 |
-
{
|
315 |
-
"id":"google\/gemini-2.5-pro-preview-05-06",
|
316 |
-
"name":"Gemini 2.5 Pro Preview 05-06",
|
317 |
-
"provider_name":"Google",
|
318 |
-
"cost":10.0,
|
319 |
-
"hf_id":null,
|
320 |
-
"size":null,
|
321 |
-
"type":"closed-source",
|
322 |
-
"license":null,
|
323 |
-
"creation_date":1746576000000.0,
|
324 |
-
"tasks":[
|
325 |
-
"translation_from",
|
326 |
-
"translation_to",
|
327 |
-
"classification",
|
328 |
-
"mmlu",
|
329 |
-
"mgsm"
|
330 |
-
]
|
331 |
-
},
|
332 |
-
{
|
333 |
-
"id":"google\/gemini-flash-1.5",
|
334 |
-
"name":"Gemini 1.5 Flash ",
|
335 |
"provider_name":"Google",
|
336 |
"cost":0.3,
|
337 |
"hf_id":null,
|
338 |
"size":null,
|
339 |
"type":"closed-source",
|
340 |
"license":null,
|
341 |
-
"creation_date":
|
342 |
"tasks":[
|
343 |
"translation_from",
|
344 |
"translation_to",
|
@@ -350,15 +260,15 @@
|
|
350 |
]
|
351 |
},
|
352 |
{
|
353 |
-
"id":"google\/gemini-
|
354 |
-
"name":"Gemini
|
355 |
"provider_name":"Google",
|
356 |
-
"cost":
|
357 |
"hf_id":null,
|
358 |
"size":null,
|
359 |
"type":"closed-source",
|
360 |
"license":null,
|
361 |
-
"creation_date":
|
362 |
"tasks":[
|
363 |
"translation_from",
|
364 |
"translation_to",
|
@@ -370,12 +280,12 @@
|
|
370 |
]
|
371 |
},
|
372 |
{
|
373 |
-
"id":"google\/gemma-3-
|
374 |
-
"name":"Gemma 3
|
375 |
"provider_name":"Google",
|
376 |
"cost":0.0,
|
377 |
-
"hf_id":"google\/gemma-3-
|
378 |
-
"size":
|
379 |
"type":"open-source",
|
380 |
"license":"Gemma",
|
381 |
"creation_date":1740787200000,
|
@@ -390,30 +300,15 @@
|
|
390 |
]
|
391 |
},
|
392 |
{
|
393 |
-
"id":"google\/
|
394 |
-
"name":"
|
395 |
"provider_name":"Google",
|
396 |
-
"cost":
|
397 |
-
"hf_id":
|
398 |
-
"size":
|
399 |
-
"type":"closed-source",
|
400 |
-
"license":null,
|
401 |
-
"creation_date":null,
|
402 |
-
"tasks":[
|
403 |
-
"translation_from",
|
404 |
-
"translation_to"
|
405 |
-
]
|
406 |
-
},
|
407 |
-
{
|
408 |
-
"id":"gryphe\/mythomax-l2-13b",
|
409 |
-
"name":"MythoMax 13B",
|
410 |
-
"provider_name":"MythoMax 13B",
|
411 |
-
"cost":0.07,
|
412 |
-
"hf_id":"Gryphe\/MythoMax-L2-13b",
|
413 |
-
"size":null,
|
414 |
"type":"open-source",
|
415 |
-
"license":"
|
416 |
-
"creation_date":
|
417 |
"tasks":[
|
418 |
"translation_from",
|
419 |
"translation_to",
|
@@ -464,30 +359,6 @@
|
|
464 |
"mgsm"
|
465 |
]
|
466 |
},
|
467 |
-
{
|
468 |
-
"id":"meta-llama\/llama-3.1-8b-instruct",
|
469 |
-
"name":"Llama 3.1 8B Instruct",
|
470 |
-
"provider_name":"Meta",
|
471 |
-
"cost":0.0,
|
472 |
-
"hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
|
473 |
-
"size":8030261248.0,
|
474 |
-
"type":"open-source",
|
475 |
-
"license":"Llama3.1",
|
476 |
-
"creation_date":1721260800000.0,
|
477 |
-
"tasks":null
|
478 |
-
},
|
479 |
-
{
|
480 |
-
"id":"meta-llama\/llama-3.2-1b-instruct",
|
481 |
-
"name":"Llama 3.2 1B Instruct",
|
482 |
-
"provider_name":"Meta",
|
483 |
-
"cost":0.0,
|
484 |
-
"hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
|
485 |
-
"size":1235814400.0,
|
486 |
-
"type":"open-source",
|
487 |
-
"license":"Llama3.2",
|
488 |
-
"creation_date":1726617600000.0,
|
489 |
-
"tasks":null
|
490 |
-
},
|
491 |
{
|
492 |
"id":"meta-llama\/llama-3.3-70b-instruct",
|
493 |
"name":"Llama 3.3 70B Instruct",
|
@@ -568,6 +439,26 @@
|
|
568 |
"mgsm"
|
569 |
]
|
570 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
{
|
572 |
"id":"mistralai\/mistral-nemo",
|
573 |
"name":"Mistral Nemo",
|
@@ -629,15 +520,55 @@
|
|
629 |
]
|
630 |
},
|
631 |
{
|
632 |
-
"id":"
|
633 |
-
"name":"
|
634 |
-
"provider_name":"
|
635 |
-
"cost":
|
636 |
-
"hf_id":
|
637 |
"size":null,
|
638 |
-
"type":"
|
639 |
-
"license":
|
640 |
-
"creation_date":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
"tasks":[
|
642 |
"translation_from",
|
643 |
"translation_to",
|
@@ -708,6 +639,26 @@
|
|
708 |
"mgsm"
|
709 |
]
|
710 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
{
|
712 |
"id":"openai\/gpt-4o-mini",
|
713 |
"name":"GPT-4o-mini",
|
@@ -728,6 +679,86 @@
|
|
728 |
"mgsm"
|
729 |
]
|
730 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
{
|
732 |
"id":"qwen\/qwen3-235b-a22b",
|
733 |
"name":"Qwen3 235B A22B",
|
@@ -772,7 +803,7 @@
|
|
772 |
"id":"qwen\/qwen3-32b",
|
773 |
"name":"Qwen3 32B",
|
774 |
"provider_name":"Qwen",
|
775 |
-
"cost":0.
|
776 |
"hf_id":"Qwen\/Qwen3-32B",
|
777 |
"size":32762123264.0,
|
778 |
"type":"open-source",
|
@@ -787,5 +818,120 @@
|
|
787 |
"truthfulqa",
|
788 |
"mgsm"
|
789 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
790 |
}
|
791 |
]
|
|
|
20 |
]
|
21 |
},
|
22 |
{
|
23 |
+
"id":"anthropic\/claude-3-haiku",
|
24 |
+
"name":"Claude 3 Haiku",
|
25 |
"provider_name":"Anthropic",
|
26 |
+
"cost":1.25,
|
27 |
"hf_id":null,
|
28 |
"size":null,
|
29 |
"type":"closed-source",
|
30 |
"license":null,
|
31 |
+
"creation_date":1710288000000,
|
32 |
"tasks":[
|
33 |
"translation_from",
|
34 |
"translation_to",
|
|
|
80 |
]
|
81 |
},
|
82 |
{
|
83 |
+
"id":"arliai\/qwq-32b-arliai-rpr-v1",
|
84 |
+
"name":"QwQ 32B RpR v1",
|
85 |
+
"provider_name":"ArliAI",
|
86 |
"cost":0.0,
|
87 |
+
"hf_id":"ArliAI\/QwQ-32B-ArliAI-RpR-v1",
|
88 |
+
"size":32763876352.0,
|
89 |
"type":"open-source",
|
90 |
+
"license":"Apache 2.0",
|
91 |
+
"creation_date":1743984000000,
|
92 |
"tasks":[
|
93 |
"translation_from",
|
94 |
"translation_to",
|
|
|
100 |
]
|
101 |
},
|
102 |
{
|
103 |
+
"id":"cohere\/command-r-08-2024",
|
104 |
+
"name":"Command R (08-2024)",
|
105 |
+
"provider_name":"Cohere",
|
106 |
+
"cost":0.6,
|
107 |
+
"hf_id":null,
|
108 |
+
"size":null,
|
109 |
+
"type":"closed-source",
|
110 |
+
"license":null,
|
111 |
+
"creation_date":1724976000000,
|
112 |
"tasks":[
|
113 |
"translation_from",
|
114 |
"translation_to",
|
|
|
120 |
]
|
121 |
},
|
122 |
{
|
123 |
+
"id":"deepseek\/deepseek-chat",
|
124 |
+
"name":"DeepSeek V3",
|
125 |
"provider_name":"DeepSeek",
|
126 |
+
"cost":0.8,
|
127 |
+
"hf_id":"deepseek-ai\/DeepSeek-V3",
|
128 |
"size":684531386000.0,
|
129 |
"type":"open-source",
|
130 |
+
"license":"",
|
131 |
+
"creation_date":1735084800000,
|
132 |
"tasks":[
|
133 |
"translation_from",
|
134 |
"translation_to",
|
|
|
140 |
]
|
141 |
},
|
142 |
{
|
143 |
+
"id":"deepseek\/deepseek-chat-v3-0324",
|
144 |
+
"name":"DeepSeek V3 0324",
|
145 |
"provider_name":"DeepSeek",
|
146 |
"cost":0.0,
|
147 |
+
"hf_id":"deepseek-ai\/DeepSeek-V3-0324",
|
148 |
"size":684531386000.0,
|
149 |
"type":"open-source",
|
150 |
"license":"Mit",
|
151 |
+
"creation_date":1742774400000,
|
152 |
"tasks":[
|
153 |
"translation_from",
|
154 |
"translation_to",
|
|
|
160 |
]
|
161 |
},
|
162 |
{
|
163 |
+
"id":"deepseek\/deepseek-chat-v3.1",
|
164 |
+
"name":"DeepSeek V3.1",
|
165 |
+
"provider_name":"DeepSeek",
|
166 |
+
"cost":0.0,
|
167 |
+
"hf_id":"deepseek-ai\/DeepSeek-V3.1",
|
168 |
+
"size":684531386000.0,
|
169 |
+
"type":"open-source",
|
170 |
+
"license":"Mit",
|
171 |
+
"creation_date":1755734400000,
|
172 |
"tasks":[
|
173 |
"translation_from",
|
174 |
"translation_to",
|
|
|
180 |
]
|
181 |
},
|
182 |
{
|
183 |
+
"id":"deepseek\/deepseek-r1",
|
184 |
+
"name":"R1",
|
185 |
+
"provider_name":"DeepSeek",
|
186 |
+
"cost":0.0,
|
187 |
+
"hf_id":"deepseek-ai\/DeepSeek-R1",
|
188 |
+
"size":684531386000.0,
|
189 |
+
"type":"open-source",
|
190 |
+
"license":"Mit",
|
191 |
+
"creation_date":1737331200000,
|
192 |
"tasks":[
|
193 |
"translation_from",
|
194 |
"translation_to",
|
|
|
200 |
]
|
201 |
},
|
202 |
{
|
203 |
+
"id":"deepseek\/deepseek-r1-0528-qwen3-8b",
|
204 |
+
"name":"Deepseek R1 0528 Qwen3 8B",
|
205 |
+
"provider_name":"DeepSeek",
|
206 |
+
"cost":0.0,
|
207 |
+
"hf_id":"deepseek-ai\/DeepSeek-R1-0528-Qwen3-8B",
|
208 |
+
"size":8190735360.0,
|
209 |
+
"type":"open-source",
|
210 |
+
"license":"Mit",
|
211 |
+
"creation_date":1748476800000,
|
212 |
"tasks":[
|
213 |
"translation_from",
|
214 |
"translation_to",
|
|
|
220 |
]
|
221 |
},
|
222 |
{
|
223 |
+
"id":"google\/gemini-2.0-flash-001",
|
224 |
+
"name":"Gemini 2.0 Flash",
|
225 |
"provider_name":"Google",
|
226 |
"cost":0.4,
|
227 |
"hf_id":null,
|
228 |
"size":null,
|
229 |
"type":"closed-source",
|
230 |
"license":null,
|
231 |
+
"creation_date":1738713600000,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
"tasks":[
|
233 |
"translation_from",
|
234 |
"translation_to",
|
|
|
240 |
]
|
241 |
},
|
242 |
{
|
243 |
+
"id":"google\/gemini-2.0-flash-lite-001",
|
244 |
+
"name":"Gemini 2.0 Flash Lite",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
"provider_name":"Google",
|
246 |
"cost":0.3,
|
247 |
"hf_id":null,
|
248 |
"size":null,
|
249 |
"type":"closed-source",
|
250 |
"license":null,
|
251 |
+
"creation_date":1740441600000,
|
252 |
"tasks":[
|
253 |
"translation_from",
|
254 |
"translation_to",
|
|
|
260 |
]
|
261 |
},
|
262 |
{
|
263 |
+
"id":"google\/gemini-2.5-flash",
|
264 |
+
"name":"Gemini 2.5 Flash",
|
265 |
"provider_name":"Google",
|
266 |
+
"cost":2.5,
|
267 |
"hf_id":null,
|
268 |
"size":null,
|
269 |
"type":"closed-source",
|
270 |
"license":null,
|
271 |
+
"creation_date":1750118400000,
|
272 |
"tasks":[
|
273 |
"translation_from",
|
274 |
"translation_to",
|
|
|
280 |
]
|
281 |
},
|
282 |
{
|
283 |
+
"id":"google\/gemma-3-12b-it",
|
284 |
+
"name":"Gemma 3 12B",
|
285 |
"provider_name":"Google",
|
286 |
"cost":0.0,
|
287 |
+
"hf_id":"google\/gemma-3-12b-it",
|
288 |
+
"size":12187325040.0,
|
289 |
"type":"open-source",
|
290 |
"license":"Gemma",
|
291 |
"creation_date":1740787200000,
|
|
|
300 |
]
|
301 |
},
|
302 |
{
|
303 |
+
"id":"google\/gemma-3-27b-it",
|
304 |
+
"name":"Gemma 3 27B",
|
305 |
"provider_name":"Google",
|
306 |
+
"cost":0.0,
|
307 |
+
"hf_id":"google\/gemma-3-27b-it",
|
308 |
+
"size":27432406640.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
"type":"open-source",
|
310 |
+
"license":"Gemma",
|
311 |
+
"creation_date":1740787200000,
|
312 |
"tasks":[
|
313 |
"translation_from",
|
314 |
"translation_to",
|
|
|
359 |
"mgsm"
|
360 |
]
|
361 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
{
|
363 |
"id":"meta-llama\/llama-3.3-70b-instruct",
|
364 |
"name":"Llama 3.3 70B Instruct",
|
|
|
439 |
"mgsm"
|
440 |
]
|
441 |
},
|
442 |
+
{
|
443 |
+
"id":"mistralai\/mistral-7b-instruct-v0.3",
|
444 |
+
"name":"Mistral 7B Instruct v0.3",
|
445 |
+
"provider_name":"Mistral",
|
446 |
+
"cost":0.05,
|
447 |
+
"hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
|
448 |
+
"size":7248023552.0,
|
449 |
+
"type":"open-source",
|
450 |
+
"license":"Apache 2.0",
|
451 |
+
"creation_date":1716336000000,
|
452 |
+
"tasks":[
|
453 |
+
"translation_from",
|
454 |
+
"translation_to",
|
455 |
+
"classification",
|
456 |
+
"mmlu",
|
457 |
+
"arc",
|
458 |
+
"truthfulqa",
|
459 |
+
"mgsm"
|
460 |
+
]
|
461 |
+
},
|
462 |
{
|
463 |
"id":"mistralai\/mistral-nemo",
|
464 |
"name":"Mistral Nemo",
|
|
|
520 |
]
|
521 |
},
|
522 |
{
|
523 |
+
"id":"moonshotai\/kimi-k2",
|
524 |
+
"name":"Kimi K2",
|
525 |
+
"provider_name":"MoonshotAI",
|
526 |
+
"cost":0.0,
|
527 |
+
"hf_id":"moonshotai\/Kimi-K2-Instruct",
|
528 |
"size":null,
|
529 |
+
"type":"open-source",
|
530 |
+
"license":"Other",
|
531 |
+
"creation_date":1752192000000,
|
532 |
+
"tasks":[
|
533 |
+
"translation_from",
|
534 |
+
"translation_to",
|
535 |
+
"classification",
|
536 |
+
"mmlu",
|
537 |
+
"arc",
|
538 |
+
"truthfulqa",
|
539 |
+
"mgsm"
|
540 |
+
]
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"id":"neversleep\/llama-3-lumimaid-70b",
|
544 |
+
"name":"Llama 3 Lumimaid 70B",
|
545 |
+
"provider_name":"NeverSleep",
|
546 |
+
"cost":6.0,
|
547 |
+
"hf_id":"NeverSleep\/Llama-3-Lumimaid-70B-v0.1",
|
548 |
+
"size":70553706496.0,
|
549 |
+
"type":"open-source",
|
550 |
+
"license":"Cc By Nc 4.0",
|
551 |
+
"creation_date":1714262400000,
|
552 |
+
"tasks":[
|
553 |
+
"translation_from",
|
554 |
+
"translation_to",
|
555 |
+
"classification",
|
556 |
+
"mmlu",
|
557 |
+
"arc",
|
558 |
+
"truthfulqa",
|
559 |
+
"mgsm"
|
560 |
+
]
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"id":"nvidia\/llama-3.1-nemotron-70b-instruct",
|
564 |
+
"name":"Llama 3.1 Nemotron 70B Instruct",
|
565 |
+
"provider_name":"NVIDIA",
|
566 |
+
"cost":0.3,
|
567 |
+
"hf_id":"nvidia\/Llama-3.1-Nemotron-70B-Instruct-HF",
|
568 |
+
"size":70553706496.0,
|
569 |
+
"type":"open-source",
|
570 |
+
"license":"Llama3.1",
|
571 |
+
"creation_date":1728691200000,
|
572 |
"tasks":[
|
573 |
"translation_from",
|
574 |
"translation_to",
|
|
|
639 |
"mgsm"
|
640 |
]
|
641 |
},
|
642 |
+
{
|
643 |
+
"id":"openai\/gpt-4o-2024-11-20",
|
644 |
+
"name":"GPT-4o (2024-11-20)",
|
645 |
+
"provider_name":"OpenAI",
|
646 |
+
"cost":10.0,
|
647 |
+
"hf_id":null,
|
648 |
+
"size":null,
|
649 |
+
"type":"closed-source",
|
650 |
+
"license":null,
|
651 |
+
"creation_date":1732060800000,
|
652 |
+
"tasks":[
|
653 |
+
"translation_from",
|
654 |
+
"translation_to",
|
655 |
+
"classification",
|
656 |
+
"mmlu",
|
657 |
+
"arc",
|
658 |
+
"truthfulqa",
|
659 |
+
"mgsm"
|
660 |
+
]
|
661 |
+
},
|
662 |
{
|
663 |
"id":"openai\/gpt-4o-mini",
|
664 |
"name":"GPT-4o-mini",
|
|
|
679 |
"mgsm"
|
680 |
]
|
681 |
},
|
682 |
+
{
|
683 |
+
"id":"openai\/gpt-5",
|
684 |
+
"name":"GPT-5",
|
685 |
+
"provider_name":"OpenAI",
|
686 |
+
"cost":10.0,
|
687 |
+
"hf_id":null,
|
688 |
+
"size":null,
|
689 |
+
"type":"closed-source",
|
690 |
+
"license":null,
|
691 |
+
"creation_date":1754524800000,
|
692 |
+
"tasks":[
|
693 |
+
"translation_from",
|
694 |
+
"translation_to",
|
695 |
+
"classification",
|
696 |
+
"mmlu",
|
697 |
+
"arc",
|
698 |
+
"truthfulqa",
|
699 |
+
"mgsm"
|
700 |
+
]
|
701 |
+
},
|
702 |
+
{
|
703 |
+
"id":"openai\/gpt-5-nano",
|
704 |
+
"name":"GPT-5 Nano",
|
705 |
+
"provider_name":"OpenAI",
|
706 |
+
"cost":0.4,
|
707 |
+
"hf_id":null,
|
708 |
+
"size":null,
|
709 |
+
"type":"closed-source",
|
710 |
+
"license":null,
|
711 |
+
"creation_date":1754524800000,
|
712 |
+
"tasks":[
|
713 |
+
"translation_from",
|
714 |
+
"translation_to",
|
715 |
+
"classification",
|
716 |
+
"mmlu",
|
717 |
+
"arc",
|
718 |
+
"truthfulqa",
|
719 |
+
"mgsm"
|
720 |
+
]
|
721 |
+
},
|
722 |
+
{
|
723 |
+
"id":"openai\/gpt-oss-120b",
|
724 |
+
"name":"gpt-oss-120b",
|
725 |
+
"provider_name":"OpenAI",
|
726 |
+
"cost":0.0,
|
727 |
+
"hf_id":"openai\/gpt-oss-120b",
|
728 |
+
"size":120412337472.0,
|
729 |
+
"type":"open-source",
|
730 |
+
"license":"Apache 2.0",
|
731 |
+
"creation_date":1754265600000,
|
732 |
+
"tasks":[
|
733 |
+
"translation_from",
|
734 |
+
"translation_to",
|
735 |
+
"classification",
|
736 |
+
"mmlu",
|
737 |
+
"arc",
|
738 |
+
"truthfulqa",
|
739 |
+
"mgsm"
|
740 |
+
]
|
741 |
+
},
|
742 |
+
{
|
743 |
+
"id":"qwen\/qwen-2.5-coder-32b-instruct",
|
744 |
+
"name":"Qwen2.5 Coder 32B Instruct",
|
745 |
+
"provider_name":"Qwen2.5 Coder 32B Instruct (free)",
|
746 |
+
"cost":0.0,
|
747 |
+
"hf_id":"Qwen\/Qwen2.5-Coder-32B-Instruct",
|
748 |
+
"size":32763876352.0,
|
749 |
+
"type":"open-source",
|
750 |
+
"license":"Apache 2.0",
|
751 |
+
"creation_date":1730851200000,
|
752 |
+
"tasks":[
|
753 |
+
"translation_from",
|
754 |
+
"translation_to",
|
755 |
+
"classification",
|
756 |
+
"mmlu",
|
757 |
+
"arc",
|
758 |
+
"truthfulqa",
|
759 |
+
"mgsm"
|
760 |
+
]
|
761 |
+
},
|
762 |
{
|
763 |
"id":"qwen\/qwen3-235b-a22b",
|
764 |
"name":"Qwen3 235B A22B",
|
|
|
803 |
"id":"qwen\/qwen3-32b",
|
804 |
"name":"Qwen3 32B",
|
805 |
"provider_name":"Qwen",
|
806 |
+
"cost":0.07,
|
807 |
"hf_id":"Qwen\/Qwen3-32B",
|
808 |
"size":32762123264.0,
|
809 |
"type":"open-source",
|
|
|
818 |
"truthfulqa",
|
819 |
"mgsm"
|
820 |
]
|
821 |
+
},
|
822 |
+
{
|
823 |
+
"id":"scb10x\/llama3.1-typhoon2-70b-instruct",
|
824 |
+
"name":"Typhoon2 70B Instruct",
|
825 |
+
"provider_name":"Typhoon2 70B Instruct",
|
826 |
+
"cost":0.88,
|
827 |
+
"hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
|
828 |
+
"size":70553706496.0,
|
829 |
+
"type":"open-source",
|
830 |
+
"license":"Llama3.1",
|
831 |
+
"creation_date":1734220800000,
|
832 |
+
"tasks":[
|
833 |
+
"translation_from",
|
834 |
+
"translation_to",
|
835 |
+
"classification",
|
836 |
+
"mmlu",
|
837 |
+
"arc",
|
838 |
+
"truthfulqa",
|
839 |
+
"mgsm"
|
840 |
+
]
|
841 |
+
},
|
842 |
+
{
|
843 |
+
"id":"tencent\/hunyuan-a13b-instruct",
|
844 |
+
"name":"Hunyuan A13B Instruct",
|
845 |
+
"provider_name":"Tencent",
|
846 |
+
"cost":0.0,
|
847 |
+
"hf_id":"tencent\/Hunyuan-A13B-Instruct",
|
848 |
+
"size":80393183232.0,
|
849 |
+
"type":"open-source",
|
850 |
+
"license":"Other",
|
851 |
+
"creation_date":1750809600000,
|
852 |
+
"tasks":[
|
853 |
+
"translation_from",
|
854 |
+
"translation_to",
|
855 |
+
"classification",
|
856 |
+
"mmlu",
|
857 |
+
"arc",
|
858 |
+
"truthfulqa",
|
859 |
+
"mgsm"
|
860 |
+
]
|
861 |
+
},
|
862 |
+
{
|
863 |
+
"id":"thedrummer\/anubis-pro-105b-v1",
|
864 |
+
"name":"Anubis Pro 105B V1",
|
865 |
+
"provider_name":"TheDrummer",
|
866 |
+
"cost":1.0,
|
867 |
+
"hf_id":"TheDrummer\/Anubis-Pro-105B-v1",
|
868 |
+
"size":104779882496.0,
|
869 |
+
"type":"open-source",
|
870 |
+
"license":"Other",
|
871 |
+
"creation_date":1738454400000,
|
872 |
+
"tasks":[
|
873 |
+
"translation_from",
|
874 |
+
"translation_to",
|
875 |
+
"classification",
|
876 |
+
"mmlu",
|
877 |
+
"arc",
|
878 |
+
"truthfulqa",
|
879 |
+
"mgsm"
|
880 |
+
]
|
881 |
+
},
|
882 |
+
{
|
883 |
+
"id":"x-ai\/grok-4",
|
884 |
+
"name":"Grok 4",
|
885 |
+
"provider_name":"xAI",
|
886 |
+
"cost":15.0,
|
887 |
+
"hf_id":null,
|
888 |
+
"size":null,
|
889 |
+
"type":"closed-source",
|
890 |
+
"license":null,
|
891 |
+
"creation_date":1752019200000,
|
892 |
+
"tasks":[
|
893 |
+
"translation_from",
|
894 |
+
"translation_to",
|
895 |
+
"classification",
|
896 |
+
"mmlu",
|
897 |
+
"arc",
|
898 |
+
"truthfulqa",
|
899 |
+
"mgsm"
|
900 |
+
]
|
901 |
+
},
|
902 |
+
{
|
903 |
+
"id":"z-ai\/glm-4.5v",
|
904 |
+
"name":"GLM 4.5V",
|
905 |
+
"provider_name":"Z.AI",
|
906 |
+
"cost":1.8,
|
907 |
+
"hf_id":"zai-org\/GLM-4.5V",
|
908 |
+
"size":107710933120.0,
|
909 |
+
"type":"open-source",
|
910 |
+
"license":"Mit",
|
911 |
+
"creation_date":1754784000000,
|
912 |
+
"tasks":[
|
913 |
+
"translation_from",
|
914 |
+
"translation_to",
|
915 |
+
"classification",
|
916 |
+
"mmlu",
|
917 |
+
"arc",
|
918 |
+
"truthfulqa",
|
919 |
+
"mgsm"
|
920 |
+
]
|
921 |
+
},
|
922 |
+
{
|
923 |
+
"id":"google\/translate-v2",
|
924 |
+
"name":"Google Translate",
|
925 |
+
"provider_name":"Google",
|
926 |
+
"cost":20.0,
|
927 |
+
"hf_id":null,
|
928 |
+
"size":null,
|
929 |
+
"type":"closed-source",
|
930 |
+
"license":null,
|
931 |
+
"creation_date":null,
|
932 |
+
"tasks":[
|
933 |
+
"translation_from",
|
934 |
+
"translation_to"
|
935 |
+
]
|
936 |
}
|
937 |
]
|
pyproject.toml
CHANGED
@@ -44,3 +44,13 @@ dev = [
|
|
44 |
"scipy>=1.16.0",
|
45 |
"seaborn>=0.13.2",
|
46 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"scipy>=1.16.0",
|
45 |
"seaborn>=0.13.2",
|
46 |
]
|
47 |
+
|
48 |
+
[build-system]
|
49 |
+
requires = ["hatchling"]
|
50 |
+
build-backend = "hatchling.build"
|
51 |
+
|
52 |
+
[tool.hatch.build.targets.wheel]
|
53 |
+
packages = ["evals"]
|
54 |
+
|
55 |
+
[tool.uv]
|
56 |
+
package = true
|
results.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:649509b8373b76e51a79809fdab77badff44e5536ca3bd8e3eb409f406b6ecda
|
3 |
+
size 13260774
|
uv.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|