Spaces:

fair-forward
/

evals-for-every-language

Runtime error

App Files Files Community

davidpomerenke commited on about 13 hours ago

Commit

a0d1624

verified ·

1 Parent(s): c790fdb

Upload from GitHub Actions: Merge pull request #18 from datenlabor-bmz/pr-17

Browse files

Files changed (38) hide show

.DS_Store +0 -0
.github/workflows/nightly-evals.yml +4 -0
.gitignore +2 -0
Dockerfile +1 -1
README.md +13 -0
datasets.json +6 -6
evals/__init__.py +0 -1
evals/backend.py +139 -22
evals/countries.py +10 -4
evals/datasets_/__init__.py +1 -1
evals/datasets_/arc.py +44 -27
evals/datasets_/fleurs.py +2 -1
evals/datasets_/mgsm.py +47 -23
evals/datasets_/mmlu.py +57 -25
evals/datasets_/truthfulqa.py +66 -28
evals/datasets_/util.py +8 -0
evals/download_data.py +33 -16
evals/languages.py +3 -0
evals/main.py +176 -48
evals/models.py +126 -36
evals/plots.py +75 -41
evals/tasks.py +130 -142
evals/translate.py +1 -1
frontend/package-lock.json +0 -0
frontend/package.json +7 -5
frontend/src/App.js +183 -77
frontend/src/components/HistoryPlot.js +2 -2
frontend/src/components/LanguageTable.js +1 -1
frontend/src/components/ModelTable.js +31 -17
frontend/src/components/ScoreColumns.js +23 -10
frontend/src/components/ScoreField.js +2 -1
frontend/src/components/SpeakerPlot.js +2 -2
frontend/src/components/WorldMap.js +22 -7
languages.json +49 -49
models.json +362 -216
pyproject.toml +10 -0
results.json +2 -2
uv.lock +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.github/workflows/nightly-evals.yml CHANGED Viewed

@@ -8,6 +8,8 @@ on:
 jobs:
   run-evals:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -25,6 +27,8 @@ jobs:
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

 jobs:
   run-evals:
     runs-on: ubuntu-latest
+    # checking if this is working in case eval runs take longer than 6h github actions allowance
+    timeout-minutes: 1440  # 24 hours timeout
     steps:
       - uses: actions/checkout@v3
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
+          N_SENTENCES: 20
+          MAX_LANGUAGES: 150
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

.gitignore CHANGED Viewed

@@ -20,3 +20,5 @@ wheels/
 # folders and files to be ignored
 .specstory/
 .cursorindexingignore

 # folders and files to be ignored
 .specstory/
 .cursorindexingignore

Dockerfile CHANGED Viewed

@@ -14,7 +14,7 @@ ENV HOME=/home/user \
 RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
 USER user
 WORKDIR $HOME/app
-COPY --chown=user pyproject.toml uv.lock ./
 RUN uv sync --frozen --no-dev
 COPY --chown=user evals/ evals/
 COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build

 RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
 USER user
 WORKDIR $HOME/app
+COPY --chown=user pyproject.toml uv.lock README.md ./
 RUN uv sync --frozen --no-dev
 COPY --chown=user evals/ evals/
 COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build

README.md CHANGED Viewed

@@ -43,8 +43,21 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
 _Tracking language proficiency of AI models for every language_
 ## Evaluate
 ```bash
 uv run --extra dev evals/main.py
 ```

 _Tracking language proficiency of AI models for every language_
+## System Architecture
+The AI Language Monitor evaluates language models across 100+ languages using a comprehensive pipeline that combines model discovery, automated evaluation, and real-time visualization.
+> **Detailed Architecture**: See [system_architecture_diagram.md](system_architecture_diagram.md) for the complete system architecture diagram and component descriptions.
+**Key Features:**
+- **Model Discovery**: Combines curated models with real-time trending models via web scraping
+- **Multi-Task Evaluation**: 7 tasks across 100+ languages with origin tracking (human vs machine-translated)
+- **Scalable Architecture**: Dual deployment (local/GitHub vs Google Cloud)
+- **Real-time Visualization**: Interactive web interface with country-level insights
 ## Evaluate
+### Local Development
 ```bash
 uv run --extra dev evals/main.py
 ```

datasets.json CHANGED Viewed

@@ -219,7 +219,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
@@ -256,7 +256,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
-        "implemented": true,
         "group": "Multitask Language Understanding"
     },
     {
@@ -360,7 +360,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": true,
         "group": "ARC Question Answering"
     },
     {
@@ -375,7 +375,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
-        "implemented": true,
         "group": "ARC Question Answering"
     },
     {
@@ -420,7 +420,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": true,
         "group": "Truthfulness"
     },
     {
@@ -435,7 +435,7 @@
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
-        "implemented": true,
         "group": "Truthfulness"
     },
     {

         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "MMLU",
+        "implemented": false,
         "group": "Multitask Language Understanding"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": false,
         "group": "ARC Question Answering"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "AI2 ARC",
+        "implemented": false,
         "group": "ARC Question Answering"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": false,
         "group": "Truthfulness"
     },
     {
         "parallel": true,
         "translation": "machine",
         "base": "TruthfulQA",
+        "implemented": false,
         "group": "Truthfulness"
     },
     {

evals/__init__.py CHANGED Viewed

	@@ -1 +0,0 @@
1	-

evals/backend.py CHANGED Viewed

@@ -4,7 +4,8 @@ import os
 import numpy as np
 import pandas as pd
 import uvicorn
-from countries import make_country_table
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
@@ -26,7 +27,7 @@ task_metrics = [
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
-    # "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
@@ -39,28 +40,77 @@ def compute_normalized_average(df, metrics):
             col_min = normalized_df[col].min()
             col_max = normalized_df[col].max()
             if col_max > col_min:  # Avoid division by zero
-                normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
             else:
                 normalized_df[col] = 0  # If all values are the same, set to 0
     return normalized_df.mean(axis=1, skipna=False)
-def make_model_table(df, models):
-    df = (
-        df.groupby(["model", "task", "metric"])
-        .agg({"score": "mean", "bcp_47": "nunique"})
-        .reset_index()
     )
-    df["task_metric"] = df["task"] + "_" + df["metric"]
-    df = df.drop(columns=["task", "metric"])
-    df = df.pivot(index="model", columns="task_metric", values="score")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
     df = df[
         [
             "rank",
@@ -74,27 +124,81 @@ def make_model_table(df, models):
             "license",
             "cost",
             "average",
-            *task_metrics,
         ]
     ]
     return df
-def make_language_table(df, languages):
-    df = (
-        df.groupby(["bcp_47", "task", "metric"])
-        .agg({"score": "mean", "model": "nunique"})
-        .reset_index()
     )
-    df["task_metric"] = df["task"] + "_" + df["metric"]
-    df = df.drop(columns=["task", "metric"])
-    df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[
         [
             "bcp_47",
@@ -104,7 +208,7 @@ def make_language_table(df, languages):
             "family",
             "average",
             "in_benchmark",
-            *task_metrics,
         ]
     ]
     return df
@@ -125,10 +229,22 @@ async def data(request: Request):
     body = await request.body()
     data = json.loads(body)
     selected_languages = data.get("selectedLanguages", {})
-    df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
     # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
     language_table = make_language_table(df, languages)
     datasets_df = pd.read_json("datasets.json")
     if selected_languages:
         # the filtering is only applied for the model table and the country data
         df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
@@ -143,6 +259,7 @@ async def data(request: Request):
         "language_table": serialize(language_table),
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
     }
     return JSONResponse(content=all_tables)

 import numpy as np
 import pandas as pd
 import uvicorn
+from evals.countries import make_country_table
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
+    "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
             col_min = normalized_df[col].min()
             col_max = normalized_df[col].max()
             if col_max > col_min:  # Avoid division by zero
+                normalized_df[col] = (normalized_df[col] - col_min) / (
+                    col_max - col_min
+                )
             else:
                 normalized_df[col] = 0  # If all values are the same, set to 0
     return normalized_df.mean(axis=1, skipna=False)
+def make_model_table(scores_df, models):
+    # Create a combined task_metric for origin
+    scores_df["task_metric_origin"] = (
+        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
+    )
+    # Pivot to get scores for each origin-specific metric
+    scores_pivot = scores_df.pivot_table(
+        index="model",
+        columns="task_metric_origin",
+        values="score",
+        aggfunc="mean",
     )
+    # Create the regular task_metric for the main average calculation
+    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
+    main_pivot = scores_df.pivot_table(
+        index="model", columns="task_metric", values="score", aggfunc="mean"
+    )
+    # Merge the two pivots
+    df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
+    # Compute origin presence per model+metric
+    origin_presence = (
+        scores_df.groupby(["model", "task_metric", "origin"])
+        .size()
+        .unstack(fill_value=0)
+    )
+    # Add boolean flags: show asterisk only if exclusively machine-origin contributed
+    for metric in task_metrics:
+        human_col_name = "human" if "human" in origin_presence.columns else None
+        machine_col_name = "machine" if "machine" in origin_presence.columns else None
+        if human_col_name or machine_col_name:
+            flags = []
+            for model in df.index:
+                try:
+                    counts = origin_presence.loc[(model, metric)]
+                except KeyError:
+                    flags.append(False)
+                    continue
+                human_count = counts.get(human_col_name, 0) if human_col_name else 0
+                machine_count = (
+                    counts.get(machine_col_name, 0) if machine_col_name else 0
+                )
+                flags.append(machine_count > 0 and human_count == 0)
+            df[f"{metric}_is_machine"] = flags
+        else:
+            df[f"{metric}_is_machine"] = False
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
+    # Dynamically find all metric columns to include
+    final_cols = df.columns
+    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
     df = df[
         [
             "rank",
             "license",
             "cost",
             "average",
+            *sorted(list(set(metric_cols))),
         ]
     ]
     return df
+def make_language_table(scores_df, languages):
+    # Create a combined task_metric for origin
+    scores_df["task_metric_origin"] = (
+        scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
+    )
+    # Pivot to get scores for each origin-specific metric
+    scores_pivot = scores_df.pivot_table(
+        index="bcp_47",
+        columns="task_metric_origin",
+        values="score",
+        aggfunc="mean",
+    )
+    # Create the regular task_metric for the main average calculation
+    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
+    main_pivot = scores_df.pivot_table(
+        index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
     )
+    # Merge the two pivots
+    df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
     for metric in task_metrics:
         if metric not in df.columns:
             df[metric] = np.nan
     df["average"] = compute_normalized_average(df, task_metrics)
+    # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
+    origin_presence = (
+        scores_df.groupby(["bcp_47", "task_metric", "origin"])
+        .size()
+        .unstack(fill_value=0)
+    )
+    for metric in task_metrics:
+        human_col_name = "human" if "human" in origin_presence.columns else None
+        machine_col_name = "machine" if "machine" in origin_presence.columns else None
+        if human_col_name or machine_col_name:
+            flags = []
+            for bcp in df.index:
+                try:
+                    counts = origin_presence.loc[(bcp, metric)]
+                except KeyError:
+                    flags.append(False)
+                    continue
+                human_count = counts.get(human_col_name, 0) if human_col_name else 0
+                machine_count = (
+                    counts.get(machine_col_name, 0) if machine_col_name else 0
+                )
+                flags.append(machine_count > 0 and human_count == 0)
+            df[f"{metric}_is_machine"] = flags
+        else:
+            df[f"{metric}_is_machine"] = False
+    # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
+    for metric in task_metrics:
+        machine_col = f"{metric}_machine"
+        if machine_col in df.columns:
+            df[f"{metric}_is_machine"] = df[machine_col].notna()
+        else:
+            df[f"{metric}_is_machine"] = False
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
+    # Dynamically find all metric columns to include
+    final_cols = df.columns
+    metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
     df = df[
         [
             "bcp_47",
             "family",
             "average",
             "in_benchmark",
+            *sorted(list(set(metric_cols))),
         ]
     ]
     return df
     body = await request.body()
     data = json.loads(body)
     selected_languages = data.get("selectedLanguages", {})
+    df = (
+        scores.groupby(["model", "bcp_47", "task", "metric", "origin"])
+        .mean()
+        .reset_index()
+    )
     # lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
     language_table = make_language_table(df, languages)
     datasets_df = pd.read_json("datasets.json")
+    # Identify which metrics have machine translations available
+    machine_translated_metrics = set()
+    for _, row in df.iterrows():
+        if row["origin"] == "machine":
+            metric_name = f"{row['task']}_{row['metric']}"
+            machine_translated_metrics.add(metric_name)
     if selected_languages:
         # the filtering is only applied for the model table and the country data
         df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
         "language_table": serialize(language_table),
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
+        "machine_translated_metrics": list(machine_translated_metrics),
     }
     return JSONResponse(content=all_tables)

evals/countries.py CHANGED Viewed

@@ -15,6 +15,7 @@ def population(bcp_47):
     }
     return items
 @cache
 def make_country_table(language_table):
     countries = defaultdict(list)
@@ -30,10 +31,15 @@ def make_country_table(language_table):
             )
     for country, languages in countries.items():
         speaker_pop = sum(entry["population"] for entry in languages)
-        score = (
-            sum(entry["score"] * entry["population"] for entry in languages)
-            / speaker_pop
-        )
         countries[country] = {
             "score": score,
             "languages": languages,

     }
     return items
 @cache
 def make_country_table(language_table):
     countries = defaultdict(list)
             )
     for country, languages in countries.items():
         speaker_pop = sum(entry["population"] for entry in languages)
+        if speaker_pop < 1000:  # Grey out low-population countries
+            score = None  # This will make them appear grey on the map
+        else:
+            score = (
+                sum(entry["score"] * entry["population"] for entry in languages)
+                / speaker_pop
+            )
         countries[country] = {
             "score": score,
             "languages": languages,

evals/datasets_/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # ~~This file makes datasets_ a Python package~~


1	+

evals/datasets_/arc.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import random
-from collections import Counter, defaultdict
-from langcodes import Language, standardize_tag
 from rich import print
-from models import translate_google, google_supported_languages
 from tqdm import tqdm
-from datasets import Dataset, load_dataset
 import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
@@ -14,27 +13,33 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
 tags_uhura_arc_easy = {
-    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy)
     if not a.endswith("unmatched")
 }
 random.seed(42)
-id_sets_train = [set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"]) for tag in tags_uhura_arc_easy.values()]
 common_ids_train = list(sorted(set.intersection(*id_sets_train)))
 random.shuffle(common_ids_train)
-id_sets_test = [set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"]) for tag in tags_uhura_arc_easy.values()]
 common_ids_test = list(sorted(set.intersection(*id_sets_test)))
 random.shuffle(common_ids_test)
 slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
 tags_uhura_arc_easy_translated = {
-    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 }
 def add_choices(row):
     row["choices"] = row["choices"]["text"]
     return row
@@ -45,37 +50,40 @@ def load_uhura_arc_easy(language_bcp_47, nr):
         ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
         ds = ds.map(add_choices)
         ds = ds.rename_column("answerKey", "answer")
-        train_ids = common_ids_train[nr:nr+3]
-        examples = ds["train"].filter(lambda x: x["id"] in train_ids)
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
-        return "masakhane/uhura-arc-easy", examples, task
     if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
-        ds = _load_dataset(slug_uhura_arc_easy_translated, tags_uhura_arc_easy_translated[language_bcp_47])
         ds = ds.rename_column("answerKey", "answer")
-        train_ids = common_ids_train[nr:nr+3]
-        examples = ds["train"].filter(lambda x: x["id"] in train_ids)
-        # raise Exception(language_bcp_47)
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
-        return "fair-forward/arc-easy-autotranslated", examples, task
     else:
         return None, None, None
 def translate_arc(languages):
     human_translated = tags_uhura_arc_easy.keys()
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in google_supported_languages
     ]
     n_samples = 10
-    train_ids = common_ids_train[:n_samples+3]
-    en_train = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train")
     en_train = en_train.filter(lambda x: x["id"] in train_ids)
     test_ids = common_ids_test[:n_samples]
-    en_test = _load_dataset(slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test")
     en_test = en_test.filter(lambda x: x["id"] in test_ids)
     data = {"train": en_train, "test": en_test}
     slug = "fair-forward/arc-easy-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
@@ -84,16 +92,22 @@ def translate_arc(languages):
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split, data_en in data.items():
-                questions_tr = [translate_google(q, "en", lang) for q in data_en["question"]]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
                 choices_texts_concatenated = []
                 for choice in data_en["choices"]:
                     for option in choice["text"]:
                         choices_texts_concatenated.append(option)
-                choices_tr = [translate_google(c, "en", lang) for c in choices_texts_concatenated]
                 choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
                 # group into chunks of 4
-                choices_tr = [choices_tr[i:i+4] for i in range(0, len(choices_tr), 4)]
                 ds_lang = Dataset.from_dict(
                     {
@@ -110,5 +124,8 @@ def translate_arc(languages):
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
-                    f"data/translations/arc/{lang}_{split}.json", lines=False, force_ascii=False, indent=2
                 )

 import random
+from langcodes import standardize_tag
 from rich import print
+from models import translate_google, get_google_supported_languages
 from tqdm import tqdm
+from datasets import load_dataset, Dataset
 import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
 slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
 tags_uhura_arc_easy = {
+    standardize_tag(a.split("_")[0], macro=True): a
+    for a in _get_dataset_config_names(slug_uhura_arc_easy)
     if not a.endswith("unmatched")
 }
 random.seed(42)
+id_sets_train = [
+    set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
+    for tag in tags_uhura_arc_easy.values()
+]
 common_ids_train = list(sorted(set.intersection(*id_sets_train)))
 random.shuffle(common_ids_train)
+id_sets_test = [
+    set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
+    for tag in tags_uhura_arc_easy.values()
+]
 common_ids_test = list(sorted(set.intersection(*id_sets_test)))
 random.shuffle(common_ids_test)
 slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
 tags_uhura_arc_easy_translated = {
+    standardize_tag(a.split("_")[0], macro=True): a
+    for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
 }
 def add_choices(row):
     row["choices"] = row["choices"]["text"]
     return row
         ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
         ds = ds.map(add_choices)
         ds = ds.rename_column("answerKey", "answer")
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
+        return "masakhane/uhura-arc-easy", task, "human"
     if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
+        ds = _load_dataset(
+            slug_uhura_arc_easy_translated,
+            tags_uhura_arc_easy_translated[language_bcp_47],
+        )
         ds = ds.rename_column("answerKey", "answer")
         task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
+        return "fair-forward/arc-easy-autotranslated", task, "machine"
     else:
         return None, None, None
 def translate_arc(languages):
     human_translated = tags_uhura_arc_easy.keys()
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in get_google_supported_languages()
     ]
     n_samples = 10
+    train_ids = common_ids_train[: n_samples + 3]
+    en_train = _load_dataset(
+        slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train"
+    )
     en_train = en_train.filter(lambda x: x["id"] in train_ids)
     test_ids = common_ids_test[:n_samples]
+    en_test = _load_dataset(
+        slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test"
+    )
     en_test = en_test.filter(lambda x: x["id"] in test_ids)
     data = {"train": en_train, "test": en_test}
     slug = "fair-forward/arc-easy-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split, data_en in data.items():
+                questions_tr = [
+                    translate_google(q, "en", lang) for q in data_en["question"]
+                ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
                 choices_texts_concatenated = []
                 for choice in data_en["choices"]:
                     for option in choice["text"]:
                         choices_texts_concatenated.append(option)
+                choices_tr = [
+                    translate_google(c, "en", lang) for c in choices_texts_concatenated
+                ]
                 choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
                 # group into chunks of 4
+                choices_tr = [
+                    choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
+                ]
                 ds_lang = Dataset.from_dict(
                     {
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
+                    f"data/translations/arc/{lang}_{split}.json",
+                    lines=False,
+                    force_ascii=False,
+                    indent=2,
                 )

evals/datasets_/fleurs.py CHANGED Viewed

@@ -11,6 +11,7 @@ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
     lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
 )
 def download_file(url, path):
     response = requests.get(url)
     with open(path, "wb") as f:
@@ -34,4 +35,4 @@ def download_fleurs(transcription_langs_eval):
         if not tsv_path.exists():
             print(f"Downloading {tsv_url} to {tsv_path}")
             tsv_path.parent.mkdir(parents=True, exist_ok=True)
-            download_file(tsv_url, tsv_path)

     lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
 )
 def download_file(url, path):
     response = requests.get(url)
     with open(path, "wb") as f:
         if not tsv_path.exists():
             print(f"Downloading {tsv_url} to {tsv_path}")
             tsv_path.parent.mkdir(parents=True, exist_ok=True)
+            download_file(tsv_url, tsv_path)

evals/datasets_/mgsm.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import asyncio
 import os
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset
-from langcodes import standardize_tag
-from models import google_supported_languages, translate_google
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
@@ -37,31 +39,50 @@ def parse_number(i):
         return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
-        ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
-        return slug_mgsm, ds[nr]
     elif language_bcp_47 in tags_afrimgsm.keys():
-        ds = _load_dataset(
-            slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
         )
-        return slug_afrimgsm, ds[nr]
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
-        ds = _load_dataset(
-            slug_gsm_autotranslated, subset=tags_gsm_autotranslated[language_bcp_47], split="test"
         )
-        return slug_gsm_autotranslated, ds[nr]
-    elif language_bcp_47 in tags_gsm8kx.keys():
-        row = _load_dataset(
-            slug_gsm8kx,
-            subset=tags_gsm8kx[language_bcp_47],
-            split="test",
-            trust_remote_code=True,
-        )[nr]
-        row["answer_number"] = row["answer"].split("####")[1].strip()
-        return slug_gsm8kx, row
     else:
-        return None, None
 def translate_mgsm(languages):
@@ -69,7 +90,7 @@ def translate_mgsm(languages):
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in google_supported_languages
     ]
     en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
     slug = "fair-forward/gsm-autotranslated"
@@ -96,5 +117,8 @@ def translate_mgsm(languages):
                 token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
             )
             ds_lang.to_json(
-                f"data/translations/mgsm/{lang}.json", lines=False, force_ascii=False, indent=2
             )

 import asyncio
 import os
+import random
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset, cache
+from langcodes import Language, standardize_tag
+from models import get_google_supported_languages, translate_google
+from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
         return None
+@cache
+def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
+    """Cache individual MGSM items efficiently"""
+    try:
+        ds = _load_dataset(
+            dataset_slug,
+            subset=subset_tag,
+            split="test",
+            trust_remote_code=trust_remote_code,
+        )
+        if nr >= len(ds):
+            return None
+        row = ds[nr]
+        # Post-process based on dataset type
+        if dataset_slug == slug_gsm8kx:
+            row["answer_number"] = row["answer"].split("####")[1].strip()
+        return row
+    except Exception:
+        # Dataset doesn't exist or doesn't have test split
+        return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
+        item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
+        return slug_mgsm, item, "human" if item else (None, None, None)
     elif language_bcp_47 in tags_afrimgsm.keys():
+        item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
+        return slug_afrimgsm, item, "human" if item else (None, None, None)
+    elif language_bcp_47 in tags_gsm8kx.keys():
+        item = _get_mgsm_item(
+            slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True
         )
+        return slug_gsm8kx, item, "machine" if item else (None, None, None)
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
+        item = _get_mgsm_item(
+            slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr
         )
+        return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
     else:
+        return None, None, None
 def translate_mgsm(languages):
     untranslated = [
         lang
         for lang in languages["bcp_47"].values[:100]
+        if lang not in human_translated and lang in get_google_supported_languages()
     ]
     en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
     slug = "fair-forward/gsm-autotranslated"
                 token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
             )
             ds_lang.to_json(
+                f"data/translations/mgsm/{lang}.json",
+                lines=False,
+                force_ascii=False,
+                indent=2,
             )

evals/datasets_/mmlu.py CHANGED Viewed

@@ -4,9 +4,9 @@ import random
 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset
 from langcodes import Language, standardize_tag
-from models import google_supported_languages, translate_google
 from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
@@ -111,6 +111,7 @@ def print_datasets_analysis():
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
@@ -143,32 +144,61 @@ tags_mmlux = set(
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
-tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 categories = sorted(
-        list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
-    )
-def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
-        ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
-        ds = ds.map(parse_choices)
-        examples = ds["dev"].filter(lambda x: x["subject"] == category)
-        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
-        return "masakhane/afrimmlu", examples, task
     elif language_bcp_47 in tags_global_mmlu.keys():
-        ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
-        ds = ds.map(add_choices)
-        examples = ds["dev"].filter(lambda x: x["subject"] == category)
-        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
-        return "CohereForAI/Global-MMLU", examples, task
     elif language_bcp_47 in tags_mmlu_autotranslated:
-        ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
-        examples = ds["dev"].filter(lambda x: x["subject"] == category)
-        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
-        return "fair-forward/mmlu-autotranslated", examples, task
     else:
         return None, None, None
@@ -177,10 +207,10 @@ def translate_mmlu(languages):
     human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
     untranslated = [
         lang
-        for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in google_supported_languages
     ]
-    n_samples = 10
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
@@ -196,8 +226,10 @@ def translate_mmlu(languages):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
-                        for i in range(n_samples):
-                            task = ds.filter(lambda x: x["subject"] == category)[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset, cache
 from langcodes import Language, standardize_tag
+from models import get_google_supported_languages, translate_google
 from rich import print
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
+tags_mmlu_autotranslated = {
+    standardize_tag(a, macro=True): a
+    for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
+}
 categories = sorted(
+    list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
+)
+@cache
+def _get_processed_mmlu_dataset(dataset_name, subset_tag):
+    """Cache processed datasets to avoid reprocessing"""
+    ds = _load_dataset(dataset_name, subset_tag)
+    if dataset_name == "masakhane/afrimmlu":
+        ds = ds.map(parse_choices)
+    elif dataset_name == "CohereForAI/Global-MMLU":
+        ds = ds.map(add_choices)
+    return ds
+@cache
+def _get_mmlu_item(dataset_name, subset_tag, category, nr):
+    """Cache individual MMLU items efficiently"""
+    ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
+    if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
+        filtered = ds["test"].filter(lambda x: x["subject"] == category)
+        return filtered[nr] if nr < len(filtered) else None
+    else:  # fair-forward/mmlu-autotranslated
+        filtered = ds["test"].filter(lambda x: x["subject"] == category)
+        return filtered[nr] if nr < len(filtered) else None
+async def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
+        task = _get_mmlu_item(
+            "masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr
+        )
+        return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
     elif language_bcp_47 in tags_global_mmlu.keys():
+        task = _get_mmlu_item(
+            "CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr
+        )
+        return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
+    # TODO: add in Okapi, MMLUX @Jonas
     elif language_bcp_47 in tags_mmlu_autotranslated:
+        task = _get_mmlu_item(
+            "fair-forward/mmlu-autotranslated", language_bcp_47, category, nr
+        )
+        return (
+            "fair-forward/mmlu-autotranslated",
+            task,
+            "machine" if task else (None, None, None),
+        )
     else:
         return None, None, None
     human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
     untranslated = [
         lang
+        for lang in languages["bcp_47"].values[:150]
+        if lang not in human_translated and lang in get_google_supported_languages()
     ]
+    n_samples = 20
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
+                        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
+                        filtered = ds.filter(lambda x: x["subject"] == category)
+                        for i in range(min(n_samples, len(filtered))):
+                            task = filtered[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

evals/datasets_/truthfulqa.py CHANGED Viewed

@@ -8,17 +8,29 @@ import asyncio
 from tqdm.asyncio import tqdm_asyncio
 import os
-from datasets import Dataset, load_dataset
-from models import translate_google, google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 tags_uhura_truthfulqa = {
-    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
@@ -26,26 +38,42 @@ def add_choices(row):
     return row
-def load_truthfulqa(language_bcp_47, nr):
     if language_bcp_47 in tags_uhura_truthfulqa.keys():
-        ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
         ds = ds.map(add_choices)
-        examples = ds["train"]
         task = ds["test"][nr]
-        return "masakhane/uhura-truthfulqa", examples, task
     else:
         return None, None, None
 def translate_truthfulqa(languages):
     human_translated = [*tags_uhura_truthfulqa.keys()]
     untranslated = [
         lang
-        for lang in languages["bcp_47"].values[:100]
-        if lang not in human_translated and lang in google_supported_languages
     ]
-    n_samples = 10
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
@@ -55,37 +83,47 @@ def translate_truthfulqa(languages):
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split in ["train", "test"]:
-                ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split)
                 samples = []
                 if split == "train":
                     samples.extend(ds)
                 else:
-                    for i in range(n_samples):
                         task = ds[i]
                         samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
-                choices_texts_concatenated = []
                 for s in samples:
-                    for choice in eval(s["choices"]):
-                        choices_texts_concatenated.append(choice)
-                choices_tr = [
-                    translate_google(c, "en", lang) for c in choices_texts_concatenated
-                ]
-                choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
-                # group into chunks of 4
-                choices_tr = [
-                    choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
-                ]
                 ds_lang = Dataset.from_dict(
                     {
-                        "subject": [s["subject"] for s in samples],
                         "question": questions_tr,
-                        "choices": choices_tr,
-                        "answer": [s["answer"] for s in samples],
                     }
                 )
                 ds_lang.push_to_hub(
@@ -95,7 +133,7 @@ def translate_truthfulqa(languages):
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
-                    f"data/translations/mmlu/{lang}_{split}.json",
                     lines=False,
                     force_ascii=False,
                     indent=2,

 from tqdm.asyncio import tqdm_asyncio
 import os
+from datasets import Dataset, load_dataset, DatasetNotFoundError
+from models import translate_google, get_google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
+slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
 tags_uhura_truthfulqa = {
+    standardize_tag(a.split("_")[0], macro=True): a
+    for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
+# Get available auto-translated languages
+try:
+    tags_truthfulqa_autotranslated = {
+        standardize_tag(a, macro=True): a
+        for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
+    }
+except DatasetNotFoundError:
+    tags_truthfulqa_autotranslated = {}
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
     return row
+async def load_truthfulqa(language_bcp_47, nr):
     if language_bcp_47 in tags_uhura_truthfulqa.keys():
+        ds = _load_dataset(
+            slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
+        )
         ds = ds.map(add_choices)
         task = ds["test"][nr]
+        # Ensure there is a correct answer before returning the task
+        if 1 not in task["labels"]:
+            return None, None, None
+        return "masakhane/uhura-truthfulqa", task, "human"
+    elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
+        # Load from auto-translated dataset (same samples as translation)
+        ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
+        test_split = ds["test"] if "test" in ds else ds
+        task = test_split[nr]
+        # Ensure there is a correct answer before returning the task
+        if 1 not in task.get("labels", []):
+            return None, None, None
+        return slug_truthfulqa_autotranslated, task, "machine"
+    # TODO: add Okapi, TruthfulQA-X @Jonas
     else:
         return None, None, None
 def translate_truthfulqa(languages):
     human_translated = [*tags_uhura_truthfulqa.keys()]
     untranslated = [
         lang
+        for lang in languages["bcp_47"].values[:150]
+        if lang not in human_translated and lang in get_google_supported_languages()
     ]
+    n_samples = 20
+    # Set fixed seed for consistent sample selection across all languages
+    random.seed(42)
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
         except (ValueError, Exception):
             print(f"Translating {lang}...")
             for split in ["train", "test"]:
+                ds = _load_dataset(
+                    slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split
+                )
                 samples = []
                 if split == "train":
                     samples.extend(ds)
                 else:
+                    # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
+                    for i in range(min(n_samples, len(ds))):
                         task = ds[i]
                         samples.append(task)
+                # Translate questions
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
+                # Translate choices for each sample
+                all_choices_tr = []
+                all_labels = []
                 for s in samples:
+                    # Get choices from mc1_targets
+                    choices = s["mc1_targets"]["choices"]
+                    labels = s["mc1_targets"]["labels"]
+                    # Translate choices
+                    choices_tr = [
+                        translate_google(choice, "en", lang) for choice in choices
+                    ]
+                    choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
+                    all_choices_tr.append(choices_tr)
+                    all_labels.append(labels)
                 ds_lang = Dataset.from_dict(
                     {
                         "question": questions_tr,
+                        "choices": all_choices_tr,
+                        "labels": all_labels,
                     }
                 )
                 ds_lang.push_to_hub(
                     token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                 )
                 ds_lang.to_json(
+                    f"data/translations/truthfulqa/{lang}_{split}.json",
                     lines=False,
                     force_ascii=False,
                     indent=2,

evals/datasets_/util.py CHANGED Viewed

@@ -12,3 +12,11 @@ def _get_dataset_config_names(dataset, **kwargs):
 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)

 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)
+# Cache individual dataset items to avoid reloading entire datasets
+@cache
+def _get_dataset_item(dataset, subset, split, index, **kwargs):
+    """Load a single item from a dataset efficiently"""
+    ds = load_dataset(dataset, subset, split=split, **kwargs)
+    return ds[index] if index < len(ds) else None

evals/download_data.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 import sys
 import huggingface_hub
 from datasets import load_dataset, DatasetDict
 # Import fleurs DataFrame directly from its source module
 from datasets_.fleurs import fleurs
@@ -24,22 +25,25 @@ DATA_DIR = project_root / "data"
 FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
 FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
-GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
 GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
 GLOTTOLOG_CSV_NAME = "languoid.csv"
-SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
 SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
-SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
 SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
 SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
-SPBLEU_DICT_URL = "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
 SPBLEU_DICT_NAME = "dictionary.txt"
 # --- Helper Functions ---
 def download_file(url, path: Path):
     """Downloads a file from a URL to a local path."""
     print(f"Downloading {url} to {path}...")
@@ -84,11 +88,16 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
                     break
             if target_zip_path:
-                with z.open(target_zip_path) as source, open(extract_path / target_filename, "wb") as target:
                     target.write(source.read())
                 print(f"Successfully extracted {target_filename}.")
             else:
-                print(f"Error: Could not find {target_filename} within the zip archive.")
     except zipfile.BadZipFile:
         print("Error: Downloaded file is not a valid zip archive.")
@@ -98,13 +107,14 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
 # --- Download Functions ---
 def download_fleurs_data():
     """Downloads Fleurs audio and text data."""
     print("\n--- Downloading Fleurs Data ---")
     FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
     # Use the fleurs_tag column from the imported DataFrame
-    fleurs_tags_list = fleurs['fleurs_tag'].tolist()
     if not fleurs_tags_list:
         print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
@@ -117,7 +127,9 @@ def download_fleurs_data():
         audio_dir = lang_dir / "audio"
         dev_tsv_path = lang_dir / "dev.tsv"
         dev_audio_archive_path = audio_dir / "dev.tar.gz"
-        audio_extracted_marker = audio_dir / "dev" # Check if extraction likely happened
         # Download TSV
         if not dev_tsv_path.exists():
@@ -129,15 +141,15 @@ def download_fleurs_data():
         # Download and Extract Audio
         if not audio_extracted_marker.exists():
             if not dev_audio_archive_path.exists():
-                 tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
-                 download_file(tar_url, dev_audio_archive_path)
             if dev_audio_archive_path.exists():
-                 extract_tar_gz(dev_audio_archive_path, audio_dir)
             else:
                 print(f"Audio archive missing, cannot extract for {lang_tag}")
         else:
-             print(f"Found extracted audio: {audio_extracted_marker}")
 def download_glottolog_data():
@@ -165,7 +177,9 @@ def download_scriptcodes_data():
     # The URL points to an HTML page, not a direct CSV link.
     # Manual download is likely required for ScriptCodes.csv.
     print(f"Cannot automatically download from {SCRIPTCODES_URL}")
-    print(f"Please manually download the ISO 15924 codes list (often available as a .txt file)")
     print("from the Unicode website or related sources and save it as:")
     print(f"{SCRIPTCODES_TARGET_FILE}")
     if SCRIPTCODES_TARGET_FILE.exists():
@@ -196,21 +210,24 @@ def download_spbleu_data():
 # --- Main Execution ---
 def main():
     """Runs all download functions and the conversion step."""
     print("Starting data download process...")
     DATA_DIR.mkdir(exist_ok=True)
-    #download_fleurs_data()
     download_glottolog_data()
     download_scriptcodes_data()
     download_spbleu_data()
     print("\nData download process finished.")
     print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
-    print("Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well")
     print("in 'evals/datasets_/flores.py' to be read correctly.")
 if __name__ == "__main__":
-    main()

 import sys
 import huggingface_hub
 from datasets import load_dataset, DatasetDict
 # Import fleurs DataFrame directly from its source module
 from datasets_.fleurs import fleurs
 FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
 FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
+GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip"  # Assumed direct link from https://glottolog.org/meta/downloads
 GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
 GLOTTOLOG_CSV_NAME = "languoid.csv"
+SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html"  # This is HTML, need manual download or parsing
 SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
+SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm"  # Assumed direct link
 SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
 SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
+SPBLEU_DICT_URL = (
+    "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
+)
 SPBLEU_DICT_NAME = "dictionary.txt"
 # --- Helper Functions ---
 def download_file(url, path: Path):
     """Downloads a file from a URL to a local path."""
     print(f"Downloading {url} to {path}...")
                     break
             if target_zip_path:
+                with (
+                    z.open(target_zip_path) as source,
+                    open(extract_path / target_filename, "wb") as target,
+                ):
                     target.write(source.read())
                 print(f"Successfully extracted {target_filename}.")
             else:
+                print(
+                    f"Error: Could not find {target_filename} within the zip archive."
+                )
     except zipfile.BadZipFile:
         print("Error: Downloaded file is not a valid zip archive.")
 # --- Download Functions ---
 def download_fleurs_data():
     """Downloads Fleurs audio and text data."""
     print("\n--- Downloading Fleurs Data ---")
     FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
     # Use the fleurs_tag column from the imported DataFrame
+    fleurs_tags_list = fleurs["fleurs_tag"].tolist()
     if not fleurs_tags_list:
         print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
         audio_dir = lang_dir / "audio"
         dev_tsv_path = lang_dir / "dev.tsv"
         dev_audio_archive_path = audio_dir / "dev.tar.gz"
+        audio_extracted_marker = (
+            audio_dir / "dev"
+        )  # Check if extraction likely happened
         # Download TSV
         if not dev_tsv_path.exists():
         # Download and Extract Audio
         if not audio_extracted_marker.exists():
             if not dev_audio_archive_path.exists():
+                tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
+                download_file(tar_url, dev_audio_archive_path)
             if dev_audio_archive_path.exists():
+                extract_tar_gz(dev_audio_archive_path, audio_dir)
             else:
                 print(f"Audio archive missing, cannot extract for {lang_tag}")
         else:
+            print(f"Found extracted audio: {audio_extracted_marker}")
 def download_glottolog_data():
     # The URL points to an HTML page, not a direct CSV link.
     # Manual download is likely required for ScriptCodes.csv.
     print(f"Cannot automatically download from {SCRIPTCODES_URL}")
+    print(
+        "Please manually download the ISO 15924 codes list (often available as a .txt file)"
+    )
     print("from the Unicode website or related sources and save it as:")
     print(f"{SCRIPTCODES_TARGET_FILE}")
     if SCRIPTCODES_TARGET_FILE.exists():
 # --- Main Execution ---
 def main():
     """Runs all download functions and the conversion step."""
     print("Starting data download process...")
     DATA_DIR.mkdir(exist_ok=True)
+    # download_fleurs_data()
     download_glottolog_data()
     download_scriptcodes_data()
     download_spbleu_data()
     print("\nData download process finished.")
     print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
+    print(
+        "Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well"
+    )
     print("in 'evals/datasets_/flores.py' to be read correctly.")
 if __name__ == "__main__":
+    main()

evals/languages.py CHANGED Viewed

@@ -31,6 +31,7 @@ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
     lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
 )
 @cache
 def language_family(bcp_47):
     languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
@@ -39,6 +40,7 @@ def language_family(bcp_47):
     family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
     return family["name"]
 languages["family"] = languages["bcp_47"].apply(language_family)
 # load script codes and names
@@ -46,6 +48,7 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
     columns={"Code": "iso15924", "English Name": "script_name"}
 )
 def script_name(iso15924):
     return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]

     lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
 )
 @cache
 def language_family(bcp_47):
     languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
     family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
     return family["name"]
 languages["family"] = languages["bcp_47"].apply(language_family)
 # load script codes and names
     columns={"Code": "iso15924", "English Name": "script_name"}
 )
 def script_name(iso15924):
     return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]

evals/main.py CHANGED Viewed

@@ -1,62 +1,190 @@
 import asyncio
 import pandas as pd
-from languages import languages
 from models import models
 from tasks import tasks
-from tqdm.asyncio import tqdm_asyncio
-# ===== config =====
-n_sentences = 10
-# ===== run evaluation and aggregate results =====
-async def evaluate():
-    # FIXME we should not need this for-loop, but it helps
-    for n_languages in range(10, 101, 10):
-        print(f"running evaluations for {n_languages} languages")
         old_results = pd.read_json("results.json")
-        old_models = pd.read_json("models.json")
-        # get all combinations of model, language and task
-        combis = [
-            (model, lang.bcp_47, task_name)
-            for model in models["id"]
-            for lang in languages.iloc[:n_languages].itertuples()
-            for task_name, task in tasks.items()
-            if task_name in models[models["id"] == model]["tasks"].iloc[0]
-        ]
-        # filter out combinations that have already been evaluated
-        combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-        combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
-        combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
-        # run evaluations
-        results = [
-            tasks[task_name](model, bcp_47, i)
-            for i in range(n_sentences)
-            for model, bcp_47, task_name in combis.itertuples(index=False)
-        ]
-        results = await tqdm_asyncio.gather(*results, miniters=1)
-        results = [r for group in results for r in group]
-        args = dict(orient="records", indent=2, force_ascii=False)
-        if results:
-            # aggregate results
-            results = pd.DataFrame(results)
-            results = (
-                results.groupby(["model", "bcp_47", "task", "metric"])
-                .agg({"score": "mean"})
-                .reset_index()
             )
-            # save results
-            results = pd.concat([old_results, results])
-            results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
-            results.to_json("results.json", **args)
-        # save up-to-date info on models and languages
-        all_models = pd.concat([pd.DataFrame(models), old_models])
-        all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
-        all_models.to_json("models.json", **args)
-        pd.DataFrame(languages).to_json("languages.json", **args)
 if __name__ == "__main__":

 import asyncio
 import pandas as pd
+import time
+from datetime import datetime, timedelta
 from models import models
 from tasks import tasks
+from languages import languages
+import os
+async def evaluate():
+    # Configuration - easily adjustable defaults
+    n_sentences = int(
+        os.environ.get("N_SENTENCES", 20)
+    )  # Default: 20 sentences per task
+    max_languages = int(
+        os.environ.get("MAX_LANGUAGES", 150)
+    )  # Default: 150 top languages
+    single_model = os.environ.get(
+        "SINGLE_MODEL"
+    )  # Optional: run only one specific model
+    test_mode = os.environ.get("TEST", "").lower() in (
+        "1",
+        "true",
+        "yes",
+    )  # Optional: skip results loading/saving
+    # Keep original DataFrames for saving metadata - distinction added for single model test runs.
+    original_models_df = pd.DataFrame(models)
+    original_languages_df = pd.DataFrame(languages)
+    # Create working copies for single evaluation runs
+    models_df = original_models_df.copy()
+    languages_df = original_languages_df.copy()
+    top_languages = languages.head(max_languages)
+    # Filter to single model if specified (only affects evaluation, not saving)
+    if single_model:
+        models_df = models_df[models_df["id"] == single_model]
+        if len(models_df) == 0:
+            print(f"Error: Model '{single_model}' not found. Available models:")
+            for model_id in original_models_df["id"]:
+                print(f"  {model_id}")
+            return pd.DataFrame()
+    print(
+        f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task"
+    )
+    if test_mode:
+        print("TEST MODE: Skipping results loading/saving")
+    start_time = time.time()
+    # Load existing results to avoid re-evaluation (skip in test mode)
+    if test_mode:
+        old_results = pd.DataFrame(
+            columns=["model", "bcp_47", "task", "metric", "origin", "score"]
+        )
+    else:
         old_results = pd.read_json("results.json")
+    # Get all combinations that need evaluation
+    combis = [
+        (model, lang.bcp_47, task_name)
+        for model in models_df["id"]
+        for lang in top_languages.itertuples()
+        for task_name, task in tasks.items()
+        if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
+    ]
+    # Filter out already evaluated combinations
+    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+    if not old_results.empty:
+        completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
+        # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
+        mask = ~combis.apply(
+            lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1
+        )
+        combis = combis[mask]
+    # Create all evaluation tasks
+    all_tasks = []
+    for i in range(n_sentences):
+        for model, bcp_47, task_name in combis.itertuples(index=False):
+            all_tasks.append((tasks[task_name], model, bcp_47, i))
+    print(f"Running {len(all_tasks)} evaluation tasks...")
+    # For single model runs, we stop immediately on first API error to inspect.
+    # For full evaluations, we continue despite errors to get maximum coverage.
+    stop_on_error = single_model is not None
+    # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
+    batch_size = 1000
+    all_results = []
+    try:
+        for i in range(0, len(all_tasks), batch_size):
+            batch = all_tasks[i : i + batch_size]
+            batch_results = await asyncio.gather(
+                *[
+                    task_func(model, bcp_47, sentence_nr)
+                    for task_func, model, bcp_47, sentence_nr in batch
+                ],
+                return_exceptions=not stop_on_error,
             )
+            all_results.extend(batch_results)
+        results = all_results
+        # Process results and logging API errors separately to understand what are the main issues.
+        valid_results = []
+        errors = []
+        for i, r in enumerate(results):
+            if isinstance(r, Exception):
+                if i < len(all_tasks):
+                    task_info = all_tasks[i]
+                    errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
+            elif isinstance(r, list):
+                valid_results.extend(r)
+            elif r is not None:
+                valid_results.append(r)
+        # log errors and store
+        if errors:
+            with open("errors.log", "w") as f:
+                f.write("model,task,error\n")
+                for error in errors:
+                    f.write(error + "\n")
+        # Track model completion (TO BE DELETED - was for local run only)
+        if valid_results:
+            completed_models = set()
+            for result in valid_results:
+                if isinstance(result, dict) and "model" in result:
+                    model = result["model"]
+                    if model not in completed_models:
+                        completed_models.add(model)
+                        print(f"Completed: {model}")
+        print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")
+    # this is for local single model runs - for testing and development
+    except Exception as e:
+        print(f"EVALUATION STOPPED - API Error occurred:")
+        print(f"Error type: {type(e).__name__}")
+        print(f"Error message: {str(e)}")
+        return pd.DataFrame()
+    # Save results (skipped in test mode as we do not want to overwrite existing results)
+    if valid_results:
+        results_df = pd.DataFrame(valid_results)
+        # Aggregate results
+        results_df = (
+            results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
+            .agg({"score": "mean"})
+            .reset_index()
+        )
+        if not test_mode:
+            args = dict(orient="records", indent=2, force_ascii=False)
+            # Merge with existing results
+            if not old_results.empty:
+                results_df = pd.concat([old_results, results_df])
+                results_df = results_df.drop_duplicates(
+                    subset=["model", "bcp_47", "task", "metric", "origin"]
+                )
+            results_df = results_df.sort_values(
+                by=["model", "bcp_47", "task", "metric"]
+            )
+            results_df.to_json("results.json", **args)
+            # Save model and language info (always save complete metadata, not filtered)
+            original_models_df.to_json("models.json", **args)
+            original_languages_df.to_json("languages.json", **args)
+        else:
+            print("TEST MODE: Skipping results saving")
+        elapsed = time.time() - start_time
+        print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
+        return results_df
+    return pd.DataFrame()
 if __name__ == "__main__":

evals/models.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import re
 from collections import defaultdict
@@ -7,7 +8,6 @@ from os import getenv
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
-from elevenlabs import AsyncElevenLabs
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
@@ -22,14 +22,17 @@ important_models = [
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
-    # "openai/gpt-4o-2024-11-20", # 10$
-    "openai/gpt-3.5-turbo-0613",  # 2$
-    # "openai/gpt-3.5-turbo",  # 1.5$
-    # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "mistralai/mistral-nemo",  # 0.08$
@@ -48,10 +51,13 @@ important_models = [
     "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     "google/gemini-2.5-flash-preview-04-17",
@@ -59,6 +65,7 @@ blocklist = [
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
 ]
 transcription_models = [
@@ -93,28 +100,81 @@ def get_model(permaslug):
 @cache
 def get_historical_popular_models(date: date):
-    raw = get("https://openrouter.ai/rankings").text
-    data = re.search(r'{\\"data\\":(.*),\\"isPercentage\\"', raw).group(1)
-    data = json.loads(data.replace("\\", ""))
-    counts = defaultdict(int)
-    for day in data:
-        for model, count in day["ys"].items():
-            if model.startswith("openrouter") or model == "Others":
-                continue
-            counts[model.split(":")[0]] += count
-    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
-    models = [get_model(model) for model, _ in counts]
-    return [m for m in models if m]
 @cache
 def get_current_popular_models(date: date):
-    raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
-    data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
-    data = json.loads(data)
-    data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
-    models = [get_model(model["model_permaslug"]) for model in data]
-    return [m for m in models if m]
 def get_translation_models():
@@ -161,7 +221,10 @@ async def complete(**kwargs) -> str | None:
 translate_client = translate.Client()
-google_supported_languages = [l["language"] for l in translate_client.get_languages()]
 @cache
@@ -231,12 +294,15 @@ def get_hf_metadata(row):
         return empty
     try:
         info = api.model_info(id)
-        license = (
-            (info.card_data.license or "")
-            .replace("-", " ")
-            .replace("mit", "MIT")
-            .title()
-        )
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,
@@ -249,8 +315,14 @@ def get_hf_metadata(row):
 def get_cost(row):
-    cost = float(row["endpoint"]["pricing"]["completion"])
-    return round(cost * 1_000_000, 2)
 @cache
@@ -260,8 +332,17 @@ def load_models(date: date):
         + get_current_popular_models(date.today())[:10]
     )
     popular_models = [m["slug"] for m in popular_models]
-    models = set(important_models + popular_models) - set(blocklist)
-    models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
@@ -281,9 +362,18 @@ def load_models(date: date):
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
-    # models = models[models["cost"] <= 2.0].reset_index(drop=True)
     models["tasks"] = [
-        ["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
     ] * len(models)
     models = pd.concat([models, get_translation_models()])
     return models

+import asyncio
 import json
 import re
 from collections import defaultdict
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
+    "openai/gpt-5",
+    "openai/gpt-5-nano",  # include if/when available
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
+    "openai/gpt-4o-2024-11-20",  # 10$
+    "openai/gpt-oss-120b",
+    "anthropic/claude-3.7-sonnet",  # 15$ - added for full coverage
+    "anthropic/claude-sonnet-4",  # 15$ - added for full coverage
+    "anthropic/claude-opus-4.1",  # 15$ - added for full coverage
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "mistralai/mistral-nemo",  # 0.08$
     "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
+    "moonshotai/kimi-k2",  # 0.6$ - added to prevent missing from models.json
+    "x-ai/grok-4",
 ]
 blocklist = [
     "google/gemini-2.5-pro-preview",
+    "google/gemini-2.5-pro",
     "google/gemini-2.5-flash-preview",
     "google/gemini-2.5-flash-lite-preview",
     "google/gemini-2.5-flash-preview-04-17",
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
+    "perplexity/sonar-deep-research",
 ]
 transcription_models = [
 @cache
 def get_historical_popular_models(date: date):
+    try:
+        raw = get("https://openrouter.ai/rankings").text
+        # Extract model data from rankingData using regex
+        import re
+        import json
+        # Find all count and model_permaslug pairs in the data
+        # Format: "count":number,"model_permaslug":"model/name"
+        pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
+        matches = re.findall(pattern, raw)
+        if matches:
+            # Aggregate model counts
+            model_counts = {}
+            for count_str, model_slug in matches:
+                count = float(count_str)
+                if not model_slug.startswith("openrouter") and model_slug != "Others":
+                    # Remove variant suffixes for aggregation
+                    base_model = model_slug.split(":")[0]
+                    model_counts[base_model] = model_counts.get(base_model, 0) + count
+            # Sort by popularity and return top models
+            sorted_models = sorted(
+                model_counts.items(), key=lambda x: x[1], reverse=True
+            )
+            result = []
+            for model_slug, count in sorted_models[:20]:  # Top 20
+                result.append({"slug": model_slug, "count": int(count)})
+            return result
+        else:
+            return []
+    except Exception as e:
+        return []
 @cache
 def get_current_popular_models(date: date):
+    try:
+        raw = get("https://openrouter.ai/rankings?view=day").text
+        # Extract model data from daily rankings
+        import re
+        import json
+        # Find all count and model_permaslug pairs in the daily data
+        pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
+        matches = re.findall(pattern, raw)
+        if matches:
+            # Aggregate model counts
+            model_counts = {}
+            for count_str, model_slug in matches:
+                count = float(count_str)
+                if not model_slug.startswith("openrouter") and model_slug != "Others":
+                    # Remove variant suffixes for aggregation
+                    base_model = model_slug.split(":")[0]
+                    model_counts[base_model] = model_counts.get(base_model, 0) + count
+            # Sort by popularity and return top models
+            sorted_models = sorted(
+                model_counts.items(), key=lambda x: x[1], reverse=True
+            )
+            result = []
+            for model_slug, count in sorted_models[:10]:  # Top 10
+                result.append({"slug": model_slug, "count": int(count)})
+            return result
+        else:
+            return []
+    except Exception as e:
+        return []
 def get_translation_models():
 translate_client = translate.Client()
+def get_google_supported_languages():
+    return [l["language"] for l in translate_client.get_languages()]
 @cache
         return empty
     try:
         info = api.model_info(id)
+        license = ""
+        if (
+            info.card_data
+            and hasattr(info.card_data, "license")
+            and info.card_data.license
+        ):
+            license = (
+                info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
+            )
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,
 def get_cost(row):
+    """
+    row: a row from the OpenRouter models dataframe
+    """
+    try:
+        cost = float(row["endpoint"]["pricing"]["completion"])
+        return round(cost * 1_000_000, 2)
+    except (TypeError, KeyError):
+        return None
 @cache
         + get_current_popular_models(date.today())[:10]
     )
     popular_models = [m["slug"] for m in popular_models]
+    all_model_candidates = set(important_models + popular_models) - set(blocklist)
+    # Validate models exist on OpenRouter before including them
+    valid_models = []
+    for model_id in all_model_candidates:
+        metadata = get_or_metadata(model_id)
+        if metadata is not None:
+            valid_models.append(model_id)
+    models = pd.DataFrame(sorted(valid_models), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
+    # Filter out expensive models to keep costs reasonable
+    models = models[models["cost"] <= 15.0].reset_index(drop=True)
     models["tasks"] = [
+        [
+            "translation_from",
+            "translation_to",
+            "classification",
+            "mmlu",
+            "arc",
+            "truthfulqa",
+            "mgsm",
+        ]
     ] * len(models)
     models = pd.concat([models, get_translation_models()])
     return models

evals/plots.py CHANGED Viewed

@@ -9,34 +9,33 @@ df = pd.read_json("../results.json")
 df = df[df["metric"] != "chrf"]
 df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
 # Apply logit transformation to classification scores to reduce skewness
 def transform_classification_scores(row):
-    if row['task'] == 'classification':
         # Avoid division by zero and infinite values by clipping
-        score = np.clip(row['score'], 0.001, 0.999)
         # Apply logit transformation (log(p/(1-p)))
         return logit(score)
     else:
-        return row['score']
-df['score'] = df.apply(transform_classification_scores, axis=1)
 # Create a pivot table with tasks as columns and languages as rows
 pivot_df = df.pivot_table(
-    values='score',
-    index='bcp_47',
-    columns='task',
-    aggfunc='mean'
 )
 # Sort and filter tasks
 ordered_tasks = [
-    'translation_from',
-    'translation_to',
-    'classification',
-    'mmlu',
-    'arc',
-    'mgsm',
 ]
 # Drop 'truthfulqa' if present and reindex columns
 pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
@@ -46,29 +45,29 @@ correlation_matrix = pivot_df.corr()
 # Create the correlation plot
 plt.figure(figsize=(8, 6))
-# Create mask for upper triangle including diagonal to show only lower triangle
 mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
 # Create a heatmap
 sns.heatmap(
-    correlation_matrix,
-    annot=True,
-    cmap='Blues',
     center=0,
     square=True,
     mask=mask,
-    cbar_kws={"shrink": .8},
-    fmt='.3f'
 )
-plt.xlabel('Tasks', fontsize=12)
-plt.ylabel('Tasks', fontsize=12)
-plt.xticks(rotation=45, ha='right')
 plt.yticks(rotation=0)
 plt.tight_layout()
 # Save the plot
-plt.savefig('task_correlation_matrix.png', dpi=300, bbox_inches='tight')
 plt.show()
 # Print correlation values for reference
@@ -77,56 +76,91 @@ print("Note: Classification scores have been logit-transformed to reduce skewnes
 print(correlation_matrix.round(3))
 # Also create a scatter plot matrix for pairwise relationships with highlighted languages
-highlighted_languages = ['en', 'zh', 'hi', 'es', 'ar']
 # Create color mapping
 def get_color_and_label(lang_code):
     if lang_code in highlighted_languages:
-        color_map = {'en': 'red', 'zh': 'blue', 'hi': 'green', 'es': 'orange', 'ar': 'purple'}
         return color_map[lang_code], lang_code
     else:
-        return 'lightgray', 'Other'
 # Create custom scatter plot matrix
 tasks = pivot_df.columns.tolist()
 n_tasks = len(tasks)
 fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
-fig.suptitle('Pairwise Task Performance', fontsize=16, fontweight='bold')
 # Create legend elements
 legend_elements = []
 for lang in highlighted_languages:
     color, _ = get_color_and_label(lang)
-    legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=8, label=lang))
-legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgray', markersize=8, label='Other'))
 for i, task_y in enumerate(tasks):
     for j, task_x in enumerate(tasks):
         ax = axes[i, j]
         if i == j:
             # Diagonal: histogram
             task_data = pivot_df[task_y].dropna()
             colors = [get_color_and_label(lang)[0] for lang in task_data.index]
-            ax.hist(task_data, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
-            ax.set_title(f'{task_y}', fontsize=10)
         else:
             # Off-diagonal: scatter plot
             for lang_code in pivot_df.index:
-                if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(pivot_df.loc[lang_code, task_y]):
                     color, _ = get_color_and_label(lang_code)
                     alpha = 0.8 if lang_code in highlighted_languages else 0.3
                     size = 50 if lang_code in highlighted_languages else 20
-                    ax.scatter(pivot_df.loc[lang_code, task_x], pivot_df.loc[lang_code, task_y],
-                             c=color, alpha=alpha, s=size)
         # Set labels
         if i == n_tasks - 1:
             ax.set_xlabel(task_x, fontsize=10)
         if j == 0:
             ax.set_ylabel(task_y, fontsize=10)
         # Remove tick labels except for edges
         if i != n_tasks - 1:
             ax.set_xticklabels([])
@@ -136,15 +170,15 @@ for i, task_y in enumerate(tasks):
 # Add legend
 fig.legend(
     handles=legend_elements,
-    loc='lower center',
     bbox_to_anchor=(0.5, -0.05),
     ncol=len(legend_elements),
     frameon=False,
     fontsize=10,
     handletextpad=0.5,
-    columnspacing=1.0
 )
 plt.tight_layout()
-plt.savefig('task_scatter_matrix.png', dpi=300, bbox_inches='tight')
 plt.show()

 df = df[df["metric"] != "chrf"]
 df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
 # Apply logit transformation to classification scores to reduce skewness
 def transform_classification_scores(row):
+    if row["task"] == "classification":
         # Avoid division by zero and infinite values by clipping
+        score = np.clip(row["score"], 0.001, 0.999)
         # Apply logit transformation (log(p/(1-p)))
         return logit(score)
     else:
+        return row["score"]
+df["score"] = df.apply(transform_classification_scores, axis=1)
 # Create a pivot table with tasks as columns and languages as rows
 pivot_df = df.pivot_table(
+    values="score", index="bcp_47", columns="task", aggfunc="mean"
 )
 # Sort and filter tasks
 ordered_tasks = [
+    "translation_from",
+    "translation_to",
+    "classification",
+    "mmlu",
+    "arc",
+    "mgsm",
 ]
 # Drop 'truthfulqa' if present and reindex columns
 pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
 # Create the correlation plot
 plt.figure(figsize=(8, 6))
+# Create mask for upper triangle including diagonal to show only lower triangle
 mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
 # Create a heatmap
 sns.heatmap(
+    correlation_matrix,
+    annot=True,
+    cmap="Blues",
     center=0,
     square=True,
     mask=mask,
+    cbar_kws={"shrink": 0.8},
+    fmt=".3f",
 )
+plt.xlabel("Tasks", fontsize=12)
+plt.ylabel("Tasks", fontsize=12)
+plt.xticks(rotation=45, ha="right")
 plt.yticks(rotation=0)
 plt.tight_layout()
 # Save the plot
+plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
 plt.show()
 # Print correlation values for reference
 print(correlation_matrix.round(3))
 # Also create a scatter plot matrix for pairwise relationships with highlighted languages
+highlighted_languages = ["en", "zh", "hi", "es", "ar"]
 # Create color mapping
 def get_color_and_label(lang_code):
     if lang_code in highlighted_languages:
+        color_map = {
+            "en": "red",
+            "zh": "blue",
+            "hi": "green",
+            "es": "orange",
+            "ar": "purple",
+        }
         return color_map[lang_code], lang_code
     else:
+        return "lightgray", "Other"
 # Create custom scatter plot matrix
 tasks = pivot_df.columns.tolist()
 n_tasks = len(tasks)
 fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
+fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")
 # Create legend elements
 legend_elements = []
 for lang in highlighted_languages:
     color, _ = get_color_and_label(lang)
+    legend_elements.append(
+        plt.Line2D(
+            [0],
+            [0],
+            marker="o",
+            color="w",
+            markerfacecolor=color,
+            markersize=8,
+            label=lang,
+        )
+    )
+legend_elements.append(
+    plt.Line2D(
+        [0],
+        [0],
+        marker="o",
+        color="w",
+        markerfacecolor="lightgray",
+        markersize=8,
+        label="Other",
+    )
+)
 for i, task_y in enumerate(tasks):
     for j, task_x in enumerate(tasks):
         ax = axes[i, j]
         if i == j:
             # Diagonal: histogram
             task_data = pivot_df[task_y].dropna()
             colors = [get_color_and_label(lang)[0] for lang in task_data.index]
+            ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
+            ax.set_title(f"{task_y}", fontsize=10)
         else:
             # Off-diagonal: scatter plot
             for lang_code in pivot_df.index:
+                if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
+                    pivot_df.loc[lang_code, task_y]
+                ):
                     color, _ = get_color_and_label(lang_code)
                     alpha = 0.8 if lang_code in highlighted_languages else 0.3
                     size = 50 if lang_code in highlighted_languages else 20
+                    ax.scatter(
+                        pivot_df.loc[lang_code, task_x],
+                        pivot_df.loc[lang_code, task_y],
+                        c=color,
+                        alpha=alpha,
+                        s=size,
+                    )
         # Set labels
         if i == n_tasks - 1:
             ax.set_xlabel(task_x, fontsize=10)
         if j == 0:
             ax.set_ylabel(task_y, fontsize=10)
         # Remove tick labels except for edges
         if i != n_tasks - 1:
             ax.set_xticklabels([])
 # Add legend
 fig.legend(
     handles=legend_elements,
+    loc="lower center",
     bbox_to_anchor=(0.5, -0.05),
     ncol=len(legend_elements),
     frameon=False,
     fontsize=10,
     handletextpad=0.5,
+    columnspacing=1.0,
 )
 plt.tight_layout()
+plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
 plt.show()

evals/tasks.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import random
 from functools import partial
 from textwrap import dedent
@@ -5,10 +6,10 @@ from textwrap import dedent
 import evaluate
 import pandas as pd
 import sentencepiece as spm
 from datasets_.flores import flores_sentences
 from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
-from datasets_.arc import load_uhura_arc_easy
 from datasets_.truthfulqa import load_truthfulqa
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
@@ -47,6 +48,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
     if model == "google/translate-v2":
         original_language = closest_supported_match(
             original_language, supported_languages
@@ -66,7 +68,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             messages=[
                 {
                     "role": "user",
-                    "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
                 }
             ],
             temperature=0,
@@ -91,6 +93,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             "task": f"translation_{mode}",
             "metric": metric,
             "score": score,
             "sentence_nr": sentence_nr,
         }
         for metric, score in (
@@ -112,57 +115,33 @@ async def classify_and_evaluate(model, bcp_47, nr):
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
-    examples = pd.concat(
-        [
-            paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42)
-            for t in top_topics
-        ]
-    ).sample(frac=1, random_state=nr)
-    test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
-        frac=1, random_state=42
-    )
-    test_paragraph = test_paragraphs.iloc[nr]
-    def format_prompt(text):
-        return f"{text}\n\nTopic: {'|'.join(top_topics)}?"
-    messages = []
-    for example in examples.itertuples():
-        messages += [
-            {"role": "user", "content": format_prompt(example.text)},
-            {"role": "assistant", "content": example.topic},
-        ]
-    # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
-    # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
-    try:
-        pred = await complete(
-            model=model,
-            messages=[
-                *messages,
-                {
-                    "role": "user",
-                    "content": format_prompt(test_paragraph.text),
-                },
-            ],
-            temperature=0,
-            max_tokens=30,
-        )
-        true = test_paragraph.topic
-        others = [t for t in top_topics if t != true]
-        acc = (
-            int(
-                pred.startswith(true)
-                or (true in pred and not any(o in pred for o in others))
-            )
-            if pred
-            else 0
         )
-    except Exception as e:
-        if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
-            print(f"Max tokens exceeded for {model} in {bcp_47}")
-            acc = 0
-        else:
-            raise e
     return [
         {
             "model": model,
@@ -170,6 +149,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
             "task": "classification",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
@@ -232,39 +212,38 @@ def format_multiple_choice(item):
     A: {item["choices"][0]}
     B: {item["choices"][1]}
     C: {item["choices"][2]}
-    D: {item["choices"][3]}
-    A|B|C|D?"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
-    ds_name, examples, task = load_mmlu(language_bcp_47, nr)
     if not task:
         return []
-    messages = []
-    for example in examples:
-        messages += [
-            {"role": "user", "content": format_multiple_choice(example)},
-            {"role": "assistant", "content": example["answer"]},
-        ]
-    messages += [{"role": "user", "content": format_multiple_choice(task)}]
-    try:
-        response = await complete(
-            model=model,
-            messages=messages,
-            temperature=0,
-            max_tokens=1,
-        )
-        if response:
-            acc = int(response[:1].strip() == task["answer"])
-        else:
-            acc = 0
-    except Exception as e:
-        if "ResponsibleAIPolicyViolation" in str(e):
-            acc = 0
-        else:
-            raise e
     return [
         {
             "model": model,
@@ -272,39 +251,40 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
-    ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
-    messages = []
-    for example in examples:
-        messages += [
-            {"role": "user", "content": format_multiple_choice(example)},
-            {"role": "assistant", "content": example["answer"]},
-        ]
-    messages += [{"role": "user", "content": format_multiple_choice(task)}]
-    try:
-        response = await complete(
-            model=model,
-            messages=messages,
-            temperature=0,
-            max_tokens=1,
-        )
-        if response:
-            acc = int(response[:1].strip() == task["answer"])
-        else:
-            acc = 0
-    except Exception as e:
-        if "ResponsibleAIPolicyViolation" in str(e):
-            acc = 0
-        else:
-            raise e
     return [
         {
             "model": model,
@@ -312,6 +292,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
             "task": "arc",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
@@ -332,40 +313,42 @@ def format_multiple_choice_truthfulqa(item):
     text = item["question"] + "\n\n"
     for i, choice in enumerate(item["choices"]):
         text += f"{letters[i]}: {choice}\n"
-    text += "|".join(letters[: len(item["choices"])]) + "?"
     return text
 async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
-    ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
     if not task:
         return []
-    task = shuffle_choices_and_labels(task)
-    answer = letters[task["labels"].index(1)]
-    messages = []
-    for example in examples:
-        example = shuffle_choices_and_labels(example)
-        messages += [
-            {"role": "user", "content": format_multiple_choice_truthfulqa(example)},
-            {"role": "assistant", "content": letters[example["labels"].index(1)]},
-        ]
-    messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
-    try:
-        response = await complete(
-            model=model,
-            messages=messages,
-            temperature=0,
-            max_tokens=1,
-        )
-        if response:
-            acc = int(response[:1].strip() == answer)
-        else:
-            acc = 0
-    except Exception as e:
-        if "ResponsibleAIPolicyViolation" in str(e):
-            acc = 0
-        else:
-            raise e
     return [
         {
             "model": model,
@@ -373,30 +356,36 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
             "task": "truthfulqa",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
-    system_prompt = """
-    Solve the math problem. Use reasoning, and finally give the answer as a number.
-    Response format: <reasoning> #### <number>
-    """
-    system_prompt = dedent(system_prompt).strip()
-    ds_slug, question = load_mgsm(language_bcp_47, nr)
     if not question:
         return []
     response = await complete(
         model=model,
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": question["question"]},
-        ],
         temperature=0,
         max_tokens=1024,
     )
-    if response and len(response.split("####")) == 2:
         number = response.split("####")[1].strip()
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
@@ -409,6 +398,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
             "task": "mgsm",
             "metric": "accuracy",
             "score": accuracy,
             "sentence_nr": nr,
         }
     ]
@@ -449,10 +439,8 @@ tasks = {
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
     "classification": classify_and_evaluate,
-    # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
-    # "asr": transcribe_and_evaluate,
 }

+import asyncio
 import random
 from functools import partial
 from textwrap import dedent
 import evaluate
 import pandas as pd
 import sentencepiece as spm
+from datasets_.arc import load_uhura_arc_easy
 from datasets_.flores import flores_sentences
 from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
 from datasets_.truthfulqa import load_truthfulqa
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
+    translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
     if model == "google/translate-v2":
         original_language = closest_supported_match(
             original_language, supported_languages
             messages=[
                 {
                     "role": "user",
+                    "content": translation_prompt,
                 }
             ],
             temperature=0,
             "task": f"translation_{mode}",
             "metric": metric,
             "score": score,
+            "origin": "human",  # FLORES+ is human-translated
             "sentence_nr": sentence_nr,
         }
         for metric, score in (
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
+    test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
+    prompt = f"""Classify the following text into one of these topics: {", ".join(top_topics)}.
+Reply with only the topic name.
+Text:
+{test_paragraph.text}
+"""
+    response = await complete(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0,
+        max_tokens=30,
+    )
+    pred = response.lower().strip() if response else ""
+    true = test_paragraph.topic.lower().strip()
+    others = [t for t in top_topics if t != true]
+    acc = (
+        int(
+            pred.startswith(true)
+            or (true in pred and not any(o in pred for o in others))
         )
+        if pred
+        else 0
+    )
     return [
         {
             "model": model,
             "task": "classification",
             "metric": "accuracy",
             "score": acc,
+            "origin": "human",  # FLORES+ is human-translated
             "sentence_nr": nr,
         }
     ]
     A: {item["choices"][0]}
     B: {item["choices"][1]}
     C: {item["choices"][2]}
+    D: {item["choices"][3]}"""
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
+    ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
     if not task:
         return []
+    messages = [
+        {
+            "role": "user",
+            "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
+Response format: <reasoning> #### <letter>
+---
+{format_multiple_choice(task)}""",
+        },
+    ]
+    response = await complete(
+        model=model,
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+    )
+    if response and "####" in response:
+        answer = response.split("####")[-1].strip()
+        acc = int(answer[:1] == task["answer"])
+    else:
+        acc = 0
     return [
         {
             "model": model,
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
+            "origin": origin,  # Add origin tag to results
             "sentence_nr": nr,
         }
     ]
 async def arc_and_evaluate(model, language_bcp_47, nr):
+    ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
     if not task:
         return []
+    messages = [
+        {
+            "role": "user",
+            "content": f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
+Response format: <reasoning> #### <letter>
+---
+{format_multiple_choice(task)}""",
+        },
+    ]
+    response = await complete(
+        model=model,
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+    )
+    if response and "####" in response:
+        answer = response.split("####")[-1].strip()
+        acc = int(answer[:1] == task["answer"])
+    else:
+        acc = 0
     return [
         {
             "model": model,
             "task": "arc",
             "metric": "accuracy",
             "score": acc,
+            "origin": origin,
             "sentence_nr": nr,
         }
     ]
     text = item["question"] + "\n\n"
     for i, choice in enumerate(item["choices"]):
         text += f"{letters[i]}: {choice}\n"
     return text
 async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
+    ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
     if not task:
         return []
+    # Find the correct answer
+    correct_choice_index = task["labels"].index(1)
+    answer = letters[correct_choice_index]
+    messages = [
+        {
+            "role": "user",
+            "content": f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.
+Response format: <reasoning> #### <letter>
+---
+{format_multiple_choice_truthfulqa(task)}""",
+        },
+    ]
+    response = await complete(
+        model=model,
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,  # Increased for reasoning
+    )
+    if response and "####" in response:
+        pred_answer = response.split("####")[-1].strip()
+        acc = int(pred_answer[:1].upper() == answer)
+    else:
+        acc = 0
     return [
         {
             "model": model,
             "task": "truthfulqa",
             "metric": "accuracy",
             "score": acc,
+            "origin": origin,
             "sentence_nr": nr,
         }
     ]
 async def mgsm_and_evaluate(model, language_bcp_47, nr):
+    ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
     if not question:
         return []
+    messages = [
+        {
+            "role": "user",
+            "content": f"""Solve the following math problem. Reason step-by-step and then write the final answer as a number.
+Response format: <reasoning> #### <number>
+---
+{question["question"]}""",
+        },
+    ]
     response = await complete(
         model=model,
+        messages=messages,
         temperature=0,
         max_tokens=1024,
     )
+    if response and "####" in response:
         number = response.split("####")[1].strip()
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
             "task": "mgsm",
             "metric": "accuracy",
             "score": accuracy,
+            "origin": origin,
             "sentence_nr": nr,
         }
     ]
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
     "classification": classify_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     "arc": arc_and_evaluate,
     "truthfulqa": truthfulqa_and_evaluate,
     "mgsm": mgsm_and_evaluate,
 }

evals/translate.py CHANGED Viewed

@@ -6,4 +6,4 @@ from datasets_.mmlu import translate_mmlu
 if __name__ == "__main__":
     translate_mmlu(languages)
     translate_mgsm(languages)
-    translate_arc(languages)

 if __name__ == "__main__":
     translate_mmlu(languages)
     translate_mgsm(languages)
+    translate_arc(languages)

frontend/package-lock.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

frontend/package.json CHANGED Viewed

@@ -6,13 +6,12 @@
     "@observablehq/plot": "^0.6.17",
     "@testing-library/dom": "^10.4.0",
     "@testing-library/jest-dom": "^6.6.3",
-    "@testing-library/react": "^16.2.0",
     "@testing-library/user-event": "^13.5.0",
     "primeicons": "^7.0.0",
     "primereact": "^10.9.3",
-    "react": "^19.0.0",
-    "react-dom": "^19.0.0",
-    "react-scripts": "5.0.1",
     "topojson-simplify": "^3.0.3",
     "web-vitals": "^2.1.4"
   },
@@ -41,5 +40,8 @@
       "last 1 safari version"
     ]
   },
-  "proxy": "http://localhost:8000"
 }

     "@observablehq/plot": "^0.6.17",
     "@testing-library/dom": "^10.4.0",
     "@testing-library/jest-dom": "^6.6.3",
+    "@testing-library/react": "^15.0.0",
     "@testing-library/user-event": "^13.5.0",
     "primeicons": "^7.0.0",
     "primereact": "^10.9.3",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0",
     "topojson-simplify": "^3.0.3",
     "web-vitals": "^2.1.4"
   },
       "last 1 safari version"
     ]
   },
+  "proxy": "http://localhost:8000",
+  "devDependencies": {
+    "react-scripts": "^5.0.1"
+  }
 }

frontend/src/App.js CHANGED Viewed

@@ -19,9 +19,14 @@ function App () {
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
   const [dialogVisible, setDialogVisible] = useState(false)
   const [aboutVisible, setAboutVisible] = useState(false)
   const [contributeVisible, setContributeVisible] = useState(false)
   useEffect(() => {
     fetch('/api/data', {
@@ -36,6 +41,7 @@ function App () {
       })
       .then(jsonData => {
         setData(jsonData)
         setLoading(false)
       })
       .catch(err => {
@@ -44,8 +50,27 @@ function App () {
       })
   }, [selectedLanguages])
   const [windowWidth, setWindowWidth] = useState(window.innerWidth)
   const [windowHeight, setWindowHeight] = useState(window.innerHeight)
   useEffect(() => {
     const handleResize = () => {
       setWindowWidth(window.innerWidth)
@@ -55,6 +80,44 @@ function App () {
     return () => window.removeEventListener('resize', handleResize)
   }, [])
   return (
     <PrimeReactProvider>
       <div
@@ -69,35 +132,50 @@ function App () {
           style={{
             backgroundColor: '#fff3cd',
             color: '#856404',
-            padding: '0.75rem 1.25rem',
             marginBottom: '1rem',
             border: '1px solid #ffeeba',
             borderRadius: '0.25rem',
-            textAlign: 'center'
           }}
         >
           <strong>Work in Progress:</strong> This dashboard is currently under
-          active development. Evaluation results are not yet final.
           <a
             href='https://github.com/datenlabor-bmz/ai-language-monitor'
             target='_blank'
             rel='noopener noreferrer'
             style={{
               textDecoration: 'none',
-              color: '#856404',
-              float: 'right',
-              fontSize: '1.2rem',
-              fontWeight: 'bold',
-              padding: '0 0.5rem',
-              borderRadius: '3px',
-              backgroundColor: 'rgba(255,255,255,0.3)'
             }}
           >
-            <i
-              className='pi pi-github'
-              title='View on GitHub'
-              style={{ marginRight: '0.3rem' }}
-            />
             GitHub
           </a>
         </div>
@@ -149,39 +227,88 @@ function App () {
           <div
             style={{
               display: 'flex',
-              gap: '1rem',
-              marginBottom: '1.5rem',
               flexWrap: 'wrap',
               justifyContent: 'center'
             }}
           >
-            <Button
-              label='📚 About this tool'
-              className='p-button-text'
               onClick={() => setAboutVisible(true)}
               style={{
-                color: '#666',
-                border: '1px solid #ddd',
-                padding: '0.5rem 1rem',
-                borderRadius: '4px',
-                fontSize: '0.9rem'
               }}
-            />
-            <Button
-              label='🚀 Add your model (soon)'
-              className='p-button-text'
               onClick={() => setContributeVisible(true)}
-              tooltip='This feature is on our roadmap and will be available soon.'
-              tooltipOptions={{ position: 'bottom' }}
               style={{
-                color: '#666',
-                border: '1px solid #ddd',
-                padding: '0.5rem 1rem',
-                borderRadius: '4px',
-                fontSize: '0.9rem'
               }}
-            />
           </div>
           {data && (
@@ -220,6 +347,7 @@ function App () {
                 data={data.model_table}
                 selectedLanguages={selectedLanguages}
                 allLanguages={data.language_table || []}
               />
               <LanguageTable
                 data={data.language_table}
@@ -248,20 +376,18 @@ function App () {
                     color: '#666'
                   }}
                 />
-                <Carousel
-                  value={[
-                    <WorldMap data={data.countries} />,
-                    <LanguagePlot data={data} />,
-                    <SpeakerPlot data={data} />,
-                    <HistoryPlot data={data} />,
-                    <CostPlot data={data} />
-                  ]}
-                  numScroll={1}
-                  numVisible={1}
-                  itemTemplate={item => item}
-                  circular
-                  style={{ width: '100%', minHeight: '650px' }}
-                />
               </div>
             </>
           )}
@@ -409,36 +535,16 @@ function App () {
           modal
           header={null}
         >
-          {data && (
             <div style={{ width: '100%', height: '100%' }}>
               <Carousel
-                value={[
-                  <WorldMap
-                    data={data.countries}
-                    width={windowWidth * 0.7}
-                    height={windowHeight * 0.6}
-                  />,
-                  <LanguagePlot
-                    data={data}
-                    width={windowWidth * 0.7}
-                    height={windowHeight * 0.6}
-                  />,
-                  <SpeakerPlot
-                    data={data}
-                    width={windowWidth * 0.7}
-                    height={windowHeight * 0.6}
-                  />,
-                  <HistoryPlot
-                    data={data}
-                    width={windowWidth * 0.7}
-                    height={windowHeight * 0.6}
-                  />,
-                  <CostPlot data={data} />
-                ]}
                 numScroll={1}
                 numVisible={1}
                 itemTemplate={item => item}
-                circular
                 style={{ width: '100%', height: 'calc(90vh - 120px)' }}
               />
             </div>
@@ -449,4 +555,4 @@ function App () {
   )
 }
-export default App

   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
+  const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
   const [dialogVisible, setDialogVisible] = useState(false)
   const [aboutVisible, setAboutVisible] = useState(false)
   const [contributeVisible, setContributeVisible] = useState(false)
+  // Add state for carousel items
+  const [carouselItems, setCarouselItems] = useState([])
+  const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
   useEffect(() => {
     fetch('/api/data', {
       })
       .then(jsonData => {
         setData(jsonData)
+        setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
         setLoading(false)
       })
       .catch(err => {
       })
   }, [selectedLanguages])
+  // Create carousel items when data is loaded
+  useEffect(() => {
+    if (data) {
+      // Add a small delay to ensure components are ready
+      const timer = setTimeout(() => {
+        setCarouselItems([
+          <WorldMap key="worldmap-0" data={data.countries} allLanguages={data.language_table} width={750} height={500} />,
+          <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
+          <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
+          <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
+          <CostPlot key="costplot-4" data={data} width={750} height={500} />
+        ]);
+      }, 100);
+      return () => clearTimeout(timer);
+    }
+  }, [data])
   const [windowWidth, setWindowWidth] = useState(window.innerWidth)
   const [windowHeight, setWindowHeight] = useState(window.innerHeight)
   useEffect(() => {
     const handleResize = () => {
       setWindowWidth(window.innerWidth)
     return () => window.removeEventListener('resize', handleResize)
   }, [])
+  // Create full-screen carousel items when data or window size changes
+  useEffect(() => {
+    if (data) {
+      const timer = setTimeout(() => {
+        setFullScreenCarouselItems([
+          <WorldMap
+            key="fs-worldmap-0"
+            data={data.countries}
+            allLanguages={data.language_table}
+            width={windowWidth * 0.7}
+            height={windowHeight * 0.6}
+          />,
+          <LanguagePlot
+            key="fs-langplot-1"
+            data={data}
+            width={windowWidth * 0.7}
+            height={windowHeight * 0.6}
+          />,
+          <SpeakerPlot
+            key="fs-speakerplot-2"
+            data={data}
+            width={windowWidth * 0.7}
+            height={windowHeight * 0.6}
+          />,
+          <HistoryPlot
+            key="fs-histplot-3"
+            data={data}
+            width={windowWidth * 0.7}
+            height={windowHeight * 0.6}
+          />,
+          <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
+        ]);
+      }, 100);
+      return () => clearTimeout(timer);
+    }
+  }, [data, windowWidth, windowHeight])
   return (
     <PrimeReactProvider>
       <div
           style={{
             backgroundColor: '#fff3cd',
             color: '#856404',
+            padding: '1rem 1.5rem',
             marginBottom: '1rem',
             border: '1px solid #ffeeba',
             borderRadius: '0.25rem',
+            textAlign: 'center',
+            lineHeight: '1.5',
+            position: 'relative'
           }}
         >
           <strong>Work in Progress:</strong> This dashboard is currently under
+          active development. Evaluation results are not yet final. Note that the visualised results currently stem from sampling 20 instances per combination of model, task, and language. We have evaluated 139 languages across 41 models and 7 tasks, totaling over 300,000 individual evaluations. Only the top 150 languages by speaker count are included in the current evaluation scope. More extensive evaluation runs will be released later this year.
+        </div>
+        <div
+          style={{
+            display: 'flex',
+            justifyContent: 'flex-end',
+            padding: '0 1.5rem',
+            marginBottom: '1rem'
+          }}
+        >
           <a
             href='https://github.com/datenlabor-bmz/ai-language-monitor'
             target='_blank'
             rel='noopener noreferrer'
             style={{
               textDecoration: 'none',
+              color: '#6c757d',
+              fontSize: '1rem',
+              fontWeight: '500',
+              padding: '0.5rem 1rem',
+              borderRadius: '0.375rem',
+              backgroundColor: '#f8f9fa',
+              border: '1px solid #e9ecef',
+              display: 'flex',
+              alignItems: 'center',
+              gap: '0.5rem',
+              transition: 'all 0.2s ease',
+              ':hover': {
+                backgroundColor: '#e9ecef',
+                color: '#495057'
+              }
             }}
           >
+            <i className='pi pi-github' title='View on GitHub' />
             GitHub
           </a>
         </div>
           <div
             style={{
               display: 'flex',
+              gap: '0.75rem',
+              marginBottom: '2rem',
               flexWrap: 'wrap',
               justifyContent: 'center'
             }}
           >
+            <button
               onClick={() => setAboutVisible(true)}
               style={{
+                background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
+                color: 'white',
+                border: 'none',
+                padding: '0.75rem 1.5rem',
+                borderRadius: '12px',
+                fontSize: '0.95rem',
+                fontWeight: '500',
+                cursor: 'pointer',
+                display: 'flex',
+                alignItems: 'center',
+                gap: '0.5rem',
+                boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
+                transition: 'all 0.3s ease',
+                ':hover': {
+                  transform: 'translateY(-2px)',
+                  boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
+                }
               }}
+              onMouseEnter={(e) => {
+                e.target.style.transform = 'translateY(-2px)';
+                e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
+              }}
+              onMouseLeave={(e) => {
+                e.target.style.transform = 'translateY(0)';
+                e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
+              }}
+            >
+              <span style={{ fontSize: '1.1rem' }}>📚</span>
+              About this tool
+            </button>
+            <button
               onClick={() => setContributeVisible(true)}
+              title='This feature is on our roadmap and will be available soon.'
               style={{
+                background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
+                color: '#6b46c1',
+                border: 'none',
+                padding: '0.75rem 1.5rem',
+                borderRadius: '12px',
+                fontSize: '0.95rem',
+                fontWeight: '500',
+                cursor: 'pointer',
+                display: 'flex',
+                alignItems: 'center',
+                gap: '0.5rem',
+                boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
+                transition: 'all 0.3s ease',
+                position: 'relative',
+                overflow: 'hidden'
               }}
+              onMouseEnter={(e) => {
+                e.target.style.transform = 'translateY(-2px)';
+                e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
+              }}
+              onMouseLeave={(e) => {
+                e.target.style.transform = 'translateY(0)';
+                e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
+              }}
+            >
+              <span style={{ fontSize: '1.1rem' }}>🚀</span>
+              Add your model
+              <span style={{
+                fontSize: '0.75rem',
+                backgroundColor: 'rgba(107, 70, 193, 0.15)',
+                padding: '0.2rem 0.5rem',
+                borderRadius: '6px',
+                marginLeft: '0.5rem',
+                fontWeight: '600'
+              }}>
+                soon
+              </span>
+            </button>
           </div>
           {data && (
                 data={data.model_table}
                 selectedLanguages={selectedLanguages}
                 allLanguages={data.language_table || []}
+                machineTranslatedMetrics={machineTranslatedMetrics}
               />
               <LanguageTable
                 data={data.language_table}
                     color: '#666'
                   }}
                 />
+                {carouselItems.length > 0 && (
+                  <Carousel
+                    key={`main-carousel-${carouselItems.length}-${Date.now()}`}
+                    value={carouselItems}
+                    numScroll={1}
+                    numVisible={1}
+                    itemTemplate={item => item}
+                    circular={false}
+                    activeIndex={0}
+                    style={{ width: '100%', minHeight: '650px' }}
+                  />
+                )}
               </div>
             </>
           )}
           modal
           header={null}
         >
+          {fullScreenCarouselItems.length > 0 && (
             <div style={{ width: '100%', height: '100%' }}>
               <Carousel
+                key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
+                value={fullScreenCarouselItems}
                 numScroll={1}
                 numVisible={1}
                 itemTemplate={item => item}
+                circular={false}
+                activeIndex={0}
                 style={{ width: '100%', height: 'calc(90vh - 120px)' }}
               />
             </div>
   )
 }
+export default App

frontend/src/components/HistoryPlot.js CHANGED Viewed

@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
             ...models.filter(d => d.newRecord),
             {
               creation_date: new Date(),
-              maxAverage: models[models.length - 1].maxAverage
             }
           ],
           {
             x: d => d.creation_date,
-            y: d => d.maxAverage,
             curve: 'step-after',
             strokeOpacity: 0.3
           }

             ...models.filter(d => d.newRecord),
             {
               creation_date: new Date(),
+              maxAverage: models[models.length - 1]?.maxAverage || 0
             }
           ],
           {
             x: d => d.creation_date,
+            y: d => d.maxAverage || 0,
             curve: 'step-after',
             strokeOpacity: 0.3
           }

frontend/src/components/LanguageTable.js CHANGED Viewed

@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
         filterElement={familyRowFilterTemplate}
         style={{ minWidth: '10rem' }}
       />
-      {ScoreColumns}
     </DataTable>
   )
 }

         filterElement={familyRowFilterTemplate}
         style={{ minWidth: '10rem' }}
       />
+      {ScoreColumns()}
     </DataTable>
   )
 }

frontend/src/components/ModelTable.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
 import Medal from './Medal'
 import { Slider } from 'primereact/slider'
 import ScoreColumns from './ScoreColumns'
-const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
   const [filters, setFilters] = useState({
     type: { value: null, matchMode: FilterMatchMode.IN },
     size: { value: null, matchMode: FilterMatchMode.BETWEEN },
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
   }
   const SliderWithLabel = ({ value, onChange, min, max }) => {
-    const p = 10
-    const start = value === null ? min : Math.log(value[0]) / Math.log(p)
-    const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
-    const [_value, _setValue] = useState([start, stop])
     useEffect(() => {
       const timer = setTimeout(() => {
         onChange({
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
             // set to "no filter" when (almost) the whole range is selected
             _value[0] <= min + 0.1 && _value[1] >= max - 0.1
               ? null
-              : [p ** _value[0], p ** _value[1]]
-        })
-      }, 1000)
-      return () => clearTimeout(timer)
-    }, [_value, onChange, min, max])
     return (
       <div style={{ minWidth: '20rem' }}>
         <div>{formatSize(p ** _value[0])}</div>
@@ -147,21 +147,35 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
   }
   const costBodyTemplate = rowData => {
-    return <div style={{ textAlign: 'center' }}>${rowData.cost?.toFixed(2)}</div>
   }
   const getHeaderText = () => {
-    // Count languages that have evaluation data (average score available)
-    const evaluatedLanguagesCount = allLanguages.filter(lang =>
-      lang.average !== null && lang.average !== undefined
-    ).length
     if (selectedLanguages.length === 0) {
       return (
         <span>
           <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
           <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
-            Average performance across {evaluatedLanguagesCount} evaluated languages
           </span>
         </span>
       )
@@ -245,7 +259,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
         body={costBodyTemplate}
         style={{ minWidth: '5rem' }}
       />
-      {ScoreColumns}
     </DataTable>
   )
 }

 import Medal from './Medal'
 import { Slider } from 'primereact/slider'
 import ScoreColumns from './ScoreColumns'
+const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
   const [filters, setFilters] = useState({
     type: { value: null, matchMode: FilterMatchMode.IN },
     size: { value: null, matchMode: FilterMatchMode.BETWEEN },
   }
   const SliderWithLabel = ({ value, onChange, min, max }) => {
+    const p = 10;
+    const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
+    const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
+    const [_value, _setValue] = useState([start, stop]);
     useEffect(() => {
       const timer = setTimeout(() => {
         onChange({
             // set to "no filter" when (almost) the whole range is selected
             _value[0] <= min + 0.1 && _value[1] >= max - 0.1
               ? null
+              : [p ** _value[0], p ** _value[1]],
+        });
+      }, 1000);
+      return () => clearTimeout(timer);
+    }, [_value, onChange, min, max]);
     return (
       <div style={{ minWidth: '20rem' }}>
         <div>{formatSize(p ** _value[0])}</div>
   }
   const costBodyTemplate = rowData => {
+    return (
+      <div style={{ textAlign: 'center' }}>
+        {rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
+      </div>
+    )
   }
   const getHeaderText = () => {
+    // Count languages that have any evaluation data (any task scores available)
+    const evaluatedLanguagesCount = allLanguages.filter(lang => {
+      // Check if language has any task scores (not just average)
+      const hasAnyScores = [
+        'translation_from_bleu',
+        'translation_to_bleu',
+        'classification_accuracy',
+        'mmlu_accuracy',
+        'arc_accuracy',
+        'truthfulqa_accuracy',
+        'mgsm_accuracy'
+      ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
+      return hasAnyScores
+    }).length
     if (selectedLanguages.length === 0) {
       return (
         <span>
           <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
           <span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
+            Performance across {evaluatedLanguagesCount} evaluated languages
           </span>
         </span>
       )
         body={costBodyTemplate}
         style={{ minWidth: '5rem' }}
       />
+      {ScoreColumns(machineTranslatedMetrics)}
     </DataTable>
   )
 }

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -2,21 +2,28 @@ import { Column } from 'primereact/column'
 import ScoreField from './ScoreField'
 const scoreBodyTemplate = (field, options = {}) => {
-  const { minScore = 0, maxScore = 1 } = options
   return rowData => {
     const score = rowData[field]
-    return ScoreField(score, minScore, maxScore)
   }
 }
-const ScoreColumns = [
   <Column
     field='average'
     header='Proficiency'
     headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
-    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
@@ -26,7 +33,8 @@ const ScoreColumns = [
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
-      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -37,7 +45,8 @@ const ScoreColumns = [
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
-      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -48,7 +57,8 @@ const ScoreColumns = [
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
       minScore: 0,
-      maxScore: 0.5
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -69,7 +79,8 @@ const ScoreColumns = [
     sortable
     body={scoreBodyTemplate('mmlu_accuracy', {
       minScore: 0,
-      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -80,7 +91,8 @@ const ScoreColumns = [
     sortable
     body={scoreBodyTemplate('arc_accuracy', {
       minScore: 0,
-      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
@@ -91,7 +103,8 @@ const ScoreColumns = [
     sortable
     body={scoreBodyTemplate('mgsm_accuracy', {
       minScore: 0,
-      maxScore: 1
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,

 import ScoreField from './ScoreField'
 const scoreBodyTemplate = (field, options = {}) => {
+  const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
   return rowData => {
     const score = rowData[field]
+    // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
+    // otherwise fall back to global list
+    const rowFlagKey = `${field}_is_machine`
+    const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
+    const isMachineTranslated = hasRowFlag
+      ? !!rowData[rowFlagKey]
+      : machineTranslatedMetrics.includes(field)
+    return ScoreField(score, minScore, maxScore, isMachineTranslated)
   }
 }
+const ScoreColumns = (machineTranslatedMetrics = []) => [
   <Column
     field='average'
     header='Proficiency'
     headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
+    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
+      maxScore: 0.5,
+      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
+      maxScore: 0.5,
+      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
       minScore: 0,
+      maxScore: 0.5,
+      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('mmlu_accuracy', {
       minScore: 0,
+      maxScore: 1,
+      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('arc_accuracy', {
       minScore: 0,
+      maxScore: 1,
+      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
     sortable
     body={scoreBodyTemplate('mgsm_accuracy', {
       minScore: 0,
+      maxScore: 1,
+      machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,

frontend/src/components/ScoreField.js CHANGED Viewed

@@ -1,4 +1,4 @@
-const ScoreField = (score, minScore, maxScore) => {
   let percentage = 100
   let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
   if (score !== null) {
@@ -50,6 +50,7 @@ const ScoreField = (score, minScore, maxScore) => {
         }}
       >
         {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
       </span>
     </div>
   )

+const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
   let percentage = 100
   let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
   if (score !== null) {
         }}
       >
         {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
+        {isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
       </span>
     </div>
   )

frontend/src/components/SpeakerPlot.js CHANGED Viewed

@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
-        Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
           x: 40,
           y: languages[39].cumSpeakers / 1e6
-        })
       ]
     })
     containerRef.current.append(plot)

           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
+        ...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
           x: 40,
           y: languages[39].cumSpeakers / 1e6
+        })] : [])
       ]
     })
     containerRef.current.append(plot)

frontend/src/components/WorldMap.js CHANGED Viewed

@@ -26,13 +26,13 @@ const makeTitle = data => d => {
         a =>
           `${smoothProgressBar(a.population / pop)} ${
             a.name
-          } – ${a.score.toFixed(2)}`
       )
       .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
-  return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
 }
-const WorldMap = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
@@ -48,8 +48,22 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
       acc[country.iso2] = country
       return acc
     }, {})
     const plot = Plot.plot({
-      subtitle: 'Language Proficiency Score by Country',
       width: width,
       height: height,
       projection: 'equal-earth',
@@ -61,11 +75,12 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
         })
       ],
       color: {
-        scheme: 'Greens',
-        unknown: 'gray',
         label: 'Score',
         legend: true,
-        domain: [0, 1]
       },
       style: {
         fontFamily: 'monospace'

         a =>
           `${smoothProgressBar(a.population / pop)} ${
             a.name
+          } – ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
       )
       .join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
+  return `${d.properties.ADMIN} – ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
 }
+const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
       acc[country.iso2] = country
       return acc
     }, {})
+    // Count languages that have any evaluation data
+    const evaluatedLanguagesCount = allLanguages.filter(lang => {
+      const hasAnyScores = [
+        'translation_from_bleu',
+        'translation_to_bleu',
+        'classification_accuracy',
+        'mmlu_accuracy',
+        'arc_accuracy',
+        'truthfulqa_accuracy',
+        'mgsm_accuracy'
+      ].some(metric => lang[metric] !== null && lang[metric] !== undefined)
+      return hasAnyScores
+    }).length
     const plot = Plot.plot({
+      subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
       width: width,
       height: height,
       projection: 'equal-earth',
         })
       ],
       color: {
+        scheme: 'RdYlGn',
+        unknown: '#d0d0d0',
         label: 'Score',
         legend: true,
+        domain: [0, 1],
+        pivot: 0.5
       },
       style: {
         fontFamily: 'monospace'

languages.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
-    "commonvoice_hours":2674.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
@@ -32,7 +32,7 @@
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
-    "commonvoice_locale":"hi-IN",
     "in_benchmark":true
   },
   {
@@ -43,7 +43,7 @@
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
-    "commonvoice_hours":448.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
@@ -79,7 +79,7 @@
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
-    "commonvoice_hours":1065.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
@@ -103,7 +103,7 @@
     "family":"Indo-European",
     "flores_path":"por_Latn",
     "fleurs_tag":"pt_br",
-    "commonvoice_hours":180.0,
     "commonvoice_locale":"pt",
     "in_benchmark":true
   },
@@ -115,7 +115,7 @@
     "family":"Indo-European",
     "flores_path":"pan_Guru",
     "fleurs_tag":"pa_in",
-    "commonvoice_hours":2.3,
     "commonvoice_locale":"pa-IN",
     "in_benchmark":true
   },
@@ -127,7 +127,7 @@
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
-    "commonvoice_hours":245.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
@@ -139,7 +139,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
-    "commonvoice_hours":411.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
@@ -151,7 +151,7 @@
     "family":"Austronesian",
     "flores_path":"ind_Latn",
     "fleurs_tag":"id_id",
-    "commonvoice_hours":33.0,
     "commonvoice_locale":"id",
     "in_benchmark":true
   },
@@ -163,7 +163,7 @@
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
-    "commonvoice_hours":1369.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
@@ -379,7 +379,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":"ps_af",
-    "commonvoice_hours":81.0,
     "commonvoice_locale":"ps",
     "in_benchmark":false
   },
@@ -439,7 +439,7 @@
     "family":"Indo-European",
     "flores_path":"pol_Latn",
     "fleurs_tag":"pl_pl",
-    "commonvoice_hours":175.0,
     "commonvoice_locale":"pl",
     "in_benchmark":true
   },
@@ -619,7 +619,7 @@
     "family":"Indo-European",
     "flores_path":"nld_Latn",
     "fleurs_tag":"nl_nl",
-    "commonvoice_hours":120.0,
     "commonvoice_locale":"nl",
     "in_benchmark":true
   },
@@ -655,7 +655,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
-    "commonvoice_hours":6.3,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
@@ -979,7 +979,7 @@
     "family":"Turkic",
     "flores_path":"kaz_Cyrl",
     "fleurs_tag":"kk_kz",
-    "commonvoice_hours":2.2,
     "commonvoice_locale":"kk",
     "in_benchmark":true
   },
@@ -1027,7 +1027,7 @@
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
-    "commonvoice_hours":93.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
@@ -1099,7 +1099,7 @@
     "family":"Indo-European",
     "flores_path":"ckb_Arab",
     "fleurs_tag":"ckb_iq",
-    "commonvoice_hours":135.0,
     "commonvoice_locale":"ckb",
     "in_benchmark":true
   },
@@ -1183,7 +1183,7 @@
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
-    "commonvoice_hours":1810.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
@@ -1207,7 +1207,7 @@
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
-    "commonvoice_hours":0.4,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
@@ -1243,7 +1243,7 @@
     "family":"Indo-European",
     "flores_path":"afr_Latn",
     "fleurs_tag":"af_za",
-    "commonvoice_hours":0.5,
     "commonvoice_locale":"af",
     "in_benchmark":true
   },
@@ -1291,7 +1291,7 @@
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
-    "commonvoice_hours":2863.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
@@ -1303,7 +1303,7 @@
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
-    "commonvoice_hours":1.4,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
@@ -1375,7 +1375,7 @@
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
-    "commonvoice_hours":411.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
@@ -1519,7 +1519,7 @@
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":69.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
@@ -1555,7 +1555,7 @@
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
-    "commonvoice_hours":51.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
@@ -1675,7 +1675,7 @@
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":4.0,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
@@ -1747,7 +1747,7 @@
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
-    "commonvoice_hours":0.5,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
@@ -2155,7 +2155,7 @@
     "family":"Kartvelian",
     "flores_path":"kat_Geor",
     "fleurs_tag":"ka_ge",
-    "commonvoice_hours":166.0,
     "commonvoice_locale":"ka",
     "in_benchmark":true
   },
@@ -2167,7 +2167,7 @@
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
-    "commonvoice_hours":117.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
@@ -2323,7 +2323,7 @@
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":1.2,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
@@ -2623,7 +2623,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.9,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
@@ -2695,7 +2695,7 @@
     "family":"Indo-European",
     "flores_path":"oci_Latn",
     "fleurs_tag":"oc_fr",
-    "commonvoice_hours":1.8,
     "commonvoice_locale":"oc",
     "in_benchmark":true
   },
@@ -3175,8 +3175,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
@@ -3319,8 +3319,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
@@ -3331,7 +3331,7 @@
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
-    "commonvoice_hours":8.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
@@ -3487,7 +3487,7 @@
     "family":"Indo-European",
     "flores_path":"lvs_Latn",
     "fleurs_tag":"lv_lv",
-    "commonvoice_hours":262.0,
     "commonvoice_locale":"lv",
     "in_benchmark":true
   },
@@ -3535,7 +3535,7 @@
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":440.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
@@ -3559,7 +3559,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":83.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
@@ -3679,7 +3679,7 @@
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
-    "commonvoice_hours":0.7,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
@@ -3991,8 +3991,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
@@ -4099,8 +4099,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
@@ -4351,7 +4351,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":29.0,
     "commonvoice_locale":"br",
     "in_benchmark":false
   },
@@ -4651,7 +4651,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":30.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
@@ -5011,7 +5011,7 @@
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },
@@ -7879,7 +7879,7 @@
     "family":"Artificial Language",
     "flores_path":"epo_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":1436.0,
     "commonvoice_locale":"eo",
     "in_benchmark":true
   },

     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
+    "commonvoice_hours":2683.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
+    "commonvoice_locale":"hi",
     "in_benchmark":true
   },
   {
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
+    "commonvoice_hours":449.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
+    "commonvoice_hours":1073.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"por_Latn",
     "fleurs_tag":"pt_br",
+    "commonvoice_hours":181.0,
     "commonvoice_locale":"pt",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"pan_Guru",
     "fleurs_tag":"pa_in",
+    "commonvoice_hours":2.5,
     "commonvoice_locale":"pa-IN",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
+    "commonvoice_hours":247.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
+    "commonvoice_hours":412.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
     "family":"Austronesian",
     "flores_path":"ind_Latn",
     "fleurs_tag":"id_id",
+    "commonvoice_hours":34.0,
     "commonvoice_locale":"id",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
+    "commonvoice_hours":1372.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":"ps_af",
+    "commonvoice_hours":82.0,
     "commonvoice_locale":"ps",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"pol_Latn",
     "fleurs_tag":"pl_pl",
+    "commonvoice_hours":176.0,
     "commonvoice_locale":"pl",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nld_Latn",
     "fleurs_tag":"nl_nl",
+    "commonvoice_hours":123.0,
     "commonvoice_locale":"nl",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
+    "commonvoice_hours":6.4,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"kaz_Cyrl",
     "fleurs_tag":"kk_kz",
+    "commonvoice_hours":2.3,
     "commonvoice_locale":"kk",
     "in_benchmark":true
   },
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
+    "commonvoice_hours":94.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"ckb_Arab",
     "fleurs_tag":"ckb_iq",
+    "commonvoice_hours":136.0,
     "commonvoice_locale":"ckb",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
+    "commonvoice_hours":1812.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
+    "commonvoice_hours":0.6,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"afr_Latn",
     "fleurs_tag":"af_za",
+    "commonvoice_hours":0.6,
     "commonvoice_locale":"af",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
+    "commonvoice_hours":2883.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
+    "commonvoice_hours":2.0,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
+    "commonvoice_hours":437.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":71.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
+    "commonvoice_hours":52.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":4.5,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
+    "commonvoice_hours":1.8,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
     "family":"Kartvelian",
     "flores_path":"kat_Geor",
     "fleurs_tag":"ka_ge",
+    "commonvoice_hours":167.0,
     "commonvoice_locale":"ka",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
+    "commonvoice_hours":164.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
     "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":11.0,
     "commonvoice_locale":"brh",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":11.0,
     "commonvoice_locale":"haz",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"oci_Latn",
     "fleurs_tag":"oc_fr",
+    "commonvoice_hours":1.9,
     "commonvoice_locale":"oc",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"seh",
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"mfe",
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
+    "commonvoice_hours":9.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"lvs_Latn",
     "fleurs_tag":"lv_lv",
+    "commonvoice_hours":263.0,
     "commonvoice_locale":"lv",
     "in_benchmark":true
   },
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":453.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":106.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
+    "commonvoice_hours":1.8,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"gaa",
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"pcd",
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":30.0,
     "commonvoice_locale":"br",
     "in_benchmark":false
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":32.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
+    "commonvoice_hours":1.3,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },
     "family":"Artificial Language",
     "flores_path":"epo_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":1437.0,
     "commonvoice_locale":"eo",
     "in_benchmark":true
   },

models.json CHANGED Viewed

@@ -20,15 +20,15 @@
     ]
   },
   {
-    "id":"anthropic\/claude-3.5-sonnet",
-    "name":"Claude 3.5 Sonnet",
     "provider_name":"Anthropic",
-    "cost":15.0,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1729555200000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -80,15 +80,15 @@
     ]
   },
   {
-    "id":"deepseek\/deepseek-chat",
-    "name":"DeepSeek V3",
-    "provider_name":"DeepSeek",
     "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-V3",
-    "size":684531386000.0,
     "type":"open-source",
-    "license":"",
-    "creation_date":1735084800000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -100,15 +100,15 @@
     ]
   },
   {
-    "id":"deepseek\/deepseek-chat-v3-0324",
-    "name":"DeepSeek V3 0324",
-    "provider_name":"DeepSeek",
-    "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
-    "size":684531386000.0,
-    "type":"open-source",
-    "license":"Mit",
-    "creation_date":1742774400000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -120,15 +120,15 @@
     ]
   },
   {
-    "id":"deepseek\/deepseek-r1",
-    "name":"R1",
     "provider_name":"DeepSeek",
-    "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-R1",
     "size":684531386000.0,
     "type":"open-source",
-    "license":"Mit",
-    "creation_date":1737331200000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -140,15 +140,15 @@
     ]
   },
   {
-    "id":"deepseek\/deepseek-r1-0528",
-    "name":"R1 0528",
     "provider_name":"DeepSeek",
     "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-R1-0528",
     "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
-    "creation_date":1748390400000.0,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -160,15 +160,15 @@
     ]
   },
   {
-    "id":"google\/gemini-2.0-flash-001",
-    "name":"Gemini 2.0 Flash",
-    "provider_name":"Google",
-    "cost":0.4,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1738713600000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -180,15 +180,15 @@
     ]
   },
   {
-    "id":"google\/gemini-2.0-flash-lite-001",
-    "name":"Gemini 2.0 Flash Lite",
-    "provider_name":"Google",
-    "cost":0.3,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1740441600000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -200,15 +200,15 @@
     ]
   },
   {
-    "id":"google\/gemini-2.5-flash",
-    "name":"Gemini 2.5 Flash",
-    "provider_name":"Google",
-    "cost":2.5,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1750118400000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -220,69 +220,15 @@
     ]
   },
   {
-    "id":"google\/gemini-2.5-flash-lite-preview-06-17",
-    "name":"Gemini 2.5 Flash Lite Preview 06-17",
     "provider_name":"Google",
     "cost":0.4,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1750118400000.0,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/gemini-2.5-flash-preview",
-    "name":"Gemini 2.5 Flash Preview 04-17",
-    "provider_name":"Google",
-    "cost":0.6,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1744848000000.0,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/gemini-2.5-flash-preview-05-20",
-    "name":"Gemini 2.5 Flash Preview 05-20",
-    "provider_name":"Google",
-    "cost":0.6,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1747699200000.0,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/gemini-2.5-pro",
-    "name":"Gemini 2.5 Pro",
-    "provider_name":"Google",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1750118400000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -294,51 +240,15 @@
     ]
   },
   {
-    "id":"google\/gemini-2.5-pro-preview",
-    "name":"Gemini 2.5 Pro Preview 06-05",
-    "provider_name":"Google",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1749081600000.0,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/gemini-2.5-pro-preview-05-06",
-    "name":"Gemini 2.5 Pro Preview 05-06",
-    "provider_name":"Google",
-    "cost":10.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1746576000000.0,
-    "tasks":[
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "mgsm"
-    ]
-  },
-  {
-    "id":"google\/gemini-flash-1.5",
-    "name":"Gemini 1.5 Flash ",
     "provider_name":"Google",
     "cost":0.3,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1715644800000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -350,15 +260,15 @@
     ]
   },
   {
-    "id":"google\/gemini-flash-1.5-8b",
-    "name":"Gemini 1.5 Flash 8B",
     "provider_name":"Google",
-    "cost":0.15,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
-    "creation_date":1727913600000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -370,12 +280,12 @@
     ]
   },
   {
-    "id":"google\/gemma-3-27b-it",
-    "name":"Gemma 3 27B",
     "provider_name":"Google",
     "cost":0.0,
-    "hf_id":"google\/gemma-3-27b-it",
-    "size":27432406640.0,
     "type":"open-source",
     "license":"Gemma",
     "creation_date":1740787200000,
@@ -390,30 +300,15 @@
     ]
   },
   {
-    "id":"google\/translate-v2",
-    "name":"Google Translate",
     "provider_name":"Google",
-    "cost":20.0,
-    "hf_id":null,
-    "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":null,
-    "tasks":[
-      "translation_from",
-      "translation_to"
-    ]
-  },
-  {
-    "id":"gryphe\/mythomax-l2-13b",
-    "name":"MythoMax 13B",
-    "provider_name":"MythoMax 13B",
-    "cost":0.07,
-    "hf_id":"Gryphe\/MythoMax-L2-13b",
-    "size":null,
     "type":"open-source",
-    "license":"Other",
-    "creation_date":1691625600000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -464,30 +359,6 @@
       "mgsm"
     ]
   },
-  {
-    "id":"meta-llama\/llama-3.1-8b-instruct",
-    "name":"Llama 3.1 8B Instruct",
-    "provider_name":"Meta",
-    "cost":0.0,
-    "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
-    "size":8030261248.0,
-    "type":"open-source",
-    "license":"Llama3.1",
-    "creation_date":1721260800000.0,
-    "tasks":null
-  },
-  {
-    "id":"meta-llama\/llama-3.2-1b-instruct",
-    "name":"Llama 3.2 1B Instruct",
-    "provider_name":"Meta",
-    "cost":0.0,
-    "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
-    "size":1235814400.0,
-    "type":"open-source",
-    "license":"Llama3.2",
-    "creation_date":1726617600000.0,
-    "tasks":null
-  },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
     "name":"Llama 3.3 70B Instruct",
@@ -568,6 +439,26 @@
       "mgsm"
     ]
   },
   {
     "id":"mistralai\/mistral-nemo",
     "name":"Mistral Nemo",
@@ -629,15 +520,55 @@
     ]
   },
   {
-    "id":"openai\/gpt-3.5-turbo-0613",
-    "name":"GPT-3.5 Turbo (older v0613)",
-    "provider_name":"OpenAI",
-    "cost":2.0,
-    "hf_id":null,
     "size":null,
-    "type":"closed-source",
-    "license":null,
-    "creation_date":1706140800000,
     "tasks":[
       "translation_from",
       "translation_to",
@@ -708,6 +639,26 @@
       "mgsm"
     ]
   },
   {
     "id":"openai\/gpt-4o-mini",
     "name":"GPT-4o-mini",
@@ -728,6 +679,86 @@
       "mgsm"
     ]
   },
   {
     "id":"qwen\/qwen3-235b-a22b",
     "name":"Qwen3 235B A22B",
@@ -772,7 +803,7 @@
     "id":"qwen\/qwen3-32b",
     "name":"Qwen3 32B",
     "provider_name":"Qwen",
-    "cost":0.0,
     "hf_id":"Qwen\/Qwen3-32B",
     "size":32762123264.0,
     "type":"open-source",
@@ -787,5 +818,120 @@
       "truthfulqa",
       "mgsm"
     ]
   }
 ]

     ]
   },
   {
+    "id":"anthropic\/claude-3-haiku",
+    "name":"Claude 3 Haiku",
     "provider_name":"Anthropic",
+    "cost":1.25,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1710288000000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"arliai\/qwq-32b-arliai-rpr-v1",
+    "name":"QwQ 32B RpR v1",
+    "provider_name":"ArliAI",
     "cost":0.0,
+    "hf_id":"ArliAI\/QwQ-32B-ArliAI-RpR-v1",
+    "size":32763876352.0,
     "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1743984000000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"cohere\/command-r-08-2024",
+    "name":"Command R (08-2024)",
+    "provider_name":"Cohere",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1724976000000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"deepseek\/deepseek-chat",
+    "name":"DeepSeek V3",
     "provider_name":"DeepSeek",
+    "cost":0.8,
+    "hf_id":"deepseek-ai\/DeepSeek-V3",
     "size":684531386000.0,
     "type":"open-source",
+    "license":"",
+    "creation_date":1735084800000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"deepseek\/deepseek-chat-v3-0324",
+    "name":"DeepSeek V3 0324",
     "provider_name":"DeepSeek",
     "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
     "size":684531386000.0,
     "type":"open-source",
     "license":"Mit",
+    "creation_date":1742774400000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"deepseek\/deepseek-chat-v3.1",
+    "name":"DeepSeek V3.1",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-V3.1",
+    "size":684531386000.0,
+    "type":"open-source",
+    "license":"Mit",
+    "creation_date":1755734400000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"deepseek\/deepseek-r1",
+    "name":"R1",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-R1",
+    "size":684531386000.0,
+    "type":"open-source",
+    "license":"Mit",
+    "creation_date":1737331200000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"deepseek\/deepseek-r1-0528-qwen3-8b",
+    "name":"Deepseek R1 0528 Qwen3 8B",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-R1-0528-Qwen3-8B",
+    "size":8190735360.0,
+    "type":"open-source",
+    "license":"Mit",
+    "creation_date":1748476800000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/gemini-2.0-flash-001",
+    "name":"Gemini 2.0 Flash",
     "provider_name":"Google",
     "cost":0.4,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1738713600000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/gemini-2.0-flash-lite-001",
+    "name":"Gemini 2.0 Flash Lite",
     "provider_name":"Google",
     "cost":0.3,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1740441600000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/gemini-2.5-flash",
+    "name":"Gemini 2.5 Flash",
     "provider_name":"Google",
+    "cost":2.5,
     "hf_id":null,
     "size":null,
     "type":"closed-source",
     "license":null,
+    "creation_date":1750118400000,
     "tasks":[
       "translation_from",
       "translation_to",
     ]
   },
   {
+    "id":"google\/gemma-3-12b-it",
+    "name":"Gemma 3 12B",
     "provider_name":"Google",
     "cost":0.0,
+    "hf_id":"google\/gemma-3-12b-it",
+    "size":12187325040.0,
     "type":"open-source",
     "license":"Gemma",
     "creation_date":1740787200000,
     ]
   },
   {
+    "id":"google\/gemma-3-27b-it",
+    "name":"Gemma 3 27B",
     "provider_name":"Google",
+    "cost":0.0,
+    "hf_id":"google\/gemma-3-27b-it",
+    "size":27432406640.0,
     "type":"open-source",
+    "license":"Gemma",
+    "creation_date":1740787200000,
     "tasks":[
       "translation_from",
       "translation_to",
       "mgsm"
     ]
   },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
     "name":"Llama 3.3 70B Instruct",
       "mgsm"
     ]
   },
+  {
+    "id":"mistralai\/mistral-7b-instruct-v0.3",
+    "name":"Mistral 7B Instruct v0.3",
+    "provider_name":"Mistral",
+    "cost":0.05,
+    "hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
+    "size":7248023552.0,
+    "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1716336000000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
   {
     "id":"mistralai\/mistral-nemo",
     "name":"Mistral Nemo",
     ]
   },
   {
+    "id":"moonshotai\/kimi-k2",
+    "name":"Kimi K2",
+    "provider_name":"MoonshotAI",
+    "cost":0.0,
+    "hf_id":"moonshotai\/Kimi-K2-Instruct",
     "size":null,
+    "type":"open-source",
+    "license":"Other",
+    "creation_date":1752192000000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"neversleep\/llama-3-lumimaid-70b",
+    "name":"Llama 3 Lumimaid 70B",
+    "provider_name":"NeverSleep",
+    "cost":6.0,
+    "hf_id":"NeverSleep\/Llama-3-Lumimaid-70B-v0.1",
+    "size":70553706496.0,
+    "type":"open-source",
+    "license":"Cc By Nc 4.0",
+    "creation_date":1714262400000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"nvidia\/llama-3.1-nemotron-70b-instruct",
+    "name":"Llama 3.1 Nemotron 70B Instruct",
+    "provider_name":"NVIDIA",
+    "cost":0.3,
+    "hf_id":"nvidia\/Llama-3.1-Nemotron-70B-Instruct-HF",
+    "size":70553706496.0,
+    "type":"open-source",
+    "license":"Llama3.1",
+    "creation_date":1728691200000,
     "tasks":[
       "translation_from",
       "translation_to",
       "mgsm"
     ]
   },
+  {
+    "id":"openai\/gpt-4o-2024-11-20",
+    "name":"GPT-4o (2024-11-20)",
+    "provider_name":"OpenAI",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1732060800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
   {
     "id":"openai\/gpt-4o-mini",
     "name":"GPT-4o-mini",
       "mgsm"
     ]
   },
+  {
+    "id":"openai\/gpt-5",
+    "name":"GPT-5",
+    "provider_name":"OpenAI",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1754524800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"openai\/gpt-5-nano",
+    "name":"GPT-5 Nano",
+    "provider_name":"OpenAI",
+    "cost":0.4,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1754524800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"openai\/gpt-oss-120b",
+    "name":"gpt-oss-120b",
+    "provider_name":"OpenAI",
+    "cost":0.0,
+    "hf_id":"openai\/gpt-oss-120b",
+    "size":120412337472.0,
+    "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1754265600000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"qwen\/qwen-2.5-coder-32b-instruct",
+    "name":"Qwen2.5 Coder 32B Instruct",
+    "provider_name":"Qwen2.5 Coder 32B Instruct (free)",
+    "cost":0.0,
+    "hf_id":"Qwen\/Qwen2.5-Coder-32B-Instruct",
+    "size":32763876352.0,
+    "type":"open-source",
+    "license":"Apache 2.0",
+    "creation_date":1730851200000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
   {
     "id":"qwen\/qwen3-235b-a22b",
     "name":"Qwen3 235B A22B",
     "id":"qwen\/qwen3-32b",
     "name":"Qwen3 32B",
     "provider_name":"Qwen",
+    "cost":0.07,
     "hf_id":"Qwen\/Qwen3-32B",
     "size":32762123264.0,
     "type":"open-source",
       "truthfulqa",
       "mgsm"
     ]
+  },
+  {
+    "id":"scb10x\/llama3.1-typhoon2-70b-instruct",
+    "name":"Typhoon2 70B Instruct",
+    "provider_name":"Typhoon2 70B Instruct",
+    "cost":0.88,
+    "hf_id":"scb10x\/llama3.1-typhoon2-70b-instruct",
+    "size":70553706496.0,
+    "type":"open-source",
+    "license":"Llama3.1",
+    "creation_date":1734220800000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"tencent\/hunyuan-a13b-instruct",
+    "name":"Hunyuan A13B Instruct",
+    "provider_name":"Tencent",
+    "cost":0.0,
+    "hf_id":"tencent\/Hunyuan-A13B-Instruct",
+    "size":80393183232.0,
+    "type":"open-source",
+    "license":"Other",
+    "creation_date":1750809600000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"thedrummer\/anubis-pro-105b-v1",
+    "name":"Anubis Pro 105B V1",
+    "provider_name":"TheDrummer",
+    "cost":1.0,
+    "hf_id":"TheDrummer\/Anubis-Pro-105B-v1",
+    "size":104779882496.0,
+    "type":"open-source",
+    "license":"Other",
+    "creation_date":1738454400000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"x-ai\/grok-4",
+    "name":"Grok 4",
+    "provider_name":"xAI",
+    "cost":15.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1752019200000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"z-ai\/glm-4.5v",
+    "name":"GLM 4.5V",
+    "provider_name":"Z.AI",
+    "cost":1.8,
+    "hf_id":"zai-org\/GLM-4.5V",
+    "size":107710933120.0,
+    "type":"open-source",
+    "license":"Mit",
+    "creation_date":1754784000000,
+    "tasks":[
+      "translation_from",
+      "translation_to",
+      "classification",
+      "mmlu",
+      "arc",
+      "truthfulqa",
+      "mgsm"
+    ]
+  },
+  {
+    "id":"google\/translate-v2",
+    "name":"Google Translate",
+    "provider_name":"Google",
+    "cost":20.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":null,
+    "tasks":[
+      "translation_from",
+      "translation_to"
+    ]
   }
 ]

pyproject.toml CHANGED Viewed

@@ -44,3 +44,13 @@ dev = [
     "scipy>=1.16.0",
     "seaborn>=0.13.2",
 ]

     "scipy>=1.16.0",
     "seaborn>=0.13.2",
 ]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["evals"]
+[tool.uv]
+package = true

results.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8dbe020a1941a0e49c05f81aeee40ba37d3e2f9f3d83303fcfe1b5711676d1d8
-size 2978273

 version https://git-lfs.github.com/spec/v1
+oid sha256:649509b8373b76e51a79809fdab77badff44e5536ca3bd8e3eb409f406b6ecda
+size 13260774

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff