Spaces:

fair-forward
/

evals-for-every-language

Runtime error

App Files Files Community

davidpomerenke commited on 13 days ago

Commit

963cb78

verified ·

1 Parent(s): 8eebb41

Upload from GitHub Actions: updated and cleaned up scripts for new eval runs

Browse files

Files changed (9) hide show

.github/workflows/nightly-evals.yml +4 -24
evals/datasets_/mgsm.py +29 -22
evals/datasets_/mmlu.py +34 -15
evals/main.py +88 -232
evals/models.py +3 -35
evals/tasks.py +69 -82
languages.json +28 -28
models.json +10 -976
results.json +2 -2

.github/workflows/nightly-evals.yml CHANGED Viewed

@@ -8,6 +8,7 @@ on:
 jobs:
   run-evals:
     runs-on: ubuntu-latest
     timeout-minutes: 1440  # 24 hours timeout
     steps:
       - uses: actions/checkout@v3
@@ -22,7 +23,7 @@ jobs:
           curl -LsSf https://astral.sh/uv/install.sh | sh
           uv sync --frozen --extra dev
-      - name: Run evaluations with checkpointing
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
@@ -31,28 +32,7 @@ jobs:
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py
-          # Run evaluations with periodic checkpointing
-          uv run python -c "
-          import time
-          import subprocess
-          import json
-          import os
-          # Check if we have existing results to resume from
-          if os.path.exists('results.json'):
-              print('Found existing results.json, will resume from checkpoint')
-          # Run the main evaluation
-          try:
-              subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
-          except subprocess.CalledProcessError as e:
-              print(f'Evaluation failed: {e}')
-              # Save current state even if failed
-              if os.path.exists('results.json'):
-                  print('Saving checkpoint before exit...')
-              exit(1)
-          "
       - name: Commit changes
         env:
@@ -62,7 +42,7 @@ jobs:
           git config --local user.name "github-actions[bot]"
           git config --local --unset-all http.https://github.com/.extraheader
           git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
-          git add results.json models.json languages.json checkpoint.json
           git commit -m "Update evaluation results" || echo "No changes to commit"
           git push origin HEAD:main

 jobs:
   run-evals:
     runs-on: ubuntu-latest
+    # checking if this is working in case eval runs take longer than 6h github actions allowance
     timeout-minutes: 1440  # 24 hours timeout
     steps:
       - uses: actions/checkout@v3
           curl -LsSf https://astral.sh/uv/install.sh | sh
           uv sync --frozen --extra dev
+      - name: Run evaluations
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py
+          uv run evals/main.py
       - name: Commit changes
         env:
           git config --local user.name "github-actions[bot]"
           git config --local --unset-all http.https://github.com/.extraheader
           git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
+          git add results.json models.json languages.json
           git commit -m "Update evaluation results" || echo "No changes to commit"
           git push origin HEAD:main

evals/datasets_/mgsm.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import random
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset
 from langcodes import Language, standardize_tag
 from models import get_google_supported_languages, translate_google
 from rich import print
@@ -39,32 +39,39 @@ def parse_number(i):
         return None
 def load_mgsm(language_bcp_47, nr):
-    print(f"Loading MGSM data for {language_bcp_47}...")
     if language_bcp_47 in tags_mgsm.keys():
-        ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
-        return slug_mgsm, ds[nr], "human"
     elif language_bcp_47 in tags_afrimgsm.keys():
-        ds = _load_dataset(
-            slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
-        )
-        return slug_afrimgsm, ds[nr], "human"
     elif language_bcp_47 in tags_gsm8kx.keys():
-        row = _load_dataset(
-            slug_gsm8kx,
-            subset=tags_gsm8kx[language_bcp_47],
-            split="test",
-            trust_remote_code=True,
-        )[nr]
-        row["answer_number"] = row["answer"].split("####")[1].strip()
-        return slug_gsm8kx, row, "machine"
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
-        ds = _load_dataset(
-            slug_gsm_autotranslated,
-            subset=tags_gsm_autotranslated[language_bcp_47],
-            split="test",
-        )
-        return slug_gsm_autotranslated, ds[nr], "machine"
     else:
         return None, None, None

 import random
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset, cache
 from langcodes import Language, standardize_tag
 from models import get_google_supported_languages, translate_google
 from rich import print
         return None
+@cache
+def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
+    """Cache individual MGSM items efficiently"""
+    try:
+        ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
+        if nr >= len(ds):
+            return None
+        row = ds[nr]
+        # Post-process based on dataset type
+        if dataset_slug == slug_gsm8kx:
+            row["answer_number"] = row["answer"].split("####")[1].strip()
+        return row
+    except Exception:
+        # Dataset doesn't exist or doesn't have test split
+        return None
 def load_mgsm(language_bcp_47, nr):
     if language_bcp_47 in tags_mgsm.keys():
+        item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
+        return slug_mgsm, item, "human" if item else (None, None, None)
     elif language_bcp_47 in tags_afrimgsm.keys():
+        item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
+        return slug_afrimgsm, item, "human" if item else (None, None, None)
     elif language_bcp_47 in tags_gsm8kx.keys():
+        item = _get_mgsm_item(slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True)
+        return slug_gsm8kx, item, "machine" if item else (None, None, None)
     elif language_bcp_47 in tags_gsm_autotranslated.keys():
+        item = _get_mgsm_item(slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr)
+        return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
     else:
         return None, None, None

evals/datasets_/mmlu.py CHANGED Viewed

@@ -4,7 +4,7 @@ import random
 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
-from datasets_.util import _get_dataset_config_names, _load_dataset
 from langcodes import Language, standardize_tag
 from models import get_google_supported_languages, translate_google
 from rich import print
@@ -144,32 +144,51 @@ tags_mmlux = set(
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
-tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 categories = sorted(
         list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
     )
 async def load_mmlu(language_bcp_47, nr):
-    print(f"Loading MMLU data for {language_bcp_47}...")
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
-        ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
-        ds = ds.map(parse_choices)
-        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
-        return "masakhane/afrimmlu", task, "human"
     elif language_bcp_47 in tags_global_mmlu.keys():
-        ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
-        ds = ds.map(add_choices)
-        task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
-        return "CohereForAI/Global-MMLU", task, "human"
     # TODO: add in Okapi, MMLUX @Jonas
     elif language_bcp_47 in tags_mmlu_autotranslated:
-        ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
-        filtered = ds["test"].filter(lambda x: x["subject"] == category)
-        task = filtered[nr]
-        return "fair-forward/mmlu-autotranslated", task, "machine"
     else:
         return None, None, None

 from collections import Counter, defaultdict
 from datasets import Dataset, load_dataset
+from datasets_.util import _get_dataset_config_names, _load_dataset, cache
 from langcodes import Language, standardize_tag
 from models import get_google_supported_languages, translate_google
 from rich import print
     a.rsplit("_", 1)[1].split("-")[0].lower()
     for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
 )
+tags_mmlu_autotranslated = {
+    standardize_tag(a, macro=True): a
+    for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
+}
 categories = sorted(
         list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
     )
+@cache
+def _get_processed_mmlu_dataset(dataset_name, subset_tag):
+    """Cache processed datasets to avoid reprocessing"""
+    ds = _load_dataset(dataset_name, subset_tag)
+    if dataset_name == "masakhane/afrimmlu":
+        ds = ds.map(parse_choices)
+    elif dataset_name == "CohereForAI/Global-MMLU":
+        ds = ds.map(add_choices)
+    return ds
+@cache
+def _get_mmlu_item(dataset_name, subset_tag, category, nr):
+    """Cache individual MMLU items efficiently"""
+    ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
+    if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
+        filtered = ds["test"].filter(lambda x: x["subject"] == category)
+        return filtered[nr] if nr < len(filtered) else None
+    else:  # fair-forward/mmlu-autotranslated
+        filtered = ds["test"].filter(lambda x: x["subject"] == category)
+        return filtered[nr] if nr < len(filtered) else None
 async def load_mmlu(language_bcp_47, nr):
     category = categories[nr % len(categories)]
     if language_bcp_47 in tags_afrimmlu.keys():
+        task = _get_mmlu_item("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr)
+        return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
     elif language_bcp_47 in tags_global_mmlu.keys():
+        task = _get_mmlu_item("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr)
+        return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
     # TODO: add in Okapi, MMLUX @Jonas
     elif language_bcp_47 in tags_mmlu_autotranslated:
+        task = _get_mmlu_item("fair-forward/mmlu-autotranslated", language_bcp_47, category, nr)
+        return "fair-forward/mmlu-autotranslated", task, "machine" if task else (None, None, None)
     else:
         return None, None, None

evals/main.py CHANGED Viewed

@@ -1,271 +1,127 @@
 import asyncio
 import pandas as pd
 import time
-import os
 from datetime import datetime, timedelta
-from tqdm.asyncio import tqdm_asyncio
 from models import models
 from tasks import tasks
 from languages import languages
-import json
-results = pd.DataFrame()
-def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
-    """Save current progress as checkpoint"""
-    try:
-        args = dict(orient="records", indent=2, force_ascii=False)
-        # Save current results
-        if len(results_df) > 0:
-            results_df.to_json("results.json", **args)
-            print(f"💾 Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
-        # Save model and language info
-        models_df.to_json("models.json", **args)
-        languages_df.to_json("languages.json", **args)
-        # Save checkpoint metadata
-        checkpoint_info = {
-            "last_batch": batch_num,
-            "total_batches": total_batches,
-            "timestamp": datetime.now().isoformat(),
-            "results_count": len(results_df)
-        }
-        with open("checkpoint.json", "w") as f:
-            json.dump(checkpoint_info, f, indent=2)
-    except Exception as e:
-        print(f"⚠️  Failed to save checkpoint: {e}")
-def load_checkpoint():
-    """Load previous checkpoint if available"""
-    try:
-        if os.path.exists("checkpoint.json"):
-            with open("checkpoint.json", "r") as f:
-                checkpoint = json.load(f)
-            print(f"📂 Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
-            return checkpoint
-    except Exception as e:
-        print(f"⚠️  Failed to load checkpoint: {e}")
-    return None
 async def evaluate():
-    # FIXME we should not need this for-loop, but it helps
-    n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
-    # Load models and languages
     models_df = pd.DataFrame(models)
     languages_df = pd.DataFrame(languages)
-    print(f"🚀 Running full evaluation with {len(models_df)} models.")
     start_time = time.time()
-    print(f"🚀 Starting full evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    print(f"📊 Evaluating {n_sentences} sentences per task")
-    # Evaluate top languages by speakers (configurable via MAX_LANGUAGES env var)
-    max_languages = int(os.environ.get("MAX_LANGUAGES", 2))  # Default 2 for quick testing
-    top_languages = languages.head(max_languages)  # Top N by population
-    print(f"🌍 Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
-    # Load checkpoint if available
-    checkpoint = load_checkpoint()
-    start_batch = 0
-    if checkpoint:
-        start_batch = checkpoint['last_batch']
-        print(f"🔄 Resuming from batch {start_batch}")
-    # For testing, just use all available languages up to max_languages
-    for n_languages in [min(max_languages, len(top_languages))]:
-        print(f"running evaluations for {n_languages} languages")
-        # Load existing results
         try:
             old_results = pd.read_json("results.json")
             if old_results.empty:
                 old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
         except FileNotFoundError:
             old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
-        try:
-            old_models = pd.read_json("models.json")
-        except FileNotFoundError:
-            old_models = pd.DataFrame()
-        # get all combinations of model, language and task
-        combis = [
-            (model, lang.bcp_47, task_name)
-            for model in models_df["id"]
-            for lang in top_languages.iloc[:n_languages].itertuples()
-            for task_name, task in tasks.items()
-            if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
-        ]
-        # filter out combinations that have already been evaluated
-        combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-        combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
-        combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
-        # run evaluations in batches to prevent HTTP pool exhaustion
-        all_tasks = []
-        for i in range(n_sentences):
-            for model, bcp_47, task_name in combis.itertuples(index=False):
-                # All tasks now use the same signature
-                all_tasks.append((tasks[task_name], model, bcp_47, i))
-        print(f"⏳ Processing {len(all_tasks)} evaluation tasks in batches...")
-        batch_size = 200  # Process 200 tasks at a time (optimized for GitHub Actions)
-        all_results = []
-        # Calculate total batches for progress tracking
-        total_batches = (len(all_tasks) + batch_size - 1) // batch_size
-        for i in range(start_batch * batch_size, len(all_tasks), batch_size):
-            batch = all_tasks[i:i+batch_size]
-            current_batch = i // batch_size + 1
-            print(f"📦 Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
-            # Show what's being evaluated in this batch
-            batch_summary = {}
-            for task_data in batch:
-                task_func, model, bcp_47, sentence_nr = task_data
-                # Extract task name from function - handle both partial functions and regular functions
-                if hasattr(task_func, 'func'):
-                    task_name = task_func.func.__name__.replace('_and_evaluate', '')
-                else:
-                    task_name = task_func.__name__.replace('_and_evaluate', '')
-                if task_name not in batch_summary:
-                    batch_summary[task_name] = set()
-                batch_summary[task_name].add(bcp_47)
-            for task_name, languages_set in batch_summary.items():
-                lang_list = ', '.join(sorted(languages_set))
-                print(f"  🔄 {task_name}: {lang_list}")
-            batch_coroutines = []
-            for task_data in batch:
-                task_func, model, bcp_47, sentence_nr = task_data
-                batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
-            try:
-                batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
-                all_results.extend(batch_results)
-                # Save checkpoint after each batch
-                valid_results = []
-                exception_count = 0
-                for r in batch_results:
-                    if isinstance(r, Exception):
-                        exception_count += 1
-                        continue
-                    if isinstance(r, list):
-                        valid_results.extend(r)
-                    else:
-                        valid_results.append(r)
-                if valid_results:
-                    # Aggregate results
-                    batch_df = pd.DataFrame(valid_results)
-                    if len(batch_df) > 0:
-                        batch_df = (
-                            batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
-                            .agg({"score": "mean"})
-                            .reset_index()
-                        )
-                        # Merge with existing results
-                        all_results_df = pd.concat([old_results, batch_df])
-                        all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
-                        all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
-                        # Save checkpoint
-                        save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
-                        # Update old_results for next batch
-                        old_results = all_results_df
-                print(f"✅ Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
-            except Exception as e:
-                print(f"❌ Batch {current_batch} failed: {e}")
-                # Save checkpoint even on failure
-                if len(all_results) > 0:
-                    results_df = pd.DataFrame(all_results)
-                    save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
-                continue
-            # Reduced delay between batches (optimized for GitHub Actions)
-            await asyncio.sleep(0.5)
-        # Final aggregation and save
-        results = all_results
-        # Filter out exceptions and flatten results
         valid_results = []
-        exception_count = 0
         for r in results:
-            if isinstance(r, Exception):
-                exception_count += 1
-                continue
             if isinstance(r, list):
                 valid_results.extend(r)
             else:
                 valid_results.append(r)
-        print(f"⚠️  Encountered {exception_count} API errors (model unavailable/rate limits)")
-        print(f"✅ Successfully processed {len(valid_results)} evaluations")
-        # Save final results
         if valid_results:
-            results = valid_results
-            args = dict(orient="records", indent=2, force_ascii=False)
-            # Aggregate results like main branch
-            results_df = pd.DataFrame(results)
-            if len(results_df) > 0:
-                results_df = (
-                    results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
-                    .agg({"score": "mean"})
-                    .reset_index()
-                )
-                # Merge with old results
-                old_results = pd.read_json("results.json")
-                results_df = pd.concat([old_results, results_df])
-                results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
                 results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
                 results_df.to_json("results.json", **args)
-                print(f"💾 Saved {len(results_df)} aggregated results to results.json")
             else:
-                print("⚠️  No valid results to aggregate")
-        else:
-            print("⚠️  No valid results to save - all API calls failed")
-        # Save up-to-date info on models and languages (like main branch)
-        all_models = pd.concat([pd.DataFrame(models), old_models])
-        all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
-        all_models.to_json("models.json", **args)
-        pd.DataFrame(languages).to_json("languages.json", **args)
-        # Continue with next batch even if this one had errors
-        # Time estimation
-        elapsed = time.time() - start_time
-        elapsed_str = str(timedelta(seconds=int(elapsed)))
-        if n_languages < max_languages:
-            remaining_batches = (max_languages - n_languages) // 10
-            batch_count = max(1, n_languages // 10)  # Avoid division by zero
-            estimated_remaining = elapsed * remaining_batches / batch_count
-            eta = datetime.now() + timedelta(seconds=estimated_remaining)
-            print(f"⏱️  Batch completed in {elapsed_str}. ETA for full run: {eta.strftime('%H:%M:%S')}")
-        else:
-            print(f"✅ Full evaluation completed in {elapsed_str}")
-            print(f"🎉 Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    # Clean up checkpoint file on successful completion
-    if os.path.exists("checkpoint.json"):
-        os.remove("checkpoint.json")
-        print("🧹 Cleaned up checkpoint file")
-    return results
 if __name__ == "__main__":

 import asyncio
 import pandas as pd
 import time
 from datetime import datetime, timedelta
 from models import models
 from tasks import tasks
 from languages import languages
+import os
 async def evaluate():
+    # Configuration - easily adjustable defaults
+    n_sentences = int(os.environ.get("N_SENTENCES", 20))     # Default: 20 sentences per task
+    max_languages = int(os.environ.get("MAX_LANGUAGES", 150))  # Default: 150 top languages
+    single_model = os.environ.get("SINGLE_MODEL")            # Optional: run only one specific model
+    test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes")  # Optional: skip results loading/saving
     models_df = pd.DataFrame(models)
     languages_df = pd.DataFrame(languages)
+    top_languages = languages.head(max_languages)
+    # Filter to single model if specified
+    if single_model:
+        models_df = models_df[models_df["id"] == single_model]
+        if len(models_df) == 0:
+            print(f"Error: Model '{single_model}' not found. Available models:")
+            for model_id in pd.DataFrame(models)["id"]:
+                print(f"  {model_id}")
+            return pd.DataFrame()
+    print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
+    if test_mode:
+        print("TEST MODE: Skipping results loading/saving")
     start_time = time.time()
+    # Load existing results to avoid re-evaluation (skip in test mode)
+    if test_mode:
+        old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
+    else:
         try:
             old_results = pd.read_json("results.json")
             if old_results.empty:
                 old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
         except FileNotFoundError:
             old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
+    # Get all combinations that need evaluation
+    combis = [
+        (model, lang.bcp_47, task_name)
+        for model in models_df["id"]
+        for lang in top_languages.itertuples()
+        for task_name, task in tasks.items()
+        if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
+    ]
+    # Filter out already evaluated combinations
+    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+    combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
+    combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
+    # Create all evaluation tasks
+    all_tasks = []
+    for i in range(n_sentences):
+        for model, bcp_47, task_name in combis.itertuples(index=False):
+            all_tasks.append((tasks[task_name], model, bcp_47, i))
+    print(f"Running {len(all_tasks)} evaluation tasks...")
+    # Run all tasks with simple asyncio.gather, but stop on first error
+    try:
+        results = await asyncio.gather(
+            *[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks],
+            return_exceptions=False  # This will raise on first exception
+        )
+        # Process results - no exceptions should reach here
         valid_results = []
         for r in results:
             if isinstance(r, list):
                 valid_results.extend(r)
             else:
                 valid_results.append(r)
+        print(f"Completed: {len(valid_results)} valid results")
+    except Exception as e:
+        print(f"EVALUATION STOPPED - API Error occurred:")
+        print(f"Error type: {type(e).__name__}")
+        print(f"Error message: {str(e)}")
+        return pd.DataFrame()
+        # Save results (skip in test mode)
         if valid_results:
+            results_df = pd.DataFrame(valid_results)
+            # Aggregate results
+            results_df = (
+                results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
+                .agg({"score": "mean"})
+                .reset_index()
+            )
+            if not test_mode:
+                args = dict(orient="records", indent=2, force_ascii=False)
+                # Merge with existing results
+                if not old_results.empty:
+                    results_df = pd.concat([old_results, results_df])
+                    results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
                 results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
                 results_df.to_json("results.json", **args)
+                # Save model and language info
+                models_df.to_json("models.json", **args)
+                languages_df.to_json("languages.json", **args)
             else:
+                print("TEST MODE: Skipping results saving")
+            elapsed = time.time() - start_time
+            print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
+            return results_df
+    return pd.DataFrame()
 if __name__ == "__main__":

evals/models.py CHANGED Viewed

@@ -27,7 +27,8 @@ important_models = [
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
-    "openai/gpt-5",  # include if/when available
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
@@ -96,9 +97,6 @@ def get_model(permaslug):
         and m["endpoint"]
         and not m["endpoint"]["is_free"]
     ]
-    if len(slugs) == 0:
-        # the problem is that free models typically have very high rate-limiting
-        print(f"no non-free model found for {permaslug}")
     return slugs[0] if len(slugs) >= 1 else None
@@ -132,18 +130,11 @@ def get_historical_popular_models(date: date):
             for model_slug, count in sorted_models[:20]:  # Top 20
                 result.append({"slug": model_slug, "count": int(count)})
-            print(f"✅ Historical OpenRouter models: {len(result)} models fetched")
-            if result:
-                print(f"   Top 5: {[m['slug'] for m in result[:5]]}")
-                print(f"   Sample counts: {[m['count'] for m in result[:3]]}")
             return result
         else:
-            print("⚠️ Could not find model ranking data in OpenRouter response")
             return []
     except Exception as e:
-        print(f"⚠️ Error fetching OpenRouter historical rankings: {e}")
-        print("🔄 Falling back to static model list")
         return []
@@ -176,18 +167,11 @@ def get_current_popular_models(date: date):
             for model_slug, count in sorted_models[:10]:  # Top 10
                 result.append({"slug": model_slug, "count": int(count)})
-            print(f"✅ Current OpenRouter models: {len(result)} models fetched")
-            if result:
-                print(f"   Top 5: {[m['slug'] for m in result[:5]]}")
-                print(f"   Sample counts: {[m['count'] for m in result[:3]]}")
             return result
         else:
-            print("⚠️ Could not find daily ranking data in OpenRouter response")
             return []
     except Exception as e:
-        print(f"⚠️ Error fetching OpenRouter current rankings: {e}")
-        print("🔄 Falling back to static model list")
         return []
@@ -244,16 +228,13 @@ async def complete(**kwargs) -> str | None:
                 return None
             raise e
         except asyncio.TimeoutError:
-            print(f"⏰ Timeout after {timeout}s for model {model_id}")
             return None
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
 translate_client = None
 def get_google_translate_client():
     global translate_client
     if translate_client is None:
@@ -364,7 +345,7 @@ def get_cost(row):
         return None
-@cache
 def load_models(date: date):
     popular_models = (
         get_historical_popular_models(date.today())[:20]
@@ -374,25 +355,12 @@ def load_models(date: date):
     all_model_candidates = set(important_models + popular_models) - set(blocklist)
     # Validate models exist on OpenRouter before including them
-    print(f"🔍 Validating {len(all_model_candidates)} model candidates...")
     valid_models = []
-    invalid_models = []
     for model_id in all_model_candidates:
         metadata = get_or_metadata(model_id)
         if metadata is not None:
             valid_models.append(model_id)
-        else:
-            invalid_models.append(model_id)
-    if invalid_models:
-        print(f"⚠️ Excluded {len(invalid_models)} invalid models:")
-        for model in sorted(invalid_models)[:5]:  # Show first 5
-            print(f"   - {model}")
-        if len(invalid_models) > 5:
-            print(f"   ... and {len(invalid_models) - 5} more")
-    print(f"✅ Using {len(valid_models)} valid models for evaluation")
     models = pd.DataFrame(sorted(valid_models), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)

     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
+    "openai/gpt-5",
+    "openai/gpt-5-nano",  # include if/when available
     "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
         and m["endpoint"]
         and not m["endpoint"]["is_free"]
     ]
     return slugs[0] if len(slugs) >= 1 else None
             for model_slug, count in sorted_models[:20]:  # Top 20
                 result.append({"slug": model_slug, "count": int(count)})
             return result
         else:
             return []
     except Exception as e:
         return []
             for model_slug, count in sorted_models[:10]:  # Top 10
                 result.append({"slug": model_slug, "count": int(count)})
             return result
         else:
             return []
     except Exception as e:
         return []
                 return None
             raise e
         except asyncio.TimeoutError:
             return None
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
 translate_client = None
 def get_google_translate_client():
     global translate_client
     if translate_client is None:
         return None
+#@cache
 def load_models(date: date):
     popular_models = (
         get_historical_popular_models(date.today())[:20]
     all_model_candidates = set(important_models + popular_models) - set(blocklist)
     # Validate models exist on OpenRouter before including them
     valid_models = []
     for model_id in all_model_candidates:
         metadata = get_or_metadata(model_id)
         if metadata is not None:
             valid_models.append(model_id)
     models = pd.DataFrame(sorted(valid_models), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)

evals/tasks.py CHANGED Viewed

@@ -11,10 +11,8 @@ from datasets_.mgsm import load_mgsm, parse_number
 from datasets_.mmlu import load_mmlu
 from datasets_.arc import load_uhura_arc_easy
 from datasets_.truthfulqa import load_truthfulqa
-from google.cloud import translate_v2 as translate
-from langcodes import closest_supported_match
 from languages import languages, script_name
-from models import complete, transcribe, translate_google, get_google_supported_languages
 bleu = evaluate.load("bleu")
 chrf = evaluate.load("chrf")
@@ -45,32 +43,20 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
-    if model == "google/translate-v2":
-        supported_languages = get_google_supported_languages()
-        original_language = closest_supported_match(
-            original_language, supported_languages
-        )
-        target_language = closest_supported_match(target_language, supported_languages)
-        if original_language == target_language:
-            prediction = original_sentence
-        elif original_language is None or target_language is None:
-            prediction = None
-        else:
-            prediction = await translate_google(
-                original_sentence, original_language.bcp_47, target_language.bcp_47
-            )
-    else:
-        prediction = await complete(
-            model=model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
-                }
-            ],
-            temperature=0,
-            max_tokens=1024,
-        )
     if prediction:
         bleu_score = bleu.compute(
             predictions=[prediction],
@@ -83,6 +69,9 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
     else:
         bleu_score = {"bleu": 0}
         chrf_score = {"score": 0}
     return [
         {
             "model": model,
@@ -120,12 +109,16 @@ Reply with only the topic name.
 Text:
 {test_paragraph.text}
 """
-    pred = await complete(
         model=model,
         messages=[{"role": "user", "content": prompt}],
         temperature=0,
         max_tokens=30,
-    ).lower().strip()
     true = test_paragraph.topic.lower().strip()
     others = [t for t in top_topics if t != true]
     acc = (
@@ -136,6 +129,8 @@ Text:
         if pred
         else 0
     )
     return [
         {
             "model": model,
@@ -228,23 +223,20 @@ Response format: <reasoning> #### <letter>
 {format_multiple_choice(task)}""",
         },
     ]
-    try:
-        response = await complete(
-            model=model,
-            messages=messages,
-            temperature=0,
-            max_tokens=1024,
-        )
-        if response and "####" in response:
-            answer = response.split("####")[-1].strip()
-            acc = int(answer[:1] == task["answer"])
-        else:
-            acc = 0
-    except Exception as e:
-        if "ResponsibleAIPolicyViolation" in str(e):
-            acc = 0
-        else:
-            raise e
     return [
         {
@@ -276,23 +268,18 @@ Response format: <reasoning> #### <letter>
 {format_multiple_choice(task)}""",
         },
     ]
-    try:
-        response = await complete(
-            model=model,
-            messages=messages,
-            temperature=0,
-            max_tokens=1024,
-        )
-        if response and "####" in response:
-            answer = response.split("####")[-1].strip()
-            acc = int(answer[:1] == task["answer"])
-        else:
-            acc = 0
-    except Exception as e:
-        if "ResponsibleAIPolicyViolation" in str(e):
-            acc = 0
-        else:
-            raise e
     return [
         {
             "model": model,
@@ -349,23 +336,20 @@ Response format: <reasoning> #### <letter>
 {format_multiple_choice_truthfulqa(task)}""",
         },
     ]
-    try:
-        response = await complete(
-            model=model,
-            messages=messages,
-            temperature=0,
-            max_tokens=1024, # Increased for reasoning
-        )
-        if response and "####" in response:
-            pred_answer = response.split("####")[-1].strip()
-            acc = int(pred_answer[:1].upper() == answer)
-        else:
-            acc = 0
-    except Exception as e:
-        if "ResponsibleAIPolicyViolation" in str(e):
-            acc = 0
-        else:
-            raise e
     return [
         {
             "model": model,
@@ -407,6 +391,9 @@ Response format: <reasoning> #### <number>
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
         accuracy = 0
     return [
         {

 from datasets_.mmlu import load_mmlu
 from datasets_.arc import load_uhura_arc_easy
 from datasets_.truthfulqa import load_truthfulqa
 from languages import languages, script_name
+from models import complete, transcribe
 bleu = evaluate.load("bleu")
 chrf = evaluate.load("chrf")
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
+    translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
+    prediction = await complete(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": translation_prompt,
+            }
+        ],
+        temperature=0,
+        max_tokens=1024,
+    )
     if prediction:
         bleu_score = bleu.compute(
             predictions=[prediction],
     else:
         bleu_score = {"bleu": 0}
         chrf_score = {"score": 0}
     return [
         {
             "model": model,
 Text:
 {test_paragraph.text}
 """
+    response = await complete(
         model=model,
         messages=[{"role": "user", "content": prompt}],
         temperature=0,
         max_tokens=30,
+    )
+    pred = response.lower().strip() if response else ""
     true = test_paragraph.topic.lower().strip()
     others = [t for t in top_topics if t != true]
     acc = (
         if pred
         else 0
     )
     return [
         {
             "model": model,
 {format_multiple_choice(task)}""",
         },
     ]
+    response = await complete(
+        model=model,
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+    )
+    if response and "####" in response:
+        answer = response.split("####")[-1].strip()
+        acc = int(answer[:1] == task["answer"])
+    else:
+        acc = 0
+        answer = "NO_ANSWER"
     return [
         {
 {format_multiple_choice(task)}""",
         },
     ]
+    response = await complete(
+        model=model,
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+    )
+    if response and "####" in response:
+        answer = response.split("####")[-1].strip()
+        acc = int(answer[:1] == task["answer"])
+    else:
+        acc = 0
+        answer = "NO_ANSWER"
     return [
         {
             "model": model,
 {format_multiple_choice_truthfulqa(task)}""",
         },
     ]
+    response = await complete(
+        model=model,
+        messages=messages,
+        temperature=0,
+        max_tokens=1024, # Increased for reasoning
+    )
+    if response and "####" in response:
+        pred_answer = response.split("####")[-1].strip()
+        acc = int(pred_answer[:1].upper() == answer)
+    else:
+        acc = 0
+        pred_answer = "NO_ANSWER"
     return [
         {
             "model": model,
         accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
     else:
         accuracy = 0
+        number = "NO_ANSWER"
     return [
         {

languages.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
-    "commonvoice_hours":2679.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
@@ -32,7 +32,7 @@
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
-    "commonvoice_locale":"hi-IN",
     "in_benchmark":true
   },
   {
@@ -43,7 +43,7 @@
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
-    "commonvoice_hours":448.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
@@ -79,7 +79,7 @@
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
-    "commonvoice_hours":1068.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
@@ -127,7 +127,7 @@
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
-    "commonvoice_hours":245.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
@@ -139,7 +139,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
-    "commonvoice_hours":411.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
@@ -163,7 +163,7 @@
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
-    "commonvoice_hours":1371.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
@@ -1027,7 +1027,7 @@
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
-    "commonvoice_hours":93.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
@@ -1183,7 +1183,7 @@
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
-    "commonvoice_hours":1811.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
@@ -1207,7 +1207,7 @@
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
-    "commonvoice_hours":0.4,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
@@ -1291,7 +1291,7 @@
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
-    "commonvoice_hours":2878.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
@@ -1303,7 +1303,7 @@
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
-    "commonvoice_hours":1.7,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
@@ -1375,7 +1375,7 @@
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
-    "commonvoice_hours":427.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
@@ -1519,7 +1519,7 @@
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":69.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
@@ -1555,7 +1555,7 @@
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
-    "commonvoice_hours":51.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
@@ -1675,7 +1675,7 @@
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":4.1,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
@@ -1747,7 +1747,7 @@
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
-    "commonvoice_hours":1.5,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
@@ -2167,7 +2167,7 @@
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
-    "commonvoice_hours":129.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
@@ -3175,8 +3175,8 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
@@ -3331,7 +3331,7 @@
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
-    "commonvoice_hours":9.1,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
@@ -3535,7 +3535,7 @@
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":452.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
@@ -3559,7 +3559,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":94.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
@@ -3679,7 +3679,7 @@
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
-    "commonvoice_hours":1.4,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
@@ -4099,8 +4099,8 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
@@ -4651,7 +4651,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":31.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
@@ -5011,7 +5011,7 @@
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
-    "commonvoice_hours":0.9,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },

     "family":"Indo-European",
     "flores_path":"eng_Latn",
     "fleurs_tag":"en_us",
+    "commonvoice_hours":2683.0,
     "commonvoice_locale":"en",
     "in_benchmark":true
   },
     "flores_path":"hin_Deva",
     "fleurs_tag":"hi_in",
     "commonvoice_hours":16.0,
+    "commonvoice_locale":"hi",
     "in_benchmark":true
   },
   {
     "family":"Indo-European",
     "flores_path":"spa_Latn",
     "fleurs_tag":"es_419",
+    "commonvoice_hours":449.0,
     "commonvoice_locale":"es",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"fra_Latn",
     "fleurs_tag":"fr_fr",
+    "commonvoice_hours":1072.0,
     "commonvoice_locale":"fr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"rus_Cyrl",
     "fleurs_tag":"ru_ru",
+    "commonvoice_hours":247.0,
     "commonvoice_locale":"ru",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"swh_Latn",
     "fleurs_tag":"sw_ke",
+    "commonvoice_hours":412.0,
     "commonvoice_locale":"sw",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"deu_Latn",
     "fleurs_tag":"de_de",
+    "commonvoice_hours":1372.0,
     "commonvoice_locale":"de",
     "in_benchmark":true
   },
     "family":"Uralic",
     "flores_path":"hun_Latn",
     "fleurs_tag":"hu_hu",
+    "commonvoice_hours":94.0,
     "commonvoice_locale":"hu",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"bel_Cyrl",
     "fleurs_tag":"be_by",
+    "commonvoice_hours":1812.0,
     "commonvoice_locale":"be",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"tgk_Cyrl",
     "fleurs_tag":"tg_tj",
+    "commonvoice_hours":0.6,
     "commonvoice_locale":"tg",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"cat_Latn",
     "fleurs_tag":"ca_es",
+    "commonvoice_hours":2883.0,
     "commonvoice_locale":"ca",
     "in_benchmark":true
   },
     "family":"Afro-Asiatic",
     "flores_path":"heb_Hebr",
     "fleurs_tag":"he_il",
+    "commonvoice_hours":2.0,
     "commonvoice_locale":"he",
     "in_benchmark":true
   },
     "family":"Turkic",
     "flores_path":"uig_Arab",
     "fleurs_tag":null,
+    "commonvoice_hours":437.0,
     "commonvoice_locale":"ug",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"kmr_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":71.0,
     "commonvoice_locale":"kmr",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"slk_Latn",
     "fleurs_tag":"sk_sk",
+    "commonvoice_hours":52.0,
     "commonvoice_locale":"sk",
     "in_benchmark":true
   },
     "family":"Tupian",
     "flores_path":"gug_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":4.5,
     "commonvoice_locale":"gn",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"nob_Latn",
     "fleurs_tag":"nb_no",
+    "commonvoice_hours":1.8,
     "commonvoice_locale":"nb-NO",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"glg_Latn",
     "fleurs_tag":"gl_es",
+    "commonvoice_hours":162.0,
     "commonvoice_locale":"gl",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"seh",
     "in_benchmark":false
   },
   {
     "family":"Indo-European",
     "flores_path":"gle_Latn",
     "fleurs_tag":"ga_ie",
+    "commonvoice_hours":9.3,
     "commonvoice_locale":"ga-IE",
     "in_benchmark":true
   },
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":453.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":106.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
     "family":"Indo-European",
     "flores_path":"ydd_Hebr",
     "fleurs_tag":null,
+    "commonvoice_hours":1.7,
     "commonvoice_locale":"yi",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"pcd",
     "in_benchmark":false
   },
   {
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":32.0,
     "commonvoice_locale":"ady",
     "in_benchmark":false
   },
     "family":"Nakh-Daghestanian",
     "flores_path":"dar_Cyrl",
     "fleurs_tag":null,
+    "commonvoice_hours":1.3,
     "commonvoice_locale":"dar",
     "in_benchmark":true
   },

models.json CHANGED Viewed

@@ -1,15 +1,15 @@
 [
   {
-    "id": "amazon/nova-micro-v1",
-    "name": "Nova Micro 1.0",
-    "provider_name": "Amazon",
-    "cost": 0.14,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1733356800000,
-    "tasks": [
       "translation_from",
       "translation_to",
       "classification",
@@ -18,971 +18,5 @@
       "truthfulqa",
       "mgsm"
     ]
-  },
-  {
-    "id": "anthracite-org/magnum-v4-72b",
-    "name": "Magnum v4 72B",
-    "provider_name": "Magnum v4 72B",
-    "cost": 3.0,
-    "hf_id": "anthracite-org/magnum-v4-72b",
-    "size": 72706203648.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1726790400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "anthropic/claude-sonnet-4",
-    "name": "Claude Sonnet 4",
-    "provider_name": "Anthropic",
-    "cost": 15.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1747872000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "deepseek/deepseek-chat",
-    "name": "DeepSeek V3",
-    "provider_name": "DeepSeek",
-    "cost": 0.72,
-    "hf_id": "deepseek-ai/DeepSeek-V3",
-    "size": 684531386000.0,
-    "type": "open-source",
-    "license": "",
-    "creation_date": 1735084800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "deepseek/deepseek-chat-v3-0324",
-    "name": "DeepSeek V3 0324",
-    "provider_name": "DeepSeek",
-    "cost": 0.0,
-    "hf_id": "deepseek-ai/DeepSeek-V3-0324",
-    "size": 684531386000.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1742774400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "deepseek/deepseek-r1-0528",
-    "name": "R1 0528",
-    "provider_name": "DeepSeek",
-    "cost": 0.0,
-    "hf_id": "deepseek-ai/DeepSeek-R1-0528",
-    "size": 684531386000.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1748390400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "google/gemini-2.0-flash-lite-001",
-    "name": "Gemini 2.0 Flash Lite",
-    "provider_name": "Google",
-    "cost": 0.3,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1740441600000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "google/gemini-2.5-flash",
-    "name": "Gemini 2.5 Flash",
-    "provider_name": "Google",
-    "cost": 2.5,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1750118400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "google/gemma-2-9b-it",
-    "name": "Gemma 2 9B",
-    "provider_name": "Google",
-    "cost": 0.0,
-    "hf_id": "google/gemma-2-9b-it",
-    "size": 9241705984.0,
-    "type": "open-source",
-    "license": "Gemma",
-    "creation_date": 1719187200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "google/gemma-3-27b-it",
-    "name": "Gemma 3 27B",
-    "provider_name": "Google",
-    "cost": 0.0,
-    "hf_id": "google/gemma-3-27b-it",
-    "size": 27432406640.0,
-    "type": "open-source",
-    "license": "Gemma",
-    "creation_date": 1740787200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "meta-llama/llama-3-70b-instruct",
-    "name": "Llama 3 70B Instruct",
-    "provider_name": "Meta",
-    "cost": 0.4,
-    "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
-    "size": 70553706496.0,
-    "type": "open-source",
-    "license": "Llama3",
-    "creation_date": 1713312000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "meta-llama/llama-3.1-70b-instruct",
-    "name": "Llama 3.1 70B Instruct",
-    "provider_name": "Meta",
-    "cost": 0.28,
-    "hf_id": "meta-llama/Llama-3.1-70B-Instruct",
-    "size": 70553706496.0,
-    "type": "open-source",
-    "license": "Llama3.1",
-    "creation_date": 1721088000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "meta-llama/llama-3.2-3b-instruct",
-    "name": "Llama 3.2 3B Instruct",
-    "provider_name": "Meta",
-    "cost": 0.0,
-    "hf_id": "meta-llama/Llama-3.2-3B-Instruct",
-    "size": 3212749824.0,
-    "type": "open-source",
-    "license": "Llama3.2",
-    "creation_date": 1726617600000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "meta-llama/llama-3.3-70b-instruct",
-    "name": "Llama 3.3 70B Instruct",
-    "provider_name": "Meta",
-    "cost": 0.0,
-    "hf_id": "meta-llama/Llama-3.3-70B-Instruct",
-    "size": 70553706496.0,
-    "type": "open-source",
-    "license": "Llama3.3",
-    "creation_date": 1732579200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "meta-llama/llama-4-maverick",
-    "name": "Llama 4 Maverick",
-    "provider_name": "Meta",
-    "cost": 0.6,
-    "hf_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
-    "size": 401583781376.0,
-    "type": "open-source",
-    "license": "Other",
-    "creation_date": 1743465600000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "meta-llama/llama-guard-4-12b",
-    "name": "Llama Guard 4 12B",
-    "provider_name": "Meta",
-    "cost": 0.18,
-    "hf_id": "meta-llama/Llama-Guard-4-12B",
-    "size": 12001097216.0,
-    "type": "open-source",
-    "license": "Other",
-    "creation_date": 1745366400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "microsoft/phi-3-medium-128k-instruct",
-    "name": "Phi-3 Medium 128K Instruct",
-    "provider_name": "Microsoft",
-    "cost": 1.0,
-    "hf_id": "microsoft/Phi-3-medium-128k-instruct",
-    "size": 13960238080.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1715040000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "microsoft/phi-3.5-mini-128k-instruct",
-    "name": "Phi-3.5 Mini 128K Instruct",
-    "provider_name": "Microsoft",
-    "cost": 0.1,
-    "hf_id": "microsoft/Phi-3.5-mini-instruct",
-    "size": 3821079552.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1723766400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "microsoft/phi-4",
-    "name": "Phi 4",
-    "provider_name": "Microsoft",
-    "cost": 0.14,
-    "hf_id": "microsoft/phi-4",
-    "size": 14659507200.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1733875200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "microsoft/phi-4-multimodal-instruct",
-    "name": "Phi 4 Multimodal Instruct",
-    "provider_name": "Microsoft",
-    "cost": 0.1,
-    "hf_id": "microsoft/Phi-4-multimodal-instruct",
-    "size": 5574460384.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1740355200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "mistralai/magistral-medium-2506",
-    "name": "Magistral Medium 2506",
-    "provider_name": "Mistral",
-    "cost": 5.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1749340800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "mistralai/mistral-7b-instruct",
-    "name": "Mistral 7B Instruct",
-    "provider_name": "Mistral",
-    "cost": 0.0,
-    "hf_id": "mistralai/Mistral-7B-Instruct-v0.3",
-    "size": 7248023552.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1716336000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "mistralai/mistral-nemo",
-    "name": "Mistral Nemo",
-    "provider_name": "Mistral",
-    "cost": 0.0,
-    "hf_id": "mistralai/Mistral-Nemo-Instruct-2407",
-    "size": 12247782400.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1721174400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "mistralai/mistral-saba",
-    "name": "Saba",
-    "provider_name": "Mistral",
-    "cost": 0.6,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1739750400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "mistralai/mistral-small-3.1-24b-instruct",
-    "name": "Mistral Small 3.1 24B",
-    "provider_name": "Mistral",
-    "cost": 0.0,
-    "hf_id": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
-    "size": 24011361280.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1741651200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "mistralai/mixtral-8x7b-instruct",
-    "name": "Mixtral 8x7B Instruct",
-    "provider_name": "Mistral",
-    "cost": 0.24,
-    "hf_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "size": 46702792704.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1702166400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "neversleep/llama-3-lumimaid-70b",
-    "name": "Llama 3 Lumimaid 70B",
-    "provider_name": "NeverSleep",
-    "cost": 6.0,
-    "hf_id": "NeverSleep/Llama-3-Lumimaid-70B-v0.1",
-    "size": 70553706496.0,
-    "type": "open-source",
-    "license": "Cc By Nc 4.0",
-    "creation_date": 1714262400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "nvidia/llama-3.1-nemotron-70b-instruct",
-    "name": "Llama 3.1 Nemotron 70B Instruct",
-    "provider_name": "NVIDIA",
-    "cost": 0.3,
-    "hf_id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
-    "size": 70553706496.0,
-    "type": "open-source",
-    "license": "Llama3.1",
-    "creation_date": 1728691200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/chatgpt-4o-latest",
-    "name": "ChatGPT-4o",
-    "provider_name": "OpenAI",
-    "cost": 15.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1723593600000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-3.5-turbo",
-    "name": "GPT-3.5 Turbo",
-    "provider_name": "OpenAI",
-    "cost": 1.5,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1685232000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-3.5-turbo-0613",
-    "name": "GPT-3.5 Turbo (older v0613)",
-    "provider_name": "OpenAI",
-    "cost": 2.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1706140800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-4.1",
-    "name": "GPT-4.1",
-    "provider_name": "OpenAI",
-    "cost": 8.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1744588800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-4.1-mini",
-    "name": "GPT-4.1 Mini",
-    "provider_name": "OpenAI",
-    "cost": 1.6,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1744588800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-4.1-nano",
-    "name": "GPT-4.1 Nano",
-    "provider_name": "OpenAI",
-    "cost": 0.4,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1744588800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-4o-2024-11-20",
-    "name": "GPT-4o (2024-11-20)",
-    "provider_name": "OpenAI",
-    "cost": 10.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1732060800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-4o-mini",
-    "name": "GPT-4o-mini",
-    "provider_name": "OpenAI",
-    "cost": 0.6,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1721260800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "openai/gpt-5",
-    "name": "GPT-5",
-    "provider_name": "OpenAI",
-    "cost": 10.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1754524800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "opengvlab/internvl3-14b",
-    "name": "InternVL3 14B",
-    "provider_name": "OpenGVLab",
-    "cost": 0.4,
-    "hf_id": "OpenGVLab/InternVL3-14B",
-    "size": 15117256704.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1744243200000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "qwen/qwen3-235b-a22b",
-    "name": "Qwen3 235B A22B",
-    "provider_name": "Qwen",
-    "cost": 0.0,
-    "hf_id": "Qwen/Qwen3-235B-A22B",
-    "size": 235093634560.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1745712000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "qwen/qwen3-30b-a3b",
-    "name": "Qwen3 30B A3B",
-    "provider_name": "Qwen",
-    "cost": 0.0,
-    "hf_id": "Qwen/Qwen3-30B-A3B",
-    "size": 30532122624.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1745712000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "qwen/qwen3-32b",
-    "name": "Qwen3 32B",
-    "provider_name": "Qwen",
-    "cost": 0.07,
-    "hf_id": "Qwen/Qwen3-32B",
-    "size": 32762123264.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1745712000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "qwen/qwq-32b",
-    "name": "QwQ 32B",
-    "provider_name": "Qwen",
-    "cost": 0.0,
-    "hf_id": "Qwen/QwQ-32B",
-    "size": 32763876352.0,
-    "type": "open-source",
-    "license": "Apache 2.0",
-    "creation_date": 1741132800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "switchpoint/router",
-    "name": "Switchpoint Router",
-    "provider_name": "Switchpoint Router",
-    "cost": 3.4,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1752192000000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "thedrummer/anubis-pro-105b-v1",
-    "name": "Anubis Pro 105B V1",
-    "provider_name": "TheDrummer",
-    "cost": 1.0,
-    "hf_id": "TheDrummer/Anubis-Pro-105B-v1",
-    "size": 104779882496.0,
-    "type": "open-source",
-    "license": "Other",
-    "creation_date": 1738454400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "thedrummer/skyfall-36b-v2",
-    "name": "Skyfall 36B V2",
-    "provider_name": "TheDrummer",
-    "cost": 0.19,
-    "hf_id": "TheDrummer/Skyfall-36B-v2",
-    "size": 36910535680.0,
-    "type": "open-source",
-    "license": "Other",
-    "creation_date": 1738540800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "tngtech/deepseek-r1t-chimera",
-    "name": "DeepSeek R1T Chimera",
-    "provider_name": "TNG",
-    "cost": 0.0,
-    "hf_id": "tngtech/DeepSeek-R1T-Chimera",
-    "size": 684531386000.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1745625600000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "tngtech/deepseek-r1t2-chimera",
-    "name": "DeepSeek R1T2 Chimera",
-    "provider_name": "TNG",
-    "cost": 0.0,
-    "hf_id": "tngtech/DeepSeek-TNG-R1T2-Chimera",
-    "size": 684531386000.0,
-    "type": "open-source",
-    "license": "Mit",
-    "creation_date": 1751414400000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "x-ai/grok-2-1212",
-    "name": "Grok 2 1212",
-    "provider_name": "xAI",
-    "cost": 10.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": 1734220800000,
-    "tasks": [
-      "translation_from",
-      "translation_to",
-      "classification",
-      "mmlu",
-      "arc",
-      "truthfulqa",
-      "mgsm"
-    ]
-  },
-  {
-    "id": "google/translate-v2",
-    "name": "Google Translate",
-    "provider_name": "Google",
-    "cost": 20.0,
-    "hf_id": null,
-    "size": null,
-    "type": "closed-source",
-    "license": null,
-    "creation_date": null,
-    "tasks": [
-      "translation_from",
-      "translation_to"
-    ]
-  },
-  {
-    "id": "moonshotai/kimi-k2",
-    "name": "Kimi K2",
-    "provider_name": "Moonshot AI",
-    "size": null,
-    "type": "closed-source",
-    "cost": 0.6,
-    "hf_id": null,
-    "creation_date": null,
-    "license": null
   }
 ]

 [
   {
+    "id":"openai\/gpt-5-nano",
+    "name":"GPT-5 Nano",
+    "provider_name":"OpenAI",
+    "cost":0.4,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1754524800000,
+    "tasks":[
       "translation_from",
       "translation_to",
       "classification",
       "truthfulqa",
       "mgsm"
     ]
   }
 ]

results.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a3f388fd054fc570366705f1b8d6cb65bd6353164482d3d2c71ccec742d6158
-size 57534940

 version https://git-lfs.github.com/spec/v1
+oid sha256:afcbf2e565f584c3e57fbdbd788e12aaa887f421e04249ab35a8a9fcf94ad6b4
+size 8030558