davidpomerenke commited on
Commit
963cb78
·
verified ·
1 Parent(s): 8eebb41

Upload from GitHub Actions: updated and cleaned up scripts for new eval runs

Browse files
.github/workflows/nightly-evals.yml CHANGED
@@ -8,6 +8,7 @@ on:
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
 
11
  timeout-minutes: 1440 # 24 hours timeout
12
  steps:
13
  - uses: actions/checkout@v3
@@ -22,7 +23,7 @@ jobs:
22
  curl -LsSf https://astral.sh/uv/install.sh | sh
23
  uv sync --frozen --extra dev
24
 
25
- - name: Run evaluations with checkpointing
26
  env:
27
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
28
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
@@ -31,28 +32,7 @@ jobs:
31
  run: |
32
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
33
  uv run evals/download_data.py
34
-
35
- # Run evaluations with periodic checkpointing
36
- uv run python -c "
37
- import time
38
- import subprocess
39
- import json
40
- import os
41
-
42
- # Check if we have existing results to resume from
43
- if os.path.exists('results.json'):
44
- print('Found existing results.json, will resume from checkpoint')
45
-
46
- # Run the main evaluation
47
- try:
48
- subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
49
- except subprocess.CalledProcessError as e:
50
- print(f'Evaluation failed: {e}')
51
- # Save current state even if failed
52
- if os.path.exists('results.json'):
53
- print('Saving checkpoint before exit...')
54
- exit(1)
55
- "
56
 
57
  - name: Commit changes
58
  env:
@@ -62,7 +42,7 @@ jobs:
62
  git config --local user.name "github-actions[bot]"
63
  git config --local --unset-all http.https://github.com/.extraheader
64
  git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
65
- git add results.json models.json languages.json checkpoint.json
66
  git commit -m "Update evaluation results" || echo "No changes to commit"
67
  git push origin HEAD:main
68
 
 
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
11
+ # checking if this is working in case eval runs take longer than 6h github actions allowance
12
  timeout-minutes: 1440 # 24 hours timeout
13
  steps:
14
  - uses: actions/checkout@v3
 
23
  curl -LsSf https://astral.sh/uv/install.sh | sh
24
  uv sync --frozen --extra dev
25
 
26
+ - name: Run evaluations
27
  env:
28
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
29
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
 
32
  run: |
33
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
34
  uv run evals/download_data.py
35
+ uv run evals/main.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  - name: Commit changes
38
  env:
 
42
  git config --local user.name "github-actions[bot]"
43
  git config --local --unset-all http.https://github.com/.extraheader
44
  git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
45
+ git add results.json models.json languages.json
46
  git commit -m "Update evaluation results" || echo "No changes to commit"
47
  git push origin HEAD:main
48
 
evals/datasets_/mgsm.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import random
4
 
5
  from datasets import Dataset, load_dataset
6
- from datasets_.util import _get_dataset_config_names, _load_dataset
7
  from langcodes import Language, standardize_tag
8
  from models import get_google_supported_languages, translate_google
9
  from rich import print
@@ -39,32 +39,39 @@ def parse_number(i):
39
  return None
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def load_mgsm(language_bcp_47, nr):
43
- print(f"Loading MGSM data for {language_bcp_47}...")
44
  if language_bcp_47 in tags_mgsm.keys():
45
- ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
46
- return slug_mgsm, ds[nr], "human"
47
  elif language_bcp_47 in tags_afrimgsm.keys():
48
- ds = _load_dataset(
49
- slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
50
- )
51
- return slug_afrimgsm, ds[nr], "human"
52
  elif language_bcp_47 in tags_gsm8kx.keys():
53
- row = _load_dataset(
54
- slug_gsm8kx,
55
- subset=tags_gsm8kx[language_bcp_47],
56
- split="test",
57
- trust_remote_code=True,
58
- )[nr]
59
- row["answer_number"] = row["answer"].split("####")[1].strip()
60
- return slug_gsm8kx, row, "machine"
61
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
62
- ds = _load_dataset(
63
- slug_gsm_autotranslated,
64
- subset=tags_gsm_autotranslated[language_bcp_47],
65
- split="test",
66
- )
67
- return slug_gsm_autotranslated, ds[nr], "machine"
68
  else:
69
  return None, None, None
70
 
 
3
  import random
4
 
5
  from datasets import Dataset, load_dataset
6
+ from datasets_.util import _get_dataset_config_names, _load_dataset, cache
7
  from langcodes import Language, standardize_tag
8
  from models import get_google_supported_languages, translate_google
9
  from rich import print
 
39
  return None
40
 
41
 
42
+ @cache
43
+ def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
44
+ """Cache individual MGSM items efficiently"""
45
+ try:
46
+ ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
47
+ if nr >= len(ds):
48
+ return None
49
+
50
+ row = ds[nr]
51
+
52
+ # Post-process based on dataset type
53
+ if dataset_slug == slug_gsm8kx:
54
+ row["answer_number"] = row["answer"].split("####")[1].strip()
55
+
56
+ return row
57
+ except Exception:
58
+ # Dataset doesn't exist or doesn't have test split
59
+ return None
60
+
61
+
62
  def load_mgsm(language_bcp_47, nr):
 
63
  if language_bcp_47 in tags_mgsm.keys():
64
+ item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
65
+ return slug_mgsm, item, "human" if item else (None, None, None)
66
  elif language_bcp_47 in tags_afrimgsm.keys():
67
+ item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
68
+ return slug_afrimgsm, item, "human" if item else (None, None, None)
 
 
69
  elif language_bcp_47 in tags_gsm8kx.keys():
70
+ item = _get_mgsm_item(slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True)
71
+ return slug_gsm8kx, item, "machine" if item else (None, None, None)
 
 
 
 
 
 
72
  elif language_bcp_47 in tags_gsm_autotranslated.keys():
73
+ item = _get_mgsm_item(slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr)
74
+ return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
 
 
 
 
75
  else:
76
  return None, None, None
77
 
evals/datasets_/mmlu.py CHANGED
@@ -4,7 +4,7 @@ import random
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
- from datasets_.util import _get_dataset_config_names, _load_dataset
8
  from langcodes import Language, standardize_tag
9
  from models import get_google_supported_languages, translate_google
10
  from rich import print
@@ -144,32 +144,51 @@ tags_mmlux = set(
144
  a.rsplit("_", 1)[1].split("-")[0].lower()
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
- tags_mmlu_autotranslated = _get_dataset_config_names("fair-forward/mmlu-autotranslated")
 
 
 
148
 
149
  categories = sorted(
150
  list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
151
  )
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  async def load_mmlu(language_bcp_47, nr):
155
- print(f"Loading MMLU data for {language_bcp_47}...")
156
  category = categories[nr % len(categories)]
157
  if language_bcp_47 in tags_afrimmlu.keys():
158
- ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
159
- ds = ds.map(parse_choices)
160
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
161
- return "masakhane/afrimmlu", task, "human"
162
  elif language_bcp_47 in tags_global_mmlu.keys():
163
- ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
164
- ds = ds.map(add_choices)
165
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
- return "CohereForAI/Global-MMLU", task, "human"
167
  # TODO: add in Okapi, MMLUX @Jonas
168
  elif language_bcp_47 in tags_mmlu_autotranslated:
169
- ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
170
- filtered = ds["test"].filter(lambda x: x["subject"] == category)
171
- task = filtered[nr]
172
- return "fair-forward/mmlu-autotranslated", task, "machine"
173
  else:
174
  return None, None, None
175
 
 
4
  from collections import Counter, defaultdict
5
 
6
  from datasets import Dataset, load_dataset
7
+ from datasets_.util import _get_dataset_config_names, _load_dataset, cache
8
  from langcodes import Language, standardize_tag
9
  from models import get_google_supported_languages, translate_google
10
  from rich import print
 
144
  a.rsplit("_", 1)[1].split("-")[0].lower()
145
  for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
146
  )
147
+ tags_mmlu_autotranslated = {
148
+ standardize_tag(a, macro=True): a
149
+ for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
150
+ }
151
 
152
  categories = sorted(
153
  list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
154
  )
155
 
156
 
157
+ @cache
158
+ def _get_processed_mmlu_dataset(dataset_name, subset_tag):
159
+ """Cache processed datasets to avoid reprocessing"""
160
+ ds = _load_dataset(dataset_name, subset_tag)
161
+ if dataset_name == "masakhane/afrimmlu":
162
+ ds = ds.map(parse_choices)
163
+ elif dataset_name == "CohereForAI/Global-MMLU":
164
+ ds = ds.map(add_choices)
165
+ return ds
166
+
167
+
168
+ @cache
169
+ def _get_mmlu_item(dataset_name, subset_tag, category, nr):
170
+ """Cache individual MMLU items efficiently"""
171
+ ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
172
+ if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
173
+ filtered = ds["test"].filter(lambda x: x["subject"] == category)
174
+ return filtered[nr] if nr < len(filtered) else None
175
+ else: # fair-forward/mmlu-autotranslated
176
+ filtered = ds["test"].filter(lambda x: x["subject"] == category)
177
+ return filtered[nr] if nr < len(filtered) else None
178
+
179
+
180
  async def load_mmlu(language_bcp_47, nr):
 
181
  category = categories[nr % len(categories)]
182
  if language_bcp_47 in tags_afrimmlu.keys():
183
+ task = _get_mmlu_item("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr)
184
+ return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
 
 
185
  elif language_bcp_47 in tags_global_mmlu.keys():
186
+ task = _get_mmlu_item("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr)
187
+ return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
 
 
188
  # TODO: add in Okapi, MMLUX @Jonas
189
  elif language_bcp_47 in tags_mmlu_autotranslated:
190
+ task = _get_mmlu_item("fair-forward/mmlu-autotranslated", language_bcp_47, category, nr)
191
+ return "fair-forward/mmlu-autotranslated", task, "machine" if task else (None, None, None)
 
 
192
  else:
193
  return None, None, None
194
 
evals/main.py CHANGED
@@ -1,271 +1,127 @@
1
  import asyncio
2
  import pandas as pd
3
  import time
4
- import os
5
  from datetime import datetime, timedelta
6
- from tqdm.asyncio import tqdm_asyncio
7
  from models import models
8
  from tasks import tasks
9
  from languages import languages
10
- import json
11
-
12
- results = pd.DataFrame()
13
-
14
- def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
15
- """Save current progress as checkpoint"""
16
- try:
17
- args = dict(orient="records", indent=2, force_ascii=False)
18
-
19
- # Save current results
20
- if len(results_df) > 0:
21
- results_df.to_json("results.json", **args)
22
- print(f"💾 Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
23
-
24
- # Save model and language info
25
- models_df.to_json("models.json", **args)
26
- languages_df.to_json("languages.json", **args)
27
-
28
- # Save checkpoint metadata
29
- checkpoint_info = {
30
- "last_batch": batch_num,
31
- "total_batches": total_batches,
32
- "timestamp": datetime.now().isoformat(),
33
- "results_count": len(results_df)
34
- }
35
- with open("checkpoint.json", "w") as f:
36
- json.dump(checkpoint_info, f, indent=2)
37
-
38
- except Exception as e:
39
- print(f"⚠️ Failed to save checkpoint: {e}")
40
-
41
- def load_checkpoint():
42
- """Load previous checkpoint if available"""
43
- try:
44
- if os.path.exists("checkpoint.json"):
45
- with open("checkpoint.json", "r") as f:
46
- checkpoint = json.load(f)
47
- print(f"📂 Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
48
- return checkpoint
49
- except Exception as e:
50
- print(f"⚠️ Failed to load checkpoint: {e}")
51
- return None
52
 
53
  async def evaluate():
54
- # FIXME we should not need this for-loop, but it helps
55
- n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
 
 
 
56
 
57
- # Load models and languages
58
  models_df = pd.DataFrame(models)
59
  languages_df = pd.DataFrame(languages)
 
 
 
 
 
 
 
 
 
 
60
 
61
- print(f"🚀 Running full evaluation with {len(models_df)} models.")
 
 
62
  start_time = time.time()
63
- print(f"🚀 Starting full evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
64
- print(f"📊 Evaluating {n_sentences} sentences per task")
65
 
66
- # Evaluate top languages by speakers (configurable via MAX_LANGUAGES env var)
67
- max_languages = int(os.environ.get("MAX_LANGUAGES", 2)) # Default 2 for quick testing
68
- top_languages = languages.head(max_languages) # Top N by population
69
- print(f"🌍 Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
70
-
71
- # Load checkpoint if available
72
- checkpoint = load_checkpoint()
73
- start_batch = 0
74
- if checkpoint:
75
- start_batch = checkpoint['last_batch']
76
- print(f"🔄 Resuming from batch {start_batch}")
77
-
78
- # For testing, just use all available languages up to max_languages
79
- for n_languages in [min(max_languages, len(top_languages))]:
80
- print(f"running evaluations for {n_languages} languages")
81
-
82
- # Load existing results
83
  try:
84
  old_results = pd.read_json("results.json")
85
  if old_results.empty:
86
  old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
87
  except FileNotFoundError:
88
  old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- try:
91
- old_models = pd.read_json("models.json")
92
- except FileNotFoundError:
93
- old_models = pd.DataFrame()
94
-
95
- # get all combinations of model, language and task
96
- combis = [
97
- (model, lang.bcp_47, task_name)
98
- for model in models_df["id"]
99
- for lang in top_languages.iloc[:n_languages].itertuples()
100
- for task_name, task in tasks.items()
101
- if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
102
- ]
103
- # filter out combinations that have already been evaluated
104
- combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
105
- combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
106
- combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
107
- # run evaluations in batches to prevent HTTP pool exhaustion
108
- all_tasks = []
109
- for i in range(n_sentences):
110
- for model, bcp_47, task_name in combis.itertuples(index=False):
111
- # All tasks now use the same signature
112
- all_tasks.append((tasks[task_name], model, bcp_47, i))
113
-
114
- print(f"⏳ Processing {len(all_tasks)} evaluation tasks in batches...")
115
-
116
- batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
117
- all_results = []
118
-
119
- # Calculate total batches for progress tracking
120
- total_batches = (len(all_tasks) + batch_size - 1) // batch_size
121
-
122
- for i in range(start_batch * batch_size, len(all_tasks), batch_size):
123
- batch = all_tasks[i:i+batch_size]
124
- current_batch = i // batch_size + 1
125
-
126
- print(f"📦 Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
127
-
128
- # Show what's being evaluated in this batch
129
- batch_summary = {}
130
- for task_data in batch:
131
- task_func, model, bcp_47, sentence_nr = task_data
132
- # Extract task name from function - handle both partial functions and regular functions
133
- if hasattr(task_func, 'func'):
134
- task_name = task_func.func.__name__.replace('_and_evaluate', '')
135
- else:
136
- task_name = task_func.__name__.replace('_and_evaluate', '')
137
-
138
- if task_name not in batch_summary:
139
- batch_summary[task_name] = set()
140
- batch_summary[task_name].add(bcp_47)
141
-
142
- for task_name, languages_set in batch_summary.items():
143
- lang_list = ', '.join(sorted(languages_set))
144
- print(f" 🔄 {task_name}: {lang_list}")
145
-
146
- batch_coroutines = []
147
- for task_data in batch:
148
- task_func, model, bcp_47, sentence_nr = task_data
149
- batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
150
-
151
- try:
152
- batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
153
- all_results.extend(batch_results)
154
-
155
- # Save checkpoint after each batch
156
- valid_results = []
157
- exception_count = 0
158
- for r in batch_results:
159
- if isinstance(r, Exception):
160
- exception_count += 1
161
- continue
162
- if isinstance(r, list):
163
- valid_results.extend(r)
164
- else:
165
- valid_results.append(r)
166
-
167
- if valid_results:
168
- # Aggregate results
169
- batch_df = pd.DataFrame(valid_results)
170
- if len(batch_df) > 0:
171
- batch_df = (
172
- batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
173
- .agg({"score": "mean"})
174
- .reset_index()
175
- )
176
- # Merge with existing results
177
- all_results_df = pd.concat([old_results, batch_df])
178
- all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
179
- all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
180
-
181
- # Save checkpoint
182
- save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
183
-
184
- # Update old_results for next batch
185
- old_results = all_results_df
186
-
187
- print(f"✅ Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
188
-
189
- except Exception as e:
190
- print(f"❌ Batch {current_batch} failed: {e}")
191
- # Save checkpoint even on failure
192
- if len(all_results) > 0:
193
- results_df = pd.DataFrame(all_results)
194
- save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
195
- continue
196
-
197
- # Reduced delay between batches (optimized for GitHub Actions)
198
- await asyncio.sleep(0.5)
199
-
200
- # Final aggregation and save
201
- results = all_results
202
- # Filter out exceptions and flatten results
203
  valid_results = []
204
- exception_count = 0
205
  for r in results:
206
- if isinstance(r, Exception):
207
- exception_count += 1
208
- continue
209
  if isinstance(r, list):
210
  valid_results.extend(r)
211
  else:
212
  valid_results.append(r)
 
 
213
 
214
- print(f"⚠️ Encountered {exception_count} API errors (model unavailable/rate limits)")
215
- print(f" Successfully processed {len(valid_results)} evaluations")
216
-
217
- # Save final results
 
 
 
218
  if valid_results:
219
- results = valid_results
220
- args = dict(orient="records", indent=2, force_ascii=False)
 
 
 
 
 
 
221
 
222
- # Aggregate results like main branch
223
- results_df = pd.DataFrame(results)
224
- if len(results_df) > 0:
225
- results_df = (
226
- results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
227
- .agg({"score": "mean"})
228
- .reset_index()
229
- )
230
- # Merge with old results
231
- old_results = pd.read_json("results.json")
232
- results_df = pd.concat([old_results, results_df])
233
- results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
234
  results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
235
  results_df.to_json("results.json", **args)
236
- print(f"💾 Saved {len(results_df)} aggregated results to results.json")
 
 
 
237
  else:
238
- print("⚠️ No valid results to aggregate")
239
- else:
240
- print("⚠️ No valid results to save - all API calls failed")
241
-
242
- # Save up-to-date info on models and languages (like main branch)
243
- all_models = pd.concat([pd.DataFrame(models), old_models])
244
- all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
245
- all_models.to_json("models.json", **args)
246
- pd.DataFrame(languages).to_json("languages.json", **args)
247
 
248
- # Continue with next batch even if this one had errors
249
-
250
- # Time estimation
251
- elapsed = time.time() - start_time
252
- elapsed_str = str(timedelta(seconds=int(elapsed)))
253
- if n_languages < max_languages:
254
- remaining_batches = (max_languages - n_languages) // 10
255
- batch_count = max(1, n_languages // 10) # Avoid division by zero
256
- estimated_remaining = elapsed * remaining_batches / batch_count
257
- eta = datetime.now() + timedelta(seconds=estimated_remaining)
258
- print(f"⏱️ Batch completed in {elapsed_str}. ETA for full run: {eta.strftime('%H:%M:%S')}")
259
- else:
260
- print(f"✅ Full evaluation completed in {elapsed_str}")
261
- print(f"🎉 Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
262
-
263
- # Clean up checkpoint file on successful completion
264
- if os.path.exists("checkpoint.json"):
265
- os.remove("checkpoint.json")
266
- print("🧹 Cleaned up checkpoint file")
267
 
268
- return results
 
 
269
 
270
 
271
  if __name__ == "__main__":
 
1
  import asyncio
2
  import pandas as pd
3
  import time
 
4
  from datetime import datetime, timedelta
 
5
  from models import models
6
  from tasks import tasks
7
  from languages import languages
8
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  async def evaluate():
11
+ # Configuration - easily adjustable defaults
12
+ n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task
13
+ max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages
14
+ single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model
15
+ test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving
16
 
 
17
  models_df = pd.DataFrame(models)
18
  languages_df = pd.DataFrame(languages)
19
+ top_languages = languages.head(max_languages)
20
+
21
+ # Filter to single model if specified
22
+ if single_model:
23
+ models_df = models_df[models_df["id"] == single_model]
24
+ if len(models_df) == 0:
25
+ print(f"Error: Model '{single_model}' not found. Available models:")
26
+ for model_id in pd.DataFrame(models)["id"]:
27
+ print(f" {model_id}")
28
+ return pd.DataFrame()
29
 
30
+ print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
31
+ if test_mode:
32
+ print("TEST MODE: Skipping results loading/saving")
33
  start_time = time.time()
 
 
34
 
35
+ # Load existing results to avoid re-evaluation (skip in test mode)
36
+ if test_mode:
37
+ old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
38
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  try:
40
  old_results = pd.read_json("results.json")
41
  if old_results.empty:
42
  old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
43
  except FileNotFoundError:
44
  old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
45
+
46
+ # Get all combinations that need evaluation
47
+ combis = [
48
+ (model, lang.bcp_47, task_name)
49
+ for model in models_df["id"]
50
+ for lang in top_languages.itertuples()
51
+ for task_name, task in tasks.items()
52
+ if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
53
+ ]
54
+
55
+ # Filter out already evaluated combinations
56
+ combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
57
+ combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
58
+ combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
59
+
60
+ # Create all evaluation tasks
61
+ all_tasks = []
62
+ for i in range(n_sentences):
63
+ for model, bcp_47, task_name in combis.itertuples(index=False):
64
+ all_tasks.append((tasks[task_name], model, bcp_47, i))
65
+
66
+ print(f"Running {len(all_tasks)} evaluation tasks...")
67
+
68
+ # Run all tasks with simple asyncio.gather, but stop on first error
69
+ try:
70
+ results = await asyncio.gather(
71
+ *[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks],
72
+ return_exceptions=False # This will raise on first exception
73
+ )
74
 
75
+ # Process results - no exceptions should reach here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  valid_results = []
 
77
  for r in results:
 
 
 
78
  if isinstance(r, list):
79
  valid_results.extend(r)
80
  else:
81
  valid_results.append(r)
82
+
83
+ print(f"Completed: {len(valid_results)} valid results")
84
 
85
+ except Exception as e:
86
+ print(f"EVALUATION STOPPED - API Error occurred:")
87
+ print(f"Error type: {type(e).__name__}")
88
+ print(f"Error message: {str(e)}")
89
+ return pd.DataFrame()
90
+
91
+ # Save results (skip in test mode)
92
  if valid_results:
93
+ results_df = pd.DataFrame(valid_results)
94
+
95
+ # Aggregate results
96
+ results_df = (
97
+ results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
98
+ .agg({"score": "mean"})
99
+ .reset_index()
100
+ )
101
 
102
+ if not test_mode:
103
+ args = dict(orient="records", indent=2, force_ascii=False)
104
+
105
+ # Merge with existing results
106
+ if not old_results.empty:
107
+ results_df = pd.concat([old_results, results_df])
108
+ results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
109
+
 
 
 
 
110
  results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
111
  results_df.to_json("results.json", **args)
112
+
113
+ # Save model and language info
114
+ models_df.to_json("models.json", **args)
115
+ languages_df.to_json("languages.json", **args)
116
  else:
117
+ print("TEST MODE: Skipping results saving")
 
 
 
 
 
 
 
 
118
 
119
+ elapsed = time.time() - start_time
120
+ print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ return results_df
123
+
124
+ return pd.DataFrame()
125
 
126
 
127
  if __name__ == "__main__":
evals/models.py CHANGED
@@ -27,7 +27,8 @@ important_models = [
27
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
28
  "meta-llama/llama-3-70b-instruct", # 0.4$
29
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
30
- "openai/gpt-5", # include if/when available
 
31
  "openai/gpt-4.1", # 8$
32
  "openai/gpt-4.1-mini", # 1.6$
33
  "openai/gpt-4.1-nano", # 0.4$
@@ -96,9 +97,6 @@ def get_model(permaslug):
96
  and m["endpoint"]
97
  and not m["endpoint"]["is_free"]
98
  ]
99
- if len(slugs) == 0:
100
- # the problem is that free models typically have very high rate-limiting
101
- print(f"no non-free model found for {permaslug}")
102
  return slugs[0] if len(slugs) >= 1 else None
103
 
104
 
@@ -132,18 +130,11 @@ def get_historical_popular_models(date: date):
132
  for model_slug, count in sorted_models[:20]: # Top 20
133
  result.append({"slug": model_slug, "count": int(count)})
134
 
135
- print(f"✅ Historical OpenRouter models: {len(result)} models fetched")
136
- if result:
137
- print(f" Top 5: {[m['slug'] for m in result[:5]]}")
138
- print(f" Sample counts: {[m['count'] for m in result[:3]]}")
139
  return result
140
  else:
141
- print("⚠️ Could not find model ranking data in OpenRouter response")
142
  return []
143
 
144
  except Exception as e:
145
- print(f"⚠️ Error fetching OpenRouter historical rankings: {e}")
146
- print("🔄 Falling back to static model list")
147
  return []
148
 
149
 
@@ -176,18 +167,11 @@ def get_current_popular_models(date: date):
176
  for model_slug, count in sorted_models[:10]: # Top 10
177
  result.append({"slug": model_slug, "count": int(count)})
178
 
179
- print(f"✅ Current OpenRouter models: {len(result)} models fetched")
180
- if result:
181
- print(f" Top 5: {[m['slug'] for m in result[:5]]}")
182
- print(f" Sample counts: {[m['count'] for m in result[:3]]}")
183
  return result
184
  else:
185
- print("⚠️ Could not find daily ranking data in OpenRouter response")
186
  return []
187
 
188
  except Exception as e:
189
- print(f"⚠️ Error fetching OpenRouter current rankings: {e}")
190
- print("🔄 Falling back to static model list")
191
  return []
192
 
193
 
@@ -244,16 +228,13 @@ async def complete(**kwargs) -> str | None:
244
  return None
245
  raise e
246
  except asyncio.TimeoutError:
247
- print(f"⏰ Timeout after {timeout}s for model {model_id}")
248
  return None
249
  if not response.choices:
250
  raise Exception(response)
251
  return response.choices[0].message.content.strip()
252
 
253
-
254
  translate_client = None
255
 
256
-
257
  def get_google_translate_client():
258
  global translate_client
259
  if translate_client is None:
@@ -364,7 +345,7 @@ def get_cost(row):
364
  return None
365
 
366
 
367
- @cache
368
  def load_models(date: date):
369
  popular_models = (
370
  get_historical_popular_models(date.today())[:20]
@@ -374,25 +355,12 @@ def load_models(date: date):
374
  all_model_candidates = set(important_models + popular_models) - set(blocklist)
375
 
376
  # Validate models exist on OpenRouter before including them
377
- print(f"🔍 Validating {len(all_model_candidates)} model candidates...")
378
  valid_models = []
379
- invalid_models = []
380
 
381
  for model_id in all_model_candidates:
382
  metadata = get_or_metadata(model_id)
383
  if metadata is not None:
384
  valid_models.append(model_id)
385
- else:
386
- invalid_models.append(model_id)
387
-
388
- if invalid_models:
389
- print(f"⚠️ Excluded {len(invalid_models)} invalid models:")
390
- for model in sorted(invalid_models)[:5]: # Show first 5
391
- print(f" - {model}")
392
- if len(invalid_models) > 5:
393
- print(f" ... and {len(invalid_models) - 5} more")
394
-
395
- print(f"✅ Using {len(valid_models)} valid models for evaluation")
396
 
397
  models = pd.DataFrame(sorted(valid_models), columns=["id"])
398
  or_metadata = models["id"].apply(get_or_metadata)
 
27
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
28
  "meta-llama/llama-3-70b-instruct", # 0.4$
29
  # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
30
+ "openai/gpt-5",
31
+ "openai/gpt-5-nano", # include if/when available
32
  "openai/gpt-4.1", # 8$
33
  "openai/gpt-4.1-mini", # 1.6$
34
  "openai/gpt-4.1-nano", # 0.4$
 
97
  and m["endpoint"]
98
  and not m["endpoint"]["is_free"]
99
  ]
 
 
 
100
  return slugs[0] if len(slugs) >= 1 else None
101
 
102
 
 
130
  for model_slug, count in sorted_models[:20]: # Top 20
131
  result.append({"slug": model_slug, "count": int(count)})
132
 
 
 
 
 
133
  return result
134
  else:
 
135
  return []
136
 
137
  except Exception as e:
 
 
138
  return []
139
 
140
 
 
167
  for model_slug, count in sorted_models[:10]: # Top 10
168
  result.append({"slug": model_slug, "count": int(count)})
169
 
 
 
 
 
170
  return result
171
  else:
 
172
  return []
173
 
174
  except Exception as e:
 
 
175
  return []
176
 
177
 
 
228
  return None
229
  raise e
230
  except asyncio.TimeoutError:
 
231
  return None
232
  if not response.choices:
233
  raise Exception(response)
234
  return response.choices[0].message.content.strip()
235
 
 
236
  translate_client = None
237
 
 
238
  def get_google_translate_client():
239
  global translate_client
240
  if translate_client is None:
 
345
  return None
346
 
347
 
348
+ #@cache
349
  def load_models(date: date):
350
  popular_models = (
351
  get_historical_popular_models(date.today())[:20]
 
355
  all_model_candidates = set(important_models + popular_models) - set(blocklist)
356
 
357
  # Validate models exist on OpenRouter before including them
 
358
  valid_models = []
 
359
 
360
  for model_id in all_model_candidates:
361
  metadata = get_or_metadata(model_id)
362
  if metadata is not None:
363
  valid_models.append(model_id)
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  models = pd.DataFrame(sorted(valid_models), columns=["id"])
366
  or_metadata = models["id"].apply(get_or_metadata)
evals/tasks.py CHANGED
@@ -11,10 +11,8 @@ from datasets_.mgsm import load_mgsm, parse_number
11
  from datasets_.mmlu import load_mmlu
12
  from datasets_.arc import load_uhura_arc_easy
13
  from datasets_.truthfulqa import load_truthfulqa
14
- from google.cloud import translate_v2 as translate
15
- from langcodes import closest_supported_match
16
  from languages import languages, script_name
17
- from models import complete, transcribe, translate_google, get_google_supported_languages
18
 
19
  bleu = evaluate.load("bleu")
20
  chrf = evaluate.load("chrf")
@@ -45,32 +43,20 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
45
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
46
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
47
  script = script_name(target_language.flores_path.split("_")[1])
48
- if model == "google/translate-v2":
49
- supported_languages = get_google_supported_languages()
50
- original_language = closest_supported_match(
51
- original_language, supported_languages
52
- )
53
- target_language = closest_supported_match(target_language, supported_languages)
54
- if original_language == target_language:
55
- prediction = original_sentence
56
- elif original_language is None or target_language is None:
57
- prediction = None
58
- else:
59
- prediction = await translate_google(
60
- original_sentence, original_language.bcp_47, target_language.bcp_47
61
- )
62
- else:
63
- prediction = await complete(
64
- model=model,
65
- messages=[
66
- {
67
- "role": "user",
68
- "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
69
- }
70
- ],
71
- temperature=0,
72
- max_tokens=1024,
73
- )
74
  if prediction:
75
  bleu_score = bleu.compute(
76
  predictions=[prediction],
@@ -83,6 +69,9 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
83
  else:
84
  bleu_score = {"bleu": 0}
85
  chrf_score = {"score": 0}
 
 
 
86
  return [
87
  {
88
  "model": model,
@@ -120,12 +109,16 @@ Reply with only the topic name.
120
  Text:
121
  {test_paragraph.text}
122
  """
123
- pred = await complete(
124
  model=model,
125
  messages=[{"role": "user", "content": prompt}],
126
  temperature=0,
127
  max_tokens=30,
128
- ).lower().strip()
 
 
 
 
129
  true = test_paragraph.topic.lower().strip()
130
  others = [t for t in top_topics if t != true]
131
  acc = (
@@ -136,6 +129,8 @@ Text:
136
  if pred
137
  else 0
138
  )
 
 
139
  return [
140
  {
141
  "model": model,
@@ -228,23 +223,20 @@ Response format: <reasoning> #### <letter>
228
  {format_multiple_choice(task)}""",
229
  },
230
  ]
231
- try:
232
- response = await complete(
233
- model=model,
234
- messages=messages,
235
- temperature=0,
236
- max_tokens=1024,
237
- )
238
- if response and "####" in response:
239
- answer = response.split("####")[-1].strip()
240
- acc = int(answer[:1] == task["answer"])
241
- else:
242
- acc = 0
243
- except Exception as e:
244
- if "ResponsibleAIPolicyViolation" in str(e):
245
- acc = 0
246
- else:
247
- raise e
248
 
249
  return [
250
  {
@@ -276,23 +268,18 @@ Response format: <reasoning> #### <letter>
276
  {format_multiple_choice(task)}""",
277
  },
278
  ]
279
- try:
280
- response = await complete(
281
- model=model,
282
- messages=messages,
283
- temperature=0,
284
- max_tokens=1024,
285
- )
286
- if response and "####" in response:
287
- answer = response.split("####")[-1].strip()
288
- acc = int(answer[:1] == task["answer"])
289
- else:
290
- acc = 0
291
- except Exception as e:
292
- if "ResponsibleAIPolicyViolation" in str(e):
293
- acc = 0
294
- else:
295
- raise e
296
  return [
297
  {
298
  "model": model,
@@ -349,23 +336,20 @@ Response format: <reasoning> #### <letter>
349
  {format_multiple_choice_truthfulqa(task)}""",
350
  },
351
  ]
352
- try:
353
- response = await complete(
354
- model=model,
355
- messages=messages,
356
- temperature=0,
357
- max_tokens=1024, # Increased for reasoning
358
- )
359
- if response and "####" in response:
360
- pred_answer = response.split("####")[-1].strip()
361
- acc = int(pred_answer[:1].upper() == answer)
362
- else:
363
- acc = 0
364
- except Exception as e:
365
- if "ResponsibleAIPolicyViolation" in str(e):
366
- acc = 0
367
- else:
368
- raise e
369
  return [
370
  {
371
  "model": model,
@@ -407,6 +391,9 @@ Response format: <reasoning> #### <number>
407
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
408
  else:
409
  accuracy = 0
 
 
 
410
 
411
  return [
412
  {
 
11
  from datasets_.mmlu import load_mmlu
12
  from datasets_.arc import load_uhura_arc_easy
13
  from datasets_.truthfulqa import load_truthfulqa
 
 
14
  from languages import languages, script_name
15
+ from models import complete, transcribe
16
 
17
  bleu = evaluate.load("bleu")
18
  chrf = evaluate.load("chrf")
 
43
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
44
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
45
  script = script_name(target_language.flores_path.split("_")[1])
46
+ translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
47
+ prediction = await complete(
48
+ model=model,
49
+ messages=[
50
+ {
51
+ "role": "user",
52
+ "content": translation_prompt,
53
+ }
54
+ ],
55
+ temperature=0,
56
+ max_tokens=1024,
57
+ )
58
+
59
+
 
 
 
 
 
 
 
 
 
 
 
 
60
  if prediction:
61
  bleu_score = bleu.compute(
62
  predictions=[prediction],
 
69
  else:
70
  bleu_score = {"bleu": 0}
71
  chrf_score = {"score": 0}
72
+
73
+
74
+
75
  return [
76
  {
77
  "model": model,
 
109
  Text:
110
  {test_paragraph.text}
111
  """
112
+ response = await complete(
113
  model=model,
114
  messages=[{"role": "user", "content": prompt}],
115
  temperature=0,
116
  max_tokens=30,
117
+ )
118
+
119
+
120
+
121
+ pred = response.lower().strip() if response else ""
122
  true = test_paragraph.topic.lower().strip()
123
  others = [t for t in top_topics if t != true]
124
  acc = (
 
129
  if pred
130
  else 0
131
  )
132
+
133
+
134
  return [
135
  {
136
  "model": model,
 
223
  {format_multiple_choice(task)}""",
224
  },
225
  ]
226
+ response = await complete(
227
+ model=model,
228
+ messages=messages,
229
+ temperature=0,
230
+ max_tokens=1024,
231
+ )
232
+ if response and "####" in response:
233
+ answer = response.split("####")[-1].strip()
234
+ acc = int(answer[:1] == task["answer"])
235
+ else:
236
+ acc = 0
237
+ answer = "NO_ANSWER"
238
+
239
+
 
 
 
240
 
241
  return [
242
  {
 
268
  {format_multiple_choice(task)}""",
269
  },
270
  ]
271
+ response = await complete(
272
+ model=model,
273
+ messages=messages,
274
+ temperature=0,
275
+ max_tokens=1024,
276
+ )
277
+ if response and "####" in response:
278
+ answer = response.split("####")[-1].strip()
279
+ acc = int(answer[:1] == task["answer"])
280
+ else:
281
+ acc = 0
282
+ answer = "NO_ANSWER"
 
 
 
 
 
283
  return [
284
  {
285
  "model": model,
 
336
  {format_multiple_choice_truthfulqa(task)}""",
337
  },
338
  ]
339
+ response = await complete(
340
+ model=model,
341
+ messages=messages,
342
+ temperature=0,
343
+ max_tokens=1024, # Increased for reasoning
344
+ )
345
+ if response and "####" in response:
346
+ pred_answer = response.split("####")[-1].strip()
347
+ acc = int(pred_answer[:1].upper() == answer)
348
+ else:
349
+ acc = 0
350
+ pred_answer = "NO_ANSWER"
351
+
352
+
 
 
 
353
  return [
354
  {
355
  "model": model,
 
391
  accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
392
  else:
393
  accuracy = 0
394
+ number = "NO_ANSWER"
395
+
396
+
397
 
398
  return [
399
  {
languages.json CHANGED
@@ -7,7 +7,7 @@
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
- "commonvoice_hours":2679.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
@@ -32,7 +32,7 @@
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
- "commonvoice_locale":"hi-IN",
36
  "in_benchmark":true
37
  },
38
  {
@@ -43,7 +43,7 @@
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
- "commonvoice_hours":448.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
@@ -79,7 +79,7 @@
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
- "commonvoice_hours":1068.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
@@ -127,7 +127,7 @@
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
- "commonvoice_hours":245.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
@@ -139,7 +139,7 @@
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
- "commonvoice_hours":411.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
@@ -163,7 +163,7 @@
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
- "commonvoice_hours":1371.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
@@ -1027,7 +1027,7 @@
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
- "commonvoice_hours":93.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
@@ -1183,7 +1183,7 @@
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
- "commonvoice_hours":1811.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
@@ -1207,7 +1207,7 @@
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
- "commonvoice_hours":0.4,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
@@ -1291,7 +1291,7 @@
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
- "commonvoice_hours":2878.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
@@ -1303,7 +1303,7 @@
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
- "commonvoice_hours":1.7,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
@@ -1375,7 +1375,7 @@
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
- "commonvoice_hours":427.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
@@ -1519,7 +1519,7 @@
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
- "commonvoice_hours":69.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
@@ -1555,7 +1555,7 @@
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
- "commonvoice_hours":51.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
@@ -1675,7 +1675,7 @@
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
- "commonvoice_hours":4.1,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
@@ -1747,7 +1747,7 @@
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
- "commonvoice_hours":1.5,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
@@ -2167,7 +2167,7 @@
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
- "commonvoice_hours":129.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
@@ -3175,8 +3175,8 @@
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
- "commonvoice_hours":null,
3179
- "commonvoice_locale":null,
3180
  "in_benchmark":false
3181
  },
3182
  {
@@ -3331,7 +3331,7 @@
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
- "commonvoice_hours":9.1,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
@@ -3535,7 +3535,7 @@
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
- "commonvoice_hours":452.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
@@ -3559,7 +3559,7 @@
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
- "commonvoice_hours":94.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
@@ -3679,7 +3679,7 @@
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
- "commonvoice_hours":1.4,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
@@ -4099,8 +4099,8 @@
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
- "commonvoice_hours":null,
4103
- "commonvoice_locale":null,
4104
  "in_benchmark":false
4105
  },
4106
  {
@@ -4651,7 +4651,7 @@
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
- "commonvoice_hours":31.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
@@ -5011,7 +5011,7 @@
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
- "commonvoice_hours":0.9,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
 
7
  "family":"Indo-European",
8
  "flores_path":"eng_Latn",
9
  "fleurs_tag":"en_us",
10
+ "commonvoice_hours":2683.0,
11
  "commonvoice_locale":"en",
12
  "in_benchmark":true
13
  },
 
32
  "flores_path":"hin_Deva",
33
  "fleurs_tag":"hi_in",
34
  "commonvoice_hours":16.0,
35
+ "commonvoice_locale":"hi",
36
  "in_benchmark":true
37
  },
38
  {
 
43
  "family":"Indo-European",
44
  "flores_path":"spa_Latn",
45
  "fleurs_tag":"es_419",
46
+ "commonvoice_hours":449.0,
47
  "commonvoice_locale":"es",
48
  "in_benchmark":true
49
  },
 
79
  "family":"Indo-European",
80
  "flores_path":"fra_Latn",
81
  "fleurs_tag":"fr_fr",
82
+ "commonvoice_hours":1072.0,
83
  "commonvoice_locale":"fr",
84
  "in_benchmark":true
85
  },
 
127
  "family":"Indo-European",
128
  "flores_path":"rus_Cyrl",
129
  "fleurs_tag":"ru_ru",
130
+ "commonvoice_hours":247.0,
131
  "commonvoice_locale":"ru",
132
  "in_benchmark":true
133
  },
 
139
  "family":"Atlantic-Congo",
140
  "flores_path":"swh_Latn",
141
  "fleurs_tag":"sw_ke",
142
+ "commonvoice_hours":412.0,
143
  "commonvoice_locale":"sw",
144
  "in_benchmark":true
145
  },
 
163
  "family":"Indo-European",
164
  "flores_path":"deu_Latn",
165
  "fleurs_tag":"de_de",
166
+ "commonvoice_hours":1372.0,
167
  "commonvoice_locale":"de",
168
  "in_benchmark":true
169
  },
 
1027
  "family":"Uralic",
1028
  "flores_path":"hun_Latn",
1029
  "fleurs_tag":"hu_hu",
1030
+ "commonvoice_hours":94.0,
1031
  "commonvoice_locale":"hu",
1032
  "in_benchmark":true
1033
  },
 
1183
  "family":"Indo-European",
1184
  "flores_path":"bel_Cyrl",
1185
  "fleurs_tag":"be_by",
1186
+ "commonvoice_hours":1812.0,
1187
  "commonvoice_locale":"be",
1188
  "in_benchmark":true
1189
  },
 
1207
  "family":"Indo-European",
1208
  "flores_path":"tgk_Cyrl",
1209
  "fleurs_tag":"tg_tj",
1210
+ "commonvoice_hours":0.6,
1211
  "commonvoice_locale":"tg",
1212
  "in_benchmark":true
1213
  },
 
1291
  "family":"Indo-European",
1292
  "flores_path":"cat_Latn",
1293
  "fleurs_tag":"ca_es",
1294
+ "commonvoice_hours":2883.0,
1295
  "commonvoice_locale":"ca",
1296
  "in_benchmark":true
1297
  },
 
1303
  "family":"Afro-Asiatic",
1304
  "flores_path":"heb_Hebr",
1305
  "fleurs_tag":"he_il",
1306
+ "commonvoice_hours":2.0,
1307
  "commonvoice_locale":"he",
1308
  "in_benchmark":true
1309
  },
 
1375
  "family":"Turkic",
1376
  "flores_path":"uig_Arab",
1377
  "fleurs_tag":null,
1378
+ "commonvoice_hours":437.0,
1379
  "commonvoice_locale":"ug",
1380
  "in_benchmark":true
1381
  },
 
1519
  "family":"Indo-European",
1520
  "flores_path":"kmr_Latn",
1521
  "fleurs_tag":null,
1522
+ "commonvoice_hours":71.0,
1523
  "commonvoice_locale":"kmr",
1524
  "in_benchmark":true
1525
  },
 
1555
  "family":"Indo-European",
1556
  "flores_path":"slk_Latn",
1557
  "fleurs_tag":"sk_sk",
1558
+ "commonvoice_hours":52.0,
1559
  "commonvoice_locale":"sk",
1560
  "in_benchmark":true
1561
  },
 
1675
  "family":"Tupian",
1676
  "flores_path":"gug_Latn",
1677
  "fleurs_tag":null,
1678
+ "commonvoice_hours":4.5,
1679
  "commonvoice_locale":"gn",
1680
  "in_benchmark":true
1681
  },
 
1747
  "family":"Indo-European",
1748
  "flores_path":"nob_Latn",
1749
  "fleurs_tag":"nb_no",
1750
+ "commonvoice_hours":1.8,
1751
  "commonvoice_locale":"nb-NO",
1752
  "in_benchmark":true
1753
  },
 
2167
  "family":"Indo-European",
2168
  "flores_path":"glg_Latn",
2169
  "fleurs_tag":"gl_es",
2170
+ "commonvoice_hours":162.0,
2171
  "commonvoice_locale":"gl",
2172
  "in_benchmark":true
2173
  },
 
3175
  "family":"Atlantic-Congo",
3176
  "flores_path":null,
3177
  "fleurs_tag":null,
3178
+ "commonvoice_hours":0.0,
3179
+ "commonvoice_locale":"seh",
3180
  "in_benchmark":false
3181
  },
3182
  {
 
3331
  "family":"Indo-European",
3332
  "flores_path":"gle_Latn",
3333
  "fleurs_tag":"ga_ie",
3334
+ "commonvoice_hours":9.3,
3335
  "commonvoice_locale":"ga-IE",
3336
  "in_benchmark":true
3337
  },
 
3535
  "family":null,
3536
  "flores_path":"eus_Latn",
3537
  "fleurs_tag":null,
3538
+ "commonvoice_hours":453.0,
3539
  "commonvoice_locale":"eu",
3540
  "in_benchmark":true
3541
  },
 
3559
  "family":"Abkhaz-Adyge",
3560
  "flores_path":null,
3561
  "fleurs_tag":null,
3562
+ "commonvoice_hours":106.0,
3563
  "commonvoice_locale":"kbd",
3564
  "in_benchmark":false
3565
  },
 
3679
  "family":"Indo-European",
3680
  "flores_path":"ydd_Hebr",
3681
  "fleurs_tag":null,
3682
+ "commonvoice_hours":1.7,
3683
  "commonvoice_locale":"yi",
3684
  "in_benchmark":true
3685
  },
 
4099
  "family":"Indo-European",
4100
  "flores_path":null,
4101
  "fleurs_tag":null,
4102
+ "commonvoice_hours":0.0,
4103
+ "commonvoice_locale":"pcd",
4104
  "in_benchmark":false
4105
  },
4106
  {
 
4651
  "family":"Abkhaz-Adyge",
4652
  "flores_path":null,
4653
  "fleurs_tag":null,
4654
+ "commonvoice_hours":32.0,
4655
  "commonvoice_locale":"ady",
4656
  "in_benchmark":false
4657
  },
 
5011
  "family":"Nakh-Daghestanian",
5012
  "flores_path":"dar_Cyrl",
5013
  "fleurs_tag":null,
5014
+ "commonvoice_hours":1.3,
5015
  "commonvoice_locale":"dar",
5016
  "in_benchmark":true
5017
  },
models.json CHANGED
@@ -1,15 +1,15 @@
1
  [
2
  {
3
- "id": "amazon/nova-micro-v1",
4
- "name": "Nova Micro 1.0",
5
- "provider_name": "Amazon",
6
- "cost": 0.14,
7
- "hf_id": null,
8
- "size": null,
9
- "type": "closed-source",
10
- "license": null,
11
- "creation_date": 1733356800000,
12
- "tasks": [
13
  "translation_from",
14
  "translation_to",
15
  "classification",
@@ -18,971 +18,5 @@
18
  "truthfulqa",
19
  "mgsm"
20
  ]
21
- },
22
- {
23
- "id": "anthracite-org/magnum-v4-72b",
24
- "name": "Magnum v4 72B",
25
- "provider_name": "Magnum v4 72B",
26
- "cost": 3.0,
27
- "hf_id": "anthracite-org/magnum-v4-72b",
28
- "size": 72706203648.0,
29
- "type": "open-source",
30
- "license": "Apache 2.0",
31
- "creation_date": 1726790400000,
32
- "tasks": [
33
- "translation_from",
34
- "translation_to",
35
- "classification",
36
- "mmlu",
37
- "arc",
38
- "truthfulqa",
39
- "mgsm"
40
- ]
41
- },
42
- {
43
- "id": "anthropic/claude-sonnet-4",
44
- "name": "Claude Sonnet 4",
45
- "provider_name": "Anthropic",
46
- "cost": 15.0,
47
- "hf_id": null,
48
- "size": null,
49
- "type": "closed-source",
50
- "license": null,
51
- "creation_date": 1747872000000,
52
- "tasks": [
53
- "translation_from",
54
- "translation_to",
55
- "classification",
56
- "mmlu",
57
- "arc",
58
- "truthfulqa",
59
- "mgsm"
60
- ]
61
- },
62
- {
63
- "id": "deepseek/deepseek-chat",
64
- "name": "DeepSeek V3",
65
- "provider_name": "DeepSeek",
66
- "cost": 0.72,
67
- "hf_id": "deepseek-ai/DeepSeek-V3",
68
- "size": 684531386000.0,
69
- "type": "open-source",
70
- "license": "",
71
- "creation_date": 1735084800000,
72
- "tasks": [
73
- "translation_from",
74
- "translation_to",
75
- "classification",
76
- "mmlu",
77
- "arc",
78
- "truthfulqa",
79
- "mgsm"
80
- ]
81
- },
82
- {
83
- "id": "deepseek/deepseek-chat-v3-0324",
84
- "name": "DeepSeek V3 0324",
85
- "provider_name": "DeepSeek",
86
- "cost": 0.0,
87
- "hf_id": "deepseek-ai/DeepSeek-V3-0324",
88
- "size": 684531386000.0,
89
- "type": "open-source",
90
- "license": "Mit",
91
- "creation_date": 1742774400000,
92
- "tasks": [
93
- "translation_from",
94
- "translation_to",
95
- "classification",
96
- "mmlu",
97
- "arc",
98
- "truthfulqa",
99
- "mgsm"
100
- ]
101
- },
102
- {
103
- "id": "deepseek/deepseek-r1-0528",
104
- "name": "R1 0528",
105
- "provider_name": "DeepSeek",
106
- "cost": 0.0,
107
- "hf_id": "deepseek-ai/DeepSeek-R1-0528",
108
- "size": 684531386000.0,
109
- "type": "open-source",
110
- "license": "Mit",
111
- "creation_date": 1748390400000,
112
- "tasks": [
113
- "translation_from",
114
- "translation_to",
115
- "classification",
116
- "mmlu",
117
- "arc",
118
- "truthfulqa",
119
- "mgsm"
120
- ]
121
- },
122
- {
123
- "id": "google/gemini-2.0-flash-lite-001",
124
- "name": "Gemini 2.0 Flash Lite",
125
- "provider_name": "Google",
126
- "cost": 0.3,
127
- "hf_id": null,
128
- "size": null,
129
- "type": "closed-source",
130
- "license": null,
131
- "creation_date": 1740441600000,
132
- "tasks": [
133
- "translation_from",
134
- "translation_to",
135
- "classification",
136
- "mmlu",
137
- "arc",
138
- "truthfulqa",
139
- "mgsm"
140
- ]
141
- },
142
- {
143
- "id": "google/gemini-2.5-flash",
144
- "name": "Gemini 2.5 Flash",
145
- "provider_name": "Google",
146
- "cost": 2.5,
147
- "hf_id": null,
148
- "size": null,
149
- "type": "closed-source",
150
- "license": null,
151
- "creation_date": 1750118400000,
152
- "tasks": [
153
- "translation_from",
154
- "translation_to",
155
- "classification",
156
- "mmlu",
157
- "arc",
158
- "truthfulqa",
159
- "mgsm"
160
- ]
161
- },
162
- {
163
- "id": "google/gemma-2-9b-it",
164
- "name": "Gemma 2 9B",
165
- "provider_name": "Google",
166
- "cost": 0.0,
167
- "hf_id": "google/gemma-2-9b-it",
168
- "size": 9241705984.0,
169
- "type": "open-source",
170
- "license": "Gemma",
171
- "creation_date": 1719187200000,
172
- "tasks": [
173
- "translation_from",
174
- "translation_to",
175
- "classification",
176
- "mmlu",
177
- "arc",
178
- "truthfulqa",
179
- "mgsm"
180
- ]
181
- },
182
- {
183
- "id": "google/gemma-3-27b-it",
184
- "name": "Gemma 3 27B",
185
- "provider_name": "Google",
186
- "cost": 0.0,
187
- "hf_id": "google/gemma-3-27b-it",
188
- "size": 27432406640.0,
189
- "type": "open-source",
190
- "license": "Gemma",
191
- "creation_date": 1740787200000,
192
- "tasks": [
193
- "translation_from",
194
- "translation_to",
195
- "classification",
196
- "mmlu",
197
- "arc",
198
- "truthfulqa",
199
- "mgsm"
200
- ]
201
- },
202
- {
203
- "id": "meta-llama/llama-3-70b-instruct",
204
- "name": "Llama 3 70B Instruct",
205
- "provider_name": "Meta",
206
- "cost": 0.4,
207
- "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
208
- "size": 70553706496.0,
209
- "type": "open-source",
210
- "license": "Llama3",
211
- "creation_date": 1713312000000,
212
- "tasks": [
213
- "translation_from",
214
- "translation_to",
215
- "classification",
216
- "mmlu",
217
- "arc",
218
- "truthfulqa",
219
- "mgsm"
220
- ]
221
- },
222
- {
223
- "id": "meta-llama/llama-3.1-70b-instruct",
224
- "name": "Llama 3.1 70B Instruct",
225
- "provider_name": "Meta",
226
- "cost": 0.28,
227
- "hf_id": "meta-llama/Llama-3.1-70B-Instruct",
228
- "size": 70553706496.0,
229
- "type": "open-source",
230
- "license": "Llama3.1",
231
- "creation_date": 1721088000000,
232
- "tasks": [
233
- "translation_from",
234
- "translation_to",
235
- "classification",
236
- "mmlu",
237
- "arc",
238
- "truthfulqa",
239
- "mgsm"
240
- ]
241
- },
242
- {
243
- "id": "meta-llama/llama-3.2-3b-instruct",
244
- "name": "Llama 3.2 3B Instruct",
245
- "provider_name": "Meta",
246
- "cost": 0.0,
247
- "hf_id": "meta-llama/Llama-3.2-3B-Instruct",
248
- "size": 3212749824.0,
249
- "type": "open-source",
250
- "license": "Llama3.2",
251
- "creation_date": 1726617600000,
252
- "tasks": [
253
- "translation_from",
254
- "translation_to",
255
- "classification",
256
- "mmlu",
257
- "arc",
258
- "truthfulqa",
259
- "mgsm"
260
- ]
261
- },
262
- {
263
- "id": "meta-llama/llama-3.3-70b-instruct",
264
- "name": "Llama 3.3 70B Instruct",
265
- "provider_name": "Meta",
266
- "cost": 0.0,
267
- "hf_id": "meta-llama/Llama-3.3-70B-Instruct",
268
- "size": 70553706496.0,
269
- "type": "open-source",
270
- "license": "Llama3.3",
271
- "creation_date": 1732579200000,
272
- "tasks": [
273
- "translation_from",
274
- "translation_to",
275
- "classification",
276
- "mmlu",
277
- "arc",
278
- "truthfulqa",
279
- "mgsm"
280
- ]
281
- },
282
- {
283
- "id": "meta-llama/llama-4-maverick",
284
- "name": "Llama 4 Maverick",
285
- "provider_name": "Meta",
286
- "cost": 0.6,
287
- "hf_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
288
- "size": 401583781376.0,
289
- "type": "open-source",
290
- "license": "Other",
291
- "creation_date": 1743465600000,
292
- "tasks": [
293
- "translation_from",
294
- "translation_to",
295
- "classification",
296
- "mmlu",
297
- "arc",
298
- "truthfulqa",
299
- "mgsm"
300
- ]
301
- },
302
- {
303
- "id": "meta-llama/llama-guard-4-12b",
304
- "name": "Llama Guard 4 12B",
305
- "provider_name": "Meta",
306
- "cost": 0.18,
307
- "hf_id": "meta-llama/Llama-Guard-4-12B",
308
- "size": 12001097216.0,
309
- "type": "open-source",
310
- "license": "Other",
311
- "creation_date": 1745366400000,
312
- "tasks": [
313
- "translation_from",
314
- "translation_to",
315
- "classification",
316
- "mmlu",
317
- "arc",
318
- "truthfulqa",
319
- "mgsm"
320
- ]
321
- },
322
- {
323
- "id": "microsoft/phi-3-medium-128k-instruct",
324
- "name": "Phi-3 Medium 128K Instruct",
325
- "provider_name": "Microsoft",
326
- "cost": 1.0,
327
- "hf_id": "microsoft/Phi-3-medium-128k-instruct",
328
- "size": 13960238080.0,
329
- "type": "open-source",
330
- "license": "Mit",
331
- "creation_date": 1715040000000,
332
- "tasks": [
333
- "translation_from",
334
- "translation_to",
335
- "classification",
336
- "mmlu",
337
- "arc",
338
- "truthfulqa",
339
- "mgsm"
340
- ]
341
- },
342
- {
343
- "id": "microsoft/phi-3.5-mini-128k-instruct",
344
- "name": "Phi-3.5 Mini 128K Instruct",
345
- "provider_name": "Microsoft",
346
- "cost": 0.1,
347
- "hf_id": "microsoft/Phi-3.5-mini-instruct",
348
- "size": 3821079552.0,
349
- "type": "open-source",
350
- "license": "Mit",
351
- "creation_date": 1723766400000,
352
- "tasks": [
353
- "translation_from",
354
- "translation_to",
355
- "classification",
356
- "mmlu",
357
- "arc",
358
- "truthfulqa",
359
- "mgsm"
360
- ]
361
- },
362
- {
363
- "id": "microsoft/phi-4",
364
- "name": "Phi 4",
365
- "provider_name": "Microsoft",
366
- "cost": 0.14,
367
- "hf_id": "microsoft/phi-4",
368
- "size": 14659507200.0,
369
- "type": "open-source",
370
- "license": "Mit",
371
- "creation_date": 1733875200000,
372
- "tasks": [
373
- "translation_from",
374
- "translation_to",
375
- "classification",
376
- "mmlu",
377
- "arc",
378
- "truthfulqa",
379
- "mgsm"
380
- ]
381
- },
382
- {
383
- "id": "microsoft/phi-4-multimodal-instruct",
384
- "name": "Phi 4 Multimodal Instruct",
385
- "provider_name": "Microsoft",
386
- "cost": 0.1,
387
- "hf_id": "microsoft/Phi-4-multimodal-instruct",
388
- "size": 5574460384.0,
389
- "type": "open-source",
390
- "license": "Mit",
391
- "creation_date": 1740355200000,
392
- "tasks": [
393
- "translation_from",
394
- "translation_to",
395
- "classification",
396
- "mmlu",
397
- "arc",
398
- "truthfulqa",
399
- "mgsm"
400
- ]
401
- },
402
- {
403
- "id": "mistralai/magistral-medium-2506",
404
- "name": "Magistral Medium 2506",
405
- "provider_name": "Mistral",
406
- "cost": 5.0,
407
- "hf_id": null,
408
- "size": null,
409
- "type": "closed-source",
410
- "license": null,
411
- "creation_date": 1749340800000,
412
- "tasks": [
413
- "translation_from",
414
- "translation_to",
415
- "classification",
416
- "mmlu",
417
- "arc",
418
- "truthfulqa",
419
- "mgsm"
420
- ]
421
- },
422
- {
423
- "id": "mistralai/mistral-7b-instruct",
424
- "name": "Mistral 7B Instruct",
425
- "provider_name": "Mistral",
426
- "cost": 0.0,
427
- "hf_id": "mistralai/Mistral-7B-Instruct-v0.3",
428
- "size": 7248023552.0,
429
- "type": "open-source",
430
- "license": "Apache 2.0",
431
- "creation_date": 1716336000000,
432
- "tasks": [
433
- "translation_from",
434
- "translation_to",
435
- "classification",
436
- "mmlu",
437
- "arc",
438
- "truthfulqa",
439
- "mgsm"
440
- ]
441
- },
442
- {
443
- "id": "mistralai/mistral-nemo",
444
- "name": "Mistral Nemo",
445
- "provider_name": "Mistral",
446
- "cost": 0.0,
447
- "hf_id": "mistralai/Mistral-Nemo-Instruct-2407",
448
- "size": 12247782400.0,
449
- "type": "open-source",
450
- "license": "Apache 2.0",
451
- "creation_date": 1721174400000,
452
- "tasks": [
453
- "translation_from",
454
- "translation_to",
455
- "classification",
456
- "mmlu",
457
- "arc",
458
- "truthfulqa",
459
- "mgsm"
460
- ]
461
- },
462
- {
463
- "id": "mistralai/mistral-saba",
464
- "name": "Saba",
465
- "provider_name": "Mistral",
466
- "cost": 0.6,
467
- "hf_id": null,
468
- "size": null,
469
- "type": "closed-source",
470
- "license": null,
471
- "creation_date": 1739750400000,
472
- "tasks": [
473
- "translation_from",
474
- "translation_to",
475
- "classification",
476
- "mmlu",
477
- "arc",
478
- "truthfulqa",
479
- "mgsm"
480
- ]
481
- },
482
- {
483
- "id": "mistralai/mistral-small-3.1-24b-instruct",
484
- "name": "Mistral Small 3.1 24B",
485
- "provider_name": "Mistral",
486
- "cost": 0.0,
487
- "hf_id": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
488
- "size": 24011361280.0,
489
- "type": "open-source",
490
- "license": "Apache 2.0",
491
- "creation_date": 1741651200000,
492
- "tasks": [
493
- "translation_from",
494
- "translation_to",
495
- "classification",
496
- "mmlu",
497
- "arc",
498
- "truthfulqa",
499
- "mgsm"
500
- ]
501
- },
502
- {
503
- "id": "mistralai/mixtral-8x7b-instruct",
504
- "name": "Mixtral 8x7B Instruct",
505
- "provider_name": "Mistral",
506
- "cost": 0.24,
507
- "hf_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
508
- "size": 46702792704.0,
509
- "type": "open-source",
510
- "license": "Apache 2.0",
511
- "creation_date": 1702166400000,
512
- "tasks": [
513
- "translation_from",
514
- "translation_to",
515
- "classification",
516
- "mmlu",
517
- "arc",
518
- "truthfulqa",
519
- "mgsm"
520
- ]
521
- },
522
- {
523
- "id": "neversleep/llama-3-lumimaid-70b",
524
- "name": "Llama 3 Lumimaid 70B",
525
- "provider_name": "NeverSleep",
526
- "cost": 6.0,
527
- "hf_id": "NeverSleep/Llama-3-Lumimaid-70B-v0.1",
528
- "size": 70553706496.0,
529
- "type": "open-source",
530
- "license": "Cc By Nc 4.0",
531
- "creation_date": 1714262400000,
532
- "tasks": [
533
- "translation_from",
534
- "translation_to",
535
- "classification",
536
- "mmlu",
537
- "arc",
538
- "truthfulqa",
539
- "mgsm"
540
- ]
541
- },
542
- {
543
- "id": "nvidia/llama-3.1-nemotron-70b-instruct",
544
- "name": "Llama 3.1 Nemotron 70B Instruct",
545
- "provider_name": "NVIDIA",
546
- "cost": 0.3,
547
- "hf_id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
548
- "size": 70553706496.0,
549
- "type": "open-source",
550
- "license": "Llama3.1",
551
- "creation_date": 1728691200000,
552
- "tasks": [
553
- "translation_from",
554
- "translation_to",
555
- "classification",
556
- "mmlu",
557
- "arc",
558
- "truthfulqa",
559
- "mgsm"
560
- ]
561
- },
562
- {
563
- "id": "openai/chatgpt-4o-latest",
564
- "name": "ChatGPT-4o",
565
- "provider_name": "OpenAI",
566
- "cost": 15.0,
567
- "hf_id": null,
568
- "size": null,
569
- "type": "closed-source",
570
- "license": null,
571
- "creation_date": 1723593600000,
572
- "tasks": [
573
- "translation_from",
574
- "translation_to",
575
- "classification",
576
- "mmlu",
577
- "arc",
578
- "truthfulqa",
579
- "mgsm"
580
- ]
581
- },
582
- {
583
- "id": "openai/gpt-3.5-turbo",
584
- "name": "GPT-3.5 Turbo",
585
- "provider_name": "OpenAI",
586
- "cost": 1.5,
587
- "hf_id": null,
588
- "size": null,
589
- "type": "closed-source",
590
- "license": null,
591
- "creation_date": 1685232000000,
592
- "tasks": [
593
- "translation_from",
594
- "translation_to",
595
- "classification",
596
- "mmlu",
597
- "arc",
598
- "truthfulqa",
599
- "mgsm"
600
- ]
601
- },
602
- {
603
- "id": "openai/gpt-3.5-turbo-0613",
604
- "name": "GPT-3.5 Turbo (older v0613)",
605
- "provider_name": "OpenAI",
606
- "cost": 2.0,
607
- "hf_id": null,
608
- "size": null,
609
- "type": "closed-source",
610
- "license": null,
611
- "creation_date": 1706140800000,
612
- "tasks": [
613
- "translation_from",
614
- "translation_to",
615
- "classification",
616
- "mmlu",
617
- "arc",
618
- "truthfulqa",
619
- "mgsm"
620
- ]
621
- },
622
- {
623
- "id": "openai/gpt-4.1",
624
- "name": "GPT-4.1",
625
- "provider_name": "OpenAI",
626
- "cost": 8.0,
627
- "hf_id": null,
628
- "size": null,
629
- "type": "closed-source",
630
- "license": null,
631
- "creation_date": 1744588800000,
632
- "tasks": [
633
- "translation_from",
634
- "translation_to",
635
- "classification",
636
- "mmlu",
637
- "arc",
638
- "truthfulqa",
639
- "mgsm"
640
- ]
641
- },
642
- {
643
- "id": "openai/gpt-4.1-mini",
644
- "name": "GPT-4.1 Mini",
645
- "provider_name": "OpenAI",
646
- "cost": 1.6,
647
- "hf_id": null,
648
- "size": null,
649
- "type": "closed-source",
650
- "license": null,
651
- "creation_date": 1744588800000,
652
- "tasks": [
653
- "translation_from",
654
- "translation_to",
655
- "classification",
656
- "mmlu",
657
- "arc",
658
- "truthfulqa",
659
- "mgsm"
660
- ]
661
- },
662
- {
663
- "id": "openai/gpt-4.1-nano",
664
- "name": "GPT-4.1 Nano",
665
- "provider_name": "OpenAI",
666
- "cost": 0.4,
667
- "hf_id": null,
668
- "size": null,
669
- "type": "closed-source",
670
- "license": null,
671
- "creation_date": 1744588800000,
672
- "tasks": [
673
- "translation_from",
674
- "translation_to",
675
- "classification",
676
- "mmlu",
677
- "arc",
678
- "truthfulqa",
679
- "mgsm"
680
- ]
681
- },
682
- {
683
- "id": "openai/gpt-4o-2024-11-20",
684
- "name": "GPT-4o (2024-11-20)",
685
- "provider_name": "OpenAI",
686
- "cost": 10.0,
687
- "hf_id": null,
688
- "size": null,
689
- "type": "closed-source",
690
- "license": null,
691
- "creation_date": 1732060800000,
692
- "tasks": [
693
- "translation_from",
694
- "translation_to",
695
- "classification",
696
- "mmlu",
697
- "arc",
698
- "truthfulqa",
699
- "mgsm"
700
- ]
701
- },
702
- {
703
- "id": "openai/gpt-4o-mini",
704
- "name": "GPT-4o-mini",
705
- "provider_name": "OpenAI",
706
- "cost": 0.6,
707
- "hf_id": null,
708
- "size": null,
709
- "type": "closed-source",
710
- "license": null,
711
- "creation_date": 1721260800000,
712
- "tasks": [
713
- "translation_from",
714
- "translation_to",
715
- "classification",
716
- "mmlu",
717
- "arc",
718
- "truthfulqa",
719
- "mgsm"
720
- ]
721
- },
722
- {
723
- "id": "openai/gpt-5",
724
- "name": "GPT-5",
725
- "provider_name": "OpenAI",
726
- "cost": 10.0,
727
- "hf_id": null,
728
- "size": null,
729
- "type": "closed-source",
730
- "license": null,
731
- "creation_date": 1754524800000,
732
- "tasks": [
733
- "translation_from",
734
- "translation_to",
735
- "classification",
736
- "mmlu",
737
- "arc",
738
- "truthfulqa",
739
- "mgsm"
740
- ]
741
- },
742
- {
743
- "id": "opengvlab/internvl3-14b",
744
- "name": "InternVL3 14B",
745
- "provider_name": "OpenGVLab",
746
- "cost": 0.4,
747
- "hf_id": "OpenGVLab/InternVL3-14B",
748
- "size": 15117256704.0,
749
- "type": "open-source",
750
- "license": "Apache 2.0",
751
- "creation_date": 1744243200000,
752
- "tasks": [
753
- "translation_from",
754
- "translation_to",
755
- "classification",
756
- "mmlu",
757
- "arc",
758
- "truthfulqa",
759
- "mgsm"
760
- ]
761
- },
762
- {
763
- "id": "qwen/qwen3-235b-a22b",
764
- "name": "Qwen3 235B A22B",
765
- "provider_name": "Qwen",
766
- "cost": 0.0,
767
- "hf_id": "Qwen/Qwen3-235B-A22B",
768
- "size": 235093634560.0,
769
- "type": "open-source",
770
- "license": "Apache 2.0",
771
- "creation_date": 1745712000000,
772
- "tasks": [
773
- "translation_from",
774
- "translation_to",
775
- "classification",
776
- "mmlu",
777
- "arc",
778
- "truthfulqa",
779
- "mgsm"
780
- ]
781
- },
782
- {
783
- "id": "qwen/qwen3-30b-a3b",
784
- "name": "Qwen3 30B A3B",
785
- "provider_name": "Qwen",
786
- "cost": 0.0,
787
- "hf_id": "Qwen/Qwen3-30B-A3B",
788
- "size": 30532122624.0,
789
- "type": "open-source",
790
- "license": "Apache 2.0",
791
- "creation_date": 1745712000000,
792
- "tasks": [
793
- "translation_from",
794
- "translation_to",
795
- "classification",
796
- "mmlu",
797
- "arc",
798
- "truthfulqa",
799
- "mgsm"
800
- ]
801
- },
802
- {
803
- "id": "qwen/qwen3-32b",
804
- "name": "Qwen3 32B",
805
- "provider_name": "Qwen",
806
- "cost": 0.07,
807
- "hf_id": "Qwen/Qwen3-32B",
808
- "size": 32762123264.0,
809
- "type": "open-source",
810
- "license": "Apache 2.0",
811
- "creation_date": 1745712000000,
812
- "tasks": [
813
- "translation_from",
814
- "translation_to",
815
- "classification",
816
- "mmlu",
817
- "arc",
818
- "truthfulqa",
819
- "mgsm"
820
- ]
821
- },
822
- {
823
- "id": "qwen/qwq-32b",
824
- "name": "QwQ 32B",
825
- "provider_name": "Qwen",
826
- "cost": 0.0,
827
- "hf_id": "Qwen/QwQ-32B",
828
- "size": 32763876352.0,
829
- "type": "open-source",
830
- "license": "Apache 2.0",
831
- "creation_date": 1741132800000,
832
- "tasks": [
833
- "translation_from",
834
- "translation_to",
835
- "classification",
836
- "mmlu",
837
- "arc",
838
- "truthfulqa",
839
- "mgsm"
840
- ]
841
- },
842
- {
843
- "id": "switchpoint/router",
844
- "name": "Switchpoint Router",
845
- "provider_name": "Switchpoint Router",
846
- "cost": 3.4,
847
- "hf_id": null,
848
- "size": null,
849
- "type": "closed-source",
850
- "license": null,
851
- "creation_date": 1752192000000,
852
- "tasks": [
853
- "translation_from",
854
- "translation_to",
855
- "classification",
856
- "mmlu",
857
- "arc",
858
- "truthfulqa",
859
- "mgsm"
860
- ]
861
- },
862
- {
863
- "id": "thedrummer/anubis-pro-105b-v1",
864
- "name": "Anubis Pro 105B V1",
865
- "provider_name": "TheDrummer",
866
- "cost": 1.0,
867
- "hf_id": "TheDrummer/Anubis-Pro-105B-v1",
868
- "size": 104779882496.0,
869
- "type": "open-source",
870
- "license": "Other",
871
- "creation_date": 1738454400000,
872
- "tasks": [
873
- "translation_from",
874
- "translation_to",
875
- "classification",
876
- "mmlu",
877
- "arc",
878
- "truthfulqa",
879
- "mgsm"
880
- ]
881
- },
882
- {
883
- "id": "thedrummer/skyfall-36b-v2",
884
- "name": "Skyfall 36B V2",
885
- "provider_name": "TheDrummer",
886
- "cost": 0.19,
887
- "hf_id": "TheDrummer/Skyfall-36B-v2",
888
- "size": 36910535680.0,
889
- "type": "open-source",
890
- "license": "Other",
891
- "creation_date": 1738540800000,
892
- "tasks": [
893
- "translation_from",
894
- "translation_to",
895
- "classification",
896
- "mmlu",
897
- "arc",
898
- "truthfulqa",
899
- "mgsm"
900
- ]
901
- },
902
- {
903
- "id": "tngtech/deepseek-r1t-chimera",
904
- "name": "DeepSeek R1T Chimera",
905
- "provider_name": "TNG",
906
- "cost": 0.0,
907
- "hf_id": "tngtech/DeepSeek-R1T-Chimera",
908
- "size": 684531386000.0,
909
- "type": "open-source",
910
- "license": "Mit",
911
- "creation_date": 1745625600000,
912
- "tasks": [
913
- "translation_from",
914
- "translation_to",
915
- "classification",
916
- "mmlu",
917
- "arc",
918
- "truthfulqa",
919
- "mgsm"
920
- ]
921
- },
922
- {
923
- "id": "tngtech/deepseek-r1t2-chimera",
924
- "name": "DeepSeek R1T2 Chimera",
925
- "provider_name": "TNG",
926
- "cost": 0.0,
927
- "hf_id": "tngtech/DeepSeek-TNG-R1T2-Chimera",
928
- "size": 684531386000.0,
929
- "type": "open-source",
930
- "license": "Mit",
931
- "creation_date": 1751414400000,
932
- "tasks": [
933
- "translation_from",
934
- "translation_to",
935
- "classification",
936
- "mmlu",
937
- "arc",
938
- "truthfulqa",
939
- "mgsm"
940
- ]
941
- },
942
- {
943
- "id": "x-ai/grok-2-1212",
944
- "name": "Grok 2 1212",
945
- "provider_name": "xAI",
946
- "cost": 10.0,
947
- "hf_id": null,
948
- "size": null,
949
- "type": "closed-source",
950
- "license": null,
951
- "creation_date": 1734220800000,
952
- "tasks": [
953
- "translation_from",
954
- "translation_to",
955
- "classification",
956
- "mmlu",
957
- "arc",
958
- "truthfulqa",
959
- "mgsm"
960
- ]
961
- },
962
- {
963
- "id": "google/translate-v2",
964
- "name": "Google Translate",
965
- "provider_name": "Google",
966
- "cost": 20.0,
967
- "hf_id": null,
968
- "size": null,
969
- "type": "closed-source",
970
- "license": null,
971
- "creation_date": null,
972
- "tasks": [
973
- "translation_from",
974
- "translation_to"
975
- ]
976
- },
977
- {
978
- "id": "moonshotai/kimi-k2",
979
- "name": "Kimi K2",
980
- "provider_name": "Moonshot AI",
981
- "size": null,
982
- "type": "closed-source",
983
- "cost": 0.6,
984
- "hf_id": null,
985
- "creation_date": null,
986
- "license": null
987
  }
988
  ]
 
1
  [
2
  {
3
+ "id":"openai\/gpt-5-nano",
4
+ "name":"GPT-5 Nano",
5
+ "provider_name":"OpenAI",
6
+ "cost":0.4,
7
+ "hf_id":null,
8
+ "size":null,
9
+ "type":"closed-source",
10
+ "license":null,
11
+ "creation_date":1754524800000,
12
+ "tasks":[
13
  "translation_from",
14
  "translation_to",
15
  "classification",
 
18
  "truthfulqa",
19
  "mgsm"
20
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
  ]
results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a3f388fd054fc570366705f1b8d6cb65bd6353164482d3d2c71ccec742d6158
3
- size 57534940
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afcbf2e565f584c3e57fbdbd788e12aaa887f421e04249ab35a8a9fcf94ad6b4
3
+ size 8030558