Spaces:
Runtime error
Runtime error
Upload from GitHub Actions: updated and cleaned up scripts for new eval runs
Browse files- .github/workflows/nightly-evals.yml +4 -24
- evals/datasets_/mgsm.py +29 -22
- evals/datasets_/mmlu.py +34 -15
- evals/main.py +88 -232
- evals/models.py +3 -35
- evals/tasks.py +69 -82
- languages.json +28 -28
- models.json +10 -976
- results.json +2 -2
.github/workflows/nightly-evals.yml
CHANGED
@@ -8,6 +8,7 @@ on:
|
|
8 |
jobs:
|
9 |
run-evals:
|
10 |
runs-on: ubuntu-latest
|
|
|
11 |
timeout-minutes: 1440 # 24 hours timeout
|
12 |
steps:
|
13 |
- uses: actions/checkout@v3
|
@@ -22,7 +23,7 @@ jobs:
|
|
22 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
23 |
uv sync --frozen --extra dev
|
24 |
|
25 |
-
- name: Run evaluations
|
26 |
env:
|
27 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
28 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
@@ -31,28 +32,7 @@ jobs:
|
|
31 |
run: |
|
32 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
33 |
uv run evals/download_data.py
|
34 |
-
|
35 |
-
# Run evaluations with periodic checkpointing
|
36 |
-
uv run python -c "
|
37 |
-
import time
|
38 |
-
import subprocess
|
39 |
-
import json
|
40 |
-
import os
|
41 |
-
|
42 |
-
# Check if we have existing results to resume from
|
43 |
-
if os.path.exists('results.json'):
|
44 |
-
print('Found existing results.json, will resume from checkpoint')
|
45 |
-
|
46 |
-
# Run the main evaluation
|
47 |
-
try:
|
48 |
-
subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
|
49 |
-
except subprocess.CalledProcessError as e:
|
50 |
-
print(f'Evaluation failed: {e}')
|
51 |
-
# Save current state even if failed
|
52 |
-
if os.path.exists('results.json'):
|
53 |
-
print('Saving checkpoint before exit...')
|
54 |
-
exit(1)
|
55 |
-
"
|
56 |
|
57 |
- name: Commit changes
|
58 |
env:
|
@@ -62,7 +42,7 @@ jobs:
|
|
62 |
git config --local user.name "github-actions[bot]"
|
63 |
git config --local --unset-all http.https://github.com/.extraheader
|
64 |
git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
|
65 |
-
git add results.json models.json languages.json
|
66 |
git commit -m "Update evaluation results" || echo "No changes to commit"
|
67 |
git push origin HEAD:main
|
68 |
|
|
|
8 |
jobs:
|
9 |
run-evals:
|
10 |
runs-on: ubuntu-latest
|
11 |
+
# checking if this is working in case eval runs take longer than 6h github actions allowance
|
12 |
timeout-minutes: 1440 # 24 hours timeout
|
13 |
steps:
|
14 |
- uses: actions/checkout@v3
|
|
|
23 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
24 |
uv sync --frozen --extra dev
|
25 |
|
26 |
+
- name: Run evaluations
|
27 |
env:
|
28 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
29 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
32 |
run: |
|
33 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
34 |
uv run evals/download_data.py
|
35 |
+
uv run evals/main.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
- name: Commit changes
|
38 |
env:
|
|
|
42 |
git config --local user.name "github-actions[bot]"
|
43 |
git config --local --unset-all http.https://github.com/.extraheader
|
44 |
git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
|
45 |
+
git add results.json models.json languages.json
|
46 |
git commit -m "Update evaluation results" || echo "No changes to commit"
|
47 |
git push origin HEAD:main
|
48 |
|
evals/datasets_/mgsm.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
import random
|
4 |
|
5 |
from datasets import Dataset, load_dataset
|
6 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
7 |
from langcodes import Language, standardize_tag
|
8 |
from models import get_google_supported_languages, translate_google
|
9 |
from rich import print
|
@@ -39,32 +39,39 @@ def parse_number(i):
|
|
39 |
return None
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def load_mgsm(language_bcp_47, nr):
|
43 |
-
print(f"Loading MGSM data for {language_bcp_47}...")
|
44 |
if language_bcp_47 in tags_mgsm.keys():
|
45 |
-
|
46 |
-
return slug_mgsm,
|
47 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
48 |
-
|
49 |
-
|
50 |
-
)
|
51 |
-
return slug_afrimgsm, ds[nr], "human"
|
52 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
53 |
-
|
54 |
-
|
55 |
-
subset=tags_gsm8kx[language_bcp_47],
|
56 |
-
split="test",
|
57 |
-
trust_remote_code=True,
|
58 |
-
)[nr]
|
59 |
-
row["answer_number"] = row["answer"].split("####")[1].strip()
|
60 |
-
return slug_gsm8kx, row, "machine"
|
61 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
62 |
-
|
63 |
-
|
64 |
-
subset=tags_gsm_autotranslated[language_bcp_47],
|
65 |
-
split="test",
|
66 |
-
)
|
67 |
-
return slug_gsm_autotranslated, ds[nr], "machine"
|
68 |
else:
|
69 |
return None, None, None
|
70 |
|
|
|
3 |
import random
|
4 |
|
5 |
from datasets import Dataset, load_dataset
|
6 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
7 |
from langcodes import Language, standardize_tag
|
8 |
from models import get_google_supported_languages, translate_google
|
9 |
from rich import print
|
|
|
39 |
return None
|
40 |
|
41 |
|
42 |
+
@cache
|
43 |
+
def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
|
44 |
+
"""Cache individual MGSM items efficiently"""
|
45 |
+
try:
|
46 |
+
ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
|
47 |
+
if nr >= len(ds):
|
48 |
+
return None
|
49 |
+
|
50 |
+
row = ds[nr]
|
51 |
+
|
52 |
+
# Post-process based on dataset type
|
53 |
+
if dataset_slug == slug_gsm8kx:
|
54 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
55 |
+
|
56 |
+
return row
|
57 |
+
except Exception:
|
58 |
+
# Dataset doesn't exist or doesn't have test split
|
59 |
+
return None
|
60 |
+
|
61 |
+
|
62 |
def load_mgsm(language_bcp_47, nr):
|
|
|
63 |
if language_bcp_47 in tags_mgsm.keys():
|
64 |
+
item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
|
65 |
+
return slug_mgsm, item, "human" if item else (None, None, None)
|
66 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
67 |
+
item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
|
68 |
+
return slug_afrimgsm, item, "human" if item else (None, None, None)
|
|
|
|
|
69 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
70 |
+
item = _get_mgsm_item(slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True)
|
71 |
+
return slug_gsm8kx, item, "machine" if item else (None, None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
73 |
+
item = _get_mgsm_item(slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr)
|
74 |
+
return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
|
|
|
|
|
|
|
|
|
75 |
else:
|
76 |
return None, None, None
|
77 |
|
evals/datasets_/mmlu.py
CHANGED
@@ -4,7 +4,7 @@ import random
|
|
4 |
from collections import Counter, defaultdict
|
5 |
|
6 |
from datasets import Dataset, load_dataset
|
7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
8 |
from langcodes import Language, standardize_tag
|
9 |
from models import get_google_supported_languages, translate_google
|
10 |
from rich import print
|
@@ -144,32 +144,51 @@ tags_mmlux = set(
|
|
144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
146 |
)
|
147 |
-
tags_mmlu_autotranslated =
|
|
|
|
|
|
|
148 |
|
149 |
categories = sorted(
|
150 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
151 |
)
|
152 |
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
async def load_mmlu(language_bcp_47, nr):
|
155 |
-
print(f"Loading MMLU data for {language_bcp_47}...")
|
156 |
category = categories[nr % len(categories)]
|
157 |
if language_bcp_47 in tags_afrimmlu.keys():
|
158 |
-
|
159 |
-
|
160 |
-
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
161 |
-
return "masakhane/afrimmlu", task, "human"
|
162 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
163 |
-
|
164 |
-
|
165 |
-
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
166 |
-
return "CohereForAI/Global-MMLU", task, "human"
|
167 |
# TODO: add in Okapi, MMLUX @Jonas
|
168 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
169 |
-
|
170 |
-
|
171 |
-
task = filtered[nr]
|
172 |
-
return "fair-forward/mmlu-autotranslated", task, "machine"
|
173 |
else:
|
174 |
return None, None, None
|
175 |
|
|
|
4 |
from collections import Counter, defaultdict
|
5 |
|
6 |
from datasets import Dataset, load_dataset
|
7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
8 |
from langcodes import Language, standardize_tag
|
9 |
from models import get_google_supported_languages, translate_google
|
10 |
from rich import print
|
|
|
144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
146 |
)
|
147 |
+
tags_mmlu_autotranslated = {
|
148 |
+
standardize_tag(a, macro=True): a
|
149 |
+
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
150 |
+
}
|
151 |
|
152 |
categories = sorted(
|
153 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
154 |
)
|
155 |
|
156 |
|
157 |
+
@cache
|
158 |
+
def _get_processed_mmlu_dataset(dataset_name, subset_tag):
|
159 |
+
"""Cache processed datasets to avoid reprocessing"""
|
160 |
+
ds = _load_dataset(dataset_name, subset_tag)
|
161 |
+
if dataset_name == "masakhane/afrimmlu":
|
162 |
+
ds = ds.map(parse_choices)
|
163 |
+
elif dataset_name == "CohereForAI/Global-MMLU":
|
164 |
+
ds = ds.map(add_choices)
|
165 |
+
return ds
|
166 |
+
|
167 |
+
|
168 |
+
@cache
|
169 |
+
def _get_mmlu_item(dataset_name, subset_tag, category, nr):
|
170 |
+
"""Cache individual MMLU items efficiently"""
|
171 |
+
ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
|
172 |
+
if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
|
173 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
174 |
+
return filtered[nr] if nr < len(filtered) else None
|
175 |
+
else: # fair-forward/mmlu-autotranslated
|
176 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
177 |
+
return filtered[nr] if nr < len(filtered) else None
|
178 |
+
|
179 |
+
|
180 |
async def load_mmlu(language_bcp_47, nr):
|
|
|
181 |
category = categories[nr % len(categories)]
|
182 |
if language_bcp_47 in tags_afrimmlu.keys():
|
183 |
+
task = _get_mmlu_item("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr)
|
184 |
+
return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
|
|
|
|
|
185 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
186 |
+
task = _get_mmlu_item("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr)
|
187 |
+
return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
|
|
|
|
|
188 |
# TODO: add in Okapi, MMLUX @Jonas
|
189 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
190 |
+
task = _get_mmlu_item("fair-forward/mmlu-autotranslated", language_bcp_47, category, nr)
|
191 |
+
return "fair-forward/mmlu-autotranslated", task, "machine" if task else (None, None, None)
|
|
|
|
|
192 |
else:
|
193 |
return None, None, None
|
194 |
|
evals/main.py
CHANGED
@@ -1,271 +1,127 @@
|
|
1 |
import asyncio
|
2 |
import pandas as pd
|
3 |
import time
|
4 |
-
import os
|
5 |
from datetime import datetime, timedelta
|
6 |
-
from tqdm.asyncio import tqdm_asyncio
|
7 |
from models import models
|
8 |
from tasks import tasks
|
9 |
from languages import languages
|
10 |
-
import
|
11 |
-
|
12 |
-
results = pd.DataFrame()
|
13 |
-
|
14 |
-
def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
|
15 |
-
"""Save current progress as checkpoint"""
|
16 |
-
try:
|
17 |
-
args = dict(orient="records", indent=2, force_ascii=False)
|
18 |
-
|
19 |
-
# Save current results
|
20 |
-
if len(results_df) > 0:
|
21 |
-
results_df.to_json("results.json", **args)
|
22 |
-
print(f"💾 Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
|
23 |
-
|
24 |
-
# Save model and language info
|
25 |
-
models_df.to_json("models.json", **args)
|
26 |
-
languages_df.to_json("languages.json", **args)
|
27 |
-
|
28 |
-
# Save checkpoint metadata
|
29 |
-
checkpoint_info = {
|
30 |
-
"last_batch": batch_num,
|
31 |
-
"total_batches": total_batches,
|
32 |
-
"timestamp": datetime.now().isoformat(),
|
33 |
-
"results_count": len(results_df)
|
34 |
-
}
|
35 |
-
with open("checkpoint.json", "w") as f:
|
36 |
-
json.dump(checkpoint_info, f, indent=2)
|
37 |
-
|
38 |
-
except Exception as e:
|
39 |
-
print(f"⚠️ Failed to save checkpoint: {e}")
|
40 |
-
|
41 |
-
def load_checkpoint():
|
42 |
-
"""Load previous checkpoint if available"""
|
43 |
-
try:
|
44 |
-
if os.path.exists("checkpoint.json"):
|
45 |
-
with open("checkpoint.json", "r") as f:
|
46 |
-
checkpoint = json.load(f)
|
47 |
-
print(f"📂 Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
|
48 |
-
return checkpoint
|
49 |
-
except Exception as e:
|
50 |
-
print(f"⚠️ Failed to load checkpoint: {e}")
|
51 |
-
return None
|
52 |
|
53 |
async def evaluate():
|
54 |
-
#
|
55 |
-
n_sentences = int(os.environ.get("N_SENTENCES",
|
|
|
|
|
|
|
56 |
|
57 |
-
# Load models and languages
|
58 |
models_df = pd.DataFrame(models)
|
59 |
languages_df = pd.DataFrame(languages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
print(f"
|
|
|
|
|
62 |
start_time = time.time()
|
63 |
-
print(f"🚀 Starting full evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
64 |
-
print(f"📊 Evaluating {n_sentences} sentences per task")
|
65 |
|
66 |
-
#
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
# Load checkpoint if available
|
72 |
-
checkpoint = load_checkpoint()
|
73 |
-
start_batch = 0
|
74 |
-
if checkpoint:
|
75 |
-
start_batch = checkpoint['last_batch']
|
76 |
-
print(f"🔄 Resuming from batch {start_batch}")
|
77 |
-
|
78 |
-
# For testing, just use all available languages up to max_languages
|
79 |
-
for n_languages in [min(max_languages, len(top_languages))]:
|
80 |
-
print(f"running evaluations for {n_languages} languages")
|
81 |
-
|
82 |
-
# Load existing results
|
83 |
try:
|
84 |
old_results = pd.read_json("results.json")
|
85 |
if old_results.empty:
|
86 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
87 |
except FileNotFoundError:
|
88 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
old_models = pd.read_json("models.json")
|
92 |
-
except FileNotFoundError:
|
93 |
-
old_models = pd.DataFrame()
|
94 |
-
|
95 |
-
# get all combinations of model, language and task
|
96 |
-
combis = [
|
97 |
-
(model, lang.bcp_47, task_name)
|
98 |
-
for model in models_df["id"]
|
99 |
-
for lang in top_languages.iloc[:n_languages].itertuples()
|
100 |
-
for task_name, task in tasks.items()
|
101 |
-
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
102 |
-
]
|
103 |
-
# filter out combinations that have already been evaluated
|
104 |
-
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
105 |
-
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
106 |
-
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
107 |
-
# run evaluations in batches to prevent HTTP pool exhaustion
|
108 |
-
all_tasks = []
|
109 |
-
for i in range(n_sentences):
|
110 |
-
for model, bcp_47, task_name in combis.itertuples(index=False):
|
111 |
-
# All tasks now use the same signature
|
112 |
-
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
113 |
-
|
114 |
-
print(f"⏳ Processing {len(all_tasks)} evaluation tasks in batches...")
|
115 |
-
|
116 |
-
batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
|
117 |
-
all_results = []
|
118 |
-
|
119 |
-
# Calculate total batches for progress tracking
|
120 |
-
total_batches = (len(all_tasks) + batch_size - 1) // batch_size
|
121 |
-
|
122 |
-
for i in range(start_batch * batch_size, len(all_tasks), batch_size):
|
123 |
-
batch = all_tasks[i:i+batch_size]
|
124 |
-
current_batch = i // batch_size + 1
|
125 |
-
|
126 |
-
print(f"📦 Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
|
127 |
-
|
128 |
-
# Show what's being evaluated in this batch
|
129 |
-
batch_summary = {}
|
130 |
-
for task_data in batch:
|
131 |
-
task_func, model, bcp_47, sentence_nr = task_data
|
132 |
-
# Extract task name from function - handle both partial functions and regular functions
|
133 |
-
if hasattr(task_func, 'func'):
|
134 |
-
task_name = task_func.func.__name__.replace('_and_evaluate', '')
|
135 |
-
else:
|
136 |
-
task_name = task_func.__name__.replace('_and_evaluate', '')
|
137 |
-
|
138 |
-
if task_name not in batch_summary:
|
139 |
-
batch_summary[task_name] = set()
|
140 |
-
batch_summary[task_name].add(bcp_47)
|
141 |
-
|
142 |
-
for task_name, languages_set in batch_summary.items():
|
143 |
-
lang_list = ', '.join(sorted(languages_set))
|
144 |
-
print(f" 🔄 {task_name}: {lang_list}")
|
145 |
-
|
146 |
-
batch_coroutines = []
|
147 |
-
for task_data in batch:
|
148 |
-
task_func, model, bcp_47, sentence_nr = task_data
|
149 |
-
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
150 |
-
|
151 |
-
try:
|
152 |
-
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
|
153 |
-
all_results.extend(batch_results)
|
154 |
-
|
155 |
-
# Save checkpoint after each batch
|
156 |
-
valid_results = []
|
157 |
-
exception_count = 0
|
158 |
-
for r in batch_results:
|
159 |
-
if isinstance(r, Exception):
|
160 |
-
exception_count += 1
|
161 |
-
continue
|
162 |
-
if isinstance(r, list):
|
163 |
-
valid_results.extend(r)
|
164 |
-
else:
|
165 |
-
valid_results.append(r)
|
166 |
-
|
167 |
-
if valid_results:
|
168 |
-
# Aggregate results
|
169 |
-
batch_df = pd.DataFrame(valid_results)
|
170 |
-
if len(batch_df) > 0:
|
171 |
-
batch_df = (
|
172 |
-
batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
173 |
-
.agg({"score": "mean"})
|
174 |
-
.reset_index()
|
175 |
-
)
|
176 |
-
# Merge with existing results
|
177 |
-
all_results_df = pd.concat([old_results, batch_df])
|
178 |
-
all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
179 |
-
all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
180 |
-
|
181 |
-
# Save checkpoint
|
182 |
-
save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
|
183 |
-
|
184 |
-
# Update old_results for next batch
|
185 |
-
old_results = all_results_df
|
186 |
-
|
187 |
-
print(f"✅ Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
|
188 |
-
|
189 |
-
except Exception as e:
|
190 |
-
print(f"❌ Batch {current_batch} failed: {e}")
|
191 |
-
# Save checkpoint even on failure
|
192 |
-
if len(all_results) > 0:
|
193 |
-
results_df = pd.DataFrame(all_results)
|
194 |
-
save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
|
195 |
-
continue
|
196 |
-
|
197 |
-
# Reduced delay between batches (optimized for GitHub Actions)
|
198 |
-
await asyncio.sleep(0.5)
|
199 |
-
|
200 |
-
# Final aggregation and save
|
201 |
-
results = all_results
|
202 |
-
# Filter out exceptions and flatten results
|
203 |
valid_results = []
|
204 |
-
exception_count = 0
|
205 |
for r in results:
|
206 |
-
if isinstance(r, Exception):
|
207 |
-
exception_count += 1
|
208 |
-
continue
|
209 |
if isinstance(r, list):
|
210 |
valid_results.extend(r)
|
211 |
else:
|
212 |
valid_results.append(r)
|
|
|
|
|
213 |
|
214 |
-
|
215 |
-
print(f"
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
218 |
if valid_results:
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
.
|
228 |
-
.
|
229 |
-
|
230 |
-
# Merge with old results
|
231 |
-
old_results = pd.read_json("results.json")
|
232 |
-
results_df = pd.concat([old_results, results_df])
|
233 |
-
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
234 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
235 |
results_df.to_json("results.json", **args)
|
236 |
-
|
|
|
|
|
|
|
237 |
else:
|
238 |
-
print("
|
239 |
-
else:
|
240 |
-
print("⚠️ No valid results to save - all API calls failed")
|
241 |
-
|
242 |
-
# Save up-to-date info on models and languages (like main branch)
|
243 |
-
all_models = pd.concat([pd.DataFrame(models), old_models])
|
244 |
-
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
245 |
-
all_models.to_json("models.json", **args)
|
246 |
-
pd.DataFrame(languages).to_json("languages.json", **args)
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
# Time estimation
|
251 |
-
elapsed = time.time() - start_time
|
252 |
-
elapsed_str = str(timedelta(seconds=int(elapsed)))
|
253 |
-
if n_languages < max_languages:
|
254 |
-
remaining_batches = (max_languages - n_languages) // 10
|
255 |
-
batch_count = max(1, n_languages // 10) # Avoid division by zero
|
256 |
-
estimated_remaining = elapsed * remaining_batches / batch_count
|
257 |
-
eta = datetime.now() + timedelta(seconds=estimated_remaining)
|
258 |
-
print(f"⏱️ Batch completed in {elapsed_str}. ETA for full run: {eta.strftime('%H:%M:%S')}")
|
259 |
-
else:
|
260 |
-
print(f"✅ Full evaluation completed in {elapsed_str}")
|
261 |
-
print(f"🎉 Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
262 |
-
|
263 |
-
# Clean up checkpoint file on successful completion
|
264 |
-
if os.path.exists("checkpoint.json"):
|
265 |
-
os.remove("checkpoint.json")
|
266 |
-
print("🧹 Cleaned up checkpoint file")
|
267 |
|
268 |
-
|
|
|
|
|
269 |
|
270 |
|
271 |
if __name__ == "__main__":
|
|
|
1 |
import asyncio
|
2 |
import pandas as pd
|
3 |
import time
|
|
|
4 |
from datetime import datetime, timedelta
|
|
|
5 |
from models import models
|
6 |
from tasks import tasks
|
7 |
from languages import languages
|
8 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
async def evaluate():
|
11 |
+
# Configuration - easily adjustable defaults
|
12 |
+
n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task
|
13 |
+
max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages
|
14 |
+
single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model
|
15 |
+
test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving
|
16 |
|
|
|
17 |
models_df = pd.DataFrame(models)
|
18 |
languages_df = pd.DataFrame(languages)
|
19 |
+
top_languages = languages.head(max_languages)
|
20 |
+
|
21 |
+
# Filter to single model if specified
|
22 |
+
if single_model:
|
23 |
+
models_df = models_df[models_df["id"] == single_model]
|
24 |
+
if len(models_df) == 0:
|
25 |
+
print(f"Error: Model '{single_model}' not found. Available models:")
|
26 |
+
for model_id in pd.DataFrame(models)["id"]:
|
27 |
+
print(f" {model_id}")
|
28 |
+
return pd.DataFrame()
|
29 |
|
30 |
+
print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
|
31 |
+
if test_mode:
|
32 |
+
print("TEST MODE: Skipping results loading/saving")
|
33 |
start_time = time.time()
|
|
|
|
|
34 |
|
35 |
+
# Load existing results to avoid re-evaluation (skip in test mode)
|
36 |
+
if test_mode:
|
37 |
+
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
38 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
try:
|
40 |
old_results = pd.read_json("results.json")
|
41 |
if old_results.empty:
|
42 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
43 |
except FileNotFoundError:
|
44 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
45 |
+
|
46 |
+
# Get all combinations that need evaluation
|
47 |
+
combis = [
|
48 |
+
(model, lang.bcp_47, task_name)
|
49 |
+
for model in models_df["id"]
|
50 |
+
for lang in top_languages.itertuples()
|
51 |
+
for task_name, task in tasks.items()
|
52 |
+
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
53 |
+
]
|
54 |
+
|
55 |
+
# Filter out already evaluated combinations
|
56 |
+
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
57 |
+
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
58 |
+
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
59 |
+
|
60 |
+
# Create all evaluation tasks
|
61 |
+
all_tasks = []
|
62 |
+
for i in range(n_sentences):
|
63 |
+
for model, bcp_47, task_name in combis.itertuples(index=False):
|
64 |
+
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
65 |
+
|
66 |
+
print(f"Running {len(all_tasks)} evaluation tasks...")
|
67 |
+
|
68 |
+
# Run all tasks with simple asyncio.gather, but stop on first error
|
69 |
+
try:
|
70 |
+
results = await asyncio.gather(
|
71 |
+
*[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks],
|
72 |
+
return_exceptions=False # This will raise on first exception
|
73 |
+
)
|
74 |
|
75 |
+
# Process results - no exceptions should reach here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
valid_results = []
|
|
|
77 |
for r in results:
|
|
|
|
|
|
|
78 |
if isinstance(r, list):
|
79 |
valid_results.extend(r)
|
80 |
else:
|
81 |
valid_results.append(r)
|
82 |
+
|
83 |
+
print(f"Completed: {len(valid_results)} valid results")
|
84 |
|
85 |
+
except Exception as e:
|
86 |
+
print(f"EVALUATION STOPPED - API Error occurred:")
|
87 |
+
print(f"Error type: {type(e).__name__}")
|
88 |
+
print(f"Error message: {str(e)}")
|
89 |
+
return pd.DataFrame()
|
90 |
+
|
91 |
+
# Save results (skip in test mode)
|
92 |
if valid_results:
|
93 |
+
results_df = pd.DataFrame(valid_results)
|
94 |
+
|
95 |
+
# Aggregate results
|
96 |
+
results_df = (
|
97 |
+
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
98 |
+
.agg({"score": "mean"})
|
99 |
+
.reset_index()
|
100 |
+
)
|
101 |
|
102 |
+
if not test_mode:
|
103 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
104 |
+
|
105 |
+
# Merge with existing results
|
106 |
+
if not old_results.empty:
|
107 |
+
results_df = pd.concat([old_results, results_df])
|
108 |
+
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
109 |
+
|
|
|
|
|
|
|
|
|
110 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
111 |
results_df.to_json("results.json", **args)
|
112 |
+
|
113 |
+
# Save model and language info
|
114 |
+
models_df.to_json("models.json", **args)
|
115 |
+
languages_df.to_json("languages.json", **args)
|
116 |
else:
|
117 |
+
print("TEST MODE: Skipping results saving")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
+
elapsed = time.time() - start_time
|
120 |
+
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
return results_df
|
123 |
+
|
124 |
+
return pd.DataFrame()
|
125 |
|
126 |
|
127 |
if __name__ == "__main__":
|
evals/models.py
CHANGED
@@ -27,7 +27,8 @@ important_models = [
|
|
27 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
28 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
29 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
30 |
-
"openai/gpt-5",
|
|
|
31 |
"openai/gpt-4.1", # 8$
|
32 |
"openai/gpt-4.1-mini", # 1.6$
|
33 |
"openai/gpt-4.1-nano", # 0.4$
|
@@ -96,9 +97,6 @@ def get_model(permaslug):
|
|
96 |
and m["endpoint"]
|
97 |
and not m["endpoint"]["is_free"]
|
98 |
]
|
99 |
-
if len(slugs) == 0:
|
100 |
-
# the problem is that free models typically have very high rate-limiting
|
101 |
-
print(f"no non-free model found for {permaslug}")
|
102 |
return slugs[0] if len(slugs) >= 1 else None
|
103 |
|
104 |
|
@@ -132,18 +130,11 @@ def get_historical_popular_models(date: date):
|
|
132 |
for model_slug, count in sorted_models[:20]: # Top 20
|
133 |
result.append({"slug": model_slug, "count": int(count)})
|
134 |
|
135 |
-
print(f"✅ Historical OpenRouter models: {len(result)} models fetched")
|
136 |
-
if result:
|
137 |
-
print(f" Top 5: {[m['slug'] for m in result[:5]]}")
|
138 |
-
print(f" Sample counts: {[m['count'] for m in result[:3]]}")
|
139 |
return result
|
140 |
else:
|
141 |
-
print("⚠️ Could not find model ranking data in OpenRouter response")
|
142 |
return []
|
143 |
|
144 |
except Exception as e:
|
145 |
-
print(f"⚠️ Error fetching OpenRouter historical rankings: {e}")
|
146 |
-
print("🔄 Falling back to static model list")
|
147 |
return []
|
148 |
|
149 |
|
@@ -176,18 +167,11 @@ def get_current_popular_models(date: date):
|
|
176 |
for model_slug, count in sorted_models[:10]: # Top 10
|
177 |
result.append({"slug": model_slug, "count": int(count)})
|
178 |
|
179 |
-
print(f"✅ Current OpenRouter models: {len(result)} models fetched")
|
180 |
-
if result:
|
181 |
-
print(f" Top 5: {[m['slug'] for m in result[:5]]}")
|
182 |
-
print(f" Sample counts: {[m['count'] for m in result[:3]]}")
|
183 |
return result
|
184 |
else:
|
185 |
-
print("⚠️ Could not find daily ranking data in OpenRouter response")
|
186 |
return []
|
187 |
|
188 |
except Exception as e:
|
189 |
-
print(f"⚠️ Error fetching OpenRouter current rankings: {e}")
|
190 |
-
print("🔄 Falling back to static model list")
|
191 |
return []
|
192 |
|
193 |
|
@@ -244,16 +228,13 @@ async def complete(**kwargs) -> str | None:
|
|
244 |
return None
|
245 |
raise e
|
246 |
except asyncio.TimeoutError:
|
247 |
-
print(f"⏰ Timeout after {timeout}s for model {model_id}")
|
248 |
return None
|
249 |
if not response.choices:
|
250 |
raise Exception(response)
|
251 |
return response.choices[0].message.content.strip()
|
252 |
|
253 |
-
|
254 |
translate_client = None
|
255 |
|
256 |
-
|
257 |
def get_google_translate_client():
|
258 |
global translate_client
|
259 |
if translate_client is None:
|
@@ -364,7 +345,7 @@ def get_cost(row):
|
|
364 |
return None
|
365 |
|
366 |
|
367 |
-
|
368 |
def load_models(date: date):
|
369 |
popular_models = (
|
370 |
get_historical_popular_models(date.today())[:20]
|
@@ -374,25 +355,12 @@ def load_models(date: date):
|
|
374 |
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
375 |
|
376 |
# Validate models exist on OpenRouter before including them
|
377 |
-
print(f"🔍 Validating {len(all_model_candidates)} model candidates...")
|
378 |
valid_models = []
|
379 |
-
invalid_models = []
|
380 |
|
381 |
for model_id in all_model_candidates:
|
382 |
metadata = get_or_metadata(model_id)
|
383 |
if metadata is not None:
|
384 |
valid_models.append(model_id)
|
385 |
-
else:
|
386 |
-
invalid_models.append(model_id)
|
387 |
-
|
388 |
-
if invalid_models:
|
389 |
-
print(f"⚠️ Excluded {len(invalid_models)} invalid models:")
|
390 |
-
for model in sorted(invalid_models)[:5]: # Show first 5
|
391 |
-
print(f" - {model}")
|
392 |
-
if len(invalid_models) > 5:
|
393 |
-
print(f" ... and {len(invalid_models) - 5} more")
|
394 |
-
|
395 |
-
print(f"✅ Using {len(valid_models)} valid models for evaluation")
|
396 |
|
397 |
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
398 |
or_metadata = models["id"].apply(get_or_metadata)
|
|
|
27 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
28 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
29 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
30 |
+
"openai/gpt-5",
|
31 |
+
"openai/gpt-5-nano", # include if/when available
|
32 |
"openai/gpt-4.1", # 8$
|
33 |
"openai/gpt-4.1-mini", # 1.6$
|
34 |
"openai/gpt-4.1-nano", # 0.4$
|
|
|
97 |
and m["endpoint"]
|
98 |
and not m["endpoint"]["is_free"]
|
99 |
]
|
|
|
|
|
|
|
100 |
return slugs[0] if len(slugs) >= 1 else None
|
101 |
|
102 |
|
|
|
130 |
for model_slug, count in sorted_models[:20]: # Top 20
|
131 |
result.append({"slug": model_slug, "count": int(count)})
|
132 |
|
|
|
|
|
|
|
|
|
133 |
return result
|
134 |
else:
|
|
|
135 |
return []
|
136 |
|
137 |
except Exception as e:
|
|
|
|
|
138 |
return []
|
139 |
|
140 |
|
|
|
167 |
for model_slug, count in sorted_models[:10]: # Top 10
|
168 |
result.append({"slug": model_slug, "count": int(count)})
|
169 |
|
|
|
|
|
|
|
|
|
170 |
return result
|
171 |
else:
|
|
|
172 |
return []
|
173 |
|
174 |
except Exception as e:
|
|
|
|
|
175 |
return []
|
176 |
|
177 |
|
|
|
228 |
return None
|
229 |
raise e
|
230 |
except asyncio.TimeoutError:
|
|
|
231 |
return None
|
232 |
if not response.choices:
|
233 |
raise Exception(response)
|
234 |
return response.choices[0].message.content.strip()
|
235 |
|
|
|
236 |
translate_client = None
|
237 |
|
|
|
238 |
def get_google_translate_client():
|
239 |
global translate_client
|
240 |
if translate_client is None:
|
|
|
345 |
return None
|
346 |
|
347 |
|
348 |
+
#@cache
|
349 |
def load_models(date: date):
|
350 |
popular_models = (
|
351 |
get_historical_popular_models(date.today())[:20]
|
|
|
355 |
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
356 |
|
357 |
# Validate models exist on OpenRouter before including them
|
|
|
358 |
valid_models = []
|
|
|
359 |
|
360 |
for model_id in all_model_candidates:
|
361 |
metadata = get_or_metadata(model_id)
|
362 |
if metadata is not None:
|
363 |
valid_models.append(model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
366 |
or_metadata = models["id"].apply(get_or_metadata)
|
evals/tasks.py
CHANGED
@@ -11,10 +11,8 @@ from datasets_.mgsm import load_mgsm, parse_number
|
|
11 |
from datasets_.mmlu import load_mmlu
|
12 |
from datasets_.arc import load_uhura_arc_easy
|
13 |
from datasets_.truthfulqa import load_truthfulqa
|
14 |
-
from google.cloud import translate_v2 as translate
|
15 |
-
from langcodes import closest_supported_match
|
16 |
from languages import languages, script_name
|
17 |
-
from models import complete, transcribe
|
18 |
|
19 |
bleu = evaluate.load("bleu")
|
20 |
chrf = evaluate.load("chrf")
|
@@ -45,32 +43,20 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
45 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
46 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
47 |
script = script_name(target_language.flores_path.split("_")[1])
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
else:
|
63 |
-
prediction = await complete(
|
64 |
-
model=model,
|
65 |
-
messages=[
|
66 |
-
{
|
67 |
-
"role": "user",
|
68 |
-
"content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
|
69 |
-
}
|
70 |
-
],
|
71 |
-
temperature=0,
|
72 |
-
max_tokens=1024,
|
73 |
-
)
|
74 |
if prediction:
|
75 |
bleu_score = bleu.compute(
|
76 |
predictions=[prediction],
|
@@ -83,6 +69,9 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
83 |
else:
|
84 |
bleu_score = {"bleu": 0}
|
85 |
chrf_score = {"score": 0}
|
|
|
|
|
|
|
86 |
return [
|
87 |
{
|
88 |
"model": model,
|
@@ -120,12 +109,16 @@ Reply with only the topic name.
|
|
120 |
Text:
|
121 |
{test_paragraph.text}
|
122 |
"""
|
123 |
-
|
124 |
model=model,
|
125 |
messages=[{"role": "user", "content": prompt}],
|
126 |
temperature=0,
|
127 |
max_tokens=30,
|
128 |
-
)
|
|
|
|
|
|
|
|
|
129 |
true = test_paragraph.topic.lower().strip()
|
130 |
others = [t for t in top_topics if t != true]
|
131 |
acc = (
|
@@ -136,6 +129,8 @@ Text:
|
|
136 |
if pred
|
137 |
else 0
|
138 |
)
|
|
|
|
|
139 |
return [
|
140 |
{
|
141 |
"model": model,
|
@@ -228,23 +223,20 @@ Response format: <reasoning> #### <letter>
|
|
228 |
{format_multiple_choice(task)}""",
|
229 |
},
|
230 |
]
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
acc = 0
|
246 |
-
else:
|
247 |
-
raise e
|
248 |
|
249 |
return [
|
250 |
{
|
@@ -276,23 +268,18 @@ Response format: <reasoning> #### <letter>
|
|
276 |
{format_multiple_choice(task)}""",
|
277 |
},
|
278 |
]
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
except Exception as e:
|
292 |
-
if "ResponsibleAIPolicyViolation" in str(e):
|
293 |
-
acc = 0
|
294 |
-
else:
|
295 |
-
raise e
|
296 |
return [
|
297 |
{
|
298 |
"model": model,
|
@@ -349,23 +336,20 @@ Response format: <reasoning> #### <letter>
|
|
349 |
{format_multiple_choice_truthfulqa(task)}""",
|
350 |
},
|
351 |
]
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
acc = 0
|
367 |
-
else:
|
368 |
-
raise e
|
369 |
return [
|
370 |
{
|
371 |
"model": model,
|
@@ -407,6 +391,9 @@ Response format: <reasoning> #### <number>
|
|
407 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
408 |
else:
|
409 |
accuracy = 0
|
|
|
|
|
|
|
410 |
|
411 |
return [
|
412 |
{
|
|
|
11 |
from datasets_.mmlu import load_mmlu
|
12 |
from datasets_.arc import load_uhura_arc_easy
|
13 |
from datasets_.truthfulqa import load_truthfulqa
|
|
|
|
|
14 |
from languages import languages, script_name
|
15 |
+
from models import complete, transcribe
|
16 |
|
17 |
bleu = evaluate.load("bleu")
|
18 |
chrf = evaluate.load("chrf")
|
|
|
43 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
44 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
45 |
script = script_name(target_language.flores_path.split("_")[1])
|
46 |
+
translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
|
47 |
+
prediction = await complete(
|
48 |
+
model=model,
|
49 |
+
messages=[
|
50 |
+
{
|
51 |
+
"role": "user",
|
52 |
+
"content": translation_prompt,
|
53 |
+
}
|
54 |
+
],
|
55 |
+
temperature=0,
|
56 |
+
max_tokens=1024,
|
57 |
+
)
|
58 |
+
|
59 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
if prediction:
|
61 |
bleu_score = bleu.compute(
|
62 |
predictions=[prediction],
|
|
|
69 |
else:
|
70 |
bleu_score = {"bleu": 0}
|
71 |
chrf_score = {"score": 0}
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
return [
|
76 |
{
|
77 |
"model": model,
|
|
|
109 |
Text:
|
110 |
{test_paragraph.text}
|
111 |
"""
|
112 |
+
response = await complete(
|
113 |
model=model,
|
114 |
messages=[{"role": "user", "content": prompt}],
|
115 |
temperature=0,
|
116 |
max_tokens=30,
|
117 |
+
)
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
pred = response.lower().strip() if response else ""
|
122 |
true = test_paragraph.topic.lower().strip()
|
123 |
others = [t for t in top_topics if t != true]
|
124 |
acc = (
|
|
|
129 |
if pred
|
130 |
else 0
|
131 |
)
|
132 |
+
|
133 |
+
|
134 |
return [
|
135 |
{
|
136 |
"model": model,
|
|
|
223 |
{format_multiple_choice(task)}""",
|
224 |
},
|
225 |
]
|
226 |
+
response = await complete(
|
227 |
+
model=model,
|
228 |
+
messages=messages,
|
229 |
+
temperature=0,
|
230 |
+
max_tokens=1024,
|
231 |
+
)
|
232 |
+
if response and "####" in response:
|
233 |
+
answer = response.split("####")[-1].strip()
|
234 |
+
acc = int(answer[:1] == task["answer"])
|
235 |
+
else:
|
236 |
+
acc = 0
|
237 |
+
answer = "NO_ANSWER"
|
238 |
+
|
239 |
+
|
|
|
|
|
|
|
240 |
|
241 |
return [
|
242 |
{
|
|
|
268 |
{format_multiple_choice(task)}""",
|
269 |
},
|
270 |
]
|
271 |
+
response = await complete(
|
272 |
+
model=model,
|
273 |
+
messages=messages,
|
274 |
+
temperature=0,
|
275 |
+
max_tokens=1024,
|
276 |
+
)
|
277 |
+
if response and "####" in response:
|
278 |
+
answer = response.split("####")[-1].strip()
|
279 |
+
acc = int(answer[:1] == task["answer"])
|
280 |
+
else:
|
281 |
+
acc = 0
|
282 |
+
answer = "NO_ANSWER"
|
|
|
|
|
|
|
|
|
|
|
283 |
return [
|
284 |
{
|
285 |
"model": model,
|
|
|
336 |
{format_multiple_choice_truthfulqa(task)}""",
|
337 |
},
|
338 |
]
|
339 |
+
response = await complete(
|
340 |
+
model=model,
|
341 |
+
messages=messages,
|
342 |
+
temperature=0,
|
343 |
+
max_tokens=1024, # Increased for reasoning
|
344 |
+
)
|
345 |
+
if response and "####" in response:
|
346 |
+
pred_answer = response.split("####")[-1].strip()
|
347 |
+
acc = int(pred_answer[:1].upper() == answer)
|
348 |
+
else:
|
349 |
+
acc = 0
|
350 |
+
pred_answer = "NO_ANSWER"
|
351 |
+
|
352 |
+
|
|
|
|
|
|
|
353 |
return [
|
354 |
{
|
355 |
"model": model,
|
|
|
391 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
392 |
else:
|
393 |
accuracy = 0
|
394 |
+
number = "NO_ANSWER"
|
395 |
+
|
396 |
+
|
397 |
|
398 |
return [
|
399 |
{
|
languages.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"family":"Indo-European",
|
8 |
"flores_path":"eng_Latn",
|
9 |
"fleurs_tag":"en_us",
|
10 |
-
"commonvoice_hours":
|
11 |
"commonvoice_locale":"en",
|
12 |
"in_benchmark":true
|
13 |
},
|
@@ -32,7 +32,7 @@
|
|
32 |
"flores_path":"hin_Deva",
|
33 |
"fleurs_tag":"hi_in",
|
34 |
"commonvoice_hours":16.0,
|
35 |
-
"commonvoice_locale":"hi
|
36 |
"in_benchmark":true
|
37 |
},
|
38 |
{
|
@@ -43,7 +43,7 @@
|
|
43 |
"family":"Indo-European",
|
44 |
"flores_path":"spa_Latn",
|
45 |
"fleurs_tag":"es_419",
|
46 |
-
"commonvoice_hours":
|
47 |
"commonvoice_locale":"es",
|
48 |
"in_benchmark":true
|
49 |
},
|
@@ -79,7 +79,7 @@
|
|
79 |
"family":"Indo-European",
|
80 |
"flores_path":"fra_Latn",
|
81 |
"fleurs_tag":"fr_fr",
|
82 |
-
"commonvoice_hours":
|
83 |
"commonvoice_locale":"fr",
|
84 |
"in_benchmark":true
|
85 |
},
|
@@ -127,7 +127,7 @@
|
|
127 |
"family":"Indo-European",
|
128 |
"flores_path":"rus_Cyrl",
|
129 |
"fleurs_tag":"ru_ru",
|
130 |
-
"commonvoice_hours":
|
131 |
"commonvoice_locale":"ru",
|
132 |
"in_benchmark":true
|
133 |
},
|
@@ -139,7 +139,7 @@
|
|
139 |
"family":"Atlantic-Congo",
|
140 |
"flores_path":"swh_Latn",
|
141 |
"fleurs_tag":"sw_ke",
|
142 |
-
"commonvoice_hours":
|
143 |
"commonvoice_locale":"sw",
|
144 |
"in_benchmark":true
|
145 |
},
|
@@ -163,7 +163,7 @@
|
|
163 |
"family":"Indo-European",
|
164 |
"flores_path":"deu_Latn",
|
165 |
"fleurs_tag":"de_de",
|
166 |
-
"commonvoice_hours":
|
167 |
"commonvoice_locale":"de",
|
168 |
"in_benchmark":true
|
169 |
},
|
@@ -1027,7 +1027,7 @@
|
|
1027 |
"family":"Uralic",
|
1028 |
"flores_path":"hun_Latn",
|
1029 |
"fleurs_tag":"hu_hu",
|
1030 |
-
"commonvoice_hours":
|
1031 |
"commonvoice_locale":"hu",
|
1032 |
"in_benchmark":true
|
1033 |
},
|
@@ -1183,7 +1183,7 @@
|
|
1183 |
"family":"Indo-European",
|
1184 |
"flores_path":"bel_Cyrl",
|
1185 |
"fleurs_tag":"be_by",
|
1186 |
-
"commonvoice_hours":
|
1187 |
"commonvoice_locale":"be",
|
1188 |
"in_benchmark":true
|
1189 |
},
|
@@ -1207,7 +1207,7 @@
|
|
1207 |
"family":"Indo-European",
|
1208 |
"flores_path":"tgk_Cyrl",
|
1209 |
"fleurs_tag":"tg_tj",
|
1210 |
-
"commonvoice_hours":0.
|
1211 |
"commonvoice_locale":"tg",
|
1212 |
"in_benchmark":true
|
1213 |
},
|
@@ -1291,7 +1291,7 @@
|
|
1291 |
"family":"Indo-European",
|
1292 |
"flores_path":"cat_Latn",
|
1293 |
"fleurs_tag":"ca_es",
|
1294 |
-
"commonvoice_hours":
|
1295 |
"commonvoice_locale":"ca",
|
1296 |
"in_benchmark":true
|
1297 |
},
|
@@ -1303,7 +1303,7 @@
|
|
1303 |
"family":"Afro-Asiatic",
|
1304 |
"flores_path":"heb_Hebr",
|
1305 |
"fleurs_tag":"he_il",
|
1306 |
-
"commonvoice_hours":
|
1307 |
"commonvoice_locale":"he",
|
1308 |
"in_benchmark":true
|
1309 |
},
|
@@ -1375,7 +1375,7 @@
|
|
1375 |
"family":"Turkic",
|
1376 |
"flores_path":"uig_Arab",
|
1377 |
"fleurs_tag":null,
|
1378 |
-
"commonvoice_hours":
|
1379 |
"commonvoice_locale":"ug",
|
1380 |
"in_benchmark":true
|
1381 |
},
|
@@ -1519,7 +1519,7 @@
|
|
1519 |
"family":"Indo-European",
|
1520 |
"flores_path":"kmr_Latn",
|
1521 |
"fleurs_tag":null,
|
1522 |
-
"commonvoice_hours":
|
1523 |
"commonvoice_locale":"kmr",
|
1524 |
"in_benchmark":true
|
1525 |
},
|
@@ -1555,7 +1555,7 @@
|
|
1555 |
"family":"Indo-European",
|
1556 |
"flores_path":"slk_Latn",
|
1557 |
"fleurs_tag":"sk_sk",
|
1558 |
-
"commonvoice_hours":
|
1559 |
"commonvoice_locale":"sk",
|
1560 |
"in_benchmark":true
|
1561 |
},
|
@@ -1675,7 +1675,7 @@
|
|
1675 |
"family":"Tupian",
|
1676 |
"flores_path":"gug_Latn",
|
1677 |
"fleurs_tag":null,
|
1678 |
-
"commonvoice_hours":4.
|
1679 |
"commonvoice_locale":"gn",
|
1680 |
"in_benchmark":true
|
1681 |
},
|
@@ -1747,7 +1747,7 @@
|
|
1747 |
"family":"Indo-European",
|
1748 |
"flores_path":"nob_Latn",
|
1749 |
"fleurs_tag":"nb_no",
|
1750 |
-
"commonvoice_hours":1.
|
1751 |
"commonvoice_locale":"nb-NO",
|
1752 |
"in_benchmark":true
|
1753 |
},
|
@@ -2167,7 +2167,7 @@
|
|
2167 |
"family":"Indo-European",
|
2168 |
"flores_path":"glg_Latn",
|
2169 |
"fleurs_tag":"gl_es",
|
2170 |
-
"commonvoice_hours":
|
2171 |
"commonvoice_locale":"gl",
|
2172 |
"in_benchmark":true
|
2173 |
},
|
@@ -3175,8 +3175,8 @@
|
|
3175 |
"family":"Atlantic-Congo",
|
3176 |
"flores_path":null,
|
3177 |
"fleurs_tag":null,
|
3178 |
-
"commonvoice_hours":
|
3179 |
-
"commonvoice_locale":
|
3180 |
"in_benchmark":false
|
3181 |
},
|
3182 |
{
|
@@ -3331,7 +3331,7 @@
|
|
3331 |
"family":"Indo-European",
|
3332 |
"flores_path":"gle_Latn",
|
3333 |
"fleurs_tag":"ga_ie",
|
3334 |
-
"commonvoice_hours":9.
|
3335 |
"commonvoice_locale":"ga-IE",
|
3336 |
"in_benchmark":true
|
3337 |
},
|
@@ -3535,7 +3535,7 @@
|
|
3535 |
"family":null,
|
3536 |
"flores_path":"eus_Latn",
|
3537 |
"fleurs_tag":null,
|
3538 |
-
"commonvoice_hours":
|
3539 |
"commonvoice_locale":"eu",
|
3540 |
"in_benchmark":true
|
3541 |
},
|
@@ -3559,7 +3559,7 @@
|
|
3559 |
"family":"Abkhaz-Adyge",
|
3560 |
"flores_path":null,
|
3561 |
"fleurs_tag":null,
|
3562 |
-
"commonvoice_hours":
|
3563 |
"commonvoice_locale":"kbd",
|
3564 |
"in_benchmark":false
|
3565 |
},
|
@@ -3679,7 +3679,7 @@
|
|
3679 |
"family":"Indo-European",
|
3680 |
"flores_path":"ydd_Hebr",
|
3681 |
"fleurs_tag":null,
|
3682 |
-
"commonvoice_hours":1.
|
3683 |
"commonvoice_locale":"yi",
|
3684 |
"in_benchmark":true
|
3685 |
},
|
@@ -4099,8 +4099,8 @@
|
|
4099 |
"family":"Indo-European",
|
4100 |
"flores_path":null,
|
4101 |
"fleurs_tag":null,
|
4102 |
-
"commonvoice_hours":
|
4103 |
-
"commonvoice_locale":
|
4104 |
"in_benchmark":false
|
4105 |
},
|
4106 |
{
|
@@ -4651,7 +4651,7 @@
|
|
4651 |
"family":"Abkhaz-Adyge",
|
4652 |
"flores_path":null,
|
4653 |
"fleurs_tag":null,
|
4654 |
-
"commonvoice_hours":
|
4655 |
"commonvoice_locale":"ady",
|
4656 |
"in_benchmark":false
|
4657 |
},
|
@@ -5011,7 +5011,7 @@
|
|
5011 |
"family":"Nakh-Daghestanian",
|
5012 |
"flores_path":"dar_Cyrl",
|
5013 |
"fleurs_tag":null,
|
5014 |
-
"commonvoice_hours":
|
5015 |
"commonvoice_locale":"dar",
|
5016 |
"in_benchmark":true
|
5017 |
},
|
|
|
7 |
"family":"Indo-European",
|
8 |
"flores_path":"eng_Latn",
|
9 |
"fleurs_tag":"en_us",
|
10 |
+
"commonvoice_hours":2683.0,
|
11 |
"commonvoice_locale":"en",
|
12 |
"in_benchmark":true
|
13 |
},
|
|
|
32 |
"flores_path":"hin_Deva",
|
33 |
"fleurs_tag":"hi_in",
|
34 |
"commonvoice_hours":16.0,
|
35 |
+
"commonvoice_locale":"hi",
|
36 |
"in_benchmark":true
|
37 |
},
|
38 |
{
|
|
|
43 |
"family":"Indo-European",
|
44 |
"flores_path":"spa_Latn",
|
45 |
"fleurs_tag":"es_419",
|
46 |
+
"commonvoice_hours":449.0,
|
47 |
"commonvoice_locale":"es",
|
48 |
"in_benchmark":true
|
49 |
},
|
|
|
79 |
"family":"Indo-European",
|
80 |
"flores_path":"fra_Latn",
|
81 |
"fleurs_tag":"fr_fr",
|
82 |
+
"commonvoice_hours":1072.0,
|
83 |
"commonvoice_locale":"fr",
|
84 |
"in_benchmark":true
|
85 |
},
|
|
|
127 |
"family":"Indo-European",
|
128 |
"flores_path":"rus_Cyrl",
|
129 |
"fleurs_tag":"ru_ru",
|
130 |
+
"commonvoice_hours":247.0,
|
131 |
"commonvoice_locale":"ru",
|
132 |
"in_benchmark":true
|
133 |
},
|
|
|
139 |
"family":"Atlantic-Congo",
|
140 |
"flores_path":"swh_Latn",
|
141 |
"fleurs_tag":"sw_ke",
|
142 |
+
"commonvoice_hours":412.0,
|
143 |
"commonvoice_locale":"sw",
|
144 |
"in_benchmark":true
|
145 |
},
|
|
|
163 |
"family":"Indo-European",
|
164 |
"flores_path":"deu_Latn",
|
165 |
"fleurs_tag":"de_de",
|
166 |
+
"commonvoice_hours":1372.0,
|
167 |
"commonvoice_locale":"de",
|
168 |
"in_benchmark":true
|
169 |
},
|
|
|
1027 |
"family":"Uralic",
|
1028 |
"flores_path":"hun_Latn",
|
1029 |
"fleurs_tag":"hu_hu",
|
1030 |
+
"commonvoice_hours":94.0,
|
1031 |
"commonvoice_locale":"hu",
|
1032 |
"in_benchmark":true
|
1033 |
},
|
|
|
1183 |
"family":"Indo-European",
|
1184 |
"flores_path":"bel_Cyrl",
|
1185 |
"fleurs_tag":"be_by",
|
1186 |
+
"commonvoice_hours":1812.0,
|
1187 |
"commonvoice_locale":"be",
|
1188 |
"in_benchmark":true
|
1189 |
},
|
|
|
1207 |
"family":"Indo-European",
|
1208 |
"flores_path":"tgk_Cyrl",
|
1209 |
"fleurs_tag":"tg_tj",
|
1210 |
+
"commonvoice_hours":0.6,
|
1211 |
"commonvoice_locale":"tg",
|
1212 |
"in_benchmark":true
|
1213 |
},
|
|
|
1291 |
"family":"Indo-European",
|
1292 |
"flores_path":"cat_Latn",
|
1293 |
"fleurs_tag":"ca_es",
|
1294 |
+
"commonvoice_hours":2883.0,
|
1295 |
"commonvoice_locale":"ca",
|
1296 |
"in_benchmark":true
|
1297 |
},
|
|
|
1303 |
"family":"Afro-Asiatic",
|
1304 |
"flores_path":"heb_Hebr",
|
1305 |
"fleurs_tag":"he_il",
|
1306 |
+
"commonvoice_hours":2.0,
|
1307 |
"commonvoice_locale":"he",
|
1308 |
"in_benchmark":true
|
1309 |
},
|
|
|
1375 |
"family":"Turkic",
|
1376 |
"flores_path":"uig_Arab",
|
1377 |
"fleurs_tag":null,
|
1378 |
+
"commonvoice_hours":437.0,
|
1379 |
"commonvoice_locale":"ug",
|
1380 |
"in_benchmark":true
|
1381 |
},
|
|
|
1519 |
"family":"Indo-European",
|
1520 |
"flores_path":"kmr_Latn",
|
1521 |
"fleurs_tag":null,
|
1522 |
+
"commonvoice_hours":71.0,
|
1523 |
"commonvoice_locale":"kmr",
|
1524 |
"in_benchmark":true
|
1525 |
},
|
|
|
1555 |
"family":"Indo-European",
|
1556 |
"flores_path":"slk_Latn",
|
1557 |
"fleurs_tag":"sk_sk",
|
1558 |
+
"commonvoice_hours":52.0,
|
1559 |
"commonvoice_locale":"sk",
|
1560 |
"in_benchmark":true
|
1561 |
},
|
|
|
1675 |
"family":"Tupian",
|
1676 |
"flores_path":"gug_Latn",
|
1677 |
"fleurs_tag":null,
|
1678 |
+
"commonvoice_hours":4.5,
|
1679 |
"commonvoice_locale":"gn",
|
1680 |
"in_benchmark":true
|
1681 |
},
|
|
|
1747 |
"family":"Indo-European",
|
1748 |
"flores_path":"nob_Latn",
|
1749 |
"fleurs_tag":"nb_no",
|
1750 |
+
"commonvoice_hours":1.8,
|
1751 |
"commonvoice_locale":"nb-NO",
|
1752 |
"in_benchmark":true
|
1753 |
},
|
|
|
2167 |
"family":"Indo-European",
|
2168 |
"flores_path":"glg_Latn",
|
2169 |
"fleurs_tag":"gl_es",
|
2170 |
+
"commonvoice_hours":162.0,
|
2171 |
"commonvoice_locale":"gl",
|
2172 |
"in_benchmark":true
|
2173 |
},
|
|
|
3175 |
"family":"Atlantic-Congo",
|
3176 |
"flores_path":null,
|
3177 |
"fleurs_tag":null,
|
3178 |
+
"commonvoice_hours":0.0,
|
3179 |
+
"commonvoice_locale":"seh",
|
3180 |
"in_benchmark":false
|
3181 |
},
|
3182 |
{
|
|
|
3331 |
"family":"Indo-European",
|
3332 |
"flores_path":"gle_Latn",
|
3333 |
"fleurs_tag":"ga_ie",
|
3334 |
+
"commonvoice_hours":9.3,
|
3335 |
"commonvoice_locale":"ga-IE",
|
3336 |
"in_benchmark":true
|
3337 |
},
|
|
|
3535 |
"family":null,
|
3536 |
"flores_path":"eus_Latn",
|
3537 |
"fleurs_tag":null,
|
3538 |
+
"commonvoice_hours":453.0,
|
3539 |
"commonvoice_locale":"eu",
|
3540 |
"in_benchmark":true
|
3541 |
},
|
|
|
3559 |
"family":"Abkhaz-Adyge",
|
3560 |
"flores_path":null,
|
3561 |
"fleurs_tag":null,
|
3562 |
+
"commonvoice_hours":106.0,
|
3563 |
"commonvoice_locale":"kbd",
|
3564 |
"in_benchmark":false
|
3565 |
},
|
|
|
3679 |
"family":"Indo-European",
|
3680 |
"flores_path":"ydd_Hebr",
|
3681 |
"fleurs_tag":null,
|
3682 |
+
"commonvoice_hours":1.7,
|
3683 |
"commonvoice_locale":"yi",
|
3684 |
"in_benchmark":true
|
3685 |
},
|
|
|
4099 |
"family":"Indo-European",
|
4100 |
"flores_path":null,
|
4101 |
"fleurs_tag":null,
|
4102 |
+
"commonvoice_hours":0.0,
|
4103 |
+
"commonvoice_locale":"pcd",
|
4104 |
"in_benchmark":false
|
4105 |
},
|
4106 |
{
|
|
|
4651 |
"family":"Abkhaz-Adyge",
|
4652 |
"flores_path":null,
|
4653 |
"fleurs_tag":null,
|
4654 |
+
"commonvoice_hours":32.0,
|
4655 |
"commonvoice_locale":"ady",
|
4656 |
"in_benchmark":false
|
4657 |
},
|
|
|
5011 |
"family":"Nakh-Daghestanian",
|
5012 |
"flores_path":"dar_Cyrl",
|
5013 |
"fleurs_tag":null,
|
5014 |
+
"commonvoice_hours":1.3,
|
5015 |
"commonvoice_locale":"dar",
|
5016 |
"in_benchmark":true
|
5017 |
},
|
models.json
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"id":
|
4 |
-
"name":
|
5 |
-
"provider_name":
|
6 |
-
"cost":
|
7 |
-
"hf_id":
|
8 |
-
"size":
|
9 |
-
"type":
|
10 |
-
"license":
|
11 |
-
"creation_date":
|
12 |
-
"tasks":
|
13 |
"translation_from",
|
14 |
"translation_to",
|
15 |
"classification",
|
@@ -18,971 +18,5 @@
|
|
18 |
"truthfulqa",
|
19 |
"mgsm"
|
20 |
]
|
21 |
-
},
|
22 |
-
{
|
23 |
-
"id": "anthracite-org/magnum-v4-72b",
|
24 |
-
"name": "Magnum v4 72B",
|
25 |
-
"provider_name": "Magnum v4 72B",
|
26 |
-
"cost": 3.0,
|
27 |
-
"hf_id": "anthracite-org/magnum-v4-72b",
|
28 |
-
"size": 72706203648.0,
|
29 |
-
"type": "open-source",
|
30 |
-
"license": "Apache 2.0",
|
31 |
-
"creation_date": 1726790400000,
|
32 |
-
"tasks": [
|
33 |
-
"translation_from",
|
34 |
-
"translation_to",
|
35 |
-
"classification",
|
36 |
-
"mmlu",
|
37 |
-
"arc",
|
38 |
-
"truthfulqa",
|
39 |
-
"mgsm"
|
40 |
-
]
|
41 |
-
},
|
42 |
-
{
|
43 |
-
"id": "anthropic/claude-sonnet-4",
|
44 |
-
"name": "Claude Sonnet 4",
|
45 |
-
"provider_name": "Anthropic",
|
46 |
-
"cost": 15.0,
|
47 |
-
"hf_id": null,
|
48 |
-
"size": null,
|
49 |
-
"type": "closed-source",
|
50 |
-
"license": null,
|
51 |
-
"creation_date": 1747872000000,
|
52 |
-
"tasks": [
|
53 |
-
"translation_from",
|
54 |
-
"translation_to",
|
55 |
-
"classification",
|
56 |
-
"mmlu",
|
57 |
-
"arc",
|
58 |
-
"truthfulqa",
|
59 |
-
"mgsm"
|
60 |
-
]
|
61 |
-
},
|
62 |
-
{
|
63 |
-
"id": "deepseek/deepseek-chat",
|
64 |
-
"name": "DeepSeek V3",
|
65 |
-
"provider_name": "DeepSeek",
|
66 |
-
"cost": 0.72,
|
67 |
-
"hf_id": "deepseek-ai/DeepSeek-V3",
|
68 |
-
"size": 684531386000.0,
|
69 |
-
"type": "open-source",
|
70 |
-
"license": "",
|
71 |
-
"creation_date": 1735084800000,
|
72 |
-
"tasks": [
|
73 |
-
"translation_from",
|
74 |
-
"translation_to",
|
75 |
-
"classification",
|
76 |
-
"mmlu",
|
77 |
-
"arc",
|
78 |
-
"truthfulqa",
|
79 |
-
"mgsm"
|
80 |
-
]
|
81 |
-
},
|
82 |
-
{
|
83 |
-
"id": "deepseek/deepseek-chat-v3-0324",
|
84 |
-
"name": "DeepSeek V3 0324",
|
85 |
-
"provider_name": "DeepSeek",
|
86 |
-
"cost": 0.0,
|
87 |
-
"hf_id": "deepseek-ai/DeepSeek-V3-0324",
|
88 |
-
"size": 684531386000.0,
|
89 |
-
"type": "open-source",
|
90 |
-
"license": "Mit",
|
91 |
-
"creation_date": 1742774400000,
|
92 |
-
"tasks": [
|
93 |
-
"translation_from",
|
94 |
-
"translation_to",
|
95 |
-
"classification",
|
96 |
-
"mmlu",
|
97 |
-
"arc",
|
98 |
-
"truthfulqa",
|
99 |
-
"mgsm"
|
100 |
-
]
|
101 |
-
},
|
102 |
-
{
|
103 |
-
"id": "deepseek/deepseek-r1-0528",
|
104 |
-
"name": "R1 0528",
|
105 |
-
"provider_name": "DeepSeek",
|
106 |
-
"cost": 0.0,
|
107 |
-
"hf_id": "deepseek-ai/DeepSeek-R1-0528",
|
108 |
-
"size": 684531386000.0,
|
109 |
-
"type": "open-source",
|
110 |
-
"license": "Mit",
|
111 |
-
"creation_date": 1748390400000,
|
112 |
-
"tasks": [
|
113 |
-
"translation_from",
|
114 |
-
"translation_to",
|
115 |
-
"classification",
|
116 |
-
"mmlu",
|
117 |
-
"arc",
|
118 |
-
"truthfulqa",
|
119 |
-
"mgsm"
|
120 |
-
]
|
121 |
-
},
|
122 |
-
{
|
123 |
-
"id": "google/gemini-2.0-flash-lite-001",
|
124 |
-
"name": "Gemini 2.0 Flash Lite",
|
125 |
-
"provider_name": "Google",
|
126 |
-
"cost": 0.3,
|
127 |
-
"hf_id": null,
|
128 |
-
"size": null,
|
129 |
-
"type": "closed-source",
|
130 |
-
"license": null,
|
131 |
-
"creation_date": 1740441600000,
|
132 |
-
"tasks": [
|
133 |
-
"translation_from",
|
134 |
-
"translation_to",
|
135 |
-
"classification",
|
136 |
-
"mmlu",
|
137 |
-
"arc",
|
138 |
-
"truthfulqa",
|
139 |
-
"mgsm"
|
140 |
-
]
|
141 |
-
},
|
142 |
-
{
|
143 |
-
"id": "google/gemini-2.5-flash",
|
144 |
-
"name": "Gemini 2.5 Flash",
|
145 |
-
"provider_name": "Google",
|
146 |
-
"cost": 2.5,
|
147 |
-
"hf_id": null,
|
148 |
-
"size": null,
|
149 |
-
"type": "closed-source",
|
150 |
-
"license": null,
|
151 |
-
"creation_date": 1750118400000,
|
152 |
-
"tasks": [
|
153 |
-
"translation_from",
|
154 |
-
"translation_to",
|
155 |
-
"classification",
|
156 |
-
"mmlu",
|
157 |
-
"arc",
|
158 |
-
"truthfulqa",
|
159 |
-
"mgsm"
|
160 |
-
]
|
161 |
-
},
|
162 |
-
{
|
163 |
-
"id": "google/gemma-2-9b-it",
|
164 |
-
"name": "Gemma 2 9B",
|
165 |
-
"provider_name": "Google",
|
166 |
-
"cost": 0.0,
|
167 |
-
"hf_id": "google/gemma-2-9b-it",
|
168 |
-
"size": 9241705984.0,
|
169 |
-
"type": "open-source",
|
170 |
-
"license": "Gemma",
|
171 |
-
"creation_date": 1719187200000,
|
172 |
-
"tasks": [
|
173 |
-
"translation_from",
|
174 |
-
"translation_to",
|
175 |
-
"classification",
|
176 |
-
"mmlu",
|
177 |
-
"arc",
|
178 |
-
"truthfulqa",
|
179 |
-
"mgsm"
|
180 |
-
]
|
181 |
-
},
|
182 |
-
{
|
183 |
-
"id": "google/gemma-3-27b-it",
|
184 |
-
"name": "Gemma 3 27B",
|
185 |
-
"provider_name": "Google",
|
186 |
-
"cost": 0.0,
|
187 |
-
"hf_id": "google/gemma-3-27b-it",
|
188 |
-
"size": 27432406640.0,
|
189 |
-
"type": "open-source",
|
190 |
-
"license": "Gemma",
|
191 |
-
"creation_date": 1740787200000,
|
192 |
-
"tasks": [
|
193 |
-
"translation_from",
|
194 |
-
"translation_to",
|
195 |
-
"classification",
|
196 |
-
"mmlu",
|
197 |
-
"arc",
|
198 |
-
"truthfulqa",
|
199 |
-
"mgsm"
|
200 |
-
]
|
201 |
-
},
|
202 |
-
{
|
203 |
-
"id": "meta-llama/llama-3-70b-instruct",
|
204 |
-
"name": "Llama 3 70B Instruct",
|
205 |
-
"provider_name": "Meta",
|
206 |
-
"cost": 0.4,
|
207 |
-
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
|
208 |
-
"size": 70553706496.0,
|
209 |
-
"type": "open-source",
|
210 |
-
"license": "Llama3",
|
211 |
-
"creation_date": 1713312000000,
|
212 |
-
"tasks": [
|
213 |
-
"translation_from",
|
214 |
-
"translation_to",
|
215 |
-
"classification",
|
216 |
-
"mmlu",
|
217 |
-
"arc",
|
218 |
-
"truthfulqa",
|
219 |
-
"mgsm"
|
220 |
-
]
|
221 |
-
},
|
222 |
-
{
|
223 |
-
"id": "meta-llama/llama-3.1-70b-instruct",
|
224 |
-
"name": "Llama 3.1 70B Instruct",
|
225 |
-
"provider_name": "Meta",
|
226 |
-
"cost": 0.28,
|
227 |
-
"hf_id": "meta-llama/Llama-3.1-70B-Instruct",
|
228 |
-
"size": 70553706496.0,
|
229 |
-
"type": "open-source",
|
230 |
-
"license": "Llama3.1",
|
231 |
-
"creation_date": 1721088000000,
|
232 |
-
"tasks": [
|
233 |
-
"translation_from",
|
234 |
-
"translation_to",
|
235 |
-
"classification",
|
236 |
-
"mmlu",
|
237 |
-
"arc",
|
238 |
-
"truthfulqa",
|
239 |
-
"mgsm"
|
240 |
-
]
|
241 |
-
},
|
242 |
-
{
|
243 |
-
"id": "meta-llama/llama-3.2-3b-instruct",
|
244 |
-
"name": "Llama 3.2 3B Instruct",
|
245 |
-
"provider_name": "Meta",
|
246 |
-
"cost": 0.0,
|
247 |
-
"hf_id": "meta-llama/Llama-3.2-3B-Instruct",
|
248 |
-
"size": 3212749824.0,
|
249 |
-
"type": "open-source",
|
250 |
-
"license": "Llama3.2",
|
251 |
-
"creation_date": 1726617600000,
|
252 |
-
"tasks": [
|
253 |
-
"translation_from",
|
254 |
-
"translation_to",
|
255 |
-
"classification",
|
256 |
-
"mmlu",
|
257 |
-
"arc",
|
258 |
-
"truthfulqa",
|
259 |
-
"mgsm"
|
260 |
-
]
|
261 |
-
},
|
262 |
-
{
|
263 |
-
"id": "meta-llama/llama-3.3-70b-instruct",
|
264 |
-
"name": "Llama 3.3 70B Instruct",
|
265 |
-
"provider_name": "Meta",
|
266 |
-
"cost": 0.0,
|
267 |
-
"hf_id": "meta-llama/Llama-3.3-70B-Instruct",
|
268 |
-
"size": 70553706496.0,
|
269 |
-
"type": "open-source",
|
270 |
-
"license": "Llama3.3",
|
271 |
-
"creation_date": 1732579200000,
|
272 |
-
"tasks": [
|
273 |
-
"translation_from",
|
274 |
-
"translation_to",
|
275 |
-
"classification",
|
276 |
-
"mmlu",
|
277 |
-
"arc",
|
278 |
-
"truthfulqa",
|
279 |
-
"mgsm"
|
280 |
-
]
|
281 |
-
},
|
282 |
-
{
|
283 |
-
"id": "meta-llama/llama-4-maverick",
|
284 |
-
"name": "Llama 4 Maverick",
|
285 |
-
"provider_name": "Meta",
|
286 |
-
"cost": 0.6,
|
287 |
-
"hf_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
|
288 |
-
"size": 401583781376.0,
|
289 |
-
"type": "open-source",
|
290 |
-
"license": "Other",
|
291 |
-
"creation_date": 1743465600000,
|
292 |
-
"tasks": [
|
293 |
-
"translation_from",
|
294 |
-
"translation_to",
|
295 |
-
"classification",
|
296 |
-
"mmlu",
|
297 |
-
"arc",
|
298 |
-
"truthfulqa",
|
299 |
-
"mgsm"
|
300 |
-
]
|
301 |
-
},
|
302 |
-
{
|
303 |
-
"id": "meta-llama/llama-guard-4-12b",
|
304 |
-
"name": "Llama Guard 4 12B",
|
305 |
-
"provider_name": "Meta",
|
306 |
-
"cost": 0.18,
|
307 |
-
"hf_id": "meta-llama/Llama-Guard-4-12B",
|
308 |
-
"size": 12001097216.0,
|
309 |
-
"type": "open-source",
|
310 |
-
"license": "Other",
|
311 |
-
"creation_date": 1745366400000,
|
312 |
-
"tasks": [
|
313 |
-
"translation_from",
|
314 |
-
"translation_to",
|
315 |
-
"classification",
|
316 |
-
"mmlu",
|
317 |
-
"arc",
|
318 |
-
"truthfulqa",
|
319 |
-
"mgsm"
|
320 |
-
]
|
321 |
-
},
|
322 |
-
{
|
323 |
-
"id": "microsoft/phi-3-medium-128k-instruct",
|
324 |
-
"name": "Phi-3 Medium 128K Instruct",
|
325 |
-
"provider_name": "Microsoft",
|
326 |
-
"cost": 1.0,
|
327 |
-
"hf_id": "microsoft/Phi-3-medium-128k-instruct",
|
328 |
-
"size": 13960238080.0,
|
329 |
-
"type": "open-source",
|
330 |
-
"license": "Mit",
|
331 |
-
"creation_date": 1715040000000,
|
332 |
-
"tasks": [
|
333 |
-
"translation_from",
|
334 |
-
"translation_to",
|
335 |
-
"classification",
|
336 |
-
"mmlu",
|
337 |
-
"arc",
|
338 |
-
"truthfulqa",
|
339 |
-
"mgsm"
|
340 |
-
]
|
341 |
-
},
|
342 |
-
{
|
343 |
-
"id": "microsoft/phi-3.5-mini-128k-instruct",
|
344 |
-
"name": "Phi-3.5 Mini 128K Instruct",
|
345 |
-
"provider_name": "Microsoft",
|
346 |
-
"cost": 0.1,
|
347 |
-
"hf_id": "microsoft/Phi-3.5-mini-instruct",
|
348 |
-
"size": 3821079552.0,
|
349 |
-
"type": "open-source",
|
350 |
-
"license": "Mit",
|
351 |
-
"creation_date": 1723766400000,
|
352 |
-
"tasks": [
|
353 |
-
"translation_from",
|
354 |
-
"translation_to",
|
355 |
-
"classification",
|
356 |
-
"mmlu",
|
357 |
-
"arc",
|
358 |
-
"truthfulqa",
|
359 |
-
"mgsm"
|
360 |
-
]
|
361 |
-
},
|
362 |
-
{
|
363 |
-
"id": "microsoft/phi-4",
|
364 |
-
"name": "Phi 4",
|
365 |
-
"provider_name": "Microsoft",
|
366 |
-
"cost": 0.14,
|
367 |
-
"hf_id": "microsoft/phi-4",
|
368 |
-
"size": 14659507200.0,
|
369 |
-
"type": "open-source",
|
370 |
-
"license": "Mit",
|
371 |
-
"creation_date": 1733875200000,
|
372 |
-
"tasks": [
|
373 |
-
"translation_from",
|
374 |
-
"translation_to",
|
375 |
-
"classification",
|
376 |
-
"mmlu",
|
377 |
-
"arc",
|
378 |
-
"truthfulqa",
|
379 |
-
"mgsm"
|
380 |
-
]
|
381 |
-
},
|
382 |
-
{
|
383 |
-
"id": "microsoft/phi-4-multimodal-instruct",
|
384 |
-
"name": "Phi 4 Multimodal Instruct",
|
385 |
-
"provider_name": "Microsoft",
|
386 |
-
"cost": 0.1,
|
387 |
-
"hf_id": "microsoft/Phi-4-multimodal-instruct",
|
388 |
-
"size": 5574460384.0,
|
389 |
-
"type": "open-source",
|
390 |
-
"license": "Mit",
|
391 |
-
"creation_date": 1740355200000,
|
392 |
-
"tasks": [
|
393 |
-
"translation_from",
|
394 |
-
"translation_to",
|
395 |
-
"classification",
|
396 |
-
"mmlu",
|
397 |
-
"arc",
|
398 |
-
"truthfulqa",
|
399 |
-
"mgsm"
|
400 |
-
]
|
401 |
-
},
|
402 |
-
{
|
403 |
-
"id": "mistralai/magistral-medium-2506",
|
404 |
-
"name": "Magistral Medium 2506",
|
405 |
-
"provider_name": "Mistral",
|
406 |
-
"cost": 5.0,
|
407 |
-
"hf_id": null,
|
408 |
-
"size": null,
|
409 |
-
"type": "closed-source",
|
410 |
-
"license": null,
|
411 |
-
"creation_date": 1749340800000,
|
412 |
-
"tasks": [
|
413 |
-
"translation_from",
|
414 |
-
"translation_to",
|
415 |
-
"classification",
|
416 |
-
"mmlu",
|
417 |
-
"arc",
|
418 |
-
"truthfulqa",
|
419 |
-
"mgsm"
|
420 |
-
]
|
421 |
-
},
|
422 |
-
{
|
423 |
-
"id": "mistralai/mistral-7b-instruct",
|
424 |
-
"name": "Mistral 7B Instruct",
|
425 |
-
"provider_name": "Mistral",
|
426 |
-
"cost": 0.0,
|
427 |
-
"hf_id": "mistralai/Mistral-7B-Instruct-v0.3",
|
428 |
-
"size": 7248023552.0,
|
429 |
-
"type": "open-source",
|
430 |
-
"license": "Apache 2.0",
|
431 |
-
"creation_date": 1716336000000,
|
432 |
-
"tasks": [
|
433 |
-
"translation_from",
|
434 |
-
"translation_to",
|
435 |
-
"classification",
|
436 |
-
"mmlu",
|
437 |
-
"arc",
|
438 |
-
"truthfulqa",
|
439 |
-
"mgsm"
|
440 |
-
]
|
441 |
-
},
|
442 |
-
{
|
443 |
-
"id": "mistralai/mistral-nemo",
|
444 |
-
"name": "Mistral Nemo",
|
445 |
-
"provider_name": "Mistral",
|
446 |
-
"cost": 0.0,
|
447 |
-
"hf_id": "mistralai/Mistral-Nemo-Instruct-2407",
|
448 |
-
"size": 12247782400.0,
|
449 |
-
"type": "open-source",
|
450 |
-
"license": "Apache 2.0",
|
451 |
-
"creation_date": 1721174400000,
|
452 |
-
"tasks": [
|
453 |
-
"translation_from",
|
454 |
-
"translation_to",
|
455 |
-
"classification",
|
456 |
-
"mmlu",
|
457 |
-
"arc",
|
458 |
-
"truthfulqa",
|
459 |
-
"mgsm"
|
460 |
-
]
|
461 |
-
},
|
462 |
-
{
|
463 |
-
"id": "mistralai/mistral-saba",
|
464 |
-
"name": "Saba",
|
465 |
-
"provider_name": "Mistral",
|
466 |
-
"cost": 0.6,
|
467 |
-
"hf_id": null,
|
468 |
-
"size": null,
|
469 |
-
"type": "closed-source",
|
470 |
-
"license": null,
|
471 |
-
"creation_date": 1739750400000,
|
472 |
-
"tasks": [
|
473 |
-
"translation_from",
|
474 |
-
"translation_to",
|
475 |
-
"classification",
|
476 |
-
"mmlu",
|
477 |
-
"arc",
|
478 |
-
"truthfulqa",
|
479 |
-
"mgsm"
|
480 |
-
]
|
481 |
-
},
|
482 |
-
{
|
483 |
-
"id": "mistralai/mistral-small-3.1-24b-instruct",
|
484 |
-
"name": "Mistral Small 3.1 24B",
|
485 |
-
"provider_name": "Mistral",
|
486 |
-
"cost": 0.0,
|
487 |
-
"hf_id": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
|
488 |
-
"size": 24011361280.0,
|
489 |
-
"type": "open-source",
|
490 |
-
"license": "Apache 2.0",
|
491 |
-
"creation_date": 1741651200000,
|
492 |
-
"tasks": [
|
493 |
-
"translation_from",
|
494 |
-
"translation_to",
|
495 |
-
"classification",
|
496 |
-
"mmlu",
|
497 |
-
"arc",
|
498 |
-
"truthfulqa",
|
499 |
-
"mgsm"
|
500 |
-
]
|
501 |
-
},
|
502 |
-
{
|
503 |
-
"id": "mistralai/mixtral-8x7b-instruct",
|
504 |
-
"name": "Mixtral 8x7B Instruct",
|
505 |
-
"provider_name": "Mistral",
|
506 |
-
"cost": 0.24,
|
507 |
-
"hf_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
508 |
-
"size": 46702792704.0,
|
509 |
-
"type": "open-source",
|
510 |
-
"license": "Apache 2.0",
|
511 |
-
"creation_date": 1702166400000,
|
512 |
-
"tasks": [
|
513 |
-
"translation_from",
|
514 |
-
"translation_to",
|
515 |
-
"classification",
|
516 |
-
"mmlu",
|
517 |
-
"arc",
|
518 |
-
"truthfulqa",
|
519 |
-
"mgsm"
|
520 |
-
]
|
521 |
-
},
|
522 |
-
{
|
523 |
-
"id": "neversleep/llama-3-lumimaid-70b",
|
524 |
-
"name": "Llama 3 Lumimaid 70B",
|
525 |
-
"provider_name": "NeverSleep",
|
526 |
-
"cost": 6.0,
|
527 |
-
"hf_id": "NeverSleep/Llama-3-Lumimaid-70B-v0.1",
|
528 |
-
"size": 70553706496.0,
|
529 |
-
"type": "open-source",
|
530 |
-
"license": "Cc By Nc 4.0",
|
531 |
-
"creation_date": 1714262400000,
|
532 |
-
"tasks": [
|
533 |
-
"translation_from",
|
534 |
-
"translation_to",
|
535 |
-
"classification",
|
536 |
-
"mmlu",
|
537 |
-
"arc",
|
538 |
-
"truthfulqa",
|
539 |
-
"mgsm"
|
540 |
-
]
|
541 |
-
},
|
542 |
-
{
|
543 |
-
"id": "nvidia/llama-3.1-nemotron-70b-instruct",
|
544 |
-
"name": "Llama 3.1 Nemotron 70B Instruct",
|
545 |
-
"provider_name": "NVIDIA",
|
546 |
-
"cost": 0.3,
|
547 |
-
"hf_id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
|
548 |
-
"size": 70553706496.0,
|
549 |
-
"type": "open-source",
|
550 |
-
"license": "Llama3.1",
|
551 |
-
"creation_date": 1728691200000,
|
552 |
-
"tasks": [
|
553 |
-
"translation_from",
|
554 |
-
"translation_to",
|
555 |
-
"classification",
|
556 |
-
"mmlu",
|
557 |
-
"arc",
|
558 |
-
"truthfulqa",
|
559 |
-
"mgsm"
|
560 |
-
]
|
561 |
-
},
|
562 |
-
{
|
563 |
-
"id": "openai/chatgpt-4o-latest",
|
564 |
-
"name": "ChatGPT-4o",
|
565 |
-
"provider_name": "OpenAI",
|
566 |
-
"cost": 15.0,
|
567 |
-
"hf_id": null,
|
568 |
-
"size": null,
|
569 |
-
"type": "closed-source",
|
570 |
-
"license": null,
|
571 |
-
"creation_date": 1723593600000,
|
572 |
-
"tasks": [
|
573 |
-
"translation_from",
|
574 |
-
"translation_to",
|
575 |
-
"classification",
|
576 |
-
"mmlu",
|
577 |
-
"arc",
|
578 |
-
"truthfulqa",
|
579 |
-
"mgsm"
|
580 |
-
]
|
581 |
-
},
|
582 |
-
{
|
583 |
-
"id": "openai/gpt-3.5-turbo",
|
584 |
-
"name": "GPT-3.5 Turbo",
|
585 |
-
"provider_name": "OpenAI",
|
586 |
-
"cost": 1.5,
|
587 |
-
"hf_id": null,
|
588 |
-
"size": null,
|
589 |
-
"type": "closed-source",
|
590 |
-
"license": null,
|
591 |
-
"creation_date": 1685232000000,
|
592 |
-
"tasks": [
|
593 |
-
"translation_from",
|
594 |
-
"translation_to",
|
595 |
-
"classification",
|
596 |
-
"mmlu",
|
597 |
-
"arc",
|
598 |
-
"truthfulqa",
|
599 |
-
"mgsm"
|
600 |
-
]
|
601 |
-
},
|
602 |
-
{
|
603 |
-
"id": "openai/gpt-3.5-turbo-0613",
|
604 |
-
"name": "GPT-3.5 Turbo (older v0613)",
|
605 |
-
"provider_name": "OpenAI",
|
606 |
-
"cost": 2.0,
|
607 |
-
"hf_id": null,
|
608 |
-
"size": null,
|
609 |
-
"type": "closed-source",
|
610 |
-
"license": null,
|
611 |
-
"creation_date": 1706140800000,
|
612 |
-
"tasks": [
|
613 |
-
"translation_from",
|
614 |
-
"translation_to",
|
615 |
-
"classification",
|
616 |
-
"mmlu",
|
617 |
-
"arc",
|
618 |
-
"truthfulqa",
|
619 |
-
"mgsm"
|
620 |
-
]
|
621 |
-
},
|
622 |
-
{
|
623 |
-
"id": "openai/gpt-4.1",
|
624 |
-
"name": "GPT-4.1",
|
625 |
-
"provider_name": "OpenAI",
|
626 |
-
"cost": 8.0,
|
627 |
-
"hf_id": null,
|
628 |
-
"size": null,
|
629 |
-
"type": "closed-source",
|
630 |
-
"license": null,
|
631 |
-
"creation_date": 1744588800000,
|
632 |
-
"tasks": [
|
633 |
-
"translation_from",
|
634 |
-
"translation_to",
|
635 |
-
"classification",
|
636 |
-
"mmlu",
|
637 |
-
"arc",
|
638 |
-
"truthfulqa",
|
639 |
-
"mgsm"
|
640 |
-
]
|
641 |
-
},
|
642 |
-
{
|
643 |
-
"id": "openai/gpt-4.1-mini",
|
644 |
-
"name": "GPT-4.1 Mini",
|
645 |
-
"provider_name": "OpenAI",
|
646 |
-
"cost": 1.6,
|
647 |
-
"hf_id": null,
|
648 |
-
"size": null,
|
649 |
-
"type": "closed-source",
|
650 |
-
"license": null,
|
651 |
-
"creation_date": 1744588800000,
|
652 |
-
"tasks": [
|
653 |
-
"translation_from",
|
654 |
-
"translation_to",
|
655 |
-
"classification",
|
656 |
-
"mmlu",
|
657 |
-
"arc",
|
658 |
-
"truthfulqa",
|
659 |
-
"mgsm"
|
660 |
-
]
|
661 |
-
},
|
662 |
-
{
|
663 |
-
"id": "openai/gpt-4.1-nano",
|
664 |
-
"name": "GPT-4.1 Nano",
|
665 |
-
"provider_name": "OpenAI",
|
666 |
-
"cost": 0.4,
|
667 |
-
"hf_id": null,
|
668 |
-
"size": null,
|
669 |
-
"type": "closed-source",
|
670 |
-
"license": null,
|
671 |
-
"creation_date": 1744588800000,
|
672 |
-
"tasks": [
|
673 |
-
"translation_from",
|
674 |
-
"translation_to",
|
675 |
-
"classification",
|
676 |
-
"mmlu",
|
677 |
-
"arc",
|
678 |
-
"truthfulqa",
|
679 |
-
"mgsm"
|
680 |
-
]
|
681 |
-
},
|
682 |
-
{
|
683 |
-
"id": "openai/gpt-4o-2024-11-20",
|
684 |
-
"name": "GPT-4o (2024-11-20)",
|
685 |
-
"provider_name": "OpenAI",
|
686 |
-
"cost": 10.0,
|
687 |
-
"hf_id": null,
|
688 |
-
"size": null,
|
689 |
-
"type": "closed-source",
|
690 |
-
"license": null,
|
691 |
-
"creation_date": 1732060800000,
|
692 |
-
"tasks": [
|
693 |
-
"translation_from",
|
694 |
-
"translation_to",
|
695 |
-
"classification",
|
696 |
-
"mmlu",
|
697 |
-
"arc",
|
698 |
-
"truthfulqa",
|
699 |
-
"mgsm"
|
700 |
-
]
|
701 |
-
},
|
702 |
-
{
|
703 |
-
"id": "openai/gpt-4o-mini",
|
704 |
-
"name": "GPT-4o-mini",
|
705 |
-
"provider_name": "OpenAI",
|
706 |
-
"cost": 0.6,
|
707 |
-
"hf_id": null,
|
708 |
-
"size": null,
|
709 |
-
"type": "closed-source",
|
710 |
-
"license": null,
|
711 |
-
"creation_date": 1721260800000,
|
712 |
-
"tasks": [
|
713 |
-
"translation_from",
|
714 |
-
"translation_to",
|
715 |
-
"classification",
|
716 |
-
"mmlu",
|
717 |
-
"arc",
|
718 |
-
"truthfulqa",
|
719 |
-
"mgsm"
|
720 |
-
]
|
721 |
-
},
|
722 |
-
{
|
723 |
-
"id": "openai/gpt-5",
|
724 |
-
"name": "GPT-5",
|
725 |
-
"provider_name": "OpenAI",
|
726 |
-
"cost": 10.0,
|
727 |
-
"hf_id": null,
|
728 |
-
"size": null,
|
729 |
-
"type": "closed-source",
|
730 |
-
"license": null,
|
731 |
-
"creation_date": 1754524800000,
|
732 |
-
"tasks": [
|
733 |
-
"translation_from",
|
734 |
-
"translation_to",
|
735 |
-
"classification",
|
736 |
-
"mmlu",
|
737 |
-
"arc",
|
738 |
-
"truthfulqa",
|
739 |
-
"mgsm"
|
740 |
-
]
|
741 |
-
},
|
742 |
-
{
|
743 |
-
"id": "opengvlab/internvl3-14b",
|
744 |
-
"name": "InternVL3 14B",
|
745 |
-
"provider_name": "OpenGVLab",
|
746 |
-
"cost": 0.4,
|
747 |
-
"hf_id": "OpenGVLab/InternVL3-14B",
|
748 |
-
"size": 15117256704.0,
|
749 |
-
"type": "open-source",
|
750 |
-
"license": "Apache 2.0",
|
751 |
-
"creation_date": 1744243200000,
|
752 |
-
"tasks": [
|
753 |
-
"translation_from",
|
754 |
-
"translation_to",
|
755 |
-
"classification",
|
756 |
-
"mmlu",
|
757 |
-
"arc",
|
758 |
-
"truthfulqa",
|
759 |
-
"mgsm"
|
760 |
-
]
|
761 |
-
},
|
762 |
-
{
|
763 |
-
"id": "qwen/qwen3-235b-a22b",
|
764 |
-
"name": "Qwen3 235B A22B",
|
765 |
-
"provider_name": "Qwen",
|
766 |
-
"cost": 0.0,
|
767 |
-
"hf_id": "Qwen/Qwen3-235B-A22B",
|
768 |
-
"size": 235093634560.0,
|
769 |
-
"type": "open-source",
|
770 |
-
"license": "Apache 2.0",
|
771 |
-
"creation_date": 1745712000000,
|
772 |
-
"tasks": [
|
773 |
-
"translation_from",
|
774 |
-
"translation_to",
|
775 |
-
"classification",
|
776 |
-
"mmlu",
|
777 |
-
"arc",
|
778 |
-
"truthfulqa",
|
779 |
-
"mgsm"
|
780 |
-
]
|
781 |
-
},
|
782 |
-
{
|
783 |
-
"id": "qwen/qwen3-30b-a3b",
|
784 |
-
"name": "Qwen3 30B A3B",
|
785 |
-
"provider_name": "Qwen",
|
786 |
-
"cost": 0.0,
|
787 |
-
"hf_id": "Qwen/Qwen3-30B-A3B",
|
788 |
-
"size": 30532122624.0,
|
789 |
-
"type": "open-source",
|
790 |
-
"license": "Apache 2.0",
|
791 |
-
"creation_date": 1745712000000,
|
792 |
-
"tasks": [
|
793 |
-
"translation_from",
|
794 |
-
"translation_to",
|
795 |
-
"classification",
|
796 |
-
"mmlu",
|
797 |
-
"arc",
|
798 |
-
"truthfulqa",
|
799 |
-
"mgsm"
|
800 |
-
]
|
801 |
-
},
|
802 |
-
{
|
803 |
-
"id": "qwen/qwen3-32b",
|
804 |
-
"name": "Qwen3 32B",
|
805 |
-
"provider_name": "Qwen",
|
806 |
-
"cost": 0.07,
|
807 |
-
"hf_id": "Qwen/Qwen3-32B",
|
808 |
-
"size": 32762123264.0,
|
809 |
-
"type": "open-source",
|
810 |
-
"license": "Apache 2.0",
|
811 |
-
"creation_date": 1745712000000,
|
812 |
-
"tasks": [
|
813 |
-
"translation_from",
|
814 |
-
"translation_to",
|
815 |
-
"classification",
|
816 |
-
"mmlu",
|
817 |
-
"arc",
|
818 |
-
"truthfulqa",
|
819 |
-
"mgsm"
|
820 |
-
]
|
821 |
-
},
|
822 |
-
{
|
823 |
-
"id": "qwen/qwq-32b",
|
824 |
-
"name": "QwQ 32B",
|
825 |
-
"provider_name": "Qwen",
|
826 |
-
"cost": 0.0,
|
827 |
-
"hf_id": "Qwen/QwQ-32B",
|
828 |
-
"size": 32763876352.0,
|
829 |
-
"type": "open-source",
|
830 |
-
"license": "Apache 2.0",
|
831 |
-
"creation_date": 1741132800000,
|
832 |
-
"tasks": [
|
833 |
-
"translation_from",
|
834 |
-
"translation_to",
|
835 |
-
"classification",
|
836 |
-
"mmlu",
|
837 |
-
"arc",
|
838 |
-
"truthfulqa",
|
839 |
-
"mgsm"
|
840 |
-
]
|
841 |
-
},
|
842 |
-
{
|
843 |
-
"id": "switchpoint/router",
|
844 |
-
"name": "Switchpoint Router",
|
845 |
-
"provider_name": "Switchpoint Router",
|
846 |
-
"cost": 3.4,
|
847 |
-
"hf_id": null,
|
848 |
-
"size": null,
|
849 |
-
"type": "closed-source",
|
850 |
-
"license": null,
|
851 |
-
"creation_date": 1752192000000,
|
852 |
-
"tasks": [
|
853 |
-
"translation_from",
|
854 |
-
"translation_to",
|
855 |
-
"classification",
|
856 |
-
"mmlu",
|
857 |
-
"arc",
|
858 |
-
"truthfulqa",
|
859 |
-
"mgsm"
|
860 |
-
]
|
861 |
-
},
|
862 |
-
{
|
863 |
-
"id": "thedrummer/anubis-pro-105b-v1",
|
864 |
-
"name": "Anubis Pro 105B V1",
|
865 |
-
"provider_name": "TheDrummer",
|
866 |
-
"cost": 1.0,
|
867 |
-
"hf_id": "TheDrummer/Anubis-Pro-105B-v1",
|
868 |
-
"size": 104779882496.0,
|
869 |
-
"type": "open-source",
|
870 |
-
"license": "Other",
|
871 |
-
"creation_date": 1738454400000,
|
872 |
-
"tasks": [
|
873 |
-
"translation_from",
|
874 |
-
"translation_to",
|
875 |
-
"classification",
|
876 |
-
"mmlu",
|
877 |
-
"arc",
|
878 |
-
"truthfulqa",
|
879 |
-
"mgsm"
|
880 |
-
]
|
881 |
-
},
|
882 |
-
{
|
883 |
-
"id": "thedrummer/skyfall-36b-v2",
|
884 |
-
"name": "Skyfall 36B V2",
|
885 |
-
"provider_name": "TheDrummer",
|
886 |
-
"cost": 0.19,
|
887 |
-
"hf_id": "TheDrummer/Skyfall-36B-v2",
|
888 |
-
"size": 36910535680.0,
|
889 |
-
"type": "open-source",
|
890 |
-
"license": "Other",
|
891 |
-
"creation_date": 1738540800000,
|
892 |
-
"tasks": [
|
893 |
-
"translation_from",
|
894 |
-
"translation_to",
|
895 |
-
"classification",
|
896 |
-
"mmlu",
|
897 |
-
"arc",
|
898 |
-
"truthfulqa",
|
899 |
-
"mgsm"
|
900 |
-
]
|
901 |
-
},
|
902 |
-
{
|
903 |
-
"id": "tngtech/deepseek-r1t-chimera",
|
904 |
-
"name": "DeepSeek R1T Chimera",
|
905 |
-
"provider_name": "TNG",
|
906 |
-
"cost": 0.0,
|
907 |
-
"hf_id": "tngtech/DeepSeek-R1T-Chimera",
|
908 |
-
"size": 684531386000.0,
|
909 |
-
"type": "open-source",
|
910 |
-
"license": "Mit",
|
911 |
-
"creation_date": 1745625600000,
|
912 |
-
"tasks": [
|
913 |
-
"translation_from",
|
914 |
-
"translation_to",
|
915 |
-
"classification",
|
916 |
-
"mmlu",
|
917 |
-
"arc",
|
918 |
-
"truthfulqa",
|
919 |
-
"mgsm"
|
920 |
-
]
|
921 |
-
},
|
922 |
-
{
|
923 |
-
"id": "tngtech/deepseek-r1t2-chimera",
|
924 |
-
"name": "DeepSeek R1T2 Chimera",
|
925 |
-
"provider_name": "TNG",
|
926 |
-
"cost": 0.0,
|
927 |
-
"hf_id": "tngtech/DeepSeek-TNG-R1T2-Chimera",
|
928 |
-
"size": 684531386000.0,
|
929 |
-
"type": "open-source",
|
930 |
-
"license": "Mit",
|
931 |
-
"creation_date": 1751414400000,
|
932 |
-
"tasks": [
|
933 |
-
"translation_from",
|
934 |
-
"translation_to",
|
935 |
-
"classification",
|
936 |
-
"mmlu",
|
937 |
-
"arc",
|
938 |
-
"truthfulqa",
|
939 |
-
"mgsm"
|
940 |
-
]
|
941 |
-
},
|
942 |
-
{
|
943 |
-
"id": "x-ai/grok-2-1212",
|
944 |
-
"name": "Grok 2 1212",
|
945 |
-
"provider_name": "xAI",
|
946 |
-
"cost": 10.0,
|
947 |
-
"hf_id": null,
|
948 |
-
"size": null,
|
949 |
-
"type": "closed-source",
|
950 |
-
"license": null,
|
951 |
-
"creation_date": 1734220800000,
|
952 |
-
"tasks": [
|
953 |
-
"translation_from",
|
954 |
-
"translation_to",
|
955 |
-
"classification",
|
956 |
-
"mmlu",
|
957 |
-
"arc",
|
958 |
-
"truthfulqa",
|
959 |
-
"mgsm"
|
960 |
-
]
|
961 |
-
},
|
962 |
-
{
|
963 |
-
"id": "google/translate-v2",
|
964 |
-
"name": "Google Translate",
|
965 |
-
"provider_name": "Google",
|
966 |
-
"cost": 20.0,
|
967 |
-
"hf_id": null,
|
968 |
-
"size": null,
|
969 |
-
"type": "closed-source",
|
970 |
-
"license": null,
|
971 |
-
"creation_date": null,
|
972 |
-
"tasks": [
|
973 |
-
"translation_from",
|
974 |
-
"translation_to"
|
975 |
-
]
|
976 |
-
},
|
977 |
-
{
|
978 |
-
"id": "moonshotai/kimi-k2",
|
979 |
-
"name": "Kimi K2",
|
980 |
-
"provider_name": "Moonshot AI",
|
981 |
-
"size": null,
|
982 |
-
"type": "closed-source",
|
983 |
-
"cost": 0.6,
|
984 |
-
"hf_id": null,
|
985 |
-
"creation_date": null,
|
986 |
-
"license": null
|
987 |
}
|
988 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"id":"openai\/gpt-5-nano",
|
4 |
+
"name":"GPT-5 Nano",
|
5 |
+
"provider_name":"OpenAI",
|
6 |
+
"cost":0.4,
|
7 |
+
"hf_id":null,
|
8 |
+
"size":null,
|
9 |
+
"type":"closed-source",
|
10 |
+
"license":null,
|
11 |
+
"creation_date":1754524800000,
|
12 |
+
"tasks":[
|
13 |
"translation_from",
|
14 |
"translation_to",
|
15 |
"classification",
|
|
|
18 |
"truthfulqa",
|
19 |
"mgsm"
|
20 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
}
|
22 |
]
|
results.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:afcbf2e565f584c3e57fbdbd788e12aaa887f421e04249ab35a8a9fcf94ad6b4
|
3 |
+
size 8030558
|