Upload from GitHub Actions: More models and languages
Browse files- evals/main.py +57 -41
- evals/models.py +15 -7
- frontend/src/App.js +2 -0
- frontend/src/components/CostPlot.js +88 -0
- frontend/src/components/HistoryPlot.js +5 -4
- frontend/src/components/WorldMap.js +0 -1
- models.json +55 -0
- results.json +0 -0
evals/main.py
CHANGED
@@ -1,15 +1,29 @@
|
|
1 |
import asyncio
|
|
|
|
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
from languages import languages
|
|
|
|
|
|
|
|
|
5 |
from models import models
|
|
|
|
|
|
|
|
|
6 |
from tasks import tasks
|
|
|
|
|
|
|
|
|
7 |
from tqdm.asyncio import tqdm_asyncio
|
8 |
|
9 |
# ===== config =====
|
10 |
|
11 |
n_sentences = 10
|
12 |
-
n_languages = 40
|
13 |
n_models = 35
|
14 |
|
15 |
|
@@ -17,46 +31,48 @@ n_models = 35
|
|
17 |
|
18 |
|
19 |
async def evaluate():
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
results.
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
60 |
|
61 |
|
62 |
if __name__ == "__main__":
|
|
|
1 |
import asyncio
|
2 |
+
from time import time
|
3 |
+
|
4 |
+
t0 = time()
|
5 |
|
6 |
import pandas as pd
|
7 |
from languages import languages
|
8 |
+
|
9 |
+
print(f"loaded languages in {time() - t0:.2f}s")
|
10 |
+
t0 = time()
|
11 |
+
|
12 |
from models import models
|
13 |
+
|
14 |
+
print(f"loaded models in {time() - t0:.2f}s")
|
15 |
+
t0 = time()
|
16 |
+
|
17 |
from tasks import tasks
|
18 |
+
|
19 |
+
print(f"loaded tasks in {time() - t0:.2f}s")
|
20 |
+
t0 = time()
|
21 |
+
|
22 |
from tqdm.asyncio import tqdm_asyncio
|
23 |
|
24 |
# ===== config =====
|
25 |
|
26 |
n_sentences = 10
|
|
|
27 |
n_models = 35
|
28 |
|
29 |
|
|
|
31 |
|
32 |
|
33 |
async def evaluate():
|
34 |
+
# FIXME we should not need this for loop, but it helps
|
35 |
+
for n_languages in range(0, 200):
|
36 |
+
print(f"running evaluations for {n_languages} languages")
|
37 |
+
old_results = pd.read_json("results.json")
|
38 |
+
old_models = pd.read_json("models.json")
|
39 |
+
# get all combinations of model, language and task
|
40 |
+
combis = [
|
41 |
+
(model, lang.bcp_47, task_name)
|
42 |
+
for task_name, task in tasks.items()
|
43 |
+
for lang in languages.iloc[:n_languages].itertuples()
|
44 |
+
for model in models["id"].iloc[:n_models]
|
45 |
+
]
|
46 |
+
# filter out combinations that have already been evaluated
|
47 |
+
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
48 |
+
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
49 |
+
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
50 |
+
# run evaluations
|
51 |
+
results = [
|
52 |
+
tasks[task_name](model, bcp_47, i)
|
53 |
+
for i in range(n_sentences)
|
54 |
+
for model, bcp_47, task_name in combis.itertuples(index=False)
|
55 |
+
]
|
56 |
+
results = await tqdm_asyncio.gather(*results, miniters=1)
|
57 |
+
results = [r for group in results for r in group]
|
58 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
59 |
+
if results:
|
60 |
+
# aggregate results
|
61 |
+
results = pd.DataFrame(results)
|
62 |
+
results = (
|
63 |
+
results.groupby(["model", "bcp_47", "task", "metric"])
|
64 |
+
.agg({"score": "mean"})
|
65 |
+
.reset_index()
|
66 |
+
)
|
67 |
+
# save results
|
68 |
+
results = pd.concat([old_results, results])
|
69 |
+
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
70 |
+
results.to_json("results.json", **args)
|
71 |
+
# save up-to-date info on models and languages
|
72 |
+
all_models = pd.concat([old_models, pd.DataFrame(models)])
|
73 |
+
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
74 |
+
all_models.to_json("models.json", **args)
|
75 |
+
pd.DataFrame(languages).to_json("languages.json", **args)
|
76 |
|
77 |
|
78 |
if __name__ == "__main__":
|
evals/models.py
CHANGED
@@ -10,9 +10,8 @@ from dotenv import load_dotenv
|
|
10 |
from elevenlabs import AsyncElevenLabs
|
11 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
12 |
from joblib.memory import Memory
|
13 |
-
from openai import AsyncOpenAI
|
14 |
from requests import HTTPError, get
|
15 |
-
from openai import PermissionDeniedError
|
16 |
|
17 |
# for development purposes, all languages will be evaluated on the fast models
|
18 |
# and only a sample of languages will be evaluated on all models
|
@@ -48,7 +47,7 @@ important_models = [
|
|
48 |
]
|
49 |
|
50 |
blocklist = [
|
51 |
-
"microsoft/wizardlm-2-8x22b"
|
52 |
]
|
53 |
|
54 |
transcription_models = [
|
@@ -68,7 +67,13 @@ def get_models(date: date):
|
|
68 |
|
69 |
def get_model(permaslug):
|
70 |
models = get_models(date.today())
|
71 |
-
slugs = [
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
if len(slugs) == 0:
|
73 |
# the problem is that free models typically have very high rate-limiting
|
74 |
print(f"no non-free model found for {permaslug}")
|
@@ -124,6 +129,7 @@ async def complete(**kwargs) -> str | None:
|
|
124 |
raise Exception(response)
|
125 |
return response.choices[0].message.content.strip()
|
126 |
|
|
|
127 |
@cache
|
128 |
async def transcribe_elevenlabs(path, model):
|
129 |
modelname = model.split("/")[-1]
|
@@ -221,7 +227,9 @@ def load_models(date: date):
|
|
221 |
).dt.date
|
222 |
|
223 |
models = models.assign(
|
224 |
-
name=or_metadata.str["short_name"]
|
|
|
|
|
225 |
provider_name=or_metadata.str["name"].str.split(": ").str[0],
|
226 |
cost=or_metadata.apply(get_cost),
|
227 |
hf_id=hf_metadata.str["hf_id"],
|
@@ -230,8 +238,8 @@ def load_models(date: date):
|
|
230 |
license=hf_metadata.str["license"],
|
231 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
232 |
)
|
233 |
-
models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
234 |
return models
|
235 |
|
236 |
|
237 |
-
models = load_models(date.today())
|
|
|
10 |
from elevenlabs import AsyncElevenLabs
|
11 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
12 |
from joblib.memory import Memory
|
13 |
+
from openai import AsyncOpenAI, PermissionDeniedError
|
14 |
from requests import HTTPError, get
|
|
|
15 |
|
16 |
# for development purposes, all languages will be evaluated on the fast models
|
17 |
# and only a sample of languages will be evaluated on all models
|
|
|
47 |
]
|
48 |
|
49 |
blocklist = [
|
50 |
+
"microsoft/wizardlm-2-8x22b" # temporarily rate-limited
|
51 |
]
|
52 |
|
53 |
transcription_models = [
|
|
|
67 |
|
68 |
def get_model(permaslug):
|
69 |
models = get_models(date.today())
|
70 |
+
slugs = [
|
71 |
+
m
|
72 |
+
for m in models
|
73 |
+
if m["permaslug"] == permaslug
|
74 |
+
and m["endpoint"]
|
75 |
+
and not m["endpoint"]["is_free"]
|
76 |
+
]
|
77 |
if len(slugs) == 0:
|
78 |
# the problem is that free models typically have very high rate-limiting
|
79 |
print(f"no non-free model found for {permaslug}")
|
|
|
129 |
raise Exception(response)
|
130 |
return response.choices[0].message.content.strip()
|
131 |
|
132 |
+
|
133 |
@cache
|
134 |
async def transcribe_elevenlabs(path, model):
|
135 |
modelname = model.split("/")[-1]
|
|
|
227 |
).dt.date
|
228 |
|
229 |
models = models.assign(
|
230 |
+
name=or_metadata.str["short_name"]
|
231 |
+
.str.replace(" (free)", "")
|
232 |
+
.str.replace(" (self-moderated)", ""),
|
233 |
provider_name=or_metadata.str["name"].str.split(": ").str[0],
|
234 |
cost=or_metadata.apply(get_cost),
|
235 |
hf_id=hf_metadata.str["hf_id"],
|
|
|
238 |
license=hf_metadata.str["license"],
|
239 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
240 |
)
|
241 |
+
# models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
242 |
return models
|
243 |
|
244 |
|
245 |
+
models = load_models(date.today())
|
frontend/src/App.js
CHANGED
@@ -9,6 +9,7 @@ import AutoComplete from './components/AutoComplete'
|
|
9 |
import LanguagePlot from './components/LanguagePlot'
|
10 |
import SpeakerPlot from './components/SpeakerPlot'
|
11 |
import HistoryPlot from './components/HistoryPlot'
|
|
|
12 |
import { Carousel } from 'primereact/carousel'
|
13 |
import { Dialog } from 'primereact/dialog'
|
14 |
import { Button } from 'primereact/button'
|
@@ -211,6 +212,7 @@ function App () {
|
|
211 |
<LanguagePlot data={data} />,
|
212 |
<SpeakerPlot data={data} />,
|
213 |
<HistoryPlot data={data} />,
|
|
|
214 |
]}
|
215 |
numScroll={1}
|
216 |
numVisible={1}
|
|
|
9 |
import LanguagePlot from './components/LanguagePlot'
|
10 |
import SpeakerPlot from './components/SpeakerPlot'
|
11 |
import HistoryPlot from './components/HistoryPlot'
|
12 |
+
import CostPlot from './components/CostPlot'
|
13 |
import { Carousel } from 'primereact/carousel'
|
14 |
import { Dialog } from 'primereact/dialog'
|
15 |
import { Button } from 'primereact/button'
|
|
|
212 |
<LanguagePlot data={data} />,
|
213 |
<SpeakerPlot data={data} />,
|
214 |
<HistoryPlot data={data} />,
|
215 |
+
<CostPlot data={data} />,
|
216 |
]}
|
217 |
numScroll={1}
|
218 |
numVisible={1}
|
frontend/src/components/CostPlot.js
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useRef, useEffect } from 'react'
|
2 |
+
import * as Plot from '@observablehq/plot'
|
3 |
+
|
4 |
+
const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
5 |
+
const containerRef = useRef()
|
6 |
+
useEffect(() => {
|
7 |
+
const models = [...data.model_table] // sort copy, not in place
|
8 |
+
.filter(d => d.average !== null && d.cost > 0)
|
9 |
+
.sort((a, b) => a.cost - b.cost)
|
10 |
+
.reduce((acc, curr) => {
|
11 |
+
const last = acc[acc.length - 1]?.maxAverage || 0
|
12 |
+
acc.push({
|
13 |
+
...curr,
|
14 |
+
maxAverage: Math.max(last, curr.average),
|
15 |
+
newRecord: curr.average > last
|
16 |
+
})
|
17 |
+
return acc
|
18 |
+
}, [])
|
19 |
+
let USDollar = new Intl.NumberFormat('en-US', {
|
20 |
+
style: 'currency',
|
21 |
+
currency: 'USD'
|
22 |
+
})
|
23 |
+
const plot = Plot.plot({
|
24 |
+
width: width,
|
25 |
+
height: height,
|
26 |
+
subtitle: 'Cost vs Performance',
|
27 |
+
x: {
|
28 |
+
label: 'Cost (USD)',
|
29 |
+
type: 'log',
|
30 |
+
// format dollar / ct
|
31 |
+
tickFormat: d => USDollar.format(d)
|
32 |
+
},
|
33 |
+
y: {
|
34 |
+
label: 'Language Proficiency Score'
|
35 |
+
},
|
36 |
+
symbol: {
|
37 |
+
legend: true
|
38 |
+
},
|
39 |
+
marks: [
|
40 |
+
Plot.dot(models, {
|
41 |
+
x: d => d.cost,
|
42 |
+
y: d => d.average,
|
43 |
+
symbol: 'provider_name',
|
44 |
+
stroke: 'provider_name',
|
45 |
+
title: d =>
|
46 |
+
`${d.provider_name} - ${d.name} (${
|
47 |
+
d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
|
48 |
+
})\nCost: ${USDollar.format(d.cost)}\nScore: ${d.average.toFixed(
|
49 |
+
2
|
50 |
+
)}`,
|
51 |
+
tip: true
|
52 |
+
}),
|
53 |
+
Plot.line(
|
54 |
+
[
|
55 |
+
...models.filter(d => d.newRecord),
|
56 |
+
{
|
57 |
+
cost: models.map(d => d.cost).reduce((a, b) => Math.max(a, b), 0),
|
58 |
+
maxAverage: models[models.length - 1].maxAverage
|
59 |
+
}
|
60 |
+
],
|
61 |
+
{
|
62 |
+
x: d => d.cost,
|
63 |
+
y: d => d.maxAverage,
|
64 |
+
curve: 'catmull-rom',
|
65 |
+
strokeOpacity: 0.3
|
66 |
+
}
|
67 |
+
)
|
68 |
+
]
|
69 |
+
})
|
70 |
+
containerRef.current.append(plot)
|
71 |
+
return () => plot.remove()
|
72 |
+
}, [data])
|
73 |
+
|
74 |
+
return (
|
75 |
+
<div
|
76 |
+
ref={containerRef}
|
77 |
+
style={{
|
78 |
+
width: '100%',
|
79 |
+
height: '100%',
|
80 |
+
display: 'flex',
|
81 |
+
alignItems: 'center',
|
82 |
+
justifyContent: 'center'
|
83 |
+
}}
|
84 |
+
/>
|
85 |
+
)
|
86 |
+
}
|
87 |
+
|
88 |
+
export default HistoryPlot
|
frontend/src/components/HistoryPlot.js
CHANGED
@@ -15,7 +15,6 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
15 |
})
|
16 |
return acc
|
17 |
}, [])
|
18 |
-
console.log(models)
|
19 |
useEffect(() => {
|
20 |
const plot = Plot.plot({
|
21 |
width: width,
|
@@ -36,12 +35,14 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
36 |
Plot.dot(models, {
|
37 |
x: d => d.creation_date,
|
38 |
y: d => d.average,
|
39 |
-
symbol:
|
40 |
-
stroke:
|
41 |
title: d =>
|
42 |
`${d.provider_name} - ${d.name} (${
|
43 |
d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
|
44 |
-
})\nPublished: ${
|
|
|
|
|
45 |
tip: true
|
46 |
}),
|
47 |
Plot.line(
|
|
|
15 |
})
|
16 |
return acc
|
17 |
}, [])
|
|
|
18 |
useEffect(() => {
|
19 |
const plot = Plot.plot({
|
20 |
width: width,
|
|
|
35 |
Plot.dot(models, {
|
36 |
x: d => d.creation_date,
|
37 |
y: d => d.average,
|
38 |
+
symbol: 'provider_name',
|
39 |
+
stroke: 'provider_name',
|
40 |
title: d =>
|
41 |
`${d.provider_name} - ${d.name} (${
|
42 |
d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
|
43 |
+
})\nPublished: ${new Date(
|
44 |
+
d.creation_date
|
45 |
+
).toLocaleDateString()}\nScore: ${d.average.toFixed(2)}`,
|
46 |
tip: true
|
47 |
}),
|
48 |
Plot.line(
|
frontend/src/components/WorldMap.js
CHANGED
@@ -43,7 +43,6 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
|
|
43 |
}, [])
|
44 |
|
45 |
useEffect(() => {
|
46 |
-
console.log('countries', data)
|
47 |
if (mapData === undefined || data === undefined) return
|
48 |
const countriesDict = data.reduce((acc, country) => {
|
49 |
acc[country.iso2] = country
|
|
|
43 |
}, [])
|
44 |
|
45 |
useEffect(() => {
|
|
|
46 |
if (mapData === undefined || data === undefined) return
|
47 |
const countriesDict = data.reduce((acc, country) => {
|
48 |
acc[country.iso2] = country
|
models.json
CHANGED
@@ -10,6 +10,39 @@
|
|
10 |
"license":null,
|
11 |
"creation_date":1733356800000
|
12 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
{
|
14 |
"id":"deepseek\/deepseek-chat",
|
15 |
"name":"DeepSeek V3",
|
@@ -98,6 +131,17 @@
|
|
98 |
"license":null,
|
99 |
"creation_date":1747699200000
|
100 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
{
|
102 |
"id":"google\/gemini-flash-1.5",
|
103 |
"name":"Gemini 1.5 Flash ",
|
@@ -263,6 +307,17 @@
|
|
263 |
"license":"Apache 2.0",
|
264 |
"creation_date":1741651200000
|
265 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
{
|
267 |
"id":"openai\/gpt-4.1-mini",
|
268 |
"name":"GPT-4.1 Mini",
|
|
|
10 |
"license":null,
|
11 |
"creation_date":1733356800000
|
12 |
},
|
13 |
+
{
|
14 |
+
"id":"anthropic\/claude-3.5-sonnet",
|
15 |
+
"name":"Claude 3.5 Sonnet",
|
16 |
+
"provider_name":"Anthropic",
|
17 |
+
"cost":15.0,
|
18 |
+
"hf_id":null,
|
19 |
+
"size":null,
|
20 |
+
"type":"closed-source",
|
21 |
+
"license":null,
|
22 |
+
"creation_date":1729555200000
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id":"anthropic\/claude-3.7-sonnet",
|
26 |
+
"name":"Claude 3.7 Sonnet",
|
27 |
+
"provider_name":"Anthropic",
|
28 |
+
"cost":15.0,
|
29 |
+
"hf_id":null,
|
30 |
+
"size":null,
|
31 |
+
"type":"closed-source",
|
32 |
+
"license":null,
|
33 |
+
"creation_date":1740355200000
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"id":"anthropic\/claude-sonnet-4",
|
37 |
+
"name":"Claude Sonnet 4",
|
38 |
+
"provider_name":"Anthropic",
|
39 |
+
"cost":15.0,
|
40 |
+
"hf_id":null,
|
41 |
+
"size":null,
|
42 |
+
"type":"closed-source",
|
43 |
+
"license":null,
|
44 |
+
"creation_date":1747872000000
|
45 |
+
},
|
46 |
{
|
47 |
"id":"deepseek\/deepseek-chat",
|
48 |
"name":"DeepSeek V3",
|
|
|
131 |
"license":null,
|
132 |
"creation_date":1747699200000
|
133 |
},
|
134 |
+
{
|
135 |
+
"id":"google\/gemini-2.5-pro-preview-05-06",
|
136 |
+
"name":"Gemini 2.5 Pro Preview 05-06",
|
137 |
+
"provider_name":"Google",
|
138 |
+
"cost":10.0,
|
139 |
+
"hf_id":null,
|
140 |
+
"size":null,
|
141 |
+
"type":"closed-source",
|
142 |
+
"license":null,
|
143 |
+
"creation_date":1746576000000
|
144 |
+
},
|
145 |
{
|
146 |
"id":"google\/gemini-flash-1.5",
|
147 |
"name":"Gemini 1.5 Flash ",
|
|
|
307 |
"license":"Apache 2.0",
|
308 |
"creation_date":1741651200000
|
309 |
},
|
310 |
+
{
|
311 |
+
"id":"openai\/gpt-4.1",
|
312 |
+
"name":"GPT-4.1",
|
313 |
+
"provider_name":"OpenAI",
|
314 |
+
"cost":8.0,
|
315 |
+
"hf_id":null,
|
316 |
+
"size":null,
|
317 |
+
"type":"closed-source",
|
318 |
+
"license":null,
|
319 |
+
"creation_date":1744588800000
|
320 |
+
},
|
321 |
{
|
322 |
"id":"openai\/gpt-4.1-mini",
|
323 |
"name":"GPT-4.1 Mini",
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|