davidpomerenke commited on
Commit
a73f888
·
verified ·
1 Parent(s): 3dfd880

Upload from GitHub Actions: More models and languages

Browse files
evals/main.py CHANGED
@@ -1,15 +1,29 @@
1
  import asyncio
 
 
 
2
 
3
  import pandas as pd
4
  from languages import languages
 
 
 
 
5
  from models import models
 
 
 
 
6
  from tasks import tasks
 
 
 
 
7
  from tqdm.asyncio import tqdm_asyncio
8
 
9
  # ===== config =====
10
 
11
  n_sentences = 10
12
- n_languages = 40
13
  n_models = 35
14
 
15
 
@@ -17,46 +31,48 @@ n_models = 35
17
 
18
 
19
  async def evaluate():
20
- print("running evaluations")
21
- old_results = pd.read_json("results.json")
22
- old_models = pd.read_json("models.json")
23
- # get all combinations of model, language and task
24
- combis = [
25
- (model, lang.bcp_47, task_name)
26
- for task_name, task in tasks.items()
27
- for lang in languages.iloc[:n_languages].itertuples()
28
- for model in models["id"].iloc[:n_models]
29
- ]
30
- # filter out combinations that have already been evaluated
31
- combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
32
- combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
33
- combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
34
- # run evaluations
35
- results = [
36
- tasks[task_name](model, bcp_47, i)
37
- for i in range(n_sentences)
38
- for model, bcp_47, task_name in combis.itertuples(index=False)
39
- ]
40
- results = await tqdm_asyncio.gather(*results, miniters=1)
41
- results = [r for group in results for r in group]
42
- args = dict(orient="records", indent=2, force_ascii=False)
43
- if results:
44
- # aggregate results
45
- results = pd.DataFrame(results)
46
- results = (
47
- results.groupby(["model", "bcp_47", "task", "metric"])
48
- .agg({"score": "mean"})
49
- .reset_index()
50
- )
51
- # save results
52
- results = pd.concat([old_results, results])
53
- results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
54
- results.to_json("results.json", **args)
55
- # save up-to-date info on models and languages
56
- all_models = pd.concat([old_models, pd.DataFrame(models)])
57
- all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
58
- all_models.to_json("models.json", **args)
59
- pd.DataFrame(languages).to_json("languages.json", **args)
 
 
60
 
61
 
62
  if __name__ == "__main__":
 
1
  import asyncio
2
+ from time import time
3
+
4
+ t0 = time()
5
 
6
  import pandas as pd
7
  from languages import languages
8
+
9
+ print(f"loaded languages in {time() - t0:.2f}s")
10
+ t0 = time()
11
+
12
  from models import models
13
+
14
+ print(f"loaded models in {time() - t0:.2f}s")
15
+ t0 = time()
16
+
17
  from tasks import tasks
18
+
19
+ print(f"loaded tasks in {time() - t0:.2f}s")
20
+ t0 = time()
21
+
22
  from tqdm.asyncio import tqdm_asyncio
23
 
24
  # ===== config =====
25
 
26
  n_sentences = 10
 
27
  n_models = 35
28
 
29
 
 
31
 
32
 
33
  async def evaluate():
34
+ # FIXME we should not need this for loop, but it helps
35
+ for n_languages in range(0, 200):
36
+ print(f"running evaluations for {n_languages} languages")
37
+ old_results = pd.read_json("results.json")
38
+ old_models = pd.read_json("models.json")
39
+ # get all combinations of model, language and task
40
+ combis = [
41
+ (model, lang.bcp_47, task_name)
42
+ for task_name, task in tasks.items()
43
+ for lang in languages.iloc[:n_languages].itertuples()
44
+ for model in models["id"].iloc[:n_models]
45
+ ]
46
+ # filter out combinations that have already been evaluated
47
+ combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
48
+ combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
49
+ combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
50
+ # run evaluations
51
+ results = [
52
+ tasks[task_name](model, bcp_47, i)
53
+ for i in range(n_sentences)
54
+ for model, bcp_47, task_name in combis.itertuples(index=False)
55
+ ]
56
+ results = await tqdm_asyncio.gather(*results, miniters=1)
57
+ results = [r for group in results for r in group]
58
+ args = dict(orient="records", indent=2, force_ascii=False)
59
+ if results:
60
+ # aggregate results
61
+ results = pd.DataFrame(results)
62
+ results = (
63
+ results.groupby(["model", "bcp_47", "task", "metric"])
64
+ .agg({"score": "mean"})
65
+ .reset_index()
66
+ )
67
+ # save results
68
+ results = pd.concat([old_results, results])
69
+ results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
70
+ results.to_json("results.json", **args)
71
+ # save up-to-date info on models and languages
72
+ all_models = pd.concat([old_models, pd.DataFrame(models)])
73
+ all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
74
+ all_models.to_json("models.json", **args)
75
+ pd.DataFrame(languages).to_json("languages.json", **args)
76
 
77
 
78
  if __name__ == "__main__":
evals/models.py CHANGED
@@ -10,9 +10,8 @@ from dotenv import load_dotenv
10
  from elevenlabs import AsyncElevenLabs
11
  from huggingface_hub import AsyncInferenceClient, HfApi
12
  from joblib.memory import Memory
13
- from openai import AsyncOpenAI
14
  from requests import HTTPError, get
15
- from openai import PermissionDeniedError
16
 
17
  # for development purposes, all languages will be evaluated on the fast models
18
  # and only a sample of languages will be evaluated on all models
@@ -48,7 +47,7 @@ important_models = [
48
  ]
49
 
50
  blocklist = [
51
- "microsoft/wizardlm-2-8x22b" # temporarily rate-limited
52
  ]
53
 
54
  transcription_models = [
@@ -68,7 +67,13 @@ def get_models(date: date):
68
 
69
  def get_model(permaslug):
70
  models = get_models(date.today())
71
- slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
 
 
 
 
 
 
72
  if len(slugs) == 0:
73
  # the problem is that free models typically have very high rate-limiting
74
  print(f"no non-free model found for {permaslug}")
@@ -124,6 +129,7 @@ async def complete(**kwargs) -> str | None:
124
  raise Exception(response)
125
  return response.choices[0].message.content.strip()
126
 
 
127
  @cache
128
  async def transcribe_elevenlabs(path, model):
129
  modelname = model.split("/")[-1]
@@ -221,7 +227,9 @@ def load_models(date: date):
221
  ).dt.date
222
 
223
  models = models.assign(
224
- name=or_metadata.str["short_name"].str.replace(" (free)", ""),
 
 
225
  provider_name=or_metadata.str["name"].str.split(": ").str[0],
226
  cost=or_metadata.apply(get_cost),
227
  hf_id=hf_metadata.str["hf_id"],
@@ -230,8 +238,8 @@ def load_models(date: date):
230
  license=hf_metadata.str["license"],
231
  creation_date=creation_date_hf.combine_first(creation_date_or),
232
  )
233
- models = models[models["cost"] <= 2.0].reset_index(drop=True)
234
  return models
235
 
236
 
237
- models = load_models(date.today())
 
10
  from elevenlabs import AsyncElevenLabs
11
  from huggingface_hub import AsyncInferenceClient, HfApi
12
  from joblib.memory import Memory
13
+ from openai import AsyncOpenAI, PermissionDeniedError
14
  from requests import HTTPError, get
 
15
 
16
  # for development purposes, all languages will be evaluated on the fast models
17
  # and only a sample of languages will be evaluated on all models
 
47
  ]
48
 
49
  blocklist = [
50
+ "microsoft/wizardlm-2-8x22b" # temporarily rate-limited
51
  ]
52
 
53
  transcription_models = [
 
67
 
68
  def get_model(permaslug):
69
  models = get_models(date.today())
70
+ slugs = [
71
+ m
72
+ for m in models
73
+ if m["permaslug"] == permaslug
74
+ and m["endpoint"]
75
+ and not m["endpoint"]["is_free"]
76
+ ]
77
  if len(slugs) == 0:
78
  # the problem is that free models typically have very high rate-limiting
79
  print(f"no non-free model found for {permaslug}")
 
129
  raise Exception(response)
130
  return response.choices[0].message.content.strip()
131
 
132
+
133
  @cache
134
  async def transcribe_elevenlabs(path, model):
135
  modelname = model.split("/")[-1]
 
227
  ).dt.date
228
 
229
  models = models.assign(
230
+ name=or_metadata.str["short_name"]
231
+ .str.replace(" (free)", "")
232
+ .str.replace(" (self-moderated)", ""),
233
  provider_name=or_metadata.str["name"].str.split(": ").str[0],
234
  cost=or_metadata.apply(get_cost),
235
  hf_id=hf_metadata.str["hf_id"],
 
238
  license=hf_metadata.str["license"],
239
  creation_date=creation_date_hf.combine_first(creation_date_or),
240
  )
241
+ # models = models[models["cost"] <= 2.0].reset_index(drop=True)
242
  return models
243
 
244
 
245
+ models = load_models(date.today())
frontend/src/App.js CHANGED
@@ -9,6 +9,7 @@ import AutoComplete from './components/AutoComplete'
9
  import LanguagePlot from './components/LanguagePlot'
10
  import SpeakerPlot from './components/SpeakerPlot'
11
  import HistoryPlot from './components/HistoryPlot'
 
12
  import { Carousel } from 'primereact/carousel'
13
  import { Dialog } from 'primereact/dialog'
14
  import { Button } from 'primereact/button'
@@ -211,6 +212,7 @@ function App () {
211
  <LanguagePlot data={data} />,
212
  <SpeakerPlot data={data} />,
213
  <HistoryPlot data={data} />,
 
214
  ]}
215
  numScroll={1}
216
  numVisible={1}
 
9
  import LanguagePlot from './components/LanguagePlot'
10
  import SpeakerPlot from './components/SpeakerPlot'
11
  import HistoryPlot from './components/HistoryPlot'
12
+ import CostPlot from './components/CostPlot'
13
  import { Carousel } from 'primereact/carousel'
14
  import { Dialog } from 'primereact/dialog'
15
  import { Button } from 'primereact/button'
 
212
  <LanguagePlot data={data} />,
213
  <SpeakerPlot data={data} />,
214
  <HistoryPlot data={data} />,
215
+ <CostPlot data={data} />,
216
  ]}
217
  numScroll={1}
218
  numVisible={1}
frontend/src/components/CostPlot.js ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useRef, useEffect } from 'react'
2
+ import * as Plot from '@observablehq/plot'
3
+
4
+ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
5
+ const containerRef = useRef()
6
+ useEffect(() => {
7
+ const models = [...data.model_table] // sort copy, not in place
8
+ .filter(d => d.average !== null && d.cost > 0)
9
+ .sort((a, b) => a.cost - b.cost)
10
+ .reduce((acc, curr) => {
11
+ const last = acc[acc.length - 1]?.maxAverage || 0
12
+ acc.push({
13
+ ...curr,
14
+ maxAverage: Math.max(last, curr.average),
15
+ newRecord: curr.average > last
16
+ })
17
+ return acc
18
+ }, [])
19
+ let USDollar = new Intl.NumberFormat('en-US', {
20
+ style: 'currency',
21
+ currency: 'USD'
22
+ })
23
+ const plot = Plot.plot({
24
+ width: width,
25
+ height: height,
26
+ subtitle: 'Cost vs Performance',
27
+ x: {
28
+ label: 'Cost (USD)',
29
+ type: 'log',
30
+ // format dollar / ct
31
+ tickFormat: d => USDollar.format(d)
32
+ },
33
+ y: {
34
+ label: 'Language Proficiency Score'
35
+ },
36
+ symbol: {
37
+ legend: true
38
+ },
39
+ marks: [
40
+ Plot.dot(models, {
41
+ x: d => d.cost,
42
+ y: d => d.average,
43
+ symbol: 'provider_name',
44
+ stroke: 'provider_name',
45
+ title: d =>
46
+ `${d.provider_name} - ${d.name} (${
47
+ d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
48
+ })\nCost: ${USDollar.format(d.cost)}\nScore: ${d.average.toFixed(
49
+ 2
50
+ )}`,
51
+ tip: true
52
+ }),
53
+ Plot.line(
54
+ [
55
+ ...models.filter(d => d.newRecord),
56
+ {
57
+ cost: models.map(d => d.cost).reduce((a, b) => Math.max(a, b), 0),
58
+ maxAverage: models[models.length - 1].maxAverage
59
+ }
60
+ ],
61
+ {
62
+ x: d => d.cost,
63
+ y: d => d.maxAverage,
64
+ curve: 'catmull-rom',
65
+ strokeOpacity: 0.3
66
+ }
67
+ )
68
+ ]
69
+ })
70
+ containerRef.current.append(plot)
71
+ return () => plot.remove()
72
+ }, [data])
73
+
74
+ return (
75
+ <div
76
+ ref={containerRef}
77
+ style={{
78
+ width: '100%',
79
+ height: '100%',
80
+ display: 'flex',
81
+ alignItems: 'center',
82
+ justifyContent: 'center'
83
+ }}
84
+ />
85
+ )
86
+ }
87
+
88
+ export default HistoryPlot
frontend/src/components/HistoryPlot.js CHANGED
@@ -15,7 +15,6 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
15
  })
16
  return acc
17
  }, [])
18
- console.log(models)
19
  useEffect(() => {
20
  const plot = Plot.plot({
21
  width: width,
@@ -36,12 +35,14 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
36
  Plot.dot(models, {
37
  x: d => d.creation_date,
38
  y: d => d.average,
39
- symbol: "provider_name",
40
- stroke: "provider_name",
41
  title: d =>
42
  `${d.provider_name} - ${d.name} (${
43
  d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
44
- })\nPublished: ${d.creation_date}\nScore: ${d.average.toFixed(2)}`,
 
 
45
  tip: true
46
  }),
47
  Plot.line(
 
15
  })
16
  return acc
17
  }, [])
 
18
  useEffect(() => {
19
  const plot = Plot.plot({
20
  width: width,
 
35
  Plot.dot(models, {
36
  x: d => d.creation_date,
37
  y: d => d.average,
38
+ symbol: 'provider_name',
39
+ stroke: 'provider_name',
40
  title: d =>
41
  `${d.provider_name} - ${d.name} (${
42
  d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
43
+ })\nPublished: ${new Date(
44
+ d.creation_date
45
+ ).toLocaleDateString()}\nScore: ${d.average.toFixed(2)}`,
46
  tip: true
47
  }),
48
  Plot.line(
frontend/src/components/WorldMap.js CHANGED
@@ -43,7 +43,6 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
43
  }, [])
44
 
45
  useEffect(() => {
46
- console.log('countries', data)
47
  if (mapData === undefined || data === undefined) return
48
  const countriesDict = data.reduce((acc, country) => {
49
  acc[country.iso2] = country
 
43
  }, [])
44
 
45
  useEffect(() => {
 
46
  if (mapData === undefined || data === undefined) return
47
  const countriesDict = data.reduce((acc, country) => {
48
  acc[country.iso2] = country
models.json CHANGED
@@ -10,6 +10,39 @@
10
  "license":null,
11
  "creation_date":1733356800000
12
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  {
14
  "id":"deepseek\/deepseek-chat",
15
  "name":"DeepSeek V3",
@@ -98,6 +131,17 @@
98
  "license":null,
99
  "creation_date":1747699200000
100
  },
 
 
 
 
 
 
 
 
 
 
 
101
  {
102
  "id":"google\/gemini-flash-1.5",
103
  "name":"Gemini 1.5 Flash ",
@@ -263,6 +307,17 @@
263
  "license":"Apache 2.0",
264
  "creation_date":1741651200000
265
  },
 
 
 
 
 
 
 
 
 
 
 
266
  {
267
  "id":"openai\/gpt-4.1-mini",
268
  "name":"GPT-4.1 Mini",
 
10
  "license":null,
11
  "creation_date":1733356800000
12
  },
13
+ {
14
+ "id":"anthropic\/claude-3.5-sonnet",
15
+ "name":"Claude 3.5 Sonnet",
16
+ "provider_name":"Anthropic",
17
+ "cost":15.0,
18
+ "hf_id":null,
19
+ "size":null,
20
+ "type":"closed-source",
21
+ "license":null,
22
+ "creation_date":1729555200000
23
+ },
24
+ {
25
+ "id":"anthropic\/claude-3.7-sonnet",
26
+ "name":"Claude 3.7 Sonnet",
27
+ "provider_name":"Anthropic",
28
+ "cost":15.0,
29
+ "hf_id":null,
30
+ "size":null,
31
+ "type":"closed-source",
32
+ "license":null,
33
+ "creation_date":1740355200000
34
+ },
35
+ {
36
+ "id":"anthropic\/claude-sonnet-4",
37
+ "name":"Claude Sonnet 4",
38
+ "provider_name":"Anthropic",
39
+ "cost":15.0,
40
+ "hf_id":null,
41
+ "size":null,
42
+ "type":"closed-source",
43
+ "license":null,
44
+ "creation_date":1747872000000
45
+ },
46
  {
47
  "id":"deepseek\/deepseek-chat",
48
  "name":"DeepSeek V3",
 
131
  "license":null,
132
  "creation_date":1747699200000
133
  },
134
+ {
135
+ "id":"google\/gemini-2.5-pro-preview-05-06",
136
+ "name":"Gemini 2.5 Pro Preview 05-06",
137
+ "provider_name":"Google",
138
+ "cost":10.0,
139
+ "hf_id":null,
140
+ "size":null,
141
+ "type":"closed-source",
142
+ "license":null,
143
+ "creation_date":1746576000000
144
+ },
145
  {
146
  "id":"google\/gemini-flash-1.5",
147
  "name":"Gemini 1.5 Flash ",
 
307
  "license":"Apache 2.0",
308
  "creation_date":1741651200000
309
  },
310
+ {
311
+ "id":"openai\/gpt-4.1",
312
+ "name":"GPT-4.1",
313
+ "provider_name":"OpenAI",
314
+ "cost":8.0,
315
+ "hf_id":null,
316
+ "size":null,
317
+ "type":"closed-source",
318
+ "license":null,
319
+ "creation_date":1744588800000
320
+ },
321
  {
322
  "id":"openai\/gpt-4.1-mini",
323
  "name":"GPT-4.1 Mini",
results.json CHANGED
The diff for this file is too large to render. See raw diff