davidpomerenke commited on
Commit
f840423
·
verified ·
1 Parent(s): 913253a

Upload from GitHub Actions: Update model ranking fetching

Browse files
Files changed (5) hide show
  1. evals/main.py +5 -5
  2. evals/models.py +14 -22
  3. evals/tasks.py +12 -4
  4. models.json +194 -7
  5. results.json +0 -0
evals/main.py CHANGED
@@ -9,8 +9,8 @@ from tqdm.asyncio import tqdm_asyncio
9
  # ===== config =====
10
 
11
  n_sentences = 10
12
- n_languages = 10
13
- n_models = 10
14
 
15
  # ===== run evaluation and aggregate results =====
16
 
@@ -31,8 +31,8 @@ async def evaluate():
31
  ]
32
  # filter out combinations that have already been evaluated
33
  combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
34
- # combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
35
- # combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
36
  # run evaluations
37
  results = [
38
  tasks[task_name](model, bcp_47, i)
@@ -50,7 +50,7 @@ async def evaluate():
50
  .reset_index()
51
  )
52
  # save results
53
- # results = pd.concat([old_results, results])
54
  results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
55
  results.to_json("results.json", **args)
56
 
 
9
  # ===== config =====
10
 
11
  n_sentences = 10
12
+ n_languages = 18
13
+ n_models = 22
14
 
15
  # ===== run evaluation and aggregate results =====
16
 
 
31
  ]
32
  # filter out combinations that have already been evaluated
33
  combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
34
+ combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
35
+ combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
36
  # run evaluations
37
  results = [
38
  tasks[task_name](model, bcp_47, i)
 
50
  .reset_index()
51
  )
52
  # save results
53
+ results = pd.concat([old_results, results])
54
  results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
55
  results.to_json("results.json", **args)
56
 
evals/models.py CHANGED
@@ -45,10 +45,6 @@ important_models = [
45
  "amazon/nova-micro-v1", # 0.09$
46
  ]
47
 
48
- blocklist = [
49
- "google/gemini-2.5-pro-exp-03-25" # rate limit too low
50
- ]
51
-
52
  transcription_models = [
53
  "elevenlabs/scribe_v1",
54
  "openai/whisper-large-v3",
@@ -68,7 +64,7 @@ def get_model(permaslug):
68
  models = get_models(date.today())
69
  slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
70
  if len(slugs) == 0:
71
- print(f"no model found for {permaslug}")
72
  return slugs[0] if len(slugs) >= 1 else None
73
 
74
 
@@ -90,9 +86,9 @@ def get_historical_popular_models(date: date):
90
 
91
  @cache
92
  def get_current_popular_models(date: date):
93
- raw = get("https://openrouter.ai/rankings").text
94
- data = re.search(r'{\\"rankMap\\":(.*)\}\]\\n"\]\)</script>', raw).group(1)
95
- data = json.loads(data.replace("\\", ""))["day"]
96
  data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
97
  models = [get_model(model["model_permaslug"]) for model in data]
98
  return [m for m in models if m]
@@ -110,16 +106,13 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
110
 
111
 
112
  @cache
113
- async def complete(**kwargs):
114
  async with openrouter_rate_limit:
115
  try:
116
  response = await client.chat.completions.create(**kwargs)
117
  except PermissionDeniedError as e:
118
- if e["error"]["metadata"]["reason"] in ["violence", "hate", "sexual", "self-harm", "harassment"]:
119
- print(e)
120
- return None
121
- else:
122
- raise e
123
  if not response.choices:
124
  raise Exception(response)
125
  return response.choices[0].message.content.strip()
@@ -206,13 +199,12 @@ def get_cost(row):
206
 
207
  @cache
208
  def load_models(date: date):
209
- # popular_models = (
210
- # get_historical_popular_models(date.today())[:15]
211
- # + get_current_popular_models(date.today())[:15]
212
- # )
213
- # popular_models = [m["slug"] for m in popular_models]
214
- # models = set(important_models + popular_models) - set(blocklist)
215
- models = set(important_models) - set(blocklist)
216
  models = pd.DataFrame(sorted(list(models)), columns=["id"])
217
  or_metadata = models["id"].apply(get_or_metadata)
218
  hf_metadata = or_metadata.apply(get_hf_metadata)
@@ -222,7 +214,7 @@ def load_models(date: date):
222
  ).dt.date
223
 
224
  models = models.assign(
225
- name=or_metadata.str["short_name"],
226
  provider_name=or_metadata.str["name"].str.split(": ").str[0],
227
  cost=or_metadata.apply(get_cost),
228
  hf_id=hf_metadata.str["hf_id"],
 
45
  "amazon/nova-micro-v1", # 0.09$
46
  ]
47
 
 
 
 
 
48
  transcription_models = [
49
  "elevenlabs/scribe_v1",
50
  "openai/whisper-large-v3",
 
64
  models = get_models(date.today())
65
  slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
66
  if len(slugs) == 0:
67
+ print(f"no non-free model found for {permaslug}")
68
  return slugs[0] if len(slugs) >= 1 else None
69
 
70
 
 
86
 
87
  @cache
88
  def get_current_popular_models(date: date):
89
+ raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
90
+ data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
91
+ data = json.loads(data)
92
  data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
93
  models = [get_model(model["model_permaslug"]) for model in data]
94
  return [m for m in models if m]
 
106
 
107
 
108
  @cache
109
+ async def complete(**kwargs) -> str | None:
110
  async with openrouter_rate_limit:
111
  try:
112
  response = await client.chat.completions.create(**kwargs)
113
  except PermissionDeniedError as e:
114
+ print(e)
115
+ return None
 
 
 
116
  if not response.choices:
117
  raise Exception(response)
118
  return response.choices[0].message.content.strip()
 
199
 
200
  @cache
201
  def load_models(date: date):
202
+ popular_models = (
203
+ get_historical_popular_models(date.today())[:30]
204
+ + get_current_popular_models(date.today())[:10]
205
+ )
206
+ popular_models = [m["slug"] for m in popular_models]
207
+ models = set(important_models + popular_models)
 
208
  models = pd.DataFrame(sorted(list(models)), columns=["id"])
209
  or_metadata = models["id"].apply(get_or_metadata)
210
  hf_metadata = or_metadata.apply(get_hf_metadata)
 
214
  ).dt.date
215
 
216
  models = models.assign(
217
+ name=or_metadata.str["short_name"].str.replace(" (free)", ""),
218
  provider_name=or_metadata.str["name"].str.split(": ").str[0],
219
  cost=or_metadata.apply(get_cost),
220
  hf_id=hf_metadata.str["hf_id"],
evals/tasks.py CHANGED
@@ -30,7 +30,10 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
30
  pass
31
  case "to":
32
  original_language, target_language = target_language, original_language
33
- if flores_sentences(original_language) is None or flores_sentences(target_language) is None:
 
 
 
34
  return []
35
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
36
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
@@ -70,6 +73,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
70
  )
71
  ]
72
 
 
73
  async def classify_and_evaluate(model, bcp_47, nr):
74
  language = languages[languages["bcp_47"] == bcp_47].iloc[0]
75
  sentences = flores_sentences(language)
@@ -119,9 +123,13 @@ async def classify_and_evaluate(model, bcp_47, nr):
119
  )
120
  true = test_paragraph.topic
121
  others = [t for t in top_topics if t != true]
122
- acc = int(
123
- pred.startswith(true)
124
- or (true in pred and not any(o in pred for o in others))
 
 
 
 
125
  )
126
  except Exception as e:
127
  if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
 
30
  pass
31
  case "to":
32
  original_language, target_language = target_language, original_language
33
+ if (
34
+ flores_sentences(original_language) is None
35
+ or flores_sentences(target_language) is None
36
+ ):
37
  return []
38
  original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
39
  target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
 
73
  )
74
  ]
75
 
76
+
77
  async def classify_and_evaluate(model, bcp_47, nr):
78
  language = languages[languages["bcp_47"] == bcp_47].iloc[0]
79
  sentences = flores_sentences(language)
 
123
  )
124
  true = test_paragraph.topic
125
  others = [t for t in top_topics if t != true]
126
+ acc = (
127
+ int(
128
+ pred.startswith(true)
129
+ or (true in pred and not any(o in pred for o in others))
130
+ )
131
+ if pred
132
+ else 0
133
  )
134
  except Exception as e:
135
  if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
models.json CHANGED
@@ -10,9 +10,31 @@
10
  "license":null,
11
  "creation_date":1733356800000
12
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  {
14
  "id":"deepseek\/deepseek-chat",
15
- "name":"DeepSeek V3 (free)",
16
  "provider_name":"DeepSeek",
17
  "cost":0.0,
18
  "hf_id":"deepseek-ai\/DeepSeek-V3",
@@ -23,7 +45,7 @@
23
  },
24
  {
25
  "id":"deepseek\/deepseek-chat-v3-0324",
26
- "name":"DeepSeek V3 0324 (free)",
27
  "provider_name":"DeepSeek",
28
  "cost":0.0,
29
  "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
@@ -32,6 +54,28 @@
32
  "license":"Mit",
33
  "creation_date":1742774400000
34
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  {
36
  "id":"google\/gemini-2.0-flash-lite-001",
37
  "name":"Gemini 2.0 Flash Lite",
@@ -54,9 +98,53 @@
54
  "license":null,
55
  "creation_date":1744848000000
56
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  {
58
  "id":"google\/gemma-3-27b-it",
59
- "name":"Gemma 3 27B (free)",
60
  "provider_name":"Google",
61
  "cost":0.0,
62
  "hf_id":"google\/gemma-3-27b-it",
@@ -65,6 +153,17 @@
65
  "license":"Gemma",
66
  "creation_date":1740787200000
67
  },
 
 
 
 
 
 
 
 
 
 
 
68
  {
69
  "id":"meta-llama\/llama-3-70b-instruct",
70
  "name":"Llama 3 70B Instruct",
@@ -76,6 +175,17 @@
76
  "license":"Llama3",
77
  "creation_date":1713312000000
78
  },
 
 
 
 
 
 
 
 
 
 
 
79
  {
80
  "id":"meta-llama\/llama-3.1-70b-instruct",
81
  "name":"Llama 3.1 70B Instruct",
@@ -87,9 +197,31 @@
87
  "license":"Llama3.1",
88
  "creation_date":1721088000000
89
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  {
91
  "id":"meta-llama\/llama-3.3-70b-instruct",
92
- "name":"Llama 3.3 70B Instruct (free)",
93
  "provider_name":"Meta",
94
  "cost":0.0,
95
  "hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
@@ -100,7 +232,7 @@
100
  },
101
  {
102
  "id":"meta-llama\/llama-4-maverick",
103
- "name":"Llama 4 Maverick (free)",
104
  "provider_name":"Meta",
105
  "cost":0.0,
106
  "hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
@@ -131,9 +263,31 @@
131
  "license":"Mit",
132
  "creation_date":1740355200000
133
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  {
135
  "id":"mistralai\/mistral-nemo",
136
- "name":"Mistral Nemo (free)",
137
  "provider_name":"Mistral",
138
  "cost":0.0,
139
  "hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
@@ -155,7 +309,7 @@
155
  },
156
  {
157
  "id":"mistralai\/mistral-small-3.1-24b-instruct",
158
- "name":"Mistral Small 3.1 24B (free)",
159
  "provider_name":"Mistral",
160
  "cost":0.0,
161
  "hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
@@ -164,6 +318,28 @@
164
  "license":"Apache 2.0",
165
  "creation_date":1741651200000
166
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  {
168
  "id":"openai\/gpt-4.1-mini",
169
  "name":"GPT-4.1 Mini",
@@ -196,5 +372,16 @@
196
  "type":"Commercial",
197
  "license":null,
198
  "creation_date":1721260800000
 
 
 
 
 
 
 
 
 
 
 
199
  }
200
  ]
 
10
  "license":null,
11
  "creation_date":1733356800000
12
  },
13
+ {
14
+ "id":"anthropic\/claude-3-haiku",
15
+ "name":"Claude 3 Haiku (self-moderated)",
16
+ "provider_name":"Anthropic",
17
+ "cost":1.25,
18
+ "hf_id":null,
19
+ "size":null,
20
+ "type":"Commercial",
21
+ "license":null,
22
+ "creation_date":1710288000000
23
+ },
24
+ {
25
+ "id":"cohere\/command-r",
26
+ "name":"Command R",
27
+ "provider_name":"Cohere",
28
+ "cost":1.5,
29
+ "hf_id":null,
30
+ "size":null,
31
+ "type":"Commercial",
32
+ "license":null,
33
+ "creation_date":1710374400000
34
+ },
35
  {
36
  "id":"deepseek\/deepseek-chat",
37
+ "name":"DeepSeek V3",
38
  "provider_name":"DeepSeek",
39
  "cost":0.0,
40
  "hf_id":"deepseek-ai\/DeepSeek-V3",
 
45
  },
46
  {
47
  "id":"deepseek\/deepseek-chat-v3-0324",
48
+ "name":"DeepSeek V3 0324",
49
  "provider_name":"DeepSeek",
50
  "cost":0.0,
51
  "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
 
54
  "license":"Mit",
55
  "creation_date":1742774400000
56
  },
57
+ {
58
+ "id":"deepseek\/deepseek-r1",
59
+ "name":"R1",
60
+ "provider_name":"DeepSeek",
61
+ "cost":0.0,
62
+ "hf_id":"deepseek-ai\/DeepSeek-R1",
63
+ "size":684531386000.0,
64
+ "type":"Open",
65
+ "license":"Mit",
66
+ "creation_date":1737331200000
67
+ },
68
+ {
69
+ "id":"google\/gemini-2.0-flash-001",
70
+ "name":"Gemini 2.0 Flash",
71
+ "provider_name":"Google",
72
+ "cost":0.4,
73
+ "hf_id":null,
74
+ "size":null,
75
+ "type":"Commercial",
76
+ "license":null,
77
+ "creation_date":1738713600000
78
+ },
79
  {
80
  "id":"google\/gemini-2.0-flash-lite-001",
81
  "name":"Gemini 2.0 Flash Lite",
 
98
  "license":null,
99
  "creation_date":1744848000000
100
  },
101
+ {
102
+ "id":"google\/gemini-2.5-flash-preview-05-20",
103
+ "name":"Gemini 2.5 Flash Preview 05-20",
104
+ "provider_name":"Google",
105
+ "cost":0.6,
106
+ "hf_id":null,
107
+ "size":null,
108
+ "type":"Commercial",
109
+ "license":null,
110
+ "creation_date":1747699200000
111
+ },
112
+ {
113
+ "id":"google\/gemini-flash-1.5",
114
+ "name":"Gemini 1.5 Flash ",
115
+ "provider_name":"Google",
116
+ "cost":0.3,
117
+ "hf_id":null,
118
+ "size":null,
119
+ "type":"Commercial",
120
+ "license":null,
121
+ "creation_date":1715644800000
122
+ },
123
+ {
124
+ "id":"google\/gemini-flash-1.5-8b",
125
+ "name":"Gemini 1.5 Flash 8B",
126
+ "provider_name":"Google",
127
+ "cost":0.15,
128
+ "hf_id":null,
129
+ "size":null,
130
+ "type":"Commercial",
131
+ "license":null,
132
+ "creation_date":1727913600000
133
+ },
134
+ {
135
+ "id":"google\/gemma-2-9b-it",
136
+ "name":"Gemma 2 9B",
137
+ "provider_name":"Google",
138
+ "cost":0.0,
139
+ "hf_id":"google\/gemma-2-9b-it",
140
+ "size":9241705984.0,
141
+ "type":"Open",
142
+ "license":"Gemma",
143
+ "creation_date":1719187200000
144
+ },
145
  {
146
  "id":"google\/gemma-3-27b-it",
147
+ "name":"Gemma 3 27B",
148
  "provider_name":"Google",
149
  "cost":0.0,
150
  "hf_id":"google\/gemma-3-27b-it",
 
153
  "license":"Gemma",
154
  "creation_date":1740787200000
155
  },
156
+ {
157
+ "id":"gryphe\/mythomax-l2-13b",
158
+ "name":"MythoMax 13B",
159
+ "provider_name":"MythoMax 13B",
160
+ "cost":0.07,
161
+ "hf_id":"Gryphe\/MythoMax-L2-13b",
162
+ "size":null,
163
+ "type":"Open",
164
+ "license":"Other",
165
+ "creation_date":1691625600000
166
+ },
167
  {
168
  "id":"meta-llama\/llama-3-70b-instruct",
169
  "name":"Llama 3 70B Instruct",
 
175
  "license":"Llama3",
176
  "creation_date":1713312000000
177
  },
178
+ {
179
+ "id":"meta-llama\/llama-3-8b-instruct",
180
+ "name":"Llama 3 8B Instruct",
181
+ "provider_name":"Meta",
182
+ "cost":0.06,
183
+ "hf_id":"meta-llama\/Meta-Llama-3-8B-Instruct",
184
+ "size":8030261248.0,
185
+ "type":"Open",
186
+ "license":"Llama3",
187
+ "creation_date":1713312000000
188
+ },
189
  {
190
  "id":"meta-llama\/llama-3.1-70b-instruct",
191
  "name":"Llama 3.1 70B Instruct",
 
197
  "license":"Llama3.1",
198
  "creation_date":1721088000000
199
  },
200
+ {
201
+ "id":"meta-llama\/llama-3.1-8b-instruct",
202
+ "name":"Llama 3.1 8B Instruct",
203
+ "provider_name":"Meta",
204
+ "cost":0.0,
205
+ "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
206
+ "size":8030261248.0,
207
+ "type":"Open",
208
+ "license":"Llama3.1",
209
+ "creation_date":1721260800000
210
+ },
211
+ {
212
+ "id":"meta-llama\/llama-3.2-1b-instruct",
213
+ "name":"Llama 3.2 1B Instruct",
214
+ "provider_name":"Meta",
215
+ "cost":0.0,
216
+ "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
217
+ "size":1235814400.0,
218
+ "type":"Open",
219
+ "license":"Llama3.2",
220
+ "creation_date":1726617600000
221
+ },
222
  {
223
  "id":"meta-llama\/llama-3.3-70b-instruct",
224
+ "name":"Llama 3.3 70B Instruct",
225
  "provider_name":"Meta",
226
  "cost":0.0,
227
  "hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
 
232
  },
233
  {
234
  "id":"meta-llama\/llama-4-maverick",
235
+ "name":"Llama 4 Maverick",
236
  "provider_name":"Meta",
237
  "cost":0.0,
238
  "hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
 
263
  "license":"Mit",
264
  "creation_date":1740355200000
265
  },
266
+ {
267
+ "id":"microsoft\/wizardlm-2-8x22b",
268
+ "name":"WizardLM-2 8x22B",
269
+ "provider_name":"WizardLM-2 8x22B",
270
+ "cost":0.5,
271
+ "hf_id":null,
272
+ "size":null,
273
+ "type":"Commercial",
274
+ "license":null,
275
+ "creation_date":1713225600000
276
+ },
277
+ {
278
+ "id":"mistralai\/mistral-7b-instruct",
279
+ "name":"Mistral 7B Instruct",
280
+ "provider_name":"Mistral",
281
+ "cost":0.0,
282
+ "hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
283
+ "size":7248023552.0,
284
+ "type":"Open",
285
+ "license":"Apache 2.0",
286
+ "creation_date":1716336000000
287
+ },
288
  {
289
  "id":"mistralai\/mistral-nemo",
290
+ "name":"Mistral Nemo",
291
  "provider_name":"Mistral",
292
  "cost":0.0,
293
  "hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
 
309
  },
310
  {
311
  "id":"mistralai\/mistral-small-3.1-24b-instruct",
312
+ "name":"Mistral Small 3.1 24B",
313
  "provider_name":"Mistral",
314
  "cost":0.0,
315
  "hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
 
318
  "license":"Apache 2.0",
319
  "creation_date":1741651200000
320
  },
321
+ {
322
+ "id":"mistralai\/mistral-tiny",
323
+ "name":"Mistral Tiny",
324
+ "provider_name":"Mistral Tiny",
325
+ "cost":0.25,
326
+ "hf_id":null,
327
+ "size":null,
328
+ "type":"Commercial",
329
+ "license":null,
330
+ "creation_date":1704844800000
331
+ },
332
+ {
333
+ "id":"nousresearch\/hermes-3-llama-3.1-405b",
334
+ "name":"Hermes 3 405B Instruct",
335
+ "provider_name":"Nous",
336
+ "cost":0.8,
337
+ "hf_id":"NousResearch\/Hermes-3-Llama-3.1-405B",
338
+ "size":405853388800.0,
339
+ "type":"Open",
340
+ "license":"Llama3",
341
+ "creation_date":1723507200000
342
+ },
343
  {
344
  "id":"openai\/gpt-4.1-mini",
345
  "name":"GPT-4.1 Mini",
 
372
  "type":"Commercial",
373
  "license":null,
374
  "creation_date":1721260800000
375
+ },
376
+ {
377
+ "id":"openai\/gpt-4o-mini-2024-07-18",
378
+ "name":"GPT-4o-mini (2024-07-18)",
379
+ "provider_name":"OpenAI",
380
+ "cost":0.6,
381
+ "hf_id":null,
382
+ "size":null,
383
+ "type":"Commercial",
384
+ "license":null,
385
+ "creation_date":1721260800000
386
  }
387
  ]
results.json CHANGED
The diff for this file is too large to render. See raw diff