Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on May 22

Commit

f840423

verified ·

1 Parent(s): 913253a

Upload from GitHub Actions: Update model ranking fetching

Browse files

Files changed (5) hide show

evals/main.py +5 -5
evals/models.py +14 -22
evals/tasks.py +12 -4
models.json +194 -7
results.json +0 -0

evals/main.py CHANGED Viewed

@@ -9,8 +9,8 @@ from tqdm.asyncio import tqdm_asyncio
 # ===== config =====
 n_sentences = 10
-n_languages = 10
-n_models = 10
 # ===== run evaluation and aggregate results =====
@@ -31,8 +31,8 @@ async def evaluate():
     ]
     # filter out combinations that have already been evaluated
     combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-    # combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
-    # combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
     # run evaluations
     results = [
         tasks[task_name](model, bcp_47, i)
@@ -50,7 +50,7 @@ async def evaluate():
             .reset_index()
         )
         # save results
-        # results = pd.concat([old_results, results])
         results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
         results.to_json("results.json", **args)

 # ===== config =====
 n_sentences = 10
+n_languages = 18
+n_models = 22
 # ===== run evaluation and aggregate results =====
     ]
     # filter out combinations that have already been evaluated
     combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+    combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
+    combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
     # run evaluations
     results = [
         tasks[task_name](model, bcp_47, i)
             .reset_index()
         )
         # save results
+        results = pd.concat([old_results, results])
         results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
         results.to_json("results.json", **args)

evals/models.py CHANGED Viewed

@@ -45,10 +45,6 @@ important_models = [
     "amazon/nova-micro-v1",  # 0.09$
 ]
-blocklist = [
-    "google/gemini-2.5-pro-exp-03-25"  # rate limit too low
-]
 transcription_models = [
     "elevenlabs/scribe_v1",
     "openai/whisper-large-v3",
@@ -68,7 +64,7 @@ def get_model(permaslug):
     models = get_models(date.today())
     slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
     if len(slugs) == 0:
-        print(f"no model found for {permaslug}")
     return slugs[0] if len(slugs) >= 1 else None
@@ -90,9 +86,9 @@ def get_historical_popular_models(date: date):
 @cache
 def get_current_popular_models(date: date):
-    raw = get("https://openrouter.ai/rankings").text
-    data = re.search(r'{\\"rankMap\\":(.*)\}\]\\n"\]\)</script>', raw).group(1)
-    data = json.loads(data.replace("\\", ""))["day"]
     data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
     models = [get_model(model["model_permaslug"]) for model in data]
     return [m for m in models if m]
@@ -110,16 +106,13 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
 @cache
-async def complete(**kwargs):
     async with openrouter_rate_limit:
         try:
             response = await client.chat.completions.create(**kwargs)
         except PermissionDeniedError as e:
-            if e["error"]["metadata"]["reason"] in ["violence", "hate", "sexual", "self-harm", "harassment"]:
-                print(e)
-                return None
-            else:
-                raise e
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
@@ -206,13 +199,12 @@ def get_cost(row):
 @cache
 def load_models(date: date):
-    # popular_models = (
-    #     get_historical_popular_models(date.today())[:15]
-    #     + get_current_popular_models(date.today())[:15]
-    # )
-    # popular_models = [m["slug"] for m in popular_models]
-    # models = set(important_models + popular_models) - set(blocklist)
-    models = set(important_models) - set(blocklist)
     models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
@@ -222,7 +214,7 @@ def load_models(date: date):
     ).dt.date
     models = models.assign(
-        name=or_metadata.str["short_name"],
         provider_name=or_metadata.str["name"].str.split(": ").str[0],
         cost=or_metadata.apply(get_cost),
         hf_id=hf_metadata.str["hf_id"],

     "amazon/nova-micro-v1",  # 0.09$
 ]
 transcription_models = [
     "elevenlabs/scribe_v1",
     "openai/whisper-large-v3",
     models = get_models(date.today())
     slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
     if len(slugs) == 0:
+        print(f"no non-free model found for {permaslug}")
     return slugs[0] if len(slugs) >= 1 else None
 @cache
 def get_current_popular_models(date: date):
+    raw = get("https://openrouter.ai/rankings?view=day").text.replace("\\", "")
+    data = re.search(r'"rankingData":(.*),"rankingType":"day"', raw).group(1)
+    data = json.loads(data)
     data = sorted(data, key=lambda x: x["total_prompt_tokens"], reverse=True)
     models = [get_model(model["model_permaslug"]) for model in data]
     return [m for m in models if m]
 @cache
+async def complete(**kwargs) -> str | None:
     async with openrouter_rate_limit:
         try:
             response = await client.chat.completions.create(**kwargs)
         except PermissionDeniedError as e:
+            print(e)
+            return None
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()
 @cache
 def load_models(date: date):
+    popular_models = (
+        get_historical_popular_models(date.today())[:30]
+        + get_current_popular_models(date.today())[:10]
+    )
+    popular_models = [m["slug"] for m in popular_models]
+    models = set(important_models + popular_models)
     models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)
     ).dt.date
     models = models.assign(
+        name=or_metadata.str["short_name"].str.replace(" (free)", ""),
         provider_name=or_metadata.str["name"].str.split(": ").str[0],
         cost=or_metadata.apply(get_cost),
         hf_id=hf_metadata.str["hf_id"],

evals/tasks.py CHANGED Viewed

@@ -30,7 +30,10 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             pass
         case "to":
             original_language, target_language = target_language, original_language
-    if flores_sentences(original_language) is None or flores_sentences(target_language) is None:
         return []
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
@@ -70,6 +73,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
         )
     ]
 async def classify_and_evaluate(model, bcp_47, nr):
     language = languages[languages["bcp_47"] == bcp_47].iloc[0]
     sentences = flores_sentences(language)
@@ -119,9 +123,13 @@ async def classify_and_evaluate(model, bcp_47, nr):
         )
         true = test_paragraph.topic
         others = [t for t in top_topics if t != true]
-        acc = int(
-            pred.startswith(true)
-            or (true in pred and not any(o in pred for o in others))
         )
     except Exception as e:
         if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):

             pass
         case "to":
             original_language, target_language = target_language, original_language
+    if (
+        flores_sentences(original_language) is None
+        or flores_sentences(target_language) is None
+    ):
         return []
     original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
     target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
         )
     ]
 async def classify_and_evaluate(model, bcp_47, nr):
     language = languages[languages["bcp_47"] == bcp_47].iloc[0]
     sentences = flores_sentences(language)
         )
         true = test_paragraph.topic
         others = [t for t in top_topics if t != true]
+        acc = (
+            int(
+                pred.startswith(true)
+                or (true in pred and not any(o in pred for o in others))
+            )
+            if pred
+            else 0
         )
     except Exception as e:
         if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):

models.json CHANGED Viewed

@@ -10,9 +10,31 @@
     "license":null,
     "creation_date":1733356800000
   },
   {
     "id":"deepseek\/deepseek-chat",
-    "name":"DeepSeek V3 (free)",
     "provider_name":"DeepSeek",
     "cost":0.0,
     "hf_id":"deepseek-ai\/DeepSeek-V3",
@@ -23,7 +45,7 @@
   },
   {
     "id":"deepseek\/deepseek-chat-v3-0324",
-    "name":"DeepSeek V3 0324 (free)",
     "provider_name":"DeepSeek",
     "cost":0.0,
     "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
@@ -32,6 +54,28 @@
     "license":"Mit",
     "creation_date":1742774400000
   },
   {
     "id":"google\/gemini-2.0-flash-lite-001",
     "name":"Gemini 2.0 Flash Lite",
@@ -54,9 +98,53 @@
     "license":null,
     "creation_date":1744848000000
   },
   {
     "id":"google\/gemma-3-27b-it",
-    "name":"Gemma 3 27B (free)",
     "provider_name":"Google",
     "cost":0.0,
     "hf_id":"google\/gemma-3-27b-it",
@@ -65,6 +153,17 @@
     "license":"Gemma",
     "creation_date":1740787200000
   },
   {
     "id":"meta-llama\/llama-3-70b-instruct",
     "name":"Llama 3 70B Instruct",
@@ -76,6 +175,17 @@
     "license":"Llama3",
     "creation_date":1713312000000
   },
   {
     "id":"meta-llama\/llama-3.1-70b-instruct",
     "name":"Llama 3.1 70B Instruct",
@@ -87,9 +197,31 @@
     "license":"Llama3.1",
     "creation_date":1721088000000
   },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
-    "name":"Llama 3.3 70B Instruct (free)",
     "provider_name":"Meta",
     "cost":0.0,
     "hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
@@ -100,7 +232,7 @@
   },
   {
     "id":"meta-llama\/llama-4-maverick",
-    "name":"Llama 4 Maverick (free)",
     "provider_name":"Meta",
     "cost":0.0,
     "hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
@@ -131,9 +263,31 @@
     "license":"Mit",
     "creation_date":1740355200000
   },
   {
     "id":"mistralai\/mistral-nemo",
-    "name":"Mistral Nemo (free)",
     "provider_name":"Mistral",
     "cost":0.0,
     "hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
@@ -155,7 +309,7 @@
   },
   {
     "id":"mistralai\/mistral-small-3.1-24b-instruct",
-    "name":"Mistral Small 3.1 24B (free)",
     "provider_name":"Mistral",
     "cost":0.0,
     "hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
@@ -164,6 +318,28 @@
     "license":"Apache 2.0",
     "creation_date":1741651200000
   },
   {
     "id":"openai\/gpt-4.1-mini",
     "name":"GPT-4.1 Mini",
@@ -196,5 +372,16 @@
     "type":"Commercial",
     "license":null,
     "creation_date":1721260800000
   }
 ]

     "license":null,
     "creation_date":1733356800000
   },
+  {
+    "id":"anthropic\/claude-3-haiku",
+    "name":"Claude 3 Haiku (self-moderated)",
+    "provider_name":"Anthropic",
+    "cost":1.25,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1710288000000
+  },
+  {
+    "id":"cohere\/command-r",
+    "name":"Command R",
+    "provider_name":"Cohere",
+    "cost":1.5,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1710374400000
+  },
   {
     "id":"deepseek\/deepseek-chat",
+    "name":"DeepSeek V3",
     "provider_name":"DeepSeek",
     "cost":0.0,
     "hf_id":"deepseek-ai\/DeepSeek-V3",
   },
   {
     "id":"deepseek\/deepseek-chat-v3-0324",
+    "name":"DeepSeek V3 0324",
     "provider_name":"DeepSeek",
     "cost":0.0,
     "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
     "license":"Mit",
     "creation_date":1742774400000
   },
+  {
+    "id":"deepseek\/deepseek-r1",
+    "name":"R1",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-R1",
+    "size":684531386000.0,
+    "type":"Open",
+    "license":"Mit",
+    "creation_date":1737331200000
+  },
+  {
+    "id":"google\/gemini-2.0-flash-001",
+    "name":"Gemini 2.0 Flash",
+    "provider_name":"Google",
+    "cost":0.4,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1738713600000
+  },
   {
     "id":"google\/gemini-2.0-flash-lite-001",
     "name":"Gemini 2.0 Flash Lite",
     "license":null,
     "creation_date":1744848000000
   },
+  {
+    "id":"google\/gemini-2.5-flash-preview-05-20",
+    "name":"Gemini 2.5 Flash Preview 05-20",
+    "provider_name":"Google",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1747699200000
+  },
+  {
+    "id":"google\/gemini-flash-1.5",
+    "name":"Gemini 1.5 Flash ",
+    "provider_name":"Google",
+    "cost":0.3,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1715644800000
+  },
+  {
+    "id":"google\/gemini-flash-1.5-8b",
+    "name":"Gemini 1.5 Flash 8B",
+    "provider_name":"Google",
+    "cost":0.15,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1727913600000
+  },
+  {
+    "id":"google\/gemma-2-9b-it",
+    "name":"Gemma 2 9B",
+    "provider_name":"Google",
+    "cost":0.0,
+    "hf_id":"google\/gemma-2-9b-it",
+    "size":9241705984.0,
+    "type":"Open",
+    "license":"Gemma",
+    "creation_date":1719187200000
+  },
   {
     "id":"google\/gemma-3-27b-it",
+    "name":"Gemma 3 27B",
     "provider_name":"Google",
     "cost":0.0,
     "hf_id":"google\/gemma-3-27b-it",
     "license":"Gemma",
     "creation_date":1740787200000
   },
+  {
+    "id":"gryphe\/mythomax-l2-13b",
+    "name":"MythoMax 13B",
+    "provider_name":"MythoMax 13B",
+    "cost":0.07,
+    "hf_id":"Gryphe\/MythoMax-L2-13b",
+    "size":null,
+    "type":"Open",
+    "license":"Other",
+    "creation_date":1691625600000
+  },
   {
     "id":"meta-llama\/llama-3-70b-instruct",
     "name":"Llama 3 70B Instruct",
     "license":"Llama3",
     "creation_date":1713312000000
   },
+  {
+    "id":"meta-llama\/llama-3-8b-instruct",
+    "name":"Llama 3 8B Instruct",
+    "provider_name":"Meta",
+    "cost":0.06,
+    "hf_id":"meta-llama\/Meta-Llama-3-8B-Instruct",
+    "size":8030261248.0,
+    "type":"Open",
+    "license":"Llama3",
+    "creation_date":1713312000000
+  },
   {
     "id":"meta-llama\/llama-3.1-70b-instruct",
     "name":"Llama 3.1 70B Instruct",
     "license":"Llama3.1",
     "creation_date":1721088000000
   },
+  {
+    "id":"meta-llama\/llama-3.1-8b-instruct",
+    "name":"Llama 3.1 8B Instruct",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.1-8B-Instruct",
+    "size":8030261248.0,
+    "type":"Open",
+    "license":"Llama3.1",
+    "creation_date":1721260800000
+  },
+  {
+    "id":"meta-llama\/llama-3.2-1b-instruct",
+    "name":"Llama 3.2 1B Instruct",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.2-1B-Instruct",
+    "size":1235814400.0,
+    "type":"Open",
+    "license":"Llama3.2",
+    "creation_date":1726617600000
+  },
   {
     "id":"meta-llama\/llama-3.3-70b-instruct",
+    "name":"Llama 3.3 70B Instruct",
     "provider_name":"Meta",
     "cost":0.0,
     "hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
   },
   {
     "id":"meta-llama\/llama-4-maverick",
+    "name":"Llama 4 Maverick",
     "provider_name":"Meta",
     "cost":0.0,
     "hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
     "license":"Mit",
     "creation_date":1740355200000
   },
+  {
+    "id":"microsoft\/wizardlm-2-8x22b",
+    "name":"WizardLM-2 8x22B",
+    "provider_name":"WizardLM-2 8x22B",
+    "cost":0.5,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1713225600000
+  },
+  {
+    "id":"mistralai\/mistral-7b-instruct",
+    "name":"Mistral 7B Instruct",
+    "provider_name":"Mistral",
+    "cost":0.0,
+    "hf_id":"mistralai\/Mistral-7B-Instruct-v0.3",
+    "size":7248023552.0,
+    "type":"Open",
+    "license":"Apache 2.0",
+    "creation_date":1716336000000
+  },
   {
     "id":"mistralai\/mistral-nemo",
+    "name":"Mistral Nemo",
     "provider_name":"Mistral",
     "cost":0.0,
     "hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
   },
   {
     "id":"mistralai\/mistral-small-3.1-24b-instruct",
+    "name":"Mistral Small 3.1 24B",
     "provider_name":"Mistral",
     "cost":0.0,
     "hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
     "license":"Apache 2.0",
     "creation_date":1741651200000
   },
+  {
+    "id":"mistralai\/mistral-tiny",
+    "name":"Mistral Tiny",
+    "provider_name":"Mistral Tiny",
+    "cost":0.25,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1704844800000
+  },
+  {
+    "id":"nousresearch\/hermes-3-llama-3.1-405b",
+    "name":"Hermes 3 405B Instruct",
+    "provider_name":"Nous",
+    "cost":0.8,
+    "hf_id":"NousResearch\/Hermes-3-Llama-3.1-405B",
+    "size":405853388800.0,
+    "type":"Open",
+    "license":"Llama3",
+    "creation_date":1723507200000
+  },
   {
     "id":"openai\/gpt-4.1-mini",
     "name":"GPT-4.1 Mini",
     "type":"Commercial",
     "license":null,
     "creation_date":1721260800000
+  },
+  {
+    "id":"openai\/gpt-4o-mini-2024-07-18",
+    "name":"GPT-4o-mini (2024-07-18)",
+    "provider_name":"OpenAI",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1721260800000
   }
 ]

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff