Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on about 17 hours ago

Commit

a73f888

verified ·

1 Parent(s): 3dfd880

Upload from GitHub Actions: More models and languages

Browse files

Files changed (8) hide show

evals/main.py +57 -41
evals/models.py +15 -7
frontend/src/App.js +2 -0
frontend/src/components/CostPlot.js +88 -0
frontend/src/components/HistoryPlot.js +5 -4
frontend/src/components/WorldMap.js +0 -1
models.json +55 -0
results.json +0 -0

evals/main.py CHANGED Viewed

@@ -1,15 +1,29 @@
 import asyncio
 import pandas as pd
 from languages import languages
 from models import models
 from tasks import tasks
 from tqdm.asyncio import tqdm_asyncio
 # ===== config =====
 n_sentences = 10
-n_languages = 40
 n_models = 35
@@ -17,46 +31,48 @@ n_models = 35
 async def evaluate():
-    print("running evaluations")
-    old_results = pd.read_json("results.json")
-    old_models = pd.read_json("models.json")
-    # get all combinations of model, language and task
-    combis = [
-        (model, lang.bcp_47, task_name)
-        for task_name, task in tasks.items()
-        for lang in languages.iloc[:n_languages].itertuples()
-        for model in models["id"].iloc[:n_models]
-    ]
-    # filter out combinations that have already been evaluated
-    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-    combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
-    combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
-    # run evaluations
-    results = [
-        tasks[task_name](model, bcp_47, i)
-        for i in range(n_sentences)
-        for model, bcp_47, task_name in combis.itertuples(index=False)
-    ]
-    results = await tqdm_asyncio.gather(*results, miniters=1)
-    results = [r for group in results for r in group]
-    args = dict(orient="records", indent=2, force_ascii=False)
-    if results:
-        # aggregate results
-        results = pd.DataFrame(results)
-        results = (
-            results.groupby(["model", "bcp_47", "task", "metric"])
-            .agg({"score": "mean"})
-            .reset_index()
-        )
-        # save results
-        results = pd.concat([old_results, results])
-        results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
-        results.to_json("results.json", **args)
-    # save up-to-date info on models and languages
-    all_models = pd.concat([old_models, pd.DataFrame(models)])
-    all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
-    all_models.to_json("models.json", **args)
-    pd.DataFrame(languages).to_json("languages.json", **args)
 if __name__ == "__main__":

 import asyncio
+from time import time
+t0 = time()
 import pandas as pd
 from languages import languages
+print(f"loaded languages in {time() - t0:.2f}s")
+t0 = time()
 from models import models
+print(f"loaded models in {time() - t0:.2f}s")
+t0 = time()
 from tasks import tasks
+print(f"loaded tasks in {time() - t0:.2f}s")
+t0 = time()
 from tqdm.asyncio import tqdm_asyncio
 # ===== config =====
 n_sentences = 10
 n_models = 35
 async def evaluate():
+    # FIXME we should not need this for loop, but it helps
+    for n_languages in range(0, 200):
+        print(f"running evaluations for {n_languages} languages")
+        old_results = pd.read_json("results.json")
+        old_models = pd.read_json("models.json")
+        # get all combinations of model, language and task
+        combis = [
+            (model, lang.bcp_47, task_name)
+            for task_name, task in tasks.items()
+            for lang in languages.iloc[:n_languages].itertuples()
+            for model in models["id"].iloc[:n_models]
+        ]
+        # filter out combinations that have already been evaluated
+        combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+        combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
+        combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
+        # run evaluations
+        results = [
+            tasks[task_name](model, bcp_47, i)
+            for i in range(n_sentences)
+            for model, bcp_47, task_name in combis.itertuples(index=False)
+        ]
+        results = await tqdm_asyncio.gather(*results, miniters=1)
+        results = [r for group in results for r in group]
+        args = dict(orient="records", indent=2, force_ascii=False)
+        if results:
+            # aggregate results
+            results = pd.DataFrame(results)
+            results = (
+                results.groupby(["model", "bcp_47", "task", "metric"])
+                .agg({"score": "mean"})
+                .reset_index()
+            )
+            # save results
+            results = pd.concat([old_results, results])
+            results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
+            results.to_json("results.json", **args)
+        # save up-to-date info on models and languages
+        all_models = pd.concat([old_models, pd.DataFrame(models)])
+        all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
+        all_models.to_json("models.json", **args)
+        pd.DataFrame(languages).to_json("languages.json", **args)
 if __name__ == "__main__":

evals/models.py CHANGED Viewed

@@ -10,9 +10,8 @@ from dotenv import load_dotenv
 from elevenlabs import AsyncElevenLabs
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
-from openai import AsyncOpenAI
 from requests import HTTPError, get
-from openai import PermissionDeniedError
 # for development purposes, all languages will be evaluated on the fast models
 # and only a sample of languages will be evaluated on all models
@@ -48,7 +47,7 @@ important_models = [
 ]
 blocklist = [
-    "microsoft/wizardlm-2-8x22b" # temporarily rate-limited
 ]
 transcription_models = [
@@ -68,7 +67,13 @@ def get_models(date: date):
 def get_model(permaslug):
     models = get_models(date.today())
-    slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
     if len(slugs) == 0:
         # the problem is that free models typically have very high rate-limiting
         print(f"no non-free model found for {permaslug}")
@@ -124,6 +129,7 @@ async def complete(**kwargs) -> str | None:
         raise Exception(response)
     return response.choices[0].message.content.strip()
 @cache
 async def transcribe_elevenlabs(path, model):
     modelname = model.split("/")[-1]
@@ -221,7 +227,9 @@ def load_models(date: date):
     ).dt.date
     models = models.assign(
-        name=or_metadata.str["short_name"].str.replace(" (free)", ""),
         provider_name=or_metadata.str["name"].str.split(": ").str[0],
         cost=or_metadata.apply(get_cost),
         hf_id=hf_metadata.str["hf_id"],
@@ -230,8 +238,8 @@ def load_models(date: date):
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
-    models = models[models["cost"] <= 2.0].reset_index(drop=True)
     return models
-models = load_models(date.today())

 from elevenlabs import AsyncElevenLabs
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
+from openai import AsyncOpenAI, PermissionDeniedError
 from requests import HTTPError, get
 # for development purposes, all languages will be evaluated on the fast models
 # and only a sample of languages will be evaluated on all models
 ]
 blocklist = [
+    "microsoft/wizardlm-2-8x22b"  # temporarily rate-limited
 ]
 transcription_models = [
 def get_model(permaslug):
     models = get_models(date.today())
+    slugs = [
+        m
+        for m in models
+        if m["permaslug"] == permaslug
+        and m["endpoint"]
+        and not m["endpoint"]["is_free"]
+    ]
     if len(slugs) == 0:
         # the problem is that free models typically have very high rate-limiting
         print(f"no non-free model found for {permaslug}")
         raise Exception(response)
     return response.choices[0].message.content.strip()
 @cache
 async def transcribe_elevenlabs(path, model):
     modelname = model.split("/")[-1]
     ).dt.date
     models = models.assign(
+        name=or_metadata.str["short_name"]
+        .str.replace(" (free)", "")
+        .str.replace(" (self-moderated)", ""),
         provider_name=or_metadata.str["name"].str.split(": ").str[0],
         cost=or_metadata.apply(get_cost),
         hf_id=hf_metadata.str["hf_id"],
         license=hf_metadata.str["license"],
         creation_date=creation_date_hf.combine_first(creation_date_or),
     )
+    # models = models[models["cost"] <= 2.0].reset_index(drop=True)
     return models
+models = load_models(date.today())

frontend/src/App.js CHANGED Viewed

@@ -9,6 +9,7 @@ import AutoComplete from './components/AutoComplete'
 import LanguagePlot from './components/LanguagePlot'
 import SpeakerPlot from './components/SpeakerPlot'
 import HistoryPlot from './components/HistoryPlot'
 import { Carousel } from 'primereact/carousel'
 import { Dialog } from 'primereact/dialog'
 import { Button } from 'primereact/button'
@@ -211,6 +212,7 @@ function App () {
                     <LanguagePlot data={data} />,
                     <SpeakerPlot data={data} />,
                     <HistoryPlot data={data} />,
                   ]}
                   numScroll={1}
                   numVisible={1}

 import LanguagePlot from './components/LanguagePlot'
 import SpeakerPlot from './components/SpeakerPlot'
 import HistoryPlot from './components/HistoryPlot'
+import CostPlot from './components/CostPlot'
 import { Carousel } from 'primereact/carousel'
 import { Dialog } from 'primereact/dialog'
 import { Button } from 'primereact/button'
                     <LanguagePlot data={data} />,
                     <SpeakerPlot data={data} />,
                     <HistoryPlot data={data} />,
+                    <CostPlot data={data} />,
                   ]}
                   numScroll={1}
                   numVisible={1}

frontend/src/components/CostPlot.js ADDED Viewed

	@@ -0,0 +1,88 @@

+import { useRef, useEffect } from 'react'
+import * as Plot from '@observablehq/plot'
+const HistoryPlot = ({ data, width = 750, height = 500 }) => {
+  const containerRef = useRef()
+  useEffect(() => {
+    const models = [...data.model_table] // sort copy, not in place
+      .filter(d => d.average !== null && d.cost > 0)
+      .sort((a, b) => a.cost - b.cost)
+      .reduce((acc, curr) => {
+        const last = acc[acc.length - 1]?.maxAverage || 0
+        acc.push({
+          ...curr,
+          maxAverage: Math.max(last, curr.average),
+          newRecord: curr.average > last
+        })
+        return acc
+      }, [])
+    let USDollar = new Intl.NumberFormat('en-US', {
+      style: 'currency',
+      currency: 'USD'
+    })
+    const plot = Plot.plot({
+      width: width,
+      height: height,
+      subtitle: 'Cost vs Performance',
+      x: {
+        label: 'Cost (USD)',
+        type: 'log',
+        // format dollar / ct
+        tickFormat: d => USDollar.format(d)
+      },
+      y: {
+        label: 'Language Proficiency Score'
+      },
+      symbol: {
+        legend: true
+      },
+      marks: [
+        Plot.dot(models, {
+          x: d => d.cost,
+          y: d => d.average,
+          symbol: 'provider_name',
+          stroke: 'provider_name',
+          title: d =>
+            `${d.provider_name} - ${d.name} (${
+              d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
+            })\nCost: ${USDollar.format(d.cost)}\nScore: ${d.average.toFixed(
+              2
+            )}`,
+          tip: true
+        }),
+        Plot.line(
+          [
+            ...models.filter(d => d.newRecord),
+            {
+              cost: models.map(d => d.cost).reduce((a, b) => Math.max(a, b), 0),
+              maxAverage: models[models.length - 1].maxAverage
+            }
+          ],
+          {
+            x: d => d.cost,
+            y: d => d.maxAverage,
+            curve: 'catmull-rom',
+            strokeOpacity: 0.3
+          }
+        )
+      ]
+    })
+    containerRef.current.append(plot)
+    return () => plot.remove()
+  }, [data])
+  return (
+    <div
+      ref={containerRef}
+      style={{
+        width: '100%',
+        height: '100%',
+        display: 'flex',
+        alignItems: 'center',
+        justifyContent: 'center'
+      }}
+    />
+  )
+}
+export default HistoryPlot

frontend/src/components/HistoryPlot.js CHANGED Viewed

@@ -15,7 +15,6 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
       })
       return acc
     }, [])
-  console.log(models)
   useEffect(() => {
     const plot = Plot.plot({
       width: width,
@@ -36,12 +35,14 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
         Plot.dot(models, {
           x: d => d.creation_date,
           y: d => d.average,
-          symbol: "provider_name",
-          stroke: "provider_name",
           title: d =>
             `${d.provider_name} - ${d.name} (${
               d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
-            })\nPublished: ${d.creation_date}\nScore: ${d.average.toFixed(2)}`,
           tip: true
         }),
         Plot.line(

       })
       return acc
     }, [])
   useEffect(() => {
     const plot = Plot.plot({
       width: width,
         Plot.dot(models, {
           x: d => d.creation_date,
           y: d => d.average,
+          symbol: 'provider_name',
+          stroke: 'provider_name',
           title: d =>
             `${d.provider_name} - ${d.name} (${
               d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
+            })\nPublished: ${new Date(
+              d.creation_date
+            ).toLocaleDateString()}\nScore: ${d.average.toFixed(2)}`,
           tip: true
         }),
         Plot.line(

frontend/src/components/WorldMap.js CHANGED Viewed

@@ -43,7 +43,6 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
   }, [])
   useEffect(() => {
-    console.log('countries', data)
     if (mapData === undefined || data === undefined) return
     const countriesDict = data.reduce((acc, country) => {
       acc[country.iso2] = country

   }, [])
   useEffect(() => {
     if (mapData === undefined || data === undefined) return
     const countriesDict = data.reduce((acc, country) => {
       acc[country.iso2] = country

models.json CHANGED Viewed

@@ -10,6 +10,39 @@
     "license":null,
     "creation_date":1733356800000
   },
   {
     "id":"deepseek\/deepseek-chat",
     "name":"DeepSeek V3",
@@ -98,6 +131,17 @@
     "license":null,
     "creation_date":1747699200000
   },
   {
     "id":"google\/gemini-flash-1.5",
     "name":"Gemini 1.5 Flash ",
@@ -263,6 +307,17 @@
     "license":"Apache 2.0",
     "creation_date":1741651200000
   },
   {
     "id":"openai\/gpt-4.1-mini",
     "name":"GPT-4.1 Mini",

     "license":null,
     "creation_date":1733356800000
   },
+  {
+    "id":"anthropic\/claude-3.5-sonnet",
+    "name":"Claude 3.5 Sonnet",
+    "provider_name":"Anthropic",
+    "cost":15.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1729555200000
+  },
+  {
+    "id":"anthropic\/claude-3.7-sonnet",
+    "name":"Claude 3.7 Sonnet",
+    "provider_name":"Anthropic",
+    "cost":15.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1740355200000
+  },
+  {
+    "id":"anthropic\/claude-sonnet-4",
+    "name":"Claude Sonnet 4",
+    "provider_name":"Anthropic",
+    "cost":15.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1747872000000
+  },
   {
     "id":"deepseek\/deepseek-chat",
     "name":"DeepSeek V3",
     "license":null,
     "creation_date":1747699200000
   },
+  {
+    "id":"google\/gemini-2.5-pro-preview-05-06",
+    "name":"Gemini 2.5 Pro Preview 05-06",
+    "provider_name":"Google",
+    "cost":10.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1746576000000
+  },
   {
     "id":"google\/gemini-flash-1.5",
     "name":"Gemini 1.5 Flash ",
     "license":"Apache 2.0",
     "creation_date":1741651200000
   },
+  {
+    "id":"openai\/gpt-4.1",
+    "name":"GPT-4.1",
+    "provider_name":"OpenAI",
+    "cost":8.0,
+    "hf_id":null,
+    "size":null,
+    "type":"closed-source",
+    "license":null,
+    "creation_date":1744588800000
+  },
   {
     "id":"openai\/gpt-4.1-mini",
     "name":"GPT-4.1 Mini",

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff