Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on Jun 4

Commit

941d5c5

verified ·

1 Parent(s): 3f60023

Upload from GitHub Actions: Eavaluate on 40 languages

Browse files

Files changed (15) hide show

datasets.json +14 -4
evals/backend.py +16 -2
evals/main.py +2 -1
evals/models.py +3 -0
evals/tasks.py +2 -1
frontend/src/App.js +57 -1
frontend/src/components/HistoryPlot.js +3 -3
frontend/src/components/LanguagePlot.js +9 -9
frontend/src/components/ScoreColumns.js +2 -2
frontend/src/components/SpeakerPlot.js +3 -3
frontend/src/components/WorldMap.js +3 -3
languages.json +3 -3
notes/biases.md +24 -0
notes/maps.md +6 -0
results.json +0 -0

datasets.json CHANGED Viewed

@@ -5,10 +5,20 @@
         "author_url": "https://ai.meta.com",
         "url": "https://huggingface.co/datasets/openlanguagedata/flores_plus",
         "n_languages": 200,
-        "tasks": [
-            "translation",
-            "classification"
-        ],
         "parallel": true,
         "translation": "human",
         "base": "FLORES",

         "author_url": "https://ai.meta.com",
         "url": "https://huggingface.co/datasets/openlanguagedata/flores_plus",
         "n_languages": 200,
+        "tasks": ["translation"],
+        "parallel": true,
+        "translation": "human",
+        "base": "FLORES",
+        "implemented": true,
+        "group": "Translation"
+    },
+    {
+        "name": "SIB-200",
+        "author": "Academic",
+        "author_url": null,
+        "url": "https://huggingface.co/datasets/Davlan/sib200",
+        "n_languages": 200,
+        "tasks": ["classification"],
         "parallel": true,
         "translation": "human",
         "base": "FLORES",

evals/backend.py CHANGED Viewed

@@ -29,6 +29,20 @@ task_metrics = [
 ]
 def make_model_table(df, models):
     df = (
         df.groupby(["model", "task", "metric"])
@@ -38,7 +52,7 @@ def make_model_table(df, models):
     df["task_metric"] = df["task"] + "_" + df["metric"]
     df = df.drop(columns=["task", "metric"])
     df = df.pivot(index="model", columns="task_metric", values="score")
-    df["average"] = df[task_metrics].mean(axis=1, skipna=False)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
@@ -70,7 +84,7 @@ def make_language_table(df, languages):
     df["task_metric"] = df["task"] + "_" + df["metric"]
     df = df.drop(columns=["task", "metric"])
     df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
-    df["average"] = df[task_metrics].mean(axis=1, skipna=False)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[

 ]
+def compute_normalized_average(df, metrics):
+    """Compute average of min-max normalized metric columns."""
+    normalized_df = df[metrics].copy()
+    for col in metrics:
+        if col in normalized_df.columns:
+            col_min = normalized_df[col].min()
+            col_max = normalized_df[col].max()
+            if col_max > col_min:  # Avoid division by zero
+                normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
+            else:
+                normalized_df[col] = 0  # If all values are the same, set to 0
+    return normalized_df.mean(axis=1, skipna=False)
 def make_model_table(df, models):
     df = (
         df.groupby(["model", "task", "metric"])
     df["task_metric"] = df["task"] + "_" + df["metric"]
     df = df.drop(columns=["task", "metric"])
     df = df.pivot(index="model", columns="task_metric", values="score")
+    df["average"] = compute_normalized_average(df, task_metrics)
     df = df.sort_values(by="average", ascending=False).reset_index()
     df = pd.merge(df, models, left_on="model", right_on="id", how="left")
     df["rank"] = df.index + 1
     df["task_metric"] = df["task"] + "_" + df["metric"]
     df = df.drop(columns=["task", "metric"])
     df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
+    df["average"] = compute_normalized_average(df, task_metrics)
     df = pd.merge(languages, df, on="bcp_47", how="outer")
     df = df.sort_values(by="speakers", ascending=False)
     df = df[

evals/main.py CHANGED Viewed

@@ -9,9 +9,10 @@ from tqdm.asyncio import tqdm_asyncio
 # ===== config =====
 n_sentences = 10
-n_languages = 20
 n_models = 35
 # ===== run evaluation and aggregate results =====

 # ===== config =====
 n_sentences = 10
+n_languages = 40
 n_models = 35
 # ===== run evaluation and aggregate results =====

evals/models.py CHANGED Viewed

@@ -22,9 +22,11 @@ important_models = [
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
     # "openai/gpt-3.5-turbo-0613",  # 2$
     # "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
@@ -68,6 +70,7 @@ def get_model(permaslug):
     models = get_models(date.today())
     slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
     if len(slugs) == 0:
         print(f"no non-free model found for {permaslug}")
     return slugs[0] if len(slugs) >= 1 else None

     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
     # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
+    # "openai/gpt-4.1",  # 8$
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
+    # "openai/gpt-4o-2024-11-20", # 10$
     # "openai/gpt-3.5-turbo-0613",  # 2$
     # "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     models = get_models(date.today())
     slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
     if len(slugs) == 0:
+        # the problem is that free models typically have very high rate-limiting
         print(f"no non-free model found for {permaslug}")
     return slugs[0] if len(slugs) >= 1 else None

evals/tasks.py CHANGED Viewed

@@ -57,9 +57,10 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             references=[target_sentence],
             tokenizer=tokenizer.tokenize,
         )
     else:
         bleu_score = {"bleu": 0}
-    chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
     return [
         {
             "model": model,

             references=[target_sentence],
             tokenizer=tokenizer.tokenize,
         )
+        chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
     else:
         bleu_score = {"bleu": 0}
+        chrf_score = {"score": 0}
     return [
         {
             "model": model,

frontend/src/App.js CHANGED Viewed

@@ -10,12 +10,16 @@ import LanguagePlot from './components/LanguagePlot'
 import SpeakerPlot from './components/SpeakerPlot'
 import HistoryPlot from './components/HistoryPlot'
 import { Carousel } from 'primereact/carousel'
 function App () {
   const [data, setData] = useState(null)
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
   useEffect(() => {
     fetch('/api/data', {
       method: 'POST',
@@ -37,6 +41,17 @@ function App () {
       })
   }, [selectedLanguages])
   return (
     <PrimeReactProvider>
       <div style={{ height: '100%', display: 'flex', flexDirection: 'column' }}>
@@ -143,9 +158,23 @@ function App () {
                   maxWidth: 'min(100vw, 800px)',
                   alignItems: 'center',
                   justifyContent: 'center',
-                  width: '100%'
                 }}
               >
                 <Carousel
                   value={[
                     <WorldMap data={data.countries} />,
@@ -163,6 +192,33 @@ function App () {
             </>
           )}
         </main>
       </div>
     </PrimeReactProvider>
   )

 import SpeakerPlot from './components/SpeakerPlot'
 import HistoryPlot from './components/HistoryPlot'
 import { Carousel } from 'primereact/carousel'
+import { Dialog } from 'primereact/dialog'
+import { Button } from 'primereact/button'
 function App () {
   const [data, setData] = useState(null)
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState(null)
   const [selectedLanguages, setSelectedLanguages] = useState([])
+  const [dialogVisible, setDialogVisible] = useState(false)
   useEffect(() => {
     fetch('/api/data', {
       method: 'POST',
       })
   }, [selectedLanguages])
+  const [windowWidth, setWindowWidth] = useState(window.innerWidth)
+  const [windowHeight, setWindowHeight] = useState(window.innerHeight)
+  useEffect(() => {
+    const handleResize = () => {
+      setWindowWidth(window.innerWidth)
+      setWindowHeight(window.innerHeight)
+    }
+    window.addEventListener('resize', handleResize)
+    return () => window.removeEventListener('resize', handleResize)
+  }, [])
   return (
     <PrimeReactProvider>
       <div style={{ height: '100%', display: 'flex', flexDirection: 'column' }}>
                   maxWidth: 'min(100vw, 800px)',
                   alignItems: 'center',
                   justifyContent: 'center',
+                  width: '100%',
+                  position: 'relative'
                 }}
               >
+                <Button
+                  icon="pi pi-external-link"
+                  className="p-button-text p-button-plain"
+                  onClick={() => setDialogVisible(true)}
+                  tooltip="Open in larger view"
+                  style={{
+                    position: 'absolute',
+                    top: '10px',
+                    right: '10px',
+                    zIndex: 1,
+                    color: '#666'
+                  }}
+                />
                 <Carousel
                   value={[
                     <WorldMap data={data.countries} />,
             </>
           )}
         </main>
+        <Dialog
+          visible={dialogVisible}
+          onHide={() => setDialogVisible(false)}
+          style={{ width: '90vw', height: '90vh' }}
+          maximizable
+          modal
+          header="Charts"
+        >
+          {data && (
+            <div style={{ width: '100%', height: '100%' }}>
+              <Carousel
+                value={[
+                  <WorldMap data={data.countries} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
+                  <LanguagePlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
+                  <SpeakerPlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
+                  <HistoryPlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
+                ]}
+                numScroll={1}
+                numVisible={1}
+                itemTemplate={item => item}
+                circular
+                style={{ width: '100%', height: 'calc(90vh - 120px)' }}
+              />
+            </div>
+          )}
+        </Dialog>
       </div>
     </PrimeReactProvider>
   )

frontend/src/components/HistoryPlot.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { useRef, useEffect } from 'react'
 import * as Plot from '@observablehq/plot'
-const HistoryPlot = ({ data }) => {
   const containerRef = useRef()
   const models = [...data.model_table] // sort copy, not in place
     .filter(d => d.average !== null)
@@ -18,8 +18,8 @@ const HistoryPlot = ({ data }) => {
   console.log(models)
   useEffect(() => {
     const plot = Plot.plot({
-      width: 750,
-      height: 500,
       subtitle: 'Model performance over time',
       x: {
         label: 'Date',

 import { useRef, useEffect } from 'react'
 import * as Plot from '@observablehq/plot'
+const HistoryPlot = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
   const models = [...data.model_table] // sort copy, not in place
     .filter(d => d.average !== null)
   console.log(models)
   useEffect(() => {
     const plot = Plot.plot({
+      width: width,
+      height: height,
       subtitle: 'Model performance over time',
       x: {
         label: 'Date',

frontend/src/components/LanguagePlot.js CHANGED Viewed

@@ -1,40 +1,40 @@
 import { useRef, useEffect } from 'react'
 import * as Plot from '@observablehq/plot'
-const LanguagePlot = ({ data }) => {
   const containerRef = useRef()
-  const languages = data.language_table.filter(a => a.average > 0)
   const families = [...new Set(languages.map(a => a.family))]
   useEffect(() => {
     const plot = Plot.plot({
-      width: 750,
-      height: 500,
-      subtitle: 'Proficiency scores by language',
       x: {
         label: 'Number of Speakers',
         type: 'log'
       },
       y: {
-        label: 'Language Proficiency Score'
       },
       marks: [
         Plot.dot(languages, {
           x: 'speakers',
-          y: d => d.average,
           r: 'speakers',
           fill: 'family',
           title: d =>
             `${d.language_name}\n${d.speakers.toLocaleString('en-US', {
               notation: 'compact'
-            })} speakers\nScore: ${d.average.toFixed(2)}`,
           tip: true
         }),
         Plot.text(
           languages.filter(a => a.speakers > 1e8),
           {
             x: 'speakers',
-            y: d => d.average,
             text: d => d.language_name,
             fill: 'black',
             frameAnchor: 'left',

 import { useRef, useEffect } from 'react'
 import * as Plot from '@observablehq/plot'
+const LanguagePlot = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
+  const languages = data.language_table.filter(a => a.translation_from_bleu > 0)
   const families = [...new Set(languages.map(a => a.family))]
   useEffect(() => {
     const plot = Plot.plot({
+      width: width,
+      height: height,
+      subtitle: 'Translation quality by language',
       x: {
         label: 'Number of Speakers',
         type: 'log'
       },
       y: {
+        label: 'Translation quality (spBLEU score for translating from the given language to other languages)'
       },
       marks: [
         Plot.dot(languages, {
           x: 'speakers',
+          y: d => d.translation_from_bleu,
           r: 'speakers',
           fill: 'family',
           title: d =>
             `${d.language_name}\n${d.speakers.toLocaleString('en-US', {
               notation: 'compact'
+            })} speakers\nScore: ${d.translation_from_bleu.toFixed(2)}`,
           tip: true
         }),
         Plot.text(
           languages.filter(a => a.speakers > 1e8),
           {
             x: 'speakers',
+            y: d => d.translation_from_bleu,
             text: d => d.language_name,
             fill: 'black',
             frameAnchor: 'left',

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -13,8 +13,8 @@ const scoreBodyTemplate = (field, options = {}) => {
 const ScoreColumns = [
   <Column
     field='average'
-    header='Average'
-    headerTooltip='Language Proficiency Score (average of all displayed scores)'
     sortable
     body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}

 const ScoreColumns = [
   <Column
     field='average'
+    header='Overall'
+    headerTooltip='Language Proficiency Score (average of all displayed scores, after min-max normalization)'
     sortable
     body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}

frontend/src/components/SpeakerPlot.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { useRef, useEffect } from 'react'
 import * as Plot from '@observablehq/plot'
-const SpeakerPlot = ({ data }) => {
   const containerRef = useRef()
   const allSpeakers = data.language_table.reduce(
     (sum, curr) => sum + curr.speakers,
@@ -25,8 +25,8 @@ const SpeakerPlot = ({ data }) => {
   useEffect(() => {
     const plot = Plot.plot({
-      width: 750,
-      height: 500,
       subtitle: 'Number of languages vs speakers covered',
       x: {
         label: 'Languages',

 import { useRef, useEffect } from 'react'
 import * as Plot from '@observablehq/plot'
+const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
   const allSpeakers = data.language_table.reduce(
     (sum, curr) => sum + curr.speakers,
   useEffect(() => {
     const plot = Plot.plot({
+      width: width,
+      height: height,
       subtitle: 'Number of languages vs speakers covered',
       x: {
         label: 'Languages',

frontend/src/components/WorldMap.js CHANGED Viewed

@@ -32,7 +32,7 @@ const makeTitle = data => d => {
   return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
 }
-const WorldMap = ({ data }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
@@ -51,8 +51,8 @@ const WorldMap = ({ data }) => {
     }, {})
     const plot = Plot.plot({
       subtitle: 'Language Proficiency Score by Country',
-      width: 750,
-      height: 500,
       projection: 'equal-earth',
       marks: [
         Plot.geo(mapData, {

   return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
 }
+const WorldMap = ({ data, width = 750, height = 500 }) => {
   const containerRef = useRef()
   const [mapData, setMapData] = useState()
     }, {})
     const plot = Plot.plot({
       subtitle: 'Language Proficiency Score by Country',
+      width: width,
+      height: height,
       projection: 'equal-earth',
       marks: [
         Plot.geo(mapData, {

languages.json CHANGED Viewed

@@ -235,7 +235,7 @@
     "family":"Austroasiatic",
     "flores_path":"vie_Latn",
     "fleurs_tag":"vi_vn",
-    "commonvoice_hours":6.2,
     "commonvoice_locale":"vi",
     "in_benchmark":true
   },
@@ -655,7 +655,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
-    "commonvoice_hours":6.2,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
@@ -5815,7 +5815,7 @@
     "family":"Indo-European",
     "flores_path":"ltg_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":29.0,
     "commonvoice_locale":"ltg",
     "in_benchmark":true
   },

     "family":"Austroasiatic",
     "flores_path":"vie_Latn",
     "fleurs_tag":"vi_vn",
+    "commonvoice_hours":6.3,
     "commonvoice_locale":"vi",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"yor_Latn",
     "fleurs_tag":"yo_ng",
+    "commonvoice_hours":6.3,
     "commonvoice_locale":"yo",
     "in_benchmark":true
   },
     "family":"Indo-European",
     "flores_path":"ltg_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":30.0,
     "commonvoice_locale":"ltg",
     "in_benchmark":true
   },

notes/biases.md ADDED Viewed

	@@ -0,0 +1,24 @@

+We want to make a benchmark that objectively
+- shows the level of how well-supported or neglected different languages are;
+- shows for each language the performance of different AI models;
+- shows how performant different AI models generally are regarding language support.
+Turns out this is really difficult to do without having some kind of bias.
+Here's a list of approaches:
+- Translate **from English** to `evaluated_language`: This will give an advantage to languages that are similar to English, and makes it impossible to evaluate English itself.
+- Translate from **every other language** to `evaluated_language`: This will give an advantage to language families that have a lot of members.
+- Translate from every other language **(weighted by population)** to `evaluated language`: The metrics are usually not comparable across evaluated languages, specifically ...
+  - **BLEU**: May support some languages better than others.
+  - **BertScore**: Will give an advantage to English, which it is primarily trained on.
+  - **Multilingual** BertScore: Will give an advantage to the languages primarily trained on; may recognize semantic similarity even for untranslated words.
+  - **ChrF++**: Better than BLEU.
+  - **Sp**BLEU, SentencePiece-based tokenizers trained separately on `evaluated_language`, as provided for the FLORES+ dataset: Seems okay.
+- Translate **from** `evaluated_language` to every other language (weighted by population), evaluate using any metric: For 2 very similar sister languages that are equally well translated, this gives an advantage to the smaller language (because it has the big sister language as an easy target, whereas the other sister language only has the small sister language as an easy target).
+- Translate from `evaluated_language` to every language (`evaluated_language` **itself included**, weighted by population), evaluate using any metric: Gives an advantage to big languages that trivially get a high score for translating to themselves; but this is fair in terms of objectively showing "to how many people can I communicate (and to what extent) given the AI model".
+- Rather than translation, use **masked language modeling** just on `evaluated_language` itself: This still depends on an evaluation metric, which is usually not comparable across languages, see the problems above.
+- Use **categorization** of sentences in `evaluated_language` (where the same categories are used for all languages): Categories may be tied to certain cultures and thus languages, giving an advantage to the language/culture in which the categories are created.
+- Use **culture-independent categories** for categorization of sentences in `evaluated_language` using zero-shot prompting with a given set of category labels: The labels should be language-independent but consistent at the same time, which may be difficult; in practice, English labels may be just fine.
+- Use culture-independent categorization of sentences in `evaluated_language` using **few-shot prompting**: This seems okay.

notes/maps.md ADDED Viewed

	@@ -0,0 +1,6 @@

+Possible sources for maps:
+- [Natural Earth](https://www.naturalearthdata.com/): Their main version is not politically correct, and they do provide additional "orld view" data, including for Germany, but not for UN or other international organizations, and it's not very straightforward to use. Also has some issues with ISO2 codes, one can use [`ISO_A2_EH`](https://github.com/nvkelso/natural-earth-vector/issues/284) to work around that; still lacking Somalia though.
+- [UN](https://geoportal.un.org/arcgis/apps/sites/#/geohub/datasets/d7caaff3ef4b4f7c82689b7c4694ad92/about): Has some countries inverted, we can mostly [correct for that](https://observablehq.com/@bumbeishvili/rewind-geojson), but it still leaves some artifacts in Norway and the Gulf of Amerxico.
+- [World Bank](https://datacatalog.worldbank.org/search/dataset/0038272): Has missing ISO2 country codes for France and Norway.
+- [EU](https://ec.europa.eu/eurostat/web/gisco/geodata/administrative-units/countries): Displays very weirdly, haven't looked into the details.

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff