Upload from GitHub Actions: Eavaluate on 40 languages
Browse files- datasets.json +14 -4
- evals/backend.py +16 -2
- evals/main.py +2 -1
- evals/models.py +3 -0
- evals/tasks.py +2 -1
- frontend/src/App.js +57 -1
- frontend/src/components/HistoryPlot.js +3 -3
- frontend/src/components/LanguagePlot.js +9 -9
- frontend/src/components/ScoreColumns.js +2 -2
- frontend/src/components/SpeakerPlot.js +3 -3
- frontend/src/components/WorldMap.js +3 -3
- languages.json +3 -3
- notes/biases.md +24 -0
- notes/maps.md +6 -0
- results.json +0 -0
datasets.json
CHANGED
@@ -5,10 +5,20 @@
|
|
5 |
"author_url": "https://ai.meta.com",
|
6 |
"url": "https://huggingface.co/datasets/openlanguagedata/flores_plus",
|
7 |
"n_languages": 200,
|
8 |
-
"tasks": [
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"parallel": true,
|
13 |
"translation": "human",
|
14 |
"base": "FLORES",
|
|
|
5 |
"author_url": "https://ai.meta.com",
|
6 |
"url": "https://huggingface.co/datasets/openlanguagedata/flores_plus",
|
7 |
"n_languages": 200,
|
8 |
+
"tasks": ["translation"],
|
9 |
+
"parallel": true,
|
10 |
+
"translation": "human",
|
11 |
+
"base": "FLORES",
|
12 |
+
"implemented": true,
|
13 |
+
"group": "Translation"
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"name": "SIB-200",
|
17 |
+
"author": "Academic",
|
18 |
+
"author_url": null,
|
19 |
+
"url": "https://huggingface.co/datasets/Davlan/sib200",
|
20 |
+
"n_languages": 200,
|
21 |
+
"tasks": ["classification"],
|
22 |
"parallel": true,
|
23 |
"translation": "human",
|
24 |
"base": "FLORES",
|
evals/backend.py
CHANGED
@@ -29,6 +29,20 @@ task_metrics = [
|
|
29 |
]
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def make_model_table(df, models):
|
33 |
df = (
|
34 |
df.groupby(["model", "task", "metric"])
|
@@ -38,7 +52,7 @@ def make_model_table(df, models):
|
|
38 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
39 |
df = df.drop(columns=["task", "metric"])
|
40 |
df = df.pivot(index="model", columns="task_metric", values="score")
|
41 |
-
df["average"] = df
|
42 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
43 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
44 |
df["rank"] = df.index + 1
|
@@ -70,7 +84,7 @@ def make_language_table(df, languages):
|
|
70 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
71 |
df = df.drop(columns=["task", "metric"])
|
72 |
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
73 |
-
df["average"] = df
|
74 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
75 |
df = df.sort_values(by="speakers", ascending=False)
|
76 |
df = df[
|
|
|
29 |
]
|
30 |
|
31 |
|
32 |
+
def compute_normalized_average(df, metrics):
|
33 |
+
"""Compute average of min-max normalized metric columns."""
|
34 |
+
normalized_df = df[metrics].copy()
|
35 |
+
for col in metrics:
|
36 |
+
if col in normalized_df.columns:
|
37 |
+
col_min = normalized_df[col].min()
|
38 |
+
col_max = normalized_df[col].max()
|
39 |
+
if col_max > col_min: # Avoid division by zero
|
40 |
+
normalized_df[col] = (normalized_df[col] - col_min) / (col_max - col_min)
|
41 |
+
else:
|
42 |
+
normalized_df[col] = 0 # If all values are the same, set to 0
|
43 |
+
return normalized_df.mean(axis=1, skipna=False)
|
44 |
+
|
45 |
+
|
46 |
def make_model_table(df, models):
|
47 |
df = (
|
48 |
df.groupby(["model", "task", "metric"])
|
|
|
52 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
53 |
df = df.drop(columns=["task", "metric"])
|
54 |
df = df.pivot(index="model", columns="task_metric", values="score")
|
55 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
56 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
57 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
58 |
df["rank"] = df.index + 1
|
|
|
84 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
85 |
df = df.drop(columns=["task", "metric"])
|
86 |
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
87 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
88 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
89 |
df = df.sort_values(by="speakers", ascending=False)
|
90 |
df = df[
|
evals/main.py
CHANGED
@@ -9,9 +9,10 @@ from tqdm.asyncio import tqdm_asyncio
|
|
9 |
# ===== config =====
|
10 |
|
11 |
n_sentences = 10
|
12 |
-
n_languages =
|
13 |
n_models = 35
|
14 |
|
|
|
15 |
# ===== run evaluation and aggregate results =====
|
16 |
|
17 |
|
|
|
9 |
# ===== config =====
|
10 |
|
11 |
n_sentences = 10
|
12 |
+
n_languages = 40
|
13 |
n_models = 35
|
14 |
|
15 |
+
|
16 |
# ===== run evaluation and aggregate results =====
|
17 |
|
18 |
|
evals/models.py
CHANGED
@@ -22,9 +22,11 @@ important_models = [
|
|
22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
|
|
25 |
"openai/gpt-4.1-mini", # 1.6$
|
26 |
"openai/gpt-4.1-nano", # 0.4$
|
27 |
"openai/gpt-4o-mini", # 0.6$
|
|
|
28 |
# "openai/gpt-3.5-turbo-0613", # 2$
|
29 |
# "openai/gpt-3.5-turbo", # 1.5$
|
30 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
@@ -68,6 +70,7 @@ def get_model(permaslug):
|
|
68 |
models = get_models(date.today())
|
69 |
slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
|
70 |
if len(slugs) == 0:
|
|
|
71 |
print(f"no non-free model found for {permaslug}")
|
72 |
return slugs[0] if len(slugs) >= 1 else None
|
73 |
|
|
|
22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
25 |
+
# "openai/gpt-4.1", # 8$
|
26 |
"openai/gpt-4.1-mini", # 1.6$
|
27 |
"openai/gpt-4.1-nano", # 0.4$
|
28 |
"openai/gpt-4o-mini", # 0.6$
|
29 |
+
# "openai/gpt-4o-2024-11-20", # 10$
|
30 |
# "openai/gpt-3.5-turbo-0613", # 2$
|
31 |
# "openai/gpt-3.5-turbo", # 1.5$
|
32 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
|
|
70 |
models = get_models(date.today())
|
71 |
slugs = [m for m in models if m["permaslug"] == permaslug and m["endpoint"] and not m["endpoint"]["is_free"]]
|
72 |
if len(slugs) == 0:
|
73 |
+
# the problem is that free models typically have very high rate-limiting
|
74 |
print(f"no non-free model found for {permaslug}")
|
75 |
return slugs[0] if len(slugs) >= 1 else None
|
76 |
|
evals/tasks.py
CHANGED
@@ -57,9 +57,10 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
57 |
references=[target_sentence],
|
58 |
tokenizer=tokenizer.tokenize,
|
59 |
)
|
|
|
60 |
else:
|
61 |
bleu_score = {"bleu": 0}
|
62 |
-
|
63 |
return [
|
64 |
{
|
65 |
"model": model,
|
|
|
57 |
references=[target_sentence],
|
58 |
tokenizer=tokenizer.tokenize,
|
59 |
)
|
60 |
+
chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
|
61 |
else:
|
62 |
bleu_score = {"bleu": 0}
|
63 |
+
chrf_score = {"score": 0}
|
64 |
return [
|
65 |
{
|
66 |
"model": model,
|
frontend/src/App.js
CHANGED
@@ -10,12 +10,16 @@ import LanguagePlot from './components/LanguagePlot'
|
|
10 |
import SpeakerPlot from './components/SpeakerPlot'
|
11 |
import HistoryPlot from './components/HistoryPlot'
|
12 |
import { Carousel } from 'primereact/carousel'
|
|
|
|
|
13 |
|
14 |
function App () {
|
15 |
const [data, setData] = useState(null)
|
16 |
const [loading, setLoading] = useState(true)
|
17 |
const [error, setError] = useState(null)
|
18 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
|
|
|
|
19 |
useEffect(() => {
|
20 |
fetch('/api/data', {
|
21 |
method: 'POST',
|
@@ -37,6 +41,17 @@ function App () {
|
|
37 |
})
|
38 |
}, [selectedLanguages])
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
return (
|
41 |
<PrimeReactProvider>
|
42 |
<div style={{ height: '100%', display: 'flex', flexDirection: 'column' }}>
|
@@ -143,9 +158,23 @@ function App () {
|
|
143 |
maxWidth: 'min(100vw, 800px)',
|
144 |
alignItems: 'center',
|
145 |
justifyContent: 'center',
|
146 |
-
width: '100%'
|
|
|
147 |
}}
|
148 |
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
<Carousel
|
150 |
value={[
|
151 |
<WorldMap data={data.countries} />,
|
@@ -163,6 +192,33 @@ function App () {
|
|
163 |
</>
|
164 |
)}
|
165 |
</main>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
</div>
|
167 |
</PrimeReactProvider>
|
168 |
)
|
|
|
10 |
import SpeakerPlot from './components/SpeakerPlot'
|
11 |
import HistoryPlot from './components/HistoryPlot'
|
12 |
import { Carousel } from 'primereact/carousel'
|
13 |
+
import { Dialog } from 'primereact/dialog'
|
14 |
+
import { Button } from 'primereact/button'
|
15 |
|
16 |
function App () {
|
17 |
const [data, setData] = useState(null)
|
18 |
const [loading, setLoading] = useState(true)
|
19 |
const [error, setError] = useState(null)
|
20 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
21 |
+
const [dialogVisible, setDialogVisible] = useState(false)
|
22 |
+
|
23 |
useEffect(() => {
|
24 |
fetch('/api/data', {
|
25 |
method: 'POST',
|
|
|
41 |
})
|
42 |
}, [selectedLanguages])
|
43 |
|
44 |
+
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
45 |
+
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
46 |
+
useEffect(() => {
|
47 |
+
const handleResize = () => {
|
48 |
+
setWindowWidth(window.innerWidth)
|
49 |
+
setWindowHeight(window.innerHeight)
|
50 |
+
}
|
51 |
+
window.addEventListener('resize', handleResize)
|
52 |
+
return () => window.removeEventListener('resize', handleResize)
|
53 |
+
}, [])
|
54 |
+
|
55 |
return (
|
56 |
<PrimeReactProvider>
|
57 |
<div style={{ height: '100%', display: 'flex', flexDirection: 'column' }}>
|
|
|
158 |
maxWidth: 'min(100vw, 800px)',
|
159 |
alignItems: 'center',
|
160 |
justifyContent: 'center',
|
161 |
+
width: '100%',
|
162 |
+
position: 'relative'
|
163 |
}}
|
164 |
>
|
165 |
+
<Button
|
166 |
+
icon="pi pi-external-link"
|
167 |
+
className="p-button-text p-button-plain"
|
168 |
+
onClick={() => setDialogVisible(true)}
|
169 |
+
tooltip="Open in larger view"
|
170 |
+
style={{
|
171 |
+
position: 'absolute',
|
172 |
+
top: '10px',
|
173 |
+
right: '10px',
|
174 |
+
zIndex: 1,
|
175 |
+
color: '#666'
|
176 |
+
}}
|
177 |
+
/>
|
178 |
<Carousel
|
179 |
value={[
|
180 |
<WorldMap data={data.countries} />,
|
|
|
192 |
</>
|
193 |
)}
|
194 |
</main>
|
195 |
+
|
196 |
+
<Dialog
|
197 |
+
visible={dialogVisible}
|
198 |
+
onHide={() => setDialogVisible(false)}
|
199 |
+
style={{ width: '90vw', height: '90vh' }}
|
200 |
+
maximizable
|
201 |
+
modal
|
202 |
+
header="Charts"
|
203 |
+
>
|
204 |
+
{data && (
|
205 |
+
<div style={{ width: '100%', height: '100%' }}>
|
206 |
+
<Carousel
|
207 |
+
value={[
|
208 |
+
<WorldMap data={data.countries} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
|
209 |
+
<LanguagePlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
|
210 |
+
<SpeakerPlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
|
211 |
+
<HistoryPlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
|
212 |
+
]}
|
213 |
+
numScroll={1}
|
214 |
+
numVisible={1}
|
215 |
+
itemTemplate={item => item}
|
216 |
+
circular
|
217 |
+
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
218 |
+
/>
|
219 |
+
</div>
|
220 |
+
)}
|
221 |
+
</Dialog>
|
222 |
</div>
|
223 |
</PrimeReactProvider>
|
224 |
)
|
frontend/src/components/HistoryPlot.js
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import { useRef, useEffect } from 'react'
|
2 |
import * as Plot from '@observablehq/plot'
|
3 |
|
4 |
-
const HistoryPlot = ({ data }) => {
|
5 |
const containerRef = useRef()
|
6 |
const models = [...data.model_table] // sort copy, not in place
|
7 |
.filter(d => d.average !== null)
|
@@ -18,8 +18,8 @@ const HistoryPlot = ({ data }) => {
|
|
18 |
console.log(models)
|
19 |
useEffect(() => {
|
20 |
const plot = Plot.plot({
|
21 |
-
width:
|
22 |
-
height:
|
23 |
subtitle: 'Model performance over time',
|
24 |
x: {
|
25 |
label: 'Date',
|
|
|
1 |
import { useRef, useEffect } from 'react'
|
2 |
import * as Plot from '@observablehq/plot'
|
3 |
|
4 |
+
const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
5 |
const containerRef = useRef()
|
6 |
const models = [...data.model_table] // sort copy, not in place
|
7 |
.filter(d => d.average !== null)
|
|
|
18 |
console.log(models)
|
19 |
useEffect(() => {
|
20 |
const plot = Plot.plot({
|
21 |
+
width: width,
|
22 |
+
height: height,
|
23 |
subtitle: 'Model performance over time',
|
24 |
x: {
|
25 |
label: 'Date',
|
frontend/src/components/LanguagePlot.js
CHANGED
@@ -1,40 +1,40 @@
|
|
1 |
import { useRef, useEffect } from 'react'
|
2 |
import * as Plot from '@observablehq/plot'
|
3 |
|
4 |
-
const LanguagePlot = ({ data }) => {
|
5 |
const containerRef = useRef()
|
6 |
-
const languages = data.language_table.filter(a => a.
|
7 |
const families = [...new Set(languages.map(a => a.family))]
|
8 |
|
9 |
useEffect(() => {
|
10 |
const plot = Plot.plot({
|
11 |
-
width:
|
12 |
-
height:
|
13 |
-
subtitle: '
|
14 |
x: {
|
15 |
label: 'Number of Speakers',
|
16 |
type: 'log'
|
17 |
},
|
18 |
y: {
|
19 |
-
label: '
|
20 |
},
|
21 |
marks: [
|
22 |
Plot.dot(languages, {
|
23 |
x: 'speakers',
|
24 |
-
y: d => d.
|
25 |
r: 'speakers',
|
26 |
fill: 'family',
|
27 |
title: d =>
|
28 |
`${d.language_name}\n${d.speakers.toLocaleString('en-US', {
|
29 |
notation: 'compact'
|
30 |
-
})} speakers\nScore: ${d.
|
31 |
tip: true
|
32 |
}),
|
33 |
Plot.text(
|
34 |
languages.filter(a => a.speakers > 1e8),
|
35 |
{
|
36 |
x: 'speakers',
|
37 |
-
y: d => d.
|
38 |
text: d => d.language_name,
|
39 |
fill: 'black',
|
40 |
frameAnchor: 'left',
|
|
|
1 |
import { useRef, useEffect } from 'react'
|
2 |
import * as Plot from '@observablehq/plot'
|
3 |
|
4 |
+
const LanguagePlot = ({ data, width = 750, height = 500 }) => {
|
5 |
const containerRef = useRef()
|
6 |
+
const languages = data.language_table.filter(a => a.translation_from_bleu > 0)
|
7 |
const families = [...new Set(languages.map(a => a.family))]
|
8 |
|
9 |
useEffect(() => {
|
10 |
const plot = Plot.plot({
|
11 |
+
width: width,
|
12 |
+
height: height,
|
13 |
+
subtitle: 'Translation quality by language',
|
14 |
x: {
|
15 |
label: 'Number of Speakers',
|
16 |
type: 'log'
|
17 |
},
|
18 |
y: {
|
19 |
+
label: 'Translation quality (spBLEU score for translating from the given language to other languages)'
|
20 |
},
|
21 |
marks: [
|
22 |
Plot.dot(languages, {
|
23 |
x: 'speakers',
|
24 |
+
y: d => d.translation_from_bleu,
|
25 |
r: 'speakers',
|
26 |
fill: 'family',
|
27 |
title: d =>
|
28 |
`${d.language_name}\n${d.speakers.toLocaleString('en-US', {
|
29 |
notation: 'compact'
|
30 |
+
})} speakers\nScore: ${d.translation_from_bleu.toFixed(2)}`,
|
31 |
tip: true
|
32 |
}),
|
33 |
Plot.text(
|
34 |
languages.filter(a => a.speakers > 1e8),
|
35 |
{
|
36 |
x: 'speakers',
|
37 |
+
y: d => d.translation_from_bleu,
|
38 |
text: d => d.language_name,
|
39 |
fill: 'black',
|
40 |
frameAnchor: 'left',
|
frontend/src/components/ScoreColumns.js
CHANGED
@@ -13,8 +13,8 @@ const scoreBodyTemplate = (field, options = {}) => {
|
|
13 |
const ScoreColumns = [
|
14 |
<Column
|
15 |
field='average'
|
16 |
-
header='
|
17 |
-
headerTooltip='Language Proficiency Score (average of all displayed scores)'
|
18 |
sortable
|
19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
13 |
const ScoreColumns = [
|
14 |
<Column
|
15 |
field='average'
|
16 |
+
header='Overall'
|
17 |
+
headerTooltip='Language Proficiency Score (average of all displayed scores, after min-max normalization)'
|
18 |
sortable
|
19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
frontend/src/components/SpeakerPlot.js
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import { useRef, useEffect } from 'react'
|
2 |
import * as Plot from '@observablehq/plot'
|
3 |
|
4 |
-
const SpeakerPlot = ({ data }) => {
|
5 |
const containerRef = useRef()
|
6 |
const allSpeakers = data.language_table.reduce(
|
7 |
(sum, curr) => sum + curr.speakers,
|
@@ -25,8 +25,8 @@ const SpeakerPlot = ({ data }) => {
|
|
25 |
|
26 |
useEffect(() => {
|
27 |
const plot = Plot.plot({
|
28 |
-
width:
|
29 |
-
height:
|
30 |
subtitle: 'Number of languages vs speakers covered',
|
31 |
x: {
|
32 |
label: 'Languages',
|
|
|
1 |
import { useRef, useEffect } from 'react'
|
2 |
import * as Plot from '@observablehq/plot'
|
3 |
|
4 |
+
const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
|
5 |
const containerRef = useRef()
|
6 |
const allSpeakers = data.language_table.reduce(
|
7 |
(sum, curr) => sum + curr.speakers,
|
|
|
25 |
|
26 |
useEffect(() => {
|
27 |
const plot = Plot.plot({
|
28 |
+
width: width,
|
29 |
+
height: height,
|
30 |
subtitle: 'Number of languages vs speakers covered',
|
31 |
x: {
|
32 |
label: 'Languages',
|
frontend/src/components/WorldMap.js
CHANGED
@@ -32,7 +32,7 @@ const makeTitle = data => d => {
|
|
32 |
return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
|
33 |
}
|
34 |
|
35 |
-
const WorldMap = ({ data }) => {
|
36 |
const containerRef = useRef()
|
37 |
const [mapData, setMapData] = useState()
|
38 |
|
@@ -51,8 +51,8 @@ const WorldMap = ({ data }) => {
|
|
51 |
}, {})
|
52 |
const plot = Plot.plot({
|
53 |
subtitle: 'Language Proficiency Score by Country',
|
54 |
-
width:
|
55 |
-
height:
|
56 |
projection: 'equal-earth',
|
57 |
marks: [
|
58 |
Plot.geo(mapData, {
|
|
|
32 |
return `${d.properties.ADMIN} – ${cData?.score.toFixed(2)}\n\n${langstring}`
|
33 |
}
|
34 |
|
35 |
+
const WorldMap = ({ data, width = 750, height = 500 }) => {
|
36 |
const containerRef = useRef()
|
37 |
const [mapData, setMapData] = useState()
|
38 |
|
|
|
51 |
}, {})
|
52 |
const plot = Plot.plot({
|
53 |
subtitle: 'Language Proficiency Score by Country',
|
54 |
+
width: width,
|
55 |
+
height: height,
|
56 |
projection: 'equal-earth',
|
57 |
marks: [
|
58 |
Plot.geo(mapData, {
|
languages.json
CHANGED
@@ -235,7 +235,7 @@
|
|
235 |
"family":"Austroasiatic",
|
236 |
"flores_path":"vie_Latn",
|
237 |
"fleurs_tag":"vi_vn",
|
238 |
-
"commonvoice_hours":6.
|
239 |
"commonvoice_locale":"vi",
|
240 |
"in_benchmark":true
|
241 |
},
|
@@ -655,7 +655,7 @@
|
|
655 |
"family":"Atlantic-Congo",
|
656 |
"flores_path":"yor_Latn",
|
657 |
"fleurs_tag":"yo_ng",
|
658 |
-
"commonvoice_hours":6.
|
659 |
"commonvoice_locale":"yo",
|
660 |
"in_benchmark":true
|
661 |
},
|
@@ -5815,7 +5815,7 @@
|
|
5815 |
"family":"Indo-European",
|
5816 |
"flores_path":"ltg_Latn",
|
5817 |
"fleurs_tag":null,
|
5818 |
-
"commonvoice_hours":
|
5819 |
"commonvoice_locale":"ltg",
|
5820 |
"in_benchmark":true
|
5821 |
},
|
|
|
235 |
"family":"Austroasiatic",
|
236 |
"flores_path":"vie_Latn",
|
237 |
"fleurs_tag":"vi_vn",
|
238 |
+
"commonvoice_hours":6.3,
|
239 |
"commonvoice_locale":"vi",
|
240 |
"in_benchmark":true
|
241 |
},
|
|
|
655 |
"family":"Atlantic-Congo",
|
656 |
"flores_path":"yor_Latn",
|
657 |
"fleurs_tag":"yo_ng",
|
658 |
+
"commonvoice_hours":6.3,
|
659 |
"commonvoice_locale":"yo",
|
660 |
"in_benchmark":true
|
661 |
},
|
|
|
5815 |
"family":"Indo-European",
|
5816 |
"flores_path":"ltg_Latn",
|
5817 |
"fleurs_tag":null,
|
5818 |
+
"commonvoice_hours":30.0,
|
5819 |
"commonvoice_locale":"ltg",
|
5820 |
"in_benchmark":true
|
5821 |
},
|
notes/biases.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
We want to make a benchmark that objectively
|
2 |
+
|
3 |
+
- shows the level of how well-supported or neglected different languages are;
|
4 |
+
- shows for each language the performance of different AI models;
|
5 |
+
- shows how performant different AI models generally are regarding language support.
|
6 |
+
|
7 |
+
Turns out this is really difficult to do without having some kind of bias.
|
8 |
+
|
9 |
+
Here's a list of approaches:
|
10 |
+
|
11 |
+
- Translate **from English** to `evaluated_language`: This will give an advantage to languages that are similar to English, and makes it impossible to evaluate English itself.
|
12 |
+
- Translate from **every other language** to `evaluated_language`: This will give an advantage to language families that have a lot of members.
|
13 |
+
- Translate from every other language **(weighted by population)** to `evaluated language`: The metrics are usually not comparable across evaluated languages, specifically ...
|
14 |
+
- **BLEU**: May support some languages better than others.
|
15 |
+
- **BertScore**: Will give an advantage to English, which it is primarily trained on.
|
16 |
+
- **Multilingual** BertScore: Will give an advantage to the languages primarily trained on; may recognize semantic similarity even for untranslated words.
|
17 |
+
- **ChrF++**: Better than BLEU.
|
18 |
+
- **Sp**BLEU, SentencePiece-based tokenizers trained separately on `evaluated_language`, as provided for the FLORES+ dataset: Seems okay.
|
19 |
+
- Translate **from** `evaluated_language` to every other language (weighted by population), evaluate using any metric: For 2 very similar sister languages that are equally well translated, this gives an advantage to the smaller language (because it has the big sister language as an easy target, whereas the other sister language only has the small sister language as an easy target).
|
20 |
+
- Translate from `evaluated_language` to every language (`evaluated_language` **itself included**, weighted by population), evaluate using any metric: Gives an advantage to big languages that trivially get a high score for translating to themselves; but this is fair in terms of objectively showing "to how many people can I communicate (and to what extent) given the AI model".
|
21 |
+
- Rather than translation, use **masked language modeling** just on `evaluated_language` itself: This still depends on an evaluation metric, which is usually not comparable across languages, see the problems above.
|
22 |
+
- Use **categorization** of sentences in `evaluated_language` (where the same categories are used for all languages): Categories may be tied to certain cultures and thus languages, giving an advantage to the language/culture in which the categories are created.
|
23 |
+
- Use **culture-independent categories** for categorization of sentences in `evaluated_language` using zero-shot prompting with a given set of category labels: The labels should be language-independent but consistent at the same time, which may be difficult; in practice, English labels may be just fine.
|
24 |
+
- Use culture-independent categorization of sentences in `evaluated_language` using **few-shot prompting**: This seems okay.
|
notes/maps.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Possible sources for maps:
|
2 |
+
|
3 |
+
- [Natural Earth](https://www.naturalearthdata.com/): Their main version is not politically correct, and they do provide additional "orld view" data, including for Germany, but not for UN or other international organizations, and it's not very straightforward to use. Also has some issues with ISO2 codes, one can use [`ISO_A2_EH`](https://github.com/nvkelso/natural-earth-vector/issues/284) to work around that; still lacking Somalia though.
|
4 |
+
- [UN](https://geoportal.un.org/arcgis/apps/sites/#/geohub/datasets/d7caaff3ef4b4f7c82689b7c4694ad92/about): Has some countries inverted, we can mostly [correct for that](https://observablehq.com/@bumbeishvili/rewind-geojson), but it still leaves some artifacts in Norway and the Gulf of Amerxico.
|
5 |
+
- [World Bank](https://datacatalog.worldbank.org/search/dataset/0038272): Has missing ISO2 country codes for France and Norway.
|
6 |
+
- [EU](https://ec.europa.eu/eurostat/web/gisco/geodata/administrative-units/countries): Displays very weirdly, haven't looked into the details.
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|