File size: 2,670 Bytes
da6e1bc
a73f888
 
 
da6e1bc
4d13673
da6e1bc
a73f888
 
 
 
8274634
a73f888
 
 
 
da6e1bc
a73f888
 
 
 
b311dd5
da6e1bc
 
 
8274634
52abc5b
da6e1bc
941d5c5
da6e1bc
 
3ed02d5
da6e1bc
a73f888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da6e1bc
 
 
2f9dee1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import asyncio
from time import time

t0 = time()

import pandas as pd
from languages import languages

print(f"loaded languages in {time() - t0:.2f}s")
t0 = time()

from models import models

print(f"loaded models in {time() - t0:.2f}s")
t0 = time()

from tasks import tasks

print(f"loaded tasks in {time() - t0:.2f}s")
t0 = time()

from tqdm.asyncio import tqdm_asyncio

# ===== config =====

n_sentences = 10
n_models = 35


# ===== run evaluation and aggregate results =====


async def evaluate():
    # FIXME we should not need this for loop, but it helps
    for n_languages in range(0, 200):
        print(f"running evaluations for {n_languages} languages")
        old_results = pd.read_json("results.json")
        old_models = pd.read_json("models.json")
        # get all combinations of model, language and task
        combis = [
            (model, lang.bcp_47, task_name)
            for task_name, task in tasks.items()
            for lang in languages.iloc[:n_languages].itertuples()
            for model in models["id"].iloc[:n_models]
        ]
        # filter out combinations that have already been evaluated
        combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
        combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
        combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
        # run evaluations
        results = [
            tasks[task_name](model, bcp_47, i)
            for i in range(n_sentences)
            for model, bcp_47, task_name in combis.itertuples(index=False)
        ]
        results = await tqdm_asyncio.gather(*results, miniters=1)
        results = [r for group in results for r in group]
        args = dict(orient="records", indent=2, force_ascii=False)
        if results:
            # aggregate results
            results = pd.DataFrame(results)
            results = (
                results.groupby(["model", "bcp_47", "task", "metric"])
                .agg({"score": "mean"})
                .reset_index()
            )
            # save results
            results = pd.concat([old_results, results])
            results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
            results.to_json("results.json", **args)
        # save up-to-date info on models and languages
        all_models = pd.concat([old_models, pd.DataFrame(models)])
        all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
        all_models.to_json("models.json", **args)
        pd.DataFrame(languages).to_json("languages.json", **args)


if __name__ == "__main__":
    results = asyncio.run(evaluate())