File size: 3,826 Bytes
c2e60bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# This module handles interfacing with the huggingface api

from typing import Literal
from datetime import datetime

from huggingface_hub import HfApi
from huggingface_hub.errors import RepositoryNotFoundError
from datasets import load_dataset, concatenate_datasets, Dataset, Features, Value
from datasets.exceptions import DatasetNotFoundError

api = HfApi()

LEADERBOARD_ID = "KoelLabs/_IPA-TRANSCRIPTION-EN-SCORES"
LEADERBOARD_FEATURES = Features(
    {
        "display_name": Value("string"),
        "repo_id": Value("string"),
        "repo_hash": Value("string"),
        "repo_last_modified": Value("timestamp[s, tz=UTC]"),
        "submission_timestamp": Value("timestamp[s, tz=UTC]"),
        "average_per": Value("float32"),
        "average_fer": Value("float32"),
        "url": Value("string"),
        "fer_TIMIT": Value("float32"),
        "fer_EpaDB": Value("float32"),
        "fer_PSST": Value("float32"),
        "fer_SpeechOcean": Value("float32"),
        "fer_ISLE": Value("float32"),
    }
)
LEADERBOARD_DEFAULTS = {
    "url": "",
    "fer_TIMIT": None,
    "fer_EpaDB": None,
    "fer_PSST": None,
    "fer_SpeechOcean": None,
    "fer_ISLE": None,
}


def get_repo_info(
    repo_id, type: Literal["model", "dataset", "space"] = "model"
) -> tuple[str, datetime]:
    try:
        repo_info = api.repo_info(repo_id=repo_id, repo_type=type)
        return repo_info.sha, repo_info.last_modified  # type: ignore
    except RepositoryNotFoundError:
        return "", datetime(year=1970, month=1, day=1)


def get_or_create_leaderboard() -> Dataset:
    modified = False
    try:
        dataset: Dataset = load_dataset(LEADERBOARD_ID)["train"]  # type: ignore
    except DatasetNotFoundError:
        empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()}
        dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES)
        modified = True
    except ValueError:
        empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()}
        dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES)

    for col in LEADERBOARD_FEATURES.keys():
        if col not in dataset.column_names:
            modified = True
            dataset = dataset.add_column(col, [LEADERBOARD_DEFAULTS.get(col)] * len(dataset))  # type: ignore
            dataset = dataset.cast_column(col, feature=LEADERBOARD_FEATURES[col])

    if modified:
        dataset.push_to_hub(LEADERBOARD_ID, private=True)

    return dataset


def add_leaderboard_entry(
    display_name: str,
    repo_id: str,
    repo_hash: str,
    repo_last_modified: datetime,
    submission_timestamp: datetime,
    average_per: float,
    average_fer: float,
    url: str,
    per_dataset_fers: dict = {},
):
    existing_dataset = get_or_create_leaderboard()
    new_row = Dataset.from_dict(
        dict(
            display_name=[display_name],
            repo_id=[repo_id],
            repo_hash=[repo_hash],
            repo_last_modified=[repo_last_modified.replace(microsecond=0)],
            submission_timestamp=[submission_timestamp.replace(microsecond=0)],
            average_per=[average_per],
            average_fer=[average_fer],
            url=[url],
            fer_TIMIT=[per_dataset_fers.get("TIMIT")],
            fer_EpaDB=[per_dataset_fers.get("EpaDB")],
            fer_PSST=[per_dataset_fers.get("PSST")],
            fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")],
            fer_ISLE=[per_dataset_fers.get("ISLE")],
        ),
        features=LEADERBOARD_FEATURES,
    )
    combined_dataset = concatenate_datasets([existing_dataset, new_row])
    combined_dataset.push_to_hub(LEADERBOARD_ID, private=True)


if __name__ == "__main__":
    print(get_repo_info(LEADERBOARD_ID, type="dataset"))
    print(get_or_create_leaderboard().to_pandas().head(5))  # type: ignore