Spaces:
Running
Running
File size: 3,826 Bytes
c2e60bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# This module handles interfacing with the huggingface api
from typing import Literal
from datetime import datetime
from huggingface_hub import HfApi
from huggingface_hub.errors import RepositoryNotFoundError
from datasets import load_dataset, concatenate_datasets, Dataset, Features, Value
from datasets.exceptions import DatasetNotFoundError
api = HfApi()
LEADERBOARD_ID = "KoelLabs/_IPA-TRANSCRIPTION-EN-SCORES"
LEADERBOARD_FEATURES = Features(
{
"display_name": Value("string"),
"repo_id": Value("string"),
"repo_hash": Value("string"),
"repo_last_modified": Value("timestamp[s, tz=UTC]"),
"submission_timestamp": Value("timestamp[s, tz=UTC]"),
"average_per": Value("float32"),
"average_fer": Value("float32"),
"url": Value("string"),
"fer_TIMIT": Value("float32"),
"fer_EpaDB": Value("float32"),
"fer_PSST": Value("float32"),
"fer_SpeechOcean": Value("float32"),
"fer_ISLE": Value("float32"),
}
)
LEADERBOARD_DEFAULTS = {
"url": "",
"fer_TIMIT": None,
"fer_EpaDB": None,
"fer_PSST": None,
"fer_SpeechOcean": None,
"fer_ISLE": None,
}
def get_repo_info(
repo_id, type: Literal["model", "dataset", "space"] = "model"
) -> tuple[str, datetime]:
try:
repo_info = api.repo_info(repo_id=repo_id, repo_type=type)
return repo_info.sha, repo_info.last_modified # type: ignore
except RepositoryNotFoundError:
return "", datetime(year=1970, month=1, day=1)
def get_or_create_leaderboard() -> Dataset:
modified = False
try:
dataset: Dataset = load_dataset(LEADERBOARD_ID)["train"] # type: ignore
except DatasetNotFoundError:
empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()}
dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES)
modified = True
except ValueError:
empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()}
dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES)
for col in LEADERBOARD_FEATURES.keys():
if col not in dataset.column_names:
modified = True
dataset = dataset.add_column(col, [LEADERBOARD_DEFAULTS.get(col)] * len(dataset)) # type: ignore
dataset = dataset.cast_column(col, feature=LEADERBOARD_FEATURES[col])
if modified:
dataset.push_to_hub(LEADERBOARD_ID, private=True)
return dataset
def add_leaderboard_entry(
display_name: str,
repo_id: str,
repo_hash: str,
repo_last_modified: datetime,
submission_timestamp: datetime,
average_per: float,
average_fer: float,
url: str,
per_dataset_fers: dict = {},
):
existing_dataset = get_or_create_leaderboard()
new_row = Dataset.from_dict(
dict(
display_name=[display_name],
repo_id=[repo_id],
repo_hash=[repo_hash],
repo_last_modified=[repo_last_modified.replace(microsecond=0)],
submission_timestamp=[submission_timestamp.replace(microsecond=0)],
average_per=[average_per],
average_fer=[average_fer],
url=[url],
fer_TIMIT=[per_dataset_fers.get("TIMIT")],
fer_EpaDB=[per_dataset_fers.get("EpaDB")],
fer_PSST=[per_dataset_fers.get("PSST")],
fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")],
fer_ISLE=[per_dataset_fers.get("ISLE")],
),
features=LEADERBOARD_FEATURES,
)
combined_dataset = concatenate_datasets([existing_dataset, new_row])
combined_dataset.push_to_hub(LEADERBOARD_ID, private=True)
if __name__ == "__main__":
print(get_repo_info(LEADERBOARD_ID, type="dataset"))
print(get_or_create_leaderboard().to_pandas().head(5)) # type: ignore
|