Spaces:
Running
Running
# This module handles interfacing with the huggingface api | |
from typing import Literal | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
from huggingface_hub.errors import RepositoryNotFoundError | |
from datasets import load_dataset, concatenate_datasets, Dataset, Features, Value | |
from datasets.exceptions import DatasetNotFoundError | |
api = HfApi() | |
LEADERBOARD_ID = "KoelLabs/_IPA-TRANSCRIPTION-EN-SCORES" | |
LEADERBOARD_FEATURES = Features( | |
{ | |
"display_name": Value("string"), | |
"repo_id": Value("string"), | |
"repo_hash": Value("string"), | |
"repo_last_modified": Value("timestamp[s, tz=UTC]"), | |
"submission_timestamp": Value("timestamp[s, tz=UTC]"), | |
"average_per": Value("float32"), | |
"average_fer": Value("float32"), | |
"url": Value("string"), | |
"fer_TIMIT": Value("float32"), | |
"fer_EpaDB": Value("float32"), | |
"fer_PSST": Value("float32"), | |
"fer_SpeechOcean": Value("float32"), | |
"fer_ISLE": Value("float32"), | |
} | |
) | |
LEADERBOARD_DEFAULTS = { | |
"url": "", | |
"fer_TIMIT": None, | |
"fer_EpaDB": None, | |
"fer_PSST": None, | |
"fer_SpeechOcean": None, | |
"fer_ISLE": None, | |
} | |
def get_repo_info( | |
repo_id, type: Literal["model", "dataset", "space"] = "model" | |
) -> tuple[str, datetime]: | |
try: | |
repo_info = api.repo_info(repo_id=repo_id, repo_type=type) | |
return repo_info.sha, repo_info.last_modified # type: ignore | |
except RepositoryNotFoundError: | |
return "", datetime(year=1970, month=1, day=1) | |
def get_or_create_leaderboard() -> Dataset: | |
modified = False | |
try: | |
dataset: Dataset = load_dataset(LEADERBOARD_ID)["train"] # type: ignore | |
except DatasetNotFoundError: | |
empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()} | |
dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES) | |
modified = True | |
except ValueError: | |
empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()} | |
dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES) | |
for col in LEADERBOARD_FEATURES.keys(): | |
if col not in dataset.column_names: | |
modified = True | |
dataset = dataset.add_column(col, [LEADERBOARD_DEFAULTS.get(col)] * len(dataset)) # type: ignore | |
dataset = dataset.cast_column(col, feature=LEADERBOARD_FEATURES[col]) | |
if modified: | |
dataset.push_to_hub(LEADERBOARD_ID, private=True) | |
return dataset | |
def add_leaderboard_entry( | |
display_name: str, | |
repo_id: str, | |
repo_hash: str, | |
repo_last_modified: datetime, | |
submission_timestamp: datetime, | |
average_per: float, | |
average_fer: float, | |
url: str, | |
per_dataset_fers: dict = {}, | |
): | |
existing_dataset = get_or_create_leaderboard() | |
new_row = Dataset.from_dict( | |
dict( | |
display_name=[display_name], | |
repo_id=[repo_id], | |
repo_hash=[repo_hash], | |
repo_last_modified=[repo_last_modified.replace(microsecond=0)], | |
submission_timestamp=[submission_timestamp.replace(microsecond=0)], | |
average_per=[average_per], | |
average_fer=[average_fer], | |
url=[url], | |
fer_TIMIT=[per_dataset_fers.get("TIMIT")], | |
fer_EpaDB=[per_dataset_fers.get("EpaDB")], | |
fer_PSST=[per_dataset_fers.get("PSST")], | |
fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")], | |
fer_ISLE=[per_dataset_fers.get("ISLE")], | |
), | |
features=LEADERBOARD_FEATURES, | |
) | |
combined_dataset = concatenate_datasets([existing_dataset, new_row]) | |
combined_dataset.push_to_hub(LEADERBOARD_ID, private=True) | |
if __name__ == "__main__": | |
print(get_repo_info(LEADERBOARD_ID, type="dataset")) | |
print(get_or_create_leaderboard().to_pandas().head(5)) # type: ignore | |