SanderGi's picture
fix and make functional, add more datasets
c2e60bb
# This module handles interfacing with the huggingface api
from typing import Literal
from datetime import datetime
from huggingface_hub import HfApi
from huggingface_hub.errors import RepositoryNotFoundError
from datasets import load_dataset, concatenate_datasets, Dataset, Features, Value
from datasets.exceptions import DatasetNotFoundError
api = HfApi()
LEADERBOARD_ID = "KoelLabs/_IPA-TRANSCRIPTION-EN-SCORES"
LEADERBOARD_FEATURES = Features(
{
"display_name": Value("string"),
"repo_id": Value("string"),
"repo_hash": Value("string"),
"repo_last_modified": Value("timestamp[s, tz=UTC]"),
"submission_timestamp": Value("timestamp[s, tz=UTC]"),
"average_per": Value("float32"),
"average_fer": Value("float32"),
"url": Value("string"),
"fer_TIMIT": Value("float32"),
"fer_EpaDB": Value("float32"),
"fer_PSST": Value("float32"),
"fer_SpeechOcean": Value("float32"),
"fer_ISLE": Value("float32"),
}
)
LEADERBOARD_DEFAULTS = {
"url": "",
"fer_TIMIT": None,
"fer_EpaDB": None,
"fer_PSST": None,
"fer_SpeechOcean": None,
"fer_ISLE": None,
}
def get_repo_info(
repo_id, type: Literal["model", "dataset", "space"] = "model"
) -> tuple[str, datetime]:
try:
repo_info = api.repo_info(repo_id=repo_id, repo_type=type)
return repo_info.sha, repo_info.last_modified # type: ignore
except RepositoryNotFoundError:
return "", datetime(year=1970, month=1, day=1)
def get_or_create_leaderboard() -> Dataset:
modified = False
try:
dataset: Dataset = load_dataset(LEADERBOARD_ID)["train"] # type: ignore
except DatasetNotFoundError:
empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()}
dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES)
modified = True
except ValueError:
empty_data = {col: [] for col in LEADERBOARD_FEATURES.keys()}
dataset = Dataset.from_dict(empty_data, features=LEADERBOARD_FEATURES)
for col in LEADERBOARD_FEATURES.keys():
if col not in dataset.column_names:
modified = True
dataset = dataset.add_column(col, [LEADERBOARD_DEFAULTS.get(col)] * len(dataset)) # type: ignore
dataset = dataset.cast_column(col, feature=LEADERBOARD_FEATURES[col])
if modified:
dataset.push_to_hub(LEADERBOARD_ID, private=True)
return dataset
def add_leaderboard_entry(
display_name: str,
repo_id: str,
repo_hash: str,
repo_last_modified: datetime,
submission_timestamp: datetime,
average_per: float,
average_fer: float,
url: str,
per_dataset_fers: dict = {},
):
existing_dataset = get_or_create_leaderboard()
new_row = Dataset.from_dict(
dict(
display_name=[display_name],
repo_id=[repo_id],
repo_hash=[repo_hash],
repo_last_modified=[repo_last_modified.replace(microsecond=0)],
submission_timestamp=[submission_timestamp.replace(microsecond=0)],
average_per=[average_per],
average_fer=[average_fer],
url=[url],
fer_TIMIT=[per_dataset_fers.get("TIMIT")],
fer_EpaDB=[per_dataset_fers.get("EpaDB")],
fer_PSST=[per_dataset_fers.get("PSST")],
fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")],
fer_ISLE=[per_dataset_fers.get("ISLE")],
),
features=LEADERBOARD_FEATURES,
)
combined_dataset = concatenate_datasets([existing_dataset, new_row])
combined_dataset.push_to_hub(LEADERBOARD_ID, private=True)
if __name__ == "__main__":
print(get_repo_info(LEADERBOARD_ID, type="dataset"))
print(get_or_create_leaderboard().to_pandas().head(5)) # type: ignore