whisperkit-benchmarks / .github /scripts /check_dataset_update.py
ardaatahan's picture
Add parity, test coverage and distinguish between device support and test coverage
d3a1d6c
raw
history blame
3.72 kB
import json
import os
from datetime import datetime, timedelta
from github import Github
from huggingface_hub import HfApi, login
def check_dataset_updates(dataset_id):
api = HfApi()
github = Github(os.environ["GH_TOKEN"])
repo = github.get_repo("argmaxinc/whisperkit")
dataset_info = api.dataset_info(dataset_id)
last_modified = dataset_info.lastModified.isoformat()
current_sha = dataset_info.sha
repo_tree = api.list_repo_tree(
repo_id=dataset_id,
repo_type="dataset",
path_in_repo="benchmark_data",
recursive=False,
)
cutoff_date = datetime.now(dataset_info.lastModified.tzinfo) - timedelta(weeks=6)
commit_dates_hashes = [item.path.split("/")[-1] for item in repo_tree]
new_commit_hashes = []
for commit_date_hash in commit_dates_hashes:
commit_date, commit_hash = commit_date_hash.split("_")
commit_date = datetime.strptime(commit_date, "%Y-%m-%dT%H%M%S").replace(
tzinfo=dataset_info.lastModified.tzinfo
)
if commit_date < cutoff_date:
continue
new_commit_hashes.append(commit_hash)
commit_info = []
for commit_hash in new_commit_hashes:
try:
commit = repo.get_commit(commit_hash)
commit_date = commit.commit.author.date
version = get_commit_version(repo, commit_hash)
if version:
commit_info.append((commit_hash, commit_date, version))
except Exception as e:
print(f"Error processing commit {commit_hash}: {str(e)}")
continue
# Sort by commit date
commit_info.sort(key=lambda x: x[1])
# Extract sorted commits and versions
new_releases = [info[0] for info in commit_info]
new_versions = [info[2] for info in commit_info]
cache_dir = "dashboard_data"
cache_file = os.path.join(cache_dir, "version.json")
with open(cache_file, "r") as f:
version = json.load(f)
releases = version["releases"]
versions = version["versions"]
updated_releases = []
updated_versions = []
for release, version in zip(new_releases, new_versions):
if release not in releases:
updated_releases.append(release)
updated_versions.append(version)
if os.path.exists(cache_file):
with open(cache_file, "r") as f:
cached_data = json.load(f)
if cached_data.get("sha") == current_sha:
with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
print(f"has_updates=false", file=fh)
return
with open(cache_file, "w") as f:
json.dump(
{
"last_modified": last_modified,
"sha": current_sha,
"releases": releases + updated_releases,
"versions": versions + updated_versions,
},
f,
)
with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
print(f"has_updates=true", file=fh)
def get_commit_version(repo, commit_hash):
try:
releases = list(repo.get_releases())
releases.sort(key=lambda x: x.created_at)
commit = repo.get_commit(commit_hash)
commit_date = commit.commit.author.date
for i, release in enumerate(releases):
if commit_date <= release.created_at:
return releases[i].tag_name.lstrip("v")
return releases[-1].tag_name.lstrip("v")
except Exception as e:
print(f"Error processing commit {commit_hash}: {str(e)}")
return None
if __name__ == "__main__":
login(token=os.environ["HF_TOKEN"])
check_dataset_updates("argmaxinc/whisperkit-evals-dataset")