File size: 5,211 Bytes
47e279a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import json
import re
from pathlib import Path

import requests
import streamlit as st
import yaml
from huggingface_hub import hf_hub_download
from streamlit_tags import st_tags

# exact same regex as in the Hub server. Please keep in sync.
REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")

with open("languages.json") as f:
    lang2name = json.load(f)


def try_parse_yaml(yaml_block):
    try:
        metadata = yaml.load(yaml_block, yaml.SafeLoader)
    except yaml.YAMLError as e:
        print("Error while parsing the metadata YAML:")
        if hasattr(e, "problem_mark"):
            if e.context is not None:
                st.error(
                    str(e.problem_mark)
                    + "\n  "
                    + str(e.problem)
                    + " "
                    + str(e.context)
                    + "\nPlease correct the README.md and retry."
                )
            else:
                st.error(
                    str(e.problem_mark)
                    + "\n  "
                    + str(e.problem)
                    + "\nPlease correct the README.md and retry."
                )
        else:
            st.error(
                "Something went wrong while parsing the metadata. "
                "Make sure it's written according to the YAML spec!"
            )
        return None
    return metadata


def main():
    st.markdown("## 1. Load your model's metadata")
    st.markdown("Enter your model's path below.")
    model_id = st.text_input("", placeholder="<username>/<model>")
    if not model_id.strip():
        st.stop()
    try:
        readme_path = hf_hub_download(model_id, filename="README.md")
    except requests.exceptions.HTTPError:
        st.error(
            f"ERROR: https://huggingface.co/{model_id}/blob/main/README.md "
            f"not found, make sure you've entered a correct model path!"
        )
        st.stop()

    content = Path(readme_path).read_text()
    match = REGEX_YAML_BLOCK.search(content)
    if match:
        meta_yaml = match.group(1)
    else:
        st.error(
            "ERROR: Couldn't find the metadata section inside your model's `README.md`. Do you have some basic metadata "
            "enclosed in `---` as described in [the Hub documentation](https://huggingface.co/docs/hub/model-repos#model-card-metadata)?"
        )
        st.stop()

    metadata = try_parse_yaml(meta_yaml)
    if metadata is None:
        st.stop()
    else:
        st.success("Successfully loaded the metadata!")
    with st.expander("Inspect the parsed metadata for debugging"):
        st.json(metadata)

    st.markdown("## 2. Edit the data")

    ############################
    # LANGUAGES
    ############################
    st.markdown("### Language(s)")
    st.markdown(
        "For each spoken language that your model handles, enter an "
        "[ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) language code, or "
        "find an appropriate alternative from "
        "[our list here](https://huggingface.co/spaces/huggingface/hf-speech-bench/blob/main/languages.json). "
        "When in doubt, use the most generic language code, e.g. `en` instead of `en-GB` and `en-US`."
    )
    st.markdown("*Example*: `cs, hsb, pl`")
    metadata["language"] = metadata["language"] if "language" in metadata else []
    metadata["language"] = (
        metadata["language"]
        if isinstance(metadata["language"], list)
        else [metadata["language"]]
    )

    languages = st_tags(
        label="", text="add more if needed, and press enter", value=metadata["language"]
    )
    lang_names = [lang2name[lang] if lang in lang2name else lang for lang in languages]
    st.markdown("These languages will be parsed by the leaderboard as: ")
    st.code(", ".join(lang_names))

    ############################
    # TRAIN DATASETS
    ############################
    st.markdown("### Training dataset(s)")
    st.markdown("List the datasets that your model was trained on.")
    st.markdown("*Example*: `librispeech_asr, mozilla-foundation/common_voice_8_0`")

    if "datasets" not in metadata:
        metadata["datasets"] = []

    train_datasets = st_tags(
        label="", text="add more if needed, and press enter", value=metadata["datasets"]
    )
    if "common_voice" in train_datasets:
        st.warning(
            "WARNING: `common_voice` is deprecated, please replace it with its equivalent: "
            "`mozilla-foundation/common_voice_6_1`"
        )

    ############################
    # MODEL NAME
    ############################
    st.markdown("### Model name")
    st.markdown("Enter a descriptive name for your model.")
    st.markdown("*Example*: `XLS-R Wav2Vec2 LM Spanish by Jane Doe`")

    if "model_index" not in metadata:
        metadata["model_index"] = [{}]
    if "name" not in ["model_index"][0]:
        metadata["model_index"][0]["name"] = model_id.split("/")[-1]
    model_name = st.text_input("", value=metadata["model_index"][0]["name"])

    ############################
    # EVAL DATASETS
    ############################
    st.markdown("### Evaluation metrics")


if __name__ == "__main__":
    main()