|
import glob |
|
import os |
|
import random |
|
|
|
import torch |
|
|
|
|
|
|
|
|
|
def split_dictionary_into_chunks(input_dict, split_n): |
|
res = [] |
|
new_dict = {} |
|
elements_per_dict = (len(input_dict.keys()) // split_n) + 1 |
|
for k, v in input_dict.items(): |
|
if len(new_dict) < elements_per_dict: |
|
new_dict[k] = v |
|
else: |
|
res.append(new_dict) |
|
new_dict = {k: v} |
|
res.append(new_dict) |
|
return res |
|
|
|
|
|
def limit_to_n(path_to_transcript_dict, n=40000): |
|
|
|
limited_dict = dict() |
|
if len(path_to_transcript_dict.keys()) > n: |
|
for key in random.sample(list(path_to_transcript_dict.keys()), n): |
|
limited_dict[key] = path_to_transcript_dict[key] |
|
return limited_dict |
|
else: |
|
return path_to_transcript_dict |
|
|
|
|
|
def build_path_to_transcript_dict_multi_ling_librispeech_template(root): |
|
""" |
|
https://arxiv.org/abs/2012.03411 |
|
""" |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "transcripts.txt"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
fields = line.split("\t") |
|
wav_folders = fields[0].split("_") |
|
wav_path = f"{root}/audio/{wav_folders[0]}/{wav_folders[1]}/{fields[0]}.flac" |
|
path_to_transcript[wav_path] = fields[1] |
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_hui_template(root): |
|
""" |
|
https://arxiv.org/abs/2106.06309 |
|
""" |
|
path_to_transcript = dict() |
|
for el in os.listdir(root): |
|
if os.path.isdir(os.path.join(root, el)): |
|
with open(os.path.join(root, el, "metadata.csv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("|")[1] |
|
wav_path = os.path.join(root, el, "wavs", line.split("|")[0] + ".wav") |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript |
|
return path_to_transcript |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_english(re_cache=False): |
|
lang = "english" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train/pttd_cache.pt" |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_gigaspeech(re_cache=False): |
|
root = "/mount/resources/speech/corpora/GigaSpeech/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "transcripts_only_clean_samples.txt"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
fields = line.split("\t") |
|
norm_transcript = fields[1] |
|
wav_path = fields[0] |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_elizabeth(re_cache=False): |
|
root = "/mount/resources/speech/corpora/MAILabs_british_single_speaker_elizabeth" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
for el in os.listdir(root): |
|
if os.path.isdir(os.path.join(root, el)): |
|
with open(os.path.join(root, el, "metadata.csv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("|")[2] |
|
wav_path = os.path.join(root, el, "wavs", line.split("|")[0] + ".wav") |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_nancy(re_cache=False): |
|
root = "/mount/resources/speech/corpora/NancyKrebs" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "metadata.csv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("|")[1] |
|
wav_path = os.path.join(root, "wav", line.split("|")[0] + ".wav") |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_integration_test(re_cache=True): |
|
root = "/mount/resources/speech/corpora/NancyKrebs" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "metadata.csv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n")[:500]: |
|
if line.strip() != "": |
|
norm_transcript = line.split("|")[1] |
|
wav_path = os.path.join(root, "wav", line.split("|")[0] + ".wav") |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_CREMA_D(re_cache=False): |
|
root = "/mount/resources/speech/corpora/CREMA_D/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
identifier_to_sent = {"IEO": "It's eleven o'clock.", |
|
"TIE": "That is exactly what happened.", |
|
"IOM": "I'm on my way to the meeting.", |
|
"IWW": "I wonder what this is about.", |
|
"TAI": "The airplane is almost full.", |
|
"MTI": "Maybe tomorrow it will be cold.", |
|
"IWL": "I would like a new alarm clock.", |
|
"ITH": "I think, I have a doctor's appointment.", |
|
"DFA": "Don't forget a jacket.", |
|
"ITS": "I think, I've seen this before.", |
|
"TSI": "The surface is slick.", |
|
"WSI": "We'll stop in a couple of minutes."} |
|
path_to_transcript = dict() |
|
for file in os.listdir(root): |
|
if file.endswith(".wav"): |
|
path_to_transcript[root + file] = identifier_to_sent[file.split("_")[1]] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_EmoV_DB(re_cache=False): |
|
root = "/mount/resources/speech/corpora/EmoV_DB/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "labels.txt"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
identifier_to_sent = dict() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
identifier_to_sent[line.split()[0]] = " ".join(line.split()[1:]) |
|
for file in os.listdir(root): |
|
if file.endswith(".wav"): |
|
path_to_transcript[root + file] = identifier_to_sent[file[-14:-10]] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_ryanspeech(re_cache=False): |
|
root = "/mount/resources/speech/corpora/RyanSpeech" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript_dict = dict() |
|
with open(root + "/metadata.csv", mode="r", encoding="utf8") as f: |
|
transcripts = f.read().split("\n") |
|
for transcript in transcripts: |
|
if transcript.strip() != "": |
|
parsed_line = transcript.split("|") |
|
audio_file = f"{root}/wavs/{parsed_line[0]}.wav" |
|
path_to_transcript_dict[audio_file] = parsed_line[2] |
|
torch.save(path_to_transcript_dict, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_RAVDESS(re_cache=False): |
|
root = "/mount/resources/speech/corpora/RAVDESS" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript_dict = dict() |
|
for speaker_dir in os.listdir(root): |
|
for audio_file in os.listdir(os.path.join(root, speaker_dir)): |
|
if audio_file.split("-")[4] == "01": |
|
path_to_transcript_dict[os.path.join(root, speaker_dir, audio_file)] = "Kids are talking by the door." |
|
else: |
|
path_to_transcript_dict[os.path.join(root, speaker_dir, audio_file)] = "Dogs are sitting by the door." |
|
torch.save(path_to_transcript_dict, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_ESDS(re_cache=False): |
|
root = "/mount/resources/speech/corpora/Emotional_Speech_Dataset_Singapore" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript_dict = dict() |
|
for speaker_dir in os.listdir(root): |
|
if speaker_dir.startswith("00"): |
|
if int(speaker_dir) > 10: |
|
with open(f"{root}/{speaker_dir}/fixed_unicode.txt", mode="r", encoding="utf8") as f: |
|
transcripts = f.read() |
|
for line in transcripts.replace("\n\n", "\n").replace(",", ", ").split("\n"): |
|
if line.strip() != "": |
|
filename, text, emo_dir = line.split("\t") |
|
filename = speaker_dir + "_" + filename.split("_")[1] |
|
path_to_transcript_dict[f"{root}/{speaker_dir}/{emo_dir}/{filename}.wav"] = text |
|
torch.save(path_to_transcript_dict, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_nvidia_hifitts(re_cache=False): |
|
root = "/mount/resources/speech/corpora/hi_fi_tts_v0" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
transcripts = list() |
|
import json |
|
for jpath in [f"{root}/6097_manifest_clean_dev.json", |
|
f"{root}/6097_manifest_clean_test.json", |
|
f"{root}/6097_manifest_clean_train.json", |
|
f"{root}/9017_manifest_clean_dev.json", |
|
f"{root}/9017_manifest_clean_test.json", |
|
f"{root}/9017_manifest_clean_train.json", |
|
f"{root}/92_manifest_clean_dev.json", |
|
f"{root}/92_manifest_clean_test.json", |
|
f"{root}/92_manifest_clean_train.json"]: |
|
with open(jpath, encoding='utf-8', mode='r') as jfile: |
|
for line in jfile.read().split("\n"): |
|
if line.strip() != "": |
|
transcripts.append(json.loads(line)) |
|
for transcript in transcripts: |
|
path = transcript["audio_filepath"] |
|
norm_text = transcript["text_normalized"] |
|
path_to_transcript[f"{root}/{path}"] = norm_text |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_blizzard_2013(re_cache=False): |
|
root = "/mount/resources/speech/corpora/Blizzard2013/train/segmented/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(root + "prompts.gui", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
blocks = transcriptions.split("||\n") |
|
for block in blocks: |
|
trans_lines = block.split("\n") |
|
if trans_lines[0].strip() != "": |
|
transcript = trans_lines[1].replace("@", "").replace("#", ",").replace("|", "").replace(";", ",").replace( |
|
":", ",").replace(" 's", "'s").replace(", ,", ",").replace(" ", " ").replace(" ,", ",").replace(" .", |
|
".").replace( |
|
" ?", "?").replace(" !", "!").rstrip(" ,") |
|
path_to_transcript[root + "wavn/" + trans_lines[0] + ".wav"] = transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_vctk(re_cache=False): |
|
root = "/mount/resources/speech/corpora/VCTK" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
for transcript_dir in os.listdir("/mount/resources/speech/corpora/VCTK/txt"): |
|
for transcript_file in os.listdir(f"/mount/resources/speech/corpora/VCTK/txt/{transcript_dir}"): |
|
if transcript_file.endswith(".txt"): |
|
with open(f"/mount/resources/speech/corpora/VCTK/txt/{transcript_dir}/" + transcript_file, 'r', |
|
encoding='utf8') as tf: |
|
transcript = tf.read() |
|
wav_path = f"/mount/resources/speech/corpora/VCTK/wav48_silence_trimmed/{transcript_dir}/" + transcript_file.rstrip( |
|
".txt") + "_mic2.flac" |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_libritts_all_clean(re_cache=False): |
|
root = "/mount/resources/speech/corpora/LibriTTS_R/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_train = "/mount/resources/speech/corpora/LibriTTS_R/" |
|
path_to_transcript = dict() |
|
for speaker in os.listdir(path_train): |
|
for chapter in os.listdir(os.path.join(path_train, speaker)): |
|
for file in os.listdir(os.path.join(path_train, speaker, chapter)): |
|
if file.endswith("normalized.txt"): |
|
with open(os.path.join(path_train, speaker, chapter, file), 'r', encoding='utf8') as tf: |
|
transcript = tf.read() |
|
wav_file = file.split(".")[0] + ".wav" |
|
path_to_transcript[os.path.join(path_train, speaker, chapter, wav_file)] = transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_libritts_other500(re_cache=False): |
|
root = "/mount/resources/asr-data/LibriTTS/train-other-500" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_train = "/mount/resources/asr-data/LibriTTS/train-other-500" |
|
path_to_transcript = dict() |
|
for speaker in os.listdir(path_train): |
|
for chapter in os.listdir(os.path.join(path_train, speaker)): |
|
for file in os.listdir(os.path.join(path_train, speaker, chapter)): |
|
if file.endswith("normalized.txt"): |
|
with open(os.path.join(path_train, speaker, chapter, file), 'r', encoding='utf8') as tf: |
|
transcript = tf.read() |
|
wav_file = file.split(".")[0] + ".wav" |
|
path_to_transcript[os.path.join(path_train, speaker, chapter, wav_file)] = transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_ljspeech(re_cache=False): |
|
root = "/mount/resources/speech/corpora/LJSpeech/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
for transcript_file in os.listdir("/mount/resources/speech/corpora/LJSpeech/16kHz/txt"): |
|
with open("/mount/resources/speech/corpora/LJSpeech/16kHz/txt/" + transcript_file, 'r', encoding='utf8') as tf: |
|
transcript = tf.read() |
|
wav_path = "/mount/resources/speech/corpora/LJSpeech/16kHz/wav/" + transcript_file.rstrip(".txt") + ".wav" |
|
path_to_transcript[wav_path] = transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_jenny(re_cache=False): |
|
""" |
|
https://www.kaggle.com/datasets/noml4u/jenny-tts-dataset |
|
https://github.com/dioco-group/jenny-tts-dataset |
|
|
|
Dataset of Speaker Jenny (Dioco) with an Irish accent |
|
""" |
|
root = "/mount/resources/speech/corpora/Jenny/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open("/mount/resources/speech/corpora/Jenny/metadata.csv", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript["/mount/resources/speech/corpora/Jenny/" + line.split("|")[0] + "_silence.flac"] = line.split("|")[1] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_german(re_cache=False): |
|
lang = "german" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_karlsson(re_cache=False): |
|
root = "/mount/resources/speech/corpora/HUI_German/Karlsson" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_eva(re_cache=False): |
|
root = "/mount/resources/speech/corpora/HUI_German/Eva" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_bernd(re_cache=False): |
|
root = "/mount/resources/speech/corpora/HUI_German/Bernd" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_friedrich(re_cache=False): |
|
root = "/mount/resources/speech/corpora/HUI_German/Friedrich" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_hokus(re_cache=False): |
|
root = "/mount/resources/speech/corpora/HUI_German/Hokus" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_hui_others(re_cache=False): |
|
root = "/mount/resources/speech/corpora/HUI_German/others" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
pttd = dict() |
|
for speaker in os.listdir(root): |
|
pttd.update(build_path_to_transcript_dict_hui_template(root=f"{root}/{speaker}")) |
|
torch.save(pttd, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_thorsten_neutral(re_cache=False): |
|
root = "/mount/resources/speech/corpora/ThorstenDatasets/thorsten-de_v03" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(root + "/metadata_train.csv", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
with open(root + "/metadata_val.csv", encoding="utf8") as f: |
|
transcriptions += "\n" + f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[root + "/wavs/" + line.split("|")[0] + ".wav"] = \ |
|
line.split("|")[1] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_thorsten_2022_10(re_cache=False): |
|
root = "/mount/resources/speech/corpora/ThorstenDatasets/ThorstenVoice-Dataset_2022.10" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(root + "/metadata_train.csv", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
with open(root + "/metadata_dev.csv", encoding="utf8") as f: |
|
transcriptions += "\n" + f.read() |
|
with open(root + "/metadata_test.csv", encoding="utf8") as f: |
|
transcriptions += "\n" + f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[root + "/wavs/" + line.split("|")[0] + ".wav"] = \ |
|
line.split("|")[1] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_thorsten_emotional(re_cache=False): |
|
root = "/mount/resources/speech/corpora/ThorstenDatasets/thorsten-emotional_v02" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(root + "/thorsten-emotional-metadata.csv", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[root + "/amused/" + line.split("|")[0] + ".wav"] = line.split("|")[1] |
|
path_to_transcript[root + "/angry/" + line.split("|")[0] + ".wav"] = line.split("|")[1] |
|
path_to_transcript[root + "/disgusted/" + line.split("|")[0] + ".wav"] = line.split("|")[1] |
|
path_to_transcript[root + "/neutral/" + line.split("|")[0] + ".wav"] = line.split("|")[1] |
|
path_to_transcript[root + "/sleepy/" + line.split("|")[0] + ".wav"] = line.split("|")[1] |
|
path_to_transcript[root + "/surprised/" + line.split("|")[0] + ".wav"] = line.split("|")[1] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_french(re_cache=False): |
|
lang = "french" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_blizzard2023_ad_silence_removed(re_cache=False): |
|
root = "/mount/resources/speech/corpora/Blizzard2023/AD_silence_removed" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("\t")[1] |
|
wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1]) |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript.replace("§", "").replace("#", "").replace("~", "").replace(" »", '"').replace("« ", '"').replace("»", '"').replace("«", '"') |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_blizzard2023_neb_silence_removed(re_cache=False): |
|
root = "/mount/resources/speech/corpora/Blizzard2023/NEB_silence_removed" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("\t")[1] |
|
wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1]) |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript.replace("§", "").replace("#", "").replace("~", "").replace(" »", '"').replace("« ", '"').replace("»", '"').replace("«", '"') |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_blizzard2023_neb_e_silence_removed(re_cache=False): |
|
root = "/mount/resources/speech/corpora/Blizzard2023/enhanced_NEB_subset_silence_removed" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("\t")[1] |
|
wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1]) |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript.replace("§", "").replace("#", "").replace("~", "").replace(" »", '"').replace("« ", '"').replace("»", '"').replace("«", '"') |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_synpaflex_norm_subset(re_cache=False): |
|
""" |
|
Contributed by https://github.com/tomschelsen |
|
""" |
|
root = "/mount/resources/speech/corpora/synpaflex-corpus/5/v0.1/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
for text_path in glob.iglob(os.path.join(root, "**/*_norm.txt"), recursive=True): |
|
with open(text_path, "r", encoding="utf8") as file: |
|
norm_transcript = file.read() |
|
path_obj = Path(text_path) |
|
wav_path = str((path_obj.parent.parent / path_obj.name[:-9]).with_suffix(".wav")) |
|
if Path(wav_path).exists(): |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_siwis_subset(re_cache=False): |
|
""" |
|
Contributed by https://github.com/tomschelsen |
|
""" |
|
root = "/mount/resources/speech/corpora/SiwisFrenchSpeechSynthesisDatabase/" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
|
|
sub_dirs = ["part1", "part2", "part3"] |
|
path_to_transcript = dict() |
|
for sd in sub_dirs: |
|
for text_path in glob.iglob(os.path.join(root, "text", sd, "*.txt")): |
|
with open(text_path, "r", encoding="utf8") as file: |
|
norm_transcript = file.read() |
|
path_obj = Path(text_path) |
|
wav_path = str((path_obj.parent.parent.parent / "wavs" / sd / path_obj.stem).with_suffix(".wav")) |
|
if Path(wav_path).exists(): |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_css10fr(re_cache=False): |
|
language = "french" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_spanish(re_cache=False): |
|
lang = "spanish" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_css10es(re_cache=False): |
|
language = "spanish" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_spanish_blizzard_train(re_cache=False): |
|
root = "/mount/resources/speech/corpora/Blizzard2021/spanish_blizzard_release_2021_v2/hub" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(os.path.join(root, "train_text.txt"), "r", encoding="utf8") as file: |
|
lookup = file.read() |
|
for line in lookup.split("\n"): |
|
if line.strip() != "": |
|
norm_transcript = line.split("\t")[1] |
|
wav_path = os.path.join(root, "train_wav", line.split("\t")[0] + ".wav") |
|
if os.path.exists(wav_path): |
|
path_to_transcript[wav_path] = norm_transcript |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_portuguese(re_cache=False): |
|
lang = "portuguese" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_polish(re_cache=False): |
|
lang = "polish" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_italian(re_cache=False): |
|
lang = "italian" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_mls_dutch(re_cache=False): |
|
lang = "dutch" |
|
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_css10nl(re_cache=False): |
|
language = "dutch" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_css10el(re_cache=False): |
|
language = "greek" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_css10fi(re_cache=False): |
|
language = "finnish" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_VIVOS_viet(re_cache=False): |
|
root = "/mount/resources/speech/corpora/VIVOS_vietnamese/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript_dict = dict() |
|
with open(root + "/prompts.txt", mode="r", encoding="utf8") as f: |
|
transcripts = f.read().split("\n") |
|
for transcript in transcripts: |
|
if transcript.strip() != "": |
|
parsed_line = transcript.split(" ") |
|
audio_file = f"{root}/waves/{parsed_line[0][:10]}/{parsed_line[0]}.wav" |
|
path_to_transcript_dict[audio_file] = " ".join(parsed_line[1:]).lower() |
|
torch.save(path_to_transcript_dict, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_vietTTS(re_cache=False): |
|
root = "/mount/resources/speech/corpora/VietTTS" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(root + "/meta_data.tsv", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
for line in transcriptions.split("\n"): |
|
if line.strip() != "": |
|
parsed_line = line.split(".wav") |
|
audio_path = parsed_line[0] |
|
transcript = parsed_line[1] |
|
path_to_transcript[os.path.join(root, audio_path + ".wav")] = transcript.strip() |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_aishell3(re_cache=False): |
|
root = "/mount/resources/speech/corpora/aishell3/train" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript_dict = dict() |
|
with open(root + "/label_train-set.txt", mode="r", encoding="utf8") as f: |
|
transcripts = f.read().replace("$", "").replace("%", " ").split("\n") |
|
for transcript in transcripts: |
|
if transcript.strip() != "" and not transcript.startswith("#"): |
|
parsed_line = transcript.split("|") |
|
audio_file = f"{root}/wav/{parsed_line[0][:7]}/{parsed_line[0]}.wav" |
|
kanji = parsed_line[2] |
|
path_to_transcript_dict[audio_file] = kanji |
|
torch.save(path_to_transcript_dict, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
def build_path_to_transcript_dict_css10cmn(re_cache=False): |
|
language = "chinese" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open("/mount/resources/speech/corpora/CSS10/chinese/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript["/mount/resources/speech/corpora/CSS10/chinese/" + line.split("|")[0]] = line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_css10ru(re_cache=False): |
|
language = "russian" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_path_to_transcript_dict_css10hu(re_cache=False): |
|
language = "hungarian" |
|
root = f"/mount/resources/speech/corpora/CSS10/{language}" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
path_to_transcript = dict() |
|
language = "hungarian" |
|
with open(f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", encoding="utf8") as f: |
|
transcriptions = f.read() |
|
trans_lines = transcriptions.split("\n") |
|
for line in trans_lines: |
|
if line.strip() != "": |
|
path_to_transcript[f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}"] = \ |
|
line.split("|")[2] |
|
torch.save(path_to_transcript, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
|
|
|
|
def build_file_list_singing_voice_audio_database(re_cache=False): |
|
root = "/mount/resources/speech/corpora/singing_voice_audio_dataset/monophonic" |
|
cache_path = os.path.join(root, "pttd_cache.pt") |
|
if not os.path.exists(cache_path) or re_cache: |
|
file_list = list() |
|
for corw in os.listdir(root): |
|
for singer in os.listdir(os.path.join(root, corw)): |
|
for audio in os.listdir(os.path.join(root, corw, singer)): |
|
file_list.append(os.path.join(root, corw, singer, audio)) |
|
torch.save(file_list, cache_path) |
|
return torch.load(cache_path) |
|
|
|
|
|
from pathlib import Path |
|
import xml.etree.ElementTree as ET |
|
from csv import DictReader |
|
import json |
|
|
|
|
|
def build_path_to_transcript_dict_nst_norwegian(): |
|
root = '/resources/speech/corpora/NST_norwegian/pcm/cs' |
|
path_to_transcript = dict() |
|
audio_paths = sorted(list(Path(root).glob('*.pcm'))) |
|
i = 0 |
|
with open(Path(root, 'SCRIPTS/CTTS_core'), encoding='latin-1') as f: |
|
for line in f: |
|
transcript = line.strip().replace('\xad', '') |
|
path = str(audio_paths[i].absolute()) |
|
path_to_transcript[path] = transcript |
|
i += 1 |
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_nst_swedish(): |
|
root = '/resources/speech/corpora/NST_swedish/sw_pcms' |
|
path_to_transcript = dict() |
|
audio_paths = sorted(list(Path(root, 'mf').glob('*.pcm'))) |
|
audio_paths.insert(4154, None) |
|
audio_paths.insert(5144, None) |
|
i = 0 |
|
with open(Path(root, 'scripts/mf/sw_all'), encoding='latin-1') as f: |
|
for line in f: |
|
if i == 4154 or i == 5144: |
|
i += 1 |
|
continue |
|
transcript = line.strip().replace('\xad', '') |
|
path = str(audio_paths[i].absolute()) |
|
path_to_transcript[path] = transcript |
|
i += 1 |
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_afr(): |
|
root = '/resources/speech/corpora/nchlt_afr' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='afr') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_nbl(): |
|
root = '/resources/speech/corpora/nchlt_nbl' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='nbl') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_nso(): |
|
root = '/resources/speech/corpora/nchlt_nso' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='nso') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_sot(): |
|
root = '/resources/speech/corpora/nchlt_sot' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='sot') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_ssw(): |
|
root = '/resources/speech/corpora/nchlt_ssw' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='ssw') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_tsn(): |
|
root = '/resources/speech/corpora/nchlt_tsn' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='tsn') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_tso(): |
|
root = '/resources/speech/corpora/nchlt_tso' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='tso') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_ven(): |
|
root = '/resources/speech/corpora/nchlt_ven' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='ven') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_xho(): |
|
root = '/resources/speech/corpora/nchlt_xho' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='xho') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_zul(): |
|
root = '/resources/speech/corpora/nchlt_zul' |
|
return build_path_to_transcript_dict_nchlt_template(root, lang_code='zul') |
|
|
|
|
|
def build_path_to_transcript_dict_nchlt_template(root, lang_code): |
|
path_to_transcript = dict() |
|
base_dir = Path(root).parent |
|
|
|
for split in ['trn', 'tst']: |
|
tree = ET.parse(f'{root}/transcriptions/nchlt_{lang_code}.{split}.xml') |
|
tree_root = tree.getroot() |
|
for rec in tree_root.iter('recording'): |
|
transcript = rec.find('orth').text |
|
if '[s]' in transcript: |
|
continue |
|
path = str(base_dir / rec.get('audio')) |
|
path_to_transcript[path] = transcript |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_bibletts_akuapem_twi(): |
|
path_to_transcript = dict() |
|
root = '/resources/speech/corpora/BibleTTS/akuapem-twi' |
|
for split in ['train', 'dev', 'test']: |
|
for book in Path(root, split).glob('*'): |
|
for textfile in book.glob('*.txt'): |
|
with open(textfile, 'r', encoding='utf-8') as f: |
|
text = ' '.join([line.strip() for line in f]) |
|
path_to_transcript[textfile.with_suffix('.flac')] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_bembaspeech(): |
|
root = '/resources/speech/corpora/BembaSpeech/bem' |
|
path_to_transcript = dict() |
|
|
|
for split in ['train', 'dev', 'test']: |
|
with open(Path(root, f'{split}.tsv'), 'r', encoding='utf-8') as f: |
|
reader = DictReader(f, delimiter='\t') |
|
for row in reader: |
|
path_to_transcript[str(Path(root, 'audio', row['audio']))] = row['sentence'] |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_alffa_sw(): |
|
root = '/resources/speech/corpora/ALFFA/data_broadcastnews_sw/data' |
|
|
|
path_to_transcript = build_path_to_transcript_dict_kaldi_template(root=root, split='train', replace_in_path=('asr_swahili/data/', '')) |
|
path_to_transcript.update(build_path_to_transcript_dict_kaldi_template(root=root, split='test', replace_in_path=('/my_dir/wav', 'test/wav5'))) |
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_alffa_am(): |
|
root = '/resources/speech/corpora/ALFFA/data_readspeech_am/data' |
|
|
|
path_to_transcript = build_path_to_transcript_dict_kaldi_template(root=root, split='train', replace_in_path=('/home/melese/kaldi/data/', '')) |
|
path_to_transcript.update(build_path_to_transcript_dict_kaldi_template(root=root, split='test', replace_in_path=('/home/melese/kaldi/data/', ''))) |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_alffa_wo(): |
|
root = '/resources/speech/corpora/ALFFA/data_readspeech_wo/data' |
|
path_to_transcript = dict() |
|
|
|
for split in ['train', 'dev', 'test']: |
|
with open(Path(root, split, 'text'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.strip().split() |
|
file = line[0] |
|
text = ' '.join(line[1:]) |
|
number = file.split('_')[1] |
|
path_to_transcript[str(Path(root, split, number, f'{file}.wav'))] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_malayalam(): |
|
root = '/resources/speech/corpora/malayalam' |
|
path_to_transcript = dict() |
|
|
|
for gender in ['female', 'male']: |
|
with open(Path(root, f'line_index_{gender}.tsv'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
file, text = line.strip().split('\t') |
|
path_to_transcript[str(Path(root, gender, f'{file}.wav'))] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_msc(): |
|
root = '/resources/speech/corpora/msc_reviewed_speech' |
|
path_to_transcript = dict() |
|
|
|
with open(Path(root, f'metadata.tsv'), 'r', encoding='utf-8') as f: |
|
reader = DictReader(f, delimiter='\t') |
|
for row in reader: |
|
path_to_transcript[str(Path(root, row['speechpath']))] = row['transcript'] |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_chuvash(): |
|
root = '/resources/speech/corpora/chuvash' |
|
path_to_transcript = dict() |
|
|
|
for textfile in Path(root, 'transcripts', 'txt').glob('*.txt'): |
|
with open(textfile, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.strip().split() |
|
text = ' '.join(line[1:]).replace('«', '').replace('»', '') |
|
path = Path(root, 'audio', 'split', f'trim_clean_{textfile.stem}.{line[0]}.flac') |
|
if path.exists(): |
|
path_to_transcript[str(path)] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_iban(): |
|
root = '/resources/speech/corpora/iban/data' |
|
path_to_transcript = build_path_to_transcript_dict_kaldi_template(root, 'train', replace_in_path=( |
|
'asr_iban/data/', '')) |
|
path_to_transcript.update(build_path_to_transcript_dict_kaldi_template(root, 'dev', replace_in_path=( |
|
'asr_iban/data/', ''))) |
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_kaldi_template(root, split, replace_in_path=None): |
|
path_to_transcript = dict() |
|
|
|
wav_scp = {} |
|
with open(Path(root, split, 'wav.scp'), 'r') as f: |
|
for line in f: |
|
wav_id, wav_path = line.split() |
|
if replace_in_path: |
|
wav_path = wav_path.replace(replace_in_path[0], replace_in_path[1]) |
|
wav_scp[wav_id] = str(Path(root, wav_path)) |
|
|
|
with open(Path(root, split, 'text'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.split() |
|
wav_id = line[0] |
|
text = ' '.join(line[1:]) |
|
if '<' in text: |
|
continue |
|
path_to_transcript[wav_scp[wav_id]] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_sundanese_speech(): |
|
root = '/resources/speech/corpora/sundanese_speech/asr_sundanese' |
|
return build_path_to_transcript_dict_south_asian_languages_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_sinhala_speech(): |
|
root = '/resources/speech/corpora/sinhala_speech/asr_sinhala' |
|
return build_path_to_transcript_dict_south_asian_languages_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_bengali_speech(): |
|
root = '/resources/speech/corpora/bengali_speech/asr_bengali' |
|
return build_path_to_transcript_dict_south_asian_languages_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_nepali_speech(): |
|
root = '/resources/speech/corpora/nepali_speech/asr_nepali' |
|
return build_path_to_transcript_dict_south_asian_languages_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_javanese_speech(): |
|
root = '/resources/speech/corpora/javanese_speech/asr_javanese' |
|
return build_path_to_transcript_dict_south_asian_languages_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_south_asian_languages_template(root): |
|
path_to_transcript = dict() |
|
|
|
with open(Path(root, 'utt_spk_text.tsv'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
utt, spk, text = line.strip().split('\t') |
|
dir_tag = utt[:2] |
|
path_to_transcript[str(Path(root, 'data', dir_tag, f'{utt}.flac'))] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_kenyan_afv(): |
|
root = '/resources/speech/corpora/AfricanVoices/afv_enke' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_fon_alf(): |
|
root = '/resources/speech/corpora/AfricanVoices/fon_alf' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_hausa_cmv(): |
|
main_root = '/resources/speech/corpora/AfricanVoices' |
|
path_to_transcript = build_path_to_transcript_dict_african_voices_template(f'{main_root}/hau_cmv_f') |
|
path_to_transcript.update(build_path_to_transcript_dict_african_voices_template(f'{main_root}/hau_cmv_m')) |
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_ibibio_lst(): |
|
root = '/resources/speech/corpora/AfricanVoices/ibb_lst' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_kikuyu_opb(): |
|
root = '/resources/speech/corpora/AfricanVoices/kik_opb' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_lingala_opb(): |
|
root = '/resources/speech/corpora/AfricanVoices/lin_opb' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_ganda_cmv(): |
|
root = '/resources/speech/corpora/AfricanVoices/lug_cmv' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_luo_afv(): |
|
root = '/resources/speech/corpora/AfricanVoices/luo_afv' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_luo_opb(): |
|
root = '/resources/speech/corpora/AfricanVoices/luo_opb' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_swahili_llsti(): |
|
root = '/resources/speech/corpora/AfricanVoices/swa_llsti' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_suba_afv(): |
|
root = '/resources/speech/corpora/AfricanVoices/sxb_afv' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_wolof_alf(): |
|
root = '/resources/speech/corpora/AfricanVoices/wol_alf' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_yoruba_opb(): |
|
root = '/resources/speech/corpora/AfricanVoices/yor_opb' |
|
return build_path_to_transcript_dict_african_voices_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_african_voices_template(root): |
|
path_to_transcript = dict() |
|
|
|
with open(Path(root, 'txt.done.data'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.replace('\\"', "'").split('"') |
|
text = line[1] |
|
file = line[0].split()[-1] |
|
path_to_transcript[str(Path(root, 'wav', f'{file}.wav'))] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_zambezi_voice_nyanja(): |
|
root = '/resources/speech/corpora/ZambeziVoice/nyanja/nya' |
|
return build_path_to_transcript_dict_zambezi_voice_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_zambezi_voice_lozi(): |
|
root = '/resources/speech/corpora/ZambeziVoice/lozi/loz' |
|
return build_path_to_transcript_dict_zambezi_voice_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_zambezi_voice_tonga(): |
|
root = '/resources/speech/corpora/ZambeziVoice/tonga/toi' |
|
return build_path_to_transcript_dict_zambezi_voice_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_zambezi_voice_template(root): |
|
path_to_transcript = dict() |
|
|
|
for split in ['train', 'dev', 'test']: |
|
with open(Path(root, f'{split}.tsv'), 'r', encoding='utf-8') as f: |
|
reader = DictReader(f, delimiter='\t') |
|
for row in reader: |
|
path_to_transcript[str(Path(root, 'audio', row['audio_id']))] = row['sentence'].strip() |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_template(root): |
|
path_to_transcript = dict() |
|
|
|
for split in ['train', 'dev', 'test']: |
|
with open(Path(root, f'{split}.tsv'), 'r', encoding='utf-8') as f: |
|
reader = DictReader(f, delimiter='\t', fieldnames=['id', 'filename', 'transcription_raw', |
|
'transcription', 'words', 'speaker', 'gender']) |
|
for row in reader: |
|
path_to_transcript[str(Path(root, 'audio', split, row['filename']))] = row['transcription_raw'].strip() |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_afrikaans(): |
|
root = '/resources/speech/corpora/fleurs/af_za' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_amharic(): |
|
root = '/resources/speech/corpora/fleurs/am_et' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_arabic(): |
|
root = '/resources/speech/corpora/fleurs/ar_eg' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_assamese(): |
|
root = '/resources/speech/corpora/fleurs/as_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_asturian(): |
|
root = '/resources/speech/corpora/fleurs/ast_es' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_azerbaijani(): |
|
root = '/resources/speech/corpora/fleurs/az_az' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_belarusian(): |
|
root = '/resources/speech/corpora/fleurs/be_by' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_bulgarian(): |
|
root = '/resources/speech/corpora/fleurs/bg_bg' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_bengali(): |
|
root = '/resources/speech/corpora/fleurs/bn_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_bosnian(): |
|
root = '/resources/speech/corpora/fleurs/bs_ba' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_catalan(): |
|
root = '/resources/speech/corpora/fleurs/ca_es' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_cebuano(): |
|
root = '/resources/speech/corpora/fleurs/ceb_ph' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_sorani_kurdish(): |
|
root = '/resources/speech/corpora/fleurs/ckb_iq' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_mandarin(): |
|
root = '/resources/speech/corpora/fleurs/cmn_hans_cn' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_czech(): |
|
root = '/resources/speech/corpora/fleurs/cs_cz' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_welsh(): |
|
root = '/resources/speech/corpora/fleurs/cy_gb' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_danish(): |
|
root = '/resources/speech/corpora/fleurs/da_dk' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_german(): |
|
root = '/resources/speech/corpora/fleurs/de_de' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_greek(): |
|
root = '/resources/speech/corpora/fleurs/el_gr' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_english(): |
|
root = '/resources/speech/corpora/fleurs/en_us' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_spanish(): |
|
root = '/resources/speech/corpora/fleurs/es_419' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_estonian(): |
|
root = '/resources/speech/corpora/fleurs/et_ee' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_persian(): |
|
root = '/resources/speech/corpora/fleurs/fa_ir' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_fula(): |
|
root = '/resources/speech/corpora/fleurs/ff_sn' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_finnish(): |
|
root = '/resources/speech/corpora/fleurs/fi_fi' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_filipino(): |
|
root = '/resources/speech/corpora/fleurs/fil_ph' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_french(): |
|
root = '/resources/speech/corpora/fleurs/fr_fr' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_irish(): |
|
root = '/resources/speech/corpora/fleurs/ga_ie' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_galician(): |
|
root = '/resources/speech/corpora/fleurs/gl_es' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_gujarati(): |
|
root = '/resources/speech/corpora/fleurs/gu_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_hausa(): |
|
root = '/resources/speech/corpora/fleurs/ha_ng' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_hebrew(): |
|
root = '/resources/speech/corpora/fleurs/he_il' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_hindi(): |
|
root = '/resources/speech/corpora/fleurs/hi_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_croatian(): |
|
root = '/resources/speech/corpora/fleurs/hr_hr' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_hungarian(): |
|
root = '/resources/speech/corpora/fleurs/hu_hu' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_armenian(): |
|
root = '/resources/speech/corpora/fleurs/hy_am' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_indonesian(): |
|
root = '/resources/speech/corpora/fleurs/id_id' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_igbo(): |
|
root = '/resources/speech/corpora/fleurs/ig_ng' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_icelandic(): |
|
root = '/resources/speech/corpora/fleurs/is_is' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_italian(): |
|
root = '/resources/speech/corpora/fleurs/it_it' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_japanese(): |
|
root = '/resources/speech/corpora/fleurs/ja_jp' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_javanese(): |
|
root = '/resources/speech/corpora/fleurs/jv_id' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_georgian(): |
|
root = '/resources/speech/corpora/fleurs/ka_ge' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_kamba(): |
|
root = '/resources/speech/corpora/fleurs/kam_ke' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_kabuverdianu(): |
|
root = '/resources/speech/corpora/fleurs/kea_cv' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_kazakh(): |
|
root = '/resources/speech/corpora/fleurs/kk_kz' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_khmer(): |
|
root = '/resources/speech/corpora/fleurs/km_kh' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_kannada(): |
|
root = '/resources/speech/corpora/fleurs/kn_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_korean(): |
|
root = '/resources/speech/corpora/fleurs/ko_kr' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_kyrgyz(): |
|
root = '/resources/speech/corpora/fleurs/ky_kg' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_luxembourgish(): |
|
root = '/resources/speech/corpora/fleurs/lb_lu' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_ganda(): |
|
root = '/resources/speech/corpora/fleurs/lg_ug' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_lingala(): |
|
root = '/resources/speech/corpora/fleurs/ln_cd' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_lao(): |
|
root = '/resources/speech/corpora/fleurs/lo_la' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_lithuanian(): |
|
root = '/resources/speech/corpora/fleurs/lt_lt' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_luo(): |
|
root = '/resources/speech/corpora/fleurs/luo_ke' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_latvian(): |
|
root = '/resources/speech/corpora/fleurs/lv_lv' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_maori(): |
|
root = '/resources/speech/corpora/fleurs/mi_nz' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_macedonian(): |
|
root = '/resources/speech/corpora/fleurs/mk_mk' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_malayalam(): |
|
root = '/resources/speech/corpora/fleurs/ml_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_mongolian(): |
|
root = '/resources/speech/corpora/fleurs/mn_mn' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_marathi(): |
|
root = '/resources/speech/corpora/fleurs/mr_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_malay(): |
|
root = '/resources/speech/corpora/fleurs/ms_my' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_maltese(): |
|
root = '/resources/speech/corpora/fleurs/mt_mt' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_burmese(): |
|
root = '/resources/speech/corpora/fleurs/my_mm' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_norwegian(): |
|
root = '/resources/speech/corpora/fleurs/nb_no' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_nepali(): |
|
root = '/resources/speech/corpora/fleurs/ne_np' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_dutch(): |
|
root = '/resources/speech/corpora/fleurs/nl_nl' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_northern_sotho(): |
|
root = '/resources/speech/corpora/fleurs/nso_za' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_nyanja(): |
|
root = '/resources/speech/corpora/fleurs/ny_mw' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_occitan(): |
|
root = '/resources/speech/corpora/fleurs/oc_fr' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_oroma(): |
|
root = '/resources/speech/corpora/fleurs/om_et' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_oriya(): |
|
root = '/resources/speech/corpora/fleurs/or_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_punjabi(): |
|
root = '/resources/speech/corpora/fleurs/pa_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_polish(): |
|
root = '/resources/speech/corpora/fleurs/pl_pl' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_pashto(): |
|
root = '/resources/speech/corpora/fleurs/ps_af' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_portuguese(): |
|
root = '/resources/speech/corpora/fleurs/pt_br' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_romanian(): |
|
root = '/resources/speech/corpora/fleurs/ro_ro' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_russian(): |
|
root = '/resources/speech/corpora/fleurs/ru_ru' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_sindhi(): |
|
root = '/resources/speech/corpora/fleurs/sd_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_slovak(): |
|
root = '/resources/speech/corpora/fleurs/sk_sk' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_slovenian(): |
|
root = '/resources/speech/corpora/fleurs/sl_si' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_shona(): |
|
root = '/resources/speech/corpora/fleurs/sn_zw' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_somali(): |
|
root = '/resources/speech/corpora/fleurs/so_so' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_serbian(): |
|
root = '/resources/speech/corpora/fleurs/sr_rs' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_swedish(): |
|
root = '/resources/speech/corpora/fleurs/sv_se' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_swahili(): |
|
root = '/resources/speech/corpora/fleurs/sw_ke' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_tamil(): |
|
root = '/resources/speech/corpora/fleurs/ta_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_telugu(): |
|
root = '/resources/speech/corpora/fleurs/te_in' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_tajik(): |
|
root = '/resources/speech/corpora/fleurs/tg_tj' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_thai(): |
|
root = '/resources/speech/corpora/fleurs/th_th' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_turkish(): |
|
root = '/resources/speech/corpora/fleurs/tr_tr' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_ukrainian(): |
|
root = '/resources/speech/corpora/fleurs/uk_ua' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_umbundu(): |
|
root = '/resources/speech/corpora/fleurs/umb_ao' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_urdu(): |
|
root = '/resources/speech/corpora/fleurs/ur_pk' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_uzbek(): |
|
root = '/resources/speech/corpora/fleurs/uz_uz' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_vietnamese(): |
|
root = '/resources/speech/corpora/fleurs/vi_vn' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_wolof(): |
|
root = '/resources/speech/corpora/fleurs/wo_sn' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_xhosa(): |
|
root = '/resources/speech/corpora/fleurs/xh_za' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_yoruba(): |
|
root = '/resources/speech/corpora/fleurs/yo_ng' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_cantonese(): |
|
root = '/resources/speech/corpora/fleurs/yue_hant_hk' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_fleurs_zulu(): |
|
root = '/resources/speech/corpora/fleurs/zu_za' |
|
return build_path_to_transcript_dict_fleurs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_living_audio_dataset_template(root): |
|
path_to_transcript = dict() |
|
tree = ET.parse(f'{root}/text.xml') |
|
tree_root = tree.getroot() |
|
|
|
for rec in tree_root.iter('recording_script'): |
|
for file in rec.iter('fileid'): |
|
path_to_transcript[str(Path(root, '48000_orig', f'{file.get("id")}.wav'))] = file.text.strip() |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_living_audio_dataset_irish(): |
|
root = '/resources/speech/corpora/LivingAudioDataset/ga' |
|
return build_path_to_transcript_dict_living_audio_dataset_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_living_audio_dataset_dutch(): |
|
root = '/resources/speech/corpora/LivingAudioDataset/nl' |
|
return build_path_to_transcript_dict_living_audio_dataset_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_living_audio_dataset_russian(): |
|
root = '/resources/speech/corpora/LivingAudioDataset/ru' |
|
return build_path_to_transcript_dict_living_audio_dataset_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_romanian_db(): |
|
root = '/resources/speech/corpora/RomanianDB' |
|
path_to_transcript = dict() |
|
|
|
for split in ['training', 'testing', 'elena', 'georgiana']: |
|
for transcript in Path(root, split, 'text').glob('*.txt'): |
|
subset = transcript.stem |
|
with open(transcript, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
fileid = line.strip()[:2] |
|
if len(fileid) == 2: |
|
fileid = '0' + fileid |
|
text = line.strip()[5:] |
|
if split == 'elena': |
|
path = f'ele_{subset}_{fileid}.wav' |
|
elif split == 'georgiana': |
|
path = f'geo_{subset}_{fileid}.wav' |
|
else: |
|
path = f'adr_{subset}_{fileid}.wav' |
|
path_to_transcript[str(Path(root, split, 'wav', subset, path))] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_shemo(): |
|
root = '/resources/speech/corpora/ShEMO' |
|
path_to_transcript = dict() |
|
|
|
with open('/resources/speech/corpora/ShEMO/shemo.json', 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
|
|
for fileid, file_info in data.items(): |
|
path = Path(root, file_info['gender'], f'{fileid}.wav') |
|
if path.exists(): |
|
path_to_transcript[str(path)] = file_info['transcript'] |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_mslt_template(root, lang='en'): |
|
path_to_transcript = dict() |
|
|
|
for split in Path(root).glob('*'): |
|
if split.is_dir(): |
|
for audio_file in split.glob('*.wav'): |
|
text_file = str(audio_file).replace(f'T0.{lang}.wav', f'T1.{lang}.snt') |
|
with open(text_file, 'r', encoding='utf-16') as f: |
|
for line in f: |
|
text = line.strip() |
|
if '<' in text or '[' in text: |
|
|
|
continue |
|
path_to_transcript[str(audio_file)] = text |
|
break |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_mslt_english(): |
|
root = '/resources/speech/corpora/MSLT/Data/EN' |
|
return build_path_to_transcript_dict_mslt_template(root, lang='en') |
|
|
|
|
|
def build_path_to_transcript_dict_mslt_japanese(): |
|
root = '/resources/speech/corpora/MSLT/Data/JA' |
|
return build_path_to_transcript_dict_mslt_template(root, lang='jp') |
|
|
|
|
|
def build_path_to_transcript_dict_mslt_chinese(): |
|
root = '/resources/speech/corpora/MSLT/Data/ZH' |
|
return build_path_to_transcript_dict_mslt_template(root, lang='ch') |
|
|
|
|
|
def build_path_to_transcript_dict_rajasthani_hindi_speech(): |
|
root = '/resources/speech/corpora/Rajasthani_Hindi_Speech/Hindi-Speech-Data' |
|
path_to_transcript = dict() |
|
|
|
for audio_file in Path(root).glob('*.3gp'): |
|
with open(audio_file.with_suffix('.txt'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
text = line.strip() |
|
path_to_transcript[str(audio_file)] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_cmu_arctic(): |
|
root = '/resources/speech/corpora/cmu_arctic' |
|
path_to_transcript = dict() |
|
|
|
for speaker_dir in Path(root).glob('*'): |
|
if speaker_dir.is_dir(): |
|
with open(Path(speaker_dir, 'etc', 'txt.done.data'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.replace('\\"', "'").split('"') |
|
text = line[1] |
|
file = line[0].split()[-1] |
|
path_to_transcript[str(Path(speaker_dir, 'wav', f'{file}.wav'))] = text |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_sevil_tatar(): |
|
root = '/resources/speech/corpora/sevil_tatar/sevil' |
|
path_to_transcript = dict() |
|
|
|
with open(Path(root, 'metadata.jsonl'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
meta = json.loads(line) |
|
path_to_transcript[str(Path(root, meta['file']))] = meta['orig_text'].strip().replace('\xad', '') |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_clartts(): |
|
root = '/resources/speech/corpora/ClArTTS' |
|
path_to_transcript = dict() |
|
|
|
with open(Path(root, 'training.txt'), 'r', encoding='utf-16') as f: |
|
for line in f: |
|
fileid, transcript = line.strip().split('|') |
|
path_to_transcript[str(Path(root, 'wav', 'train', f'{fileid}.wav'))] = transcript |
|
|
|
with open(Path(root, 'validation.txt'), 'r', encoding='utf-16') as f: |
|
for line in f: |
|
fileid, transcript = line.strip().split('|') |
|
path_to_transcript[str(Path(root, 'wav', 'val', f'{fileid}.wav'))] = transcript |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_template(root, lang): |
|
path_to_transcript = dict() |
|
|
|
for split in ['train_full', 'val_full', 'test_common']: |
|
with open(Path(root, 'experiments', lang, f'{split}.csv'), 'r', encoding='utf-8') as f: |
|
reader = DictReader(f, delimiter=',') |
|
for row in reader: |
|
path = row['path'].replace('data/', f'{root}/') |
|
path_to_transcript[path] = row['sentence'].strip() |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_bhadrawahi(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'bhadrawahi' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_bilaspuri(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'bilaspuri' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_dogri(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'dogri' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_gaddi(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'gaddi' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_haryanvi(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'haryanvi' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_hindi(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'hindi' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_kangri(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'kangri' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_kannada(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'kannada' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_kulvi(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'kulvi' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_kulvi_outer_seraji(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'kulvi_outer_seraji' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_malayalam(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'malayalam' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_mandeali(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'mandeali' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_pahari_mahasui(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'pahari_mahasui' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_tamil(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'tamil' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_snow_mountain_telugu(): |
|
root = '/resources/speech/corpora/snow_mountain' |
|
language = 'telugu' |
|
return build_path_to_transcript_dict_snow_mountain_template(root, language) |
|
|
|
|
|
def build_path_to_transcript_dict_ukrainian_lada(): |
|
root = '/resources/speech/corpora/ukrainian_lada/dataset_lada/accept' |
|
path_to_transcript = dict() |
|
|
|
with open(Path(root, 'metadata.jsonl'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
meta = json.loads(line) |
|
path_to_transcript[str(Path(root, meta['file']).with_suffix('.wav'))] = meta['orig_text'].strip().replace('\xad', '') |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_template(root): |
|
path_to_transcript = dict() |
|
|
|
for gender_dir in Path(root).glob('*'): |
|
if not gender_dir.is_dir(): |
|
continue |
|
for speaker_dir in gender_dir.glob('*'): |
|
if not speaker_dir.is_dir(): |
|
continue |
|
if (speaker_dir / 'wavs').exists(): |
|
with open(Path(speaker_dir, 'metadata.csv'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
fileid, text, text_norm = line.strip().split('|') |
|
path = Path(speaker_dir, 'wavs', f'{fileid}.wav') |
|
if path.exists(): |
|
path_to_transcript[str(path)] = text_norm |
|
else: |
|
|
|
for session_dir in speaker_dir.glob('*'): |
|
if not session_dir.is_dir(): |
|
continue |
|
with open(Path(session_dir, 'metadata.csv'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
fileid, text, text_norm = line.strip().split('|') |
|
path = Path(session_dir, 'wavs', f'{fileid}.wav') |
|
if path.exists(): |
|
path_to_transcript[str(path)] = text_norm |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_german(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/de_DE' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_uk_english(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/en_UK' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_us_english(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/en_US' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_spanish(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/es_ES' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_french(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/fr_FR' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_italian(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/it_IT' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_polish(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/pl_PL' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_russian(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/ru_RU' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_m_ailabs_ukrainian(): |
|
root = '/resources/speech/corpora/m-ailabs-speech/uk_UK' |
|
return build_path_to_transcript_dict_m_ailabs_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_template(root): |
|
path_to_transcript = dict() |
|
|
|
for split in ['train', 'dev', 'test']: |
|
with open(Path(root, f'{split}.csv'), 'r', encoding='utf-8') as f: |
|
reader = DictReader(f, delimiter='|') |
|
for row in reader: |
|
path_to_transcript[str(Path(root, row['wav_filename']))] = row['transcript'].strip() |
|
|
|
return path_to_transcript |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_dutch(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_dutch_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_french(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_french_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_german(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_german_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_italian(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_italian_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_polish(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_polish_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_portuguese(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_portuguese_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_cml_tts_spanish(): |
|
root = '/resources/speech/corpora/cml_tts/cml_tts_dataset_spanish_v0.1' |
|
return build_path_to_transcript_dict_cml_tts_template(root) |
|
|
|
|
|
def build_path_to_transcript_dict_mms_template(lang, root='/resources/speech/corpora/mms_synthesized_bible_speech'): |
|
path_to_transcript = dict() |
|
|
|
i = 0 |
|
with open(Path(root, 'bible_texts', f'{lang}.txt'), 'r', encoding='utf-8') as f: |
|
for line in f: |
|
path = Path(root, 'bible_audios', lang, f'{i}.wav') |
|
if path.exists(): |
|
path_to_transcript[str(path)] = line.strip() |
|
i += 1 |
|
|
|
return path_to_transcript |
|
|
|
|
|
if __name__ == '__main__': |
|
pass |
|
|