|
import json |
|
import re |
|
import csv |
|
import shutil |
|
import os |
|
import argparse |
|
|
|
main_path = os.getcwd() |
|
|
|
def get_duration(row): |
|
phone_durs = row.split() |
|
dur_sum = 0 |
|
for phone_dur in phone_durs: |
|
if phone_dur == '|': |
|
continue |
|
else: |
|
phone_dur = phone_dur.split('[') |
|
dur = float(phone_dur[1][:-1])/1000 |
|
dur_sum += dur |
|
return dur_sum |
|
|
|
def prepare_data_for_model(path, duration_lim): |
|
f = open(path, 'r') |
|
data = csv.DictReader(f) |
|
data_lines = [] |
|
for row in data: |
|
dur = get_duration(row['phenome']) |
|
if dur > duration_lim: |
|
continue |
|
phoneme = row['phenome'] |
|
utterance_name = row['seg_id'] |
|
speaker_id = row['speaker_id'] |
|
phoneme = re.sub("\[([0-9]+)\]", '', phoneme) |
|
phoneme = re.sub("\s+\|\s+", ' ', phoneme) |
|
data_lines.append([phoneme, utterance_name, speaker_id]) |
|
f.close() |
|
return data_lines |
|
|
|
|
|
def save_files(train_data, test_data, data_path): |
|
for line in train_data: |
|
try: |
|
original = os.path.join(data_path, 'train_wav/{}.wav'.format(line[1])) |
|
target = os.path.join(main_path, 'dataset/persian_data/train_data/speaker-{0}/book-1/utterance-{1}.wav'.format(line[2], line[1])) |
|
os.makedirs(os.path.dirname(target), exist_ok=True) |
|
shutil.copyfile(original, target) |
|
except Exception as e: |
|
print(e) |
|
return False |
|
|
|
path = os.path.join(main_path, 'dataset/persian_data/train_data/speaker-{0}/book-1/utterance-{1}.txt'.format(line[2], line[1])) |
|
with open(path, 'w') as fp: |
|
fp.write(line[0]) |
|
|
|
for line in test_data: |
|
try: |
|
original = os.path.join(data_path, 'test_wav/{}.wav'.format(line[1])) |
|
target = os.path.join(main_path, 'dataset/persian_data/test_data/speaker-{0}/book-1/utterance-{1}.wav'.format(line[2], line[1])) |
|
os.makedirs(os.path.dirname(target), exist_ok=True) |
|
shutil.copyfile(original, target) |
|
except Exception as e: |
|
print(e) |
|
return False |
|
|
|
path = os.path.join(main_path, 'dataset/persian_data/test_data/speaker-{0}/book-1/utterance-{1}.txt'.format(line[2], line[1])) |
|
with open(path, 'w') as fp: |
|
fp.write(line[0]) |
|
return True |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--data_path', required=True) |
|
args = parser.parse_args() |
|
data_path = args.data_path |
|
|
|
if os.path.isfile(os.path.join(data_path, 'train_info.csv')): |
|
train_data_path = os.path.join(data_path, 'train_info.csv') |
|
else: |
|
print('data_path is not correct!') |
|
return -1 |
|
if os.path.isfile(os.path.join(data_path, 'test_info.csv')): |
|
test_data_path = os.path.join(data_path, 'test_info.csv') |
|
else: |
|
print('data_path is not correct!') |
|
return -1 |
|
train_data = prepare_data_for_model(train_data_path, 12) |
|
test_data = prepare_data_for_model(test_data_path, 15) |
|
print('number of train data: ' + str(len(train_data))) |
|
print('number of test data: ' + str(len(test_data))) |
|
|
|
res = save_files(train_data, test_data, data_path) |
|
if res: |
|
print('Data is created.') |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|