# coding=utf8 import argparse import os import pathlib import sys root_dir = pathlib.Path(__file__).parent.parent.resolve() os.environ['PYTHONPATH'] = str(root_dir) sys.path.insert(0, str(root_dir)) import numpy as np import torch import tqdm from inference.ds_acoustic import DiffSingerAcousticInfer from utils.infer_utils import cross_fade, save_wav from utils.hparams import set_hparams, hparams parser = argparse.ArgumentParser(description='Run DiffSinger vocoder') parser.add_argument('mel', type=str, help='Path to the input file') parser.add_argument('--exp', type=str, required=False, help='Read vocoder class and path from chosen experiment') parser.add_argument('--config', type=str, required=False, help='Read vocoder class and path from config file') parser.add_argument('--class', type=str, required=False, help='Specify vocoder class') parser.add_argument('--ckpt', type=str, required=False, help='Specify vocoder checkpoint path') parser.add_argument('--out', type=str, required=False, help='Path of the output folder') parser.add_argument('--title', type=str, required=False, help='Title of output file') args = parser.parse_args() mel = pathlib.Path(args.mel) name = mel.stem if not args.title else args.title config = None if args.exp: config = root_dir / 'checkpoints' / args.exp / 'config.yaml' elif args.config: config = pathlib.Path(args.config) else: assert False, 'Either argument \'--exp\' or \'--config\' should be specified.' sys.argv = [ sys.argv[0], '--config', str(config) ] set_hparams(print_hparams=False) cls = getattr(args, 'class') if cls: hparams['vocoder'] = cls if args.ckpt: hparams['vocoder_ckpt'] = args.ckpt out = args.out if args.out: out = pathlib.Path(args.out) else: out = mel.parent mel_seq = torch.load(mel) assert isinstance(mel_seq, list), 'Not a valid mel sequence.' assert len(mel_seq) > 0, 'Mel sequence is empty.' sample_rate = hparams['audio_sample_rate'] infer_ins = DiffSingerAcousticInfer(load_model=False) def run_vocoder(path: pathlib.Path): result = np.zeros(0) current_length = 0 for seg_mel in tqdm.tqdm(mel_seq, desc='mel segment', total=len(mel_seq)): seg_audio = infer_ins.run_vocoder(seg_mel['mel'].to(infer_ins.device), f0=seg_mel['f0'].to(infer_ins.device)) seg_audio = seg_audio.squeeze(0).cpu().numpy() silent_length = round(seg_mel['offset'] * sample_rate) - current_length if silent_length >= 0: result = np.append(result, np.zeros(silent_length)) result = np.append(result, seg_audio) else: result = cross_fade(result, seg_audio, current_length + silent_length) current_length = current_length + silent_length + seg_audio.shape[0] print(f'| save audio: {path}') save_wav(result, path, sample_rate) os.makedirs(out, exist_ok=True) try: run_vocoder(out / (name + '.wav')) except KeyboardInterrupt: exit(-1)