Spaces:
Runtime error
Runtime error
File size: 5,088 Bytes
c062c5b 831484b 73f5594 ecba073 831484b 990f25d f5c6e96 990f25d 831484b 9203178 be049c9 9203178 be049c9 9203178 831484b 73f5594 a1ebad1 9203178 1c8882f be049c9 990f25d 831484b 990f25d 831484b 990f25d be049c9 990f25d be049c9 831484b 990f25d f5c6e96 990f25d f5c6e96 990f25d 831484b 990f25d 831484b 990f25d a1ebad1 ecba073 831484b 73f5594 831484b a1ebad1 831484b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import sys
sys.path.append('tacotron2/')
sys.path.append('tacotron2/waveglow')
import torch
import os
import torchaudio
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
from scipy.io.wavfile import write
from text import symbols, text_to_sequence
import wave
device="cpu"
# Load Nvidia Tacotron2 from Hub
tacotron2 = torch.hub.load(
"NVIDIA/DeepLearningExamples:torchhub",
"nvidia_tacotron2",
model_math='fp32',
pretrained=False,
)
# Load Weights and bias of nepali text
tacotron2_checkpoint_path = os.path.join(os.getcwd(), 'model_E45.ckpt')
state_dict = torch.load(tacotron2_checkpoint_path, map_location=device)
tacotron2.load_state_dict(state_dict)
tacotron2 = tacotron2.to(device)
tacotron2.eval()
# Load Nvidia Waveglow from Hub
waveglow = torch.hub.load(
"NVIDIA/DeepLearningExamples:torchhub",
"nvidia_waveglow",
model_math="fp32",
pretrained=False,
)
checkpoint = torch.hub.load_state_dict_from_url(
"https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth", # noqa: E501
progress=False,
map_location=device,
)
state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}
waveglow.load_state_dict(state_dict)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow.eval()
# ERR: module glow not found
# waveglow_pretrained_model = os.path.join(os.getcwd(), 'waveglow_256channels_ljs_v3.pt')
# waveglow = torch.load(waveglow_pretrained_model, map_location=device)['model']
# waveglow = waveglow.to(device)
# waveglow.eval()
# Load Nvidia Utils from Hub
# utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
# sequences, lengths = utils.prepare_input_sequence([text])
def inference(text):
for i in [x for x in text.split("\n") if len(x)]:
if i[-1] != ";": i=i+";"
with torch.no_grad():
sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
# plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T))
audio = waveglow.infer(mel_outputs_postnet, sigma=0.8)
#Save Mel Spectrogram
plt.imshow(mel_outputs_postnet[0].cpu().detach())
plt.axis('off')
plt.savefig("test.png", bbox_inches='tight')
#Save Audio
audio_numpy = audio[0].data.cpu().numpy()
rate = 22050
write("output1.wav", rate, audio_numpy)
torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=rate)
# sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
# sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
# mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
# audio = hifigan(mel_outputs_postnet.float()).to("cpu")
# audio = audio * MAX_WAV_VALUE
# data = audio.squeeze().detach().cpu().numpy()
# rate = 22050
# scaled = np.int16(data / np.max(np.abs(data)) * 32767)
# write('test.wav', rate, scaled)
# concatenate_audio_wave(["output.wav","test.wav"],"output.wav")
# with torch.no_grad():
# sequences, lengths = utils.prepare_input_sequence([text])
# sequences = sequences.to(device)
# lengths = lengths.to(device)
# mel, _, _ = tacotron2.infer(sequences, lengths)
# audio = waveglow.infer(mel)
# #Save Mel Spectrogram
# plt.imshow(mel[0].cpu().detach())
# plt.axis('off')
# plt.savefig("test.png", bbox_inches='tight')
# #Save Audio
# audio_numpy = audio[0].data.cpu().numpy()
# rate = 22050
# write("output1.wav", rate, audio_numpy)
# torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=22050)
return "output1.wav", "output2.wav", "test.png"
title="TACOTRON 2"
description="Nepali Speech TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples=[["म नेपाली टिटिएस हुँ"]]
gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True) |