Nepali-Tacotron2_ShrutiAudio

Runtime error

App Files Files Community

Nepali-Tacotron2_ShrutiAudio / app.py

rahulshah63

Update app.py

c062c5b over 2 years ago

raw

history blame contribute delete

5.09 kB

	import sys
	sys.path.append('tacotron2/')
	sys.path.append('tacotron2/waveglow')

	import torch
	import os
	import torchaudio
	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np
	from scipy.io.wavfile import write
	from text import symbols, text_to_sequence
	import wave

	device="cpu"

	# Load Nvidia Tacotron2 from Hub
	tacotron2 = torch.hub.load(
	"NVIDIA/DeepLearningExamples:torchhub",
	"nvidia_tacotron2",
	model_math='fp32',
	pretrained=False,
	)

	# Load Weights and bias of nepali text
	tacotron2_checkpoint_path = os.path.join(os.getcwd(), 'model_E45.ckpt')
	state_dict = torch.load(tacotron2_checkpoint_path, map_location=device)

	tacotron2.load_state_dict(state_dict)
	tacotron2 = tacotron2.to(device)
	tacotron2.eval()

	# Load Nvidia Waveglow from Hub
	waveglow = torch.hub.load(
	"NVIDIA/DeepLearningExamples:torchhub",
	"nvidia_waveglow",
	model_math="fp32",
	pretrained=False,
	)
	checkpoint = torch.hub.load_state_dict_from_url(
	"https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth", # noqa: E501
	progress=False,
	map_location=device,
	)
	state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}

	waveglow.load_state_dict(state_dict)
	waveglow = waveglow.remove_weightnorm(waveglow)
	waveglow = waveglow.to(device)
	waveglow.eval()

	# ERR: module glow not found
	# waveglow_pretrained_model = os.path.join(os.getcwd(), 'waveglow_256channels_ljs_v3.pt')
	# waveglow = torch.load(waveglow_pretrained_model, map_location=device)['model']
	# waveglow = waveglow.to(device)
	# waveglow.eval()

	# Load Nvidia Utils from Hub
	# utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
	# sequences, lengths = utils.prepare_input_sequence([text])

	def inference(text):
	for i in [x for x in text.split("\n") if len(x)]:
	if i[-1] != ";": i=i+";"

	with torch.no_grad():
	sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
	sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
	mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
	# plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T))
	audio = waveglow.infer(mel_outputs_postnet, sigma=0.8)

	#Save Mel Spectrogram
	plt.imshow(mel_outputs_postnet[0].cpu().detach())
	plt.axis('off')
	plt.savefig("test.png", bbox_inches='tight')

	#Save Audio
	audio_numpy = audio[0].data.cpu().numpy()
	rate = 22050
	write("output1.wav", rate, audio_numpy)
	torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=rate)


	# sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
	# sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
	# mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
	# audio = hifigan(mel_outputs_postnet.float()).to("cpu")
	# audio = audio * MAX_WAV_VALUE
	# data = audio.squeeze().detach().cpu().numpy()
	# rate = 22050
	# scaled = np.int16(data / np.max(np.abs(data)) * 32767)
	# write('test.wav', rate, scaled)
	# concatenate_audio_wave(["output.wav","test.wav"],"output.wav")

	# with torch.no_grad():
	# sequences, lengths = utils.prepare_input_sequence([text])
	# sequences = sequences.to(device)
	# lengths = lengths.to(device)
	# mel, _, _ = tacotron2.infer(sequences, lengths)
	# audio = waveglow.infer(mel)

	# #Save Mel Spectrogram
	# plt.imshow(mel[0].cpu().detach())
	# plt.axis('off')
	# plt.savefig("test.png", bbox_inches='tight')

	# #Save Audio
	# audio_numpy = audio[0].data.cpu().numpy()
	# rate = 22050
	# write("output1.wav", rate, audio_numpy)
	# torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=22050)

	return "output1.wav", "output2.wav", "test.png"

	title="TACOTRON 2"
	description="Nepali Speech TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> \| <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
	examples=[["म नेपाली टिटिएस हुँ"]]
	gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)