File size: 5,088 Bytes
c062c5b
 
 
 
831484b
73f5594
ecba073
831484b
 
990f25d
 
f5c6e96
990f25d
831484b
 
9203178
be049c9
9203178
 
 
be049c9
9203178
 
831484b
73f5594
a1ebad1
 
9203178
 
 
 
1c8882f
be049c9
990f25d
 
 
 
 
 
 
 
 
 
 
 
831484b
990f25d
 
831484b
 
 
990f25d
 
 
 
 
 
be049c9
990f25d
be049c9
 
831484b
990f25d
 
 
 
 
 
f5c6e96
990f25d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5c6e96
990f25d
 
 
 
 
 
 
 
 
 
 
 
 
 
831484b
990f25d
 
 
 
831484b
990f25d
 
 
 
 
a1ebad1
ecba073
831484b
 
73f5594
831484b
a1ebad1
831484b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
sys.path.append('tacotron2/')
sys.path.append('tacotron2/waveglow')

import torch
import os
import torchaudio
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
from scipy.io.wavfile import write
from text import symbols, text_to_sequence
import wave

device="cpu"

# Load Nvidia Tacotron2 from Hub
tacotron2 = torch.hub.load(
    "NVIDIA/DeepLearningExamples:torchhub",
    "nvidia_tacotron2",
    model_math='fp32',
    pretrained=False,
)

# Load Weights and bias of nepali text
tacotron2_checkpoint_path = os.path.join(os.getcwd(), 'model_E45.ckpt')
state_dict = torch.load(tacotron2_checkpoint_path, map_location=device)

tacotron2.load_state_dict(state_dict)
tacotron2 = tacotron2.to(device)
tacotron2.eval()

# Load Nvidia Waveglow from Hub
waveglow = torch.hub.load(
    "NVIDIA/DeepLearningExamples:torchhub",
    "nvidia_waveglow",
    model_math="fp32",
    pretrained=False,
)
checkpoint = torch.hub.load_state_dict_from_url(
    "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth",  # noqa: E501
    progress=False,
    map_location=device,
)
state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}

waveglow.load_state_dict(state_dict)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow.eval()

# ERR: module glow not found
# waveglow_pretrained_model = os.path.join(os.getcwd(), 'waveglow_256channels_ljs_v3.pt')
# waveglow = torch.load(waveglow_pretrained_model, map_location=device)['model']
# waveglow = waveglow.to(device)
# waveglow.eval()

# Load Nvidia Utils from Hub
# utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
# sequences, lengths = utils.prepare_input_sequence([text])

def inference(text):
    for i in [x for x in text.split("\n") if len(x)]:
        if i[-1] != ";": i=i+";"

        with torch.no_grad():
            sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
            sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
            mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
            # plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T))
            audio = waveglow.infer(mel_outputs_postnet, sigma=0.8)
            
            #Save Mel Spectrogram
            plt.imshow(mel_outputs_postnet[0].cpu().detach())
            plt.axis('off')
            plt.savefig("test.png", bbox_inches='tight')
        
            #Save Audio
            audio_numpy = audio[0].data.cpu().numpy()
            rate = 22050
            write("output1.wav", rate, audio_numpy)
            torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=rate)

            
            # sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :]
            # sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long()
            # mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
            # audio = hifigan(mel_outputs_postnet.float()).to("cpu")
            # audio = audio * MAX_WAV_VALUE
            # data = audio.squeeze().detach().cpu().numpy()
            # rate = 22050
            # scaled = np.int16(data / np.max(np.abs(data)) * 32767)
            # write('test.wav', rate, scaled)
            # concatenate_audio_wave(["output.wav","test.wav"],"output.wav")

    # with torch.no_grad():
    #         sequences, lengths = utils.prepare_input_sequence([text])
    #         sequences = sequences.to(device)
    #         lengths = lengths.to(device)
    #         mel, _, _ = tacotron2.infer(sequences, lengths)
    #         audio = waveglow.infer(mel)

    # #Save Mel Spectrogram
    # plt.imshow(mel[0].cpu().detach())
    # plt.axis('off')
    # plt.savefig("test.png", bbox_inches='tight')

    # #Save Audio
    # audio_numpy = audio[0].data.cpu().numpy()
    # rate = 22050
    # write("output1.wav", rate, audio_numpy)
    # torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=22050)
    
    return "output1.wav", "output2.wav", "test.png"
  
title="TACOTRON 2"
description="Nepali Speech TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples=[["म नेपाली टिटिएस हुँ"]]
gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)