import gradio as gr import torch import torchaudio import tempfile import numpy as np from nemo.collections.tts.models import FastPitchModel from nemo.collections.tts.models import HifiGanModel from nemo.collections.tts.models import MixerTTSModel # spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx") # model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx") spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker") spec_generator.eval() voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch") voc_model.eval() def greet(name): return "Hello " + name + "!!" def generate_tts(text: str, speaker: int = 0): sr = 44100 parsed = spec_generator.parse(text) spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker) audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram) # with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # torchaudio.save(fp.name, audio.to('cpu'), sample_rate=sr) # return fp.name return (sr, audio.to('cpu').detach().numpy()) def run(): demo = gr.Interface( fn=generate_tts, inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"), gr.Slider(0, 10, step=1, label="Speaker")], outputs=gr.Audio(label="Output", type="numpy"), ) demo.launch(server_name="0.0.0.0", server_port=7860) if __name__ == "__main__": run()