import gradio as gr import torch from speechbrain.pretrained import Tacotron2 from speechbrain.pretrained import HIFIGAN # モデルのロード hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder") tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts") # 推論関数の定義 def synthesize_speech(text): # Tacotron2でmel spectrogramを生成 mel_output, _, _ = tacotron2.encode_text(text) # HiFi-GANでmel spectrogramから音声を生成 waveforms = hifi_gan.decode_batch(mel_output) # torch tensorをwavfileとして保存 torch.save(waveforms, "speech.pt") return "speech.pt" # Gradioインターフェースの作成 iface = gr.Interface( fn=synthesize_speech, inputs=gr.Textbox(lines=5, label="Input Text"), outputs=gr.Audio(label="Output Audio", type="filepath"), title="TTS Demo", description="Enter text to synthesize speech." ) iface.launch()