Spaces:

miya3333
/

TTSDemo

Sleeping

File size: 1,518 Bytes

a152706
 
9fae8dd
978608b
 
dfbe110
a152706
dc4f75a
 
 
a152706
 
 
9fae8dd
dfbe110
 
e3f6ffa
 
 
dfbe110
 
 
9fae8dd
dc4f75a
dfbe110
dc4f75a
 
 
 
200e5f9
 
 
a152706
 
 
 
 
dc4f75a
a152706
 
 
 
9fae8dd

import gradio as gr
import torch
import soundfile as sf
from speechbrain.inference.TTS import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.utils.text_to_sequence import text_to_sequence

# モデルのロード
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")

# 推論関数の定義
def synthesize_speech(text):
    # テキストをトークンIDに変換
    sequence = text_to_sequence(
      text,
      tacotron2.hparams.text_cleaners,
      add_bos_eos=tacotron2.hparams.add_bos_eos,
      symbol_set=tacotron2.hparams.symbol_set
    )
    # 系列をパディング
    batch = tacotron2.mods.encoder.pad_sequence_pre([torch.tensor(sequence)])

    # Tacotron2でmel spectrogramを生成
    mel_output, mel_length, alignment = tacotron2.encode_batch(batch)

    # HiFi-GANでmel spectrogramから音声を生成
    waveforms = hifi_gan.decode_batch(mel_output)

    # 音声を .wav 形式で保存
    sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
    return "speech.wav"

# Gradioインターフェースの作成
iface = gr.Interface(
    fn=synthesize_speech,
    inputs=gr.Textbox(lines=5, label="Input Text"),
    outputs=gr.Audio(label="Output Audio", type="filepath"),
    title="TTS Demo",
    description="Enter text to synthesize speech."
)

iface.launch()