Spaces:

miya3333
/

TTSDemo

Running

miya3333 commited on Jan 3

Commit

9fae8dd

verified ·

1 Parent(s): 55ca9a4

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import gradio as gr
 import torch
 from speechbrain.inference.TTS import Tacotron2
 from speechbrain.inference.vocoders import HIFIGAN
-import soundfile as sf
 # モデルのロード
 hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
@@ -10,17 +11,18 @@ tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech",
 # 推論関数の定義
 def synthesize_speech(text):
     # Tacotron2でmel spectrogramを生成
-    # テキストを直接入力として、LongTensorでラップする
-    mel_output, mel_length, alignment = tacotron2.encode_batch([text])
     # HiFi-GANでmel spectrogramから音声を生成
     waveforms = hifi_gan.decode_batch(mel_output)
-    # # torch tensorをwavfileとして保存
-    # torch.save(waveforms, "speech.pt")
-    # return "speech.pt"
     # 音声を .wav 形式で保存
     sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
     return "speech.wav"
@@ -34,12 +36,4 @@ iface = gr.Interface(
     description="Enter text to synthesize speech."
 )
-iface.launch()

 import gradio as gr
 import torch
+import soundfile as sf
 from speechbrain.inference.TTS import Tacotron2
 from speechbrain.inference.vocoders import HIFIGAN
+from speechbrain.dataio.dataio import read_audio
 # モデルのロード
 hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
 # 推論関数の定義
 def synthesize_speech(text):
+    # テキストをトークンIDに変換
+    text = text.lower()
+    tokenized = tacotron2.hparams.tokenize(text, phonemize=False)
+    # トークンIDをLong型のテンソルに変換
+    tokens = torch.LongTensor(tokenized)
     # Tacotron2でmel spectrogramを生成
+    mel_output, mel_length, alignment = tacotron2.encode_batch(tokens)
     # HiFi-GANでmel spectrogramから音声を生成
     waveforms = hifi_gan.decode_batch(mel_output)
     # 音声を .wav 形式で保存
     sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
     return "speech.wav"
     description="Enter text to synthesize speech."
 )
+iface.launch()