Spaces:

miya3333
/

TTSDemo

Sleeping

App Files Files Community

miya3333 commited on Jan 3

Commit

200e5f9

verified ·

1 Parent(s): 22b35db

Upload 2 files

Browse files

Files changed (2) hide show

app.py +52 -6
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import torch
 from speechbrain.inference.TTS import Tacotron2
 from speechbrain.inference.vocoders import HIFIGAN
@@ -9,16 +10,20 @@ tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech",
 # 推論関数の定義
 def synthesize_speech(text):
     # Tacotron2でmel spectrogramを生成
-    # テキストを直接入力として、LongTensorでラップする
-    mel_output, mel_length, alignment = tacotron2.encode_batch([text])
     # HiFi-GANでmel spectrogramから音声を生成
     waveforms = hifi_gan.decode_batch(mel_output)
-    # torch tensorをwavfileとして保存
-    torch.save(waveforms, "speech.pt")
-    return "speech.pt"
 # Gradioインターフェースの作成
 iface = gr.Interface(
@@ -29,4 +34,45 @@ iface = gr.Interface(
     description="Enter text to synthesize speech."
 )
-iface.launch()

 import gradio as gr
 import torch
+import soundfile as sf
 from speechbrain.inference.TTS import Tacotron2
 from speechbrain.inference.vocoders import HIFIGAN
 # 推論関数の定義
 def synthesize_speech(text):
+    # テキストをトークンIDに変換
+    tokenized = tacotron2.tokenizer(text, phonemize=False)
+    # トークンIDをLong型のテンソルに変換
+    tokens = torch.LongTensor(tokenized)
     # Tacotron2でmel spectrogramを生成
+    mel_output, mel_length, alignment = tacotron2.encode_batch(tokens)
     # HiFi-GANでmel spectrogramから音声を生成
     waveforms = hifi_gan.decode_batch(mel_output)
+    # 音声を .wav 形式で保存
+    sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
+    return "speech.wav"
 # Gradioインターフェースの作成
 iface = gr.Interface(
     description="Enter text to synthesize speech."
 )
+iface.launch()
+# import gradio as gr
+# import torch
+# from speechbrain.inference.TTS import Tacotron2
+# from speechbrain.inference.vocoders import HIFIGAN
+# # モデルのロード
+# hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
+# tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
+# # 推論関数の定義
+# def synthesize_speech(text):
+#     # Tacotron2でmel spectrogramを生成
+#     # テキストを直接入力として、LongTensorでラップする
+#     mel_output, mel_length, alignment = tacotron2.encode_batch([text])
+#     # HiFi-GANでmel spectrogramから音声を生成
+#     waveforms = hifi_gan.decode_batch(mel_output)
+#     # torch tensorをwavfileとして保存
+#     torch.save(waveforms, "speech.pt")
+#     return "speech.pt"
+# # Gradioインターフェースの作成
+# iface = gr.Interface(
+#     fn=synthesize_speech,
+#     inputs=gr.Textbox(lines=5, label="Input Text"),
+#     outputs=gr.Audio(label="Output Audio", type="filepath"),
+#     title="TTS Demo",
+#     description="Enter text to synthesize speech."
+# )
+# iface.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio
 speechbrain
 torch

 gradio
 speechbrain
 torch
+soundfile