miya3333 commited on
Commit
9fae8dd
·
verified ·
1 Parent(s): 55ca9a4

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -16
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import gradio as gr
2
  import torch
 
3
  from speechbrain.inference.TTS import Tacotron2
4
  from speechbrain.inference.vocoders import HIFIGAN
5
- import soundfile as sf
6
 
7
  # モデルのロード
8
  hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
@@ -10,17 +11,18 @@ tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech",
10
 
11
  # 推論関数の定義
12
  def synthesize_speech(text):
 
 
 
 
 
 
13
  # Tacotron2でmel spectrogramを生成
14
- # テキストを直接入力として、LongTensorでラップする
15
- mel_output, mel_length, alignment = tacotron2.encode_batch([text])
16
 
17
  # HiFi-GANでmel spectrogramから音声を生成
18
  waveforms = hifi_gan.decode_batch(mel_output)
19
 
20
- # # torch tensorをwavfileとして保存
21
- # torch.save(waveforms, "speech.pt")
22
- # return "speech.pt"
23
-
24
  # 音声を .wav 形式で保存
25
  sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
26
  return "speech.wav"
@@ -34,12 +36,4 @@ iface = gr.Interface(
34
  description="Enter text to synthesize speech."
35
  )
36
 
37
- iface.launch()
38
-
39
-
40
-
41
-
42
-
43
-
44
-
45
-
 
1
  import gradio as gr
2
  import torch
3
+ import soundfile as sf
4
  from speechbrain.inference.TTS import Tacotron2
5
  from speechbrain.inference.vocoders import HIFIGAN
6
+ from speechbrain.dataio.dataio import read_audio
7
 
8
  # モデルのロード
9
  hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
 
11
 
12
  # 推論関数の定義
13
  def synthesize_speech(text):
14
+ # テキストをトークンIDに変換
15
+ text = text.lower()
16
+ tokenized = tacotron2.hparams.tokenize(text, phonemize=False)
17
+ # トークンIDをLong型のテンソルに変換
18
+ tokens = torch.LongTensor(tokenized)
19
+
20
  # Tacotron2でmel spectrogramを生成
21
+ mel_output, mel_length, alignment = tacotron2.encode_batch(tokens)
 
22
 
23
  # HiFi-GANでmel spectrogramから音声を生成
24
  waveforms = hifi_gan.decode_batch(mel_output)
25
 
 
 
 
 
26
  # 音声を .wav 形式で保存
27
  sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
28
  return "speech.wav"
 
36
  description="Enter text to synthesize speech."
37
  )
38
 
39
+ iface.launch()