miya3333 commited on
Commit
200e5f9
·
verified ·
1 Parent(s): 22b35db

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +52 -6
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import torch
 
3
  from speechbrain.inference.TTS import Tacotron2
4
  from speechbrain.inference.vocoders import HIFIGAN
5
 
@@ -9,16 +10,20 @@ tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech",
9
 
10
  # 推論関数の定義
11
  def synthesize_speech(text):
 
 
 
 
 
12
  # Tacotron2でmel spectrogramを生成
13
- # テキストを直接入力として、LongTensorでラップする
14
- mel_output, mel_length, alignment = tacotron2.encode_batch([text])
15
 
16
  # HiFi-GANでmel spectrogramから音声を生成
17
  waveforms = hifi_gan.decode_batch(mel_output)
18
 
19
- # torch tensorをwavfileとして保存
20
- torch.save(waveforms, "speech.pt")
21
- return "speech.pt"
22
 
23
  # Gradioインターフェースの作成
24
  iface = gr.Interface(
@@ -29,4 +34,45 @@ iface = gr.Interface(
29
  description="Enter text to synthesize speech."
30
  )
31
 
32
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
+ import soundfile as sf
4
  from speechbrain.inference.TTS import Tacotron2
5
  from speechbrain.inference.vocoders import HIFIGAN
6
 
 
10
 
11
  # 推論関数の定義
12
  def synthesize_speech(text):
13
+ # テキストをトークンIDに変換
14
+ tokenized = tacotron2.tokenizer(text, phonemize=False)
15
+ # トークンIDをLong型のテンソルに変換
16
+ tokens = torch.LongTensor(tokenized)
17
+
18
  # Tacotron2でmel spectrogramを生成
19
+ mel_output, mel_length, alignment = tacotron2.encode_batch(tokens)
 
20
 
21
  # HiFi-GANでmel spectrogramから音声を生成
22
  waveforms = hifi_gan.decode_batch(mel_output)
23
 
24
+ # 音声を .wav 形式で保存
25
+ sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
26
+ return "speech.wav"
27
 
28
  # Gradioインターフェースの作成
29
  iface = gr.Interface(
 
34
  description="Enter text to synthesize speech."
35
  )
36
 
37
+ iface.launch()
38
+
39
+ # import gradio as gr
40
+ # import torch
41
+ # from speechbrain.inference.TTS import Tacotron2
42
+ # from speechbrain.inference.vocoders import HIFIGAN
43
+
44
+ # # モデルのロード
45
+ # hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
46
+ # tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
47
+
48
+ # # 推論関数の定義
49
+ # def synthesize_speech(text):
50
+ # # Tacotron2でmel spectrogramを生成
51
+ # # テキストを直接入力として、LongTensorでラップする
52
+ # mel_output, mel_length, alignment = tacotron2.encode_batch([text])
53
+
54
+ # # HiFi-GANでmel spectrogramから音声を生成
55
+ # waveforms = hifi_gan.decode_batch(mel_output)
56
+
57
+ # # torch tensorをwavfileとして保存
58
+ # torch.save(waveforms, "speech.pt")
59
+ # return "speech.pt"
60
+
61
+ # # Gradioインターフェースの作成
62
+ # iface = gr.Interface(
63
+ # fn=synthesize_speech,
64
+ # inputs=gr.Textbox(lines=5, label="Input Text"),
65
+ # outputs=gr.Audio(label="Output Audio", type="filepath"),
66
+ # title="TTS Demo",
67
+ # description="Enter text to synthesize speech."
68
+ # )
69
+
70
+ # iface.launch()
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  gradio
2
  speechbrain
3
  torch
 
 
1
  gradio
2
  speechbrain
3
  torch
4
+ soundfile