Spaces:
Sleeping
Sleeping
speed improvements and documentation
Browse files
InferenceInterfaces/ControllableInterface.py
CHANGED
@@ -59,31 +59,31 @@ class ControllableInterface:
|
|
59 |
0.0], dtype=torch.float32)
|
60 |
embedding = self.wgan.modify_embed(controllability_vector)
|
61 |
self.model.set_utterance_embedding(embedding=embedding)
|
62 |
-
wav, sr,
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
wavs.append(wav)
|
75 |
-
wav = sum(wavs)/len(wavs)
|
76 |
else:
|
77 |
self.model.set_utterance_embedding(reference_audio)
|
78 |
|
79 |
if not voice_seed and reference_audio is not None:
|
80 |
-
wav, sr,
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
return sr, wav
|
|
|
59 |
0.0], dtype=torch.float32)
|
60 |
embedding = self.wgan.modify_embed(controllability_vector)
|
61 |
self.model.set_utterance_embedding(embedding=embedding)
|
62 |
+
wav, sr, pitch, energy, durations = self.model(prompt,
|
63 |
+
input_is_phones=True,
|
64 |
+
duration_scaling_factor=1.0,
|
65 |
+
pitch_variance_scale=1.0,
|
66 |
+
energy_variance_scale=1.0,
|
67 |
+
pause_duration_scaling_factor=1.0,
|
68 |
+
return_plot_as_filepath=False,
|
69 |
+
prosody_creativity=prosody_creativity,
|
70 |
+
loudness_in_db=loudness_in_db,
|
71 |
+
pitch=pitch,
|
72 |
+
energy=energy,
|
73 |
+
durations=durations)
|
74 |
wavs.append(wav)
|
75 |
+
wav = sum(wavs) / len(wavs)
|
76 |
else:
|
77 |
self.model.set_utterance_embedding(reference_audio)
|
78 |
|
79 |
if not voice_seed and reference_audio is not None:
|
80 |
+
wav, sr, pitch, energy, durations = self.model(prompt,
|
81 |
+
input_is_phones=True,
|
82 |
+
duration_scaling_factor=1.0,
|
83 |
+
pitch_variance_scale=1.0,
|
84 |
+
energy_variance_scale=1.0,
|
85 |
+
pause_duration_scaling_factor=1.0,
|
86 |
+
return_plot_as_filepath=False,
|
87 |
+
prosody_creativity=prosody_creativity,
|
88 |
+
loudness_in_db=loudness_in_db)
|
89 |
+
return sr, wav
|
InferenceInterfaces/ToucanTTSInterface.py
CHANGED
@@ -232,7 +232,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
232 |
plt.savefig("tmp.png")
|
233 |
plt.close()
|
234 |
return wave, sr, "tmp.png", pitch, energy, durations
|
235 |
-
return wave, sr
|
236 |
|
237 |
def read_to_file(self,
|
238 |
text_list,
|
|
|
232 |
plt.savefig("tmp.png")
|
233 |
plt.close()
|
234 |
return wave, sr, "tmp.png", pitch, energy, durations
|
235 |
+
return wave, sr, pitch, energy, durations
|
236 |
|
237 |
def read_to_file(self,
|
238 |
text_list,
|
app.py
CHANGED
@@ -43,8 +43,7 @@ class TTSWebUI:
|
|
43 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
44 |
# gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
|
45 |
],
|
46 |
-
outputs=[gr.Audio(type="numpy", label="Speech"),
|
47 |
-
gr.Image(label="Visualization")],
|
48 |
title=title,
|
49 |
allow_flagging="never",
|
50 |
description=article,
|
@@ -57,12 +56,12 @@ class TTSWebUI:
|
|
57 |
voice_seed,
|
58 |
reference_audio,
|
59 |
):
|
60 |
-
sr, wav
|
61 |
reference_audio,
|
62 |
voice_seed,
|
63 |
prosody_creativity,
|
64 |
-24.)
|
65 |
-
return (sr, float2pcm(wav))
|
66 |
|
67 |
|
68 |
if __name__ == '__main__':
|
|
|
43 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
44 |
# gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
|
45 |
],
|
46 |
+
outputs=[gr.Audio(type="numpy", label="Speech")],
|
|
|
47 |
title=title,
|
48 |
allow_flagging="never",
|
49 |
description=article,
|
|
|
56 |
voice_seed,
|
57 |
reference_audio,
|
58 |
):
|
59 |
+
sr, wav = self.controllable_ui.read(prompt,
|
60 |
reference_audio,
|
61 |
voice_seed,
|
62 |
prosody_creativity,
|
63 |
-24.)
|
64 |
+
return (sr, float2pcm(wav))
|
65 |
|
66 |
|
67 |
if __name__ == '__main__':
|