Spaces:
Running
Running
Implement changes to speak accent agnostic and in many languages
Browse files- InferenceInterfaces/ControllableInterface.py +50 -77
- InferenceInterfaces/ToucanTTSInterface.py +1 -1
- Modules/GeneralLayers/Conformer.py +7 -4
- app.py +4 -28
- requirements.txt +0 -3
InferenceInterfaces/ControllableInterface.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import os
|
2 |
-
|
3 |
import torch
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
|
@@ -30,88 +29,62 @@ class ControllableInterface:
|
|
30 |
def read(self,
|
31 |
prompt,
|
32 |
reference_audio,
|
33 |
-
language,
|
34 |
-
accent,
|
35 |
voice_seed,
|
36 |
prosody_creativity,
|
37 |
-
duration_scaling_factor,
|
38 |
-
pause_duration_scaling_factor,
|
39 |
-
pitch_variance_scale,
|
40 |
-
energy_variance_scale,
|
41 |
-
emb_slider_1,
|
42 |
-
emb_slider_2,
|
43 |
-
emb_slider_3,
|
44 |
-
emb_slider_4,
|
45 |
-
emb_slider_5,
|
46 |
-
emb_slider_6,
|
47 |
loudness_in_db
|
48 |
):
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
self.current_language = language
|
53 |
-
if self.current_accent != accent:
|
54 |
-
self.model.set_accent_language(accent)
|
55 |
-
print(f"switched accent language to {accent}")
|
56 |
-
self.current_accent = accent
|
57 |
if reference_audio is None:
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
else:
|
68 |
self.model.set_utterance_embedding(reference_audio)
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
elif language == "rus":
|
81 |
-
prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
|
82 |
-
elif language == "hun":
|
83 |
-
prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
|
84 |
-
elif language == "nld":
|
85 |
-
prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
|
86 |
-
elif language == "fra":
|
87 |
-
prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
|
88 |
-
elif language == 'pol':
|
89 |
-
prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
|
90 |
-
elif language == 'por':
|
91 |
-
prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
|
92 |
-
elif language == 'ita':
|
93 |
-
prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
|
94 |
-
elif language == 'cmn':
|
95 |
-
prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
|
96 |
-
elif language == 'vie':
|
97 |
-
prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
|
98 |
-
else:
|
99 |
-
prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
|
100 |
-
if self.current_language != "eng":
|
101 |
-
self.model.set_phonemizer_language("eng")
|
102 |
-
self.current_language = "eng"
|
103 |
-
if self.current_accent != "eng":
|
104 |
-
self.model.set_accent_language("eng")
|
105 |
-
self.current_accent = "eng"
|
106 |
-
|
107 |
-
print(prompt + "\n\n")
|
108 |
-
wav, sr, fig = self.model(prompt,
|
109 |
-
input_is_phones=False,
|
110 |
-
duration_scaling_factor=duration_scaling_factor,
|
111 |
-
pitch_variance_scale=pitch_variance_scale,
|
112 |
-
energy_variance_scale=energy_variance_scale,
|
113 |
-
pause_duration_scaling_factor=pause_duration_scaling_factor,
|
114 |
-
return_plot_as_filepath=True,
|
115 |
-
prosody_creativity=prosody_creativity,
|
116 |
-
loudness_in_db=loudness_in_db)
|
117 |
return sr, wav, fig
|
|
|
1 |
import os
|
|
|
2 |
import torch
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
|
|
|
29 |
def read(self,
|
30 |
prompt,
|
31 |
reference_audio,
|
|
|
|
|
32 |
voice_seed,
|
33 |
prosody_creativity,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
loudness_in_db
|
35 |
):
|
36 |
+
print(prompt + "\n\n")
|
37 |
+
self.model.set_accent_language(accent)
|
38 |
+
|
|
|
|
|
|
|
|
|
|
|
39 |
if reference_audio is None:
|
40 |
+
if not voice_seed:
|
41 |
+
self.wgan.set_latent(7)
|
42 |
+
controllability_vector = torch.tensor([0.0,
|
43 |
+
0.0,
|
44 |
+
0.0,
|
45 |
+
0.0,
|
46 |
+
0.0,
|
47 |
+
0.0], dtype=torch.float32)
|
48 |
+
embedding = self.wgan.modify_embed(controllability_vector)
|
49 |
+
self.model.set_utterance_embedding(embedding=embedding)
|
50 |
+
else:
|
51 |
+
wavs = list()
|
52 |
+
pitch, energy, durations = None, None, None
|
53 |
+
for i in range(10):
|
54 |
+
self.wgan.set_latent(i)
|
55 |
+
controllability_vector = torch.tensor([0.0,
|
56 |
+
0.0,
|
57 |
+
0.0,
|
58 |
+
0.0,
|
59 |
+
0.0,
|
60 |
+
0.0], dtype=torch.float32)
|
61 |
+
embedding = self.wgan.modify_embed(controllability_vector)
|
62 |
+
self.model.set_utterance_embedding(embedding=embedding)
|
63 |
+
wav, sr, fig, pitch, energy, durations = self.model(prompt,
|
64 |
+
input_is_phones=True,
|
65 |
+
duration_scaling_factor=1.0,
|
66 |
+
pitch_variance_scale=1.0,
|
67 |
+
energy_variance_scale=1.0,
|
68 |
+
pause_duration_scaling_factor=1.0,
|
69 |
+
return_plot_as_filepath=True,
|
70 |
+
prosody_creativity=prosody_creativity,
|
71 |
+
loudness_in_db=loudness_in_db,
|
72 |
+
pitch=pitch,
|
73 |
+
energy=energy,
|
74 |
+
durations=durations)
|
75 |
+
wavs.append(wav)
|
76 |
+
wav = wavs.mean()
|
77 |
else:
|
78 |
self.model.set_utterance_embedding(reference_audio)
|
79 |
|
80 |
+
if not voice_seed and reference_audio is not None:
|
81 |
+
wav, sr, fig, pitch, energy, durations = self.model(prompt,
|
82 |
+
input_is_phones=True,
|
83 |
+
duration_scaling_factor=1.0,
|
84 |
+
pitch_variance_scale=1.0,
|
85 |
+
energy_variance_scale=1.0,
|
86 |
+
pause_duration_scaling_factor=1.0,
|
87 |
+
return_plot_as_filepath=True,
|
88 |
+
prosody_creativity=prosody_creativity,
|
89 |
+
loudness_in_db=loudness_in_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
return sr, wav, fig
|
InferenceInterfaces/ToucanTTSInterface.py
CHANGED
@@ -231,7 +231,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
231 |
if return_plot_as_filepath:
|
232 |
plt.savefig("tmp.png")
|
233 |
plt.close()
|
234 |
-
return wave, sr, "tmp.png"
|
235 |
return wave, sr
|
236 |
|
237 |
def read_to_file(self,
|
|
|
231 |
if return_plot_as_filepath:
|
232 |
plt.savefig("tmp.png")
|
233 |
plt.close()
|
234 |
+
return wave, sr, "tmp.png", pitch, energy, durations
|
235 |
return wave, sr
|
236 |
|
237 |
def read_to_file(self,
|
Modules/GeneralLayers/Conformer.py
CHANGED
@@ -130,10 +130,13 @@ class Conformer(torch.nn.Module):
|
|
130 |
xs = self.art_embed_norm(xs)
|
131 |
|
132 |
if lang_ids is not None:
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
137 |
|
138 |
xs = self.pos_enc(xs)
|
139 |
|
|
|
130 |
xs = self.art_embed_norm(xs)
|
131 |
|
132 |
if lang_ids is not None:
|
133 |
+
proj_lang_embs_s = list()
|
134 |
+
for lang_id in [1448, 1709, 1250, 6356, 1809, 2450, 5540, 4447, 334, 1685, 2211, 2574, 5222]:
|
135 |
+
lang_embs = self.language_embedding(torch.LongTensor([lang_id]))
|
136 |
+
projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
|
137 |
+
projected_lang_embs = self.language_emb_norm(projected_lang_embs)
|
138 |
+
proj_lang_embs_s.append(projected_lang_embs)
|
139 |
+
xs = xs + proj_lang_embs_s.mean() # offset phoneme representation by language specific offset
|
140 |
|
141 |
xs = self.pos_enc(xs)
|
142 |
|
app.py
CHANGED
@@ -33,17 +33,11 @@ class TTSWebUI:
|
|
33 |
embedding_gan_path=embedding_gan_path)
|
34 |
self.iface = gr.Interface(fn=self.read,
|
35 |
inputs=[gr.Textbox(lines=2,
|
36 |
-
placeholder="
|
37 |
-
value="
|
38 |
-
label="
|
39 |
-
gr.Dropdown(text_selection,
|
40 |
-
type="value",
|
41 |
-
value='English (eng)',
|
42 |
-
label="Select the Language of the Text (type on your keyboard to find it quickly)"),
|
43 |
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
|
44 |
-
gr.
|
45 |
-
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=5, label="Random Seed for the artificial Voice"),
|
46 |
-
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"),
|
47 |
gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
|
48 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
49 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
@@ -59,32 +53,14 @@ class TTSWebUI:
|
|
59 |
|
60 |
def read(self,
|
61 |
prompt,
|
62 |
-
language,
|
63 |
prosody_creativity,
|
64 |
-
duration_scaling_factor,
|
65 |
voice_seed,
|
66 |
-
emb1,
|
67 |
reference_audio,
|
68 |
-
# pitch_variance_scale,
|
69 |
-
# energy_variance_scale,
|
70 |
-
# emb2
|
71 |
):
|
72 |
sr, wav, fig = self.controllable_ui.read(prompt,
|
73 |
reference_audio,
|
74 |
-
language.split(" ")[-1].split("(")[1].split(")")[0],
|
75 |
-
language.split(" ")[-1].split("(")[1].split(")")[0],
|
76 |
voice_seed,
|
77 |
prosody_creativity,
|
78 |
-
duration_scaling_factor,
|
79 |
-
1.,
|
80 |
-
1.0,
|
81 |
-
1.0,
|
82 |
-
emb1,
|
83 |
-
0.,
|
84 |
-
0.,
|
85 |
-
0.,
|
86 |
-
0.,
|
87 |
-
0.,
|
88 |
-24.)
|
89 |
return (sr, float2pcm(wav)), fig
|
90 |
|
|
|
33 |
embedding_gan_path=embedding_gan_path)
|
34 |
self.iface = gr.Interface(fn=self.read,
|
35 |
inputs=[gr.Textbox(lines=2,
|
36 |
+
placeholder="put in IPA symbols here...",
|
37 |
+
value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
|
38 |
+
label="IPA Input"),
|
|
|
|
|
|
|
|
|
39 |
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
|
40 |
+
gr.Checkbox(value=True, label="Speak in many Voices"),
|
|
|
|
|
41 |
gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
|
42 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
43 |
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
|
|
53 |
|
54 |
def read(self,
|
55 |
prompt,
|
|
|
56 |
prosody_creativity,
|
|
|
57 |
voice_seed,
|
|
|
58 |
reference_audio,
|
|
|
|
|
|
|
59 |
):
|
60 |
sr, wav, fig = self.controllable_ui.read(prompt,
|
61 |
reference_audio,
|
|
|
|
|
62 |
voice_seed,
|
63 |
prosody_creativity,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
-24.)
|
65 |
return (sr, float2pcm(wav)), fig
|
66 |
|
requirements.txt
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
-
torch==2.2.0
|
2 |
-
torchaudio==2.2.0
|
3 |
-
torchvision==0.17.0
|
4 |
torch_complex~=0.4.3
|
5 |
epitran==1.24
|
6 |
tqdm~=4.64.1
|
|
|
|
|
|
|
|
|
1 |
torch_complex~=0.4.3
|
2 |
epitran==1.24
|
3 |
tqdm~=4.64.1
|