Flux9665 commited on
Commit
401875a
·
1 Parent(s): e1cc9e4

Implement changes to speak accent agnostic and in many languages

Browse files
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
-
3
  import torch
4
  from huggingface_hub import hf_hub_download
5
 
@@ -30,88 +29,62 @@ class ControllableInterface:
30
  def read(self,
31
  prompt,
32
  reference_audio,
33
- language,
34
- accent,
35
  voice_seed,
36
  prosody_creativity,
37
- duration_scaling_factor,
38
- pause_duration_scaling_factor,
39
- pitch_variance_scale,
40
- energy_variance_scale,
41
- emb_slider_1,
42
- emb_slider_2,
43
- emb_slider_3,
44
- emb_slider_4,
45
- emb_slider_5,
46
- emb_slider_6,
47
  loudness_in_db
48
  ):
49
- if self.current_language != language:
50
- self.model.set_phonemizer_language(language)
51
- print(f"switched phonemizer language to {language}")
52
- self.current_language = language
53
- if self.current_accent != accent:
54
- self.model.set_accent_language(accent)
55
- print(f"switched accent language to {accent}")
56
- self.current_accent = accent
57
  if reference_audio is None:
58
- self.wgan.set_latent(voice_seed)
59
- controllability_vector = torch.tensor([emb_slider_1,
60
- emb_slider_2,
61
- emb_slider_3,
62
- emb_slider_4,
63
- emb_slider_5,
64
- emb_slider_6], dtype=torch.float32)
65
- embedding = self.wgan.modify_embed(controllability_vector)
66
- self.model.set_utterance_embedding(embedding=embedding)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  else:
68
  self.model.set_utterance_embedding(reference_audio)
69
 
70
- phones = self.model.text2phone.get_phone_string(prompt)
71
- if len(phones) > 1800:
72
- if language == "deu":
73
- prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
74
- elif language == "ell":
75
- prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
76
- elif language == "spa":
77
- prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
78
- elif language == "fin":
79
- prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
80
- elif language == "rus":
81
- prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
82
- elif language == "hun":
83
- prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
84
- elif language == "nld":
85
- prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
86
- elif language == "fra":
87
- prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
88
- elif language == 'pol':
89
- prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
90
- elif language == 'por':
91
- prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
92
- elif language == 'ita':
93
- prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
94
- elif language == 'cmn':
95
- prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
96
- elif language == 'vie':
97
- prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
98
- else:
99
- prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
100
- if self.current_language != "eng":
101
- self.model.set_phonemizer_language("eng")
102
- self.current_language = "eng"
103
- if self.current_accent != "eng":
104
- self.model.set_accent_language("eng")
105
- self.current_accent = "eng"
106
-
107
- print(prompt + "\n\n")
108
- wav, sr, fig = self.model(prompt,
109
- input_is_phones=False,
110
- duration_scaling_factor=duration_scaling_factor,
111
- pitch_variance_scale=pitch_variance_scale,
112
- energy_variance_scale=energy_variance_scale,
113
- pause_duration_scaling_factor=pause_duration_scaling_factor,
114
- return_plot_as_filepath=True,
115
- prosody_creativity=prosody_creativity,
116
- loudness_in_db=loudness_in_db)
117
  return sr, wav, fig
 
1
  import os
 
2
  import torch
3
  from huggingface_hub import hf_hub_download
4
 
 
29
  def read(self,
30
  prompt,
31
  reference_audio,
 
 
32
  voice_seed,
33
  prosody_creativity,
 
 
 
 
 
 
 
 
 
 
34
  loudness_in_db
35
  ):
36
+ print(prompt + "\n\n")
37
+ self.model.set_accent_language(accent)
38
+
 
 
 
 
 
39
  if reference_audio is None:
40
+ if not voice_seed:
41
+ self.wgan.set_latent(7)
42
+ controllability_vector = torch.tensor([0.0,
43
+ 0.0,
44
+ 0.0,
45
+ 0.0,
46
+ 0.0,
47
+ 0.0], dtype=torch.float32)
48
+ embedding = self.wgan.modify_embed(controllability_vector)
49
+ self.model.set_utterance_embedding(embedding=embedding)
50
+ else:
51
+ wavs = list()
52
+ pitch, energy, durations = None, None, None
53
+ for i in range(10):
54
+ self.wgan.set_latent(i)
55
+ controllability_vector = torch.tensor([0.0,
56
+ 0.0,
57
+ 0.0,
58
+ 0.0,
59
+ 0.0,
60
+ 0.0], dtype=torch.float32)
61
+ embedding = self.wgan.modify_embed(controllability_vector)
62
+ self.model.set_utterance_embedding(embedding=embedding)
63
+ wav, sr, fig, pitch, energy, durations = self.model(prompt,
64
+ input_is_phones=True,
65
+ duration_scaling_factor=1.0,
66
+ pitch_variance_scale=1.0,
67
+ energy_variance_scale=1.0,
68
+ pause_duration_scaling_factor=1.0,
69
+ return_plot_as_filepath=True,
70
+ prosody_creativity=prosody_creativity,
71
+ loudness_in_db=loudness_in_db,
72
+ pitch=pitch,
73
+ energy=energy,
74
+ durations=durations)
75
+ wavs.append(wav)
76
+ wav = wavs.mean()
77
  else:
78
  self.model.set_utterance_embedding(reference_audio)
79
 
80
+ if not voice_seed and reference_audio is not None:
81
+ wav, sr, fig, pitch, energy, durations = self.model(prompt,
82
+ input_is_phones=True,
83
+ duration_scaling_factor=1.0,
84
+ pitch_variance_scale=1.0,
85
+ energy_variance_scale=1.0,
86
+ pause_duration_scaling_factor=1.0,
87
+ return_plot_as_filepath=True,
88
+ prosody_creativity=prosody_creativity,
89
+ loudness_in_db=loudness_in_db)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  return sr, wav, fig
InferenceInterfaces/ToucanTTSInterface.py CHANGED
@@ -231,7 +231,7 @@ class ToucanTTSInterface(torch.nn.Module):
231
  if return_plot_as_filepath:
232
  plt.savefig("tmp.png")
233
  plt.close()
234
- return wave, sr, "tmp.png"
235
  return wave, sr
236
 
237
  def read_to_file(self,
 
231
  if return_plot_as_filepath:
232
  plt.savefig("tmp.png")
233
  plt.close()
234
+ return wave, sr, "tmp.png", pitch, energy, durations
235
  return wave, sr
236
 
237
  def read_to_file(self,
Modules/GeneralLayers/Conformer.py CHANGED
@@ -130,10 +130,13 @@ class Conformer(torch.nn.Module):
130
  xs = self.art_embed_norm(xs)
131
 
132
  if lang_ids is not None:
133
- lang_embs = self.language_embedding(lang_ids)
134
- projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
135
- projected_lang_embs = self.language_emb_norm(projected_lang_embs)
136
- xs = xs + projected_lang_embs # offset phoneme representation by language specific offset
 
 
 
137
 
138
  xs = self.pos_enc(xs)
139
 
 
130
  xs = self.art_embed_norm(xs)
131
 
132
  if lang_ids is not None:
133
+ proj_lang_embs_s = list()
134
+ for lang_id in [1448, 1709, 1250, 6356, 1809, 2450, 5540, 4447, 334, 1685, 2211, 2574, 5222]:
135
+ lang_embs = self.language_embedding(torch.LongTensor([lang_id]))
136
+ projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
137
+ projected_lang_embs = self.language_emb_norm(projected_lang_embs)
138
+ proj_lang_embs_s.append(projected_lang_embs)
139
+ xs = xs + proj_lang_embs_s.mean() # offset phoneme representation by language specific offset
140
 
141
  xs = self.pos_enc(xs)
142
 
app.py CHANGED
@@ -33,17 +33,11 @@ class TTSWebUI:
33
  embedding_gan_path=embedding_gan_path)
34
  self.iface = gr.Interface(fn=self.read,
35
  inputs=[gr.Textbox(lines=2,
36
- placeholder="write what you want the synthesis to read here...",
37
- value="What I cannot create, I do not understand.",
38
- label="Text input"),
39
- gr.Dropdown(text_selection,
40
- type="value",
41
- value='English (eng)',
42
- label="Select the Language of the Text (type on your keyboard to find it quickly)"),
43
  gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
44
- gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Faster - Slower"),
45
- gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=5, label="Random Seed for the artificial Voice"),
46
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"),
47
  gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
48
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
49
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
@@ -59,32 +53,14 @@ class TTSWebUI:
59
 
60
  def read(self,
61
  prompt,
62
- language,
63
  prosody_creativity,
64
- duration_scaling_factor,
65
  voice_seed,
66
- emb1,
67
  reference_audio,
68
- # pitch_variance_scale,
69
- # energy_variance_scale,
70
- # emb2
71
  ):
72
  sr, wav, fig = self.controllable_ui.read(prompt,
73
  reference_audio,
74
- language.split(" ")[-1].split("(")[1].split(")")[0],
75
- language.split(" ")[-1].split("(")[1].split(")")[0],
76
  voice_seed,
77
  prosody_creativity,
78
- duration_scaling_factor,
79
- 1.,
80
- 1.0,
81
- 1.0,
82
- emb1,
83
- 0.,
84
- 0.,
85
- 0.,
86
- 0.,
87
- 0.,
88
  -24.)
89
  return (sr, float2pcm(wav)), fig
90
 
 
33
  embedding_gan_path=embedding_gan_path)
34
  self.iface = gr.Interface(fn=self.read,
35
  inputs=[gr.Textbox(lines=2,
36
+ placeholder="put in IPA symbols here...",
37
+ value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
38
+ label="IPA Input"),
 
 
 
 
39
  gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
40
+ gr.Checkbox(value=True, label="Speak in many Voices"),
 
 
41
  gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
42
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
43
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
 
53
 
54
  def read(self,
55
  prompt,
 
56
  prosody_creativity,
 
57
  voice_seed,
 
58
  reference_audio,
 
 
 
59
  ):
60
  sr, wav, fig = self.controllable_ui.read(prompt,
61
  reference_audio,
 
 
62
  voice_seed,
63
  prosody_creativity,
 
 
 
 
 
 
 
 
 
 
64
  -24.)
65
  return (sr, float2pcm(wav)), fig
66
 
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
- torch==2.2.0
2
- torchaudio==2.2.0
3
- torchvision==0.17.0
4
  torch_complex~=0.4.3
5
  epitran==1.24
6
  tqdm~=4.64.1
 
 
 
 
1
  torch_complex~=0.4.3
2
  epitran==1.24
3
  tqdm~=4.64.1