rakhlin commited on
Commit
64a316c
·
1 Parent(s): f016728

Upload folder using huggingface_hub

Browse files
.ipynb_checkpoints/Coqui.ai-Copy1-checkpoint.ipynb ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "14a326bb",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import gradio as gr\n",
11
+ "import numpy as np\n",
12
+ "import torch\n",
13
+ "import torch.nn.functional as F\n",
14
+ "from pathlib import Path\n",
15
+ "\n",
16
+ "from TTS.api import TTS\n",
17
+ "from TTS.utils.manage import ModelManager"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 17,
23
+ "id": "2cfd77d1",
24
+ "metadata": {
25
+ "scrolled": false
26
+ },
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
33
+ " > Model's license - MIT\n",
34
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
35
+ " > Using model: freevc\n",
36
+ " > Loading pretrained speaker encoder model ...\n",
37
+ "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
38
+ "Running on local URL: http://127.0.0.1:7873\n",
39
+ "\n",
40
+ "To create a public link, set `share=True` in `launch()`.\n"
41
+ ]
42
+ },
43
+ {
44
+ "data": {
45
+ "text/html": [
46
+ "<div><iframe src=\"http://127.0.0.1:7873/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
47
+ ],
48
+ "text/plain": [
49
+ "<IPython.core.display.HTML object>"
50
+ ]
51
+ },
52
+ "metadata": {},
53
+ "output_type": "display_data"
54
+ },
55
+ {
56
+ "data": {
57
+ "text/plain": []
58
+ },
59
+ "execution_count": 17,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ },
63
+ {
64
+ "name": "stdout",
65
+ "output_type": "stream",
66
+ "text": [
67
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
68
+ " > Model's license - apache 2.0\n",
69
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
70
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
71
+ " > Model's license - apache 2.0\n",
72
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
73
+ " > Using model: Tacotron2\n",
74
+ " > Setting up Audio Processor...\n",
75
+ " | > sample_rate:22050\n",
76
+ " | > resample:False\n",
77
+ " | > num_mels:80\n",
78
+ " | > log_func:np.log10\n",
79
+ " | > min_level_db:-100\n",
80
+ " | > frame_shift_ms:None\n",
81
+ " | > frame_length_ms:None\n",
82
+ " | > ref_level_db:20\n",
83
+ " | > fft_size:1024\n",
84
+ " | > power:1.5\n",
85
+ " | > preemphasis:0.0\n",
86
+ " | > griffin_lim_iters:60\n",
87
+ " | > signal_norm:True\n",
88
+ " | > symmetric_norm:True\n",
89
+ " | > mel_fmin:50.0\n",
90
+ " | > mel_fmax:7600.0\n",
91
+ " | > pitch_fmin:0.0\n",
92
+ " | > pitch_fmax:640.0\n",
93
+ " | > spec_gain:1.0\n",
94
+ " | > stft_pad_mode:reflect\n",
95
+ " | > max_norm:4.0\n",
96
+ " | > clip_norm:True\n",
97
+ " | > do_trim_silence:True\n",
98
+ " | > trim_db:60\n",
99
+ " | > do_sound_norm:False\n",
100
+ " | > do_amp_to_db_linear:True\n",
101
+ " | > do_amp_to_db_mel:True\n",
102
+ " | > do_rms_norm:False\n",
103
+ " | > db_level:None\n",
104
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
105
+ " | > base:10\n",
106
+ " | > hop_length:256\n",
107
+ " | > win_length:1024\n",
108
+ " > Model's reduction rate `r` is set to: 2\n",
109
+ " > Vocoder Model: univnet\n",
110
+ " > Setting up Audio Processor...\n",
111
+ " | > sample_rate:22050\n",
112
+ " | > resample:False\n",
113
+ " | > num_mels:80\n",
114
+ " | > log_func:np.log10\n",
115
+ " | > min_level_db:-100\n",
116
+ " | > frame_shift_ms:None\n",
117
+ " | > frame_length_ms:None\n",
118
+ " | > ref_level_db:20\n",
119
+ " | > fft_size:1024\n",
120
+ " | > power:1.5\n",
121
+ " | > preemphasis:0.0\n",
122
+ " | > griffin_lim_iters:60\n",
123
+ " | > signal_norm:True\n",
124
+ " | > symmetric_norm:True\n",
125
+ " | > mel_fmin:50.0\n",
126
+ " | > mel_fmax:7600.0\n",
127
+ " | > pitch_fmin:1.0\n",
128
+ " | > pitch_fmax:640.0\n",
129
+ " | > spec_gain:1.0\n",
130
+ " | > stft_pad_mode:reflect\n",
131
+ " | > max_norm:4.0\n",
132
+ " | > clip_norm:True\n",
133
+ " | > do_trim_silence:True\n",
134
+ " | > trim_db:60\n",
135
+ " | > do_sound_norm:False\n",
136
+ " | > do_amp_to_db_linear:True\n",
137
+ " | > do_amp_to_db_mel:True\n",
138
+ " | > do_rms_norm:False\n",
139
+ " | > db_level:None\n",
140
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
141
+ " | > base:10\n",
142
+ " | > hop_length:256\n",
143
+ " | > win_length:1024\n",
144
+ " > Generator Model: univnet_generator\n",
145
+ " > Discriminator Model: univnet_discriminator\n",
146
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
147
+ "language: \n",
148
+ "speaker: \n",
149
+ "voice cloning with the voice conversion model\n",
150
+ " > Text splitted to sentences.\n",
151
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
152
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
153
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
154
+ " > Processing time: 3.2769999504089355\n",
155
+ " > Real-time factor: 0.37722315040572285\n",
156
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
157
+ " > Model's license - MIT\n",
158
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
159
+ " > Using model: freevc\n",
160
+ " > Loading pretrained speaker encoder model ...\n",
161
+ "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
162
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
163
+ "language: \n",
164
+ "speaker: \n",
165
+ "voice cloning with the voice conversion model\n",
166
+ " > Text splitted to sentences.\n",
167
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
168
+ " > Processing time: 2.8229997158050537\n",
169
+ " > Real-time factor: 0.3249621185552823\n",
170
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
171
+ "language: \n",
172
+ "speaker: \n",
173
+ "voice cloning with the voice conversion model\n",
174
+ " > Text splitted to sentences.\n",
175
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
176
+ " > Processing time: 2.858999729156494\n",
177
+ " > Real-time factor: 0.32910616452921765\n",
178
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
179
+ "language: \n",
180
+ "speaker: \n",
181
+ "voice cloning with the voice conversion model\n",
182
+ " > Text splitted to sentences.\n",
183
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
184
+ " > Processing time: 2.8419997692108154\n",
185
+ " > Real-time factor: 0.3271492592669274\n",
186
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
187
+ "language: \n",
188
+ "speaker: \n",
189
+ "voice cloning with the voice conversion model\n",
190
+ " > Text splitted to sentences.\n",
191
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
192
+ " > Processing time: 2.922999858856201\n",
193
+ " > Real-time factor: 0.3364733695695124\n"
194
+ ]
195
+ },
196
+ {
197
+ "name": "stderr",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "Traceback (most recent call last):\n",
201
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\routes.py\", line 434, in run_predict\n",
202
+ " event_data=event_data,\n",
203
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\blocks.py\", line 1324, in process_api\n",
204
+ " fn_index, inputs, iterator, request, event_id, event_data\n",
205
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\blocks.py\", line 1052, in call_function\n",
206
+ " fn, *processed_input, limiter=self.limiter\n",
207
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\to_thread.py\", line 34, in run_sync\n",
208
+ " func, *args, cancellable=cancellable, limiter=limiter\n",
209
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 877, in run_sync_in_worker_thread\n",
210
+ " return await future\n",
211
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
212
+ " result = context.run(func, *args)\n",
213
+ "TypeError: voice_clone() takes 2 positional arguments but 3 were given\n",
214
+ "Traceback (most recent call last):\n",
215
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\routes.py\", line 434, in run_predict\n",
216
+ " event_data=event_data,\n",
217
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\blocks.py\", line 1324, in process_api\n",
218
+ " fn_index, inputs, iterator, request, event_id, event_data\n",
219
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\blocks.py\", line 1052, in call_function\n",
220
+ " fn, *processed_input, limiter=self.limiter\n",
221
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\to_thread.py\", line 34, in run_sync\n",
222
+ " func, *args, cancellable=cancellable, limiter=limiter\n",
223
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 877, in run_sync_in_worker_thread\n",
224
+ " return await future\n",
225
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
226
+ " result = context.run(func, *args)\n",
227
+ "TypeError: voice_clone() takes 2 positional arguments but 3 were given\n"
228
+ ]
229
+ }
230
+ ],
231
+ "source": [
232
+ "title = \"\"\n",
233
+ "description = \"\"\"\"\"\"\n",
234
+ "article = \"\"\"\"\"\"\n",
235
+ "\n",
236
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
237
+ "GPU = device == \"cuda\"\n",
238
+ "INT16MAX = np.iinfo(np.int16).max\n",
239
+ "VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)\n",
240
+ "\n",
241
+ "\n",
242
+ "model_ids = ModelManager(verbose=False).list_models()\n",
243
+ "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
244
+ "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
245
+ "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
246
+ "examples_pt = 'examples'\n",
247
+ "allowed_extentions = ['.mp3', '.wav']\n",
248
+ "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
249
+ "verse = \"\"\"Mary had a little lamb,\n",
250
+ "Its fleece was white as snow.\n",
251
+ "Everywhere the child went,\n",
252
+ "The little lamb was sure to go.\"\"\"\n",
253
+ "\n",
254
+ "\n",
255
+ "def on_model_tts_select(model_name):\n",
256
+ " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
257
+ " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
258
+ " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
259
+ " language = languages[0]\n",
260
+ " speaker = speakers[0]\n",
261
+ " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
262
+ " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
263
+ "\n",
264
+ "\n",
265
+ "def on_voicedropdown(x):\n",
266
+ " return examples[x]\n",
267
+ "\n",
268
+ "\n",
269
+ "def voice_clone(source_wav, target_wav):\n",
270
+ " print(f'model: {VC_MODEL.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
271
+ " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
272
+ " if vc_model is None or source_wav is None or target_wav is None:\n",
273
+ " return (sample_rate, np.zeros(0).astype(np.int16))\n",
274
+ "\n",
275
+ " speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
276
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
277
+ " return (sample_rate, speech)\n",
278
+ "\n",
279
+ "\n",
280
+ "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
281
+ " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
282
+ " return (16000, np.zeros(0).astype(np.int16))\n",
283
+ "\n",
284
+ " sample_rate = tts_model.synthesizer.output_sample_rate\n",
285
+ " if tts_model.is_multi_speaker:\n",
286
+ " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
287
+ " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
288
+ "\n",
289
+ " language = None if language == '' else language\n",
290
+ " speaker = None if speaker == '' else speaker\n",
291
+ " if use_original_voice:\n",
292
+ " print('Using original voice')\n",
293
+ " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
294
+ " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
295
+ " print('voice cloning with the tts')\n",
296
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
297
+ " else:\n",
298
+ " print('voice cloning with the voice conversion model')\n",
299
+ " speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
300
+ "\n",
301
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
302
+ " return (sample_rate, speech)\n",
303
+ "\n",
304
+ "\n",
305
+ "with gr.Blocks() as demo:\n",
306
+ " tts_model = gr.State(None)\n",
307
+ " vc_model = gr.State(None)\n",
308
+ " def activate(*args):\n",
309
+ " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
310
+ " def deactivate(*args):\n",
311
+ " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
312
+ "\n",
313
+ " \n",
314
+ " gr.Markdown(description)\n",
315
+ "\n",
316
+ " with gr.Row(equal_height=True):\n",
317
+ " with gr.Column(scale=5, min_width=50):\n",
318
+ " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)\n",
319
+ " with gr.Column(scale=1, min_width=10):\n",
320
+ " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
321
+ " with gr.Column(scale=1, min_width=10):\n",
322
+ " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
323
+ " \n",
324
+ " with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
325
+ " gr.Markdown(\"Upload target voice...\")\n",
326
+ " with gr.Row(equal_height=True):\n",
327
+ " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
328
+ " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
329
+ "\n",
330
+ " with gr.Row(equal_height=True):\n",
331
+ " with gr.Column(scale=2):\n",
332
+ " with gr.Row(equal_height=True):\n",
333
+ " with gr.Column():\n",
334
+ " text_to_convert = gr.Textbox(verse)\n",
335
+ " orig_voice = gr.Checkbox(label='Use original voice')\n",
336
+ " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
337
+ " with gr.Row(equal_height=True):\n",
338
+ " button_text = gr.Button('Text to speech', interactive=True)\n",
339
+ " button_audio = gr.Button('Convert audio', interactive=True)\n",
340
+ " with gr.Row(equal_height=True):\n",
341
+ " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
342
+ " \n",
343
+ " # actions\n",
344
+ " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
345
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
346
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
347
+ " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
348
+ " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
349
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
350
+ "\n",
351
+ " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
352
+ " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
353
+ " outputs=speech).\\\n",
354
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
355
+ "\n",
356
+ " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
357
+ " then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\\\n",
358
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
359
+ " \n",
360
+ " gr.HTML(article)\n",
361
+ "demo.launch(share=False)"
362
+ ]
363
+ }
364
+ ],
365
+ "metadata": {
366
+ "kernelspec": {
367
+ "display_name": "Python 3",
368
+ "language": "python",
369
+ "name": "python3"
370
+ },
371
+ "language_info": {
372
+ "codemirror_mode": {
373
+ "name": "ipython",
374
+ "version": 3
375
+ },
376
+ "file_extension": ".py",
377
+ "mimetype": "text/x-python",
378
+ "name": "python",
379
+ "nbconvert_exporter": "python",
380
+ "pygments_lexer": "ipython3",
381
+ "version": "3.7.9"
382
+ }
383
+ },
384
+ "nbformat": 4,
385
+ "nbformat_minor": 5
386
+ }
.ipynb_checkpoints/Coqui.ai-checkpoint.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
- "id": "6065d339",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -19,8 +19,8 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": 8,
23
- "id": "1e64dfd7",
24
  "metadata": {
25
  "scrolled": false
26
  },
@@ -29,7 +29,7 @@
29
  "name": "stdout",
30
  "output_type": "stream",
31
  "text": [
32
- "Running on local URL: http://127.0.0.1:7863\n",
33
  "\n",
34
  "To create a public link, set `share=True` in `launch()`.\n"
35
  ]
@@ -37,7 +37,7 @@
37
  {
38
  "data": {
39
  "text/html": [
40
- "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
  ],
42
  "text/plain": [
43
  "<IPython.core.display.HTML object>"
@@ -50,7 +50,7 @@
50
  "data": {
51
  "text/plain": []
52
  },
53
- "execution_count": 8,
54
  "metadata": {},
55
  "output_type": "execute_result"
56
  },
@@ -138,22 +138,27 @@
138
  " | > win_length:1024\n",
139
  " > Generator Model: univnet_generator\n",
140
  " > Discriminator Model: univnet_discriminator\n",
141
- "Passing through TTS model tts_models/en/ljspeech/tacotron2-DDC_ph\n",
142
  "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
143
  "language: \n",
144
  "speaker: \n",
145
- "Using original voice\n",
146
  " > Text splitted to sentences.\n",
147
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
148
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
149
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
150
- " > Processing time: 3.316999912261963\n",
151
- " > Real-time factor: 0.38182763983344614\n",
152
- "Loading TTS model from tts_models/en/ek1/tacotron2\n",
153
- " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
 
 
 
 
 
 
154
  " > Model's license - apache 2.0\n",
155
  " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
156
- " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
157
  " > Model's license - apache 2.0\n",
158
  " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
159
  " > Using model: Tacotron2\n",
@@ -162,19 +167,19 @@
162
  " | > resample:False\n",
163
  " | > num_mels:80\n",
164
  " | > log_func:np.log10\n",
165
- " | > min_level_db:-10\n",
166
  " | > frame_shift_ms:None\n",
167
  " | > frame_length_ms:None\n",
168
- " | > ref_level_db:0\n",
169
  " | > fft_size:1024\n",
170
- " | > power:1.8\n",
171
- " | > preemphasis:0.99\n",
172
  " | > griffin_lim_iters:60\n",
173
  " | > signal_norm:True\n",
174
  " | > symmetric_norm:True\n",
175
- " | > mel_fmin:0\n",
176
- " | > mel_fmax:8000.0\n",
177
- " | > pitch_fmin:1.0\n",
178
  " | > pitch_fmax:640.0\n",
179
  " | > spec_gain:1.0\n",
180
  " | > stft_pad_mode:reflect\n",
@@ -187,19 +192,58 @@
187
  " | > do_amp_to_db_mel:True\n",
188
  " | > do_rms_norm:False\n",
189
  " | > db_level:None\n",
190
- " | > stats_path:None\n",
191
  " | > base:10\n",
192
  " | > hop_length:256\n",
193
  " | > win_length:1024\n",
194
  " > Model's reduction rate `r` is set to: 2\n",
195
- " > Vocoder Model: wavegrad\n",
196
- "Passing through TTS model tts_models/en/ek1/tacotron2\n",
197
- "model: tts_models/en/ek1/tacotron2\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  "language: \n",
199
  "speaker: \n",
200
  "Using original voice\n",
201
  " > Text splitted to sentences.\n",
202
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n"
 
 
 
 
203
  ]
204
  }
205
  ],
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
+ "id": "e65fcd73",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 2,
23
+ "id": "f902a92c",
24
  "metadata": {
25
  "scrolled": false
26
  },
 
29
  "name": "stdout",
30
  "output_type": "stream",
31
  "text": [
32
+ "Running on local URL: http://127.0.0.1:7860\n",
33
  "\n",
34
  "To create a public link, set `share=True` in `launch()`.\n"
35
  ]
 
37
  {
38
  "data": {
39
  "text/html": [
40
+ "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
  ],
42
  "text/plain": [
43
  "<IPython.core.display.HTML object>"
 
50
  "data": {
51
  "text/plain": []
52
  },
53
+ "execution_count": 2,
54
  "metadata": {},
55
  "output_type": "execute_result"
56
  },
 
138
  " | > win_length:1024\n",
139
  " > Generator Model: univnet_generator\n",
140
  " > Discriminator Model: univnet_discriminator\n",
 
141
  "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
142
  "language: \n",
143
  "speaker: \n",
144
+ "voice cloning with the voice conversion model\n",
145
  " > Text splitted to sentences.\n",
146
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
147
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
148
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
149
+ " > Processing time: 3.4810001850128174\n",
150
+ " > Real-time factor: 0.400706095887971\n",
151
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
152
+ " > Model's license - MIT\n",
153
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
154
+ " > Using model: freevc\n",
155
+ " > Loading pretrained speaker encoder model ...\n",
156
+ "Loaded the voice encoder model on cpu in 0.09 seconds.\n",
157
+ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
159
  " > Model's license - apache 2.0\n",
160
  " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
161
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
162
  " > Model's license - apache 2.0\n",
163
  " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
164
  " > Using model: Tacotron2\n",
 
167
  " | > resample:False\n",
168
  " | > num_mels:80\n",
169
  " | > log_func:np.log10\n",
170
+ " | > min_level_db:-100\n",
171
  " | > frame_shift_ms:None\n",
172
  " | > frame_length_ms:None\n",
173
+ " | > ref_level_db:20\n",
174
  " | > fft_size:1024\n",
175
+ " | > power:1.5\n",
176
+ " | > preemphasis:0.0\n",
177
  " | > griffin_lim_iters:60\n",
178
  " | > signal_norm:True\n",
179
  " | > symmetric_norm:True\n",
180
+ " | > mel_fmin:50.0\n",
181
+ " | > mel_fmax:7600.0\n",
182
+ " | > pitch_fmin:0.0\n",
183
  " | > pitch_fmax:640.0\n",
184
  " | > spec_gain:1.0\n",
185
  " | > stft_pad_mode:reflect\n",
 
192
  " | > do_amp_to_db_mel:True\n",
193
  " | > do_rms_norm:False\n",
194
  " | > db_level:None\n",
195
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
196
  " | > base:10\n",
197
  " | > hop_length:256\n",
198
  " | > win_length:1024\n",
199
  " > Model's reduction rate `r` is set to: 2\n",
200
+ " > Vocoder Model: univnet\n",
201
+ " > Setting up Audio Processor...\n",
202
+ " | > sample_rate:22050\n",
203
+ " | > resample:False\n",
204
+ " | > num_mels:80\n",
205
+ " | > log_func:np.log10\n",
206
+ " | > min_level_db:-100\n",
207
+ " | > frame_shift_ms:None\n",
208
+ " | > frame_length_ms:None\n",
209
+ " | > ref_level_db:20\n",
210
+ " | > fft_size:1024\n",
211
+ " | > power:1.5\n",
212
+ " | > preemphasis:0.0\n",
213
+ " | > griffin_lim_iters:60\n",
214
+ " | > signal_norm:True\n",
215
+ " | > symmetric_norm:True\n",
216
+ " | > mel_fmin:50.0\n",
217
+ " | > mel_fmax:7600.0\n",
218
+ " | > pitch_fmin:1.0\n",
219
+ " | > pitch_fmax:640.0\n",
220
+ " | > spec_gain:1.0\n",
221
+ " | > stft_pad_mode:reflect\n",
222
+ " | > max_norm:4.0\n",
223
+ " | > clip_norm:True\n",
224
+ " | > do_trim_silence:True\n",
225
+ " | > trim_db:60\n",
226
+ " | > do_sound_norm:False\n",
227
+ " | > do_amp_to_db_linear:True\n",
228
+ " | > do_amp_to_db_mel:True\n",
229
+ " | > do_rms_norm:False\n",
230
+ " | > db_level:None\n",
231
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
232
+ " | > base:10\n",
233
+ " | > hop_length:256\n",
234
+ " | > win_length:1024\n",
235
+ " > Generator Model: univnet_generator\n",
236
+ " > Discriminator Model: univnet_discriminator\n",
237
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
238
  "language: \n",
239
  "speaker: \n",
240
  "Using original voice\n",
241
  " > Text splitted to sentences.\n",
242
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
243
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
244
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
245
+ " > Processing time: 2.931999921798706\n",
246
+ " > Real-time factor: 0.3375093879242267\n"
247
  ]
248
  }
249
  ],
Coqui.ai-Copy1.ipynb ADDED
@@ -0,0 +1,1139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "80ca0f5c",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import gradio as gr\n",
11
+ "import numpy as np\n",
12
+ "import torch\n",
13
+ "import torch.nn.functional as F\n",
14
+ "from pathlib import Path\n",
15
+ "import tempfile\n",
16
+ "\n",
17
+ "from TTS.api import TTS\n",
18
+ "from TTS.utils.manage import ModelManager"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 6,
24
+ "id": "4ad622cb",
25
+ "metadata": {
26
+ "scrolled": false
27
+ },
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
34
+ " > Model's license - MIT\n",
35
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
36
+ " > Using model: freevc\n",
37
+ " > Loading pretrained speaker encoder model ...\n",
38
+ "Loaded the voice encoder model on cpu in 0.01 seconds.\n",
39
+ "Running on local URL: http://127.0.0.1:7863\n",
40
+ "\n",
41
+ "To create a public link, set `share=True` in `launch()`.\n"
42
+ ]
43
+ },
44
+ {
45
+ "data": {
46
+ "text/html": [
47
+ "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
48
+ ],
49
+ "text/plain": [
50
+ "<IPython.core.display.HTML object>"
51
+ ]
52
+ },
53
+ "metadata": {},
54
+ "output_type": "display_data"
55
+ },
56
+ {
57
+ "data": {
58
+ "text/plain": []
59
+ },
60
+ "execution_count": 6,
61
+ "metadata": {},
62
+ "output_type": "execute_result"
63
+ },
64
+ {
65
+ "name": "stdout",
66
+ "output_type": "stream",
67
+ "text": [
68
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
69
+ " > Model's license - apache 2.0\n",
70
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
71
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
72
+ " > Model's license - apache 2.0\n",
73
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
74
+ " > Using model: Tacotron2\n",
75
+ " > Setting up Audio Processor...\n",
76
+ " | > sample_rate:22050\n",
77
+ " | > resample:False\n",
78
+ " | > num_mels:80\n",
79
+ " | > log_func:np.log10\n",
80
+ " | > min_level_db:-100\n",
81
+ " | > frame_shift_ms:None\n",
82
+ " | > frame_length_ms:None\n",
83
+ " | > ref_level_db:20\n",
84
+ " | > fft_size:1024\n",
85
+ " | > power:1.5\n",
86
+ " | > preemphasis:0.0\n",
87
+ " | > griffin_lim_iters:60\n",
88
+ " | > signal_norm:True\n",
89
+ " | > symmetric_norm:True\n",
90
+ " | > mel_fmin:50.0\n",
91
+ " | > mel_fmax:7600.0\n",
92
+ " | > pitch_fmin:0.0\n",
93
+ " | > pitch_fmax:640.0\n",
94
+ " | > spec_gain:1.0\n",
95
+ " | > stft_pad_mode:reflect\n",
96
+ " | > max_norm:4.0\n",
97
+ " | > clip_norm:True\n",
98
+ " | > do_trim_silence:True\n",
99
+ " | > trim_db:60\n",
100
+ " | > do_sound_norm:False\n",
101
+ " | > do_amp_to_db_linear:True\n",
102
+ " | > do_amp_to_db_mel:True\n",
103
+ " | > do_rms_norm:False\n",
104
+ " | > db_level:None\n",
105
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
106
+ " | > base:10\n",
107
+ " | > hop_length:256\n",
108
+ " | > win_length:1024\n",
109
+ " > Model's reduction rate `r` is set to: 2\n",
110
+ " > Vocoder Model: univnet\n",
111
+ " > Setting up Audio Processor...\n",
112
+ " | > sample_rate:22050\n",
113
+ " | > resample:False\n",
114
+ " | > num_mels:80\n",
115
+ " | > log_func:np.log10\n",
116
+ " | > min_level_db:-100\n",
117
+ " | > frame_shift_ms:None\n",
118
+ " | > frame_length_ms:None\n",
119
+ " | > ref_level_db:20\n",
120
+ " | > fft_size:1024\n",
121
+ " | > power:1.5\n",
122
+ " | > preemphasis:0.0\n",
123
+ " | > griffin_lim_iters:60\n",
124
+ " | > signal_norm:True\n",
125
+ " | > symmetric_norm:True\n",
126
+ " | > mel_fmin:50.0\n",
127
+ " | > mel_fmax:7600.0\n",
128
+ " | > pitch_fmin:1.0\n",
129
+ " | > pitch_fmax:640.0\n",
130
+ " | > spec_gain:1.0\n",
131
+ " | > stft_pad_mode:reflect\n",
132
+ " | > max_norm:4.0\n",
133
+ " | > clip_norm:True\n",
134
+ " | > do_trim_silence:True\n",
135
+ " | > trim_db:60\n",
136
+ " | > do_sound_norm:False\n",
137
+ " | > do_amp_to_db_linear:True\n",
138
+ " | > do_amp_to_db_mel:True\n",
139
+ " | > do_rms_norm:False\n",
140
+ " | > db_level:None\n",
141
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
142
+ " | > base:10\n",
143
+ " | > hop_length:256\n",
144
+ " | > win_length:1024\n",
145
+ " > Generator Model: univnet_generator\n",
146
+ " > Discriminator Model: univnet_discriminator\n",
147
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
148
+ "language: \n",
149
+ "speaker: \n",
150
+ "voice cloning with the voice conversion model\n",
151
+ " > Text splitted to sentences.\n",
152
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
153
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
154
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
155
+ " > Processing time: 3.3410003185272217\n",
156
+ " > Real-time factor: 0.38459038289093944\n",
157
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
+ "language: \n",
159
+ "speaker: \n",
160
+ "voice cloning with the voice conversion model\n",
161
+ " > Text splitted to sentences.\n",
162
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
163
+ " > Processing time: 2.9179999828338623\n",
164
+ " > Real-time factor: 0.3358978221135079\n",
165
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
166
+ " > Model's license - apache 2.0\n",
167
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
168
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
169
+ " > Model's license - apache 2.0\n",
170
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
171
+ " > Using model: Tacotron2\n",
172
+ " > Setting up Audio Processor...\n",
173
+ " | > sample_rate:22050\n",
174
+ " | > resample:False\n",
175
+ " | > num_mels:80\n",
176
+ " | > log_func:np.log10\n",
177
+ " | > min_level_db:-100\n",
178
+ " | > frame_shift_ms:None\n",
179
+ " | > frame_length_ms:None\n",
180
+ " | > ref_level_db:20\n",
181
+ " | > fft_size:1024\n",
182
+ " | > power:1.5\n",
183
+ " | > preemphasis:0.0\n",
184
+ " | > griffin_lim_iters:60\n",
185
+ " | > signal_norm:True\n",
186
+ " | > symmetric_norm:True\n",
187
+ " | > mel_fmin:50.0\n",
188
+ " | > mel_fmax:7600.0\n",
189
+ " | > pitch_fmin:0.0\n",
190
+ " | > pitch_fmax:640.0\n",
191
+ " | > spec_gain:1.0\n",
192
+ " | > stft_pad_mode:reflect\n",
193
+ " | > max_norm:4.0\n",
194
+ " | > clip_norm:True\n",
195
+ " | > do_trim_silence:True\n",
196
+ " | > trim_db:60\n",
197
+ " | > do_sound_norm:False\n",
198
+ " | > do_amp_to_db_linear:True\n",
199
+ " | > do_amp_to_db_mel:True\n",
200
+ " | > do_rms_norm:False\n",
201
+ " | > db_level:None\n",
202
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
203
+ " | > base:10\n",
204
+ " | > hop_length:256\n",
205
+ " | > win_length:1024\n",
206
+ " > Model's reduction rate `r` is set to: 2\n",
207
+ " > Vocoder Model: univnet\n",
208
+ " > Setting up Audio Processor...\n",
209
+ " | > sample_rate:22050\n",
210
+ " | > resample:False\n",
211
+ " | > num_mels:80\n",
212
+ " | > log_func:np.log10\n",
213
+ " | > min_level_db:-100\n",
214
+ " | > frame_shift_ms:None\n",
215
+ " | > frame_length_ms:None\n",
216
+ " | > ref_level_db:20\n",
217
+ " | > fft_size:1024\n",
218
+ " | > power:1.5\n",
219
+ " | > preemphasis:0.0\n",
220
+ " | > griffin_lim_iters:60\n",
221
+ " | > signal_norm:True\n",
222
+ " | > symmetric_norm:True\n",
223
+ " | > mel_fmin:50.0\n",
224
+ " | > mel_fmax:7600.0\n",
225
+ " | > pitch_fmin:1.0\n",
226
+ " | > pitch_fmax:640.0\n",
227
+ " | > spec_gain:1.0\n",
228
+ " | > stft_pad_mode:reflect\n",
229
+ " | > max_norm:4.0\n",
230
+ " | > clip_norm:True\n",
231
+ " | > do_trim_silence:True\n",
232
+ " | > trim_db:60\n",
233
+ " | > do_sound_norm:False\n",
234
+ " | > do_amp_to_db_linear:True\n",
235
+ " | > do_amp_to_db_mel:True\n",
236
+ " | > do_rms_norm:False\n",
237
+ " | > db_level:None\n",
238
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
239
+ " | > base:10\n",
240
+ " | > hop_length:256\n",
241
+ " | > win_length:1024\n",
242
+ " > Generator Model: univnet_generator\n",
243
+ " > Discriminator Model: univnet_discriminator\n",
244
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
245
+ "language: \n",
246
+ "speaker: \n",
247
+ "voice cloning with the voice conversion model\n",
248
+ " > Text splitted to sentences.\n",
249
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
250
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
251
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
252
+ " > Processing time: 3.021000385284424\n",
253
+ " > Real-time factor: 0.3477544400242312\n",
254
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
255
+ "language: \n",
256
+ "speaker: \n",
257
+ "voice cloning with the voice conversion model\n",
258
+ " > Text splitted to sentences.\n",
259
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
260
+ " > Processing time: 2.9099998474121094\n",
261
+ " > Real-time factor: 0.33497690776101013\n",
262
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
263
+ "language: \n",
264
+ "speaker: \n",
265
+ "voice cloning with the voice conversion model\n",
266
+ " > Text splitted to sentences.\n",
267
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
268
+ " > Processing time: 2.933000087738037\n",
269
+ " > Real-time factor: 0.33762451937136506\n",
270
+ " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
271
+ " > Model's license - apache 2.0\n",
272
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
273
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
274
+ " > Model's license - apache 2.0\n",
275
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
276
+ " > Using model: Tacotron2\n",
277
+ " > Setting up Audio Processor...\n",
278
+ " | > sample_rate:22050\n",
279
+ " | > resample:False\n",
280
+ " | > num_mels:80\n",
281
+ " | > log_func:np.log\n",
282
+ " | > min_level_db:-100\n",
283
+ " | > frame_shift_ms:None\n",
284
+ " | > frame_length_ms:None\n",
285
+ " | > ref_level_db:20\n",
286
+ " | > fft_size:1024\n",
287
+ " | > power:1.5\n",
288
+ " | > preemphasis:0.0\n",
289
+ " | > griffin_lim_iters:60\n",
290
+ " | > signal_norm:False\n",
291
+ " | > symmetric_norm:True\n",
292
+ " | > mel_fmin:0\n",
293
+ " | > mel_fmax:8000.0\n",
294
+ " | > pitch_fmin:1.0\n",
295
+ " | > pitch_fmax:640.0\n",
296
+ " | > spec_gain:1.0\n",
297
+ " | > stft_pad_mode:reflect\n",
298
+ " | > max_norm:4.0\n",
299
+ " | > clip_norm:True\n",
300
+ " | > do_trim_silence:True\n",
301
+ " | > trim_db:60\n",
302
+ " | > do_sound_norm:False\n",
303
+ " | > do_amp_to_db_linear:True\n",
304
+ " | > do_amp_to_db_mel:True\n",
305
+ " | > do_rms_norm:False\n",
306
+ " | > db_level:None\n",
307
+ " | > stats_path:None\n",
308
+ " | > base:2.718281828459045\n",
309
+ " | > hop_length:256\n",
310
+ " | > win_length:1024\n",
311
+ " > Model's reduction rate `r` is set to: 1\n",
312
+ " > Vocoder Model: hifigan\n",
313
+ " > Setting up Audio Processor...\n",
314
+ " | > sample_rate:22050\n",
315
+ " | > resample:False\n",
316
+ " | > num_mels:80\n",
317
+ " | > log_func:np.log\n",
318
+ " | > min_level_db:-100\n",
319
+ " | > frame_shift_ms:None\n",
320
+ " | > frame_length_ms:None\n",
321
+ " | > ref_level_db:20\n",
322
+ " | > fft_size:1024\n",
323
+ " | > power:1.5\n",
324
+ " | > preemphasis:0.0\n",
325
+ " | > griffin_lim_iters:60\n",
326
+ " | > signal_norm:False\n",
327
+ " | > symmetric_norm:True\n",
328
+ " | > mel_fmin:0\n",
329
+ " | > mel_fmax:8000.0\n",
330
+ " | > pitch_fmin:1.0\n",
331
+ " | > pitch_fmax:640.0\n",
332
+ " | > spec_gain:1.0\n",
333
+ " | > stft_pad_mode:reflect\n",
334
+ " | > max_norm:4.0\n",
335
+ " | > clip_norm:True\n",
336
+ " | > do_trim_silence:False\n",
337
+ " | > trim_db:60\n",
338
+ " | > do_sound_norm:False\n",
339
+ " | > do_amp_to_db_linear:True\n",
340
+ " | > do_amp_to_db_mel:True\n",
341
+ " | > do_rms_norm:False\n",
342
+ " | > db_level:None\n",
343
+ " | > stats_path:None\n",
344
+ " | > base:2.718281828459045\n",
345
+ " | > hop_length:256\n",
346
+ " | > win_length:1024\n",
347
+ " > Generator Model: hifigan_generator\n",
348
+ " > Discriminator Model: hifigan_discriminator\n"
349
+ ]
350
+ },
351
+ {
352
+ "name": "stdout",
353
+ "output_type": "stream",
354
+ "text": [
355
+ "Removing weight norm...\n",
356
+ "model: tts_models/en/ljspeech/tacotron2-DDC\n",
357
+ "language: \n",
358
+ "speaker: \n",
359
+ "voice cloning with the voice conversion model\n",
360
+ " > Text splitted to sentences.\n",
361
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
362
+ " > Processing time: 4.28600001335144\n",
363
+ " > Real-time factor: 0.42371906516498953\n",
364
+ " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
365
+ " > Model's license - apache 2.0\n",
366
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
367
+ " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
368
+ " > Model's license - apache 2.0\n",
369
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
370
+ " > Using model: Tacotron2\n",
371
+ " > Setting up Audio Processor...\n",
372
+ " | > sample_rate:22050\n",
373
+ " | > resample:False\n",
374
+ " | > num_mels:80\n",
375
+ " | > log_func:np.log10\n",
376
+ " | > min_level_db:-10\n",
377
+ " | > frame_shift_ms:None\n",
378
+ " | > frame_length_ms:None\n",
379
+ " | > ref_level_db:0\n",
380
+ " | > fft_size:1024\n",
381
+ " | > power:1.8\n",
382
+ " | > preemphasis:0.99\n",
383
+ " | > griffin_lim_iters:60\n",
384
+ " | > signal_norm:True\n",
385
+ " | > symmetric_norm:True\n",
386
+ " | > mel_fmin:0\n",
387
+ " | > mel_fmax:8000.0\n",
388
+ " | > pitch_fmin:1.0\n",
389
+ " | > pitch_fmax:640.0\n",
390
+ " | > spec_gain:1.0\n",
391
+ " | > stft_pad_mode:reflect\n",
392
+ " | > max_norm:4.0\n",
393
+ " | > clip_norm:True\n",
394
+ " | > do_trim_silence:True\n",
395
+ " | > trim_db:60\n",
396
+ " | > do_sound_norm:False\n",
397
+ " | > do_amp_to_db_linear:True\n",
398
+ " | > do_amp_to_db_mel:True\n",
399
+ " | > do_rms_norm:False\n",
400
+ " | > db_level:None\n",
401
+ " | > stats_path:None\n",
402
+ " | > base:10\n",
403
+ " | > hop_length:256\n",
404
+ " | > win_length:1024\n",
405
+ " > Model's reduction rate `r` is set to: 2\n",
406
+ " > Vocoder Model: wavegrad\n",
407
+ "model: tts_models/en/ek1/tacotron2\n",
408
+ "language: \n",
409
+ "speaker: \n",
410
+ "voice cloning with the voice conversion model\n",
411
+ " > Text splitted to sentences.\n",
412
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
413
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
414
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
415
+ " > Processing time: 224.84099984169006\n",
416
+ " > Real-time factor: 29.51038122922182\n",
417
+ " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
418
+ " > Model's license - apache 2.0\n",
419
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
420
+ " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
421
+ " > Model's license - apache 2.0\n",
422
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
423
+ " > Using model: Tacotron2\n",
424
+ " > Setting up Audio Processor...\n",
425
+ " | > sample_rate:22050\n",
426
+ " | > resample:False\n",
427
+ " | > num_mels:80\n",
428
+ " | > log_func:np.log10\n",
429
+ " | > min_level_db:-10\n",
430
+ " | > frame_shift_ms:None\n",
431
+ " | > frame_length_ms:None\n",
432
+ " | > ref_level_db:0\n",
433
+ " | > fft_size:1024\n",
434
+ " | > power:1.8\n",
435
+ " | > preemphasis:0.99\n",
436
+ " | > griffin_lim_iters:60\n",
437
+ " | > signal_norm:True\n",
438
+ " | > symmetric_norm:True\n",
439
+ " | > mel_fmin:0\n",
440
+ " | > mel_fmax:8000.0\n",
441
+ " | > pitch_fmin:1.0\n",
442
+ " | > pitch_fmax:640.0\n",
443
+ " | > spec_gain:1.0\n",
444
+ " | > stft_pad_mode:reflect\n",
445
+ " | > max_norm:4.0\n",
446
+ " | > clip_norm:True\n",
447
+ " | > do_trim_silence:True\n",
448
+ " | > trim_db:60\n",
449
+ " | > do_sound_norm:False\n",
450
+ " | > do_amp_to_db_linear:True\n",
451
+ " | > do_amp_to_db_mel:True\n",
452
+ " | > do_rms_norm:False\n",
453
+ " | > db_level:None\n",
454
+ " | > stats_path:None\n",
455
+ " | > base:10\n",
456
+ " | > hop_length:256\n",
457
+ " | > win_length:1024\n",
458
+ " > Model's reduction rate `r` is set to: 2\n",
459
+ " > Vocoder Model: wavegrad\n",
460
+ "model: tts_models/en/ek1/tacotron2\n",
461
+ "language: \n",
462
+ "speaker: \n",
463
+ "voice cloning with the voice conversion model\n",
464
+ " > Text splitted to sentences.\n",
465
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
466
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
467
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
468
+ " > Processing time: 266.6489999294281\n",
469
+ " > Real-time factor: 34.99768124073744\n",
470
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
471
+ " > Model's license - apache 2.0\n",
472
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
473
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
474
+ " > Model's license - apache 2.0\n",
475
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
476
+ " > Using model: Tacotron2\n",
477
+ " > Setting up Audio Processor...\n",
478
+ " | > sample_rate:22050\n",
479
+ " | > resample:False\n",
480
+ " | > num_mels:80\n",
481
+ " | > log_func:np.log10\n",
482
+ " | > min_level_db:-100\n",
483
+ " | > frame_shift_ms:None\n",
484
+ " | > frame_length_ms:None\n",
485
+ " | > ref_level_db:20\n",
486
+ " | > fft_size:1024\n",
487
+ " | > power:1.5\n",
488
+ " | > preemphasis:0.0\n",
489
+ " | > griffin_lim_iters:60\n",
490
+ " | > signal_norm:True\n",
491
+ " | > symmetric_norm:True\n",
492
+ " | > mel_fmin:50.0\n",
493
+ " | > mel_fmax:7600.0\n",
494
+ " | > pitch_fmin:0.0\n",
495
+ " | > pitch_fmax:640.0\n",
496
+ " | > spec_gain:1.0\n",
497
+ " | > stft_pad_mode:reflect\n",
498
+ " | > max_norm:4.0\n",
499
+ " | > clip_norm:True\n",
500
+ " | > do_trim_silence:True\n",
501
+ " | > trim_db:60\n",
502
+ " | > do_sound_norm:False\n",
503
+ " | > do_amp_to_db_linear:True\n",
504
+ " | > do_amp_to_db_mel:True\n",
505
+ " | > do_rms_norm:False\n",
506
+ " | > db_level:None\n",
507
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
508
+ " | > base:10\n",
509
+ " | > hop_length:256\n",
510
+ " | > win_length:1024\n",
511
+ " > Model's reduction rate `r` is set to: 2\n",
512
+ " > Vocoder Model: univnet\n",
513
+ " > Setting up Audio Processor...\n",
514
+ " | > sample_rate:22050\n",
515
+ " | > resample:False\n",
516
+ " | > num_mels:80\n",
517
+ " | > log_func:np.log10\n",
518
+ " | > min_level_db:-100\n",
519
+ " | > frame_shift_ms:None\n",
520
+ " | > frame_length_ms:None\n",
521
+ " | > ref_level_db:20\n",
522
+ " | > fft_size:1024\n",
523
+ " | > power:1.5\n",
524
+ " | > preemphasis:0.0\n",
525
+ " | > griffin_lim_iters:60\n",
526
+ " | > signal_norm:True\n",
527
+ " | > symmetric_norm:True\n",
528
+ " | > mel_fmin:50.0\n",
529
+ " | > mel_fmax:7600.0\n",
530
+ " | > pitch_fmin:1.0\n",
531
+ " | > pitch_fmax:640.0\n",
532
+ " | > spec_gain:1.0\n",
533
+ " | > stft_pad_mode:reflect\n",
534
+ " | > max_norm:4.0\n",
535
+ " | > clip_norm:True\n",
536
+ " | > do_trim_silence:True\n",
537
+ " | > trim_db:60\n",
538
+ " | > do_sound_norm:False\n",
539
+ " | > do_amp_to_db_linear:True\n",
540
+ " | > do_amp_to_db_mel:True\n",
541
+ " | > do_rms_norm:False\n",
542
+ " | > db_level:None\n",
543
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
544
+ " | > base:10\n",
545
+ " | > hop_length:256\n",
546
+ " | > win_length:1024\n",
547
+ " > Generator Model: univnet_generator\n",
548
+ " > Discriminator Model: univnet_discriminator\n",
549
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
550
+ "language: \n",
551
+ "speaker: \n",
552
+ "voice cloning with the voice conversion model\n",
553
+ " > Text splitted to sentences.\n",
554
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
555
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
556
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
557
+ " > Processing time: 2.885999917984009\n",
558
+ " > Real-time factor: 0.3322142195933605\n",
559
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--speedy-speech\n",
560
+ " > Model's license - apache 2.0\n",
561
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
562
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
563
+ " > Model's license - apache 2.0\n",
564
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
565
+ " > Using model: speedy_speech\n",
566
+ " > Setting up Audio Processor...\n",
567
+ " | > sample_rate:22050\n",
568
+ " | > resample:False\n",
569
+ " | > num_mels:80\n",
570
+ " | > log_func:np.log\n",
571
+ " | > min_level_db:-100\n",
572
+ " | > frame_shift_ms:None\n",
573
+ " | > frame_length_ms:None\n",
574
+ " | > ref_level_db:20\n",
575
+ " | > fft_size:1024\n",
576
+ " | > power:1.5\n",
577
+ " | > preemphasis:0.0\n",
578
+ " | > griffin_lim_iters:60\n",
579
+ " | > signal_norm:False\n",
580
+ " | > symmetric_norm:True\n",
581
+ " | > mel_fmin:0\n",
582
+ " | > mel_fmax:8000.0\n",
583
+ " | > pitch_fmin:1.0\n",
584
+ " | > pitch_fmax:640.0\n",
585
+ " | > spec_gain:1.0\n",
586
+ " | > stft_pad_mode:reflect\n",
587
+ " | > max_norm:4.0\n",
588
+ " | > clip_norm:True\n",
589
+ " | > do_trim_silence:True\n",
590
+ " | > trim_db:60\n",
591
+ " | > do_sound_norm:False\n",
592
+ " | > do_amp_to_db_linear:True\n",
593
+ " | > do_amp_to_db_mel:True\n",
594
+ " | > do_rms_norm:False\n",
595
+ " | > db_level:None\n",
596
+ " | > stats_path:None\n",
597
+ " | > base:2.718281828459045\n",
598
+ " | > hop_length:256\n",
599
+ " | > win_length:1024\n",
600
+ " > Vocoder Model: hifigan\n",
601
+ " > Setting up Audio Processor...\n",
602
+ " | > sample_rate:22050\n",
603
+ " | > resample:False\n",
604
+ " | > num_mels:80\n",
605
+ " | > log_func:np.log\n",
606
+ " | > min_level_db:-100\n",
607
+ " | > frame_shift_ms:None\n",
608
+ " | > frame_length_ms:None\n",
609
+ " | > ref_level_db:20\n",
610
+ " | > fft_size:1024\n",
611
+ " | > power:1.5\n",
612
+ " | > preemphasis:0.0\n",
613
+ " | > griffin_lim_iters:60\n",
614
+ " | > signal_norm:False\n",
615
+ " | > symmetric_norm:True\n",
616
+ " | > mel_fmin:0\n",
617
+ " | > mel_fmax:8000.0\n",
618
+ " | > pitch_fmin:1.0\n",
619
+ " | > pitch_fmax:640.0\n",
620
+ " | > spec_gain:1.0\n",
621
+ " | > stft_pad_mode:reflect\n",
622
+ " | > max_norm:4.0\n",
623
+ " | > clip_norm:True\n",
624
+ " | > do_trim_silence:False\n",
625
+ " | > trim_db:60\n",
626
+ " | > do_sound_norm:False\n",
627
+ " | > do_amp_to_db_linear:True\n",
628
+ " | > do_amp_to_db_mel:True\n",
629
+ " | > do_rms_norm:False\n",
630
+ " | > db_level:None\n",
631
+ " | > stats_path:None\n",
632
+ " | > base:2.718281828459045\n",
633
+ " | > hop_length:256\n",
634
+ " | > win_length:1024\n",
635
+ " > Generator Model: hifigan_generator\n",
636
+ " > Discriminator Model: hifigan_discriminator\n",
637
+ "Removing weight norm...\n"
638
+ ]
639
+ },
640
+ {
641
+ "name": "stdout",
642
+ "output_type": "stream",
643
+ "text": [
644
+ "model: tts_models/en/ljspeech/speedy-speech\n",
645
+ "language: \n",
646
+ "speaker: \n",
647
+ "Using original voice\n",
648
+ " > Text splitted to sentences.\n",
649
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
650
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
651
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
652
+ " > Processing time: 0.9679999351501465\n",
653
+ " > Real-time factor: 0.11673301633083617\n",
654
+ "model: tts_models/en/ljspeech/speedy-speech\n",
655
+ "language: \n",
656
+ "speaker: \n",
657
+ "voice cloning with the voice conversion model\n",
658
+ " > Text splitted to sentences.\n",
659
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
660
+ " > Processing time: 0.9630000591278076\n",
661
+ " > Real-time factor: 0.11613007144605443\n",
662
+ " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
663
+ " > Model's license - MPL\n",
664
+ " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
665
+ " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
666
+ " > Model's license - MPL\n",
667
+ " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
668
+ " > Using model: Tacotron2\n",
669
+ " > Setting up Audio Processor...\n",
670
+ " | > sample_rate:22050\n",
671
+ " | > resample:False\n",
672
+ " | > num_mels:80\n",
673
+ " | > log_func:np.log10\n",
674
+ " | > min_level_db:-100\n",
675
+ " | > frame_shift_ms:None\n",
676
+ " | > frame_length_ms:None\n",
677
+ " | > ref_level_db:20\n",
678
+ " | > fft_size:1024\n",
679
+ " | > power:1.5\n",
680
+ " | > preemphasis:0.0\n",
681
+ " | > griffin_lim_iters:60\n",
682
+ " | > signal_norm:True\n",
683
+ " | > symmetric_norm:True\n",
684
+ " | > mel_fmin:50.0\n",
685
+ " | > mel_fmax:7600.0\n",
686
+ " | > pitch_fmin:0.0\n",
687
+ " | > pitch_fmax:640.0\n",
688
+ " | > spec_gain:1.0\n",
689
+ " | > stft_pad_mode:reflect\n",
690
+ " | > max_norm:4.0\n",
691
+ " | > clip_norm:True\n",
692
+ " | > do_trim_silence:True\n",
693
+ " | > trim_db:60\n",
694
+ " | > do_sound_norm:False\n",
695
+ " | > do_amp_to_db_linear:True\n",
696
+ " | > do_amp_to_db_mel:True\n",
697
+ " | > do_rms_norm:False\n",
698
+ " | > db_level:None\n",
699
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
700
+ " | > base:10\n",
701
+ " | > hop_length:256\n",
702
+ " | > win_length:1024\n",
703
+ " > Model's reduction rate `r` is set to: 2\n",
704
+ " > Vocoder Model: multiband_melgan\n",
705
+ " > Setting up Audio Processor...\n",
706
+ " | > sample_rate:22050\n",
707
+ " | > resample:False\n",
708
+ " | > num_mels:80\n",
709
+ " | > log_func:np.log10\n",
710
+ " | > min_level_db:-100\n",
711
+ " | > frame_shift_ms:None\n",
712
+ " | > frame_length_ms:None\n",
713
+ " | > ref_level_db:0\n",
714
+ " | > fft_size:1024\n",
715
+ " | > power:1.5\n",
716
+ " | > preemphasis:0.0\n",
717
+ " | > griffin_lim_iters:60\n",
718
+ " | > signal_norm:True\n",
719
+ " | > symmetric_norm:True\n",
720
+ " | > mel_fmin:50.0\n",
721
+ " | > mel_fmax:7600.0\n",
722
+ " | > pitch_fmin:0.0\n",
723
+ " | > pitch_fmax:640.0\n",
724
+ " | > spec_gain:1.0\n",
725
+ " | > stft_pad_mode:reflect\n",
726
+ " | > max_norm:4.0\n",
727
+ " | > clip_norm:True\n",
728
+ " | > do_trim_silence:True\n",
729
+ " | > trim_db:60\n",
730
+ " | > do_sound_norm:False\n",
731
+ " | > do_amp_to_db_linear:True\n",
732
+ " | > do_amp_to_db_mel:True\n",
733
+ " | > do_rms_norm:False\n",
734
+ " | > db_level:None\n",
735
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
736
+ " | > base:10\n",
737
+ " | > hop_length:256\n",
738
+ " | > win_length:1024\n",
739
+ " > Generator Model: multiband_melgan_generator\n",
740
+ " > Discriminator Model: melgan_multiscale_discriminator\n",
741
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
742
+ "language: \n",
743
+ "speaker: \n",
744
+ "Using original voice\n",
745
+ " > Text splitted to sentences.\n",
746
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
747
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
748
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
749
+ " > Processing time: 2.067000150680542\n",
750
+ " > Real-time factor: 0.23295588670728015\n",
751
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
752
+ "language: \n",
753
+ "speaker: \n",
754
+ "voice cloning with the voice conversion model\n",
755
+ " > Text splitted to sentences.\n",
756
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
757
+ " > Processing time: 2.1570000648498535\n",
758
+ " > Real-time factor: 0.2430990934225715\n",
759
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
760
+ "language: \n",
761
+ "speaker: \n",
762
+ "voice cloning with the voice conversion model\n",
763
+ " > Text splitted to sentences.\n",
764
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
765
+ " > Processing time: 2.0920000076293945\n",
766
+ " > Real-time factor: 0.23577343069302087\n",
767
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--fast_pitch\n",
768
+ " > Model's license - apache 2.0\n",
769
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
770
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
771
+ " > Model's license - apache 2.0\n",
772
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
773
+ " > Using model: fast_pitch\n",
774
+ " > Setting up Audio Processor...\n",
775
+ " | > sample_rate:22050\n",
776
+ " | > resample:False\n",
777
+ " | > num_mels:80\n",
778
+ " | > log_func:np.log\n",
779
+ " | > min_level_db:-100\n",
780
+ " | > frame_shift_ms:None\n",
781
+ " | > frame_length_ms:None\n",
782
+ " | > ref_level_db:20\n",
783
+ " | > fft_size:1024\n",
784
+ " | > power:1.5\n",
785
+ " | > preemphasis:0.0\n",
786
+ " | > griffin_lim_iters:60\n",
787
+ " | > signal_norm:False\n",
788
+ " | > symmetric_norm:True\n",
789
+ " | > mel_fmin:0\n",
790
+ " | > mel_fmax:8000.0\n",
791
+ " | > pitch_fmin:1.0\n",
792
+ " | > pitch_fmax:640.0\n",
793
+ " | > spec_gain:1.0\n",
794
+ " | > stft_pad_mode:reflect\n",
795
+ " | > max_norm:4.0\n",
796
+ " | > clip_norm:True\n",
797
+ " | > do_trim_silence:True\n",
798
+ " | > trim_db:60\n",
799
+ " | > do_sound_norm:False\n",
800
+ " | > do_amp_to_db_linear:True\n",
801
+ " | > do_amp_to_db_mel:True\n",
802
+ " | > do_rms_norm:False\n",
803
+ " | > db_level:None\n",
804
+ " | > stats_path:None\n",
805
+ " | > base:2.718281828459045\n",
806
+ " | > hop_length:256\n",
807
+ " | > win_length:1024\n",
808
+ " > Vocoder Model: hifigan\n",
809
+ " > Setting up Audio Processor...\n",
810
+ " | > sample_rate:22050\n",
811
+ " | > resample:False\n",
812
+ " | > num_mels:80\n",
813
+ " | > log_func:np.log\n",
814
+ " | > min_level_db:-100\n",
815
+ " | > frame_shift_ms:None\n",
816
+ " | > frame_length_ms:None\n",
817
+ " | > ref_level_db:20\n",
818
+ " | > fft_size:1024\n",
819
+ " | > power:1.5\n",
820
+ " | > preemphasis:0.0\n",
821
+ " | > griffin_lim_iters:60\n",
822
+ " | > signal_norm:False\n",
823
+ " | > symmetric_norm:True\n",
824
+ " | > mel_fmin:0\n",
825
+ " | > mel_fmax:8000.0\n",
826
+ " | > pitch_fmin:1.0\n",
827
+ " | > pitch_fmax:640.0\n",
828
+ " | > spec_gain:1.0\n",
829
+ " | > stft_pad_mode:reflect\n",
830
+ " | > max_norm:4.0\n",
831
+ " | > clip_norm:True\n",
832
+ " | > do_trim_silence:False\n",
833
+ " | > trim_db:60\n",
834
+ " | > do_sound_norm:False\n",
835
+ " | > do_amp_to_db_linear:True\n",
836
+ " | > do_amp_to_db_mel:True\n",
837
+ " | > do_rms_norm:False\n",
838
+ " | > db_level:None\n",
839
+ " | > stats_path:None\n",
840
+ " | > base:2.718281828459045\n",
841
+ " | > hop_length:256\n",
842
+ " | > win_length:1024\n",
843
+ " > Generator Model: hifigan_generator\n",
844
+ " > Discriminator Model: hifigan_discriminator\n",
845
+ "Removing weight norm...\n",
846
+ "model: tts_models/en/ljspeech/fast_pitch\n",
847
+ "language: \n",
848
+ "speaker: \n",
849
+ "Using original voice\n",
850
+ " > Text splitted to sentences.\n",
851
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
852
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
853
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
854
+ " > Processing time: 1.8829996585845947\n",
855
+ " > Real-time factor: 0.19894272496832988\n",
856
+ "model: tts_models/en/ljspeech/fast_pitch\n",
857
+ "language: \n",
858
+ "speaker: \n",
859
+ "voice cloning with the voice conversion model\n",
860
+ " > Text splitted to sentences.\n",
861
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
862
+ " > Processing time: 1.8359999656677246\n",
863
+ " > Real-time factor: 0.19397711228808903\n",
864
+ "model: tts_models/en/ljspeech/fast_pitch\n",
865
+ "language: \n",
866
+ "speaker: \n",
867
+ "voice cloning with the voice conversion model\n",
868
+ " > Text splitted to sentences.\n",
869
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
870
+ " > Processing time: 1.8659999370574951\n",
871
+ " > Real-time factor: 0.19714666998293168\n",
872
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
873
+ "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
874
+ "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\11c82c70d145ea630f81dfa541de52bf615719ae\\yearn_for_time-0-100.wav\n",
875
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--overflow\n",
876
+ " > Model's license - apache 2.0\n",
877
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
878
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
879
+ " > Model's license - apache 2.0\n",
880
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
881
+ " > Using model: OverFlow\n",
882
+ " > Setting up Audio Processor...\n",
883
+ " | > sample_rate:22050\n",
884
+ " | > resample:False\n",
885
+ " | > num_mels:80\n",
886
+ " | > log_func:np.log\n",
887
+ " | > min_level_db:-100\n",
888
+ " | > frame_shift_ms:None\n",
889
+ " | > frame_length_ms:None\n",
890
+ " | > ref_level_db:20\n",
891
+ " | > fft_size:1024\n",
892
+ " | > power:1.5\n",
893
+ " | > preemphasis:0.0\n",
894
+ " | > griffin_lim_iters:60\n",
895
+ " | > signal_norm:False\n",
896
+ " | > symmetric_norm:True\n",
897
+ " | > mel_fmin:0\n",
898
+ " | > mel_fmax:8000.0\n",
899
+ " | > pitch_fmin:1.0\n",
900
+ " | > pitch_fmax:640.0\n",
901
+ " | > spec_gain:1.0\n",
902
+ " | > stft_pad_mode:reflect\n",
903
+ " | > max_norm:4.0\n",
904
+ " | > clip_norm:True\n",
905
+ " | > do_trim_silence:True\n",
906
+ " | > trim_db:60\n",
907
+ " | > do_sound_norm:False\n",
908
+ " | > do_amp_to_db_linear:True\n",
909
+ " | > do_amp_to_db_mel:True\n",
910
+ " | > do_rms_norm:False\n",
911
+ " | > db_level:None\n",
912
+ " | > stats_path:None\n",
913
+ " | > base:2.718281828459045\n",
914
+ " | > hop_length:256\n",
915
+ " | > win_length:1024\n"
916
+ ]
917
+ },
918
+ {
919
+ "name": "stdout",
920
+ "output_type": "stream",
921
+ "text": [
922
+ " > Vocoder Model: hifigan\n",
923
+ " > Setting up Audio Processor...\n",
924
+ " | > sample_rate:22050\n",
925
+ " | > resample:False\n",
926
+ " | > num_mels:80\n",
927
+ " | > log_func:np.log\n",
928
+ " | > min_level_db:-100\n",
929
+ " | > frame_shift_ms:None\n",
930
+ " | > frame_length_ms:None\n",
931
+ " | > ref_level_db:20\n",
932
+ " | > fft_size:1024\n",
933
+ " | > power:1.5\n",
934
+ " | > preemphasis:0.0\n",
935
+ " | > griffin_lim_iters:60\n",
936
+ " | > signal_norm:False\n",
937
+ " | > symmetric_norm:True\n",
938
+ " | > mel_fmin:0\n",
939
+ " | > mel_fmax:8000.0\n",
940
+ " | > pitch_fmin:1.0\n",
941
+ " | > pitch_fmax:640.0\n",
942
+ " | > spec_gain:1.0\n",
943
+ " | > stft_pad_mode:reflect\n",
944
+ " | > max_norm:4.0\n",
945
+ " | > clip_norm:True\n",
946
+ " | > do_trim_silence:False\n",
947
+ " | > trim_db:60\n",
948
+ " | > do_sound_norm:False\n",
949
+ " | > do_amp_to_db_linear:True\n",
950
+ " | > do_amp_to_db_mel:True\n",
951
+ " | > do_rms_norm:False\n",
952
+ " | > db_level:None\n",
953
+ " | > stats_path:None\n",
954
+ " | > base:2.718281828459045\n",
955
+ " | > hop_length:256\n",
956
+ " | > win_length:1024\n",
957
+ " > Generator Model: hifigan_generator\n",
958
+ " > Discriminator Model: hifigan_discriminator\n",
959
+ "Removing weight norm...\n",
960
+ "model: tts_models/en/ljspeech/overflow\n",
961
+ "language: \n",
962
+ "speaker: \n",
963
+ "Using original voice\n",
964
+ " > Text splitted to sentences.\n",
965
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
966
+ " > Processing time: 2.4030001163482666\n",
967
+ " > Real-time factor: 0.26459208495864933\n",
968
+ "model: tts_models/en/ljspeech/overflow\n",
969
+ "language: \n",
970
+ "speaker: \n",
971
+ "voice cloning with the voice conversion model\n",
972
+ " > Text splitted to sentences.\n",
973
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
974
+ " > Processing time: 2.4769999980926514\n",
975
+ " > Real-time factor: 0.27343925203231617\n"
976
+ ]
977
+ }
978
+ ],
979
+ "source": [
980
+ "title = \"\"\n",
981
+ "description = \"\"\"\"\"\"\n",
982
+ "article = \"\"\"\"\"\"\n",
983
+ "\n",
984
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
985
+ "GPU = device == \"cuda\"\n",
986
+ "INT16MAX = np.iinfo(np.int16).max\n",
987
+ "VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)\n",
988
+ "\n",
989
+ "\n",
990
+ "model_ids = ModelManager(verbose=False).list_models()\n",
991
+ "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
992
+ "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
993
+ "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
994
+ "examples_pt = 'examples'\n",
995
+ "allowed_extentions = ['.mp3', '.wav']\n",
996
+ "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
997
+ "verse = \"\"\"Mary had a little lamb,\n",
998
+ "Its fleece was white as snow.\n",
999
+ "Everywhere the child went,\n",
1000
+ "The little lamb was sure to go.\"\"\"\n",
1001
+ "\n",
1002
+ "\n",
1003
+ "def on_model_tts_select(model_name):\n",
1004
+ " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
1005
+ " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
1006
+ " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
1007
+ " language = languages[0]\n",
1008
+ " speaker = speakers[0]\n",
1009
+ " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
1010
+ " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
1011
+ "\n",
1012
+ "\n",
1013
+ "def on_voicedropdown(x):\n",
1014
+ " return examples[x]\n",
1015
+ "\n",
1016
+ "\n",
1017
+ "def voice_clone(source_wav, target_wav):\n",
1018
+ " print(f'model: {VC_MODEL.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
1019
+ " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
1020
+ " if source_wav is None or target_wav is None:\n",
1021
+ " return (sample_rate, np.zeros(0).astype(np.int16))\n",
1022
+ "\n",
1023
+ " speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
1024
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
1025
+ " return (sample_rate, speech)\n",
1026
+ "\n",
1027
+ "\n",
1028
+ "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
1029
+ " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
1030
+ " return (16000, np.zeros(0).astype(np.int16))\n",
1031
+ "\n",
1032
+ " sample_rate = tts_model.synthesizer.output_sample_rate\n",
1033
+ " if tts_model.is_multi_speaker:\n",
1034
+ " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
1035
+ " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
1036
+ "\n",
1037
+ " language = None if language == '' else language\n",
1038
+ " speaker = None if speaker == '' else speaker\n",
1039
+ " if use_original_voice:\n",
1040
+ " print('Using original voice')\n",
1041
+ " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
1042
+ " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
1043
+ " print('voice cloning with the tts')\n",
1044
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
1045
+ " else:\n",
1046
+ " print('voice cloning with the voice conversion model')\n",
1047
+ "# speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
1048
+ " with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False) as fp:\n",
1049
+ " # Lazy code... save it to a temp file to resample it while reading it for VC\n",
1050
+ " tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
1051
+ " speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
1052
+ " \n",
1053
+ "\n",
1054
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
1055
+ " return (sample_rate, speech)\n",
1056
+ "\n",
1057
+ "\n",
1058
+ "with gr.Blocks() as demo:\n",
1059
+ " tts_model = gr.State(None)\n",
1060
+ "# vc_model = gr.State(None)\n",
1061
+ " def activate(*args):\n",
1062
+ " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
1063
+ " def deactivate(*args):\n",
1064
+ " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
1065
+ "\n",
1066
+ " \n",
1067
+ " gr.Markdown(description)\n",
1068
+ "\n",
1069
+ " with gr.Row(equal_height=True):\n",
1070
+ " with gr.Column(scale=5, min_width=50):\n",
1071
+ " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)\n",
1072
+ " with gr.Column(scale=1, min_width=10):\n",
1073
+ " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
1074
+ " with gr.Column(scale=1, min_width=10):\n",
1075
+ " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
1076
+ " \n",
1077
+ " with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
1078
+ " gr.Markdown(\"Upload target voice...\")\n",
1079
+ " with gr.Row(equal_height=True):\n",
1080
+ " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
1081
+ " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
1082
+ "\n",
1083
+ " with gr.Row(equal_height=True):\n",
1084
+ " with gr.Column(scale=2):\n",
1085
+ " with gr.Row(equal_height=True):\n",
1086
+ " with gr.Column():\n",
1087
+ " text_to_convert = gr.Textbox(verse)\n",
1088
+ " orig_voice = gr.Checkbox(label='Use original voice')\n",
1089
+ " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
1090
+ " with gr.Row(equal_height=True):\n",
1091
+ " button_text = gr.Button('Text to speech', interactive=True)\n",
1092
+ " button_audio = gr.Button('Convert audio', interactive=True)\n",
1093
+ " with gr.Row(equal_height=True):\n",
1094
+ " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
1095
+ " \n",
1096
+ " # actions\n",
1097
+ " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
1098
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
1099
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
1100
+ " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
1101
+ " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
1102
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
1103
+ "\n",
1104
+ " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
1105
+ " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
1106
+ " outputs=speech).\\\n",
1107
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
1108
+ "\n",
1109
+ " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
1110
+ " then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\\\n",
1111
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
1112
+ " \n",
1113
+ " gr.HTML(article)\n",
1114
+ "demo.launch(share=False)"
1115
+ ]
1116
+ }
1117
+ ],
1118
+ "metadata": {
1119
+ "kernelspec": {
1120
+ "display_name": "Python 3",
1121
+ "language": "python",
1122
+ "name": "python3"
1123
+ },
1124
+ "language_info": {
1125
+ "codemirror_mode": {
1126
+ "name": "ipython",
1127
+ "version": 3
1128
+ },
1129
+ "file_extension": ".py",
1130
+ "mimetype": "text/x-python",
1131
+ "name": "python",
1132
+ "nbconvert_exporter": "python",
1133
+ "pygments_lexer": "ipython3",
1134
+ "version": "3.7.9"
1135
+ }
1136
+ },
1137
+ "nbformat": 4,
1138
+ "nbformat_minor": 5
1139
+ }
Coqui.ai.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
- "id": "57fc627d",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -19,8 +19,8 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": 9,
23
- "id": "a5789dee",
24
  "metadata": {
25
  "scrolled": false
26
  },
@@ -29,7 +29,7 @@
29
  "name": "stdout",
30
  "output_type": "stream",
31
  "text": [
32
- "Running on local URL: http://127.0.0.1:7864\n",
33
  "\n",
34
  "To create a public link, set `share=True` in `launch()`.\n"
35
  ]
@@ -37,7 +37,7 @@
37
  {
38
  "data": {
39
  "text/html": [
40
- "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
  ],
42
  "text/plain": [
43
  "<IPython.core.display.HTML object>"
@@ -50,7 +50,7 @@
50
  "data": {
51
  "text/plain": []
52
  },
53
- "execution_count": 9,
54
  "metadata": {},
55
  "output_type": "execute_result"
56
  },
@@ -58,6 +58,102 @@
58
  "name": "stdout",
59
  "output_type": "stream",
60
  "text": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
62
  " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
63
  " > Model's license - apache 2.0\n",
@@ -146,8 +242,8 @@
146
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
147
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
148
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
149
- " > Processing time: 24.694000244140625\n",
150
- " > Real-time factor: 2.8425842872081772\n"
151
  ]
152
  }
153
  ],
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
+ "id": "e65fcd73",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 2,
23
+ "id": "f902a92c",
24
  "metadata": {
25
  "scrolled": false
26
  },
 
29
  "name": "stdout",
30
  "output_type": "stream",
31
  "text": [
32
+ "Running on local URL: http://127.0.0.1:7860\n",
33
  "\n",
34
  "To create a public link, set `share=True` in `launch()`.\n"
35
  ]
 
37
  {
38
  "data": {
39
  "text/html": [
40
+ "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
  ],
42
  "text/plain": [
43
  "<IPython.core.display.HTML object>"
 
50
  "data": {
51
  "text/plain": []
52
  },
53
+ "execution_count": 2,
54
  "metadata": {},
55
  "output_type": "execute_result"
56
  },
 
58
  "name": "stdout",
59
  "output_type": "stream",
60
  "text": [
61
+ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
62
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
63
+ " > Model's license - apache 2.0\n",
64
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
65
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
66
+ " > Model's license - apache 2.0\n",
67
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
68
+ " > Using model: Tacotron2\n",
69
+ " > Setting up Audio Processor...\n",
70
+ " | > sample_rate:22050\n",
71
+ " | > resample:False\n",
72
+ " | > num_mels:80\n",
73
+ " | > log_func:np.log10\n",
74
+ " | > min_level_db:-100\n",
75
+ " | > frame_shift_ms:None\n",
76
+ " | > frame_length_ms:None\n",
77
+ " | > ref_level_db:20\n",
78
+ " | > fft_size:1024\n",
79
+ " | > power:1.5\n",
80
+ " | > preemphasis:0.0\n",
81
+ " | > griffin_lim_iters:60\n",
82
+ " | > signal_norm:True\n",
83
+ " | > symmetric_norm:True\n",
84
+ " | > mel_fmin:50.0\n",
85
+ " | > mel_fmax:7600.0\n",
86
+ " | > pitch_fmin:0.0\n",
87
+ " | > pitch_fmax:640.0\n",
88
+ " | > spec_gain:1.0\n",
89
+ " | > stft_pad_mode:reflect\n",
90
+ " | > max_norm:4.0\n",
91
+ " | > clip_norm:True\n",
92
+ " | > do_trim_silence:True\n",
93
+ " | > trim_db:60\n",
94
+ " | > do_sound_norm:False\n",
95
+ " | > do_amp_to_db_linear:True\n",
96
+ " | > do_amp_to_db_mel:True\n",
97
+ " | > do_rms_norm:False\n",
98
+ " | > db_level:None\n",
99
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
100
+ " | > base:10\n",
101
+ " | > hop_length:256\n",
102
+ " | > win_length:1024\n",
103
+ " > Model's reduction rate `r` is set to: 2\n",
104
+ " > Vocoder Model: univnet\n",
105
+ " > Setting up Audio Processor...\n",
106
+ " | > sample_rate:22050\n",
107
+ " | > resample:False\n",
108
+ " | > num_mels:80\n",
109
+ " | > log_func:np.log10\n",
110
+ " | > min_level_db:-100\n",
111
+ " | > frame_shift_ms:None\n",
112
+ " | > frame_length_ms:None\n",
113
+ " | > ref_level_db:20\n",
114
+ " | > fft_size:1024\n",
115
+ " | > power:1.5\n",
116
+ " | > preemphasis:0.0\n",
117
+ " | > griffin_lim_iters:60\n",
118
+ " | > signal_norm:True\n",
119
+ " | > symmetric_norm:True\n",
120
+ " | > mel_fmin:50.0\n",
121
+ " | > mel_fmax:7600.0\n",
122
+ " | > pitch_fmin:1.0\n",
123
+ " | > pitch_fmax:640.0\n",
124
+ " | > spec_gain:1.0\n",
125
+ " | > stft_pad_mode:reflect\n",
126
+ " | > max_norm:4.0\n",
127
+ " | > clip_norm:True\n",
128
+ " | > do_trim_silence:True\n",
129
+ " | > trim_db:60\n",
130
+ " | > do_sound_norm:False\n",
131
+ " | > do_amp_to_db_linear:True\n",
132
+ " | > do_amp_to_db_mel:True\n",
133
+ " | > do_rms_norm:False\n",
134
+ " | > db_level:None\n",
135
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
136
+ " | > base:10\n",
137
+ " | > hop_length:256\n",
138
+ " | > win_length:1024\n",
139
+ " > Generator Model: univnet_generator\n",
140
+ " > Discriminator Model: univnet_discriminator\n",
141
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
142
+ "language: \n",
143
+ "speaker: \n",
144
+ "voice cloning with the voice conversion model\n",
145
+ " > Text splitted to sentences.\n",
146
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
147
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
148
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
149
+ " > Processing time: 3.4810001850128174\n",
150
+ " > Real-time factor: 0.400706095887971\n",
151
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
152
+ " > Model's license - MIT\n",
153
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
154
+ " > Using model: freevc\n",
155
+ " > Loading pretrained speaker encoder model ...\n",
156
+ "Loaded the voice encoder model on cpu in 0.09 seconds.\n",
157
  "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
  " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
159
  " > Model's license - apache 2.0\n",
 
242
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
243
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
244
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
245
+ " > Processing time: 2.931999921798706\n",
246
+ " > Real-time factor: 0.3375093879242267\n"
247
  ]
248
  }
249
  ],
app.bak.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from pathlib import Path
6
+
7
+ from TTS.api import TTS
8
+ from TTS.utils.manage import ModelManager
9
+
10
+
11
+ title = ""
12
+ description = """"""
13
+ article = """"""
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ GPU = device == "cuda"
17
+ INT16MAX = np.iinfo(np.int16).max
18
+
19
+ model_ids = ModelManager(verbose=False).list_models()
20
+ model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
21
+ model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
22
+ model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
23
+ examples_pt = 'examples'
24
+ allowed_extentions = ['.mp3', '.wav']
25
+ examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
26
+ verse = """Mary had a little lamb,
27
+ Its fleece was white as snow.
28
+ Everywhere the child went,
29
+ The little lamb was sure to go."""
30
+
31
+
32
+
33
+ def on_model_tts_select(model_name, tts_var):
34
+ if tts_var is None or tts_var.model_name != model_name:
35
+ print(f'Loading TTS model from {model_name}')
36
+ tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
37
+ else:
38
+ print(f'Passing through TTS model {tts_var.model_name}')
39
+ languages = tts_var.languages if tts_var.is_multi_lingual else ['']
40
+ speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
41
+ language = languages[0]
42
+ speaker = speakers[0]
43
+ return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
44
+ gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
45
+
46
+
47
+ def on_model_vc_select(model_name, vc_var):
48
+ if vc_var is None or vc_var.model_name != model_name:
49
+ print(f'Loading voice conversion model from {model_name}')
50
+ vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
51
+ else:
52
+ print(f'Passing through voice conversion model {vc_var.model_name}')
53
+ return vc_var
54
+
55
+
56
+ def on_voicedropdown(x):
57
+ return examples[x]
58
+
59
+
60
+ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
61
+ if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
62
+ return (16000, np.zeros(0).astype(np.int16))
63
+
64
+ sample_rate = tts_model.synthesizer.output_sample_rate
65
+ if tts_model.is_multi_speaker:
66
+ speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
67
+ print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
68
+
69
+ language = None if language == '' else language
70
+ speaker = None if speaker == '' else speaker
71
+ if use_original_voice:
72
+ print('Using original voice')
73
+ speech = tts_model.tts(text, language=language, speaker=speaker)
74
+ elif tts_model.synthesizer.tts_model.speaker_manager:
75
+ print('voice cloning with the tts')
76
+ speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
+ else:
78
+ print('voice cloning with the voice conversion model')
79
+ speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
80
+
81
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
82
+ return (sample_rate, speech)
83
+
84
+
85
+ def voice_clone(vc_model, source_wav, target_wav):
86
+ print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
87
+ sample_rate = vc_model.voice_converter.output_sample_rate
88
+ if vc_model is None or source_wav is None or target_wav is None:
89
+ return (sample_rate, np.zeros(0).astype(np.int16))
90
+
91
+ speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
92
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
93
+ return (sample_rate, speech)
94
+
95
+
96
+ with gr.Blocks() as demo:
97
+ tts_model = gr.State(None)
98
+ vc_model = gr.State(None)
99
+ def activate(*args):
100
+ return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
101
+ def deactivate(*args):
102
+ return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
103
+
104
+ gr.Markdown(description)
105
+
106
+ with gr.Row(equal_height=True):
107
+ with gr.Column(scale=5, min_width=50):
108
+ model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
109
+ with gr.Column(scale=1, min_width=10):
110
+ language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
111
+ with gr.Column(scale=1, min_width=10):
112
+ speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
113
+ with gr.Column(scale=5, min_width=50):
114
+ with gr.Row(equal_height=True):
115
+ # model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
116
+ model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
117
+
118
+ with gr.Accordion("Target voice", open=False) as accordion:
119
+ gr.Markdown("Upload target voice...")
120
+ with gr.Row(equal_height=True):
121
+ voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
122
+ voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
123
+
124
+ with gr.Row(equal_height=True):
125
+ with gr.Column(scale=2):
126
+ with gr.Row(equal_height=True):
127
+ with gr.Column():
128
+ text_to_convert = gr.Textbox(verse)
129
+ orig_voice = gr.Checkbox(label='Use original voice')
130
+ voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
131
+ with gr.Row(equal_height=True):
132
+ button_text = gr.Button('Text to speech', interactive=True)
133
+ button_audio = gr.Button('Convert audio', interactive=True)
134
+ with gr.Row(equal_height=True):
135
+ speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)
136
+
137
+ # actions
138
+ model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
139
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
140
+ then(activate, [button_text, button_audio], [button_text, button_audio])
141
+ model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
142
+ then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
143
+ then(activate, [button_text, button_audio], [button_text, button_audio])
144
+ voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
145
+ then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
146
+ then(activate, [button_text, button_audio], [button_text, button_audio])
147
+
148
+ button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
149
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
150
+ then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
151
+ outputs=speech).\
152
+ then(activate, [button_text, button_audio], [button_text, button_audio])
153
+
154
+ button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
155
+ then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
156
+ then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
157
+ then(activate, [button_text, button_audio], [button_text, button_audio])
158
+
159
+ gr.HTML(article)
160
+ demo.launch(share=False)
app.py CHANGED
@@ -3,6 +3,7 @@ import numpy as np
3
  import torch
4
  import torch.nn.functional as F
5
  from pathlib import Path
 
6
 
7
  from TTS.api import TTS
8
  from TTS.utils.manage import ModelManager
@@ -15,6 +16,8 @@ article = """"""
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  GPU = device == "cuda"
17
  INT16MAX = np.iinfo(np.int16).max
 
 
18
 
19
  model_ids = ModelManager(verbose=False).list_models()
20
  model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
@@ -29,13 +32,8 @@ Everywhere the child went,
29
  The little lamb was sure to go."""
30
 
31
 
32
-
33
- def on_model_tts_select(model_name, tts_var):
34
- if tts_var is None or tts_var.model_name != model_name:
35
- print(f'Loading TTS model from {model_name}')
36
- tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
37
- else:
38
- print(f'Passing through TTS model {tts_var.model_name}')
39
  languages = tts_var.languages if tts_var.is_multi_lingual else ['']
40
  speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
41
  language = languages[0]
@@ -44,28 +42,30 @@ def on_model_tts_select(model_name, tts_var):
44
  gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
45
 
46
 
47
- def on_model_vc_select(model_name, vc_var):
48
- if vc_var is None or vc_var.model_name != model_name:
49
- print(f'Loading voice conversion model from {model_name}')
50
- vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
51
- else:
52
- print(f'Passing through voice conversion model {vc_var.model_name}')
53
- return vc_var
54
-
55
-
56
  def on_voicedropdown(x):
57
  return examples[x]
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
60
  def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
61
  if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
62
  return (16000, np.zeros(0).astype(np.int16))
63
-
64
  sample_rate = tts_model.synthesizer.output_sample_rate
65
  if tts_model.is_multi_speaker:
66
  speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
67
  print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
68
-
69
  language = None if language == '' else language
70
  speaker = None if speaker == '' else speaker
71
  if use_original_voice:
@@ -76,44 +76,34 @@ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_
76
  speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
  else:
78
  print('voice cloning with the voice conversion model')
79
- speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
80
-
81
- speech = (np.array(speech) * INT16MAX).astype(np.int16)
82
- return (sample_rate, speech)
83
-
84
-
85
- def voice_clone(vc_model, source_wav, target_wav):
86
- print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
87
- sample_rate = vc_model.voice_converter.output_sample_rate
88
- if vc_model is None or source_wav is None or target_wav is None:
89
- return (sample_rate, np.zeros(0).astype(np.int16))
90
 
91
- speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
92
  speech = (np.array(speech) * INT16MAX).astype(np.int16)
93
  return (sample_rate, speech)
94
 
95
 
96
  with gr.Blocks() as demo:
97
  tts_model = gr.State(None)
98
- vc_model = gr.State(None)
99
  def activate(*args):
100
  return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
101
  def deactivate(*args):
102
  return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
103
 
 
104
  gr.Markdown(description)
105
 
106
  with gr.Row(equal_height=True):
107
  with gr.Column(scale=5, min_width=50):
108
- model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
109
  with gr.Column(scale=1, min_width=10):
110
  language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
111
  with gr.Column(scale=1, min_width=10):
112
  speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
113
- with gr.Column(scale=5, min_width=50):
114
- with gr.Row(equal_height=True):
115
- # model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
116
- model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
117
 
118
  with gr.Accordion("Target voice", open=False) as accordion:
119
  gr.Markdown("Upload target voice...")
@@ -136,24 +126,19 @@ with gr.Blocks() as demo:
136
 
137
  # actions
138
  model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
139
- then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
140
- then(activate, [button_text, button_audio], [button_text, button_audio])
141
- model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
142
- then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
143
  then(activate, [button_text, button_audio], [button_text, button_audio])
144
  voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
145
  then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
146
  then(activate, [button_text, button_audio], [button_text, button_audio])
147
-
148
  button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
149
- then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
150
  then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
151
  outputs=speech).\
152
  then(activate, [button_text, button_audio], [button_text, button_audio])
153
 
154
  button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
155
- then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
156
- then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
157
  then(activate, [button_text, button_audio], [button_text, button_audio])
158
 
159
  gr.HTML(article)
 
3
  import torch
4
  import torch.nn.functional as F
5
  from pathlib import Path
6
+ import tempfile
7
 
8
  from TTS.api import TTS
9
  from TTS.utils.manage import ModelManager
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  GPU = device == "cuda"
18
  INT16MAX = np.iinfo(np.int16).max
19
+ VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)
20
+
21
 
22
  model_ids = ModelManager(verbose=False).list_models()
23
  model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
 
32
  The little lamb was sure to go."""
33
 
34
 
35
+ def on_model_tts_select(model_name):
36
+ tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
 
 
 
 
 
37
  languages = tts_var.languages if tts_var.is_multi_lingual else ['']
38
  speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
39
  language = languages[0]
 
42
  gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
43
 
44
 
 
 
 
 
 
 
 
 
 
45
  def on_voicedropdown(x):
46
  return examples[x]
47
 
48
 
49
+ def voice_clone(source_wav, target_wav):
50
+ print(f'model: {VC_MODEL.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
51
+ sample_rate = VC_MODEL.voice_converter.output_sample_rate
52
+ if source_wav is None or target_wav is None:
53
+ return (sample_rate, np.zeros(0).astype(np.int16))
54
+
55
+ speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)
56
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
57
+ return (sample_rate, speech)
58
+
59
+
60
  def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
61
  if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
62
  return (16000, np.zeros(0).astype(np.int16))
63
+
64
  sample_rate = tts_model.synthesizer.output_sample_rate
65
  if tts_model.is_multi_speaker:
66
  speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
67
  print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
68
+
69
  language = None if language == '' else language
70
  speaker = None if speaker == '' else speaker
71
  if use_original_voice:
 
76
  speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
  else:
78
  print('voice cloning with the voice conversion model')
79
+ # speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
80
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
81
+ # Lazy code... save it to a temp file to resample it while reading it for VC
82
+ tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
83
+ speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)
84
+
 
 
 
 
 
85
 
 
86
  speech = (np.array(speech) * INT16MAX).astype(np.int16)
87
  return (sample_rate, speech)
88
 
89
 
90
  with gr.Blocks() as demo:
91
  tts_model = gr.State(None)
 
92
  def activate(*args):
93
  return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
94
  def deactivate(*args):
95
  return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
96
 
97
+
98
  gr.Markdown(description)
99
 
100
  with gr.Row(equal_height=True):
101
  with gr.Column(scale=5, min_width=50):
102
+ model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)
103
  with gr.Column(scale=1, min_width=10):
104
  language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
105
  with gr.Column(scale=1, min_width=10):
106
  speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
 
 
 
 
107
 
108
  with gr.Accordion("Target voice", open=False) as accordion:
109
  gr.Markdown("Upload target voice...")
 
126
 
127
  # actions
128
  model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
129
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
 
 
 
130
  then(activate, [button_text, button_audio], [button_text, button_audio])
131
  voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
132
  then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
133
  then(activate, [button_text, button_audio], [button_text, button_audio])
134
+
135
  button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
 
136
  then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
137
  outputs=speech).\
138
  then(activate, [button_text, button_audio], [button_text, button_audio])
139
 
140
  button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
141
+ then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\
 
142
  then(activate, [button_text, button_audio], [button_text, button_audio])
143
 
144
  gr.HTML(article)