rakhlin commited on
Commit
b2d3c53
·
1 Parent(s): 9d94b06

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #checkpoints
2
+ tts/
3
+
4
+ #notebooks and bak files
5
+ *.bak.py
6
+ *.ipynb
.ipynb_checkpoints/Coqui.ai-Copy1-checkpoint.ipynb CHANGED
@@ -2,8 +2,8 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
6
- "id": "156133fe",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -15,13 +15,14 @@
15
  "import tempfile\n",
16
  "\n",
17
  "from TTS.api import TTS\n",
18
- "from TTS.utils.manage import ModelManager"
 
19
  ]
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 6,
24
- "id": "5e5af800",
25
  "metadata": {
26
  "scrolled": false
27
  },
@@ -35,8 +36,8 @@
35
  " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
36
  " > Using model: freevc\n",
37
  " > Loading pretrained speaker encoder model ...\n",
38
- "Loaded the voice encoder model on cpu in 0.01 seconds.\n",
39
- "Running on local URL: http://127.0.0.1:7863\n",
40
  "\n",
41
  "To create a public link, set `share=True` in `launch()`.\n"
42
  ]
@@ -44,7 +45,7 @@
44
  {
45
  "data": {
46
  "text/html": [
47
- "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
48
  ],
49
  "text/plain": [
50
  "<IPython.core.display.HTML object>"
@@ -57,7 +58,7 @@
57
  "data": {
58
  "text/plain": []
59
  },
60
- "execution_count": 6,
61
  "metadata": {},
62
  "output_type": "execute_result"
63
  },
@@ -71,1066 +72,7 @@
71
  " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
72
  " > Model's license - apache 2.0\n",
73
  " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
74
- " > Using model: Tacotron2\n",
75
- " > Setting up Audio Processor...\n",
76
- " | > sample_rate:22050\n",
77
- " | > resample:False\n",
78
- " | > num_mels:80\n",
79
- " | > log_func:np.log10\n",
80
- " | > min_level_db:-100\n",
81
- " | > frame_shift_ms:None\n",
82
- " | > frame_length_ms:None\n",
83
- " | > ref_level_db:20\n",
84
- " | > fft_size:1024\n",
85
- " | > power:1.5\n",
86
- " | > preemphasis:0.0\n",
87
- " | > griffin_lim_iters:60\n",
88
- " | > signal_norm:True\n",
89
- " | > symmetric_norm:True\n",
90
- " | > mel_fmin:50.0\n",
91
- " | > mel_fmax:7600.0\n",
92
- " | > pitch_fmin:0.0\n",
93
- " | > pitch_fmax:640.0\n",
94
- " | > spec_gain:1.0\n",
95
- " | > stft_pad_mode:reflect\n",
96
- " | > max_norm:4.0\n",
97
- " | > clip_norm:True\n",
98
- " | > do_trim_silence:True\n",
99
- " | > trim_db:60\n",
100
- " | > do_sound_norm:False\n",
101
- " | > do_amp_to_db_linear:True\n",
102
- " | > do_amp_to_db_mel:True\n",
103
- " | > do_rms_norm:False\n",
104
- " | > db_level:None\n",
105
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
106
- " | > base:10\n",
107
- " | > hop_length:256\n",
108
- " | > win_length:1024\n",
109
- " > Model's reduction rate `r` is set to: 2\n",
110
- " > Vocoder Model: univnet\n",
111
- " > Setting up Audio Processor...\n",
112
- " | > sample_rate:22050\n",
113
- " | > resample:False\n",
114
- " | > num_mels:80\n",
115
- " | > log_func:np.log10\n",
116
- " | > min_level_db:-100\n",
117
- " | > frame_shift_ms:None\n",
118
- " | > frame_length_ms:None\n",
119
- " | > ref_level_db:20\n",
120
- " | > fft_size:1024\n",
121
- " | > power:1.5\n",
122
- " | > preemphasis:0.0\n",
123
- " | > griffin_lim_iters:60\n",
124
- " | > signal_norm:True\n",
125
- " | > symmetric_norm:True\n",
126
- " | > mel_fmin:50.0\n",
127
- " | > mel_fmax:7600.0\n",
128
- " | > pitch_fmin:1.0\n",
129
- " | > pitch_fmax:640.0\n",
130
- " | > spec_gain:1.0\n",
131
- " | > stft_pad_mode:reflect\n",
132
- " | > max_norm:4.0\n",
133
- " | > clip_norm:True\n",
134
- " | > do_trim_silence:True\n",
135
- " | > trim_db:60\n",
136
- " | > do_sound_norm:False\n",
137
- " | > do_amp_to_db_linear:True\n",
138
- " | > do_amp_to_db_mel:True\n",
139
- " | > do_rms_norm:False\n",
140
- " | > db_level:None\n",
141
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
142
- " | > base:10\n",
143
- " | > hop_length:256\n",
144
- " | > win_length:1024\n",
145
- " > Generator Model: univnet_generator\n",
146
- " > Discriminator Model: univnet_discriminator\n",
147
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
148
- "language: \n",
149
- "speaker: \n",
150
- "voice cloning with the voice conversion model\n",
151
- " > Text splitted to sentences.\n",
152
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
153
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
154
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
155
- " > Processing time: 3.3410003185272217\n",
156
- " > Real-time factor: 0.38459038289093944\n",
157
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
- "language: \n",
159
- "speaker: \n",
160
- "voice cloning with the voice conversion model\n",
161
- " > Text splitted to sentences.\n",
162
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
163
- " > Processing time: 2.9179999828338623\n",
164
- " > Real-time factor: 0.3358978221135079\n",
165
- " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
166
- " > Model's license - apache 2.0\n",
167
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
168
- " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
169
- " > Model's license - apache 2.0\n",
170
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
171
- " > Using model: Tacotron2\n",
172
- " > Setting up Audio Processor...\n",
173
- " | > sample_rate:22050\n",
174
- " | > resample:False\n",
175
- " | > num_mels:80\n",
176
- " | > log_func:np.log10\n",
177
- " | > min_level_db:-100\n",
178
- " | > frame_shift_ms:None\n",
179
- " | > frame_length_ms:None\n",
180
- " | > ref_level_db:20\n",
181
- " | > fft_size:1024\n",
182
- " | > power:1.5\n",
183
- " | > preemphasis:0.0\n",
184
- " | > griffin_lim_iters:60\n",
185
- " | > signal_norm:True\n",
186
- " | > symmetric_norm:True\n",
187
- " | > mel_fmin:50.0\n",
188
- " | > mel_fmax:7600.0\n",
189
- " | > pitch_fmin:0.0\n",
190
- " | > pitch_fmax:640.0\n",
191
- " | > spec_gain:1.0\n",
192
- " | > stft_pad_mode:reflect\n",
193
- " | > max_norm:4.0\n",
194
- " | > clip_norm:True\n",
195
- " | > do_trim_silence:True\n",
196
- " | > trim_db:60\n",
197
- " | > do_sound_norm:False\n",
198
- " | > do_amp_to_db_linear:True\n",
199
- " | > do_amp_to_db_mel:True\n",
200
- " | > do_rms_norm:False\n",
201
- " | > db_level:None\n",
202
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
203
- " | > base:10\n",
204
- " | > hop_length:256\n",
205
- " | > win_length:1024\n",
206
- " > Model's reduction rate `r` is set to: 2\n",
207
- " > Vocoder Model: univnet\n",
208
- " > Setting up Audio Processor...\n",
209
- " | > sample_rate:22050\n",
210
- " | > resample:False\n",
211
- " | > num_mels:80\n",
212
- " | > log_func:np.log10\n",
213
- " | > min_level_db:-100\n",
214
- " | > frame_shift_ms:None\n",
215
- " | > frame_length_ms:None\n",
216
- " | > ref_level_db:20\n",
217
- " | > fft_size:1024\n",
218
- " | > power:1.5\n",
219
- " | > preemphasis:0.0\n",
220
- " | > griffin_lim_iters:60\n",
221
- " | > signal_norm:True\n",
222
- " | > symmetric_norm:True\n",
223
- " | > mel_fmin:50.0\n",
224
- " | > mel_fmax:7600.0\n",
225
- " | > pitch_fmin:1.0\n",
226
- " | > pitch_fmax:640.0\n",
227
- " | > spec_gain:1.0\n",
228
- " | > stft_pad_mode:reflect\n",
229
- " | > max_norm:4.0\n",
230
- " | > clip_norm:True\n",
231
- " | > do_trim_silence:True\n",
232
- " | > trim_db:60\n",
233
- " | > do_sound_norm:False\n",
234
- " | > do_amp_to_db_linear:True\n",
235
- " | > do_amp_to_db_mel:True\n",
236
- " | > do_rms_norm:False\n",
237
- " | > db_level:None\n",
238
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
239
- " | > base:10\n",
240
- " | > hop_length:256\n",
241
- " | > win_length:1024\n",
242
- " > Generator Model: univnet_generator\n",
243
- " > Discriminator Model: univnet_discriminator\n",
244
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
245
- "language: \n",
246
- "speaker: \n",
247
- "voice cloning with the voice conversion model\n",
248
- " > Text splitted to sentences.\n",
249
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
250
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
251
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
252
- " > Processing time: 3.021000385284424\n",
253
- " > Real-time factor: 0.3477544400242312\n",
254
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
255
- "language: \n",
256
- "speaker: \n",
257
- "voice cloning with the voice conversion model\n",
258
- " > Text splitted to sentences.\n",
259
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
260
- " > Processing time: 2.9099998474121094\n",
261
- " > Real-time factor: 0.33497690776101013\n",
262
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
263
- "language: \n",
264
- "speaker: \n",
265
- "voice cloning with the voice conversion model\n",
266
- " > Text splitted to sentences.\n",
267
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
268
- " > Processing time: 2.933000087738037\n",
269
- " > Real-time factor: 0.33762451937136506\n",
270
- " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
271
- " > Model's license - apache 2.0\n",
272
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
273
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
274
- " > Model's license - apache 2.0\n",
275
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
276
- " > Using model: Tacotron2\n",
277
- " > Setting up Audio Processor...\n",
278
- " | > sample_rate:22050\n",
279
- " | > resample:False\n",
280
- " | > num_mels:80\n",
281
- " | > log_func:np.log\n",
282
- " | > min_level_db:-100\n",
283
- " | > frame_shift_ms:None\n",
284
- " | > frame_length_ms:None\n",
285
- " | > ref_level_db:20\n",
286
- " | > fft_size:1024\n",
287
- " | > power:1.5\n",
288
- " | > preemphasis:0.0\n",
289
- " | > griffin_lim_iters:60\n",
290
- " | > signal_norm:False\n",
291
- " | > symmetric_norm:True\n",
292
- " | > mel_fmin:0\n",
293
- " | > mel_fmax:8000.0\n",
294
- " | > pitch_fmin:1.0\n",
295
- " | > pitch_fmax:640.0\n",
296
- " | > spec_gain:1.0\n",
297
- " | > stft_pad_mode:reflect\n",
298
- " | > max_norm:4.0\n",
299
- " | > clip_norm:True\n",
300
- " | > do_trim_silence:True\n",
301
- " | > trim_db:60\n",
302
- " | > do_sound_norm:False\n",
303
- " | > do_amp_to_db_linear:True\n",
304
- " | > do_amp_to_db_mel:True\n",
305
- " | > do_rms_norm:False\n",
306
- " | > db_level:None\n",
307
- " | > stats_path:None\n",
308
- " | > base:2.718281828459045\n",
309
- " | > hop_length:256\n",
310
- " | > win_length:1024\n",
311
- " > Model's reduction rate `r` is set to: 1\n",
312
- " > Vocoder Model: hifigan\n",
313
- " > Setting up Audio Processor...\n",
314
- " | > sample_rate:22050\n",
315
- " | > resample:False\n",
316
- " | > num_mels:80\n",
317
- " | > log_func:np.log\n",
318
- " | > min_level_db:-100\n",
319
- " | > frame_shift_ms:None\n",
320
- " | > frame_length_ms:None\n",
321
- " | > ref_level_db:20\n",
322
- " | > fft_size:1024\n",
323
- " | > power:1.5\n",
324
- " | > preemphasis:0.0\n",
325
- " | > griffin_lim_iters:60\n",
326
- " | > signal_norm:False\n",
327
- " | > symmetric_norm:True\n",
328
- " | > mel_fmin:0\n",
329
- " | > mel_fmax:8000.0\n",
330
- " | > pitch_fmin:1.0\n",
331
- " | > pitch_fmax:640.0\n",
332
- " | > spec_gain:1.0\n",
333
- " | > stft_pad_mode:reflect\n",
334
- " | > max_norm:4.0\n",
335
- " | > clip_norm:True\n",
336
- " | > do_trim_silence:False\n",
337
- " | > trim_db:60\n",
338
- " | > do_sound_norm:False\n",
339
- " | > do_amp_to_db_linear:True\n",
340
- " | > do_amp_to_db_mel:True\n",
341
- " | > do_rms_norm:False\n",
342
- " | > db_level:None\n",
343
- " | > stats_path:None\n",
344
- " | > base:2.718281828459045\n",
345
- " | > hop_length:256\n",
346
- " | > win_length:1024\n",
347
- " > Generator Model: hifigan_generator\n",
348
- " > Discriminator Model: hifigan_discriminator\n"
349
- ]
350
- },
351
- {
352
- "name": "stdout",
353
- "output_type": "stream",
354
- "text": [
355
- "Removing weight norm...\n",
356
- "model: tts_models/en/ljspeech/tacotron2-DDC\n",
357
- "language: \n",
358
- "speaker: \n",
359
- "voice cloning with the voice conversion model\n",
360
- " > Text splitted to sentences.\n",
361
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
362
- " > Processing time: 4.28600001335144\n",
363
- " > Real-time factor: 0.42371906516498953\n",
364
- " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
365
- " > Model's license - apache 2.0\n",
366
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
367
- " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
368
- " > Model's license - apache 2.0\n",
369
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
370
- " > Using model: Tacotron2\n",
371
- " > Setting up Audio Processor...\n",
372
- " | > sample_rate:22050\n",
373
- " | > resample:False\n",
374
- " | > num_mels:80\n",
375
- " | > log_func:np.log10\n",
376
- " | > min_level_db:-10\n",
377
- " | > frame_shift_ms:None\n",
378
- " | > frame_length_ms:None\n",
379
- " | > ref_level_db:0\n",
380
- " | > fft_size:1024\n",
381
- " | > power:1.8\n",
382
- " | > preemphasis:0.99\n",
383
- " | > griffin_lim_iters:60\n",
384
- " | > signal_norm:True\n",
385
- " | > symmetric_norm:True\n",
386
- " | > mel_fmin:0\n",
387
- " | > mel_fmax:8000.0\n",
388
- " | > pitch_fmin:1.0\n",
389
- " | > pitch_fmax:640.0\n",
390
- " | > spec_gain:1.0\n",
391
- " | > stft_pad_mode:reflect\n",
392
- " | > max_norm:4.0\n",
393
- " | > clip_norm:True\n",
394
- " | > do_trim_silence:True\n",
395
- " | > trim_db:60\n",
396
- " | > do_sound_norm:False\n",
397
- " | > do_amp_to_db_linear:True\n",
398
- " | > do_amp_to_db_mel:True\n",
399
- " | > do_rms_norm:False\n",
400
- " | > db_level:None\n",
401
- " | > stats_path:None\n",
402
- " | > base:10\n",
403
- " | > hop_length:256\n",
404
- " | > win_length:1024\n",
405
- " > Model's reduction rate `r` is set to: 2\n",
406
- " > Vocoder Model: wavegrad\n",
407
- "model: tts_models/en/ek1/tacotron2\n",
408
- "language: \n",
409
- "speaker: \n",
410
- "voice cloning with the voice conversion model\n",
411
- " > Text splitted to sentences.\n",
412
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
413
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
414
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
415
- " > Processing time: 224.84099984169006\n",
416
- " > Real-time factor: 29.51038122922182\n",
417
- " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
418
- " > Model's license - apache 2.0\n",
419
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
420
- " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
421
- " > Model's license - apache 2.0\n",
422
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
423
- " > Using model: Tacotron2\n",
424
- " > Setting up Audio Processor...\n",
425
- " | > sample_rate:22050\n",
426
- " | > resample:False\n",
427
- " | > num_mels:80\n",
428
- " | > log_func:np.log10\n",
429
- " | > min_level_db:-10\n",
430
- " | > frame_shift_ms:None\n",
431
- " | > frame_length_ms:None\n",
432
- " | > ref_level_db:0\n",
433
- " | > fft_size:1024\n",
434
- " | > power:1.8\n",
435
- " | > preemphasis:0.99\n",
436
- " | > griffin_lim_iters:60\n",
437
- " | > signal_norm:True\n",
438
- " | > symmetric_norm:True\n",
439
- " | > mel_fmin:0\n",
440
- " | > mel_fmax:8000.0\n",
441
- " | > pitch_fmin:1.0\n",
442
- " | > pitch_fmax:640.0\n",
443
- " | > spec_gain:1.0\n",
444
- " | > stft_pad_mode:reflect\n",
445
- " | > max_norm:4.0\n",
446
- " | > clip_norm:True\n",
447
- " | > do_trim_silence:True\n",
448
- " | > trim_db:60\n",
449
- " | > do_sound_norm:False\n",
450
- " | > do_amp_to_db_linear:True\n",
451
- " | > do_amp_to_db_mel:True\n",
452
- " | > do_rms_norm:False\n",
453
- " | > db_level:None\n",
454
- " | > stats_path:None\n",
455
- " | > base:10\n",
456
- " | > hop_length:256\n",
457
- " | > win_length:1024\n",
458
- " > Model's reduction rate `r` is set to: 2\n",
459
- " > Vocoder Model: wavegrad\n",
460
- "model: tts_models/en/ek1/tacotron2\n",
461
- "language: \n",
462
- "speaker: \n",
463
- "voice cloning with the voice conversion model\n",
464
- " > Text splitted to sentences.\n",
465
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
466
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
467
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
468
- " > Processing time: 266.6489999294281\n",
469
- " > Real-time factor: 34.99768124073744\n",
470
- " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
471
- " > Model's license - apache 2.0\n",
472
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
473
- " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
474
- " > Model's license - apache 2.0\n",
475
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
476
- " > Using model: Tacotron2\n",
477
- " > Setting up Audio Processor...\n",
478
- " | > sample_rate:22050\n",
479
- " | > resample:False\n",
480
- " | > num_mels:80\n",
481
- " | > log_func:np.log10\n",
482
- " | > min_level_db:-100\n",
483
- " | > frame_shift_ms:None\n",
484
- " | > frame_length_ms:None\n",
485
- " | > ref_level_db:20\n",
486
- " | > fft_size:1024\n",
487
- " | > power:1.5\n",
488
- " | > preemphasis:0.0\n",
489
- " | > griffin_lim_iters:60\n",
490
- " | > signal_norm:True\n",
491
- " | > symmetric_norm:True\n",
492
- " | > mel_fmin:50.0\n",
493
- " | > mel_fmax:7600.0\n",
494
- " | > pitch_fmin:0.0\n",
495
- " | > pitch_fmax:640.0\n",
496
- " | > spec_gain:1.0\n",
497
- " | > stft_pad_mode:reflect\n",
498
- " | > max_norm:4.0\n",
499
- " | > clip_norm:True\n",
500
- " | > do_trim_silence:True\n",
501
- " | > trim_db:60\n",
502
- " | > do_sound_norm:False\n",
503
- " | > do_amp_to_db_linear:True\n",
504
- " | > do_amp_to_db_mel:True\n",
505
- " | > do_rms_norm:False\n",
506
- " | > db_level:None\n",
507
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
508
- " | > base:10\n",
509
- " | > hop_length:256\n",
510
- " | > win_length:1024\n",
511
- " > Model's reduction rate `r` is set to: 2\n",
512
- " > Vocoder Model: univnet\n",
513
- " > Setting up Audio Processor...\n",
514
- " | > sample_rate:22050\n",
515
- " | > resample:False\n",
516
- " | > num_mels:80\n",
517
- " | > log_func:np.log10\n",
518
- " | > min_level_db:-100\n",
519
- " | > frame_shift_ms:None\n",
520
- " | > frame_length_ms:None\n",
521
- " | > ref_level_db:20\n",
522
- " | > fft_size:1024\n",
523
- " | > power:1.5\n",
524
- " | > preemphasis:0.0\n",
525
- " | > griffin_lim_iters:60\n",
526
- " | > signal_norm:True\n",
527
- " | > symmetric_norm:True\n",
528
- " | > mel_fmin:50.0\n",
529
- " | > mel_fmax:7600.0\n",
530
- " | > pitch_fmin:1.0\n",
531
- " | > pitch_fmax:640.0\n",
532
- " | > spec_gain:1.0\n",
533
- " | > stft_pad_mode:reflect\n",
534
- " | > max_norm:4.0\n",
535
- " | > clip_norm:True\n",
536
- " | > do_trim_silence:True\n",
537
- " | > trim_db:60\n",
538
- " | > do_sound_norm:False\n",
539
- " | > do_amp_to_db_linear:True\n",
540
- " | > do_amp_to_db_mel:True\n",
541
- " | > do_rms_norm:False\n",
542
- " | > db_level:None\n",
543
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
544
- " | > base:10\n",
545
- " | > hop_length:256\n",
546
- " | > win_length:1024\n",
547
- " > Generator Model: univnet_generator\n",
548
- " > Discriminator Model: univnet_discriminator\n",
549
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
550
- "language: \n",
551
- "speaker: \n",
552
- "voice cloning with the voice conversion model\n",
553
- " > Text splitted to sentences.\n",
554
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
555
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
556
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
557
- " > Processing time: 2.885999917984009\n",
558
- " > Real-time factor: 0.3322142195933605\n",
559
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--speedy-speech\n",
560
- " > Model's license - apache 2.0\n",
561
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
562
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
563
- " > Model's license - apache 2.0\n",
564
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
565
- " > Using model: speedy_speech\n",
566
- " > Setting up Audio Processor...\n",
567
- " | > sample_rate:22050\n",
568
- " | > resample:False\n",
569
- " | > num_mels:80\n",
570
- " | > log_func:np.log\n",
571
- " | > min_level_db:-100\n",
572
- " | > frame_shift_ms:None\n",
573
- " | > frame_length_ms:None\n",
574
- " | > ref_level_db:20\n",
575
- " | > fft_size:1024\n",
576
- " | > power:1.5\n",
577
- " | > preemphasis:0.0\n",
578
- " | > griffin_lim_iters:60\n",
579
- " | > signal_norm:False\n",
580
- " | > symmetric_norm:True\n",
581
- " | > mel_fmin:0\n",
582
- " | > mel_fmax:8000.0\n",
583
- " | > pitch_fmin:1.0\n",
584
- " | > pitch_fmax:640.0\n",
585
- " | > spec_gain:1.0\n",
586
- " | > stft_pad_mode:reflect\n",
587
- " | > max_norm:4.0\n",
588
- " | > clip_norm:True\n",
589
- " | > do_trim_silence:True\n",
590
- " | > trim_db:60\n",
591
- " | > do_sound_norm:False\n",
592
- " | > do_amp_to_db_linear:True\n",
593
- " | > do_amp_to_db_mel:True\n",
594
- " | > do_rms_norm:False\n",
595
- " | > db_level:None\n",
596
- " | > stats_path:None\n",
597
- " | > base:2.718281828459045\n",
598
- " | > hop_length:256\n",
599
- " | > win_length:1024\n",
600
- " > Vocoder Model: hifigan\n",
601
- " > Setting up Audio Processor...\n",
602
- " | > sample_rate:22050\n",
603
- " | > resample:False\n",
604
- " | > num_mels:80\n",
605
- " | > log_func:np.log\n",
606
- " | > min_level_db:-100\n",
607
- " | > frame_shift_ms:None\n",
608
- " | > frame_length_ms:None\n",
609
- " | > ref_level_db:20\n",
610
- " | > fft_size:1024\n",
611
- " | > power:1.5\n",
612
- " | > preemphasis:0.0\n",
613
- " | > griffin_lim_iters:60\n",
614
- " | > signal_norm:False\n",
615
- " | > symmetric_norm:True\n",
616
- " | > mel_fmin:0\n",
617
- " | > mel_fmax:8000.0\n",
618
- " | > pitch_fmin:1.0\n",
619
- " | > pitch_fmax:640.0\n",
620
- " | > spec_gain:1.0\n",
621
- " | > stft_pad_mode:reflect\n",
622
- " | > max_norm:4.0\n",
623
- " | > clip_norm:True\n",
624
- " | > do_trim_silence:False\n",
625
- " | > trim_db:60\n",
626
- " | > do_sound_norm:False\n",
627
- " | > do_amp_to_db_linear:True\n",
628
- " | > do_amp_to_db_mel:True\n",
629
- " | > do_rms_norm:False\n",
630
- " | > db_level:None\n",
631
- " | > stats_path:None\n",
632
- " | > base:2.718281828459045\n",
633
- " | > hop_length:256\n",
634
- " | > win_length:1024\n",
635
- " > Generator Model: hifigan_generator\n",
636
- " > Discriminator Model: hifigan_discriminator\n",
637
- "Removing weight norm...\n"
638
- ]
639
- },
640
- {
641
- "name": "stdout",
642
- "output_type": "stream",
643
- "text": [
644
- "model: tts_models/en/ljspeech/speedy-speech\n",
645
- "language: \n",
646
- "speaker: \n",
647
- "Using original voice\n",
648
- " > Text splitted to sentences.\n",
649
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
650
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
651
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
652
- " > Processing time: 0.9679999351501465\n",
653
- " > Real-time factor: 0.11673301633083617\n",
654
- "model: tts_models/en/ljspeech/speedy-speech\n",
655
- "language: \n",
656
- "speaker: \n",
657
- "voice cloning with the voice conversion model\n",
658
- " > Text splitted to sentences.\n",
659
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
660
- " > Processing time: 0.9630000591278076\n",
661
- " > Real-time factor: 0.11613007144605443\n",
662
- " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
663
- " > Model's license - MPL\n",
664
- " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
665
- " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
666
- " > Model's license - MPL\n",
667
- " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
668
- " > Using model: Tacotron2\n",
669
- " > Setting up Audio Processor...\n",
670
- " | > sample_rate:22050\n",
671
- " | > resample:False\n",
672
- " | > num_mels:80\n",
673
- " | > log_func:np.log10\n",
674
- " | > min_level_db:-100\n",
675
- " | > frame_shift_ms:None\n",
676
- " | > frame_length_ms:None\n",
677
- " | > ref_level_db:20\n",
678
- " | > fft_size:1024\n",
679
- " | > power:1.5\n",
680
- " | > preemphasis:0.0\n",
681
- " | > griffin_lim_iters:60\n",
682
- " | > signal_norm:True\n",
683
- " | > symmetric_norm:True\n",
684
- " | > mel_fmin:50.0\n",
685
- " | > mel_fmax:7600.0\n",
686
- " | > pitch_fmin:0.0\n",
687
- " | > pitch_fmax:640.0\n",
688
- " | > spec_gain:1.0\n",
689
- " | > stft_pad_mode:reflect\n",
690
- " | > max_norm:4.0\n",
691
- " | > clip_norm:True\n",
692
- " | > do_trim_silence:True\n",
693
- " | > trim_db:60\n",
694
- " | > do_sound_norm:False\n",
695
- " | > do_amp_to_db_linear:True\n",
696
- " | > do_amp_to_db_mel:True\n",
697
- " | > do_rms_norm:False\n",
698
- " | > db_level:None\n",
699
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
700
- " | > base:10\n",
701
- " | > hop_length:256\n",
702
- " | > win_length:1024\n",
703
- " > Model's reduction rate `r` is set to: 2\n",
704
- " > Vocoder Model: multiband_melgan\n",
705
- " > Setting up Audio Processor...\n",
706
- " | > sample_rate:22050\n",
707
- " | > resample:False\n",
708
- " | > num_mels:80\n",
709
- " | > log_func:np.log10\n",
710
- " | > min_level_db:-100\n",
711
- " | > frame_shift_ms:None\n",
712
- " | > frame_length_ms:None\n",
713
- " | > ref_level_db:0\n",
714
- " | > fft_size:1024\n",
715
- " | > power:1.5\n",
716
- " | > preemphasis:0.0\n",
717
- " | > griffin_lim_iters:60\n",
718
- " | > signal_norm:True\n",
719
- " | > symmetric_norm:True\n",
720
- " | > mel_fmin:50.0\n",
721
- " | > mel_fmax:7600.0\n",
722
- " | > pitch_fmin:0.0\n",
723
- " | > pitch_fmax:640.0\n",
724
- " | > spec_gain:1.0\n",
725
- " | > stft_pad_mode:reflect\n",
726
- " | > max_norm:4.0\n",
727
- " | > clip_norm:True\n",
728
- " | > do_trim_silence:True\n",
729
- " | > trim_db:60\n",
730
- " | > do_sound_norm:False\n",
731
- " | > do_amp_to_db_linear:True\n",
732
- " | > do_amp_to_db_mel:True\n",
733
- " | > do_rms_norm:False\n",
734
- " | > db_level:None\n",
735
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
736
- " | > base:10\n",
737
- " | > hop_length:256\n",
738
- " | > win_length:1024\n",
739
- " > Generator Model: multiband_melgan_generator\n",
740
- " > Discriminator Model: melgan_multiscale_discriminator\n",
741
- "model: tts_models/en/ljspeech/tacotron2-DCA\n",
742
- "language: \n",
743
- "speaker: \n",
744
- "Using original voice\n",
745
- " > Text splitted to sentences.\n",
746
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
747
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
748
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
749
- " > Processing time: 2.067000150680542\n",
750
- " > Real-time factor: 0.23295588670728015\n",
751
- "model: tts_models/en/ljspeech/tacotron2-DCA\n",
752
- "language: \n",
753
- "speaker: \n",
754
- "voice cloning with the voice conversion model\n",
755
- " > Text splitted to sentences.\n",
756
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
757
- " > Processing time: 2.1570000648498535\n",
758
- " > Real-time factor: 0.2430990934225715\n",
759
- "model: tts_models/en/ljspeech/tacotron2-DCA\n",
760
- "language: \n",
761
- "speaker: \n",
762
- "voice cloning with the voice conversion model\n",
763
- " > Text splitted to sentences.\n",
764
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
765
- " > Processing time: 2.0920000076293945\n",
766
- " > Real-time factor: 0.23577343069302087\n",
767
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--fast_pitch\n",
768
- " > Model's license - apache 2.0\n",
769
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
770
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
771
- " > Model's license - apache 2.0\n",
772
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
773
- " > Using model: fast_pitch\n",
774
- " > Setting up Audio Processor...\n",
775
- " | > sample_rate:22050\n",
776
- " | > resample:False\n",
777
- " | > num_mels:80\n",
778
- " | > log_func:np.log\n",
779
- " | > min_level_db:-100\n",
780
- " | > frame_shift_ms:None\n",
781
- " | > frame_length_ms:None\n",
782
- " | > ref_level_db:20\n",
783
- " | > fft_size:1024\n",
784
- " | > power:1.5\n",
785
- " | > preemphasis:0.0\n",
786
- " | > griffin_lim_iters:60\n",
787
- " | > signal_norm:False\n",
788
- " | > symmetric_norm:True\n",
789
- " | > mel_fmin:0\n",
790
- " | > mel_fmax:8000.0\n",
791
- " | > pitch_fmin:1.0\n",
792
- " | > pitch_fmax:640.0\n",
793
- " | > spec_gain:1.0\n",
794
- " | > stft_pad_mode:reflect\n",
795
- " | > max_norm:4.0\n",
796
- " | > clip_norm:True\n",
797
- " | > do_trim_silence:True\n",
798
- " | > trim_db:60\n",
799
- " | > do_sound_norm:False\n",
800
- " | > do_amp_to_db_linear:True\n",
801
- " | > do_amp_to_db_mel:True\n",
802
- " | > do_rms_norm:False\n",
803
- " | > db_level:None\n",
804
- " | > stats_path:None\n",
805
- " | > base:2.718281828459045\n",
806
- " | > hop_length:256\n",
807
- " | > win_length:1024\n",
808
- " > Vocoder Model: hifigan\n",
809
- " > Setting up Audio Processor...\n",
810
- " | > sample_rate:22050\n",
811
- " | > resample:False\n",
812
- " | > num_mels:80\n",
813
- " | > log_func:np.log\n",
814
- " | > min_level_db:-100\n",
815
- " | > frame_shift_ms:None\n",
816
- " | > frame_length_ms:None\n",
817
- " | > ref_level_db:20\n",
818
- " | > fft_size:1024\n",
819
- " | > power:1.5\n",
820
- " | > preemphasis:0.0\n",
821
- " | > griffin_lim_iters:60\n",
822
- " | > signal_norm:False\n",
823
- " | > symmetric_norm:True\n",
824
- " | > mel_fmin:0\n",
825
- " | > mel_fmax:8000.0\n",
826
- " | > pitch_fmin:1.0\n",
827
- " | > pitch_fmax:640.0\n",
828
- " | > spec_gain:1.0\n",
829
- " | > stft_pad_mode:reflect\n",
830
- " | > max_norm:4.0\n",
831
- " | > clip_norm:True\n",
832
- " | > do_trim_silence:False\n",
833
- " | > trim_db:60\n",
834
- " | > do_sound_norm:False\n",
835
- " | > do_amp_to_db_linear:True\n",
836
- " | > do_amp_to_db_mel:True\n",
837
- " | > do_rms_norm:False\n",
838
- " | > db_level:None\n",
839
- " | > stats_path:None\n",
840
- " | > base:2.718281828459045\n",
841
- " | > hop_length:256\n",
842
- " | > win_length:1024\n",
843
- " > Generator Model: hifigan_generator\n",
844
- " > Discriminator Model: hifigan_discriminator\n",
845
- "Removing weight norm...\n",
846
- "model: tts_models/en/ljspeech/fast_pitch\n",
847
- "language: \n",
848
- "speaker: \n",
849
- "Using original voice\n",
850
- " > Text splitted to sentences.\n",
851
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
852
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
853
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
854
- " > Processing time: 1.8829996585845947\n",
855
- " > Real-time factor: 0.19894272496832988\n",
856
- "model: tts_models/en/ljspeech/fast_pitch\n",
857
- "language: \n",
858
- "speaker: \n",
859
- "voice cloning with the voice conversion model\n",
860
- " > Text splitted to sentences.\n",
861
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
862
- " > Processing time: 1.8359999656677246\n",
863
- " > Real-time factor: 0.19397711228808903\n",
864
- "model: tts_models/en/ljspeech/fast_pitch\n",
865
- "language: \n",
866
- "speaker: \n",
867
- "voice cloning with the voice conversion model\n",
868
- " > Text splitted to sentences.\n",
869
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
870
- " > Processing time: 1.8659999370574951\n",
871
- " > Real-time factor: 0.19714666998293168\n",
872
- "model: voice_conversion_models/multilingual/vctk/freevc24\n",
873
- "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
874
- "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\11c82c70d145ea630f81dfa541de52bf615719ae\\yearn_for_time-0-100.wav\n",
875
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--overflow\n",
876
- " > Model's license - apache 2.0\n",
877
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
878
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
879
- " > Model's license - apache 2.0\n",
880
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
881
- " > Using model: OverFlow\n",
882
- " > Setting up Audio Processor...\n",
883
- " | > sample_rate:22050\n",
884
- " | > resample:False\n",
885
- " | > num_mels:80\n",
886
- " | > log_func:np.log\n",
887
- " | > min_level_db:-100\n",
888
- " | > frame_shift_ms:None\n",
889
- " | > frame_length_ms:None\n",
890
- " | > ref_level_db:20\n",
891
- " | > fft_size:1024\n",
892
- " | > power:1.5\n",
893
- " | > preemphasis:0.0\n",
894
- " | > griffin_lim_iters:60\n",
895
- " | > signal_norm:False\n",
896
- " | > symmetric_norm:True\n",
897
- " | > mel_fmin:0\n",
898
- " | > mel_fmax:8000.0\n",
899
- " | > pitch_fmin:1.0\n",
900
- " | > pitch_fmax:640.0\n",
901
- " | > spec_gain:1.0\n",
902
- " | > stft_pad_mode:reflect\n",
903
- " | > max_norm:4.0\n",
904
- " | > clip_norm:True\n",
905
- " | > do_trim_silence:True\n",
906
- " | > trim_db:60\n",
907
- " | > do_sound_norm:False\n",
908
- " | > do_amp_to_db_linear:True\n",
909
- " | > do_amp_to_db_mel:True\n",
910
- " | > do_rms_norm:False\n",
911
- " | > db_level:None\n",
912
- " | > stats_path:None\n",
913
- " | > base:2.718281828459045\n",
914
- " | > hop_length:256\n",
915
- " | > win_length:1024\n"
916
- ]
917
- },
918
- {
919
- "name": "stdout",
920
- "output_type": "stream",
921
- "text": [
922
- " > Vocoder Model: hifigan\n",
923
- " > Setting up Audio Processor...\n",
924
- " | > sample_rate:22050\n",
925
- " | > resample:False\n",
926
- " | > num_mels:80\n",
927
- " | > log_func:np.log\n",
928
- " | > min_level_db:-100\n",
929
- " | > frame_shift_ms:None\n",
930
- " | > frame_length_ms:None\n",
931
- " | > ref_level_db:20\n",
932
- " | > fft_size:1024\n",
933
- " | > power:1.5\n",
934
- " | > preemphasis:0.0\n",
935
- " | > griffin_lim_iters:60\n",
936
- " | > signal_norm:False\n",
937
- " | > symmetric_norm:True\n",
938
- " | > mel_fmin:0\n",
939
- " | > mel_fmax:8000.0\n",
940
- " | > pitch_fmin:1.0\n",
941
- " | > pitch_fmax:640.0\n",
942
- " | > spec_gain:1.0\n",
943
- " | > stft_pad_mode:reflect\n",
944
- " | > max_norm:4.0\n",
945
- " | > clip_norm:True\n",
946
- " | > do_trim_silence:False\n",
947
- " | > trim_db:60\n",
948
- " | > do_sound_norm:False\n",
949
- " | > do_amp_to_db_linear:True\n",
950
- " | > do_amp_to_db_mel:True\n",
951
- " | > do_rms_norm:False\n",
952
- " | > db_level:None\n",
953
- " | > stats_path:None\n",
954
- " | > base:2.718281828459045\n",
955
- " | > hop_length:256\n",
956
- " | > win_length:1024\n",
957
- " > Generator Model: hifigan_generator\n",
958
- " > Discriminator Model: hifigan_discriminator\n",
959
- "Removing weight norm...\n",
960
- "model: tts_models/en/ljspeech/overflow\n",
961
- "language: \n",
962
- "speaker: \n",
963
- "Using original voice\n",
964
- " > Text splitted to sentences.\n",
965
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
966
- " > Processing time: 2.4030001163482666\n",
967
- " > Real-time factor: 0.26459208495864933\n",
968
- "model: tts_models/en/ljspeech/overflow\n",
969
- "language: \n",
970
- "speaker: \n",
971
- "voice cloning with the voice conversion model\n",
972
- " > Text splitted to sentences.\n",
973
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
974
- " > Processing time: 2.4769999980926514\n",
975
- " > Real-time factor: 0.27343925203231617\n",
976
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--neural_hmm\n",
977
- " > Model's license - apache 2.0\n",
978
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
979
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
980
- " > Model's license - apache 2.0\n",
981
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
982
- " > Using model: NeuralHMM_TTS\n",
983
- " > Setting up Audio Processor...\n",
984
- " | > sample_rate:22050\n",
985
- " | > resample:False\n",
986
- " | > num_mels:80\n",
987
- " | > log_func:np.log\n",
988
- " | > min_level_db:-100\n",
989
- " | > frame_shift_ms:None\n",
990
- " | > frame_length_ms:None\n",
991
- " | > ref_level_db:20\n",
992
- " | > fft_size:1024\n",
993
- " | > power:1.5\n",
994
- " | > preemphasis:0.0\n",
995
- " | > griffin_lim_iters:60\n",
996
- " | > signal_norm:False\n",
997
- " | > symmetric_norm:True\n",
998
- " | > mel_fmin:0\n",
999
- " | > mel_fmax:8000.0\n",
1000
- " | > pitch_fmin:1.0\n",
1001
- " | > pitch_fmax:640.0\n",
1002
- " | > spec_gain:1.0\n",
1003
- " | > stft_pad_mode:reflect\n",
1004
- " | > max_norm:4.0\n",
1005
- " | > clip_norm:True\n",
1006
- " | > do_trim_silence:True\n",
1007
- " | > trim_db:60\n",
1008
- " | > do_sound_norm:False\n",
1009
- " | > do_amp_to_db_linear:True\n",
1010
- " | > do_amp_to_db_mel:True\n",
1011
- " | > do_rms_norm:False\n",
1012
- " | > db_level:None\n",
1013
- " | > stats_path:None\n",
1014
- " | > base:2.718281828459045\n",
1015
- " | > hop_length:256\n",
1016
- " | > win_length:1024\n",
1017
- " > Vocoder Model: hifigan\n",
1018
- " > Setting up Audio Processor...\n",
1019
- " | > sample_rate:22050\n",
1020
- " | > resample:False\n",
1021
- " | > num_mels:80\n",
1022
- " | > log_func:np.log\n",
1023
- " | > min_level_db:-100\n",
1024
- " | > frame_shift_ms:None\n",
1025
- " | > frame_length_ms:None\n",
1026
- " | > ref_level_db:20\n",
1027
- " | > fft_size:1024\n",
1028
- " | > power:1.5\n",
1029
- " | > preemphasis:0.0\n",
1030
- " | > griffin_lim_iters:60\n",
1031
- " | > signal_norm:False\n",
1032
- " | > symmetric_norm:True\n",
1033
- " | > mel_fmin:0\n",
1034
- " | > mel_fmax:8000.0\n",
1035
- " | > pitch_fmin:1.0\n",
1036
- " | > pitch_fmax:640.0\n",
1037
- " | > spec_gain:1.0\n",
1038
- " | > stft_pad_mode:reflect\n",
1039
- " | > max_norm:4.0\n",
1040
- " | > clip_norm:True\n",
1041
- " | > do_trim_silence:False\n",
1042
- " | > trim_db:60\n",
1043
- " | > do_sound_norm:False\n",
1044
- " | > do_amp_to_db_linear:True\n",
1045
- " | > do_amp_to_db_mel:True\n",
1046
- " | > do_rms_norm:False\n",
1047
- " | > db_level:None\n",
1048
- " | > stats_path:None\n",
1049
- " | > base:2.718281828459045\n",
1050
- " | > hop_length:256\n",
1051
- " | > win_length:1024\n",
1052
- " > Generator Model: hifigan_generator\n",
1053
- " > Discriminator Model: hifigan_discriminator\n",
1054
- "Removing weight norm...\n",
1055
- "model: tts_models/en/ljspeech/neural_hmm\n",
1056
- "language: \n",
1057
- "speaker: \n",
1058
- "Using original voice\n",
1059
- " > Text splitted to sentences.\n",
1060
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1061
- " > Processing time: 2.3940000534057617\n",
1062
- " > Real-time factor: 0.27230367477713896\n",
1063
- "model: tts_models/en/ljspeech/neural_hmm\n",
1064
- "language: \n",
1065
- "speaker: \n",
1066
- "voice cloning with the voice conversion model\n",
1067
- " > Text splitted to sentences.\n",
1068
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1069
- " > Processing time: 2.628000020980835\n",
1070
- " > Real-time factor: 0.2965699745262212\n",
1071
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--vctk--fast_pitch\n",
1072
- " > Model's license - CC BY-NC-ND 4.0\n",
1073
- " > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.\n",
1074
- " > Using model: fast_pitch\n",
1075
- " > Setting up Audio Processor...\n",
1076
- " | > sample_rate:22050\n",
1077
- " | > resample:False\n",
1078
- " | > num_mels:80\n",
1079
- " | > log_func:np.log\n",
1080
- " | > min_level_db:-100\n",
1081
- " | > frame_shift_ms:None\n",
1082
- " | > frame_length_ms:None\n",
1083
- " | > ref_level_db:20\n",
1084
- " | > fft_size:1024\n",
1085
- " | > power:1.5\n",
1086
- " | > preemphasis:0.0\n",
1087
- " | > griffin_lim_iters:60\n",
1088
- " | > signal_norm:False\n",
1089
- " | > symmetric_norm:True\n",
1090
- " | > mel_fmin:0\n",
1091
- " | > mel_fmax:8000.0\n",
1092
- " | > pitch_fmin:0.0\n",
1093
- " | > pitch_fmax:640.0\n",
1094
- " | > spec_gain:1.0\n",
1095
- " | > stft_pad_mode:reflect\n",
1096
- " | > max_norm:4.0\n",
1097
- " | > clip_norm:True\n",
1098
- " | > do_trim_silence:True\n",
1099
- " | > trim_db:23\n",
1100
- " | > do_sound_norm:False\n",
1101
- " | > do_amp_to_db_linear:True\n",
1102
- " | > do_amp_to_db_mel:True\n",
1103
- " | > do_rms_norm:False\n",
1104
- " | > db_level:None\n",
1105
- " | > stats_path:None\n",
1106
- " | > base:2.718281828459045\n",
1107
- " | > hop_length:256\n",
1108
- " | > win_length:1024\n",
1109
- " > Init speaker_embedding layer.\n",
1110
- "model: tts_models/en/vctk/fast_pitch\n",
1111
- "language: \n",
1112
- "speaker: VCTK_p225\n",
1113
- "Using original voice\n",
1114
- " > Text splitted to sentences.\n",
1115
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1116
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
1117
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
1118
- " > Processing time: 4.122999906539917\n",
1119
- " > Real-time factor: 0.6120216766695737\n",
1120
- "model: tts_models/en/vctk/fast_pitch\n",
1121
- "language: \n",
1122
- "speaker: VCTK_p227\n",
1123
- "Using original voice\n",
1124
- " > Text splitted to sentences.\n",
1125
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1126
- " > Processing time: 3.615000009536743\n",
1127
- " > Real-time factor: 0.5239715910962163\n",
1128
- "model: tts_models/en/vctk/fast_pitch\n",
1129
- "language: \n",
1130
- "speaker: VCTK_p227\n",
1131
- "voice cloning with the tts\n",
1132
- " > Text splitted to sentences.\n",
1133
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n"
1134
  ]
1135
  },
1136
  {
@@ -1150,17 +92,19 @@
1150
  " return await future\n",
1151
  " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
1152
  " result = context.run(func, *args)\n",
1153
- " File \"<ipython-input-6-20fd07aa6e62>\", line 65, in text_to_speech\n",
1154
- " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
1155
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\", line 548, in tts\n",
1156
- " **kwargs,\n",
1157
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 340, in tts\n",
1158
- " speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)\n",
1159
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\tts\\utils\\managers.py\", line 365, in compute_embedding_from_clip\n",
1160
- " embedding = _compute(wav_file)\n",
1161
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\tts\\utils\\managers.py\", line 342, in _compute\n",
1162
- " waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)\n",
1163
- "AttributeError: 'NoneType' object has no attribute 'load_wav'\n"
 
 
1164
  ]
1165
  }
1166
  ],
@@ -1169,16 +113,40 @@
1169
  "description = \"\"\"\"\"\"\n",
1170
  "article = \"\"\"\"\"\"\n",
1171
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1172
  "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
1173
  "GPU = device == \"cuda\"\n",
1174
  "INT16MAX = np.iinfo(np.int16).max\n",
1175
- "VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)\n",
 
1176
  "\n",
 
 
 
 
 
 
 
 
1177
  "\n",
1178
- "model_ids = ModelManager(verbose=False).list_models()\n",
1179
- "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
1180
- "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
1181
- "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
1182
  "examples_pt = 'examples'\n",
1183
  "allowed_extentions = ['.mp3', '.wav']\n",
1184
  "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
@@ -1189,7 +157,7 @@
1189
  "\n",
1190
  "\n",
1191
  "def on_model_tts_select(model_name):\n",
1192
- " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
1193
  " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
1194
  " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
1195
  " language = languages[0]\n",
@@ -1237,6 +205,7 @@
1237
  " # Lazy code... save it to a temp file to resample it while reading it for VC\n",
1238
  " tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
1239
  " speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
 
1240
  " \n",
1241
  "\n",
1242
  " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
@@ -1301,6 +270,193 @@
1301
  " gr.HTML(article)\n",
1302
  "demo.launch(share=False)"
1303
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1304
  }
1305
  ],
1306
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 41,
6
+ "id": "9a1c46ff",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
15
  "import tempfile\n",
16
  "\n",
17
  "from TTS.api import TTS\n",
18
+ "from TTS.utils.manage import ModelManager\n",
19
+ "from TTS.utils.synthesizer import Synthesizer"
20
  ]
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 76,
25
+ "id": "a6339716",
26
  "metadata": {
27
  "scrolled": false
28
  },
 
36
  " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
37
  " > Using model: freevc\n",
38
  " > Loading pretrained speaker encoder model ...\n",
39
+ "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
40
+ "Running on local URL: http://127.0.0.1:7867\n",
41
  "\n",
42
  "To create a public link, set `share=True` in `launch()`.\n"
43
  ]
 
45
  {
46
  "data": {
47
  "text/html": [
48
+ "<div><iframe src=\"http://127.0.0.1:7867/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
49
  ],
50
  "text/plain": [
51
  "<IPython.core.display.HTML object>"
 
58
  "data": {
59
  "text/plain": []
60
  },
61
+ "execution_count": 76,
62
  "metadata": {},
63
  "output_type": "execute_result"
64
  },
 
72
  " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
73
  " > Model's license - apache 2.0\n",
74
  " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
75
+ " > Using model: Tacotron2\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ]
77
  },
78
  {
 
92
  " return await future\n",
93
  " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
94
  " result = context.run(func, *args)\n",
95
+ " File \"<ipython-input-76-b1dd8c5769eb>\", line 44, in on_model_tts_select\n",
96
+ " tts_var = TTS_local(model_name=model_name, output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
97
+ " File \"<ipython-input-76-b1dd8c5769eb>\", line 17, in __init__\n",
98
+ " self.load_vc_model_by_name(model_name=model_name, gpu=gpu)\n",
99
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\", line 363, in load_vc_model_by_name\n",
100
+ " self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)\n",
101
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 97, in __init__\n",
102
+ " self._load_vc(vc_checkpoint, vc_config, use_cuda)\n",
103
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 131, in _load_vc\n",
104
+ " self.vc_model = setup_vc_model(config=self.vc_config)\n",
105
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\vc\\models\\__init__.py\", line 17, in setup_model\n",
106
+ " return model\n",
107
+ "UnboundLocalError: local variable 'model' referenced before assignment\n"
108
  ]
109
  }
110
  ],
 
113
  "description = \"\"\"\"\"\"\n",
114
  "article = \"\"\"\"\"\"\n",
115
  "\n",
116
+ "class TTS_local(TTS):\n",
117
+ " def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False):\n",
118
+ " super().__init__(\n",
119
+ " model_name=None,\n",
120
+ " model_path=None,\n",
121
+ " config_path=None,\n",
122
+ " vocoder_path=None,\n",
123
+ " vocoder_config_path=None,\n",
124
+ " progress_bar=progress_bar,\n",
125
+ " gpu=False,\n",
126
+ " )\n",
127
+ " self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False)\n",
128
+ " if model_name is not None:\n",
129
+ " if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
130
+ " self.load_tts_model_by_name(model_name, gpu)\n",
131
+ " elif \"voice_conversion_models\" in model_name:\n",
132
+ " self.load_vc_model_by_name(model_name, gpu) \n",
133
+ "\n",
134
+ " \n",
135
  "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
136
  "GPU = device == \"cuda\"\n",
137
  "INT16MAX = np.iinfo(np.int16).max\n",
138
+ "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
139
+ "MANAGER = ModelManager(verbose=False)\n",
140
  "\n",
141
+ "model_ids = MANAGER.list_models()\n",
142
+ "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
143
+ "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
144
+ "model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
145
+ "model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
146
+ "\n",
147
+ "VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24', \n",
148
+ " output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
149
  "\n",
 
 
 
 
150
  "examples_pt = 'examples'\n",
151
  "allowed_extentions = ['.mp3', '.wav']\n",
152
  "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
 
157
  "\n",
158
  "\n",
159
  "def on_model_tts_select(model_name):\n",
160
+ " tts_var = TTS_local(model_name=model_name, output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
161
  " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
162
  " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
163
  " language = languages[0]\n",
 
205
  " # Lazy code... save it to a temp file to resample it while reading it for VC\n",
206
  " tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
207
  " speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
208
+ " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
209
  " \n",
210
  "\n",
211
  " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
 
270
  " gr.HTML(article)\n",
271
  "demo.launch(share=False)"
272
  ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 40,
277
+ "id": "c2dc0da8",
278
+ "metadata": {},
279
+ "outputs": [
280
+ {
281
+ "name": "stdout",
282
+ "output_type": "stream",
283
+ "text": [
284
+ " > tts_models/en/blizzard2013/capacitron-t2-c50 is already downloaded.\n",
285
+ " > Model's license - apache 2.0\n",
286
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
287
+ " > vocoder_models/en/blizzard2013/hifigan_v2 is already downloaded.\n",
288
+ " > Model's license - apache 2.0\n",
289
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
290
+ " > Using model: tacotron2\n",
291
+ " > Setting up Audio Processor...\n",
292
+ " | > sample_rate:24000\n",
293
+ " | > resample:False\n",
294
+ " | > num_mels:80\n",
295
+ " | > log_func:np.log10\n",
296
+ " | > min_level_db:-100\n",
297
+ " | > frame_shift_ms:None\n",
298
+ " | > frame_length_ms:None\n",
299
+ " | > ref_level_db:20\n",
300
+ " | > fft_size:1024\n",
301
+ " | > power:1.5\n",
302
+ " | > preemphasis:0.0\n",
303
+ " | > griffin_lim_iters:60\n",
304
+ " | > signal_norm:True\n",
305
+ " | > symmetric_norm:True\n",
306
+ " | > mel_fmin:80.0\n",
307
+ " | > mel_fmax:12000.0\n",
308
+ " | > pitch_fmin:0.0\n",
309
+ " | > pitch_fmax:640.0\n",
310
+ " | > spec_gain:25.0\n",
311
+ " | > stft_pad_mode:reflect\n",
312
+ " | > max_norm:4.0\n",
313
+ " | > clip_norm:True\n",
314
+ " | > do_trim_silence:True\n",
315
+ " | > trim_db:60\n",
316
+ " | > do_sound_norm:False\n",
317
+ " | > do_amp_to_db_linear:True\n",
318
+ " | > do_amp_to_db_mel:True\n",
319
+ " | > do_rms_norm:False\n",
320
+ " | > db_level:None\n",
321
+ " | > stats_path:None\n",
322
+ " | > base:10\n",
323
+ " | > hop_length:256\n",
324
+ " | > win_length:1024\n",
325
+ " > Model's reduction rate `r` is set to: 2\n",
326
+ " > Vocoder Model: hifigan\n",
327
+ " > Setting up Audio Processor...\n",
328
+ " | > sample_rate:24000\n",
329
+ " | > resample:False\n",
330
+ " | > num_mels:80\n",
331
+ " | > log_func:np.log10\n",
332
+ " | > min_level_db:-100\n",
333
+ " | > frame_shift_ms:None\n",
334
+ " | > frame_length_ms:None\n",
335
+ " | > ref_level_db:20\n",
336
+ " | > fft_size:1024\n",
337
+ " | > power:1.5\n",
338
+ " | > preemphasis:0.0\n",
339
+ " | > griffin_lim_iters:60\n",
340
+ " | > signal_norm:True\n",
341
+ " | > symmetric_norm:True\n",
342
+ " | > mel_fmin:80.0\n",
343
+ " | > mel_fmax:12000.0\n",
344
+ " | > pitch_fmin:1.0\n",
345
+ " | > pitch_fmax:640.0\n",
346
+ " | > spec_gain:20.0\n",
347
+ " | > stft_pad_mode:reflect\n",
348
+ " | > max_norm:4.0\n",
349
+ " | > clip_norm:True\n",
350
+ " | > do_trim_silence:False\n",
351
+ " | > trim_db:60\n",
352
+ " | > do_sound_norm:True\n",
353
+ " | > do_amp_to_db_linear:True\n",
354
+ " | > do_amp_to_db_mel:True\n",
355
+ " | > do_rms_norm:False\n",
356
+ " | > db_level:None\n",
357
+ " | > stats_path:None\n",
358
+ " | > base:10\n",
359
+ " | > hop_length:256\n",
360
+ " | > win_length:1024\n",
361
+ " > Generator Model: hifigan_generator\n",
362
+ " > Discriminator Model: hifigan_discriminator\n",
363
+ "Removing weight norm...\n"
364
+ ]
365
+ },
366
+ {
367
+ "data": {
368
+ "text/plain": [
369
+ "<TTS.utils.synthesizer.Synthesizer at 0x498b2588>"
370
+ ]
371
+ },
372
+ "execution_count": 40,
373
+ "metadata": {},
374
+ "output_type": "execute_result"
375
+ }
376
+ ],
377
+ "source": [
378
+ "from TTS.utils.synthesizer import Synthesizer\n",
379
+ "\n",
380
+ "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
381
+ "MANAGER = ModelManager(output_prefix=MODEL_DIR, verbose=False)\n",
382
+ "\n",
383
+ "model_ids = manager.list_models()\n",
384
+ "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(model_dir) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
385
+ "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
386
+ "\n",
387
+ "\n",
388
+ "def load_local_checkpoint(model_name, use_cuda):\n",
389
+ " model_path = None\n",
390
+ " config_path = None\n",
391
+ " speakers_file_path = None\n",
392
+ " vocoder_path = None\n",
393
+ " vocoder_config_path = None\n",
394
+ "\n",
395
+ " model_path, config_path, model_item = MANAGER.download_model(model_name)\n",
396
+ " vocoder_name = model_item[\"default_vocoder\"]\n",
397
+ " if vocoder_name is not None:\n",
398
+ " vocoder_path, vocoder_config_path, _ = MANAGER.download_model(vocoder_name)\n",
399
+ " \n",
400
+ " if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
401
+ " synthesizer = Synthesizer(\n",
402
+ " tts_checkpoint=model_path,\n",
403
+ " tts_config_path=config_path,\n",
404
+ " tts_speakers_file=speakers_file_path,\n",
405
+ " tts_languages_file=None,\n",
406
+ " vocoder_checkpoint=vocoder_path,\n",
407
+ " vocoder_config=vocoder_config_path,\n",
408
+ " encoder_checkpoint=\"\",\n",
409
+ " encoder_config=\"\",\n",
410
+ " use_cuda=use_cuda,\n",
411
+ " )\n",
412
+ " elif \"voice_conversion_models\" in model_name:\n",
413
+ " self.load_vc_model_by_name(model_name, gpu)\n",
414
+ "\n",
415
+ " return synthesizer\n",
416
+ "\n",
417
+ "model_name = model_tts_ids[0]\n",
418
+ "load_local_checkpoint(model_name, use_cuda=False)"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": 77,
424
+ "id": "98c1d5a8",
425
+ "metadata": {},
426
+ "outputs": [
427
+ {
428
+ "name": "stdout",
429
+ "output_type": "stream",
430
+ "text": [
431
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
432
+ " > Model's license - apache 2.0\n",
433
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
434
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
435
+ " > Model's license - apache 2.0\n",
436
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
437
+ " > Using model: Tacotron2\n"
438
+ ]
439
+ },
440
+ {
441
+ "ename": "UnboundLocalError",
442
+ "evalue": "local variable 'model' referenced before assignment",
443
+ "output_type": "error",
444
+ "traceback": [
445
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
446
+ "\u001b[1;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
447
+ "\u001b[1;32m<ipython-input-77-6dbf83b539b0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mTTS_local\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'tts_models/en/ljspeech/tacotron2-DDC_ph'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mMODEL_DIR\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mGPU\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
448
+ "\u001b[1;32m<ipython-input-76-b1dd8c5769eb>\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, model_name, output_prefix, progress_bar, gpu)\u001b[0m\n\u001b[0;32m 15\u001b[0m )\n\u001b[0;32m 16\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmanager\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mModelManager\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodels_file\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_models_file_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_prefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mprogress_bar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_vc_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[0mdevice\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"cuda\"\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_available\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;34m\"cpu\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
449
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\u001b[0m in \u001b[0;36mload_vc_model_by_name\u001b[1;34m(self, model_name, gpu)\u001b[0m\n\u001b[0;32m 361\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 362\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 363\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvoice_converter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSynthesizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 365\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mload_tts_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
450
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, tts_checkpoint, tts_config_path, tts_speakers_file, tts_languages_file, vocoder_checkpoint, vocoder_config, encoder_checkpoint, encoder_config, vc_checkpoint, vc_config, model_dir, voice_dir, use_cuda)\u001b[0m\n\u001b[0;32m 95\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 97\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_load_vc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 98\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutput_sample_rate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maudio\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"output_sample_rate\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
451
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m_load_vc\u001b[1;34m(self, vc_checkpoint, vc_config_path, use_cuda)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;31m# pylint: disable=global-statement\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_config_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msetup_vc_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
452
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\vc\\models\\__init__.py\u001b[0m in \u001b[0;36msetup_model\u001b[1;34m(config, samples)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[0mMyModel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimport_module\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"TTS.vc.models.freevc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mFreeVC\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMyModel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minit_from_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamples\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
453
+ "\u001b[1;31mUnboundLocalError\u001b[0m: local variable 'model' referenced before assignment"
454
+ ]
455
+ }
456
+ ],
457
+ "source": [
458
+ "TTS_local(model_name='tts_models/en/ljspeech/tacotron2-DDC_ph', output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)"
459
+ ]
460
  }
461
  ],
462
  "metadata": {
Coqui.ai-Copy1.ipynb ADDED
@@ -0,0 +1,880 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 41,
6
+ "id": "4110138e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import gradio as gr\n",
11
+ "import numpy as np\n",
12
+ "import torch\n",
13
+ "import torch.nn.functional as F\n",
14
+ "from pathlib import Path\n",
15
+ "import tempfile\n",
16
+ "\n",
17
+ "from TTS.api import TTS\n",
18
+ "from TTS.utils.manage import ModelManager\n",
19
+ "from TTS.utils.synthesizer import Synthesizer"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 78,
25
+ "id": "b7f07cd9",
26
+ "metadata": {
27
+ "scrolled": false
28
+ },
29
+ "outputs": [
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
35
+ " > Model's license - MIT\n",
36
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
37
+ " > Using model: freevc\n",
38
+ " > Loading pretrained speaker encoder model ...\n",
39
+ "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
40
+ "Running on local URL: http://127.0.0.1:7868\n",
41
+ "\n",
42
+ "To create a public link, set `share=True` in `launch()`.\n"
43
+ ]
44
+ },
45
+ {
46
+ "data": {
47
+ "text/html": [
48
+ "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
49
+ ],
50
+ "text/plain": [
51
+ "<IPython.core.display.HTML object>"
52
+ ]
53
+ },
54
+ "metadata": {},
55
+ "output_type": "display_data"
56
+ },
57
+ {
58
+ "data": {
59
+ "text/plain": []
60
+ },
61
+ "execution_count": 78,
62
+ "metadata": {},
63
+ "output_type": "execute_result"
64
+ },
65
+ {
66
+ "name": "stdout",
67
+ "output_type": "stream",
68
+ "text": [
69
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
70
+ " > Model's license - apache 2.0\n",
71
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
72
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
73
+ " > Model's license - apache 2.0\n",
74
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
75
+ " > Using model: Tacotron2\n",
76
+ " > Setting up Audio Processor...\n",
77
+ " | > sample_rate:22050\n",
78
+ " | > resample:False\n",
79
+ " | > num_mels:80\n",
80
+ " | > log_func:np.log10\n",
81
+ " | > min_level_db:-100\n",
82
+ " | > frame_shift_ms:None\n",
83
+ " | > frame_length_ms:None\n",
84
+ " | > ref_level_db:20\n",
85
+ " | > fft_size:1024\n",
86
+ " | > power:1.5\n",
87
+ " | > preemphasis:0.0\n",
88
+ " | > griffin_lim_iters:60\n",
89
+ " | > signal_norm:True\n",
90
+ " | > symmetric_norm:True\n",
91
+ " | > mel_fmin:50.0\n",
92
+ " | > mel_fmax:7600.0\n",
93
+ " | > pitch_fmin:0.0\n",
94
+ " | > pitch_fmax:640.0\n",
95
+ " | > spec_gain:1.0\n",
96
+ " | > stft_pad_mode:reflect\n",
97
+ " | > max_norm:4.0\n",
98
+ " | > clip_norm:True\n",
99
+ " | > do_trim_silence:True\n",
100
+ " | > trim_db:60\n",
101
+ " | > do_sound_norm:False\n",
102
+ " | > do_amp_to_db_linear:True\n",
103
+ " | > do_amp_to_db_mel:True\n",
104
+ " | > do_rms_norm:False\n",
105
+ " | > db_level:None\n",
106
+ " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
107
+ " | > base:10\n",
108
+ " | > hop_length:256\n",
109
+ " | > win_length:1024\n",
110
+ " > Model's reduction rate `r` is set to: 2\n",
111
+ " > Vocoder Model: univnet\n",
112
+ " > Setting up Audio Processor...\n",
113
+ " | > sample_rate:22050\n",
114
+ " | > resample:False\n",
115
+ " | > num_mels:80\n",
116
+ " | > log_func:np.log10\n",
117
+ " | > min_level_db:-100\n",
118
+ " | > frame_shift_ms:None\n",
119
+ " | > frame_length_ms:None\n",
120
+ " | > ref_level_db:20\n",
121
+ " | > fft_size:1024\n",
122
+ " | > power:1.5\n",
123
+ " | > preemphasis:0.0\n",
124
+ " | > griffin_lim_iters:60\n",
125
+ " | > signal_norm:True\n",
126
+ " | > symmetric_norm:True\n",
127
+ " | > mel_fmin:50.0\n",
128
+ " | > mel_fmax:7600.0\n",
129
+ " | > pitch_fmin:1.0\n",
130
+ " | > pitch_fmax:640.0\n",
131
+ " | > spec_gain:1.0\n",
132
+ " | > stft_pad_mode:reflect\n",
133
+ " | > max_norm:4.0\n",
134
+ " | > clip_norm:True\n",
135
+ " | > do_trim_silence:True\n",
136
+ " | > trim_db:60\n",
137
+ " | > do_sound_norm:False\n",
138
+ " | > do_amp_to_db_linear:True\n",
139
+ " | > do_amp_to_db_mel:True\n",
140
+ " | > do_rms_norm:False\n",
141
+ " | > db_level:None\n",
142
+ " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
143
+ " | > base:10\n",
144
+ " | > hop_length:256\n",
145
+ " | > win_length:1024\n",
146
+ " > Generator Model: univnet_generator\n",
147
+ " > Discriminator Model: univnet_discriminator\n",
148
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
149
+ "language: \n",
150
+ "speaker: \n",
151
+ "Using original voice\n",
152
+ " > Text splitted to sentences.\n",
153
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
154
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
155
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
156
+ " > Processing time: 3.2799999713897705\n",
157
+ " > Real-time factor: 0.3775684898572943\n",
158
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
159
+ "language: \n",
160
+ "speaker: \n",
161
+ "voice cloning with the voice conversion model\n",
162
+ " > Text splitted to sentences.\n",
163
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
164
+ " > Processing time: 3.2300000190734863\n",
165
+ " > Real-time factor: 0.3718128780726402\n",
166
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
167
+ "language: \n",
168
+ "speaker: \n",
169
+ "voice cloning with the voice conversion model\n",
170
+ " > Text splitted to sentences.\n",
171
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
172
+ " > Processing time: 3.065000295639038\n",
173
+ " > Real-time factor: 0.3528193729057425\n",
174
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
175
+ "language: \n",
176
+ "speaker: \n",
177
+ "voice cloning with the voice conversion model\n",
178
+ " > Text splitted to sentences.\n",
179
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
180
+ " > Processing time: 2.9799997806549072\n",
181
+ " > Real-time factor: 0.3430347642595259\n",
182
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
183
+ "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
184
+ "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\9a558946172057b073ebcd01c8bec7e2d1ff998e\\hmm_i_dont_know-0-100.wav\n",
185
+ "model: voice_conversion_models/multilingual/vctk/freevc24\n",
186
+ "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
187
+ "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\f730b71860c5932c67deaae15949118446d6c7d7\\arctic_a0407_clb-0-100.wav\n",
188
+ " > tts_models/multilingual/multi-dataset/your_tts is already downloaded.\n",
189
+ " > Model's license - CC BY-NC-ND 4.0\n",
190
+ " > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.\n",
191
+ " > Using model: vits\n",
192
+ " > Setting up Audio Processor...\n",
193
+ " | > sample_rate:16000\n",
194
+ " | > resample:False\n",
195
+ " | > num_mels:80\n",
196
+ " | > log_func:np.log10\n",
197
+ " | > min_level_db:0\n",
198
+ " | > frame_shift_ms:None\n",
199
+ " | > frame_length_ms:None\n",
200
+ " | > ref_level_db:None\n",
201
+ " | > fft_size:1024\n",
202
+ " | > power:None\n",
203
+ " | > preemphasis:0.0\n",
204
+ " | > griffin_lim_iters:None\n",
205
+ " | > signal_norm:None\n",
206
+ " | > symmetric_norm:None\n",
207
+ " | > mel_fmin:0\n",
208
+ " | > mel_fmax:None\n",
209
+ " | > pitch_fmin:None\n",
210
+ " | > pitch_fmax:None\n",
211
+ " | > spec_gain:20.0\n",
212
+ " | > stft_pad_mode:reflect\n",
213
+ " | > max_norm:1.0\n",
214
+ " | > clip_norm:True\n",
215
+ " | > do_trim_silence:False\n",
216
+ " | > trim_db:60\n",
217
+ " | > do_sound_norm:False\n",
218
+ " | > do_amp_to_db_linear:True\n",
219
+ " | > do_amp_to_db_mel:True\n",
220
+ " | > do_rms_norm:False\n",
221
+ " | > db_level:None\n",
222
+ " | > stats_path:None\n",
223
+ " | > base:10\n",
224
+ " | > hop_length:256\n",
225
+ " | > win_length:1024\n",
226
+ " > Model fully restored. \n",
227
+ " > Setting up Audio Processor...\n",
228
+ " | > sample_rate:16000\n",
229
+ " | > resample:False\n",
230
+ " | > num_mels:64\n",
231
+ " | > log_func:np.log10\n",
232
+ " | > min_level_db:-100\n",
233
+ " | > frame_shift_ms:None\n",
234
+ " | > frame_length_ms:None\n",
235
+ " | > ref_level_db:20\n",
236
+ " | > fft_size:512\n",
237
+ " | > power:1.5\n",
238
+ " | > preemphasis:0.97\n",
239
+ " | > griffin_lim_iters:60\n",
240
+ " | > signal_norm:False\n",
241
+ " | > symmetric_norm:False\n",
242
+ " | > mel_fmin:0\n",
243
+ " | > mel_fmax:8000.0\n",
244
+ " | > pitch_fmin:1.0\n",
245
+ " | > pitch_fmax:640.0\n",
246
+ " | > spec_gain:20.0\n",
247
+ " | > stft_pad_mode:reflect\n",
248
+ " | > max_norm:4.0\n",
249
+ " | > clip_norm:False\n",
250
+ " | > do_trim_silence:False\n",
251
+ " | > trim_db:60\n",
252
+ " | > do_sound_norm:False\n",
253
+ " | > do_amp_to_db_linear:True\n",
254
+ " | > do_amp_to_db_mel:True\n",
255
+ " | > do_rms_norm:True\n",
256
+ " | > db_level:-27.0\n",
257
+ " | > stats_path:None\n",
258
+ " | > base:10\n",
259
+ " | > hop_length:160\n",
260
+ " | > win_length:400\n",
261
+ " > External Speaker Encoder Loaded !!\n",
262
+ " > initialization of language-embedding layers.\n",
263
+ " > Model fully restored. \n",
264
+ " > Setting up Audio Processor...\n",
265
+ " | > sample_rate:16000\n",
266
+ " | > resample:False\n",
267
+ " | > num_mels:64\n",
268
+ " | > log_func:np.log10\n",
269
+ " | > min_level_db:-100\n",
270
+ " | > frame_shift_ms:None\n",
271
+ " | > frame_length_ms:None\n",
272
+ " | > ref_level_db:20\n",
273
+ " | > fft_size:512\n",
274
+ " | > power:1.5\n",
275
+ " | > preemphasis:0.97\n",
276
+ " | > griffin_lim_iters:60\n",
277
+ " | > signal_norm:False\n",
278
+ " | > symmetric_norm:False\n",
279
+ " | > mel_fmin:0\n",
280
+ " | > mel_fmax:8000.0\n",
281
+ " | > pitch_fmin:1.0\n",
282
+ " | > pitch_fmax:640.0\n",
283
+ " | > spec_gain:20.0\n",
284
+ " | > stft_pad_mode:reflect\n",
285
+ " | > max_norm:4.0\n",
286
+ " | > clip_norm:False\n",
287
+ " | > do_trim_silence:False\n",
288
+ " | > trim_db:60\n",
289
+ " | > do_sound_norm:False\n",
290
+ " | > do_amp_to_db_linear:True\n",
291
+ " | > do_amp_to_db_mel:True\n",
292
+ " | > do_rms_norm:True\n",
293
+ " | > db_level:-27.0\n",
294
+ " | > stats_path:None\n",
295
+ " | > base:10\n",
296
+ " | > hop_length:160\n",
297
+ " | > win_length:400\n",
298
+ "model: tts_models/multilingual/multi-dataset/your_tts\n",
299
+ "language: en\n",
300
+ "speaker: female-en-5\n",
301
+ "Using original voice\n",
302
+ " > Text splitted to sentences.\n",
303
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
304
+ " > Processing time: 1.8219997882843018\n",
305
+ " > Real-time factor: 0.19457494535287287\n",
306
+ "model: tts_models/multilingual/multi-dataset/your_tts\n",
307
+ "language: en\n",
308
+ "speaker: female-en-5\n",
309
+ "voice cloning with the tts\n",
310
+ " > Text splitted to sentences.\n",
311
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
312
+ " > Processing time: 2.863999843597412\n",
313
+ " > Real-time factor: 0.3084877039635299\n",
314
+ "model: tts_models/multilingual/multi-dataset/your_tts\n",
315
+ "language: fr-fr\n",
316
+ "speaker: female-en-5\n",
317
+ "voice cloning with the tts\n",
318
+ " > Text splitted to sentences.\n",
319
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
320
+ " > Processing time: 2.546999931335449\n",
321
+ " > Real-time factor: 0.326036857569822\n",
322
+ " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
323
+ " > Model's license - apache 2.0\n",
324
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
325
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
326
+ " > Model's license - apache 2.0\n",
327
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
328
+ " > Using model: Tacotron2\n",
329
+ " > Setting up Audio Processor...\n",
330
+ " | > sample_rate:22050\n",
331
+ " | > resample:False\n",
332
+ " | > num_mels:80\n",
333
+ " | > log_func:np.log\n",
334
+ " | > min_level_db:-100\n",
335
+ " | > frame_shift_ms:None\n",
336
+ " | > frame_length_ms:None\n",
337
+ " | > ref_level_db:20\n",
338
+ " | > fft_size:1024\n",
339
+ " | > power:1.5\n",
340
+ " | > preemphasis:0.0\n",
341
+ " | > griffin_lim_iters:60\n",
342
+ " | > signal_norm:False\n",
343
+ " | > symmetric_norm:True\n",
344
+ " | > mel_fmin:0\n",
345
+ " | > mel_fmax:8000.0\n",
346
+ " | > pitch_fmin:1.0\n",
347
+ " | > pitch_fmax:640.0\n",
348
+ " | > spec_gain:1.0\n",
349
+ " | > stft_pad_mode:reflect\n",
350
+ " | > max_norm:4.0\n",
351
+ " | > clip_norm:True\n",
352
+ " | > do_trim_silence:True\n",
353
+ " | > trim_db:60\n",
354
+ " | > do_sound_norm:False\n",
355
+ " | > do_amp_to_db_linear:True\n",
356
+ " | > do_amp_to_db_mel:True\n",
357
+ " | > do_rms_norm:False\n",
358
+ " | > db_level:None\n",
359
+ " | > stats_path:None\n",
360
+ " | > base:2.718281828459045\n",
361
+ " | > hop_length:256\n",
362
+ " | > win_length:1024\n"
363
+ ]
364
+ },
365
+ {
366
+ "name": "stdout",
367
+ "output_type": "stream",
368
+ "text": [
369
+ " > Model's reduction rate `r` is set to: 1\n",
370
+ " > Vocoder Model: hifigan\n",
371
+ " > Setting up Audio Processor...\n",
372
+ " | > sample_rate:22050\n",
373
+ " | > resample:False\n",
374
+ " | > num_mels:80\n",
375
+ " | > log_func:np.log\n",
376
+ " | > min_level_db:-100\n",
377
+ " | > frame_shift_ms:None\n",
378
+ " | > frame_length_ms:None\n",
379
+ " | > ref_level_db:20\n",
380
+ " | > fft_size:1024\n",
381
+ " | > power:1.5\n",
382
+ " | > preemphasis:0.0\n",
383
+ " | > griffin_lim_iters:60\n",
384
+ " | > signal_norm:False\n",
385
+ " | > symmetric_norm:True\n",
386
+ " | > mel_fmin:0\n",
387
+ " | > mel_fmax:8000.0\n",
388
+ " | > pitch_fmin:1.0\n",
389
+ " | > pitch_fmax:640.0\n",
390
+ " | > spec_gain:1.0\n",
391
+ " | > stft_pad_mode:reflect\n",
392
+ " | > max_norm:4.0\n",
393
+ " | > clip_norm:True\n",
394
+ " | > do_trim_silence:False\n",
395
+ " | > trim_db:60\n",
396
+ " | > do_sound_norm:False\n",
397
+ " | > do_amp_to_db_linear:True\n",
398
+ " | > do_amp_to_db_mel:True\n",
399
+ " | > do_rms_norm:False\n",
400
+ " | > db_level:None\n",
401
+ " | > stats_path:None\n",
402
+ " | > base:2.718281828459045\n",
403
+ " | > hop_length:256\n",
404
+ " | > win_length:1024\n",
405
+ " > Generator Model: hifigan_generator\n",
406
+ " > Discriminator Model: hifigan_discriminator\n",
407
+ "Removing weight norm...\n",
408
+ "model: tts_models/en/ljspeech/tacotron2-DDC\n",
409
+ "language: \n",
410
+ "speaker: \n",
411
+ "voice cloning with the voice conversion model\n",
412
+ " > Text splitted to sentences.\n",
413
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
414
+ " > Processing time: 4.205999851226807\n",
415
+ " > Real-time factor: 0.4124959824204343\n",
416
+ " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
417
+ " > Model's license - MPL\n",
418
+ " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
419
+ " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
420
+ " > Model's license - MPL\n",
421
+ " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
422
+ " > Using model: Tacotron2\n",
423
+ " > Setting up Audio Processor...\n",
424
+ " | > sample_rate:22050\n",
425
+ " | > resample:False\n",
426
+ " | > num_mels:80\n",
427
+ " | > log_func:np.log10\n",
428
+ " | > min_level_db:-100\n",
429
+ " | > frame_shift_ms:None\n",
430
+ " | > frame_length_ms:None\n",
431
+ " | > ref_level_db:20\n",
432
+ " | > fft_size:1024\n",
433
+ " | > power:1.5\n",
434
+ " | > preemphasis:0.0\n",
435
+ " | > griffin_lim_iters:60\n",
436
+ " | > signal_norm:True\n",
437
+ " | > symmetric_norm:True\n",
438
+ " | > mel_fmin:50.0\n",
439
+ " | > mel_fmax:7600.0\n",
440
+ " | > pitch_fmin:0.0\n",
441
+ " | > pitch_fmax:640.0\n",
442
+ " | > spec_gain:1.0\n",
443
+ " | > stft_pad_mode:reflect\n",
444
+ " | > max_norm:4.0\n",
445
+ " | > clip_norm:True\n",
446
+ " | > do_trim_silence:True\n",
447
+ " | > trim_db:60\n",
448
+ " | > do_sound_norm:False\n",
449
+ " | > do_amp_to_db_linear:True\n",
450
+ " | > do_amp_to_db_mel:True\n",
451
+ " | > do_rms_norm:False\n",
452
+ " | > db_level:None\n",
453
+ " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
454
+ " | > base:10\n",
455
+ " | > hop_length:256\n",
456
+ " | > win_length:1024\n",
457
+ " > Model's reduction rate `r` is set to: 2\n",
458
+ " > Vocoder Model: multiband_melgan\n",
459
+ " > Setting up Audio Processor...\n",
460
+ " | > sample_rate:22050\n",
461
+ " | > resample:False\n",
462
+ " | > num_mels:80\n",
463
+ " | > log_func:np.log10\n",
464
+ " | > min_level_db:-100\n",
465
+ " | > frame_shift_ms:None\n",
466
+ " | > frame_length_ms:None\n",
467
+ " | > ref_level_db:0\n",
468
+ " | > fft_size:1024\n",
469
+ " | > power:1.5\n",
470
+ " | > preemphasis:0.0\n",
471
+ " | > griffin_lim_iters:60\n",
472
+ " | > signal_norm:True\n",
473
+ " | > symmetric_norm:True\n",
474
+ " | > mel_fmin:50.0\n",
475
+ " | > mel_fmax:7600.0\n",
476
+ " | > pitch_fmin:0.0\n",
477
+ " | > pitch_fmax:640.0\n",
478
+ " | > spec_gain:1.0\n",
479
+ " | > stft_pad_mode:reflect\n",
480
+ " | > max_norm:4.0\n",
481
+ " | > clip_norm:True\n",
482
+ " | > do_trim_silence:True\n",
483
+ " | > trim_db:60\n",
484
+ " | > do_sound_norm:False\n",
485
+ " | > do_amp_to_db_linear:True\n",
486
+ " | > do_amp_to_db_mel:True\n",
487
+ " | > do_rms_norm:False\n",
488
+ " | > db_level:None\n",
489
+ " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
490
+ " | > base:10\n",
491
+ " | > hop_length:256\n",
492
+ " | > win_length:1024\n",
493
+ " > Generator Model: multiband_melgan_generator\n",
494
+ " > Discriminator Model: melgan_multiscale_discriminator\n",
495
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
496
+ "language: \n",
497
+ "speaker: \n",
498
+ "voice cloning with the voice conversion model\n",
499
+ " > Text splitted to sentences.\n",
500
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
501
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
502
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
503
+ " > Processing time: 2.384999990463257\n",
504
+ " > Real-time factor: 0.2687952332235178\n"
505
+ ]
506
+ }
507
+ ],
508
+ "source": [
509
+ "title = \"\"\n",
510
+ "description = \"\"\"\"\"\"\n",
511
+ "article = \"\"\"\"\"\"\n",
512
+ "\n",
513
+ "class TTS_local(TTS):\n",
514
+ " def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False):\n",
515
+ " super().__init__(\n",
516
+ " model_name=None,\n",
517
+ " model_path=None,\n",
518
+ " config_path=None,\n",
519
+ " vocoder_path=None,\n",
520
+ " vocoder_config_path=None,\n",
521
+ " progress_bar=progress_bar,\n",
522
+ " gpu=False,\n",
523
+ " )\n",
524
+ " self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False)\n",
525
+ " if model_name is not None:\n",
526
+ " if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
527
+ " self.load_tts_model_by_name(model_name, gpu)\n",
528
+ " elif \"voice_conversion_models\" in model_name:\n",
529
+ " self.load_vc_model_by_name(model_name, gpu) \n",
530
+ "\n",
531
+ " \n",
532
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
533
+ "GPU = device == \"cuda\"\n",
534
+ "INT16MAX = np.iinfo(np.int16).max\n",
535
+ "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
536
+ "MANAGER = ModelManager(verbose=False)\n",
537
+ "\n",
538
+ "model_ids = MANAGER.list_models()\n",
539
+ "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
540
+ "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
541
+ "model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
542
+ "model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
543
+ "\n",
544
+ "VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24', \n",
545
+ " output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
546
+ "\n",
547
+ "examples_pt = 'examples'\n",
548
+ "allowed_extentions = ['.mp3', '.wav']\n",
549
+ "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
550
+ "verse = \"\"\"Mary had a little lamb,\n",
551
+ "Its fleece was white as snow.\n",
552
+ "Everywhere the child went,\n",
553
+ "The little lamb was sure to go.\"\"\"\n",
554
+ "\n",
555
+ "\n",
556
+ "def on_model_tts_select(model_name):\n",
557
+ " tts_var = TTS_local(model_name=model_name, output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
558
+ " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
559
+ " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
560
+ " language = languages[0]\n",
561
+ " speaker = speakers[0]\n",
562
+ " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
563
+ " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
564
+ "\n",
565
+ "\n",
566
+ "def on_voicedropdown(x):\n",
567
+ " return examples[x]\n",
568
+ "\n",
569
+ "\n",
570
+ "def voice_clone(source_wav, target_wav):\n",
571
+ " print(f'model: {VC_MODEL.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
572
+ " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
573
+ " if source_wav is None or target_wav is None:\n",
574
+ " return (sample_rate, np.zeros(0).astype(np.int16))\n",
575
+ "\n",
576
+ " speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
577
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
578
+ " return (sample_rate, speech)\n",
579
+ "\n",
580
+ "\n",
581
+ "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
582
+ " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
583
+ " return (16000, np.zeros(0).astype(np.int16))\n",
584
+ "\n",
585
+ " sample_rate = tts_model.synthesizer.output_sample_rate\n",
586
+ " if tts_model.is_multi_speaker:\n",
587
+ " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
588
+ " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
589
+ "\n",
590
+ " language = None if language == '' else language\n",
591
+ " speaker = None if speaker == '' else speaker\n",
592
+ " if use_original_voice:\n",
593
+ " print('Using original voice')\n",
594
+ " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
595
+ " elif tts_model.synthesizer.tts_model.speaker_manager and tts_model.synthesizer.tts_model.speaker_manager.encoder_ap:\n",
596
+ " print('voice cloning with the tts')\n",
597
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
598
+ " else:\n",
599
+ " print('voice cloning with the voice conversion model')\n",
600
+ "# speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
601
+ " with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False) as fp:\n",
602
+ " # Lazy code... save it to a temp file to resample it while reading it for VC\n",
603
+ " tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
604
+ " speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
605
+ " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
606
+ " \n",
607
+ "\n",
608
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
609
+ " return (sample_rate, speech)\n",
610
+ "\n",
611
+ "\n",
612
+ "with gr.Blocks() as demo:\n",
613
+ " tts_model = gr.State(None)\n",
614
+ "# vc_model = gr.State(None)\n",
615
+ " def activate(*args):\n",
616
+ " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
617
+ " def deactivate(*args):\n",
618
+ " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
619
+ "\n",
620
+ " \n",
621
+ " gr.Markdown(description)\n",
622
+ "\n",
623
+ " with gr.Row(equal_height=True):\n",
624
+ " with gr.Column(scale=5, min_width=50):\n",
625
+ " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)\n",
626
+ " with gr.Column(scale=1, min_width=10):\n",
627
+ " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
628
+ " with gr.Column(scale=1, min_width=10):\n",
629
+ " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
630
+ " \n",
631
+ " with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
632
+ " gr.Markdown(\"Upload target voice...\")\n",
633
+ " with gr.Row(equal_height=True):\n",
634
+ " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
635
+ " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
636
+ "\n",
637
+ " with gr.Row(equal_height=True):\n",
638
+ " with gr.Column(scale=2):\n",
639
+ " with gr.Row(equal_height=True):\n",
640
+ " with gr.Column():\n",
641
+ " text_to_convert = gr.Textbox(verse)\n",
642
+ " orig_voice = gr.Checkbox(label='Use original voice')\n",
643
+ " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
644
+ " with gr.Row(equal_height=True):\n",
645
+ " button_text = gr.Button('Text to speech', interactive=True)\n",
646
+ " button_audio = gr.Button('Convert audio', interactive=True)\n",
647
+ " with gr.Row(equal_height=True):\n",
648
+ " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
649
+ " \n",
650
+ " # actions\n",
651
+ " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
652
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
653
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
654
+ " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
655
+ " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
656
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
657
+ "\n",
658
+ " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
659
+ " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
660
+ " outputs=speech).\\\n",
661
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
662
+ "\n",
663
+ " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
664
+ " then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\\\n",
665
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
666
+ " \n",
667
+ " gr.HTML(article)\n",
668
+ "demo.launch(share=False)"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "code",
673
+ "execution_count": 40,
674
+ "id": "d97a1ab5",
675
+ "metadata": {},
676
+ "outputs": [
677
+ {
678
+ "name": "stdout",
679
+ "output_type": "stream",
680
+ "text": [
681
+ " > tts_models/en/blizzard2013/capacitron-t2-c50 is already downloaded.\n",
682
+ " > Model's license - apache 2.0\n",
683
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
684
+ " > vocoder_models/en/blizzard2013/hifigan_v2 is already downloaded.\n",
685
+ " > Model's license - apache 2.0\n",
686
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
687
+ " > Using model: tacotron2\n",
688
+ " > Setting up Audio Processor...\n",
689
+ " | > sample_rate:24000\n",
690
+ " | > resample:False\n",
691
+ " | > num_mels:80\n",
692
+ " | > log_func:np.log10\n",
693
+ " | > min_level_db:-100\n",
694
+ " | > frame_shift_ms:None\n",
695
+ " | > frame_length_ms:None\n",
696
+ " | > ref_level_db:20\n",
697
+ " | > fft_size:1024\n",
698
+ " | > power:1.5\n",
699
+ " | > preemphasis:0.0\n",
700
+ " | > griffin_lim_iters:60\n",
701
+ " | > signal_norm:True\n",
702
+ " | > symmetric_norm:True\n",
703
+ " | > mel_fmin:80.0\n",
704
+ " | > mel_fmax:12000.0\n",
705
+ " | > pitch_fmin:0.0\n",
706
+ " | > pitch_fmax:640.0\n",
707
+ " | > spec_gain:25.0\n",
708
+ " | > stft_pad_mode:reflect\n",
709
+ " | > max_norm:4.0\n",
710
+ " | > clip_norm:True\n",
711
+ " | > do_trim_silence:True\n",
712
+ " | > trim_db:60\n",
713
+ " | > do_sound_norm:False\n",
714
+ " | > do_amp_to_db_linear:True\n",
715
+ " | > do_amp_to_db_mel:True\n",
716
+ " | > do_rms_norm:False\n",
717
+ " | > db_level:None\n",
718
+ " | > stats_path:None\n",
719
+ " | > base:10\n",
720
+ " | > hop_length:256\n",
721
+ " | > win_length:1024\n",
722
+ " > Model's reduction rate `r` is set to: 2\n",
723
+ " > Vocoder Model: hifigan\n",
724
+ " > Setting up Audio Processor...\n",
725
+ " | > sample_rate:24000\n",
726
+ " | > resample:False\n",
727
+ " | > num_mels:80\n",
728
+ " | > log_func:np.log10\n",
729
+ " | > min_level_db:-100\n",
730
+ " | > frame_shift_ms:None\n",
731
+ " | > frame_length_ms:None\n",
732
+ " | > ref_level_db:20\n",
733
+ " | > fft_size:1024\n",
734
+ " | > power:1.5\n",
735
+ " | > preemphasis:0.0\n",
736
+ " | > griffin_lim_iters:60\n",
737
+ " | > signal_norm:True\n",
738
+ " | > symmetric_norm:True\n",
739
+ " | > mel_fmin:80.0\n",
740
+ " | > mel_fmax:12000.0\n",
741
+ " | > pitch_fmin:1.0\n",
742
+ " | > pitch_fmax:640.0\n",
743
+ " | > spec_gain:20.0\n",
744
+ " | > stft_pad_mode:reflect\n",
745
+ " | > max_norm:4.0\n",
746
+ " | > clip_norm:True\n",
747
+ " | > do_trim_silence:False\n",
748
+ " | > trim_db:60\n",
749
+ " | > do_sound_norm:True\n",
750
+ " | > do_amp_to_db_linear:True\n",
751
+ " | > do_amp_to_db_mel:True\n",
752
+ " | > do_rms_norm:False\n",
753
+ " | > db_level:None\n",
754
+ " | > stats_path:None\n",
755
+ " | > base:10\n",
756
+ " | > hop_length:256\n",
757
+ " | > win_length:1024\n",
758
+ " > Generator Model: hifigan_generator\n",
759
+ " > Discriminator Model: hifigan_discriminator\n",
760
+ "Removing weight norm...\n"
761
+ ]
762
+ },
763
+ {
764
+ "data": {
765
+ "text/plain": [
766
+ "<TTS.utils.synthesizer.Synthesizer at 0x498b2588>"
767
+ ]
768
+ },
769
+ "execution_count": 40,
770
+ "metadata": {},
771
+ "output_type": "execute_result"
772
+ }
773
+ ],
774
+ "source": [
775
+ "from TTS.utils.synthesizer import Synthesizer\n",
776
+ "\n",
777
+ "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
778
+ "MANAGER = ModelManager(output_prefix=MODEL_DIR, verbose=False)\n",
779
+ "\n",
780
+ "model_ids = manager.list_models()\n",
781
+ "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(model_dir) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
782
+ "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
783
+ "\n",
784
+ "\n",
785
+ "def load_local_checkpoint(model_name, use_cuda):\n",
786
+ " model_path = None\n",
787
+ " config_path = None\n",
788
+ " speakers_file_path = None\n",
789
+ " vocoder_path = None\n",
790
+ " vocoder_config_path = None\n",
791
+ "\n",
792
+ " model_path, config_path, model_item = MANAGER.download_model(model_name)\n",
793
+ " vocoder_name = model_item[\"default_vocoder\"]\n",
794
+ " if vocoder_name is not None:\n",
795
+ " vocoder_path, vocoder_config_path, _ = MANAGER.download_model(vocoder_name)\n",
796
+ " \n",
797
+ " if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
798
+ " synthesizer = Synthesizer(\n",
799
+ " tts_checkpoint=model_path,\n",
800
+ " tts_config_path=config_path,\n",
801
+ " tts_speakers_file=speakers_file_path,\n",
802
+ " tts_languages_file=None,\n",
803
+ " vocoder_checkpoint=vocoder_path,\n",
804
+ " vocoder_config=vocoder_config_path,\n",
805
+ " encoder_checkpoint=\"\",\n",
806
+ " encoder_config=\"\",\n",
807
+ " use_cuda=use_cuda,\n",
808
+ " )\n",
809
+ " elif \"voice_conversion_models\" in model_name:\n",
810
+ " self.load_vc_model_by_name(model_name, gpu)\n",
811
+ "\n",
812
+ " return synthesizer\n",
813
+ "\n",
814
+ "model_name = model_tts_ids[0]\n",
815
+ "load_local_checkpoint(model_name, use_cuda=False)"
816
+ ]
817
+ },
818
+ {
819
+ "cell_type": "code",
820
+ "execution_count": 77,
821
+ "id": "35c8a08c",
822
+ "metadata": {},
823
+ "outputs": [
824
+ {
825
+ "name": "stdout",
826
+ "output_type": "stream",
827
+ "text": [
828
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
829
+ " > Model's license - apache 2.0\n",
830
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
831
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
832
+ " > Model's license - apache 2.0\n",
833
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
834
+ " > Using model: Tacotron2\n"
835
+ ]
836
+ },
837
+ {
838
+ "ename": "UnboundLocalError",
839
+ "evalue": "local variable 'model' referenced before assignment",
840
+ "output_type": "error",
841
+ "traceback": [
842
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
843
+ "\u001b[1;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
844
+ "\u001b[1;32m<ipython-input-77-6dbf83b539b0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mTTS_local\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'tts_models/en/ljspeech/tacotron2-DDC_ph'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mMODEL_DIR\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mGPU\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
845
+ "\u001b[1;32m<ipython-input-76-b1dd8c5769eb>\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, model_name, output_prefix, progress_bar, gpu)\u001b[0m\n\u001b[0;32m 15\u001b[0m )\n\u001b[0;32m 16\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmanager\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mModelManager\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodels_file\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_models_file_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_prefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mprogress_bar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_vc_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[0mdevice\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"cuda\"\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_available\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;34m\"cpu\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
846
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\u001b[0m in \u001b[0;36mload_vc_model_by_name\u001b[1;34m(self, model_name, gpu)\u001b[0m\n\u001b[0;32m 361\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 362\u001b[0m \u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 363\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvoice_converter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSynthesizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 365\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mload_tts_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
847
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, tts_checkpoint, tts_config_path, tts_speakers_file, tts_languages_file, vocoder_checkpoint, vocoder_config, encoder_checkpoint, encoder_config, vc_checkpoint, vc_config, model_dir, voice_dir, use_cuda)\u001b[0m\n\u001b[0;32m 95\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 97\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_load_vc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 98\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutput_sample_rate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maudio\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"output_sample_rate\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
848
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m_load_vc\u001b[1;34m(self, vc_checkpoint, vc_config_path, use_cuda)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;31m# pylint: disable=global-statement\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_config_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msetup_vc_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
849
+ "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\vc\\models\\__init__.py\u001b[0m in \u001b[0;36msetup_model\u001b[1;34m(config, samples)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[0mMyModel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimport_module\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"TTS.vc.models.freevc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mFreeVC\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMyModel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minit_from_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamples\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
850
+ "\u001b[1;31mUnboundLocalError\u001b[0m: local variable 'model' referenced before assignment"
851
+ ]
852
+ }
853
+ ],
854
+ "source": [
855
+ "TTS_local(model_name='tts_models/en/ljspeech/tacotron2-DDC_ph', output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)"
856
+ ]
857
+ }
858
+ ],
859
+ "metadata": {
860
+ "kernelspec": {
861
+ "display_name": "Python 3",
862
+ "language": "python",
863
+ "name": "python3"
864
+ },
865
+ "language_info": {
866
+ "codemirror_mode": {
867
+ "name": "ipython",
868
+ "version": 3
869
+ },
870
+ "file_extension": ".py",
871
+ "mimetype": "text/x-python",
872
+ "name": "python",
873
+ "nbconvert_exporter": "python",
874
+ "pygments_lexer": "ipython3",
875
+ "version": "3.7.9"
876
+ }
877
+ },
878
+ "nbformat": 4,
879
+ "nbformat_minor": 5
880
+ }
Coqui.ai.ipynb ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "e65fcd73",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import gradio as gr\n",
11
+ "import numpy as np\n",
12
+ "import torch\n",
13
+ "import torch.nn.functional as F\n",
14
+ "from pathlib import Path\n",
15
+ "\n",
16
+ "from TTS.api import TTS\n",
17
+ "from TTS.utils.manage import ModelManager"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 2,
23
+ "id": "f902a92c",
24
+ "metadata": {
25
+ "scrolled": false
26
+ },
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "Running on local URL: http://127.0.0.1:7860\n",
33
+ "\n",
34
+ "To create a public link, set `share=True` in `launch()`.\n"
35
+ ]
36
+ },
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
41
+ ],
42
+ "text/plain": [
43
+ "<IPython.core.display.HTML object>"
44
+ ]
45
+ },
46
+ "metadata": {},
47
+ "output_type": "display_data"
48
+ },
49
+ {
50
+ "data": {
51
+ "text/plain": []
52
+ },
53
+ "execution_count": 2,
54
+ "metadata": {},
55
+ "output_type": "execute_result"
56
+ },
57
+ {
58
+ "name": "stdout",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
62
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
63
+ " > Model's license - apache 2.0\n",
64
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
65
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
66
+ " > Model's license - apache 2.0\n",
67
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
68
+ " > Using model: Tacotron2\n",
69
+ " > Setting up Audio Processor...\n",
70
+ " | > sample_rate:22050\n",
71
+ " | > resample:False\n",
72
+ " | > num_mels:80\n",
73
+ " | > log_func:np.log10\n",
74
+ " | > min_level_db:-100\n",
75
+ " | > frame_shift_ms:None\n",
76
+ " | > frame_length_ms:None\n",
77
+ " | > ref_level_db:20\n",
78
+ " | > fft_size:1024\n",
79
+ " | > power:1.5\n",
80
+ " | > preemphasis:0.0\n",
81
+ " | > griffin_lim_iters:60\n",
82
+ " | > signal_norm:True\n",
83
+ " | > symmetric_norm:True\n",
84
+ " | > mel_fmin:50.0\n",
85
+ " | > mel_fmax:7600.0\n",
86
+ " | > pitch_fmin:0.0\n",
87
+ " | > pitch_fmax:640.0\n",
88
+ " | > spec_gain:1.0\n",
89
+ " | > stft_pad_mode:reflect\n",
90
+ " | > max_norm:4.0\n",
91
+ " | > clip_norm:True\n",
92
+ " | > do_trim_silence:True\n",
93
+ " | > trim_db:60\n",
94
+ " | > do_sound_norm:False\n",
95
+ " | > do_amp_to_db_linear:True\n",
96
+ " | > do_amp_to_db_mel:True\n",
97
+ " | > do_rms_norm:False\n",
98
+ " | > db_level:None\n",
99
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
100
+ " | > base:10\n",
101
+ " | > hop_length:256\n",
102
+ " | > win_length:1024\n",
103
+ " > Model's reduction rate `r` is set to: 2\n",
104
+ " > Vocoder Model: univnet\n",
105
+ " > Setting up Audio Processor...\n",
106
+ " | > sample_rate:22050\n",
107
+ " | > resample:False\n",
108
+ " | > num_mels:80\n",
109
+ " | > log_func:np.log10\n",
110
+ " | > min_level_db:-100\n",
111
+ " | > frame_shift_ms:None\n",
112
+ " | > frame_length_ms:None\n",
113
+ " | > ref_level_db:20\n",
114
+ " | > fft_size:1024\n",
115
+ " | > power:1.5\n",
116
+ " | > preemphasis:0.0\n",
117
+ " | > griffin_lim_iters:60\n",
118
+ " | > signal_norm:True\n",
119
+ " | > symmetric_norm:True\n",
120
+ " | > mel_fmin:50.0\n",
121
+ " | > mel_fmax:7600.0\n",
122
+ " | > pitch_fmin:1.0\n",
123
+ " | > pitch_fmax:640.0\n",
124
+ " | > spec_gain:1.0\n",
125
+ " | > stft_pad_mode:reflect\n",
126
+ " | > max_norm:4.0\n",
127
+ " | > clip_norm:True\n",
128
+ " | > do_trim_silence:True\n",
129
+ " | > trim_db:60\n",
130
+ " | > do_sound_norm:False\n",
131
+ " | > do_amp_to_db_linear:True\n",
132
+ " | > do_amp_to_db_mel:True\n",
133
+ " | > do_rms_norm:False\n",
134
+ " | > db_level:None\n",
135
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
136
+ " | > base:10\n",
137
+ " | > hop_length:256\n",
138
+ " | > win_length:1024\n",
139
+ " > Generator Model: univnet_generator\n",
140
+ " > Discriminator Model: univnet_discriminator\n",
141
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
142
+ "language: \n",
143
+ "speaker: \n",
144
+ "voice cloning with the voice conversion model\n",
145
+ " > Text splitted to sentences.\n",
146
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
147
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
148
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
149
+ " > Processing time: 3.4810001850128174\n",
150
+ " > Real-time factor: 0.400706095887971\n",
151
+ " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
152
+ " > Model's license - MIT\n",
153
+ " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
154
+ " > Using model: freevc\n",
155
+ " > Loading pretrained speaker encoder model ...\n",
156
+ "Loaded the voice encoder model on cpu in 0.09 seconds.\n",
157
+ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
159
+ " > Model's license - apache 2.0\n",
160
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
161
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
162
+ " > Model's license - apache 2.0\n",
163
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
164
+ " > Using model: Tacotron2\n",
165
+ " > Setting up Audio Processor...\n",
166
+ " | > sample_rate:22050\n",
167
+ " | > resample:False\n",
168
+ " | > num_mels:80\n",
169
+ " | > log_func:np.log10\n",
170
+ " | > min_level_db:-100\n",
171
+ " | > frame_shift_ms:None\n",
172
+ " | > frame_length_ms:None\n",
173
+ " | > ref_level_db:20\n",
174
+ " | > fft_size:1024\n",
175
+ " | > power:1.5\n",
176
+ " | > preemphasis:0.0\n",
177
+ " | > griffin_lim_iters:60\n",
178
+ " | > signal_norm:True\n",
179
+ " | > symmetric_norm:True\n",
180
+ " | > mel_fmin:50.0\n",
181
+ " | > mel_fmax:7600.0\n",
182
+ " | > pitch_fmin:0.0\n",
183
+ " | > pitch_fmax:640.0\n",
184
+ " | > spec_gain:1.0\n",
185
+ " | > stft_pad_mode:reflect\n",
186
+ " | > max_norm:4.0\n",
187
+ " | > clip_norm:True\n",
188
+ " | > do_trim_silence:True\n",
189
+ " | > trim_db:60\n",
190
+ " | > do_sound_norm:False\n",
191
+ " | > do_amp_to_db_linear:True\n",
192
+ " | > do_amp_to_db_mel:True\n",
193
+ " | > do_rms_norm:False\n",
194
+ " | > db_level:None\n",
195
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
196
+ " | > base:10\n",
197
+ " | > hop_length:256\n",
198
+ " | > win_length:1024\n",
199
+ " > Model's reduction rate `r` is set to: 2\n",
200
+ " > Vocoder Model: univnet\n",
201
+ " > Setting up Audio Processor...\n",
202
+ " | > sample_rate:22050\n",
203
+ " | > resample:False\n",
204
+ " | > num_mels:80\n",
205
+ " | > log_func:np.log10\n",
206
+ " | > min_level_db:-100\n",
207
+ " | > frame_shift_ms:None\n",
208
+ " | > frame_length_ms:None\n",
209
+ " | > ref_level_db:20\n",
210
+ " | > fft_size:1024\n",
211
+ " | > power:1.5\n",
212
+ " | > preemphasis:0.0\n",
213
+ " | > griffin_lim_iters:60\n",
214
+ " | > signal_norm:True\n",
215
+ " | > symmetric_norm:True\n",
216
+ " | > mel_fmin:50.0\n",
217
+ " | > mel_fmax:7600.0\n",
218
+ " | > pitch_fmin:1.0\n",
219
+ " | > pitch_fmax:640.0\n",
220
+ " | > spec_gain:1.0\n",
221
+ " | > stft_pad_mode:reflect\n",
222
+ " | > max_norm:4.0\n",
223
+ " | > clip_norm:True\n",
224
+ " | > do_trim_silence:True\n",
225
+ " | > trim_db:60\n",
226
+ " | > do_sound_norm:False\n",
227
+ " | > do_amp_to_db_linear:True\n",
228
+ " | > do_amp_to_db_mel:True\n",
229
+ " | > do_rms_norm:False\n",
230
+ " | > db_level:None\n",
231
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
232
+ " | > base:10\n",
233
+ " | > hop_length:256\n",
234
+ " | > win_length:1024\n",
235
+ " > Generator Model: univnet_generator\n",
236
+ " > Discriminator Model: univnet_discriminator\n",
237
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
238
+ "language: \n",
239
+ "speaker: \n",
240
+ "Using original voice\n",
241
+ " > Text splitted to sentences.\n",
242
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
243
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
244
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
245
+ " > Processing time: 2.931999921798706\n",
246
+ " > Real-time factor: 0.3375093879242267\n"
247
+ ]
248
+ }
249
+ ],
250
+ "source": [
251
+ "title = \"\"\n",
252
+ "description = \"\"\"\"\"\"\n",
253
+ "article = \"\"\"\"\"\"\n",
254
+ "\n",
255
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
256
+ "GPU = device == \"cuda\"\n",
257
+ "INT16MAX = np.iinfo(np.int16).max\n",
258
+ "\n",
259
+ "model_ids = ModelManager(verbose=False).list_models()\n",
260
+ "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
261
+ "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
262
+ "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
263
+ "examples_pt = 'examples'\n",
264
+ "allowed_extentions = ['.mp3', '.wav']\n",
265
+ "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
266
+ "verse = \"\"\"Mary had a little lamb,\n",
267
+ "Its fleece was white as snow.\n",
268
+ "Everywhere the child went,\n",
269
+ "The little lamb was sure to go.\"\"\"\n",
270
+ "\n",
271
+ "\n",
272
+ "\n",
273
+ "def on_model_tts_select(model_name, tts_var):\n",
274
+ " if tts_var is None or tts_var.model_name != model_name:\n",
275
+ " print(f'Loading TTS model from {model_name}')\n",
276
+ " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
277
+ " else:\n",
278
+ " print(f'Passing through TTS model {tts_var.model_name}')\n",
279
+ " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
280
+ " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
281
+ " language = languages[0]\n",
282
+ " speaker = speakers[0]\n",
283
+ " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
284
+ " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
285
+ "\n",
286
+ "\n",
287
+ "def on_model_vc_select(model_name, vc_var):\n",
288
+ " if vc_var is None or vc_var.model_name != model_name:\n",
289
+ " print(f'Loading voice conversion model from {model_name}')\n",
290
+ " vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
291
+ " else:\n",
292
+ " print(f'Passing through voice conversion model {vc_var.model_name}')\n",
293
+ " return vc_var\n",
294
+ "\n",
295
+ "\n",
296
+ "def on_voicedropdown(x):\n",
297
+ " return examples[x]\n",
298
+ "\n",
299
+ "\n",
300
+ "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
301
+ " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
302
+ " return (16000, np.zeros(0).astype(np.int16))\n",
303
+ " \n",
304
+ " sample_rate = tts_model.synthesizer.output_sample_rate\n",
305
+ " if tts_model.is_multi_speaker:\n",
306
+ " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
307
+ " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
308
+ " \n",
309
+ " language = None if language == '' else language\n",
310
+ " speaker = None if speaker == '' else speaker\n",
311
+ " if use_original_voice:\n",
312
+ " print('Using original voice')\n",
313
+ " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
314
+ " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
315
+ " print('voice cloning with the tts')\n",
316
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
317
+ " else:\n",
318
+ " print('voice cloning with the voice conversion model')\n",
319
+ " speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
320
+ "\n",
321
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
322
+ " return (sample_rate, speech)\n",
323
+ "\n",
324
+ "\n",
325
+ "def voice_clone(vc_model, source_wav, target_wav):\n",
326
+ " print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
327
+ " sample_rate = vc_model.voice_converter.output_sample_rate\n",
328
+ " if vc_model is None or source_wav is None or target_wav is None:\n",
329
+ " return (sample_rate, np.zeros(0).astype(np.int16))\n",
330
+ "\n",
331
+ " speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
332
+ " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
333
+ " return (sample_rate, speech)\n",
334
+ "\n",
335
+ "\n",
336
+ "with gr.Blocks() as demo:\n",
337
+ " tts_model = gr.State(None)\n",
338
+ " vc_model = gr.State(None)\n",
339
+ " def activate(*args):\n",
340
+ " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
341
+ " def deactivate(*args):\n",
342
+ " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
343
+ "\n",
344
+ " gr.Markdown(description)\n",
345
+ "\n",
346
+ " with gr.Row(equal_height=True):\n",
347
+ " with gr.Column(scale=5, min_width=50):\n",
348
+ " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n",
349
+ " with gr.Column(scale=1, min_width=10):\n",
350
+ " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
351
+ " with gr.Column(scale=1, min_width=10):\n",
352
+ " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
353
+ " with gr.Column(scale=5, min_width=50):\n",
354
+ " with gr.Row(equal_height=True):\n",
355
+ "# model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n",
356
+ " model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n",
357
+ " \n",
358
+ " with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
359
+ " gr.Markdown(\"Upload target voice...\")\n",
360
+ " with gr.Row(equal_height=True):\n",
361
+ " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
362
+ " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
363
+ "\n",
364
+ " with gr.Row(equal_height=True):\n",
365
+ " with gr.Column(scale=2):\n",
366
+ " with gr.Row(equal_height=True):\n",
367
+ " with gr.Column():\n",
368
+ " text_to_convert = gr.Textbox(verse)\n",
369
+ " orig_voice = gr.Checkbox(label='Use original voice')\n",
370
+ " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
371
+ " with gr.Row(equal_height=True):\n",
372
+ " button_text = gr.Button('Text to speech', interactive=True)\n",
373
+ " button_audio = gr.Button('Convert audio', interactive=True)\n",
374
+ " with gr.Row(equal_height=True):\n",
375
+ " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
376
+ " \n",
377
+ " # actions\n",
378
+ " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
379
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
380
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
381
+ " model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
382
+ " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
383
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
384
+ " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
385
+ " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
386
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
387
+ " \n",
388
+ " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
389
+ " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
390
+ " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
391
+ " outputs=speech).\\\n",
392
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
393
+ "\n",
394
+ " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
395
+ " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
396
+ " then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n",
397
+ " then(activate, [button_text, button_audio], [button_text, button_audio])\n",
398
+ " \n",
399
+ " gr.HTML(article)\n",
400
+ "demo.launch(share=False)"
401
+ ]
402
+ }
403
+ ],
404
+ "metadata": {
405
+ "kernelspec": {
406
+ "display_name": "Python 3",
407
+ "language": "python",
408
+ "name": "python3"
409
+ },
410
+ "language_info": {
411
+ "codemirror_mode": {
412
+ "name": "ipython",
413
+ "version": 3
414
+ },
415
+ "file_extension": ".py",
416
+ "mimetype": "text/x-python",
417
+ "name": "python",
418
+ "nbconvert_exporter": "python",
419
+ "pygments_lexer": "ipython3",
420
+ "version": "3.7.9"
421
+ }
422
+ },
423
+ "nbformat": 4,
424
+ "nbformat_minor": 5
425
+ }
app.bak.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from pathlib import Path
6
+
7
+ from TTS.api import TTS
8
+ from TTS.utils.manage import ModelManager
9
+
10
+
11
+ title = ""
12
+ description = """"""
13
+ article = """"""
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ GPU = device == "cuda"
17
+ INT16MAX = np.iinfo(np.int16).max
18
+
19
+ model_ids = ModelManager(verbose=False).list_models()
20
+ model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
21
+ model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
22
+ model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
23
+ examples_pt = 'examples'
24
+ allowed_extentions = ['.mp3', '.wav']
25
+ examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
26
+ verse = """Mary had a little lamb,
27
+ Its fleece was white as snow.
28
+ Everywhere the child went,
29
+ The little lamb was sure to go."""
30
+
31
+
32
+
33
+ def on_model_tts_select(model_name, tts_var):
34
+ if tts_var is None or tts_var.model_name != model_name:
35
+ print(f'Loading TTS model from {model_name}')
36
+ tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
37
+ else:
38
+ print(f'Passing through TTS model {tts_var.model_name}')
39
+ languages = tts_var.languages if tts_var.is_multi_lingual else ['']
40
+ speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
41
+ language = languages[0]
42
+ speaker = speakers[0]
43
+ return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
44
+ gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
45
+
46
+
47
+ def on_model_vc_select(model_name, vc_var):
48
+ if vc_var is None or vc_var.model_name != model_name:
49
+ print(f'Loading voice conversion model from {model_name}')
50
+ vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
51
+ else:
52
+ print(f'Passing through voice conversion model {vc_var.model_name}')
53
+ return vc_var
54
+
55
+
56
+ def on_voicedropdown(x):
57
+ return examples[x]
58
+
59
+
60
+ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
61
+ if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
62
+ return (16000, np.zeros(0).astype(np.int16))
63
+
64
+ sample_rate = tts_model.synthesizer.output_sample_rate
65
+ if tts_model.is_multi_speaker:
66
+ speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
67
+ print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
68
+
69
+ language = None if language == '' else language
70
+ speaker = None if speaker == '' else speaker
71
+ if use_original_voice:
72
+ print('Using original voice')
73
+ speech = tts_model.tts(text, language=language, speaker=speaker)
74
+ elif tts_model.synthesizer.tts_model.speaker_manager:
75
+ print('voice cloning with the tts')
76
+ speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
+ else:
78
+ print('voice cloning with the voice conversion model')
79
+ speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
80
+
81
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
82
+ return (sample_rate, speech)
83
+
84
+
85
+ def voice_clone(vc_model, source_wav, target_wav):
86
+ print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
87
+ sample_rate = vc_model.voice_converter.output_sample_rate
88
+ if vc_model is None or source_wav is None or target_wav is None:
89
+ return (sample_rate, np.zeros(0).astype(np.int16))
90
+
91
+ speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
92
+ speech = (np.array(speech) * INT16MAX).astype(np.int16)
93
+ return (sample_rate, speech)
94
+
95
+
96
+ with gr.Blocks() as demo:
97
+ tts_model = gr.State(None)
98
+ vc_model = gr.State(None)
99
+ def activate(*args):
100
+ return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
101
+ def deactivate(*args):
102
+ return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
103
+
104
+ gr.Markdown(description)
105
+
106
+ with gr.Row(equal_height=True):
107
+ with gr.Column(scale=5, min_width=50):
108
+ model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
109
+ with gr.Column(scale=1, min_width=10):
110
+ language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
111
+ with gr.Column(scale=1, min_width=10):
112
+ speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
113
+ with gr.Column(scale=5, min_width=50):
114
+ with gr.Row(equal_height=True):
115
+ # model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
116
+ model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
117
+
118
+ with gr.Accordion("Target voice", open=False) as accordion:
119
+ gr.Markdown("Upload target voice...")
120
+ with gr.Row(equal_height=True):
121
+ voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
122
+ voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
123
+
124
+ with gr.Row(equal_height=True):
125
+ with gr.Column(scale=2):
126
+ with gr.Row(equal_height=True):
127
+ with gr.Column():
128
+ text_to_convert = gr.Textbox(verse)
129
+ orig_voice = gr.Checkbox(label='Use original voice')
130
+ voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
131
+ with gr.Row(equal_height=True):
132
+ button_text = gr.Button('Text to speech', interactive=True)
133
+ button_audio = gr.Button('Convert audio', interactive=True)
134
+ with gr.Row(equal_height=True):
135
+ speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)
136
+
137
+ # actions
138
+ model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
139
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
140
+ then(activate, [button_text, button_audio], [button_text, button_audio])
141
+ model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
142
+ then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
143
+ then(activate, [button_text, button_audio], [button_text, button_audio])
144
+ voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
145
+ then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
146
+ then(activate, [button_text, button_audio], [button_text, button_audio])
147
+
148
+ button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
149
+ then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
150
+ then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
151
+ outputs=speech).\
152
+ then(activate, [button_text, button_audio], [button_text, button_audio])
153
+
154
+ button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
155
+ then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
156
+ then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
157
+ then(activate, [button_text, button_audio], [button_text, button_audio])
158
+
159
+ gr.HTML(article)
160
+ demo.launch(share=False)
app.py CHANGED
@@ -13,16 +13,40 @@ title = ""
13
  description = """"""
14
  article = """"""
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  GPU = device == "cuda"
18
  INT16MAX = np.iinfo(np.int16).max
19
- VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)
 
 
 
 
 
 
 
20
 
 
 
21
 
22
- model_ids = ModelManager(verbose=False).list_models()
23
- model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
24
- model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
25
- model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
26
  examples_pt = 'examples'
27
  allowed_extentions = ['.mp3', '.wav']
28
  examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
@@ -81,6 +105,7 @@ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_
81
  # Lazy code... save it to a temp file to resample it while reading it for VC
82
  tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
83
  speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)
 
84
 
85
 
86
  speech = (np.array(speech) * INT16MAX).astype(np.int16)
 
13
  description = """"""
14
  article = """"""
15
 
16
+ class TTS_local(TTS):
17
+ def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False):
18
+ super().__init__(
19
+ model_name=None,
20
+ model_path=None,
21
+ config_path=None,
22
+ vocoder_path=None,
23
+ vocoder_config_path=None,
24
+ progress_bar=progress_bar,
25
+ gpu=False,
26
+ )
27
+ self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False)
28
+ if model_name is not None:
29
+ if "tts_models" in model_name or "coqui_studio" in model_name:
30
+ self.load_tts_model_by_name(model_name, gpu)
31
+ elif "voice_conversion_models" in model_name:
32
+ self.load_vc_model_by_name(model_name, gpu)
33
+
34
+
35
  device = "cuda" if torch.cuda.is_available() else "cpu"
36
  GPU = device == "cuda"
37
  INT16MAX = np.iinfo(np.int16).max
38
+ MODEL_DIR = 'C:/Users/Torch/AppData/Local'
39
+ MANAGER = ModelManager(verbose=False)
40
+
41
+ model_ids = MANAGER.list_models()
42
+ local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]
43
+ model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
44
+ model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
45
+ model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
46
 
47
+ VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24',
48
+ output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)
49
 
 
 
 
 
50
  examples_pt = 'examples'
51
  allowed_extentions = ['.mp3', '.wav']
52
  examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
 
105
  # Lazy code... save it to a temp file to resample it while reading it for VC
106
  tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
107
  speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)
108
+ sample_rate = VC_MODEL.voice_converter.output_sample_rate
109
 
110
 
111
  speech = (np.array(speech) * INT16MAX).astype(np.int16)
tts/voice_conversion_models--multilingual--vctk--freevc24/._config.json ADDED
Binary file (386 Bytes). View file
 
tts/voice_conversion_models--multilingual--vctk--freevc24/._model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fa468ed77a9726751b4d321242e069c77dbcd8ecb2e30a212dc0f38f69b852a
3
+ size 230
tts/voice_conversion_models--multilingual--vctk--freevc24/._voice_conversion_models--multilingual--vctk--freevc24 ADDED
Binary file (330 Bytes). View file
 
tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/._voice_conversion_models--multilingual--vctk--freevc24 ADDED
Binary file (330 Bytes). View file
 
tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/voice_conversion_models--multilingual--vctk--freevc24/._config.json ADDED
Binary file (386 Bytes). View file
 
tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/voice_conversion_models--multilingual--vctk--freevc24/._model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fa468ed77a9726751b4d321242e069c77dbcd8ecb2e30a212dc0f38f69b852a
3
+ size 230
tts/voice_conversion_models--multilingual--vctk--freevc24/config.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output",
3
+ "logger_uri": null,
4
+ "run_name": "run",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 32,
28
+ "eval_batch_size": 16,
29
+ "grad_clip": [
30
+ 1000,
31
+ 1000
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": null,
45
+ "lr_scheduler_params": {},
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "freevc",
52
+ "num_loader_workers": 0,
53
+ "num_eval_loader_workers": 0,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "max_wav_value": 32768.0,
57
+ "input_sample_rate": 16000,
58
+ "output_sample_rate": 24000,
59
+ "filter_length": 1280,
60
+ "hop_length": 320,
61
+ "win_length": 1280,
62
+ "n_mel_channels": 80,
63
+ "mel_fmin": 0.0,
64
+ "mel_fmax": null
65
+ },
66
+ "batch_group_size": 0,
67
+ "loss_masking": null,
68
+ "min_audio_len": 1,
69
+ "max_audio_len": Infinity,
70
+ "min_text_len": 1,
71
+ "max_text_len": Infinity,
72
+ "compute_f0": false,
73
+ "compute_energy": false,
74
+ "compute_linear_spec": true,
75
+ "precompute_num_workers": 0,
76
+ "start_by_longest": false,
77
+ "shuffle": false,
78
+ "drop_last": false,
79
+ "datasets": [
80
+ {
81
+ "formatter": "",
82
+ "dataset_name": "",
83
+ "path": "",
84
+ "meta_file_train": "",
85
+ "ignored_speakers": null,
86
+ "language": "",
87
+ "phonemizer": "",
88
+ "meta_file_val": "",
89
+ "meta_file_attn_mask": ""
90
+ }
91
+ ],
92
+ "test_sentences": [
93
+ [
94
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
95
+ ],
96
+ [
97
+ "Be a voice, not an echo."
98
+ ],
99
+ [
100
+ "I'm sorry Dave. I'm afraid I can't do that."
101
+ ],
102
+ [
103
+ "This cake is great. It's so delicious and moist."
104
+ ],
105
+ [
106
+ "Prior to November 22, 1963."
107
+ ]
108
+ ],
109
+ "eval_split_max_size": null,
110
+ "eval_split_size": 0.01,
111
+ "use_speaker_weighted_sampler": false,
112
+ "speaker_weighted_sampler_alpha": 1.0,
113
+ "use_language_weighted_sampler": false,
114
+ "language_weighted_sampler_alpha": 1.0,
115
+ "use_length_weighted_sampler": false,
116
+ "length_weighted_sampler_alpha": 1.0,
117
+ "model_args": {
118
+ "spec_channels": 641,
119
+ "inter_channels": 192,
120
+ "hidden_channels": 192,
121
+ "filter_channels": 768,
122
+ "n_heads": 2,
123
+ "n_layers": 6,
124
+ "kernel_size": 3,
125
+ "p_dropout": 0.1,
126
+ "resblock": "1",
127
+ "resblock_kernel_sizes": [
128
+ 3,
129
+ 7,
130
+ 11
131
+ ],
132
+ "resblock_dilation_sizes": [
133
+ [
134
+ 1,
135
+ 3,
136
+ 5
137
+ ],
138
+ [
139
+ 1,
140
+ 3,
141
+ 5
142
+ ],
143
+ [
144
+ 1,
145
+ 3,
146
+ 5
147
+ ]
148
+ ],
149
+ "upsample_rates": [
150
+ 10,
151
+ 6,
152
+ 4,
153
+ 2
154
+ ],
155
+ "upsample_initial_channel": 512,
156
+ "upsample_kernel_sizes": [
157
+ 16,
158
+ 16,
159
+ 4,
160
+ 4
161
+ ],
162
+ "n_layers_q": 3,
163
+ "use_spectral_norm": false,
164
+ "gin_channels": 256,
165
+ "ssl_dim": 1024,
166
+ "use_spk": true,
167
+ "num_spks": 0,
168
+ "segment_size": 8960
169
+ },
170
+ "lr_gen": 0.0002,
171
+ "lr_disc": 0.0002,
172
+ "lr_scheduler_gen": "ExponentialLR",
173
+ "lr_scheduler_gen_params": {
174
+ "gamma": 0.999875,
175
+ "last_epoch": -1
176
+ },
177
+ "lr_scheduler_disc": "ExponentialLR",
178
+ "lr_scheduler_disc_params": {
179
+ "gamma": 0.999875,
180
+ "last_epoch": -1
181
+ },
182
+ "kl_loss_alpha": 1.0,
183
+ "disc_loss_alpha": 1.0,
184
+ "gen_loss_alpha": 1.0,
185
+ "feat_loss_alpha": 1.0,
186
+ "mel_loss_alpha": 45.0,
187
+ "dur_loss_alpha": 1.0,
188
+ "speaker_encoder_loss_alpha": 1.0,
189
+ "return_wav": true,
190
+ "use_weighted_sampler": false,
191
+ "weighted_sampler_attrs": {},
192
+ "weighted_sampler_multipliers": {},
193
+ "r": 1,
194
+ "add_blank": true,
195
+ "num_speakers": 0,
196
+ "use_speaker_embedding": false,
197
+ "speakers_file": null,
198
+ "speaker_embedding_channels": 256,
199
+ "language_ids_file": null,
200
+ "use_language_embedding": false,
201
+ "use_d_vector_file": false,
202
+ "d_vector_file": null,
203
+ "d_vector_dim": null
204
+ }
tts/voice_conversion_models--multilingual--vctk--freevc24/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d4ce44e7c803d675be1984b174e0f7bf05ce937419f19a818877e83f197007
3
+ size 1425242419