rakhlin commited on
Commit
f8f0961
·
1 Parent(s): 64a316c

Upload folder using huggingface_hub

Browse files
.ipynb_checkpoints/Coqui.ai-Copy1-checkpoint.ipynb CHANGED
@@ -2,8 +2,8 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "14a326bb",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -12,6 +12,7 @@
12
  "import torch\n",
13
  "import torch.nn.functional as F\n",
14
  "from pathlib import Path\n",
 
15
  "\n",
16
  "from TTS.api import TTS\n",
17
  "from TTS.utils.manage import ModelManager"
@@ -19,8 +20,8 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": 17,
23
- "id": "2cfd77d1",
24
  "metadata": {
25
  "scrolled": false
26
  },
@@ -34,8 +35,8 @@
34
  " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
35
  " > Using model: freevc\n",
36
  " > Loading pretrained speaker encoder model ...\n",
37
- "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
38
- "Running on local URL: http://127.0.0.1:7873\n",
39
  "\n",
40
  "To create a public link, set `share=True` in `launch()`.\n"
41
  ]
@@ -43,7 +44,7 @@
43
  {
44
  "data": {
45
  "text/html": [
46
- "<div><iframe src=\"http://127.0.0.1:7873/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
47
  ],
48
  "text/plain": [
49
  "<IPython.core.display.HTML object>"
@@ -56,7 +57,7 @@
56
  "data": {
57
  "text/plain": []
58
  },
59
- "execution_count": 17,
60
  "metadata": {},
61
  "output_type": "execute_result"
62
  },
@@ -151,46 +152,985 @@
151
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
152
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
153
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
154
- " > Processing time: 3.2769999504089355\n",
155
- " > Real-time factor: 0.37722315040572285\n",
156
- " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
157
- " > Model's license - MIT\n",
158
- " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
159
- " > Using model: freevc\n",
160
- " > Loading pretrained speaker encoder model ...\n",
161
- "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
162
- "model: voice_conversion_models/multilingual/vctk/freevc24\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  "language: \n",
164
  "speaker: \n",
165
  "voice cloning with the voice conversion model\n",
166
  " > Text splitted to sentences.\n",
167
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
168
- " > Processing time: 2.8229997158050537\n",
169
- " > Real-time factor: 0.3249621185552823\n",
170
- "model: voice_conversion_models/multilingual/vctk/freevc24\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  "language: \n",
172
  "speaker: \n",
173
  "voice cloning with the voice conversion model\n",
174
  " > Text splitted to sentences.\n",
175
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
176
- " > Processing time: 2.858999729156494\n",
177
- " > Real-time factor: 0.32910616452921765\n",
178
- "model: voice_conversion_models/multilingual/vctk/freevc24\n",
179
  "language: \n",
180
  "speaker: \n",
181
  "voice cloning with the voice conversion model\n",
182
  " > Text splitted to sentences.\n",
183
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
184
- " > Processing time: 2.8419997692108154\n",
185
- " > Real-time factor: 0.3271492592669274\n",
186
  "model: voice_conversion_models/multilingual/vctk/freevc24\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  "language: \n",
188
  "speaker: \n",
189
  "voice cloning with the voice conversion model\n",
190
  " > Text splitted to sentences.\n",
191
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
192
- " > Processing time: 2.922999858856201\n",
193
- " > Real-time factor: 0.3364733695695124\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  ]
195
  },
196
  {
@@ -210,21 +1150,17 @@
210
  " return await future\n",
211
  " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
212
  " result = context.run(func, *args)\n",
213
- "TypeError: voice_clone() takes 2 positional arguments but 3 were given\n",
214
- "Traceback (most recent call last):\n",
215
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\routes.py\", line 434, in run_predict\n",
216
- " event_data=event_data,\n",
217
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\blocks.py\", line 1324, in process_api\n",
218
- " fn_index, inputs, iterator, request, event_id, event_data\n",
219
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gradio\\blocks.py\", line 1052, in call_function\n",
220
- " fn, *processed_input, limiter=self.limiter\n",
221
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\to_thread.py\", line 34, in run_sync\n",
222
- " func, *args, cancellable=cancellable, limiter=limiter\n",
223
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 877, in run_sync_in_worker_thread\n",
224
- " return await future\n",
225
- " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
226
- " result = context.run(func, *args)\n",
227
- "TypeError: voice_clone() takes 2 positional arguments but 3 were given\n"
228
  ]
229
  }
230
  ],
@@ -269,10 +1205,10 @@
269
  "def voice_clone(source_wav, target_wav):\n",
270
  " print(f'model: {VC_MODEL.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
271
  " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
272
- " if vc_model is None or source_wav is None or target_wav is None:\n",
273
  " return (sample_rate, np.zeros(0).astype(np.int16))\n",
274
  "\n",
275
- " speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
276
  " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
277
  " return (sample_rate, speech)\n",
278
  "\n",
@@ -291,12 +1227,17 @@
291
  " if use_original_voice:\n",
292
  " print('Using original voice')\n",
293
  " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
294
- " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
295
  " print('voice cloning with the tts')\n",
296
  " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
297
  " else:\n",
298
  " print('voice cloning with the voice conversion model')\n",
299
- " speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
 
 
 
 
 
300
  "\n",
301
  " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
302
  " return (sample_rate, speech)\n",
@@ -304,7 +1245,7 @@
304
  "\n",
305
  "with gr.Blocks() as demo:\n",
306
  " tts_model = gr.State(None)\n",
307
- " vc_model = gr.State(None)\n",
308
  " def activate(*args):\n",
309
  " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
310
  " def deactivate(*args):\n",
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "156133fe",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
12
  "import torch\n",
13
  "import torch.nn.functional as F\n",
14
  "from pathlib import Path\n",
15
+ "import tempfile\n",
16
  "\n",
17
  "from TTS.api import TTS\n",
18
  "from TTS.utils.manage import ModelManager"
 
20
  },
21
  {
22
  "cell_type": "code",
23
+ "execution_count": 6,
24
+ "id": "5e5af800",
25
  "metadata": {
26
  "scrolled": false
27
  },
 
35
  " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
36
  " > Using model: freevc\n",
37
  " > Loading pretrained speaker encoder model ...\n",
38
+ "Loaded the voice encoder model on cpu in 0.01 seconds.\n",
39
+ "Running on local URL: http://127.0.0.1:7863\n",
40
  "\n",
41
  "To create a public link, set `share=True` in `launch()`.\n"
42
  ]
 
44
  {
45
  "data": {
46
  "text/html": [
47
+ "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
48
  ],
49
  "text/plain": [
50
  "<IPython.core.display.HTML object>"
 
57
  "data": {
58
  "text/plain": []
59
  },
60
+ "execution_count": 6,
61
  "metadata": {},
62
  "output_type": "execute_result"
63
  },
 
152
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
153
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
154
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
155
+ " > Processing time: 3.3410003185272217\n",
156
+ " > Real-time factor: 0.38459038289093944\n",
157
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
+ "language: \n",
159
+ "speaker: \n",
160
+ "voice cloning with the voice conversion model\n",
161
+ " > Text splitted to sentences.\n",
162
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
163
+ " > Processing time: 2.9179999828338623\n",
164
+ " > Real-time factor: 0.3358978221135079\n",
165
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
166
+ " > Model's license - apache 2.0\n",
167
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
168
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
169
+ " > Model's license - apache 2.0\n",
170
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
171
+ " > Using model: Tacotron2\n",
172
+ " > Setting up Audio Processor...\n",
173
+ " | > sample_rate:22050\n",
174
+ " | > resample:False\n",
175
+ " | > num_mels:80\n",
176
+ " | > log_func:np.log10\n",
177
+ " | > min_level_db:-100\n",
178
+ " | > frame_shift_ms:None\n",
179
+ " | > frame_length_ms:None\n",
180
+ " | > ref_level_db:20\n",
181
+ " | > fft_size:1024\n",
182
+ " | > power:1.5\n",
183
+ " | > preemphasis:0.0\n",
184
+ " | > griffin_lim_iters:60\n",
185
+ " | > signal_norm:True\n",
186
+ " | > symmetric_norm:True\n",
187
+ " | > mel_fmin:50.0\n",
188
+ " | > mel_fmax:7600.0\n",
189
+ " | > pitch_fmin:0.0\n",
190
+ " | > pitch_fmax:640.0\n",
191
+ " | > spec_gain:1.0\n",
192
+ " | > stft_pad_mode:reflect\n",
193
+ " | > max_norm:4.0\n",
194
+ " | > clip_norm:True\n",
195
+ " | > do_trim_silence:True\n",
196
+ " | > trim_db:60\n",
197
+ " | > do_sound_norm:False\n",
198
+ " | > do_amp_to_db_linear:True\n",
199
+ " | > do_amp_to_db_mel:True\n",
200
+ " | > do_rms_norm:False\n",
201
+ " | > db_level:None\n",
202
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
203
+ " | > base:10\n",
204
+ " | > hop_length:256\n",
205
+ " | > win_length:1024\n",
206
+ " > Model's reduction rate `r` is set to: 2\n",
207
+ " > Vocoder Model: univnet\n",
208
+ " > Setting up Audio Processor...\n",
209
+ " | > sample_rate:22050\n",
210
+ " | > resample:False\n",
211
+ " | > num_mels:80\n",
212
+ " | > log_func:np.log10\n",
213
+ " | > min_level_db:-100\n",
214
+ " | > frame_shift_ms:None\n",
215
+ " | > frame_length_ms:None\n",
216
+ " | > ref_level_db:20\n",
217
+ " | > fft_size:1024\n",
218
+ " | > power:1.5\n",
219
+ " | > preemphasis:0.0\n",
220
+ " | > griffin_lim_iters:60\n",
221
+ " | > signal_norm:True\n",
222
+ " | > symmetric_norm:True\n",
223
+ " | > mel_fmin:50.0\n",
224
+ " | > mel_fmax:7600.0\n",
225
+ " | > pitch_fmin:1.0\n",
226
+ " | > pitch_fmax:640.0\n",
227
+ " | > spec_gain:1.0\n",
228
+ " | > stft_pad_mode:reflect\n",
229
+ " | > max_norm:4.0\n",
230
+ " | > clip_norm:True\n",
231
+ " | > do_trim_silence:True\n",
232
+ " | > trim_db:60\n",
233
+ " | > do_sound_norm:False\n",
234
+ " | > do_amp_to_db_linear:True\n",
235
+ " | > do_amp_to_db_mel:True\n",
236
+ " | > do_rms_norm:False\n",
237
+ " | > db_level:None\n",
238
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
239
+ " | > base:10\n",
240
+ " | > hop_length:256\n",
241
+ " | > win_length:1024\n",
242
+ " > Generator Model: univnet_generator\n",
243
+ " > Discriminator Model: univnet_discriminator\n",
244
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
245
+ "language: \n",
246
+ "speaker: \n",
247
+ "voice cloning with the voice conversion model\n",
248
+ " > Text splitted to sentences.\n",
249
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
250
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
251
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
252
+ " > Processing time: 3.021000385284424\n",
253
+ " > Real-time factor: 0.3477544400242312\n",
254
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
255
+ "language: \n",
256
+ "speaker: \n",
257
+ "voice cloning with the voice conversion model\n",
258
+ " > Text splitted to sentences.\n",
259
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
260
+ " > Processing time: 2.9099998474121094\n",
261
+ " > Real-time factor: 0.33497690776101013\n",
262
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
263
+ "language: \n",
264
+ "speaker: \n",
265
+ "voice cloning with the voice conversion model\n",
266
+ " > Text splitted to sentences.\n",
267
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
268
+ " > Processing time: 2.933000087738037\n",
269
+ " > Real-time factor: 0.33762451937136506\n",
270
+ " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
271
+ " > Model's license - apache 2.0\n",
272
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
273
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
274
+ " > Model's license - apache 2.0\n",
275
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
276
+ " > Using model: Tacotron2\n",
277
+ " > Setting up Audio Processor...\n",
278
+ " | > sample_rate:22050\n",
279
+ " | > resample:False\n",
280
+ " | > num_mels:80\n",
281
+ " | > log_func:np.log\n",
282
+ " | > min_level_db:-100\n",
283
+ " | > frame_shift_ms:None\n",
284
+ " | > frame_length_ms:None\n",
285
+ " | > ref_level_db:20\n",
286
+ " | > fft_size:1024\n",
287
+ " | > power:1.5\n",
288
+ " | > preemphasis:0.0\n",
289
+ " | > griffin_lim_iters:60\n",
290
+ " | > signal_norm:False\n",
291
+ " | > symmetric_norm:True\n",
292
+ " | > mel_fmin:0\n",
293
+ " | > mel_fmax:8000.0\n",
294
+ " | > pitch_fmin:1.0\n",
295
+ " | > pitch_fmax:640.0\n",
296
+ " | > spec_gain:1.0\n",
297
+ " | > stft_pad_mode:reflect\n",
298
+ " | > max_norm:4.0\n",
299
+ " | > clip_norm:True\n",
300
+ " | > do_trim_silence:True\n",
301
+ " | > trim_db:60\n",
302
+ " | > do_sound_norm:False\n",
303
+ " | > do_amp_to_db_linear:True\n",
304
+ " | > do_amp_to_db_mel:True\n",
305
+ " | > do_rms_norm:False\n",
306
+ " | > db_level:None\n",
307
+ " | > stats_path:None\n",
308
+ " | > base:2.718281828459045\n",
309
+ " | > hop_length:256\n",
310
+ " | > win_length:1024\n",
311
+ " > Model's reduction rate `r` is set to: 1\n",
312
+ " > Vocoder Model: hifigan\n",
313
+ " > Setting up Audio Processor...\n",
314
+ " | > sample_rate:22050\n",
315
+ " | > resample:False\n",
316
+ " | > num_mels:80\n",
317
+ " | > log_func:np.log\n",
318
+ " | > min_level_db:-100\n",
319
+ " | > frame_shift_ms:None\n",
320
+ " | > frame_length_ms:None\n",
321
+ " | > ref_level_db:20\n",
322
+ " | > fft_size:1024\n",
323
+ " | > power:1.5\n",
324
+ " | > preemphasis:0.0\n",
325
+ " | > griffin_lim_iters:60\n",
326
+ " | > signal_norm:False\n",
327
+ " | > symmetric_norm:True\n",
328
+ " | > mel_fmin:0\n",
329
+ " | > mel_fmax:8000.0\n",
330
+ " | > pitch_fmin:1.0\n",
331
+ " | > pitch_fmax:640.0\n",
332
+ " | > spec_gain:1.0\n",
333
+ " | > stft_pad_mode:reflect\n",
334
+ " | > max_norm:4.0\n",
335
+ " | > clip_norm:True\n",
336
+ " | > do_trim_silence:False\n",
337
+ " | > trim_db:60\n",
338
+ " | > do_sound_norm:False\n",
339
+ " | > do_amp_to_db_linear:True\n",
340
+ " | > do_amp_to_db_mel:True\n",
341
+ " | > do_rms_norm:False\n",
342
+ " | > db_level:None\n",
343
+ " | > stats_path:None\n",
344
+ " | > base:2.718281828459045\n",
345
+ " | > hop_length:256\n",
346
+ " | > win_length:1024\n",
347
+ " > Generator Model: hifigan_generator\n",
348
+ " > Discriminator Model: hifigan_discriminator\n"
349
+ ]
350
+ },
351
+ {
352
+ "name": "stdout",
353
+ "output_type": "stream",
354
+ "text": [
355
+ "Removing weight norm...\n",
356
+ "model: tts_models/en/ljspeech/tacotron2-DDC\n",
357
+ "language: \n",
358
+ "speaker: \n",
359
+ "voice cloning with the voice conversion model\n",
360
+ " > Text splitted to sentences.\n",
361
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
362
+ " > Processing time: 4.28600001335144\n",
363
+ " > Real-time factor: 0.42371906516498953\n",
364
+ " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
365
+ " > Model's license - apache 2.0\n",
366
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
367
+ " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
368
+ " > Model's license - apache 2.0\n",
369
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
370
+ " > Using model: Tacotron2\n",
371
+ " > Setting up Audio Processor...\n",
372
+ " | > sample_rate:22050\n",
373
+ " | > resample:False\n",
374
+ " | > num_mels:80\n",
375
+ " | > log_func:np.log10\n",
376
+ " | > min_level_db:-10\n",
377
+ " | > frame_shift_ms:None\n",
378
+ " | > frame_length_ms:None\n",
379
+ " | > ref_level_db:0\n",
380
+ " | > fft_size:1024\n",
381
+ " | > power:1.8\n",
382
+ " | > preemphasis:0.99\n",
383
+ " | > griffin_lim_iters:60\n",
384
+ " | > signal_norm:True\n",
385
+ " | > symmetric_norm:True\n",
386
+ " | > mel_fmin:0\n",
387
+ " | > mel_fmax:8000.0\n",
388
+ " | > pitch_fmin:1.0\n",
389
+ " | > pitch_fmax:640.0\n",
390
+ " | > spec_gain:1.0\n",
391
+ " | > stft_pad_mode:reflect\n",
392
+ " | > max_norm:4.0\n",
393
+ " | > clip_norm:True\n",
394
+ " | > do_trim_silence:True\n",
395
+ " | > trim_db:60\n",
396
+ " | > do_sound_norm:False\n",
397
+ " | > do_amp_to_db_linear:True\n",
398
+ " | > do_amp_to_db_mel:True\n",
399
+ " | > do_rms_norm:False\n",
400
+ " | > db_level:None\n",
401
+ " | > stats_path:None\n",
402
+ " | > base:10\n",
403
+ " | > hop_length:256\n",
404
+ " | > win_length:1024\n",
405
+ " > Model's reduction rate `r` is set to: 2\n",
406
+ " > Vocoder Model: wavegrad\n",
407
+ "model: tts_models/en/ek1/tacotron2\n",
408
+ "language: \n",
409
+ "speaker: \n",
410
+ "voice cloning with the voice conversion model\n",
411
+ " > Text splitted to sentences.\n",
412
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
413
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
414
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
415
+ " > Processing time: 224.84099984169006\n",
416
+ " > Real-time factor: 29.51038122922182\n",
417
+ " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
418
+ " > Model's license - apache 2.0\n",
419
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
420
+ " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
421
+ " > Model's license - apache 2.0\n",
422
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
423
+ " > Using model: Tacotron2\n",
424
+ " > Setting up Audio Processor...\n",
425
+ " | > sample_rate:22050\n",
426
+ " | > resample:False\n",
427
+ " | > num_mels:80\n",
428
+ " | > log_func:np.log10\n",
429
+ " | > min_level_db:-10\n",
430
+ " | > frame_shift_ms:None\n",
431
+ " | > frame_length_ms:None\n",
432
+ " | > ref_level_db:0\n",
433
+ " | > fft_size:1024\n",
434
+ " | > power:1.8\n",
435
+ " | > preemphasis:0.99\n",
436
+ " | > griffin_lim_iters:60\n",
437
+ " | > signal_norm:True\n",
438
+ " | > symmetric_norm:True\n",
439
+ " | > mel_fmin:0\n",
440
+ " | > mel_fmax:8000.0\n",
441
+ " | > pitch_fmin:1.0\n",
442
+ " | > pitch_fmax:640.0\n",
443
+ " | > spec_gain:1.0\n",
444
+ " | > stft_pad_mode:reflect\n",
445
+ " | > max_norm:4.0\n",
446
+ " | > clip_norm:True\n",
447
+ " | > do_trim_silence:True\n",
448
+ " | > trim_db:60\n",
449
+ " | > do_sound_norm:False\n",
450
+ " | > do_amp_to_db_linear:True\n",
451
+ " | > do_amp_to_db_mel:True\n",
452
+ " | > do_rms_norm:False\n",
453
+ " | > db_level:None\n",
454
+ " | > stats_path:None\n",
455
+ " | > base:10\n",
456
+ " | > hop_length:256\n",
457
+ " | > win_length:1024\n",
458
+ " > Model's reduction rate `r` is set to: 2\n",
459
+ " > Vocoder Model: wavegrad\n",
460
+ "model: tts_models/en/ek1/tacotron2\n",
461
+ "language: \n",
462
+ "speaker: \n",
463
+ "voice cloning with the voice conversion model\n",
464
+ " > Text splitted to sentences.\n",
465
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
466
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
467
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
468
+ " > Processing time: 266.6489999294281\n",
469
+ " > Real-time factor: 34.99768124073744\n",
470
+ " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
471
+ " > Model's license - apache 2.0\n",
472
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
473
+ " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
474
+ " > Model's license - apache 2.0\n",
475
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
476
+ " > Using model: Tacotron2\n",
477
+ " > Setting up Audio Processor...\n",
478
+ " | > sample_rate:22050\n",
479
+ " | > resample:False\n",
480
+ " | > num_mels:80\n",
481
+ " | > log_func:np.log10\n",
482
+ " | > min_level_db:-100\n",
483
+ " | > frame_shift_ms:None\n",
484
+ " | > frame_length_ms:None\n",
485
+ " | > ref_level_db:20\n",
486
+ " | > fft_size:1024\n",
487
+ " | > power:1.5\n",
488
+ " | > preemphasis:0.0\n",
489
+ " | > griffin_lim_iters:60\n",
490
+ " | > signal_norm:True\n",
491
+ " | > symmetric_norm:True\n",
492
+ " | > mel_fmin:50.0\n",
493
+ " | > mel_fmax:7600.0\n",
494
+ " | > pitch_fmin:0.0\n",
495
+ " | > pitch_fmax:640.0\n",
496
+ " | > spec_gain:1.0\n",
497
+ " | > stft_pad_mode:reflect\n",
498
+ " | > max_norm:4.0\n",
499
+ " | > clip_norm:True\n",
500
+ " | > do_trim_silence:True\n",
501
+ " | > trim_db:60\n",
502
+ " | > do_sound_norm:False\n",
503
+ " | > do_amp_to_db_linear:True\n",
504
+ " | > do_amp_to_db_mel:True\n",
505
+ " | > do_rms_norm:False\n",
506
+ " | > db_level:None\n",
507
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
508
+ " | > base:10\n",
509
+ " | > hop_length:256\n",
510
+ " | > win_length:1024\n",
511
+ " > Model's reduction rate `r` is set to: 2\n",
512
+ " > Vocoder Model: univnet\n",
513
+ " > Setting up Audio Processor...\n",
514
+ " | > sample_rate:22050\n",
515
+ " | > resample:False\n",
516
+ " | > num_mels:80\n",
517
+ " | > log_func:np.log10\n",
518
+ " | > min_level_db:-100\n",
519
+ " | > frame_shift_ms:None\n",
520
+ " | > frame_length_ms:None\n",
521
+ " | > ref_level_db:20\n",
522
+ " | > fft_size:1024\n",
523
+ " | > power:1.5\n",
524
+ " | > preemphasis:0.0\n",
525
+ " | > griffin_lim_iters:60\n",
526
+ " | > signal_norm:True\n",
527
+ " | > symmetric_norm:True\n",
528
+ " | > mel_fmin:50.0\n",
529
+ " | > mel_fmax:7600.0\n",
530
+ " | > pitch_fmin:1.0\n",
531
+ " | > pitch_fmax:640.0\n",
532
+ " | > spec_gain:1.0\n",
533
+ " | > stft_pad_mode:reflect\n",
534
+ " | > max_norm:4.0\n",
535
+ " | > clip_norm:True\n",
536
+ " | > do_trim_silence:True\n",
537
+ " | > trim_db:60\n",
538
+ " | > do_sound_norm:False\n",
539
+ " | > do_amp_to_db_linear:True\n",
540
+ " | > do_amp_to_db_mel:True\n",
541
+ " | > do_rms_norm:False\n",
542
+ " | > db_level:None\n",
543
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
544
+ " | > base:10\n",
545
+ " | > hop_length:256\n",
546
+ " | > win_length:1024\n",
547
+ " > Generator Model: univnet_generator\n",
548
+ " > Discriminator Model: univnet_discriminator\n",
549
+ "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
550
+ "language: \n",
551
+ "speaker: \n",
552
+ "voice cloning with the voice conversion model\n",
553
+ " > Text splitted to sentences.\n",
554
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
555
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
556
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
557
+ " > Processing time: 2.885999917984009\n",
558
+ " > Real-time factor: 0.3322142195933605\n",
559
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--speedy-speech\n",
560
+ " > Model's license - apache 2.0\n",
561
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
562
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
563
+ " > Model's license - apache 2.0\n",
564
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
565
+ " > Using model: speedy_speech\n",
566
+ " > Setting up Audio Processor...\n",
567
+ " | > sample_rate:22050\n",
568
+ " | > resample:False\n",
569
+ " | > num_mels:80\n",
570
+ " | > log_func:np.log\n",
571
+ " | > min_level_db:-100\n",
572
+ " | > frame_shift_ms:None\n",
573
+ " | > frame_length_ms:None\n",
574
+ " | > ref_level_db:20\n",
575
+ " | > fft_size:1024\n",
576
+ " | > power:1.5\n",
577
+ " | > preemphasis:0.0\n",
578
+ " | > griffin_lim_iters:60\n",
579
+ " | > signal_norm:False\n",
580
+ " | > symmetric_norm:True\n",
581
+ " | > mel_fmin:0\n",
582
+ " | > mel_fmax:8000.0\n",
583
+ " | > pitch_fmin:1.0\n",
584
+ " | > pitch_fmax:640.0\n",
585
+ " | > spec_gain:1.0\n",
586
+ " | > stft_pad_mode:reflect\n",
587
+ " | > max_norm:4.0\n",
588
+ " | > clip_norm:True\n",
589
+ " | > do_trim_silence:True\n",
590
+ " | > trim_db:60\n",
591
+ " | > do_sound_norm:False\n",
592
+ " | > do_amp_to_db_linear:True\n",
593
+ " | > do_amp_to_db_mel:True\n",
594
+ " | > do_rms_norm:False\n",
595
+ " | > db_level:None\n",
596
+ " | > stats_path:None\n",
597
+ " | > base:2.718281828459045\n",
598
+ " | > hop_length:256\n",
599
+ " | > win_length:1024\n",
600
+ " > Vocoder Model: hifigan\n",
601
+ " > Setting up Audio Processor...\n",
602
+ " | > sample_rate:22050\n",
603
+ " | > resample:False\n",
604
+ " | > num_mels:80\n",
605
+ " | > log_func:np.log\n",
606
+ " | > min_level_db:-100\n",
607
+ " | > frame_shift_ms:None\n",
608
+ " | > frame_length_ms:None\n",
609
+ " | > ref_level_db:20\n",
610
+ " | > fft_size:1024\n",
611
+ " | > power:1.5\n",
612
+ " | > preemphasis:0.0\n",
613
+ " | > griffin_lim_iters:60\n",
614
+ " | > signal_norm:False\n",
615
+ " | > symmetric_norm:True\n",
616
+ " | > mel_fmin:0\n",
617
+ " | > mel_fmax:8000.0\n",
618
+ " | > pitch_fmin:1.0\n",
619
+ " | > pitch_fmax:640.0\n",
620
+ " | > spec_gain:1.0\n",
621
+ " | > stft_pad_mode:reflect\n",
622
+ " | > max_norm:4.0\n",
623
+ " | > clip_norm:True\n",
624
+ " | > do_trim_silence:False\n",
625
+ " | > trim_db:60\n",
626
+ " | > do_sound_norm:False\n",
627
+ " | > do_amp_to_db_linear:True\n",
628
+ " | > do_amp_to_db_mel:True\n",
629
+ " | > do_rms_norm:False\n",
630
+ " | > db_level:None\n",
631
+ " | > stats_path:None\n",
632
+ " | > base:2.718281828459045\n",
633
+ " | > hop_length:256\n",
634
+ " | > win_length:1024\n",
635
+ " > Generator Model: hifigan_generator\n",
636
+ " > Discriminator Model: hifigan_discriminator\n",
637
+ "Removing weight norm...\n"
638
+ ]
639
+ },
640
+ {
641
+ "name": "stdout",
642
+ "output_type": "stream",
643
+ "text": [
644
+ "model: tts_models/en/ljspeech/speedy-speech\n",
645
+ "language: \n",
646
+ "speaker: \n",
647
+ "Using original voice\n",
648
+ " > Text splitted to sentences.\n",
649
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
650
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
651
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
652
+ " > Processing time: 0.9679999351501465\n",
653
+ " > Real-time factor: 0.11673301633083617\n",
654
+ "model: tts_models/en/ljspeech/speedy-speech\n",
655
+ "language: \n",
656
+ "speaker: \n",
657
+ "voice cloning with the voice conversion model\n",
658
+ " > Text splitted to sentences.\n",
659
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
660
+ " > Processing time: 0.9630000591278076\n",
661
+ " > Real-time factor: 0.11613007144605443\n",
662
+ " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
663
+ " > Model's license - MPL\n",
664
+ " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
665
+ " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
666
+ " > Model's license - MPL\n",
667
+ " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
668
+ " > Using model: Tacotron2\n",
669
+ " > Setting up Audio Processor...\n",
670
+ " | > sample_rate:22050\n",
671
+ " | > resample:False\n",
672
+ " | > num_mels:80\n",
673
+ " | > log_func:np.log10\n",
674
+ " | > min_level_db:-100\n",
675
+ " | > frame_shift_ms:None\n",
676
+ " | > frame_length_ms:None\n",
677
+ " | > ref_level_db:20\n",
678
+ " | > fft_size:1024\n",
679
+ " | > power:1.5\n",
680
+ " | > preemphasis:0.0\n",
681
+ " | > griffin_lim_iters:60\n",
682
+ " | > signal_norm:True\n",
683
+ " | > symmetric_norm:True\n",
684
+ " | > mel_fmin:50.0\n",
685
+ " | > mel_fmax:7600.0\n",
686
+ " | > pitch_fmin:0.0\n",
687
+ " | > pitch_fmax:640.0\n",
688
+ " | > spec_gain:1.0\n",
689
+ " | > stft_pad_mode:reflect\n",
690
+ " | > max_norm:4.0\n",
691
+ " | > clip_norm:True\n",
692
+ " | > do_trim_silence:True\n",
693
+ " | > trim_db:60\n",
694
+ " | > do_sound_norm:False\n",
695
+ " | > do_amp_to_db_linear:True\n",
696
+ " | > do_amp_to_db_mel:True\n",
697
+ " | > do_rms_norm:False\n",
698
+ " | > db_level:None\n",
699
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
700
+ " | > base:10\n",
701
+ " | > hop_length:256\n",
702
+ " | > win_length:1024\n",
703
+ " > Model's reduction rate `r` is set to: 2\n",
704
+ " > Vocoder Model: multiband_melgan\n",
705
+ " > Setting up Audio Processor...\n",
706
+ " | > sample_rate:22050\n",
707
+ " | > resample:False\n",
708
+ " | > num_mels:80\n",
709
+ " | > log_func:np.log10\n",
710
+ " | > min_level_db:-100\n",
711
+ " | > frame_shift_ms:None\n",
712
+ " | > frame_length_ms:None\n",
713
+ " | > ref_level_db:0\n",
714
+ " | > fft_size:1024\n",
715
+ " | > power:1.5\n",
716
+ " | > preemphasis:0.0\n",
717
+ " | > griffin_lim_iters:60\n",
718
+ " | > signal_norm:True\n",
719
+ " | > symmetric_norm:True\n",
720
+ " | > mel_fmin:50.0\n",
721
+ " | > mel_fmax:7600.0\n",
722
+ " | > pitch_fmin:0.0\n",
723
+ " | > pitch_fmax:640.0\n",
724
+ " | > spec_gain:1.0\n",
725
+ " | > stft_pad_mode:reflect\n",
726
+ " | > max_norm:4.0\n",
727
+ " | > clip_norm:True\n",
728
+ " | > do_trim_silence:True\n",
729
+ " | > trim_db:60\n",
730
+ " | > do_sound_norm:False\n",
731
+ " | > do_amp_to_db_linear:True\n",
732
+ " | > do_amp_to_db_mel:True\n",
733
+ " | > do_rms_norm:False\n",
734
+ " | > db_level:None\n",
735
+ " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
736
+ " | > base:10\n",
737
+ " | > hop_length:256\n",
738
+ " | > win_length:1024\n",
739
+ " > Generator Model: multiband_melgan_generator\n",
740
+ " > Discriminator Model: melgan_multiscale_discriminator\n",
741
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
742
+ "language: \n",
743
+ "speaker: \n",
744
+ "Using original voice\n",
745
+ " > Text splitted to sentences.\n",
746
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
747
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
748
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
749
+ " > Processing time: 2.067000150680542\n",
750
+ " > Real-time factor: 0.23295588670728015\n",
751
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
752
+ "language: \n",
753
+ "speaker: \n",
754
+ "voice cloning with the voice conversion model\n",
755
+ " > Text splitted to sentences.\n",
756
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
757
+ " > Processing time: 2.1570000648498535\n",
758
+ " > Real-time factor: 0.2430990934225715\n",
759
+ "model: tts_models/en/ljspeech/tacotron2-DCA\n",
760
  "language: \n",
761
  "speaker: \n",
762
  "voice cloning with the voice conversion model\n",
763
  " > Text splitted to sentences.\n",
764
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
765
+ " > Processing time: 2.0920000076293945\n",
766
+ " > Real-time factor: 0.23577343069302087\n",
767
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--fast_pitch\n",
768
+ " > Model's license - apache 2.0\n",
769
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
770
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
771
+ " > Model's license - apache 2.0\n",
772
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
773
+ " > Using model: fast_pitch\n",
774
+ " > Setting up Audio Processor...\n",
775
+ " | > sample_rate:22050\n",
776
+ " | > resample:False\n",
777
+ " | > num_mels:80\n",
778
+ " | > log_func:np.log\n",
779
+ " | > min_level_db:-100\n",
780
+ " | > frame_shift_ms:None\n",
781
+ " | > frame_length_ms:None\n",
782
+ " | > ref_level_db:20\n",
783
+ " | > fft_size:1024\n",
784
+ " | > power:1.5\n",
785
+ " | > preemphasis:0.0\n",
786
+ " | > griffin_lim_iters:60\n",
787
+ " | > signal_norm:False\n",
788
+ " | > symmetric_norm:True\n",
789
+ " | > mel_fmin:0\n",
790
+ " | > mel_fmax:8000.0\n",
791
+ " | > pitch_fmin:1.0\n",
792
+ " | > pitch_fmax:640.0\n",
793
+ " | > spec_gain:1.0\n",
794
+ " | > stft_pad_mode:reflect\n",
795
+ " | > max_norm:4.0\n",
796
+ " | > clip_norm:True\n",
797
+ " | > do_trim_silence:True\n",
798
+ " | > trim_db:60\n",
799
+ " | > do_sound_norm:False\n",
800
+ " | > do_amp_to_db_linear:True\n",
801
+ " | > do_amp_to_db_mel:True\n",
802
+ " | > do_rms_norm:False\n",
803
+ " | > db_level:None\n",
804
+ " | > stats_path:None\n",
805
+ " | > base:2.718281828459045\n",
806
+ " | > hop_length:256\n",
807
+ " | > win_length:1024\n",
808
+ " > Vocoder Model: hifigan\n",
809
+ " > Setting up Audio Processor...\n",
810
+ " | > sample_rate:22050\n",
811
+ " | > resample:False\n",
812
+ " | > num_mels:80\n",
813
+ " | > log_func:np.log\n",
814
+ " | > min_level_db:-100\n",
815
+ " | > frame_shift_ms:None\n",
816
+ " | > frame_length_ms:None\n",
817
+ " | > ref_level_db:20\n",
818
+ " | > fft_size:1024\n",
819
+ " | > power:1.5\n",
820
+ " | > preemphasis:0.0\n",
821
+ " | > griffin_lim_iters:60\n",
822
+ " | > signal_norm:False\n",
823
+ " | > symmetric_norm:True\n",
824
+ " | > mel_fmin:0\n",
825
+ " | > mel_fmax:8000.0\n",
826
+ " | > pitch_fmin:1.0\n",
827
+ " | > pitch_fmax:640.0\n",
828
+ " | > spec_gain:1.0\n",
829
+ " | > stft_pad_mode:reflect\n",
830
+ " | > max_norm:4.0\n",
831
+ " | > clip_norm:True\n",
832
+ " | > do_trim_silence:False\n",
833
+ " | > trim_db:60\n",
834
+ " | > do_sound_norm:False\n",
835
+ " | > do_amp_to_db_linear:True\n",
836
+ " | > do_amp_to_db_mel:True\n",
837
+ " | > do_rms_norm:False\n",
838
+ " | > db_level:None\n",
839
+ " | > stats_path:None\n",
840
+ " | > base:2.718281828459045\n",
841
+ " | > hop_length:256\n",
842
+ " | > win_length:1024\n",
843
+ " > Generator Model: hifigan_generator\n",
844
+ " > Discriminator Model: hifigan_discriminator\n",
845
+ "Removing weight norm...\n",
846
+ "model: tts_models/en/ljspeech/fast_pitch\n",
847
+ "language: \n",
848
+ "speaker: \n",
849
+ "Using original voice\n",
850
+ " > Text splitted to sentences.\n",
851
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
852
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
853
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
854
+ " > Processing time: 1.8829996585845947\n",
855
+ " > Real-time factor: 0.19894272496832988\n",
856
+ "model: tts_models/en/ljspeech/fast_pitch\n",
857
  "language: \n",
858
  "speaker: \n",
859
  "voice cloning with the voice conversion model\n",
860
  " > Text splitted to sentences.\n",
861
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
862
+ " > Processing time: 1.8359999656677246\n",
863
+ " > Real-time factor: 0.19397711228808903\n",
864
+ "model: tts_models/en/ljspeech/fast_pitch\n",
865
  "language: \n",
866
  "speaker: \n",
867
  "voice cloning with the voice conversion model\n",
868
  " > Text splitted to sentences.\n",
869
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
870
+ " > Processing time: 1.8659999370574951\n",
871
+ " > Real-time factor: 0.19714666998293168\n",
872
  "model: voice_conversion_models/multilingual/vctk/freevc24\n",
873
+ "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
874
+ "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\11c82c70d145ea630f81dfa541de52bf615719ae\\yearn_for_time-0-100.wav\n",
875
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--overflow\n",
876
+ " > Model's license - apache 2.0\n",
877
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
878
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
879
+ " > Model's license - apache 2.0\n",
880
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
881
+ " > Using model: OverFlow\n",
882
+ " > Setting up Audio Processor...\n",
883
+ " | > sample_rate:22050\n",
884
+ " | > resample:False\n",
885
+ " | > num_mels:80\n",
886
+ " | > log_func:np.log\n",
887
+ " | > min_level_db:-100\n",
888
+ " | > frame_shift_ms:None\n",
889
+ " | > frame_length_ms:None\n",
890
+ " | > ref_level_db:20\n",
891
+ " | > fft_size:1024\n",
892
+ " | > power:1.5\n",
893
+ " | > preemphasis:0.0\n",
894
+ " | > griffin_lim_iters:60\n",
895
+ " | > signal_norm:False\n",
896
+ " | > symmetric_norm:True\n",
897
+ " | > mel_fmin:0\n",
898
+ " | > mel_fmax:8000.0\n",
899
+ " | > pitch_fmin:1.0\n",
900
+ " | > pitch_fmax:640.0\n",
901
+ " | > spec_gain:1.0\n",
902
+ " | > stft_pad_mode:reflect\n",
903
+ " | > max_norm:4.0\n",
904
+ " | > clip_norm:True\n",
905
+ " | > do_trim_silence:True\n",
906
+ " | > trim_db:60\n",
907
+ " | > do_sound_norm:False\n",
908
+ " | > do_amp_to_db_linear:True\n",
909
+ " | > do_amp_to_db_mel:True\n",
910
+ " | > do_rms_norm:False\n",
911
+ " | > db_level:None\n",
912
+ " | > stats_path:None\n",
913
+ " | > base:2.718281828459045\n",
914
+ " | > hop_length:256\n",
915
+ " | > win_length:1024\n"
916
+ ]
917
+ },
918
+ {
919
+ "name": "stdout",
920
+ "output_type": "stream",
921
+ "text": [
922
+ " > Vocoder Model: hifigan\n",
923
+ " > Setting up Audio Processor...\n",
924
+ " | > sample_rate:22050\n",
925
+ " | > resample:False\n",
926
+ " | > num_mels:80\n",
927
+ " | > log_func:np.log\n",
928
+ " | > min_level_db:-100\n",
929
+ " | > frame_shift_ms:None\n",
930
+ " | > frame_length_ms:None\n",
931
+ " | > ref_level_db:20\n",
932
+ " | > fft_size:1024\n",
933
+ " | > power:1.5\n",
934
+ " | > preemphasis:0.0\n",
935
+ " | > griffin_lim_iters:60\n",
936
+ " | > signal_norm:False\n",
937
+ " | > symmetric_norm:True\n",
938
+ " | > mel_fmin:0\n",
939
+ " | > mel_fmax:8000.0\n",
940
+ " | > pitch_fmin:1.0\n",
941
+ " | > pitch_fmax:640.0\n",
942
+ " | > spec_gain:1.0\n",
943
+ " | > stft_pad_mode:reflect\n",
944
+ " | > max_norm:4.0\n",
945
+ " | > clip_norm:True\n",
946
+ " | > do_trim_silence:False\n",
947
+ " | > trim_db:60\n",
948
+ " | > do_sound_norm:False\n",
949
+ " | > do_amp_to_db_linear:True\n",
950
+ " | > do_amp_to_db_mel:True\n",
951
+ " | > do_rms_norm:False\n",
952
+ " | > db_level:None\n",
953
+ " | > stats_path:None\n",
954
+ " | > base:2.718281828459045\n",
955
+ " | > hop_length:256\n",
956
+ " | > win_length:1024\n",
957
+ " > Generator Model: hifigan_generator\n",
958
+ " > Discriminator Model: hifigan_discriminator\n",
959
+ "Removing weight norm...\n",
960
+ "model: tts_models/en/ljspeech/overflow\n",
961
+ "language: \n",
962
+ "speaker: \n",
963
+ "Using original voice\n",
964
+ " > Text splitted to sentences.\n",
965
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
966
+ " > Processing time: 2.4030001163482666\n",
967
+ " > Real-time factor: 0.26459208495864933\n",
968
+ "model: tts_models/en/ljspeech/overflow\n",
969
+ "language: \n",
970
+ "speaker: \n",
971
+ "voice cloning with the voice conversion model\n",
972
+ " > Text splitted to sentences.\n",
973
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
974
+ " > Processing time: 2.4769999980926514\n",
975
+ " > Real-time factor: 0.27343925203231617\n",
976
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--neural_hmm\n",
977
+ " > Model's license - apache 2.0\n",
978
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
979
+ " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
980
+ " > Model's license - apache 2.0\n",
981
+ " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
982
+ " > Using model: NeuralHMM_TTS\n",
983
+ " > Setting up Audio Processor...\n",
984
+ " | > sample_rate:22050\n",
985
+ " | > resample:False\n",
986
+ " | > num_mels:80\n",
987
+ " | > log_func:np.log\n",
988
+ " | > min_level_db:-100\n",
989
+ " | > frame_shift_ms:None\n",
990
+ " | > frame_length_ms:None\n",
991
+ " | > ref_level_db:20\n",
992
+ " | > fft_size:1024\n",
993
+ " | > power:1.5\n",
994
+ " | > preemphasis:0.0\n",
995
+ " | > griffin_lim_iters:60\n",
996
+ " | > signal_norm:False\n",
997
+ " | > symmetric_norm:True\n",
998
+ " | > mel_fmin:0\n",
999
+ " | > mel_fmax:8000.0\n",
1000
+ " | > pitch_fmin:1.0\n",
1001
+ " | > pitch_fmax:640.0\n",
1002
+ " | > spec_gain:1.0\n",
1003
+ " | > stft_pad_mode:reflect\n",
1004
+ " | > max_norm:4.0\n",
1005
+ " | > clip_norm:True\n",
1006
+ " | > do_trim_silence:True\n",
1007
+ " | > trim_db:60\n",
1008
+ " | > do_sound_norm:False\n",
1009
+ " | > do_amp_to_db_linear:True\n",
1010
+ " | > do_amp_to_db_mel:True\n",
1011
+ " | > do_rms_norm:False\n",
1012
+ " | > db_level:None\n",
1013
+ " | > stats_path:None\n",
1014
+ " | > base:2.718281828459045\n",
1015
+ " | > hop_length:256\n",
1016
+ " | > win_length:1024\n",
1017
+ " > Vocoder Model: hifigan\n",
1018
+ " > Setting up Audio Processor...\n",
1019
+ " | > sample_rate:22050\n",
1020
+ " | > resample:False\n",
1021
+ " | > num_mels:80\n",
1022
+ " | > log_func:np.log\n",
1023
+ " | > min_level_db:-100\n",
1024
+ " | > frame_shift_ms:None\n",
1025
+ " | > frame_length_ms:None\n",
1026
+ " | > ref_level_db:20\n",
1027
+ " | > fft_size:1024\n",
1028
+ " | > power:1.5\n",
1029
+ " | > preemphasis:0.0\n",
1030
+ " | > griffin_lim_iters:60\n",
1031
+ " | > signal_norm:False\n",
1032
+ " | > symmetric_norm:True\n",
1033
+ " | > mel_fmin:0\n",
1034
+ " | > mel_fmax:8000.0\n",
1035
+ " | > pitch_fmin:1.0\n",
1036
+ " | > pitch_fmax:640.0\n",
1037
+ " | > spec_gain:1.0\n",
1038
+ " | > stft_pad_mode:reflect\n",
1039
+ " | > max_norm:4.0\n",
1040
+ " | > clip_norm:True\n",
1041
+ " | > do_trim_silence:False\n",
1042
+ " | > trim_db:60\n",
1043
+ " | > do_sound_norm:False\n",
1044
+ " | > do_amp_to_db_linear:True\n",
1045
+ " | > do_amp_to_db_mel:True\n",
1046
+ " | > do_rms_norm:False\n",
1047
+ " | > db_level:None\n",
1048
+ " | > stats_path:None\n",
1049
+ " | > base:2.718281828459045\n",
1050
+ " | > hop_length:256\n",
1051
+ " | > win_length:1024\n",
1052
+ " > Generator Model: hifigan_generator\n",
1053
+ " > Discriminator Model: hifigan_discriminator\n",
1054
+ "Removing weight norm...\n",
1055
+ "model: tts_models/en/ljspeech/neural_hmm\n",
1056
+ "language: \n",
1057
+ "speaker: \n",
1058
+ "Using original voice\n",
1059
+ " > Text splitted to sentences.\n",
1060
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1061
+ " > Processing time: 2.3940000534057617\n",
1062
+ " > Real-time factor: 0.27230367477713896\n",
1063
+ "model: tts_models/en/ljspeech/neural_hmm\n",
1064
  "language: \n",
1065
  "speaker: \n",
1066
  "voice cloning with the voice conversion model\n",
1067
  " > Text splitted to sentences.\n",
1068
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1069
+ " > Processing time: 2.628000020980835\n",
1070
+ " > Real-time factor: 0.2965699745262212\n",
1071
+ " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--vctk--fast_pitch\n",
1072
+ " > Model's license - CC BY-NC-ND 4.0\n",
1073
+ " > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.\n",
1074
+ " > Using model: fast_pitch\n",
1075
+ " > Setting up Audio Processor...\n",
1076
+ " | > sample_rate:22050\n",
1077
+ " | > resample:False\n",
1078
+ " | > num_mels:80\n",
1079
+ " | > log_func:np.log\n",
1080
+ " | > min_level_db:-100\n",
1081
+ " | > frame_shift_ms:None\n",
1082
+ " | > frame_length_ms:None\n",
1083
+ " | > ref_level_db:20\n",
1084
+ " | > fft_size:1024\n",
1085
+ " | > power:1.5\n",
1086
+ " | > preemphasis:0.0\n",
1087
+ " | > griffin_lim_iters:60\n",
1088
+ " | > signal_norm:False\n",
1089
+ " | > symmetric_norm:True\n",
1090
+ " | > mel_fmin:0\n",
1091
+ " | > mel_fmax:8000.0\n",
1092
+ " | > pitch_fmin:0.0\n",
1093
+ " | > pitch_fmax:640.0\n",
1094
+ " | > spec_gain:1.0\n",
1095
+ " | > stft_pad_mode:reflect\n",
1096
+ " | > max_norm:4.0\n",
1097
+ " | > clip_norm:True\n",
1098
+ " | > do_trim_silence:True\n",
1099
+ " | > trim_db:23\n",
1100
+ " | > do_sound_norm:False\n",
1101
+ " | > do_amp_to_db_linear:True\n",
1102
+ " | > do_amp_to_db_mel:True\n",
1103
+ " | > do_rms_norm:False\n",
1104
+ " | > db_level:None\n",
1105
+ " | > stats_path:None\n",
1106
+ " | > base:2.718281828459045\n",
1107
+ " | > hop_length:256\n",
1108
+ " | > win_length:1024\n",
1109
+ " > Init speaker_embedding layer.\n",
1110
+ "model: tts_models/en/vctk/fast_pitch\n",
1111
+ "language: \n",
1112
+ "speaker: VCTK_p225\n",
1113
+ "Using original voice\n",
1114
+ " > Text splitted to sentences.\n",
1115
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1116
+ "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
1117
+ " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
1118
+ " > Processing time: 4.122999906539917\n",
1119
+ " > Real-time factor: 0.6120216766695737\n",
1120
+ "model: tts_models/en/vctk/fast_pitch\n",
1121
+ "language: \n",
1122
+ "speaker: VCTK_p227\n",
1123
+ "Using original voice\n",
1124
+ " > Text splitted to sentences.\n",
1125
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
1126
+ " > Processing time: 3.615000009536743\n",
1127
+ " > Real-time factor: 0.5239715910962163\n",
1128
+ "model: tts_models/en/vctk/fast_pitch\n",
1129
+ "language: \n",
1130
+ "speaker: VCTK_p227\n",
1131
+ "voice cloning with the tts\n",
1132
+ " > Text splitted to sentences.\n",
1133
+ "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n"
1134
  ]
1135
  },
1136
  {
 
1150
  " return await future\n",
1151
  " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
1152
  " result = context.run(func, *args)\n",
1153
+ " File \"<ipython-input-6-20fd07aa6e62>\", line 65, in text_to_speech\n",
1154
+ " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
1155
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\", line 548, in tts\n",
1156
+ " **kwargs,\n",
1157
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 340, in tts\n",
1158
+ " speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)\n",
1159
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\tts\\utils\\managers.py\", line 365, in compute_embedding_from_clip\n",
1160
+ " embedding = _compute(wav_file)\n",
1161
+ " File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\tts\\utils\\managers.py\", line 342, in _compute\n",
1162
+ " waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)\n",
1163
+ "AttributeError: 'NoneType' object has no attribute 'load_wav'\n"
 
 
 
 
1164
  ]
1165
  }
1166
  ],
 
1205
  "def voice_clone(source_wav, target_wav):\n",
1206
  " print(f'model: {VC_MODEL.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
1207
  " sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
1208
+ " if source_wav is None or target_wav is None:\n",
1209
  " return (sample_rate, np.zeros(0).astype(np.int16))\n",
1210
  "\n",
1211
+ " speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
1212
  " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
1213
  " return (sample_rate, speech)\n",
1214
  "\n",
 
1227
  " if use_original_voice:\n",
1228
  " print('Using original voice')\n",
1229
  " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
1230
+ " elif tts_model.synthesizer.tts_model.speaker_manager and tts_model.synthesizer.tts_model.speaker_manager.encoder_ap:\n",
1231
  " print('voice cloning with the tts')\n",
1232
  " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
1233
  " else:\n",
1234
  " print('voice cloning with the voice conversion model')\n",
1235
+ "# speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
1236
+ " with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False) as fp:\n",
1237
+ " # Lazy code... save it to a temp file to resample it while reading it for VC\n",
1238
+ " tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
1239
+ " speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
1240
+ " \n",
1241
  "\n",
1242
  " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
1243
  " return (sample_rate, speech)\n",
 
1245
  "\n",
1246
  "with gr.Blocks() as demo:\n",
1247
  " tts_model = gr.State(None)\n",
1248
+ "# vc_model = gr.State(None)\n",
1249
  " def activate(*args):\n",
1250
  " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
1251
  " def deactivate(*args):\n",
Coqui.ai-Copy1.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 4,
6
- "id": "80ca0f5c",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -20,8 +20,8 @@
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 6,
24
- "id": "4ad622cb",
25
  "metadata": {
26
  "scrolled": false
27
  },
@@ -36,7 +36,7 @@
36
  " > Using model: freevc\n",
37
  " > Loading pretrained speaker encoder model ...\n",
38
  "Loaded the voice encoder model on cpu in 0.01 seconds.\n",
39
- "Running on local URL: http://127.0.0.1:7863\n",
40
  "\n",
41
  "To create a public link, set `share=True` in `launch()`.\n"
42
  ]
@@ -44,7 +44,7 @@
44
  {
45
  "data": {
46
  "text/html": [
47
- "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
48
  ],
49
  "text/plain": [
50
  "<IPython.core.display.HTML object>"
@@ -57,7 +57,7 @@
57
  "data": {
58
  "text/plain": []
59
  },
60
- "execution_count": 6,
61
  "metadata": {},
62
  "output_type": "execute_result"
63
  },
@@ -65,504 +65,10 @@
65
  "name": "stdout",
66
  "output_type": "stream",
67
  "text": [
68
- " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
69
- " > Model's license - apache 2.0\n",
70
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
71
- " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
72
- " > Model's license - apache 2.0\n",
73
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
74
- " > Using model: Tacotron2\n",
75
- " > Setting up Audio Processor...\n",
76
- " | > sample_rate:22050\n",
77
- " | > resample:False\n",
78
- " | > num_mels:80\n",
79
- " | > log_func:np.log10\n",
80
- " | > min_level_db:-100\n",
81
- " | > frame_shift_ms:None\n",
82
- " | > frame_length_ms:None\n",
83
- " | > ref_level_db:20\n",
84
- " | > fft_size:1024\n",
85
- " | > power:1.5\n",
86
- " | > preemphasis:0.0\n",
87
- " | > griffin_lim_iters:60\n",
88
- " | > signal_norm:True\n",
89
- " | > symmetric_norm:True\n",
90
- " | > mel_fmin:50.0\n",
91
- " | > mel_fmax:7600.0\n",
92
- " | > pitch_fmin:0.0\n",
93
- " | > pitch_fmax:640.0\n",
94
- " | > spec_gain:1.0\n",
95
- " | > stft_pad_mode:reflect\n",
96
- " | > max_norm:4.0\n",
97
- " | > clip_norm:True\n",
98
- " | > do_trim_silence:True\n",
99
- " | > trim_db:60\n",
100
- " | > do_sound_norm:False\n",
101
- " | > do_amp_to_db_linear:True\n",
102
- " | > do_amp_to_db_mel:True\n",
103
- " | > do_rms_norm:False\n",
104
- " | > db_level:None\n",
105
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
106
- " | > base:10\n",
107
- " | > hop_length:256\n",
108
- " | > win_length:1024\n",
109
- " > Model's reduction rate `r` is set to: 2\n",
110
- " > Vocoder Model: univnet\n",
111
- " > Setting up Audio Processor...\n",
112
- " | > sample_rate:22050\n",
113
- " | > resample:False\n",
114
- " | > num_mels:80\n",
115
- " | > log_func:np.log10\n",
116
- " | > min_level_db:-100\n",
117
- " | > frame_shift_ms:None\n",
118
- " | > frame_length_ms:None\n",
119
- " | > ref_level_db:20\n",
120
- " | > fft_size:1024\n",
121
- " | > power:1.5\n",
122
- " | > preemphasis:0.0\n",
123
- " | > griffin_lim_iters:60\n",
124
- " | > signal_norm:True\n",
125
- " | > symmetric_norm:True\n",
126
- " | > mel_fmin:50.0\n",
127
- " | > mel_fmax:7600.0\n",
128
- " | > pitch_fmin:1.0\n",
129
- " | > pitch_fmax:640.0\n",
130
- " | > spec_gain:1.0\n",
131
- " | > stft_pad_mode:reflect\n",
132
- " | > max_norm:4.0\n",
133
- " | > clip_norm:True\n",
134
- " | > do_trim_silence:True\n",
135
- " | > trim_db:60\n",
136
- " | > do_sound_norm:False\n",
137
- " | > do_amp_to_db_linear:True\n",
138
- " | > do_amp_to_db_mel:True\n",
139
- " | > do_rms_norm:False\n",
140
- " | > db_level:None\n",
141
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
142
- " | > base:10\n",
143
- " | > hop_length:256\n",
144
- " | > win_length:1024\n",
145
- " > Generator Model: univnet_generator\n",
146
- " > Discriminator Model: univnet_discriminator\n",
147
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
148
- "language: \n",
149
- "speaker: \n",
150
- "voice cloning with the voice conversion model\n",
151
- " > Text splitted to sentences.\n",
152
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
153
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
154
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
155
- " > Processing time: 3.3410003185272217\n",
156
- " > Real-time factor: 0.38459038289093944\n",
157
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
158
- "language: \n",
159
- "speaker: \n",
160
- "voice cloning with the voice conversion model\n",
161
- " > Text splitted to sentences.\n",
162
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
163
- " > Processing time: 2.9179999828338623\n",
164
- " > Real-time factor: 0.3358978221135079\n",
165
- " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
166
- " > Model's license - apache 2.0\n",
167
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
168
- " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
169
- " > Model's license - apache 2.0\n",
170
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
171
- " > Using model: Tacotron2\n",
172
- " > Setting up Audio Processor...\n",
173
- " | > sample_rate:22050\n",
174
- " | > resample:False\n",
175
- " | > num_mels:80\n",
176
- " | > log_func:np.log10\n",
177
- " | > min_level_db:-100\n",
178
- " | > frame_shift_ms:None\n",
179
- " | > frame_length_ms:None\n",
180
- " | > ref_level_db:20\n",
181
- " | > fft_size:1024\n",
182
- " | > power:1.5\n",
183
- " | > preemphasis:0.0\n",
184
- " | > griffin_lim_iters:60\n",
185
- " | > signal_norm:True\n",
186
- " | > symmetric_norm:True\n",
187
- " | > mel_fmin:50.0\n",
188
- " | > mel_fmax:7600.0\n",
189
- " | > pitch_fmin:0.0\n",
190
- " | > pitch_fmax:640.0\n",
191
- " | > spec_gain:1.0\n",
192
- " | > stft_pad_mode:reflect\n",
193
- " | > max_norm:4.0\n",
194
- " | > clip_norm:True\n",
195
- " | > do_trim_silence:True\n",
196
- " | > trim_db:60\n",
197
- " | > do_sound_norm:False\n",
198
- " | > do_amp_to_db_linear:True\n",
199
- " | > do_amp_to_db_mel:True\n",
200
- " | > do_rms_norm:False\n",
201
- " | > db_level:None\n",
202
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
203
- " | > base:10\n",
204
- " | > hop_length:256\n",
205
- " | > win_length:1024\n",
206
- " > Model's reduction rate `r` is set to: 2\n",
207
- " > Vocoder Model: univnet\n",
208
- " > Setting up Audio Processor...\n",
209
- " | > sample_rate:22050\n",
210
- " | > resample:False\n",
211
- " | > num_mels:80\n",
212
- " | > log_func:np.log10\n",
213
- " | > min_level_db:-100\n",
214
- " | > frame_shift_ms:None\n",
215
- " | > frame_length_ms:None\n",
216
- " | > ref_level_db:20\n",
217
- " | > fft_size:1024\n",
218
- " | > power:1.5\n",
219
- " | > preemphasis:0.0\n",
220
- " | > griffin_lim_iters:60\n",
221
- " | > signal_norm:True\n",
222
- " | > symmetric_norm:True\n",
223
- " | > mel_fmin:50.0\n",
224
- " | > mel_fmax:7600.0\n",
225
- " | > pitch_fmin:1.0\n",
226
- " | > pitch_fmax:640.0\n",
227
- " | > spec_gain:1.0\n",
228
- " | > stft_pad_mode:reflect\n",
229
- " | > max_norm:4.0\n",
230
- " | > clip_norm:True\n",
231
- " | > do_trim_silence:True\n",
232
- " | > trim_db:60\n",
233
- " | > do_sound_norm:False\n",
234
- " | > do_amp_to_db_linear:True\n",
235
- " | > do_amp_to_db_mel:True\n",
236
- " | > do_rms_norm:False\n",
237
- " | > db_level:None\n",
238
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
239
- " | > base:10\n",
240
- " | > hop_length:256\n",
241
- " | > win_length:1024\n",
242
- " > Generator Model: univnet_generator\n",
243
- " > Discriminator Model: univnet_discriminator\n",
244
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
245
- "language: \n",
246
- "speaker: \n",
247
- "voice cloning with the voice conversion model\n",
248
- " > Text splitted to sentences.\n",
249
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
250
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
251
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
252
- " > Processing time: 3.021000385284424\n",
253
- " > Real-time factor: 0.3477544400242312\n",
254
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
255
- "language: \n",
256
- "speaker: \n",
257
- "voice cloning with the voice conversion model\n",
258
- " > Text splitted to sentences.\n",
259
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
260
- " > Processing time: 2.9099998474121094\n",
261
- " > Real-time factor: 0.33497690776101013\n",
262
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
263
- "language: \n",
264
- "speaker: \n",
265
- "voice cloning with the voice conversion model\n",
266
- " > Text splitted to sentences.\n",
267
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
268
- " > Processing time: 2.933000087738037\n",
269
- " > Real-time factor: 0.33762451937136506\n",
270
- " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
271
- " > Model's license - apache 2.0\n",
272
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
273
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
274
- " > Model's license - apache 2.0\n",
275
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
276
- " > Using model: Tacotron2\n",
277
- " > Setting up Audio Processor...\n",
278
- " | > sample_rate:22050\n",
279
- " | > resample:False\n",
280
- " | > num_mels:80\n",
281
- " | > log_func:np.log\n",
282
- " | > min_level_db:-100\n",
283
- " | > frame_shift_ms:None\n",
284
- " | > frame_length_ms:None\n",
285
- " | > ref_level_db:20\n",
286
- " | > fft_size:1024\n",
287
- " | > power:1.5\n",
288
- " | > preemphasis:0.0\n",
289
- " | > griffin_lim_iters:60\n",
290
- " | > signal_norm:False\n",
291
- " | > symmetric_norm:True\n",
292
- " | > mel_fmin:0\n",
293
- " | > mel_fmax:8000.0\n",
294
- " | > pitch_fmin:1.0\n",
295
- " | > pitch_fmax:640.0\n",
296
- " | > spec_gain:1.0\n",
297
- " | > stft_pad_mode:reflect\n",
298
- " | > max_norm:4.0\n",
299
- " | > clip_norm:True\n",
300
- " | > do_trim_silence:True\n",
301
- " | > trim_db:60\n",
302
- " | > do_sound_norm:False\n",
303
- " | > do_amp_to_db_linear:True\n",
304
- " | > do_amp_to_db_mel:True\n",
305
- " | > do_rms_norm:False\n",
306
- " | > db_level:None\n",
307
- " | > stats_path:None\n",
308
- " | > base:2.718281828459045\n",
309
- " | > hop_length:256\n",
310
- " | > win_length:1024\n",
311
- " > Model's reduction rate `r` is set to: 1\n",
312
- " > Vocoder Model: hifigan\n",
313
- " > Setting up Audio Processor...\n",
314
- " | > sample_rate:22050\n",
315
- " | > resample:False\n",
316
- " | > num_mels:80\n",
317
- " | > log_func:np.log\n",
318
- " | > min_level_db:-100\n",
319
- " | > frame_shift_ms:None\n",
320
- " | > frame_length_ms:None\n",
321
- " | > ref_level_db:20\n",
322
- " | > fft_size:1024\n",
323
- " | > power:1.5\n",
324
- " | > preemphasis:0.0\n",
325
- " | > griffin_lim_iters:60\n",
326
- " | > signal_norm:False\n",
327
- " | > symmetric_norm:True\n",
328
- " | > mel_fmin:0\n",
329
- " | > mel_fmax:8000.0\n",
330
- " | > pitch_fmin:1.0\n",
331
- " | > pitch_fmax:640.0\n",
332
- " | > spec_gain:1.0\n",
333
- " | > stft_pad_mode:reflect\n",
334
- " | > max_norm:4.0\n",
335
- " | > clip_norm:True\n",
336
- " | > do_trim_silence:False\n",
337
- " | > trim_db:60\n",
338
- " | > do_sound_norm:False\n",
339
- " | > do_amp_to_db_linear:True\n",
340
- " | > do_amp_to_db_mel:True\n",
341
- " | > do_rms_norm:False\n",
342
- " | > db_level:None\n",
343
- " | > stats_path:None\n",
344
- " | > base:2.718281828459045\n",
345
- " | > hop_length:256\n",
346
- " | > win_length:1024\n",
347
- " > Generator Model: hifigan_generator\n",
348
- " > Discriminator Model: hifigan_discriminator\n"
349
- ]
350
- },
351
- {
352
- "name": "stdout",
353
- "output_type": "stream",
354
- "text": [
355
- "Removing weight norm...\n",
356
- "model: tts_models/en/ljspeech/tacotron2-DDC\n",
357
- "language: \n",
358
- "speaker: \n",
359
- "voice cloning with the voice conversion model\n",
360
- " > Text splitted to sentences.\n",
361
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
362
- " > Processing time: 4.28600001335144\n",
363
- " > Real-time factor: 0.42371906516498953\n",
364
- " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
365
- " > Model's license - apache 2.0\n",
366
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
367
- " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
368
- " > Model's license - apache 2.0\n",
369
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
370
- " > Using model: Tacotron2\n",
371
- " > Setting up Audio Processor...\n",
372
- " | > sample_rate:22050\n",
373
- " | > resample:False\n",
374
- " | > num_mels:80\n",
375
- " | > log_func:np.log10\n",
376
- " | > min_level_db:-10\n",
377
- " | > frame_shift_ms:None\n",
378
- " | > frame_length_ms:None\n",
379
- " | > ref_level_db:0\n",
380
- " | > fft_size:1024\n",
381
- " | > power:1.8\n",
382
- " | > preemphasis:0.99\n",
383
- " | > griffin_lim_iters:60\n",
384
- " | > signal_norm:True\n",
385
- " | > symmetric_norm:True\n",
386
- " | > mel_fmin:0\n",
387
- " | > mel_fmax:8000.0\n",
388
- " | > pitch_fmin:1.0\n",
389
- " | > pitch_fmax:640.0\n",
390
- " | > spec_gain:1.0\n",
391
- " | > stft_pad_mode:reflect\n",
392
- " | > max_norm:4.0\n",
393
- " | > clip_norm:True\n",
394
- " | > do_trim_silence:True\n",
395
- " | > trim_db:60\n",
396
- " | > do_sound_norm:False\n",
397
- " | > do_amp_to_db_linear:True\n",
398
- " | > do_amp_to_db_mel:True\n",
399
- " | > do_rms_norm:False\n",
400
- " | > db_level:None\n",
401
- " | > stats_path:None\n",
402
- " | > base:10\n",
403
- " | > hop_length:256\n",
404
- " | > win_length:1024\n",
405
- " > Model's reduction rate `r` is set to: 2\n",
406
- " > Vocoder Model: wavegrad\n",
407
- "model: tts_models/en/ek1/tacotron2\n",
408
- "language: \n",
409
- "speaker: \n",
410
- "voice cloning with the voice conversion model\n",
411
- " > Text splitted to sentences.\n",
412
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
413
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
414
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
415
- " > Processing time: 224.84099984169006\n",
416
- " > Real-time factor: 29.51038122922182\n",
417
- " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
418
- " > Model's license - apache 2.0\n",
419
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
420
- " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
421
- " > Model's license - apache 2.0\n",
422
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
423
- " > Using model: Tacotron2\n",
424
- " > Setting up Audio Processor...\n",
425
- " | > sample_rate:22050\n",
426
- " | > resample:False\n",
427
- " | > num_mels:80\n",
428
- " | > log_func:np.log10\n",
429
- " | > min_level_db:-10\n",
430
- " | > frame_shift_ms:None\n",
431
- " | > frame_length_ms:None\n",
432
- " | > ref_level_db:0\n",
433
- " | > fft_size:1024\n",
434
- " | > power:1.8\n",
435
- " | > preemphasis:0.99\n",
436
- " | > griffin_lim_iters:60\n",
437
- " | > signal_norm:True\n",
438
- " | > symmetric_norm:True\n",
439
- " | > mel_fmin:0\n",
440
- " | > mel_fmax:8000.0\n",
441
- " | > pitch_fmin:1.0\n",
442
- " | > pitch_fmax:640.0\n",
443
- " | > spec_gain:1.0\n",
444
- " | > stft_pad_mode:reflect\n",
445
- " | > max_norm:4.0\n",
446
- " | > clip_norm:True\n",
447
- " | > do_trim_silence:True\n",
448
- " | > trim_db:60\n",
449
- " | > do_sound_norm:False\n",
450
- " | > do_amp_to_db_linear:True\n",
451
- " | > do_amp_to_db_mel:True\n",
452
- " | > do_rms_norm:False\n",
453
- " | > db_level:None\n",
454
- " | > stats_path:None\n",
455
- " | > base:10\n",
456
- " | > hop_length:256\n",
457
- " | > win_length:1024\n",
458
- " > Model's reduction rate `r` is set to: 2\n",
459
- " > Vocoder Model: wavegrad\n",
460
- "model: tts_models/en/ek1/tacotron2\n",
461
- "language: \n",
462
- "speaker: \n",
463
- "voice cloning with the voice conversion model\n",
464
- " > Text splitted to sentences.\n",
465
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
466
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
467
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
468
- " > Processing time: 266.6489999294281\n",
469
- " > Real-time factor: 34.99768124073744\n",
470
- " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
471
- " > Model's license - apache 2.0\n",
472
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
473
- " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
474
- " > Model's license - apache 2.0\n",
475
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
476
- " > Using model: Tacotron2\n",
477
- " > Setting up Audio Processor...\n",
478
- " | > sample_rate:22050\n",
479
- " | > resample:False\n",
480
- " | > num_mels:80\n",
481
- " | > log_func:np.log10\n",
482
- " | > min_level_db:-100\n",
483
- " | > frame_shift_ms:None\n",
484
- " | > frame_length_ms:None\n",
485
- " | > ref_level_db:20\n",
486
- " | > fft_size:1024\n",
487
- " | > power:1.5\n",
488
- " | > preemphasis:0.0\n",
489
- " | > griffin_lim_iters:60\n",
490
- " | > signal_norm:True\n",
491
- " | > symmetric_norm:True\n",
492
- " | > mel_fmin:50.0\n",
493
- " | > mel_fmax:7600.0\n",
494
- " | > pitch_fmin:0.0\n",
495
- " | > pitch_fmax:640.0\n",
496
- " | > spec_gain:1.0\n",
497
- " | > stft_pad_mode:reflect\n",
498
- " | > max_norm:4.0\n",
499
- " | > clip_norm:True\n",
500
- " | > do_trim_silence:True\n",
501
- " | > trim_db:60\n",
502
- " | > do_sound_norm:False\n",
503
- " | > do_amp_to_db_linear:True\n",
504
- " | > do_amp_to_db_mel:True\n",
505
- " | > do_rms_norm:False\n",
506
- " | > db_level:None\n",
507
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
508
- " | > base:10\n",
509
- " | > hop_length:256\n",
510
- " | > win_length:1024\n",
511
- " > Model's reduction rate `r` is set to: 2\n",
512
- " > Vocoder Model: univnet\n",
513
- " > Setting up Audio Processor...\n",
514
- " | > sample_rate:22050\n",
515
- " | > resample:False\n",
516
- " | > num_mels:80\n",
517
- " | > log_func:np.log10\n",
518
- " | > min_level_db:-100\n",
519
- " | > frame_shift_ms:None\n",
520
- " | > frame_length_ms:None\n",
521
- " | > ref_level_db:20\n",
522
- " | > fft_size:1024\n",
523
- " | > power:1.5\n",
524
- " | > preemphasis:0.0\n",
525
- " | > griffin_lim_iters:60\n",
526
- " | > signal_norm:True\n",
527
- " | > symmetric_norm:True\n",
528
- " | > mel_fmin:50.0\n",
529
- " | > mel_fmax:7600.0\n",
530
- " | > pitch_fmin:1.0\n",
531
- " | > pitch_fmax:640.0\n",
532
- " | > spec_gain:1.0\n",
533
- " | > stft_pad_mode:reflect\n",
534
- " | > max_norm:4.0\n",
535
- " | > clip_norm:True\n",
536
- " | > do_trim_silence:True\n",
537
- " | > trim_db:60\n",
538
- " | > do_sound_norm:False\n",
539
- " | > do_amp_to_db_linear:True\n",
540
- " | > do_amp_to_db_mel:True\n",
541
- " | > do_rms_norm:False\n",
542
- " | > db_level:None\n",
543
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
544
- " | > base:10\n",
545
- " | > hop_length:256\n",
546
- " | > win_length:1024\n",
547
- " > Generator Model: univnet_generator\n",
548
- " > Discriminator Model: univnet_discriminator\n",
549
- "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
550
- "language: \n",
551
- "speaker: \n",
552
- "voice cloning with the voice conversion model\n",
553
- " > Text splitted to sentences.\n",
554
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
555
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
556
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
557
- " > Processing time: 2.885999917984009\n",
558
- " > Real-time factor: 0.3322142195933605\n",
559
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--speedy-speech\n",
560
- " > Model's license - apache 2.0\n",
561
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
562
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
563
- " > Model's license - apache 2.0\n",
564
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
565
- " > Using model: speedy_speech\n",
566
  " > Setting up Audio Processor...\n",
567
  " | > sample_rate:22050\n",
568
  " | > resample:False\n",
@@ -580,49 +86,14 @@
580
  " | > symmetric_norm:True\n",
581
  " | > mel_fmin:0\n",
582
  " | > mel_fmax:8000.0\n",
583
- " | > pitch_fmin:1.0\n",
584
  " | > pitch_fmax:640.0\n",
585
  " | > spec_gain:1.0\n",
586
  " | > stft_pad_mode:reflect\n",
587
  " | > max_norm:4.0\n",
588
  " | > clip_norm:True\n",
589
  " | > do_trim_silence:True\n",
590
- " | > trim_db:60\n",
591
- " | > do_sound_norm:False\n",
592
- " | > do_amp_to_db_linear:True\n",
593
- " | > do_amp_to_db_mel:True\n",
594
- " | > do_rms_norm:False\n",
595
- " | > db_level:None\n",
596
- " | > stats_path:None\n",
597
- " | > base:2.718281828459045\n",
598
- " | > hop_length:256\n",
599
- " | > win_length:1024\n",
600
- " > Vocoder Model: hifigan\n",
601
- " > Setting up Audio Processor...\n",
602
- " | > sample_rate:22050\n",
603
- " | > resample:False\n",
604
- " | > num_mels:80\n",
605
- " | > log_func:np.log\n",
606
- " | > min_level_db:-100\n",
607
- " | > frame_shift_ms:None\n",
608
- " | > frame_length_ms:None\n",
609
- " | > ref_level_db:20\n",
610
- " | > fft_size:1024\n",
611
- " | > power:1.5\n",
612
- " | > preemphasis:0.0\n",
613
- " | > griffin_lim_iters:60\n",
614
- " | > signal_norm:False\n",
615
- " | > symmetric_norm:True\n",
616
- " | > mel_fmin:0\n",
617
- " | > mel_fmax:8000.0\n",
618
- " | > pitch_fmin:1.0\n",
619
- " | > pitch_fmax:640.0\n",
620
- " | > spec_gain:1.0\n",
621
- " | > stft_pad_mode:reflect\n",
622
- " | > max_norm:4.0\n",
623
- " | > clip_norm:True\n",
624
- " | > do_trim_silence:False\n",
625
- " | > trim_db:60\n",
626
  " | > do_sound_norm:False\n",
627
  " | > do_amp_to_db_linear:True\n",
628
  " | > do_amp_to_db_mel:True\n",
@@ -632,347 +103,65 @@
632
  " | > base:2.718281828459045\n",
633
  " | > hop_length:256\n",
634
  " | > win_length:1024\n",
635
- " > Generator Model: hifigan_generator\n",
636
- " > Discriminator Model: hifigan_discriminator\n",
637
- "Removing weight norm...\n"
638
- ]
639
- },
640
- {
641
- "name": "stdout",
642
- "output_type": "stream",
643
- "text": [
644
- "model: tts_models/en/ljspeech/speedy-speech\n",
645
  "language: \n",
646
- "speaker: \n",
647
  "Using original voice\n",
648
  " > Text splitted to sentences.\n",
649
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
650
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
651
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
652
- " > Processing time: 0.9679999351501465\n",
653
- " > Real-time factor: 0.11673301633083617\n",
654
- "model: tts_models/en/ljspeech/speedy-speech\n",
655
- "language: \n",
656
- "speaker: \n",
657
- "voice cloning with the voice conversion model\n",
658
- " > Text splitted to sentences.\n",
659
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
660
- " > Processing time: 0.9630000591278076\n",
661
- " > Real-time factor: 0.11613007144605443\n",
662
- " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
663
- " > Model's license - MPL\n",
664
- " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
665
- " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
666
- " > Model's license - MPL\n",
667
- " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
668
- " > Using model: Tacotron2\n",
669
- " > Setting up Audio Processor...\n",
670
- " | > sample_rate:22050\n",
671
- " | > resample:False\n",
672
- " | > num_mels:80\n",
673
- " | > log_func:np.log10\n",
674
- " | > min_level_db:-100\n",
675
- " | > frame_shift_ms:None\n",
676
- " | > frame_length_ms:None\n",
677
- " | > ref_level_db:20\n",
678
- " | > fft_size:1024\n",
679
- " | > power:1.5\n",
680
- " | > preemphasis:0.0\n",
681
- " | > griffin_lim_iters:60\n",
682
- " | > signal_norm:True\n",
683
- " | > symmetric_norm:True\n",
684
- " | > mel_fmin:50.0\n",
685
- " | > mel_fmax:7600.0\n",
686
- " | > pitch_fmin:0.0\n",
687
- " | > pitch_fmax:640.0\n",
688
- " | > spec_gain:1.0\n",
689
- " | > stft_pad_mode:reflect\n",
690
- " | > max_norm:4.0\n",
691
- " | > clip_norm:True\n",
692
- " | > do_trim_silence:True\n",
693
- " | > trim_db:60\n",
694
- " | > do_sound_norm:False\n",
695
- " | > do_amp_to_db_linear:True\n",
696
- " | > do_amp_to_db_mel:True\n",
697
- " | > do_rms_norm:False\n",
698
- " | > db_level:None\n",
699
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
700
- " | > base:10\n",
701
- " | > hop_length:256\n",
702
- " | > win_length:1024\n",
703
- " > Model's reduction rate `r` is set to: 2\n",
704
- " > Vocoder Model: multiband_melgan\n",
705
- " > Setting up Audio Processor...\n",
706
- " | > sample_rate:22050\n",
707
- " | > resample:False\n",
708
- " | > num_mels:80\n",
709
- " | > log_func:np.log10\n",
710
- " | > min_level_db:-100\n",
711
- " | > frame_shift_ms:None\n",
712
- " | > frame_length_ms:None\n",
713
- " | > ref_level_db:0\n",
714
- " | > fft_size:1024\n",
715
- " | > power:1.5\n",
716
- " | > preemphasis:0.0\n",
717
- " | > griffin_lim_iters:60\n",
718
- " | > signal_norm:True\n",
719
- " | > symmetric_norm:True\n",
720
- " | > mel_fmin:50.0\n",
721
- " | > mel_fmax:7600.0\n",
722
- " | > pitch_fmin:0.0\n",
723
- " | > pitch_fmax:640.0\n",
724
- " | > spec_gain:1.0\n",
725
- " | > stft_pad_mode:reflect\n",
726
- " | > max_norm:4.0\n",
727
- " | > clip_norm:True\n",
728
- " | > do_trim_silence:True\n",
729
- " | > trim_db:60\n",
730
- " | > do_sound_norm:False\n",
731
- " | > do_amp_to_db_linear:True\n",
732
- " | > do_amp_to_db_mel:True\n",
733
- " | > do_rms_norm:False\n",
734
- " | > db_level:None\n",
735
- " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
736
- " | > base:10\n",
737
- " | > hop_length:256\n",
738
- " | > win_length:1024\n",
739
- " > Generator Model: multiband_melgan_generator\n",
740
- " > Discriminator Model: melgan_multiscale_discriminator\n",
741
- "model: tts_models/en/ljspeech/tacotron2-DCA\n",
742
  "language: \n",
743
- "speaker: \n",
744
  "Using original voice\n",
745
  " > Text splitted to sentences.\n",
746
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
747
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
748
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
749
- " > Processing time: 2.067000150680542\n",
750
- " > Real-time factor: 0.23295588670728015\n",
751
- "model: tts_models/en/ljspeech/tacotron2-DCA\n",
752
- "language: \n",
753
- "speaker: \n",
754
- "voice cloning with the voice conversion model\n",
755
- " > Text splitted to sentences.\n",
756
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
757
- " > Processing time: 2.1570000648498535\n",
758
- " > Real-time factor: 0.2430990934225715\n",
759
- "model: tts_models/en/ljspeech/tacotron2-DCA\n",
760
  "language: \n",
761
- "speaker: \n",
762
- "voice cloning with the voice conversion model\n",
763
  " > Text splitted to sentences.\n",
764
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
765
- " > Processing time: 2.0920000076293945\n",
766
- " > Real-time factor: 0.23577343069302087\n",
767
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--fast_pitch\n",
768
- " > Model's license - apache 2.0\n",
769
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
770
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
771
- " > Model's license - apache 2.0\n",
772
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
773
- " > Using model: fast_pitch\n",
774
- " > Setting up Audio Processor...\n",
775
- " | > sample_rate:22050\n",
776
- " | > resample:False\n",
777
- " | > num_mels:80\n",
778
- " | > log_func:np.log\n",
779
- " | > min_level_db:-100\n",
780
- " | > frame_shift_ms:None\n",
781
- " | > frame_length_ms:None\n",
782
- " | > ref_level_db:20\n",
783
- " | > fft_size:1024\n",
784
- " | > power:1.5\n",
785
- " | > preemphasis:0.0\n",
786
- " | > griffin_lim_iters:60\n",
787
- " | > signal_norm:False\n",
788
- " | > symmetric_norm:True\n",
789
- " | > mel_fmin:0\n",
790
- " | > mel_fmax:8000.0\n",
791
- " | > pitch_fmin:1.0\n",
792
- " | > pitch_fmax:640.0\n",
793
- " | > spec_gain:1.0\n",
794
- " | > stft_pad_mode:reflect\n",
795
- " | > max_norm:4.0\n",
796
- " | > clip_norm:True\n",
797
- " | > do_trim_silence:True\n",
798
- " | > trim_db:60\n",
799
- " | > do_sound_norm:False\n",
800
- " | > do_amp_to_db_linear:True\n",
801
- " | > do_amp_to_db_mel:True\n",
802
- " | > do_rms_norm:False\n",
803
- " | > db_level:None\n",
804
- " | > stats_path:None\n",
805
- " | > base:2.718281828459045\n",
806
- " | > hop_length:256\n",
807
- " | > win_length:1024\n",
808
- " > Vocoder Model: hifigan\n",
809
- " > Setting up Audio Processor...\n",
810
- " | > sample_rate:22050\n",
811
- " | > resample:False\n",
812
- " | > num_mels:80\n",
813
- " | > log_func:np.log\n",
814
- " | > min_level_db:-100\n",
815
- " | > frame_shift_ms:None\n",
816
- " | > frame_length_ms:None\n",
817
- " | > ref_level_db:20\n",
818
- " | > fft_size:1024\n",
819
- " | > power:1.5\n",
820
- " | > preemphasis:0.0\n",
821
- " | > griffin_lim_iters:60\n",
822
- " | > signal_norm:False\n",
823
- " | > symmetric_norm:True\n",
824
- " | > mel_fmin:0\n",
825
- " | > mel_fmax:8000.0\n",
826
- " | > pitch_fmin:1.0\n",
827
- " | > pitch_fmax:640.0\n",
828
- " | > spec_gain:1.0\n",
829
- " | > stft_pad_mode:reflect\n",
830
- " | > max_norm:4.0\n",
831
- " | > clip_norm:True\n",
832
- " | > do_trim_silence:False\n",
833
- " | > trim_db:60\n",
834
- " | > do_sound_norm:False\n",
835
- " | > do_amp_to_db_linear:True\n",
836
- " | > do_amp_to_db_mel:True\n",
837
- " | > do_rms_norm:False\n",
838
- " | > db_level:None\n",
839
- " | > stats_path:None\n",
840
- " | > base:2.718281828459045\n",
841
- " | > hop_length:256\n",
842
- " | > win_length:1024\n",
843
- " > Generator Model: hifigan_generator\n",
844
- " > Discriminator Model: hifigan_discriminator\n",
845
- "Removing weight norm...\n",
846
- "model: tts_models/en/ljspeech/fast_pitch\n",
847
  "language: \n",
848
- "speaker: \n",
849
  "Using original voice\n",
850
  " > Text splitted to sentences.\n",
851
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
852
- "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
853
- " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
854
- " > Processing time: 1.8829996585845947\n",
855
- " > Real-time factor: 0.19894272496832988\n",
856
- "model: tts_models/en/ljspeech/fast_pitch\n",
857
  "language: \n",
858
- "speaker: \n",
859
- "voice cloning with the voice conversion model\n",
860
  " > Text splitted to sentences.\n",
861
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
862
- " > Processing time: 1.8359999656677246\n",
863
- " > Real-time factor: 0.19397711228808903\n",
864
- "model: tts_models/en/ljspeech/fast_pitch\n",
865
  "language: \n",
866
- "speaker: \n",
867
  "voice cloning with the voice conversion model\n",
868
  " > Text splitted to sentences.\n",
869
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
870
- " > Processing time: 1.8659999370574951\n",
871
- " > Real-time factor: 0.19714666998293168\n",
872
- "model: voice_conversion_models/multilingual/vctk/freevc24\n",
873
- "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
874
- "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\11c82c70d145ea630f81dfa541de52bf615719ae\\yearn_for_time-0-100.wav\n",
875
- " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--overflow\n",
876
- " > Model's license - apache 2.0\n",
877
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
878
- " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
879
- " > Model's license - apache 2.0\n",
880
- " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
881
- " > Using model: OverFlow\n",
882
- " > Setting up Audio Processor...\n",
883
- " | > sample_rate:22050\n",
884
- " | > resample:False\n",
885
- " | > num_mels:80\n",
886
- " | > log_func:np.log\n",
887
- " | > min_level_db:-100\n",
888
- " | > frame_shift_ms:None\n",
889
- " | > frame_length_ms:None\n",
890
- " | > ref_level_db:20\n",
891
- " | > fft_size:1024\n",
892
- " | > power:1.5\n",
893
- " | > preemphasis:0.0\n",
894
- " | > griffin_lim_iters:60\n",
895
- " | > signal_norm:False\n",
896
- " | > symmetric_norm:True\n",
897
- " | > mel_fmin:0\n",
898
- " | > mel_fmax:8000.0\n",
899
- " | > pitch_fmin:1.0\n",
900
- " | > pitch_fmax:640.0\n",
901
- " | > spec_gain:1.0\n",
902
- " | > stft_pad_mode:reflect\n",
903
- " | > max_norm:4.0\n",
904
- " | > clip_norm:True\n",
905
- " | > do_trim_silence:True\n",
906
- " | > trim_db:60\n",
907
- " | > do_sound_norm:False\n",
908
- " | > do_amp_to_db_linear:True\n",
909
- " | > do_amp_to_db_mel:True\n",
910
- " | > do_rms_norm:False\n",
911
- " | > db_level:None\n",
912
- " | > stats_path:None\n",
913
- " | > base:2.718281828459045\n",
914
- " | > hop_length:256\n",
915
- " | > win_length:1024\n"
916
- ]
917
- },
918
- {
919
- "name": "stdout",
920
- "output_type": "stream",
921
- "text": [
922
- " > Vocoder Model: hifigan\n",
923
- " > Setting up Audio Processor...\n",
924
- " | > sample_rate:22050\n",
925
- " | > resample:False\n",
926
- " | > num_mels:80\n",
927
- " | > log_func:np.log\n",
928
- " | > min_level_db:-100\n",
929
- " | > frame_shift_ms:None\n",
930
- " | > frame_length_ms:None\n",
931
- " | > ref_level_db:20\n",
932
- " | > fft_size:1024\n",
933
- " | > power:1.5\n",
934
- " | > preemphasis:0.0\n",
935
- " | > griffin_lim_iters:60\n",
936
- " | > signal_norm:False\n",
937
- " | > symmetric_norm:True\n",
938
- " | > mel_fmin:0\n",
939
- " | > mel_fmax:8000.0\n",
940
- " | > pitch_fmin:1.0\n",
941
- " | > pitch_fmax:640.0\n",
942
- " | > spec_gain:1.0\n",
943
- " | > stft_pad_mode:reflect\n",
944
- " | > max_norm:4.0\n",
945
- " | > clip_norm:True\n",
946
- " | > do_trim_silence:False\n",
947
- " | > trim_db:60\n",
948
- " | > do_sound_norm:False\n",
949
- " | > do_amp_to_db_linear:True\n",
950
- " | > do_amp_to_db_mel:True\n",
951
- " | > do_rms_norm:False\n",
952
- " | > db_level:None\n",
953
- " | > stats_path:None\n",
954
- " | > base:2.718281828459045\n",
955
- " | > hop_length:256\n",
956
- " | > win_length:1024\n",
957
- " > Generator Model: hifigan_generator\n",
958
- " > Discriminator Model: hifigan_discriminator\n",
959
- "Removing weight norm...\n",
960
- "model: tts_models/en/ljspeech/overflow\n",
961
- "language: \n",
962
- "speaker: \n",
963
- "Using original voice\n",
964
- " > Text splitted to sentences.\n",
965
- "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
966
- " > Processing time: 2.4030001163482666\n",
967
- " > Real-time factor: 0.26459208495864933\n",
968
- "model: tts_models/en/ljspeech/overflow\n",
969
  "language: \n",
970
- "speaker: \n",
971
  "voice cloning with the voice conversion model\n",
972
  " > Text splitted to sentences.\n",
973
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
974
- " > Processing time: 2.4769999980926514\n",
975
- " > Real-time factor: 0.27343925203231617\n"
976
  ]
977
  }
978
  ],
@@ -1039,7 +228,7 @@
1039
  " if use_original_voice:\n",
1040
  " print('Using original voice')\n",
1041
  " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
1042
- " elif tts_model.synthesizer.tts_model.speaker_manager:\n",
1043
  " print('voice cloning with the tts')\n",
1044
  " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
1045
  " else:\n",
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 4,
6
+ "id": "9b361b8e",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
20
  },
21
  {
22
  "cell_type": "code",
23
+ "execution_count": 7,
24
+ "id": "38f89cca",
25
  "metadata": {
26
  "scrolled": false
27
  },
 
36
  " > Using model: freevc\n",
37
  " > Loading pretrained speaker encoder model ...\n",
38
  "Loaded the voice encoder model on cpu in 0.01 seconds.\n",
39
+ "Running on local URL: http://127.0.0.1:7864\n",
40
  "\n",
41
  "To create a public link, set `share=True` in `launch()`.\n"
42
  ]
 
44
  {
45
  "data": {
46
  "text/html": [
47
+ "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
48
  ],
49
  "text/plain": [
50
  "<IPython.core.display.HTML object>"
 
57
  "data": {
58
  "text/plain": []
59
  },
60
+ "execution_count": 7,
61
  "metadata": {},
62
  "output_type": "execute_result"
63
  },
 
65
  "name": "stdout",
66
  "output_type": "stream",
67
  "text": [
68
+ " > tts_models/en/vctk/fast_pitch is already downloaded.\n",
69
+ " > Model's license - CC BY-NC-ND 4.0\n",
70
+ " > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.\n",
71
+ " > Using model: fast_pitch\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  " > Setting up Audio Processor...\n",
73
  " | > sample_rate:22050\n",
74
  " | > resample:False\n",
 
86
  " | > symmetric_norm:True\n",
87
  " | > mel_fmin:0\n",
88
  " | > mel_fmax:8000.0\n",
89
+ " | > pitch_fmin:0.0\n",
90
  " | > pitch_fmax:640.0\n",
91
  " | > spec_gain:1.0\n",
92
  " | > stft_pad_mode:reflect\n",
93
  " | > max_norm:4.0\n",
94
  " | > clip_norm:True\n",
95
  " | > do_trim_silence:True\n",
96
+ " | > trim_db:23\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  " | > do_sound_norm:False\n",
98
  " | > do_amp_to_db_linear:True\n",
99
  " | > do_amp_to_db_mel:True\n",
 
103
  " | > base:2.718281828459045\n",
104
  " | > hop_length:256\n",
105
  " | > win_length:1024\n",
106
+ " > Init speaker_embedding layer.\n",
107
+ "model: tts_models/en/vctk/fast_pitch\n",
 
 
 
 
 
 
 
 
108
  "language: \n",
109
+ "speaker: VCTK_p225\n",
110
  "Using original voice\n",
111
  " > Text splitted to sentences.\n",
112
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
113
  "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
114
  " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
115
+ " > Processing time: 3.823000192642212\n",
116
+ " > Real-time factor: 0.5674894593370367\n",
117
+ "model: tts_models/en/vctk/fast_pitch\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  "language: \n",
119
+ "speaker: VCTK_p248\n",
120
  "Using original voice\n",
121
  " > Text splitted to sentences.\n",
122
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
123
+ " > Processing time: 3.246999979019165\n",
124
+ " > Real-time factor: 0.5186337327405872\n",
125
+ "model: tts_models/en/vctk/fast_pitch\n",
 
 
 
 
 
 
 
 
 
 
126
  "language: \n",
127
+ "speaker: VCTK_p267\n",
128
+ "Using original voice\n",
129
  " > Text splitted to sentences.\n",
130
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
131
+ " > Processing time: 3.6530003547668457\n",
132
+ " > Real-time factor: 0.5277034710600691\n",
133
+ "model: tts_models/en/vctk/fast_pitch\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  "language: \n",
135
+ "speaker: VCTK_p334\n",
136
  "Using original voice\n",
137
  " > Text splitted to sentences.\n",
138
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
139
+ " > Processing time: 3.421999931335449\n",
140
+ " > Real-time factor: 0.5260981320137958\n",
141
+ "model: tts_models/en/vctk/fast_pitch\n",
 
 
142
  "language: \n",
143
+ "speaker: VCTK_p362\n",
144
+ "Using original voice\n",
145
  " > Text splitted to sentences.\n",
146
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
147
+ " > Processing time: 3.9649999141693115\n",
148
+ " > Real-time factor: 0.545335878913631\n",
149
+ "model: tts_models/en/vctk/fast_pitch\n",
150
  "language: \n",
151
+ "speaker: VCTK_p362\n",
152
  "voice cloning with the voice conversion model\n",
153
  " > Text splitted to sentences.\n",
154
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
155
+ " > Processing time: 3.944000005722046\n",
156
+ " > Real-time factor: 0.5424476055774146\n",
157
+ "model: tts_models/en/vctk/fast_pitch\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  "language: \n",
159
+ "speaker: VCTK_p305\n",
160
  "voice cloning with the voice conversion model\n",
161
  " > Text splitted to sentences.\n",
162
  "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
163
+ " > Processing time: 3.994999647140503\n",
164
+ " > Real-time factor: 0.5442476165199195\n"
165
  ]
166
  }
167
  ],
 
228
  " if use_original_voice:\n",
229
  " print('Using original voice')\n",
230
  " speech = tts_model.tts(text, language=language, speaker=speaker) \n",
231
+ " elif tts_model.synthesizer.tts_model.speaker_manager and tts_model.synthesizer.tts_model.speaker_manager.encoder_ap:\n",
232
  " print('voice cloning with the tts')\n",
233
  " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
234
  " else:\n",
app.py CHANGED
@@ -71,7 +71,7 @@ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_
71
  if use_original_voice:
72
  print('Using original voice')
73
  speech = tts_model.tts(text, language=language, speaker=speaker)
74
- elif tts_model.synthesizer.tts_model.speaker_manager:
75
  print('voice cloning with the tts')
76
  speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
  else:
 
71
  if use_original_voice:
72
  print('Using original voice')
73
  speech = tts_model.tts(text, language=language, speaker=speaker)
74
+ elif tts_model.synthesizer.tts_model.speaker_manager and tts_model.synthesizer.tts_model.speaker_manager.encoder_ap:
75
  print('voice cloning with the tts')
76
  speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
77
  else: