Spaces:

rakhlin
/

Coqui.ai

Sleeping

App Files Files Community

rakhlin commited on Jun 6, 2023

Commit

b2d3c53

1 Parent(s): 9d94b06

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitignore +6 -0
.ipynb_checkpoints/Coqui.ai-Copy1-checkpoint.ipynb +242 -1086
Coqui.ai-Copy1.ipynb +880 -0
Coqui.ai.ipynb +425 -0
app.bak.py +160 -0
app.py +30 -5
tts/voice_conversion_models--multilingual--vctk--freevc24/._config.json +0 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/._model.pth +3 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/._voice_conversion_models--multilingual--vctk--freevc24 +0 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/._voice_conversion_models--multilingual--vctk--freevc24 +0 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/voice_conversion_models--multilingual--vctk--freevc24/._config.json +0 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/voice_conversion_models--multilingual--vctk--freevc24/._model.pth +3 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/config.json +204 -0
tts/voice_conversion_models--multilingual--vctk--freevc24/model.pth +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+#checkpoints
+tts/
+#notebooks and bak files
+*.bak.py
+*.ipynb

.ipynb_checkpoints/Coqui.ai-Copy1-checkpoint.ipynb CHANGED Viewed

@@ -2,8 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "156133fe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,13 +15,14 @@
     "import tempfile\n",
     "\n",
     "from TTS.api import TTS\n",
-    "from TTS.utils.manage import ModelManager"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "5e5af800",
    "metadata": {
     "scrolled": false
    },
@@ -35,8 +36,8 @@
       " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
       " > Using model: freevc\n",
       " > Loading pretrained speaker encoder model ...\n",
-      "Loaded the voice encoder model on cpu in 0.01 seconds.\n",
-      "Running on local URL:  http://127.0.0.1:7863\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
@@ -44,7 +45,7 @@
     {
      "data": {
       "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -57,7 +58,7 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -71,1066 +72,7 @@
       " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
       " > Model's license - apache 2.0\n",
       " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:0.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 2\n",
-      " > Vocoder Model: univnet\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: univnet_generator\n",
-      " > Discriminator Model: univnet_discriminator\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 3.3410003185272217\n",
-      " > Real-time factor: 0.38459038289093944\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.9179999828338623\n",
-      " > Real-time factor: 0.3358978221135079\n",
-      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:0.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 2\n",
-      " > Vocoder Model: univnet\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: univnet_generator\n",
-      " > Discriminator Model: univnet_discriminator\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 3.021000385284424\n",
-      " > Real-time factor: 0.3477544400242312\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.9099998474121094\n",
-      " > Real-time factor: 0.33497690776101013\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.933000087738037\n",
-      " > Real-time factor: 0.33762451937136506\n",
-      " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 1\n",
-      " > Vocoder Model: hifigan\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:False\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: hifigan_generator\n",
-      " > Discriminator Model: hifigan_discriminator\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Removing weight norm...\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 4.28600001335144\n",
-      " > Real-time factor: 0.42371906516498953\n",
-      " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-10\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:0\n",
-      " | > fft_size:1024\n",
-      " | > power:1.8\n",
-      " | > preemphasis:0.99\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 2\n",
-      " > Vocoder Model: wavegrad\n",
-      "model: tts_models/en/ek1/tacotron2\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 224.84099984169006\n",
-      " > Real-time factor: 29.51038122922182\n",
-      " > tts_models/en/ek1/tacotron2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ek1/wavegrad is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-10\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:0\n",
-      " | > fft_size:1024\n",
-      " | > power:1.8\n",
-      " | > preemphasis:0.99\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 2\n",
-      " > Vocoder Model: wavegrad\n",
-      "model: tts_models/en/ek1/tacotron2\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 266.6489999294281\n",
-      " > Real-time factor: 34.99768124073744\n",
-      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:0.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 2\n",
-      " > Vocoder Model: univnet\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: univnet_generator\n",
-      " > Discriminator Model: univnet_discriminator\n",
-      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 2.885999917984009\n",
-      " > Real-time factor: 0.3322142195933605\n",
-      " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--speedy-speech\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: speedy_speech\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Vocoder Model: hifigan\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:False\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: hifigan_generator\n",
-      " > Discriminator Model: hifigan_discriminator\n",
-      "Removing weight norm...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "model: tts_models/en/ljspeech/speedy-speech\n",
-      "language: \n",
-      "speaker: \n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 0.9679999351501465\n",
-      " > Real-time factor: 0.11673301633083617\n",
-      "model: tts_models/en/ljspeech/speedy-speech\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 0.9630000591278076\n",
-      " > Real-time factor: 0.11613007144605443\n",
-      " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
-      " > Model's license - MPL\n",
-      " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
-      " > Model's license - MPL\n",
-      " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
-      " > Using model: Tacotron2\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:0.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Model's reduction rate `r` is set to: 2\n",
-      " > Vocoder Model: multiband_melgan\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log10\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:0\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:True\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:50.0\n",
-      " | > mel_fmax:7600.0\n",
-      " | > pitch_fmin:0.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
-      " | > base:10\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: multiband_melgan_generator\n",
-      " > Discriminator Model: melgan_multiscale_discriminator\n",
-      "model: tts_models/en/ljspeech/tacotron2-DCA\n",
-      "language: \n",
-      "speaker: \n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 2.067000150680542\n",
-      " > Real-time factor: 0.23295588670728015\n",
-      "model: tts_models/en/ljspeech/tacotron2-DCA\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.1570000648498535\n",
-      " > Real-time factor: 0.2430990934225715\n",
-      "model: tts_models/en/ljspeech/tacotron2-DCA\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.0920000076293945\n",
-      " > Real-time factor: 0.23577343069302087\n",
-      " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--fast_pitch\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: fast_pitch\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Vocoder Model: hifigan\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:False\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: hifigan_generator\n",
-      " > Discriminator Model: hifigan_discriminator\n",
-      "Removing weight norm...\n",
-      "model: tts_models/en/ljspeech/fast_pitch\n",
-      "language: \n",
-      "speaker: \n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 1.8829996585845947\n",
-      " > Real-time factor: 0.19894272496832988\n",
-      "model: tts_models/en/ljspeech/fast_pitch\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 1.8359999656677246\n",
-      " > Real-time factor: 0.19397711228808903\n",
-      "model: tts_models/en/ljspeech/fast_pitch\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 1.8659999370574951\n",
-      " > Real-time factor: 0.19714666998293168\n",
-      "model: voice_conversion_models/multilingual/vctk/freevc24\n",
-      "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
-      "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\11c82c70d145ea630f81dfa541de52bf615719ae\\yearn_for_time-0-100.wav\n",
-      " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--overflow\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: OverFlow\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " > Vocoder Model: hifigan\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:False\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: hifigan_generator\n",
-      " > Discriminator Model: hifigan_discriminator\n",
-      "Removing weight norm...\n",
-      "model: tts_models/en/ljspeech/overflow\n",
-      "language: \n",
-      "speaker: \n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.4030001163482666\n",
-      " > Real-time factor: 0.26459208495864933\n",
-      "model: tts_models/en/ljspeech/overflow\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.4769999980926514\n",
-      " > Real-time factor: 0.27343925203231617\n",
-      " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--neural_hmm\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
-      " > Model's license - apache 2.0\n",
-      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
-      " > Using model: NeuralHMM_TTS\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Vocoder Model: hifigan\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:1.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:False\n",
-      " | > trim_db:60\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Generator Model: hifigan_generator\n",
-      " > Discriminator Model: hifigan_discriminator\n",
-      "Removing weight norm...\n",
-      "model: tts_models/en/ljspeech/neural_hmm\n",
-      "language: \n",
-      "speaker: \n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.3940000534057617\n",
-      " > Real-time factor: 0.27230367477713896\n",
-      "model: tts_models/en/ljspeech/neural_hmm\n",
-      "language: \n",
-      "speaker: \n",
-      "voice cloning with the voice conversion model\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 2.628000020980835\n",
-      " > Real-time factor: 0.2965699745262212\n",
-      " > Downloading model to C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--vctk--fast_pitch\n",
-      " > Model's license - CC BY-NC-ND 4.0\n",
-      " > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.\n",
-      " > Using model: fast_pitch\n",
-      " > Setting up Audio Processor...\n",
-      " | > sample_rate:22050\n",
-      " | > resample:False\n",
-      " | > num_mels:80\n",
-      " | > log_func:np.log\n",
-      " | > min_level_db:-100\n",
-      " | > frame_shift_ms:None\n",
-      " | > frame_length_ms:None\n",
-      " | > ref_level_db:20\n",
-      " | > fft_size:1024\n",
-      " | > power:1.5\n",
-      " | > preemphasis:0.0\n",
-      " | > griffin_lim_iters:60\n",
-      " | > signal_norm:False\n",
-      " | > symmetric_norm:True\n",
-      " | > mel_fmin:0\n",
-      " | > mel_fmax:8000.0\n",
-      " | > pitch_fmin:0.0\n",
-      " | > pitch_fmax:640.0\n",
-      " | > spec_gain:1.0\n",
-      " | > stft_pad_mode:reflect\n",
-      " | > max_norm:4.0\n",
-      " | > clip_norm:True\n",
-      " | > do_trim_silence:True\n",
-      " | > trim_db:23\n",
-      " | > do_sound_norm:False\n",
-      " | > do_amp_to_db_linear:True\n",
-      " | > do_amp_to_db_mel:True\n",
-      " | > do_rms_norm:False\n",
-      " | > db_level:None\n",
-      " | > stats_path:None\n",
-      " | > base:2.718281828459045\n",
-      " | > hop_length:256\n",
-      " | > win_length:1024\n",
-      " > Init speaker_embedding layer.\n",
-      "model: tts_models/en/vctk/fast_pitch\n",
-      "language: \n",
-      "speaker: VCTK_p225\n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
-      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
-      " > Processing time: 4.122999906539917\n",
-      " > Real-time factor: 0.6120216766695737\n",
-      "model: tts_models/en/vctk/fast_pitch\n",
-      "language: \n",
-      "speaker: VCTK_p227\n",
-      "Using original voice\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
-      " > Processing time: 3.615000009536743\n",
-      " > Real-time factor: 0.5239715910962163\n",
-      "model: tts_models/en/vctk/fast_pitch\n",
-      "language: \n",
-      "speaker: VCTK_p227\n",
-      "voice cloning with the tts\n",
-      " > Text splitted to sentences.\n",
-      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n"
      ]
     },
     {
@@ -1150,17 +92,19 @@
       "    return await future\n",
       "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
       "    result = context.run(func, *args)\n",
-      "  File \"<ipython-input-6-20fd07aa6e62>\", line 65, in text_to_speech\n",
-      "    speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
-      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\", line 548, in tts\n",
-      "    **kwargs,\n",
-      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 340, in tts\n",
-      "    speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)\n",
-      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\tts\\utils\\managers.py\", line 365, in compute_embedding_from_clip\n",
-      "    embedding = _compute(wav_file)\n",
-      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\tts\\utils\\managers.py\", line 342, in _compute\n",
-      "    waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)\n",
-      "AttributeError: 'NoneType' object has no attribute 'load_wav'\n"
      ]
     }
    ],
@@ -1169,16 +113,40 @@
     "description = \"\"\"\"\"\"\n",
     "article = \"\"\"\"\"\"\n",
     "\n",
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "GPU = device == \"cuda\"\n",
     "INT16MAX = np.iinfo(np.int16).max\n",
-    "VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)\n",
     "\n",
     "\n",
-    "model_ids = ModelManager(verbose=False).list_models()\n",
-    "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
-    "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
-    "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
     "examples_pt = 'examples'\n",
     "allowed_extentions = ['.mp3', '.wav']\n",
     "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
@@ -1189,7 +157,7 @@
     "\n",
     "\n",
     "def on_model_tts_select(model_name):\n",
-    "    tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
     "    languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
     "    speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
     "    language = languages[0]\n",
@@ -1237,6 +205,7 @@
     "            # Lazy code... save it to a temp file to resample it while reading it for VC\n",
     "            tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
     "        speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
     "        \n",
     "\n",
     "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
@@ -1301,6 +270,193 @@
     "    gr.HTML(article)\n",
     "demo.launch(share=False)"
    ]
   }
  ],
  "metadata": {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 41,
+   "id": "9a1c46ff",
    "metadata": {},
    "outputs": [],
    "source": [
     "import tempfile\n",
     "\n",
     "from TTS.api import TTS\n",
+    "from TTS.utils.manage import ModelManager\n",
+    "from TTS.utils.synthesizer import Synthesizer"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 76,
+   "id": "a6339716",
    "metadata": {
     "scrolled": false
    },
       " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
       " > Using model: freevc\n",
       " > Loading pretrained speaker encoder model ...\n",
+      "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
+      "Running on local URL:  http://127.0.0.1:7867\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
     {
      "data": {
       "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7867/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
      "data": {
       "text/plain": []
      },
+     "execution_count": 76,
      "metadata": {},
      "output_type": "execute_result"
     },
       " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
       " > Model's license - apache 2.0\n",
       " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n"
      ]
     },
     {
       "    return await future\n",
       "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 807, in run\n",
       "    result = context.run(func, *args)\n",
+      "  File \"<ipython-input-76-b1dd8c5769eb>\", line 44, in on_model_tts_select\n",
+      "    tts_var = TTS_local(model_name=model_name, output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
+      "  File \"<ipython-input-76-b1dd8c5769eb>\", line 17, in __init__\n",
+      "    self.load_vc_model_by_name(model_name=model_name, gpu=gpu)\n",
+      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\", line 363, in load_vc_model_by_name\n",
+      "    self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)\n",
+      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 97, in __init__\n",
+      "    self._load_vc(vc_checkpoint, vc_config, use_cuda)\n",
+      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\", line 131, in _load_vc\n",
+      "    self.vc_model = setup_vc_model(config=self.vc_config)\n",
+      "  File \"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\vc\\models\\__init__.py\", line 17, in setup_model\n",
+      "    return model\n",
+      "UnboundLocalError: local variable 'model' referenced before assignment\n"
      ]
     }
    ],
     "description = \"\"\"\"\"\"\n",
     "article = \"\"\"\"\"\"\n",
     "\n",
+    "class TTS_local(TTS):\n",
+    "    def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False):\n",
+    "        super().__init__(\n",
+    "                model_name=None,\n",
+    "                model_path=None,\n",
+    "                config_path=None,\n",
+    "                vocoder_path=None,\n",
+    "                vocoder_config_path=None,\n",
+    "                progress_bar=progress_bar,\n",
+    "                gpu=False,\n",
+    "        )\n",
+    "        self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False)\n",
+    "        if model_name is not None:\n",
+    "            if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
+    "                self.load_tts_model_by_name(model_name, gpu)\n",
+    "            elif \"voice_conversion_models\" in model_name:\n",
+    "                self.load_vc_model_by_name(model_name, gpu)        \n",
+    "\n",
+    "        \n",
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "GPU = device == \"cuda\"\n",
     "INT16MAX = np.iinfo(np.int16).max\n",
+    "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
+    "MANAGER = ModelManager(verbose=False)\n",
     "\n",
+    "model_ids = MANAGER.list_models()\n",
+    "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
+    "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
+    "model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "\n",
+    "VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24', \n",
+    "                     output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
     "\n",
     "examples_pt = 'examples'\n",
     "allowed_extentions = ['.mp3', '.wav']\n",
     "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
     "\n",
     "\n",
     "def on_model_tts_select(model_name):\n",
+    "    tts_var = TTS_local(model_name=model_name, output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
     "    languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
     "    speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
     "    language = languages[0]\n",
     "            # Lazy code... save it to a temp file to resample it while reading it for VC\n",
     "            tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
     "        speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
+    "        sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
     "        \n",
     "\n",
     "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
     "    gr.HTML(article)\n",
     "demo.launch(share=False)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "c2dc0da8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > tts_models/en/blizzard2013/capacitron-t2-c50 is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/blizzard2013/hifigan_v2 is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:24000\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:80.0\n",
+      " | > mel_fmax:12000.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:25.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: hifigan\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:24000\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:80.0\n",
+      " | > mel_fmax:12000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:20.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:False\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:True\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: hifigan_generator\n",
+      " > Discriminator Model: hifigan_discriminator\n",
+      "Removing weight norm...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<TTS.utils.synthesizer.Synthesizer at 0x498b2588>"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from TTS.utils.synthesizer import Synthesizer\n",
+    "\n",
+    "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
+    "MANAGER = ModelManager(output_prefix=MODEL_DIR, verbose=False)\n",
+    "\n",
+    "model_ids = manager.list_models()\n",
+    "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(model_dir) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
+    "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "\n",
+    "\n",
+    "def load_local_checkpoint(model_name, use_cuda):\n",
+    "    model_path = None\n",
+    "    config_path = None\n",
+    "    speakers_file_path = None\n",
+    "    vocoder_path = None\n",
+    "    vocoder_config_path = None\n",
+    "\n",
+    "    model_path, config_path, model_item = MANAGER.download_model(model_name)\n",
+    "    vocoder_name = model_item[\"default_vocoder\"]\n",
+    "    if vocoder_name is not None:\n",
+    "    vocoder_path, vocoder_config_path, _ = MANAGER.download_model(vocoder_name)\n",
+    "    \n",
+    "    if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
+    "        synthesizer = Synthesizer(\n",
+    "            tts_checkpoint=model_path,\n",
+    "            tts_config_path=config_path,\n",
+    "            tts_speakers_file=speakers_file_path,\n",
+    "            tts_languages_file=None,\n",
+    "            vocoder_checkpoint=vocoder_path,\n",
+    "            vocoder_config=vocoder_config_path,\n",
+    "            encoder_checkpoint=\"\",\n",
+    "            encoder_config=\"\",\n",
+    "            use_cuda=use_cuda,\n",
+    "        )\n",
+    "    elif \"voice_conversion_models\" in model_name:\n",
+    "        self.load_vc_model_by_name(model_name, gpu)\n",
+    "\n",
+    "    return synthesizer\n",
+    "\n",
+    "model_name = model_tts_ids[0]\n",
+    "load_local_checkpoint(model_name, use_cuda=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "98c1d5a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n"
+     ]
+    },
+    {
+     "ename": "UnboundLocalError",
+     "evalue": "local variable 'model' referenced before assignment",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-77-6dbf83b539b0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mTTS_local\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'tts_models/en/ljspeech/tacotron2-DDC_ph'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mMODEL_DIR\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mGPU\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32m<ipython-input-76-b1dd8c5769eb>\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, model_name, output_prefix, progress_bar, gpu)\u001b[0m\n\u001b[0;32m     15\u001b[0m         )\n\u001b[0;32m     16\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmanager\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mModelManager\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodels_file\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_models_file_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_prefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mprogress_bar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_vc_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[0mdevice\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"cuda\"\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_available\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;34m\"cpu\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\u001b[0m in \u001b[0;36mload_vc_model_by_name\u001b[1;34m(self, model_name, gpu)\u001b[0m\n\u001b[0;32m    361\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    362\u001b[0m         \u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 363\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvoice_converter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSynthesizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    365\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mload_tts_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, tts_checkpoint, tts_config_path, tts_speakers_file, tts_languages_file, vocoder_checkpoint, vocoder_config, encoder_checkpoint, encoder_config, vc_checkpoint, vc_config, model_dir, voice_dir, use_cuda)\u001b[0m\n\u001b[0;32m     95\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     96\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 97\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_load_vc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     98\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutput_sample_rate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maudio\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"output_sample_rate\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m_load_vc\u001b[1;34m(self, vc_checkpoint, vc_config_path, use_cuda)\u001b[0m\n\u001b[0;32m    129\u001b[0m         \u001b[1;31m# pylint: disable=global-statement\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    130\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_config_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msetup_vc_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    132\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    133\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\vc\\models\\__init__.py\u001b[0m in \u001b[0;36msetup_model\u001b[1;34m(config, samples)\u001b[0m\n\u001b[0;32m     15\u001b[0m         \u001b[0mMyModel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimport_module\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"TTS.vc.models.freevc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mFreeVC\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     16\u001b[0m         \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMyModel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minit_from_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamples\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mUnboundLocalError\u001b[0m: local variable 'model' referenced before assignment"
+     ]
+    }
+   ],
+   "source": [
+    "TTS_local(model_name='tts_models/en/ljspeech/tacotron2-DDC_ph', output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)"
+   ]
   }
  ],
  "metadata": {

Coqui.ai-Copy1.ipynb ADDED Viewed

	@@ -0,0 +1,880 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "4110138e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from pathlib import Path\n",
+    "import tempfile\n",
+    "\n",
+    "from TTS.api import TTS\n",
+    "from TTS.utils.manage import ModelManager\n",
+    "from TTS.utils.synthesizer import Synthesizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "b7f07cd9",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
+      " > Model's license - MIT\n",
+      " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
+      " > Using model: freevc\n",
+      " > Loading pretrained speaker encoder model ...\n",
+      "Loaded the voice encoder model on cpu in 0.02 seconds.\n",
+      "Running on local URL:  http://127.0.0.1:7868\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: univnet\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: univnet_generator\n",
+      " > Discriminator Model: univnet_discriminator\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "Using original voice\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
+      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
+      " > Processing time: 3.2799999713897705\n",
+      " > Real-time factor: 0.3775684898572943\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "voice cloning with the voice conversion model\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 3.2300000190734863\n",
+      " > Real-time factor: 0.3718128780726402\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "voice cloning with the voice conversion model\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 3.065000295639038\n",
+      " > Real-time factor: 0.3528193729057425\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "voice cloning with the voice conversion model\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 2.9799997806549072\n",
+      " > Real-time factor: 0.3430347642595259\n",
+      "model: voice_conversion_models/multilingual/vctk/freevc24\n",
+      "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
+      "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\9a558946172057b073ebcd01c8bec7e2d1ff998e\\hmm_i_dont_know-0-100.wav\n",
+      "model: voice_conversion_models/multilingual/vctk/freevc24\n",
+      "source_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\b6e9c24083a878478ebbecd7bc42e1f631c05df6\\henry5-0-100.wav\n",
+      "target_wav: C:\\Users\\Torch\\AppData\\Local\\Temp\\gradio\\f730b71860c5932c67deaae15949118446d6c7d7\\arctic_a0407_clb-0-100.wav\n",
+      " > tts_models/multilingual/multi-dataset/your_tts is already downloaded.\n",
+      " > Model's license - CC BY-NC-ND 4.0\n",
+      " > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.\n",
+      " > Using model: vits\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:16000\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:0\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:None\n",
+      " | > fft_size:1024\n",
+      " | > power:None\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:None\n",
+      " | > signal_norm:None\n",
+      " | > symmetric_norm:None\n",
+      " | > mel_fmin:0\n",
+      " | > mel_fmax:None\n",
+      " | > pitch_fmin:None\n",
+      " | > pitch_fmax:None\n",
+      " | > spec_gain:20.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:1.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:False\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model fully restored. \n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:16000\n",
+      " | > resample:False\n",
+      " | > num_mels:64\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:512\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.97\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:False\n",
+      " | > symmetric_norm:False\n",
+      " | > mel_fmin:0\n",
+      " | > mel_fmax:8000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:20.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:False\n",
+      " | > do_trim_silence:False\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:True\n",
+      " | > db_level:-27.0\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:160\n",
+      " | > win_length:400\n",
+      " > External Speaker Encoder Loaded !!\n",
+      " > initialization of language-embedding layers.\n",
+      " > Model fully restored. \n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:16000\n",
+      " | > resample:False\n",
+      " | > num_mels:64\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:512\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.97\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:False\n",
+      " | > symmetric_norm:False\n",
+      " | > mel_fmin:0\n",
+      " | > mel_fmax:8000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:20.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:False\n",
+      " | > do_trim_silence:False\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:True\n",
+      " | > db_level:-27.0\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:160\n",
+      " | > win_length:400\n",
+      "model: tts_models/multilingual/multi-dataset/your_tts\n",
+      "language: en\n",
+      "speaker: female-en-5\n",
+      "Using original voice\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 1.8219997882843018\n",
+      " > Real-time factor: 0.19457494535287287\n",
+      "model: tts_models/multilingual/multi-dataset/your_tts\n",
+      "language: en\n",
+      "speaker: female-en-5\n",
+      "voice cloning with the tts\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 2.863999843597412\n",
+      " > Real-time factor: 0.3084877039635299\n",
+      "model: tts_models/multilingual/multi-dataset/your_tts\n",
+      "language: fr-fr\n",
+      "speaker: female-en-5\n",
+      "voice cloning with the tts\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 2.546999931335449\n",
+      " > Real-time factor: 0.326036857569822\n",
+      " > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:False\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:0\n",
+      " | > mel_fmax:8000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:2.718281828459045\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Model's reduction rate `r` is set to: 1\n",
+      " > Vocoder Model: hifigan\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:False\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:0\n",
+      " | > mel_fmax:8000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:False\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:2.718281828459045\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: hifigan_generator\n",
+      " > Discriminator Model: hifigan_discriminator\n",
+      "Removing weight norm...\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC\n",
+      "language: \n",
+      "speaker: \n",
+      "voice cloning with the voice conversion model\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      " > Processing time: 4.205999851226807\n",
+      " > Real-time factor: 0.4124959824204343\n",
+      " > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.\n",
+      " > Model's license - MPL\n",
+      " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n",
+      " > Model's license - MPL\n",
+      " > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\tts_models--en--ljspeech--tacotron2-DCA\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: multiband_melgan\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:0\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:/Users/Torch/AppData/Local\\tts\\vocoder_models--en--ljspeech--multiband-melgan\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: multiband_melgan_generator\n",
+      " > Discriminator Model: melgan_multiscale_discriminator\n",
+      "model: tts_models/en/ljspeech/tacotron2-DCA\n",
+      "language: \n",
+      "speaker: \n",
+      "voice cloning with the voice conversion model\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
+      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
+      " > Processing time: 2.384999990463257\n",
+      " > Real-time factor: 0.2687952332235178\n"
+     ]
+    }
+   ],
+   "source": [
+    "title = \"\"\n",
+    "description = \"\"\"\"\"\"\n",
+    "article = \"\"\"\"\"\"\n",
+    "\n",
+    "class TTS_local(TTS):\n",
+    "    def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False):\n",
+    "        super().__init__(\n",
+    "                model_name=None,\n",
+    "                model_path=None,\n",
+    "                config_path=None,\n",
+    "                vocoder_path=None,\n",
+    "                vocoder_config_path=None,\n",
+    "                progress_bar=progress_bar,\n",
+    "                gpu=False,\n",
+    "        )\n",
+    "        self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False)\n",
+    "        if model_name is not None:\n",
+    "            if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
+    "                self.load_tts_model_by_name(model_name, gpu)\n",
+    "            elif \"voice_conversion_models\" in model_name:\n",
+    "                self.load_vc_model_by_name(model_name, gpu)        \n",
+    "\n",
+    "        \n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "GPU = device == \"cuda\"\n",
+    "INT16MAX = np.iinfo(np.int16).max\n",
+    "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
+    "MANAGER = ModelManager(verbose=False)\n",
+    "\n",
+    "model_ids = MANAGER.list_models()\n",
+    "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
+    "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
+    "model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "\n",
+    "VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24', \n",
+    "                     output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
+    "\n",
+    "examples_pt = 'examples'\n",
+    "allowed_extentions = ['.mp3', '.wav']\n",
+    "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
+    "verse = \"\"\"Mary had a little lamb,\n",
+    "Its fleece was white as snow.\n",
+    "Everywhere the child went,\n",
+    "The little lamb was sure to go.\"\"\"\n",
+    "\n",
+    "\n",
+    "def on_model_tts_select(model_name):\n",
+    "    tts_var = TTS_local(model_name=model_name, output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)\n",
+    "    languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
+    "    speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
+    "    language = languages[0]\n",
+    "    speaker = speakers[0]\n",
+    "    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
+    "                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
+    "\n",
+    "\n",
+    "def on_voicedropdown(x):\n",
+    "    return examples[x]\n",
+    "\n",
+    "\n",
+    "def voice_clone(source_wav, target_wav):\n",
+    "    print(f'model: {VC_MODEL.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
+    "    sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
+    "    if source_wav is None or target_wav is None:\n",
+    "        return (sample_rate, np.zeros(0).astype(np.int16))\n",
+    "\n",
+    "    speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
+    "    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
+    "        return (16000, np.zeros(0).astype(np.int16))\n",
+    "\n",
+    "    sample_rate = tts_model.synthesizer.output_sample_rate\n",
+    "    if tts_model.is_multi_speaker:\n",
+    "        speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
+    "    print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
+    "\n",
+    "    language = None if language == '' else language\n",
+    "    speaker = None if speaker == '' else speaker\n",
+    "    if use_original_voice:\n",
+    "        print('Using original voice')\n",
+    "        speech = tts_model.tts(text, language=language, speaker=speaker)       \n",
+    "    elif tts_model.synthesizer.tts_model.speaker_manager and tts_model.synthesizer.tts_model.speaker_manager.encoder_ap:\n",
+    "        print('voice cloning with the tts')\n",
+    "        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
+    "    else:\n",
+    "        print('voice cloning with the voice conversion model')\n",
+    "#         speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
+    "        with tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False) as fp:\n",
+    "            # Lazy code... save it to a temp file to resample it while reading it for VC\n",
+    "            tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)\n",
+    "        speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)\n",
+    "        sample_rate = VC_MODEL.voice_converter.output_sample_rate\n",
+    "        \n",
+    "\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "with gr.Blocks() as demo:\n",
+    "    tts_model = gr.State(None)\n",
+    "#     vc_model = gr.State(None)\n",
+    "    def activate(*args):\n",
+    "        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
+    "    def deactivate(*args):\n",
+    "        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
+    "\n",
+    "    \n",
+    "    gr.Markdown(description)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
+    "                \n",
+    "    with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
+    "        gr.Markdown(\"Upload target voice...\")\n",
+    "        with gr.Row(equal_height=True):\n",
+    "            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
+    "            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=2):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                with gr.Column():\n",
+    "                    text_to_convert = gr.Textbox(verse)\n",
+    "                    orig_voice = gr.Checkbox(label='Use original voice')\n",
+    "                voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                button_text = gr.Button('Text to speech', interactive=True)\n",
+    "                button_audio = gr.Button('Convert audio', interactive=True)\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
+    "        \n",
+    "    # actions\n",
+    "    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "\n",
+    "    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
+    "             outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "\n",
+    "    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    gr.HTML(article)\n",
+    "demo.launch(share=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "d97a1ab5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > tts_models/en/blizzard2013/capacitron-t2-c50 is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/blizzard2013/hifigan_v2 is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:24000\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:80.0\n",
+      " | > mel_fmax:12000.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:25.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: hifigan\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:24000\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:80.0\n",
+      " | > mel_fmax:12000.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:20.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:False\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:True\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:None\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: hifigan_generator\n",
+      " > Discriminator Model: hifigan_discriminator\n",
+      "Removing weight norm...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<TTS.utils.synthesizer.Synthesizer at 0x498b2588>"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from TTS.utils.synthesizer import Synthesizer\n",
+    "\n",
+    "MODEL_DIR = 'C:/Users/Torch/AppData/Local'\n",
+    "MANAGER = ModelManager(output_prefix=MODEL_DIR, verbose=False)\n",
+    "\n",
+    "model_ids = manager.list_models()\n",
+    "local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(model_dir) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]\n",
+    "model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "\n",
+    "\n",
+    "def load_local_checkpoint(model_name, use_cuda):\n",
+    "    model_path = None\n",
+    "    config_path = None\n",
+    "    speakers_file_path = None\n",
+    "    vocoder_path = None\n",
+    "    vocoder_config_path = None\n",
+    "\n",
+    "    model_path, config_path, model_item = MANAGER.download_model(model_name)\n",
+    "    vocoder_name = model_item[\"default_vocoder\"]\n",
+    "    if vocoder_name is not None:\n",
+    "    vocoder_path, vocoder_config_path, _ = MANAGER.download_model(vocoder_name)\n",
+    "    \n",
+    "    if \"tts_models\" in model_name or \"coqui_studio\" in model_name:\n",
+    "        synthesizer = Synthesizer(\n",
+    "            tts_checkpoint=model_path,\n",
+    "            tts_config_path=config_path,\n",
+    "            tts_speakers_file=speakers_file_path,\n",
+    "            tts_languages_file=None,\n",
+    "            vocoder_checkpoint=vocoder_path,\n",
+    "            vocoder_config=vocoder_config_path,\n",
+    "            encoder_checkpoint=\"\",\n",
+    "            encoder_config=\"\",\n",
+    "            use_cuda=use_cuda,\n",
+    "        )\n",
+    "    elif \"voice_conversion_models\" in model_name:\n",
+    "        self.load_vc_model_by_name(model_name, gpu)\n",
+    "\n",
+    "    return synthesizer\n",
+    "\n",
+    "model_name = model_tts_ids[0]\n",
+    "load_local_checkpoint(model_name, use_cuda=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "35c8a08c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n"
+     ]
+    },
+    {
+     "ename": "UnboundLocalError",
+     "evalue": "local variable 'model' referenced before assignment",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-77-6dbf83b539b0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mTTS_local\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'tts_models/en/ljspeech/tacotron2-DDC_ph'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mMODEL_DIR\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mGPU\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32m<ipython-input-76-b1dd8c5769eb>\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, model_name, output_prefix, progress_bar, gpu)\u001b[0m\n\u001b[0;32m     15\u001b[0m         )\n\u001b[0;32m     16\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmanager\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mModelManager\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodels_file\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_models_file_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_prefix\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_prefix\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprogress_bar\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mprogress_bar\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_vc_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[0mdevice\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"cuda\"\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_available\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;34m\"cpu\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\api.py\u001b[0m in \u001b[0;36mload_vc_model_by_name\u001b[1;34m(self, model_name, gpu)\u001b[0m\n\u001b[0;32m    361\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    362\u001b[0m         \u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 363\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvoice_converter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSynthesizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mgpu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    365\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mload_tts_model_by_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgpu\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, tts_checkpoint, tts_config_path, tts_speakers_file, tts_languages_file, vocoder_checkpoint, vocoder_config, encoder_checkpoint, encoder_config, vc_checkpoint, vc_config, model_dir, voice_dir, use_cuda)\u001b[0m\n\u001b[0;32m     95\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     96\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 97\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_load_vc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     98\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutput_sample_rate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maudio\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"output_sample_rate\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\utils\\synthesizer.py\u001b[0m in \u001b[0;36m_load_vc\u001b[1;34m(self, vc_checkpoint, vc_config_path, use_cuda)\u001b[0m\n\u001b[0;32m    129\u001b[0m         \u001b[1;31m# pylint: disable=global-statement\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    130\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvc_config_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msetup_vc_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    132\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_checkpoint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvc_config\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvc_checkpoint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    133\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\TTS\\vc\\models\\__init__.py\u001b[0m in \u001b[0;36msetup_model\u001b[1;34m(config, samples)\u001b[0m\n\u001b[0;32m     15\u001b[0m         \u001b[0mMyModel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimport_module\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"TTS.vc.models.freevc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mFreeVC\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     16\u001b[0m         \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMyModel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minit_from_config\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamples\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mUnboundLocalError\u001b[0m: local variable 'model' referenced before assignment"
+     ]
+    }
+   ],
+   "source": [
+    "TTS_local(model_name='tts_models/en/ljspeech/tacotron2-DDC_ph', output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Coqui.ai.ipynb ADDED Viewed

	@@ -0,0 +1,425 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e65fcd73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from TTS.api import TTS\n",
+    "from TTS.utils.manage import ModelManager"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f902a92c",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: univnet\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: univnet_generator\n",
+      " > Discriminator Model: univnet_discriminator\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "voice cloning with the voice conversion model\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
+      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
+      " > Processing time: 3.4810001850128174\n",
+      " > Real-time factor: 0.400706095887971\n",
+      " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n",
+      " > Model's license - MIT\n",
+      " > Check https://choosealicense.com/licenses/mit/ for more info.\n",
+      " > Using model: freevc\n",
+      " > Loading pretrained speaker encoder model ...\n",
+      "Loaded the voice encoder model on cpu in 0.09 seconds.\n",
+      "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > vocoder_models/en/ljspeech/univnet is already downloaded.\n",
+      " > Model's license - apache 2.0\n",
+      " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n",
+      " > Using model: Tacotron2\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:0.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Model's reduction rate `r` is set to: 2\n",
+      " > Vocoder Model: univnet\n",
+      " > Setting up Audio Processor...\n",
+      " | > sample_rate:22050\n",
+      " | > resample:False\n",
+      " | > num_mels:80\n",
+      " | > log_func:np.log10\n",
+      " | > min_level_db:-100\n",
+      " | > frame_shift_ms:None\n",
+      " | > frame_length_ms:None\n",
+      " | > ref_level_db:20\n",
+      " | > fft_size:1024\n",
+      " | > power:1.5\n",
+      " | > preemphasis:0.0\n",
+      " | > griffin_lim_iters:60\n",
+      " | > signal_norm:True\n",
+      " | > symmetric_norm:True\n",
+      " | > mel_fmin:50.0\n",
+      " | > mel_fmax:7600.0\n",
+      " | > pitch_fmin:1.0\n",
+      " | > pitch_fmax:640.0\n",
+      " | > spec_gain:1.0\n",
+      " | > stft_pad_mode:reflect\n",
+      " | > max_norm:4.0\n",
+      " | > clip_norm:True\n",
+      " | > do_trim_silence:True\n",
+      " | > trim_db:60\n",
+      " | > do_sound_norm:False\n",
+      " | > do_amp_to_db_linear:True\n",
+      " | > do_amp_to_db_mel:True\n",
+      " | > do_rms_norm:False\n",
+      " | > db_level:None\n",
+      " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n",
+      " | > base:10\n",
+      " | > hop_length:256\n",
+      " | > win_length:1024\n",
+      " > Generator Model: univnet_generator\n",
+      " > Discriminator Model: univnet_discriminator\n",
+      "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n",
+      "language: \n",
+      "speaker: \n",
+      "Using original voice\n",
+      " > Text splitted to sentences.\n",
+      "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n",
+      "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n",
+      " [!] Character '͡' not found in the vocabulary. Discarding it.\n",
+      " > Processing time: 2.931999921798706\n",
+      " > Real-time factor: 0.3375093879242267\n"
+     ]
+    }
+   ],
+   "source": [
+    "title = \"\"\n",
+    "description = \"\"\"\"\"\"\n",
+    "article = \"\"\"\"\"\"\n",
+    "\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "GPU = device == \"cuda\"\n",
+    "INT16MAX = np.iinfo(np.int16).max\n",
+    "\n",
+    "model_ids = ModelManager(verbose=False).list_models()\n",
+    "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n",
+    "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n",
+    "examples_pt = 'examples'\n",
+    "allowed_extentions = ['.mp3', '.wav']\n",
+    "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n",
+    "verse = \"\"\"Mary had a little lamb,\n",
+    "Its fleece was white as snow.\n",
+    "Everywhere the child went,\n",
+    "The little lamb was sure to go.\"\"\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "def on_model_tts_select(model_name, tts_var):\n",
+    "    if tts_var is None or tts_var.model_name != model_name:\n",
+    "        print(f'Loading TTS model from {model_name}')\n",
+    "        tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
+    "    else:\n",
+    "        print(f'Passing through TTS model {tts_var.model_name}')\n",
+    "    languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n",
+    "    speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n",
+    "    language = languages[0]\n",
+    "    speaker = speakers[0]\n",
+    "    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n",
+    "                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n",
+    "\n",
+    "\n",
+    "def on_model_vc_select(model_name, vc_var):\n",
+    "    if vc_var is None or vc_var.model_name != model_name:\n",
+    "        print(f'Loading voice conversion model from {model_name}')\n",
+    "        vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n",
+    "    else:\n",
+    "        print(f'Passing through voice conversion model {vc_var.model_name}')\n",
+    "    return vc_var\n",
+    "\n",
+    "\n",
+    "def on_voicedropdown(x):\n",
+    "    return examples[x]\n",
+    "\n",
+    "\n",
+    "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n",
+    "    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n",
+    "        return (16000, np.zeros(0).astype(np.int16))\n",
+    "    \n",
+    "    sample_rate = tts_model.synthesizer.output_sample_rate\n",
+    "    if tts_model.is_multi_speaker:\n",
+    "        speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n",
+    "    print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n",
+    "    \n",
+    "    language = None if language == '' else language\n",
+    "    speaker = None if speaker == '' else speaker\n",
+    "    if use_original_voice:\n",
+    "        print('Using original voice')\n",
+    "        speech = tts_model.tts(text, language=language, speaker=speaker)       \n",
+    "    elif tts_model.synthesizer.tts_model.speaker_manager:\n",
+    "        print('voice cloning with the tts')\n",
+    "        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n",
+    "    else:\n",
+    "        print('voice cloning with the voice conversion model')\n",
+    "        speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n",
+    "\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "def voice_clone(vc_model, source_wav, target_wav):\n",
+    "    print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n",
+    "    sample_rate = vc_model.voice_converter.output_sample_rate\n",
+    "    if vc_model is None or source_wav is None or target_wav is None:\n",
+    "        return (sample_rate, np.zeros(0).astype(np.int16))\n",
+    "\n",
+    "    speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n",
+    "    speech = (np.array(speech) * INT16MAX).astype(np.int16)\n",
+    "    return (sample_rate, speech)\n",
+    "\n",
+    "\n",
+    "with gr.Blocks() as demo:\n",
+    "    tts_model = gr.State(None)\n",
+    "    vc_model = gr.State(None)\n",
+    "    def activate(*args):\n",
+    "        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n",
+    "    def deactivate(*args):\n",
+    "        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n",
+    "\n",
+    "    gr.Markdown(description)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=1, min_width=10):\n",
+    "                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n",
+    "        with gr.Column(scale=5, min_width=50):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "#                 model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n",
+    "                model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n",
+    "                \n",
+    "    with gr.Accordion(\"Target voice\", open=False) as accordion:\n",
+    "        gr.Markdown(\"Upload target voice...\")\n",
+    "        with gr.Row(equal_height=True):\n",
+    "            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n",
+    "            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n",
+    "\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        with gr.Column(scale=2):\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                with gr.Column():\n",
+    "                    text_to_convert = gr.Textbox(verse)\n",
+    "                    orig_voice = gr.Checkbox(label='Use original voice')\n",
+    "                voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n",
+    "            with gr.Row(equal_height=True):\n",
+    "                button_text = gr.Button('Text to speech', interactive=True)\n",
+    "                button_audio = gr.Button('Convert audio', interactive=True)\n",
+    "    with gr.Row(equal_height=True):\n",
+    "        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n",
+    "        \n",
+    "    # actions\n",
+    "    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n",
+    "        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n",
+    "             outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "\n",
+    "    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n",
+    "        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n",
+    "        then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n",
+    "        then(activate, [button_text, button_audio], [button_text, button_audio])\n",
+    "    \n",
+    "    gr.HTML(article)\n",
+    "demo.launch(share=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.bak.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import gradio as gr
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pathlib import Path
+from TTS.api import TTS
+from TTS.utils.manage import ModelManager
+title = ""
+description = """"""
+article = """"""
+device = "cuda" if torch.cuda.is_available() else "cpu"
+GPU = device == "cuda"
+INT16MAX = np.iinfo(np.int16).max
+model_ids = ModelManager(verbose=False).list_models()
+model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
+model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
+model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
+examples_pt = 'examples'
+allowed_extentions = ['.mp3', '.wav']
+examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
+verse = """Mary had a little lamb,
+Its fleece was white as snow.
+Everywhere the child went,
+The little lamb was sure to go."""
+def on_model_tts_select(model_name, tts_var):
+    if tts_var is None or tts_var.model_name != model_name:
+        print(f'Loading TTS model from {model_name}')
+        tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
+    else:
+        print(f'Passing through TTS model {tts_var.model_name}')
+    languages = tts_var.languages if tts_var.is_multi_lingual else ['']
+    speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
+    language = languages[0]
+    speaker = speakers[0]
+    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
+                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)
+def on_model_vc_select(model_name, vc_var):
+    if vc_var is None or vc_var.model_name != model_name:
+        print(f'Loading voice conversion model from {model_name}')
+        vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
+    else:
+        print(f'Passing through voice conversion model {vc_var.model_name}')
+    return vc_var
+def on_voicedropdown(x):
+    return examples[x]
+def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
+    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
+        return (16000, np.zeros(0).astype(np.int16))
+    sample_rate = tts_model.synthesizer.output_sample_rate
+    if tts_model.is_multi_speaker:
+        speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
+    print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')
+    language = None if language == '' else language
+    speaker = None if speaker == '' else speaker
+    if use_original_voice:
+        print('Using original voice')
+        speech = tts_model.tts(text, language=language, speaker=speaker)
+    elif tts_model.synthesizer.tts_model.speaker_manager:
+        print('voice cloning with the tts')
+        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
+    else:
+        print('voice cloning with the voice conversion model')
+        speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
+    speech = (np.array(speech) * INT16MAX).astype(np.int16)
+    return (sample_rate, speech)
+def voice_clone(vc_model, source_wav, target_wav):
+    print(f'model: {vc_model.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
+    sample_rate = vc_model.voice_converter.output_sample_rate
+    if vc_model is None or source_wav is None or target_wav is None:
+        return (sample_rate, np.zeros(0).astype(np.int16))
+    speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+    speech = (np.array(speech) * INT16MAX).astype(np.int16)
+    return (sample_rate, speech)
+with gr.Blocks() as demo:
+    tts_model = gr.State(None)
+    vc_model = gr.State(None)
+    def activate(*args):
+        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
+    def deactivate(*args):
+        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)
+    gr.Markdown(description)
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=5, min_width=50):
+            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)
+        with gr.Column(scale=1, min_width=10):
+                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
+        with gr.Column(scale=1, min_width=10):
+                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
+        with gr.Column(scale=5, min_width=50):
+            with gr.Row(equal_height=True):
+#                 model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)
+                model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)
+    with gr.Accordion("Target voice", open=False) as accordion:
+        gr.Markdown("Upload target voice...")
+        with gr.Row(equal_height=True):
+            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
+            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    text_to_convert = gr.Textbox(verse)
+                    orig_voice = gr.Checkbox(label='Use original voice')
+                voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
+            with gr.Row(equal_height=True):
+                button_text = gr.Button('Text to speech', interactive=True)
+                button_audio = gr.Button('Convert audio', interactive=True)
+    with gr.Row(equal_height=True):
+        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)
+    # actions
+    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
+        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
+             outputs=speech).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
+        then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\
+        then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\
+        then(activate, [button_text, button_audio], [button_text, button_audio])
+    gr.HTML(article)
+demo.launch(share=False)

app.py CHANGED Viewed

@@ -13,16 +13,40 @@ title = ""
 description = """"""
 article = """"""
 device = "cuda" if torch.cuda.is_available() else "cpu"
 GPU = device == "cuda"
 INT16MAX = np.iinfo(np.int16).max
-VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)
-model_ids = ModelManager(verbose=False).list_models()
-model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
-model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
-model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
 examples_pt = 'examples'
 allowed_extentions = ['.mp3', '.wav']
 examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
@@ -81,6 +105,7 @@ def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_
             # Lazy code... save it to a temp file to resample it while reading it for VC
             tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
         speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)
     speech = (np.array(speech) * INT16MAX).astype(np.int16)

 description = """"""
 article = """"""
+class TTS_local(TTS):
+    def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False):
+        super().__init__(
+                model_name=None,
+                model_path=None,
+                config_path=None,
+                vocoder_path=None,
+                vocoder_config_path=None,
+                progress_bar=progress_bar,
+                gpu=False,
+        )
+        self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False)
+        if model_name is not None:
+            if "tts_models" in model_name or "coqui_studio" in model_name:
+                self.load_tts_model_by_name(model_name, gpu)
+            elif "voice_conversion_models" in model_name:
+                self.load_vc_model_by_name(model_name, gpu)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 GPU = device == "cuda"
 INT16MAX = np.iinfo(np.int16).max
+MODEL_DIR = 'C:/Users/Torch/AppData/Local'
+MANAGER = ModelManager(verbose=False)
+model_ids = MANAGER.list_models()
+local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)]
+model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
+model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
+model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
+VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24',
+                     output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU)
 examples_pt = 'examples'
 allowed_extentions = ['.mp3', '.wav']
 examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
             # Lazy code... save it to a temp file to resample it while reading it for VC
             tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
         speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)
+	sample_rate = VC_MODEL.voice_converter.output_sample_rate
     speech = (np.array(speech) * INT16MAX).astype(np.int16)

tts/voice_conversion_models--multilingual--vctk--freevc24/._config.json ADDED Viewed

Binary file (386 Bytes). View file

tts/voice_conversion_models--multilingual--vctk--freevc24/._model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa468ed77a9726751b4d321242e069c77dbcd8ecb2e30a212dc0f38f69b852a
+size 230

tts/voice_conversion_models--multilingual--vctk--freevc24/._voice_conversion_models--multilingual--vctk--freevc24 ADDED Viewed

Binary file (330 Bytes). View file

tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/._voice_conversion_models--multilingual--vctk--freevc24 ADDED Viewed

Binary file (330 Bytes). View file

tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/voice_conversion_models--multilingual--vctk--freevc24/._config.json ADDED Viewed

Binary file (386 Bytes). View file

tts/voice_conversion_models--multilingual--vctk--freevc24/__MACOSX/voice_conversion_models--multilingual--vctk--freevc24/._model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa468ed77a9726751b4d321242e069c77dbcd8ecb2e30a212dc0f38f69b852a
+size 230

tts/voice_conversion_models--multilingual--vctk--freevc24/config.json ADDED Viewed

	@@ -0,0 +1,204 @@

+{
+    "output_path": "output",
+    "logger_uri": null,
+    "run_name": "run",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": null,
+    "save_step": 10000,
+    "save_n_checkpoints": 5,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "epochs": 1000,
+    "batch_size": 32,
+    "eval_batch_size": 16,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "freevc",
+    "num_loader_workers": 0,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "max_wav_value": 32768.0,
+        "input_sample_rate": 16000,
+        "output_sample_rate": 24000,
+        "filter_length": 1280,
+        "hop_length": 320,
+        "win_length": 1280,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "batch_group_size": 0,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 0,
+    "start_by_longest": false,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "",
+            "dataset_name": "",
+            "path": "",
+            "meta_file_train": "",
+            "ignored_speakers": null,
+            "language": "",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
+        ],
+        [
+            "Be a voice, not an echo."
+        ],
+        [
+            "I'm sorry Dave. I'm afraid I can't do that."
+        ],
+        [
+            "This cake is great. It's so delicious and moist."
+        ],
+        [
+            "Prior to November 22, 1963."
+        ]
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "spec_channels": 641,
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            10,
+            6,
+            4,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "n_layers_q": 3,
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "ssl_dim": 1024,
+        "use_spk": true,
+        "num_spks": 0,
+        "segment_size": 8960
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 1.0,
+    "return_wav": true,
+    "use_weighted_sampler": false,
+    "weighted_sampler_attrs": {},
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "add_blank": true,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": null,
+    "speaker_embedding_channels": 256,
+    "language_ids_file": null,
+    "use_language_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": null,
+    "d_vector_dim": null
+}

tts/voice_conversion_models--multilingual--vctk--freevc24/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18d4ce44e7c803d675be1984b174e0f7bf05ce937419f19a818877e83f197007
+size 1425242419