{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e65fcd73", "metadata": {}, "outputs": [], "source": [ "import gradio as gr\n", "import numpy as np\n", "import torch\n", "import torch.nn.functional as F\n", "from pathlib import Path\n", "\n", "from TTS.api import TTS\n", "from TTS.utils.manage import ModelManager" ] }, { "cell_type": "code", "execution_count": 2, "id": "f902a92c", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n", " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n", " > Model's license - apache 2.0\n", " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n", " > vocoder_models/en/ljspeech/univnet is already downloaded.\n", " > Model's license - apache 2.0\n", " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n", " > Using model: Tacotron2\n", " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > resample:False\n", " | > num_mels:80\n", " | > log_func:np.log10\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:True\n", " | > symmetric_norm:True\n", " | > mel_fmin:50.0\n", " | > mel_fmax:7600.0\n", " | > pitch_fmin:0.0\n", " | > pitch_fmax:640.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:False\n", " | > do_amp_to_db_linear:True\n", " | > do_amp_to_db_mel:True\n", " | > do_rms_norm:False\n", " | > db_level:None\n", " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n", " | > base:10\n", " | > hop_length:256\n", " | > win_length:1024\n", " > Model's reduction rate `r` is set to: 2\n", " > Vocoder Model: univnet\n", " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > resample:False\n", " | > num_mels:80\n", " | > log_func:np.log10\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:True\n", " | > symmetric_norm:True\n", " | > mel_fmin:50.0\n", " | > mel_fmax:7600.0\n", " | > pitch_fmin:1.0\n", " | > pitch_fmax:640.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:False\n", " | > do_amp_to_db_linear:True\n", " | > do_amp_to_db_mel:True\n", " | > do_rms_norm:False\n", " | > db_level:None\n", " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n", " | > base:10\n", " | > hop_length:256\n", " | > win_length:1024\n", " > Generator Model: univnet_generator\n", " > Discriminator Model: univnet_discriminator\n", "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n", "language: \n", "speaker: \n", "voice cloning with the voice conversion model\n", " > Text splitted to sentences.\n", "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n", "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n", " [!] Character '͡' not found in the vocabulary. Discarding it.\n", " > Processing time: 3.4810001850128174\n", " > Real-time factor: 0.400706095887971\n", " > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.\n", " > Model's license - MIT\n", " > Check https://choosealicense.com/licenses/mit/ for more info.\n", " > Using model: freevc\n", " > Loading pretrained speaker encoder model ...\n", "Loaded the voice encoder model on cpu in 0.09 seconds.\n", "Loading TTS model from tts_models/en/ljspeech/tacotron2-DDC_ph\n", " > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.\n", " > Model's license - apache 2.0\n", " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n", " > vocoder_models/en/ljspeech/univnet is already downloaded.\n", " > Model's license - apache 2.0\n", " > Check https://choosealicense.com/licenses/apache-2.0/ for more info.\n", " > Using model: Tacotron2\n", " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > resample:False\n", " | > num_mels:80\n", " | > log_func:np.log10\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:True\n", " | > symmetric_norm:True\n", " | > mel_fmin:50.0\n", " | > mel_fmax:7600.0\n", " | > pitch_fmin:0.0\n", " | > pitch_fmax:640.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:False\n", " | > do_amp_to_db_linear:True\n", " | > do_amp_to_db_mel:True\n", " | > do_rms_norm:False\n", " | > db_level:None\n", " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\tts_models--en--ljspeech--tacotron2-DDC_ph\\scale_stats.npy\n", " | > base:10\n", " | > hop_length:256\n", " | > win_length:1024\n", " > Model's reduction rate `r` is set to: 2\n", " > Vocoder Model: univnet\n", " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > resample:False\n", " | > num_mels:80\n", " | > log_func:np.log10\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:True\n", " | > symmetric_norm:True\n", " | > mel_fmin:50.0\n", " | > mel_fmax:7600.0\n", " | > pitch_fmin:1.0\n", " | > pitch_fmax:640.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:False\n", " | > do_amp_to_db_linear:True\n", " | > do_amp_to_db_mel:True\n", " | > do_rms_norm:False\n", " | > db_level:None\n", " | > stats_path:C:\\Users\\Torch\\AppData\\Local\\tts\\vocoder_models--en--ljspeech--univnet\\scale_stats.npy\n", " | > base:10\n", " | > hop_length:256\n", " | > win_length:1024\n", " > Generator Model: univnet_generator\n", " > Discriminator Model: univnet_discriminator\n", "model: tts_models/en/ljspeech/tacotron2-DDC_ph\n", "language: \n", "speaker: \n", "Using original voice\n", " > Text splitted to sentences.\n", "['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']\n", "ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,\n", " [!] Character '͡' not found in the vocabulary. Discarding it.\n", " > Processing time: 2.931999921798706\n", " > Real-time factor: 0.3375093879242267\n" ] } ], "source": [ "title = \"\"\n", "description = \"\"\"\"\"\"\n", "article = \"\"\"\"\"\"\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "GPU = device == \"cuda\"\n", "INT16MAX = np.iinfo(np.int16).max\n", "\n", "model_ids = ModelManager(verbose=False).list_models()\n", "model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]\n", "model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]\n", "model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]\n", "examples_pt = 'examples'\n", "allowed_extentions = ['.mp3', '.wav']\n", "examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}\n", "verse = \"\"\"Mary had a little lamb,\n", "Its fleece was white as snow.\n", "Everywhere the child went,\n", "The little lamb was sure to go.\"\"\"\n", "\n", "\n", "\n", "def on_model_tts_select(model_name, tts_var):\n", " if tts_var is None or tts_var.model_name != model_name:\n", " print(f'Loading TTS model from {model_name}')\n", " tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n", " else:\n", " print(f'Passing through TTS model {tts_var.model_name}')\n", " languages = tts_var.languages if tts_var.is_multi_lingual else ['']\n", " speakers = [s.replace('\\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting\n", " language = languages[0]\n", " speaker = speakers[0]\n", " return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\\\n", " gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)\n", "\n", "\n", "def on_model_vc_select(model_name, vc_var):\n", " if vc_var is None or vc_var.model_name != model_name:\n", " print(f'Loading voice conversion model from {model_name}')\n", " vc_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)\n", " else:\n", " print(f'Passing through voice conversion model {vc_var.model_name}')\n", " return vc_var\n", "\n", "\n", "def on_voicedropdown(x):\n", " return examples[x]\n", "\n", "\n", "def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):\n", " if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):\n", " return (16000, np.zeros(0).astype(np.int16))\n", " \n", " sample_rate = tts_model.synthesizer.output_sample_rate\n", " if tts_model.is_multi_speaker:\n", " speaker = {s.replace('\\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting\n", " print(f'model: {tts_model.model_name}\\nlanguage: {language}\\nspeaker: {speaker}')\n", " \n", " language = None if language == '' else language\n", " speaker = None if speaker == '' else speaker\n", " if use_original_voice:\n", " print('Using original voice')\n", " speech = tts_model.tts(text, language=language, speaker=speaker) \n", " elif tts_model.synthesizer.tts_model.speaker_manager:\n", " print('voice cloning with the tts')\n", " speech = tts_model.tts(text, language=language, speaker_wav=target_wav)\n", " else:\n", " print('voice cloning with the voice conversion model')\n", " speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)\n", "\n", " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n", " return (sample_rate, speech)\n", "\n", "\n", "def voice_clone(vc_model, source_wav, target_wav):\n", " print(f'model: {vc_model.model_name}\\nsource_wav: {source_wav}\\ntarget_wav: {target_wav}')\n", " sample_rate = vc_model.voice_converter.output_sample_rate\n", " if vc_model is None or source_wav is None or target_wav is None:\n", " return (sample_rate, np.zeros(0).astype(np.int16))\n", "\n", " speech = vc_model.voice_conversion(source_wav=source_wav, target_wav=target_wav)\n", " speech = (np.array(speech) * INT16MAX).astype(np.int16)\n", " return (sample_rate, speech)\n", "\n", "\n", "with gr.Blocks() as demo:\n", " tts_model = gr.State(None)\n", " vc_model = gr.State(None)\n", " def activate(*args):\n", " return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)\n", " def deactivate(*args):\n", " return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)\n", "\n", " gr.Markdown(description)\n", "\n", " with gr.Row(equal_height=True):\n", " with gr.Column(scale=5, min_width=50):\n", " model_tts_dropdown = gr.Dropdown(model_tts_ids, value=model_tts_ids[3], label='Text-to-speech model', interactive=True)\n", " with gr.Column(scale=1, min_width=10):\n", " language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)\n", " with gr.Column(scale=1, min_width=10):\n", " speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)\n", " with gr.Column(scale=5, min_width=50):\n", " with gr.Row(equal_height=True):\n", "# model_vocoder_dropdown = gr.Dropdown(model_voc_ids, label='Select vocoder model', interactive=True)\n", " model_vc_dropdown = gr.Dropdown(model_vc_ids, value=model_vc_ids[0], label='Voice conversion model', interactive=True)\n", " \n", " with gr.Accordion(\"Target voice\", open=False) as accordion:\n", " gr.Markdown(\"Upload target voice...\")\n", " with gr.Row(equal_height=True):\n", " voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')\n", " voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)\n", "\n", " with gr.Row(equal_height=True):\n", " with gr.Column(scale=2):\n", " with gr.Row(equal_height=True):\n", " with gr.Column():\n", " text_to_convert = gr.Textbox(verse)\n", " orig_voice = gr.Checkbox(label='Use original voice')\n", " voice_to_convert = gr.Audio(label=\"Upload voice to convert\", source='upload', type='filepath')\n", " with gr.Row(equal_height=True):\n", " button_text = gr.Button('Text to speech', interactive=True)\n", " button_audio = gr.Button('Convert audio', interactive=True)\n", " with gr.Row(equal_height=True):\n", " speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) \n", " \n", " # actions\n", " model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n", " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n", " then(activate, [button_text, button_audio], [button_text, button_audio])\n", " model_vc_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n", " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n", " then(activate, [button_text, button_audio], [button_text, button_audio])\n", " voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n", " then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\\\n", " then(activate, [button_text, button_audio], [button_text, button_audio])\n", " \n", " button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n", " then(fn=on_model_tts_select, inputs=[model_tts_dropdown, tts_model], outputs=[tts_model, language_dropdown, speaker_dropdown]).\\\n", " then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], \n", " outputs=speech).\\\n", " then(activate, [button_text, button_audio], [button_text, button_audio])\n", "\n", " button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\\\n", " then(fn=on_model_vc_select, inputs=[model_vc_dropdown, vc_model], outputs=vc_model).\\\n", " then(fn=voice_clone, inputs=[vc_model, voice_to_convert, voice_upload], outputs=speech).\\\n", " then(activate, [button_text, button_audio], [button_text, button_audio])\n", " \n", " gr.HTML(article)\n", "demo.launch(share=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 5 }