In [4]:
import gradio as gr
import numpy as np
import torch
import torch.nn.functional as F
from pathlib import Path
import tempfile

from TTS.api import TTS
from TTS.utils.manage import ModelManager

In [6]:
title = ""
description = """"""
article = """"""

device = "cuda" if torch.cuda.is_available() else "cpu"
GPU = device == "cuda"
INT16MAX = np.iinfo(np.int16).max
VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)


model_ids = ModelManager(verbose=False).list_models()
model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
examples_pt = 'examples'
allowed_extentions = ['.mp3', '.wav']
examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
verse = """Mary had a little lamb,
Its fleece was white as snow.
Everywhere the child went,
The little lamb was sure to go."""


def on_model_tts_select(model_name):
    tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
    languages = tts_var.languages if tts_var.is_multi_lingual else ['']
    speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
    language = languages[0]
    speaker = speakers[0]
    return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
                gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)


def on_voicedropdown(x):
    return examples[x]


def voice_clone(source_wav, target_wav):
    print(f'model: {VC_MODEL.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
    sample_rate = VC_MODEL.voice_converter.output_sample_rate
    if source_wav is None or target_wav is None:
        return (sample_rate, np.zeros(0).astype(np.int16))

    speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)
    speech = (np.array(speech) * INT16MAX).astype(np.int16)
    return (sample_rate, speech)


def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
    if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
        return (16000, np.zeros(0).astype(np.int16))

    sample_rate = tts_model.synthesizer.output_sample_rate
    if tts_model.is_multi_speaker:
        speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
    print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')

    language = None if language == '' else language
    speaker = None if speaker == '' else speaker
    if use_original_voice:
        print('Using original voice')
        speech = tts_model.tts(text, language=language, speaker=speaker)       
    elif tts_model.synthesizer.tts_model.speaker_manager:
        print('voice cloning with the tts')
        speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
    else:
        print('voice cloning with the voice conversion model')
#         speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            # Lazy code... save it to a temp file to resample it while reading it for VC
            tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
        speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)
        

    speech = (np.array(speech) * INT16MAX).astype(np.int16)
    return (sample_rate, speech)


with gr.Blocks() as demo:
    tts_model = gr.State(None)
#     vc_model = gr.State(None)
    def activate(*args):
        return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
    def deactivate(*args):
        return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)

    
    gr.Markdown(description)

    with gr.Row(equal_height=True):
        with gr.Column(scale=5, min_width=50):
            model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)
        with gr.Column(scale=1, min_width=10):
                language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
        with gr.Column(scale=1, min_width=10):
                speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)
                
    with gr.Accordion("Target voice", open=False) as accordion:
        gr.Markdown("Upload target voice...")
        with gr.Row(equal_height=True):
            voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
            voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)

    with gr.Row(equal_height=True):
        with gr.Column(scale=2):
            with gr.Row(equal_height=True):
                with gr.Column():
                    text_to_convert = gr.Textbox(verse)
                    orig_voice = gr.Checkbox(label='Use original voice')
                voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
            with gr.Row(equal_height=True):
                button_text = gr.Button('Text to speech', interactive=True)
                button_audio = gr.Button('Convert audio', interactive=True)
    with gr.Row(equal_height=True):
        speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) 
        
    # actions
    model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
        then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
        then(activate, [button_text, button_audio], [button_text, button_audio])
    voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
        then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
        then(activate, [button_text, button_audio], [button_text, button_audio])

    button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
        then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], 
             outputs=speech).\
        then(activate, [button_text, button_audio], [button_text, button_audio])

    button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
        then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\
        then(activate, [button_text, button_audio], [button_text, button_audio])
    
    gr.HTML(article)
demo.launch(share=False)

 > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.
 > Model's license - MIT
 > Check https://choosealicense.com/licenses/mit/ for more info.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cpu in 0.01 seconds.
Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




 > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > vocoder_models/en/ljspeech/univnet is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rm

Removing weight norm...
model: tts_models/en/ljspeech/tacotron2-DDC
language: 
speaker: 
voice cloning with the voice conversion model
 > Text splitted to sentences.
['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']
 > Processing time: 4.28600001335144
 > Real-time factor: 0.42371906516498953
 > tts_models/en/ek1/tacotron2 is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > vocoder_models/en/ek1/wavegrad is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-10
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.8
 | > preemphasis:0.99
 | > griffin_lim_ite

model: tts_models/en/ljspeech/speedy-speech
language: 
speaker: 
Using original voice
 > Text splitted to sentences.
['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']
ɛvɹiwɛɹ ðə t͡ʃaɪld wɛnt,
 [!] Character '͡' not found in the vocabulary. Discarding it.
 > Processing time: 0.9679999351501465
 > Real-time factor: 0.11673301633083617
model: tts_models/en/ljspeech/speedy-speech
language: 
speaker: 
voice cloning with the voice conversion model
 > Text splitted to sentences.
['Mary had a little lamb,', 'Its fleece was white as snow.', 'Everywhere the child went,', 'The little lamb was sure to go.']
 > Processing time: 0.9630000591278076
 > Real-time factor: 0.11613007144605443
 > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.
 > Model's license - MPL
 > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.
 > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.
 > Model's license -

 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Generator Model: hifigan_generator
 > Discriminator Model: hifigan_discriminator
Removing weight norm...
model: tts_models/en/ljspeech/overflow
language: 
speaker: 
Using original voice
 > Text splitted to sentenc