|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
import torch.nn.functional as F |
|
from pathlib import Path |
|
import tempfile |
|
|
|
from TTS.api import TTS |
|
from TTS.utils.manage import ModelManager |
|
|
|
|
|
title = "" |
|
description = """""" |
|
article = """""" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
GPU = device == "cuda" |
|
INT16MAX = np.iinfo(np.int16).max |
|
VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU) |
|
|
|
|
|
model_ids = ModelManager(verbose=False).list_models() |
|
model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)] |
|
model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)] |
|
model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)] |
|
examples_pt = 'examples' |
|
allowed_extentions = ['.mp3', '.wav'] |
|
examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions} |
|
verse = """Mary had a little lamb, |
|
Its fleece was white as snow. |
|
Everywhere the child went, |
|
The little lamb was sure to go.""" |
|
|
|
|
|
def on_model_tts_select(model_name): |
|
tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU) |
|
languages = tts_var.languages if tts_var.is_multi_lingual else [''] |
|
speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] |
|
language = languages[0] |
|
speaker = speakers[0] |
|
return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\ |
|
gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker) |
|
|
|
|
|
def on_voicedropdown(x): |
|
return examples[x] |
|
|
|
|
|
def voice_clone(source_wav, target_wav): |
|
print(f'model: {VC_MODEL.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}') |
|
sample_rate = VC_MODEL.voice_converter.output_sample_rate |
|
if source_wav is None or target_wav is None: |
|
return (sample_rate, np.zeros(0).astype(np.int16)) |
|
|
|
speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav) |
|
speech = (np.array(speech) * INT16MAX).astype(np.int16) |
|
return (sample_rate, speech) |
|
|
|
|
|
def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice): |
|
if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice): |
|
return (16000, np.zeros(0).astype(np.int16)) |
|
|
|
sample_rate = tts_model.synthesizer.output_sample_rate |
|
if tts_model.is_multi_speaker: |
|
speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] |
|
print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}') |
|
|
|
language = None if language == '' else language |
|
speaker = None if speaker == '' else speaker |
|
if use_original_voice: |
|
print('Using original voice') |
|
speech = tts_model.tts(text, language=language, speaker=speaker) |
|
elif tts_model.synthesizer.tts_model.speaker_manager: |
|
print('voice cloning with the tts') |
|
speech = tts_model.tts(text, language=language, speaker_wav=target_wav) |
|
else: |
|
print('voice cloning with the voice conversion model') |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
|
|
tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name) |
|
speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav) |
|
|
|
|
|
speech = (np.array(speech) * INT16MAX).astype(np.int16) |
|
return (sample_rate, speech) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
tts_model = gr.State(None) |
|
def activate(*args): |
|
return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args) |
|
def deactivate(*args): |
|
return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args) |
|
|
|
|
|
gr.Markdown(description) |
|
|
|
with gr.Row(equal_height=True): |
|
with gr.Column(scale=5, min_width=50): |
|
model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True) |
|
with gr.Column(scale=1, min_width=10): |
|
language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True) |
|
with gr.Column(scale=1, min_width=10): |
|
speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True) |
|
|
|
with gr.Accordion("Target voice", open=False) as accordion: |
|
gr.Markdown("Upload target voice...") |
|
with gr.Row(equal_height=True): |
|
voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath') |
|
voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True) |
|
|
|
with gr.Row(equal_height=True): |
|
with gr.Column(scale=2): |
|
with gr.Row(equal_height=True): |
|
with gr.Column(): |
|
text_to_convert = gr.Textbox(verse) |
|
orig_voice = gr.Checkbox(label='Use original voice') |
|
voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath') |
|
with gr.Row(equal_height=True): |
|
button_text = gr.Button('Text to speech', interactive=True) |
|
button_audio = gr.Button('Convert audio', interactive=True) |
|
with gr.Row(equal_height=True): |
|
speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) |
|
|
|
|
|
model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\ |
|
then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\ |
|
then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
|
|
button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], |
|
outputs=speech).\ |
|
then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
|
|
button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ |
|
then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\ |
|
then(activate, [button_text, button_audio], [button_text, button_audio]) |
|
|
|
gr.HTML(article) |
|
demo.launch(share=False) |