Spaces:

rakhlin
/

Coqui.ai

Running

App Files Files Community

Coqui.ai / app.py

rakhlin

Upload folder using huggingface_hub

64a316c about 2 years ago

raw

history blame

7.1 kB

	import gradio as gr
	import numpy as np
	import torch
	import torch.nn.functional as F
	from pathlib import Path
	import tempfile

	from TTS.api import TTS
	from TTS.utils.manage import ModelManager


	title = ""
	description = """"""
	article = """"""

	device = "cuda" if torch.cuda.is_available() else "cpu"
	GPU = device == "cuda"
	INT16MAX = np.iinfo(np.int16).max
	VC_MODEL = TTS(model_name='voice_conversion_models/multilingual/vctk/freevc24', progress_bar=False, gpu=GPU)


	model_ids = ModelManager(verbose=False).list_models()
	model_tts_ids = [model for model in model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)]
	model_voc_ids = [model for model in model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)]
	model_vc_ids = [model for model in model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)]
	examples_pt = 'examples'
	allowed_extentions = ['.mp3', '.wav']
	examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions}
	verse = """Mary had a little lamb,
	Its fleece was white as snow.
	Everywhere the child went,
	The little lamb was sure to go."""


	def on_model_tts_select(model_name):
	tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU)
	languages = tts_var.languages if tts_var.is_multi_lingual else ['']
	speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting
	language = languages[0]
	speaker = speakers[0]
	return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\
	gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker)


	def on_voicedropdown(x):
	return examples[x]


	def voice_clone(source_wav, target_wav):
	print(f'model: {VC_MODEL.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}')
	sample_rate = VC_MODEL.voice_converter.output_sample_rate
	if source_wav is None or target_wav is None:
	return (sample_rate, np.zeros(0).astype(np.int16))

	speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav)
	speech = (np.array(speech) * INT16MAX).astype(np.int16)
	return (sample_rate, speech)


	def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice):
	if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice):
	return (16000, np.zeros(0).astype(np.int16))

	sample_rate = tts_model.synthesizer.output_sample_rate
	if tts_model.is_multi_speaker:
	speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting
	print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}')

	language = None if language == '' else language
	speaker = None if speaker == '' else speaker
	if use_original_voice:
	print('Using original voice')
	speech = tts_model.tts(text, language=language, speaker=speaker)
	elif tts_model.synthesizer.tts_model.speaker_manager:
	print('voice cloning with the tts')
	speech = tts_model.tts(text, language=language, speaker_wav=target_wav)
	else:
	print('voice cloning with the voice conversion model')
	# speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	# Lazy code... save it to a temp file to resample it while reading it for VC
	tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name)
	speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav)


	speech = (np.array(speech) * INT16MAX).astype(np.int16)
	return (sample_rate, speech)


	with gr.Blocks() as demo:
	tts_model = gr.State(None)
	def activate(*args):
	return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args)
	def deactivate(*args):
	return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args)


	gr.Markdown(description)

	with gr.Row(equal_height=True):
	with gr.Column(scale=5, min_width=50):
	model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True)
	with gr.Column(scale=1, min_width=10):
	language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True)
	with gr.Column(scale=1, min_width=10):
	speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True)

	with gr.Accordion("Target voice", open=False) as accordion:
	gr.Markdown("Upload target voice...")
	with gr.Row(equal_height=True):
	voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath')
	voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True)

	with gr.Row(equal_height=True):
	with gr.Column(scale=2):
	with gr.Row(equal_height=True):
	with gr.Column():
	text_to_convert = gr.Textbox(verse)
	orig_voice = gr.Checkbox(label='Use original voice')
	voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath')
	with gr.Row(equal_height=True):
	button_text = gr.Button('Text to speech', interactive=True)
	button_audio = gr.Button('Convert audio', interactive=True)
	with gr.Row(equal_height=True):
	speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False)

	# actions
	model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
	then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\
	then(activate, [button_text, button_audio], [button_text, button_audio])
	voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\
	then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\
	then(activate, [button_text, button_audio], [button_text, button_audio])

	button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
	then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice],
	outputs=speech).\
	then(activate, [button_text, button_audio], [button_text, button_audio])

	button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\
	then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\
	then(activate, [button_text, button_audio], [button_text, button_audio])

	gr.HTML(article)
	demo.launch(share=False)