Spaces:

Bmo411
/

Text-To-Speech

Sleeping

App Files Files Community

Text-To-Speech / app.py

Bmo411

Update app.py

e5ddb46 verified 4 months ago

raw

history blame

3.57 kB

	import gradio as gr
	import numpy as np
	import tensorflow as tf
	from tensorflow import keras
	import torch
	from huggingface_hub import hf_hub_download
	from speechbrain.inference.TTS import Tacotron2

	# Cargar modelo Tacotron2
	tacotron2 = Tacotron2.from_hparams(
	source="speechbrain/tts-tacotron2-ljspeech",
	savedir="tmpdir_tts",
	run_opts={"device": "cpu"}
	)

	# Descargar y cargar el modelo Generator entrenado
	model_path = hf_hub_download(
	repo_id="Bmo411/WGAN",
	filename="generator_epoch_1000.keras"
	)
	generator = keras.models.load_model(model_path, compile=False)

	# Función para convertir texto a audio
	def text_to_audio(text):
	# Configuraciones
	sample_rate = 8000 # Frecuencia de muestreo
	target_length = 1 * sample_rate # 2 segundos a 8000 Hz = 16000 muestras
	default_audio = np.zeros(target_length, dtype=np.float32)

	if not text or not text.strip():
	return (sample_rate, default_audio)

	try:
	# Convertir texto a mel-spectrograma con Tacotron2
	mel_output, _, _ = tacotron2.encode_text(text)
	mel = mel_output.detach().cpu().numpy().astype(np.float32)

	print(f"Forma original del mel: {mel.shape}")

	# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
	mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
	mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)

	print(f"Forma del mel preparado: {mel_input.shape}")

	# Generar audio
	generated_audio = generator(mel_input, training=False)

	# Procesar el audio generado
	generated_audio = tf.squeeze(generated_audio).numpy()

	print(f"Forma del audio generado: {generated_audio.shape}")

	# Asegurarse de que hay valores no cero antes de normalizar
	if np.max(np.abs(generated_audio)) > 0:
	generated_audio = generated_audio / np.max(np.abs(generated_audio))

	# RECORTAR O RELLENAR EL AUDIO A 2 SEGUNDOS (16000 muestras)
	current_length = len(generated_audio)

	if current_length > target_length:
	# Recortar si es más largo de 2 segundos
	print(f"Recortando audio de {current_length} a {target_length} muestras")
	final_audio = generated_audio[:target_length]
	else:
	# Rellenar con ceros si es más corto de 2 segundos
	print(f"Rellenando audio de {current_length} a {target_length} muestras")
	final_audio = np.zeros(target_length, dtype=np.float32)
	final_audio[:current_length] = generated_audio

	# Convertir a float32 para gradio
	final_audio = final_audio.astype(np.float32)

	print(f"Forma final del audio: {final_audio.shape}")

	return (sample_rate, final_audio)

	except Exception as e:
	print(f"Error en la generación de audio: {e}")
	import traceback
	traceback.print_exc()
	return (sample_rate, default_audio)

	# Crear interfaz en Gradio
	interface = gr.Interface(
	fn=text_to_audio,
	inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
	outputs=gr.Audio(label="Audio generado (2 segundos)"),
	title="Demo de TTS con Tacotron2 + Generador",
	description="Convierte texto en audio de 2 segundos usando Tacotron2 + modelo Generator.",
	examples=[["Hello"], ["Hi there"]]
	)

	# Lanzar aplicación
	if __name__ == "__main__":
	interface.launch(debug=True)