Spaces:

Bmo411
/

Text-To-Speech

Sleeping

App Files Files Community

Text-To-Speech / app.py

Bmo411

Update app.py

cd231b1 verified 4 months ago

raw

history blame

4.09 kB

	import gradio as gr
	import numpy as np
	import tensorflow as tf
	from tensorflow import keras
	import torch
	from huggingface_hub import hf_hub_download
	from speechbrain.inference.TTS import Tacotron2

	# Cargar modelo Tacotron2
	tacotron2 = Tacotron2.from_hparams(
	source="speechbrain/tts-tacotron2-ljspeech",
	savedir="tmpdir_tts",
	run_opts={"device": "cpu"}
	)

	# Descargar y cargar el modelo Generator entrenado
	model_path = hf_hub_download(
	repo_id="Bmo411/WGAN",
	filename="generator_epoch_1000.keras"
	)
	generator = keras.models.load_model(model_path, compile=False)

	# Función para convertir texto a audio
	def text_to_audio(text):
	# Crear un array vacío por defecto en caso de error
	default_audio = np.zeros(8000, dtype=np.float32)
	sample_rate = 8000 # Ajusta según la configuración de tu modelo

	if not text or not text.strip():
	return (sample_rate, default_audio)

	try:
	# Convertir texto a mel-spectrograma con Tacotron2
	mel_output, _, _ = tacotron2.encode_text(text)
	mel = mel_output.detach().cpu().numpy().astype(np.float32)

	# Imprimir forma original del mel para debugging
	print(f"Forma original del mel: {mel.shape}")

	# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
	# Si mel tiene forma (80, frames) - lo más probable
	if len(mel.shape) == 2:
	mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
	mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
	# Si viene con otra forma, intentamos adaptarla
	elif len(mel.shape) == 3 and mel.shape[0] == 1:
	# Si es (1, 80, frames) o (1, frames, 80)
	if mel.shape[1] == 80:
	mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1)
	else:
	mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1)
	else:
	# Intento final de reorganización
	mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)

	print(f"Forma del mel preparado: {mel_input.shape}")

	# Generar audio
	generated_audio = generator(mel_input, training=False)

	# Procesar el audio generado
	generated_audio = tf.squeeze(generated_audio).numpy()

	# Asegurarse de que hay valores no cero antes de normalizar
	if np.max(np.abs(generated_audio)) > 0:
	generated_audio = generated_audio / np.max(np.abs(generated_audio))

	# Convertir a float32 para gradio
	generated_audio = generated_audio.astype(np.float32)

	print(f"Forma del audio generado: {generated_audio.shape}")
	current_length = len(generated_audio)

	if current_length > 8000:
	# Recortar si es más largo de 2 segundos
	print(f"Recortando audio de {current_length} a {8000} muestras")
	final_audio = generated_audio[:8000]
	else:
	# Rellenar con ceros si es más corto de 2 segundos
	print(f"Rellenando audio de {current_length} a {8000} muestras")
	final_audio = np.zeros(8000, dtype=np.float32)
	final_audio[:current_length] = generated_audio

	return (sample_rate, final_audio)

	except Exception as e:
	print(f"Error en la generación de audio: {e}")
	# Si hay error, imprimir un traceback completo para mejor diagnóstico
	import traceback
	traceback.print_exc()
	return (sample_rate, default_audio)

	# Crear interfaz en Gradio
	interface = gr.Interface(
	fn=text_to_audio,
	inputs=gr.Textbox(lines=2, placeholder="Escribe nine"),
	outputs=gr.Audio(label="Audio generado"),
	title="Demo de TTS con Tacotron2 + Generador",
	description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
	examples=[["nine"], ["nine"]]
	)

	# Lanzar aplicación
	if __name__ == "__main__":
	interface.launch(debug=True)