Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow import keras | |
import torch | |
from huggingface_hub import hf_hub_download | |
from speechbrain.inference.TTS import Tacotron2 | |
# Cargar modelo Tacotron2 | |
tacotron2 = Tacotron2.from_hparams( | |
source="speechbrain/tts-tacotron2-ljspeech", | |
savedir="tmpdir_tts", | |
run_opts={"device": "cpu"} | |
) | |
# Descargar y cargar el modelo Generator entrenado | |
model_path = hf_hub_download( | |
repo_id="Bmo411/WGAN", | |
filename="generator_epoch_1000.keras" | |
) | |
generator = keras.models.load_model(model_path, compile=False) | |
# Funci贸n para convertir texto a audio | |
def text_to_audio(text): | |
# Crear un array vac铆o por defecto en caso de error | |
default_audio = np.zeros(8000, dtype=np.float32) | |
sample_rate = 8000 # Ajusta seg煤n la configuraci贸n de tu modelo | |
if not text or not text.strip(): | |
return (sample_rate, default_audio) | |
try: | |
# Convertir texto a mel-spectrograma con Tacotron2 | |
mel_output, _, _ = tacotron2.encode_text(text) | |
mel = mel_output.detach().cpu().numpy().astype(np.float32) | |
# Imprimir forma original del mel para debugging | |
print(f"Forma original del mel: {mel.shape}") | |
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1) | |
# Si mel tiene forma (80, frames) - lo m谩s probable | |
if len(mel.shape) == 2: | |
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames) | |
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1) | |
# Si viene con otra forma, intentamos adaptarla | |
elif len(mel.shape) == 3 and mel.shape[0] == 1: | |
# Si es (1, 80, frames) o (1, frames, 80) | |
if mel.shape[1] == 80: | |
mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1) | |
else: | |
mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1) | |
else: | |
# Intento final de reorganizaci贸n | |
mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1) | |
print(f"Forma del mel preparado: {mel_input.shape}") | |
# Generar audio | |
generated_audio = generator(mel_input, training=False) | |
# Procesar el audio generado | |
generated_audio = tf.squeeze(generated_audio).numpy() | |
# Asegurarse de que hay valores no cero antes de normalizar | |
if np.max(np.abs(generated_audio)) > 0: | |
generated_audio = generated_audio / np.max(np.abs(generated_audio)) | |
# Convertir a float32 para gradio | |
generated_audio = generated_audio.astype(np.float32) | |
print(f"Forma del audio generado: {generated_audio.shape}") | |
current_length = len(generated_audio) | |
if current_length > 8000: | |
# Recortar si es m谩s largo de 2 segundos | |
print(f"Recortando audio de {current_length} a {8000} muestras") | |
final_audio = generated_audio[:8000] | |
else: | |
# Rellenar con ceros si es m谩s corto de 2 segundos | |
print(f"Rellenando audio de {current_length} a {8000} muestras") | |
final_audio = np.zeros(8000, dtype=np.float32) | |
final_audio[:current_length] = generated_audio | |
return (sample_rate, final_audio) | |
except Exception as e: | |
print(f"Error en la generaci贸n de audio: {e}") | |
# Si hay error, imprimir un traceback completo para mejor diagn贸stico | |
import traceback | |
traceback.print_exc() | |
return (sample_rate, default_audio) | |
# Crear interfaz en Gradio | |
interface = gr.Interface( | |
fn=text_to_audio, | |
inputs=gr.Textbox(lines=2, placeholder="Escribe nine"), | |
outputs=gr.Audio(label="Audio generado"), | |
title="Demo de TTS con Tacotron2 + Generador", | |
description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.", | |
examples=[["nine"], ["nine"]] | |
) | |
# Lanzar aplicaci贸n | |
if __name__ == "__main__": | |
interface.launch(debug=True) |