Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow import keras | |
import torch | |
from speechbrain.inference.TTS import Tacotron2 | |
# Cargar Tacotron2 | |
tacotron2 = Tacotron2.from_hparams( | |
source="speechbrain/tts-tacotron2-ljspeech", | |
savedir="tmpdir_tts", | |
run_opts={"device": "cpu"} | |
) | |
# Cargar tu modelo generator.keras | |
# Cargar tu generator.keras desde HuggingFace | |
model_path = huggingface_hub.hf_hub_download( | |
repo_id="Bmo411/WGAN", # <<-- aquí pones tu ruta exacta | |
filename="generator_epoch_3500.keras" # o el nombre exacto del archivo en el repo | |
) | |
generator = keras.models.load_model(model_path, compile=False) | |
# Función de generación | |
def text_to_audio(text): | |
# 1. Convertir texto a mel-spectrograma | |
mel_output, _, _ = tacotron2.encode_text(text) | |
mel = mel_output.squeeze(0).detach().cpu().numpy().astype(np.float32) # (80, frames) | |
# 2. Preparar para generator | |
mel_input = mel[np.newaxis, ..., np.newaxis] # (1, 80, frames, 1) | |
mel_input = tf.convert_to_tensor(mel_input) | |
# 3. Usar generator para generar audio | |
fake_audio = generator(mel_input, training=False) | |
fake_audio = tf.squeeze(fake_audio, axis=0).numpy() # (samples,) | |
# 4. Asegurar que esté en [-1, 1] para audio | |
fake_audio = np.clip(fake_audio, -1.0, 1.0) | |
# 5. Devolver audio como (numpy_array, sample_rate) | |
return fake_audio, 8000 # tu modelo está entrenado en 8 kHz, ¿verdad? | |
# Interfaz Gradio | |
interface = gr.Interface( | |
fn=text_to_audio, | |
inputs=gr.Textbox(lines=1, placeholder="Escribe un número (ej. nine)"), | |
outputs=gr.Audio(type="numpy", label="Audio generado"), | |
title="Demo de TTS con Tacotron2 + Generator", | |
description="Convierte texto en audio usando Tacotron2 + tu modelo generator." | |
) | |
# Lanzar app | |
if __name__ == "__main__": | |
interface.launch() | |