Spaces:
Sleeping
Sleeping
File size: 3,575 Bytes
3e69bb2 82a3b65 3e69bb2 89cbeba 82a3b65 2690696 82a3b65 2690696 7cbab11 d22c009 5712a10 82a3b65 2690696 82a3b65 bd0c722 82a3b65 2690696 82a3b65 8156da9 bd0c722 2690696 bd0c722 8156da9 82a3b65 2690696 3e69bb2 bd0c722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import torch
from huggingface_hub import hf_hub_download
from speechbrain.inference.TTS import Tacotron2
# Cargar modelo Tacotron2
tacotron2 = Tacotron2.from_hparams(
source="speechbrain/tts-tacotron2-ljspeech",
savedir="tmpdir_tts",
run_opts={"device": "cpu"}
)
# Descargar y cargar el modelo Generator entrenado
model_path = hf_hub_download(
repo_id="Bmo411/WGAN",
filename="generator_epoch_1000.keras"
)
generator = keras.models.load_model(model_path, compile=False)
# Funci贸n para convertir texto a audio
def text_to_audio(text):
# Crear un array vac铆o por defecto en caso de error
default_audio = np.zeros(8000, dtype=np.float32)
sample_rate = 22050 # Ajusta seg煤n la configuraci贸n de tu modelo
if not text or not text.strip():
return (sample_rate, default_audio)
try:
# Convertir texto a mel-spectrograma con Tacotron2
mel_output, _, _ = tacotron2.encode_text(text)
mel = mel_output.detach().cpu().numpy().astype(np.float32)
# Imprimir forma original del mel para debugging
print(f"Forma original del mel: {mel.shape}")
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
# Si mel tiene forma (80, frames) - lo m谩s probable
if len(mel.shape) == 2:
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
# Si viene con otra forma, intentamos adaptarla
elif len(mel.shape) == 3 and mel.shape[0] == 1:
# Si es (1, 80, frames) o (1, frames, 80)
if mel.shape[1] == 80:
mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1)
else:
mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1)
else:
# Intento final de reorganizaci贸n
mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)
print(f"Forma del mel preparado: {mel_input.shape}")
# Generar audio
generated_audio = generator(mel_input, training=False)
# Procesar el audio generado
generated_audio = tf.squeeze(generated_audio).numpy()
# Asegurarse de que hay valores no cero antes de normalizar
if np.max(np.abs(generated_audio)) > 0:
generated_audio = generated_audio / np.max(np.abs(generated_audio))
# Convertir a float32 para gradio
generated_audio = generated_audio.astype(np.float32)
print(f"Forma del audio generado: {generated_audio.shape}")
return (sample_rate, generated_audio)
except Exception as e:
print(f"Error en la generaci贸n de audio: {e}")
# Si hay error, imprimir un traceback completo para mejor diagn贸stico
import traceback
traceback.print_exc()
return (sample_rate, default_audio)
# Crear interfaz en Gradio
interface = gr.Interface(
fn=text_to_audio,
inputs=gr.Textbox(lines=2, placeholder="Escribe algun numero del 0-9, en letra"),
outputs=gr.Audio(label="Audio generado"),
title="Demo de TTS con Tacotron2 + Generador",
description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
examples=[["nine"], ["cero"]]
)
# Lanzar aplicaci贸n
if __name__ == "__main__":
interface.launch(debug=True) |