Spaces:
Sleeping
Sleeping
File size: 4,090 Bytes
3e69bb2 82a3b65 3e69bb2 89cbeba 82a3b65 2690696 82a3b65 2690696 7cbab11 d22c009 5712a10 82a3b65 2690696 82a3b65 b2de89e bd0c722 b2de89e bd0c722 b2de89e bd0c722 b2de89e bd0c722 b2de89e 3d92837 bd0c722 3d92837 cd231b1 bd0c722 b2de89e bd0c722 82a3b65 2690696 82a3b65 3d92837 b2de89e 2690696 b2de89e 3d92837 82a3b65 2690696 3d92837 bd0c722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import torch
from huggingface_hub import hf_hub_download
from speechbrain.inference.TTS import Tacotron2
# Cargar modelo Tacotron2
tacotron2 = Tacotron2.from_hparams(
source="speechbrain/tts-tacotron2-ljspeech",
savedir="tmpdir_tts",
run_opts={"device": "cpu"}
)
# Descargar y cargar el modelo Generator entrenado
model_path = hf_hub_download(
repo_id="Bmo411/WGAN",
filename="generator_epoch_1000.keras"
)
generator = keras.models.load_model(model_path, compile=False)
# Funci贸n para convertir texto a audio
def text_to_audio(text):
# Crear un array vac铆o por defecto en caso de error
default_audio = np.zeros(8000, dtype=np.float32)
sample_rate = 8000 # Ajusta seg煤n la configuraci贸n de tu modelo
if not text or not text.strip():
return (sample_rate, default_audio)
try:
# Convertir texto a mel-spectrograma con Tacotron2
mel_output, _, _ = tacotron2.encode_text(text)
mel = mel_output.detach().cpu().numpy().astype(np.float32)
# Imprimir forma original del mel para debugging
print(f"Forma original del mel: {mel.shape}")
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
# Si mel tiene forma (80, frames) - lo m谩s probable
if len(mel.shape) == 2:
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
# Si viene con otra forma, intentamos adaptarla
elif len(mel.shape) == 3 and mel.shape[0] == 1:
# Si es (1, 80, frames) o (1, frames, 80)
if mel.shape[1] == 80:
mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1)
else:
mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1)
else:
# Intento final de reorganizaci贸n
mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)
print(f"Forma del mel preparado: {mel_input.shape}")
# Generar audio
generated_audio = generator(mel_input, training=False)
# Procesar el audio generado
generated_audio = tf.squeeze(generated_audio).numpy()
# Asegurarse de que hay valores no cero antes de normalizar
if np.max(np.abs(generated_audio)) > 0:
generated_audio = generated_audio / np.max(np.abs(generated_audio))
# Convertir a float32 para gradio
generated_audio = generated_audio.astype(np.float32)
print(f"Forma del audio generado: {generated_audio.shape}")
current_length = len(generated_audio)
if current_length > 8000:
# Recortar si es m谩s largo de 2 segundos
print(f"Recortando audio de {current_length} a {8000} muestras")
final_audio = generated_audio[:8000]
else:
# Rellenar con ceros si es m谩s corto de 2 segundos
print(f"Rellenando audio de {current_length} a {8000} muestras")
final_audio = np.zeros(8000, dtype=np.float32)
final_audio[:current_length] = generated_audio
return (sample_rate, final_audio)
except Exception as e:
print(f"Error en la generaci贸n de audio: {e}")
# Si hay error, imprimir un traceback completo para mejor diagn贸stico
import traceback
traceback.print_exc()
return (sample_rate, default_audio)
# Crear interfaz en Gradio
interface = gr.Interface(
fn=text_to_audio,
inputs=gr.Textbox(lines=2, placeholder="Escribe nine"),
outputs=gr.Audio(label="Audio generado"),
title="Demo de TTS con Tacotron2 + Generador",
description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
examples=[["nine"], ["nine"]]
)
# Lanzar aplicaci贸n
if __name__ == "__main__":
interface.launch(debug=True) |