Spaces:
Sleeping
Sleeping
File size: 3,565 Bytes
3e69bb2 82a3b65 3e69bb2 89cbeba 82a3b65 2690696 82a3b65 2690696 7cbab11 d22c009 5712a10 82a3b65 2690696 82a3b65 725b8b5 e5ddb46 725b8b5 bd0c722 725b8b5 bd0c722 725b8b5 bd0c722 725b8b5 bd0c722 725b8b5 bd0c722 725b8b5 bd0c722 725b8b5 bd0c722 82a3b65 2690696 82a3b65 2b83dc2 725b8b5 2690696 725b8b5 2b83dc2 82a3b65 2690696 3e69bb2 bd0c722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import torch
from huggingface_hub import hf_hub_download
from speechbrain.inference.TTS import Tacotron2
# Cargar modelo Tacotron2
tacotron2 = Tacotron2.from_hparams(
source="speechbrain/tts-tacotron2-ljspeech",
savedir="tmpdir_tts",
run_opts={"device": "cpu"}
)
# Descargar y cargar el modelo Generator entrenado
model_path = hf_hub_download(
repo_id="Bmo411/WGAN",
filename="generator_epoch_1000.keras"
)
generator = keras.models.load_model(model_path, compile=False)
# Funci贸n para convertir texto a audio
def text_to_audio(text):
# Configuraciones
sample_rate = 8000 # Frecuencia de muestreo
target_length = 1 * sample_rate # 2 segundos a 8000 Hz = 16000 muestras
default_audio = np.zeros(target_length, dtype=np.float32)
if not text or not text.strip():
return (sample_rate, default_audio)
try:
# Convertir texto a mel-spectrograma con Tacotron2
mel_output, _, _ = tacotron2.encode_text(text)
mel = mel_output.detach().cpu().numpy().astype(np.float32)
print(f"Forma original del mel: {mel.shape}")
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
print(f"Forma del mel preparado: {mel_input.shape}")
# Generar audio
generated_audio = generator(mel_input, training=False)
# Procesar el audio generado
generated_audio = tf.squeeze(generated_audio).numpy()
print(f"Forma del audio generado: {generated_audio.shape}")
# Asegurarse de que hay valores no cero antes de normalizar
if np.max(np.abs(generated_audio)) > 0:
generated_audio = generated_audio / np.max(np.abs(generated_audio))
# RECORTAR O RELLENAR EL AUDIO A 2 SEGUNDOS (16000 muestras)
current_length = len(generated_audio)
if current_length > target_length:
# Recortar si es m谩s largo de 2 segundos
print(f"Recortando audio de {current_length} a {target_length} muestras")
final_audio = generated_audio[:target_length]
else:
# Rellenar con ceros si es m谩s corto de 2 segundos
print(f"Rellenando audio de {current_length} a {target_length} muestras")
final_audio = np.zeros(target_length, dtype=np.float32)
final_audio[:current_length] = generated_audio
# Convertir a float32 para gradio
final_audio = final_audio.astype(np.float32)
print(f"Forma final del audio: {final_audio.shape}")
return (sample_rate, final_audio)
except Exception as e:
print(f"Error en la generaci贸n de audio: {e}")
import traceback
traceback.print_exc()
return (sample_rate, default_audio)
# Crear interfaz en Gradio
interface = gr.Interface(
fn=text_to_audio,
inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
outputs=gr.Audio(label="Audio generado (2 segundos)"),
title="Demo de TTS con Tacotron2 + Generador",
description="Convierte texto en audio de 2 segundos usando Tacotron2 + modelo Generator.",
examples=[["Hello"], ["Hi there"]]
)
# Lanzar aplicaci贸n
if __name__ == "__main__":
interface.launch(debug=True) |