Spaces:

Bmo411
/

Text-To-Speech

Sleeping

File size: 3,565 Bytes

3e69bb2
 
82a3b65
 
3e69bb2
89cbeba
82a3b65
 
2690696
82a3b65
 
 
 
 
 
2690696
7cbab11
 
d22c009
5712a10
 
82a3b65
2690696
82a3b65
725b8b5
 
e5ddb46
725b8b5
bd0c722
 
 
 
 
 
 
 
 
 
 
 
725b8b5
 
bd0c722
 
 
 
 
 
 
 
 
725b8b5
 
bd0c722
 
 
 
725b8b5
 
 
 
 
 
 
 
 
 
 
 
 
bd0c722
725b8b5
bd0c722
725b8b5
bd0c722
725b8b5
bd0c722
 
 
 
 
 
82a3b65
2690696
82a3b65
 
2b83dc2
725b8b5
2690696
725b8b5
2b83dc2
82a3b65
 
2690696
3e69bb2
bd0c722

import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import torch
from huggingface_hub import hf_hub_download
from speechbrain.inference.TTS import Tacotron2

# Cargar modelo Tacotron2
tacotron2 = Tacotron2.from_hparams(
    source="speechbrain/tts-tacotron2-ljspeech",
    savedir="tmpdir_tts",
    run_opts={"device": "cpu"}
)

# Descargar y cargar el modelo Generator entrenado
model_path = hf_hub_download(
    repo_id="Bmo411/WGAN",
    filename="generator_epoch_1000.keras"
)
generator = keras.models.load_model(model_path, compile=False)

# Función para convertir texto a audio
def text_to_audio(text):
    # Configuraciones
    sample_rate = 8000  # Frecuencia de muestreo
    target_length = 1 * sample_rate  # 2 segundos a 8000 Hz = 16000 muestras
    default_audio = np.zeros(target_length, dtype=np.float32)
    
    if not text or not text.strip():
        return (sample_rate, default_audio)
    
    try:
        # Convertir texto a mel-spectrograma con Tacotron2
        mel_output, _, _ = tacotron2.encode_text(text)
        mel = mel_output.detach().cpu().numpy().astype(np.float32)
        
        print(f"Forma original del mel: {mel.shape}")
        
        # Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
        mel_input = np.expand_dims(mel, axis=0)  # (1, 80, frames)
        mel_input = np.expand_dims(mel_input, axis=-1)  # (1, 80, frames, 1)
        
        print(f"Forma del mel preparado: {mel_input.shape}")
        
        # Generar audio
        generated_audio = generator(mel_input, training=False)
        
        # Procesar el audio generado
        generated_audio = tf.squeeze(generated_audio).numpy()
        
        print(f"Forma del audio generado: {generated_audio.shape}")
        
        # Asegurarse de que hay valores no cero antes de normalizar
        if np.max(np.abs(generated_audio)) > 0:
            generated_audio = generated_audio / np.max(np.abs(generated_audio))
        
        # RECORTAR O RELLENAR EL AUDIO A 2 SEGUNDOS (16000 muestras)
        current_length = len(generated_audio)
        
        if current_length > target_length:
            # Recortar si es más largo de 2 segundos
            print(f"Recortando audio de {current_length} a {target_length} muestras")
            final_audio = generated_audio[:target_length]
        else:
            # Rellenar con ceros si es más corto de 2 segundos
            print(f"Rellenando audio de {current_length} a {target_length} muestras")
            final_audio = np.zeros(target_length, dtype=np.float32)
            final_audio[:current_length] = generated_audio
        
        # Convertir a float32 para gradio
        final_audio = final_audio.astype(np.float32)
        
        print(f"Forma final del audio: {final_audio.shape}")
        
        return (sample_rate, final_audio)
    
    except Exception as e:
        print(f"Error en la generación de audio: {e}")
        import traceback
        traceback.print_exc()
        return (sample_rate, default_audio)

# Crear interfaz en Gradio
interface = gr.Interface(
    fn=text_to_audio,
    inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
    outputs=gr.Audio(label="Audio generado (2 segundos)"),
    title="Demo de TTS con Tacotron2 + Generador",
    description="Convierte texto en audio de 2 segundos usando Tacotron2 + modelo Generator.",
    examples=[["Hello"], ["Hi there"]]
)

# Lanzar aplicación
if __name__ == "__main__":
    interface.launch(debug=True)