File size: 3,573 Bytes
3e69bb2
 
82a3b65
 
3e69bb2
89cbeba
82a3b65
 
2690696
82a3b65
 
 
 
 
 
2690696
7cbab11
 
d22c009
5712a10
 
82a3b65
2690696
82a3b65
bd0c722
 
d857d8c
bd0c722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82a3b65
2690696
82a3b65
 
2b83dc2
bd0c722
2690696
bd0c722
2b83dc2
82a3b65
 
2690696
3e69bb2
bd0c722
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import numpy as np
import tensorflow as tf
from tensorflow import keras
import torch
from huggingface_hub import hf_hub_download
from speechbrain.inference.TTS import Tacotron2

# Cargar modelo Tacotron2
tacotron2 = Tacotron2.from_hparams(
    source="speechbrain/tts-tacotron2-ljspeech",
    savedir="tmpdir_tts",
    run_opts={"device": "cpu"}
)

# Descargar y cargar el modelo Generator entrenado
model_path = hf_hub_download(
    repo_id="Bmo411/WGAN",
    filename="generator_epoch_1000.keras"
)
generator = keras.models.load_model(model_path, compile=False)

# Funci贸n para convertir texto a audio
def text_to_audio(text):
    # Crear un array vac铆o por defecto en caso de error
    default_audio = np.zeros(8000, dtype=np.float32)
    sample_rate = 8000  # Ajusta seg煤n la configuraci贸n de tu modelo
    
    if not text or not text.strip():
        return (sample_rate, default_audio)
    
    try:
        # Convertir texto a mel-spectrograma con Tacotron2
        mel_output, _, _ = tacotron2.encode_text(text)
        mel = mel_output.detach().cpu().numpy().astype(np.float32)
        
        # Imprimir forma original del mel para debugging
        print(f"Forma original del mel: {mel.shape}")
        
        # Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
        # Si mel tiene forma (80, frames) - lo m谩s probable
        if len(mel.shape) == 2:
            mel_input = np.expand_dims(mel, axis=0)  # (1, 80, frames)
            mel_input = np.expand_dims(mel_input, axis=-1)  # (1, 80, frames, 1)
        # Si viene con otra forma, intentamos adaptarla
        elif len(mel.shape) == 3 and mel.shape[0] == 1:
            # Si es (1, 80, frames) o (1, frames, 80)
            if mel.shape[1] == 80:
                mel_input = np.expand_dims(mel, axis=-1)  # (1, 80, frames, 1)
            else:
                mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1)  # (1, 80, frames, 1)
        else:
            # Intento final de reorganizaci贸n
            mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)
        
        print(f"Forma del mel preparado: {mel_input.shape}")
        
        # Generar audio
        generated_audio = generator(mel_input, training=False)
        
        # Procesar el audio generado
        generated_audio = tf.squeeze(generated_audio).numpy()
        
        # Asegurarse de que hay valores no cero antes de normalizar
        if np.max(np.abs(generated_audio)) > 0:
            generated_audio = generated_audio / np.max(np.abs(generated_audio))
        
        # Convertir a float32 para gradio
        generated_audio = generated_audio.astype(np.float32)
        
        print(f"Forma del audio generado: {generated_audio.shape}")
        
        return (sample_rate, generated_audio)
    
    except Exception as e:
        print(f"Error en la generaci贸n de audio: {e}")
        # Si hay error, imprimir un traceback completo para mejor diagn贸stico
        import traceback
        traceback.print_exc()
        return (sample_rate, default_audio)

# Crear interfaz en Gradio
interface = gr.Interface(
    fn=text_to_audio,
    inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
    outputs=gr.Audio(label="Audio generado"),
    title="Demo de TTS con Tacotron2 + Generador",
    description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
    examples=[["Hello"], ["Hi there"]]
)

# Lanzar aplicaci贸n
if __name__ == "__main__":
    interface.launch(debug=True)