Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,10 +22,9 @@ generator = keras.models.load_model(model_path, compile=False)
|
|
22 |
|
23 |
# Funci贸n para convertir texto a audio
|
24 |
def text_to_audio(text):
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
default_audio = np.zeros(target_length, dtype=np.float32)
|
29 |
|
30 |
if not text or not text.strip():
|
31 |
return (sample_rate, default_audio)
|
@@ -35,11 +34,24 @@ def text_to_audio(text):
|
|
35 |
mel_output, _, _ = tacotron2.encode_text(text)
|
36 |
mel = mel_output.detach().cpu().numpy().astype(np.float32)
|
37 |
|
|
|
38 |
print(f"Forma original del mel: {mel.shape}")
|
39 |
|
40 |
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
print(f"Forma del mel preparado: {mel_input.shape}")
|
45 |
|
@@ -49,34 +61,20 @@ def text_to_audio(text):
|
|
49 |
# Procesar el audio generado
|
50 |
generated_audio = tf.squeeze(generated_audio).numpy()
|
51 |
|
52 |
-
print(f"Forma del audio generado: {generated_audio.shape}")
|
53 |
-
|
54 |
# Asegurarse de que hay valores no cero antes de normalizar
|
55 |
if np.max(np.abs(generated_audio)) > 0:
|
56 |
generated_audio = generated_audio / np.max(np.abs(generated_audio))
|
57 |
|
58 |
-
# RECORTAR O RELLENAR EL AUDIO A 2 SEGUNDOS (16000 muestras)
|
59 |
-
current_length = len(generated_audio)
|
60 |
-
|
61 |
-
if current_length > target_length:
|
62 |
-
# Recortar si es m谩s largo de 2 segundos
|
63 |
-
print(f"Recortando audio de {current_length} a {target_length} muestras")
|
64 |
-
final_audio = generated_audio[:target_length]
|
65 |
-
else:
|
66 |
-
# Rellenar con ceros si es m谩s corto de 2 segundos
|
67 |
-
print(f"Rellenando audio de {current_length} a {target_length} muestras")
|
68 |
-
final_audio = np.zeros(target_length, dtype=np.float32)
|
69 |
-
final_audio[:current_length] = generated_audio
|
70 |
-
|
71 |
# Convertir a float32 para gradio
|
72 |
-
|
73 |
|
74 |
-
print(f"Forma
|
75 |
|
76 |
-
return (sample_rate,
|
77 |
|
78 |
except Exception as e:
|
79 |
print(f"Error en la generaci贸n de audio: {e}")
|
|
|
80 |
import traceback
|
81 |
traceback.print_exc()
|
82 |
return (sample_rate, default_audio)
|
@@ -85,12 +83,12 @@ def text_to_audio(text):
|
|
85 |
interface = gr.Interface(
|
86 |
fn=text_to_audio,
|
87 |
inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
|
88 |
-
outputs=gr.Audio(label="Audio generado
|
89 |
title="Demo de TTS con Tacotron2 + Generador",
|
90 |
-
description="Convierte texto en audio
|
91 |
examples=[["Hello"], ["Hi there"]]
|
92 |
)
|
93 |
|
94 |
# Lanzar aplicaci贸n
|
95 |
-
if
|
96 |
interface.launch(debug=True)
|
|
|
22 |
|
23 |
# Funci贸n para convertir texto a audio
|
24 |
def text_to_audio(text):
|
25 |
+
# Crear un array vac铆o por defecto en caso de error
|
26 |
+
default_audio = np.zeros(8000, dtype=np.float32)
|
27 |
+
sample_rate = 8000 # Ajusta seg煤n la configuraci贸n de tu modelo
|
|
|
28 |
|
29 |
if not text or not text.strip():
|
30 |
return (sample_rate, default_audio)
|
|
|
34 |
mel_output, _, _ = tacotron2.encode_text(text)
|
35 |
mel = mel_output.detach().cpu().numpy().astype(np.float32)
|
36 |
|
37 |
+
# Imprimir forma original del mel para debugging
|
38 |
print(f"Forma original del mel: {mel.shape}")
|
39 |
|
40 |
# Reorganizar el mel para que coincida con la forma esperada (batch, 80, frames, 1)
|
41 |
+
# Si mel tiene forma (80, frames) - lo m谩s probable
|
42 |
+
if len(mel.shape) == 2:
|
43 |
+
mel_input = np.expand_dims(mel, axis=0) # (1, 80, frames)
|
44 |
+
mel_input = np.expand_dims(mel_input, axis=-1) # (1, 80, frames, 1)
|
45 |
+
# Si viene con otra forma, intentamos adaptarla
|
46 |
+
elif len(mel.shape) == 3 and mel.shape[0] == 1:
|
47 |
+
# Si es (1, 80, frames) o (1, frames, 80)
|
48 |
+
if mel.shape[1] == 80:
|
49 |
+
mel_input = np.expand_dims(mel, axis=-1) # (1, 80, frames, 1)
|
50 |
+
else:
|
51 |
+
mel_input = np.expand_dims(np.transpose(mel, (0, 2, 1)), axis=-1) # (1, 80, frames, 1)
|
52 |
+
else:
|
53 |
+
# Intento final de reorganizaci贸n
|
54 |
+
mel_input = np.expand_dims(np.expand_dims(mel, axis=0), axis=-1)
|
55 |
|
56 |
print(f"Forma del mel preparado: {mel_input.shape}")
|
57 |
|
|
|
61 |
# Procesar el audio generado
|
62 |
generated_audio = tf.squeeze(generated_audio).numpy()
|
63 |
|
|
|
|
|
64 |
# Asegurarse de que hay valores no cero antes de normalizar
|
65 |
if np.max(np.abs(generated_audio)) > 0:
|
66 |
generated_audio = generated_audio / np.max(np.abs(generated_audio))
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# Convertir a float32 para gradio
|
69 |
+
generated_audio = generated_audio.astype(np.float32)
|
70 |
|
71 |
+
print(f"Forma del audio generado: {generated_audio.shape}")
|
72 |
|
73 |
+
return (sample_rate, generated_audio)
|
74 |
|
75 |
except Exception as e:
|
76 |
print(f"Error en la generaci贸n de audio: {e}")
|
77 |
+
# Si hay error, imprimir un traceback completo para mejor diagn贸stico
|
78 |
import traceback
|
79 |
traceback.print_exc()
|
80 |
return (sample_rate, default_audio)
|
|
|
83 |
interface = gr.Interface(
|
84 |
fn=text_to_audio,
|
85 |
inputs=gr.Textbox(lines=2, placeholder="Escribe algo (ej. 'Hello world')"),
|
86 |
+
outputs=gr.Audio(label="Audio generado"),
|
87 |
title="Demo de TTS con Tacotron2 + Generador",
|
88 |
+
description="Convierte texto en audio usando Tacotron2 + modelo Generator entrenado.",
|
89 |
examples=[["Hello"], ["Hi there"]]
|
90 |
)
|
91 |
|
92 |
# Lanzar aplicaci贸n
|
93 |
+
if _name_ == "_main_":
|
94 |
interface.launch(debug=True)
|