seayala commited on
Commit
0e42d56
verified
1 Parent(s): 23441c8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import soundfile as sf
4
+ import tempfile
5
+
6
+ from transformers import (
7
+ SpeechT5Processor,
8
+ SpeechT5ForSpeechToSpeech,
9
+ SpeechT5HifiGan
10
+ )
11
+ from datasets import load_dataset
12
+ import librosa
13
+
14
+ # Carga de modelos y recursos
15
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
16
+ model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
17
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
+
19
+ # Speaker embeddings fijos (puedes extender a selecci贸n en el UI)
20
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
+
23
+ # Funci贸n principal
24
+ def voice_conversion(audio_file):
25
+ # Cargar audio con librosa
26
+ audio, sr = librosa.load(audio_file, sr=16000) # aseg煤rate de que est茅 en 16kHz
27
+ inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
28
+
29
+ with torch.no_grad():
30
+ speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
31
+
32
+ # Guardar en archivo temporal
33
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
34
+ sf.write(f.name, speech.numpy(), samplerate=16000)
35
+ return f.name
36
+
37
+ # Interfaz Gradio
38
+ interface = gr.Interface(
39
+ fn=voice_conversion,
40
+ inputs=gr.Audio(source="upload", type="filepath", label="Sube un audio (voz hablada)"),
41
+ outputs=gr.Audio(type="filepath", label="Voz convertida"),
42
+ title="SpeechT5 Voice Conversion",
43
+ description="Convierte una voz hablada en otra con SpeechT5 de Microsoft"
44
+ )
45
+
46
+ interface.launch()