Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -5,11 +5,12 @@ import random
|
|
5 |
import argparse
|
6 |
import os
|
7 |
import torch
|
|
|
8 |
import librosa
|
9 |
from tqdm import tqdm
|
10 |
from diffusers import DDIMScheduler
|
11 |
from solospeech.model.solospeech.conditioners import SoloSpeech_TSE
|
12 |
-
from solospeech.model.solospeech.conditioners import SoloSpeech_TSR
|
13 |
from solospeech.scripts.solospeech.utils import save_audio
|
14 |
import shutil
|
15 |
from solospeech.vae_modules.autoencoder_wrapper import Autoencoder
|
@@ -61,6 +62,7 @@ parser = argparse.ArgumentParser()
|
|
61 |
# pre-trained model path
|
62 |
parser.add_argument('--eta', type=int, default=0)
|
63 |
parser.add_argument("--num_infer_steps", type=int, default=200)
|
|
|
64 |
parser.add_argument('--sample-rate', type=int, default=16000)
|
65 |
# random seed
|
66 |
parser.add_argument('--random-seed', type=int, default=42, help="Fixed seed")
|
@@ -71,11 +73,11 @@ local_dir = snapshot_download(
|
|
71 |
repo_id="OpenSound/SoloSpeech-models"
|
72 |
)
|
73 |
args.tse_config = os.path.join(local_dir, "config_extractor.yaml")
|
74 |
-
args.tsr_config = os.path.join(local_dir, "config_tsr.yaml")
|
75 |
args.vae_config = os.path.join(local_dir, "config_compressor.json")
|
76 |
args.autoencoder_path = os.path.join(local_dir, "compressor.ckpt")
|
77 |
args.tse_ckpt = os.path.join(local_dir, "extractor.pt")
|
78 |
-
args.tsr_ckpt = os.path.join(local_dir, "tsr.pt")
|
79 |
args.geco_ckpt = os.path.join(local_dir, "corrector.ckpt")
|
80 |
|
81 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -84,8 +86,8 @@ print(f"Device: {device}")
|
|
84 |
print("Loading models...")
|
85 |
with open(args.tse_config, 'r') as fp:
|
86 |
args.tse_config = yaml.safe_load(fp)
|
87 |
-
with open(args.tsr_config, 'r') as fp:
|
88 |
-
|
89 |
args.v_prediction = args.tse_config["ddim"]["v_prediction"]
|
90 |
# load compressor
|
91 |
autoencoder = Autoencoder(args.autoencoder_path, args.vae_config, 'stft_vae', quantization_first=True)
|
@@ -98,12 +100,12 @@ tse_model = SoloSpeech_TSE(
|
|
98 |
).to(device)
|
99 |
tse_model.load_state_dict(torch.load(args.tse_ckpt)['model'])
|
100 |
tse_model.eval()
|
101 |
-
# load tsr model
|
102 |
-
tsr_model = SoloSpeech_TSR(
|
103 |
-
|
104 |
-
).to(device)
|
105 |
-
tsr_model.load_state_dict(torch.load(args.tsr_ckpt)['model'])
|
106 |
-
tsr_model.eval()
|
107 |
# load corrector
|
108 |
geco_model = ScoreModel.load_from_checkpoint(
|
109 |
args.geco_ckpt,
|
@@ -113,7 +115,7 @@ geco_model.eval(no_ema=False)
|
|
113 |
geco_model.cuda()
|
114 |
# load sid model
|
115 |
ecapatdnn_model = Encoder.from_hparams(source="yangwang825/ecapa-tdnn-vox2")
|
116 |
-
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
|
117 |
# load diffusion tools
|
118 |
noise_scheduler = DDIMScheduler(**args.tse_config["ddim"]['diffusers'])
|
119 |
# these steps reset dtype of noise_scheduler params
|
@@ -128,7 +130,7 @@ _ = noise_scheduler.add_noise(latents, noise, timesteps)
|
|
128 |
|
129 |
|
130 |
@spaces.GPU
|
131 |
-
def sample_diffusion(tse_model,
|
132 |
mixture=None, reference=None, lengths=None, reference_lengths=None,
|
133 |
ddim_steps=50, eta=0, seed=2025
|
134 |
):
|
@@ -136,7 +138,7 @@ def sample_diffusion(tse_model, tsr_model, autoencoder, std, scheduler, device,
|
|
136 |
generator = torch.Generator(device=device).manual_seed(seed)
|
137 |
scheduler.set_timesteps(ddim_steps)
|
138 |
tse_pred = torch.randn(mixture.shape, generator=generator, device=device)
|
139 |
-
tsr_pred = torch.randn(mixture.shape, generator=generator, device=device)
|
140 |
|
141 |
for t in scheduler.timesteps:
|
142 |
tse_pred = scheduler.scale_model_input(tse_pred, t)
|
@@ -151,22 +153,22 @@ def sample_diffusion(tse_model, tsr_model, autoencoder, std, scheduler, device,
|
|
151 |
tse_pred = scheduler.step(model_output=model_output, timestep=t, sample=tse_pred,
|
152 |
eta=eta, generator=generator).prev_sample
|
153 |
|
154 |
-
for t in scheduler.timesteps:
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
|
166 |
tse_pred = autoencoder(embedding=tse_pred.transpose(2,1), std=std).squeeze(1)
|
167 |
-
tsr_pred = autoencoder(embedding=tsr_pred.transpose(2,1), std=std).squeeze(1)
|
168 |
|
169 |
-
return tse_pred
|
170 |
|
171 |
@spaces.GPU
|
172 |
def tse(test_wav, enroll_wav):
|
@@ -179,19 +181,21 @@ def tse(test_wav, enroll_wav):
|
|
179 |
with torch.no_grad():
|
180 |
# compressor
|
181 |
reference, _ = autoencoder(audio=reference.unsqueeze(1))
|
182 |
-
reference_lengths = torch.LongTensor([reference.shape[-1]]).to(device)
|
183 |
mixture_input = torch.tensor(mixture).unsqueeze(0).to(device)
|
184 |
mixture_wav = mixture_input
|
185 |
mixture_input, std = autoencoder(audio=mixture_input.unsqueeze(1))
|
186 |
-
lengths = torch.LongTensor([mixture_input.shape[-1]]).to(device)
|
187 |
# extractor
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
195 |
# corrector
|
196 |
min_leng = min(pred.shape[-1], mixture_wav.shape[-1])
|
197 |
x = pred[...,:min_leng]
|
|
|
5 |
import argparse
|
6 |
import os
|
7 |
import torch
|
8 |
+
import torch.nn.functional as F
|
9 |
import librosa
|
10 |
from tqdm import tqdm
|
11 |
from diffusers import DDIMScheduler
|
12 |
from solospeech.model.solospeech.conditioners import SoloSpeech_TSE
|
13 |
+
# from solospeech.model.solospeech.conditioners import SoloSpeech_TSR
|
14 |
from solospeech.scripts.solospeech.utils import save_audio
|
15 |
import shutil
|
16 |
from solospeech.vae_modules.autoencoder_wrapper import Autoencoder
|
|
|
62 |
# pre-trained model path
|
63 |
parser.add_argument('--eta', type=int, default=0)
|
64 |
parser.add_argument("--num_infer_steps", type=int, default=200)
|
65 |
+
parser.add_argument("--num_candidates", type=int, default=4)
|
66 |
parser.add_argument('--sample-rate', type=int, default=16000)
|
67 |
# random seed
|
68 |
parser.add_argument('--random-seed', type=int, default=42, help="Fixed seed")
|
|
|
73 |
repo_id="OpenSound/SoloSpeech-models"
|
74 |
)
|
75 |
args.tse_config = os.path.join(local_dir, "config_extractor.yaml")
|
76 |
+
# args.tsr_config = os.path.join(local_dir, "config_tsr.yaml")
|
77 |
args.vae_config = os.path.join(local_dir, "config_compressor.json")
|
78 |
args.autoencoder_path = os.path.join(local_dir, "compressor.ckpt")
|
79 |
args.tse_ckpt = os.path.join(local_dir, "extractor.pt")
|
80 |
+
# args.tsr_ckpt = os.path.join(local_dir, "tsr.pt")
|
81 |
args.geco_ckpt = os.path.join(local_dir, "corrector.ckpt")
|
82 |
|
83 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
86 |
print("Loading models...")
|
87 |
with open(args.tse_config, 'r') as fp:
|
88 |
args.tse_config = yaml.safe_load(fp)
|
89 |
+
# with open(args.tsr_config, 'r') as fp:
|
90 |
+
# args.tsr_config = yaml.safe_load(fp)
|
91 |
args.v_prediction = args.tse_config["ddim"]["v_prediction"]
|
92 |
# load compressor
|
93 |
autoencoder = Autoencoder(args.autoencoder_path, args.vae_config, 'stft_vae', quantization_first=True)
|
|
|
100 |
).to(device)
|
101 |
tse_model.load_state_dict(torch.load(args.tse_ckpt)['model'])
|
102 |
tse_model.eval()
|
103 |
+
# # load tsr model
|
104 |
+
# tsr_model = SoloSpeech_TSR(
|
105 |
+
# args.tsr_config['diffwrap']['UDiT']
|
106 |
+
# ).to(device)
|
107 |
+
# tsr_model.load_state_dict(torch.load(args.tsr_ckpt)['model'])
|
108 |
+
# tsr_model.eval()
|
109 |
# load corrector
|
110 |
geco_model = ScoreModel.load_from_checkpoint(
|
111 |
args.geco_ckpt,
|
|
|
115 |
geco_model.cuda()
|
116 |
# load sid model
|
117 |
ecapatdnn_model = Encoder.from_hparams(source="yangwang825/ecapa-tdnn-vox2")
|
118 |
+
# cosine_sim = torch.nn.CosineSimilarity(dim=-1)
|
119 |
# load diffusion tools
|
120 |
noise_scheduler = DDIMScheduler(**args.tse_config["ddim"]['diffusers'])
|
121 |
# these steps reset dtype of noise_scheduler params
|
|
|
130 |
|
131 |
|
132 |
@spaces.GPU
|
133 |
+
def sample_diffusion(tse_model, autoencoder, std, scheduler, device,
|
134 |
mixture=None, reference=None, lengths=None, reference_lengths=None,
|
135 |
ddim_steps=50, eta=0, seed=2025
|
136 |
):
|
|
|
138 |
generator = torch.Generator(device=device).manual_seed(seed)
|
139 |
scheduler.set_timesteps(ddim_steps)
|
140 |
tse_pred = torch.randn(mixture.shape, generator=generator, device=device)
|
141 |
+
# tsr_pred = torch.randn(mixture.shape, generator=generator, device=device)
|
142 |
|
143 |
for t in scheduler.timesteps:
|
144 |
tse_pred = scheduler.scale_model_input(tse_pred, t)
|
|
|
153 |
tse_pred = scheduler.step(model_output=model_output, timestep=t, sample=tse_pred,
|
154 |
eta=eta, generator=generator).prev_sample
|
155 |
|
156 |
+
# for t in scheduler.timesteps:
|
157 |
+
# tsr_pred = scheduler.scale_model_input(tsr_pred, t)
|
158 |
+
# model_output, _ = tsr_model(
|
159 |
+
# x=tsr_pred,
|
160 |
+
# timesteps=t,
|
161 |
+
# mixture=mixture,
|
162 |
+
# reference=tse_pred,
|
163 |
+
# x_len=lengths,
|
164 |
+
# )
|
165 |
+
# tsr_pred = scheduler.step(model_output=model_output, timestep=t, sample=tsr_pred,
|
166 |
+
# eta=eta, generator=generator).prev_sample
|
167 |
|
168 |
tse_pred = autoencoder(embedding=tse_pred.transpose(2,1), std=std).squeeze(1)
|
169 |
+
# tsr_pred = autoencoder(embedding=tsr_pred.transpose(2,1), std=std).squeeze(1)
|
170 |
|
171 |
+
return tse_pred
|
172 |
|
173 |
@spaces.GPU
|
174 |
def tse(test_wav, enroll_wav):
|
|
|
181 |
with torch.no_grad():
|
182 |
# compressor
|
183 |
reference, _ = autoencoder(audio=reference.unsqueeze(1))
|
184 |
+
reference_lengths = torch.LongTensor([reference.shape[-1]] * args.num_candidates).to(device)
|
185 |
mixture_input = torch.tensor(mixture).unsqueeze(0).to(device)
|
186 |
mixture_wav = mixture_input
|
187 |
mixture_input, std = autoencoder(audio=mixture_input.unsqueeze(1))
|
188 |
+
lengths = torch.LongTensor([mixture_input.shape[-1]] * args.num_candidates).to(device)
|
189 |
# extractor
|
190 |
+
mixture_input = mixture_input.repeat(args.num_candidates, 1, 1)
|
191 |
+
reference = reference.repeat(args.num_candidates, 1, 1)
|
192 |
+
tse_pred = sample_diffusion(tse_model, autoencoder, std, noise_scheduler, device, mixture_input.transpose(2,1), reference.transpose(2,1), lengths, reference_lengths, ddim_steps=args.num_infer_steps, eta=args.eta, seed=args.random_seed)
|
193 |
+
tse_pred = sample_diffusion(tse_model, autoencoder, std, noise_scheduler, device, mixture_input.transpose(2,1), reference.transpose(2,1), lengths, reference_lengths, ddim_steps=args.num_infer_steps, eta=args.eta, seed=args.random_seed)
|
194 |
+
ecapatdnn_embedding_pred = ecapatdnn_model.encode_batch(tse_pred).squeeze()
|
195 |
+
ecapatdnn_embedding_ref = ecapatdnn_model.encode_batch(torch.tensor(reference_wav)).squeeze()
|
196 |
+
cos_sims = F.cosine_similarity(ecapatdnn_embedding_pred, ecapatdnn_embedding_ref.unsqueeze(0), dim=1)
|
197 |
+
_, max_idx = torch.max(cos_sims, dim=0)
|
198 |
+
pred = tse_pred[max_idx].unsqueeze(0)
|
199 |
# corrector
|
200 |
min_leng = min(pred.shape[-1], mixture_wav.shape[-1])
|
201 |
x = pred[...,:min_leng]
|