OpenSound commited on
Commit
f5e60bb
·
verified ·
1 Parent(s): 336c98d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -36
app.py CHANGED
@@ -5,11 +5,12 @@ import random
5
  import argparse
6
  import os
7
  import torch
 
8
  import librosa
9
  from tqdm import tqdm
10
  from diffusers import DDIMScheduler
11
  from solospeech.model.solospeech.conditioners import SoloSpeech_TSE
12
- from solospeech.model.solospeech.conditioners import SoloSpeech_TSR
13
  from solospeech.scripts.solospeech.utils import save_audio
14
  import shutil
15
  from solospeech.vae_modules.autoencoder_wrapper import Autoencoder
@@ -61,6 +62,7 @@ parser = argparse.ArgumentParser()
61
  # pre-trained model path
62
  parser.add_argument('--eta', type=int, default=0)
63
  parser.add_argument("--num_infer_steps", type=int, default=200)
 
64
  parser.add_argument('--sample-rate', type=int, default=16000)
65
  # random seed
66
  parser.add_argument('--random-seed', type=int, default=42, help="Fixed seed")
@@ -71,11 +73,11 @@ local_dir = snapshot_download(
71
  repo_id="OpenSound/SoloSpeech-models"
72
  )
73
  args.tse_config = os.path.join(local_dir, "config_extractor.yaml")
74
- args.tsr_config = os.path.join(local_dir, "config_tsr.yaml")
75
  args.vae_config = os.path.join(local_dir, "config_compressor.json")
76
  args.autoencoder_path = os.path.join(local_dir, "compressor.ckpt")
77
  args.tse_ckpt = os.path.join(local_dir, "extractor.pt")
78
- args.tsr_ckpt = os.path.join(local_dir, "tsr.pt")
79
  args.geco_ckpt = os.path.join(local_dir, "corrector.ckpt")
80
 
81
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -84,8 +86,8 @@ print(f"Device: {device}")
84
  print("Loading models...")
85
  with open(args.tse_config, 'r') as fp:
86
  args.tse_config = yaml.safe_load(fp)
87
- with open(args.tsr_config, 'r') as fp:
88
- args.tsr_config = yaml.safe_load(fp)
89
  args.v_prediction = args.tse_config["ddim"]["v_prediction"]
90
  # load compressor
91
  autoencoder = Autoencoder(args.autoencoder_path, args.vae_config, 'stft_vae', quantization_first=True)
@@ -98,12 +100,12 @@ tse_model = SoloSpeech_TSE(
98
  ).to(device)
99
  tse_model.load_state_dict(torch.load(args.tse_ckpt)['model'])
100
  tse_model.eval()
101
- # load tsr model
102
- tsr_model = SoloSpeech_TSR(
103
- args.tsr_config['diffwrap']['UDiT']
104
- ).to(device)
105
- tsr_model.load_state_dict(torch.load(args.tsr_ckpt)['model'])
106
- tsr_model.eval()
107
  # load corrector
108
  geco_model = ScoreModel.load_from_checkpoint(
109
  args.geco_ckpt,
@@ -113,7 +115,7 @@ geco_model.eval(no_ema=False)
113
  geco_model.cuda()
114
  # load sid model
115
  ecapatdnn_model = Encoder.from_hparams(source="yangwang825/ecapa-tdnn-vox2")
116
- cosine_sim = torch.nn.CosineSimilarity(dim=-1)
117
  # load diffusion tools
118
  noise_scheduler = DDIMScheduler(**args.tse_config["ddim"]['diffusers'])
119
  # these steps reset dtype of noise_scheduler params
@@ -128,7 +130,7 @@ _ = noise_scheduler.add_noise(latents, noise, timesteps)
128
 
129
 
130
  @spaces.GPU
131
- def sample_diffusion(tse_model, tsr_model, autoencoder, std, scheduler, device,
132
  mixture=None, reference=None, lengths=None, reference_lengths=None,
133
  ddim_steps=50, eta=0, seed=2025
134
  ):
@@ -136,7 +138,7 @@ def sample_diffusion(tse_model, tsr_model, autoencoder, std, scheduler, device,
136
  generator = torch.Generator(device=device).manual_seed(seed)
137
  scheduler.set_timesteps(ddim_steps)
138
  tse_pred = torch.randn(mixture.shape, generator=generator, device=device)
139
- tsr_pred = torch.randn(mixture.shape, generator=generator, device=device)
140
 
141
  for t in scheduler.timesteps:
142
  tse_pred = scheduler.scale_model_input(tse_pred, t)
@@ -151,22 +153,22 @@ def sample_diffusion(tse_model, tsr_model, autoencoder, std, scheduler, device,
151
  tse_pred = scheduler.step(model_output=model_output, timestep=t, sample=tse_pred,
152
  eta=eta, generator=generator).prev_sample
153
 
154
- for t in scheduler.timesteps:
155
- tsr_pred = scheduler.scale_model_input(tsr_pred, t)
156
- model_output, _ = tsr_model(
157
- x=tsr_pred,
158
- timesteps=t,
159
- mixture=mixture,
160
- reference=tse_pred,
161
- x_len=lengths,
162
- )
163
- tsr_pred = scheduler.step(model_output=model_output, timestep=t, sample=tsr_pred,
164
- eta=eta, generator=generator).prev_sample
165
 
166
  tse_pred = autoencoder(embedding=tse_pred.transpose(2,1), std=std).squeeze(1)
167
- tsr_pred = autoencoder(embedding=tsr_pred.transpose(2,1), std=std).squeeze(1)
168
 
169
- return tse_pred, tsr_pred
170
 
171
  @spaces.GPU
172
  def tse(test_wav, enroll_wav):
@@ -179,19 +181,21 @@ def tse(test_wav, enroll_wav):
179
  with torch.no_grad():
180
  # compressor
181
  reference, _ = autoencoder(audio=reference.unsqueeze(1))
182
- reference_lengths = torch.LongTensor([reference.shape[-1]]).to(device)
183
  mixture_input = torch.tensor(mixture).unsqueeze(0).to(device)
184
  mixture_wav = mixture_input
185
  mixture_input, std = autoencoder(audio=mixture_input.unsqueeze(1))
186
- lengths = torch.LongTensor([mixture_input.shape[-1]]).to(device)
187
  # extractor
188
- tse_pred, tsr_pred = sample_diffusion(tse_model, tsr_model, autoencoder, std, noise_scheduler, device, mixture_input.transpose(2,1), reference.transpose(2,1), lengths, reference_lengths, ddim_steps=args.num_infer_steps, eta=args.eta, seed=args.random_seed)
189
- ecapatdnn_embedding1 = ecapatdnn_model.encode_batch(tse_pred.squeeze()).squeeze()
190
- ecapatdnn_embedding2 = ecapatdnn_model.encode_batch(tsr_pred.squeeze()).squeeze()
191
- ecapatdnn_embedding3 = ecapatdnn_model.encode_batch(torch.tensor(reference_wav)).squeeze()
192
- sim1 = cosine_sim(ecapatdnn_embedding1, ecapatdnn_embedding3).item()
193
- sim2 = cosine_sim(ecapatdnn_embedding2, ecapatdnn_embedding3).item()
194
- pred = tse_pred if sim1 > sim2 else tsr_pred
 
 
195
  # corrector
196
  min_leng = min(pred.shape[-1], mixture_wav.shape[-1])
197
  x = pred[...,:min_leng]
 
5
  import argparse
6
  import os
7
  import torch
8
+ import torch.nn.functional as F
9
  import librosa
10
  from tqdm import tqdm
11
  from diffusers import DDIMScheduler
12
  from solospeech.model.solospeech.conditioners import SoloSpeech_TSE
13
+ # from solospeech.model.solospeech.conditioners import SoloSpeech_TSR
14
  from solospeech.scripts.solospeech.utils import save_audio
15
  import shutil
16
  from solospeech.vae_modules.autoencoder_wrapper import Autoencoder
 
62
  # pre-trained model path
63
  parser.add_argument('--eta', type=int, default=0)
64
  parser.add_argument("--num_infer_steps", type=int, default=200)
65
+ parser.add_argument("--num_candidates", type=int, default=4)
66
  parser.add_argument('--sample-rate', type=int, default=16000)
67
  # random seed
68
  parser.add_argument('--random-seed', type=int, default=42, help="Fixed seed")
 
73
  repo_id="OpenSound/SoloSpeech-models"
74
  )
75
  args.tse_config = os.path.join(local_dir, "config_extractor.yaml")
76
+ # args.tsr_config = os.path.join(local_dir, "config_tsr.yaml")
77
  args.vae_config = os.path.join(local_dir, "config_compressor.json")
78
  args.autoencoder_path = os.path.join(local_dir, "compressor.ckpt")
79
  args.tse_ckpt = os.path.join(local_dir, "extractor.pt")
80
+ # args.tsr_ckpt = os.path.join(local_dir, "tsr.pt")
81
  args.geco_ckpt = os.path.join(local_dir, "corrector.ckpt")
82
 
83
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
86
  print("Loading models...")
87
  with open(args.tse_config, 'r') as fp:
88
  args.tse_config = yaml.safe_load(fp)
89
+ # with open(args.tsr_config, 'r') as fp:
90
+ # args.tsr_config = yaml.safe_load(fp)
91
  args.v_prediction = args.tse_config["ddim"]["v_prediction"]
92
  # load compressor
93
  autoencoder = Autoencoder(args.autoencoder_path, args.vae_config, 'stft_vae', quantization_first=True)
 
100
  ).to(device)
101
  tse_model.load_state_dict(torch.load(args.tse_ckpt)['model'])
102
  tse_model.eval()
103
+ # # load tsr model
104
+ # tsr_model = SoloSpeech_TSR(
105
+ # args.tsr_config['diffwrap']['UDiT']
106
+ # ).to(device)
107
+ # tsr_model.load_state_dict(torch.load(args.tsr_ckpt)['model'])
108
+ # tsr_model.eval()
109
  # load corrector
110
  geco_model = ScoreModel.load_from_checkpoint(
111
  args.geco_ckpt,
 
115
  geco_model.cuda()
116
  # load sid model
117
  ecapatdnn_model = Encoder.from_hparams(source="yangwang825/ecapa-tdnn-vox2")
118
+ # cosine_sim = torch.nn.CosineSimilarity(dim=-1)
119
  # load diffusion tools
120
  noise_scheduler = DDIMScheduler(**args.tse_config["ddim"]['diffusers'])
121
  # these steps reset dtype of noise_scheduler params
 
130
 
131
 
132
  @spaces.GPU
133
+ def sample_diffusion(tse_model, autoencoder, std, scheduler, device,
134
  mixture=None, reference=None, lengths=None, reference_lengths=None,
135
  ddim_steps=50, eta=0, seed=2025
136
  ):
 
138
  generator = torch.Generator(device=device).manual_seed(seed)
139
  scheduler.set_timesteps(ddim_steps)
140
  tse_pred = torch.randn(mixture.shape, generator=generator, device=device)
141
+ # tsr_pred = torch.randn(mixture.shape, generator=generator, device=device)
142
 
143
  for t in scheduler.timesteps:
144
  tse_pred = scheduler.scale_model_input(tse_pred, t)
 
153
  tse_pred = scheduler.step(model_output=model_output, timestep=t, sample=tse_pred,
154
  eta=eta, generator=generator).prev_sample
155
 
156
+ # for t in scheduler.timesteps:
157
+ # tsr_pred = scheduler.scale_model_input(tsr_pred, t)
158
+ # model_output, _ = tsr_model(
159
+ # x=tsr_pred,
160
+ # timesteps=t,
161
+ # mixture=mixture,
162
+ # reference=tse_pred,
163
+ # x_len=lengths,
164
+ # )
165
+ # tsr_pred = scheduler.step(model_output=model_output, timestep=t, sample=tsr_pred,
166
+ # eta=eta, generator=generator).prev_sample
167
 
168
  tse_pred = autoencoder(embedding=tse_pred.transpose(2,1), std=std).squeeze(1)
169
+ # tsr_pred = autoencoder(embedding=tsr_pred.transpose(2,1), std=std).squeeze(1)
170
 
171
+ return tse_pred
172
 
173
  @spaces.GPU
174
  def tse(test_wav, enroll_wav):
 
181
  with torch.no_grad():
182
  # compressor
183
  reference, _ = autoencoder(audio=reference.unsqueeze(1))
184
+ reference_lengths = torch.LongTensor([reference.shape[-1]] * args.num_candidates).to(device)
185
  mixture_input = torch.tensor(mixture).unsqueeze(0).to(device)
186
  mixture_wav = mixture_input
187
  mixture_input, std = autoencoder(audio=mixture_input.unsqueeze(1))
188
+ lengths = torch.LongTensor([mixture_input.shape[-1]] * args.num_candidates).to(device)
189
  # extractor
190
+ mixture_input = mixture_input.repeat(args.num_candidates, 1, 1)
191
+ reference = reference.repeat(args.num_candidates, 1, 1)
192
+ tse_pred = sample_diffusion(tse_model, autoencoder, std, noise_scheduler, device, mixture_input.transpose(2,1), reference.transpose(2,1), lengths, reference_lengths, ddim_steps=args.num_infer_steps, eta=args.eta, seed=args.random_seed)
193
+ tse_pred = sample_diffusion(tse_model, autoencoder, std, noise_scheduler, device, mixture_input.transpose(2,1), reference.transpose(2,1), lengths, reference_lengths, ddim_steps=args.num_infer_steps, eta=args.eta, seed=args.random_seed)
194
+ ecapatdnn_embedding_pred = ecapatdnn_model.encode_batch(tse_pred).squeeze()
195
+ ecapatdnn_embedding_ref = ecapatdnn_model.encode_batch(torch.tensor(reference_wav)).squeeze()
196
+ cos_sims = F.cosine_similarity(ecapatdnn_embedding_pred, ecapatdnn_embedding_ref.unsqueeze(0), dim=1)
197
+ _, max_idx = torch.max(cos_sims, dim=0)
198
+ pred = tse_pred[max_idx].unsqueeze(0)
199
  # corrector
200
  min_leng = min(pred.shape[-1], mixture_wav.shape[-1])
201
  x = pred[...,:min_leng]