werning commited on
Commit
cbdb41f
·
1 Parent(s): 948a63d

Adapt code for gradio

Browse files
app.py CHANGED
@@ -1,28 +1,177 @@
1
- import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
 
4
- # gradio generate tone example
5
 
6
- notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
 
 
7
 
8
- def generate_tone(note, octave, duration):
9
- sr = 48000
10
- a4_freq, tones_from_a4 = 440, 12 * (octave - 4) + (note - 9)
11
- frequency = a4_freq * 2 ** (tones_from_a4 / 12)
12
- duration = int(duration)
13
- audio = np.linspace(0, duration, duration * sr)
14
- audio = (20000 * np.sin(audio * (2 * np.pi * frequency))).astype(np.int16)
15
- return sr, audio
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  demo = gr.Interface(
18
- generate_tone,
19
- [
20
- gr.Dropdown(notes, type="index"),
21
- gr.Slider(4, 6, step=1),
22
- gr.Textbox(value="1", label="Duration in seconds"),
23
  ],
24
- "audio",
25
  )
 
26
  if __name__ == "__main__":
27
- demo.launch()
28
 
 
1
+ import numpy as np
2
+ from pathlib import Path
3
+ import padertorch as pt
4
+ import paderbox as pb
5
+ import time
6
+ import torch
7
+ import torchaudio
8
+ from onnxruntime import InferenceSession
9
+ from pvq_manipulation.models.vits import Vits_NT
10
+ from pvq_manipulation.models.ffjord import FFJORD
11
+ from IPython.display import display, Audio, clear_output
12
+ from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
13
+ import librosa
14
+ from pvq_manipulation.helper.vad import EnergyVAD
15
  import gradio as gr
16
 
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
 
19
+ # load tts model
20
+ storage_dir_tts = Path("./models/tts_model/")
21
+ tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
22
 
23
+ # load normalizing flow
24
+ storage_dir_normalizing_flow = Path("./models/norm_flow")
25
+ speaker_conditioning = pb.io.load(storage_dir_normalizing_flow / "speaker_conditioning.json")
 
 
 
 
 
26
 
27
+ normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
28
+
29
+ # load hubert features model
30
+ hubert_model = HubertExtractor(
31
+ layer=SID_LARGE_LAYER,
32
+ model_name="HUBERT_LARGE",
33
+ backend="torchaudio",
34
+ device=device,
35
+ # storage_dir= # target storage dir hubert model
36
+ )
37
+
38
+ # example synthesis
39
+ # speaker_id = 1034
40
+ # example_id = "1034_121119_000028_000001"
41
+
42
+ # wav_1 = tts_model.synthesize_from_example({
43
+ # 'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
44
+ # 'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth"
45
+ # })
46
+ # display(Audio(wav_1, rate=24_000, normalize=True))
47
+
48
+ # manipulation block
49
+ def get_manipulation(
50
+ d_vector,
51
+ labels,
52
+ flow,
53
+ tts_model,
54
+ manipulation_idx=0,
55
+ manipulation_fkt=1,
56
+ ):
57
+ labels_manipulated = labels.clone()
58
+ labels_manipulated[:,manipulation_idx] += manipulation_fkt
59
+
60
+ output_forward = flow.forward((d_vector.float(), labels))[0]
61
+ sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
62
+
63
+ wav = tts_model.synthesize_from_example({
64
+ 'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
65
+ 'd_vector': d_vector.detach().numpy(),
66
+ 'd_vector_man': sampled_class_manipulated.detach().numpy(),
67
+ })
68
+ return wav
69
+
70
+ def extract_speaker_embedding(example):
71
+ observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
72
+ observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)
73
+
74
+ vad = EnergyVAD(sample_rate=16_000)
75
+ if observation.ndim == 1:
76
+ observation = observation[None, :]
77
+
78
+ observation = vad({'audio_data': observation})['audio_data']
79
+
80
+ with torch.no_grad():
81
+ example = tts_model.speaker_manager.prepare_example({'audio_data': {'observation': observation}, **example})
82
+ example = pt.data.utils.collate_fn([example])
83
+ example['features'] = torch.tensor(np.array(example['features']))
84
+ d_vector = tts_model.speaker_manager.forward(example)[0]
85
+ return d_vector
86
+
87
+ # load speaker labels
88
+ def load_speaker_labels(example, speaker_conditioning, reg_stor_dir=Path('./models/pvq_extractor/')):
89
+ audio, _ = torchaudio.load(example['audio_path']['observation'])
90
+ audio = audio.to(device)
91
+ num_samples = torch.tensor([audio.shape[-1]], device=device)
92
+
93
+ providers = ["CPUExecutionProvider"]
94
+
95
+ with torch.no_grad():
96
+ features, seq_len = hubert_model(
97
+ audio,
98
+ 24_000,
99
+ sequence_lengths=num_samples,
100
+ )
101
+ features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
102
+
103
+ pvqd_predictions = {}
104
+ for pvq in ['Breathiness', 'Loudness', 'Pitch', 'Resonance', 'Roughness', 'Strain', 'Weight']:
105
+ with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
106
+ onnx = fid.read()
107
+ sess = InferenceSession(onnx, providers=providers)
108
+ pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
109
+ pvqd_predictions[pvq] = pred.tolist()[0]
110
+ labels = []
111
+ for key in speaker_conditioning:
112
+ labels.append(pvqd_predictions[key]/100)
113
+ return torch.tensor(labels)
114
+
115
+
116
+ example = {
117
+ 'audio_path': {'observation': "audio/1034_121119_000028_000001.wav"},
118
+ 'speaker_id': 1034,
119
+ 'example_id': "1034_121119_000028_000001",
120
+ }
121
+
122
+ labels = load_speaker_labels(example, speaker_conditioning)
123
+ label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
124
+
125
+ # print('Estimated PVQ strengths of input speaker:')
126
+ # max_len = max(len(name) for name in label_options)
127
+ # for label_name, pvq in zip(label_options, labels):
128
+ # print(f'{label_name:<{max_len}} : {pvq:6.2f}')
129
+
130
+
131
+ def update_manipulation(manipulation_idx, manipulation_fkt):
132
+
133
+ d_vector = extract_speaker_embedding(example)
134
+ labels = load_speaker_labels(example, speaker_conditioning)
135
+
136
+ wav_manipulated = get_manipulation(
137
+ # example=example,
138
+ d_vector=d_vector,
139
+ labels=labels[None, :],
140
+ flow=normalizing_flow,
141
+ tts_model=tts_model,
142
+ manipulation_idx=manipulation_idx,
143
+ manipulation_fkt=manipulation_fkt,
144
+ )
145
+
146
+ wav_unmanipulated = tts_model.synthesize_from_example({
147
+ 'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
148
+ 'd_vector': d_vector.detach().numpy(),
149
+ })
150
+ sr = 24_000
151
+ return (sr, wav_unmanipulated), (sr, wav_manipulated)
152
+
153
+ # with audio_output:
154
+ # clear_output(wait=True)
155
+ # print('Manipulated Speaker')
156
+ # display(Audio(wav_manipulated, rate=24_000, normalize=True))
157
+ # print('Unmanipulated Synthese')
158
+ # display(Audio(wav_unmanipulated, rate=24_000, normalize=True))
159
+ # print('Original Speaker')
160
+ # display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))
161
+
162
+ # print(f"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}")
163
+
164
+
165
+ dropdown_options = [(label, i) for i, label in enumerate(label_options)]
166
  demo = gr.Interface(
167
+ update_manipulation,
168
+ inputs=[
169
+ gr.Dropdown(dropdown_options, value=2, type="index"),
170
+ gr.Slider(minimum=-2.0, maximum=2.0, value=1.0, step=0.1),
 
171
  ],
172
+ outputs=[gr.Audio(label="original"), gr.Audio(label="manipulated")],
173
  )
174
+
175
  if __name__ == "__main__":
176
+ demo.launch(share=True)
177
 
pvq_manipulation/models/ffjord.py CHANGED
@@ -93,9 +93,10 @@ class FFJORD(Model):
93
  This class is an implementation of the FFJORD model as proposed in
94
  https://arxiv.org/pdf/1810.01367
95
  """
96
- def __init__(self, ode_function, normalize=True):
97
  super().__init__()
98
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
99
  self.input_dim = ode_function.input_dim
100
  self.time_deriv_func = ODEBlock(ode_function=ode_function)
101
  self.latent_dist = torch.distributions.MultivariateNormal(
@@ -108,15 +109,18 @@ class FFJORD(Model):
108
  self.output_norm = MovingBatchNorm1d(self.input_dim, bn_lag=0)
109
 
110
  @staticmethod
111
- def load_model(model_path, checkpoint):
112
- model_dict = pb.io.load_yaml(model_path / "config.yaml")
113
- model = Model.from_config(model_dict['model'])
 
 
114
  cp = torch.load(
115
  model_path / checkpoint,
116
- map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
117
  )
118
  model_weights = cp.copy()
119
- model.load_state_dict(model_weights['model'])
120
  model.eval()
121
  return model
122
 
 
93
  This class is an implementation of the FFJORD model as proposed in
94
  https://arxiv.org/pdf/1810.01367
95
  """
96
+ def __init__(self, ode_function, normalize=True, device=None):
97
  super().__init__()
98
+ if device is None:
99
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
100
  self.input_dim = ode_function.input_dim
101
  self.time_deriv_func = ODEBlock(ode_function=ode_function)
102
  self.latent_dist = torch.distributions.MultivariateNormal(
 
109
  self.output_norm = MovingBatchNorm1d(self.input_dim, bn_lag=0)
110
 
111
  @staticmethod
112
+ def load_model(model_path, checkpoint, device=None):
113
+ if device is None:
114
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
115
+ model_dict = pb.io.load(model_path / "config.json")
116
+ model = Model.from_config(model_dict)
117
  cp = torch.load(
118
  model_path / checkpoint,
119
+ map_location=device,
120
+ weights_only=True
121
  )
122
  model_weights = cp.copy()
123
+ model.load_state_dict(model_weights)
124
  model.eval()
125
  return model
126
 
pvq_manipulation/models/vits.py CHANGED
@@ -154,7 +154,8 @@ class Vits_NT(Vits):
154
  def init_from_config(
155
  config: "VitsConfig",
156
  samples= None,
157
- verbose=True
 
158
  ):
159
  """
160
  Initiate model from config
@@ -165,17 +166,20 @@ class Vits_NT(Vits):
165
  Returns:
166
  model (Vits): Initialized model.
167
  """
 
 
 
168
  upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
169
  assert (upsample_rate == config.audio.hop_length), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
170
  ap = AudioProcessor.init_from_config(config, verbose=verbose)
171
  tokenizer, new_config = TTSTokenizer.init_from_config(config)
172
  language_manager = LanguageManager.init_from_config(config)
173
- speaker_manager = pt.Module.from_storage_dir(
174
- config['d_vector_model_file'],
175
- checkpoint_name='ckpt_latest.pth',
176
- consider_mpi=False,
177
- config_name='config.json',
178
- )
179
  speaker_manager.num_speakers = config['num_speakers']
180
  for param in speaker_manager.parameters():
181
  param.requires_grad = False
@@ -343,7 +347,7 @@ class Vits_NT(Vits):
343
  return outputs
344
 
345
  @staticmethod
346
- def load_model(model_path, checkpoint):
347
  """
348
  Load model from checkpoint
349
 
@@ -369,11 +373,10 @@ class Vits_NT(Vits):
369
  **config,
370
  )
371
  model = Vits_NT.init_from_config(config)
372
- cp = torch.load(
373
  model_path / checkpoint,
374
- map_location=torch.device('cpu')
375
  )
376
- model_weights = cp['model'].copy()
377
  model.load_state_dict(model_weights, strict=False)
378
  model.eval()
379
  return model
 
154
  def init_from_config(
155
  config: "VitsConfig",
156
  samples= None,
157
+ verbose=True,
158
+ device=None
159
  ):
160
  """
161
  Initiate model from config
 
166
  Returns:
167
  model (Vits): Initialized model.
168
  """
169
+ if device is None:
170
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
171
+
172
  upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
173
  assert (upsample_rate == config.audio.hop_length), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
174
  ap = AudioProcessor.init_from_config(config, verbose=verbose)
175
  tokenizer, new_config = TTSTokenizer.init_from_config(config)
176
  language_manager = LanguageManager.init_from_config(config)
177
+ speaker_manager_config = pb.io.load(Path(config['d_vector_model_file'])/'config.json')
178
+
179
+ speaker_manager = pt.Configurable.from_config(speaker_manager_config)
180
+
181
+ speaker_manager.load_state_dict(torch.load(Path(config['d_vector_model_file'])/"model.pt", weights_only=True, map_location=device))
182
+
183
  speaker_manager.num_speakers = config['num_speakers']
184
  for param in speaker_manager.parameters():
185
  param.requires_grad = False
 
347
  return outputs
348
 
349
  @staticmethod
350
+ def load_model(model_path, checkpoint, device='cpu'):
351
  """
352
  Load model from checkpoint
353
 
 
373
  **config,
374
  )
375
  model = Vits_NT.init_from_config(config)
376
+ model_weights = torch.load(
377
  model_path / checkpoint,
378
+ map_location=torch.device(device)
379
  )
 
380
  model.load_state_dict(model_weights, strict=False)
381
  model.eval()
382
  return model
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  wheel
 
 
2
  paderbox
3
  padertorch
4
  onnxruntime
 
1
  wheel
2
+ gradio
3
+ pydantic==2.10.6 # errors with gradio for 2.11
4
  paderbox
5
  padertorch
6
  onnxruntime