Spaces:
Running
Running
Adapt code for gradio
Browse files- app.py +167 -18
- pvq_manipulation/models/ffjord.py +11 -7
- pvq_manipulation/models/vits.py +14 -11
- requirements.txt +2 -0
app.py
CHANGED
@@ -1,28 +1,177 @@
|
|
1 |
-
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
|
4 |
-
|
5 |
|
6 |
-
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
frequency = a4_freq * 2 ** (tones_from_a4 / 12)
|
12 |
-
duration = int(duration)
|
13 |
-
audio = np.linspace(0, duration, duration * sr)
|
14 |
-
audio = (20000 * np.sin(audio * (2 * np.pi * frequency))).astype(np.int16)
|
15 |
-
return sr, audio
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
demo = gr.Interface(
|
18 |
-
|
19 |
-
[
|
20 |
-
gr.Dropdown(
|
21 |
-
gr.Slider(
|
22 |
-
gr.Textbox(value="1", label="Duration in seconds"),
|
23 |
],
|
24 |
-
"
|
25 |
)
|
|
|
26 |
if __name__ == "__main__":
|
27 |
-
demo.launch()
|
28 |
|
|
|
1 |
+
import numpy as np
|
2 |
+
from pathlib import Path
|
3 |
+
import padertorch as pt
|
4 |
+
import paderbox as pb
|
5 |
+
import time
|
6 |
+
import torch
|
7 |
+
import torchaudio
|
8 |
+
from onnxruntime import InferenceSession
|
9 |
+
from pvq_manipulation.models.vits import Vits_NT
|
10 |
+
from pvq_manipulation.models.ffjord import FFJORD
|
11 |
+
from IPython.display import display, Audio, clear_output
|
12 |
+
from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
|
13 |
+
import librosa
|
14 |
+
from pvq_manipulation.helper.vad import EnergyVAD
|
15 |
import gradio as gr
|
16 |
|
17 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
18 |
|
19 |
+
# load tts model
|
20 |
+
storage_dir_tts = Path("./models/tts_model/")
|
21 |
+
tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
|
22 |
|
23 |
+
# load normalizing flow
|
24 |
+
storage_dir_normalizing_flow = Path("./models/norm_flow")
|
25 |
+
speaker_conditioning = pb.io.load(storage_dir_normalizing_flow / "speaker_conditioning.json")
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
|
28 |
+
|
29 |
+
# load hubert features model
|
30 |
+
hubert_model = HubertExtractor(
|
31 |
+
layer=SID_LARGE_LAYER,
|
32 |
+
model_name="HUBERT_LARGE",
|
33 |
+
backend="torchaudio",
|
34 |
+
device=device,
|
35 |
+
# storage_dir= # target storage dir hubert model
|
36 |
+
)
|
37 |
+
|
38 |
+
# example synthesis
|
39 |
+
# speaker_id = 1034
|
40 |
+
# example_id = "1034_121119_000028_000001"
|
41 |
+
|
42 |
+
# wav_1 = tts_model.synthesize_from_example({
|
43 |
+
# 'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
44 |
+
# 'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth"
|
45 |
+
# })
|
46 |
+
# display(Audio(wav_1, rate=24_000, normalize=True))
|
47 |
+
|
48 |
+
# manipulation block
|
49 |
+
def get_manipulation(
|
50 |
+
d_vector,
|
51 |
+
labels,
|
52 |
+
flow,
|
53 |
+
tts_model,
|
54 |
+
manipulation_idx=0,
|
55 |
+
manipulation_fkt=1,
|
56 |
+
):
|
57 |
+
labels_manipulated = labels.clone()
|
58 |
+
labels_manipulated[:,manipulation_idx] += manipulation_fkt
|
59 |
+
|
60 |
+
output_forward = flow.forward((d_vector.float(), labels))[0]
|
61 |
+
sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
|
62 |
+
|
63 |
+
wav = tts_model.synthesize_from_example({
|
64 |
+
'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
65 |
+
'd_vector': d_vector.detach().numpy(),
|
66 |
+
'd_vector_man': sampled_class_manipulated.detach().numpy(),
|
67 |
+
})
|
68 |
+
return wav
|
69 |
+
|
70 |
+
def extract_speaker_embedding(example):
|
71 |
+
observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
|
72 |
+
observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)
|
73 |
+
|
74 |
+
vad = EnergyVAD(sample_rate=16_000)
|
75 |
+
if observation.ndim == 1:
|
76 |
+
observation = observation[None, :]
|
77 |
+
|
78 |
+
observation = vad({'audio_data': observation})['audio_data']
|
79 |
+
|
80 |
+
with torch.no_grad():
|
81 |
+
example = tts_model.speaker_manager.prepare_example({'audio_data': {'observation': observation}, **example})
|
82 |
+
example = pt.data.utils.collate_fn([example])
|
83 |
+
example['features'] = torch.tensor(np.array(example['features']))
|
84 |
+
d_vector = tts_model.speaker_manager.forward(example)[0]
|
85 |
+
return d_vector
|
86 |
+
|
87 |
+
# load speaker labels
|
88 |
+
def load_speaker_labels(example, speaker_conditioning, reg_stor_dir=Path('./models/pvq_extractor/')):
|
89 |
+
audio, _ = torchaudio.load(example['audio_path']['observation'])
|
90 |
+
audio = audio.to(device)
|
91 |
+
num_samples = torch.tensor([audio.shape[-1]], device=device)
|
92 |
+
|
93 |
+
providers = ["CPUExecutionProvider"]
|
94 |
+
|
95 |
+
with torch.no_grad():
|
96 |
+
features, seq_len = hubert_model(
|
97 |
+
audio,
|
98 |
+
24_000,
|
99 |
+
sequence_lengths=num_samples,
|
100 |
+
)
|
101 |
+
features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
|
102 |
+
|
103 |
+
pvqd_predictions = {}
|
104 |
+
for pvq in ['Breathiness', 'Loudness', 'Pitch', 'Resonance', 'Roughness', 'Strain', 'Weight']:
|
105 |
+
with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
|
106 |
+
onnx = fid.read()
|
107 |
+
sess = InferenceSession(onnx, providers=providers)
|
108 |
+
pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
|
109 |
+
pvqd_predictions[pvq] = pred.tolist()[0]
|
110 |
+
labels = []
|
111 |
+
for key in speaker_conditioning:
|
112 |
+
labels.append(pvqd_predictions[key]/100)
|
113 |
+
return torch.tensor(labels)
|
114 |
+
|
115 |
+
|
116 |
+
example = {
|
117 |
+
'audio_path': {'observation': "audio/1034_121119_000028_000001.wav"},
|
118 |
+
'speaker_id': 1034,
|
119 |
+
'example_id': "1034_121119_000028_000001",
|
120 |
+
}
|
121 |
+
|
122 |
+
labels = load_speaker_labels(example, speaker_conditioning)
|
123 |
+
label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
|
124 |
+
|
125 |
+
# print('Estimated PVQ strengths of input speaker:')
|
126 |
+
# max_len = max(len(name) for name in label_options)
|
127 |
+
# for label_name, pvq in zip(label_options, labels):
|
128 |
+
# print(f'{label_name:<{max_len}} : {pvq:6.2f}')
|
129 |
+
|
130 |
+
|
131 |
+
def update_manipulation(manipulation_idx, manipulation_fkt):
|
132 |
+
|
133 |
+
d_vector = extract_speaker_embedding(example)
|
134 |
+
labels = load_speaker_labels(example, speaker_conditioning)
|
135 |
+
|
136 |
+
wav_manipulated = get_manipulation(
|
137 |
+
# example=example,
|
138 |
+
d_vector=d_vector,
|
139 |
+
labels=labels[None, :],
|
140 |
+
flow=normalizing_flow,
|
141 |
+
tts_model=tts_model,
|
142 |
+
manipulation_idx=manipulation_idx,
|
143 |
+
manipulation_fkt=manipulation_fkt,
|
144 |
+
)
|
145 |
+
|
146 |
+
wav_unmanipulated = tts_model.synthesize_from_example({
|
147 |
+
'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
148 |
+
'd_vector': d_vector.detach().numpy(),
|
149 |
+
})
|
150 |
+
sr = 24_000
|
151 |
+
return (sr, wav_unmanipulated), (sr, wav_manipulated)
|
152 |
+
|
153 |
+
# with audio_output:
|
154 |
+
# clear_output(wait=True)
|
155 |
+
# print('Manipulated Speaker')
|
156 |
+
# display(Audio(wav_manipulated, rate=24_000, normalize=True))
|
157 |
+
# print('Unmanipulated Synthese')
|
158 |
+
# display(Audio(wav_unmanipulated, rate=24_000, normalize=True))
|
159 |
+
# print('Original Speaker')
|
160 |
+
# display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))
|
161 |
+
|
162 |
+
# print(f"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}")
|
163 |
+
|
164 |
+
|
165 |
+
dropdown_options = [(label, i) for i, label in enumerate(label_options)]
|
166 |
demo = gr.Interface(
|
167 |
+
update_manipulation,
|
168 |
+
inputs=[
|
169 |
+
gr.Dropdown(dropdown_options, value=2, type="index"),
|
170 |
+
gr.Slider(minimum=-2.0, maximum=2.0, value=1.0, step=0.1),
|
|
|
171 |
],
|
172 |
+
outputs=[gr.Audio(label="original"), gr.Audio(label="manipulated")],
|
173 |
)
|
174 |
+
|
175 |
if __name__ == "__main__":
|
176 |
+
demo.launch(share=True)
|
177 |
|
pvq_manipulation/models/ffjord.py
CHANGED
@@ -93,9 +93,10 @@ class FFJORD(Model):
|
|
93 |
This class is an implementation of the FFJORD model as proposed in
|
94 |
https://arxiv.org/pdf/1810.01367
|
95 |
"""
|
96 |
-
def __init__(self, ode_function, normalize=True):
|
97 |
super().__init__()
|
98 |
-
|
|
|
99 |
self.input_dim = ode_function.input_dim
|
100 |
self.time_deriv_func = ODEBlock(ode_function=ode_function)
|
101 |
self.latent_dist = torch.distributions.MultivariateNormal(
|
@@ -108,15 +109,18 @@ class FFJORD(Model):
|
|
108 |
self.output_norm = MovingBatchNorm1d(self.input_dim, bn_lag=0)
|
109 |
|
110 |
@staticmethod
|
111 |
-
def load_model(model_path, checkpoint):
|
112 |
-
|
113 |
-
|
|
|
|
|
114 |
cp = torch.load(
|
115 |
model_path / checkpoint,
|
116 |
-
map_location=
|
|
|
117 |
)
|
118 |
model_weights = cp.copy()
|
119 |
-
model.load_state_dict(model_weights
|
120 |
model.eval()
|
121 |
return model
|
122 |
|
|
|
93 |
This class is an implementation of the FFJORD model as proposed in
|
94 |
https://arxiv.org/pdf/1810.01367
|
95 |
"""
|
96 |
+
def __init__(self, ode_function, normalize=True, device=None):
|
97 |
super().__init__()
|
98 |
+
if device is None:
|
99 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
100 |
self.input_dim = ode_function.input_dim
|
101 |
self.time_deriv_func = ODEBlock(ode_function=ode_function)
|
102 |
self.latent_dist = torch.distributions.MultivariateNormal(
|
|
|
109 |
self.output_norm = MovingBatchNorm1d(self.input_dim, bn_lag=0)
|
110 |
|
111 |
@staticmethod
|
112 |
+
def load_model(model_path, checkpoint, device=None):
|
113 |
+
if device is None:
|
114 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
115 |
+
model_dict = pb.io.load(model_path / "config.json")
|
116 |
+
model = Model.from_config(model_dict)
|
117 |
cp = torch.load(
|
118 |
model_path / checkpoint,
|
119 |
+
map_location=device,
|
120 |
+
weights_only=True
|
121 |
)
|
122 |
model_weights = cp.copy()
|
123 |
+
model.load_state_dict(model_weights)
|
124 |
model.eval()
|
125 |
return model
|
126 |
|
pvq_manipulation/models/vits.py
CHANGED
@@ -154,7 +154,8 @@ class Vits_NT(Vits):
|
|
154 |
def init_from_config(
|
155 |
config: "VitsConfig",
|
156 |
samples= None,
|
157 |
-
verbose=True
|
|
|
158 |
):
|
159 |
"""
|
160 |
Initiate model from config
|
@@ -165,17 +166,20 @@ class Vits_NT(Vits):
|
|
165 |
Returns:
|
166 |
model (Vits): Initialized model.
|
167 |
"""
|
|
|
|
|
|
|
168 |
upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
|
169 |
assert (upsample_rate == config.audio.hop_length), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
|
170 |
ap = AudioProcessor.init_from_config(config, verbose=verbose)
|
171 |
tokenizer, new_config = TTSTokenizer.init_from_config(config)
|
172 |
language_manager = LanguageManager.init_from_config(config)
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
speaker_manager.num_speakers = config['num_speakers']
|
180 |
for param in speaker_manager.parameters():
|
181 |
param.requires_grad = False
|
@@ -343,7 +347,7 @@ class Vits_NT(Vits):
|
|
343 |
return outputs
|
344 |
|
345 |
@staticmethod
|
346 |
-
def load_model(model_path, checkpoint):
|
347 |
"""
|
348 |
Load model from checkpoint
|
349 |
|
@@ -369,11 +373,10 @@ class Vits_NT(Vits):
|
|
369 |
**config,
|
370 |
)
|
371 |
model = Vits_NT.init_from_config(config)
|
372 |
-
|
373 |
model_path / checkpoint,
|
374 |
-
map_location=torch.device(
|
375 |
)
|
376 |
-
model_weights = cp['model'].copy()
|
377 |
model.load_state_dict(model_weights, strict=False)
|
378 |
model.eval()
|
379 |
return model
|
|
|
154 |
def init_from_config(
|
155 |
config: "VitsConfig",
|
156 |
samples= None,
|
157 |
+
verbose=True,
|
158 |
+
device=None
|
159 |
):
|
160 |
"""
|
161 |
Initiate model from config
|
|
|
166 |
Returns:
|
167 |
model (Vits): Initialized model.
|
168 |
"""
|
169 |
+
if device is None:
|
170 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
171 |
+
|
172 |
upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
|
173 |
assert (upsample_rate == config.audio.hop_length), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
|
174 |
ap = AudioProcessor.init_from_config(config, verbose=verbose)
|
175 |
tokenizer, new_config = TTSTokenizer.init_from_config(config)
|
176 |
language_manager = LanguageManager.init_from_config(config)
|
177 |
+
speaker_manager_config = pb.io.load(Path(config['d_vector_model_file'])/'config.json')
|
178 |
+
|
179 |
+
speaker_manager = pt.Configurable.from_config(speaker_manager_config)
|
180 |
+
|
181 |
+
speaker_manager.load_state_dict(torch.load(Path(config['d_vector_model_file'])/"model.pt", weights_only=True, map_location=device))
|
182 |
+
|
183 |
speaker_manager.num_speakers = config['num_speakers']
|
184 |
for param in speaker_manager.parameters():
|
185 |
param.requires_grad = False
|
|
|
347 |
return outputs
|
348 |
|
349 |
@staticmethod
|
350 |
+
def load_model(model_path, checkpoint, device='cpu'):
|
351 |
"""
|
352 |
Load model from checkpoint
|
353 |
|
|
|
373 |
**config,
|
374 |
)
|
375 |
model = Vits_NT.init_from_config(config)
|
376 |
+
model_weights = torch.load(
|
377 |
model_path / checkpoint,
|
378 |
+
map_location=torch.device(device)
|
379 |
)
|
|
|
380 |
model.load_state_dict(model_weights, strict=False)
|
381 |
model.eval()
|
382 |
return model
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
wheel
|
|
|
|
|
2 |
paderbox
|
3 |
padertorch
|
4 |
onnxruntime
|
|
|
1 |
wheel
|
2 |
+
gradio
|
3 |
+
pydantic==2.10.6 # errors with gradio for 2.11
|
4 |
paderbox
|
5 |
padertorch
|
6 |
onnxruntime
|