Spaces:

fgnt-upb
/

pvq_manipulation

Running

App Files Files Community

werning commited on Apr 25

Commit

cbdb41f

1 Parent(s): 948a63d

Adapt code for gradio

Browse files

Files changed (4) hide show

app.py +167 -18
pvq_manipulation/models/ffjord.py +11 -7
pvq_manipulation/models/vits.py +14 -11
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,28 +1,177 @@
-import numpy as np
 import gradio as gr
-# gradio generate tone example
-notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
-def generate_tone(note, octave, duration):
-    sr = 48000
-    a4_freq, tones_from_a4 = 440, 12 * (octave - 4) + (note - 9)
-    frequency = a4_freq * 2 ** (tones_from_a4 / 12)
-    duration = int(duration)
-    audio = np.linspace(0, duration, duration * sr)
-    audio = (20000 * np.sin(audio * (2 * np.pi * frequency))).astype(np.int16)
-    return sr, audio
 demo = gr.Interface(
-    generate_tone,
-    [
-        gr.Dropdown(notes, type="index"),
-        gr.Slider(4, 6, step=1),
-        gr.Textbox(value="1", label="Duration in seconds"),
     ],
-    "audio",
 )
 if __name__ == "__main__":
-    demo.launch()

+import numpy as np
+from pathlib import Path
+import padertorch as pt
+import paderbox as pb
+import time
+import torch
+import torchaudio
+from onnxruntime import InferenceSession
+from pvq_manipulation.models.vits import Vits_NT
+from pvq_manipulation.models.ffjord import FFJORD
+from IPython.display import display, Audio, clear_output
+from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
+import librosa
+from pvq_manipulation.helper.vad import EnergyVAD
 import gradio as gr
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# load tts model
+storage_dir_tts = Path("./models/tts_model/")
+tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
+# load normalizing flow
+storage_dir_normalizing_flow = Path("./models/norm_flow")
+speaker_conditioning = pb.io.load(storage_dir_normalizing_flow / "speaker_conditioning.json")
+normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
+# load hubert features model
+hubert_model = HubertExtractor(
+    layer=SID_LARGE_LAYER,
+    model_name="HUBERT_LARGE",
+    backend="torchaudio",
+    device=device,
+    # storage_dir= # target storage dir hubert model
+)
+# example synthesis
+# speaker_id = 1034
+# example_id = "1034_121119_000028_000001"
+# wav_1 = tts_model.synthesize_from_example({
+#     'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+#     'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth"
+# })
+# display(Audio(wav_1, rate=24_000, normalize=True))
+# manipulation block
+def get_manipulation(
+    d_vector,
+    labels,
+    flow,
+    tts_model,
+    manipulation_idx=0,
+    manipulation_fkt=1,
+):
+    labels_manipulated = labels.clone()
+    labels_manipulated[:,manipulation_idx] += manipulation_fkt
+    output_forward = flow.forward((d_vector.float(), labels))[0]
+    sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
+    wav = tts_model.synthesize_from_example({
+        'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+        'd_vector': d_vector.detach().numpy(),
+        'd_vector_man': sampled_class_manipulated.detach().numpy(),
+    })
+    return wav
+def extract_speaker_embedding(example):
+    observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
+    observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)
+    vad = EnergyVAD(sample_rate=16_000)
+    if observation.ndim == 1:
+        observation = observation[None, :]
+    observation = vad({'audio_data': observation})['audio_data']
+    with torch.no_grad():
+        example = tts_model.speaker_manager.prepare_example({'audio_data': {'observation': observation}, **example})
+        example = pt.data.utils.collate_fn([example])
+        example['features'] = torch.tensor(np.array(example['features']))
+        d_vector = tts_model.speaker_manager.forward(example)[0]
+    return d_vector
+# load speaker labels
+def load_speaker_labels(example, speaker_conditioning, reg_stor_dir=Path('./models/pvq_extractor/')):
+    audio, _ = torchaudio.load(example['audio_path']['observation'])
+    audio = audio.to(device)
+    num_samples = torch.tensor([audio.shape[-1]], device=device)
+    providers = ["CPUExecutionProvider"]
+    with torch.no_grad():
+        features, seq_len = hubert_model(
+            audio,
+            24_000,
+            sequence_lengths=num_samples,
+        )
+        features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
+        pvqd_predictions = {}
+        for pvq in ['Breathiness', 'Loudness', 'Pitch', 'Resonance', 'Roughness', 'Strain', 'Weight']:
+            with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
+                onnx = fid.read()
+            sess = InferenceSession(onnx, providers=providers)
+            pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
+            pvqd_predictions[pvq] = pred.tolist()[0]
+    labels = []
+    for key in speaker_conditioning:
+        labels.append(pvqd_predictions[key]/100)
+    return torch.tensor(labels)
+example = {
+    'audio_path': {'observation': "audio/1034_121119_000028_000001.wav"},
+    'speaker_id': 1034,
+    'example_id': "1034_121119_000028_000001",
+}
+labels = load_speaker_labels(example, speaker_conditioning)
+label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
+# print('Estimated PVQ strengths of input speaker:')
+# max_len = max(len(name) for name in label_options)
+# for label_name, pvq in zip(label_options, labels):
+    # print(f'{label_name:<{max_len}} : {pvq:6.2f}')
+def update_manipulation(manipulation_idx, manipulation_fkt):
+    d_vector = extract_speaker_embedding(example)
+    labels = load_speaker_labels(example, speaker_conditioning)
+    wav_manipulated = get_manipulation(
+        # example=example,
+        d_vector=d_vector,
+        labels=labels[None, :],
+        flow=normalizing_flow,
+        tts_model=tts_model,
+        manipulation_idx=manipulation_idx,
+        manipulation_fkt=manipulation_fkt,
+    )
+    wav_unmanipulated = tts_model.synthesize_from_example({
+        'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+        'd_vector': d_vector.detach().numpy(),
+    })
+    sr = 24_000
+    return (sr, wav_unmanipulated), (sr, wav_manipulated)
+    # with audio_output:
+    #     clear_output(wait=True)
+    #     print('Manipulated Speaker')
+    #     display(Audio(wav_manipulated, rate=24_000, normalize=True))
+    #     print('Unmanipulated Synthese')
+    #     display(Audio(wav_unmanipulated, rate=24_000, normalize=True))
+    #     print('Original Speaker')
+    #     display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))
+    # print(f"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}")
+dropdown_options = [(label, i) for i, label in enumerate(label_options)]
 demo = gr.Interface(
+    update_manipulation,
+    inputs=[
+        gr.Dropdown(dropdown_options, value=2, type="index"),
+        gr.Slider(minimum=-2.0, maximum=2.0, value=1.0, step=0.1),
     ],
+    outputs=[gr.Audio(label="original"), gr.Audio(label="manipulated")],
 )
 if __name__ == "__main__":
+    demo.launch(share=True)

pvq_manipulation/models/ffjord.py CHANGED Viewed

@@ -93,9 +93,10 @@ class FFJORD(Model):
     This class is an implementation of the FFJORD model as proposed in
     https://arxiv.org/pdf/1810.01367
     """
-    def __init__(self, ode_function, normalize=True):
         super().__init__()
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.input_dim = ode_function.input_dim
         self.time_deriv_func = ODEBlock(ode_function=ode_function)
         self.latent_dist = torch.distributions.MultivariateNormal(
@@ -108,15 +109,18 @@ class FFJORD(Model):
             self.output_norm = MovingBatchNorm1d(self.input_dim, bn_lag=0)
     @staticmethod
-    def load_model(model_path, checkpoint):
-        model_dict = pb.io.load_yaml(model_path / "config.yaml")
-        model = Model.from_config(model_dict['model'])
         cp = torch.load(
             model_path / checkpoint,
-            map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         )
         model_weights = cp.copy()
-        model.load_state_dict(model_weights['model'])
         model.eval()
         return model

     This class is an implementation of the FFJORD model as proposed in
     https://arxiv.org/pdf/1810.01367
     """
+    def __init__(self, ode_function, normalize=True, device=None):
         super().__init__()
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.input_dim = ode_function.input_dim
         self.time_deriv_func = ODEBlock(ode_function=ode_function)
         self.latent_dist = torch.distributions.MultivariateNormal(
             self.output_norm = MovingBatchNorm1d(self.input_dim, bn_lag=0)
     @staticmethod
+    def load_model(model_path, checkpoint, device=None):
+        if device is None:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        model_dict = pb.io.load(model_path / "config.json")
+        model = Model.from_config(model_dict)
         cp = torch.load(
             model_path / checkpoint,
+            map_location=device,
+            weights_only=True
         )
         model_weights = cp.copy()
+        model.load_state_dict(model_weights)
         model.eval()
         return model

pvq_manipulation/models/vits.py CHANGED Viewed

@@ -154,7 +154,8 @@ class Vits_NT(Vits):
     def init_from_config(
             config: "VitsConfig",
             samples= None,
-            verbose=True
     ):
         """
         Initiate model from config
@@ -165,17 +166,20 @@ class Vits_NT(Vits):
         Returns:
             model (Vits): Initialized model.
         """
         upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
         assert (upsample_rate == config.audio.hop_length), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
         ap = AudioProcessor.init_from_config(config, verbose=verbose)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
         language_manager = LanguageManager.init_from_config(config)
-        speaker_manager = pt.Module.from_storage_dir(
-            config['d_vector_model_file'],
-            checkpoint_name='ckpt_latest.pth',
-            consider_mpi=False,
-            config_name='config.json',
-        )
         speaker_manager.num_speakers = config['num_speakers']
         for param in speaker_manager.parameters():
             param.requires_grad = False
@@ -343,7 +347,7 @@ class Vits_NT(Vits):
         return outputs
     @staticmethod
-    def load_model(model_path, checkpoint):
         """
         Load model from checkpoint
@@ -369,11 +373,10 @@ class Vits_NT(Vits):
             **config,
         )
         model = Vits_NT.init_from_config(config)
-        cp = torch.load(
             model_path / checkpoint,
-            map_location=torch.device('cpu')
         )
-        model_weights = cp['model'].copy()
         model.load_state_dict(model_weights, strict=False)
         model.eval()
         return model

     def init_from_config(
             config: "VitsConfig",
             samples= None,
+            verbose=True,
+            device=None
     ):
         """
         Initiate model from config
         Returns:
             model (Vits): Initialized model.
         """
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
         upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
         assert (upsample_rate == config.audio.hop_length), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
         ap = AudioProcessor.init_from_config(config, verbose=verbose)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
         language_manager = LanguageManager.init_from_config(config)
+        speaker_manager_config = pb.io.load(Path(config['d_vector_model_file'])/'config.json')
+        speaker_manager = pt.Configurable.from_config(speaker_manager_config)
+        speaker_manager.load_state_dict(torch.load(Path(config['d_vector_model_file'])/"model.pt", weights_only=True, map_location=device))
         speaker_manager.num_speakers = config['num_speakers']
         for param in speaker_manager.parameters():
             param.requires_grad = False
         return outputs
     @staticmethod
+    def load_model(model_path, checkpoint, device='cpu'):
         """
         Load model from checkpoint
             **config,
         )
         model = Vits_NT.init_from_config(config)
+        model_weights = torch.load(
             model_path / checkpoint,
+            map_location=torch.device(device)
         )
         model.load_state_dict(model_weights, strict=False)
         model.eval()
         return model

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 wheel
 paderbox
 padertorch
 onnxruntime

 wheel
+gradio
+pydantic==2.10.6  # errors with gradio for 2.11
 paderbox
 padertorch
 onnxruntime