Spaces:

fgnt-upb
/

pvq_manipulation

Running

App Files Files Community

FrederikRautenberg commited on 25 days ago

Commit

7efb86f

1 Parent(s): 091b3ba

debug threads

Browse files

Files changed (1) hide show

app.py +39 -20

app.py CHANGED Viewed

@@ -52,6 +52,16 @@ hubert_model = HubertExtractor(
     # storage_dir= # target storage dir hubert model
 )
 def get_manipulation(
     example,
@@ -98,14 +108,13 @@ def get_creak_label(example):
     return mean_creak * 100
-def load_speaker_labels(example, reg_stor_dir=Path('./models/pvq_extractor/')):
     audio_data = torch.tensor(example['loaded_audio_data']['16_000'], dtype=torch.float)[None, :]
     num_samples = torch.tensor([audio_data.shape[-1]])
     if torch.cuda.is_available():
         audio_data = audio_data.cuda()
         num_samples = num_samples.cuda()
-    providers = ["CPUExecutionProvider"]
     with torch.no_grad():
         features, seq_len = hubert_model(
@@ -116,9 +125,7 @@ def load_speaker_labels(example, reg_stor_dir=Path('./models/pvq_extractor/')):
         features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
         pvqd_predictions = {}
         for pvq in pvq_labels:
-            with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
-                onnx = fid.read()
-            sess = InferenceSession(onnx, providers=providers)
             pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
             pvqd_predictions[pvq] = pred.tolist()[0]
@@ -149,6 +156,15 @@ def load_audio_files(example):
     return example
 def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
     global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
@@ -163,25 +179,28 @@ def update_manipulation(manipulation_idx, example_id, transcription, manipulatio
     }
     if cached_example_id != example_id:
         cached_loaded_example = load_audio_files(example)
         cached_d_vector = torch.load(f"./Dataset/Embeddings/{speaker_id}/{example_id}.pth")
         cached_labels = load_speaker_labels(example)
         cached_example_id = example_id
-        cached_unmanipulated = tts_model.synthesize_from_example({
-            'text': transcription,
-            'd_vector': cached_d_vector.detach().numpy(),
-        })
-    wav_manipulated = get_manipulation(
-        example=example,
-        d_vector=cached_d_vector,
-        labels=cached_labels[None, :],
-        flow=normalizing_flow,
-        tts_model=tts_model,
-        manipulation_idx=manipulation_idx,
-        manipulation_fkt=manipulation_fkt,
-        config_norm_flow=config_norm_flow,
-    )
     return (24_000, cached_unmanipulated), (24_000, wav_manipulated)

     # storage_dir= # target storage dir hubert model
 )
+# load pvq models
+reg_stor_dir = Path('./models/pvq_extractor/')
+onnx_sessions = {}
+for pvq in pvq_labels:
+    onnx_path = reg_stor_dir / f"{pvq}.onnx"
+    onnx_sessions[pvq] = InferenceSession(
+        str(onnx_path),
+        providers=["CPUExecutionProvider"]
+    )
 def get_manipulation(
     example,
     return mean_creak * 100
+def load_speaker_labels(example):
     audio_data = torch.tensor(example['loaded_audio_data']['16_000'], dtype=torch.float)[None, :]
     num_samples = torch.tensor([audio_data.shape[-1]])
     if torch.cuda.is_available():
         audio_data = audio_data.cuda()
         num_samples = num_samples.cuda()
     with torch.no_grad():
         features, seq_len = hubert_model(
         features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
         pvqd_predictions = {}
         for pvq in pvq_labels:
+            sess = onnx_sessions[pvq]
             pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
             pvqd_predictions[pvq] = pred.tolist()[0]
     return example
+def delete_cache():
+    global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, cached_unmanipulated
+    del cached_example_id
+    del cached_loaded_example
+    del cached_labels
+    del cached_d_vector
+    del cached_unmanipulated
 def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
     global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
     }
     if cached_example_id != example_id:
+        delete_cache()
         cached_loaded_example = load_audio_files(example)
         cached_d_vector = torch.load(f"./Dataset/Embeddings/{speaker_id}/{example_id}.pth")
         cached_labels = load_speaker_labels(example)
         cached_example_id = example_id
+        with torch.no_grad():
+            cached_unmanipulated = tts_model.synthesize_from_example({
+                'text': transcription,
+                'd_vector': cached_d_vector.detach().numpy(),
+            })
+    with torch.no_grad():
+        wav_manipulated = get_manipulation(
+            example=example,
+            d_vector=cached_d_vector,
+            labels=cached_labels[None, :],
+            flow=normalizing_flow,
+            tts_model=tts_model,
+            manipulation_idx=manipulation_idx,
+            manipulation_fkt=manipulation_fkt,
+            config_norm_flow=config_norm_flow,
+        )
     return (24_000, cached_unmanipulated), (24_000, wav_manipulated)