FrederikRautenberg commited on
Commit
2a24191
·
1 Parent(s): af4c524

Update Notebook, change resampling

Browse files
pvq_manipulation/Example_Notebook.ipynb CHANGED
@@ -20,8 +20,9 @@
20
  "from pvq_manipulation.models.ffjord import FFJORD\n",
21
  "from IPython.display import display, Audio, clear_output\n",
22
  "from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER\n",
23
- "from paderbox.transform.module_resample import resample_sox\n",
24
- "from pvq_manipulation.helper.vad import EnergyVAD\n"
 
25
  ]
26
  },
27
  {
@@ -83,7 +84,7 @@
83
  " model_name=\"HUBERT_LARGE\",\n",
84
  " backend=\"torchaudio\",\n",
85
  " device='cpu', \n",
86
- " storage_dir=# target storage dir hubert model\n",
87
  ")"
88
  ]
89
  },
@@ -151,7 +152,8 @@
151
  "\n",
152
  "def extract_speaker_embedding(example):\n",
153
  " observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)\n",
154
- " observation = resample_sox(observation, in_rate=sr, out_rate=16_000)\n",
 
155
  " \n",
156
  " vad = EnergyVAD(sample_rate=16_000)\n",
157
  " if observation.ndim == 1:\n",
@@ -206,16 +208,16 @@
206
  },
207
  {
208
  "cell_type": "markdown",
209
- "id": "008035ba-6054-4e6e-ab16-1aaaf68f584a",
210
  "metadata": {},
211
  "source": [
212
- "# Get example manipulation"
213
  ]
214
  },
215
  {
216
  "cell_type": "code",
217
  "execution_count": null,
218
- "id": "e921a3cd-1699-495c-b825-519fb706d89d",
219
  "metadata": {},
220
  "outputs": [],
221
  "source": [
@@ -239,6 +241,15 @@
239
  ")"
240
  ]
241
  },
 
 
 
 
 
 
 
 
 
242
  {
243
  "cell_type": "code",
244
  "execution_count": null,
@@ -252,11 +263,18 @@
252
  " 'example_id': \"1034_121119_000028_000001\",\n",
253
  "}\n",
254
  "\n",
 
255
  "label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']\n",
256
  "\n",
 
 
 
 
 
 
257
  "manipulation_idx_widget = widgets.Dropdown(\n",
258
  " options=[(label, i) for i, label in enumerate(label_options)],\n",
259
- " value=2, # Standardwert: Breathiness\n",
260
  " description='Type:',\n",
261
  " style={'description_width': 'initial'}\n",
262
  ")\n",
@@ -294,9 +312,18 @@
294
  " manipulation_fkt=manipulation_fkt,\n",
295
  " )\n",
296
  " \n",
 
 
 
 
 
297
  " with audio_output:\n",
298
  " clear_output(wait=True) \n",
 
299
  " display(Audio(wav_manipulated, rate=24_000, normalize=True))\n",
 
 
 
300
  " display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))\n",
301
  "\n",
302
  " print(f\"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}\")\n",
@@ -304,6 +331,22 @@
304
  "run_button.on_click(update_manipulation)\n",
305
  "display(manipulation_idx_widget, manipulation_fkt_widget, run_button, audio_output)"
306
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  }
308
  ],
309
  "metadata": {
 
20
  "from pvq_manipulation.models.ffjord import FFJORD\n",
21
  "from IPython.display import display, Audio, clear_output\n",
22
  "from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER\n",
23
+ "#from paderbox.transform.module_resample import resample_sox # use resample_sox or librosa for resampling \n",
24
+ "import librosa\n",
25
+ "from pvq_manipulation.helper.vad import EnergyVAD"
26
  ]
27
  },
28
  {
 
84
  " model_name=\"HUBERT_LARGE\",\n",
85
  " backend=\"torchaudio\",\n",
86
  " device='cpu', \n",
87
+ " storage_dir= # target storage dir hubert model\n",
88
  ")"
89
  ]
90
  },
 
152
  "\n",
153
  "def extract_speaker_embedding(example):\n",
154
  " observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)\n",
155
+ " #observation = resample_sox(observation, in_rate=sr, out_rate=16_000)\n",
156
+ " observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)\n",
157
  " \n",
158
  " vad = EnergyVAD(sample_rate=16_000)\n",
159
  " if observation.ndim == 1:\n",
 
208
  },
209
  {
210
  "cell_type": "markdown",
211
+ "id": "941389a1-abd9-4cc6-80b3-5bbc7818e037",
212
  "metadata": {},
213
  "source": [
214
+ "# The following cell is intended for debugging purposes. If everything is functioning correctly, you can ignore it."
215
  ]
216
  },
217
  {
218
  "cell_type": "code",
219
  "execution_count": null,
220
+ "id": "14c279b3-6511-4a48-89c2-4efc613a08d3",
221
  "metadata": {},
222
  "outputs": [],
223
  "source": [
 
241
  ")"
242
  ]
243
  },
244
+ {
245
+ "cell_type": "markdown",
246
+ "id": "008035ba-6054-4e6e-ab16-1aaaf68f584a",
247
+ "metadata": {},
248
+ "source": [
249
+ "# Get example manipulation\n",
250
+ "### Manipulating a PVQ value: a_man = a + strength (where 'a' is the original)"
251
+ ]
252
+ },
253
  {
254
  "cell_type": "code",
255
  "execution_count": null,
 
263
  " 'example_id': \"1034_121119_000028_000001\",\n",
264
  "}\n",
265
  "\n",
266
+ "labels = load_speaker_labels(example, config_norm_flow)\n",
267
  "label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']\n",
268
  "\n",
269
+ "print('Estimated PVQ strengths of input speaker:')\n",
270
+ "max_len = max(len(name) for name in label_options) \n",
271
+ "for label_name, pvq in zip(label_options, labels):\n",
272
+ " print(f'{label_name:<{max_len}} : {pvq:6.2f}')\n",
273
+ "\n",
274
+ "\n",
275
  "manipulation_idx_widget = widgets.Dropdown(\n",
276
  " options=[(label, i) for i, label in enumerate(label_options)],\n",
277
+ " value=2, \n",
278
  " description='Type:',\n",
279
  " style={'description_width': 'initial'}\n",
280
  ")\n",
 
312
  " manipulation_fkt=manipulation_fkt,\n",
313
  " )\n",
314
  " \n",
315
+ " wav_unmanipulated = tts_model.synthesize_from_example({\n",
316
+ " 'text': \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\", \n",
317
+ " 'd_vector': d_vector.detach().numpy(),\n",
318
+ " })\n",
319
+ " \n",
320
  " with audio_output:\n",
321
  " clear_output(wait=True) \n",
322
+ " print('Manipulated Speaker')\n",
323
  " display(Audio(wav_manipulated, rate=24_000, normalize=True))\n",
324
+ " print('Unmanipulated Synthese')\n",
325
+ " display(Audio(wav_unmanipulated, rate=24_000, normalize=True))\n",
326
+ " print('Original Speaker')\n",
327
  " display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))\n",
328
  "\n",
329
  " print(f\"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}\")\n",
 
331
  "run_button.on_click(update_manipulation)\n",
332
  "display(manipulation_idx_widget, manipulation_fkt_widget, run_button, audio_output)"
333
  ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": null,
338
+ "id": "f551258d-df75-4d7d-a2bd-b9c52bbdf22b",
339
+ "metadata": {},
340
+ "outputs": [],
341
+ "source": []
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": null,
346
+ "id": "d185a73f-d066-465c-bcea-20ff6f279362",
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": []
350
  }
351
  ],
352
  "metadata": {