Spaces:

fgnt-upb
/

pvq_manipulation

Sleeping

App Files Files Community

FrederikRautenberg commited on Apr 19

Commit

2a24191

1 Parent(s): af4c524

Update Notebook, change resampling

Browse files

Files changed (1) hide show

pvq_manipulation/Example_Notebook.ipynb +51 -8

pvq_manipulation/Example_Notebook.ipynb CHANGED Viewed

@@ -20,8 +20,9 @@
     "from pvq_manipulation.models.ffjord import FFJORD\n",
     "from IPython.display import display, Audio, clear_output\n",
     "from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER\n",
-    "from paderbox.transform.module_resample import resample_sox\n",
-    "from pvq_manipulation.helper.vad import EnergyVAD\n"
    ]
   },
   {
@@ -83,7 +84,7 @@
     "    model_name=\"HUBERT_LARGE\",\n",
     "    backend=\"torchaudio\",\n",
     "    device='cpu', \n",
-    "    storage_dir=# target storage dir hubert model\n",
     ")"
    ]
   },
@@ -151,7 +152,8 @@
     "\n",
     "def extract_speaker_embedding(example):\n",
     "    observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)\n",
-    "    observation = resample_sox(observation, in_rate=sr, out_rate=16_000)\n",
     "    \n",
     "    vad = EnergyVAD(sample_rate=16_000)\n",
     "    if observation.ndim == 1:\n",
@@ -206,16 +208,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "008035ba-6054-4e6e-ab16-1aaaf68f584a",
    "metadata": {},
    "source": [
-    "# Get example manipulation"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e921a3cd-1699-495c-b825-519fb706d89d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -239,6 +241,15 @@
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -252,11 +263,18 @@
     "    'example_id': \"1034_121119_000028_000001\",\n",
     "}\n",
     "\n",
     "label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']\n",
     "\n",
     "manipulation_idx_widget = widgets.Dropdown(\n",
     "    options=[(label, i) for i, label in enumerate(label_options)],\n",
-    "    value=2,  # Standardwert: Breathiness\n",
     "    description='Type:',\n",
     "    style={'description_width': 'initial'}\n",
     ")\n",
@@ -294,9 +312,18 @@
     "        manipulation_fkt=manipulation_fkt,\n",
     "    )\n",
     "    \n",
     "    with audio_output:\n",
     "        clear_output(wait=True) \n",
     "        display(Audio(wav_manipulated, rate=24_000, normalize=True))\n",
     "        display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))\n",
     "\n",
     "    print(f\"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}\")\n",
@@ -304,6 +331,22 @@
     "run_button.on_click(update_manipulation)\n",
     "display(manipulation_idx_widget, manipulation_fkt_widget, run_button, audio_output)"
    ]
   }
  ],
  "metadata": {

     "from pvq_manipulation.models.ffjord import FFJORD\n",
     "from IPython.display import display, Audio, clear_output\n",
     "from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER\n",
+    "#from paderbox.transform.module_resample import resample_sox # use resample_sox or librosa for resampling \n",
+    "import librosa\n",
+    "from pvq_manipulation.helper.vad import EnergyVAD"
    ]
   },
   {
     "    model_name=\"HUBERT_LARGE\",\n",
     "    backend=\"torchaudio\",\n",
     "    device='cpu', \n",
+    "    storage_dir= # target storage dir hubert model\n",
     ")"
    ]
   },
     "\n",
     "def extract_speaker_embedding(example):\n",
     "    observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)\n",
+    "    #observation = resample_sox(observation, in_rate=sr, out_rate=16_000)\n",
+    "    observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)\n",
     "    \n",
     "    vad = EnergyVAD(sample_rate=16_000)\n",
     "    if observation.ndim == 1:\n",
   },
   {
    "cell_type": "markdown",
+   "id": "941389a1-abd9-4cc6-80b3-5bbc7818e037",
    "metadata": {},
    "source": [
+    "# The following cell is intended for debugging purposes. If everything is functioning correctly, you can ignore it."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "14c279b3-6511-4a48-89c2-4efc613a08d3",
    "metadata": {},
    "outputs": [],
    "source": [
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "008035ba-6054-4e6e-ab16-1aaaf68f584a",
+   "metadata": {},
+   "source": [
+    "# Get example manipulation\n",
+    "### Manipulating a PVQ value: a_man = a + strength (where 'a' is the original)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
     "    'example_id': \"1034_121119_000028_000001\",\n",
     "}\n",
     "\n",
+    "labels = load_speaker_labels(example, config_norm_flow)\n",
     "label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']\n",
     "\n",
+    "print('Estimated PVQ strengths of input speaker:')\n",
+    "max_len = max(len(name) for name in label_options)  \n",
+    "for label_name, pvq in zip(label_options, labels):\n",
+    "    print(f'{label_name:<{max_len}} : {pvq:6.2f}')\n",
+    "\n",
+    "\n",
     "manipulation_idx_widget = widgets.Dropdown(\n",
     "    options=[(label, i) for i, label in enumerate(label_options)],\n",
+    "    value=2, \n",
     "    description='Type:',\n",
     "    style={'description_width': 'initial'}\n",
     ")\n",
     "        manipulation_fkt=manipulation_fkt,\n",
     "    )\n",
     "    \n",
+    "    wav_unmanipulated = tts_model.synthesize_from_example({\n",
+    "        'text': \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\", \n",
+    "        'd_vector': d_vector.detach().numpy(),\n",
+    "    })\n",
+    "    \n",
     "    with audio_output:\n",
     "        clear_output(wait=True) \n",
+    "        print('Manipulated Speaker')\n",
     "        display(Audio(wav_manipulated, rate=24_000, normalize=True))\n",
+    "        print('Unmanipulated Synthese')\n",
+    "        display(Audio(wav_unmanipulated, rate=24_000, normalize=True))\n",
+    "        print('Original Speaker')\n",
     "        display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))\n",
     "\n",
     "    print(f\"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}\")\n",
     "run_button.on_click(update_manipulation)\n",
     "display(manipulation_idx_widget, manipulation_fkt_widget, run_button, audio_output)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f551258d-df75-4d7d-a2bd-b9c52bbdf22b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d185a73f-d066-465c-bcea-20ff6f279362",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {