Spaces:
Runtime error
Runtime error
import torchaudio | |
import gradio as gr | |
from pyannote.audio import Pipeline | |
from pyannote.audio.pipelines.utils.hook import ProgressHook | |
import scipy.io.wavfile | |
import os | |
def perform_separation(audio_file_path: str): | |
# Instantiate the pipeline | |
pipeline = Pipeline.from_pretrained( | |
"pyannote/speech-separation-ami-1.0", | |
use_auth_token=HUGGINGFACE_ACCESS_TOKEN, | |
) | |
waveform, sample_rate = torchaudio.load(audio_file_path) | |
# Run the pipeline | |
with ProgressHook() as hook: | |
diarization, sources = pipeline( | |
{"waveform": waveform, "sample_rate": sample_rate}, hook=hook | |
) | |
# Save separated sources to disk as SPEAKER_XX.wav files | |
output_file_paths = [] | |
for s, speaker in enumerate(diarization.labels()): | |
number_of_separated_sources = sources.data.shape[1] | |
if s >= number_of_separated_sources: | |
break | |
output_file_path = f"{speaker}.wav" | |
scipy.io.wavfile.write( | |
output_file_path, sample_rate, sources.data[:, s].numpy() | |
) | |
output_file_paths.append(output_file_path) | |
# Generate RTTM content | |
rttm_content = diarization.to_rttm() | |
return output_file_paths, rttm_content | |
def gradio_wrapper(audio_file_path: str): | |
output_file_paths, rttm_content = perform_separation(audio_file_path) | |
return output_file_paths + [rttm_content] | |
inputs = gr.inputs.Audio(label="Input Audio", type="filepath") | |
# Dynamic output for audio files | |
outputs = [] | |
max_speakers = 10 # Set a reasonable maximum number of speakers | |
for i in range(max_speakers): | |
outputs.append(gr.outputs.Audio(label=f"Speaker {i+1}", type="filepath")) | |
# Add RTTM output | |
outputs.append(gr.outputs.Textbox(label="RTTM Output")) | |
title = "Speech Separation and Diarization" | |
description = "Gradio demo for Speech Separation and Diarization using Pyannote's pyannote/speech-separation-ami-1.0. To use it, simply upload your audio, or click one of the examples to load them. The app will output separated audio for each speaker and the RTTM file content." | |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2403.02288' target='_blank'>PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings</a> | <a href='https://huggingface.co/pyannote/speech-separation-ami-1.0' '_blank'>HuggingFace Pipeline</a></p>" | |
examples = [["samples_audio_samples_test_mixture.wav"]] | |
gr.Interface( | |
gradio_wrapper, | |
inputs, | |
outputs, | |
title=title, | |
description=description, | |
article=article, | |
examples=examples, | |
).launch() | |