Spaces:

benwiley
/

pyannote-speech-separation-ami-1.0

Runtime error

pyannote-speech-separation-ami-1.0 / app.py

Ben Wiley

adding init

a7ab6a9 about 1 year ago

2.61 kB

	import torchaudio
	import gradio as gr
	from pyannote.audio import Pipeline
	from pyannote.audio.pipelines.utils.hook import ProgressHook
	import scipy.io.wavfile
	import os


	def perform_separation(audio_file_path: str):
	# Instantiate the pipeline
	pipeline = Pipeline.from_pretrained(
	"pyannote/speech-separation-ami-1.0",
	use_auth_token=HUGGINGFACE_ACCESS_TOKEN,
	)

	waveform, sample_rate = torchaudio.load(audio_file_path)

	# Run the pipeline
	with ProgressHook() as hook:
	diarization, sources = pipeline(
	{"waveform": waveform, "sample_rate": sample_rate}, hook=hook
	)

	# Save separated sources to disk as SPEAKER_XX.wav files
	output_file_paths = []
	for s, speaker in enumerate(diarization.labels()):
	number_of_separated_sources = sources.data.shape[1]
	if s >= number_of_separated_sources:
	break

	output_file_path = f"{speaker}.wav"
	scipy.io.wavfile.write(
	output_file_path, sample_rate, sources.data[:, s].numpy()
	)
	output_file_paths.append(output_file_path)

	# Generate RTTM content
	rttm_content = diarization.to_rttm()

	return output_file_paths, rttm_content


	def gradio_wrapper(audio_file_path: str):
	output_file_paths, rttm_content = perform_separation(audio_file_path)
	return output_file_paths + [rttm_content]


	inputs = gr.inputs.Audio(label="Input Audio", type="filepath")

	# Dynamic output for audio files
	outputs = []
	max_speakers = 10 # Set a reasonable maximum number of speakers
	for i in range(max_speakers):
	outputs.append(gr.outputs.Audio(label=f"Speaker {i+1}", type="filepath"))

	# Add RTTM output
	outputs.append(gr.outputs.Textbox(label="RTTM Output"))

	title = "Speech Separation and Diarization"
	description = "Gradio demo for Speech Separation and Diarization using Pyannote's pyannote/speech-separation-ami-1.0. To use it, simply upload your audio, or click one of the examples to load them. The app will output separated audio for each speaker and the RTTM file content."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2403.02288' target='_blank'>PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings</a> \| <a href='https://huggingface.co/pyannote/speech-separation-ami-1.0' '_blank'>HuggingFace Pipeline</a></p>"
	examples = [["samples_audio_samples_test_mixture.wav"]]

	gr.Interface(
	gradio_wrapper,
	inputs,
	outputs,
	title=title,
	description=description,
	article=article,
	examples=examples,
	).launch()