TIGER-audio-extraction1

Sleeping

App Files Files Community

TIGER-audio-extraction1 / inference_speech.py

fffiloni

Migrated from GitHub

406f22d verified 16 days ago

raw

history blame contribute delete

3.76 kB

	import yaml
	import os
	import look2hear.models
	import argparse
	import torch
	import torchaudio
	import torchaudio.transforms as T # Added for resampling

	# audio path
	parser = argparse.ArgumentParser()
	# --- Argument Parsing ---
	parser = argparse.ArgumentParser(description="Separate speech sources using Look2Hear TIGER model.")
	parser.add_argument("--audio_path", default="test/mix.wav", help="Path to audio file (mixture).")
	parser.add_argument("--output_dir", default="separated_audio", help="Directory to save separated audio files.")
	parser.add_argument("--model_cache_dir", default="cache", help="Directory to cache downloaded model.")

	# Parse arguments once at the beginning

	args = parser.parse_args()

	audio_path = args.audio_path

	output_dir = args.output_dir

	cache_dir = args.model_cache_dir
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	print(f"Using device: {device}")

	# Load model

	print("Loading TIGER model...")
	# Ensure cache directory exists if specified
	if cache_dir:
	os.makedirs(cache_dir, exist_ok=True)
	# Load the pretrained model
	model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir=cache_dir)
	model.to(device)
	model.eval()


	# --- Audio Loading and Preprocessing ---
	# Define the target sample rate expected by the model (usually 16kHz for TIGER)

	target_sr = 16000
	print(f"Loading audio from: {audio_path}")
	try:
	# Load audio and get its original sample rate
	waveform, original_sr = torchaudio.load(audio_path)
	except Exception as e:
	print(f"Error loading audio file {audio_path}: {e}")
	exit(1)
	print(f"Original sample rate: {original_sr} Hz, Target sample rate: {target_sr} Hz")

	# Resample if necessary
	if original_sr != target_sr:
	print(f"Resampling audio from {original_sr} Hz to {target_sr} Hz...")
	resampler = T.Resample(orig_freq=original_sr, new_freq=target_sr)
	waveform = resampler(waveform)
	print("Resampling complete.")

	# Move waveform to the target device
	audio = waveform.to(device)

	# Prepare the input tensor for the model
	# Model likely expects a batch dimension [B, T] or [B, C, T]
	# Assuming input is mono or model handles channels; add batch dim
	# If audio has channel dim [C, T], keep it. If it's just [T], add channel dim first.

	if audio.dim() == 1:
	audio = audio.unsqueeze(0) # Add channel dimension -> [1, T]
	# Add batch dimension -> [1, C, T]
	# The original audio[None] is equivalent to unsqueeze(0) on the batch dimension
	audio_input = audio.unsqueeze(0).to(device)
	print(f"Audio tensor prepared with shape: {audio_input.shape}")

	# --- Speech Separation ---

	# Create output directory if it doesn't exist

	os.makedirs(output_dir, exist_ok=True)
	print(f"Output directory: {output_dir}")
	print("Performing separation...")

	with torch.no_grad():
	# Pass the prepared input tensor to the model
	ests_speech = model(audio_input) # Expected output shape: [B, num_spk, T]

	# Process the estimated sources

	# Remove the batch dimension -> [num_spk, T]

	ests_speech = ests_speech.squeeze(0)

	num_speakers = ests_speech.shape[0]

	print(f"Separation complete. Detected {num_speakers} potential speakers.")



	# --- Save Separated Audio ---

	# Dynamically save all separated tracks

	for i in range(num_speakers):
	output_filename = os.path.join(output_dir, f"spk{i+1}.wav")
	speaker_track = ests_speech[i].cpu() # Get the i-th speaker track and move to CPU
	print(f"Saving speaker {i+1} to {output_filename}")
	try:
	torchaudio.save(
	output_filename,
	speaker_track, # Save the individual track
	target_sr # Save with the target sample rate
	)
	except Exception as e:
	print(f"Error saving file {output_filename}: {e}")