from typing import Union import librosa import numpy as np import parselmouth import torch from modules.nsf_hifigan.nvSTFT import STFT from utils.decomposed_waveform import DecomposedWaveform from utils.pitch_utils import interp_f0 def get_mel_torch( waveform, samplerate, *, num_mel_bins=128, hop_size=512, win_size=2048, fft_size=2048, fmin=40, fmax=16000, keyshift=0, speed=1, device=None ): if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' stft = STFT(samplerate, num_mel_bins, fft_size, win_size, hop_size, fmin, fmax, device=device) with torch.no_grad(): wav_torch = torch.from_numpy(waveform).to(device) mel_torch = stft.get_mel(wav_torch.unsqueeze(0), keyshift=keyshift, speed=speed).squeeze(0).T return mel_torch.cpu().numpy() @torch.no_grad() def get_mel2ph_torch(lr, durs, length, timestep, device='cpu'): ph_acc = torch.round(torch.cumsum(durs.to(device), dim=0) / timestep + 0.5).long() ph_dur = torch.diff(ph_acc, dim=0, prepend=torch.LongTensor([0]).to(device)) mel2ph = lr(ph_dur[None])[0] num_frames = mel2ph.shape[0] if num_frames < length: mel2ph = torch.cat((mel2ph, torch.full((length - num_frames,), fill_value=mel2ph[-1], device=device)), dim=0) elif num_frames > length: mel2ph = mel2ph[:length] return mel2ph def get_pitch_parselmouth( waveform, samplerate, length, *, hop_size, f0_min=65, f0_max=1100, speed=1, interp_uv=False ): """ :param waveform: [T] :param samplerate: sampling rate :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param f0_min: Minimum f0 in Hz :param f0_max: Maximum f0 in Hz :param speed: Change the speed :param interp_uv: Interpolate unvoiced parts :return: f0, uv """ hop_size = int(np.round(hop_size * speed)) time_step = hop_size / samplerate l_pad = int(np.ceil(1.5 / f0_min * samplerate)) r_pad = hop_size * ((len(waveform) - 1) // hop_size + 1) - len(waveform) + l_pad + 1 waveform = np.pad(waveform, (l_pad, r_pad)) # noinspection PyArgumentList s = parselmouth.Sound(waveform, sampling_frequency=samplerate).to_pitch_ac( time_step=time_step, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max ) assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 f0 = s.selected_array['frequency'].astype(np.float32) if len(f0) < length: f0 = np.pad(f0, (0, length - len(f0))) f0 = f0[: length] uv = f0 == 0 if interp_uv: f0, uv = interp_f0(f0, uv) return f0, uv def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'): """ Definition of energy: RMS of the waveform, in dB representation :param waveform: [T] :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param win_size: Window size, in number of samples :param domain: db or amplitude :return: energy """ energy = librosa.feature.rms(y=waveform, frame_length=win_size, hop_length=hop_size)[0] if len(energy) < length: energy = np.pad(energy, (0, length - len(energy))) energy = energy[: length] if domain == 'db': energy = librosa.amplitude_to_db(energy) elif domain == 'amplitude': pass else: raise ValueError(f'Invalid domain: {domain}') return energy def get_breathiness( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length, *, hop_size=None, fft_size=None, win_size=None ): """ Definition of breathiness: RMS of the aperiodic part, in dB representation :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given :param samplerate: sampling rate :param f0: reference f0 :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param fft_size: Number of fft bins :param win_size: Window size, in number of samples :return: breathiness """ if not isinstance(waveform, DecomposedWaveform): waveform = DecomposedWaveform( waveform=waveform, samplerate=samplerate, f0=f0, hop_size=hop_size, fft_size=fft_size, win_size=win_size ) waveform_ap = waveform.aperiodic() breathiness = get_energy_librosa( waveform_ap, length=length, hop_size=waveform.hop_size, win_size=waveform.win_size ) return breathiness def get_voicing( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length, *, hop_size=None, fft_size=None, win_size=None ): """ Definition of voicing: RMS of the harmonic part, in dB representation :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given :param samplerate: sampling rate :param f0: reference f0 :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param fft_size: Number of fft bins :param win_size: Window size, in number of samples :return: voicing """ if not isinstance(waveform, DecomposedWaveform): waveform = DecomposedWaveform( waveform=waveform, samplerate=samplerate, f0=f0, hop_size=hop_size, fft_size=fft_size, win_size=win_size ) waveform_sp = waveform.harmonic() voicing = get_energy_librosa( waveform_sp, length=length, hop_size=waveform.hop_size, win_size=waveform.win_size ) return voicing def get_tension_base_harmonic( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length, *, hop_size=None, fft_size=None, win_size=None, domain='logit' ): """ Definition of tension: radio of the real harmonic part (harmonic part except the base harmonic) to the full harmonic part. :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given :param samplerate: sampling rate :param f0: reference f0 :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param fft_size: Number of fft bins :param win_size: Window size, in number of samples :param domain: The domain of the final ratio representation. Can be 'ratio' (the raw ratio), 'db' (log decibel) or 'logit' (the reverse function of sigmoid) :return: tension """ if not isinstance(waveform, DecomposedWaveform): waveform = DecomposedWaveform( waveform=waveform, samplerate=samplerate, f0=f0, hop_size=hop_size, fft_size=fft_size, win_size=win_size ) waveform_h = waveform.harmonic() waveform_base_h = waveform.harmonic(0) energy_base_h = get_energy_librosa( waveform_base_h, length, hop_size=waveform.hop_size, win_size=waveform.win_size, domain='amplitude' ) energy_h = get_energy_librosa( waveform_h, length, hop_size=waveform.hop_size, win_size=waveform.win_size, domain='amplitude' ) tension = np.sqrt(np.clip(energy_h ** 2 - energy_base_h ** 2, a_min=0, a_max=None)) / (energy_h + 1e-5) if domain == 'ratio': tension = np.clip(tension, a_min=0, a_max=1) elif domain == 'db': tension = np.clip(tension, a_min=1e-5, a_max=1) tension = librosa.amplitude_to_db(tension) elif domain == 'logit': tension = np.clip(tension, a_min=1e-4, a_max=1 - 1e-4) tension = np.log(tension / (1 - tension)) return tension class SinusoidalSmoothingConv1d(torch.nn.Conv1d): def __init__(self, kernel_size): super().__init__( in_channels=1, out_channels=1, kernel_size=kernel_size, bias=False, padding='same', padding_mode='replicate' ) smooth_kernel = torch.sin(torch.from_numpy( np.linspace(0, 1, kernel_size).astype(np.float32) * np.pi )) smooth_kernel /= smooth_kernel.sum() self.weight.data = smooth_kernel[None, None]