|
"""Library implementing normalization. |
|
|
|
Authors |
|
* Mirco Ravanelli 2020 |
|
* Guillermo Cámbara 2021 |
|
* Sarthak Yadav 2022 |
|
""" |
|
|
|
import torch |
|
import torch.nn as nn |
|
|
|
|
|
class BatchNorm1d(nn.Module): |
|
"""Applies 1d batch normalization to the input tensor. |
|
|
|
Arguments |
|
--------- |
|
input_shape : tuple |
|
The expected shape of the input. Alternatively, use ``input_size``. |
|
input_size : int |
|
The expected size of the input. Alternatively, use ``input_shape``. |
|
eps : float |
|
This value is added to std deviation estimation to improve the numerical |
|
stability. |
|
momentum : float |
|
It is a value used for the running_mean and running_var computation. |
|
affine : bool |
|
When set to True, the affine parameters are learned. |
|
track_running_stats : bool |
|
When set to True, this module tracks the running mean and variance, |
|
and when set to False, this module does not track such statistics. |
|
combine_batch_time : bool |
|
When true, it combines batch an time axis. |
|
skip_transpose : bool |
|
Whether to skip the transposition. |
|
|
|
|
|
Example |
|
------- |
|
>>> input = torch.randn(100, 10) |
|
>>> norm = BatchNorm1d(input_shape=input.shape) |
|
>>> output = norm(input) |
|
>>> output.shape |
|
torch.Size([100, 10]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_shape=None, |
|
input_size=None, |
|
eps=1e-05, |
|
momentum=0.1, |
|
affine=True, |
|
track_running_stats=True, |
|
combine_batch_time=False, |
|
skip_transpose=False, |
|
): |
|
super().__init__() |
|
self.combine_batch_time = combine_batch_time |
|
self.skip_transpose = skip_transpose |
|
|
|
if input_size is None and skip_transpose: |
|
input_size = input_shape[1] |
|
elif input_size is None: |
|
input_size = input_shape[-1] |
|
|
|
self.norm = nn.BatchNorm1d( |
|
input_size, |
|
eps=eps, |
|
momentum=momentum, |
|
affine=affine, |
|
track_running_stats=track_running_stats, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, [channels]) |
|
input to normalize. 2d or 3d tensors are expected in input |
|
4d tensors can be used when combine_dims=True. |
|
|
|
Returns |
|
------- |
|
x_n : torch.Tensor |
|
The normalized outputs. |
|
""" |
|
shape_or = x.shape |
|
if self.combine_batch_time: |
|
if x.ndim == 3: |
|
x = x.reshape(shape_or[0] * shape_or[1], shape_or[2]) |
|
else: |
|
x = x.reshape( |
|
shape_or[0] * shape_or[1], shape_or[3], shape_or[2] |
|
) |
|
|
|
elif not self.skip_transpose: |
|
x = x.transpose(-1, 1) |
|
|
|
x_n = self.norm(x) |
|
|
|
if self.combine_batch_time: |
|
x_n = x_n.reshape(shape_or) |
|
elif not self.skip_transpose: |
|
x_n = x_n.transpose(1, -1) |
|
|
|
return x_n |
|
|
|
|
|
class BatchNorm2d(nn.Module): |
|
"""Applies 2d batch normalization to the input tensor. |
|
|
|
Arguments |
|
--------- |
|
input_shape : tuple |
|
The expected shape of the input. Alternatively, use ``input_size``. |
|
input_size : int |
|
The expected size of the input. Alternatively, use ``input_shape``. |
|
eps : float |
|
This value is added to std deviation estimation to improve the numerical |
|
stability. |
|
momentum : float |
|
It is a value used for the running_mean and running_var computation. |
|
affine : bool |
|
When set to True, the affine parameters are learned. |
|
track_running_stats : bool |
|
When set to True, this module tracks the running mean and variance, |
|
and when set to False, this module does not track such statistics. |
|
|
|
Example |
|
------- |
|
>>> input = torch.randn(100, 10, 5, 20) |
|
>>> norm = BatchNorm2d(input_shape=input.shape) |
|
>>> output = norm(input) |
|
>>> output.shape |
|
torch.Size([100, 10, 5, 20]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_shape=None, |
|
input_size=None, |
|
eps=1e-05, |
|
momentum=0.1, |
|
affine=True, |
|
track_running_stats=True, |
|
): |
|
super().__init__() |
|
|
|
if input_shape is None and input_size is None: |
|
raise ValueError("Expected input_shape or input_size as input") |
|
|
|
if input_size is None: |
|
input_size = input_shape[-1] |
|
|
|
self.norm = nn.BatchNorm2d( |
|
input_size, |
|
eps=eps, |
|
momentum=momentum, |
|
affine=affine, |
|
track_running_stats=track_running_stats, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channel1, channel2) |
|
input to normalize. 4d tensors are expected. |
|
|
|
Returns |
|
------- |
|
x_n : torch.Tensor |
|
The normalized outputs. |
|
""" |
|
x = x.transpose(-1, 1) |
|
x_n = self.norm(x) |
|
x_n = x_n.transpose(1, -1) |
|
|
|
return x_n |
|
|
|
|
|
class LayerNorm(nn.Module): |
|
"""Applies layer normalization to the input tensor. |
|
|
|
Arguments |
|
--------- |
|
input_size : int |
|
The expected size of the dimension to be normalized. |
|
input_shape : tuple |
|
The expected shape of the input. |
|
eps : float |
|
This value is added to std deviation estimation to improve the numerical |
|
stability. |
|
elementwise_affine : bool |
|
If True, this module has learnable per-element affine parameters |
|
initialized to ones (for weights) and zeros (for biases). |
|
|
|
Example |
|
------- |
|
>>> input = torch.randn(100, 101, 128) |
|
>>> norm = LayerNorm(input_shape=input.shape) |
|
>>> output = norm(input) |
|
>>> output.shape |
|
torch.Size([100, 101, 128]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_size=None, |
|
input_shape=None, |
|
eps=1e-05, |
|
elementwise_affine=True, |
|
): |
|
super().__init__() |
|
self.eps = eps |
|
self.elementwise_affine = elementwise_affine |
|
|
|
if input_shape is not None: |
|
input_size = input_shape[2:] |
|
|
|
self.norm = torch.nn.LayerNorm( |
|
input_size, |
|
eps=self.eps, |
|
elementwise_affine=self.elementwise_affine, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channels) |
|
input to normalize. 3d or 4d tensors are expected. |
|
|
|
Returns |
|
------- |
|
The normalized outputs. |
|
""" |
|
return self.norm(x) |
|
|
|
|
|
class InstanceNorm1d(nn.Module): |
|
"""Applies 1d instance normalization to the input tensor. |
|
|
|
Arguments |
|
--------- |
|
input_shape : tuple |
|
The expected shape of the input. Alternatively, use ``input_size``. |
|
input_size : int |
|
The expected size of the input. Alternatively, use ``input_shape``. |
|
eps : float |
|
This value is added to std deviation estimation to improve the numerical |
|
stability. |
|
momentum : float |
|
It is a value used for the running_mean and running_var computation. |
|
track_running_stats : bool |
|
When set to True, this module tracks the running mean and variance, |
|
and when set to False, this module does not track such statistics. |
|
affine : bool |
|
A boolean value that when set to True, this module has learnable |
|
affine parameters, initialized the same way as done for |
|
batch normalization. Default: False. |
|
|
|
Example |
|
------- |
|
>>> input = torch.randn(100, 10, 20) |
|
>>> norm = InstanceNorm1d(input_shape=input.shape) |
|
>>> output = norm(input) |
|
>>> output.shape |
|
torch.Size([100, 10, 20]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_shape=None, |
|
input_size=None, |
|
eps=1e-05, |
|
momentum=0.1, |
|
track_running_stats=True, |
|
affine=False, |
|
): |
|
super().__init__() |
|
|
|
if input_shape is None and input_size is None: |
|
raise ValueError("Expected input_shape or input_size as input") |
|
|
|
if input_size is None: |
|
input_size = input_shape[-1] |
|
|
|
self.norm = nn.InstanceNorm1d( |
|
input_size, |
|
eps=eps, |
|
momentum=momentum, |
|
track_running_stats=track_running_stats, |
|
affine=affine, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channels) |
|
input to normalize. 3d tensors are expected. |
|
|
|
Returns |
|
------- |
|
x_n : torch.Tensor |
|
The normalized outputs. |
|
""" |
|
x = x.transpose(-1, 1) |
|
x_n = self.norm(x) |
|
x_n = x_n.transpose(1, -1) |
|
|
|
return x_n |
|
|
|
|
|
class InstanceNorm2d(nn.Module): |
|
"""Applies 2d instance normalization to the input tensor. |
|
|
|
Arguments |
|
--------- |
|
input_shape : tuple |
|
The expected shape of the input. Alternatively, use ``input_size``. |
|
input_size : int |
|
The expected size of the input. Alternatively, use ``input_shape``. |
|
eps : float |
|
This value is added to std deviation estimation to improve the numerical |
|
stability. |
|
momentum : float |
|
It is a value used for the running_mean and running_var computation. |
|
track_running_stats : bool |
|
When set to True, this module tracks the running mean and variance, |
|
and when set to False, this module does not track such statistics. |
|
affine : bool |
|
A boolean value that when set to True, this module has learnable |
|
affine parameters, initialized the same way as done for |
|
batch normalization. Default: False. |
|
|
|
Example |
|
------- |
|
>>> input = torch.randn(100, 10, 20, 2) |
|
>>> norm = InstanceNorm2d(input_shape=input.shape) |
|
>>> output = norm(input) |
|
>>> output.shape |
|
torch.Size([100, 10, 20, 2]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_shape=None, |
|
input_size=None, |
|
eps=1e-05, |
|
momentum=0.1, |
|
track_running_stats=True, |
|
affine=False, |
|
): |
|
super().__init__() |
|
|
|
if input_shape is None and input_size is None: |
|
raise ValueError("Expected input_shape or input_size as input") |
|
|
|
if input_size is None: |
|
input_size = input_shape[-1] |
|
|
|
self.norm = nn.InstanceNorm2d( |
|
input_size, |
|
eps=eps, |
|
momentum=momentum, |
|
track_running_stats=track_running_stats, |
|
affine=affine, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channel1, channel2) |
|
input to normalize. 4d tensors are expected. |
|
|
|
Returns |
|
------- |
|
x_n : torch.Tensor |
|
The normalized outputs. |
|
""" |
|
x = x.transpose(-1, 1) |
|
x_n = self.norm(x) |
|
x_n = x_n.transpose(1, -1) |
|
|
|
return x_n |
|
|
|
|
|
class GroupNorm(nn.Module): |
|
"""Applies group normalization to the input tensor. |
|
|
|
Arguments |
|
--------- |
|
input_shape : tuple |
|
The expected shape of the input. Alternatively, use ``input_size``. |
|
input_size : int |
|
The expected size of the input. Alternatively, use ``input_shape``. |
|
num_groups : int |
|
Number of groups to separate the channels into. |
|
eps : float |
|
This value is added to std deviation estimation to improve the numerical |
|
stability. |
|
affine : bool |
|
A boolean value that when set to True, this module has learnable per-channel |
|
affine parameters initialized to ones (for weights) and zeros (for biases). |
|
|
|
Example |
|
------- |
|
>>> input = torch.randn(100, 101, 128) |
|
>>> norm = GroupNorm(input_size=128, num_groups=128) |
|
>>> output = norm(input) |
|
>>> output.shape |
|
torch.Size([100, 101, 128]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_shape=None, |
|
input_size=None, |
|
num_groups=None, |
|
eps=1e-05, |
|
affine=True, |
|
): |
|
super().__init__() |
|
self.eps = eps |
|
self.affine = affine |
|
|
|
if input_shape is None and input_size is None: |
|
raise ValueError("Expected input_shape or input_size as input") |
|
|
|
if num_groups is None: |
|
raise ValueError("Expected num_groups as input") |
|
|
|
if input_shape is not None: |
|
input_size = input_shape[-1] |
|
|
|
self.norm = torch.nn.GroupNorm( |
|
num_groups, |
|
input_size, |
|
eps=self.eps, |
|
affine=self.affine, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channels) |
|
input to normalize. 3d or 4d tensors are expected. |
|
|
|
Returns |
|
------- |
|
x_n : torch.Tensor |
|
The normalized outputs. |
|
""" |
|
x = x.transpose(-1, 1) |
|
x_n = self.norm(x) |
|
x_n = x_n.transpose(1, -1) |
|
|
|
return x_n |
|
|
|
|
|
class ExponentialMovingAverage(nn.Module): |
|
""" |
|
Applies learnable exponential moving average, as required by learnable PCEN layer |
|
|
|
Arguments |
|
--------- |
|
input_size : int |
|
The expected size of the input. |
|
coeff_init: float |
|
Initial smoothing coefficient value |
|
per_channel: bool |
|
Controls whether every smoothing coefficients are learned |
|
independently for every input channel |
|
trainable: bool |
|
whether to learn the PCEN parameters or use fixed |
|
skip_transpose : bool |
|
If False, uses batch x time x channel convention of speechbrain. |
|
If True, uses batch x channel x time convention. |
|
|
|
Example |
|
------- |
|
>>> inp_tensor = torch.rand([10, 50, 40]) |
|
>>> pcen = ExponentialMovingAverage(40) |
|
>>> out_tensor = pcen(inp_tensor) |
|
>>> out_tensor.shape |
|
torch.Size([10, 50, 40]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_size: int, |
|
coeff_init: float = 0.04, |
|
per_channel: bool = False, |
|
trainable: bool = True, |
|
skip_transpose: bool = False, |
|
): |
|
super().__init__() |
|
self._coeff_init = coeff_init |
|
self._per_channel = per_channel |
|
self.skip_transpose = skip_transpose |
|
self.trainable = trainable |
|
weights = ( |
|
torch.ones( |
|
input_size, |
|
) |
|
if self._per_channel |
|
else torch.ones( |
|
1, |
|
) |
|
) |
|
self._weights = nn.Parameter( |
|
weights * self._coeff_init, requires_grad=trainable |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channels) |
|
input to normalize. |
|
""" |
|
if not self.skip_transpose: |
|
x = x.transpose(1, -1) |
|
w = torch.clamp(self._weights, min=0.0, max=1.0) |
|
initial_state = x[:, :, 0] |
|
|
|
def scan(init_state, x, w): |
|
"""Loops and accumulates.""" |
|
x = x.permute(2, 0, 1) |
|
acc = init_state |
|
results = [] |
|
for ix in range(x.shape[0]): |
|
acc = (w * x[ix]) + ((1.0 - w) * acc) |
|
results.append(acc.unsqueeze(0)) |
|
results = torch.cat(results, dim=0) |
|
results = results.permute(1, 2, 0) |
|
return results |
|
|
|
output = scan(initial_state, x, w) |
|
if not self.skip_transpose: |
|
output = output.transpose(1, -1) |
|
return output |
|
|
|
|
|
class PCEN(nn.Module): |
|
""" |
|
This class implements a learnable Per-channel energy normalization (PCEN) layer, supporting both |
|
original PCEN as specified in [1] as well as sPCEN as specified in [2] |
|
|
|
[1] Yuxuan Wang, Pascal Getreuer, Thad Hughes, Richard F. Lyon, Rif A. Saurous, "Trainable Frontend For |
|
Robust and Far-Field Keyword Spotting", in Proc of ICASSP 2017 (https://arxiv.org/abs/1607.05666) |
|
|
|
[2] Neil Zeghidour, Olivier Teboul, F{\'e}lix de Chaumont Quitry & Marco Tagliasacchi, "LEAF: A LEARNABLE FRONTEND |
|
FOR AUDIO CLASSIFICATION", in Proc of ICLR 2021 (https://arxiv.org/abs/2101.08596) |
|
|
|
The default argument values correspond with those used by [2]. |
|
|
|
Arguments |
|
--------- |
|
input_size : int |
|
The expected size of the input. |
|
alpha: float |
|
specifies alpha coefficient for PCEN |
|
smooth_coef: float |
|
specified smooth coefficient for PCEN |
|
delta: float |
|
specifies delta coefficient for PCEN |
|
root: float |
|
specifies root coefficient for PCEN |
|
floor: float |
|
specifies floor coefficient for PCEN |
|
trainable: bool |
|
whether to learn the PCEN parameters or use fixed |
|
per_channel_smooth_coef: bool |
|
whether to learn independent smooth coefficients for every channel. |
|
when True, essentially using sPCEN from [2] |
|
skip_transpose : bool |
|
If False, uses batch x time x channel convention of speechbrain. |
|
If True, uses batch x channel x time convention. |
|
|
|
Example |
|
------- |
|
>>> inp_tensor = torch.rand([10, 50, 40]) |
|
>>> pcen = PCEN(40, alpha=0.96) # sPCEN |
|
>>> out_tensor = pcen(inp_tensor) |
|
>>> out_tensor.shape |
|
torch.Size([10, 50, 40]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_size, |
|
alpha: float = 0.96, |
|
smooth_coef: float = 0.04, |
|
delta: float = 2.0, |
|
root: float = 2.0, |
|
floor: float = 1e-12, |
|
trainable: bool = True, |
|
per_channel_smooth_coef: bool = True, |
|
skip_transpose: bool = False, |
|
): |
|
super().__init__() |
|
self._smooth_coef = smooth_coef |
|
self._floor = floor |
|
self._per_channel_smooth_coef = per_channel_smooth_coef |
|
self.skip_transpose = skip_transpose |
|
self.alpha = nn.Parameter( |
|
torch.ones(input_size) * alpha, requires_grad=trainable |
|
) |
|
self.delta = nn.Parameter( |
|
torch.ones(input_size) * delta, requires_grad=trainable |
|
) |
|
self.root = nn.Parameter( |
|
torch.ones(input_size) * root, requires_grad=trainable |
|
) |
|
|
|
self.ema = ExponentialMovingAverage( |
|
input_size, |
|
coeff_init=self._smooth_coef, |
|
per_channel=self._per_channel_smooth_coef, |
|
skip_transpose=True, |
|
trainable=trainable, |
|
) |
|
|
|
def forward(self, x): |
|
"""Returns the normalized input tensor. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor (batch, time, channels) |
|
input to normalize. |
|
|
|
Returns |
|
------- |
|
output : torch.Tensor |
|
The normalized outputs. |
|
""" |
|
if not self.skip_transpose: |
|
x = x.transpose(1, -1) |
|
alpha = torch.min( |
|
self.alpha, torch.tensor(1.0, dtype=x.dtype, device=x.device) |
|
) |
|
root = torch.max( |
|
self.root, torch.tensor(1.0, dtype=x.dtype, device=x.device) |
|
) |
|
ema_smoother = self.ema(x) |
|
one_over_root = 1.0 / root |
|
output = ( |
|
x / (self._floor + ema_smoother) ** alpha.view(1, -1, 1) |
|
+ self.delta.view(1, -1, 1) |
|
) ** one_over_root.view(1, -1, 1) - self.delta.view( |
|
1, -1, 1 |
|
) ** one_over_root.view( |
|
1, -1, 1 |
|
) |
|
if not self.skip_transpose: |
|
output = output.transpose(1, -1) |
|
return output |
|
|