Source code for hybra.isac_mfcc

from typing import Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.functional import create_dct
from torchaudio.transforms import AmplitudeToDB

from hybra.utils import audfilters, circ_conv
from hybra.utils import plot_response as plot_response_
from hybra.utils import ISACgram as ISACgram_
from hybra import ISACSpec




[docs]
class ISACCC(nn.Module):
    """ISAC Cepstral Coefficients (ISACCC) extractor for speech features.
    
    ISACCC computes cepstral coefficients from ISAC (Invertible and Stable Auditory 
    filterbank with Customizable kernels) spectrograms using the Discrete Cosine 
    Transform (DCT). This provides compact features suitable for speech recognition 
    and audio classification tasks.

    Args:
        kernel_size (int, optional): Size of the filter kernels. If None, computed automatically. Default: None
        num_channels (int): Number of frequency channels. Default: 40
        stride (int, optional): Stride of the filterbank. If None, uses 25% overlap. Default: None
        num_cc (int): Number of cepstral coefficients to extract. Default: 13
        fc_max (float, optional): Maximum frequency on the auditory scale in Hz. 
            If None, uses fs//2. Default: None
        fmax (float, optional): Maximum frequency for ISACSpec computation in Hz. Default: None
        fs (int): Sampling frequency in Hz. Default: 16000
        L (int): Signal length in samples. Default: 16000
        supp_mult (float): Support multiplier for kernel sizing. Default: 1.0
        power (float): Power applied to ISACSpec coefficients. Default: 2.0
        scale (str): Auditory scale type. One of {'mel', 'erb', 'bark', 'log10', 'elelog'}. 
            'elelog' is adapted for elephant hearing. Default: 'mel'
        is_log (bool): Whether to apply log instead of dB conversion. Default: False
        verbose (bool): Whether to print filterbank information during initialization. Default: True
        
    Raises:
        ValueError: If num_cc > num_channels
        
    Note:
        The DCT is applied with orthonormal basis functions for energy preservation.
        The number of cepstral coefficients should typically be much smaller than
        the number of frequency channels for dimensionality reduction.
        
    Example:
        >>> mfcc_extractor = ISACCC(num_channels=40, num_cc=13, fs=16000, L=16000)
        >>> x = torch.randn(1, 16000)
        >>> cepstral_coeffs = mfcc_extractor(x)
    """

[docs]
    def __init__(self,
                 kernel_size:Union[int,None]=None,
                 num_channels:int=40,
                 stride:Union[int,None]=None,
                 num_cc:int=13,
                 fc_max:Union[float,int,None]=None,
                 fmax:Union[float,int,None]=None,
                 fs:int=16000, 
                 L:int=16000,
                 supp_mult:float=1,
                 power:float=2.0,
                 scale:str='mel',
                 is_log:bool=False,
                 verbose:bool=True):
        super().__init__()

        self.isac = ISACSpec(
            kernel_size=kernel_size,
            num_channels=num_channels,
            stride=stride,
            fc_max=fc_max,
            fs=fs,
            L=L,
            supp_mult=supp_mult,
            power=power,
            scale=scale,
            is_log=False,
            verbose=verbose
        )

        self.fc_min = self.isac.fc_min
        self.fc_max = self.isac.fc_max
        self.kernel_min = self.isac.kernel_min
        self.fs = fs
        self.Ls = self.isac.Ls
        self.num_channels = num_channels
        self.num_cc = num_cc
        self.fmax = fmax
        self.is_log = is_log

        if self.num_cc > num_channels:
            raise ValueError("Cannot select more cepstrum coefficients than # channels")
        
        if self.fmax is not None:
            self.num_channels = torch.sum(self.isac.fc <= self.fmax)

        dct_mat = create_dct(self.num_cc, self.num_channels, norm='ortho').to(self.isac.kernels.device)
        self.register_buffer("dct_mat", dct_mat)

        self.amplitude_to_DB = AmplitudeToDB("power", 80.0)



[docs]
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass to compute ISAC cepstral coefficients.
        
        Args:
            x (torch.Tensor): Input signal of shape (batch_size, signal_length) or (signal_length,)
            
        Returns:
            torch.Tensor: Cepstral coefficients of shape (batch_size, num_cc, num_frames)
            
        Note:
            The process involves: ISAC spectrogram -> log/dB conversion -> DCT transform.
        """
        coeff = self.isac(x)
        if self.fmax is not None:
            coeff = coeff[:, :self.num_channels, :]
        if self.is_log:
            coeff = torch.log(coeff + 1e-10)
        else:
            coeff = self.amplitude_to_DB(coeff)
        return torch.matmul(coeff.transpose(-1, -2), self.dct_mat).transpose(-1, -2)



[docs]
    def ISACgram(self, x: torch.Tensor) -> None:
        """Plot cepstral coefficients representation.
        
        Args:
            x (torch.Tensor): Input signal to visualize
            
        Note:
            This method displays a plot of the cepstral coefficients and does not return values.
        """
        with torch.no_grad():
            coefficients = self.forward(x)
        ISACgram_(coefficients, None, self.Ls, self.fs)



[docs]
    def plot_response(self) -> None:
        """Plot frequency response of the underlying ISAC filters.
        
        Note:
            This method displays a plot and does not return values.
        """
        plot_response_(g=(self.isac.kernels[:self.num_channels, :]).detach().numpy(), fs=self.isac.fs, scale=True, fc_min=self.isac.fc_min, fc_max=self.isac.fc_max, kernel_min=self.isac.kernel_min)
Source code for hybra.isac_mfcc

HybrA-Filterbanks

Navigation

Related Topics

Versions