Upload model

Browse files

Files changed (7) hide show

README.md +199 -0
biome_model.py +35 -0
biome_modules.py +246 -0
config.json +36 -0
configuration_biome.py +62 -0
model.safetensors +3 -0
modeling_biome.py +245 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

biome_model.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from transformers import PreTrainedModel
+from .modeling_biome import BioME
+from .configuration_biome import BioMEConfig
+class BioMEModel(PreTrainedModel):
+    config_class = BioMEConfig
+    def __init__(self, config: BioMEConfig):
+        super().__init__(config)
+        self.model = BioME(config)
+        self.post_init()
+    def forward(
+        self,
+        wavs: torch.Tensor,
+        start_pos: int = 0,
+        padding_mask: torch.Tensor = None,
+        fbank_mean: float = 15.41663,
+        fbank_std: float = 6.55582,
+    ):
+        output, hidden_states, _, _, _, _ = self.model(
+            wavs,
+            start_pos=start_pos,
+            padding_mask=padding_mask,
+            fbank_mean=fbank_mean,
+            fbank_std=fbank_std,
+        )
+        return {
+            "last_hidden_state": output,
+            "hidden_states": hidden_states,
+        }

biome_modules.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+Our Transformer-based model for the AudioSet dataset.
+The model is heavily inspired in the Llama-3 model:
+    reference: https://github.com/meta-llama/llama3/blob/main/llama/model.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from .configuration_biome import BioMEConfig
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class GroupedQueryAttention(nn.Module):
+    """
+    A MultiHeadGroupedQueryAttention implementation.
+    Paper: 'GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints' (https://arxiv.org/pdf/2305.13245)
+    Code heavily inspired on:
+    - https://github.com/meta-llama/llama3/blob/main/llama/model.py
+    - https://docs.pytorch.org/torchtune/0.4/_modules/torchtune/modules/attention.html
+    Args:
+        qdim (int): Query input dimension. Default: 512
+        kdim (int, optional): Key input dimension. Default: qdim
+        vdim (int, optional): Value input dimension. Default: qdim
+        embd_dim (int, optional): Embedding dimension after projection. Must be
+            divisible by nheads. Default: qdim
+        nheads (int): Number of attention heads. Default: 8
+        dropout (float): Dropout probability. Default: 0
+        bias (bool): Use bias in projections. Default: True
+        use_gqa (bool): Enable grouped query attention. Default: False
+        device (torch.device, optional): Device for parameters
+        dtype (optional): Data type for parameters
+    Shape:
+        - Query: (B, L_q, qdim)
+        - Key: (B, L_k, kdim)
+        - Value: (B, L_k, vdim)
+        - Output: (B, L_q, qdim)
+        where B is batch size and L is sequence length
+    """
+    def __init__(
+        self,
+        dim: int = 512,
+        num_q_heads: int = 16,
+        num_kv_heads: int = 4,
+        dropout: float = 0.0,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_q_heads = num_q_heads
+        self.num_kv_heads = num_kv_heads
+        self.dropout = dropout
+        self.bias = bias
+        factory_kwargs = {"device": device, "dtype": dtype}
+        assert dim % num_q_heads == 0, "Embedding dim is not divisible by nheads"
+        self.dim_per_head = dim // num_q_heads
+        self.q_proj = nn.Linear(self.dim, num_q_heads * self.dim_per_head, bias=bias, **factory_kwargs)
+        self.k_proj = nn.Linear(self.dim, num_kv_heads * self.dim_per_head, bias=bias, **factory_kwargs)
+        self.v_proj = nn.Linear(self.dim, num_kv_heads * self.dim_per_head, bias=bias, **factory_kwargs)
+        self.out_proj = nn.Linear(num_q_heads * self.dim_per_head, self.dim, bias=bias, **factory_kwargs)
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, tgt_len, qdim)
+            start_pos (int): Start position for rotary embeddings
+            freqs_cis (torch.Tensor): Rotary embeddings
+            attn_mask (torch.Tensor): Attention mask
+            is_causal (bool): If True, applies a causal mask to prevent attending to future positions.
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, tgt_len, qdim)
+        """
+        bsz, seqlen, _ = x.shape
+        # Step 1: Apply projections
+        xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        # Step 2: Split the heads before the scale-dot product attention
+        xq = xq.view(bsz, seqlen, self.num_q_heads, self.dim_per_head)
+        xk = xk.view(bsz, seqlen, self.num_kv_heads, self.dim_per_head)
+        xv = xv.view(bsz, seqlen, self.num_kv_heads, self.dim_per_head)
+        # Step 3: Apply rotary embeddings
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        # Step 4: Apply scale-dot product attention
+        # Note: torch sdpa expects (batch_size, num_heads, seq_len, dim_per_head)
+        attn_output = (
+            F.scaled_dot_product_attention(
+                xq.transpose(1, 2),
+                xk.transpose(1, 2),
+                xv.transpose(1, 2),
+                attn_mask=attn_mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=is_causal,
+                enable_gqa=True,
+            )
+            .transpose(1, 2)
+            .flatten(-2)  # (B, nheads, L, dim_per_head) -> (B, L_t, E_total)
+        )
+        return self.out_proj(attn_output)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class FiLM(nn.Module):
+    """
+    A Feature-wise Linear Modulation Layer from
+    'FiLM: Visual Reasoning with a General Conditioning Layer'
+    """
+    def __init__(self, d_model: int, context_dim: int):
+        super().__init__()
+        self.d_model = d_model
+        self.context_dim = context_dim
+        self.shared_modulator = nn.Linear(context_dim, 2 * d_model)
+    def forward(self, x, ctx):
+        """
+        Arguments
+        ----------
+        x: torch.Tensor
+            Activations / Tensor in the Transformer of shape (B, T, d_model)
+        ctx: torch.Tensor
+            Side channel information. It can be (B, F) or (B, T, F).
+            If 3-dimensional, note that the sequence-dimension, T, must match
+            the input tensor where you are going to combine the FiLM'ed result.
+        """
+        params = self.shared_modulator(ctx)
+        params = params.view(params.size(0), 1, -1)
+        gammas, betas = params.chunk(2, dim=-1)
+        return (gammas * x) + betas
+class TransformerFFN(nn.Module):
+    def __init__(self, dim, hidden_dim, bias: bool = False):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=bias)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=bias)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=bias)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, config: BioMEConfig):
+        super().__init__()
+        self.use_context = config.use_context
+        if self.use_context:
+            self.film = FiLM(
+                d_model=config.hidden_size, context_dim=config.ctx_hidden_size
+            )
+            self.film_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+            self.film_norm_ctx = RMSNorm(config.ctx_hidden_size, eps=config.norm_eps)
+        self.attention = GroupedQueryAttention(
+            dim=config.hidden_size,
+            num_q_heads=config.num_query_heads,
+            num_kv_heads=config.num_kv_heads,
+            dropout=config.dropout,
+            bias=config.bias,
+        )
+        self.feed_forward = TransformerFFN(
+            dim=config.hidden_size,
+            hidden_dim=config.ffn_hidden_size,
+        )
+        self.attention_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        ctx: torch.Tensor = None,
+        padding_mask: torch.Tensor = None,
+    ):
+        if padding_mask is not None:
+            x[padding_mask] = 0
+        h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis)
+        if self.use_context:
+            h = self.film(self.film_norm(h), self.film_norm_ctx(ctx))
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "architectures": [
+    "BioMEModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_biome.BioMEConfig",
+    "AutoModel": "biome_model.BioMEModel"
+  },
+  "bias": false,
+  "context_type": "mss",
+  "ctx_hidden_size": 258,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "embed_dim": 384,
+  "ffn_hidden_size": 1344,
+  "frame_length": 25,
+  "frame_shift": 10,
+  "hidden_size": 384,
+  "input_patch_size": 16,
+  "max_cache_size": 10,
+  "max_seq_len": 1024,
+  "model_type": "biome",
+  "mss_n_fft1": 256,
+  "mss_n_fft2": 256,
+  "mss_win_shift": 128,
+  "mss_win_size": 256,
+  "n_mels": 128,
+  "norm_eps": 1e-05,
+  "num_kv_heads": 4,
+  "num_layers": 12,
+  "num_query_heads": 8,
+  "rope_theta": 10000.0,
+  "sample_rate": 16000,
+  "transformers_version": "5.0.0",
+  "use_context": true
+}

configuration_biome.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from transformers import PreTrainedConfig
+class BioMEConfig(PreTrainedConfig):
+    model_type = "biome"
+    def __init__(
+        self,
+        num_layers: int = 12,
+        num_query_heads: int = 12,
+        num_kv_heads: int = 4,
+        embed_dim: int = 512,
+        hidden_size: int = 384,
+        ffn_hidden_size: int = 1344,
+        dropout: float = 0.1,
+        sample_rate: int = 16000,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        n_mels: int = 128,
+        input_patch_size: int = 16,
+        norm_eps: float = 1e-5,
+        max_seq_len: int = 1024,
+        rope_theta: float = 10000.0,
+        bias: bool = False,
+        use_context: bool = True,
+        context_type: str = "mss",
+        max_cache_size: int = 10,
+        ctx_hidden_size: int = 258,
+        mss_n_fft1: int = 256,
+        mss_n_fft2: int = 256,
+        mss_win_size: int = 256,
+        mss_win_shift: int = 128,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Transformer Parameters
+        self.num_layers = num_layers
+        self.num_query_heads = num_query_heads
+        self.num_kv_heads = num_kv_heads
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.dropout = dropout
+        self.sample_rate = sample_rate
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.n_mels = n_mels
+        self.input_patch_size = input_patch_size
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.rope_theta = rope_theta
+        self.bias = bias
+        # Context Parameters
+        self.use_context = use_context
+        self.context_type = context_type
+        self.max_cache_size = max_cache_size
+        self.ctx_hidden_size = ctx_hidden_size
+        self.mss_n_fft1 = mss_n_fft1
+        self.mss_n_fft2 = mss_n_fft2
+        self.mss_win_size = mss_win_size
+        self.mss_win_shift = mss_win_shift

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3586650791ff61bdfe48be1f7c7564c97bb052dbc53b3275d12135fb576146a
+size 105578728

modeling_biome.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import math
+import torch
+import torch.nn as nn
+import torchaudio.compliance.kaldi as ta_kaldi
+from .biome_modules import RMSNorm
+from .configuration_biome import BioMEConfig
+from .biome_modules import precompute_freqs_cis
+from .biome_modules import TransformerEncoderLayer
+class BioME(nn.Module):
+    def __init__(self, cfg: BioMEConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.n_layers = cfg.num_layers
+        self.patch_embedding = nn.Conv2d(
+            1,
+            cfg.embed_dim,
+            kernel_size=cfg.input_patch_size,
+            stride=cfg.input_patch_size,
+            bias=False,
+        )
+        self.dropout_input = nn.Dropout(cfg.dropout)
+        self.post_extract_proj = (
+            nn.Linear(cfg.embed_dim, cfg.hidden_size)
+            if cfg.embed_dim != cfg.hidden_size
+            else nn.Identity()
+        )
+        self.layers = torch.nn.ModuleList()
+        for _ in range(cfg.num_layers):
+            self.layers.append(TransformerEncoderLayer(cfg))
+        self.feature_norm = RMSNorm(cfg.embed_dim, eps=cfg.norm_eps)
+        self.freqs_cis = precompute_freqs_cis(
+            cfg.hidden_size // cfg.num_query_heads,
+            cfg.max_seq_len * 2,
+            cfg.rope_theta,
+        )
+        self.modulation_cache = {}
+        # Weights initialization
+        deep_norm_beta = math.pow(8 * cfg.num_layers, -1 / 4)
+        for i in range(cfg.num_layers):
+            nn.init.xavier_normal_(self.layers[i].attention.k_proj.weight, gain=1)
+            nn.init.xavier_normal_(
+                self.layers[i].attention.v_proj.weight, gain=deep_norm_beta
+            )
+            nn.init.xavier_normal_(self.layers[i].attention.q_proj.weight, gain=1)
+            nn.init.xavier_normal_(
+                self.layers[i].attention.out_proj.weight, gain=deep_norm_beta
+            )
+            nn.init.xavier_normal_(
+                self.layers[i].feed_forward.w1.weight, gain=deep_norm_beta
+            )
+            nn.init.xavier_normal_(
+                self.layers[i].feed_forward.w2.weight, gain=deep_norm_beta
+            )
+            nn.init.xavier_normal_(
+                self.layers[i].feed_forward.w3.weight, gain=deep_norm_beta
+            )
+    def forward_padding_mask(
+        self,
+        features: torch.Tensor,
+        padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def forward(
+        self,
+        wavs: torch.Tensor,
+        start_pos: int,
+        padding_mask: torch.Tensor = None,
+        fbank_mean: float = 15.41663,
+        fbank_std: float = 6.55582,
+        apply_mask: bool = False,
+    ):
+        # 1. Get input features
+        fbank = self.wav_to_fbank(wavs, fbank_mean=fbank_mean, fbank_std=fbank_std)
+        ctx = self.get_modulation_spectrum(wavs) # Side-channel (MSAB) features
+        # 2. Patchfy the input
+        features = self.feature_patchfy(fbank)
+        patch_padding_mask = None
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+            patch_padding_mask = padding_mask.clone()
+        ids_restore, kept_mask = None, None
+        if apply_mask:
+            B, T, F = features.shape
+            u = torch.rand(B, T, device=features.device)
+            to_mask = (u < self.cfg.mlm_mask_prob)
+            kept_mask = ~to_mask
+            features = features.masked_fill(~kept_mask.unsqueeze(-1), 0.0)
+        features = self.post_extract_proj(features)
+        _, seqlen, _ = features.shape
+        # 3. Apply positional encoding
+        if self.freqs_cis.device.type == "meta":
+            self.freqs_cis = self._get_freqs_cis()
+        self.freqs_cis = self.freqs_cis.to(features.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+        # 4. Apply transformer layers
+        x = self.dropout_input(features)
+        layer_results = []
+        for layer in self.layers:
+            x = layer(
+                x, start_pos=0, freqs_cis=freqs_cis, ctx=ctx, padding_mask=padding_mask
+            )
+            layer_results.append(x)
+        # 5. Apply post-processing
+        return x, layer_results, padding_mask, ids_restore, kept_mask, patch_padding_mask
+    def wav_to_fbank(
+        self,
+        source: torch.Tensor,
+        fbank_mean: float = -4.268,
+        fbank_std: float = 4.569,
+    ):
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2**15
+            fbank = ta_kaldi.fbank(
+                waveform,
+                num_mel_bins=self.cfg.n_mels,
+                sample_frequency=self.cfg.sample_rate,
+                frame_length=self.cfg.frame_length,
+                frame_shift=self.cfg.frame_shift,
+                use_energy=False,
+                window_type="hanning",
+                dither=0.0,
+            )
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def feature_patchfy(self, rep: torch.Tensor) -> torch.Tensor:
+        """
+        Patchify the feature representation.
+        """
+        rep = rep.unsqueeze(1)
+        features = self.patch_embedding(rep)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.feature_norm(features)
+        return features
+    def _get_freqs_cis(self):
+        return precompute_freqs_cis(
+            self.cfg.hidden_size // self.cfg.num_query_heads,
+            self.cfg.max_seq_len * 2,
+            self.cfg.rope_theta,
+        )
+    @torch.no_grad()
+    def normalize_fft(self, spec_data, window, n_samples, n_fft, fs):
+        # Normalizations
+        win_rms = torch.sqrt(window.pow(2.0).sum() / n_samples)
+        # Compute the power spectrogram
+        spec_data /= win_rms
+        spec_data = spec_data.abs().pow(
+            2.0
+        )  # same as X_pwr = abs(np.multiply(Xt, np.conj(Xt)))
+        spec_data *= 1.0 / n_fft**2  # make it orthonormal
+        if n_fft % 2 != 0:
+            n_freqs = (n_fft + 1) / 2
+            spec_data[
+                :, 1:, :
+            ] *= 2  # double all frequency components except DC component
+        else:
+            n_freqs = (n_fft / 2) + 1
+            spec_data[
+                :, 1:-1, :
+            ] *= 2  # double all frequency components except DC and fs/2 components
+        f_delta = fs / n_fft
+        spec_data = torch.divide(spec_data, f_delta)  # scale by frequency delta
+        return f_delta, spec_data
+    @torch.no_grad()
+    def get_modulation_spectrum(self, wavs: torch.Tensor):
+        # number of samples and number of channels
+        _, n_samples = wavs.shape
+        # Step 1: compute STFT spectrogram
+        window = torch.hamming_window(
+            self.cfg.mss_win_size, periodic=True, device=wavs.device
+        )
+        spec_data = torch.stft(
+            wavs,
+            n_fft=self.cfg.mss_n_fft1,
+            win_length=self.cfg.mss_win_size,
+            hop_length=self.cfg.mss_win_shift,
+            window=window,
+            return_complex=True,
+            onesided=True,
+        )  # We add pad while old code remove the last window if necessary
+        _, _, n_windows = spec_data.shape
+        # Normalizations
+        _, spec_data = self.normalize_fft(
+            spec_data, window, n_samples, self.cfg.mss_n_fft1, self.cfg.sample_rate
+        )
+        # Step 2: Modulation Features
+        # modulation sampling frequency
+        fs_mod = 1 / (self.cfg.mss_win_shift / self.cfg.sample_rate)
+        n_fft2 = self.cfg.mss_n_fft2
+        if n_fft2 is None:
+            n_fft2 = n_windows
+        # the AM analysis is made in the Amplitude derived from the Power Spectrogram
+        window = torch.hamming_window(n_windows, periodic=True, device=wavs.device)
+        spec_data = torch.multiply(spec_data, window)
+        mod_psd = torch.fft.rfft(spec_data, n=n_fft2, dim=2)
+        _, mod_psd = self.normalize_fft(mod_psd, window, n_samples, n_fft2, fs_mod)
+        return torch.cat([mod_psd.mean(dim=1), mod_psd.mean(dim=2)], dim=1)