fixie-ai
/

ultravox-v0_7-glm-4_6

Audio-Text-to-Text

image-feature-extraction

Model card Files Files and versions

zqhuang commited on Dec 2, 2025

Commit

3836184

·

verified ·

1 Parent(s): aec6738

Upload UltravoxPipeline

Files changed (1) hide show

ultravox_model.py +0 -53

ultravox_model.py CHANGED Viewed

@@ -174,59 +174,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                 yield i_b, audio_index
                 audio_index += 1
-    def _select_embedings(
-        self,
-        inputs_embeds: torch.Tensor,
-        start_idx: torch.Tensor,
-        lengths: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Select a contiguous slice per batch starting at `start_idx[b]` with
-        length `lengths[b]`, returned in a compact, front-aligned tensor.
-        Any positions in the output that correspond to padding are zeroed out.
-        Supports both 3D tensors (B, T, D) and 2D tensors (B, T).
-        """
-        B = inputs_embeds.size(0)
-        T = inputs_embeds.size(1)
-        max_length = int(lengths.max().item())
-        if max_length == 0:
-            # Return an empty slice with correct rank
-            if inputs_embeds.dim() == 3:
-                return inputs_embeds.new_zeros((B, 0, inputs_embeds.size(2)))
-            else:
-                return inputs_embeds.new_zeros((B, 0), dtype=inputs_embeds.dtype)
-        # --- Create indices to gather ---
-        idx = torch.arange(
-            max_length, device=inputs_embeds.device, dtype=start_idx.dtype
-        )  # (Lmax,)
-        pos = start_idx.unsqueeze(1) + idx.unsqueeze(0)  # (B, Lmax)
-        # Clamp to prevent out-of-bounds gather, we will mask the invalid values later
-        pos = pos.clamp_(0, T - 1)
-        # --- Create mask for valid output positions ---
-        mask = idx.unsqueeze(0) < lengths.unsqueeze(1)  # (B, Lmax)
-        # --- Gather and mask ---
-        if inputs_embeds.dim() == 3:
-            D = inputs_embeds.size(2)
-            gathered = inputs_embeds.gather(
-                1, pos.unsqueeze(-1).expand(B, max_length, D)
-            )
-            # Zero out the padded values
-            gathered = gathered * mask.unsqueeze(-1)
-            return gathered
-        elif inputs_embeds.dim() == 2:
-            gathered = inputs_embeds.gather(1, pos)
-            # Zero out the padded values
-            gathered = gathered * mask
-            return gathered
-        else:
-            raise ValueError(
-                f"_select_embedings expects 2D or 3D tensors, got {inputs_embeds.dim()}D"
-            )
     def _decoder_layers(self):
         """Return decoder blocks across architectures (LLaMA/GLM/etc.)."""
         lm = self.language_model

                 yield i_b, audio_index
                 audio_index += 1
     def _decoder_layers(self):
         """Return decoder blocks across architectures (LLaMA/GLM/etc.)."""
         lm = self.language_model