Phi-4-multimodal-instruct

@@ -1037,6 +1037,24 @@ class Phi4MMMLP(nn.Module):
         return self.down_proj(up_states)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -1134,7 +1152,7 @@ class Phi4MMAttention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -1229,7 +1247,7 @@ class Phi4MMFlashAttention2(Phi4MMAttention):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = (
@@ -1351,7 +1369,7 @@ class Phi4MMSdpaAttention(Phi4MMAttention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

         return self.down_proj(up_states)
+def _get_usable_past_kv_length(cache: Cache, new_seq_length: int, layer_idx: int = 0) -> int:
+    """Compute the usable past length for the given cache and upcoming new sequence length.
+    This mirrors the previous `get_usable_length(new_seq_length, layer_idx)` behavior that existed in
+    Transformers < 4.45, while being compatible with the new Cache API.
+    """
+    try:
+        previous_length = cache.get_seq_length(layer_idx)
+        # Dynamic layers return -1, static layers return an int
+        max_length = cache.get_max_cache_shape(layer_idx)
+        if max_length is not None and max_length != -1 and previous_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_length
+    except Exception:
+        # Best-effort fallback
+        return cache.get_seq_length(layer_idx) if hasattr(cache, "get_seq_length") else 0
 # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            kv_seq_len += _get_usable_past_kv_length(past_key_value, kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            kv_seq_len += _get_usable_past_kv_length(past_key_value, kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = (
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
+            kv_seq_len += _get_usable_past_kv_length(past_key_value, kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)