Instructions to use nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", trust_remote_code=True, dtype="auto")

Inference
Local Apps Settings

vLLM

How to use nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5

SGLang

How to use nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 with Docker Model Runner:
```
docker model run hf.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
```

variable_cache.py compatibility for v4.57.2 / python3.12

#12

by NePe - opened Nov 24, 2025

base: refs/heads/main

←

from: refs/pr/12

Discussion Files changed

+24

-14

Files changed (2) hide show

modeling_decilm.py +7 -3
variable_cache.py +17 -11

modeling_decilm.py CHANGED Viewed

@@ -27,7 +27,7 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import GenerationConfig
-from transformers.generation.utils import NEED_SETUP_CACHE_CLASSES_MAPPING, GenerationMixin, GenerateOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.utils import (
@@ -809,8 +809,9 @@ class DeciLMPreTrainedModel(PreTrainedModel):
     ) -> tuple[GenerationConfig, dict]:
         # DeciLM-specific code
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, *args, **kwargs)
-        generation_config.cache_implementation = "variable"
-        NEED_SETUP_CACHE_CLASSES_MAPPING["variable"] = VariableCache
         return generation_config, model_kwargs
@@ -1133,6 +1134,9 @@ class DeciLMForCausalLM(DeciLMPreTrainedModel, GenerationMixin):
     def get_decoder(self):
         return self.model
     @add_start_docstrings_to_model_forward(DECILM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(

 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import GenerationConfig
+from transformers.generation.utils import GenerationMixin, GenerateOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.utils import (
     ) -> tuple[GenerationConfig, dict]:
         # DeciLM-specific code
         generation_config, model_kwargs = super()._prepare_generation_config(generation_config, *args, **kwargs)
+        generation_config.disable_compile = True
+        #generation_config.cache_implementation = "variable"
+        #NEED_SETUP_CACHE_CLASSES_MAPPING["variable"] = VariableCache
         return generation_config, model_kwargs
     def get_decoder(self):
         return self.model
+    def getVariableCache(self, batch_size=1, max_cache_len=4096, dtype=torch.bfloat16):
+        return VariableCache(config=self.config, batch_size=batch_size, max_batch_size=batch_size, max_cache_len=max_cache_len, dtype=dtype)
     @add_start_docstrings_to_model_forward(DECILM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(

variable_cache.py CHANGED Viewed

@@ -31,6 +31,9 @@ class VariableCache(Cache_4_44_2, Cache):
     The default implementation for the layer caches is StaticCache.
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
     def __init__(
             self,
@@ -50,7 +53,7 @@ class VariableCache(Cache_4_44_2, Cache):
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype
-        self.layer_caches: list[Cache_4_44_2 | None] = [None] * config.num_hidden_layers
         self.layer_devices: list[torch.device | None] = [None] * config.num_hidden_layers
     def update(
@@ -60,11 +63,11 @@ class VariableCache(Cache_4_44_2, Cache):
             layer_idx: int,
             cache_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if self.layer_caches[layer_idx] is None:
             self.layer_devices[layer_idx] = key_states.device
             self._init_layer_cache(layer_idx)
-        layer_cache = self.layer_caches[layer_idx]
         assert layer_cache is not None, f"Trying to update the cache of a cache-less layer: {layer_idx=}"
         k_out, v_out = layer_cache.update(key_states=key_states,
@@ -93,37 +96,37 @@ class VariableCache(Cache_4_44_2, Cache):
         if attention_config.window_length is not None:
             if not attention_config.is_sink:
                 config.sliding_window = attention_config.window_length
-                self.layer_caches[layer_idx] = SlidingWindowCache(config=config,
                                                                   max_batch_size=self.max_batch_size,
                                                                   max_cache_len=self.max_cache_len,
                                                                   device=device,
                                                                   dtype=self.dtype)
                 return
             elif not attention_config.unshifted_sink:
-                self.layer_caches[layer_idx] = SinkCache(window_length=attention_config.window_length,
                                                          num_sink_tokens=attention_config.num_sink_tokens)
                 return
-        self.layer_caches[layer_idx] = StaticCache(config=config,
                                                    max_batch_size=self.max_batch_size,
                                                    max_cache_len=self.max_cache_len,
                                                    device=device,
                                                    dtype=self.dtype)
     def _get_first_real_cache(self) -> Cache:
-        for layer_cache in self.layer_caches:
             if layer_cache is not None:
                 return layer_cache
         raise ValueError(f"No real cache found, all layer caches are None.")
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        if layer_idx == 0 and self.layer_caches[0] is None:
             try:
                 layer_cache = self._get_first_real_cache()
             except ValueError:
                 return 0
         else:
-            layer_cache = self.layer_caches[layer_idx]
         return layer_cache.get_seq_length()
     def get_max_length(self) -> Optional[int]:
@@ -131,9 +134,12 @@ class VariableCache(Cache_4_44_2, Cache):
         return self.max_cache_len
     def reset(self):
-        for layer_idx in range(len(self.layer_caches)):
-            layer_cache = self.layer_caches[layer_idx]
             if hasattr(layer_cache, "reset"):
                 layer_cache.reset()
             else:
                 self._init_layer_cache(layer_idx)

     The default implementation for the layer caches is StaticCache.
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
+    max_batch_size = None
+    max_cache_len = None
     def __init__(
             self,
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype
+        self.layers: list[Cache_4_44_2 | None] = [None] * config.num_hidden_layers
         self.layer_devices: list[torch.device | None] = [None] * config.num_hidden_layers
     def update(
             layer_idx: int,
             cache_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.layers[layer_idx] is None:
             self.layer_devices[layer_idx] = key_states.device
             self._init_layer_cache(layer_idx)
+        layer_cache = self.layers[layer_idx]
         assert layer_cache is not None, f"Trying to update the cache of a cache-less layer: {layer_idx=}"
         k_out, v_out = layer_cache.update(key_states=key_states,
         if attention_config.window_length is not None:
             if not attention_config.is_sink:
                 config.sliding_window = attention_config.window_length
+                self.layers[layer_idx] = SlidingWindowCache(config=config,
                                                                   max_batch_size=self.max_batch_size,
                                                                   max_cache_len=self.max_cache_len,
                                                                   device=device,
                                                                   dtype=self.dtype)
                 return
             elif not attention_config.unshifted_sink:
+                self.layers[layer_idx] = SinkCache(window_length=attention_config.window_length,
                                                          num_sink_tokens=attention_config.num_sink_tokens)
                 return
+        self.layers[layer_idx] = StaticCache(config=config,
                                                    max_batch_size=self.max_batch_size,
                                                    max_cache_len=self.max_cache_len,
                                                    device=device,
                                                    dtype=self.dtype)
     def _get_first_real_cache(self) -> Cache:
+        for layer_cache in self.layers:
             if layer_cache is not None:
                 return layer_cache
         raise ValueError(f"No real cache found, all layer caches are None.")
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        if layer_idx == 0 and self.layers[0] is None:
             try:
                 layer_cache = self._get_first_real_cache()
             except ValueError:
                 return 0
         else:
+            layer_cache = self.layers[layer_idx]
         return layer_cache.get_seq_length()
     def get_max_length(self) -> Optional[int]:
         return self.max_cache_len
     def reset(self):
+        for layer_idx in range(len(self.layers)):
+            layer_cache = self.layers[layer_idx]
             if hasattr(layer_cache, "reset"):
                 layer_cache.reset()
             else:
                 self._init_layer_cache(layer_idx)
+    def is_compileable(self) -> bool:
+        return False