Text Generation
Transformers
Safetensors
PyTorch
nemotron_h
nvidia
conversational
custom_code
Eval Results
suhara commited on
Commit
5a48de7
·
1 Parent(s): 378df16

Upload modeling_nemotron_h.py (#53)

Browse files

- Upload modeling_nemotron_h.py (d4560445935c0bdae7f00faac5c9bb4c34e6b8dd)

Files changed (1) hide show
  1. modeling_nemotron_h.py +4 -0
modeling_nemotron_h.py CHANGED
@@ -1219,6 +1219,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
1219
  def _init_weights(self, module):
1220
  """Initialize the weights."""
1221
  if isinstance(module, NemotronHMamba2Mixer):
 
 
1222
  module.A_log._no_weight_decay = True
1223
  module.D._no_weight_decay = True
1224
 
@@ -1250,6 +1252,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
1250
  #
1251
  # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
1252
  for name, p in module.named_parameters():
 
 
1253
  if name in ["out_proj.weight"]:
1254
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
1255
  # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
 
1219
  def _init_weights(self, module):
1220
  """Initialize the weights."""
1221
  if isinstance(module, NemotronHMamba2Mixer):
1222
+ if getattr(module.dt_bias, "_is_hf_initialized", False):
1223
+ return
1224
  module.A_log._no_weight_decay = True
1225
  module.D._no_weight_decay = True
1226
 
 
1252
  #
1253
  # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
1254
  for name, p in module.named_parameters():
1255
+ if getattr(p, "_is_hf_initialized", False):
1256
+ continue
1257
  if name in ["out_proj.weight"]:
1258
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
1259
  # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)