Vinpolar commited on
Commit
f122742
·
verified ·
1 Parent(s): 2cdbd5d

superres config

Browse files
Files changed (1) hide show
  1. superres_megatron_args.json +602 -0
superres_megatron_args.json ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_layers": 24,
3
+ "encoder_num_layers": 24,
4
+ "decoder_num_layers": null,
5
+ "hidden_size": 2048,
6
+ "ffn_hidden_size": 5632,
7
+ "num_attention_heads": 32,
8
+ "attention_backend": "<stub>",
9
+ "kv_channels": 64,
10
+ "group_query_attention": true,
11
+ "num_query_groups": 8,
12
+ "softmax_type": "vanilla",
13
+ "window_size": null,
14
+ "window_attn_skip_freq": null,
15
+ "max_position_embeddings": 8192,
16
+ "position_embedding_type": "rope",
17
+ "relative_attention_num_buckets": 32,
18
+ "relative_attention_max_distance": 128,
19
+ "use_rotary_position_embeddings": false,
20
+ "rotary_base": 500000,
21
+ "rotary_percent": 1.0,
22
+ "rotary_interleaved": false,
23
+ "rotary_seq_len_interpolation_factor": null,
24
+ "use_rope_scaling": false,
25
+ "rope_scaling_factor": 8.0,
26
+ "no_rope_freq": null,
27
+ "add_position_embedding": true,
28
+ "mrope_section": null,
29
+ "make_vocab_size_divisible_by": 128,
30
+ "normalization": "RMSNorm",
31
+ "norm_epsilon": 1e-06,
32
+ "apply_layernorm_1p": false,
33
+ "apply_residual_connection_post_layernorm": false,
34
+ "openai_gelu": false,
35
+ "squared_relu": false,
36
+ "swiglu": true,
37
+ "quick_geglu": false,
38
+ "activation_func_clamp_value": null,
39
+ "glu_linear_offset": 0.0,
40
+ "onnx_safe": null,
41
+ "bert_binary_head": true,
42
+ "untie_embeddings_and_output_weights": true,
43
+ "multi_latent_attention": false,
44
+ "mtp_num_layers": null,
45
+ "mtp_loss_scaling_factor": 0.1,
46
+ "attention_dropout": 0.0,
47
+ "hidden_dropout": 0.0,
48
+ "weight_decay": 0.1,
49
+ "start_weight_decay": 0.1,
50
+ "end_weight_decay": 0.1,
51
+ "weight_decay_incr_style": "constant",
52
+ "clip_grad": 1.0,
53
+ "adam_beta1": 0.9,
54
+ "adam_beta2": 0.95,
55
+ "adam_eps": 1e-08,
56
+ "sgd_momentum": 0.9,
57
+ "micro_batch_size": 1,
58
+ "global_batch_size": 480,
59
+ "rampup_batch_size": null,
60
+ "decrease_batch_size_if_needed": false,
61
+ "recompute_granularity": null,
62
+ "check_for_nan_in_loss_and_grad": true,
63
+ "check_for_spiky_loss": false,
64
+ "check_for_large_grads": false,
65
+ "distribute_saved_activations": false,
66
+ "recompute_method": null,
67
+ "recompute_num_layers": null,
68
+ "recompute_modules": null,
69
+ "clone_scatter_output_in_embedding": true,
70
+ "profile": false,
71
+ "profile_step_start": 10,
72
+ "profile_step_end": 12,
73
+ "iterations_to_skip": [],
74
+ "result_rejected_tracker_filename": null,
75
+ "enable_gloo_process_groups": true,
76
+ "use_pytorch_profiler": false,
77
+ "profile_ranks": [
78
+ 0
79
+ ],
80
+ "record_memory_history": false,
81
+ "memory_snapshot_path": "snapshot.pickle",
82
+ "tp_comm_overlap": false,
83
+ "tp_comm_overlap_cfg": null,
84
+ "tp_comm_overlap_ag": true,
85
+ "tp_comm_overlap_rs": true,
86
+ "tp_comm_overlap_rs_dgrad": false,
87
+ "tp_comm_bulk_dgrad": true,
88
+ "tp_comm_bulk_wgrad": true,
89
+ "tp_comm_bootstrap_backend": "nccl",
90
+ "use_cpu_initialization": null,
91
+ "empty_unused_memory_level": 0,
92
+ "deterministic_mode": false,
93
+ "check_weight_hash_across_dp_replicas_interval": null,
94
+ "calculate_per_token_loss": false,
95
+ "train_sync_interval": null,
96
+ "train_iters": 10000,
97
+ "train_samples": null,
98
+ "log_interval": 50,
99
+ "exit_interval": null,
100
+ "exit_duration_in_mins": null,
101
+ "exit_signal_handler": false,
102
+ "tensorboard_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/tensorboard/",
103
+ "masked_softmax_fusion": true,
104
+ "bias_gelu_fusion": true,
105
+ "bias_swiglu_fusion": true,
106
+ "use_fused_weighted_squared_relu": false,
107
+ "bias_dropout_fusion": true,
108
+ "apply_rope_fusion": true,
109
+ "rope_type": null,
110
+ "cross_entropy_loss_fusion": true,
111
+ "cross_entropy_fusion_impl": "native",
112
+ "use_flash_attn": false,
113
+ "add_bias_linear": true,
114
+ "add_qkv_bias": true,
115
+ "optimizer": "adam",
116
+ "optimizer_cpu_offload": false,
117
+ "optimizer_offload_fraction": 1.0,
118
+ "use_torch_optimizer_for_cpu_offload": false,
119
+ "overlap_cpu_optimizer_d2h_h2d": false,
120
+ "pin_cpu_grads": true,
121
+ "pin_cpu_params": true,
122
+ "dataloader_type": "cyclic",
123
+ "async_tensor_model_parallel_allreduce": true,
124
+ "no_persist_layer_norm": false,
125
+ "sequence_parallel": false,
126
+ "gradient_accumulation_fusion": true,
127
+ "deprecated_use_mcore_models": false,
128
+ "use_legacy_models": false,
129
+ "manual_gc": false,
130
+ "manual_gc_interval": 0,
131
+ "manual_gc_eval": true,
132
+ "tp_comm_split_ag": true,
133
+ "tp_comm_split_rs": true,
134
+ "pipeline_model_parallel_comm_backend": null,
135
+ "high_priority_stream_groups": [],
136
+ "use_te_activation_func": false,
137
+ "perform_rl_step": false,
138
+ "rl_prompts_per_eval": 32,
139
+ "grpo_prompts_per_step": 32,
140
+ "grpo_group_size": 2,
141
+ "grpo_iterations": 2,
142
+ "grpo_clamp_eps_lower": 0.01,
143
+ "grpo_clamp_eps_upper": 0.01,
144
+ "grpo_kl_beta": 0.001,
145
+ "grpo_entropy_term_weight": 0.0,
146
+ "grpo_filter_groups_with_same_reward": false,
147
+ "grpo_default_temperature": 1.0,
148
+ "grpo_default_top_p": 0,
149
+ "langrl_inference_server_type": "inplace_megatron",
150
+ "langrl_inference_server_conversation_template": null,
151
+ "langrl_env_config": null,
152
+ "rl_offload_optimizer_during_inference": false,
153
+ "rl_offload_kv_cache_during_training": false,
154
+ "rl_remove_kv_cache_during_training": false,
155
+ "rl_reset_cuda_graphs": false,
156
+ "rl_partial_rollouts": false,
157
+ "rl_inference_logprobs_is_correction": false,
158
+ "rl_importance_sampling_truncation_coef": null,
159
+ "rl_calculate_intra_group_similarity": false,
160
+ "seed": 2026,
161
+ "data_parallel_random_init": false,
162
+ "init_method_std": 0.02,
163
+ "embedding_init_method_std": null,
164
+ "init_method_xavier_uniform": false,
165
+ "lr": 5e-05,
166
+ "lr_decay_style": "cosine",
167
+ "lr_wsd_decay_style": "exponential",
168
+ "lr_decay_iters": 10000,
169
+ "lr_decay_samples": null,
170
+ "lr_wsd_decay_samples": null,
171
+ "lr_wsd_decay_iters": null,
172
+ "lr_warmup_fraction": null,
173
+ "lr_warmup_iters": 200,
174
+ "lr_warmup_samples": 0,
175
+ "lr_warmup_init": 0.0,
176
+ "min_lr": 5e-06,
177
+ "override_opt_param_scheduler": false,
178
+ "use_checkpoint_opt_param_scheduler": false,
179
+ "decoupled_lr": null,
180
+ "decoupled_min_lr": null,
181
+ "save": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/checkpoint/",
182
+ "save_interval": 200,
183
+ "save_retain_interval": null,
184
+ "no_save_optim": null,
185
+ "no_save_rng": null,
186
+ "load": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/checkpoint/",
187
+ "no_load_optim": null,
188
+ "load_main_params_from_ckpt": null,
189
+ "no_load_rng": null,
190
+ "strict_fsdp_dtensor_load": true,
191
+ "non_persistent_save_interval": null,
192
+ "non_persistent_ckpt_type": null,
193
+ "non_persistent_global_ckpt_dir": null,
194
+ "non_persistent_local_ckpt_dir": null,
195
+ "non_persistent_local_ckpt_algo": "fully_parallel",
196
+ "finetune": false,
197
+ "pretrained_checkpoint": null,
198
+ "ckpt_step": null,
199
+ "perform_initialization": true,
200
+ "use_checkpoint_args": false,
201
+ "use_mp_args_from_checkpoint_args": false,
202
+ "use_tokenizer_model_from_checkpoint_args": true,
203
+ "exit_on_missing_checkpoint": false,
204
+ "use_dist_ckpt_deprecated": false,
205
+ "use_persistent_ckpt_worker": false,
206
+ "auto_detect_ckpt_format": false,
207
+ "dist_ckpt_format_deprecated": null,
208
+ "ckpt_format": "torch_dist",
209
+ "ckpt_convert_format": null,
210
+ "ckpt_convert_save": null,
211
+ "ckpt_convert_update_legacy_dist_opt_format": false,
212
+ "ckpt_fully_parallel_save_deprecated": false,
213
+ "ckpt_fully_parallel_save": true,
214
+ "async_save": null,
215
+ "ckpt_fully_parallel_load": false,
216
+ "ckpt_assume_constant_structure": false,
217
+ "dist_ckpt_strictness": "assume_ok_unexpected",
218
+ "dist_ckpt_save_pre_mcore_014": false,
219
+ "dist_ckpt_optim_fully_reshardable": false,
220
+ "distrib_optim_fully_reshardable_mem_efficient": false,
221
+ "load_model_opt_format": false,
222
+ "fp16": false,
223
+ "bf16": true,
224
+ "grad_reduce_in_bf16": false,
225
+ "loss_scale": null,
226
+ "initial_loss_scale": 4294967296,
227
+ "min_loss_scale": 1.0,
228
+ "loss_scale_window": 1000,
229
+ "hysteresis": 2,
230
+ "fp32_residual_connection": false,
231
+ "apply_query_key_layer_scaling": false,
232
+ "attention_softmax_in_fp32": false,
233
+ "accumulate_allreduce_grads_in_fp32": true,
234
+ "fp16_lm_cross_entropy": false,
235
+ "disable_bf16_reduced_precision_matmul": false,
236
+ "reuse_grad_buf_for_mxfp8_param_ag": false,
237
+ "tensor_model_parallel_size": 1,
238
+ "pipeline_model_parallel_size": 1,
239
+ "decoder_first_pipeline_num_layers": null,
240
+ "decoder_last_pipeline_num_layers": null,
241
+ "pipeline_model_parallel_layout": null,
242
+ "num_layers_per_virtual_pipeline_stage": null,
243
+ "num_virtual_stages_per_pipeline_rank": null,
244
+ "microbatch_group_size_per_vp_stage": null,
245
+ "overlap_p2p_comm": false,
246
+ "overlap_p2p_comm_warmup_flush": false,
247
+ "distributed_backend": "nccl",
248
+ "distributed_timeout_minutes": 10,
249
+ "overlap_grad_reduce": false,
250
+ "defer_embedding_wgrad_compute": false,
251
+ "wgrad_deferral_limit": 0,
252
+ "align_grad_reduce": true,
253
+ "ddp_num_buckets": null,
254
+ "ddp_bucket_size": null,
255
+ "ddp_pad_buckets_for_high_nccl_busbw": false,
256
+ "ddp_average_in_collective": false,
257
+ "overlap_param_gather": false,
258
+ "overlap_param_gather_with_optimizer_step": false,
259
+ "align_param_gather": false,
260
+ "scatter_gather_tensors_in_pipeline": true,
261
+ "use_ring_exchange_p2p": false,
262
+ "local_rank": 0,
263
+ "lazy_mpu_init": null,
264
+ "account_for_embedding_in_pipeline_split": false,
265
+ "account_for_loss_in_pipeline_split": false,
266
+ "use_distributed_optimizer": true,
267
+ "nccl_ub": false,
268
+ "disable_symmetric_registration": false,
269
+ "use_sharp": false,
270
+ "sharp_enabled_group": null,
271
+ "use_megatron_fsdp": false,
272
+ "init_model_with_meta_device": false,
273
+ "data_parallel_sharding_strategy": "no_shard",
274
+ "gradient_reduce_div_fusion": true,
275
+ "fsdp_double_buffer": false,
276
+ "suggested_communication_unit_size": null,
277
+ "keep_fp8_transpose_cache": false,
278
+ "enable_full_sharding_in_hsdp": false,
279
+ "num_distributed_optimizer_instances": 1,
280
+ "use_torch_fsdp2": false,
281
+ "torch_fsdp2_reshard_after_forward": true,
282
+ "context_parallel_size": 1,
283
+ "cp_comm_type": [
284
+ "p2p"
285
+ ],
286
+ "hierarchical_context_parallel_sizes": null,
287
+ "nccl_communicator_config_path": null,
288
+ "use_tp_pp_dp_mapping": false,
289
+ "replication": false,
290
+ "replication_jump": null,
291
+ "replication_factor": 2,
292
+ "full_validation": false,
293
+ "multiple_validation_sets": false,
294
+ "eval_iters": 100,
295
+ "eval_interval": 200,
296
+ "test_mode": false,
297
+ "skip_train": false,
298
+ "data_path": null,
299
+ "split": null,
300
+ "train_data_path": [
301
+ "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter0_full",
302
+ "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter1_full",
303
+ "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter2_full",
304
+ "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter3_full"
305
+ ],
306
+ "valid_data_path": [
307
+ "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter0_full",
308
+ "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter1_full",
309
+ "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter2_full",
310
+ "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter3_full"
311
+ ],
312
+ "test_data_path": null,
313
+ "data_args_path": null,
314
+ "per_split_data_args_path": null,
315
+ "data_cache_path": null,
316
+ "mmap_bin_files": true,
317
+ "mock_data": false,
318
+ "seq_length": 8192,
319
+ "encoder_seq_length": 8192,
320
+ "decoder_seq_length": null,
321
+ "retriever_seq_length": 256,
322
+ "sample_rate": 1.0,
323
+ "mask_prob": 0.15,
324
+ "short_seq_prob": 0.1,
325
+ "num_workers": 12,
326
+ "reset_position_ids": false,
327
+ "reset_attention_mask": false,
328
+ "eod_mask_loss": true,
329
+ "create_attention_mask_in_dataloader": true,
330
+ "num_dataset_builder_threads": 1,
331
+ "object_storage_cache_path": null,
332
+ "mid_level_dataset_surplus": 0.005,
333
+ "vocab_size": 193792,
334
+ "padded_vocab_size": 193920,
335
+ "vocab_file": null,
336
+ "merge_file": null,
337
+ "vocab_extra_ids": 0,
338
+ "tokenizer_type": "NullTokenizer",
339
+ "tokenizer_model": null,
340
+ "tokenizer_metadata": null,
341
+ "tiktoken_pattern": null,
342
+ "tiktoken_num_special_tokens": 1000,
343
+ "tiktoken_special_tokens": null,
344
+ "legacy_tokenizer": false,
345
+ "trust_remote_code": false,
346
+ "adlr_autoresume": false,
347
+ "adlr_autoresume_interval": 1000,
348
+ "ict_head_size": null,
349
+ "biencoder_projection_dim": 0,
350
+ "biencoder_shared_query_context_model": false,
351
+ "ict_load": null,
352
+ "bert_load": null,
353
+ "titles_data_path": null,
354
+ "query_in_block_prob": 0.1,
355
+ "use_one_sent_docs": false,
356
+ "evidence_data_path": null,
357
+ "retriever_report_topk_accuracies": [],
358
+ "retriever_score_scaling": false,
359
+ "block_data_path": null,
360
+ "embedding_path": null,
361
+ "indexer_batch_size": 128,
362
+ "indexer_log_interval": 1000,
363
+ "num_classes": 1000,
364
+ "img_h": 224,
365
+ "img_w": 224,
366
+ "num_channels": 3,
367
+ "patch_dim": 16,
368
+ "classes_fraction": 1.0,
369
+ "data_per_class_fraction": 1.0,
370
+ "data_sharding": true,
371
+ "head_lr_mult": 1.0,
372
+ "vision_pretraining": false,
373
+ "vision_pretraining_type": "classify",
374
+ "vision_backbone_type": "vit",
375
+ "swin_backbone_type": "tiny",
376
+ "mask_type": "random",
377
+ "mask_factor": 1.0,
378
+ "iter_per_epoch": 1250,
379
+ "dino_local_img_size": 96,
380
+ "dino_local_crops_number": 10,
381
+ "dino_head_hidden_size": 2048,
382
+ "dino_bottleneck_size": 256,
383
+ "dino_freeze_last_layer": 1,
384
+ "dino_norm_last_layer": false,
385
+ "dino_warmup_teacher_temp": 0.04,
386
+ "dino_teacher_temp": 0.07,
387
+ "dino_warmup_teacher_temp_epochs": 30,
388
+ "qk_layernorm": false,
389
+ "qk_l2_norm": false,
390
+ "expert_model_parallel_size": 1,
391
+ "expert_tensor_parallel_size": 1,
392
+ "num_experts": null,
393
+ "moe_layer_freq": 1,
394
+ "moe_ffn_hidden_size": null,
395
+ "moe_shared_expert_intermediate_size": null,
396
+ "moe_shared_expert_overlap": false,
397
+ "moe_grouped_gemm": false,
398
+ "moe_use_legacy_grouped_gemm": false,
399
+ "moe_layer_recompute": false,
400
+ "moe_extended_tp": false,
401
+ "moe_use_upcycling": false,
402
+ "moe_router_load_balancing_type": "aux_loss",
403
+ "moe_router_dtype": null,
404
+ "moe_router_fusion": false,
405
+ "moe_router_score_function": "softmax",
406
+ "moe_router_topk": 2,
407
+ "moe_router_pre_softmax": false,
408
+ "moe_router_num_groups": null,
409
+ "moe_router_group_topk": null,
410
+ "moe_router_topk_scaling_factor": null,
411
+ "moe_router_enable_expert_bias": false,
412
+ "moe_router_bias_update_rate": 0.001,
413
+ "moe_router_force_load_balancing": false,
414
+ "moe_router_padding_for_fp8": false,
415
+ "moe_aux_loss_coeff": 0.0,
416
+ "moe_z_loss_coeff": null,
417
+ "moe_input_jitter_eps": null,
418
+ "moe_per_layer_logging": false,
419
+ "moe_token_dispatcher_type": "allgather",
420
+ "moe_enable_deepep": false,
421
+ "moe_deepep_num_sms": 20,
422
+ "moe_permute_fusion": false,
423
+ "moe_expert_capacity_factor": null,
424
+ "moe_pad_expert_input_to_capacity": false,
425
+ "moe_token_drop_policy": "probs",
426
+ "moe_apply_probs_on_input": false,
427
+ "overlap_moe_expert_parallel_comm": false,
428
+ "delay_wgrad_compute": false,
429
+ "moe_upcycling_granularity": 1,
430
+ "q_lora_rank": null,
431
+ "kv_lora_rank": 32,
432
+ "qk_head_dim": 128,
433
+ "qk_pos_emb_head_dim": 64,
434
+ "v_head_dim": 128,
435
+ "rotary_scaling_factor": 1.0,
436
+ "mscale": 1.0,
437
+ "mscale_all_dim": 0.0,
438
+ "cache_mla_latents": false,
439
+ "heterogeneous_layers_config_path": null,
440
+ "heterogeneous_layers_config_encoded_json": null,
441
+ "log_params_norm": true,
442
+ "log_num_zeros_in_grad": true,
443
+ "log_throughput": true,
444
+ "log_progress": true,
445
+ "timing_log_level": 0,
446
+ "log_energy": false,
447
+ "barrier_with_L1_time": true,
448
+ "timing_log_option": "minmax",
449
+ "tensorboard_log_interval": 1,
450
+ "tensorboard_queue_size": 1000,
451
+ "log_timers_to_tensorboard": true,
452
+ "log_loss_scale_to_tensorboard": true,
453
+ "log_validation_ppl_to_tensorboard": true,
454
+ "log_memory_to_tensorboard": true,
455
+ "log_world_size_to_tensorboard": true,
456
+ "wandb_project": "Megatron_Stage2",
457
+ "wandb_entity": "",
458
+ "wandb_exp_name": "1B_nl24_hs2048_gb480_seed2026_20260204_q01_ft5k_super_v2",
459
+ "wandb_save_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/wandb/",
460
+ "logging_level": null,
461
+ "log_straggler": false,
462
+ "disable_straggler_on_startup": false,
463
+ "straggler_ctrlr_port": 65535,
464
+ "straggler_minmax_count": 1,
465
+ "run_workload_inspector_server": false,
466
+ "inference_batch_times_seqlen_threshold": -1,
467
+ "max_tokens_to_oom": 12000,
468
+ "output_bert_embeddings": false,
469
+ "bert_embedder_type": "megatron",
470
+ "flash_decode": false,
471
+ "enable_cuda_graph": false,
472
+ "cuda_graph_warmup_steps": 3,
473
+ "external_cuda_graph": false,
474
+ "cuda_graph_scope": "full",
475
+ "inference_max_batch_size": 8,
476
+ "inference_max_seq_length": 2560,
477
+ "inference_dynamic_batching": false,
478
+ "inference_dynamic_batching_buffer_size_gb": 40.0,
479
+ "inference_dynamic_batching_chunk_size": 256,
480
+ "inference_dynamic_batching_buffer_guaranteed_fraction": 0.2,
481
+ "inference_dynamic_batching_buffer_overflow_factor": null,
482
+ "inference_dynamic_batching_max_requests_override": null,
483
+ "inference_dynamic_batching_max_tokens_override": null,
484
+ "inference_dynamic_batching_num_cuda_graphs": 16,
485
+ "inference_dynamic_batching_track_paused_request_events": false,
486
+ "symmetric_ar_type": null,
487
+ "nccl_all_reduce_for_prefill": false,
488
+ "mlp_chunks_for_prefill": 1,
489
+ "initialize_socket_comms": false,
490
+ "fp8": null,
491
+ "fp8_recipe": "delayed",
492
+ "fp8_margin": 0,
493
+ "fp8_interval": 1,
494
+ "fp8_amax_history_len": 1,
495
+ "fp8_amax_compute_algo": "most_recent",
496
+ "fp8_wgrad": true,
497
+ "transformer_impl": "transformer_engine",
498
+ "fp8_param_gather": false,
499
+ "first_last_layers_bf16": false,
500
+ "num_layers_at_start_in_bf16": 1,
501
+ "num_layers_at_end_in_bf16": 1,
502
+ "fp4": null,
503
+ "fp4_recipe": "nvfp4",
504
+ "fp4_param": false,
505
+ "te_rng_tracker": false,
506
+ "inference_rng_tracker": false,
507
+ "retro_project_dir": null,
508
+ "retro_add_retriever": false,
509
+ "retro_cyclic_train_iters": null,
510
+ "retro_encoder_layers": 2,
511
+ "retro_encoder_hidden_dropout": 0.1,
512
+ "retro_encoder_attention_dropout": 0.1,
513
+ "retro_num_neighbors": 2,
514
+ "retro_num_retrieved_chunks": 2,
515
+ "retro_attention_gate": 1,
516
+ "retro_verify_neighbor_count": true,
517
+ "enable_experimental": false,
518
+ "spec": null,
519
+ "hybrid_attention_ratio": 0.0,
520
+ "hybrid_mlp_ratio": 0.0,
521
+ "hybrid_override_pattern": null,
522
+ "mamba_state_dim": 128,
523
+ "mamba_head_dim": 64,
524
+ "mamba_num_groups": 8,
525
+ "mamba_num_heads": null,
526
+ "is_hybrid_model": false,
527
+ "disable_mamba_mem_eff_path": false,
528
+ "yaml_cfg": null,
529
+ "use_precision_aware_optimizer": false,
530
+ "main_grads_dtype": "torch.float32",
531
+ "main_params_dtype": "torch.float32",
532
+ "exp_avg_dtype": "torch.float32",
533
+ "exp_avg_sq_dtype": "torch.float32",
534
+ "enable_one_logger": true,
535
+ "one_logger_project": "megatron-lm",
536
+ "one_logger_run_name": null,
537
+ "one_logger_async": false,
538
+ "app_tag_run_name": null,
539
+ "app_tag_run_version": "0.0.0",
540
+ "inprocess_restart": false,
541
+ "inprocess_max_iterations": null,
542
+ "inprocess_monitor_thread_interval": 1.0,
543
+ "inprocess_monitor_process_interval": 1.0,
544
+ "inprocess_progress_watchdog_interval": 1.0,
545
+ "inprocess_heartbeat_interval": 30,
546
+ "inprocess_soft_timeout": 60,
547
+ "inprocess_hard_timeout": 90,
548
+ "inprocess_heartbeat_timeout": 60,
549
+ "inprocess_barrier_timeout": 120,
550
+ "inprocess_completion_timeout": 120,
551
+ "inprocess_last_call_wait": 1,
552
+ "inprocess_termination_grace_time": 1,
553
+ "inprocess_granularity": "node",
554
+ "inprocess_active_world_size": 40,
555
+ "inprocess_empty_cuda_cache": false,
556
+ "enable_ft_package": false,
557
+ "calc_ft_timeouts": false,
558
+ "config_logger_dir": "",
559
+ "error_injection_rate": 0,
560
+ "error_injection_type": "transient_error",
561
+ "rerun_mode": "validate_results",
562
+ "enable_msc": true,
563
+ "kitchen_config_file": null,
564
+ "kitchen_recipe_number": null,
565
+ "sft": false,
566
+ "sft_tokenizer_prompt_format": "nemotron-h-aligned",
567
+ "num_quantizers": 64,
568
+ "export_model_type": "GPTModel",
569
+ "export_legacy_megatron": false,
570
+ "export_te_mcore_model": false,
571
+ "export_force_local_attention": false,
572
+ "export_kv_cache_quant": false,
573
+ "export_real_quant_cfg": "None",
574
+ "export_quant_cfg": null,
575
+ "export_kd_cfg": null,
576
+ "teacher_model_config": null,
577
+ "export_kd_teacher_load": null,
578
+ "export_kd_teacher_ckpt_format": null,
579
+ "finetune_hf_dataset": null,
580
+ "finetune_data_split": "train",
581
+ "export_qk_l2_norm": false,
582
+ "export_moe_apply_probs_on_input": false,
583
+ "export_offline_model": false,
584
+ "rank": 0,
585
+ "world_size": 40,
586
+ "use_dist_ckpt": true,
587
+ "transformer_pipeline_model_parallel_size": 1,
588
+ "data_parallel_size": 40,
589
+ "virtual_pipeline_model_parallel_size": null,
590
+ "params_dtype": "torch.bfloat16",
591
+ "consumed_train_samples": 4800000,
592
+ "skipped_train_samples": 0,
593
+ "consumed_valid_samples": 2400000,
594
+ "variable_seq_lengths": false,
595
+ "model_type": "<stub>",
596
+ "iteration": 8000,
597
+ "num_floating_point_operations_so_far": 2.926014640120922e+22,
598
+ "do_train": 1,
599
+ "do_valid": 1,
600
+ "do_test": 0,
601
+ "curr_iteration": 9999
602
+ }