{ "num_layers": 24, "encoder_num_layers": 24, "decoder_num_layers": null, "hidden_size": 2048, "ffn_hidden_size": 5632, "num_attention_heads": 32, "attention_backend": "", "kv_channels": 64, "group_query_attention": true, "num_query_groups": 8, "softmax_type": "vanilla", "window_size": null, "window_attn_skip_freq": null, "max_position_embeddings": 8192, "position_embedding_type": "rope", "relative_attention_num_buckets": 32, "relative_attention_max_distance": 128, "use_rotary_position_embeddings": false, "rotary_base": 500000, "rotary_percent": 1.0, "rotary_interleaved": false, "rotary_seq_len_interpolation_factor": null, "use_rope_scaling": false, "rope_scaling_factor": 8.0, "no_rope_freq": null, "add_position_embedding": true, "mrope_section": null, "make_vocab_size_divisible_by": 128, "normalization": "RMSNorm", "norm_epsilon": 1e-06, "apply_layernorm_1p": false, "apply_residual_connection_post_layernorm": false, "openai_gelu": false, "squared_relu": false, "swiglu": true, "quick_geglu": false, "activation_func_clamp_value": null, "glu_linear_offset": 0.0, "onnx_safe": null, "bert_binary_head": true, "untie_embeddings_and_output_weights": true, "multi_latent_attention": false, "mtp_num_layers": null, "mtp_loss_scaling_factor": 0.1, "attention_dropout": 0.0, "hidden_dropout": 0.0, "weight_decay": 0.1, "start_weight_decay": 0.1, "end_weight_decay": 0.1, "weight_decay_incr_style": "constant", "clip_grad": 1.0, "adam_beta1": 0.9, "adam_beta2": 0.95, "adam_eps": 1e-08, "sgd_momentum": 0.9, "micro_batch_size": 1, "global_batch_size": 480, "rampup_batch_size": null, "decrease_batch_size_if_needed": false, "recompute_granularity": null, "check_for_nan_in_loss_and_grad": true, "check_for_spiky_loss": false, "check_for_large_grads": false, "distribute_saved_activations": false, "recompute_method": null, "recompute_num_layers": null, "recompute_modules": null, "clone_scatter_output_in_embedding": true, "profile": false, "profile_step_start": 10, "profile_step_end": 12, "iterations_to_skip": [], "result_rejected_tracker_filename": null, "enable_gloo_process_groups": true, "use_pytorch_profiler": false, "profile_ranks": [ 0 ], "record_memory_history": false, "memory_snapshot_path": "snapshot.pickle", "tp_comm_overlap": false, "tp_comm_overlap_cfg": null, "tp_comm_overlap_ag": true, "tp_comm_overlap_rs": true, "tp_comm_overlap_rs_dgrad": false, "tp_comm_bulk_dgrad": true, "tp_comm_bulk_wgrad": true, "tp_comm_bootstrap_backend": "nccl", "use_cpu_initialization": null, "empty_unused_memory_level": 0, "deterministic_mode": false, "check_weight_hash_across_dp_replicas_interval": null, "calculate_per_token_loss": false, "train_sync_interval": null, "train_iters": 10000, "train_samples": null, "log_interval": 50, "exit_interval": null, "exit_duration_in_mins": null, "exit_signal_handler": false, "tensorboard_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/tensorboard/", "masked_softmax_fusion": true, "bias_gelu_fusion": true, "bias_swiglu_fusion": true, "use_fused_weighted_squared_relu": false, "bias_dropout_fusion": true, "apply_rope_fusion": true, "rope_type": null, "cross_entropy_loss_fusion": true, "cross_entropy_fusion_impl": "native", "use_flash_attn": false, "add_bias_linear": true, "add_qkv_bias": true, "optimizer": "adam", "optimizer_cpu_offload": false, "optimizer_offload_fraction": 1.0, "use_torch_optimizer_for_cpu_offload": false, "overlap_cpu_optimizer_d2h_h2d": false, "pin_cpu_grads": true, "pin_cpu_params": true, "dataloader_type": "cyclic", "async_tensor_model_parallel_allreduce": true, "no_persist_layer_norm": false, "sequence_parallel": false, "gradient_accumulation_fusion": true, "deprecated_use_mcore_models": false, "use_legacy_models": false, "manual_gc": false, "manual_gc_interval": 0, "manual_gc_eval": true, "tp_comm_split_ag": true, "tp_comm_split_rs": true, "pipeline_model_parallel_comm_backend": null, "high_priority_stream_groups": [], "use_te_activation_func": false, "perform_rl_step": false, "rl_prompts_per_eval": 32, "grpo_prompts_per_step": 32, "grpo_group_size": 2, "grpo_iterations": 2, "grpo_clamp_eps_lower": 0.01, "grpo_clamp_eps_upper": 0.01, "grpo_kl_beta": 0.001, "grpo_entropy_term_weight": 0.0, "grpo_filter_groups_with_same_reward": false, "grpo_default_temperature": 1.0, "grpo_default_top_p": 0, "langrl_inference_server_type": "inplace_megatron", "langrl_inference_server_conversation_template": null, "langrl_env_config": null, "rl_offload_optimizer_during_inference": false, "rl_offload_kv_cache_during_training": false, "rl_remove_kv_cache_during_training": false, "rl_reset_cuda_graphs": false, "rl_partial_rollouts": false, "rl_inference_logprobs_is_correction": false, "rl_importance_sampling_truncation_coef": null, "rl_calculate_intra_group_similarity": false, "seed": 2026, "data_parallel_random_init": false, "init_method_std": 0.02, "embedding_init_method_std": null, "init_method_xavier_uniform": false, "lr": 5e-05, "lr_decay_style": "cosine", "lr_wsd_decay_style": "exponential", "lr_decay_iters": 10000, "lr_decay_samples": null, "lr_wsd_decay_samples": null, "lr_wsd_decay_iters": null, "lr_warmup_fraction": null, "lr_warmup_iters": 200, "lr_warmup_samples": 0, "lr_warmup_init": 0.0, "min_lr": 5e-06, "override_opt_param_scheduler": false, "use_checkpoint_opt_param_scheduler": false, "decoupled_lr": null, "decoupled_min_lr": null, "save": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/checkpoint/", "save_interval": 200, "save_retain_interval": null, "no_save_optim": null, "no_save_rng": null, "load": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/checkpoint/", "no_load_optim": null, "load_main_params_from_ckpt": null, "no_load_rng": null, "strict_fsdp_dtensor_load": true, "non_persistent_save_interval": null, "non_persistent_ckpt_type": null, "non_persistent_global_ckpt_dir": null, "non_persistent_local_ckpt_dir": null, "non_persistent_local_ckpt_algo": "fully_parallel", "finetune": false, "pretrained_checkpoint": null, "ckpt_step": null, "perform_initialization": true, "use_checkpoint_args": false, "use_mp_args_from_checkpoint_args": false, "use_tokenizer_model_from_checkpoint_args": true, "exit_on_missing_checkpoint": false, "use_dist_ckpt_deprecated": false, "use_persistent_ckpt_worker": false, "auto_detect_ckpt_format": false, "dist_ckpt_format_deprecated": null, "ckpt_format": "torch_dist", "ckpt_convert_format": null, "ckpt_convert_save": null, "ckpt_convert_update_legacy_dist_opt_format": false, "ckpt_fully_parallel_save_deprecated": false, "ckpt_fully_parallel_save": true, "async_save": null, "ckpt_fully_parallel_load": false, "ckpt_assume_constant_structure": false, "dist_ckpt_strictness": "assume_ok_unexpected", "dist_ckpt_save_pre_mcore_014": false, "dist_ckpt_optim_fully_reshardable": false, "distrib_optim_fully_reshardable_mem_efficient": false, "load_model_opt_format": false, "fp16": false, "bf16": true, "grad_reduce_in_bf16": false, "loss_scale": null, "initial_loss_scale": 4294967296, "min_loss_scale": 1.0, "loss_scale_window": 1000, "hysteresis": 2, "fp32_residual_connection": false, "apply_query_key_layer_scaling": false, "attention_softmax_in_fp32": false, "accumulate_allreduce_grads_in_fp32": true, "fp16_lm_cross_entropy": false, "disable_bf16_reduced_precision_matmul": false, "reuse_grad_buf_for_mxfp8_param_ag": false, "tensor_model_parallel_size": 1, "pipeline_model_parallel_size": 1, "decoder_first_pipeline_num_layers": null, "decoder_last_pipeline_num_layers": null, "pipeline_model_parallel_layout": null, "num_layers_per_virtual_pipeline_stage": null, "num_virtual_stages_per_pipeline_rank": null, "microbatch_group_size_per_vp_stage": null, "overlap_p2p_comm": false, "overlap_p2p_comm_warmup_flush": false, "distributed_backend": "nccl", "distributed_timeout_minutes": 10, "overlap_grad_reduce": false, "defer_embedding_wgrad_compute": false, "wgrad_deferral_limit": 0, "align_grad_reduce": true, "ddp_num_buckets": null, "ddp_bucket_size": null, "ddp_pad_buckets_for_high_nccl_busbw": false, "ddp_average_in_collective": false, "overlap_param_gather": false, "overlap_param_gather_with_optimizer_step": false, "align_param_gather": false, "scatter_gather_tensors_in_pipeline": true, "use_ring_exchange_p2p": false, "local_rank": 0, "lazy_mpu_init": null, "account_for_embedding_in_pipeline_split": false, "account_for_loss_in_pipeline_split": false, "use_distributed_optimizer": true, "nccl_ub": false, "disable_symmetric_registration": false, "use_sharp": false, "sharp_enabled_group": null, "use_megatron_fsdp": false, "init_model_with_meta_device": false, "data_parallel_sharding_strategy": "no_shard", "gradient_reduce_div_fusion": true, "fsdp_double_buffer": false, "suggested_communication_unit_size": null, "keep_fp8_transpose_cache": false, "enable_full_sharding_in_hsdp": false, "num_distributed_optimizer_instances": 1, "use_torch_fsdp2": false, "torch_fsdp2_reshard_after_forward": true, "context_parallel_size": 1, "cp_comm_type": [ "p2p" ], "hierarchical_context_parallel_sizes": null, "nccl_communicator_config_path": null, "use_tp_pp_dp_mapping": false, "replication": false, "replication_jump": null, "replication_factor": 2, "full_validation": false, "multiple_validation_sets": false, "eval_iters": 100, "eval_interval": 200, "test_mode": false, "skip_train": false, "data_path": null, "split": null, "train_data_path": [ "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter0_full", "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter1_full", "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter2_full", "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter3_full" ], "valid_data_path": [ "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter0_full", "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter1_full", "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter2_full", "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter3_full" ], "test_data_path": null, "data_args_path": null, "per_split_data_args_path": null, "data_cache_path": null, "mmap_bin_files": true, "mock_data": false, "seq_length": 8192, "encoder_seq_length": 8192, "decoder_seq_length": null, "retriever_seq_length": 256, "sample_rate": 1.0, "mask_prob": 0.15, "short_seq_prob": 0.1, "num_workers": 12, "reset_position_ids": false, "reset_attention_mask": false, "eod_mask_loss": true, "create_attention_mask_in_dataloader": true, "num_dataset_builder_threads": 1, "object_storage_cache_path": null, "mid_level_dataset_surplus": 0.005, "vocab_size": 193792, "padded_vocab_size": 193920, "vocab_file": null, "merge_file": null, "vocab_extra_ids": 0, "tokenizer_type": "NullTokenizer", "tokenizer_model": null, "tokenizer_metadata": null, "tiktoken_pattern": null, "tiktoken_num_special_tokens": 1000, "tiktoken_special_tokens": null, "legacy_tokenizer": false, "trust_remote_code": false, "adlr_autoresume": false, "adlr_autoresume_interval": 1000, "ict_head_size": null, "biencoder_projection_dim": 0, "biencoder_shared_query_context_model": false, "ict_load": null, "bert_load": null, "titles_data_path": null, "query_in_block_prob": 0.1, "use_one_sent_docs": false, "evidence_data_path": null, "retriever_report_topk_accuracies": [], "retriever_score_scaling": false, "block_data_path": null, "embedding_path": null, "indexer_batch_size": 128, "indexer_log_interval": 1000, "num_classes": 1000, "img_h": 224, "img_w": 224, "num_channels": 3, "patch_dim": 16, "classes_fraction": 1.0, "data_per_class_fraction": 1.0, "data_sharding": true, "head_lr_mult": 1.0, "vision_pretraining": false, "vision_pretraining_type": "classify", "vision_backbone_type": "vit", "swin_backbone_type": "tiny", "mask_type": "random", "mask_factor": 1.0, "iter_per_epoch": 1250, "dino_local_img_size": 96, "dino_local_crops_number": 10, "dino_head_hidden_size": 2048, "dino_bottleneck_size": 256, "dino_freeze_last_layer": 1, "dino_norm_last_layer": false, "dino_warmup_teacher_temp": 0.04, "dino_teacher_temp": 0.07, "dino_warmup_teacher_temp_epochs": 30, "qk_layernorm": false, "qk_l2_norm": false, "expert_model_parallel_size": 1, "expert_tensor_parallel_size": 1, "num_experts": null, "moe_layer_freq": 1, "moe_ffn_hidden_size": null, "moe_shared_expert_intermediate_size": null, "moe_shared_expert_overlap": false, "moe_grouped_gemm": false, "moe_use_legacy_grouped_gemm": false, "moe_layer_recompute": false, "moe_extended_tp": false, "moe_use_upcycling": false, "moe_router_load_balancing_type": "aux_loss", "moe_router_dtype": null, "moe_router_fusion": false, "moe_router_score_function": "softmax", "moe_router_topk": 2, "moe_router_pre_softmax": false, "moe_router_num_groups": null, "moe_router_group_topk": null, "moe_router_topk_scaling_factor": null, "moe_router_enable_expert_bias": false, "moe_router_bias_update_rate": 0.001, "moe_router_force_load_balancing": false, "moe_router_padding_for_fp8": false, "moe_aux_loss_coeff": 0.0, "moe_z_loss_coeff": null, "moe_input_jitter_eps": null, "moe_per_layer_logging": false, "moe_token_dispatcher_type": "allgather", "moe_enable_deepep": false, "moe_deepep_num_sms": 20, "moe_permute_fusion": false, "moe_expert_capacity_factor": null, "moe_pad_expert_input_to_capacity": false, "moe_token_drop_policy": "probs", "moe_apply_probs_on_input": false, "overlap_moe_expert_parallel_comm": false, "delay_wgrad_compute": false, "moe_upcycling_granularity": 1, "q_lora_rank": null, "kv_lora_rank": 32, "qk_head_dim": 128, "qk_pos_emb_head_dim": 64, "v_head_dim": 128, "rotary_scaling_factor": 1.0, "mscale": 1.0, "mscale_all_dim": 0.0, "cache_mla_latents": false, "heterogeneous_layers_config_path": null, "heterogeneous_layers_config_encoded_json": null, "log_params_norm": true, "log_num_zeros_in_grad": true, "log_throughput": true, "log_progress": true, "timing_log_level": 0, "log_energy": false, "barrier_with_L1_time": true, "timing_log_option": "minmax", "tensorboard_log_interval": 1, "tensorboard_queue_size": 1000, "log_timers_to_tensorboard": true, "log_loss_scale_to_tensorboard": true, "log_validation_ppl_to_tensorboard": true, "log_memory_to_tensorboard": true, "log_world_size_to_tensorboard": true, "wandb_project": "Megatron_Stage2", "wandb_entity": "", "wandb_exp_name": "1B_nl24_hs2048_gb480_seed2026_20260204_q01_ft5k_super_v2", "wandb_save_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/wandb/", "logging_level": null, "log_straggler": false, "disable_straggler_on_startup": false, "straggler_ctrlr_port": 65535, "straggler_minmax_count": 1, "run_workload_inspector_server": false, "inference_batch_times_seqlen_threshold": -1, "max_tokens_to_oom": 12000, "output_bert_embeddings": false, "bert_embedder_type": "megatron", "flash_decode": false, "enable_cuda_graph": false, "cuda_graph_warmup_steps": 3, "external_cuda_graph": false, "cuda_graph_scope": "full", "inference_max_batch_size": 8, "inference_max_seq_length": 2560, "inference_dynamic_batching": false, "inference_dynamic_batching_buffer_size_gb": 40.0, "inference_dynamic_batching_chunk_size": 256, "inference_dynamic_batching_buffer_guaranteed_fraction": 0.2, "inference_dynamic_batching_buffer_overflow_factor": null, "inference_dynamic_batching_max_requests_override": null, "inference_dynamic_batching_max_tokens_override": null, "inference_dynamic_batching_num_cuda_graphs": 16, "inference_dynamic_batching_track_paused_request_events": false, "symmetric_ar_type": null, "nccl_all_reduce_for_prefill": false, "mlp_chunks_for_prefill": 1, "initialize_socket_comms": false, "fp8": null, "fp8_recipe": "delayed", "fp8_margin": 0, "fp8_interval": 1, "fp8_amax_history_len": 1, "fp8_amax_compute_algo": "most_recent", "fp8_wgrad": true, "transformer_impl": "transformer_engine", "fp8_param_gather": false, "first_last_layers_bf16": false, "num_layers_at_start_in_bf16": 1, "num_layers_at_end_in_bf16": 1, "fp4": null, "fp4_recipe": "nvfp4", "fp4_param": false, "te_rng_tracker": false, "inference_rng_tracker": false, "retro_project_dir": null, "retro_add_retriever": false, "retro_cyclic_train_iters": null, "retro_encoder_layers": 2, "retro_encoder_hidden_dropout": 0.1, "retro_encoder_attention_dropout": 0.1, "retro_num_neighbors": 2, "retro_num_retrieved_chunks": 2, "retro_attention_gate": 1, "retro_verify_neighbor_count": true, "enable_experimental": false, "spec": null, "hybrid_attention_ratio": 0.0, "hybrid_mlp_ratio": 0.0, "hybrid_override_pattern": null, "mamba_state_dim": 128, "mamba_head_dim": 64, "mamba_num_groups": 8, "mamba_num_heads": null, "is_hybrid_model": false, "disable_mamba_mem_eff_path": false, "yaml_cfg": null, "use_precision_aware_optimizer": false, "main_grads_dtype": "torch.float32", "main_params_dtype": "torch.float32", "exp_avg_dtype": "torch.float32", "exp_avg_sq_dtype": "torch.float32", "enable_one_logger": true, "one_logger_project": "megatron-lm", "one_logger_run_name": null, "one_logger_async": false, "app_tag_run_name": null, "app_tag_run_version": "0.0.0", "inprocess_restart": false, "inprocess_max_iterations": null, "inprocess_monitor_thread_interval": 1.0, "inprocess_monitor_process_interval": 1.0, "inprocess_progress_watchdog_interval": 1.0, "inprocess_heartbeat_interval": 30, "inprocess_soft_timeout": 60, "inprocess_hard_timeout": 90, "inprocess_heartbeat_timeout": 60, "inprocess_barrier_timeout": 120, "inprocess_completion_timeout": 120, "inprocess_last_call_wait": 1, "inprocess_termination_grace_time": 1, "inprocess_granularity": "node", "inprocess_active_world_size": 40, "inprocess_empty_cuda_cache": false, "enable_ft_package": false, "calc_ft_timeouts": false, "config_logger_dir": "", "error_injection_rate": 0, "error_injection_type": "transient_error", "rerun_mode": "validate_results", "enable_msc": true, "kitchen_config_file": null, "kitchen_recipe_number": null, "sft": false, "sft_tokenizer_prompt_format": "nemotron-h-aligned", "num_quantizers": 64, "export_model_type": "GPTModel", "export_legacy_megatron": false, "export_te_mcore_model": false, "export_force_local_attention": false, "export_kv_cache_quant": false, "export_real_quant_cfg": "None", "export_quant_cfg": null, "export_kd_cfg": null, "teacher_model_config": null, "export_kd_teacher_load": null, "export_kd_teacher_ckpt_format": null, "finetune_hf_dataset": null, "finetune_data_split": "train", "export_qk_l2_norm": false, "export_moe_apply_probs_on_input": false, "export_offline_model": false, "rank": 0, "world_size": 40, "use_dist_ckpt": true, "transformer_pipeline_model_parallel_size": 1, "data_parallel_size": 40, "virtual_pipeline_model_parallel_size": null, "params_dtype": "torch.bfloat16", "consumed_train_samples": 4800000, "skipped_train_samples": 0, "consumed_valid_samples": 2400000, "variable_seq_lengths": false, "model_type": "", "iteration": 8000, "num_floating_point_operations_so_far": 2.926014640120922e+22, "do_train": 1, "do_valid": 1, "do_test": 0, "curr_iteration": 9999 }