{
  "num_layers": 24,
  "encoder_num_layers": 24,
  "decoder_num_layers": null,
  "hidden_size": 2048,
  "ffn_hidden_size": 5632,
  "num_attention_heads": 32,
  "attention_backend": "<stub>",
  "kv_channels": 64,
  "group_query_attention": true,
  "num_query_groups": 8,
  "softmax_type": "vanilla",
  "window_size": null,
  "window_attn_skip_freq": null,
  "max_position_embeddings": 8192,
  "position_embedding_type": "rope",
  "relative_attention_num_buckets": 32,
  "relative_attention_max_distance": 128,
  "use_rotary_position_embeddings": false,
  "rotary_base": 500000,
  "rotary_percent": 1.0,
  "rotary_interleaved": false,
  "rotary_seq_len_interpolation_factor": null,
  "use_rope_scaling": false,
  "rope_scaling_factor": 8.0,
  "no_rope_freq": null,
  "add_position_embedding": true,
  "mrope_section": null,
  "make_vocab_size_divisible_by": 128,
  "normalization": "RMSNorm",
  "norm_epsilon": 1e-06,
  "apply_layernorm_1p": false,
  "apply_residual_connection_post_layernorm": false,
  "openai_gelu": false,
  "squared_relu": false,
  "swiglu": true,
  "quick_geglu": false,
  "activation_func_clamp_value": null,
  "glu_linear_offset": 0.0,
  "onnx_safe": null,
  "bert_binary_head": true,
  "untie_embeddings_and_output_weights": true,
  "multi_latent_attention": false,
  "mtp_num_layers": null,
  "mtp_loss_scaling_factor": 0.1,
  "attention_dropout": 0.0,
  "hidden_dropout": 0.0,
  "weight_decay": 0.1,
  "start_weight_decay": 0.1,
  "end_weight_decay": 0.1,
  "weight_decay_incr_style": "constant",
  "clip_grad": 1.0,
  "adam_beta1": 0.9,
  "adam_beta2": 0.95,
  "adam_eps": 1e-08,
  "sgd_momentum": 0.9,
  "micro_batch_size": 1,
  "global_batch_size": 480,
  "rampup_batch_size": null,
  "decrease_batch_size_if_needed": false,
  "recompute_granularity": null,
  "check_for_nan_in_loss_and_grad": true,
  "check_for_spiky_loss": false,
  "check_for_large_grads": false,
  "distribute_saved_activations": false,
  "recompute_method": null,
  "recompute_num_layers": null,
  "recompute_modules": null,
  "clone_scatter_output_in_embedding": true,
  "profile": false,
  "profile_step_start": 10,
  "profile_step_end": 12,
  "iterations_to_skip": [],
  "result_rejected_tracker_filename": null,
  "enable_gloo_process_groups": true,
  "use_pytorch_profiler": false,
  "profile_ranks": [
    0
  ],
  "record_memory_history": false,
  "memory_snapshot_path": "snapshot.pickle",
  "tp_comm_overlap": false,
  "tp_comm_overlap_cfg": null,
  "tp_comm_overlap_ag": true,
  "tp_comm_overlap_rs": true,
  "tp_comm_overlap_rs_dgrad": false,
  "tp_comm_bulk_dgrad": true,
  "tp_comm_bulk_wgrad": true,
  "tp_comm_bootstrap_backend": "nccl",
  "use_cpu_initialization": null,
  "empty_unused_memory_level": 0,
  "deterministic_mode": false,
  "check_weight_hash_across_dp_replicas_interval": null,
  "calculate_per_token_loss": false,
  "train_sync_interval": null,
  "train_iters": 10000,
  "train_samples": null,
  "log_interval": 50,
  "exit_interval": null,
  "exit_duration_in_mins": null,
  "exit_signal_handler": false,
  "tensorboard_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/tensorboard/",
  "masked_softmax_fusion": true,
  "bias_gelu_fusion": true,
  "bias_swiglu_fusion": true,
  "use_fused_weighted_squared_relu": false,
  "bias_dropout_fusion": true,
  "apply_rope_fusion": true,
  "rope_type": null,
  "cross_entropy_loss_fusion": true,
  "cross_entropy_fusion_impl": "native",
  "use_flash_attn": false,
  "add_bias_linear": true,
  "add_qkv_bias": true,
  "optimizer": "adam",
  "optimizer_cpu_offload": false,
  "optimizer_offload_fraction": 1.0,
  "use_torch_optimizer_for_cpu_offload": false,
  "overlap_cpu_optimizer_d2h_h2d": false,
  "pin_cpu_grads": true,
  "pin_cpu_params": true,
  "dataloader_type": "cyclic",
  "async_tensor_model_parallel_allreduce": true,
  "no_persist_layer_norm": false,
  "sequence_parallel": false,
  "gradient_accumulation_fusion": true,
  "deprecated_use_mcore_models": false,
  "use_legacy_models": false,
  "manual_gc": false,
  "manual_gc_interval": 0,
  "manual_gc_eval": true,
  "tp_comm_split_ag": true,
  "tp_comm_split_rs": true,
  "pipeline_model_parallel_comm_backend": null,
  "high_priority_stream_groups": [],
  "use_te_activation_func": false,
  "perform_rl_step": false,
  "rl_prompts_per_eval": 32,
  "grpo_prompts_per_step": 32,
  "grpo_group_size": 2,
  "grpo_iterations": 2,
  "grpo_clamp_eps_lower": 0.01,
  "grpo_clamp_eps_upper": 0.01,
  "grpo_kl_beta": 0.001,
  "grpo_entropy_term_weight": 0.0,
  "grpo_filter_groups_with_same_reward": false,
  "grpo_default_temperature": 1.0,
  "grpo_default_top_p": 0,
  "langrl_inference_server_type": "inplace_megatron",
  "langrl_inference_server_conversation_template": null,
  "langrl_env_config": null,
  "rl_offload_optimizer_during_inference": false,
  "rl_offload_kv_cache_during_training": false,
  "rl_remove_kv_cache_during_training": false,
  "rl_reset_cuda_graphs": false,
  "rl_partial_rollouts": false,
  "rl_inference_logprobs_is_correction": false,
  "rl_importance_sampling_truncation_coef": null,
  "rl_calculate_intra_group_similarity": false,
  "seed": 2026,
  "data_parallel_random_init": false,
  "init_method_std": 0.02,
  "embedding_init_method_std": null,
  "init_method_xavier_uniform": false,
  "lr": 5e-05,
  "lr_decay_style": "cosine",
  "lr_wsd_decay_style": "exponential",
  "lr_decay_iters": 10000,
  "lr_decay_samples": null,
  "lr_wsd_decay_samples": null,
  "lr_wsd_decay_iters": null,
  "lr_warmup_fraction": null,
  "lr_warmup_iters": 200,
  "lr_warmup_samples": 0,
  "lr_warmup_init": 0.0,
  "min_lr": 5e-06,
  "override_opt_param_scheduler": false,
  "use_checkpoint_opt_param_scheduler": false,
  "decoupled_lr": null,
  "decoupled_min_lr": null,
  "save": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/checkpoint/",
  "save_interval": 200,
  "save_retain_interval": null,
  "no_save_optim": null,
  "no_save_rng": null,
  "load": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/checkpoint/",
  "no_load_optim": null,
  "load_main_params_from_ckpt": null,
  "no_load_rng": null,
  "strict_fsdp_dtensor_load": true,
  "non_persistent_save_interval": null,
  "non_persistent_ckpt_type": null,
  "non_persistent_global_ckpt_dir": null,
  "non_persistent_local_ckpt_dir": null,
  "non_persistent_local_ckpt_algo": "fully_parallel",
  "finetune": false,
  "pretrained_checkpoint": null,
  "ckpt_step": null,
  "perform_initialization": true,
  "use_checkpoint_args": false,
  "use_mp_args_from_checkpoint_args": false,
  "use_tokenizer_model_from_checkpoint_args": true,
  "exit_on_missing_checkpoint": false,
  "use_dist_ckpt_deprecated": false,
  "use_persistent_ckpt_worker": false,
  "auto_detect_ckpt_format": false,
  "dist_ckpt_format_deprecated": null,
  "ckpt_format": "torch_dist",
  "ckpt_convert_format": null,
  "ckpt_convert_save": null,
  "ckpt_convert_update_legacy_dist_opt_format": false,
  "ckpt_fully_parallel_save_deprecated": false,
  "ckpt_fully_parallel_save": true,
  "async_save": null,
  "ckpt_fully_parallel_load": false,
  "ckpt_assume_constant_structure": false,
  "dist_ckpt_strictness": "assume_ok_unexpected",
  "dist_ckpt_save_pre_mcore_014": false,
  "dist_ckpt_optim_fully_reshardable": false,
  "distrib_optim_fully_reshardable_mem_efficient": false,
  "load_model_opt_format": false,
  "fp16": false,
  "bf16": true,
  "grad_reduce_in_bf16": false,
  "loss_scale": null,
  "initial_loss_scale": 4294967296,
  "min_loss_scale": 1.0,
  "loss_scale_window": 1000,
  "hysteresis": 2,
  "fp32_residual_connection": false,
  "apply_query_key_layer_scaling": false,
  "attention_softmax_in_fp32": false,
  "accumulate_allreduce_grads_in_fp32": true,
  "fp16_lm_cross_entropy": false,
  "disable_bf16_reduced_precision_matmul": false,
  "reuse_grad_buf_for_mxfp8_param_ag": false,
  "tensor_model_parallel_size": 1,
  "pipeline_model_parallel_size": 1,
  "decoder_first_pipeline_num_layers": null,
  "decoder_last_pipeline_num_layers": null,
  "pipeline_model_parallel_layout": null,
  "num_layers_per_virtual_pipeline_stage": null,
  "num_virtual_stages_per_pipeline_rank": null,
  "microbatch_group_size_per_vp_stage": null,
  "overlap_p2p_comm": false,
  "overlap_p2p_comm_warmup_flush": false,
  "distributed_backend": "nccl",
  "distributed_timeout_minutes": 10,
  "overlap_grad_reduce": false,
  "defer_embedding_wgrad_compute": false,
  "wgrad_deferral_limit": 0,
  "align_grad_reduce": true,
  "ddp_num_buckets": null,
  "ddp_bucket_size": null,
  "ddp_pad_buckets_for_high_nccl_busbw": false,
  "ddp_average_in_collective": false,
  "overlap_param_gather": false,
  "overlap_param_gather_with_optimizer_step": false,
  "align_param_gather": false,
  "scatter_gather_tensors_in_pipeline": true,
  "use_ring_exchange_p2p": false,
  "local_rank": 0,
  "lazy_mpu_init": null,
  "account_for_embedding_in_pipeline_split": false,
  "account_for_loss_in_pipeline_split": false,
  "use_distributed_optimizer": true,
  "nccl_ub": false,
  "disable_symmetric_registration": false,
  "use_sharp": false,
  "sharp_enabled_group": null,
  "use_megatron_fsdp": false,
  "init_model_with_meta_device": false,
  "data_parallel_sharding_strategy": "no_shard",
  "gradient_reduce_div_fusion": true,
  "fsdp_double_buffer": false,
  "suggested_communication_unit_size": null,
  "keep_fp8_transpose_cache": false,
  "enable_full_sharding_in_hsdp": false,
  "num_distributed_optimizer_instances": 1,
  "use_torch_fsdp2": false,
  "torch_fsdp2_reshard_after_forward": true,
  "context_parallel_size": 1,
  "cp_comm_type": [
    "p2p"
  ],
  "hierarchical_context_parallel_sizes": null,
  "nccl_communicator_config_path": null,
  "use_tp_pp_dp_mapping": false,
  "replication": false,
  "replication_jump": null,
  "replication_factor": 2,
  "full_validation": false,
  "multiple_validation_sets": false,
  "eval_iters": 100,
  "eval_interval": 200,
  "test_mode": false,
  "skip_train": false,
  "data_path": null,
  "split": null,
  "train_data_path": [
    "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter0_full",
    "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter1_full",
    "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter2_full",
    "/2214/dongyuanliang/torchtitan/washed_top20w_latest5w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/train_filter3_full"
  ],
  "valid_data_path": [
    "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter0_full",
    "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter1_full",
    "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter2_full",
    "/2214/dongyuanliang/torchtitan/washed_1800w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_trainval_split4_merged/valid_filter3_full"
  ],
  "test_data_path": null,
  "data_args_path": null,
  "per_split_data_args_path": null,
  "data_cache_path": null,
  "mmap_bin_files": true,
  "mock_data": false,
  "seq_length": 8192,
  "encoder_seq_length": 8192,
  "decoder_seq_length": null,
  "retriever_seq_length": 256,
  "sample_rate": 1.0,
  "mask_prob": 0.15,
  "short_seq_prob": 0.1,
  "num_workers": 12,
  "reset_position_ids": false,
  "reset_attention_mask": false,
  "eod_mask_loss": true,
  "create_attention_mask_in_dataloader": true,
  "num_dataset_builder_threads": 1,
  "object_storage_cache_path": null,
  "mid_level_dataset_surplus": 0.005,
  "vocab_size": 193792,
  "padded_vocab_size": 193920,
  "vocab_file": null,
  "merge_file": null,
  "vocab_extra_ids": 0,
  "tokenizer_type": "NullTokenizer",
  "tokenizer_model": null,
  "tokenizer_metadata": null,
  "tiktoken_pattern": null,
  "tiktoken_num_special_tokens": 1000,
  "tiktoken_special_tokens": null,
  "legacy_tokenizer": false,
  "trust_remote_code": false,
  "adlr_autoresume": false,
  "adlr_autoresume_interval": 1000,
  "ict_head_size": null,
  "biencoder_projection_dim": 0,
  "biencoder_shared_query_context_model": false,
  "ict_load": null,
  "bert_load": null,
  "titles_data_path": null,
  "query_in_block_prob": 0.1,
  "use_one_sent_docs": false,
  "evidence_data_path": null,
  "retriever_report_topk_accuracies": [],
  "retriever_score_scaling": false,
  "block_data_path": null,
  "embedding_path": null,
  "indexer_batch_size": 128,
  "indexer_log_interval": 1000,
  "num_classes": 1000,
  "img_h": 224,
  "img_w": 224,
  "num_channels": 3,
  "patch_dim": 16,
  "classes_fraction": 1.0,
  "data_per_class_fraction": 1.0,
  "data_sharding": true,
  "head_lr_mult": 1.0,
  "vision_pretraining": false,
  "vision_pretraining_type": "classify",
  "vision_backbone_type": "vit",
  "swin_backbone_type": "tiny",
  "mask_type": "random",
  "mask_factor": 1.0,
  "iter_per_epoch": 1250,
  "dino_local_img_size": 96,
  "dino_local_crops_number": 10,
  "dino_head_hidden_size": 2048,
  "dino_bottleneck_size": 256,
  "dino_freeze_last_layer": 1,
  "dino_norm_last_layer": false,
  "dino_warmup_teacher_temp": 0.04,
  "dino_teacher_temp": 0.07,
  "dino_warmup_teacher_temp_epochs": 30,
  "qk_layernorm": false,
  "qk_l2_norm": false,
  "expert_model_parallel_size": 1,
  "expert_tensor_parallel_size": 1,
  "num_experts": null,
  "moe_layer_freq": 1,
  "moe_ffn_hidden_size": null,
  "moe_shared_expert_intermediate_size": null,
  "moe_shared_expert_overlap": false,
  "moe_grouped_gemm": false,
  "moe_use_legacy_grouped_gemm": false,
  "moe_layer_recompute": false,
  "moe_extended_tp": false,
  "moe_use_upcycling": false,
  "moe_router_load_balancing_type": "aux_loss",
  "moe_router_dtype": null,
  "moe_router_fusion": false,
  "moe_router_score_function": "softmax",
  "moe_router_topk": 2,
  "moe_router_pre_softmax": false,
  "moe_router_num_groups": null,
  "moe_router_group_topk": null,
  "moe_router_topk_scaling_factor": null,
  "moe_router_enable_expert_bias": false,
  "moe_router_bias_update_rate": 0.001,
  "moe_router_force_load_balancing": false,
  "moe_router_padding_for_fp8": false,
  "moe_aux_loss_coeff": 0.0,
  "moe_z_loss_coeff": null,
  "moe_input_jitter_eps": null,
  "moe_per_layer_logging": false,
  "moe_token_dispatcher_type": "allgather",
  "moe_enable_deepep": false,
  "moe_deepep_num_sms": 20,
  "moe_permute_fusion": false,
  "moe_expert_capacity_factor": null,
  "moe_pad_expert_input_to_capacity": false,
  "moe_token_drop_policy": "probs",
  "moe_apply_probs_on_input": false,
  "overlap_moe_expert_parallel_comm": false,
  "delay_wgrad_compute": false,
  "moe_upcycling_granularity": 1,
  "q_lora_rank": null,
  "kv_lora_rank": 32,
  "qk_head_dim": 128,
  "qk_pos_emb_head_dim": 64,
  "v_head_dim": 128,
  "rotary_scaling_factor": 1.0,
  "mscale": 1.0,
  "mscale_all_dim": 0.0,
  "cache_mla_latents": false,
  "heterogeneous_layers_config_path": null,
  "heterogeneous_layers_config_encoded_json": null,
  "log_params_norm": true,
  "log_num_zeros_in_grad": true,
  "log_throughput": true,
  "log_progress": true,
  "timing_log_level": 0,
  "log_energy": false,
  "barrier_with_L1_time": true,
  "timing_log_option": "minmax",
  "tensorboard_log_interval": 1,
  "tensorboard_queue_size": 1000,
  "log_timers_to_tensorboard": true,
  "log_loss_scale_to_tensorboard": true,
  "log_validation_ppl_to_tensorboard": true,
  "log_memory_to_tensorboard": true,
  "log_world_size_to_tensorboard": true,
  "wandb_project": "Megatron_Stage2",
  "wandb_entity": "",
  "wandb_exp_name": "1B_nl24_hs2048_gb480_seed2026_20260204_q01_ft5k_super_v2",
  "wandb_save_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb480_seed2026/20260204_q01_ft5k_super_v2/wandb/",
  "logging_level": null,
  "log_straggler": false,
  "disable_straggler_on_startup": false,
  "straggler_ctrlr_port": 65535,
  "straggler_minmax_count": 1,
  "run_workload_inspector_server": false,
  "inference_batch_times_seqlen_threshold": -1,
  "max_tokens_to_oom": 12000,
  "output_bert_embeddings": false,
  "bert_embedder_type": "megatron",
  "flash_decode": false,
  "enable_cuda_graph": false,
  "cuda_graph_warmup_steps": 3,
  "external_cuda_graph": false,
  "cuda_graph_scope": "full",
  "inference_max_batch_size": 8,
  "inference_max_seq_length": 2560,
  "inference_dynamic_batching": false,
  "inference_dynamic_batching_buffer_size_gb": 40.0,
  "inference_dynamic_batching_chunk_size": 256,
  "inference_dynamic_batching_buffer_guaranteed_fraction": 0.2,
  "inference_dynamic_batching_buffer_overflow_factor": null,
  "inference_dynamic_batching_max_requests_override": null,
  "inference_dynamic_batching_max_tokens_override": null,
  "inference_dynamic_batching_num_cuda_graphs": 16,
  "inference_dynamic_batching_track_paused_request_events": false,
  "symmetric_ar_type": null,
  "nccl_all_reduce_for_prefill": false,
  "mlp_chunks_for_prefill": 1,
  "initialize_socket_comms": false,
  "fp8": null,
  "fp8_recipe": "delayed",
  "fp8_margin": 0,
  "fp8_interval": 1,
  "fp8_amax_history_len": 1,
  "fp8_amax_compute_algo": "most_recent",
  "fp8_wgrad": true,
  "transformer_impl": "transformer_engine",
  "fp8_param_gather": false,
  "first_last_layers_bf16": false,
  "num_layers_at_start_in_bf16": 1,
  "num_layers_at_end_in_bf16": 1,
  "fp4": null,
  "fp4_recipe": "nvfp4",
  "fp4_param": false,
  "te_rng_tracker": false,
  "inference_rng_tracker": false,
  "retro_project_dir": null,
  "retro_add_retriever": false,
  "retro_cyclic_train_iters": null,
  "retro_encoder_layers": 2,
  "retro_encoder_hidden_dropout": 0.1,
  "retro_encoder_attention_dropout": 0.1,
  "retro_num_neighbors": 2,
  "retro_num_retrieved_chunks": 2,
  "retro_attention_gate": 1,
  "retro_verify_neighbor_count": true,
  "enable_experimental": false,
  "spec": null,
  "hybrid_attention_ratio": 0.0,
  "hybrid_mlp_ratio": 0.0,
  "hybrid_override_pattern": null,
  "mamba_state_dim": 128,
  "mamba_head_dim": 64,
  "mamba_num_groups": 8,
  "mamba_num_heads": null,
  "is_hybrid_model": false,
  "disable_mamba_mem_eff_path": false,
  "yaml_cfg": null,
  "use_precision_aware_optimizer": false,
  "main_grads_dtype": "torch.float32",
  "main_params_dtype": "torch.float32",
  "exp_avg_dtype": "torch.float32",
  "exp_avg_sq_dtype": "torch.float32",
  "enable_one_logger": true,
  "one_logger_project": "megatron-lm",
  "one_logger_run_name": null,
  "one_logger_async": false,
  "app_tag_run_name": null,
  "app_tag_run_version": "0.0.0",
  "inprocess_restart": false,
  "inprocess_max_iterations": null,
  "inprocess_monitor_thread_interval": 1.0,
  "inprocess_monitor_process_interval": 1.0,
  "inprocess_progress_watchdog_interval": 1.0,
  "inprocess_heartbeat_interval": 30,
  "inprocess_soft_timeout": 60,
  "inprocess_hard_timeout": 90,
  "inprocess_heartbeat_timeout": 60,
  "inprocess_barrier_timeout": 120,
  "inprocess_completion_timeout": 120,
  "inprocess_last_call_wait": 1,
  "inprocess_termination_grace_time": 1,
  "inprocess_granularity": "node",
  "inprocess_active_world_size": 40,
  "inprocess_empty_cuda_cache": false,
  "enable_ft_package": false,
  "calc_ft_timeouts": false,
  "config_logger_dir": "",
  "error_injection_rate": 0,
  "error_injection_type": "transient_error",
  "rerun_mode": "validate_results",
  "enable_msc": true,
  "kitchen_config_file": null,
  "kitchen_recipe_number": null,
  "sft": false,
  "sft_tokenizer_prompt_format": "nemotron-h-aligned",
  "num_quantizers": 64,
  "export_model_type": "GPTModel",
  "export_legacy_megatron": false,
  "export_te_mcore_model": false,
  "export_force_local_attention": false,
  "export_kv_cache_quant": false,
  "export_real_quant_cfg": "None",
  "export_quant_cfg": null,
  "export_kd_cfg": null,
  "teacher_model_config": null,
  "export_kd_teacher_load": null,
  "export_kd_teacher_ckpt_format": null,
  "finetune_hf_dataset": null,
  "finetune_data_split": "train",
  "export_qk_l2_norm": false,
  "export_moe_apply_probs_on_input": false,
  "export_offline_model": false,
  "rank": 0,
  "world_size": 40,
  "use_dist_ckpt": true,
  "transformer_pipeline_model_parallel_size": 1,
  "data_parallel_size": 40,
  "virtual_pipeline_model_parallel_size": null,
  "params_dtype": "torch.bfloat16",
  "consumed_train_samples": 4800000,
  "skipped_train_samples": 0,
  "consumed_valid_samples": 2400000,
  "variable_seq_lengths": false,
  "model_type": "<stub>",
  "iteration": 8000,
  "num_floating_point_operations_so_far": 2.926014640120922e+22,
  "do_train": 1,
  "do_valid": 1,
  "do_test": 0,
  "curr_iteration": 9999
}