Khala-MusicGeneration-v1.0-MPS / backbone_megatron_args.json

backbone config

2cdbd5d verified 24 days ago

22.4 kB

	{
	"num_layers": 24,
	"encoder_num_layers": 24,
	"decoder_num_layers": null,
	"hidden_size": 2048,
	"ffn_hidden_size": 5632,
	"num_attention_heads": 32,
	"attention_backend": "<stub>",
	"kv_channels": 64,
	"group_query_attention": true,
	"num_query_groups": 8,
	"softmax_type": "vanilla",
	"window_size": null,
	"window_attn_skip_freq": null,
	"max_position_embeddings": 16384,
	"position_embedding_type": "rope",
	"relative_attention_num_buckets": 32,
	"relative_attention_max_distance": 128,
	"use_rotary_position_embeddings": false,
	"rotary_base": 500000,
	"rotary_percent": 1.0,
	"rotary_interleaved": false,
	"rotary_seq_len_interpolation_factor": null,
	"use_rope_scaling": false,
	"rope_scaling_factor": 8.0,
	"no_rope_freq": null,
	"add_position_embedding": true,
	"mrope_section": null,
	"make_vocab_size_divisible_by": 128,
	"normalization": "RMSNorm",
	"norm_epsilon": 1e-06,
	"apply_layernorm_1p": false,
	"apply_residual_connection_post_layernorm": false,
	"openai_gelu": false,
	"squared_relu": false,
	"swiglu": true,
	"quick_geglu": false,
	"activation_func_clamp_value": null,
	"glu_linear_offset": 0.0,
	"onnx_safe": null,
	"bert_binary_head": true,
	"untie_embeddings_and_output_weights": true,
	"multi_latent_attention": false,
	"mtp_num_layers": null,
	"mtp_loss_scaling_factor": 0.1,
	"attention_dropout": 0.0,
	"hidden_dropout": 0.0,
	"weight_decay": 0.1,
	"start_weight_decay": 0.1,
	"end_weight_decay": 0.1,
	"weight_decay_incr_style": "constant",
	"clip_grad": 1.0,
	"adam_beta1": 0.9,
	"adam_beta2": 0.95,
	"adam_eps": 1e-08,
	"sgd_momentum": 0.9,
	"micro_batch_size": 1,
	"global_batch_size": 240,
	"rampup_batch_size": null,
	"decrease_batch_size_if_needed": false,
	"recompute_granularity": null,
	"check_for_nan_in_loss_and_grad": true,
	"check_for_spiky_loss": false,
	"check_for_large_grads": false,
	"distribute_saved_activations": false,
	"recompute_method": null,
	"recompute_num_layers": null,
	"recompute_modules": null,
	"clone_scatter_output_in_embedding": true,
	"profile": false,
	"profile_step_start": 10,
	"profile_step_end": 12,
	"iterations_to_skip": [],
	"result_rejected_tracker_filename": null,
	"enable_gloo_process_groups": true,
	"use_pytorch_profiler": false,
	"profile_ranks": [
	0
	],
	"record_memory_history": false,
	"memory_snapshot_path": "snapshot.pickle",
	"tp_comm_overlap": false,
	"tp_comm_overlap_cfg": null,
	"tp_comm_overlap_ag": true,
	"tp_comm_overlap_rs": true,
	"tp_comm_overlap_rs_dgrad": false,
	"tp_comm_bulk_dgrad": true,
	"tp_comm_bulk_wgrad": true,
	"tp_comm_bootstrap_backend": "nccl",
	"use_cpu_initialization": null,
	"empty_unused_memory_level": 0,
	"deterministic_mode": false,
	"check_weight_hash_across_dp_replicas_interval": null,
	"calculate_per_token_loss": false,
	"train_sync_interval": null,
	"train_iters": 36000,
	"train_samples": null,
	"log_interval": 50,
	"exit_interval": null,
	"exit_duration_in_mins": null,
	"exit_signal_handler": false,
	"tensorboard_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/tensorboard/",
	"masked_softmax_fusion": true,
	"bias_gelu_fusion": true,
	"bias_swiglu_fusion": true,
	"use_fused_weighted_squared_relu": false,
	"bias_dropout_fusion": true,
	"apply_rope_fusion": true,
	"rope_type": null,
	"cross_entropy_loss_fusion": true,
	"cross_entropy_fusion_impl": "native",
	"use_flash_attn": false,
	"add_bias_linear": true,
	"add_qkv_bias": true,
	"optimizer": "adam",
	"optimizer_cpu_offload": false,
	"optimizer_offload_fraction": 1.0,
	"use_torch_optimizer_for_cpu_offload": false,
	"overlap_cpu_optimizer_d2h_h2d": false,
	"pin_cpu_grads": true,
	"pin_cpu_params": true,
	"dataloader_type": "cyclic",
	"async_tensor_model_parallel_allreduce": true,
	"no_persist_layer_norm": false,
	"sequence_parallel": false,
	"gradient_accumulation_fusion": true,
	"deprecated_use_mcore_models": false,
	"use_legacy_models": false,
	"manual_gc": false,
	"manual_gc_interval": 0,
	"manual_gc_eval": true,
	"tp_comm_split_ag": true,
	"tp_comm_split_rs": true,
	"pipeline_model_parallel_comm_backend": null,
	"high_priority_stream_groups": [],
	"use_te_activation_func": false,
	"perform_rl_step": false,
	"rl_prompts_per_eval": 32,
	"grpo_prompts_per_step": 32,
	"grpo_group_size": 2,
	"grpo_iterations": 2,
	"grpo_clamp_eps_lower": 0.01,
	"grpo_clamp_eps_upper": 0.01,
	"grpo_kl_beta": 0.001,
	"grpo_entropy_term_weight": 0.0,
	"grpo_filter_groups_with_same_reward": false,
	"grpo_default_temperature": 1.0,
	"grpo_default_top_p": 0,
	"langrl_inference_server_type": "inplace_megatron",
	"langrl_inference_server_conversation_template": null,
	"langrl_env_config": null,
	"rl_offload_optimizer_during_inference": false,
	"rl_offload_kv_cache_during_training": false,
	"rl_remove_kv_cache_during_training": false,
	"rl_reset_cuda_graphs": false,
	"rl_partial_rollouts": false,
	"rl_inference_logprobs_is_correction": false,
	"rl_importance_sampling_truncation_coef": null,
	"rl_calculate_intra_group_similarity": false,
	"seed": 2026,
	"data_parallel_random_init": false,
	"init_method_std": 0.02,
	"embedding_init_method_std": null,
	"init_method_xavier_uniform": false,
	"lr": 0.0001,
	"lr_decay_style": "cosine",
	"lr_wsd_decay_style": "exponential",
	"lr_decay_iters": 36000,
	"lr_decay_samples": null,
	"lr_wsd_decay_samples": null,
	"lr_wsd_decay_iters": null,
	"lr_warmup_fraction": null,
	"lr_warmup_iters": 5000,
	"lr_warmup_samples": 0,
	"lr_warmup_init": 0.0,
	"min_lr": 1e-05,
	"override_opt_param_scheduler": false,
	"use_checkpoint_opt_param_scheduler": false,
	"decoupled_lr": null,
	"decoupled_min_lr": null,
	"save": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/checkpoint/",
	"save_interval": 2000,
	"save_retain_interval": null,
	"no_save_optim": null,
	"no_save_rng": null,
	"load": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260132_q01_v0/checkpoint",
	"no_load_optim": null,
	"load_main_params_from_ckpt": null,
	"no_load_rng": null,
	"strict_fsdp_dtensor_load": true,
	"non_persistent_save_interval": null,
	"non_persistent_ckpt_type": null,
	"non_persistent_global_ckpt_dir": null,
	"non_persistent_local_ckpt_dir": null,
	"non_persistent_local_ckpt_algo": "fully_parallel",
	"finetune": true,
	"pretrained_checkpoint": null,
	"ckpt_step": null,
	"perform_initialization": true,
	"use_checkpoint_args": false,
	"use_mp_args_from_checkpoint_args": false,
	"use_tokenizer_model_from_checkpoint_args": true,
	"exit_on_missing_checkpoint": false,
	"use_dist_ckpt_deprecated": false,
	"use_persistent_ckpt_worker": false,
	"auto_detect_ckpt_format": false,
	"dist_ckpt_format_deprecated": null,
	"ckpt_format": "torch_dist",
	"ckpt_convert_format": null,
	"ckpt_convert_save": null,
	"ckpt_convert_update_legacy_dist_opt_format": false,
	"ckpt_fully_parallel_save_deprecated": false,
	"ckpt_fully_parallel_save": true,
	"async_save": null,
	"ckpt_fully_parallel_load": false,
	"ckpt_assume_constant_structure": false,
	"dist_ckpt_strictness": "assume_ok_unexpected",
	"dist_ckpt_save_pre_mcore_014": false,
	"dist_ckpt_optim_fully_reshardable": false,
	"distrib_optim_fully_reshardable_mem_efficient": false,
	"load_model_opt_format": false,
	"fp16": false,
	"bf16": true,
	"grad_reduce_in_bf16": false,
	"loss_scale": null,
	"initial_loss_scale": 4294967296,
	"min_loss_scale": 1.0,
	"loss_scale_window": 1000,
	"hysteresis": 2,
	"fp32_residual_connection": false,
	"apply_query_key_layer_scaling": false,
	"attention_softmax_in_fp32": false,
	"accumulate_allreduce_grads_in_fp32": true,
	"fp16_lm_cross_entropy": false,
	"disable_bf16_reduced_precision_matmul": false,
	"reuse_grad_buf_for_mxfp8_param_ag": false,
	"tensor_model_parallel_size": 1,
	"pipeline_model_parallel_size": 1,
	"decoder_first_pipeline_num_layers": null,
	"decoder_last_pipeline_num_layers": null,
	"pipeline_model_parallel_layout": null,
	"num_layers_per_virtual_pipeline_stage": null,
	"num_virtual_stages_per_pipeline_rank": null,
	"microbatch_group_size_per_vp_stage": null,
	"overlap_p2p_comm": false,
	"overlap_p2p_comm_warmup_flush": false,
	"distributed_backend": "nccl",
	"distributed_timeout_minutes": 10,
	"overlap_grad_reduce": false,
	"defer_embedding_wgrad_compute": false,
	"wgrad_deferral_limit": 0,
	"align_grad_reduce": true,
	"ddp_num_buckets": null,
	"ddp_bucket_size": null,
	"ddp_pad_buckets_for_high_nccl_busbw": false,
	"ddp_average_in_collective": false,
	"overlap_param_gather": false,
	"overlap_param_gather_with_optimizer_step": false,
	"align_param_gather": false,
	"scatter_gather_tensors_in_pipeline": true,
	"use_ring_exchange_p2p": false,
	"local_rank": 0,
	"lazy_mpu_init": null,
	"account_for_embedding_in_pipeline_split": false,
	"account_for_loss_in_pipeline_split": false,
	"use_distributed_optimizer": true,
	"nccl_ub": false,
	"disable_symmetric_registration": false,
	"use_sharp": false,
	"sharp_enabled_group": null,
	"use_megatron_fsdp": false,
	"init_model_with_meta_device": false,
	"data_parallel_sharding_strategy": "no_shard",
	"gradient_reduce_div_fusion": true,
	"fsdp_double_buffer": false,
	"suggested_communication_unit_size": null,
	"keep_fp8_transpose_cache": false,
	"enable_full_sharding_in_hsdp": false,
	"num_distributed_optimizer_instances": 1,
	"use_torch_fsdp2": false,
	"torch_fsdp2_reshard_after_forward": true,
	"context_parallel_size": 1,
	"cp_comm_type": [
	"p2p"
	],
	"hierarchical_context_parallel_sizes": null,
	"nccl_communicator_config_path": null,
	"use_tp_pp_dp_mapping": false,
	"replication": false,
	"replication_jump": null,
	"replication_factor": 2,
	"full_validation": false,
	"multiple_validation_sets": false,
	"eval_iters": 100,
	"eval_interval": 500,
	"test_mode": false,
	"skip_train": false,
	"data_path": null,
	"split": null,
	"train_data_path": [
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter0_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter1_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter2_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter3_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter0_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter1_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter2_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter3_full"
	],
	"valid_data_path": [
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter0_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter1_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter2_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter3_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter0_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter1_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter2_full",
	"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter3_full"
	],
	"test_data_path": null,
	"data_args_path": null,
	"per_split_data_args_path": null,
	"data_cache_path": null,
	"mmap_bin_files": true,
	"mock_data": false,
	"seq_length": 16384,
	"encoder_seq_length": 16384,
	"decoder_seq_length": null,
	"retriever_seq_length": 256,
	"sample_rate": 1.0,
	"mask_prob": 0.15,
	"short_seq_prob": 0.1,
	"num_workers": 12,
	"reset_position_ids": false,
	"reset_attention_mask": false,
	"eod_mask_loss": true,
	"create_attention_mask_in_dataloader": true,
	"num_dataset_builder_threads": 1,
	"object_storage_cache_path": null,
	"mid_level_dataset_surplus": 0.005,
	"vocab_size": 130304,
	"padded_vocab_size": 130432,
	"vocab_file": null,
	"merge_file": null,
	"vocab_extra_ids": 0,
	"tokenizer_type": "NullTokenizer",
	"tokenizer_model": null,
	"tokenizer_metadata": null,
	"tiktoken_pattern": null,
	"tiktoken_num_special_tokens": 1000,
	"tiktoken_special_tokens": null,
	"legacy_tokenizer": false,
	"trust_remote_code": false,
	"adlr_autoresume": false,
	"adlr_autoresume_interval": 1000,
	"ict_head_size": null,
	"biencoder_projection_dim": 0,
	"biencoder_shared_query_context_model": false,
	"ict_load": null,
	"bert_load": null,
	"titles_data_path": null,
	"query_in_block_prob": 0.1,
	"use_one_sent_docs": false,
	"evidence_data_path": null,
	"retriever_report_topk_accuracies": [],
	"retriever_score_scaling": false,
	"block_data_path": null,
	"embedding_path": null,
	"indexer_batch_size": 128,
	"indexer_log_interval": 1000,
	"num_classes": 1000,
	"img_h": 224,
	"img_w": 224,
	"num_channels": 3,
	"patch_dim": 16,
	"classes_fraction": 1.0,
	"data_per_class_fraction": 1.0,
	"data_sharding": true,
	"head_lr_mult": 1.0,
	"vision_pretraining": false,
	"vision_pretraining_type": "classify",
	"vision_backbone_type": "vit",
	"swin_backbone_type": "tiny",
	"mask_type": "random",
	"mask_factor": 1.0,
	"iter_per_epoch": 1250,
	"dino_local_img_size": 96,
	"dino_local_crops_number": 10,
	"dino_head_hidden_size": 2048,
	"dino_bottleneck_size": 256,
	"dino_freeze_last_layer": 1,
	"dino_norm_last_layer": false,
	"dino_warmup_teacher_temp": 0.04,
	"dino_teacher_temp": 0.07,
	"dino_warmup_teacher_temp_epochs": 30,
	"qk_layernorm": false,
	"qk_l2_norm": false,
	"expert_model_parallel_size": 1,
	"expert_tensor_parallel_size": 1,
	"num_experts": null,
	"moe_layer_freq": 1,
	"moe_ffn_hidden_size": null,
	"moe_shared_expert_intermediate_size": null,
	"moe_shared_expert_overlap": false,
	"moe_grouped_gemm": false,
	"moe_use_legacy_grouped_gemm": false,
	"moe_layer_recompute": false,
	"moe_extended_tp": false,
	"moe_use_upcycling": false,
	"moe_router_load_balancing_type": "aux_loss",
	"moe_router_dtype": null,
	"moe_router_fusion": false,
	"moe_router_score_function": "softmax",
	"moe_router_topk": 2,
	"moe_router_pre_softmax": false,
	"moe_router_num_groups": null,
	"moe_router_group_topk": null,
	"moe_router_topk_scaling_factor": null,
	"moe_router_enable_expert_bias": false,
	"moe_router_bias_update_rate": 0.001,
	"moe_router_force_load_balancing": false,
	"moe_router_padding_for_fp8": false,
	"moe_aux_loss_coeff": 0.0,
	"moe_z_loss_coeff": null,
	"moe_input_jitter_eps": null,
	"moe_per_layer_logging": false,
	"moe_token_dispatcher_type": "allgather",
	"moe_enable_deepep": false,
	"moe_deepep_num_sms": 20,
	"moe_permute_fusion": false,
	"moe_expert_capacity_factor": null,
	"moe_pad_expert_input_to_capacity": false,
	"moe_token_drop_policy": "probs",
	"moe_apply_probs_on_input": false,
	"overlap_moe_expert_parallel_comm": false,
	"delay_wgrad_compute": false,
	"moe_upcycling_granularity": 1,
	"q_lora_rank": null,
	"kv_lora_rank": 32,
	"qk_head_dim": 128,
	"qk_pos_emb_head_dim": 64,
	"v_head_dim": 128,
	"rotary_scaling_factor": 1.0,
	"mscale": 1.0,
	"mscale_all_dim": 0.0,
	"cache_mla_latents": false,
	"heterogeneous_layers_config_path": null,
	"heterogeneous_layers_config_encoded_json": null,
	"log_params_norm": true,
	"log_num_zeros_in_grad": true,
	"log_throughput": true,
	"log_progress": true,
	"timing_log_level": 0,
	"log_energy": false,
	"barrier_with_L1_time": true,
	"timing_log_option": "minmax",
	"tensorboard_log_interval": 1,
	"tensorboard_queue_size": 1000,
	"log_timers_to_tensorboard": true,
	"log_loss_scale_to_tensorboard": true,
	"log_validation_ppl_to_tensorboard": true,
	"log_memory_to_tensorboard": true,
	"log_world_size_to_tensorboard": true,
	"wandb_project": "Megatron_Stage1",
	"wandb_entity": "",
	"wandb_exp_name": "1B_nl24_hs2048_gb240_seed2026_20260217_q01_354k_tag_desc_v0",
	"wandb_save_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/wandb/",
	"logging_level": null,
	"log_straggler": false,
	"disable_straggler_on_startup": false,
	"straggler_ctrlr_port": 65535,
	"straggler_minmax_count": 1,
	"run_workload_inspector_server": false,
	"inference_batch_times_seqlen_threshold": -1,
	"max_tokens_to_oom": 12000,
	"output_bert_embeddings": false,
	"bert_embedder_type": "megatron",
	"flash_decode": false,
	"enable_cuda_graph": false,
	"cuda_graph_warmup_steps": 3,
	"external_cuda_graph": false,
	"cuda_graph_scope": "full",
	"inference_max_batch_size": 8,
	"inference_max_seq_length": 2560,
	"inference_dynamic_batching": false,
	"inference_dynamic_batching_buffer_size_gb": 40.0,
	"inference_dynamic_batching_chunk_size": 256,
	"inference_dynamic_batching_buffer_guaranteed_fraction": 0.2,
	"inference_dynamic_batching_buffer_overflow_factor": null,
	"inference_dynamic_batching_max_requests_override": null,
	"inference_dynamic_batching_max_tokens_override": null,
	"inference_dynamic_batching_num_cuda_graphs": 16,
	"inference_dynamic_batching_track_paused_request_events": false,
	"symmetric_ar_type": null,
	"nccl_all_reduce_for_prefill": false,
	"mlp_chunks_for_prefill": 1,
	"initialize_socket_comms": false,
	"fp8": null,
	"fp8_recipe": "delayed",
	"fp8_margin": 0,
	"fp8_interval": 1,
	"fp8_amax_history_len": 1,
	"fp8_amax_compute_algo": "most_recent",
	"fp8_wgrad": true,
	"transformer_impl": "transformer_engine",
	"fp8_param_gather": false,
	"first_last_layers_bf16": false,
	"num_layers_at_start_in_bf16": 1,
	"num_layers_at_end_in_bf16": 1,
	"fp4": null,
	"fp4_recipe": "nvfp4",
	"fp4_param": false,
	"te_rng_tracker": false,
	"inference_rng_tracker": false,
	"retro_project_dir": null,
	"retro_add_retriever": false,
	"retro_cyclic_train_iters": null,
	"retro_encoder_layers": 2,
	"retro_encoder_hidden_dropout": 0.1,
	"retro_encoder_attention_dropout": 0.1,
	"retro_num_neighbors": 2,
	"retro_num_retrieved_chunks": 2,
	"retro_attention_gate": 1,
	"retro_verify_neighbor_count": true,
	"enable_experimental": false,
	"spec": null,
	"hybrid_attention_ratio": 0.0,
	"hybrid_mlp_ratio": 0.0,
	"hybrid_override_pattern": null,
	"mamba_state_dim": 128,
	"mamba_head_dim": 64,
	"mamba_num_groups": 8,
	"mamba_num_heads": null,
	"is_hybrid_model": false,
	"disable_mamba_mem_eff_path": false,
	"yaml_cfg": null,
	"use_precision_aware_optimizer": false,
	"main_grads_dtype": "torch.float32",
	"main_params_dtype": "torch.float32",
	"exp_avg_dtype": "torch.float32",
	"exp_avg_sq_dtype": "torch.float32",
	"enable_one_logger": true,
	"one_logger_project": "megatron-lm",
	"one_logger_run_name": null,
	"one_logger_async": false,
	"app_tag_run_name": null,
	"app_tag_run_version": "0.0.0",
	"inprocess_restart": false,
	"inprocess_max_iterations": null,
	"inprocess_monitor_thread_interval": 1.0,
	"inprocess_monitor_process_interval": 1.0,
	"inprocess_progress_watchdog_interval": 1.0,
	"inprocess_heartbeat_interval": 30,
	"inprocess_soft_timeout": 60,
	"inprocess_hard_timeout": 90,
	"inprocess_heartbeat_timeout": 60,
	"inprocess_barrier_timeout": 120,
	"inprocess_completion_timeout": 120,
	"inprocess_last_call_wait": 1,
	"inprocess_termination_grace_time": 1,
	"inprocess_granularity": "node",
	"inprocess_active_world_size": 48,
	"inprocess_empty_cuda_cache": false,
	"enable_ft_package": false,
	"calc_ft_timeouts": false,
	"config_logger_dir": "",
	"error_injection_rate": 0,
	"error_injection_type": "transient_error",
	"rerun_mode": "validate_results",
	"enable_msc": true,
	"kitchen_config_file": null,
	"kitchen_recipe_number": null,
	"sft": false,
	"sft_tokenizer_prompt_format": "nemotron-h-aligned",
	"num_quantizers": 64,
	"export_model_type": "GPTModel",
	"export_legacy_megatron": false,
	"export_te_mcore_model": false,
	"export_force_local_attention": false,
	"export_kv_cache_quant": false,
	"export_real_quant_cfg": "None",
	"export_quant_cfg": null,
	"export_kd_cfg": null,
	"teacher_model_config": null,
	"export_kd_teacher_load": null,
	"export_kd_teacher_ckpt_format": null,
	"finetune_hf_dataset": null,
	"finetune_data_split": "train",
	"export_qk_l2_norm": false,
	"export_moe_apply_probs_on_input": false,
	"export_offline_model": false,
	"rank": 0,
	"world_size": 48,
	"use_dist_ckpt": true,
	"transformer_pipeline_model_parallel_size": 1,
	"data_parallel_size": 48,
	"virtual_pipeline_model_parallel_size": null,
	"params_dtype": "torch.bfloat16",
	"consumed_train_samples": 8640000,
	"skipped_train_samples": 0,
	"consumed_valid_samples": 1728000,
	"variable_seq_lengths": false,
	"model_type": "<stub>",
	"iteration": 0,
	"num_floating_point_operations_so_far": 1.7994699384280842e+22,
	"do_train": 1,
	"do_valid": 1,
	"do_test": 0,
	"curr_iteration": 35999
	}