Khala-MusicGeneration-v1.0-MPS / backbone_megatron_args.json
Vinpolar's picture
backbone config
2cdbd5d verified
Raw
History Blame Contribute Delete
22.4 kB
{
"num_layers": 24,
"encoder_num_layers": 24,
"decoder_num_layers": null,
"hidden_size": 2048,
"ffn_hidden_size": 5632,
"num_attention_heads": 32,
"attention_backend": "<stub>",
"kv_channels": 64,
"group_query_attention": true,
"num_query_groups": 8,
"softmax_type": "vanilla",
"window_size": null,
"window_attn_skip_freq": null,
"max_position_embeddings": 16384,
"position_embedding_type": "rope",
"relative_attention_num_buckets": 32,
"relative_attention_max_distance": 128,
"use_rotary_position_embeddings": false,
"rotary_base": 500000,
"rotary_percent": 1.0,
"rotary_interleaved": false,
"rotary_seq_len_interpolation_factor": null,
"use_rope_scaling": false,
"rope_scaling_factor": 8.0,
"no_rope_freq": null,
"add_position_embedding": true,
"mrope_section": null,
"make_vocab_size_divisible_by": 128,
"normalization": "RMSNorm",
"norm_epsilon": 1e-06,
"apply_layernorm_1p": false,
"apply_residual_connection_post_layernorm": false,
"openai_gelu": false,
"squared_relu": false,
"swiglu": true,
"quick_geglu": false,
"activation_func_clamp_value": null,
"glu_linear_offset": 0.0,
"onnx_safe": null,
"bert_binary_head": true,
"untie_embeddings_and_output_weights": true,
"multi_latent_attention": false,
"mtp_num_layers": null,
"mtp_loss_scaling_factor": 0.1,
"attention_dropout": 0.0,
"hidden_dropout": 0.0,
"weight_decay": 0.1,
"start_weight_decay": 0.1,
"end_weight_decay": 0.1,
"weight_decay_incr_style": "constant",
"clip_grad": 1.0,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"adam_eps": 1e-08,
"sgd_momentum": 0.9,
"micro_batch_size": 1,
"global_batch_size": 240,
"rampup_batch_size": null,
"decrease_batch_size_if_needed": false,
"recompute_granularity": null,
"check_for_nan_in_loss_and_grad": true,
"check_for_spiky_loss": false,
"check_for_large_grads": false,
"distribute_saved_activations": false,
"recompute_method": null,
"recompute_num_layers": null,
"recompute_modules": null,
"clone_scatter_output_in_embedding": true,
"profile": false,
"profile_step_start": 10,
"profile_step_end": 12,
"iterations_to_skip": [],
"result_rejected_tracker_filename": null,
"enable_gloo_process_groups": true,
"use_pytorch_profiler": false,
"profile_ranks": [
0
],
"record_memory_history": false,
"memory_snapshot_path": "snapshot.pickle",
"tp_comm_overlap": false,
"tp_comm_overlap_cfg": null,
"tp_comm_overlap_ag": true,
"tp_comm_overlap_rs": true,
"tp_comm_overlap_rs_dgrad": false,
"tp_comm_bulk_dgrad": true,
"tp_comm_bulk_wgrad": true,
"tp_comm_bootstrap_backend": "nccl",
"use_cpu_initialization": null,
"empty_unused_memory_level": 0,
"deterministic_mode": false,
"check_weight_hash_across_dp_replicas_interval": null,
"calculate_per_token_loss": false,
"train_sync_interval": null,
"train_iters": 36000,
"train_samples": null,
"log_interval": 50,
"exit_interval": null,
"exit_duration_in_mins": null,
"exit_signal_handler": false,
"tensorboard_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/tensorboard/",
"masked_softmax_fusion": true,
"bias_gelu_fusion": true,
"bias_swiglu_fusion": true,
"use_fused_weighted_squared_relu": false,
"bias_dropout_fusion": true,
"apply_rope_fusion": true,
"rope_type": null,
"cross_entropy_loss_fusion": true,
"cross_entropy_fusion_impl": "native",
"use_flash_attn": false,
"add_bias_linear": true,
"add_qkv_bias": true,
"optimizer": "adam",
"optimizer_cpu_offload": false,
"optimizer_offload_fraction": 1.0,
"use_torch_optimizer_for_cpu_offload": false,
"overlap_cpu_optimizer_d2h_h2d": false,
"pin_cpu_grads": true,
"pin_cpu_params": true,
"dataloader_type": "cyclic",
"async_tensor_model_parallel_allreduce": true,
"no_persist_layer_norm": false,
"sequence_parallel": false,
"gradient_accumulation_fusion": true,
"deprecated_use_mcore_models": false,
"use_legacy_models": false,
"manual_gc": false,
"manual_gc_interval": 0,
"manual_gc_eval": true,
"tp_comm_split_ag": true,
"tp_comm_split_rs": true,
"pipeline_model_parallel_comm_backend": null,
"high_priority_stream_groups": [],
"use_te_activation_func": false,
"perform_rl_step": false,
"rl_prompts_per_eval": 32,
"grpo_prompts_per_step": 32,
"grpo_group_size": 2,
"grpo_iterations": 2,
"grpo_clamp_eps_lower": 0.01,
"grpo_clamp_eps_upper": 0.01,
"grpo_kl_beta": 0.001,
"grpo_entropy_term_weight": 0.0,
"grpo_filter_groups_with_same_reward": false,
"grpo_default_temperature": 1.0,
"grpo_default_top_p": 0,
"langrl_inference_server_type": "inplace_megatron",
"langrl_inference_server_conversation_template": null,
"langrl_env_config": null,
"rl_offload_optimizer_during_inference": false,
"rl_offload_kv_cache_during_training": false,
"rl_remove_kv_cache_during_training": false,
"rl_reset_cuda_graphs": false,
"rl_partial_rollouts": false,
"rl_inference_logprobs_is_correction": false,
"rl_importance_sampling_truncation_coef": null,
"rl_calculate_intra_group_similarity": false,
"seed": 2026,
"data_parallel_random_init": false,
"init_method_std": 0.02,
"embedding_init_method_std": null,
"init_method_xavier_uniform": false,
"lr": 0.0001,
"lr_decay_style": "cosine",
"lr_wsd_decay_style": "exponential",
"lr_decay_iters": 36000,
"lr_decay_samples": null,
"lr_wsd_decay_samples": null,
"lr_wsd_decay_iters": null,
"lr_warmup_fraction": null,
"lr_warmup_iters": 5000,
"lr_warmup_samples": 0,
"lr_warmup_init": 0.0,
"min_lr": 1e-05,
"override_opt_param_scheduler": false,
"use_checkpoint_opt_param_scheduler": false,
"decoupled_lr": null,
"decoupled_min_lr": null,
"save": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/checkpoint/",
"save_interval": 2000,
"save_retain_interval": null,
"no_save_optim": null,
"no_save_rng": null,
"load": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260132_q01_v0/checkpoint",
"no_load_optim": null,
"load_main_params_from_ckpt": null,
"no_load_rng": null,
"strict_fsdp_dtensor_load": true,
"non_persistent_save_interval": null,
"non_persistent_ckpt_type": null,
"non_persistent_global_ckpt_dir": null,
"non_persistent_local_ckpt_dir": null,
"non_persistent_local_ckpt_algo": "fully_parallel",
"finetune": true,
"pretrained_checkpoint": null,
"ckpt_step": null,
"perform_initialization": true,
"use_checkpoint_args": false,
"use_mp_args_from_checkpoint_args": false,
"use_tokenizer_model_from_checkpoint_args": true,
"exit_on_missing_checkpoint": false,
"use_dist_ckpt_deprecated": false,
"use_persistent_ckpt_worker": false,
"auto_detect_ckpt_format": false,
"dist_ckpt_format_deprecated": null,
"ckpt_format": "torch_dist",
"ckpt_convert_format": null,
"ckpt_convert_save": null,
"ckpt_convert_update_legacy_dist_opt_format": false,
"ckpt_fully_parallel_save_deprecated": false,
"ckpt_fully_parallel_save": true,
"async_save": null,
"ckpt_fully_parallel_load": false,
"ckpt_assume_constant_structure": false,
"dist_ckpt_strictness": "assume_ok_unexpected",
"dist_ckpt_save_pre_mcore_014": false,
"dist_ckpt_optim_fully_reshardable": false,
"distrib_optim_fully_reshardable_mem_efficient": false,
"load_model_opt_format": false,
"fp16": false,
"bf16": true,
"grad_reduce_in_bf16": false,
"loss_scale": null,
"initial_loss_scale": 4294967296,
"min_loss_scale": 1.0,
"loss_scale_window": 1000,
"hysteresis": 2,
"fp32_residual_connection": false,
"apply_query_key_layer_scaling": false,
"attention_softmax_in_fp32": false,
"accumulate_allreduce_grads_in_fp32": true,
"fp16_lm_cross_entropy": false,
"disable_bf16_reduced_precision_matmul": false,
"reuse_grad_buf_for_mxfp8_param_ag": false,
"tensor_model_parallel_size": 1,
"pipeline_model_parallel_size": 1,
"decoder_first_pipeline_num_layers": null,
"decoder_last_pipeline_num_layers": null,
"pipeline_model_parallel_layout": null,
"num_layers_per_virtual_pipeline_stage": null,
"num_virtual_stages_per_pipeline_rank": null,
"microbatch_group_size_per_vp_stage": null,
"overlap_p2p_comm": false,
"overlap_p2p_comm_warmup_flush": false,
"distributed_backend": "nccl",
"distributed_timeout_minutes": 10,
"overlap_grad_reduce": false,
"defer_embedding_wgrad_compute": false,
"wgrad_deferral_limit": 0,
"align_grad_reduce": true,
"ddp_num_buckets": null,
"ddp_bucket_size": null,
"ddp_pad_buckets_for_high_nccl_busbw": false,
"ddp_average_in_collective": false,
"overlap_param_gather": false,
"overlap_param_gather_with_optimizer_step": false,
"align_param_gather": false,
"scatter_gather_tensors_in_pipeline": true,
"use_ring_exchange_p2p": false,
"local_rank": 0,
"lazy_mpu_init": null,
"account_for_embedding_in_pipeline_split": false,
"account_for_loss_in_pipeline_split": false,
"use_distributed_optimizer": true,
"nccl_ub": false,
"disable_symmetric_registration": false,
"use_sharp": false,
"sharp_enabled_group": null,
"use_megatron_fsdp": false,
"init_model_with_meta_device": false,
"data_parallel_sharding_strategy": "no_shard",
"gradient_reduce_div_fusion": true,
"fsdp_double_buffer": false,
"suggested_communication_unit_size": null,
"keep_fp8_transpose_cache": false,
"enable_full_sharding_in_hsdp": false,
"num_distributed_optimizer_instances": 1,
"use_torch_fsdp2": false,
"torch_fsdp2_reshard_after_forward": true,
"context_parallel_size": 1,
"cp_comm_type": [
"p2p"
],
"hierarchical_context_parallel_sizes": null,
"nccl_communicator_config_path": null,
"use_tp_pp_dp_mapping": false,
"replication": false,
"replication_jump": null,
"replication_factor": 2,
"full_validation": false,
"multiple_validation_sets": false,
"eval_iters": 100,
"eval_interval": 500,
"test_mode": false,
"skip_train": false,
"data_path": null,
"split": null,
"train_data_path": [
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter0_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter1_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter2_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter3_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter0_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter1_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter2_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter3_full"
],
"valid_data_path": [
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter0_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter1_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter2_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter3_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter0_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter1_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter2_full",
"/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter3_full"
],
"test_data_path": null,
"data_args_path": null,
"per_split_data_args_path": null,
"data_cache_path": null,
"mmap_bin_files": true,
"mock_data": false,
"seq_length": 16384,
"encoder_seq_length": 16384,
"decoder_seq_length": null,
"retriever_seq_length": 256,
"sample_rate": 1.0,
"mask_prob": 0.15,
"short_seq_prob": 0.1,
"num_workers": 12,
"reset_position_ids": false,
"reset_attention_mask": false,
"eod_mask_loss": true,
"create_attention_mask_in_dataloader": true,
"num_dataset_builder_threads": 1,
"object_storage_cache_path": null,
"mid_level_dataset_surplus": 0.005,
"vocab_size": 130304,
"padded_vocab_size": 130432,
"vocab_file": null,
"merge_file": null,
"vocab_extra_ids": 0,
"tokenizer_type": "NullTokenizer",
"tokenizer_model": null,
"tokenizer_metadata": null,
"tiktoken_pattern": null,
"tiktoken_num_special_tokens": 1000,
"tiktoken_special_tokens": null,
"legacy_tokenizer": false,
"trust_remote_code": false,
"adlr_autoresume": false,
"adlr_autoresume_interval": 1000,
"ict_head_size": null,
"biencoder_projection_dim": 0,
"biencoder_shared_query_context_model": false,
"ict_load": null,
"bert_load": null,
"titles_data_path": null,
"query_in_block_prob": 0.1,
"use_one_sent_docs": false,
"evidence_data_path": null,
"retriever_report_topk_accuracies": [],
"retriever_score_scaling": false,
"block_data_path": null,
"embedding_path": null,
"indexer_batch_size": 128,
"indexer_log_interval": 1000,
"num_classes": 1000,
"img_h": 224,
"img_w": 224,
"num_channels": 3,
"patch_dim": 16,
"classes_fraction": 1.0,
"data_per_class_fraction": 1.0,
"data_sharding": true,
"head_lr_mult": 1.0,
"vision_pretraining": false,
"vision_pretraining_type": "classify",
"vision_backbone_type": "vit",
"swin_backbone_type": "tiny",
"mask_type": "random",
"mask_factor": 1.0,
"iter_per_epoch": 1250,
"dino_local_img_size": 96,
"dino_local_crops_number": 10,
"dino_head_hidden_size": 2048,
"dino_bottleneck_size": 256,
"dino_freeze_last_layer": 1,
"dino_norm_last_layer": false,
"dino_warmup_teacher_temp": 0.04,
"dino_teacher_temp": 0.07,
"dino_warmup_teacher_temp_epochs": 30,
"qk_layernorm": false,
"qk_l2_norm": false,
"expert_model_parallel_size": 1,
"expert_tensor_parallel_size": 1,
"num_experts": null,
"moe_layer_freq": 1,
"moe_ffn_hidden_size": null,
"moe_shared_expert_intermediate_size": null,
"moe_shared_expert_overlap": false,
"moe_grouped_gemm": false,
"moe_use_legacy_grouped_gemm": false,
"moe_layer_recompute": false,
"moe_extended_tp": false,
"moe_use_upcycling": false,
"moe_router_load_balancing_type": "aux_loss",
"moe_router_dtype": null,
"moe_router_fusion": false,
"moe_router_score_function": "softmax",
"moe_router_topk": 2,
"moe_router_pre_softmax": false,
"moe_router_num_groups": null,
"moe_router_group_topk": null,
"moe_router_topk_scaling_factor": null,
"moe_router_enable_expert_bias": false,
"moe_router_bias_update_rate": 0.001,
"moe_router_force_load_balancing": false,
"moe_router_padding_for_fp8": false,
"moe_aux_loss_coeff": 0.0,
"moe_z_loss_coeff": null,
"moe_input_jitter_eps": null,
"moe_per_layer_logging": false,
"moe_token_dispatcher_type": "allgather",
"moe_enable_deepep": false,
"moe_deepep_num_sms": 20,
"moe_permute_fusion": false,
"moe_expert_capacity_factor": null,
"moe_pad_expert_input_to_capacity": false,
"moe_token_drop_policy": "probs",
"moe_apply_probs_on_input": false,
"overlap_moe_expert_parallel_comm": false,
"delay_wgrad_compute": false,
"moe_upcycling_granularity": 1,
"q_lora_rank": null,
"kv_lora_rank": 32,
"qk_head_dim": 128,
"qk_pos_emb_head_dim": 64,
"v_head_dim": 128,
"rotary_scaling_factor": 1.0,
"mscale": 1.0,
"mscale_all_dim": 0.0,
"cache_mla_latents": false,
"heterogeneous_layers_config_path": null,
"heterogeneous_layers_config_encoded_json": null,
"log_params_norm": true,
"log_num_zeros_in_grad": true,
"log_throughput": true,
"log_progress": true,
"timing_log_level": 0,
"log_energy": false,
"barrier_with_L1_time": true,
"timing_log_option": "minmax",
"tensorboard_log_interval": 1,
"tensorboard_queue_size": 1000,
"log_timers_to_tensorboard": true,
"log_loss_scale_to_tensorboard": true,
"log_validation_ppl_to_tensorboard": true,
"log_memory_to_tensorboard": true,
"log_world_size_to_tensorboard": true,
"wandb_project": "Megatron_Stage1",
"wandb_entity": "",
"wandb_exp_name": "1B_nl24_hs2048_gb240_seed2026_20260217_q01_354k_tag_desc_v0",
"wandb_save_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/wandb/",
"logging_level": null,
"log_straggler": false,
"disable_straggler_on_startup": false,
"straggler_ctrlr_port": 65535,
"straggler_minmax_count": 1,
"run_workload_inspector_server": false,
"inference_batch_times_seqlen_threshold": -1,
"max_tokens_to_oom": 12000,
"output_bert_embeddings": false,
"bert_embedder_type": "megatron",
"flash_decode": false,
"enable_cuda_graph": false,
"cuda_graph_warmup_steps": 3,
"external_cuda_graph": false,
"cuda_graph_scope": "full",
"inference_max_batch_size": 8,
"inference_max_seq_length": 2560,
"inference_dynamic_batching": false,
"inference_dynamic_batching_buffer_size_gb": 40.0,
"inference_dynamic_batching_chunk_size": 256,
"inference_dynamic_batching_buffer_guaranteed_fraction": 0.2,
"inference_dynamic_batching_buffer_overflow_factor": null,
"inference_dynamic_batching_max_requests_override": null,
"inference_dynamic_batching_max_tokens_override": null,
"inference_dynamic_batching_num_cuda_graphs": 16,
"inference_dynamic_batching_track_paused_request_events": false,
"symmetric_ar_type": null,
"nccl_all_reduce_for_prefill": false,
"mlp_chunks_for_prefill": 1,
"initialize_socket_comms": false,
"fp8": null,
"fp8_recipe": "delayed",
"fp8_margin": 0,
"fp8_interval": 1,
"fp8_amax_history_len": 1,
"fp8_amax_compute_algo": "most_recent",
"fp8_wgrad": true,
"transformer_impl": "transformer_engine",
"fp8_param_gather": false,
"first_last_layers_bf16": false,
"num_layers_at_start_in_bf16": 1,
"num_layers_at_end_in_bf16": 1,
"fp4": null,
"fp4_recipe": "nvfp4",
"fp4_param": false,
"te_rng_tracker": false,
"inference_rng_tracker": false,
"retro_project_dir": null,
"retro_add_retriever": false,
"retro_cyclic_train_iters": null,
"retro_encoder_layers": 2,
"retro_encoder_hidden_dropout": 0.1,
"retro_encoder_attention_dropout": 0.1,
"retro_num_neighbors": 2,
"retro_num_retrieved_chunks": 2,
"retro_attention_gate": 1,
"retro_verify_neighbor_count": true,
"enable_experimental": false,
"spec": null,
"hybrid_attention_ratio": 0.0,
"hybrid_mlp_ratio": 0.0,
"hybrid_override_pattern": null,
"mamba_state_dim": 128,
"mamba_head_dim": 64,
"mamba_num_groups": 8,
"mamba_num_heads": null,
"is_hybrid_model": false,
"disable_mamba_mem_eff_path": false,
"yaml_cfg": null,
"use_precision_aware_optimizer": false,
"main_grads_dtype": "torch.float32",
"main_params_dtype": "torch.float32",
"exp_avg_dtype": "torch.float32",
"exp_avg_sq_dtype": "torch.float32",
"enable_one_logger": true,
"one_logger_project": "megatron-lm",
"one_logger_run_name": null,
"one_logger_async": false,
"app_tag_run_name": null,
"app_tag_run_version": "0.0.0",
"inprocess_restart": false,
"inprocess_max_iterations": null,
"inprocess_monitor_thread_interval": 1.0,
"inprocess_monitor_process_interval": 1.0,
"inprocess_progress_watchdog_interval": 1.0,
"inprocess_heartbeat_interval": 30,
"inprocess_soft_timeout": 60,
"inprocess_hard_timeout": 90,
"inprocess_heartbeat_timeout": 60,
"inprocess_barrier_timeout": 120,
"inprocess_completion_timeout": 120,
"inprocess_last_call_wait": 1,
"inprocess_termination_grace_time": 1,
"inprocess_granularity": "node",
"inprocess_active_world_size": 48,
"inprocess_empty_cuda_cache": false,
"enable_ft_package": false,
"calc_ft_timeouts": false,
"config_logger_dir": "",
"error_injection_rate": 0,
"error_injection_type": "transient_error",
"rerun_mode": "validate_results",
"enable_msc": true,
"kitchen_config_file": null,
"kitchen_recipe_number": null,
"sft": false,
"sft_tokenizer_prompt_format": "nemotron-h-aligned",
"num_quantizers": 64,
"export_model_type": "GPTModel",
"export_legacy_megatron": false,
"export_te_mcore_model": false,
"export_force_local_attention": false,
"export_kv_cache_quant": false,
"export_real_quant_cfg": "None",
"export_quant_cfg": null,
"export_kd_cfg": null,
"teacher_model_config": null,
"export_kd_teacher_load": null,
"export_kd_teacher_ckpt_format": null,
"finetune_hf_dataset": null,
"finetune_data_split": "train",
"export_qk_l2_norm": false,
"export_moe_apply_probs_on_input": false,
"export_offline_model": false,
"rank": 0,
"world_size": 48,
"use_dist_ckpt": true,
"transformer_pipeline_model_parallel_size": 1,
"data_parallel_size": 48,
"virtual_pipeline_model_parallel_size": null,
"params_dtype": "torch.bfloat16",
"consumed_train_samples": 8640000,
"skipped_train_samples": 0,
"consumed_valid_samples": 1728000,
"variable_seq_lengths": false,
"model_type": "<stub>",
"iteration": 0,
"num_floating_point_operations_so_far": 1.7994699384280842e+22,
"do_train": 1,
"do_valid": 1,
"do_test": 0,
"curr_iteration": 35999
}