| { |
| "num_layers": 24, |
| "encoder_num_layers": 24, |
| "decoder_num_layers": null, |
| "hidden_size": 2048, |
| "ffn_hidden_size": 5632, |
| "num_attention_heads": 32, |
| "attention_backend": "<stub>", |
| "kv_channels": 64, |
| "group_query_attention": true, |
| "num_query_groups": 8, |
| "softmax_type": "vanilla", |
| "window_size": null, |
| "window_attn_skip_freq": null, |
| "max_position_embeddings": 16384, |
| "position_embedding_type": "rope", |
| "relative_attention_num_buckets": 32, |
| "relative_attention_max_distance": 128, |
| "use_rotary_position_embeddings": false, |
| "rotary_base": 500000, |
| "rotary_percent": 1.0, |
| "rotary_interleaved": false, |
| "rotary_seq_len_interpolation_factor": null, |
| "use_rope_scaling": false, |
| "rope_scaling_factor": 8.0, |
| "no_rope_freq": null, |
| "add_position_embedding": true, |
| "mrope_section": null, |
| "make_vocab_size_divisible_by": 128, |
| "normalization": "RMSNorm", |
| "norm_epsilon": 1e-06, |
| "apply_layernorm_1p": false, |
| "apply_residual_connection_post_layernorm": false, |
| "openai_gelu": false, |
| "squared_relu": false, |
| "swiglu": true, |
| "quick_geglu": false, |
| "activation_func_clamp_value": null, |
| "glu_linear_offset": 0.0, |
| "onnx_safe": null, |
| "bert_binary_head": true, |
| "untie_embeddings_and_output_weights": true, |
| "multi_latent_attention": false, |
| "mtp_num_layers": null, |
| "mtp_loss_scaling_factor": 0.1, |
| "attention_dropout": 0.0, |
| "hidden_dropout": 0.0, |
| "weight_decay": 0.1, |
| "start_weight_decay": 0.1, |
| "end_weight_decay": 0.1, |
| "weight_decay_incr_style": "constant", |
| "clip_grad": 1.0, |
| "adam_beta1": 0.9, |
| "adam_beta2": 0.95, |
| "adam_eps": 1e-08, |
| "sgd_momentum": 0.9, |
| "micro_batch_size": 1, |
| "global_batch_size": 240, |
| "rampup_batch_size": null, |
| "decrease_batch_size_if_needed": false, |
| "recompute_granularity": null, |
| "check_for_nan_in_loss_and_grad": true, |
| "check_for_spiky_loss": false, |
| "check_for_large_grads": false, |
| "distribute_saved_activations": false, |
| "recompute_method": null, |
| "recompute_num_layers": null, |
| "recompute_modules": null, |
| "clone_scatter_output_in_embedding": true, |
| "profile": false, |
| "profile_step_start": 10, |
| "profile_step_end": 12, |
| "iterations_to_skip": [], |
| "result_rejected_tracker_filename": null, |
| "enable_gloo_process_groups": true, |
| "use_pytorch_profiler": false, |
| "profile_ranks": [ |
| 0 |
| ], |
| "record_memory_history": false, |
| "memory_snapshot_path": "snapshot.pickle", |
| "tp_comm_overlap": false, |
| "tp_comm_overlap_cfg": null, |
| "tp_comm_overlap_ag": true, |
| "tp_comm_overlap_rs": true, |
| "tp_comm_overlap_rs_dgrad": false, |
| "tp_comm_bulk_dgrad": true, |
| "tp_comm_bulk_wgrad": true, |
| "tp_comm_bootstrap_backend": "nccl", |
| "use_cpu_initialization": null, |
| "empty_unused_memory_level": 0, |
| "deterministic_mode": false, |
| "check_weight_hash_across_dp_replicas_interval": null, |
| "calculate_per_token_loss": false, |
| "train_sync_interval": null, |
| "train_iters": 36000, |
| "train_samples": null, |
| "log_interval": 50, |
| "exit_interval": null, |
| "exit_duration_in_mins": null, |
| "exit_signal_handler": false, |
| "tensorboard_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/tensorboard/", |
| "masked_softmax_fusion": true, |
| "bias_gelu_fusion": true, |
| "bias_swiglu_fusion": true, |
| "use_fused_weighted_squared_relu": false, |
| "bias_dropout_fusion": true, |
| "apply_rope_fusion": true, |
| "rope_type": null, |
| "cross_entropy_loss_fusion": true, |
| "cross_entropy_fusion_impl": "native", |
| "use_flash_attn": false, |
| "add_bias_linear": true, |
| "add_qkv_bias": true, |
| "optimizer": "adam", |
| "optimizer_cpu_offload": false, |
| "optimizer_offload_fraction": 1.0, |
| "use_torch_optimizer_for_cpu_offload": false, |
| "overlap_cpu_optimizer_d2h_h2d": false, |
| "pin_cpu_grads": true, |
| "pin_cpu_params": true, |
| "dataloader_type": "cyclic", |
| "async_tensor_model_parallel_allreduce": true, |
| "no_persist_layer_norm": false, |
| "sequence_parallel": false, |
| "gradient_accumulation_fusion": true, |
| "deprecated_use_mcore_models": false, |
| "use_legacy_models": false, |
| "manual_gc": false, |
| "manual_gc_interval": 0, |
| "manual_gc_eval": true, |
| "tp_comm_split_ag": true, |
| "tp_comm_split_rs": true, |
| "pipeline_model_parallel_comm_backend": null, |
| "high_priority_stream_groups": [], |
| "use_te_activation_func": false, |
| "perform_rl_step": false, |
| "rl_prompts_per_eval": 32, |
| "grpo_prompts_per_step": 32, |
| "grpo_group_size": 2, |
| "grpo_iterations": 2, |
| "grpo_clamp_eps_lower": 0.01, |
| "grpo_clamp_eps_upper": 0.01, |
| "grpo_kl_beta": 0.001, |
| "grpo_entropy_term_weight": 0.0, |
| "grpo_filter_groups_with_same_reward": false, |
| "grpo_default_temperature": 1.0, |
| "grpo_default_top_p": 0, |
| "langrl_inference_server_type": "inplace_megatron", |
| "langrl_inference_server_conversation_template": null, |
| "langrl_env_config": null, |
| "rl_offload_optimizer_during_inference": false, |
| "rl_offload_kv_cache_during_training": false, |
| "rl_remove_kv_cache_during_training": false, |
| "rl_reset_cuda_graphs": false, |
| "rl_partial_rollouts": false, |
| "rl_inference_logprobs_is_correction": false, |
| "rl_importance_sampling_truncation_coef": null, |
| "rl_calculate_intra_group_similarity": false, |
| "seed": 2026, |
| "data_parallel_random_init": false, |
| "init_method_std": 0.02, |
| "embedding_init_method_std": null, |
| "init_method_xavier_uniform": false, |
| "lr": 0.0001, |
| "lr_decay_style": "cosine", |
| "lr_wsd_decay_style": "exponential", |
| "lr_decay_iters": 36000, |
| "lr_decay_samples": null, |
| "lr_wsd_decay_samples": null, |
| "lr_wsd_decay_iters": null, |
| "lr_warmup_fraction": null, |
| "lr_warmup_iters": 5000, |
| "lr_warmup_samples": 0, |
| "lr_warmup_init": 0.0, |
| "min_lr": 1e-05, |
| "override_opt_param_scheduler": false, |
| "use_checkpoint_opt_param_scheduler": false, |
| "decoupled_lr": null, |
| "decoupled_min_lr": null, |
| "save": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/checkpoint/", |
| "save_interval": 2000, |
| "save_retain_interval": null, |
| "no_save_optim": null, |
| "no_save_rng": null, |
| "load": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260132_q01_v0/checkpoint", |
| "no_load_optim": null, |
| "load_main_params_from_ckpt": null, |
| "no_load_rng": null, |
| "strict_fsdp_dtensor_load": true, |
| "non_persistent_save_interval": null, |
| "non_persistent_ckpt_type": null, |
| "non_persistent_global_ckpt_dir": null, |
| "non_persistent_local_ckpt_dir": null, |
| "non_persistent_local_ckpt_algo": "fully_parallel", |
| "finetune": true, |
| "pretrained_checkpoint": null, |
| "ckpt_step": null, |
| "perform_initialization": true, |
| "use_checkpoint_args": false, |
| "use_mp_args_from_checkpoint_args": false, |
| "use_tokenizer_model_from_checkpoint_args": true, |
| "exit_on_missing_checkpoint": false, |
| "use_dist_ckpt_deprecated": false, |
| "use_persistent_ckpt_worker": false, |
| "auto_detect_ckpt_format": false, |
| "dist_ckpt_format_deprecated": null, |
| "ckpt_format": "torch_dist", |
| "ckpt_convert_format": null, |
| "ckpt_convert_save": null, |
| "ckpt_convert_update_legacy_dist_opt_format": false, |
| "ckpt_fully_parallel_save_deprecated": false, |
| "ckpt_fully_parallel_save": true, |
| "async_save": null, |
| "ckpt_fully_parallel_load": false, |
| "ckpt_assume_constant_structure": false, |
| "dist_ckpt_strictness": "assume_ok_unexpected", |
| "dist_ckpt_save_pre_mcore_014": false, |
| "dist_ckpt_optim_fully_reshardable": false, |
| "distrib_optim_fully_reshardable_mem_efficient": false, |
| "load_model_opt_format": false, |
| "fp16": false, |
| "bf16": true, |
| "grad_reduce_in_bf16": false, |
| "loss_scale": null, |
| "initial_loss_scale": 4294967296, |
| "min_loss_scale": 1.0, |
| "loss_scale_window": 1000, |
| "hysteresis": 2, |
| "fp32_residual_connection": false, |
| "apply_query_key_layer_scaling": false, |
| "attention_softmax_in_fp32": false, |
| "accumulate_allreduce_grads_in_fp32": true, |
| "fp16_lm_cross_entropy": false, |
| "disable_bf16_reduced_precision_matmul": false, |
| "reuse_grad_buf_for_mxfp8_param_ag": false, |
| "tensor_model_parallel_size": 1, |
| "pipeline_model_parallel_size": 1, |
| "decoder_first_pipeline_num_layers": null, |
| "decoder_last_pipeline_num_layers": null, |
| "pipeline_model_parallel_layout": null, |
| "num_layers_per_virtual_pipeline_stage": null, |
| "num_virtual_stages_per_pipeline_rank": null, |
| "microbatch_group_size_per_vp_stage": null, |
| "overlap_p2p_comm": false, |
| "overlap_p2p_comm_warmup_flush": false, |
| "distributed_backend": "nccl", |
| "distributed_timeout_minutes": 10, |
| "overlap_grad_reduce": false, |
| "defer_embedding_wgrad_compute": false, |
| "wgrad_deferral_limit": 0, |
| "align_grad_reduce": true, |
| "ddp_num_buckets": null, |
| "ddp_bucket_size": null, |
| "ddp_pad_buckets_for_high_nccl_busbw": false, |
| "ddp_average_in_collective": false, |
| "overlap_param_gather": false, |
| "overlap_param_gather_with_optimizer_step": false, |
| "align_param_gather": false, |
| "scatter_gather_tensors_in_pipeline": true, |
| "use_ring_exchange_p2p": false, |
| "local_rank": 0, |
| "lazy_mpu_init": null, |
| "account_for_embedding_in_pipeline_split": false, |
| "account_for_loss_in_pipeline_split": false, |
| "use_distributed_optimizer": true, |
| "nccl_ub": false, |
| "disable_symmetric_registration": false, |
| "use_sharp": false, |
| "sharp_enabled_group": null, |
| "use_megatron_fsdp": false, |
| "init_model_with_meta_device": false, |
| "data_parallel_sharding_strategy": "no_shard", |
| "gradient_reduce_div_fusion": true, |
| "fsdp_double_buffer": false, |
| "suggested_communication_unit_size": null, |
| "keep_fp8_transpose_cache": false, |
| "enable_full_sharding_in_hsdp": false, |
| "num_distributed_optimizer_instances": 1, |
| "use_torch_fsdp2": false, |
| "torch_fsdp2_reshard_after_forward": true, |
| "context_parallel_size": 1, |
| "cp_comm_type": [ |
| "p2p" |
| ], |
| "hierarchical_context_parallel_sizes": null, |
| "nccl_communicator_config_path": null, |
| "use_tp_pp_dp_mapping": false, |
| "replication": false, |
| "replication_jump": null, |
| "replication_factor": 2, |
| "full_validation": false, |
| "multiple_validation_sets": false, |
| "eval_iters": 100, |
| "eval_interval": 500, |
| "test_mode": false, |
| "skip_train": false, |
| "data_path": null, |
| "split": null, |
| "train_data_path": [ |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter0_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter1_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter2_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/train_filter3_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter0_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter1_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter2_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/train_filter3_full" |
| ], |
| "valid_data_path": [ |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter0_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter1_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter2_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withtags_trainval_split4_merged/valid_filter3_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter0_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter1_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter2_full", |
| "/2214/dongyuanliang/torchtitan/washed_100w_tokens_randomized_seed1998_valid_seed4619_jfrvq64_ver6_textaudio_withdescription_trainval_split4_merged/valid_filter3_full" |
| ], |
| "test_data_path": null, |
| "data_args_path": null, |
| "per_split_data_args_path": null, |
| "data_cache_path": null, |
| "mmap_bin_files": true, |
| "mock_data": false, |
| "seq_length": 16384, |
| "encoder_seq_length": 16384, |
| "decoder_seq_length": null, |
| "retriever_seq_length": 256, |
| "sample_rate": 1.0, |
| "mask_prob": 0.15, |
| "short_seq_prob": 0.1, |
| "num_workers": 12, |
| "reset_position_ids": false, |
| "reset_attention_mask": false, |
| "eod_mask_loss": true, |
| "create_attention_mask_in_dataloader": true, |
| "num_dataset_builder_threads": 1, |
| "object_storage_cache_path": null, |
| "mid_level_dataset_surplus": 0.005, |
| "vocab_size": 130304, |
| "padded_vocab_size": 130432, |
| "vocab_file": null, |
| "merge_file": null, |
| "vocab_extra_ids": 0, |
| "tokenizer_type": "NullTokenizer", |
| "tokenizer_model": null, |
| "tokenizer_metadata": null, |
| "tiktoken_pattern": null, |
| "tiktoken_num_special_tokens": 1000, |
| "tiktoken_special_tokens": null, |
| "legacy_tokenizer": false, |
| "trust_remote_code": false, |
| "adlr_autoresume": false, |
| "adlr_autoresume_interval": 1000, |
| "ict_head_size": null, |
| "biencoder_projection_dim": 0, |
| "biencoder_shared_query_context_model": false, |
| "ict_load": null, |
| "bert_load": null, |
| "titles_data_path": null, |
| "query_in_block_prob": 0.1, |
| "use_one_sent_docs": false, |
| "evidence_data_path": null, |
| "retriever_report_topk_accuracies": [], |
| "retriever_score_scaling": false, |
| "block_data_path": null, |
| "embedding_path": null, |
| "indexer_batch_size": 128, |
| "indexer_log_interval": 1000, |
| "num_classes": 1000, |
| "img_h": 224, |
| "img_w": 224, |
| "num_channels": 3, |
| "patch_dim": 16, |
| "classes_fraction": 1.0, |
| "data_per_class_fraction": 1.0, |
| "data_sharding": true, |
| "head_lr_mult": 1.0, |
| "vision_pretraining": false, |
| "vision_pretraining_type": "classify", |
| "vision_backbone_type": "vit", |
| "swin_backbone_type": "tiny", |
| "mask_type": "random", |
| "mask_factor": 1.0, |
| "iter_per_epoch": 1250, |
| "dino_local_img_size": 96, |
| "dino_local_crops_number": 10, |
| "dino_head_hidden_size": 2048, |
| "dino_bottleneck_size": 256, |
| "dino_freeze_last_layer": 1, |
| "dino_norm_last_layer": false, |
| "dino_warmup_teacher_temp": 0.04, |
| "dino_teacher_temp": 0.07, |
| "dino_warmup_teacher_temp_epochs": 30, |
| "qk_layernorm": false, |
| "qk_l2_norm": false, |
| "expert_model_parallel_size": 1, |
| "expert_tensor_parallel_size": 1, |
| "num_experts": null, |
| "moe_layer_freq": 1, |
| "moe_ffn_hidden_size": null, |
| "moe_shared_expert_intermediate_size": null, |
| "moe_shared_expert_overlap": false, |
| "moe_grouped_gemm": false, |
| "moe_use_legacy_grouped_gemm": false, |
| "moe_layer_recompute": false, |
| "moe_extended_tp": false, |
| "moe_use_upcycling": false, |
| "moe_router_load_balancing_type": "aux_loss", |
| "moe_router_dtype": null, |
| "moe_router_fusion": false, |
| "moe_router_score_function": "softmax", |
| "moe_router_topk": 2, |
| "moe_router_pre_softmax": false, |
| "moe_router_num_groups": null, |
| "moe_router_group_topk": null, |
| "moe_router_topk_scaling_factor": null, |
| "moe_router_enable_expert_bias": false, |
| "moe_router_bias_update_rate": 0.001, |
| "moe_router_force_load_balancing": false, |
| "moe_router_padding_for_fp8": false, |
| "moe_aux_loss_coeff": 0.0, |
| "moe_z_loss_coeff": null, |
| "moe_input_jitter_eps": null, |
| "moe_per_layer_logging": false, |
| "moe_token_dispatcher_type": "allgather", |
| "moe_enable_deepep": false, |
| "moe_deepep_num_sms": 20, |
| "moe_permute_fusion": false, |
| "moe_expert_capacity_factor": null, |
| "moe_pad_expert_input_to_capacity": false, |
| "moe_token_drop_policy": "probs", |
| "moe_apply_probs_on_input": false, |
| "overlap_moe_expert_parallel_comm": false, |
| "delay_wgrad_compute": false, |
| "moe_upcycling_granularity": 1, |
| "q_lora_rank": null, |
| "kv_lora_rank": 32, |
| "qk_head_dim": 128, |
| "qk_pos_emb_head_dim": 64, |
| "v_head_dim": 128, |
| "rotary_scaling_factor": 1.0, |
| "mscale": 1.0, |
| "mscale_all_dim": 0.0, |
| "cache_mla_latents": false, |
| "heterogeneous_layers_config_path": null, |
| "heterogeneous_layers_config_encoded_json": null, |
| "log_params_norm": true, |
| "log_num_zeros_in_grad": true, |
| "log_throughput": true, |
| "log_progress": true, |
| "timing_log_level": 0, |
| "log_energy": false, |
| "barrier_with_L1_time": true, |
| "timing_log_option": "minmax", |
| "tensorboard_log_interval": 1, |
| "tensorboard_queue_size": 1000, |
| "log_timers_to_tensorboard": true, |
| "log_loss_scale_to_tensorboard": true, |
| "log_validation_ppl_to_tensorboard": true, |
| "log_memory_to_tensorboard": true, |
| "log_world_size_to_tensorboard": true, |
| "wandb_project": "Megatron_Stage1", |
| "wandb_entity": "", |
| "wandb_exp_name": "1B_nl24_hs2048_gb240_seed2026_20260217_q01_354k_tag_desc_v0", |
| "wandb_save_dir": "/2214/ljf/Megatron-LM/train_logs/1B_nl24_hs2048_gb240_seed2026/20260217_q01_354k_tag_desc_v0/wandb/", |
| "logging_level": null, |
| "log_straggler": false, |
| "disable_straggler_on_startup": false, |
| "straggler_ctrlr_port": 65535, |
| "straggler_minmax_count": 1, |
| "run_workload_inspector_server": false, |
| "inference_batch_times_seqlen_threshold": -1, |
| "max_tokens_to_oom": 12000, |
| "output_bert_embeddings": false, |
| "bert_embedder_type": "megatron", |
| "flash_decode": false, |
| "enable_cuda_graph": false, |
| "cuda_graph_warmup_steps": 3, |
| "external_cuda_graph": false, |
| "cuda_graph_scope": "full", |
| "inference_max_batch_size": 8, |
| "inference_max_seq_length": 2560, |
| "inference_dynamic_batching": false, |
| "inference_dynamic_batching_buffer_size_gb": 40.0, |
| "inference_dynamic_batching_chunk_size": 256, |
| "inference_dynamic_batching_buffer_guaranteed_fraction": 0.2, |
| "inference_dynamic_batching_buffer_overflow_factor": null, |
| "inference_dynamic_batching_max_requests_override": null, |
| "inference_dynamic_batching_max_tokens_override": null, |
| "inference_dynamic_batching_num_cuda_graphs": 16, |
| "inference_dynamic_batching_track_paused_request_events": false, |
| "symmetric_ar_type": null, |
| "nccl_all_reduce_for_prefill": false, |
| "mlp_chunks_for_prefill": 1, |
| "initialize_socket_comms": false, |
| "fp8": null, |
| "fp8_recipe": "delayed", |
| "fp8_margin": 0, |
| "fp8_interval": 1, |
| "fp8_amax_history_len": 1, |
| "fp8_amax_compute_algo": "most_recent", |
| "fp8_wgrad": true, |
| "transformer_impl": "transformer_engine", |
| "fp8_param_gather": false, |
| "first_last_layers_bf16": false, |
| "num_layers_at_start_in_bf16": 1, |
| "num_layers_at_end_in_bf16": 1, |
| "fp4": null, |
| "fp4_recipe": "nvfp4", |
| "fp4_param": false, |
| "te_rng_tracker": false, |
| "inference_rng_tracker": false, |
| "retro_project_dir": null, |
| "retro_add_retriever": false, |
| "retro_cyclic_train_iters": null, |
| "retro_encoder_layers": 2, |
| "retro_encoder_hidden_dropout": 0.1, |
| "retro_encoder_attention_dropout": 0.1, |
| "retro_num_neighbors": 2, |
| "retro_num_retrieved_chunks": 2, |
| "retro_attention_gate": 1, |
| "retro_verify_neighbor_count": true, |
| "enable_experimental": false, |
| "spec": null, |
| "hybrid_attention_ratio": 0.0, |
| "hybrid_mlp_ratio": 0.0, |
| "hybrid_override_pattern": null, |
| "mamba_state_dim": 128, |
| "mamba_head_dim": 64, |
| "mamba_num_groups": 8, |
| "mamba_num_heads": null, |
| "is_hybrid_model": false, |
| "disable_mamba_mem_eff_path": false, |
| "yaml_cfg": null, |
| "use_precision_aware_optimizer": false, |
| "main_grads_dtype": "torch.float32", |
| "main_params_dtype": "torch.float32", |
| "exp_avg_dtype": "torch.float32", |
| "exp_avg_sq_dtype": "torch.float32", |
| "enable_one_logger": true, |
| "one_logger_project": "megatron-lm", |
| "one_logger_run_name": null, |
| "one_logger_async": false, |
| "app_tag_run_name": null, |
| "app_tag_run_version": "0.0.0", |
| "inprocess_restart": false, |
| "inprocess_max_iterations": null, |
| "inprocess_monitor_thread_interval": 1.0, |
| "inprocess_monitor_process_interval": 1.0, |
| "inprocess_progress_watchdog_interval": 1.0, |
| "inprocess_heartbeat_interval": 30, |
| "inprocess_soft_timeout": 60, |
| "inprocess_hard_timeout": 90, |
| "inprocess_heartbeat_timeout": 60, |
| "inprocess_barrier_timeout": 120, |
| "inprocess_completion_timeout": 120, |
| "inprocess_last_call_wait": 1, |
| "inprocess_termination_grace_time": 1, |
| "inprocess_granularity": "node", |
| "inprocess_active_world_size": 48, |
| "inprocess_empty_cuda_cache": false, |
| "enable_ft_package": false, |
| "calc_ft_timeouts": false, |
| "config_logger_dir": "", |
| "error_injection_rate": 0, |
| "error_injection_type": "transient_error", |
| "rerun_mode": "validate_results", |
| "enable_msc": true, |
| "kitchen_config_file": null, |
| "kitchen_recipe_number": null, |
| "sft": false, |
| "sft_tokenizer_prompt_format": "nemotron-h-aligned", |
| "num_quantizers": 64, |
| "export_model_type": "GPTModel", |
| "export_legacy_megatron": false, |
| "export_te_mcore_model": false, |
| "export_force_local_attention": false, |
| "export_kv_cache_quant": false, |
| "export_real_quant_cfg": "None", |
| "export_quant_cfg": null, |
| "export_kd_cfg": null, |
| "teacher_model_config": null, |
| "export_kd_teacher_load": null, |
| "export_kd_teacher_ckpt_format": null, |
| "finetune_hf_dataset": null, |
| "finetune_data_split": "train", |
| "export_qk_l2_norm": false, |
| "export_moe_apply_probs_on_input": false, |
| "export_offline_model": false, |
| "rank": 0, |
| "world_size": 48, |
| "use_dist_ckpt": true, |
| "transformer_pipeline_model_parallel_size": 1, |
| "data_parallel_size": 48, |
| "virtual_pipeline_model_parallel_size": null, |
| "params_dtype": "torch.bfloat16", |
| "consumed_train_samples": 8640000, |
| "skipped_train_samples": 0, |
| "consumed_valid_samples": 1728000, |
| "variable_seq_lengths": false, |
| "model_type": "<stub>", |
| "iteration": 0, |
| "num_floating_point_operations_so_far": 1.7994699384280842e+22, |
| "do_train": 1, |
| "do_valid": 1, |
| "do_test": 0, |
| "curr_iteration": 35999 |
| } |