KBLab
/

megatron.bert-base.wordpiece-32k-pretok.25k-steps

+cfg:
+  micro_batch_size: 62
+  global_batch_size: 7936
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  encoder_seq_length: 512
+  max_position_embeddings: 512
+  num_layers: 12
+  hidden_size: 768
+  ffn_hidden_size: 3072
+  num_attention_heads: 12
+  init_method_std: 0.02
+  hidden_dropout: 0.1
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  layernorm_epsilon: 1.0e-05
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  bert_binary_head: true
+  tokenizer:
+    library: huggingface
+    type: KBLab/wordpiece-32k-pretok-small_data-tokenizer
+    model: null
+    vocab_file: null
+    merge_file: null
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: false
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: false
+  seed: 666
+  use_cpu_initialization: false
+  onnx_safe: false
+  gradient_as_bucket_view: true
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+  data:
+    data_prefix:
+    - 1
+    - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/wikipedia-wordpiece-32k-pretok-small_data_text_sentence
+    - 1
+    - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/edepos_html-wordpiece-32k-pretok-small_data_text_sentence
+    - 1
+    - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/oscar-wordpiece-32k-pretok-small_data_text_sentence
+    - 1
+    - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/kw3-2017-wordpiece-32k-pretok-small_data_text_sentence
+    - 1
+    - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/issues-wordpiece-32k-pretok-small_data_text_sentence
+    - 1
+    - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/mc4-wordpiece-32k-pretok-small_data_text_sentence
+    index_mapping_dir: /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/npy_files/
+    data_impl: mmap
+    splits_string: 980,10,10
+    seq_length: 512
+    skip_warmup: true
+    num_workers: 32
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    masked_lm_prob: 0.15
+    short_seq_prob: 0.1
+  optim:
+    name: fused_adam
+    lr: 0.0006
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 500
+      min_lr: 2.0e-05
+  precision: 16