checkpoint:
  checkpoint_path: null
  early_stopping_metric: eval_loss
  early_stopping_patience: 6
  early_stopping_threshold: 0.005
  eval_steps: 500
  logging_steps: 10
  resume_from_checkpoint: false
  run_name: slavic-expert-gemma4b
  save_steps: 1000
  save_total_limit: 5
  serialization_dir: /work/nvme/bfzp/checkpoints/slavic_gemma_4b_expert
  wandb_entity: expert-multilingual-pretraining
  wandb_project: gemma-runs
data:
  data_fraction: 1.0
  data_prefix: /work/nvme/bfzp/madlad-tokenized-5B
  families:
  - Slavic
  max_eval_samples: 10000
  max_length: 2048
  num_proc: 4
  packing: true
  packing_strategy: wrapped
eval:
  eval_only: false
  per_language_eval: false
model:
  initialization_dir: null
  model_name: google/gemma-3-4b-pt
  torch_compile: false
  use_gradient_checkpointing: true
regularizer:
  attention_only: false
  exclude_patterns: null
  first_layers: 6
  freeze_middle: false
  include_patterns: null
  lambda_first_layers: 0.1
  lambda_l2: 0.01
  lambda_last_layers: 0.1
  lambda_middle_layers: 0.001
  last_layers: 4
  num_layers: null
  regularization_type: none
  revert_middle: false
revert:
  base_model_path: null
  checkpoint_path: null
  revert_checkpoint_only: false
  revert_output_path: null
training:
  grad_accum: 12
  lr: 2.0e-05
  max_grad_norm: 1.0
  max_steps: 16954
  seed: 42
  train_bsz: 4
  valid_bsz: 4
  warmup_steps: 200
  weight_decay: 0.01