checkpoint: checkpoint_path: null early_stopping_metric: eval_loss early_stopping_patience: 6 early_stopping_threshold: 0.005 eval_steps: 500 logging_steps: 10 resume_from_checkpoint: false run_name: slavic-expert-gemma4b save_steps: 1000 save_total_limit: 5 serialization_dir: /work/nvme/bfzp/checkpoints/slavic_gemma_4b_expert wandb_entity: expert-multilingual-pretraining wandb_project: gemma-runs data: data_fraction: 1.0 data_prefix: /work/nvme/bfzp/madlad-tokenized-5B families: - Slavic max_eval_samples: 10000 max_length: 2048 num_proc: 4 packing: true packing_strategy: wrapped eval: eval_only: false per_language_eval: false model: initialization_dir: null model_name: google/gemma-3-4b-pt torch_compile: false use_gradient_checkpointing: true regularizer: attention_only: false exclude_patterns: null first_layers: 6 freeze_middle: false include_patterns: null lambda_first_layers: 0.1 lambda_l2: 0.01 lambda_last_layers: 0.1 lambda_middle_layers: 0.001 last_layers: 4 num_layers: null regularization_type: none revert_middle: false revert: base_model_path: null checkpoint_path: null revert_checkpoint_only: false revert_output_path: null training: grad_accum: 12 lr: 2.0e-05 max_grad_norm: 1.0 max_steps: 16954 seed: 42 train_bsz: 4 valid_bsz: 4 warmup_steps: 200 weight_decay: 0.01