# Colloquial SFT: google/gemma-4-E4B-it, 50k pairs (data/russian_colloquial_sft.jsonl) # GPU 24GB+: batch 1 + grad accum 8. RunPod: scripts/run_train_colloquial_e4b_gpu.sh model_id: google/gemma-4-E4B-it dataset_path: data/russian_colloquial_sft.jsonl dataset_repeats: 1 max_seq_length: 512 per_device_train_batch_size: 1 gradient_accumulation_steps: 8 num_train_epochs: 2 learning_rate: 1.0e-4 warmup_ratio: 0.03 lr_scheduler_type: cosine lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - gate_proj - up_proj - down_proj gradient_checkpointing: true use_cache: false bf16: true fp16: false device: auto logging_steps: 25 save_strategy: steps save_steps: 1000 packing: false output_dir: outputs/gemma4-e4b-lora-colloquial-ru assistant_only_loss: true