# Colloquial SFT: google/gemma-4-E4B-it, 50k pairs (data/russian_colloquial_sft.jsonl)
# GPU 24GB+: batch 1 + grad accum 8. RunPod: scripts/run_train_colloquial_e4b_gpu.sh
model_id: google/gemma-4-E4B-it

dataset_path: data/russian_colloquial_sft.jsonl
dataset_repeats: 1

max_seq_length: 512
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 2
learning_rate: 1.0e-4
warmup_ratio: 0.03
lr_scheduler_type: cosine

lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - gate_proj
  - up_proj
  - down_proj

gradient_checkpointing: true
use_cache: false
bf16: true
fp16: false

device: auto
logging_steps: 25
save_strategy: steps
save_steps: 1000
packing: false

output_dir: outputs/gemma4-e4b-lora-colloquial-ru
assistant_only_loss: true