# HybriKo Default Configuration # ~117.8M parameters, optimized for Colab T4 (16GB VRAM) model: d_model: 768 # Hidden dimension n_layers: 12 # Number of hybrid layers vocab_size: 32000 # Vocabulary size n_heads: 12 # Attention heads n_kv_heads: 3 # KV heads for GQA (1:4 ratio) ff_mult: 3 # Feed-forward multiplier max_seq_len: 1024 # Maximum sequence length training: learning_rate: 3.0e-4 weight_decay: 0.1 warmup_steps: 20 max_steps: 1000 grad_accum_steps: 1 save_steps: 500 batch_size: 16 max_length: 512 # Training sequence length data: num_samples: 30000 min_length: 50 tokenizer_samples: 100000 tokenizer: vocab_size: 32000 model_type: unigram character_coverage: 0.9995