| { |
| "_name_or_path": "/idiap/home/mhe/repo/flame/exp/hybrid_gated_deltanet-nsa-1.3B-30B/batch1M.seqlen4096.warmup512.steps30720.lr3e-4/config.json", |
| "architectures": [ |
| "HybridGatedDeltaNetForCausalLM" |
| ], |
| "attn": { |
| "attn_type": "nsa", |
| "block_counts": 16, |
| "block_size": 64, |
| "head_dim": 64, |
| "layers": [ |
| 1, |
| 3, |
| 5, |
| 7, |
| 9, |
| 11, |
| 13, |
| 15, |
| 17, |
| 19, |
| 21, |
| 23 |
| ], |
| "num_heads": 32, |
| "num_kv_heads": 2, |
| "qkv_bias": false, |
| "rope_theta": 10000.0, |
| "window_size": 512 |
| }, |
| "attn_mode": "chunk", |
| "bos_token_id": 1, |
| "conv_size": 4, |
| "eos_token_id": 2, |
| "expand_v": 1, |
| "fuse_cross_entropy": true, |
| "fuse_norm": true, |
| "fuse_swiglu": true, |
| "head_dim": 256, |
| "hidden_act": "swish", |
| "hidden_ratio": 4, |
| "hidden_size": 2048, |
| "initializer_range": 0.006, |
| "intermediate_size": null, |
| "max_position_embeddings": 4096, |
| "model_type": "hybrid_gated_deltanet", |
| "norm_eps": 1e-06, |
| "num_heads": 8, |
| "num_hidden_layers": 24, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.49.0", |
| "use_beta": true, |
| "use_cache": true, |
| "use_gate": true, |
| "use_output_norm": true, |
| "use_short_conv": true, |
| "vocab_size": 32000 |
| } |
|
|