{ "architectures": [ "HybridGatedDeltaNetForCausalLM" ], "attn": { "attn_type": "lte", "buffer_size": 512, "conv_dilation": [ 2, 2, 2 ], "conv_size": [ 3, 3, 3 ], "dropout_prob": 0.1, "l1_alpha": 1.5, "l1_alpha_neg": 2, "l1_lambda_init": 1e-09, "l1_lambda_initial_steps": 0, "l1_lambda_max": 1.0, "l1_lambda_min": 1e-09, "l1_lambda_shared": "head", "l1_lambda_tolerate": [ 0.95, 1.0 ], "l1_lambda_update_steps": 32, "l1_warmup_steps": 0, "layers": [ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23 ], "num_heads": 32, "num_kv_heads": 8, "qkv_bias": false, "rope_theta": 10000.0, "routing_freeze_steps": 0, "routing_grad": true, "routing_grad_mode": "mask_sum", "routing_method": "sigmoid", "routing_noise_steps": 0, "routing_shared": "head", "routing_with_rope": false, "sink_size": 4, "window_size": 767 }, "attn_mode": "chunk", "bos_token_id": 1, "conv_size": 4, "eos_token_id": 2, "expand_v": 1, "fuse_cross_entropy": true, "fuse_norm": true, "fuse_swiglu": true, "head_dim": 256, "hidden_act": "swish", "hidden_ratio": 4, "hidden_size": 2048, "initializer_range": 0.006, "intermediate_size": null, "max_position_embeddings": 4096, "model_type": "hybrid_gated_deltanet", "norm_eps": 1e-06, "num_heads": 8, "num_hidden_layers": 24, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.49.0", "use_cache": true, "use_gate": true, "use_short_conv": true, "vocab_size": 32000 }