Upload 10 files

Browse files

Files changed (10) hide show

LICENSE.txt +21 -0
README.md +54 -3
config.json +59 -0
generation_config.json +6 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +346 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +44 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Idiap Research Institute
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,54 @@
----
-license: mit
----

+---
+license: mit
+---
+# gated-deltanet-nsa-1.4B-30B
+Gated DeltaNet + native sparse attention (1.4B params, 30B tokens)
+## Overview
+* **Training**: gated-deltanet-attn-0.4B-10B was trained on [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu), which is realeased under [ODC-By v1.0](https://opendatacommons.org/licenses/by/1-0/)
+* **Parameters**: 1.4B
+* **Task**: Language modeling
+* **Framework**: HuggingFace, [flash-linear-attention](https://github.com/fla-org/flash-linear-attention)
+* **Output structure**: [batch_size, sequence_length, num_logits]
+## Performance
+Various; available in paper
+## Running Code
+* Minimal code to instantiate the model and perform inference:
+```python
+# Requires flash-linear-attention (https://github.com/fla-org/flash-linear-attention)
+import fla
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained(path_to_model).cuda()
+tokenizer = AutoTokenizer.from_pretrained(path_to_model).cuda()
+input_ids = tokenizer("All human beings are", return_tensors="pt").input_ids
+outputs = model.generate(input_ids, max_length=15)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+## License
+HyperFace is released under [MIT License](LICENSE.txt)
+## Citation
+If you find our work useful, please cite the following publication:
+```bibtex
+@misc{he_alleviating_2025,
+    title = {Alleviating {Forgetfulness} of {Linear} {Attention} by {Hybrid} {Sparse} {Attention} and {Contextualized} {Learnable} {Token} {Eviction}},
+    url = {http://arxiv.org/abs/2510.20787},
+    doi = {10.48550/arXiv.2510.20787},
+    publisher = {arXiv},
+    author = {He, Mutian and Garner, Philip N.},
+    month = oct,
+    year = {2025},
+    note = {arXiv:2510.20787 [cs]},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "_name_or_path": "/idiap/home/mhe/repo/flame/exp/hybrid_gated_deltanet-nsa-1.3B-30B/batch1M.seqlen4096.warmup512.steps30720.lr3e-4/config.json",
+  "architectures": [
+    "HybridGatedDeltaNetForCausalLM"
+  ],
+  "attn": {
+    "attn_type": "nsa",
+    "block_counts": 16,
+    "block_size": 64,
+    "head_dim": 64,
+    "layers": [
+      1,
+      3,
+      5,
+      7,
+      9,
+      11,
+      13,
+      15,
+      17,
+      19,
+      21,
+      23
+    ],
+    "num_heads": 32,
+    "num_kv_heads": 2,
+    "qkv_bias": false,
+    "rope_theta": 10000.0,
+    "window_size": 512
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 256,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 2048,
+  "initializer_range": 0.006,
+  "intermediate_size": null,
+  "max_position_embeddings": 4096,
+  "model_type": "hybrid_gated_deltanet",
+  "norm_eps": 1e-06,
+  "num_heads": 8,
+  "num_hidden_layers": 24,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_beta": true,
+  "use_cache": true,
+  "use_gate": true,
+  "use_output_norm": true,
+  "use_short_conv": true,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.49.0"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9c00242508270fc36c50e84b92ca8063ef5d3c0490c6d8a3b2e0ffc0e6141c
+size 4984979984

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c93bc730f1cc6f01784e40a8a9d625b16a21173a43f22e3fb5f423b0f494dc5
+size 308289864

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+  "metadata": {
+    "total_size": 5293232896
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embeddings.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.o_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.g_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.attn_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp_norm.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}