Initial upload

Browse files

Files changed (11) hide show

.gitattributes +1 -0
README.md +156 -0
config.json +9 -0
merges.txt +0 -0
model.py +211 -0
model.safetensors +3 -0
special_tokens_map.json +5 -0
test.py +27 -0
tokenizer.json +0 -0
tokenizer_config.json +20 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,156 @@

+# Custom GPT Language Model
+A custom GPT-style autoregressive transformer language model implemented from scratch in PyTorch.
+This project contains:
+- custom multi-head self-attention
+- transformer blocks
+- causal masking
+- autoregressive text generation
+- mixed precision training
+- top-k / top-p sampling
+- safetensors model weights
+The model was trained on a subset of FineWeb-Edu using a GPT-2 tokenizer.
+---
+# Architecture
+Model configuration:
+```python
+{
+    "vocab_size": 50257,
+    "context_length": 256,
+    "emb_dim": 768,
+    "n_heads": 12,
+    "n_layers": 12,
+    "drop_rate": 0.1,
+    "qkv_bias": False
+}
+```
+Approximate parameter count:
+- ~124M parameters
+Architecture components:
+- token embeddings
+- positional embeddings
+- masked multi-head self-attention
+- feed-forward MLP blocks
+- pre-layer normalization
+- residual connections
+- causal language modeling head
+---
+# Training
+Training setup:
+- PyTorch
+- AdamW optimizer
+- Automatic Mixed Precision (AMP)
+- Gradient clipping
+- Top-k / Top-p text generation
+Hardware used:
+- RTX 3060 Ti 8GB
+Dataset:
+- FineWeb-Edu subset (10M tokens)
+Tokenizer:
+- GPT-2 tokenizer
+---
+# Installation
+Install dependencies:
+```bash
+pip install torch transformers safetensors
+```
+---
+# Loading The Model
+```python
+import json
+import torch
+from safetensors.torch import load_file
+from transformers import AutoTokenizer
+from model import GPTModel
+# load config
+with open("config.json") as f:
+    cfg = json.load(f)
+# create model
+model = GPTModel(cfg)
+# load weights
+state_dict = load_file("model.safetensors")
+model.load_state_dict(state_dict)
+model.eval()
+# tokenizer
+tokenizer = AutoTokenizer.from_pretrained(".")
+```
+---
+# Text Generation Example
+```python
+from model import generate_and_print_sample
+print(generate_and_print_sample(model, tokenizer, "cuda", "The world is big"))
+```
+---
+# Sample Generations
+Example generations from early-stage training:
+> "The world is big and is a whole for children. The best part of which has been made in the lives, and the state is an ideal man, but also the same one is in the world. “The only one has been created by people,” said the new study of the journal In the past, it is the best “s not people who have no longer to have not been seen in a few years.” “The only one who have one, the most famous in the country has no one at least three years. “If you’re very low, it is not a big or less than one’s risk.” The study is a study of people who have already been reported that the risk of people who are diagnosed with HIV-S"
+The model currently demonstrates:
+- syntactic coherence
+- topic persistence
+- autoregressive language modeling
+- early semantic structure
+---
+# Files
+```text
+model.py              # GPT architecture
+model.safetensors     # trained weights
+config.json           # model configuration
+tokenizer files       # GPT2 tokenizer assets
+README.md             # project documentation
+```
+---
+# Notes
+This is a custom PyTorch implementation and is not directly compatible with Hugging Face `AutoModelForCausalLM`.
+Users should load the model using the provided `model.py` architecture.
+---
+# License
+MIT License.

config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "vocab_size": 50257,
+    "context_length": 256,
+    "emb_dim": 768,
+    "n_heads": 12,
+    "n_layers": 12,
+    "drop_rate": 0.1,
+    "qkv_bias": false
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import torch.nn as nn
+class MultiHeadAttention(nn.Module):
+    def __init__(self,d_in,d_out,context_length,dropout,qkv_bias,n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+        self.head_dim = d_out // n_heads
+        self.d_out = d_out
+        self.W_key = nn.Linear(d_in,d_out,bias=qkv_bias)
+        self.W_query = nn.Linear(d_in,d_out,bias=qkv_bias)
+        self.W_value = nn.Linear(d_in,d_out,bias=qkv_bias)
+        self.dropout = nn.Dropout(dropout)
+        self.proj = nn.Linear(d_out,d_out)
+        self.register_buffer(
+           'mask',
+           torch.triu(torch.ones(context_length, context_length),
+           diagonal=1)
+        )
+    def forward(self,x):
+        b,n_tokens,d_out = x.shape
+        keys = self.W_key(x).view(b,n_tokens,self.n_heads,self.head_dim)
+        queries = self.W_query(x).view(b,n_tokens,self.n_heads,self.head_dim)
+        values = self.W_value(x).view(b,n_tokens,self.n_heads,self.head_dim)
+        keys = keys.transpose(1,2)
+        queries = queries.transpose(1,2)
+        values = values.transpose(1,2)
+        attn_scores = queries @ keys.transpose(2,3)
+        attn_scores = attn_scores.masked_fill_(self.mask.bool()[:n_tokens,:n_tokens],-torch.inf)
+        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        cntx_vec = (attn_weights @ values).transpose(1,2)
+        cntx_vec = cntx_vec.contiguous().view(b,n_tokens,self.d_out)
+        return self.proj(cntx_vec)
+class NormLayer(nn.Module):
+    def __init__(self,emb_dim):
+        super().__init__()
+        self.eps = 1e-5
+        self.scale = nn.Parameter(torch.ones(emb_dim))
+        self.shift = nn.Parameter(torch.zeros(emb_dim))
+    def forward(self,x):
+        mean = x.mean(dim=-1,keepdim=True)
+        var = x.var(dim=-1,keepdim=True,unbiased=False)
+        return self.scale * ((x-mean)/torch.sqrt(var+self.eps)) + self.shift
+class GELU(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return 0.5 * x * (1 + torch.tanh(
+            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
+            (x + 0.044715 * torch.pow(x, 3))
+        ))
+class FeedForward(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
+            GELU(),
+            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class TransformerBlock(nn.Module):
+    def __init__(self,cfg):
+        super().__init__()
+        self.attn = MultiHeadAttention(d_in=cfg["emb_dim"],d_out=cfg["emb_dim"],context_length=cfg["context_length"],dropout=cfg["drop_rate"],qkv_bias=cfg["qkv_bias"],n_heads=cfg["n_heads"])
+        self.ff = FeedForward(cfg)
+        self.norm1 = NormLayer(cfg["emb_dim"])
+        self.norm2 = NormLayer(cfg["emb_dim"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
+    def forward(self,x):
+        shortcut = x
+        x = self.norm1(x)
+        x = self.attn(x)
+        x = self.drop_shortcut(x)
+        x = x + shortcut
+        shortcut = x
+        x = self.norm2(x)
+        x = self.ff(x)
+        x = self.drop_shortcut(x)
+        x = x + shortcut
+        return x
+vocab_size=50257
+class GPTModel(nn.Module):
+    def __init__(self,cfg):
+        super().__init__()
+        self.tok_emb = nn.Embedding(vocab_size,cfg["emb_dim"])
+        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
+        self.drop_emb = nn.Dropout(cfg["drop_rate"])
+        self.tranf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
+        self.out_head = nn.Linear(cfg["emb_dim"],vocab_size)
+        self.final_norm = NormLayer(cfg["emb_dim"])
+    def forward(self,x):
+        b,n_inp = x.shape
+        tok_emb = self.tok_emb(x)
+        pos_emb = self.pos_emb(torch.arange(n_inp,device=x.device))
+        x = tok_emb + pos_emb
+        x= self.drop_emb(x)
+        x = self.tranf_blocks(x)
+        x = self.final_norm(x)
+        x = self.out_head(x)
+        return x
+def generate_text(
+    model,
+    idx,
+    max_new_tokens,
+    context_size,
+    temperature=0.7,
+    top_k=40
+):
+    model.eval()
+    for _ in range(max_new_tokens):
+        idx_cond = idx[:, -context_size:]
+        with torch.no_grad():
+            with torch.amp.autocast("cuda"):
+                logits = model(idx_cond)
+        logits = logits[:, -1, :]
+        # temperature scaling
+        logits = logits / temperature
+        # top-k filtering
+        top_logits, top_indices = torch.topk(
+            logits,
+            top_k
+        )
+        # probabilities only over top-k
+        top_probas = torch.softmax(
+            top_logits,
+            dim=-1
+        )
+        # sample from top-k
+        idx_next = top_indices.gather(
+            -1,
+            torch.multinomial(top_probas, 1)
+        )
+        idx = torch.cat((idx, idx_next), dim=1)
+    return idx
+def text_to_token_ids(text, tokenizer):
+    encoded = tokenizer.encode(text)
+    encoded_tensor = torch.tensor(encoded,device="cuda").unsqueeze(0)    #1
+    return encoded_tensor
+def token_ids_to_text(token_ids, tokenizer):
+    flat = token_ids.squeeze(0)                #2
+    return tokenizer.decode(flat.tolist())
+def generate_and_print_sample(model, tokenizer, device, start_context):
+    model.eval()
+    context_size = model.pos_emb.weight.shape[0]
+    encoded = text_to_token_ids(start_context, tokenizer).to("cuda")
+    with torch.no_grad():
+        token_ids = generate_text(
+            model=model, idx=encoded,
+            max_new_tokens=200, context_size=context_size,temperature=0.85,top_k=40
+        )
+    decoded_text = token_ids_to_text(token_ids, tokenizer)
+    print(decoded_text.replace("\n", " "))      #1
+    model.train()

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec8dfec355348b16ee3c12dee51e77fb6b0d1c38057ce888c0375a5c619ab29f
+size 653043260

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

test.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import json
+import torch
+from safetensors.torch import load_file
+from transformers import AutoTokenizer
+from model import GPTModel,generate_and_print_sample
+# load config
+with open("config.json") as f:
+    cfg = json.load(f)
+# create model
+model = GPTModel(cfg)
+model.to("cuda")
+# load weights
+state_dict = load_file("model.safetensors")
+model.load_state_dict(state_dict)
+model.eval()
+# tokenizer
+tokenizer = AutoTokenizer.from_pretrained(".")
+print(generate_and_print_sample(model, tokenizer, "cuda", "The world is big"))

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff