{ "architecture": "nanoGPT (custom, trained from scratch)", "model_type": "3-stage pipeline: pretrained -> SFT -> RLVR", "primary_checkpoint": "nanogpt_slm_rlvr_final.pth", "model_config": { "block_size": 512, "vocab_size": 50257, "n_layer": 12, "n_head": 12, "n_embd": 768, "dropout": 0.0, "bias": true }, "total_parameters_millions": 124.0, "tokenizer": "tiktoken gpt2 (50,257 BPE tokens)", "framework": "PyTorch", "checkpoints": { "nanogpt_slm_tinystories_best.pth": "Stage 1 -- pretrained base", "nanogpt_slm_sft_best.pth": "Stage 2 -- SFT (positive-sentiment subset)", "nanogpt_slm_rlvr_final.pth": "Stage 3 -- RLVR (primary)" }, "dataset": { "name": "TinyStories (roneneldan/TinyStories)", "description": "2.1M synthetic short stories for 3-5 year old children" }, "training": { "stage1_pretraining": { "iterations": 70000, "learning_rate": "6e-4 -> 1e-5 cosine", "batch": "32 x 512, grad-accum 4", "precision": "bfloat16" }, "stage2_sft": { "data": "positive-sentiment subset (VADER compound > 0.05): 1.91M stories", "iterations": 12952, "learning_rate": "5e-5 -> 5e-6 cosine", "best_val_loss": 1.2037 }, "stage3_rlvr": { "algorithm": "vanilla policy gradient", "reward": "VADER compound sentiment (verifiable)", "kl_penalty": "beta=0.1 vs frozen SFT reference", "iterations": 200, "generation_batch": 16, "trajectory_len": 200, "learning_rate": "5e-6", "mean_reward": "+0.6485 -> +0.8652" } }, "sentiment_comparison": { "pretrained": { "mean": 0.8428, "std": 0.3907 }, "sft": { "mean": 0.8703, "std": 0.2853 }, "rlvr": { "mean": 0.9001, "std": 0.3371 }, "metric": "VADER compound sentiment of generated stories" } }