AlexWortega
/

moe100m-physics-tinybpe

+"""~100M-active Qwen3-MoE config for the tiny-vocab physics LM.
+Imports MoEModelConfig from the scaffold model.py and tunes the transformer
+to land ~90-120M ACTIVE params. With vocab~=512 + tied embeddings, the
+embedding table is ~0.26M params (negligible), so the whole budget is in the
+attention + MoE stack.
+Run `python config_100m.py` to print the active/total breakdown.
+"""
+from __future__ import annotations
+import os, sys
+# scaffold is the sibling dir ../scaffold
+_HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, os.path.join(_HERE, "..", "scaffold"))
+from model import MoEModelConfig  # noqa: E402
+def make_config(vocab_size: int, max_seq_len: int = 1024) -> MoEModelConfig:
+    return MoEModelConfig(
+        vocab_size=vocab_size,
+        d_model=640,
+        n_layers=14,
+        n_q_heads=10,
+        n_kv_heads=2,
+        head_dim=64,
+        rope_partial=32,
+        rope_theta=10000.0,
+        d_ff=1024,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        top_k=2,
+        moe_first_layer=1,
+        max_seq_len=max_seq_len,
+        tie_embeddings=True,
+        # fp16/V100 hardening — keep router math in fp32, moderate coeffs.
+        router_z_coef=1e-3,
+        router_aux_coef=1e-3,
+        bias_update_rate=1e-3,
+        router_noise_std=0.0,
+        moe_backend="grouped",
+        moe_capacity_factor=1.5,
+        smear_gate=True,
+        use_chunked_ce=True,
+        ce_chunk_tokens=512,
+        ce_checkpoint_chunks=True,
+        use_liger_ce=True,
+        mup_base_d=512,
+        init_std=0.02,
+        attn_backend="sdpa",
+    )
+if __name__ == "__main__":
+    import argparse
+    from model import MoEModel
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--vocab", type=int, default=512)
+    args = ap.parse_args()
+    cfg = make_config(args.vocab)
+    m = MoEModel(cfg)
+    act = m.num_parameters(only_active=True) / 1e6
+    tot = m.num_parameters() / 1e6
+    print(f"VOCAB={cfg.vocab_size}  ACTIVE={act:.2f}M  TOTAL={tot:.2f}M")
+    for k, v in m.param_breakdown().items():
+        print(f"  {k:16s} {v/1e6:8.3f} M")