AlexWortega commited on
Commit
9f577b0
·
verified ·
1 Parent(s): cd5ff3a

Upload config_100m.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. config_100m.py +67 -0
config_100m.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """~100M-active Qwen3-MoE config for the tiny-vocab physics LM.
2
+
3
+ Imports MoEModelConfig from the scaffold model.py and tunes the transformer
4
+ to land ~90-120M ACTIVE params. With vocab~=512 + tied embeddings, the
5
+ embedding table is ~0.26M params (negligible), so the whole budget is in the
6
+ attention + MoE stack.
7
+
8
+ Run `python config_100m.py` to print the active/total breakdown.
9
+ """
10
+ from __future__ import annotations
11
+ import os, sys
12
+
13
+ # scaffold is the sibling dir ../scaffold
14
+ _HERE = os.path.dirname(os.path.abspath(__file__))
15
+ sys.path.insert(0, os.path.join(_HERE, "..", "scaffold"))
16
+
17
+ from model import MoEModelConfig # noqa: E402
18
+
19
+
20
+ def make_config(vocab_size: int, max_seq_len: int = 1024) -> MoEModelConfig:
21
+ return MoEModelConfig(
22
+ vocab_size=vocab_size,
23
+ d_model=640,
24
+ n_layers=14,
25
+ n_q_heads=10,
26
+ n_kv_heads=2,
27
+ head_dim=64,
28
+ rope_partial=32,
29
+ rope_theta=10000.0,
30
+ d_ff=1024,
31
+ n_routed_experts=8,
32
+ n_shared_experts=1,
33
+ top_k=2,
34
+ moe_first_layer=1,
35
+ max_seq_len=max_seq_len,
36
+ tie_embeddings=True,
37
+ # fp16/V100 hardening — keep router math in fp32, moderate coeffs.
38
+ router_z_coef=1e-3,
39
+ router_aux_coef=1e-3,
40
+ bias_update_rate=1e-3,
41
+ router_noise_std=0.0,
42
+ moe_backend="grouped",
43
+ moe_capacity_factor=1.5,
44
+ smear_gate=True,
45
+ use_chunked_ce=True,
46
+ ce_chunk_tokens=512,
47
+ ce_checkpoint_chunks=True,
48
+ use_liger_ce=True,
49
+ mup_base_d=512,
50
+ init_std=0.02,
51
+ attn_backend="sdpa",
52
+ )
53
+
54
+
55
+ if __name__ == "__main__":
56
+ import argparse
57
+ from model import MoEModel
58
+ ap = argparse.ArgumentParser()
59
+ ap.add_argument("--vocab", type=int, default=512)
60
+ args = ap.parse_args()
61
+ cfg = make_config(args.vocab)
62
+ m = MoEModel(cfg)
63
+ act = m.num_parameters(only_active=True) / 1e6
64
+ tot = m.num_parameters() / 1e6
65
+ print(f"VOCAB={cfg.vocab_size} ACTIVE={act:.2f}M TOTAL={tot:.2f}M")
66
+ for k, v in m.param_breakdown().items():
67
+ print(f" {k:16s} {v/1e6:8.3f} M")