"""~100M-active Qwen3-MoE config for the tiny-vocab physics LM. Imports MoEModelConfig from the scaffold model.py and tunes the transformer to land ~90-120M ACTIVE params. With vocab~=512 + tied embeddings, the embedding table is ~0.26M params (negligible), so the whole budget is in the attention + MoE stack. Run `python config_100m.py` to print the active/total breakdown. """ from __future__ import annotations import os, sys # scaffold is the sibling dir ../scaffold _HERE = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(_HERE, "..", "scaffold")) from model import MoEModelConfig # noqa: E402 def make_config(vocab_size: int, max_seq_len: int = 1024) -> MoEModelConfig: return MoEModelConfig( vocab_size=vocab_size, d_model=640, n_layers=14, n_q_heads=10, n_kv_heads=2, head_dim=64, rope_partial=32, rope_theta=10000.0, d_ff=1024, n_routed_experts=8, n_shared_experts=1, top_k=2, moe_first_layer=1, max_seq_len=max_seq_len, tie_embeddings=True, # fp16/V100 hardening — keep router math in fp32, moderate coeffs. router_z_coef=1e-3, router_aux_coef=1e-3, bias_update_rate=1e-3, router_noise_std=0.0, moe_backend="grouped", moe_capacity_factor=1.5, smear_gate=True, use_chunked_ce=True, ce_chunk_tokens=512, ce_checkpoint_chunks=True, use_liger_ce=True, mup_base_d=512, init_std=0.02, attn_backend="sdpa", ) if __name__ == "__main__": import argparse from model import MoEModel ap = argparse.ArgumentParser() ap.add_argument("--vocab", type=int, default=512) args = ap.parse_args() cfg = make_config(args.vocab) m = MoEModel(cfg) act = m.num_parameters(only_active=True) / 1e6 tot = m.num_parameters() / 1e6 print(f"VOCAB={cfg.vocab_size} ACTIVE={act:.2f}M TOTAL={tot:.2f}M") for k, v in m.param_breakdown().items(): print(f" {k:16s} {v/1e6:8.3f} M")