"""~100M-active Qwen3-MoE config for the tiny-vocab physics LM.

Imports MoEModelConfig from the scaffold model.py and tunes the transformer
to land ~90-120M ACTIVE params. With vocab~=512 + tied embeddings, the
embedding table is ~0.26M params (negligible), so the whole budget is in the
attention + MoE stack.

Run `python config_100m.py` to print the active/total breakdown.
"""
from __future__ import annotations
import os, sys

# scaffold is the sibling dir ../scaffold
_HERE = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(_HERE, "..", "scaffold"))

from model import MoEModelConfig  # noqa: E402


def make_config(vocab_size: int, max_seq_len: int = 1024) -> MoEModelConfig:
    return MoEModelConfig(
        vocab_size=vocab_size,
        d_model=640,
        n_layers=14,
        n_q_heads=10,
        n_kv_heads=2,
        head_dim=64,
        rope_partial=32,
        rope_theta=10000.0,
        d_ff=1024,
        n_routed_experts=8,
        n_shared_experts=1,
        top_k=2,
        moe_first_layer=1,
        max_seq_len=max_seq_len,
        tie_embeddings=True,
        # fp16/V100 hardening — keep router math in fp32, moderate coeffs.
        router_z_coef=1e-3,
        router_aux_coef=1e-3,
        bias_update_rate=1e-3,
        router_noise_std=0.0,
        moe_backend="grouped",
        moe_capacity_factor=1.5,
        smear_gate=True,
        use_chunked_ce=True,
        ce_chunk_tokens=512,
        ce_checkpoint_chunks=True,
        use_liger_ce=True,
        mup_base_d=512,
        init_std=0.02,
        attn_backend="sdpa",
    )


if __name__ == "__main__":
    import argparse
    from model import MoEModel
    ap = argparse.ArgumentParser()
    ap.add_argument("--vocab", type=int, default=512)
    args = ap.parse_args()
    cfg = make_config(args.vocab)
    m = MoEModel(cfg)
    act = m.num_parameters(only_active=True) / 1e6
    tot = m.num_parameters() / 1e6
    print(f"VOCAB={cfg.vocab_size}  ACTIVE={act:.2f}M  TOTAL={tot:.2f}M")
    for k, v in m.param_breakdown().items():
        print(f"  {k:16s} {v/1e6:8.3f} M")