sakamakismile's picture
BLS-Mini-Code-1.0 NVFP4 (W4A4) for vLLM: 17.97GB, TP=2 ~104 t/s, model card w/ flags+TPS+benchmarks
f6c7144 verified
raw
history blame contribute delete
366 Bytes
default_stage:
default_modifiers:
QuantizationModifier:
targets: [Linear]
ignore: [lm_head, 're:.*mlp\.gate$', 're:.*mlp\.shared_expert_gate$', 're:.*\.input_layernorm$',
model.norm, 're:.*\.norm$', 're:.*\.q_norm$', 're:.*\.k_norm$', 're:.*\.rotary_emb.*',
embed_tokens]
scheme: NVFP4
bypass_divisibility_checks: false