NVFP4 self-quant (llm-compressor): FP8 attn/GDN + NVFP4-W4A16 experts; beats redhat/unsloth on quality+speed+size

894cdfa verified 5 days ago

2.19 kB

	quant_stage:
	quant_modifiers:
	QuantizationModifier:
	config_groups:
	group_0:
	targets: ['re:.self_attn.q_proj.', 're:.self_attn.k_proj.', 're:.self_attn.v_proj.',
	're:.self_attn.o_proj.', 're:.linear_attn.in_proj_qkv.', 're:.linear_attn.in_proj_z.',
	're:.linear_attn.out_proj.', 're:.shared_expert.gate_proj.', 're:.shared_expert.up_proj.',
	're:.shared_expert.down_proj.']
	weights:
	num_bits: 8
	type: float
	symmetric: true
	group_size: null
	strategy: block
	block_structure: [128, 128]
	dynamic: false
	actorder: null
	scale_dtype: null
	zp_dtype: null
	observer: memoryless_minmax
	observer_kwargs: {}
	input_activations:
	num_bits: 8
	type: float
	symmetric: true
	group_size: 128
	strategy: group
	block_structure: null
	dynamic: true
	actorder: null
	scale_dtype: null
	zp_dtype: null
	observer: null
	observer_kwargs: {}
	output_activations: null
	format: null
	group_1:
	targets: ['re:.mlp.experts.gate_proj.', 're:.mlp.experts.up_proj.', 're:.mlp.experts.down_proj.*']
	weights:
	num_bits: 4
	type: float
	symmetric: true
	group_size: 16
	strategy: tensor_group
	block_structure: null
	dynamic: false
	actorder: null
	scale_dtype: torch.float8_e4m3fn
	zp_dtype: null
	observer: memoryless_minmax
	observer_kwargs: {}
	input_activations: null
	output_activations: null
	format: null
	targets: [Linear]
	ignore: ['re:.lm_head', 're:.embed_tokens', 're:visual.', 're:model.visual.', 're:.*mlp.gate$',
	're:.shared_expert_gate$', 're:.linear_attn.in_proj_a', 're:.*linear_attn.in_proj_b',
	're:.linear_attn.conv1d', 're:^mtp\..']
	bypass_divisibility_checks: false