quant_stage: quant_modifiers: QuantizationModifier: config_groups: group_0: targets: ['re:.*self_attn.q_proj.*', 're:.*self_attn.k_proj.*', 're:.*self_attn.v_proj.*', 're:.*self_attn.o_proj.*', 're:.*linear_attn.in_proj_qkv.*', 're:.*linear_attn.in_proj_z.*', 're:.*linear_attn.out_proj.*', 're:.*shared_expert.gate_proj.*', 're:.*shared_expert.up_proj.*', 're:.*shared_expert.down_proj.*'] weights: num_bits: 8 type: float symmetric: true group_size: null strategy: block block_structure: [128, 128] dynamic: false actorder: null scale_dtype: null zp_dtype: null observer: memoryless_minmax observer_kwargs: {} input_activations: num_bits: 8 type: float symmetric: true group_size: 128 strategy: group block_structure: null dynamic: true actorder: null scale_dtype: null zp_dtype: null observer: null observer_kwargs: {} output_activations: null format: null group_1: targets: ['re:.*mlp.experts.*gate_proj.*', 're:.*mlp.experts.*up_proj.*', 're:.*mlp.experts.*down_proj.*'] weights: num_bits: 4 type: float symmetric: true group_size: 16 strategy: tensor_group block_structure: null dynamic: false actorder: null scale_dtype: torch.float8_e4m3fn zp_dtype: null observer: memoryless_minmax observer_kwargs: {} input_activations: null output_activations: null format: null targets: [Linear] ignore: ['re:.*lm_head', 're:.*embed_tokens', 're:visual.*', 're:model.visual.*', 're:.*mlp.gate$', 're:.*shared_expert_gate$', 're:.*linear_attn.in_proj_a', 're:.*linear_attn.in_proj_b', 're:.*linear_attn.conv1d', 're:^mtp\..*'] bypass_divisibility_checks: false