default_stage: default_modifiers: AWQModifier: mappings: - smooth_layer: re:.*layers\.(3|7|11|15|19|23|27|31|35|39)\.input_layernorm$ balance_layers: ['re:.*self_attn.q_proj$', 're:.*self_attn.k_proj$', 're:.*self_attn.v_proj$'] activation_hook_target: null - smooth_layer: re:.*layers\.(0|1|2|4|5|6|8|9|10|12|13|14|16|17|18|20|21|22|24|25|26|28|29|30|32|33|34|36|37|38)\.input_layernorm$ balance_layers: ['re:.*linear_attn.in_proj_qkv$', 're:.*linear_attn.in_proj_z$', 're:.*linear_attn.in_proj_b$', 're:.*linear_attn.in_proj_a$'] activation_hook_target: null - smooth_layer: re:.*post_attention_layernorm$ balance_layers: ['re:.*mlp.experts.*.gate_proj$', 're:.*mlp.experts.*.up_proj$', 're:.*mlp.shared_expert_gate$', 're:.*mlp.shared_expert.gate_proj$', 're:.*mlp.shared_expert.up_proj$'] activation_hook_target: null - smooth_layer: re:.*up_proj$ balance_layers: ['re:.*down_proj$'] activation_hook_target: null offload_device: !!python/object/apply:torch.device [cpu] duo_scaling: true n_grid: 20 QuantizationModifier: targets: [Linear] ignore: [lm_head, 're:.*embed_tokens.*', 're:.*\.linear_attn\..*', 're:.*mlp\.gate$', 're:.*shared_expert_gate.*', 're:.*visual.*', 're:.*\.mtp\..*', 're:^mtp\..*', 're:.*norm.*'] scheme: W4A16 bypass_divisibility_checks: false