default_stage:
  default_modifiers:
    AWQModifier:
      mappings:
      - smooth_layer: re:.*layers\.(3|7|11|15|19|23|27|31|35|39)\.input_layernorm$
        balance_layers: ['re:.*self_attn.q_proj$', 're:.*self_attn.k_proj$', 're:.*self_attn.v_proj$']
        activation_hook_target: null
      - smooth_layer: re:.*layers\.(0|1|2|4|5|6|8|9|10|12|13|14|16|17|18|20|21|22|24|25|26|28|29|30|32|33|34|36|37|38)\.input_layernorm$
        balance_layers: ['re:.*linear_attn.in_proj_qkv$', 're:.*linear_attn.in_proj_z$', 're:.*linear_attn.in_proj_b$',
          're:.*linear_attn.in_proj_a$']
        activation_hook_target: null
      - smooth_layer: re:.*post_attention_layernorm$
        balance_layers: ['re:.*mlp.experts.*.gate_proj$', 're:.*mlp.experts.*.up_proj$', 're:.*mlp.shared_expert_gate$',
          're:.*mlp.shared_expert.gate_proj$', 're:.*mlp.shared_expert.up_proj$']
        activation_hook_target: null
      - smooth_layer: re:.*up_proj$
        balance_layers: ['re:.*down_proj$']
        activation_hook_target: null
      offload_device: !!python/object/apply:torch.device [cpu]
      duo_scaling: true
      n_grid: 20
    QuantizationModifier:
      targets: [Linear]
      ignore: [lm_head, 're:.*embed_tokens.*', 're:.*\.linear_attn\..*', 're:.*mlp\.gate$',
        're:.*shared_expert_gate.*', 're:.*visual.*', 're:.*\.mtp\..*', 're:^mtp\..*', 're:.*norm.*']
      scheme: W4A16
      bypass_divisibility_checks: false