smoothquant: honor --alpha (work around neural-compressor dropping SmoothQuant* extra_options)

onnx-neural-compressor 1.0 silently drops the SmoothQuant* knobs set in
StaticQuantConfig.extra_options: smooth_quant_entry calls
Smoother.transform(**config.to_dict()), but StaticQuantConfig.to_dict()
buries SmoothQuantAlpha/AutoAlphaArgs/SmoothQuantFolding inside a nested
extra_options dict and never emits the top-level alpha/auto_alpha_args/folding
kwargs transform() reads, so it always ran at the default alpha=0.5 regardless
of --alpha. Wrap Smoother.transform to inject those values from extra_options,
armed by main() only after a runtime guard confirms the bug is still present
(cfg.to_dict() lacks a top-level 'alpha'), so a fixed future library disarms it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (1) hide show

scripts/quantize-int8-smoothquant.py +44 -0

scripts/quantize-int8-smoothquant.py CHANGED Viewed

@@ -160,6 +160,34 @@ def _safe_get_smooth_scales(self, alpha, target_list=[]):
 _sq_core.Smoother._get_smooth_scale = _safe_get_smooth_scale
 _sq_core.Smoother._get_smooth_scales = _safe_get_smooth_scales
 # Audio file extensions recognised when an --audio entry is a folder (or the
 # default ./calibration_audio folder is scanned). SmoothQuant calibrates on
 # activations (not labels), so any speech clip works.
@@ -467,6 +495,22 @@ def main():
         },
     )
     logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
                 f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
     logger.info(f"[sq]   {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")

 _sq_core.Smoother._get_smooth_scale = _safe_get_smooth_scale
 _sq_core.Smoother._get_smooth_scales = _safe_get_smooth_scales
+# --- alpha (and auto-alpha / folding) pass-through fix ------------------------
+# onnx-neural-compressor 1.0 SILENTLY DROPS the SmoothQuant* knobs you set in
+# StaticQuantConfig.extra_options. quantize() hands the StaticQuantConfig to
+# smooth_quant_entry(), which smooths via `Smoother.transform(**config.to_dict())`
+# -- but StaticQuantConfig.to_dict() buries SmoothQuantAlpha / AutoAlphaArgs /
+# SmoothQuantFolding inside a nested "extra_options" dict and never emits the
+# top-level `alpha` / `auto_alpha_args` / `folding` kwargs that transform() reads.
+# So transform() ALWAYS falls back to its hard-coded defaults (alpha=0.5, the
+# [0.3,0.7] auto grid) regardless of --alpha. The intended path is the dedicated
+# SmoothQuantConfig (whose to_dict() does surface those names), but quantize()
+# never builds one. We wrap transform() to inject the values from extra_options
+# under the names transform() expects. main() only ARMS this (populates
+# _SMOOTH_OVERRIDE) after a runtime guard confirms the bug is still present, so a
+# future fixed library that forwards alpha itself transparently disables the shim.
+# Like the layout shim above, this reaches into library internals and may need
+# revisiting on a neural-compressor upgrade; contained to this export script.
+_SMOOTH_OVERRIDE = {}
+_orig_transform = _sq_core.Smoother.transform
+def _forced_transform(self, *args, **kwargs):
+    if _SMOOTH_OVERRIDE:
+        kwargs.update(_SMOOTH_OVERRIDE)
+    return _orig_transform(self, *args, **kwargs)
+_sq_core.Smoother.transform = _forced_transform
 # Audio file extensions recognised when an --audio entry is a folder (or the
 # default ./calibration_audio folder is scanned). SmoothQuant calibrates on
 # activations (not labels), so any speech clip works.
         },
     )
+    # Arm the alpha pass-through shim (see _forced_transform): translate the
+    # SmoothQuant* keys we set in extra_options into the top-level transform()
+    # kwargs the library forgets to forward. Guard: if a future neural-compressor
+    # fixes this, cfg.to_dict() will surface a top-level "alpha" on its own, so we
+    # leave the shim disarmed (and say so) rather than double-driving transform().
+    eo = cfg.extra_options
+    if "alpha" in cfg.to_dict():
+        logger.info("[sq] note: onnx-neural-compressor now forwards SmoothQuantAlpha "
+                    "itself; the alpha pass-through shim is obsolete and stays disarmed.")
+    else:
+        _SMOOTH_OVERRIDE.update({
+            "alpha": eo["SmoothQuantAlpha"],
+            "folding": eo["SmoothQuantFolding"],
+            "auto_alpha_args": eo["AutoAlphaArgs"],
+        })
     logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
                 f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
     logger.info(f"[sq]   {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")