smoothquant: fix auto-alpha KeyError when quantizing convs
Browse filesRunning with --op-types MatMul,Conv (default --alpha auto) crashed in the
neural-compressor smoother:
_reshape_scale_for_input -> KeyError '/pre_encode/conv/conv.0/Conv'
Convs can be statically quantized but cannot be SMOOTHED by this library: the
existing layout shim always returns None for a conv weight ((out,in,kh,kw) has
no per-input-channel max matching the activation), so the auto-alpha search
iterates the conv, finds no scale in tensor_scales_info, and KeyErrors. With a
fixed --alpha the auto-tune path is skipped so it didn't surface there.
Decouple the smoother's op set from the quantizer's: smooth only the
matmul-family ops (smooth_op_types), but still hand Conv to the static
quantizer. Wired both via extra_options['SmoothQuantOpTypes'] and, because
transform() drops that knob like it drops alpha, via the existing _SMOOTH_OVERRIDE
shim ('op_types'). --op-types MatMul,Conv now runs under auto-alpha.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
|
@@ -468,6 +468,14 @@ def main():
|
|
| 468 |
"entropy": CalibrationMethod.Entropy,
|
| 469 |
"percentile": CalibrationMethod.Percentile}[args.calibrate_method]
|
| 470 |
op_types = [t.strip() for t in args.op_types.split(",") if t.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
cfg = config.StaticQuantConfig(
|
| 472 |
calibration_data_reader=FeatureReader(feats),
|
| 473 |
quant_format=fmt,
|
|
@@ -490,6 +498,10 @@ def main():
|
|
| 490 |
execution_provider="CPUExecutionProvider",
|
| 491 |
extra_options={
|
| 492 |
"SmoothQuant": True,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
# alpha="auto" makes the smoother search a per-layer optimal alpha
|
| 494 |
# (minimising each layer's QDQ output error vs fp32) instead of forcing
|
| 495 |
# one global value onto FastConformer's very uneven outlier profile.
|
|
@@ -519,10 +531,16 @@ def main():
|
|
| 519 |
"alpha": eo["SmoothQuantAlpha"],
|
| 520 |
"folding": eo["SmoothQuantFolding"],
|
| 521 |
"auto_alpha_args": eo["AutoAlphaArgs"],
|
|
|
|
|
|
|
|
|
|
| 522 |
})
|
| 523 |
|
|
|
|
|
|
|
| 524 |
logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
|
| 525 |
-
f"calib={args.calibrate_method}, ops={
|
|
|
|
| 526 |
logger.info(f"[sq] {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
|
| 527 |
t0 = time.time()
|
| 528 |
# ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
|
|
|
|
| 468 |
"entropy": CalibrationMethod.Entropy,
|
| 469 |
"percentile": CalibrationMethod.Percentile}[args.calibrate_method]
|
| 470 |
op_types = [t.strip() for t in args.op_types.split(",") if t.strip()]
|
| 471 |
+
# Convs can be statically QUANTIZED but never SMOOTHED by this library: the
|
| 472 |
+
# layout shim above always returns None for a conv (its (out,in,kh,kw) weight
|
| 473 |
+
# has no per-input-channel max that matches the activation), so the auto-alpha
|
| 474 |
+
# search (_auto_tune_alpha -> _reshape_scale_for_input) KeyErrors on the conv's
|
| 475 |
+
# missing scale. So smooth only the matmul-family ops, but still hand the full
|
| 476 |
+
# requested set (incl. Conv) to the static quantizer below. This is also how
|
| 477 |
+
# istupakov ends up smaller: convs become plain static int8, no SmoothQuant.
|
| 478 |
+
smooth_op_types = [t for t in op_types if t not in ("Conv", "FusedConv")]
|
| 479 |
cfg = config.StaticQuantConfig(
|
| 480 |
calibration_data_reader=FeatureReader(feats),
|
| 481 |
quant_format=fmt,
|
|
|
|
| 498 |
execution_provider="CPUExecutionProvider",
|
| 499 |
extra_options={
|
| 500 |
"SmoothQuant": True,
|
| 501 |
+
# Smoother op set != quantizer op set: never send Conv to the smoother
|
| 502 |
+
# (see smooth_op_types above). Forwarded for a fixed/future library;
|
| 503 |
+
# the armed shim below also injects it into transform() directly.
|
| 504 |
+
"SmoothQuantOpTypes": smooth_op_types,
|
| 505 |
# alpha="auto" makes the smoother search a per-layer optimal alpha
|
| 506 |
# (minimising each layer's QDQ output error vs fp32) instead of forcing
|
| 507 |
# one global value onto FastConformer's very uneven outlier profile.
|
|
|
|
| 531 |
"alpha": eo["SmoothQuantAlpha"],
|
| 532 |
"folding": eo["SmoothQuantFolding"],
|
| 533 |
"auto_alpha_args": eo["AutoAlphaArgs"],
|
| 534 |
+
# transform() also forgets SmoothQuantOpTypes; inject it so Conv never
|
| 535 |
+
# reaches the smoother (auto-alpha would KeyError on its skipped scale).
|
| 536 |
+
"op_types": eo["SmoothQuantOpTypes"],
|
| 537 |
})
|
| 538 |
|
| 539 |
+
quant_only = [t for t in op_types if t not in smooth_op_types]
|
| 540 |
+
quant_only_note = f", quantize-only(no smooth)={quant_only}" if quant_only else ""
|
| 541 |
logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
|
| 542 |
+
f"calib={args.calibrate_method}, smooth-ops={smooth_op_types}"
|
| 543 |
+
f"{quant_only_note}, format={args.quant_format} ...")
|
| 544 |
logger.info(f"[sq] {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
|
| 545 |
t0 = time.time()
|
| 546 |
# ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
|