thiswillbeyourgithub Claude Opus 4.8 commited on
Commit ·
fa7c830
1
Parent(s): 7d8c10c
quantize-int8-smoothquant: route output through loguru + log to file
Browse filesReplace all print() with loguru logger calls and add a file sink so every
run's logs (calibration windows, alpha, fidelity cosine, the wer-quants
commands) are kept in a local quantize-int8-smoothquant-<timestamp>.log in
addition to stderr. Adds a --log-file override and the loguru dependency.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
scripts/quantize-int8-smoothquant.py
CHANGED
|
@@ -10,6 +10,7 @@
|
|
| 10 |
# "prettytable",
|
| 11 |
# "psutil",
|
| 12 |
# "scipy",
|
|
|
|
| 13 |
# ]
|
| 14 |
# ///
|
| 15 |
"""Export a *better* int8 Parakeet encoder using SmoothQuant static quantization.
|
|
@@ -85,17 +86,41 @@ import shutil
|
|
| 85 |
import subprocess
|
| 86 |
import sys
|
| 87 |
import time
|
|
|
|
| 88 |
from pathlib import Path
|
| 89 |
|
| 90 |
import numpy as np
|
| 91 |
import onnx
|
| 92 |
import onnxruntime as ort
|
|
|
|
| 93 |
from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType
|
| 94 |
from onnx_neural_compressor import data_reader
|
| 95 |
from onnx_neural_compressor.quantization import config, quantize
|
| 96 |
from onnx_neural_compressor.algorithms.smoother import core as _sq_core
|
| 97 |
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
# --- FastConformer compatibility shim for onnx-neural-compressor's SmoothQuant -
|
| 100 |
# The library's smoother hard-assumes a 3D activation is (batch, seq, in_channel)
|
| 101 |
# with the in-channel LAST (there is a literal TODO admitting this in
|
|
@@ -174,7 +199,8 @@ def human(n):
|
|
| 174 |
def find_ffmpeg(explicit=None):
|
| 175 |
cand = explicit or os.environ.get("FFMPEG") or shutil.which("ffmpeg")
|
| 176 |
if not cand or not shutil.which(cand) and not os.path.exists(cand):
|
| 177 |
-
|
|
|
|
| 178 |
return cand
|
| 179 |
|
| 180 |
|
|
@@ -210,11 +236,12 @@ def collect_windows(ffmpeg, audio_paths, window_sec, num_windows):
|
|
| 210 |
windows.append(pcm[start:start + win])
|
| 211 |
start += win
|
| 212 |
count += 1
|
| 213 |
-
|
| 214 |
if not windows:
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
| 218 |
if len(windows) > num_windows:
|
| 219 |
# Even stride across the whole pool for speaker/content diversity.
|
| 220 |
idx = np.linspace(0, len(windows) - 1, num_windows).round().astype(int)
|
|
@@ -364,15 +391,22 @@ def main():
|
|
| 364 |
"measured ~0.96 cosine yet tracked fp16 WER (10.9%% vs 10.2%%), so the "
|
| 365 |
"floor sits well below that. A true collapse lands far lower.")
|
| 366 |
ap.add_argument("--ffmpeg", default=None, help="ffmpeg binary (else $FFMPEG / PATH)")
|
|
|
|
|
|
|
|
|
|
| 367 |
args = ap.parse_args()
|
| 368 |
|
|
|
|
|
|
|
|
|
|
| 369 |
model_dir = Path(args.model_dir)
|
| 370 |
in_encoder = model_dir / "encoder-model.onnx"
|
| 371 |
pre_path = model_dir / "nemo128.onnx"
|
| 372 |
out_encoder = model_dir / args.out_name
|
| 373 |
for p in (in_encoder, pre_path):
|
| 374 |
if not p.exists():
|
| 375 |
-
|
|
|
|
| 376 |
|
| 377 |
# alpha is either the literal "auto" (per-layer search) or a fixed float.
|
| 378 |
if str(args.alpha).strip().lower() == "auto":
|
|
@@ -381,14 +415,15 @@ def main():
|
|
| 381 |
try:
|
| 382 |
alpha = float(args.alpha)
|
| 383 |
except ValueError:
|
| 384 |
-
|
|
|
|
| 385 |
|
| 386 |
ffmpeg = find_ffmpeg(args.ffmpeg)
|
| 387 |
audio = expand_audio(args.audio or [DEFAULT_CALIB_DIR])
|
| 388 |
|
| 389 |
-
|
| 390 |
windows = collect_windows(ffmpeg, audio, args.window_sec, args.num_windows)
|
| 391 |
-
|
| 392 |
feats = build_features(pre_path, windows)
|
| 393 |
|
| 394 |
fmt = QuantFormat.QOperator if args.quant_format == "qoperator" else QuantFormat.QDQ
|
|
@@ -433,9 +468,9 @@ def main():
|
|
| 433 |
},
|
| 434 |
)
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
t0 = time.time()
|
| 440 |
# ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
|
| 441 |
# (which has a `provides=` kwarg typo that crashes on this version) and avoids
|
|
@@ -444,8 +479,8 @@ def main():
|
|
| 444 |
optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL)
|
| 445 |
dt = time.time() - t0
|
| 446 |
if _SKIPPED["count"]:
|
| 447 |
-
|
| 448 |
-
|
| 449 |
|
| 450 |
# neural-compressor always writes the quantized weights to an external
|
| 451 |
# `<name>_data` sidecar for a model this size, ignoring use_external_data_format.
|
|
@@ -462,12 +497,12 @@ def main():
|
|
| 462 |
if os.path.exists(sidecar):
|
| 463 |
os.remove(sidecar)
|
| 464 |
if pruned:
|
| 465 |
-
|
| 466 |
|
| 467 |
out_size = os.path.getsize(out_encoder)
|
| 468 |
baseline = model_dir / "encoder-model.int8.onnx"
|
| 469 |
base_note = f" (istupakov int8 is {human(os.path.getsize(baseline))})" if baseline.exists() else ""
|
| 470 |
-
|
| 471 |
|
| 472 |
# Fidelity smoke test (NOT just shape): run one calibration window through both
|
| 473 |
# the fp32 reference and the new int8 encoder and compare the encoder outputs by
|
|
@@ -483,27 +518,26 @@ def main():
|
|
| 483 |
denom = (np.linalg.norm(out_q) * np.linalg.norm(out_f)) or 1.0
|
| 484 |
cos = float(np.dot(out_q, out_f) / denom)
|
| 485 |
if cos < args.fidelity_warn:
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
file=sys.stderr)
|
| 490 |
else:
|
| 491 |
-
|
| 492 |
-
|
| 493 |
except Exception as e:
|
| 494 |
-
|
| 495 |
|
| 496 |
if not args.no_candidate:
|
| 497 |
cand = build_candidate_dir(model_dir, out_encoder, args.candidate_dir)
|
| 498 |
-
|
| 499 |
|
| 500 |
rel_cand = os.path.relpath(args.candidate_dir)
|
| 501 |
rel_model = os.path.relpath(model_dir)
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
|
| 508 |
|
| 509 |
if __name__ == "__main__":
|
|
|
|
| 10 |
# "prettytable",
|
| 11 |
# "psutil",
|
| 12 |
# "scipy",
|
| 13 |
+
# "loguru",
|
| 14 |
# ]
|
| 15 |
# ///
|
| 16 |
"""Export a *better* int8 Parakeet encoder using SmoothQuant static quantization.
|
|
|
|
| 86 |
import subprocess
|
| 87 |
import sys
|
| 88 |
import time
|
| 89 |
+
from datetime import datetime
|
| 90 |
from pathlib import Path
|
| 91 |
|
| 92 |
import numpy as np
|
| 93 |
import onnx
|
| 94 |
import onnxruntime as ort
|
| 95 |
+
from loguru import logger
|
| 96 |
from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType
|
| 97 |
from onnx_neural_compressor import data_reader
|
| 98 |
from onnx_neural_compressor.quantization import config, quantize
|
| 99 |
from onnx_neural_compressor.algorithms.smoother import core as _sq_core
|
| 100 |
|
| 101 |
|
| 102 |
+
def configure_logging(log_file):
|
| 103 |
+
"""Send logs to stderr (human, colored) AND a local file (full, timestamped).
|
| 104 |
+
|
| 105 |
+
Called once at the start of main() so every message after arg-parsing lands
|
| 106 |
+
in both places. The file sink keeps a permanent record of an export run
|
| 107 |
+
(calibration windows, alpha, fidelity cosine, the wer-quants commands) so a
|
| 108 |
+
later A/B can be traced without re-running."""
|
| 109 |
+
logger.remove()
|
| 110 |
+
logger.add(
|
| 111 |
+
sys.stderr,
|
| 112 |
+
level="INFO",
|
| 113 |
+
format="<green>{time:HH:mm:ss}</green> | <level>{level: <7}</level> | <level>{message}</level>",
|
| 114 |
+
)
|
| 115 |
+
logger.add(
|
| 116 |
+
str(log_file),
|
| 117 |
+
level="DEBUG",
|
| 118 |
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <7} | {message}",
|
| 119 |
+
enqueue=True,
|
| 120 |
+
)
|
| 121 |
+
logger.info(f"[sq] logging to {log_file}")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
# --- FastConformer compatibility shim for onnx-neural-compressor's SmoothQuant -
|
| 125 |
# The library's smoother hard-assumes a 3D activation is (batch, seq, in_channel)
|
| 126 |
# with the in-channel LAST (there is a literal TODO admitting this in
|
|
|
|
| 199 |
def find_ffmpeg(explicit=None):
|
| 200 |
cand = explicit or os.environ.get("FFMPEG") or shutil.which("ffmpeg")
|
| 201 |
if not cand or not shutil.which(cand) and not os.path.exists(cand):
|
| 202 |
+
logger.error("ffmpeg not found (set $FFMPEG or pass --ffmpeg).")
|
| 203 |
+
sys.exit(1)
|
| 204 |
return cand
|
| 205 |
|
| 206 |
|
|
|
|
| 236 |
windows.append(pcm[start:start + win])
|
| 237 |
start += win
|
| 238 |
count += 1
|
| 239 |
+
logger.info(f" [calib] {Path(p).name}: {n / SAMPLE_RATE:.0f}s -> {count} full window(s)")
|
| 240 |
if not windows:
|
| 241 |
+
logger.error(f"No calibration audio yielded a full {window_sec:g}s window. "
|
| 242 |
+
f"Drop clips in ./{DEFAULT_CALIB_DIR}/, pass --audio <file-or-folder>, "
|
| 243 |
+
"or lower --window-sec.")
|
| 244 |
+
sys.exit(1)
|
| 245 |
if len(windows) > num_windows:
|
| 246 |
# Even stride across the whole pool for speaker/content diversity.
|
| 247 |
idx = np.linspace(0, len(windows) - 1, num_windows).round().astype(int)
|
|
|
|
| 391 |
"measured ~0.96 cosine yet tracked fp16 WER (10.9%% vs 10.2%%), so the "
|
| 392 |
"floor sits well below that. A true collapse lands far lower.")
|
| 393 |
ap.add_argument("--ffmpeg", default=None, help="ffmpeg binary (else $FFMPEG / PATH)")
|
| 394 |
+
ap.add_argument("--log-file", default=None,
|
| 395 |
+
help="path for the run log file (logs always also go to stderr). "
|
| 396 |
+
"Default: ./quantize-int8-smoothquant-<timestamp>.log")
|
| 397 |
args = ap.parse_args()
|
| 398 |
|
| 399 |
+
log_file = args.log_file or f"quantize-int8-smoothquant-{datetime.now():%Y%m%d-%H%M%S}.log"
|
| 400 |
+
configure_logging(log_file)
|
| 401 |
+
|
| 402 |
model_dir = Path(args.model_dir)
|
| 403 |
in_encoder = model_dir / "encoder-model.onnx"
|
| 404 |
pre_path = model_dir / "nemo128.onnx"
|
| 405 |
out_encoder = model_dir / args.out_name
|
| 406 |
for p in (in_encoder, pre_path):
|
| 407 |
if not p.exists():
|
| 408 |
+
logger.error(f"missing required file: {p}")
|
| 409 |
+
sys.exit(1)
|
| 410 |
|
| 411 |
# alpha is either the literal "auto" (per-layer search) or a fixed float.
|
| 412 |
if str(args.alpha).strip().lower() == "auto":
|
|
|
|
| 415 |
try:
|
| 416 |
alpha = float(args.alpha)
|
| 417 |
except ValueError:
|
| 418 |
+
logger.error(f"--alpha must be a float in 0..1 or 'auto', got {args.alpha!r}")
|
| 419 |
+
sys.exit(1)
|
| 420 |
|
| 421 |
ffmpeg = find_ffmpeg(args.ffmpeg)
|
| 422 |
audio = expand_audio(args.audio or [DEFAULT_CALIB_DIR])
|
| 423 |
|
| 424 |
+
logger.info(f"[sq] calibration: up to {args.num_windows} x {args.window_sec:g}s windows")
|
| 425 |
windows = collect_windows(ffmpeg, audio, args.window_sec, args.num_windows)
|
| 426 |
+
logger.info(f"[sq] using {len(windows)} calibration window(s); extracting mel features...")
|
| 427 |
feats = build_features(pre_path, windows)
|
| 428 |
|
| 429 |
fmt = QuantFormat.QOperator if args.quant_format == "qoperator" else QuantFormat.QDQ
|
|
|
|
| 468 |
},
|
| 469 |
)
|
| 470 |
|
| 471 |
+
logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
|
| 472 |
+
f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
|
| 473 |
+
logger.info(f"[sq] {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
|
| 474 |
t0 = time.time()
|
| 475 |
# ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
|
| 476 |
# (which has a `provides=` kwarg typo that crashes on this version) and avoids
|
|
|
|
| 479 |
optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL)
|
| 480 |
dt = time.time() - t0
|
| 481 |
if _SKIPPED["count"]:
|
| 482 |
+
logger.info(f"[sq] note: {_SKIPPED['count']} node(s) had a layout SmoothQuant could not "
|
| 483 |
+
f"resolve and were left as plain static int8 (everything else was smoothed)")
|
| 484 |
|
| 485 |
# neural-compressor always writes the quantized weights to an external
|
| 486 |
# `<name>_data` sidecar for a model this size, ignoring use_external_data_format.
|
|
|
|
| 497 |
if os.path.exists(sidecar):
|
| 498 |
os.remove(sidecar)
|
| 499 |
if pruned:
|
| 500 |
+
logger.info(f"[sq] pruned {pruned} orphaned initializer(s) (folded smooth scales)")
|
| 501 |
|
| 502 |
out_size = os.path.getsize(out_encoder)
|
| 503 |
baseline = model_dir / "encoder-model.int8.onnx"
|
| 504 |
base_note = f" (istupakov int8 is {human(os.path.getsize(baseline))})" if baseline.exists() else ""
|
| 505 |
+
logger.info(f"[sq] done in {dt:.0f}s -> {out_encoder.name} {human(out_size)}{base_note}")
|
| 506 |
|
| 507 |
# Fidelity smoke test (NOT just shape): run one calibration window through both
|
| 508 |
# the fp32 reference and the new int8 encoder and compare the encoder outputs by
|
|
|
|
| 518 |
denom = (np.linalg.norm(out_q) * np.linalg.norm(out_f)) or 1.0
|
| 519 |
cos = float(np.dot(out_q, out_f) / denom)
|
| 520 |
if cos < args.fidelity_warn:
|
| 521 |
+
logger.warning(f"[sq] encoder-output cosine vs fp32 is {cos:.4f} "
|
| 522 |
+
f"(< {args.fidelity_warn}). This export likely COLLAPSED; expect a near-100% "
|
| 523 |
+
f"WER. Try a different --calibrate-method/--alpha or keep more ops fp32.")
|
|
|
|
| 524 |
else:
|
| 525 |
+
logger.info(f"[sq] fidelity: encoder-output cosine vs fp32 = {cos:.4f} (>= "
|
| 526 |
+
f"{args.fidelity_warn}). Looks healthy.")
|
| 527 |
except Exception as e:
|
| 528 |
+
logger.warning(f"[sq] exported encoder failed the fidelity smoke test: {e}")
|
| 529 |
|
| 530 |
if not args.no_candidate:
|
| 531 |
cand = build_candidate_dir(model_dir, out_encoder, args.candidate_dir)
|
| 532 |
+
logger.info(f"[sq] candidate model dir (for wer-quants): {cand}")
|
| 533 |
|
| 534 |
rel_cand = os.path.relpath(args.candidate_dir)
|
| 535 |
rel_model = os.path.relpath(model_dir)
|
| 536 |
+
logger.info("Compare per-section degradation vs fp16 (wer-quants.py lives in the "
|
| 537 |
+
"parakeet_web repo: https://github.com/thiswillbeyourgithub/parakeet_web):")
|
| 538 |
+
logger.info(f" uv run scripts/wer-quants.py --model-dir {rel_cand} --quants int8")
|
| 539 |
+
logger.info(f" uv run scripts/wer-quants.py --model-dir {rel_model} --quants int8,fp16")
|
| 540 |
+
logger.info("A new-int8 per-section WER that tracks fp16 (instead of climbing) is the win.")
|
| 541 |
|
| 542 |
|
| 543 |
if __name__ == "__main__":
|