quantize-int8-smoothquant: route output through loguru + log to file

Replace all print() with loguru logger calls and add a file sink so every
run's logs (calibration windows, alpha, fidelity cosine, the wer-quants
commands) are kept in a local quantize-int8-smoothquant-<timestamp>.log in
addition to stderr. Adds a --log-file override and the loguru dependency.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (1) hide show

scripts/quantize-int8-smoothquant.py +63 -29

scripts/quantize-int8-smoothquant.py CHANGED Viewed

@@ -10,6 +10,7 @@
 #     "prettytable",
 #     "psutil",
 #     "scipy",
 # ]
 # ///
 """Export a *better* int8 Parakeet encoder using SmoothQuant static quantization.
@@ -85,17 +86,41 @@ import shutil
 import subprocess
 import sys
 import time
 from pathlib import Path
 import numpy as np
 import onnx
 import onnxruntime as ort
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType
 from onnx_neural_compressor import data_reader
 from onnx_neural_compressor.quantization import config, quantize
 from onnx_neural_compressor.algorithms.smoother import core as _sq_core
 # --- FastConformer compatibility shim for onnx-neural-compressor's SmoothQuant -
 # The library's smoother hard-assumes a 3D activation is (batch, seq, in_channel)
 # with the in-channel LAST (there is a literal TODO admitting this in
@@ -174,7 +199,8 @@ def human(n):
 def find_ffmpeg(explicit=None):
     cand = explicit or os.environ.get("FFMPEG") or shutil.which("ffmpeg")
     if not cand or not shutil.which(cand) and not os.path.exists(cand):
-        sys.exit("ffmpeg not found (set $FFMPEG or pass --ffmpeg).")
     return cand
@@ -210,11 +236,12 @@ def collect_windows(ffmpeg, audio_paths, window_sec, num_windows):
             windows.append(pcm[start:start + win])
             start += win
             count += 1
-        print(f"  [calib] {Path(p).name}: {n / SAMPLE_RATE:.0f}s -> {count} full window(s)")
     if not windows:
-        sys.exit(f"No calibration audio yielded a full {window_sec:g}s window. "
-                 f"Drop clips in ./{DEFAULT_CALIB_DIR}/, pass --audio <file-or-folder>, "
-                 "or lower --window-sec.")
     if len(windows) > num_windows:
         # Even stride across the whole pool for speaker/content diversity.
         idx = np.linspace(0, len(windows) - 1, num_windows).round().astype(int)
@@ -364,15 +391,22 @@ def main():
                          "measured ~0.96 cosine yet tracked fp16 WER (10.9%% vs 10.2%%), so the "
                          "floor sits well below that. A true collapse lands far lower.")
     ap.add_argument("--ffmpeg", default=None, help="ffmpeg binary (else $FFMPEG / PATH)")
     args = ap.parse_args()
     model_dir = Path(args.model_dir)
     in_encoder = model_dir / "encoder-model.onnx"
     pre_path = model_dir / "nemo128.onnx"
     out_encoder = model_dir / args.out_name
     for p in (in_encoder, pre_path):
         if not p.exists():
-            sys.exit(f"missing required file: {p}")
     # alpha is either the literal "auto" (per-layer search) or a fixed float.
     if str(args.alpha).strip().lower() == "auto":
@@ -381,14 +415,15 @@ def main():
         try:
             alpha = float(args.alpha)
         except ValueError:
-            sys.exit(f"--alpha must be a float in 0..1 or 'auto', got {args.alpha!r}")
     ffmpeg = find_ffmpeg(args.ffmpeg)
     audio = expand_audio(args.audio or [DEFAULT_CALIB_DIR])
-    print(f"[sq] calibration: up to {args.num_windows} x {args.window_sec:g}s windows")
     windows = collect_windows(ffmpeg, audio, args.window_sec, args.num_windows)
-    print(f"[sq] using {len(windows)} calibration window(s); extracting mel features...")
     feats = build_features(pre_path, windows)
     fmt = QuantFormat.QOperator if args.quant_format == "qoperator" else QuantFormat.QDQ
@@ -433,9 +468,9 @@ def main():
         },
     )
-    print(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
-          f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
-    print(f"[sq]   {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
     t0 = time.time()
     # ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
     # (which has a `provides=` kwarg typo that crashes on this version) and avoids
@@ -444,8 +479,8 @@ def main():
              optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL)
     dt = time.time() - t0
     if _SKIPPED["count"]:
-        print(f"[sq] note: {_SKIPPED['count']} node(s) had a layout SmoothQuant could not "
-              f"resolve and were left as plain static int8 (everything else was smoothed)")
     # neural-compressor always writes the quantized weights to an external
     # `<name>_data` sidecar for a model this size, ignoring use_external_data_format.
@@ -462,12 +497,12 @@ def main():
     if os.path.exists(sidecar):
         os.remove(sidecar)
     if pruned:
-        print(f"[sq] pruned {pruned} orphaned initializer(s) (folded smooth scales)")
     out_size = os.path.getsize(out_encoder)
     baseline = model_dir / "encoder-model.int8.onnx"
     base_note = f" (istupakov int8 is {human(os.path.getsize(baseline))})" if baseline.exists() else ""
-    print(f"[sq] done in {dt:.0f}s -> {out_encoder.name} {human(out_size)}{base_note}")
     # Fidelity smoke test (NOT just shape): run one calibration window through both
     # the fp32 reference and the new int8 encoder and compare the encoder outputs by
@@ -483,27 +518,26 @@ def main():
         denom = (np.linalg.norm(out_q) * np.linalg.norm(out_f)) or 1.0
         cos = float(np.dot(out_q, out_f) / denom)
         if cos < args.fidelity_warn:
-            print(f"[sq] WARNING: encoder-output cosine vs fp32 is {cos:.4f} "
-                  f"(< {args.fidelity_warn}). This export likely COLLAPSED; expect a near-100% "
-                  f"WER. Try a different --calibrate-method/--alpha or keep more ops fp32.",
-                  file=sys.stderr)
         else:
-            print(f"[sq] fidelity: encoder-output cosine vs fp32 = {cos:.4f} (>= "
-                  f"{args.fidelity_warn}). Looks healthy.")
     except Exception as e:
-        print(f"[sq] WARNING: exported encoder failed the fidelity smoke test: {e}", file=sys.stderr)
     if not args.no_candidate:
         cand = build_candidate_dir(model_dir, out_encoder, args.candidate_dir)
-        print(f"[sq] candidate model dir (for wer-quants): {cand}")
     rel_cand = os.path.relpath(args.candidate_dir)
     rel_model = os.path.relpath(model_dir)
-    print("\nCompare per-section degradation vs fp16 (wer-quants.py lives in the "
-          "parakeet_web repo: https://github.com/thiswillbeyourgithub/parakeet_web):")
-    print(f"  uv run scripts/wer-quants.py --model-dir {rel_cand} --quants int8")
-    print(f"  uv run scripts/wer-quants.py --model-dir {rel_model} --quants int8,fp16")
-    print("A new-int8 per-section WER that tracks fp16 (instead of climbing) is the win.")
 if __name__ == "__main__":

 #     "prettytable",
 #     "psutil",
 #     "scipy",
+#     "loguru",
 # ]
 # ///
 """Export a *better* int8 Parakeet encoder using SmoothQuant static quantization.
 import subprocess
 import sys
 import time
+from datetime import datetime
 from pathlib import Path
 import numpy as np
 import onnx
 import onnxruntime as ort
+from loguru import logger
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType
 from onnx_neural_compressor import data_reader
 from onnx_neural_compressor.quantization import config, quantize
 from onnx_neural_compressor.algorithms.smoother import core as _sq_core
+def configure_logging(log_file):
+    """Send logs to stderr (human, colored) AND a local file (full, timestamped).
+    Called once at the start of main() so every message after arg-parsing lands
+    in both places. The file sink keeps a permanent record of an export run
+    (calibration windows, alpha, fidelity cosine, the wer-quants commands) so a
+    later A/B can be traced without re-running."""
+    logger.remove()
+    logger.add(
+        sys.stderr,
+        level="INFO",
+        format="<green>{time:HH:mm:ss}</green> | <level>{level: <7}</level> | <level>{message}</level>",
+    )
+    logger.add(
+        str(log_file),
+        level="DEBUG",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level: <7} | {message}",
+        enqueue=True,
+    )
+    logger.info(f"[sq] logging to {log_file}")
 # --- FastConformer compatibility shim for onnx-neural-compressor's SmoothQuant -
 # The library's smoother hard-assumes a 3D activation is (batch, seq, in_channel)
 # with the in-channel LAST (there is a literal TODO admitting this in
 def find_ffmpeg(explicit=None):
     cand = explicit or os.environ.get("FFMPEG") or shutil.which("ffmpeg")
     if not cand or not shutil.which(cand) and not os.path.exists(cand):
+        logger.error("ffmpeg not found (set $FFMPEG or pass --ffmpeg).")
+        sys.exit(1)
     return cand
             windows.append(pcm[start:start + win])
             start += win
             count += 1
+        logger.info(f"  [calib] {Path(p).name}: {n / SAMPLE_RATE:.0f}s -> {count} full window(s)")
     if not windows:
+        logger.error(f"No calibration audio yielded a full {window_sec:g}s window. "
+                     f"Drop clips in ./{DEFAULT_CALIB_DIR}/, pass --audio <file-or-folder>, "
+                     "or lower --window-sec.")
+        sys.exit(1)
     if len(windows) > num_windows:
         # Even stride across the whole pool for speaker/content diversity.
         idx = np.linspace(0, len(windows) - 1, num_windows).round().astype(int)
                          "measured ~0.96 cosine yet tracked fp16 WER (10.9%% vs 10.2%%), so the "
                          "floor sits well below that. A true collapse lands far lower.")
     ap.add_argument("--ffmpeg", default=None, help="ffmpeg binary (else $FFMPEG / PATH)")
+    ap.add_argument("--log-file", default=None,
+                    help="path for the run log file (logs always also go to stderr). "
+                         "Default: ./quantize-int8-smoothquant-<timestamp>.log")
     args = ap.parse_args()
+    log_file = args.log_file or f"quantize-int8-smoothquant-{datetime.now():%Y%m%d-%H%M%S}.log"
+    configure_logging(log_file)
     model_dir = Path(args.model_dir)
     in_encoder = model_dir / "encoder-model.onnx"
     pre_path = model_dir / "nemo128.onnx"
     out_encoder = model_dir / args.out_name
     for p in (in_encoder, pre_path):
         if not p.exists():
+            logger.error(f"missing required file: {p}")
+            sys.exit(1)
     # alpha is either the literal "auto" (per-layer search) or a fixed float.
     if str(args.alpha).strip().lower() == "auto":
         try:
             alpha = float(args.alpha)
         except ValueError:
+            logger.error(f"--alpha must be a float in 0..1 or 'auto', got {args.alpha!r}")
+            sys.exit(1)
     ffmpeg = find_ffmpeg(args.ffmpeg)
     audio = expand_audio(args.audio or [DEFAULT_CALIB_DIR])
+    logger.info(f"[sq] calibration: up to {args.num_windows} x {args.window_sec:g}s windows")
     windows = collect_windows(ffmpeg, audio, args.window_sec, args.num_windows)
+    logger.info(f"[sq] using {len(windows)} calibration window(s); extracting mel features...")
     feats = build_features(pre_path, windows)
     fmt = QuantFormat.QOperator if args.quant_format == "qoperator" else QuantFormat.QDQ
         },
     )
+    logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
+                f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
+    logger.info(f"[sq]   {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
     t0 = time.time()
     # ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
     # (which has a `provides=` kwarg typo that crashes on this version) and avoids
              optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL)
     dt = time.time() - t0
     if _SKIPPED["count"]:
+        logger.info(f"[sq] note: {_SKIPPED['count']} node(s) had a layout SmoothQuant could not "
+                    f"resolve and were left as plain static int8 (everything else was smoothed)")
     # neural-compressor always writes the quantized weights to an external
     # `<name>_data` sidecar for a model this size, ignoring use_external_data_format.
     if os.path.exists(sidecar):
         os.remove(sidecar)
     if pruned:
+        logger.info(f"[sq] pruned {pruned} orphaned initializer(s) (folded smooth scales)")
     out_size = os.path.getsize(out_encoder)
     baseline = model_dir / "encoder-model.int8.onnx"
     base_note = f" (istupakov int8 is {human(os.path.getsize(baseline))})" if baseline.exists() else ""
+    logger.info(f"[sq] done in {dt:.0f}s -> {out_encoder.name} {human(out_size)}{base_note}")
     # Fidelity smoke test (NOT just shape): run one calibration window through both
     # the fp32 reference and the new int8 encoder and compare the encoder outputs by
         denom = (np.linalg.norm(out_q) * np.linalg.norm(out_f)) or 1.0
         cos = float(np.dot(out_q, out_f) / denom)
         if cos < args.fidelity_warn:
+            logger.warning(f"[sq] encoder-output cosine vs fp32 is {cos:.4f} "
+                           f"(< {args.fidelity_warn}). This export likely COLLAPSED; expect a near-100% "
+                           f"WER. Try a different --calibrate-method/--alpha or keep more ops fp32.")
         else:
+            logger.info(f"[sq] fidelity: encoder-output cosine vs fp32 = {cos:.4f} (>= "
+                        f"{args.fidelity_warn}). Looks healthy.")
     except Exception as e:
+        logger.warning(f"[sq] exported encoder failed the fidelity smoke test: {e}")
     if not args.no_candidate:
         cand = build_candidate_dir(model_dir, out_encoder, args.candidate_dir)
+        logger.info(f"[sq] candidate model dir (for wer-quants): {cand}")
     rel_cand = os.path.relpath(args.candidate_dir)
     rel_model = os.path.relpath(model_dir)
+    logger.info("Compare per-section degradation vs fp16 (wer-quants.py lives in the "
+                "parakeet_web repo: https://github.com/thiswillbeyourgithub/parakeet_web):")
+    logger.info(f"  uv run scripts/wer-quants.py --model-dir {rel_cand} --quants int8")
+    logger.info(f"  uv run scripts/wer-quants.py --model-dir {rel_model} --quants int8,fp16")
+    logger.info("A new-int8 per-section WER that tracks fp16 (instead of climbing) is the win.")
 if __name__ == "__main__":