thiswillbeyourgithub Claude Opus 4.8 commited on
Commit
fa7c830
·
1 Parent(s): 7d8c10c

quantize-int8-smoothquant: route output through loguru + log to file

Browse files

Replace all print() with loguru logger calls and add a file sink so every
run's logs (calibration windows, alpha, fidelity cosine, the wer-quants
commands) are kept in a local quantize-int8-smoothquant-<timestamp>.log in
addition to stderr. Adds a --log-file override and the loguru dependency.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (1) hide show
  1. scripts/quantize-int8-smoothquant.py +63 -29
scripts/quantize-int8-smoothquant.py CHANGED
@@ -10,6 +10,7 @@
10
  # "prettytable",
11
  # "psutil",
12
  # "scipy",
 
13
  # ]
14
  # ///
15
  """Export a *better* int8 Parakeet encoder using SmoothQuant static quantization.
@@ -85,17 +86,41 @@ import shutil
85
  import subprocess
86
  import sys
87
  import time
 
88
  from pathlib import Path
89
 
90
  import numpy as np
91
  import onnx
92
  import onnxruntime as ort
 
93
  from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType
94
  from onnx_neural_compressor import data_reader
95
  from onnx_neural_compressor.quantization import config, quantize
96
  from onnx_neural_compressor.algorithms.smoother import core as _sq_core
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  # --- FastConformer compatibility shim for onnx-neural-compressor's SmoothQuant -
100
  # The library's smoother hard-assumes a 3D activation is (batch, seq, in_channel)
101
  # with the in-channel LAST (there is a literal TODO admitting this in
@@ -174,7 +199,8 @@ def human(n):
174
  def find_ffmpeg(explicit=None):
175
  cand = explicit or os.environ.get("FFMPEG") or shutil.which("ffmpeg")
176
  if not cand or not shutil.which(cand) and not os.path.exists(cand):
177
- sys.exit("ffmpeg not found (set $FFMPEG or pass --ffmpeg).")
 
178
  return cand
179
 
180
 
@@ -210,11 +236,12 @@ def collect_windows(ffmpeg, audio_paths, window_sec, num_windows):
210
  windows.append(pcm[start:start + win])
211
  start += win
212
  count += 1
213
- print(f" [calib] {Path(p).name}: {n / SAMPLE_RATE:.0f}s -> {count} full window(s)")
214
  if not windows:
215
- sys.exit(f"No calibration audio yielded a full {window_sec:g}s window. "
216
- f"Drop clips in ./{DEFAULT_CALIB_DIR}/, pass --audio <file-or-folder>, "
217
- "or lower --window-sec.")
 
218
  if len(windows) > num_windows:
219
  # Even stride across the whole pool for speaker/content diversity.
220
  idx = np.linspace(0, len(windows) - 1, num_windows).round().astype(int)
@@ -364,15 +391,22 @@ def main():
364
  "measured ~0.96 cosine yet tracked fp16 WER (10.9%% vs 10.2%%), so the "
365
  "floor sits well below that. A true collapse lands far lower.")
366
  ap.add_argument("--ffmpeg", default=None, help="ffmpeg binary (else $FFMPEG / PATH)")
 
 
 
367
  args = ap.parse_args()
368
 
 
 
 
369
  model_dir = Path(args.model_dir)
370
  in_encoder = model_dir / "encoder-model.onnx"
371
  pre_path = model_dir / "nemo128.onnx"
372
  out_encoder = model_dir / args.out_name
373
  for p in (in_encoder, pre_path):
374
  if not p.exists():
375
- sys.exit(f"missing required file: {p}")
 
376
 
377
  # alpha is either the literal "auto" (per-layer search) or a fixed float.
378
  if str(args.alpha).strip().lower() == "auto":
@@ -381,14 +415,15 @@ def main():
381
  try:
382
  alpha = float(args.alpha)
383
  except ValueError:
384
- sys.exit(f"--alpha must be a float in 0..1 or 'auto', got {args.alpha!r}")
 
385
 
386
  ffmpeg = find_ffmpeg(args.ffmpeg)
387
  audio = expand_audio(args.audio or [DEFAULT_CALIB_DIR])
388
 
389
- print(f"[sq] calibration: up to {args.num_windows} x {args.window_sec:g}s windows")
390
  windows = collect_windows(ffmpeg, audio, args.window_sec, args.num_windows)
391
- print(f"[sq] using {len(windows)} calibration window(s); extracting mel features...")
392
  feats = build_features(pre_path, windows)
393
 
394
  fmt = QuantFormat.QOperator if args.quant_format == "qoperator" else QuantFormat.QDQ
@@ -433,9 +468,9 @@ def main():
433
  },
434
  )
435
 
436
- print(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
437
- f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
438
- print(f"[sq] {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
439
  t0 = time.time()
440
  # ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
441
  # (which has a `provides=` kwarg typo that crashes on this version) and avoids
@@ -444,8 +479,8 @@ def main():
444
  optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL)
445
  dt = time.time() - t0
446
  if _SKIPPED["count"]:
447
- print(f"[sq] note: {_SKIPPED['count']} node(s) had a layout SmoothQuant could not "
448
- f"resolve and were left as plain static int8 (everything else was smoothed)")
449
 
450
  # neural-compressor always writes the quantized weights to an external
451
  # `<name>_data` sidecar for a model this size, ignoring use_external_data_format.
@@ -462,12 +497,12 @@ def main():
462
  if os.path.exists(sidecar):
463
  os.remove(sidecar)
464
  if pruned:
465
- print(f"[sq] pruned {pruned} orphaned initializer(s) (folded smooth scales)")
466
 
467
  out_size = os.path.getsize(out_encoder)
468
  baseline = model_dir / "encoder-model.int8.onnx"
469
  base_note = f" (istupakov int8 is {human(os.path.getsize(baseline))})" if baseline.exists() else ""
470
- print(f"[sq] done in {dt:.0f}s -> {out_encoder.name} {human(out_size)}{base_note}")
471
 
472
  # Fidelity smoke test (NOT just shape): run one calibration window through both
473
  # the fp32 reference and the new int8 encoder and compare the encoder outputs by
@@ -483,27 +518,26 @@ def main():
483
  denom = (np.linalg.norm(out_q) * np.linalg.norm(out_f)) or 1.0
484
  cos = float(np.dot(out_q, out_f) / denom)
485
  if cos < args.fidelity_warn:
486
- print(f"[sq] WARNING: encoder-output cosine vs fp32 is {cos:.4f} "
487
- f"(< {args.fidelity_warn}). This export likely COLLAPSED; expect a near-100% "
488
- f"WER. Try a different --calibrate-method/--alpha or keep more ops fp32.",
489
- file=sys.stderr)
490
  else:
491
- print(f"[sq] fidelity: encoder-output cosine vs fp32 = {cos:.4f} (>= "
492
- f"{args.fidelity_warn}). Looks healthy.")
493
  except Exception as e:
494
- print(f"[sq] WARNING: exported encoder failed the fidelity smoke test: {e}", file=sys.stderr)
495
 
496
  if not args.no_candidate:
497
  cand = build_candidate_dir(model_dir, out_encoder, args.candidate_dir)
498
- print(f"[sq] candidate model dir (for wer-quants): {cand}")
499
 
500
  rel_cand = os.path.relpath(args.candidate_dir)
501
  rel_model = os.path.relpath(model_dir)
502
- print("\nCompare per-section degradation vs fp16 (wer-quants.py lives in the "
503
- "parakeet_web repo: https://github.com/thiswillbeyourgithub/parakeet_web):")
504
- print(f" uv run scripts/wer-quants.py --model-dir {rel_cand} --quants int8")
505
- print(f" uv run scripts/wer-quants.py --model-dir {rel_model} --quants int8,fp16")
506
- print("A new-int8 per-section WER that tracks fp16 (instead of climbing) is the win.")
507
 
508
 
509
  if __name__ == "__main__":
 
10
  # "prettytable",
11
  # "psutil",
12
  # "scipy",
13
+ # "loguru",
14
  # ]
15
  # ///
16
  """Export a *better* int8 Parakeet encoder using SmoothQuant static quantization.
 
86
  import subprocess
87
  import sys
88
  import time
89
+ from datetime import datetime
90
  from pathlib import Path
91
 
92
  import numpy as np
93
  import onnx
94
  import onnxruntime as ort
95
+ from loguru import logger
96
  from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType
97
  from onnx_neural_compressor import data_reader
98
  from onnx_neural_compressor.quantization import config, quantize
99
  from onnx_neural_compressor.algorithms.smoother import core as _sq_core
100
 
101
 
102
+ def configure_logging(log_file):
103
+ """Send logs to stderr (human, colored) AND a local file (full, timestamped).
104
+
105
+ Called once at the start of main() so every message after arg-parsing lands
106
+ in both places. The file sink keeps a permanent record of an export run
107
+ (calibration windows, alpha, fidelity cosine, the wer-quants commands) so a
108
+ later A/B can be traced without re-running."""
109
+ logger.remove()
110
+ logger.add(
111
+ sys.stderr,
112
+ level="INFO",
113
+ format="<green>{time:HH:mm:ss}</green> | <level>{level: <7}</level> | <level>{message}</level>",
114
+ )
115
+ logger.add(
116
+ str(log_file),
117
+ level="DEBUG",
118
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level: <7} | {message}",
119
+ enqueue=True,
120
+ )
121
+ logger.info(f"[sq] logging to {log_file}")
122
+
123
+
124
  # --- FastConformer compatibility shim for onnx-neural-compressor's SmoothQuant -
125
  # The library's smoother hard-assumes a 3D activation is (batch, seq, in_channel)
126
  # with the in-channel LAST (there is a literal TODO admitting this in
 
199
  def find_ffmpeg(explicit=None):
200
  cand = explicit or os.environ.get("FFMPEG") or shutil.which("ffmpeg")
201
  if not cand or not shutil.which(cand) and not os.path.exists(cand):
202
+ logger.error("ffmpeg not found (set $FFMPEG or pass --ffmpeg).")
203
+ sys.exit(1)
204
  return cand
205
 
206
 
 
236
  windows.append(pcm[start:start + win])
237
  start += win
238
  count += 1
239
+ logger.info(f" [calib] {Path(p).name}: {n / SAMPLE_RATE:.0f}s -> {count} full window(s)")
240
  if not windows:
241
+ logger.error(f"No calibration audio yielded a full {window_sec:g}s window. "
242
+ f"Drop clips in ./{DEFAULT_CALIB_DIR}/, pass --audio <file-or-folder>, "
243
+ "or lower --window-sec.")
244
+ sys.exit(1)
245
  if len(windows) > num_windows:
246
  # Even stride across the whole pool for speaker/content diversity.
247
  idx = np.linspace(0, len(windows) - 1, num_windows).round().astype(int)
 
391
  "measured ~0.96 cosine yet tracked fp16 WER (10.9%% vs 10.2%%), so the "
392
  "floor sits well below that. A true collapse lands far lower.")
393
  ap.add_argument("--ffmpeg", default=None, help="ffmpeg binary (else $FFMPEG / PATH)")
394
+ ap.add_argument("--log-file", default=None,
395
+ help="path for the run log file (logs always also go to stderr). "
396
+ "Default: ./quantize-int8-smoothquant-<timestamp>.log")
397
  args = ap.parse_args()
398
 
399
+ log_file = args.log_file or f"quantize-int8-smoothquant-{datetime.now():%Y%m%d-%H%M%S}.log"
400
+ configure_logging(log_file)
401
+
402
  model_dir = Path(args.model_dir)
403
  in_encoder = model_dir / "encoder-model.onnx"
404
  pre_path = model_dir / "nemo128.onnx"
405
  out_encoder = model_dir / args.out_name
406
  for p in (in_encoder, pre_path):
407
  if not p.exists():
408
+ logger.error(f"missing required file: {p}")
409
+ sys.exit(1)
410
 
411
  # alpha is either the literal "auto" (per-layer search) or a fixed float.
412
  if str(args.alpha).strip().lower() == "auto":
 
415
  try:
416
  alpha = float(args.alpha)
417
  except ValueError:
418
+ logger.error(f"--alpha must be a float in 0..1 or 'auto', got {args.alpha!r}")
419
+ sys.exit(1)
420
 
421
  ffmpeg = find_ffmpeg(args.ffmpeg)
422
  audio = expand_audio(args.audio or [DEFAULT_CALIB_DIR])
423
 
424
+ logger.info(f"[sq] calibration: up to {args.num_windows} x {args.window_sec:g}s windows")
425
  windows = collect_windows(ffmpeg, audio, args.window_sec, args.num_windows)
426
+ logger.info(f"[sq] using {len(windows)} calibration window(s); extracting mel features...")
427
  feats = build_features(pre_path, windows)
428
 
429
  fmt = QuantFormat.QOperator if args.quant_format == "qoperator" else QuantFormat.QDQ
 
468
  },
469
  )
470
 
471
+ logger.info(f"[sq] SmoothQuant(alpha={alpha}) static int8, per-channel, "
472
+ f"calib={args.calibrate_method}, ops={op_types}, format={args.quant_format} ...")
473
+ logger.info(f"[sq] {human(os.path.getsize(in_encoder) + os.path.getsize(str(in_encoder) + '.data'))} fp32 encoder")
474
  t0 = time.time()
475
  # ORT_DISABLE_ALL skips neural-compressor's pre-optimization InferenceSession
476
  # (which has a `provides=` kwarg typo that crashes on this version) and avoids
 
479
  optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL)
480
  dt = time.time() - t0
481
  if _SKIPPED["count"]:
482
+ logger.info(f"[sq] note: {_SKIPPED['count']} node(s) had a layout SmoothQuant could not "
483
+ f"resolve and were left as plain static int8 (everything else was smoothed)")
484
 
485
  # neural-compressor always writes the quantized weights to an external
486
  # `<name>_data` sidecar for a model this size, ignoring use_external_data_format.
 
497
  if os.path.exists(sidecar):
498
  os.remove(sidecar)
499
  if pruned:
500
+ logger.info(f"[sq] pruned {pruned} orphaned initializer(s) (folded smooth scales)")
501
 
502
  out_size = os.path.getsize(out_encoder)
503
  baseline = model_dir / "encoder-model.int8.onnx"
504
  base_note = f" (istupakov int8 is {human(os.path.getsize(baseline))})" if baseline.exists() else ""
505
+ logger.info(f"[sq] done in {dt:.0f}s -> {out_encoder.name} {human(out_size)}{base_note}")
506
 
507
  # Fidelity smoke test (NOT just shape): run one calibration window through both
508
  # the fp32 reference and the new int8 encoder and compare the encoder outputs by
 
518
  denom = (np.linalg.norm(out_q) * np.linalg.norm(out_f)) or 1.0
519
  cos = float(np.dot(out_q, out_f) / denom)
520
  if cos < args.fidelity_warn:
521
+ logger.warning(f"[sq] encoder-output cosine vs fp32 is {cos:.4f} "
522
+ f"(< {args.fidelity_warn}). This export likely COLLAPSED; expect a near-100% "
523
+ f"WER. Try a different --calibrate-method/--alpha or keep more ops fp32.")
 
524
  else:
525
+ logger.info(f"[sq] fidelity: encoder-output cosine vs fp32 = {cos:.4f} (>= "
526
+ f"{args.fidelity_warn}). Looks healthy.")
527
  except Exception as e:
528
+ logger.warning(f"[sq] exported encoder failed the fidelity smoke test: {e}")
529
 
530
  if not args.no_candidate:
531
  cand = build_candidate_dir(model_dir, out_encoder, args.candidate_dir)
532
+ logger.info(f"[sq] candidate model dir (for wer-quants): {cand}")
533
 
534
  rel_cand = os.path.relpath(args.candidate_dir)
535
  rel_model = os.path.relpath(model_dir)
536
+ logger.info("Compare per-section degradation vs fp16 (wer-quants.py lives in the "
537
+ "parakeet_web repo: https://github.com/thiswillbeyourgithub/parakeet_web):")
538
+ logger.info(f" uv run scripts/wer-quants.py --model-dir {rel_cand} --quants int8")
539
+ logger.info(f" uv run scripts/wer-quants.py --model-dir {rel_model} --quants int8,fp16")
540
+ logger.info("A new-int8 per-section WER that tracks fp16 (instead of climbing) is the win.")
541
 
542
 
543
  if __name__ == "__main__":