thiswillbeyourgithub
scripts: pin dependency versions in PEP 723 headers for reproducibility
501cdef
Raw
History Blame Contribute Delete
8.17 kB
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "onnx==1.21.0",
# "onnxruntime==1.26.0",
# "sympy==1.14.0", # onnxruntime.transformers.onnx_model imports it at module load
# ]
# ///
"""Convert the fp32 Parakeet ONNX pieces to float16, to land under the WASM /
Chromium ~2 GB blob limits without the heavy accuracy loss of int8.
Why fp16 (see CLAUDE.md for the full reasoning): the fp32 encoder is ~2.44 GB
of external weights, which cannot load on the WASM backend (32-bit WASM caps a
single ArrayBuffer at ~2 GB and Chromium's blob-URL fetch caps around 2 GB
too). int8 (~600 MB) fits but degrades quality. fp16 halves the fp32 weights to
~1.2 GB: under both caps, and near-lossless versus fp32. This script produces
that fp16 variant from the local fp32 files so it can be benchmarked before
deciding whether to ship it.
It converts the two pieces that matter:
- encoder-model.onnx (+ encoder-model.onnx.data) -> encoder-model.fp16.onnx
- decoder_joint-model.onnx -> decoder_joint-model.fp16.onnx
nemo128.onnx (the ONNX preprocessor) is intentionally skipped: the web app and
scripts/transcribe.mjs use the pure-JS mel preprocessor (mel.js), so the ONNX
preprocessor is never loaded.
Useful reference: https://huggingface.co/grikdotnet/parakeet-tdt-0.6b-fp16
documents the same conversion (same pieces, same keep_io_types=True /
disable_shape_infer=True settings). It uses onnxconverter_common.float16 plus a
separate post-processing pass to rewrite leftover internal Cast(to=FLOAT) nodes
to Cast(to=FLOAT16). We instead use onnxruntime.transformers.float16, the
evolved fork of that same converter, which handles those internal casts itself,
so no separate cast-fixing pass is needed (a topological_sort below is enough).
keep_io_types=True is deliberate and load-bearing: the encoder/decoder graphs
take and return float32 tensors (audio_signal, outputs, encoder_outputs, and
the decoder's LSTM input_states_*/output_states_*). Keeping the I/O boundary at
float32 means the JS pipeline (parakeet.js) feeds and reads exactly the same
dtypes as for the fp32/int8 models, so NOTHING in the JS side needs to change;
only the weights and internal compute become fp16.
Usage (run from this model-repo root; the onnx files default to .):
uv run scripts/quantize-fp16.py # convert in place
uv run scripts/quantize-fp16.py --model-dir DIR --out-dir DIR
uv run scripts/quantize-fp16.py --external-data # force .onnx.data sidecar
Self-contained via the PEP 723 header above (onnxruntime provides
onnxruntime.transformers.float16), so `uv run` installs the deps on the fly.
Built with Claude Code.
"""
import argparse
import os
import sys
import time
import onnx
from onnxruntime.transformers.float16 import convert_float_to_float16
from onnxruntime.transformers.onnx_model import OnnxModel
# (input fp32 file, output fp16 file). Only the encoder carries external weights.
PIECES = [
("encoder-model.onnx", "encoder-model.fp16.onnx"),
("decoder_joint-model.onnx", "decoder_joint-model.fp16.onnx"),
]
# Single-protobuf serialisation hard-caps at 2 GB. The fp16 encoder is ~1.2 GB
# so an inline save normally fits, but we keep a margin and fall back to an
# external-data sidecar (which scripts/transcribe.mjs createSession() already
# resolves via the "<model>.data" probe) if we get close.
TWO_GB = 2 * 1024 ** 3
def human(n):
for unit in ("B", "KB", "MB", "GB"):
if n < 1024 or unit == "GB":
return f"{n:.1f} {unit}"
n /= 1024
def file_size(path):
total = os.path.getsize(path)
data = path + ".data"
if os.path.exists(data):
total += os.path.getsize(data)
return total
def convert_one(in_path, out_path, force_external, op_block_list):
if not os.path.exists(in_path):
raise FileNotFoundError(f"missing input model: {in_path}")
in_size = file_size(in_path)
print(f"[fp16] {os.path.basename(in_path)} ({human(in_size)}) -> "
f"{os.path.basename(out_path)}")
# load_external_data=True (default) pulls the sibling .onnx.data into memory
# so the converter sees real tensors. This needs ~the fp32 model's size in
# RAM for the encoder (~2.4 GB); that is the price of an in-memory convert.
t0 = time.time()
model = onnx.load(in_path, load_external_data=True)
# disable_shape_infer=True: onnx shape inference serialises the model to run,
# which would hit the 2 GB protobuf limit on the fp32 encoder. keep_io_types
# pins the float32 boundary so the converter still inserts the right casts.
fp16_model = convert_float_to_float16(
model,
keep_io_types=True,
disable_shape_infer=True,
op_block_list=op_block_list if op_block_list else None,
)
# keep_io_types=True prepends graph_input_cast_* / appends graph_output_cast_*
# nodes but does NOT re-sort the graph, leaving it not topologically sorted.
# onnx.checker rejects that and ORT-web fails to build the session (it
# surfaced as a std::bad_alloc). A topological sort fixes the node order.
OnnxModel(fp16_model).topological_sort()
convert_s = time.time() - t0
# Estimate serialized size to choose inline vs external. ByteSize() is exact
# but can itself overflow near 2 GB, so guard it.
try:
approx = fp16_model.ByteSize()
big = approx >= TWO_GB - (64 * 1024 ** 2) # 64 MB safety margin
except (ValueError, OverflowError):
big = True
use_external = force_external or big
# A stale sidecar from a previous run would be silently reused by ORT, so
# clear it when we are NOT writing external data this time.
sidecar = out_path + ".data"
if not use_external and os.path.exists(sidecar):
os.remove(sidecar)
if use_external:
onnx.save(
fp16_model, out_path,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=os.path.basename(sidecar),
convert_attribute=False,
)
else:
onnx.save(fp16_model, out_path)
out_size = file_size(out_path)
print(f" converted in {convert_s:.1f}s, "
f"{'external' if use_external else 'inline'} -> {human(out_size)} "
f"({100 * out_size / in_size:.0f}% of fp32)")
return in_size, out_size
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--model-dir", default=".",
help="directory holding the fp32 .onnx files (default: current directory)")
ap.add_argument("--out-dir", default=None,
help="output directory (default: same as --model-dir)")
ap.add_argument("--external-data", action="store_true",
help="always write weights to a .onnx.data sidecar (default: inline when it fits under 2 GB)")
ap.add_argument("--op-block-list", default="",
help="comma-separated ONNX op types to keep in fp32 (default: the converter's built-in list)")
args = ap.parse_args()
model_dir = args.model_dir
out_dir = args.out_dir or model_dir
os.makedirs(out_dir, exist_ok=True)
op_block_list = [s.strip() for s in args.op_block_list.split(",") if s.strip()]
total_in = total_out = 0
for in_name, out_name in PIECES:
in_size, out_size = convert_one(
os.path.join(model_dir, in_name),
os.path.join(out_dir, out_name),
args.external_data,
op_block_list,
)
total_in += in_size
total_out += out_size
print(f"[fp16] done: {human(total_in)} fp32 -> {human(total_out)} fp16 "
f"({100 * total_out / total_in:.0f}%). vocab.txt is reused as-is.")
enc_out = file_size(os.path.join(out_dir, "encoder-model.fp16.onnx"))
if enc_out >= TWO_GB:
print(f"[fp16] WARNING: fp16 encoder is {human(enc_out)}, still >= 2 GB; "
f"it will NOT load on the WASM backend.", file=sys.stderr)
if __name__ == "__main__":
main()