| |
| |
| |
| |
| |
| """Shard the fp32 Parakeet encoder's external weights into <2 GB pieces so the |
| fp32 encoder can load on the WASM backend / in-browser. |
| |
| This script lives in the model repo's scripts/ folder |
| (parakeet-tdt-0.6b-v3-smoothquant-onnx/scripts) alongside |
| quantize-int8-smoothquant.py and quantize-fp16.py, and reads the fp32 encoder |
| from the current directory by default, so run it from the model-repo root. It is |
| self-contained (PEP 723 header above), so `uv run scripts/shard-fp32.py` installs |
| onnx on the fly. The model repo ships pre-built shards under sharded/ for browsers; |
| this script is included for provenance and to regenerate them. |
| |
| Why (see CLAUDE.md for the full reasoning): the fp32 encoder is ~2.4 GB held in |
| ONE encoder-model.onnx.data sidecar. That single file trips two *ingest* walls |
| that block WASM, and neither is a total-memory limit: |
| 1. a 32-bit WASM ArrayBuffer caps at ~2 GB (2^31-1), and |
| 2. Chromium's blob-URL fetch caps near 2 GB. |
| The wasm32 heap ceiling itself is ~4 GB, and fp32 (unlike fp16, which the CPU/WASM |
| EP upcasts to fp32 at session build) stays ~2.4 GB resident, so it *should* fit |
| once no single buffer exceeds 2 GB. This script rewrites the encoder's per-tensor |
| external_data locations to spread the initializers across N shard files, each under |
| a configurable byte budget (default 1.5 GB), producing: |
| |
| encoder-model.onnx (graph; tensors now point at the shards) |
| encoder-model.onnx.data.000 |
| encoder-model.onnx.data.001 |
| ... |
| |
| onnxruntime-node (native) resolves these from disk by the graph's location fields; |
| the WASM / browser loader mounts each shard as a separate externalData entry (each |
| < 2 GB), sidestepping both caps. No weights are altered: this is a pure repack, so |
| WER must be identical to the single-file fp32. That equality is the whole point of |
| the experiment (does fp32 hold up on a long chunk where int8 drops content), so the |
| script never touches tensor values, only where their bytes live. |
| |
| Usage (run from this model-repo root; the onnx files default to .): |
| uv run scripts/shard-fp32.py # . -> ./sharded |
| uv run scripts/shard-fp32.py --model-dir DIR --out-dir DIR |
| uv run scripts/shard-fp32.py --max-shard-bytes 1000000000 # smaller shards (lower transient load peak) |
| uv run scripts/shard-fp32.py --encoder encoder-model.onnx # non-default encoder name |
| |
| Built with Claude Code. |
| """ |
|
|
| import argparse |
| import os |
| import sys |
|
|
| import onnx |
| from onnx import TensorProto |
| from onnx.external_data_helper import set_external_data |
|
|
| |
| |
| |
| |
| |
| DEFAULT_MAX_SHARD_BYTES = 1_500_000_000 |
|
|
| |
| |
| |
| INLINE_THRESHOLD_BYTES = 1024 |
|
|
|
|
| def human(n): |
| n = float(n) |
| for unit in ("B", "KB", "MB", "GB"): |
| if n < 1024 or unit == "GB": |
| return f"{n:.0f} {unit}" if unit == "B" else f"{n:.1f} {unit}" |
| n /= 1024 |
|
|
|
|
| def tensor_nbytes(t): |
| |
| |
| return len(t.raw_data) if t.HasField("raw_data") else 0 |
|
|
|
|
| def shard_model(in_path, out_path, max_shard_bytes): |
| if not os.path.exists(in_path): |
| raise FileNotFoundError(f"missing input model: {in_path}") |
|
|
| |
| |
| print(f"[shard] loading {in_path} (+ external data) ...") |
| model = onnx.load(in_path, load_external_data=True) |
|
|
| out_dir = os.path.dirname(out_path) or "." |
| os.makedirs(out_dir, exist_ok=True) |
| base = os.path.basename(out_path) |
|
|
| |
| |
| |
| |
| shard_idx = 0 |
| shard_offset = 0 |
| shard_file = None |
| shard_paths = [] |
| inline_count = 0 |
| externalised = 0 |
|
|
| def shard_location(idx): |
| return f"{base}.data.{idx:03d}" |
|
|
| def open_shard(idx): |
| loc = shard_location(idx) |
| path = os.path.join(out_dir, loc) |
| f = open(path, "wb") |
| shard_paths.append(path) |
| return f, loc |
|
|
| shard_file, shard_loc = open_shard(shard_idx) |
|
|
| try: |
| for t in model.graph.initializer: |
| nbytes = tensor_nbytes(t) |
| if nbytes < INLINE_THRESHOLD_BYTES: |
| inline_count += 1 |
| continue |
|
|
| |
| |
| |
| if shard_offset > 0 and shard_offset + nbytes > max_shard_bytes: |
| shard_file.close() |
| shard_idx += 1 |
| shard_offset = 0 |
| shard_file, shard_loc = open_shard(shard_idx) |
|
|
| data = t.raw_data |
| shard_file.write(data) |
| set_external_data(t, location=shard_loc, offset=shard_offset, length=nbytes) |
| t.ClearField("raw_data") |
| t.data_location = TensorProto.EXTERNAL |
| shard_offset += nbytes |
| externalised += 1 |
| finally: |
| if shard_file: |
| shard_file.close() |
|
|
| |
| |
| |
| onnx.save(model, out_path, save_as_external_data=False) |
|
|
| sizes = [os.path.getsize(p) for p in shard_paths] |
| print(f"[shard] wrote {os.path.basename(out_path)} + {len(shard_paths)} shard(s) " |
| f"({externalised} external tensors, {inline_count} kept inline):") |
| for p, s in zip(shard_paths, sizes): |
| flag = " <-- OVER 2 GB!" if s >= 2 ** 31 else "" |
| print(f" {os.path.basename(p)} {human(s)}{flag}") |
| total = sum(sizes) |
| over = [p for p, s in zip(shard_paths, sizes) if s >= 2 ** 31] |
| print(f"[shard] total external: {human(total)} across {len(shard_paths)} shard(s)") |
| if over: |
| print(f"[shard] WARNING: {len(over)} shard(s) still exceed 2 GB; lower --max-shard-bytes", |
| file=sys.stderr) |
| return shard_paths |
|
|
|
|
| def link_sibling(src_dir, out_dir, name): |
| """Make `name` available in out_dir (symlink, falling back to copy) so the |
| output is a complete model dir for wer-bench/transcribe without duplicating |
| multi-hundred-MB files. Skips silently when src and out are the same dir or |
| the source is absent.""" |
| src = os.path.join(src_dir, name) |
| dst = os.path.join(out_dir, name) |
| if not os.path.exists(src) or os.path.abspath(src) == os.path.abspath(dst): |
| return |
| if os.path.lexists(dst): |
| os.remove(dst) |
| try: |
| os.symlink(os.path.relpath(src, out_dir), dst) |
| except OSError: |
| import shutil |
| shutil.copy2(src, dst) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser(description=__doc__, |
| formatter_class=argparse.RawDescriptionHelpFormatter) |
| ap.add_argument("--model-dir", default=".", |
| help="dir holding encoder-model.onnx (+ .onnx.data). Default: current directory") |
| ap.add_argument("--out-dir", default=None, |
| help="where to write the sharded encoder + a complete model dir " |
| "(default: <model-dir>/sharded). Pass the same value as --model-dir to shard in place.") |
| ap.add_argument("--encoder", default="encoder-model.onnx", |
| help="encoder graph filename within --model-dir (default encoder-model.onnx)") |
| ap.add_argument("--max-shard-bytes", type=int, default=DEFAULT_MAX_SHARD_BYTES, |
| help=f"max bytes per shard (default {DEFAULT_MAX_SHARD_BYTES}, i.e. 1.5 GB)") |
| args = ap.parse_args() |
|
|
| out_dir = args.out_dir or os.path.join(args.model_dir, "sharded") |
| in_path = os.path.join(args.model_dir, args.encoder) |
| out_path = os.path.join(out_dir, args.encoder) |
|
|
| if args.max_shard_bytes >= 2 ** 31: |
| print("[shard] WARNING: --max-shard-bytes >= 2 GB defeats the purpose " |
| "(shards must stay under the WASM/blob 2 GB cap)", file=sys.stderr) |
|
|
| shard_model(in_path, out_path, args.max_shard_bytes) |
|
|
| |
| |
| if os.path.abspath(out_dir) != os.path.abspath(args.model_dir): |
| for name in ("decoder_joint-model.onnx", "vocab.txt", "nemo128.onnx", "config.json"): |
| link_sibling(args.model_dir, out_dir, name) |
| print(f"[shard] linked decoder/vocab/preproc into {out_dir}") |
|
|
| print(f"[shard] done. Use: node scripts/wer-bench.mjs --model-dir {out_dir} --configs fp32@60 --ort wasm") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|