#!/usr/bin/env python
# /// script
# requires-python = ">=3.9"
# dependencies = ["onnx"]
# ///
"""Shard the fp32 Parakeet encoder's external weights into <2 GB pieces so the
fp32 encoder can load on the WASM backend / in-browser.

This script lives in the model repo's scripts/ folder
(parakeet-tdt-0.6b-v3-smoothquant-onnx/scripts) alongside
quantize-int8-smoothquant.py and quantize-fp16.py, and reads the fp32 encoder
from the current directory by default, so run it from the model-repo root. It is
self-contained (PEP 723 header above), so `uv run scripts/shard-fp32.py` installs
onnx on the fly. The model repo ships pre-built shards under sharded/ for browsers;
this script is included for provenance and to regenerate them.

Why (see CLAUDE.md for the full reasoning): the fp32 encoder is ~2.4 GB held in
ONE encoder-model.onnx.data sidecar. That single file trips two *ingest* walls
that block WASM, and neither is a total-memory limit:
  1. a 32-bit WASM ArrayBuffer caps at ~2 GB (2^31-1), and
  2. Chromium's blob-URL fetch caps near 2 GB.
The wasm32 heap ceiling itself is ~4 GB, and fp32 (unlike fp16, which the CPU/WASM
EP upcasts to fp32 at session build) stays ~2.4 GB resident, so it *should* fit
once no single buffer exceeds 2 GB. This script rewrites the encoder's per-tensor
external_data locations to spread the initializers across N shard files, each under
a configurable byte budget (default 1.5 GB), producing:

  encoder-model.onnx              (graph; tensors now point at the shards)
  encoder-model.onnx.data.000
  encoder-model.onnx.data.001
  ...

onnxruntime-node (native) resolves these from disk by the graph's location fields;
the WASM / browser loader mounts each shard as a separate externalData entry (each
< 2 GB), sidestepping both caps. No weights are altered: this is a pure repack, so
WER must be identical to the single-file fp32. That equality is the whole point of
the experiment (does fp32 hold up on a long chunk where int8 drops content), so the
script never touches tensor values, only where their bytes live.

Usage (run from this model-repo root; the onnx files default to .):
  uv run scripts/shard-fp32.py                       # . -> ./sharded
  uv run scripts/shard-fp32.py --model-dir DIR --out-dir DIR
  uv run scripts/shard-fp32.py --max-shard-bytes 1000000000   # smaller shards (lower transient load peak)
  uv run scripts/shard-fp32.py --encoder encoder-model.onnx   # non-default encoder name

Built with Claude Code.
"""

import argparse
import os
import sys

import onnx
from onnx import TensorProto
from onnx.external_data_helper import set_external_data

# Default shard budget. 1.5 GB leaves comfortable headroom under the 2 GB
# ArrayBuffer / blob caps even after a tensor that would straddle a boundary is
# pushed whole into the next shard. Smaller shards lower the transient load peak
# (ORT holds a shard's bytes in the heap while deserialising it), at the cost of
# more files; 1.5 GB is a sane default for a ~2.4 GB encoder (-> 2 shards).
DEFAULT_MAX_SHARD_BYTES = 1_500_000_000

# Tensors below this many bytes stay inline in the graph proto (mirrors onnx's
# own default size_threshold): sharding tiny scalars/biases is pointless and just
# inflates the file count.
INLINE_THRESHOLD_BYTES = 1024


def human(n):
    n = float(n)
    for unit in ("B", "KB", "MB", "GB"):
        if n < 1024 or unit == "GB":
            return f"{n:.0f} {unit}" if unit == "B" else f"{n:.1f} {unit}"
        n /= 1024


def tensor_nbytes(t):
    # After load_external_data the bytes live in raw_data; that is the only field
    # the fp32 encoder's big initializers use. Non-raw tensors are left inline.
    return len(t.raw_data) if t.HasField("raw_data") else 0


def shard_model(in_path, out_path, max_shard_bytes):
    if not os.path.exists(in_path):
        raise FileNotFoundError(f"missing input model: {in_path}")

    # Pull the sibling .onnx.data into raw_data so we see real bytes to repack.
    # Needs ~the encoder's size in RAM (~2.4 GB); cheap given the repack savings.
    print(f"[shard] loading {in_path} (+ external data) ...")
    model = onnx.load(in_path, load_external_data=True)

    out_dir = os.path.dirname(out_path) or "."
    os.makedirs(out_dir, exist_ok=True)
    base = os.path.basename(out_path)  # e.g. encoder-model.onnx

    # Greedy bin-pack: walk initializers, open a new shard whenever adding the
    # next tensor whole would exceed the budget. A single tensor larger than the
    # budget gets its own shard (we never split a tensor across files, so each
    # tensor's external_data stays a simple (location, offset, length)).
    shard_idx = 0
    shard_offset = 0
    shard_file = None
    shard_paths = []
    inline_count = 0
    externalised = 0

    def shard_location(idx):
        return f"{base}.data.{idx:03d}"

    def open_shard(idx):
        loc = shard_location(idx)
        path = os.path.join(out_dir, loc)
        f = open(path, "wb")
        shard_paths.append(path)
        return f, loc

    shard_file, shard_loc = open_shard(shard_idx)

    try:
        for t in model.graph.initializer:
            nbytes = tensor_nbytes(t)
            if nbytes < INLINE_THRESHOLD_BYTES:
                inline_count += 1
                continue  # leave small tensors inline in the graph

            # Roll to the next shard if this tensor would push us over budget,
            # unless the current shard is still empty (a tensor bigger than the
            # whole budget then lands alone in its own shard).
            if shard_offset > 0 and shard_offset + nbytes > max_shard_bytes:
                shard_file.close()
                shard_idx += 1
                shard_offset = 0
                shard_file, shard_loc = open_shard(shard_idx)

            data = t.raw_data
            shard_file.write(data)
            set_external_data(t, location=shard_loc, offset=shard_offset, length=nbytes)
            t.ClearField("raw_data")
            t.data_location = TensorProto.EXTERNAL
            shard_offset += nbytes
            externalised += 1
    finally:
        if shard_file:
            shard_file.close()

    # The initializers now reference the shard files; save the graph as-is (the
    # external_data is already set, so save_as_external_data=False is correct and
    # must stay False or onnx would try to re-pack into a single file).
    onnx.save(model, out_path, save_as_external_data=False)

    sizes = [os.path.getsize(p) for p in shard_paths]
    print(f"[shard] wrote {os.path.basename(out_path)} + {len(shard_paths)} shard(s) "
          f"({externalised} external tensors, {inline_count} kept inline):")
    for p, s in zip(shard_paths, sizes):
        flag = "  <-- OVER 2 GB!" if s >= 2 ** 31 else ""
        print(f"         {os.path.basename(p)}  {human(s)}{flag}")
    total = sum(sizes)
    over = [p for p, s in zip(shard_paths, sizes) if s >= 2 ** 31]
    print(f"[shard] total external: {human(total)} across {len(shard_paths)} shard(s)")
    if over:
        print(f"[shard] WARNING: {len(over)} shard(s) still exceed 2 GB; lower --max-shard-bytes",
              file=sys.stderr)
    return shard_paths


def link_sibling(src_dir, out_dir, name):
    """Make `name` available in out_dir (symlink, falling back to copy) so the
    output is a complete model dir for wer-bench/transcribe without duplicating
    multi-hundred-MB files. Skips silently when src and out are the same dir or
    the source is absent."""
    src = os.path.join(src_dir, name)
    dst = os.path.join(out_dir, name)
    if not os.path.exists(src) or os.path.abspath(src) == os.path.abspath(dst):
        return
    if os.path.lexists(dst):
        os.remove(dst)
    try:
        os.symlink(os.path.relpath(src, out_dir), dst)
    except OSError:
        import shutil
        shutil.copy2(src, dst)


def main():
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--model-dir", default=".",
                    help="dir holding encoder-model.onnx (+ .onnx.data). Default: current directory")
    ap.add_argument("--out-dir", default=None,
                    help="where to write the sharded encoder + a complete model dir "
                         "(default: <model-dir>/sharded). Pass the same value as --model-dir to shard in place.")
    ap.add_argument("--encoder", default="encoder-model.onnx",
                    help="encoder graph filename within --model-dir (default encoder-model.onnx)")
    ap.add_argument("--max-shard-bytes", type=int, default=DEFAULT_MAX_SHARD_BYTES,
                    help=f"max bytes per shard (default {DEFAULT_MAX_SHARD_BYTES}, i.e. 1.5 GB)")
    args = ap.parse_args()

    out_dir = args.out_dir or os.path.join(args.model_dir, "sharded")
    in_path = os.path.join(args.model_dir, args.encoder)
    out_path = os.path.join(out_dir, args.encoder)

    if args.max_shard_bytes >= 2 ** 31:
        print("[shard] WARNING: --max-shard-bytes >= 2 GB defeats the purpose "
              "(shards must stay under the WASM/blob 2 GB cap)", file=sys.stderr)

    shard_model(in_path, out_path, args.max_shard_bytes)

    # Round out the output into a self-contained model dir so wer-bench can point
    # --model-dir straight at it. The fp32 decoder/vocab/preproc are reused as-is.
    if os.path.abspath(out_dir) != os.path.abspath(args.model_dir):
        for name in ("decoder_joint-model.onnx", "vocab.txt", "nemo128.onnx", "config.json"):
            link_sibling(args.model_dir, out_dir, name)
        print(f"[shard] linked decoder/vocab/preproc into {out_dir}")

    print(f"[shard] done. Use: node scripts/wer-bench.mjs --model-dir {out_dir} --configs fp32@60 --ort wasm")


if __name__ == "__main__":
    main()