smcleod commited on May 8

Commit

6a1108c

verified ·

1 Parent(s): b688ed5

Add files using upload-large-folder tool

Browse files

Files changed (24) hide show

.gitattributes +6 -0
LICENSE +202 -0
README.md +109 -3
chat_template.jinja +2 -0
decode_step.onnx +3 -0
decode_step.onnx_data +3 -0
decode_step_int8.onnx +3 -0
decode_step_int8.onnx_data +3 -0
encoder.onnx +3 -0
encoder.onnx_data +3 -0
encoder_int8.onnx +3 -0
encoder_int8.onnx_data +3 -0
export_speech_2b_ar.py +1239 -0
granite_export_metadata.json +454 -0
preprocessor_config.json +14 -0
processor_config.json +4 -0
prompt_encode.onnx +3 -0
prompt_encode.onnx_data +3 -0
prompt_encode_int8.onnx +3 -0
prompt_encode_int8.onnx_data +3 -0
quantise.py +299 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +792 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
+decode_step_int8.onnx_data filter=lfs diff=lfs merge=lfs -text
+prompt_encode.onnx_data filter=lfs diff=lfs merge=lfs -text
+encoder_int8.onnx_data filter=lfs diff=lfs merge=lfs -text
+prompt_encode_int8.onnx_data filter=lfs diff=lfs merge=lfs -text
+decode_step.onnx_data filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,109 @@
----
-license: apache-2.0
----

+# IBM Granite Speech 4.1 2b - ONNX export
+ONNX export of [`ibm-granite/granite-speech-4.1-2b`](https://huggingface.co/ibm-granite/granite-speech-4.1-2b) produced by Sam McLeod
+(<https://smcleod.net>). Repository: `smcleod/ibm-granite-speech-4.1-2b-onnx`. Both FP32 and INT8 weight-only
+graphs are included. The graphs target opset 20, IR 10, `ai.onnx` operators
+only - no `com.microsoft` ops - so they load under the `ort` 2.0-rc.x Rust
+crate as well as standard `onnxruntime` 1.17 - 1.25.
+> **Additional precision tiers in progress.** A statically-calibrated INT8 variant (better quality vs the dynamic INT8 already in this repo) and a half-precision encoder are in active development. The repo will be updated when those graphs pass the multi-clip parity gate.
+Three graphs cooperate: `encoder.onnx` projects mel features to audio embeddings; `prompt_encode.onnx` runs the LLM forward over the full prompt (text tokens + projected audio embeds) and returns the first-token logits plus a 40-layer KV cache; `decode_step.onnx` consumes one token at a time plus the past KV cache and emits the next logits.
+The audio placeholder token id is `100352`. Replace those positions in the prompt with the projector outputs from `encoder.onnx` before running `prompt_encode.onnx`.
+## Files
+- `encoder.onnx` + `encoder.onnx_data` (FP32) and `encoder_int8.onnx` + `encoder_int8.onnx_data` (INT8 weight-only quantisation)
+- `prompt_encode.onnx` + `prompt_encode.onnx_data` (FP32) and `prompt_encode_int8.onnx` + `prompt_encode_int8.onnx_data` (INT8 weight-only quantisation)
+- `decode_step.onnx` + `decode_step.onnx_data` (FP32) and `decode_step_int8.onnx` + `decode_step_int8.onnx_data` (INT8 weight-only quantisation)
+- Tokeniser / processor: `tokenizer.json`, `tokenizer_config.json`, `processor_config.json`, `chat_template.jinja`, `special_tokens_map.json`, `preprocessor_config.json`
+- Export scripts: `export_speech_2b_ar.py`, `quantise.py`
+- `granite_export_metadata.json` (graph IO, parity numbers, toolchain)
+- `LICENSE` (Apache 2.0)
+## Parity
+Parity is taken against the upstream PyTorch reference on a single LibriSpeech
+clip (`10226_10111_000000.wav`, 8.43 seconds, 844 mel frames). FP32 graphs
+match the reference within numeric tolerance; INT8 graphs are validated in
+argmax-only mode (logit values shift but token argmax is preserved, so the
+decoded transcript is unchanged).
+Encoder (numeric output, no argmax decoding):
+| precision | max-abs-err | mean-abs-err | p99-abs-err |
+| --- | --- | --- | --- |
+| FP32 | 4.48e-06 | 1.24e-07 | 6.46e-07 |
+| INT8 | 0.169 | 0.0109 | 0.0447 |
+LLM stages (argmax decoding; INT8 logit max-abs delta is large but argmax is preserved):
+| graph | precision | max-abs-err | argmax mismatches | transcript match |
+| --- | --- | --- | --- | --- |
+| prompt_encode | FP32 | 0.000364 | 0/190 | Y |
+| prompt_encode | INT8 | 10.1 | 58/190 | Y |
+| decode_step | FP32 | n/a | 0/51 | Y |
+| decode_step | INT8 | 5.76 | 0/51 | Y |
+### Multi-clip transcript parity
+Three additional 16 kHz mono clips covering longer utterances (39 to 94 seconds), single and two-speaker conversational content. Word error rate (WER) and Levenshtein edit distance computed against the upstream PyTorch reference. Numbers measured end-to-end through the full ONNX pipeline (no PyTorch encoder fallback).
+| Clip | Duration | FP32 byte-exact vs PT | INT8 byte-exact vs PT | INT8 WER vs PT | INT8 vs FP32 Lev |
+| --- | ---: | :---: | :---: | ---: | ---: |
+| is-it-more-wood | 46.9 s | Y | N | 1.4% | 2 |
+| two-speakers-1 | 93.8 s | Y | N | 1.0% | 12 |
+| two-speakers-2 | 38.8 s | Y | N | 23.5% | 26 |
+Raw multi-clip data including full transcripts: see `granite_export_metadata.json` `multi_clip_parity` block.
+Reference transcript:
+> After his nap, Timothy lazily stretched, first one gray velvet foot, then another, strolled indolently to his plate, turning over the food, carefully selecting choice bits, nosing out that which he scorned upon the clean hearth
+Both FP32 and INT8 paths reproduce this transcript exactly on the test clip.
+## Toolchain
+- transformers 5.8.0
+- torch 2.11.0
+- onnx 1.21.0
+- onnxruntime 1.25.1
+- exporter: torch.onnx.export TorchScript path (dynamo=False)
+- opset: 20 (`ai.onnx` only)
+- IR version: 10
+- external data layout: single `<stem>.onnx_data` sidecar per graph
+## Compatibility
+Targeted at the [`ort`](https://crates.io/crates/ort) 2.0-rc.x Rust crate.
+Compatible with `onnxruntime` Python 1.17 through 1.25. No `com.microsoft`
+ops are used. Graphs were emitted via the TorchScript path
+(`torch.onnx.export(..., dynamo=False)`); the dynamo exporter was deliberately
+avoided because it injects `aten::*` ops `ort` does not understand.
+## Reproducing the export
+The included scripts and `quantise.py` regenerate every artefact in this
+bundle. From a checkout of <https://github.com/sammcj/granite-speech-4.1-onnx>:
+```bash
+python export_speech_2b_ar.py \
+    --model-dir <path-to-ibm-granite/granite-speech-4.1-2b> \
+    --out-dir exports/granite-speech-4.1-2b
+python quantise.py --input exports/granite-speech-4.1-2b/encoder.onnx       --output exports/granite-speech-4.1-2b/encoder_int8.onnx
+python quantise.py --input exports/granite-speech-4.1-2b/prompt_encode.onnx --output exports/granite-speech-4.1-2b/prompt_encode_int8.onnx
+python quantise.py --input exports/granite-speech-4.1-2b/decode_step.onnx   --output exports/granite-speech-4.1-2b/decode_step_int8.onnx
+```
+Sandboxed environments may need:
+```bash
+HF_HOME=$TMPDIR/hf_home HF_MODULES_CACHE=$TMPDIR/hf_modules <command above>
+```
+## Licence
+Apache 2.0 for both the upstream IBM model and this ONNX export. See
+[`LICENSE`](LICENSE) for the full text.

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }}
2	+ ASSISTANT:{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% endif %}{% endfor %}

decode_step.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bafe9470dc446bafb9499566ca22328251d352b222f290e039a3bbc54aa1baf7
+size 1849786

decode_step.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87d924ecd71746694f43e653c9366827a9444ab9407e976f5cd9cc9dbde97608
+size 6527008768

decode_step_int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6ddf694deba562408d2ed39d9a1744c8015c5610d5e0afce63908293a1eac45
+size 6426226

decode_step_int8.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70652d6a31cbae2d57c7e8cefb665f6c1ee503e495d191b951fff09ddb7f8608
+size 1632249856

encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efe873c6d19468eda93d1751ba14615508e763312cac6112029914acec0f33a9
+size 912937

encoder.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a39db4121859fade3cc6fef05dc9f7abc0af068d389ef8af7b57997eca0d2f43
+size 1903334768

encoder_int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1167991ae3b81a0aa0a0c35f0bed619dc8c3d52b5da72de0564fb708b0547070
+size 2608070

encoder_int8.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd1f606ff9b9145636849b7f2dbbc972e3f4964852cdbf950254da4fc45132d5
+size 787117424

export_speech_2b_ar.py ADDED Viewed

	@@ -0,0 +1,1239 @@

+# Copyright 2026 Sam McLeod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export Granite Speech 4.1 2b (autoregressive variants) to three ONNX graphs.
+Covers both `granite-speech-4.1-2b` (base) and `granite-speech-4.1-2b-plus`.
+The two share architecture - Conformer encoder + Blip2 Q-Former projector +
+Granite-4.0 1B causal LM with `logits_scaling=8` - and only differ in weights
+and chat template. Pass `--model-dir` and `--baseline` to select the variant.
+The NAR variant has a different topology and is exported by the
+`export_nar_*.py` scripts instead.
+Produces, under the configured `--out-dir`:
+  - encoder.onnx       : Conformer CTC encoder + Blip2 Q-Former projector.
+                         Input:  input_features   float32 [B, T, 160]
+                         Output: audio_embeds     float32 [B, T_audio, 2048]
+                                 audio_embed_sizes int64  [B] (per-sample valid lengths)
+  - prompt_encode.onnx : LLM prefill over a fully spliced inputs_embeds.
+                         Inputs : inputs_embeds   float32 [B, N, 2048]
+                                  position_ids    int64   [B, N]
+                                  attention_mask  float32 [B, 1, N, N] (additive)
+                         Outputs: logits          float32 [B, N, V] (divided by 8)
+                                  present.<L>.{key,value} for L in 0..39
+  - decode_step.onnx   : Single-token decode with KV cache.
+                         Inputs : inputs_embeds   float32 [B, 1, 2048]
+                                  position_ids    int64   [B, 1]
+                                  attention_mask  float32 [B, 1, 1, T_total] (additive)
+                                  past_key_values.<L>.{key,value} for L in 0..39
+                         Outputs: logits          float32 [B, 1, V] (divided by 8)
+                                  present.<L>.{key,value}
+The base/plus projector is `Blip2QFormerModel`, not the NAR custom projector.
+Q-Former self-attention is plain matmul-softmax already (Bert-style); only the
+Conformer encoder's SDPA + `if remainder > 0` guard need rewriting for clean
+tracing.
+Both LLM graphs apply `logits / config.text_config.logits_scaling` (=8). This
+matches `GraniteForCausalLM.forward`, which the reference autoregressive path
+goes through. Without it, ONNX logits are 8x the PyTorch reference even though
+argmax is preserved, which trips strict numeric parity bars.
+Usage:
+    # Base 2b (defaults):
+    HF_HOME=$TMPDIR/hf_home HF_MODULES_CACHE=$TMPDIR/hf_modules \\
+        uv run python src/export_speech_2b_ar.py
+    # Plus 2b:
+    HF_HOME=$TMPDIR/hf_home HF_MODULES_CACHE=$TMPDIR/hf_modules \\
+        uv run python src/export_speech_2b_ar.py \\
+            --model-dir models/granite-speech-4.1-2b-plus \\
+            --baseline test_data/baselines/plus.json \\
+            --out-dir exports/granite-speech-4.1-2b-plus
+    # Just one stage:
+    uv run python src/export_speech_2b_ar.py --stages encoder
+    uv run python src/export_speech_2b_ar.py --stages prompt,decode --skip-export
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+import numpy as np
+import soundfile as sf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Resolve roots so the script works whether it lives at <repo>/src/<name>.py
+# (project layout) or <bundle>/<name>.py (HF bundle layout). Defaults exist for
+# the project layout; bundle users should pass explicit --audio / --baseline /
+# --model-dir / --out-dir.
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent if SCRIPT_DIR.name == "src" else SCRIPT_DIR
+DEFAULT_AUDIO = REPO_ROOT / "test_data" / "10226_10111_000000.wav"
+DEFAULT_BASELINE = REPO_ROOT / "test_data" / "baselines" / "base.json"
+DEFAULT_MODEL_DIR = REPO_ROOT / "models" / "granite-speech-4.1-2b"
+DEFAULT_OUT_DIR = REPO_ROOT / "exports" / "granite-speech-4.1-2b"
+USER_PROMPT_TRANSCRIBE = (
+    "<|audio|>transcribe the speech with proper punctuation and capitalization."
+)
+# ---------------------------------------------------------------------------
+# Utilities.
+# ---------------------------------------------------------------------------
+def load_audio(path: Path) -> np.ndarray:
+    waveform, sr = sf.read(str(path), dtype="float32")
+    if waveform.ndim > 1:
+        waveform = waveform.mean(axis=1)
+    assert sr == 16000, f"expected 16 kHz, got {sr}"
+    return waveform
+def tensor_stats(t: torch.Tensor | np.ndarray | None) -> dict[str, Any] | None:
+    if t is None:
+        return None
+    if isinstance(t, torch.Tensor):
+        x = t.detach().float().cpu().numpy()
+        dtype_str = str(t.dtype).replace("torch.", "")
+    else:
+        x = np.asarray(t).astype(np.float32, copy=False)
+        dtype_str = str(t.dtype)
+    flat = x.flatten()
+    return {
+        "shape": list(x.shape),
+        "dtype": dtype_str,
+        "mean": float(flat.mean()) if flat.size else None,
+        "std": float(flat.std()) if flat.size else None,
+        "min": float(flat.min()) if flat.size else None,
+        "max": float(flat.max()) if flat.size else None,
+        "first10": [float(v) for v in flat[:10]],
+    }
+def _resave_single_sidecar(scratch_path: Path, out_path: Path, ir_version: int) -> None:
+    """Stage 2 of every export: re-save with one external-data sidecar in the
+    final location so we end up with exactly two artefacts on disk."""
+    import onnx
+    print("  stage-2: re-saving with single .onnx_data sidecar + ir bump")
+    model_proto = onnx.load(str(scratch_path), load_external_data=True)
+    if model_proto.ir_version < ir_version:
+        model_proto.ir_version = ir_version
+    for tensor in model_proto.graph.initializer:
+        tensor.ClearField("data_location")
+        tensor.ClearField("external_data")
+    sidecar_name = out_path.name + "_data"
+    if (out_path.parent / sidecar_name).exists():
+        (out_path.parent / sidecar_name).unlink()
+    if out_path.exists():
+        out_path.unlink()
+    onnx.save_model(
+        model_proto,
+        str(out_path),
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location=sidecar_name,
+        size_threshold=1024,
+        convert_attribute=False,
+    )
+    onnx.checker.check_model(str(out_path), full_check=False)
+    domains = sorted({n.domain for n in model_proto.graph.node})
+    print(f"  saved {out_path} (+ {sidecar_name})  node-domains={domains}")
+# ---------------------------------------------------------------------------
+# Model loading (mirrors capture_baselines.py::capture_base_or_plus).
+# ---------------------------------------------------------------------------
+def load_base_model(model_dir: Path) -> tuple[nn.Module, Any]:
+    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
+    print(f"  loading processor from {model_dir}")
+    processor = AutoProcessor.from_pretrained(str(model_dir))
+    print(f"  loading model from {model_dir} (eager, fp32)")
+    t0 = time.time()
+    # Blip2QFormerModel does not support SDPA in transformers 5.8; eager is mandatory.
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        str(model_dir), torch_dtype=torch.float32, attn_implementation="eager",
+    )
+    model.eval()
+    # The nested text_config / encoder_config / projector_config can carry
+    # `dtype: bfloat16`; force fp32 across the whole module tree.
+    model = model.to(torch.float32)
+    print(f"  loaded in {time.time() - t0:.1f}s")
+    return model, processor
+# ---------------------------------------------------------------------------
+# Trace-friendly monkey-patches for the Conformer encoder.
+# ---------------------------------------------------------------------------
+def patch_conformer_for_tracing(model: nn.Module) -> None:
+    """Rewrite the in-tree GraniteSpeechConformerAttention.forward so it traces:
+    - SDPA -> plain matmul/softmax.
+    - `if remainder > 0` guard -> always-pad by `(-num_features) % context_size`.
+    Blip2QFormerMultiHeadAttention is already plain matmul-softmax (Bert-style),
+    so no rewrite is needed for the projector's self-attention path. The
+    projector's outer reshape/pad math is handled separately by
+    patch_projector_for_tracing because it bakes T_audio into the graph if
+    left as upstream's `math.ceil(seq_len / window_size)` pattern.
+    """
+    encoder = model.encoder
+    attn0 = encoder.layers[0].attn
+    attn_cls = type(attn0)
+    def attn_forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        bsz, num_features, _ = hidden_states.shape
+        # Always-pad: pad amount may be zero. Use modulo so the graph is valid
+        # for any T at runtime.
+        pad_amount = (-num_features) % self.context_size
+        num_blocks = (num_features + self.context_size - 1) // self.context_size
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_amount))
+        query_states = self.to_q(hidden_states)
+        key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
+        query_states = query_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        key_states = key_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        value_states = value_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        # Shaw's relative positional embedding.
+        rel_pos_emb = self.rel_pos_emb(attention_dists)
+        # query_states: [B, M, H, C, D]; rel_pos_emb: [C, R, D]
+        # Output: [B, M, H, C, R]
+        pos_attn = torch.einsum(
+            "b m h c d, c r d -> b m h c r", query_states, rel_pos_emb
+        ) * self.scale
+        # Plain matmul attention with the additive `pos_attn` bias inside the
+        # softmax (matches the MATH SDPA backend numerically).
+        attn_logits = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.scale
+        attn_logits = attn_logits + pos_attn
+        attn_weights = torch.softmax(attn_logits, dim=-1)
+        out = torch.matmul(attn_weights, value_states)  # [B, M, H, C, D]
+        out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
+        out = self.to_out(out[:, :num_features, :])
+        return self.dropout(out)
+    attn_cls.forward = attn_forward
+def patch_projector_for_tracing(model: nn.Module) -> None:
+    """Rewrite GraniteSpeechEncoderProjector.forward so the output time
+    dimension (T_audio = nblocks * num_queries) stays dynamic in the exported
+    graph.
+    The upstream forward bakes T_audio because:
+      1. `seq_len = hidden_states.size(1)` is a Python int under TorchScript trace
+      2. `math.ceil(seq_len / self.window_size)` is Python int math, baked
+      3. The intermediate `.view(batch * nblocks, window_size, dim)` and final
+         `.view(batch, nblocks * window_size // downsample_rate, -1)` both
+         emit Reshape ops with a constant shape vector
+    The rewrite uses `torch._shape_as_tensor` for dynamic shape access, an
+    over-pad-then-tensor-slice idiom for the F.pad step, and `-1` for the
+    intermediate batch*nblocks dim and the final T_audio dim. Batch is still
+    baked at trace value (1) because reshape's target shape is a constant
+    vector and we don't support multi-batch inference; T_audio is the
+    audio-length-dependent dim that needs to be dynamic.
+    """
+    projector = model.projector
+    projector_cls = type(projector)
+    def projector_forward_traceable(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]  # static (B=1 at trace)
+        dim = hidden_states.shape[2]  # static encoder hidden_dim
+        window_size = self.window_size
+        # Dynamic seq_len via Shape op (emitted by torch._shape_as_tensor):
+        shape_t = torch._shape_as_tensor(hidden_states)
+        seq_len_t = shape_t[1]  # 0-d int64 Tensor
+        # nblocks * window_size = the padded length we want.
+        nblocks_t = (seq_len_t + window_size - 1) // window_size
+        final_len_t = nblocks_t * window_size  # 0-d Tensor
+        # Statically pad by (window_size - 1), the maximum pad ever needed,
+        # then dynamically slice down to final_len_t. Avoids needing F.pad
+        # with a tensor pad amount (which doesn't trace cleanly).
+        hidden_states = nn.functional.pad(
+            hidden_states, (0, 0, 0, window_size - 1), "constant", 0.0
+        )
+        hidden_states = hidden_states[:, :final_len_t, :]
+        # [B, nblocks*window_size, dim] -> [B*nblocks, window_size, dim].
+        # `-1` lets ONNX infer batch*nblocks from numel at runtime.
+        hidden_states = hidden_states.reshape(-1, window_size, dim)
+        # Build an explicit all-ones encoder_attention_mask. Without this, the
+        # QFormer auto-creates one via `torch.ones(encoder_hidden_states.size())`
+        # which under tracing bakes batch*nblocks at the trace input's value.
+        # `torch.ones_like` on a slice that drops the hidden dim keeps the
+        # mask shape dynamic ([batch*nblocks, window_size]).
+        encoder_attention_mask = torch.ones_like(hidden_states[..., 0])
+        query_output = self.qformer(
+            query_embeds=self.query,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            return_dict=True,
+        )
+        # qf_out: [B*nblocks, num_queries, qf_hidden]
+        qf_out = query_output.last_hidden_state
+        qf_hidden = qf_out.shape[-1]  # static qformer hidden
+        # [B*nblocks, num_queries, qf_hidden] -> [B, T_audio, qf_hidden].
+        # B is baked at trace (1); T_audio (= nblocks*num_queries) is inferred.
+        qf_out = qf_out.reshape(batch_size, -1, qf_hidden)
+        return self.linear(qf_out)
+    projector_cls.forward = projector_forward_traceable
+# ---------------------------------------------------------------------------
+# Encoder + projector wrapper.
+# ---------------------------------------------------------------------------
+class EncoderProjectorWrapper(nn.Module):
+    """Wrap encoder + projector into one ONNX graph.
+    Inputs:
+        input_features:  float32 [B, T, 160]
+    Outputs:
+        audio_embeds:    float32 [B, T_audio, 2048] = projector(encoder(input_features))
+        audio_embed_sizes: int64 [B] - count of valid audio tokens per sample,
+                          replicating the feature_extractor's projection-length math
+                          on the static input shape.
+    Notes:
+      - The Conformer encoder itself does not consume an attention mask; the
+        feature extractor supplies a Python-int per-sample length, which is what
+        `audio_embed_sizes` reproduces here from the static input shape T.
+        Downstream Rust glue should compute the same size from the raw audio
+        length and slice `audio_embeds[:, :size, :]` for the splice.
+      - The projector output size is `nblocks * (window_size / downsample_rate)`.
+        With T=844 (the reference clip), this gives `ceil(844/15) * 3 = 171`,
+        which matches the captured PyTorch reference.
+    """
+    def __init__(self, encoder: nn.Module, projector: nn.Module, window_size: int, downsample_rate: int):
+        super().__init__()
+        self.encoder = encoder
+        self.projector = projector
+        self.window_size = int(window_size)
+        self.downsample_rate = int(downsample_rate)
+    def forward(self, input_features: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        enc_out = self.encoder(input_features, return_dict=True)
+        audio_embeds = self.projector(enc_out.last_hidden_state)
+        # Compute audio_embed_sizes dynamically so the value tracks T at runtime.
+        # `torch._shape_as_tensor` emits an ONNX Shape op so seq_len_t is a 0-d
+        # int64 Tensor rather than a baked Python int. The result is an int64
+        # tensor of shape [B] (B baked at 1, the only mode we trace; T_audio
+        # tracks runtime input length).
+        shape_t = torch._shape_as_tensor(input_features)
+        seq_len_t = shape_t[1]
+        num_queries = self.window_size // self.downsample_rate
+        nblocks_t = (seq_len_t + self.window_size - 1) // self.window_size
+        size_per_t = nblocks_t * num_queries  # 0-d int64 Tensor
+        audio_embed_sizes = size_per_t.unsqueeze(0)  # [1] tensor
+        return audio_embeds, audio_embed_sizes
+# ---------------------------------------------------------------------------
+# LLM wrappers (prompt_encode + decode_step). Adapted from
+# src/export_granite_llm_kv.py to take inputs_embeds instead of input_ids.
+# ---------------------------------------------------------------------------
+def _build_causal_mask_4d(
+    attention_mask_2d: torch.Tensor,
+    T_past: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Build a 4-D additive attention mask `[B, 1, T_q, T_k]` from a 2-D padding
+    mask `[B, T_k]`. Padding columns are -inf, and the trailing T_q query rows
+    have an upper-triangular causal mask added.
+    The Granite eager-mask path early-exits when handed a 4-D mask, so this
+    short-circuits the v5 mask-helper crash under TorchScript trace.
+    """
+    B, T_k = attention_mask_2d.shape
+    T_q = T_k - T_past
+    neg_inf = torch.finfo(dtype).min
+    pad = (attention_mask_2d == 0).to(dtype) * neg_inf  # [B, T_k]
+    pad = pad.view(B, 1, 1, T_k).expand(B, 1, T_q, T_k)
+    q_idx = torch.arange(T_q, device=attention_mask_2d.device).view(1, 1, T_q, 1)
+    k_idx = torch.arange(T_k, device=attention_mask_2d.device).view(1, 1, 1, T_k)
+    allowed = k_idx <= (q_idx + T_past)
+    causal = torch.where(
+        allowed,
+        torch.zeros((), dtype=dtype, device=attention_mask_2d.device),
+        torch.full((), neg_inf, dtype=dtype, device=attention_mask_2d.device),
+    )
+    return pad + causal
+class PromptEncodeWrapper(nn.Module):
+    """Prefill graph; consumes pre-spliced inputs_embeds.
+    Forward signature (positional):
+        inputs_embeds:  float32 [B, N, H]
+        position_ids:   int64   [B, N]
+        attention_mask: float32 [B, 1, N, N] additive 4-D causal+padding mask
+    Outputs:
+        logits:         float32 [B, N, V]   (divided by logits_scaling)
+        present.<L>.key, present.<L>.value for L in 0..n_layers-1
+    """
+    def __init__(
+        self, llm_model: nn.Module, lm_head: nn.Module, num_layers: int, logits_scaling: float
+    ) -> None:
+        super().__init__()
+        self.llm_model = llm_model
+        self.lm_head = lm_head
+        self.num_layers = num_layers
+        self.logits_scaling = logits_scaling
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        from transformers import DynamicCache
+        cache = DynamicCache()
+        out = self.llm_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        logits = self.lm_head(out.last_hidden_state) / self.logits_scaling
+        present = out.past_key_values
+        flat: list[torch.Tensor] = [logits]
+        for layer in present.layers:
+            flat.append(layer.keys)
+            flat.append(layer.values)
+        return tuple(flat)
+class DecodeStepWrapper(nn.Module):
+    """Single-token decode graph.
+    Forward signature (positional):
+        inputs_embeds:  float32 [B, 1, H]
+        position_ids:   int64   [B, 1]
+        attention_mask: float32 [B, 1, 1, T_total] additive 4-D mask
+        past_kv_flat:   2*n_layers tensors, each float32
+                        [B, num_kv_heads, T_past, head_dim], in the order
+                        (past.0.key, past.0.value, past.1.key, ..., past.<L-1>.value)
+    """
+    def __init__(
+        self, llm_model: nn.Module, lm_head: nn.Module, num_layers: int, logits_scaling: float
+    ) -> None:
+        super().__init__()
+        self.llm_model = llm_model
+        self.lm_head = lm_head
+        self.num_layers = num_layers
+        self.logits_scaling = logits_scaling
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        *past_kv_flat: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        from transformers import DynamicCache
+        if len(past_kv_flat) != 2 * self.num_layers:
+            raise ValueError(
+                f"expected {2 * self.num_layers} past_kv tensors, got {len(past_kv_flat)}"
+            )
+        layer_pairs = [
+            (past_kv_flat[2 * i], past_kv_flat[2 * i + 1])
+            for i in range(self.num_layers)
+        ]
+        cache = DynamicCache(ddp_cache_data=layer_pairs)
+        out = self.llm_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        logits = self.lm_head(out.last_hidden_state) / self.logits_scaling
+        present = out.past_key_values
+        flat: list[torch.Tensor] = [logits]
+        for layer in present.layers:
+            flat.append(layer.keys)
+            flat.append(layer.values)
+        return tuple(flat)
+# ---------------------------------------------------------------------------
+# Export functions.
+# ---------------------------------------------------------------------------
+def export_encoder(
+    wrapper: EncoderProjectorWrapper,
+    sample_input_features: torch.Tensor,
+    out_path: Path,
+    opset: int = 20,
+    ir_version: int = 10,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"  exporting encoder to {out_path} (opset={opset}, ir_version={ir_version})")
+    dynamic_axes = {
+        "input_features": {0: "B", 1: "T"},
+        "audio_embeds": {0: "B", 1: "T_audio"},
+        "audio_embed_sizes": {0: "B"},
+    }
+    with tempfile.TemporaryDirectory(prefix="speech2b_ar_encoder_onnx_") as scratch_dir:
+        scratch_path = Path(scratch_dir) / "encoder.onnx"
+        t0 = time.time()
+        torch.onnx.export(
+            wrapper,
+            (sample_input_features,),
+            str(scratch_path),
+            input_names=["input_features"],
+            output_names=["audio_embeds", "audio_embed_sizes"],
+            dynamic_axes=dynamic_axes,
+            opset_version=opset,
+            do_constant_folding=True,
+            export_params=True,
+            dynamo=False,
+        )
+        print(f"  stage-1 torch.onnx.export done in {time.time() - t0:.1f}s")
+        _resave_single_sidecar(scratch_path, out_path, ir_version)
+def export_prompt_encode(
+    wrapper: PromptEncodeWrapper,
+    sample_inputs_embeds: torch.Tensor,
+    sample_position_ids: torch.Tensor,
+    sample_attention_mask: torch.Tensor,
+    out_path: Path,
+    num_layers: int,
+    opset: int = 20,
+    ir_version: int = 10,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"  exporting prompt_encode to {out_path} (opset={opset}, ir_version={ir_version})")
+    output_names: list[str] = ["logits"]
+    for i in range(num_layers):
+        output_names.append(f"present.{i}.key")
+        output_names.append(f"present.{i}.value")
+    dynamic_axes: dict[str, dict[int, str]] = {
+        "inputs_embeds": {0: "B", 1: "N"},
+        "position_ids": {0: "B", 1: "N"},
+        "attention_mask": {0: "B", 2: "N", 3: "N"},
+        "logits": {0: "B", 1: "N"},
+    }
+    for i in range(num_layers):
+        dynamic_axes[f"present.{i}.key"] = {0: "B", 2: "N"}
+        dynamic_axes[f"present.{i}.value"] = {0: "B", 2: "N"}
+    with tempfile.TemporaryDirectory(prefix="speech2b_ar_prompt_onnx_") as scratch_dir:
+        scratch_path = Path(scratch_dir) / "prompt_encode.onnx"
+        t0 = time.time()
+        torch.onnx.export(
+            wrapper,
+            (sample_inputs_embeds, sample_position_ids, sample_attention_mask),
+            str(scratch_path),
+            input_names=["inputs_embeds", "position_ids", "attention_mask"],
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            opset_version=opset,
+            do_constant_folding=True,
+            export_params=True,
+            dynamo=False,
+        )
+        print(f"  stage-1 torch.onnx.export done in {time.time() - t0:.1f}s")
+        _resave_single_sidecar(scratch_path, out_path, ir_version)
+def export_decode_step(
+    wrapper: DecodeStepWrapper,
+    sample_inputs_embeds: torch.Tensor,
+    sample_position_ids: torch.Tensor,
+    sample_attention_mask: torch.Tensor,
+    sample_past_kv_flat: tuple[torch.Tensor, ...],
+    out_path: Path,
+    num_layers: int,
+    opset: int = 20,
+    ir_version: int = 10,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"  exporting decode_step to {out_path} (opset={opset}, ir_version={ir_version})")
+    input_names: list[str] = ["inputs_embeds", "position_ids", "attention_mask"]
+    for i in range(num_layers):
+        input_names.append(f"past_key_values.{i}.key")
+        input_names.append(f"past_key_values.{i}.value")
+    output_names: list[str] = ["logits"]
+    for i in range(num_layers):
+        output_names.append(f"present.{i}.key")
+        output_names.append(f"present.{i}.value")
+    dynamic_axes: dict[str, dict[int, str]] = {
+        "inputs_embeds": {0: "B"},
+        "position_ids": {0: "B"},
+        "attention_mask": {0: "B", 3: "T_total"},
+        "logits": {0: "B"},
+    }
+    for i in range(num_layers):
+        dynamic_axes[f"past_key_values.{i}.key"] = {0: "B", 2: "T_past"}
+        dynamic_axes[f"past_key_values.{i}.value"] = {0: "B", 2: "T_past"}
+        dynamic_axes[f"present.{i}.key"] = {0: "B", 2: "T_total"}
+        dynamic_axes[f"present.{i}.value"] = {0: "B", 2: "T_total"}
+    with tempfile.TemporaryDirectory(prefix="speech2b_ar_decode_onnx_") as scratch_dir:
+        scratch_path = Path(scratch_dir) / "decode_step.onnx"
+        args = (sample_inputs_embeds, sample_position_ids, sample_attention_mask, *sample_past_kv_flat)
+        t0 = time.time()
+        torch.onnx.export(
+            wrapper,
+            args,
+            str(scratch_path),
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            opset_version=opset,
+            do_constant_folding=True,
+            export_params=True,
+            dynamo=False,
+        )
+        print(f"  stage-1 torch.onnx.export done in {time.time() - t0:.1f}s")
+        _resave_single_sidecar(scratch_path, out_path, ir_version)
+# ---------------------------------------------------------------------------
+# Parity helpers.
+# ---------------------------------------------------------------------------
+def encoder_parity(
+    wrapper: EncoderProjectorWrapper,
+    processor: Any,
+    waveform: np.ndarray,
+    onnx_path: Path,
+    abs_tol: float,
+    argmax_only: bool = False,
+) -> dict[str, Any]:
+    import onnxruntime as ort
+    print("\n=== encoder parity ===")
+    inputs = processor(USER_PROMPT_TRANSCRIBE, [waveform], sampling_rate=16000, return_tensors="pt")
+    input_features = inputs["input_features"].to(torch.float32)
+    print(f"  input_features: {tuple(input_features.shape)}")
+    print("  PyTorch wrapper forward")
+    t0 = time.time()
+    with torch.inference_mode():
+        audio_pt, sizes_pt = wrapper(input_features)
+    print(f"    pt: {time.time() - t0:.2f}s   audio_embeds={tuple(audio_pt.shape)}")
+    print(f"  ONNX inference: {onnx_path}")
+    sess = ort.InferenceSession(str(onnx_path), providers=["CPUExecutionProvider"])
+    t0 = time.time()
+    audio_ort, sizes_ort = sess.run(
+        ["audio_embeds", "audio_embed_sizes"],
+        {"input_features": input_features.numpy().astype(np.float32)},
+    )
+    print(f"    ort: {time.time() - t0:.2f}s   audio_embeds={tuple(audio_ort.shape)}")
+    pt_np = audio_pt.detach().float().cpu().numpy()
+    abs_err = np.abs(pt_np - audio_ort)
+    max_err = float(abs_err.max())
+    mean_err = float(abs_err.mean())
+    p99 = float(np.percentile(abs_err, 99))
+    sizes_pt_np = sizes_pt.detach().cpu().numpy().astype(np.int64)
+    sizes_ok = bool(np.array_equal(sizes_pt_np, sizes_ort.astype(np.int64)))
+    if argmax_only:
+        # The encoder's audio_embeds feed into the LLM, where the actual ship
+        # gate (transcript byte-exact, argmax stable) lives. The continuous
+        # audio_embeds delta is informational only in INT8 mode.
+        ok = sizes_ok
+    else:
+        ok = max_err <= abs_tol and sizes_ok
+    print(f"  max_abs_err={max_err:.3e}  mean={mean_err:.3e}  p99={p99:.3e}")
+    print(f"  audio_embed_sizes pt={sizes_pt_np.tolist()}  ort={sizes_ort.tolist()}  match={sizes_ok}")
+    print(f"  encoder parity: {'PASS' if ok else 'FAIL'}{' (argmax-only)' if argmax_only else ''}")
+    return {
+        "ok": ok,
+        "abs_tol": abs_tol,
+        "argmax_only": argmax_only,
+        "max_abs_err": max_err,
+        "mean_abs_err": mean_err,
+        "p99_abs_err": p99,
+        "audio_embeds_shape_pt": list(pt_np.shape),
+        "audio_embeds_shape_ort": list(audio_ort.shape),
+        "audio_embed_sizes_pt": sizes_pt_np.tolist(),
+        "audio_embed_sizes_ort": sizes_ort.tolist(),
+        "audio_embed_sizes_match": sizes_ok,
+        "audio_embeds_stats_pt": tensor_stats(audio_pt),
+        "audio_embeds_stats_ort": tensor_stats(audio_ort),
+    }
+def build_inputs_embeds(model: nn.Module, processor: Any, waveform: np.ndarray) -> tuple[torch.Tensor, torch.Tensor, dict]:
+    """Build the post-splice `inputs_embeds [1, N, 2048]` and `position_ids` for
+    parity, exactly mirroring the PyTorch path:
+      1. Render the chat prompt with `<|audio|>` -> repeated audio token.
+      2. Run encoder + projector to get audio embeds.
+      3. masked_scatter audio embeds into the text embeddings at audio-token positions.
+    """
+    chat = [{"role": "user", "content": USER_PROMPT_TRANSCRIBE}]
+    rendered = processor.tokenizer.apply_chat_template(
+        chat, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(rendered, [waveform], sampling_rate=16000, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(torch.long)
+    input_features = inputs["input_features"].to(torch.float32)
+    input_features_mask = inputs["input_features_mask"]
+    print(f"  prompt token ids shape={tuple(input_ids.shape)}")
+    print(f"  input_features shape={tuple(input_features.shape)}  input_features_mask shape={tuple(input_features_mask.shape)}")
+    with torch.inference_mode():
+        audio_outputs = model.get_audio_features(input_features, return_dict=True)
+        audio_embeds = audio_outputs.pooler_output
+        # The reference uses model.dtype; we forced fp32 at load.
+        inputs_embeds = model.get_merged_audio_embeddings(
+            input_ids=input_ids,
+            audio_features=audio_embeds,
+            input_features_mask=input_features_mask,
+        )
+    inputs_embeds = inputs_embeds.to(torch.float32)
+    N = inputs_embeds.shape[1]
+    position_ids = torch.arange(N, dtype=torch.long).unsqueeze(0).expand(1, N).contiguous()
+    info = {
+        "input_ids_shape": list(input_ids.shape),
+        "audio_embeds_shape": list(audio_embeds.shape),
+        "input_features_mask_shape": list(input_features_mask.shape),
+        "inputs_embeds_shape": list(inputs_embeds.shape),
+        "n_audio_tokens": int((input_ids == model.config.audio_token_id).sum().item()),
+        "input_features_mask_sum": int(input_features_mask.sum().item()),
+    }
+    print(f"  inputs_embeds shape={tuple(inputs_embeds.shape)}  audio_tokens={info['n_audio_tokens']}")
+    return inputs_embeds, position_ids, info
+def llm_parity_e2e(
+    model: nn.Module,
+    processor: Any,
+    waveform: np.ndarray,
+    prompt_onnx: Path,
+    decode_onnx: Path,
+    baseline_json: Path,
+    max_new_tokens: int,
+    abs_tol: float,
+    argmax_only: bool = False,
+) -> dict[str, Any]:
+    """Greedy-decode end-to-end through the ONNX graphs and compare against
+    the captured PyTorch baseline transcript token-for-token.
+    """
+    import onnxruntime as ort
+    print("\n=== prompt_encode + decode_step end-to-end parity ===")
+    inputs_embeds, position_ids, embed_info = build_inputs_embeds(model, processor, waveform)
+    N = inputs_embeds.shape[1]
+    # Build the 4-D additive causal+pad mask Python-side.
+    attn_2d = torch.ones((1, N), dtype=torch.long)
+    attn_4d_prompt = _build_causal_mask_4d(attn_2d, T_past=0, dtype=torch.float32)
+    # Reference: PyTorch logits at the prompt's last position; expected first
+    # generated token == baseline new_token_ids[0].
+    print("  loading PyTorch reference path (lm_head / logits_scaling)")
+    with torch.inference_mode():
+        out = model.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attn_4d_prompt,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=None,
+        )
+        # The language_model is a GraniteForCausalLM; out.logits is already
+        # divided by logits_scaling. Use it as the strict-parity reference.
+        pt_logits = out.logits.detach().float().cpu().numpy()
+        pt_past = out.past_key_values
+    print(f"    pt prompt logits shape={pt_logits.shape}  argmax_last={int(pt_logits[0, -1].argmax())}")
+    # ---- ONNX: prompt_encode ----
+    print(f"  loading ONNX sessions")
+    so = ort.SessionOptions()
+    sess_prompt = ort.InferenceSession(str(prompt_onnx), so, providers=["CPUExecutionProvider"])
+    sess_decode = ort.InferenceSession(str(decode_onnx), so, providers=["CPUExecutionProvider"])
+    num_layers = len(model.language_model.model.layers)
+    feeds_prompt = {
+        "inputs_embeds": inputs_embeds.numpy().astype(np.float32),
+        "position_ids": position_ids.numpy().astype(np.int64),
+        "attention_mask": attn_4d_prompt.numpy().astype(np.float32),
+    }
+    print("  running prompt_encode.onnx")
+    t0 = time.time()
+    prompt_outs = sess_prompt.run(None, feeds_prompt)
+    print(f"    forward: {time.time() - t0:.2f}s")
+    prompt_logits = prompt_outs[0]
+    past_kv_flat = list(prompt_outs[1:])
+    assert len(past_kv_flat) == 2 * num_layers
+    # Compare prompt-stage logits.
+    prompt_diff = np.abs(prompt_logits - pt_logits)
+    prompt_max_err = float(prompt_diff.max())
+    prompt_mean_err = float(prompt_diff.mean())
+    pt_argmax = pt_logits.argmax(-1)
+    ort_argmax = prompt_logits.argmax(-1)
+    prompt_argmax_mismatches = int((pt_argmax != ort_argmax).sum())
+    print(f"    prompt logits max_abs_err={prompt_max_err:.3e}  mean={prompt_mean_err:.3e}  "
+          f"argmax_mismatches={prompt_argmax_mismatches}/{pt_argmax.size}")
+    # First generated token: argmax at the last prompt position (this is what
+    # GenerationMixin's greedy path does).
+    embed_tokens = model.language_model.model.embed_tokens
+    eos_id = int(model.config.text_config.eos_token_id)
+    onnx_new_tokens: list[int] = [int(prompt_logits[0, -1].argmax())]
+    onnx_step_logits: list[np.ndarray] = [prompt_logits[0, -1].astype(np.float32)]
+    print(f"  greedy-decoding up to {max_new_tokens} new tokens through decode_step.onnx")
+    t0 = time.time()
+    for step in range(1, max_new_tokens):
+        prev_tok = onnx_new_tokens[-1]
+        if prev_tok == eos_id:
+            break
+        T_past = N + step - 1
+        T_total = T_past + 1
+        # Build the next inputs_embeds via the model's embed_tokens.
+        prev_id_tensor = torch.tensor([[prev_tok]], dtype=torch.long)
+        with torch.inference_mode():
+            next_embed = embed_tokens(prev_id_tensor).to(torch.float32)
+        # 4-D additive mask of zeros for unmasked positions; padding is irrelevant
+        # because attention_mask_2d is all-ones throughout the decode loop.
+        attn_2d_step = torch.ones((1, T_total), dtype=torch.long)
+        attn_4d_step = _build_causal_mask_4d(attn_2d_step, T_past=T_past, dtype=torch.float32)
+        feeds: dict[str, np.ndarray] = {
+            "inputs_embeds": next_embed.numpy().astype(np.float32),
+            "position_ids": np.array([[T_past]], dtype=np.int64),
+            "attention_mask": attn_4d_step.numpy().astype(np.float32),
+        }
+        for i in range(num_layers):
+            feeds[f"past_key_values.{i}.key"] = past_kv_flat[2 * i]
+            feeds[f"past_key_values.{i}.value"] = past_kv_flat[2 * i + 1]
+        outs = sess_decode.run(None, feeds)
+        step_logits = outs[0]
+        new_past = list(outs[1:])
+        assert len(new_past) == 2 * num_layers
+        past_kv_flat = new_past
+        nt = int(step_logits[0, 0].argmax())
+        onnx_step_logits.append(step_logits[0, 0].astype(np.float32))
+        onnx_new_tokens.append(nt)
+    print(f"    {len(onnx_new_tokens) - 1} decode_step forwards: {time.time() - t0:.2f}s")
+    onnx_transcript = processor.tokenizer.decode(
+        [t for t in onnx_new_tokens if t != eos_id], skip_special_tokens=True
+    )
+    print(f"  onnx new tokens: {onnx_new_tokens}")
+    print(f"  onnx transcript: {onnx_transcript!r}")
+    baseline = json.loads(baseline_json.read_text())
+    baseline_tokens = baseline["new_token_ids"]
+    baseline_transcript = baseline["transcript"]
+    tokens_match = onnx_new_tokens == baseline_tokens
+    transcript_match = onnx_transcript == baseline_transcript
+    print(f"  baseline transcript: {baseline_transcript!r}")
+    print(f"  tokens match: {tokens_match}   transcript match: {transcript_match}")
+    # Per-step parity vs PyTorch reference for the first 5 steps.
+    per_step_compare: list[dict[str, Any]] = []
+    pt_step_logits = None
+    if max_new_tokens >= 1:
+        # Recompute PyTorch reference logits per step via model.generate, to
+        # avoid having to maintain an alternate decode loop here.
+        with torch.inference_mode():
+            chat = [{"role": "user", "content": USER_PROMPT_TRANSCRIBE}]
+            rendered = processor.tokenizer.apply_chat_template(
+                chat, tokenize=False, add_generation_prompt=True
+            )
+            ref_inputs = processor(rendered, [waveform], sampling_rate=16000, return_tensors="pt")
+            gen = model.generate(
+                **ref_inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                num_beams=1,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+            pt_step_logits = [s[0].detach().float().cpu().numpy() for s in gen.scores]
+        n_compare = min(len(pt_step_logits), len(onnx_step_logits))
+        for i in range(n_compare):
+            ref = pt_step_logits[i].astype(np.float32)
+            ours = onnx_step_logits[i].astype(np.float32)
+            d = np.abs(ref - ours)
+            per_step_compare.append({
+                "step": i,
+                "ref_token": int(ref.argmax()),
+                "onnx_token": int(ours.argmax()),
+                "argmax_match": int(ref.argmax()) == int(ours.argmax()),
+                "max_abs_err": float(d.max()),
+                "mean_abs_err": float(d.mean()),
+            })
+    overall_max = max((s["max_abs_err"] for s in per_step_compare), default=0.0)
+    overall_argmax_mm = sum(0 if s["argmax_match"] else 1 for s in per_step_compare)
+    if argmax_only:
+        # INT8 ship gate: end-to-end transcript + decoded token IDs match the
+        # baseline exactly. Prompt-stage and per-step max-abs deltas vs FP32
+        # are recorded for reporting but not blocking.
+        ok = tokens_match and transcript_match
+    else:
+        ok = (
+            tokens_match
+            and transcript_match
+            and prompt_argmax_mismatches == 0
+            and overall_argmax_mm == 0
+        )
+    return {
+        "ok": ok,
+        "abs_tol": abs_tol,
+        "argmax_only": argmax_only,
+        "embed_info": embed_info,
+        "N_prompt": N,
+        "prompt_logits_max_abs_err": prompt_max_err,
+        "prompt_logits_mean_abs_err": prompt_mean_err,
+        "prompt_argmax_mismatches": prompt_argmax_mismatches,
+        "prompt_argmax_total": int(pt_argmax.size),
+        "onnx_new_tokens": onnx_new_tokens,
+        "baseline_new_tokens": baseline_tokens,
+        "tokens_match": tokens_match,
+        "onnx_transcript": onnx_transcript,
+        "baseline_transcript": baseline_transcript,
+        "transcript_match": transcript_match,
+        "per_step_compare": per_step_compare,
+        "overall_max_abs_err_step": overall_max,
+        "overall_argmax_mismatches_step": overall_argmax_mm,
+    }
+# ---------------------------------------------------------------------------
+# Main.
+# ---------------------------------------------------------------------------
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--audio", default=str(DEFAULT_AUDIO))
+    p.add_argument("--baseline", default=str(DEFAULT_BASELINE))
+    p.add_argument("--model-dir", default=str(DEFAULT_MODEL_DIR))
+    p.add_argument("--out-dir", default=str(DEFAULT_OUT_DIR))
+    p.add_argument("--abs-tol", type=float, default=1e-3)
+    p.add_argument(
+        "--stages",
+        default="encoder,prompt,decode",
+        help="comma-separated subset of {encoder, prompt, decode}",
+    )
+    p.add_argument(
+        "--skip-export", action="store_true", help="skip export, run parity on existing files"
+    )
+    p.add_argument("--max-new-tokens", type=int, default=80)
+    p.add_argument(
+        "--graph-suffix",
+        default="",
+        help="suffix appended to graph stems (e.g. '_int8') so parity runs against "
+        "encoder<suffix>.onnx etc. Parity output goes to parity<suffix>.json. "
+        "When set, --skip-export is implied.",
+    )
+    args = p.parse_args()
+    stages = {s.strip() for s in args.stages.split(",") if s.strip()}
+    valid = {"encoder", "prompt", "decode"}
+    bad = stages - valid
+    if bad:
+        raise SystemExit(f"unknown stage(s): {bad}; valid: {sorted(valid)}")
+    out_dir = Path(args.out_dir)
+    suffix = args.graph_suffix
+    if suffix and not args.skip_export:
+        print(f"  --graph-suffix={suffix!r} set; implying --skip-export")
+        args.skip_export = True
+    encoder_path = out_dir / f"encoder{suffix}.onnx"
+    prompt_path = out_dir / f"prompt_encode{suffix}.onnx"
+    decode_path = out_dir / f"decode_step{suffix}.onnx"
+    parity_json = out_dir / f"parity{suffix}.json"
+    print(f"audio:     {args.audio}")
+    print(f"model_dir: {args.model_dir}")
+    print(f"out_dir:   {out_dir}")
+    print(f"stages:    {sorted(stages)}")
+    waveform = load_audio(Path(args.audio))
+    print(f"  duration={waveform.shape[0] / 16000:.2f}s")
+    print("loading model...")
+    model, processor = load_base_model(Path(args.model_dir))
+    print("patching conformer attention for tracing...")
+    patch_conformer_for_tracing(model)
+    print("patching projector for dynamic T_audio tracing...")
+    patch_projector_for_tracing(model)
+    # Useful constants from the loaded config.
+    text_cfg = model.config.text_config
+    num_layers = int(text_cfg.num_hidden_layers)
+    logits_scaling = float(text_cfg.logits_scaling)
+    print(f"  num_layers={num_layers}  logits_scaling={logits_scaling}")
+    print(f"  audio_token_id={model.config.audio_token_id}  hidden_size={text_cfg.hidden_size}")
+    # Sample inputs for tracing.
+    sample_inputs = processor(
+        USER_PROMPT_TRANSCRIBE, [waveform], sampling_rate=16000, return_tensors="pt"
+    )
+    sample_features = sample_inputs["input_features"].to(torch.float32)
+    parity_payload: dict[str, Any] = {
+        "abs_tol": args.abs_tol,
+        "stages_run": sorted(stages),
+        "input_features_shape": list(sample_features.shape),
+    }
+    # ----- Encoder export + parity -----
+    if "encoder" in stages:
+        wrapper = EncoderProjectorWrapper(
+            encoder=model.encoder,
+            projector=model.projector,
+            window_size=int(model.config.window_size),
+            downsample_rate=int(model.config.downsample_rate),
+        ).eval()
+        if not args.skip_export:
+            with torch.inference_mode():
+                export_encoder(
+                    wrapper=wrapper,
+                    sample_input_features=sample_features,
+                    out_path=encoder_path,
+                    opset=20,
+                    ir_version=10,
+                )
+        parity_payload["encoder"] = encoder_parity(
+            wrapper=wrapper,
+            processor=processor,
+            waveform=waveform,
+            onnx_path=encoder_path,
+            abs_tol=args.abs_tol,
+            argmax_only=bool(suffix),
+        )
+    # ----- LLM (prompt + decode) export -----
+    if {"prompt", "decode"} & stages and not args.skip_export:
+        # Build a sample inputs_embeds + position_ids by running encoder + splice.
+        print("\nbuilding sample inputs_embeds for LLM export trace...")
+        sample_embeds, sample_pos_ids, _info = build_inputs_embeds(model, processor, waveform)
+        N = sample_embeds.shape[1]
+        sample_attn_4d = _build_causal_mask_4d(
+            torch.ones((1, N), dtype=torch.long), T_past=0, dtype=torch.float32
+        )
+        if "prompt" in stages:
+            prompt_wrapper = PromptEncodeWrapper(
+                llm_model=model.language_model.model,
+                lm_head=model.language_model.lm_head,
+                num_layers=num_layers,
+                logits_scaling=logits_scaling,
+            ).eval()
+            with torch.inference_mode():
+                export_prompt_encode(
+                    wrapper=prompt_wrapper,
+                    sample_inputs_embeds=sample_embeds,
+                    sample_position_ids=sample_pos_ids,
+                    sample_attention_mask=sample_attn_4d,
+                    out_path=prompt_path,
+                    num_layers=num_layers,
+                    opset=20,
+                    ir_version=10,
+                )
+        if "decode" in stages:
+            # We need a sample past_kv set for the decode_step trace; harvest by
+            # running the prompt wrapper once.
+            prompt_wrapper = PromptEncodeWrapper(
+                llm_model=model.language_model.model,
+                lm_head=model.language_model.lm_head,
+                num_layers=num_layers,
+                logits_scaling=logits_scaling,
+            ).eval()
+            with torch.inference_mode():
+                p_outs = prompt_wrapper(sample_embeds, sample_pos_ids, sample_attn_4d)
+            sample_past_kv_flat = tuple(t.detach().clone() for t in p_outs[1:])
+            assert len(sample_past_kv_flat) == 2 * num_layers
+            embed_tokens = model.language_model.model.embed_tokens
+            with torch.inference_mode():
+                sample_step_embed = (
+                    embed_tokens(torch.tensor([[0]], dtype=torch.long)).to(torch.float32)
+                )
+            sample_step_pos = torch.tensor([[N]], dtype=torch.long)
+            sample_step_attn_2d = torch.ones((1, N + 1), dtype=torch.long)
+            sample_step_attn_4d = _build_causal_mask_4d(
+                sample_step_attn_2d, T_past=N, dtype=torch.float32
+            )
+            decode_wrapper = DecodeStepWrapper(
+                llm_model=model.language_model.model,
+                lm_head=model.language_model.lm_head,
+                num_layers=num_layers,
+                logits_scaling=logits_scaling,
+            ).eval()
+            with torch.inference_mode():
+                export_decode_step(
+                    wrapper=decode_wrapper,
+                    sample_inputs_embeds=sample_step_embed,
+                    sample_position_ids=sample_step_pos,
+                    sample_attention_mask=sample_step_attn_4d,
+                    sample_past_kv_flat=sample_past_kv_flat,
+                    out_path=decode_path,
+                    num_layers=num_layers,
+                    opset=20,
+                    ir_version=10,
+                )
+    # ----- end-to-end LLM parity -----
+    if {"prompt", "decode"} <= stages and prompt_path.exists() and decode_path.exists():
+        parity_payload["llm_e2e"] = llm_parity_e2e(
+            model=model,
+            processor=processor,
+            waveform=waveform,
+            prompt_onnx=prompt_path,
+            decode_onnx=decode_path,
+            baseline_json=Path(args.baseline),
+            max_new_tokens=args.max_new_tokens,
+            abs_tol=args.abs_tol,
+            argmax_only=bool(suffix),
+        )
+    # ----- Per-graph size + int8-vs-fp32 deltas (only when graph-suffix set) -----
+    if suffix:
+        parity_payload["graph_suffix"] = suffix
+        parity_payload["graphs"] = {}
+        for label, p in (
+            ("encoder", encoder_path),
+            ("prompt_encode", prompt_path),
+            ("decode_step", decode_path),
+        ):
+            if not p.exists():
+                continue
+            data = p.with_name(p.name + "_data")
+            entry = {
+                "graph_path": str(p),
+                "graph_size_bytes": int(p.stat().st_size),
+                "sidecar_path": str(data) if data.exists() else None,
+                "int8_size_bytes": int(data.stat().st_size) if data.exists() else None,
+            }
+            fp32 = p.with_name(p.name.replace(suffix, ""))
+            fp32_data = fp32.with_name(fp32.name + "_data")
+            if fp32.exists() and fp32_data.exists():
+                entry["fp32_sidecar_path"] = str(fp32_data)
+                entry["fp32_size_bytes"] = int(fp32_data.stat().st_size)
+                if entry["int8_size_bytes"]:
+                    entry["size_ratio"] = entry["int8_size_bytes"] / entry["fp32_size_bytes"]
+            parity_payload["graphs"][label] = entry
+    # ----- Write parity report -----
+    parity_json.parent.mkdir(parents=True, exist_ok=True)
+    parity_json.write_text(json.dumps(parity_payload, indent=2))
+    print(f"\nwrote parity report -> {parity_json}")
+    # ----- Final summary -----
+    failures = []
+    print("\n--- summary ---")
+    if "encoder" in parity_payload:
+        e = parity_payload["encoder"]
+        print(f"  encoder: {'PASS' if e['ok'] else 'FAIL'}  max_abs_err={e['max_abs_err']:.3e}")
+        if not e["ok"]:
+            failures.append("encoder")
+    if "llm_e2e" in parity_payload:
+        l = parity_payload["llm_e2e"]
+        print(
+            f"  llm_e2e: {'PASS' if l['ok'] else 'FAIL'}  "
+            f"prompt_argmax_mm={l['prompt_argmax_mismatches']}  "
+            f"step_argmax_mm={l['overall_argmax_mismatches_step']}  "
+            f"transcript_match={l['transcript_match']}"
+        )
+        if not l["ok"]:
+            failures.append("llm_e2e")
+    if failures:
+        raise SystemExit(f"failed: {failures}")
+if __name__ == "__main__":
+    main()

granite_export_metadata.json ADDED Viewed

	@@ -0,0 +1,454 @@

+{
+  "variant": "base",
+  "upstream": {
+    "repo": "ibm-granite/granite-speech-4.1-2b",
+    "url": "https://huggingface.co/ibm-granite/granite-speech-4.1-2b",
+    "license": "Apache-2.0"
+  },
+  "topology": "encoder + prompt_encode + decode_step (autoregressive)",
+  "graphs": [
+    {
+      "name": "encoder.onnx",
+      "sidecar": "encoder.onnx_data",
+      "precision": "fp32",
+      "size_bytes": 912937,
+      "sidecar_size_bytes": 1903334768,
+      "opset": 20,
+      "ir_version": 10,
+      "ai_onnx_only": true,
+      "inputs": [
+        {
+          "name": "input_features",
+          "shape": [
+            "B",
+            "T",
+            160
+          ],
+          "dtype": "float32"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "audio_embeds",
+          "shape": [
+            "B",
+            "T_audio",
+            2048
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "audio_embed_sizes",
+          "shape": [
+            "B"
+          ],
+          "dtype": "int64"
+        }
+      ]
+    },
+    {
+      "name": "encoder_int8.onnx",
+      "sidecar": "encoder_int8.onnx_data",
+      "precision": "int8-weights-only",
+      "size_bytes": 2608070,
+      "sidecar_size_bytes": 787117424,
+      "opset": 20,
+      "ir_version": 10,
+      "ai_onnx_only": true,
+      "inputs": [
+        {
+          "name": "input_features",
+          "shape": [
+            "B",
+            "T",
+            160
+          ],
+          "dtype": "float32"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "audio_embeds",
+          "shape": [
+            "B",
+            "T_audio",
+            2048
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "audio_embed_sizes",
+          "shape": [
+            "B"
+          ],
+          "dtype": "int64"
+        }
+      ]
+    },
+    {
+      "name": "prompt_encode.onnx",
+      "sidecar": "prompt_encode.onnx_data",
+      "precision": "fp32",
+      "size_bytes": 1844341,
+      "sidecar_size_bytes": 6527008768,
+      "opset": 20,
+      "ir_version": 10,
+      "ai_onnx_only": true,
+      "inputs": [
+        {
+          "name": "inputs_embeds",
+          "shape": [
+            "B",
+            "N",
+            2048
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "position_ids",
+          "shape": [
+            "B",
+            "N"
+          ],
+          "dtype": "int64"
+        },
+        {
+          "name": "attention_mask",
+          "shape": [
+            "B",
+            1,
+            "N",
+            "N"
+          ],
+          "dtype": "float32"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "logits",
+          "shape": [
+            "B",
+            "N",
+            100353
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "present.{i}.{key,value}",
+          "shape": [
+            "B",
+            4,
+            "N",
+            128
+          ],
+          "dtype": "float32",
+          "note": "40 layers x 2 (key, value) = 80 KV-cache outputs"
+        }
+      ]
+    },
+    {
+      "name": "prompt_encode_int8.onnx",
+      "sidecar": "prompt_encode_int8.onnx_data",
+      "precision": "int8-weights-only",
+      "size_bytes": 6419491,
+      "sidecar_size_bytes": 1632249856,
+      "opset": 20,
+      "ir_version": 10,
+      "ai_onnx_only": true,
+      "inputs": [
+        {
+          "name": "inputs_embeds",
+          "shape": [
+            "B",
+            "N",
+            2048
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "position_ids",
+          "shape": [
+            "B",
+            "N"
+          ],
+          "dtype": "int64"
+        },
+        {
+          "name": "attention_mask",
+          "shape": [
+            "B",
+            1,
+            "N",
+            "N"
+          ],
+          "dtype": "float32"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "logits",
+          "shape": [
+            "B",
+            "N",
+            100353
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "present.{i}.{key,value}",
+          "shape": [
+            "B",
+            4,
+            "N",
+            128
+          ],
+          "dtype": "float32",
+          "note": "40 layers x 2 (key, value) = 80 KV-cache outputs"
+        }
+      ]
+    },
+    {
+      "name": "decode_step.onnx",
+      "sidecar": "decode_step.onnx_data",
+      "precision": "fp32",
+      "size_bytes": 1849786,
+      "sidecar_size_bytes": 6527008768,
+      "opset": 20,
+      "ir_version": 10,
+      "ai_onnx_only": true,
+      "inputs": [
+        {
+          "name": "inputs_embeds",
+          "shape": [
+            "B",
+            1,
+            2048
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "position_ids",
+          "shape": [
+            "B",
+            1
+          ],
+          "dtype": "int64"
+        },
+        {
+          "name": "attention_mask",
+          "shape": [
+            "B",
+            1,
+            1,
+            "T_total"
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "past_key_values.{i}.{key,value}",
+          "shape": [
+            "B",
+            4,
+            "T_past",
+            128
+          ],
+          "dtype": "float32",
+          "note": "40 layers x 2 = 80 KV-cache inputs"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "logits",
+          "shape": [
+            "B",
+            1,
+            100353
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "present.{i}.{key,value}",
+          "shape": [
+            "B",
+            4,
+            "T_total",
+            128
+          ],
+          "dtype": "float32",
+          "note": "40 layers x 2 = 80 KV-cache outputs"
+        }
+      ]
+    },
+    {
+      "name": "decode_step_int8.onnx",
+      "sidecar": "decode_step_int8.onnx_data",
+      "precision": "int8-weights-only",
+      "size_bytes": 6426226,
+      "sidecar_size_bytes": 1632249856,
+      "opset": 20,
+      "ir_version": 10,
+      "ai_onnx_only": true,
+      "inputs": [
+        {
+          "name": "inputs_embeds",
+          "shape": [
+            "B",
+            1,
+            2048
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "position_ids",
+          "shape": [
+            "B",
+            1
+          ],
+          "dtype": "int64"
+        },
+        {
+          "name": "attention_mask",
+          "shape": [
+            "B",
+            1,
+            1,
+            "T_total"
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "past_key_values.{i}.{key,value}",
+          "shape": [
+            "B",
+            4,
+            "T_past",
+            128
+          ],
+          "dtype": "float32",
+          "note": "40 layers x 2 = 80 KV-cache inputs"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "logits",
+          "shape": [
+            "B",
+            1,
+            100353
+          ],
+          "dtype": "float32"
+        },
+        {
+          "name": "present.{i}.{key,value}",
+          "shape": [
+            "B",
+            4,
+            "T_total",
+            128
+          ],
+          "dtype": "float32",
+          "note": "40 layers x 2 = 80 KV-cache outputs"
+        }
+      ]
+    }
+  ],
+  "parity": {
+    "fp32": {
+      "encoder": {
+        "argmax_only": false,
+        "max_abs_err": 4.481524229049683e-06,
+        "mean_abs_err": 1.243776637238625e-07,
+        "p99_abs_err": 6.463378667831421e-07,
+        "audio_embed_sizes_match": true,
+        "input_features_shape": [
+          1,
+          844,
+          160
+        ],
+        "audio_embeds_shape": [
+          1,
+          171,
+          2048
+        ]
+      },
+      "llm_e2e": {
+        "argmax_only": false,
+        "prompt_argmax_mismatches": 0,
+        "prompt_argmax_total": 190,
+        "prompt_logits_max_abs_err": 0.000364,
+        "decode_steps": 51,
+        "decode_argmax_mismatches": 0,
+        "decode_max_abs_err_step": null,
+        "tokens_match": true,
+        "transcript_match": true,
+        "source_note": "from task-11 record (parity.json was overwritten by a later encoder-only re-run; see dev-plan.md)"
+      }
+    },
+    "int8": {
+      "encoder": {
+        "argmax_only": true,
+        "max_abs_err": 0.16911625862121582,
+        "mean_abs_err": 0.010853650979697704,
+        "p99_abs_err": 0.044730618596076965
+      },
+      "llm_e2e": {
+        "argmax_only": true,
+        "prompt_argmax_mismatches": 58,
+        "prompt_argmax_total": 190,
+        "prompt_logits_max_abs_err": 10.136197090148926,
+        "prompt_logits_mean_abs_err": 0.778969943523407,
+        "decode_steps": 51,
+        "decode_argmax_mismatches": 0,
+        "decode_max_abs_err_step": 5.762608528137207,
+        "tokens_match": true,
+        "transcript_match": true
+      }
+    }
+  },
+  "multi_clip_parity": {
+    "rows": [
+      {
+        "name": "is-it-more-wood",
+        "duration_s": 46.9,
+        "fp32_byte_exact_vs_pt": true,
+        "int8_byte_exact_vs_pt": false,
+        "int8_wer_vs_pt": 0.0144,
+        "int8_vs_fp32_lev": 2,
+        "fp32_transcript": "Well, hello, Sam. Guess who? Yeah, it's Robert Clotworthy, the narrator of your favorite television show, \"The Curse of Oak Island.\" Yes, I'm the. Is it possible? Could it be? And what else do we say in Oak Island? A couple of words. They're not coming to me. Oh, yeah. More wood. But let's not forget. It is an island named after a tree. Well, here's the question. Why am I reaching out to you? Is it possible that I'm reaching out to you because it's your birthday? Could it be that Emma let the cat out of the bag? Well, the answer to those questions is yes. And she said, well, she contacted me. She said, Robert, you know, Sam is an amazing boyfriend. In fact, she used the word great. She said he is a great boyfriend.",
+        "int8_transcript": "Well, hello, Sam. Guess who? Yeah, it's Robert Clotworthy, the narrator of your favorite television show, \"The Curse of Oak Island.\" Yes, I'm the. Is it possible? Could it be? And what else do we say in Oak Island? A couple of words. They're not coming to me. Oh, yeah. More wood. But let's not forget. It is an island named after a tree. Well, here's the question. Why am I reaching out to you? Is it possible that I'm reaching out to you because it's your birthday? Could it be that Emma let the cat out of the bag? Well, the answer to those questions is yes. And she said, well, she contacted me. She said, Robert. You know, Sam is an amazing boyfriend. In fact, she used the word great. She said he is a great boyfriend.",
+        "pt_transcript": "Well, hello, Sam. Guess who? Yeah, it's Robert Clotworthy, the narrator of your favorite television show, \"The Curse of Oak Island.\" Yes, I'm the. Is it possible? Could it be? And what else do we say in Oak Island? A couple of words. They're not coming to me. Oh, yeah. More wood. But let's not forget. It is an island named after a tree. Well, here's the question. Why am I reaching out to you? Is it possible that I'm reaching out to you because it's your birthday? Could it be that Emma let the cat out of the bag? Well, the answer to those questions is yes. And she said, well, she contacted me. She said, Robert, you know, Sam is an amazing boyfriend. In fact, she used the word great. She said he is a great boyfriend."
+      },
+      {
+        "name": "two-speakers-1",
+        "duration_s": 93.8,
+        "fp32_byte_exact_vs_pt": true,
+        "int8_byte_exact_vs_pt": false,
+        "int8_wer_vs_pt": 0.0104,
+        "int8_vs_fp32_lev": 12,
+        "fp32_transcript": "Today it is a true honor to speak with Demis Asavis, who is the CEO of DeepMind. Demis, welcome to the podcast. Thanks for having me. First question, given your neuroscience background, how do you think about intelligence? Specifically, do you think it's like one higher level general reasoning circuit, or do you think it's thousands of independent subskills and heuristics? Well, it's interesting because intelligence is so broad and, you know, what we use it for is so sort of generally applicable. I think that suggests that, you know, there must be some sort of high-level common things in, you know, common kind of algorithmic themes, I think, around how the brain processes the world around us. So, of course, then there are specialized parts of the brain that do specific things, but I think there are probably some underlying principles that underpin all of that. Yeah. How do you make sense of the fact that in these LLMs, though, when you give them a lot of data in any specific domain, they tend to get asymmetrically better in that domain? Wouldn't we expect a sort of like general improvement across all the different areas? Well, I think you, first of all, I think you do actually sometimes get surprising improvement in other domains when you improve in a specific domain. So, for example, when these large models sort of improve at coding, that can actually improve their general reasoning. So there is some evidence of some transfer, although I think we would like a lot more evidence of that. But also, you know, that's how the human brain learns, too, is if we experience and practice a lot of things like chess or, you know, writing.",
+        "int8_transcript": "Today it is a true honor to speak with Demis Savas, who is the CEO of DeepMind. Demis, welcome to the podcast. Thanks for having me. First question, given your neuroscience background, how do you think about intelligence? Specifically, do you think it's like one higher level general reasoning circuit, or do you think it's thousands of independent subskills and heuristics? Well, it's interesting because intelligence is so broad and, you know, what we use it for is so sort of generally applicable. I think that suggests that, you know, there must be some sort of high-level common things in, you know, common kind of algorithmic themes, I think, around how the brain processes the world around us. So, of course, then there are specialized parts of the brain that do specific things, but I think there are probably some underlying principles that underpin all of that. Yeah. How do you make sense of the fact that in these LLMs, though, when you give them a lot of data in any specific domain, they tend to get asymmetrically better in that domain? Wouldn't we expect a sort of like general improvement across all the, all the different areas? Well, I think you, first of all, I think you do actually sometimes get surprising improvement in other domains when you improve in a specific domain. So, for example, when these large models sort of improve at coding, that can actually improve their general reasoning. So there is some evidence of some transfer, although I think we would like a lot more evidence of that. But also, you know, that's how the human brain learns, too, is if we experience and practice a lot of things like chess or, you know, writing.",
+        "pt_transcript": "Today it is a true honor to speak with Demis Asavis, who is the CEO of DeepMind. Demis, welcome to the podcast. Thanks for having me. First question, given your neuroscience background, how do you think about intelligence? Specifically, do you think it's like one higher level general reasoning circuit, or do you think it's thousands of independent subskills and heuristics? Well, it's interesting because intelligence is so broad and, you know, what we use it for is so sort of generally applicable. I think that suggests that, you know, there must be some sort of high-level common things in, you know, common kind of algorithmic themes, I think, around how the brain processes the world around us. So, of course, then there are specialized parts of the brain that do specific things, but I think there are probably some underlying principles that underpin all of that. Yeah. How do you make sense of the fact that in these LLMs, though, when you give them a lot of data in any specific domain, they tend to get asymmetrically better in that domain? Wouldn't we expect a sort of like general improvement across all the different areas? Well, I think you, first of all, I think you do actually sometimes get surprising improvement in other domains when you improve in a specific domain. So, for example, when these large models sort of improve at coding, that can actually improve their general reasoning. So there is some evidence of some transfer, although I think we would like a lot more evidence of that. But also, you know, that's how the human brain learns, too, is if we experience and practice a lot of things like chess or, you know, writing."
+      },
+      {
+        "name": "two-speakers-2",
+        "duration_s": 38.8,
+        "fp32_byte_exact_vs_pt": true,
+        "int8_byte_exact_vs_pt": false,
+        "int8_wer_vs_pt": 0.2347,
+        "int8_vs_fp32_lev": 26,
+        "fp32_transcript": "For the first time ever, we may have things more intelligent than us. You believe they can understand. Yes. You believe they are intelligent. Yes. You believe these systems have experiences of their own and can make decisions based on those experiences. In the same sense as people do, yes. Are they conscious? I think they probably don't have much self-awareness at present. So in that sense, I don't think they're conscious. Will they have self-awareness? Oh, yes. I think they will in time. And so human beings will be the second most intelligent beings on the planet.",
+        "int8_transcript": "for the first time ever we may have things more intelligent than us. You believe they can understand yes you believe they are intelligent yes you believe these systems have experiences of their own and can make decisions based on those experiences in the same sense as people do yes are they conscious I think they probably don't have much self-awareness at present so in that sense I don't think they're conscious. will they have self-awareness oh yes I think they will in time and so human beings will be the second most intelligent beings on the planet.",
+        "pt_transcript": "For the first time ever, we may have things more intelligent than us. You believe they can understand. Yes. You believe they are intelligent. Yes. You believe these systems have experiences of their own and can make decisions based on those experiences. In the same sense as people do, yes. Are they conscious? I think they probably don't have much self-awareness at present. So in that sense, I don't think they're conscious. Will they have self-awareness? Oh, yes. I think they will in time. And so human beings will be the second most intelligent beings on the planet."
+      }
+    ]
+  },
+  "toolchain": {
+    "transformers": "5.8.0",
+    "torch": "2.11.0",
+    "onnx": "1.21.0",
+    "onnxruntime": "1.25.1",
+    "exporter": "torch.onnx.export TorchScript path (dynamo=False)"
+  },
+  "ort_compatibility": "ort 2.0-rc.x (Rust crate); validated against onnxruntime 1.17 - 1.25",
+  "audio_token_id": 100352
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "feature_extractor_type": "GraniteSpeechFeatureExtractor",
+  "melspec_kwargs": {
+    "hop_length": 160,
+    "n_fft": 512,
+    "n_mels": 80,
+    "sample_rate": 16000,
+    "win_length": 400
+  },
+  "processor_class": "GraniteSpeechProcessor",
+  "projector_downsample_rate": 5,
+  "projector_window_size": 15,
+  "sampling_rate": 16000
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "audio_token": "<|audio|>",
+  "processor_class": "GraniteSpeechProcessor"
+}

prompt_encode.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59adb1f8dca67a1e910a74adf94852cb4cd85fded0e0ca65d522d97779073b07
+size 1844341

prompt_encode.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87d924ecd71746694f43e653c9366827a9444ab9407e976f5cd9cc9dbde97608
+size 6527008768

prompt_encode_int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3942d5a402302e5505dee6d4c79cd80f1487546656f70106f4384dbc8cd82982
+size 6419491

prompt_encode_int8.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70652d6a31cbae2d57c7e8cefb665f6c1ee503e495d191b951fff09ddb7f8608
+size 1632249856

quantise.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright 2026 Sam McLeod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dynamic INT8 (weights-only) quantiser for the Granite Speech 4.1 ONNX
+exports.
+Wraps `onnxruntime.quantization.quantize_dynamic` with the conventions used
+by the Granite Speech ONNX bundles:
+  - Single external-data sidecar per graph (mirrors the FP32 export layout).
+  - Pure `ai.onnx` opset 20 / IR 10. The default operator set is restricted
+    to `MatMul` so the dynamic quantiser emits `MatMulInteger` (standard
+    `ai.onnx`) rather than the `com.microsoft.Attention` /
+    `com.microsoft.EmbedLayerNormalization` quantised variants. Override at
+    your own risk - those domain ops are forbidden by the parakeet-rs
+    consumer contract.
+  - `per_channel=True` and `weight_type=QInt8` by default (better accuracy
+    on the LLM weight tensors with no measurable speed cost on
+    arm64 / x86 CPU EP).
+The script is self-contained (no project-internal imports) so it ships
+inside each Hugging Face bundle alongside the export script.
+Usage:
+    python quantise.py --input PATH --output PATH \\
+        [--per-channel | --no-per-channel] \\
+        [--reduce-range] \\
+        [--weight-type qint8|quint8] \\
+        [--op-types MatMul,Gemm] \\
+        [--exclude-pattern REGEX] \\
+        [--exclude-nodes NODE1,NODE2]
+Examples:
+    # Quantise the NAR editor with defaults.
+    python quantise.py \\
+        --input  exports/granite-speech-4.1-2b-nar/editor.onnx \\
+        --output exports/granite-speech-4.1-2b-nar/editor_int8.onnx
+    # Skip the lm_head MatMul if it hurts parity.
+    python quantise.py \\
+        --input  exports/granite-speech-4.1-2b-nar/editor.onnx \\
+        --output exports/granite-speech-4.1-2b-nar/editor_int8.onnx \\
+        --exclude-nodes /lm_head/MatMul
+"""
+from __future__ import annotations
+import argparse
+import re
+import sys
+import tempfile
+import time
+from pathlib import Path
+import onnx
+from onnxruntime.quantization import QuantType, quantize_dynamic
+WEIGHT_TYPE_MAP = {
+    "qint8": QuantType.QInt8,
+    "quint8": QuantType.QUInt8,
+}
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Dynamic INT8 (weights-only) ONNX quantiser for Granite Speech 4.1 graphs.",
+    )
+    p.add_argument(
+        "--input",
+        required=True,
+        type=Path,
+        help="Path to the FP32 .onnx graph (external sidecar must sit alongside it).",
+    )
+    p.add_argument(
+        "--output",
+        required=True,
+        type=Path,
+        help="Destination .onnx path. A single sidecar named <output>_data is written next to it.",
+    )
+    p.add_argument(
+        "--per-channel",
+        dest="per_channel",
+        action="store_true",
+        default=True,
+        help="Quantise weights per output channel (default: on).",
+    )
+    p.add_argument(
+        "--no-per-channel",
+        dest="per_channel",
+        action="store_false",
+        help="Disable per-channel quantisation.",
+    )
+    p.add_argument(
+        "--reduce-range",
+        action="store_true",
+        default=False,
+        help="Quantise to 7 bits instead of 8. Improves accuracy on non-VNNI hardware "
+        "but reduces the quantisation gain. Off by default.",
+    )
+    p.add_argument(
+        "--weight-type",
+        choices=sorted(WEIGHT_TYPE_MAP.keys()),
+        default="qint8",
+        help="Weight quantisation dtype (default: qint8).",
+    )
+    p.add_argument(
+        "--op-types",
+        default="MatMul",
+        help=(
+            "Comma-separated op types to quantise. Default: 'MatMul' (emits "
+            "MatMulInteger only, all ai.onnx). Adding 'Conv' enables ConvInteger "
+            "for the Conformer encoder's depthwise convolutions; this shrinks the "
+            "encoder INT8 sidecar by ~40 percent but on this model family feeds "
+            "enough weight-quant noise into the LLM head that it flips "
+            "capitalisation and drops sentence-final punctuation on short clips - "
+            "see task 17 in dev-plan.md. MatMul-only is the validated default. "
+            "Adding 'Attention' or 'EmbedLayerNormalization' would introduce "
+            "com.microsoft domain ops, which are forbidden by the parakeet-rs "
+            "contract."
+        ),
+    )
+    p.add_argument(
+        "--exclude-pattern",
+        default=None,
+        help="Regex applied to ONNX node names. Matching nodes are excluded from "
+        "quantisation. Useful for skipping e.g. lm_head if its quantisation "
+        "breaks parity.",
+    )
+    p.add_argument(
+        "--exclude-nodes",
+        default="",
+        help="Explicit comma-separated list of node names to exclude from quantisation.",
+    )
+    p.add_argument(
+        "--ir-version",
+        type=int,
+        default=10,
+        help="ONNX IR version to write (default: 10, matches the FP32 exports).",
+    )
+    return p.parse_args(argv)
+def collect_excluded_nodes(
+    input_path: Path,
+    exclude_pattern: str | None,
+    exclude_nodes: list[str],
+) -> list[str]:
+    """Resolve --exclude-pattern against the FP32 graph's node names and merge
+    with the explicit --exclude-nodes list. Loaded without external data so we
+    only touch the small graph proto.
+    """
+    excluded = set(n for n in exclude_nodes if n)
+    if exclude_pattern:
+        rx = re.compile(exclude_pattern)
+        proto = onnx.load(str(input_path), load_external_data=False)
+        for node in proto.graph.node:
+            if node.name and rx.search(node.name):
+                excluded.add(node.name)
+    return sorted(excluded)
+def assert_pure_ai_onnx(model_path: Path) -> list[str]:
+    """Reload the produced graph and verify only `ai.onnx` nodes are present.
+    Returns the sorted list of domains for reporting.
+    """
+    proto = onnx.load(str(model_path), load_external_data=False)
+    domains = sorted({(n.domain or "ai.onnx") for n in proto.graph.node})
+    forbidden = [d for d in domains if d not in ("ai.onnx", "")]
+    if forbidden:
+        raise RuntimeError(
+            f"Quantised graph contains forbidden op domains {forbidden}. "
+            "Re-run with a narrower --op-types list."
+        )
+    return domains
+def consolidate_single_sidecar(
+    quantised_in: Path,
+    final_out: Path,
+    ir_version: int,
+) -> None:
+    """The dynamic quantiser may scatter weights across multiple external-data
+    files. Reload + resave through a tempdir to land on the single-sidecar
+    layout that matches the FP32 exports.
+    """
+    print("  consolidating to single .onnx_data sidecar")
+    proto = onnx.load(str(quantised_in), load_external_data=True)
+    if proto.ir_version < ir_version:
+        proto.ir_version = ir_version
+    for tensor in proto.graph.initializer:
+        tensor.ClearField("data_location")
+        tensor.ClearField("external_data")
+    sidecar_name = final_out.name + "_data"
+    if (final_out.parent / sidecar_name).exists():
+        (final_out.parent / sidecar_name).unlink()
+    if final_out.exists():
+        final_out.unlink()
+    final_out.parent.mkdir(parents=True, exist_ok=True)
+    onnx.save_model(
+        proto,
+        str(final_out),
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location=sidecar_name,
+        size_threshold=1024,
+        convert_attribute=False,
+    )
+    onnx.checker.check_model(str(final_out), full_check=False)
+def quantise_graph(args: argparse.Namespace) -> None:
+    input_path: Path = args.input.resolve()
+    output_path: Path = args.output.resolve()
+    if not input_path.exists():
+        raise SystemExit(f"input not found: {input_path}")
+    op_types = [s.strip() for s in args.op_types.split(",") if s.strip()]
+    explicit_excludes = [s.strip() for s in args.exclude_nodes.split(",") if s.strip()]
+    excluded = collect_excluded_nodes(input_path, args.exclude_pattern, explicit_excludes)
+    weight_type = WEIGHT_TYPE_MAP[args.weight_type]
+    print(f"input:        {input_path}")
+    print(f"output:       {output_path}")
+    print(f"op_types:     {op_types}")
+    print(f"per_channel:  {args.per_channel}")
+    print(f"reduce_range: {args.reduce_range}")
+    print(f"weight_type:  {args.weight_type}")
+    if excluded:
+        print(f"excluded nodes ({len(excluded)}): {excluded}")
+    else:
+        print("excluded nodes: (none)")
+    fp32_size = input_path.stat().st_size
+    sidecar = input_path.with_name(input_path.name + "_data")
+    fp32_data_size = sidecar.stat().st_size if sidecar.exists() else 0
+    print(
+        f"  fp32 graph={fp32_size / 1e6:.2f} MB  "
+        f"sidecar={fp32_data_size / 1e9:.2f} GB"
+    )
+    with tempfile.TemporaryDirectory(prefix="quantise_int8_") as scratch_dir:
+        scratch_path = Path(scratch_dir) / output_path.name
+        t0 = time.time()
+        quantize_dynamic(
+            model_input=input_path,
+            model_output=scratch_path,
+            op_types_to_quantize=op_types,
+            per_channel=args.per_channel,
+            reduce_range=args.reduce_range,
+            weight_type=weight_type,
+            nodes_to_exclude=excluded or None,
+            use_external_data_format=True,
+        )
+        print(f"  quantize_dynamic done in {time.time() - t0:.1f}s")
+        # Stage 2: consolidate any scattered external-data files into a single
+        # sidecar at the final destination.
+        consolidate_single_sidecar(scratch_path, output_path, args.ir_version)
+    # Verify pure ai.onnx after the move.
+    domains = assert_pure_ai_onnx(output_path)
+    int8_size = output_path.stat().st_size
+    int8_data = output_path.with_name(output_path.name + "_data")
+    int8_data_size = int8_data.stat().st_size if int8_data.exists() else 0
+    print(
+        f"  saved {output_path} (+ {int8_data.name})  "
+        f"graph={int8_size / 1e6:.2f} MB  sidecar={int8_data_size / 1e9:.2f} GB"
+    )
+    print(f"  node-domains={domains}")
+    if fp32_data_size > 0:
+        ratio = int8_data_size / fp32_data_size
+        print(f"  sidecar size ratio (int8 / fp32) = {ratio:.3f}")
+def main(argv: list[str] | None = None) -> None:
+    args = parse_args(argv)
+    try:
+        quantise_graph(args)
+    except RuntimeError as exc:
+        print(f"FAIL: {exc}", file=sys.stderr)
+        raise SystemExit(2) from exc
+if __name__ == "__main__":
+    main()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|unk|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,792 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100257": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100261": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100262": {
+      "content": "<|filename|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100263": {
+      "content": "<|reponame|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100264": {
+      "content": "<|start_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<|end_of_role|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<|unused_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100267": {
+      "content": "<|start_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100268": {
+      "content": "<|end_of_plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100269": {
+      "content": "<|unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100270": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100271": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100272": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100273": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100274": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100275": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100276": {
+      "content": "<think_on>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100277": {
+      "content": "<think_off>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100278": {
+      "content": "<schema>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100279": {
+      "content": "</schema>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100280": {
+      "content": "<tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100281": {
+      "content": "</tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100282": {
+      "content": "<documents>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100283": {
+      "content": "</documents>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100284": {
+      "content": "<|unused_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100285": {
+      "content": "<|unused_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100286": {
+      "content": "<|unused_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100287": {
+      "content": "<|unused_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100288": {
+      "content": "<|unused_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100289": {
+      "content": "<|unused_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100290": {
+      "content": "<|unused_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100291": {
+      "content": "<|unused_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100292": {
+      "content": "<|unused_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100293": {
+      "content": "<|unused_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100294": {
+      "content": "<|unused_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100295": {
+      "content": "<|unused_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100296": {
+      "content": "<|unused_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100297": {
+      "content": "<|unused_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100298": {
+      "content": "<|unused_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100299": {
+      "content": "<|unused_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100300": {
+      "content": "<|unused_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100301": {
+      "content": "<|unused_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100302": {
+      "content": "<|unused_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100303": {
+      "content": "<|unused_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100304": {
+      "content": "<|unused_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100305": {
+      "content": "<|unused_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100306": {
+      "content": "<|unused_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100307": {
+      "content": "<|unused_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100308": {
+      "content": "<|unused_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100309": {
+      "content": "<|unused_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100310": {
+      "content": "<|unused_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100311": {
+      "content": "<|unused_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100312": {
+      "content": "<|unused_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100313": {
+      "content": "<|unused_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100314": {
+      "content": "<|unused_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100315": {
+      "content": "<|unused_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100316": {
+      "content": "<|unused_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100317": {
+      "content": "<|unused_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100318": {
+      "content": "<|unused_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100319": {
+      "content": "<|unused_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100320": {
+      "content": "<|unused_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100321": {
+      "content": "<|unused_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100322": {
+      "content": "<|unused_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100323": {
+      "content": "<|unused_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100324": {
+      "content": "<|unused_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100325": {
+      "content": "<|unused_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100326": {
+      "content": "<|unused_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100327": {
+      "content": "<|unused_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100328": {
+      "content": "<|unused_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100329": {
+      "content": "<|unused_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100330": {
+      "content": "<|unused_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100331": {
+      "content": "<|unused_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100332": {
+      "content": "<|unused_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100333": {
+      "content": "<|unused_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100334": {
+      "content": "<|unused_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100335": {
+      "content": "<|unused_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100336": {
+      "content": "<|unused_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100337": {
+      "content": "<|unused_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100338": {
+      "content": "<|unused_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100339": {
+      "content": "<|unused_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100340": {
+      "content": "<|unused_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100341": {
+      "content": "<|unused_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100342": {
+      "content": "<|unused_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100343": {
+      "content": "<|unused_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100344": {
+      "content": "<|unused_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100345": {
+      "content": "<|unused_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100346": {
+      "content": "<|unused_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100347": {
+      "content": "<|unused_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100348": {
+      "content": "<|unused_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100349": {
+      "content": "<|unused_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100350": {
+      "content": "<|unused_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100351": {
+      "content": "<|unused_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100352": {
+      "content": "<|audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|end_of_text|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|end_of_text|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|pad|>",
+  "padding_side": "left",
+  "processor_class": "GraniteSpeechProcessor",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|unk|>"
+}