"""End-to-end ONNX optimization pipeline for Prince-1/OmniVoice. OmniVoice backbone consists of three ONNX sub-models: audio_embeddings_encoder — text + audio embedding fusion llm_decoder — Qwen3-28 layer backbone (inputs_embeds → hidden_states) audio_heads_decoder — linear projection to per-codebook audio logits The Higgs Audio V2 Tokenizer (acoustic/semantic encoder, quantizer, decoder) requires the `boson-multimodal` package. Export it separately: cd python convert_omnivoice_to_onnx.py --only higgs --out-dir ./higgs_onnx Pipeline dataflow (32 iterative unmasking steps): (input_ids, audio_mask) → audio_embeddings_encoder → inputs_embeds (inputs_embeds, mask) → llm_decoder → hidden_states hidden_states → audio_heads_decoder → logits (B,8,S,1025) Usage: python optimize.py --device cpu # CPU INT4 backbone (cpu_and_mobile/) python optimize.py --device cpu_fp16 # CPU FP16 backbone (cpu_fp16/) python optimize.py --device gpu # CUDA FP16 backbone (cuda/) python optimize.py --include-higgs # also export Higgs Audio Tokenizer python optimize.py --higgs-only # export only Higgs tokenizer python optimize.py --skip-export # regenerate configs only python optimize.py --skip-llm # skip ModelBuilder step (slow) Profiles: cpu → INT4 weights for all sub-models. Smallest footprint. cpu_fp16 → FP16 weights for all sub-models. Better accuracy than INT4 on CPUs with AVX-512 FP16 support (Intel Sapphire Rapids+, AMD Zen5+). LLM uses ModelBuilder fp16; audio sub-models use OnnxFloatToFloat16. gpu → FP16 audio sub-models + INT4 LLM via CUDAExecutionProvider. """ import argparse import json import logging import sys from pathlib import Path logging.getLogger("onnxscript").setLevel(logging.WARNING) logging.getLogger("onnx_ir").setLevel(logging.WARNING) MODEL_NAME = "Prince-1/OmniVoice" MODELS_DIR = "models" HIDDEN_SIZE = 1024 NUM_CODEBOOKS = 8 AUDIO_VOCAB = 1025 # ============================================================================= # Step 0: Save Qwen3 standalone (required before ModelBuilder runs) # ============================================================================= def prepare_qwen3_standalone(model_path: str, work_dir: str) -> str: """Save OmniVoice's internal Qwen3 LLM as a standalone HF directory. ModelBuilder (onnxruntime-genai create_model) needs a standard HF model directory with architectures=["Qwen3ForCausalLM"] to export the LLM. Returns the path to the saved directory. """ qwen3_dir = str(Path(work_dir) / "qwen3_standalone") if Path(qwen3_dir).exists() and (Path(qwen3_dir) / "model.safetensors").exists(): print(f" Reusing existing {qwen3_dir}") return qwen3_dir print(f" Saving Qwen3 standalone to {qwen3_dir} ...") sys.path.insert(0, str(Path(__file__).parent)) from user_script import save_qwen3_standalone return save_qwen3_standalone(model_path, work_dir) # ============================================================================= # Step 1: Olive Export + Optimization # ============================================================================= def export_models(config_dir: str, llm_config_path: str = None): """Run Olive on all three backbone sub-models.""" from olive import run config_path = Path(config_dir) print(f"=== Running Olive pipelines (configs from {config_path}) ===") # audio_embeddings_encoder + audio_heads_decoder (PyTorchModel → ONNX → quantize) for config in ("audio_embeddings_encoder.json", "audio_heads_decoder.json"): print(f" Running {config}...") run(str(config_path / config)) # llm_decoder (HfModel → ModelBuilder with exclude_embeds + exclude_lm_head) llm_cfg = llm_config_path or str(config_path / "llm_decoder.json") print(f" Running {Path(llm_cfg).name}...") run(llm_cfg) print() # ============================================================================= # Step 1b: Higgs Audio Tokenizer export # ============================================================================= def export_higgs(higgs_config_dir: str = "higgs"): """Run Olive on all 4 Higgs Audio V2 Tokenizer sub-models. Configs are in higgs/ and all use FP16 (best balance of quality + speed for audio codec operations — INT4 is too lossy for DAC encoder/decoder). Both DAC models (acoustic_encoder, higgs_decoder) are pre-traced inside user_script.py before Olive sees them, resolving Python control-flow branches that would otherwise crash torch.onnx.export. """ from olive import run config_path = Path(higgs_config_dir) print(f"=== Running Higgs Audio Tokenizer pipelines (configs from {config_path}) ===") for config in ( "acoustic_encoder.json", "semantic_encoder.json", "quantizer_encoder.json", "higgs_decoder.json", ): print(f" Running {config}...") run(str(config_path / config)) print() # ============================================================================= # Step 2: Generate inference manifest # ============================================================================= def write_inference_manifest(output_dir: str, device: str = "cpu"): """Write omnivoice_manifest.json describing all ONNX sub-model paths. The inference script reads this to locate each sub-model at runtime. This is analogous to the genai_config.json used in VL pipelines. """ manifest_path = Path(output_dir) / "omnivoice_manifest.json" if device == "gpu": provider = "CUDAExecutionProvider" else: provider = "CPUExecutionProvider" # both cpu and cpu_fp16 use CPU EP precision_map = {"cpu": "int4", "cpu_fp16": "fp16", "gpu": "fp16+int4_llm"} manifest = { "model_id": MODEL_NAME, "execution_provider": provider, "precision": precision_map.get(device, "int4"), "backbone": { "audio_embeddings_encoder": { "filename": "audio_embeddings_encoder.onnx", "description": "Fuses text + audio codec token embeddings → inputs_embeds", "inputs": { "input_ids": "int64 (batch, num_codebooks=8, seq)", "audio_mask": "bool (batch, seq)" }, "outputs": { "inputs_embeds": "float32 (batch, seq, hidden=1024)" }, }, "llm_decoder": { "filename": "llm_decoder.onnx", "description": "Qwen3 28-layer backbone (inputs_embeds → hidden_states)", "note": "Exported with exclude_embeds=True, exclude_lm_head=True", "inputs": { "inputs_embeds": "float32 (batch, seq, hidden=1024)", "attention_mask": "int64 (batch, seq)", "position_ids": "int64 (batch, seq)", "past_key_values": "float32 per layer — pass empty (shape [B,heads,0,head_dim]) for full-sequence forward" }, "outputs": { "hidden_states": "float32 (batch, seq, hidden=1024)" }, "hidden_size": 1024, "num_layers": 28, "num_attn_heads": 16, "num_kv_heads": 8, "head_dim": 128, }, "audio_heads_decoder": { "filename": "audio_heads_decoder.onnx", "description": "Projects hidden_states to per-codebook audio-token logits", "inputs": { "hidden_states": "float32 (batch, seq, hidden=1024)" }, "outputs": { "logits": "float32 (batch, num_codebooks=8, seq, audio_vocab=1025)" }, "num_codebooks": 8, "audio_vocab_size": 1025, "audio_mask_id": 1024, }, }, "higgs_tokenizer": { "note": "Exported via optimize.py --include-higgs (or --higgs-only).", "models_dir": "higgs/models/", "sub_models": { "acoustic_encoder": "acoustic_encoder.onnx", "semantic_encoder": "semantic_encoder.onnx", "quantizer_encoder": "quantizer_encoder.onnx", "higgs_decoder": "higgs_decoder.onnx", }, "sample_rate_acoustic": 24000, "sample_rate_semantic": 16000, "downsample_factor": 320, "num_codebooks": 8, "codebook_size": 1024, "pipeline": [ "acoustic_encoder(waveform_24k) → acoustic_features", "semantic_encoder(waveform_16k) → semantic_features", "quantizer_encoder(acoustic_features, semantic_features) → codes", " --- TTS inference: codes → audio_embeddings_encoder → ... → audio_codes ---", "higgs_decoder(audio_codes) → waveform_24k", ], "loading_priority": [ "transformers.HiggsAudioV2TokenizerModel (transformers>=5.3.0)", "boson_multimodal.load_higgs_audio_tokenizer (pip install boson-multimodal @ git+...)", ], }, "iterative_decoding": { "note": "OmniVoice uses 32-step iterative unmasking (non-autoregressive).", "steps": 32, "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2], "per_step_pipeline": [ "audio_embeddings_encoder(input_ids, audio_mask) → inputs_embeds", "llm_decoder(inputs_embeds, attention_mask, position_ids) → hidden_states", "audio_heads_decoder(hidden_states) → logits", "sample audio tokens from logits using codebook weights", "unmask predicted positions in input_ids", ], }, } with open(manifest_path, "w") as f: json.dump(manifest, f, indent=2) print(f" Wrote {manifest_path}") # ============================================================================= # Main # ============================================================================= def main(): parser = argparse.ArgumentParser( description="Optimize Prince-1/OmniVoice backbone to ONNX" ) parser.add_argument("--device", choices=["cpu", "cpu_fp16", "gpu"], default="cpu", help="Target device/precision (default: cpu)\n" " cpu → INT4 weights, CPUExecutionProvider (smallest, slowest)\n" " cpu_fp16 → FP16 weights, CPUExecutionProvider (balanced; best on AVX-512 FP16 CPUs)\n" " gpu → FP16 audio + INT4 LLM, CUDAExecutionProvider (fastest)") parser.add_argument("--config-dir", default=None, help="Directory with Olive JSON configs (default: auto from --device)") parser.add_argument("--skip-export", action="store_true", help="Skip Olive export (models already exist)") parser.add_argument("--skip-llm", action="store_true", help="Skip LLM (ModelBuilder) export — it takes several minutes") parser.add_argument("--include-higgs", action="store_true", help="Also export Higgs Audio V2 Tokenizer (4 sub-models in higgs/)") parser.add_argument("--higgs-only", action="store_true", help="Export ONLY the Higgs tokenizer; skip backbone") parser.add_argument("--models-dir", default=None, help="Models output directory") parser.add_argument("--model-path", default=MODEL_NAME, help=f"HF model ID or local path (default: {MODEL_NAME})") args = parser.parse_args() _default_dirs = {"cpu": "cpu_and_mobile", "cpu_fp16": "cpu_fp16", "gpu": "cuda"} config_dir = args.config_dir or _default_dirs[args.device] models_dir = args.models_dir or str(Path(config_dir) / MODELS_DIR) Path(models_dir).mkdir(parents=True, exist_ok=True) print(f"Target device : {args.device}") print(f"Config dir : {config_dir}") print(f"Models dir : {models_dir}") print(f"Model : {args.model_path}") print() run_backbone = not args.higgs_only run_higgs = args.include_higgs or args.higgs_only if run_backbone and not args.skip_export: # Step 0: Save Qwen3 standalone so ModelBuilder can find it print("=== Step 0: Preparing Qwen3 standalone ===") qwen3_dir = prepare_qwen3_standalone(args.model_path, config_dir) print(f" Qwen3 standalone: {qwen3_dir}\n") # Patch llm_decoder.json to point to the saved standalone llm_json = Path(config_dir) / "llm_decoder.json" if llm_json.exists(): with open(llm_json) as f: llm_cfg = json.load(f) llm_cfg["input_model"]["model_path"] = qwen3_dir with open(llm_json, "w") as f: json.dump(llm_cfg, f, indent=4) print(f" Patched {llm_json} → model_path={qwen3_dir}") # Step 1: Backbone sub-models print("=== Step 1: Backbone Olive export + optimization ===") if args.skip_llm: print(" Skipping LLM (--skip-llm).") from olive import run for config in ("audio_embeddings_encoder.json", "audio_heads_decoder.json"): print(f" Running {config}...") run(str(Path(config_dir) / config)) else: export_models(config_dir) if run_higgs and not args.skip_export: # Step 1b: Higgs Audio V2 Tokenizer print("=== Step 1b: Higgs Audio Tokenizer export ===") export_higgs("higgs") # Step 2: Write inference manifest print("=== Step 2: Writing inference manifest ===") write_inference_manifest(output_dir=models_dir, device=args.device) print() print("Done.") if not run_higgs: print() print("TIP: To also export the Higgs Audio Tokenizer (needed for voice cloning):") print(" python optimize.py --include-higgs") if __name__ == "__main__": main()