OmniVoice-Onnx / optimize.py
Prince-1's picture
Add files using upload-large-folder tool
2ad50ac verified
Raw
History Blame Contribute Delete
14.5 kB
"""End-to-end ONNX optimization pipeline for Prince-1/OmniVoice.
OmniVoice backbone consists of three ONNX sub-models:
audio_embeddings_encoder — text + audio embedding fusion
llm_decoder — Qwen3-28 layer backbone (inputs_embeds → hidden_states)
audio_heads_decoder — linear projection to per-codebook audio logits
The Higgs Audio V2 Tokenizer (acoustic/semantic encoder, quantizer, decoder)
requires the `boson-multimodal` package. Export it separately:
cd <omnivoice_model_dir>
python convert_omnivoice_to_onnx.py --only higgs --out-dir ./higgs_onnx
Pipeline dataflow (32 iterative unmasking steps):
(input_ids, audio_mask) → audio_embeddings_encoder → inputs_embeds
(inputs_embeds, mask) → llm_decoder → hidden_states
hidden_states → audio_heads_decoder → logits (B,8,S,1025)
Usage:
python optimize.py --device cpu # CPU INT4 backbone (cpu_and_mobile/)
python optimize.py --device cpu_fp16 # CPU FP16 backbone (cpu_fp16/)
python optimize.py --device gpu # CUDA FP16 backbone (cuda/)
python optimize.py --include-higgs # also export Higgs Audio Tokenizer
python optimize.py --higgs-only # export only Higgs tokenizer
python optimize.py --skip-export # regenerate configs only
python optimize.py --skip-llm # skip ModelBuilder step (slow)
Profiles:
cpu → INT4 weights for all sub-models. Smallest footprint.
cpu_fp16 → FP16 weights for all sub-models. Better accuracy than INT4 on CPUs
with AVX-512 FP16 support (Intel Sapphire Rapids+, AMD Zen5+).
LLM uses ModelBuilder fp16; audio sub-models use OnnxFloatToFloat16.
gpu → FP16 audio sub-models + INT4 LLM via CUDAExecutionProvider.
"""
import argparse
import json
import logging
import sys
from pathlib import Path
logging.getLogger("onnxscript").setLevel(logging.WARNING)
logging.getLogger("onnx_ir").setLevel(logging.WARNING)
MODEL_NAME = "Prince-1/OmniVoice"
MODELS_DIR = "models"
HIDDEN_SIZE = 1024
NUM_CODEBOOKS = 8
AUDIO_VOCAB = 1025
# =============================================================================
# Step 0: Save Qwen3 standalone (required before ModelBuilder runs)
# =============================================================================
def prepare_qwen3_standalone(model_path: str, work_dir: str) -> str:
"""Save OmniVoice's internal Qwen3 LLM as a standalone HF directory.
ModelBuilder (onnxruntime-genai create_model) needs a standard HF model
directory with architectures=["Qwen3ForCausalLM"] to export the LLM.
Returns the path to the saved directory.
"""
qwen3_dir = str(Path(work_dir) / "qwen3_standalone")
if Path(qwen3_dir).exists() and (Path(qwen3_dir) / "model.safetensors").exists():
print(f" Reusing existing {qwen3_dir}")
return qwen3_dir
print(f" Saving Qwen3 standalone to {qwen3_dir} ...")
sys.path.insert(0, str(Path(__file__).parent))
from user_script import save_qwen3_standalone
return save_qwen3_standalone(model_path, work_dir)
# =============================================================================
# Step 1: Olive Export + Optimization
# =============================================================================
def export_models(config_dir: str, llm_config_path: str = None):
"""Run Olive on all three backbone sub-models."""
from olive import run
config_path = Path(config_dir)
print(f"=== Running Olive pipelines (configs from {config_path}) ===")
# audio_embeddings_encoder + audio_heads_decoder (PyTorchModel → ONNX → quantize)
for config in ("audio_embeddings_encoder.json", "audio_heads_decoder.json"):
print(f" Running {config}...")
run(str(config_path / config))
# llm_decoder (HfModel → ModelBuilder with exclude_embeds + exclude_lm_head)
llm_cfg = llm_config_path or str(config_path / "llm_decoder.json")
print(f" Running {Path(llm_cfg).name}...")
run(llm_cfg)
print()
# =============================================================================
# Step 1b: Higgs Audio Tokenizer export
# =============================================================================
def export_higgs(higgs_config_dir: str = "higgs"):
"""Run Olive on all 4 Higgs Audio V2 Tokenizer sub-models.
Configs are in higgs/ and all use FP16 (best balance of quality + speed
for audio codec operations — INT4 is too lossy for DAC encoder/decoder).
Both DAC models (acoustic_encoder, higgs_decoder) are pre-traced inside
user_script.py before Olive sees them, resolving Python control-flow branches
that would otherwise crash torch.onnx.export.
"""
from olive import run
config_path = Path(higgs_config_dir)
print(f"=== Running Higgs Audio Tokenizer pipelines (configs from {config_path}) ===")
for config in (
"acoustic_encoder.json",
"semantic_encoder.json",
"quantizer_encoder.json",
"higgs_decoder.json",
):
print(f" Running {config}...")
run(str(config_path / config))
print()
# =============================================================================
# Step 2: Generate inference manifest
# =============================================================================
def write_inference_manifest(output_dir: str, device: str = "cpu"):
"""Write omnivoice_manifest.json describing all ONNX sub-model paths.
The inference script reads this to locate each sub-model at runtime.
This is analogous to the genai_config.json used in VL pipelines.
"""
manifest_path = Path(output_dir) / "omnivoice_manifest.json"
if device == "gpu":
provider = "CUDAExecutionProvider"
else:
provider = "CPUExecutionProvider" # both cpu and cpu_fp16 use CPU EP
precision_map = {"cpu": "int4", "cpu_fp16": "fp16", "gpu": "fp16+int4_llm"}
manifest = {
"model_id": MODEL_NAME,
"execution_provider": provider,
"precision": precision_map.get(device, "int4"),
"backbone": {
"audio_embeddings_encoder": {
"filename": "audio_embeddings_encoder.onnx",
"description": "Fuses text + audio codec token embeddings → inputs_embeds",
"inputs": {
"input_ids": "int64 (batch, num_codebooks=8, seq)",
"audio_mask": "bool (batch, seq)"
},
"outputs": {
"inputs_embeds": "float32 (batch, seq, hidden=1024)"
},
},
"llm_decoder": {
"filename": "llm_decoder.onnx",
"description": "Qwen3 28-layer backbone (inputs_embeds → hidden_states)",
"note": "Exported with exclude_embeds=True, exclude_lm_head=True",
"inputs": {
"inputs_embeds": "float32 (batch, seq, hidden=1024)",
"attention_mask": "int64 (batch, seq)",
"position_ids": "int64 (batch, seq)",
"past_key_values": "float32 per layer — pass empty (shape [B,heads,0,head_dim]) for full-sequence forward"
},
"outputs": {
"hidden_states": "float32 (batch, seq, hidden=1024)"
},
"hidden_size": 1024,
"num_layers": 28,
"num_attn_heads": 16,
"num_kv_heads": 8,
"head_dim": 128,
},
"audio_heads_decoder": {
"filename": "audio_heads_decoder.onnx",
"description": "Projects hidden_states to per-codebook audio-token logits",
"inputs": {
"hidden_states": "float32 (batch, seq, hidden=1024)"
},
"outputs": {
"logits": "float32 (batch, num_codebooks=8, seq, audio_vocab=1025)"
},
"num_codebooks": 8,
"audio_vocab_size": 1025,
"audio_mask_id": 1024,
},
},
"higgs_tokenizer": {
"note": "Exported via optimize.py --include-higgs (or --higgs-only).",
"models_dir": "higgs/models/",
"sub_models": {
"acoustic_encoder": "acoustic_encoder.onnx",
"semantic_encoder": "semantic_encoder.onnx",
"quantizer_encoder": "quantizer_encoder.onnx",
"higgs_decoder": "higgs_decoder.onnx",
},
"sample_rate_acoustic": 24000,
"sample_rate_semantic": 16000,
"downsample_factor": 320,
"num_codebooks": 8,
"codebook_size": 1024,
"pipeline": [
"acoustic_encoder(waveform_24k) → acoustic_features",
"semantic_encoder(waveform_16k) → semantic_features",
"quantizer_encoder(acoustic_features, semantic_features) → codes",
" --- TTS inference: codes → audio_embeddings_encoder → ... → audio_codes ---",
"higgs_decoder(audio_codes) → waveform_24k",
],
"loading_priority": [
"transformers.HiggsAudioV2TokenizerModel (transformers>=5.3.0)",
"boson_multimodal.load_higgs_audio_tokenizer (pip install boson-multimodal @ git+...)",
],
},
"iterative_decoding": {
"note": "OmniVoice uses 32-step iterative unmasking (non-autoregressive).",
"steps": 32,
"audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
"per_step_pipeline": [
"audio_embeddings_encoder(input_ids, audio_mask) → inputs_embeds",
"llm_decoder(inputs_embeds, attention_mask, position_ids) → hidden_states",
"audio_heads_decoder(hidden_states) → logits",
"sample audio tokens from logits using codebook weights",
"unmask predicted positions in input_ids",
],
},
}
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
print(f" Wrote {manifest_path}")
# =============================================================================
# Main
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description="Optimize Prince-1/OmniVoice backbone to ONNX"
)
parser.add_argument("--device", choices=["cpu", "cpu_fp16", "gpu"], default="cpu",
help="Target device/precision (default: cpu)\n"
" cpu → INT4 weights, CPUExecutionProvider (smallest, slowest)\n"
" cpu_fp16 → FP16 weights, CPUExecutionProvider (balanced; best on AVX-512 FP16 CPUs)\n"
" gpu → FP16 audio + INT4 LLM, CUDAExecutionProvider (fastest)")
parser.add_argument("--config-dir", default=None,
help="Directory with Olive JSON configs (default: auto from --device)")
parser.add_argument("--skip-export", action="store_true",
help="Skip Olive export (models already exist)")
parser.add_argument("--skip-llm", action="store_true",
help="Skip LLM (ModelBuilder) export — it takes several minutes")
parser.add_argument("--include-higgs", action="store_true",
help="Also export Higgs Audio V2 Tokenizer (4 sub-models in higgs/)")
parser.add_argument("--higgs-only", action="store_true",
help="Export ONLY the Higgs tokenizer; skip backbone")
parser.add_argument("--models-dir", default=None,
help="Models output directory")
parser.add_argument("--model-path", default=MODEL_NAME,
help=f"HF model ID or local path (default: {MODEL_NAME})")
args = parser.parse_args()
_default_dirs = {"cpu": "cpu_and_mobile", "cpu_fp16": "cpu_fp16", "gpu": "cuda"}
config_dir = args.config_dir or _default_dirs[args.device]
models_dir = args.models_dir or str(Path(config_dir) / MODELS_DIR)
Path(models_dir).mkdir(parents=True, exist_ok=True)
print(f"Target device : {args.device}")
print(f"Config dir : {config_dir}")
print(f"Models dir : {models_dir}")
print(f"Model : {args.model_path}")
print()
run_backbone = not args.higgs_only
run_higgs = args.include_higgs or args.higgs_only
if run_backbone and not args.skip_export:
# Step 0: Save Qwen3 standalone so ModelBuilder can find it
print("=== Step 0: Preparing Qwen3 standalone ===")
qwen3_dir = prepare_qwen3_standalone(args.model_path, config_dir)
print(f" Qwen3 standalone: {qwen3_dir}\n")
# Patch llm_decoder.json to point to the saved standalone
llm_json = Path(config_dir) / "llm_decoder.json"
if llm_json.exists():
with open(llm_json) as f:
llm_cfg = json.load(f)
llm_cfg["input_model"]["model_path"] = qwen3_dir
with open(llm_json, "w") as f:
json.dump(llm_cfg, f, indent=4)
print(f" Patched {llm_json} → model_path={qwen3_dir}")
# Step 1: Backbone sub-models
print("=== Step 1: Backbone Olive export + optimization ===")
if args.skip_llm:
print(" Skipping LLM (--skip-llm).")
from olive import run
for config in ("audio_embeddings_encoder.json", "audio_heads_decoder.json"):
print(f" Running {config}...")
run(str(Path(config_dir) / config))
else:
export_models(config_dir)
if run_higgs and not args.skip_export:
# Step 1b: Higgs Audio V2 Tokenizer
print("=== Step 1b: Higgs Audio Tokenizer export ===")
export_higgs("higgs")
# Step 2: Write inference manifest
print("=== Step 2: Writing inference manifest ===")
write_inference_manifest(output_dir=models_dir, device=args.device)
print()
print("Done.")
if not run_higgs:
print()
print("TIP: To also export the Higgs Audio Tokenizer (needed for voice cloning):")
print(" python optimize.py --include-higgs")
if __name__ == "__main__":
main()