Add files using upload-large-folder tool

2ad50ac verified 15 days ago

14.5 kB

	"""End-to-end ONNX optimization pipeline for Prince-1/OmniVoice.

	OmniVoice backbone consists of three ONNX sub-models:

	audio_embeddings_encoder — text + audio embedding fusion
	llm_decoder — Qwen3-28 layer backbone (inputs_embeds → hidden_states)
	audio_heads_decoder — linear projection to per-codebook audio logits

	The Higgs Audio V2 Tokenizer (acoustic/semantic encoder, quantizer, decoder)
	requires the `boson-multimodal` package. Export it separately:
	cd <omnivoice_model_dir>
	python convert_omnivoice_to_onnx.py --only higgs --out-dir ./higgs_onnx

	Pipeline dataflow (32 iterative unmasking steps):
	(input_ids, audio_mask) → audio_embeddings_encoder → inputs_embeds
	(inputs_embeds, mask) → llm_decoder → hidden_states
	hidden_states → audio_heads_decoder → logits (B,8,S,1025)

	Usage:
	python optimize.py --device cpu # CPU INT4 backbone (cpu_and_mobile/)
	python optimize.py --device cpu_fp16 # CPU FP16 backbone (cpu_fp16/)
	python optimize.py --device gpu # CUDA FP16 backbone (cuda/)
	python optimize.py --include-higgs # also export Higgs Audio Tokenizer
	python optimize.py --higgs-only # export only Higgs tokenizer
	python optimize.py --skip-export # regenerate configs only
	python optimize.py --skip-llm # skip ModelBuilder step (slow)

	Profiles:
	cpu → INT4 weights for all sub-models. Smallest footprint.
	cpu_fp16 → FP16 weights for all sub-models. Better accuracy than INT4 on CPUs
	with AVX-512 FP16 support (Intel Sapphire Rapids+, AMD Zen5+).
	LLM uses ModelBuilder fp16; audio sub-models use OnnxFloatToFloat16.
	gpu → FP16 audio sub-models + INT4 LLM via CUDAExecutionProvider.
	"""
	import argparse
	import json
	import logging
	import sys
	from pathlib import Path

	logging.getLogger("onnxscript").setLevel(logging.WARNING)
	logging.getLogger("onnx_ir").setLevel(logging.WARNING)

	MODEL_NAME = "Prince-1/OmniVoice"
	MODELS_DIR = "models"
	HIDDEN_SIZE = 1024
	NUM_CODEBOOKS = 8
	AUDIO_VOCAB = 1025


	# =============================================================================
	# Step 0: Save Qwen3 standalone (required before ModelBuilder runs)
	# =============================================================================

	def prepare_qwen3_standalone(model_path: str, work_dir: str) -> str:
	"""Save OmniVoice's internal Qwen3 LLM as a standalone HF directory.

	ModelBuilder (onnxruntime-genai create_model) needs a standard HF model
	directory with architectures=["Qwen3ForCausalLM"] to export the LLM.
	Returns the path to the saved directory.
	"""
	qwen3_dir = str(Path(work_dir) / "qwen3_standalone")
	if Path(qwen3_dir).exists() and (Path(qwen3_dir) / "model.safetensors").exists():
	print(f" Reusing existing {qwen3_dir}")
	return qwen3_dir

	print(f" Saving Qwen3 standalone to {qwen3_dir} ...")
	sys.path.insert(0, str(Path(__file__).parent))
	from user_script import save_qwen3_standalone
	return save_qwen3_standalone(model_path, work_dir)


	# =============================================================================
	# Step 1: Olive Export + Optimization
	# =============================================================================

	def export_models(config_dir: str, llm_config_path: str = None):
	"""Run Olive on all three backbone sub-models."""
	from olive import run

	config_path = Path(config_dir)
	print(f"=== Running Olive pipelines (configs from {config_path}) ===")

	# audio_embeddings_encoder + audio_heads_decoder (PyTorchModel → ONNX → quantize)
	for config in ("audio_embeddings_encoder.json", "audio_heads_decoder.json"):
	print(f" Running {config}...")
	run(str(config_path / config))

	# llm_decoder (HfModel → ModelBuilder with exclude_embeds + exclude_lm_head)
	llm_cfg = llm_config_path or str(config_path / "llm_decoder.json")
	print(f" Running {Path(llm_cfg).name}...")
	run(llm_cfg)
	print()


	# =============================================================================
	# Step 1b: Higgs Audio Tokenizer export
	# =============================================================================

	def export_higgs(higgs_config_dir: str = "higgs"):
	"""Run Olive on all 4 Higgs Audio V2 Tokenizer sub-models.

	Configs are in higgs/ and all use FP16 (best balance of quality + speed
	for audio codec operations — INT4 is too lossy for DAC encoder/decoder).
	Both DAC models (acoustic_encoder, higgs_decoder) are pre-traced inside
	user_script.py before Olive sees them, resolving Python control-flow branches
	that would otherwise crash torch.onnx.export.
	"""
	from olive import run

	config_path = Path(higgs_config_dir)
	print(f"=== Running Higgs Audio Tokenizer pipelines (configs from {config_path}) ===")
	for config in (
	"acoustic_encoder.json",
	"semantic_encoder.json",
	"quantizer_encoder.json",
	"higgs_decoder.json",
	):
	print(f" Running {config}...")
	run(str(config_path / config))
	print()


	# =============================================================================
	# Step 2: Generate inference manifest
	# =============================================================================

	def write_inference_manifest(output_dir: str, device: str = "cpu"):
	"""Write omnivoice_manifest.json describing all ONNX sub-model paths.

	The inference script reads this to locate each sub-model at runtime.
	This is analogous to the genai_config.json used in VL pipelines.
	"""
	manifest_path = Path(output_dir) / "omnivoice_manifest.json"

	if device == "gpu":
	provider = "CUDAExecutionProvider"
	else:
	provider = "CPUExecutionProvider" # both cpu and cpu_fp16 use CPU EP

	precision_map = {"cpu": "int4", "cpu_fp16": "fp16", "gpu": "fp16+int4_llm"}

	manifest = {
	"model_id": MODEL_NAME,
	"execution_provider": provider,
	"precision": precision_map.get(device, "int4"),
	"backbone": {
	"audio_embeddings_encoder": {
	"filename": "audio_embeddings_encoder.onnx",
	"description": "Fuses text + audio codec token embeddings → inputs_embeds",
	"inputs": {
	"input_ids": "int64 (batch, num_codebooks=8, seq)",
	"audio_mask": "bool (batch, seq)"
	},
	"outputs": {
	"inputs_embeds": "float32 (batch, seq, hidden=1024)"
	},
	},
	"llm_decoder": {
	"filename": "llm_decoder.onnx",
	"description": "Qwen3 28-layer backbone (inputs_embeds → hidden_states)",
	"note": "Exported with exclude_embeds=True, exclude_lm_head=True",
	"inputs": {
	"inputs_embeds": "float32 (batch, seq, hidden=1024)",
	"attention_mask": "int64 (batch, seq)",
	"position_ids": "int64 (batch, seq)",
	"past_key_values": "float32 per layer — pass empty (shape [B,heads,0,head_dim]) for full-sequence forward"
	},
	"outputs": {
	"hidden_states": "float32 (batch, seq, hidden=1024)"
	},
	"hidden_size": 1024,
	"num_layers": 28,
	"num_attn_heads": 16,
	"num_kv_heads": 8,
	"head_dim": 128,
	},
	"audio_heads_decoder": {
	"filename": "audio_heads_decoder.onnx",
	"description": "Projects hidden_states to per-codebook audio-token logits",
	"inputs": {
	"hidden_states": "float32 (batch, seq, hidden=1024)"
	},
	"outputs": {
	"logits": "float32 (batch, num_codebooks=8, seq, audio_vocab=1025)"
	},
	"num_codebooks": 8,
	"audio_vocab_size": 1025,
	"audio_mask_id": 1024,
	},
	},
	"higgs_tokenizer": {
	"note": "Exported via optimize.py --include-higgs (or --higgs-only).",
	"models_dir": "higgs/models/",
	"sub_models": {
	"acoustic_encoder": "acoustic_encoder.onnx",
	"semantic_encoder": "semantic_encoder.onnx",
	"quantizer_encoder": "quantizer_encoder.onnx",
	"higgs_decoder": "higgs_decoder.onnx",
	},
	"sample_rate_acoustic": 24000,
	"sample_rate_semantic": 16000,
	"downsample_factor": 320,
	"num_codebooks": 8,
	"codebook_size": 1024,
	"pipeline": [
	"acoustic_encoder(waveform_24k) → acoustic_features",
	"semantic_encoder(waveform_16k) → semantic_features",
	"quantizer_encoder(acoustic_features, semantic_features) → codes",
	" --- TTS inference: codes → audio_embeddings_encoder → ... → audio_codes ---",
	"higgs_decoder(audio_codes) → waveform_24k",
	],
	"loading_priority": [
	"transformers.HiggsAudioV2TokenizerModel (transformers>=5.3.0)",
	"boson_multimodal.load_higgs_audio_tokenizer (pip install boson-multimodal @ git+...)",
	],
	},
	"iterative_decoding": {
	"note": "OmniVoice uses 32-step iterative unmasking (non-autoregressive).",
	"steps": 32,
	"audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
	"per_step_pipeline": [
	"audio_embeddings_encoder(input_ids, audio_mask) → inputs_embeds",
	"llm_decoder(inputs_embeds, attention_mask, position_ids) → hidden_states",
	"audio_heads_decoder(hidden_states) → logits",
	"sample audio tokens from logits using codebook weights",
	"unmask predicted positions in input_ids",
	],
	},
	}

	with open(manifest_path, "w") as f:
	json.dump(manifest, f, indent=2)
	print(f" Wrote {manifest_path}")


	# =============================================================================
	# Main
	# =============================================================================

	def main():
	parser = argparse.ArgumentParser(
	description="Optimize Prince-1/OmniVoice backbone to ONNX"
	)
	parser.add_argument("--device", choices=["cpu", "cpu_fp16", "gpu"], default="cpu",
	help="Target device/precision (default: cpu)\n"
	" cpu → INT4 weights, CPUExecutionProvider (smallest, slowest)\n"
	" cpu_fp16 → FP16 weights, CPUExecutionProvider (balanced; best on AVX-512 FP16 CPUs)\n"
	" gpu → FP16 audio + INT4 LLM, CUDAExecutionProvider (fastest)")
	parser.add_argument("--config-dir", default=None,
	help="Directory with Olive JSON configs (default: auto from --device)")
	parser.add_argument("--skip-export", action="store_true",
	help="Skip Olive export (models already exist)")
	parser.add_argument("--skip-llm", action="store_true",
	help="Skip LLM (ModelBuilder) export — it takes several minutes")
	parser.add_argument("--include-higgs", action="store_true",
	help="Also export Higgs Audio V2 Tokenizer (4 sub-models in higgs/)")
	parser.add_argument("--higgs-only", action="store_true",
	help="Export ONLY the Higgs tokenizer; skip backbone")
	parser.add_argument("--models-dir", default=None,
	help="Models output directory")
	parser.add_argument("--model-path", default=MODEL_NAME,
	help=f"HF model ID or local path (default: {MODEL_NAME})")
	args = parser.parse_args()

	_default_dirs = {"cpu": "cpu_and_mobile", "cpu_fp16": "cpu_fp16", "gpu": "cuda"}
	config_dir = args.config_dir or _default_dirs[args.device]
	models_dir = args.models_dir or str(Path(config_dir) / MODELS_DIR)
	Path(models_dir).mkdir(parents=True, exist_ok=True)

	print(f"Target device : {args.device}")
	print(f"Config dir : {config_dir}")
	print(f"Models dir : {models_dir}")
	print(f"Model : {args.model_path}")
	print()

	run_backbone = not args.higgs_only
	run_higgs = args.include_higgs or args.higgs_only

	if run_backbone and not args.skip_export:
	# Step 0: Save Qwen3 standalone so ModelBuilder can find it
	print("=== Step 0: Preparing Qwen3 standalone ===")
	qwen3_dir = prepare_qwen3_standalone(args.model_path, config_dir)
	print(f" Qwen3 standalone: {qwen3_dir}\n")

	# Patch llm_decoder.json to point to the saved standalone
	llm_json = Path(config_dir) / "llm_decoder.json"
	if llm_json.exists():
	with open(llm_json) as f:
	llm_cfg = json.load(f)
	llm_cfg["input_model"]["model_path"] = qwen3_dir
	with open(llm_json, "w") as f:
	json.dump(llm_cfg, f, indent=4)
	print(f" Patched {llm_json} → model_path={qwen3_dir}")

	# Step 1: Backbone sub-models
	print("=== Step 1: Backbone Olive export + optimization ===")
	if args.skip_llm:
	print(" Skipping LLM (--skip-llm).")
	from olive import run
	for config in ("audio_embeddings_encoder.json", "audio_heads_decoder.json"):
	print(f" Running {config}...")
	run(str(Path(config_dir) / config))
	else:
	export_models(config_dir)

	if run_higgs and not args.skip_export:
	# Step 1b: Higgs Audio V2 Tokenizer
	print("=== Step 1b: Higgs Audio Tokenizer export ===")
	export_higgs("higgs")

	# Step 2: Write inference manifest
	print("=== Step 2: Writing inference manifest ===")
	write_inference_manifest(output_dir=models_dir, device=args.device)
	print()
	print("Done.")
	if not run_higgs:
	print()
	print("TIP: To also export the Higgs Audio Tokenizer (needed for voice cloning):")
	print(" python optimize.py --include-higgs")


	if __name__ == "__main__":
	main()