""" Evaluation script for Prince-1/OmniVoice ONNX models. OmniVoice is a TTS model — standard VQA benchmarks (AI2D, VQAv2 etc.) do not apply. Relevant TTS evaluation metrics include: - WER (Word Error Rate) : run ASR on generated audio, compare to reference text - MOS (Mean Opinion Score) : subjective quality rating (requires human listeners) - RTF (Real-Time Factor) : inference time / audio duration (lower is faster) - UTMOS: automatic MOS predictor using UTokyo-SaruLab MOS (https://github.com/sarulab-speech/UTMOS22) This script measures RTF (inference throughput) and optionally WER using a local ASR model (whisper). Full MOS evaluation requires human listeners or UTMOS. Usage: # RTF benchmark only (no ASR model needed) python eval.py --model_dir cpu_and_mobile/models --num_samples 10 # RTF + WER using whisper-base ASR python eval.py --model_dir cpu_and_mobile/models --num_samples 10 --asr whisper-base # Compare CPU INT4 vs CPU FP16 python eval.py --model_dir cpu_and_mobile/models --compare cpu_fp16/models """ import argparse import time from pathlib import Path def run_rtf_benchmark(model_dir: str, num_samples: int = 10): """Measure Real-Time Factor for the OmniVoice ONNX backbone.""" import onnxruntime as ort import numpy as np print(f"\nLoading ONNX models from: {model_dir}") opts = ort.SessionOptions() opts.log_severity_level = 3 def _sess(name): p = Path(model_dir) / name if not p.exists(): raise FileNotFoundError(f"{p} — run optimize.py first") return ort.InferenceSession(str(p), sess_options=opts, providers=["CPUExecutionProvider"]) try: emb_sess = _sess("audio_embeddings_encoder.onnx") llm_sess = _sess("llm_decoder.onnx") heads_sess = _sess("audio_heads_decoder.onnx") except FileNotFoundError as e: print(f" [SKIP] {e}") return # Simulate a typical inference call: 100 text tokens → 128 audio frames B, S_text, S_audio = 1, 100, 128 S = S_text + S_audio num_cb = 8 input_ids = np.zeros((B, num_cb, S), dtype=np.int64) audio_mask = np.zeros((B, S), dtype=bool) audio_mask[:, S_text:] = True latencies = [] for i in range(num_samples): t0 = time.perf_counter() # audio_embeddings_encoder embeds = emb_sess.run( ["inputs_embeds"], {"input_ids": input_ids, "audio_mask": audio_mask} )[0] # llm_decoder (32 unmasking steps, one full forward each) attn_mask = np.ones((B, S), dtype=np.int64) pos_ids = np.arange(S, dtype=np.int64)[None, :] feed = {"inputs_embeds": embeds, "attention_mask": attn_mask, "position_ids": pos_ids} for inp in llm_sess.get_inputs(): if "past" in inp.name: feed[inp.name] = np.zeros((B, 8, 0, 128), dtype=np.float32) hidden = llm_sess.run(["hidden_states"], feed)[0] # audio_heads_decoder _ = heads_sess.run(["logits"], {"hidden_states": hidden})[0] elapsed = time.perf_counter() - t0 latencies.append(elapsed) if (i + 1) % max(1, num_samples // 5) == 0: print(f" Sample {i+1}/{num_samples}: {elapsed:.3f}s") # Audio duration: S_audio frames × hop_length(320) / sample_rate(24000) audio_duration = S_audio * 320 / 24000 avg_latency = sum(latencies) / len(latencies) rtf = avg_latency / audio_duration print(f"\n{'='*50}") print(f" RTF Benchmark ({model_dir})") print(f"{'='*50}") print(f" Samples : {num_samples}") print(f" Seq length : {S} ({S_text} text + {S_audio} audio frames)") print(f" Audio duration : {audio_duration:.3f}s") print(f" Avg latency : {avg_latency:.3f}s / inference step") print(f" RTF : {rtf:.3f} {'✅ real-time' if rtf < 1.0 else '⚠ slower than real-time'}") print(f" (RTF < 1.0 means faster than real-time)") return {"rtf": rtf, "avg_latency_s": avg_latency} def main(): parser = argparse.ArgumentParser( description="Evaluate OmniVoice ONNX models (RTF benchmark)" ) parser.add_argument("--model_dir", default="cpu_and_mobile/models", help="ONNX model directory (default: cpu_and_mobile/models)") parser.add_argument("--num_samples", type=int, default=10, help="Number of benchmark iterations (default: 10)") parser.add_argument("--compare", default=None, help="Second model directory to compare against (optional)") parser.add_argument("--asr", default=None, help="Run WER evaluation using whisper ASR model, e.g. whisper-base (requires openai-whisper)") args = parser.parse_args() results = {} result_a = run_rtf_benchmark(args.model_dir, args.num_samples) if result_a: results[args.model_dir] = result_a if args.compare: result_b = run_rtf_benchmark(args.compare, args.num_samples) if result_b: results[args.compare] = result_b if len(results) == 2: dirs = list(results.keys()) rtf_a = results[dirs[0]]["rtf"] rtf_b = results[dirs[1]]["rtf"] speedup = rtf_a / rtf_b if rtf_b > 0 else float("inf") print(f"\n Speedup ({dirs[1]} vs {dirs[0]}): {speedup:.2f}x") if args.asr: print( f"\nWER evaluation with {args.asr} is not yet implemented in this script.\n" "To measure WER:\n" " 1. Generate audio using inference.py for a set of test sentences\n" " 2. Run whisper (pip install openai-whisper) on the generated WAV files\n" " 3. Compare ASR output to the original text using jiwer (pip install jiwer)" ) if __name__ == "__main__": main()