---
license: apache-2.0
base_model:
- Qwen/Qwen3-0.6B
pipeline_tag: text-to-speech
language:
- "aae"
- "aal"
- "aao"
- "ab"
- "abb"
- "abn"
- "abr"
- "abs"
- "abv"
- "acm"
- "acw"
- "acx"
- "adf"
- "adx"
- "ady"
- "aeb"
- "aec"
- "af"
- "afb"
- "afo"
- "ahl"
- "ahs"
- "ajg"
- "aju"
- "ala"
- "aln"
- "alo"
- "am"
- "amu"
- "an"
- "anc"
- "ank"
- "anp"
- "anw"
- "aom"
- "apc"
- "apd"
- "arb"
- "arq"
- "ars"
- "ary"
- "arz"
- "as"
- "ast"
- "avl"
- "awo"
- "ayl"
- "ayp"
- "az"
- "ba"
- "bag"
- "bas"
- "bax"
- "bba"
- "bbj"
- "bbl"
- "bbu"
- "bce"
- "bci"
- "bcs"
- "bcy"
- "bda"
- "bde"
- "bdm"
- "be"
- "beb"
- "bew"
- "bfd"
- "bft"
- "bg"
- "bgp"
- "bhb"
- "bhh"
- "bho"
- "bhp"
- "bhr"
- "bjj"
- "bjk"
- "bjn"
- "bjt"
- "bkh"
- "bkm"
- "bky"
- "bmm"
- "bmq"
- "bn"
- "bnm"
- "bnn"
- "bns"
- "bo"
- "bou"
- "bqg"
- "br"
- "bra"
- "brh"
- "bri"
- "brx"
- "bs"
- "bsh"
- "bsj"
- "bsk"
- "btm"
- "btv"
- "bug"
- "bum"
- "buo"
- "bux"
- "bwr"
- "bxf"
- "byc"
- "bys"
- "byv"
- "byx"
- "bzc"
- "bzw"
- "ca"
- "ccg"
- "ceb"
- "cen"
- "cfa"
- "cgg"
- "chq"
- "cjk"
- "ckb"
- "ckl"
- "ckr"
- "cky"
- "cnh"
- "cpy"
- "cs"
- "cte"
- "ctl"
- "cut"
- "cux"
- "cv"
- "cy"
- "da"
- "dag"
- "dar"
- "dav"
- "dbd"
- "dcc"
- "de"
- "deg"
- "dgh"
- "dgo"
- "dje"
- "dmk"
- "dml"
- "dru"
- "dty"
- "dua"
- "dv"
- "dyu"
- "dzg"
- "ebr"
- "ebu"
- "ego"
- "eiv"
- "eko"
- "ekr"
- "el"
- "elm"
- "en"
- "eo"
- "es"
- "esu"
- "et"
- "eto"
- "ets"
- "etu"
- "eu"
- "ewo"
- "ext"
- "eyo"
- "fa"
- "fan"
- "fat"
- "ff"
- "ffm"
- "fi"
- "fia"
- "fil"
- "fip"
- "fkk"
- "fmp"
- "fr"
- "fub"
- "fuc"
- "fue"
- "fuf"
- "fuh"
- "fui"
- "fuq"
- "fuv"
- "fy"
- "ga"
- "gbm"
- "gbr"
- "gby"
- "gcc"
- "gdf"
- "gej"
- "ges"
- "ggg"
- "gid"
- "gig"
- "giz"
- "gjk"
- "gju"
- "gl"
- "glw"
- "gn"
- "gol"
- "gom"
- "gsl"
- "gu"
- "gui"
- "gur"
- "guz"
- "gv"
- "gwc"
- "gwe"
- "gwt"
- "gya"
- "gyz"
- "ha"
- "hah"
- "hao"
- "haw"
- "haz"
- "hbb"
- "he"
- "hem"
- "hi"
- "hia"
- "hkk"
- "hla"
- "hno"
- "hoj"
- "hr"
- "hsb"
- "ht"
- "hu"
- "hue"
- "hul"
- "hux"
- "hwo"
- "hy"
- "hz"
- "ia"
- "ibb"
- "id"
- "ida"
- "idu"
- "ig"
- "ijc"
- "ijn"
- "ik"
- "ikw"
- "is"
- "ish"
- "iso"
- "it"
- "its"
- "itw"
- "itz"
- "ja"
- "jal"
- "jax"
- "jgo"
- "jmx"
- "jns"
- "jqr"
- "juk"
- "juo"
- "jv"
- "ka"
- "kab"
- "kai"
- "kaj"
- "kam"
- "kbd"
- "kbl"
- "kbt"
- "kcq"
- "kdh"
- "kea"
- "keu"
- "kfe"
- "kfk"
- "kfp"
- "khg"
- "khw"
- "kj"
- "kjc"
- "kjk"
- "kk"
- "kln"
- "kls"
- "km"
- "kmr"
- "kmy"
- "kn"
- "kna"
- "knn"
- "ko"
- "kol"
- "koo"
- "kpo"
- "kqo"
- "ks"
- "ksd"
- "ksf"
- "kto"
- "kuh"
- "kvx"
- "kw"
- "kwm"
- "kxp"
- "ky"
- "kyx"
- "lag"
- "lb"
- "lcm"
- "ldb"
- "lg"
- "lij"
- "lir"
- "lkb"
- "lla"
- "ln"
- "lnu"
- "lo"
- "loa"
- "lrk"
- "lss"
- "lt"
- "ltg"
- "lto"
- "lua"
- "luo"
- "lus"
- "lv"
- "lwg"
- "mab"
- "maf"
- "mai"
- "mau"
- "max"
- "mbo"
- "mcf"
- "mcn"
- "mcx"
- "mdd"
- "mde"
- "mdf"
- "mek"
- "mer"
- "meu"
- "mfm"
- "mfn"
- "mfo"
- "mfv"
- "mgg"
- "mgi"
- "mhk"
- "mhr"
- "mi"
- "mig"
- "miu"
- "mk"
- "mkf"
- "mki"
- "ml"
- "mlq"
- "mn"
- "mne"
- "mni"
- "mqy"
- "mr"
- "mrj"
- "mrr"
- "mrt"
- "ms"
- "mse"
- "msh"
- "msw"
- "mt"
- "mtr"
- "mtu"
- "mtx"
- "mua"
- "mug"
- "mui"
- "mve"
- "mvy"
- "mxs"
- "mxu"
- "mxy"
- "my"
- "myv"
- "mzl"
- "nal"
- "nan"
- "nap"
- "nb"
- "nbh"
- "ncf"
- "nco"
- "ncx"
- "ndi"
- "ng"
- "ngi"
- "nhg"
- "nhi"
- "nhn"
- "nhq"
- "nja"
- "nl"
- "nla"
- "nlv"
- "nmg"
- "nmz"
- "nn"
- "nnh"
- "no"
- "noe"
- "npi"
- "nso"
- "ny"
- "nyu"
- "oc"
- "odk"
- "odu"
- "ogo"
- "om"
- "orc"
- "oru"
- "ory"
- "os"
- "pa"
- "pbs"
- "pbt"
- "pbu"
- "pcm"
- "pex"
- "phl"
- "phr"
- "pip"
- "piy"
- "pko"
- "pl"
- "plk"
- "plt"
- "pmq"
- "pms"
- "pmy"
- "pnb"
- "poc"
- "poe"
- "pow"
- "prq"
- "ps"
- "pst"
- "pt"
- "pua"
- "pwn"
- "qug"
- "qum"
- "qup"
- "qur"
- "qus"
- "quv"
- "qux"
- "quy"
- "qva"
- "qvi"
- "qvj"
- "qvl"
- "qwa"
- "qws"
- "qxa"
- "qxp"
- "qxt"
- "qxu"
- "qxw"
- "rag"
- "rm"
- "ro"
- "rob"
- "rof"
- "roo"
- "rth"
- "ru"
- "rup"
- "rw"
- "sa"
- "sah"
- "sat"
- "sau"
- "say"
- "sbn"
- "sc"
- "scl"
- "scn"
- "sd"
- "sei"
- "shu"
- "si"
- "sip"
- "siw"
- "sjr"
- "sk"
- "skg"
- "skr"
- "sl"
- "sn"
- "snc"
- "snk"
- "so"
- "sol"
- "sps"
- "sq"
- "sr"
- "src"
- "sro"
- "ssi"
- "ste"
- "sua"
- "sv"
- "sva"
- "sw"
- "szy"
- "ta"
- "tan"
- "tar"
- "tay"
- "tbf"
- "tcf"
- "tcy"
- "tdn"
- "tdx"
- "te"
- "tg"
- "tgc"
- "th"
- "the"
- "thq"
- "thr"
- "thv"
- "ti"
- "tig"
- "tio"
- "tk"
- "tkg"
- "tkt"
- "tli"
- "tlp"
- "tn"
- "tok"
- "tpl"
- "tpz"
- "tqp"
- "tr"
- "trp"
- "trq"
- "trv"
- "trw"
- "tt"
- "ttj"
- "ttr"
- "ttu"
- "tui"
- "tul"
- "tuq"
- "tuv"
- "tuy"
- "tvo"
- "tvu"
- "tw"
- "twu"
- "txs"
- "txy"
- "udl"
- "ug"
- "uk"
- "uki"
- "umb"
- "ur"
- "ush"
- "uz"
- "uzn"
- "vai"
- "var"
- "ver"
- "vi"
- "vmc"
- "vmj"
- "vmm"
- "vmp"
- "vmz"
- "vot"
- "vro"
- "wbl"
- "wci"
- "weo"
- "wes"
- "wja"
- "wji"
- "wo"
- "wof"
- "xh"
- "xhe"
- "xka"
- "xmf"
- "xmv"
- "xmw"
- "xpe"
- "xti"
- "xtu"
- "yaq"
- "yav"
- "yay"
- "ydd"
- "ydg"
- "yer"
- "yes"
- "yi"
- "yo"
- "yue"
- "zga"
- "zgh"
- "zh"
- "zoc"
- "zoh"
- "zor"
- "zpv"
- "zpy"
- "ztg"
- "ztn"
- "ztp"
- "zts"
- "ztu"
- "zu"
- "zza"
---

# OmniVoice 🌍

<div align="center">
    <img src="https://zhu-han.github.io/omnivoice/pics/omnivoice.jpg" height="200" width="200" style="object-fit: contain;">
</div>

<p align="center">
  <a href="https://huggingface.co/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-FFD21E" alt="Hugging Face Model"></a>
  &nbsp;
  <a href="https://huggingface.co/spaces/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue" alt="Hugging Face Space"></a>
  &nbsp;
  <a href="https://arxiv.org/abs/2604.00688"><img src="https://img.shields.io/badge/arXiv-Paper-B31B1B.svg"></a>
  &nbsp;
  <a href="https://github.com/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/GitHub-Code-181717?logo=GitHub" alt="GitHub Code"></a>
  &nbsp;
  <a href="https://zhu-han.github.io/omnivoice"><img src="https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=GitHub&style=flat-square"></a>
</p>

OmniVoice is a state-of-the-art zero-shot multilingual TTS model supporting more than 600 languages. Built on a novel diffusion language model architecture, it generates high-quality speech with superior inference speed, supporting voice cloning and voice design.

**Contents**: [Key Features](#key-features) | [Installation](#installation) | [Quick Start](#quick-start) | [Python API](#python-api) | [Command-Line Tools](#command-line-tools) | [Training & Evaluation](#training--evaluation) | [Discussion](#discussion--communication) | [Citation](#citation)

## Key Features

- **600+ Languages Supported**: The broadest language coverage among zero-shot TTS models ([full list](docs/languages.md))
- **Voice Cloning**: State-of-the-art voice cloning quality.
- **Voice Design**: Control voices via assigned speaker attributes (gender, age, pitch, dialect/accent, whisper, etc.).
- **Fast Inference**: RTF as low as 0.025 (40x faster than real-time).
- **Diffusion Language Model Architecture**: A clean, streamlined, and scalable design that delivers both quality and speed.

---

## Installation

Choose **one** of the following methods: **pip** or **uv**.

### pip

> We recommend using a fresh virtual environment (e.g., `conda`, `venv`, etc.) to avoid conflicts.

**Step 1**: Install PyTorch

<details>
<summary>NVIDIA GPU</summary>

```bash
# Install pytorch with your CUDA version, e.g.
pip install torch==2.8.0+cu128 torchaudio==2.8.0+cu128 --extra-index-url https://download.pytorch.org/whl/cu128
```
> See [PyTorch official site](https://pytorch.org/get-started/locally/) for other versions installation.

</details>

<details>
<summary>Apple Silicon</summary>

```bash
pip install torch==2.8.0 torchaudio==2.8.0
```

</details>

**Step 2**: Install OmniVoice (choose one)

```bash
# From PyPI (stable release)
pip install omnivoice

# From the latest source on GitHub (no need to clone)
pip install git+https://github.com/k2-fsa/OmniVoice.git

# For development (clone first, editable install)
git clone https://github.com/k2-fsa/OmniVoice.git
cd OmniVoice
pip install -e .
```

### uv

Clone the repository and sync dependencies:

```bash
git clone https://github.com/k2-fsa/OmniVoice.git
cd OmniVoice
uv sync
```

> **Tip**: Can use mirror with `uv sync --default-index "https://mirrors.aliyun.com/pypi/simple"`

---

## Quick Start

Try OmniVoice without coding:

- Launch the local web UI: `omnivoice-demo --ip 0.0.0.0 --port 8001`


- Or try it directly on [HuggingFace Space](https://huggingface.co/spaces/k2-fsa/OmniVoice)

> If you have trouble connecting to HuggingFace when downloading the pre-trained models, set `export HF_ENDPOINT="https://hf-mirror.com"` before running.

For full usage, see the [Python API](#python-api) and [Command-Line Tools](#command-line-tools) sections below.

---

## Python API

The OmniVoice model supports three generation modes. All features in this section are also available via [command-line tools](#command-line-tools).

### Voice Cloning

Clone a voice from a short reference audio. Provide `ref_audio` and `ref_text`:

```python
from omnivoice import OmniVoice
import torch
import torchaudio

model = OmniVoice.from_pretrained(
    "k2-fsa/OmniVoice",
    device_map="cuda:0",
    dtype=torch.float16
)
# Apple Silicon users: use device_map="mps" instead

audio = model.generate(
    text="Hello, this is a test of zero-shot voice cloning.",
    ref_audio="ref.wav",
    ref_text="Transcription of the reference audio.",
) # audio is a list of `torch.Tensor` with shape (1, T) at 24 kHz.

# If you don't want to input `ref_text` manually, you can directly omit the `ref_text`.
# The model will use Whisper ASR to auto-transcribe it.

torchaudio.save("out.wav", audio[0], 24000)
```

### Voice Design

Describe the desired voice with speaker attributes — no reference audio needed.
Supported attributes: **gender** (male/female), **age** (child to elderly),
**pitch** (very low to very high), **style** (whisper), **English accent**
(American, British, etc.), and **Chinese dialect** (四川话, 陕西话, etc.).
Attributes are comma-separated and freely combinable across categories.

```python
audio = model.generate(
    text="Hello, this is a test of zero-shot voice design.",
    instruct="female, low pitch, british accent",
)
```

See [docs/voice-design.md](docs/voice-design.md) for the full attribute
reference, Chinese equivalents, and usage tips.

### Auto Voice

Let the model choose a voice automatically:

```python
audio = model.generate(text="This is a sentence without any voice prompt.")
```

### Generation Parameters

All above three modes share the same `model.generate()` API. You can further control the generation behavior via keyword arguments:

```python
audio = model.generate(
    text="...",
    num_step=32,  # diffusion steps (or 16 for faster inference)
    speed=1.0,     # speed factor (>1.0 faster, <1.0 slower)
    duration=10.0, # fixed output duration in seconds (overrides speed)
    # ... more options
)
```
See more detailed control in [docs/generation-parameters.md](docs/generation-parameters.md).

### Non-Verbal & Pronunciation Control

OmniVoice supports inline **non-verbal symbols** and **pronunciation hints** within the input text.

**Non-verbal symbols**: Insert tags like `[laughter]` directly in the text to add expressive non-verbal sounds.

```python
audio = model.generate(text="[laughter] You really got me. I didn't see that coming at all.")
```

Supported tags: `[laughter]`, `[confirmation-en]`, `[question-en]`, `[question-ah]`, `[question-oh]`, `[question-ei]`, `[question-yi]`, `[surprise-ah]`, `[surprise-oh]`, `[surprise-wa]`, `[surprise-yo]`, `[dissatisfaction-hnn]`, `[sniff]`, `[sigh]`

**Pronunciation control (Chinese)**: Use pinyin with tone numbers to correct specific character pronunciations.

```python
audio = model.generate(text="这批货物打ZHE2出售后他严重SHE2本了，再也经不起ZHE1腾了。")
```

**Pronunciation control (English)**: Use [CMU pronunciation dictionary](https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict.0.7a)  (uppercase, in brackets) to override default English pronunciations.

```python
audio = model.generate(text="You could probably still make [IH1 T] look good.")
```

---

## Command-Line Tools

Three CLI entry points are provided. The CLI tools support all features available in the Python API (voice cloning, voice design, auto voice, generation parameters, etc.) — all controlled via command-line arguments.

| Command | Description | Source |
|---|---|---|
| `omnivoice-demo` | Interactive Gradio web demo | [omnivoice/cli/demo.py](omnivoice/cli/demo.py) |
| `omnivoice-infer` | Single-item inference | [omnivoice/cli/infer.py](omnivoice/cli/infer.py) |
| `omnivoice-infer-batch` | Batch inference across multiple GPUs | [omnivoice/cli/infer_batch.py](omnivoice/cli/infer_batch.py) |

### Demo

```bash
omnivoice-demo --ip 0.0.0.0 --port 8001
```

Provides a web UI for voice cloning and voice design. See `omnivoice-demo --help` for all options.

### Single Inference

```bash
# Voice Cloning
# ref_text can be omitted (Whisper will auto-transcribe ref_audio to get it).
omnivoice-infer \
    --model k2-fsa/OmniVoice \
    --text "This is a test for text to speech." \
    --ref_audio ref.wav \
    --ref_text "Transcription of the reference audio." \
    --output hello.wav

# Voice Design
omnivoice-infer --model k2-fsa/OmniVoice \
    --text "This is a test for text to speech." \
    --instruct "male, British accent" \
    --output hello.wav

# Auto Voice
omnivoice-infer \
    --model k2-fsa/OmniVoice \
    --text "This is a test for text to speech."\
    --output hello.wav
```

### Batch Inference

`omnivoice-infer-batch` can distribute batch inference across multiple GPUs, designed for large-scale TTS tasks.

```bash
omnivoice-infer-batch \
    --model k2-fsa/OmniVoice \
    --test_list test.jsonl \
    --res_dir results/
```

The test list is a JSONL file where each line is a JSON object:
```json
{"id": "sample_001", "text": "Hello world", "ref_audio": "/path/to/ref.wav", "ref_text": "Reference transcript", "instruct": "female, british accent", "language_id": "en", "language_name": "English", "duration": 10.0, "speed": 1.0}
```
Only `id` and `text` are mandatory fields. `ref_audio` and `ref_text` are used in voice cloning mode. `instruct` is used in voice design mode. If no reference audio or instruct are provided, the model will generate text in a random voice.

`language_id`, `language_name`, `duration`, and `speed` are optional. `duration` (in seconds) fixes the output length; `speed` controls the speaking rate. If `duration` and `speed` are both provided, `speed` will be ignored.

---

## Training & Evaluation

See [examples/](examples/) for the complete pipeline — from data preparation to training, evaluation, and finetuning.

---

## Discussion & Communication

You can directly discuss on [GitHub Issues](https://github.com/k2-fsa/OmniVoice/issues).

You can also scan the QR code to join our wechat group or follow our wechat official account.

| Wechat Group | Wechat Official Account |
| ------------ | ----------------------- |
|![wechat](https://k2-fsa.org/zh-CN/assets/pic/wechat_group.jpg) |![wechat](https://k2-fsa.org/zh-CN/assets/pic/wechat_account.jpg) |

---

## Citation

```bibtex
@article{zhu2026omnivoice,
      title={OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models},
      author={Zhu, Han and Ye, Lingxuan and Kang, Wei and Yao, Zengwei and Guo, Liyong and Kuang, Fangjun and Han, Zhifeng and Zhuang, Weiji and Lin, Long and Povey, Daniel},
      journal={arXiv preprint arXiv:2604.00688},
      year={2026}
}
```