"""Pocket-TTS FastAPI server.""" import io import os import wave import subprocess import traceback from pathlib import Path import numpy as np from fastapi import FastAPI, Query, HTTPException, Form, Body from fastapi.responses import Response, HTMLResponse from pydantic import BaseModel class TTSRequest(BaseModel): text: str voice: str = "af_alloy" temperature: float = 0.7 format: str = "ogg" HF_TOKEN = os.environ.get("HF_TOKEN", "") if HF_TOKEN: os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN try: from huggingface_hub import login login(token=HF_TOKEN, add_to_git_credential=False) print("Logged in to HuggingFace Hub") except Exception as e: print(f"HF login warning: {e}") try: import torch from pocket_tts import TTSModel except ImportError: torch = None TTSModel = None app = FastAPI(title="Pocket-TTS API") _state = { "initialized": False, "model": None, "sample_rate": 24000, "voice_cache": {}, } # Voice sources: (repo_id, path, repo_type) VOICE_SOURCES = { # === Nymbo/Pocket-TTS (standard voices) === "af_alloy": ("Nymbo/Pocket-TTS", "voices/af_alloy.wav", "space"), "af_aoede": ("Nymbo/Pocket-TTS", "voices/af_aoede.wav", "space"), "af_bella": ("Nymbo/Pocket-TTS", "voices/af_bella.wav", "space"), "af_heart": ("Nymbo/Pocket-TTS", "voices/af_heart.wav", "space"), "af_jessica": ("Nymbo/Pocket-TTS", "voices/af_jessica.wav", "space"), "af_kore": ("Nymbo/Pocket-TTS", "voices/af_kore.wav", "space"), "af_nicole": ("Nymbo/Pocket-TTS", "voices/af_nicole.wav", "space"), "af_nova": ("Nymbo/Pocket-TTS", "voices/af_nova.wav", "space"), "af_river": ("Nymbo/Pocket-TTS", "voices/af_river.wav", "space"), "af_sarah": ("Nymbo/Pocket-TTS", "voices/af_sarah.wav", "space"), "af_sky": ("Nymbo/Pocket-TTS", "voices/af_sky.wav", "space"), "am_adam": ("Nymbo/Pocket-TTS", "voices/am_adam.wav", "space"), "am_echo": ("Nymbo/Pocket-TTS", "voices/am_echo.wav", "space"), "am_eric": ("Nymbo/Pocket-TTS", "voices/am_eric.wav", "space"), "am_fenrir": ("Nymbo/Pocket-TTS", "voices/am_fenrir.wav", "space"), "am_liam": ("Nymbo/Pocket-TTS", "voices/am_liam.wav", "space"), "am_michael": ("Nymbo/Pocket-TTS", "voices/am_michael.wav", "space"), "am_onyx": ("Nymbo/Pocket-TTS", "voices/am_onyx.wav", "space"), "am_puck": ("Nymbo/Pocket-TTS", "voices/am_puck.wav", "space"), "am_santa": ("Nymbo/Pocket-TTS", "voices/am_santa.wav", "space"), "bf_alice": ("Nymbo/Pocket-TTS", "voices/bf_alice.wav", "space"), "bf_emma": ("Nymbo/Pocket-TTS", "voices/bf_emma.wav", "space"), "bf_isabella": ("Nymbo/Pocket-TTS", "voices/bf_isabella.wav", "space"), "bf_lily": ("Nymbo/Pocket-TTS", "voices/bf_lily.wav", "space"), "bm_daniel": ("Nymbo/Pocket-TTS", "voices/bm_daniel.wav", "space"), "bm_fable": ("Nymbo/Pocket-TTS", "voices/bm_fable.wav", "space"), "bm_george": ("Nymbo/Pocket-TTS", "voices/bm_george.wav", "space"), "bm_lewis": ("Nymbo/Pocket-TTS", "voices/bm_lewis.wav", "space"), "ef_dora": ("Nymbo/Pocket-TTS", "voices/ef_dora.wav", "space"), "em_alex": ("Nymbo/Pocket-TTS", "voices/em_alex.wav", "space"), "em_santa": ("Nymbo/Pocket-TTS", "voices/em_santa.wav", "space"), "ff_siwis": ("Nymbo/Pocket-TTS", "voices/ff_siwis.wav", "space"), "hf_alpha": ("Nymbo/Pocket-TTS", "voices/hf_alpha.wav", "space"), "hf_beta": ("Nymbo/Pocket-TTS", "voices/hf_beta.wav", "space"), "hm_omega": ("Nymbo/Pocket-TTS", "voices/hm_omega.wav", "space"), "hm_psi": ("Nymbo/Pocket-TTS", "voices/hm_psi.wav", "space"), "if_sara": ("Nymbo/Pocket-TTS", "voices/if_sara.wav", "space"), "im_nicola": ("Nymbo/Pocket-TTS", "voices/im_nicola.wav", "space"), "jf_alpha": ("Nymbo/Pocket-TTS", "voices/jf_alpha.wav", "space"), "jf_gongitsune": ("Nymbo/Pocket-TTS", "voices/jf_gongitsune.wav", "space"), "jf_nezumi": ("Nymbo/Pocket-TTS", "voices/jf_nezumi.wav", "space"), "jf_tebukuro": ("Nymbo/Pocket-TTS", "voices/jf_tebukuro.wav", "space"), "jm_kumo": ("Nymbo/Pocket-TTS", "voices/jm_kumo.wav", "space"), "pf_dora": ("Nymbo/Pocket-TTS", "voices/pf_dora.wav", "space"), "pm_alex": ("Nymbo/Pocket-TTS", "voices/pm_alex.wav", "space"), "pm_santa": ("Nymbo/Pocket-TTS", "voices/pm_santa.wav", "space"), "zf_xiaobei": ("Nymbo/Pocket-TTS", "voices/zf_xiaobei.wav", "space"), "zf_xiaoni": ("Nymbo/Pocket-TTS", "voices/zf_xiaoni.wav", "space"), "zf_xiaoxiao": ("Nymbo/Pocket-TTS", "voices/zf_xiaoxiao.wav", "space"), "zf_xiaoyi": ("Nymbo/Pocket-TTS", "voices/zf_xiaoyi.wav", "space"), "zm_yunjian": ("Nymbo/Pocket-TTS", "voices/zm_yunjian.wav", "space"), "zm_yunxi": ("Nymbo/Pocket-TTS", "voices/zm_yunxi.wav", "space"), "zm_yunxia": ("Nymbo/Pocket-TTS", "voices/zm_yunxia.wav", "space"), "zm_yunyang": ("Nymbo/Pocket-TTS", "voices/zm_yunyang.wav", "space"), # === chandypants character voices === "benji": ("chandypants/ollie-pocket-tts", "voices/benji.wav", "space"), "bertha": ("chandypants/ollie-pocket-tts", "voices/bertha.wav", "space"), "damian": ("chandypants/ollie-pocket-tts", "voices/damian.wav", "space"), "f01_young_bright": ("chandypants/ollie-pocket-tts", "voices/f01_young_bright.wav", "space"), "f02_texas_gal": ("chandypants/ollie-pocket-tts", "voices/f02_texas_gal.wav", "space"), "f03_sharp_pro": ("chandypants/ollie-pocket-tts", "voices/f03_sharp_pro.wav", "space"), "f04_warm_mom": ("chandypants/ollie-pocket-tts", "voices/f04_warm_mom.wav", "space"), "f05_husky_mature": ("chandypants/ollie-pocket-tts", "voices/f05_husky_mature.wav", "space"), "f06_perky_young": ("chandypants/ollie-pocket-tts", "voices/f06_perky_young.wav", "space"), "f07_southern_belle": ("chandypants/ollie-pocket-tts", "voices/f07_southern_belle.wav", "space"), "f08_tough_cop": ("chandypants/ollie-pocket-tts", "voices/f08_tough_cop.wav", "space"), "f09_elderly_sweet": ("chandypants/ollie-pocket-tts", "voices/f09_elderly_sweet.wav", "space"), "f10_theater_kid": ("chandypants/ollie-pocket-tts", "voices/f10_theater_kid.wav", "space"), "m01_deep_south": ("chandypants/ollie-pocket-tts", "voices/m01_deep_south.wav", "space"), "m02_smooth_tenor": ("chandypants/ollie-pocket-tts", "voices/m02_smooth_tenor.wav", "space"), "m03_gruff_ny": ("chandypants/ollie-pocket-tts", "voices/m03_gruff_ny.wav", "space"), "m04_warm_dad": ("chandypants/ollie-pocket-tts", "voices/m04_warm_dad.wav", "space"), "m05_distinguished": ("chandypants/ollie-pocket-tts", "voices/m05_distinguished.wav", "space"), "m06_young_rough": ("chandypants/ollie-pocket-tts", "voices/m06_young_rough.wav", "space"), "m07_cowboy": ("chandypants/ollie-pocket-tts", "voices/m07_cowboy.wav", "space"), "m08_fast_talker": ("chandypants/ollie-pocket-tts", "voices/m08_fast_talker.wav", "space"), "m09_gentle_giant": ("chandypants/ollie-pocket-tts", "voices/m09_gentle_giant.wav", "space"), "m10_slick": ("chandypants/ollie-pocket-tts", "voices/m10_slick.wav", "space"), } # === celebrity voices (voice clone samples) === VOICE_SOURCES.update({ "leonardo_dicaprio": ("hf4uwho/Pocket-TTS", "voices/celeb/leonardo_dicaprio.wav", "space"), "jack_nicholson": ("hf4uwho/Pocket-TTS", "voices/celeb/jack_nicholson.wav", "space"), "joe_pesci": ("hf4uwho/Pocket-TTS", "voices/celeb/joe_pesci.wav", "space"), "robert_de_niro": ("hf4uwho/Pocket-TTS", "voices/celeb/robert_de_niro.wav", "space"), "al_pacino": ("hf4uwho/Pocket-TTS", "voices/celeb/al_pacino.wav", "space"), "jake_gyllenhaal": ("hf4uwho/Pocket-TTS", "voices/celeb/jake_gyllenhaal.wav", "space"), "scarlett_johansson": ("hf4uwho/Pocket-TTS", "voices/celeb/scarlett_johansson.wav", "space"), }) # === kyutai/tts-voices (official voice catalog) === _KYUTAI_VOICES = { "alba_a_moment_by": ("kyutai/tts-voices", "alba-mackenna/a-moment-by.wav", "model"), "alba_announcer": ("kyutai/tts-voices", "alba-mackenna/announcer.wav", "model"), "alba_casual": ("kyutai/tts-voices", "alba-mackenna/casual.wav", "model"), "alba_merchant": ("kyutai/tts-voices", "alba-mackenna/merchant.wav", "model"), "cml_10087_11650_000028_0002": ("kyutai/tts-voices", "cml-tts/fr/10087_11650_000028-0002.wav", "model"), "cml_10177_10625_000134_0003": ("kyutai/tts-voices", "cml-tts/fr/10177_10625_000134-0003.wav", "model"), "cml_10179_11051_000005_0001": ("kyutai/tts-voices", "cml-tts/fr/10179_11051_000005-0001.wav", "model"), "cml_12080_11650_000047_0001": ("kyutai/tts-voices", "cml-tts/fr/12080_11650_000047-0001.wav", "model"), "cml_12205_11650_000004_0002": ("kyutai/tts-voices", "cml-tts/fr/12205_11650_000004-0002.wav", "model"), "cml_12977_10625_000037_0001": ("kyutai/tts-voices", "cml-tts/fr/12977_10625_000037-0001.wav", "model"), "cml_1406_1028_000009_0003": ("kyutai/tts-voices", "cml-tts/fr/1406_1028_000009-0003.wav", "model"), "cml_1591_1028_000108_0004": ("kyutai/tts-voices", "cml-tts/fr/1591_1028_000108-0004.wav", "model"), "cml_1770_1028_000036_0002": ("kyutai/tts-voices", "cml-tts/fr/1770_1028_000036-0002.wav", "model"), "cml_2114_1656_000053_0001": ("kyutai/tts-voices", "cml-tts/fr/2114_1656_000053-0001.wav", "model"), "cml_2154_2576_000020_0003": ("kyutai/tts-voices", "cml-tts/fr/2154_2576_000020-0003.wav", "model"), "cml_2216_1745_000007_0001": ("kyutai/tts-voices", "cml-tts/fr/2216_1745_000007-0001.wav", "model"), "cml_2223_1745_000009_0002": ("kyutai/tts-voices", "cml-tts/fr/2223_1745_000009-0002.wav", "model"), "cml_2465_1943_000152_0002": ("kyutai/tts-voices", "cml-tts/fr/2465_1943_000152-0002.wav", "model"), "cml_296_1028_000022_0001": ("kyutai/tts-voices", "cml-tts/fr/296_1028_000022-0001.wav", "model"), "cml_3267_1902_000075_0001": ("kyutai/tts-voices", "cml-tts/fr/3267_1902_000075-0001.wav", "model"), "cml_4193_3103_000004_0001": ("kyutai/tts-voices", "cml-tts/fr/4193_3103_000004-0001.wav", "model"), "cml_4482_3103_000063_0001": ("kyutai/tts-voices", "cml-tts/fr/4482_3103_000063-0001.wav", "model"), "cml_4724_3731_000031_0001": ("kyutai/tts-voices", "cml-tts/fr/4724_3731_000031-0001.wav", "model"), "cml_4937_3731_000004_0001": ("kyutai/tts-voices", "cml-tts/fr/4937_3731_000004-0001.wav", "model"), "cml_5207_3078_000031_0002": ("kyutai/tts-voices", "cml-tts/fr/5207_3078_000031-0002.wav", "model"), "cml_5476_3103_000072_0001": ("kyutai/tts-voices", "cml-tts/fr/5476_3103_000072-0001.wav", "model"), "cml_577_394_000070_0001": ("kyutai/tts-voices", "cml-tts/fr/577_394_000070-0001.wav", "model"), "cml_5790_4893_000052_0001": ("kyutai/tts-voices", "cml-tts/fr/5790_4893_000052-0001.wav", "model"), "cml_579_2548_000015_0001": ("kyutai/tts-voices", "cml-tts/fr/579_2548_000015-0001.wav", "model"), "cml_5830_4703_000037_0001": ("kyutai/tts-voices", "cml-tts/fr/5830_4703_000037-0001.wav", "model"), "cml_6318_7016_000027_0002": ("kyutai/tts-voices", "cml-tts/fr/6318_7016_000027-0002.wav", "model"), "cml_7142_2432_000124_0003": ("kyutai/tts-voices", "cml-tts/fr/7142_2432_000124-0003.wav", "model"), "cml_7400_2928_000100_0001": ("kyutai/tts-voices", "cml-tts/fr/7400_2928_000100-0001.wav", "model"), "cml_7591_6742_000149_0002": ("kyutai/tts-voices", "cml-tts/fr/7591_6742_000149-0002.wav", "model"), "cml_7601_7727_000062_0001": ("kyutai/tts-voices", "cml-tts/fr/7601_7727_000062-0001.wav", "model"), "cml_7762_8734_000048_0002": ("kyutai/tts-voices", "cml-tts/fr/7762_8734_000048-0002.wav", "model"), "cml_8128_7016_000047_0002": ("kyutai/tts-voices", "cml-tts/fr/8128_7016_000047-0002.wav", "model"), "cml_928_486_000075_0001": ("kyutai/tts-voices", "cml-tts/fr/928_486_000075-0001.wav", "model"), "cml_9834_9697_000150_0003": ("kyutai/tts-voices", "cml-tts/fr/9834_9697_000150-0003.wav", "model"), "ears_p001": ("kyutai/tts-voices", "ears/p001/freeform_speech_01.wav", "model"), "ears_p002": ("kyutai/tts-voices", "ears/p002/freeform_speech_01.wav", "model"), "ears_p003": ("kyutai/tts-voices", "ears/p003/freeform_speech_01.wav", "model"), "ears_p003_adoration": ("kyutai/tts-voices", "ears/p003/emo_adoration_freeform.wav", "model"), "ears_p003_amazement": ("kyutai/tts-voices", "ears/p003/emo_amazement_freeform.wav", "model"), "ears_p003_amusement": ("kyutai/tts-voices", "ears/p003/emo_amusement_freeform.wav", "model"), "ears_p003_anger": ("kyutai/tts-voices", "ears/p003/emo_anger_freeform.wav", "model"), "ears_p003_confusion": ("kyutai/tts-voices", "ears/p003/emo_confusion_freeform.wav", "model"), "ears_p003_contentment": ("kyutai/tts-voices", "ears/p003/emo_contentment_freeform.wav", "model"), "ears_p003_cuteness": ("kyutai/tts-voices", "ears/p003/emo_cuteness_freeform.wav", "model"), "ears_p003_desire": ("kyutai/tts-voices", "ears/p003/emo_desire_freeform.wav", "model"), "ears_p003_disappointment": ("kyutai/tts-voices", "ears/p003/emo_disappointment_freeform.wav", "model"), "ears_p003_disgust": ("kyutai/tts-voices", "ears/p003/emo_disgust_freeform.wav", "model"), "ears_p003_distress": ("kyutai/tts-voices", "ears/p003/emo_distress_freeform.wav", "model"), "ears_p003_embarassment": ("kyutai/tts-voices", "ears/p003/emo_embarassment_freeform.wav", "model"), "ears_p003_extasy": ("kyutai/tts-voices", "ears/p003/emo_extasy_freeform.wav", "model"), "ears_p003_fear": ("kyutai/tts-voices", "ears/p003/emo_fear_freeform.wav", "model"), "ears_p003_guilt": ("kyutai/tts-voices", "ears/p003/emo_guilt_freeform.wav", "model"), "ears_p003_interest": ("kyutai/tts-voices", "ears/p003/emo_interest_freeform.wav", "model"), "ears_p003_neutral": ("kyutai/tts-voices", "ears/p003/emo_neutral_freeform.wav", "model"), "ears_p003_pain": ("kyutai/tts-voices", "ears/p003/emo_pain_freeform.wav", "model"), "ears_p003_pride": ("kyutai/tts-voices", "ears/p003/emo_pride_freeform.wav", "model"), "ears_p003_realization": ("kyutai/tts-voices", "ears/p003/emo_realization_freeform.wav", "model"), "ears_p003_relief": ("kyutai/tts-voices", "ears/p003/emo_relief_freeform.wav", "model"), "ears_p003_sadness": ("kyutai/tts-voices", "ears/p003/emo_sadness_freeform.wav", "model"), "ears_p003_serenity": ("kyutai/tts-voices", "ears/p003/emo_serenity_freeform.wav", "model"), "ears_p004": ("kyutai/tts-voices", "ears/p004/freeform_speech_01.wav", "model"), "ears_p005": ("kyutai/tts-voices", "ears/p005/freeform_speech_01.wav", "model"), "ears_p006": ("kyutai/tts-voices", "ears/p006/freeform_speech_01.wav", "model"), "ears_p007": ("kyutai/tts-voices", "ears/p007/freeform_speech_01.wav", "model"), "ears_p008": ("kyutai/tts-voices", "ears/p008/freeform_speech_01.wav", "model"), "ears_p009": ("kyutai/tts-voices", "ears/p009/freeform_speech_01.wav", "model"), "ears_p010": ("kyutai/tts-voices", "ears/p010/freeform_speech_01.wav", "model"), "ears_p011": ("kyutai/tts-voices", "ears/p011/freeform_speech_01.wav", "model"), "ears_p012": ("kyutai/tts-voices", "ears/p012/freeform_speech_01.wav", "model"), "ears_p013": ("kyutai/tts-voices", "ears/p013/freeform_speech_01.wav", "model"), "ears_p014": ("kyutai/tts-voices", "ears/p014/freeform_speech_01.wav", "model"), "ears_p015": ("kyutai/tts-voices", "ears/p015/freeform_speech_01.wav", "model"), "ears_p016": ("kyutai/tts-voices", "ears/p016/freeform_speech_01.wav", "model"), "ears_p017": ("kyutai/tts-voices", "ears/p017/freeform_speech_01.wav", "model"), "ears_p018": ("kyutai/tts-voices", "ears/p018/freeform_speech_01.wav", "model"), "ears_p019": ("kyutai/tts-voices", "ears/p019/freeform_speech_01.wav", "model"), "ears_p020": ("kyutai/tts-voices", "ears/p020/freeform_speech_01.wav", "model"), "ears_p021": ("kyutai/tts-voices", "ears/p021/freeform_speech_01.wav", "model"), "ears_p022": ("kyutai/tts-voices", "ears/p022/freeform_speech_01.wav", "model"), "ears_p023": ("kyutai/tts-voices", "ears/p023/freeform_speech_01.wav", "model"), "ears_p024": ("kyutai/tts-voices", "ears/p024/freeform_speech_01.wav", "model"), "ears_p025": ("kyutai/tts-voices", "ears/p025/freeform_speech_01.wav", "model"), "ears_p026": ("kyutai/tts-voices", "ears/p026/freeform_speech_01.wav", "model"), "ears_p027": ("kyutai/tts-voices", "ears/p027/freeform_speech_01.wav", "model"), "ears_p028": ("kyutai/tts-voices", "ears/p028/freeform_speech_01.wav", "model"), "ears_p029": ("kyutai/tts-voices", "ears/p029/freeform_speech_01.wav", "model"), "ears_p030": ("kyutai/tts-voices", "ears/p030/freeform_speech_01.wav", "model"), "ears_p031": ("kyutai/tts-voices", "ears/p031/freeform_speech_01.wav", "model"), "ears_p031_adoration": ("kyutai/tts-voices", "ears/p031/emo_adoration_freeform.wav", "model"), "ears_p031_amazement": ("kyutai/tts-voices", "ears/p031/emo_amazement_freeform.wav", "model"), "ears_p031_amusement": ("kyutai/tts-voices", "ears/p031/emo_amusement_freeform.wav", "model"), "ears_p031_anger": ("kyutai/tts-voices", "ears/p031/emo_anger_freeform.wav", "model"), "ears_p031_confusion": ("kyutai/tts-voices", "ears/p031/emo_confusion_freeform.wav", "model"), "ears_p031_contentment": ("kyutai/tts-voices", "ears/p031/emo_contentment_freeform.wav", "model"), "ears_p031_cuteness": ("kyutai/tts-voices", "ears/p031/emo_cuteness_freeform.wav", "model"), "ears_p031_desire": ("kyutai/tts-voices", "ears/p031/emo_desire_freeform.wav", "model"), "ears_p031_disappointment": ("kyutai/tts-voices", "ears/p031/emo_disappointment_freeform.wav", "model"), "ears_p031_disgust": ("kyutai/tts-voices", "ears/p031/emo_disgust_freeform.wav", "model"), "ears_p031_distress": ("kyutai/tts-voices", "ears/p031/emo_distress_freeform.wav", "model"), "ears_p031_embarassment": ("kyutai/tts-voices", "ears/p031/emo_embarassment_freeform.wav", "model"), "ears_p031_extasy": ("kyutai/tts-voices", "ears/p031/emo_extasy_freeform.wav", "model"), "ears_p031_fear": ("kyutai/tts-voices", "ears/p031/emo_fear_freeform.wav", "model"), "ears_p031_guilt": ("kyutai/tts-voices", "ears/p031/emo_guilt_freeform.wav", "model"), "ears_p031_interest": ("kyutai/tts-voices", "ears/p031/emo_interest_freeform.wav", "model"), "ears_p031_neutral": ("kyutai/tts-voices", "ears/p031/emo_neutral_freeform.wav", "model"), "ears_p031_pain": ("kyutai/tts-voices", "ears/p031/emo_pain_freeform.wav", "model"), "ears_p031_pride": ("kyutai/tts-voices", "ears/p031/emo_pride_freeform.wav", "model"), "ears_p031_realization": ("kyutai/tts-voices", "ears/p031/emo_realization_freeform.wav", "model"), "ears_p031_relief": ("kyutai/tts-voices", "ears/p031/emo_relief_freeform.wav", "model"), "ears_p031_sadness": ("kyutai/tts-voices", "ears/p031/emo_sadness_freeform.wav", "model"), "ears_p031_serenity": ("kyutai/tts-voices", "ears/p031/emo_serenity_freeform.wav", "model"), "ears_p032": ("kyutai/tts-voices", "ears/p032/freeform_speech_01.wav", "model"), "ears_p033": ("kyutai/tts-voices", "ears/p033/freeform_speech_01.wav", "model"), "ears_p034": ("kyutai/tts-voices", "ears/p034/freeform_speech_01.wav", "model"), "ears_p035": ("kyutai/tts-voices", "ears/p035/freeform_speech_01.wav", "model"), "ears_p036": ("kyutai/tts-voices", "ears/p036/freeform_speech_01.wav", "model"), "ears_p037": ("kyutai/tts-voices", "ears/p037/freeform_speech_01.wav", "model"), "ears_p038": ("kyutai/tts-voices", "ears/p038/freeform_speech_01.wav", "model"), "ears_p039": ("kyutai/tts-voices", "ears/p039/freeform_speech_01.wav", "model"), "ears_p040": ("kyutai/tts-voices", "ears/p040/freeform_speech_01.wav", "model"), "ears_p041": ("kyutai/tts-voices", "ears/p041/freeform_speech_01.wav", "model"), "ears_p042": ("kyutai/tts-voices", "ears/p042/freeform_speech_01.wav", "model"), "ears_p043": ("kyutai/tts-voices", "ears/p043/freeform_speech_01.wav", "model"), "ears_p044": ("kyutai/tts-voices", "ears/p044/freeform_speech_01.wav", "model"), "ears_p045": ("kyutai/tts-voices", "ears/p045/freeform_speech_01.wav", "model"), "ears_p046": ("kyutai/tts-voices", "ears/p046/freeform_speech_01.wav", "model"), "ears_p047": ("kyutai/tts-voices", "ears/p047/freeform_speech_01.wav", "model"), "ears_p048": ("kyutai/tts-voices", "ears/p048/freeform_speech_01.wav", "model"), "ears_p049": ("kyutai/tts-voices", "ears/p049/freeform_speech_01.wav", "model"), "ears_p050": ("kyutai/tts-voices", "ears/p050/freeform_speech_01.wav", "model"), "ears_p051": ("kyutai/tts-voices", "ears/p051/freeform_speech_01.wav", "model"), "ears_p052": ("kyutai/tts-voices", "ears/p052/freeform_speech_01.wav", "model"), "ears_p053": ("kyutai/tts-voices", "ears/p053/freeform_speech_01.wav", "model"), "ears_p054": ("kyutai/tts-voices", "ears/p054/freeform_speech_01.wav", "model"), "ears_p055": ("kyutai/tts-voices", "ears/p055/freeform_speech_01.wav", "model"), "ears_p056": ("kyutai/tts-voices", "ears/p056/freeform_speech_01.wav", "model"), "ears_p057": ("kyutai/tts-voices", "ears/p057/freeform_speech_01.wav", "model"), "ears_p058": ("kyutai/tts-voices", "ears/p058/freeform_speech_01.wav", "model"), "ears_p059": ("kyutai/tts-voices", "ears/p059/freeform_speech_01.wav", "model"), "ears_p060": ("kyutai/tts-voices", "ears/p060/freeform_speech_01.wav", "model"), "ears_p061": ("kyutai/tts-voices", "ears/p061/freeform_speech_01.wav", "model"), "ears_p062": ("kyutai/tts-voices", "ears/p062/freeform_speech_01.wav", "model"), "ears_p063": ("kyutai/tts-voices", "ears/p063/freeform_speech_01.wav", "model"), "ears_p064": ("kyutai/tts-voices", "ears/p064/freeform_speech_01.wav", "model"), "ears_p065": ("kyutai/tts-voices", "ears/p065/freeform_speech_01.wav", "model"), "ears_p066": ("kyutai/tts-voices", "ears/p066/freeform_speech_01.wav", "model"), "ears_p067": ("kyutai/tts-voices", "ears/p067/freeform_speech_01.wav", "model"), "ears_p068": ("kyutai/tts-voices", "ears/p068/freeform_speech_01.wav", "model"), "ears_p069": ("kyutai/tts-voices", "ears/p069/freeform_speech_01.wav", "model"), "ears_p070": ("kyutai/tts-voices", "ears/p070/freeform_speech_01.wav", "model"), "ears_p071": ("kyutai/tts-voices", "ears/p071/freeform_speech_01.wav", "model"), "ears_p072": ("kyutai/tts-voices", "ears/p072/freeform_speech_01.wav", "model"), "ears_p073": ("kyutai/tts-voices", "ears/p073/freeform_speech_01.wav", "model"), "ears_p074": ("kyutai/tts-voices", "ears/p074/freeform_speech_01.wav", "model"), "ears_p075": ("kyutai/tts-voices", "ears/p075/freeform_speech_01.wav", "model"), "ears_p076": ("kyutai/tts-voices", "ears/p076/freeform_speech_01.wav", "model"), "ears_p077": ("kyutai/tts-voices", "ears/p077/freeform_speech_01.wav", "model"), "ears_p078": ("kyutai/tts-voices", "ears/p078/freeform_speech_01.wav", "model"), "ears_p079": ("kyutai/tts-voices", "ears/p079/freeform_speech_01.wav", "model"), "ears_p080": ("kyutai/tts-voices", "ears/p080/freeform_speech_01.wav", "model"), "ears_p081": ("kyutai/tts-voices", "ears/p081/freeform_speech_01.wav", "model"), "ears_p082": ("kyutai/tts-voices", "ears/p082/freeform_speech_01.wav", "model"), "ears_p083": ("kyutai/tts-voices", "ears/p083/freeform_speech_01.wav", "model"), "ears_p084": ("kyutai/tts-voices", "ears/p084/freeform_speech_01.wav", "model"), "ears_p085": ("kyutai/tts-voices", "ears/p085/freeform_speech_01.wav", "model"), "ears_p086": ("kyutai/tts-voices", "ears/p086/freeform_speech_01.wav", "model"), "ears_p087": ("kyutai/tts-voices", "ears/p087/freeform_speech_01.wav", "model"), "ears_p088": ("kyutai/tts-voices", "ears/p088/freeform_speech_01.wav", "model"), "ears_p089": ("kyutai/tts-voices", "ears/p089/freeform_speech_01.wav", "model"), "ears_p090": ("kyutai/tts-voices", "ears/p090/freeform_speech_01.wav", "model"), "ears_p091": ("kyutai/tts-voices", "ears/p091/freeform_speech_01.wav", "model"), "ears_p092": ("kyutai/tts-voices", "ears/p092/freeform_speech_01.wav", "model"), "ears_p093": ("kyutai/tts-voices", "ears/p093/freeform_speech_01.wav", "model"), "ears_p094": ("kyutai/tts-voices", "ears/p094/freeform_speech_01.wav", "model"), "ears_p095": ("kyutai/tts-voices", "ears/p095/freeform_speech_01.wav", "model"), "ears_p096": ("kyutai/tts-voices", "ears/p096/freeform_speech_01.wav", "model"), "ears_p097": ("kyutai/tts-voices", "ears/p097/freeform_speech_01.wav", "model"), "ears_p098": ("kyutai/tts-voices", "ears/p098/freeform_speech_01.wav", "model"), "ears_p099": ("kyutai/tts-voices", "ears/p099/freeform_speech_01.wav", "model"), "ears_p100": ("kyutai/tts-voices", "ears/p100/freeform_speech_01.wav", "model"), "ears_p101": ("kyutai/tts-voices", "ears/p101/freeform_speech_01.wav", "model"), "ears_p102": ("kyutai/tts-voices", "ears/p102/freeform_speech_01.wav", "model"), "ears_p103": ("kyutai/tts-voices", "ears/p103/freeform_speech_01.wav", "model"), "ears_p104": ("kyutai/tts-voices", "ears/p104/freeform_speech_01.wav", "model"), "ears_p105": ("kyutai/tts-voices", "ears/p105/freeform_speech_01.wav", "model"), "ears_p106": ("kyutai/tts-voices", "ears/p106/freeform_speech_01.wav", "model"), "ears_p107": ("kyutai/tts-voices", "ears/p107/freeform_speech_01.wav", "model"), "ex_duo_a_default": ("kyutai/tts-voices", "expresso/ex01-ex02_default_001_channel1_168s.wav", "model"), "ex_duo_a_enunciated": ("kyutai/tts-voices", "expresso/ex01-ex02_enunciated_001_channel1_432s.wav", "model"), "ex_duo_a_fast": ("kyutai/tts-voices", "expresso/ex01-ex02_fast_001_channel1_104s.wav", "model"), "ex_duo_a_projected": ("kyutai/tts-voices", "expresso/ex01-ex02_projected_001_channel1_46s.wav", "model"), "ex_duo_a_whisper": ("kyutai/tts-voices", "expresso/ex01-ex02_whisper_001_channel1_579s.wav", "model"), "ex_duo_b_default": ("kyutai/tts-voices", "expresso/ex04-ex03_default_001_channel1_3s.wav", "model"), "ex_duo_b_enunciated": ("kyutai/tts-voices", "expresso/ex04-ex03_enunciated_001_channel1_86s.wav", "model"), "ex_duo_b_fast": ("kyutai/tts-voices", "expresso/ex04-ex03_fast_001_channel1_208s.wav", "model"), "ex_duo_b_projected": ("kyutai/tts-voices", "expresso/ex04-ex03_projected_001_channel1_192s.wav", "model"), "ex_duo_b_whisper": ("kyutai/tts-voices", "expresso/ex04-ex03_whisper_001_channel1_198s.wav", "model"), "ex_fem_emote_angry": ("kyutai/tts-voices", "expresso/ex03-ex01_angry_001_channel1_201s.wav", "model"), "ex_fem_emote_awe": ("kyutai/tts-voices", "expresso/ex03-ex01_awe_001_channel1_1323s.wav", "model"), "ex_fem_emote_calm": ("kyutai/tts-voices", "expresso/ex03-ex01_calm_001_channel1_1143s.wav", "model"), "ex_fem_emote_confused": ("kyutai/tts-voices", "expresso/ex03-ex01_confused_001_channel1_909s.wav", "model"), "ex_fem_emote_desire": ("kyutai/tts-voices", "expresso/ex03-ex01_desire_004_channel1_545s.wav", "model"), "ex_fem_emote_disgusted": ("kyutai/tts-voices", "expresso/ex03-ex01_disgusted_004_channel1_170s.wav", "model"), "ex_fem_emote_enunciated": ("kyutai/tts-voices", "expresso/ex03-ex01_enunciated_001_channel1_388s.wav", "model"), "ex_fem_emote_happy": ("kyutai/tts-voices", "expresso/ex03-ex01_happy_001_channel1_334s.wav", "model"), "ex_fem_emote_laughing": ("kyutai/tts-voices", "expresso/ex03-ex01_laughing_001_channel1_188s.wav", "model"), "ex_fem_emote_nonverbal": ("kyutai/tts-voices", "expresso/ex03-ex01_nonverbal_006_channel1_62s.wav", "model"), "ex_fem_emote_sarcastic": ("kyutai/tts-voices", "expresso/ex03-ex01_sarcastic_001_channel1_435s.wav", "model"), "ex_fem_emote_sleepy": ("kyutai/tts-voices", "expresso/ex03-ex01_sleepy_001_channel1_619s.wav", "model"), "ex_fem_narr_animal_animaldir": ("kyutai/tts-voices", "expresso/ex03-ex02_animal-animaldir_003_channel1_32s.wav", "model"), "ex_fem_narr_animaldir_animal": ("kyutai/tts-voices", "expresso/ex03-ex02_animaldir-animal_008_channel1_147s.wav", "model"), "ex_fem_narr_child_childdir": ("kyutai/tts-voices", "expresso/ex03-ex02_child-childdir_001_channel1_291s.wav", "model"), "ex_fem_narr_childdir_child": ("kyutai/tts-voices", "expresso/ex03-ex02_childdir-child_004_channel1_308s.wav", "model"), "ex_fem_narr_laughing": ("kyutai/tts-voices", "expresso/ex03-ex02_laughing_001_channel1_248s.wav", "model"), "ex_fem_narr_narration": ("kyutai/tts-voices", "expresso/ex03-ex02_narration_001_channel1_674s.wav", "model"), "ex_fem_narr_sad_sympathetic": ("kyutai/tts-voices", "expresso/ex03-ex02_sad-sympathetic_001_channel1_454s.wav", "model"), "ex_fem_narr_sympathetic_sad": ("kyutai/tts-voices", "expresso/ex03-ex02_sympathetic-sad_008_channel1_215s.wav", "model"), "ex_mal_emote_angry": ("kyutai/tts-voices", "expresso/ex04-ex02_angry_001_channel1_119s.wav", "model"), "ex_mal_emote_awe": ("kyutai/tts-voices", "expresso/ex04-ex02_awe_001_channel1_982s.wav", "model"), "ex_mal_emote_bored": ("kyutai/tts-voices", "expresso/ex04-ex02_bored_001_channel1_254s.wav", "model"), "ex_mal_emote_calm": ("kyutai/tts-voices", "expresso/ex04-ex02_calm_002_channel1_480s.wav", "model"), "ex_mal_emote_confused": ("kyutai/tts-voices", "expresso/ex04-ex02_confused_001_channel1_499s.wav", "model"), "ex_mal_emote_desire": ("kyutai/tts-voices", "expresso/ex04-ex02_desire_001_channel1_657s.wav", "model"), "ex_mal_emote_disgusted": ("kyutai/tts-voices", "expresso/ex04-ex02_disgusted_004_channel1_169s.wav", "model"), "ex_mal_emote_enunciated": ("kyutai/tts-voices", "expresso/ex04-ex02_enunciated_001_channel1_496s.wav", "model"), "ex_mal_emote_fearful": ("kyutai/tts-voices", "expresso/ex04-ex02_fearful_001_channel1_316s.wav", "model"), "ex_mal_emote_happy": ("kyutai/tts-voices", "expresso/ex04-ex02_happy_001_channel1_118s.wav", "model"), "ex_mal_emote_laughing": ("kyutai/tts-voices", "expresso/ex04-ex02_laughing_001_channel1_147s.wav", "model"), "ex_mal_emote_nonverbal": ("kyutai/tts-voices", "expresso/ex04-ex02_nonverbal_004_channel1_18s.wav", "model"), "ex_mal_emote_sarcastic": ("kyutai/tts-voices", "expresso/ex04-ex02_sarcastic_001_channel1_519s.wav", "model"), "ex_mal_narr_animal_animaldir": ("kyutai/tts-voices", "expresso/ex04-ex01_animal-animaldir_006_channel1_196s.wav", "model"), "ex_mal_narr_animaldir_animal": ("kyutai/tts-voices", "expresso/ex04-ex01_animaldir-animal_001_channel1_118s.wav", "model"), "ex_mal_narr_child_childdir": ("kyutai/tts-voices", "expresso/ex04-ex01_child-childdir_004_channel1_118s.wav", "model"), "ex_mal_narr_childdir_child": ("kyutai/tts-voices", "expresso/ex04-ex01_childdir-child_001_channel1_228s.wav", "model"), "ex_mal_narr_disgusted": ("kyutai/tts-voices", "expresso/ex04-ex01_disgusted_001_channel1_130s.wav", "model"), "ex_mal_narr_laughing": ("kyutai/tts-voices", "expresso/ex04-ex01_laughing_001_channel1_306s.wav", "model"), "ex_mal_narr_narration": ("kyutai/tts-voices", "expresso/ex04-ex01_narration_001_channel1_605s.wav", "model"), "ex_mal_narr_sad_sympathetic": ("kyutai/tts-voices", "expresso/ex04-ex01_sad-sympathetic_001_channel1_267s.wav", "model"), "ex_mal_narr_sympathetic_sad": ("kyutai/tts-voices", "expresso/ex04-ex01_sympathetic-sad_008_channel1_415s.wav", "model"), "unmute_default_voice": ("kyutai/tts-voices", "unmute-prod-website/default_voice.wav", "model"), "unmute_degaulle_2": ("kyutai/tts-voices", "unmute-prod-website/degaulle-2.wav", "model"), "unmute_developpeuse_3": ("kyutai/tts-voices", "unmute-prod-website/developpeuse-3.wav", "model"), "unmute_ex04_narration_longform_00001": ("kyutai/tts-voices", "unmute-prod-website/ex04_narration_longform_00001.wav", "model"), "unmute_fabieng_enhanced_v2": ("kyutai/tts-voices", "unmute-prod-website/fabieng-enhanced-v2.wav", "model"), "unmute_p329_022": ("kyutai/tts-voices", "unmute-prod-website/p329_022.wav", "model"), "vctk_p225_023": ("kyutai/tts-voices", "vctk/p225_023.wav", "model"), "vctk_p226_023": ("kyutai/tts-voices", "vctk/p226_023.wav", "model"), "vctk_p227_023": ("kyutai/tts-voices", "vctk/p227_023.wav", "model"), "vctk_p228_023": ("kyutai/tts-voices", "vctk/p228_023.wav", "model"), "vctk_p229_023": ("kyutai/tts-voices", "vctk/p229_023.wav", "model"), "vctk_p230_023": ("kyutai/tts-voices", "vctk/p230_023.wav", "model"), "vctk_p231_023": ("kyutai/tts-voices", "vctk/p231_023.wav", "model"), "vctk_p232_023": ("kyutai/tts-voices", "vctk/p232_023.wav", "model"), "vctk_p233_023": ("kyutai/tts-voices", "vctk/p233_023.wav", "model"), "vctk_p234_023": ("kyutai/tts-voices", "vctk/p234_023.wav", "model"), "vctk_p236_023": ("kyutai/tts-voices", "vctk/p236_023.wav", "model"), "vctk_p237_023": ("kyutai/tts-voices", "vctk/p237_023.wav", "model"), "vctk_p238_023": ("kyutai/tts-voices", "vctk/p238_023.wav", "model"), "vctk_p239_023": ("kyutai/tts-voices", "vctk/p239_023.wav", "model"), "vctk_p240_023": ("kyutai/tts-voices", "vctk/p240_023.wav", "model"), "vctk_p241_023": ("kyutai/tts-voices", "vctk/p241_023.wav", "model"), "vctk_p243_023": ("kyutai/tts-voices", "vctk/p243_023.wav", "model"), "vctk_p244_023": ("kyutai/tts-voices", "vctk/p244_023.wav", "model"), "vctk_p245_023": ("kyutai/tts-voices", "vctk/p245_023.wav", "model"), "vctk_p246_023": ("kyutai/tts-voices", "vctk/p246_023.wav", "model"), "vctk_p247_023": ("kyutai/tts-voices", "vctk/p247_023.wav", "model"), "vctk_p248_023": ("kyutai/tts-voices", "vctk/p248_023.wav", "model"), "vctk_p249_023": ("kyutai/tts-voices", "vctk/p249_023.wav", "model"), "vctk_p250_023": ("kyutai/tts-voices", "vctk/p250_023.wav", "model"), "vctk_p251_023": ("kyutai/tts-voices", "vctk/p251_023.wav", "model"), "vctk_p252_023": ("kyutai/tts-voices", "vctk/p252_023.wav", "model"), "vctk_p253_023": ("kyutai/tts-voices", "vctk/p253_023.wav", "model"), "vctk_p254_023": ("kyutai/tts-voices", "vctk/p254_023.wav", "model"), "vctk_p255_023": ("kyutai/tts-voices", "vctk/p255_023.wav", "model"), "vctk_p256_023": ("kyutai/tts-voices", "vctk/p256_023.wav", "model"), "vctk_p257_023": ("kyutai/tts-voices", "vctk/p257_023.wav", "model"), "vctk_p258_023": ("kyutai/tts-voices", "vctk/p258_023.wav", "model"), "vctk_p259_023": ("kyutai/tts-voices", "vctk/p259_023.wav", "model"), "vctk_p260_023": ("kyutai/tts-voices", "vctk/p260_023.wav", "model"), "vctk_p261_023": ("kyutai/tts-voices", "vctk/p261_023.wav", "model"), "vctk_p262_023": ("kyutai/tts-voices", "vctk/p262_023.wav", "model"), "vctk_p263_023": ("kyutai/tts-voices", "vctk/p263_023.wav", "model"), "vctk_p264_023": ("kyutai/tts-voices", "vctk/p264_023.wav", "model"), "vctk_p265_023": ("kyutai/tts-voices", "vctk/p265_023.wav", "model"), "vctk_p266_023": ("kyutai/tts-voices", "vctk/p266_023.wav", "model"), "vctk_p267_023": ("kyutai/tts-voices", "vctk/p267_023.wav", "model"), "vctk_p269_023": ("kyutai/tts-voices", "vctk/p269_023.wav", "model"), "vctk_p270_023": ("kyutai/tts-voices", "vctk/p270_023.wav", "model"), "vctk_p271_023": ("kyutai/tts-voices", "vctk/p271_023.wav", "model"), "vctk_p272_023": ("kyutai/tts-voices", "vctk/p272_023.wav", "model"), "vctk_p273_023": ("kyutai/tts-voices", "vctk/p273_023.wav", "model"), "vctk_p274_023": ("kyutai/tts-voices", "vctk/p274_023.wav", "model"), "vctk_p275_023": ("kyutai/tts-voices", "vctk/p275_023.wav", "model"), "vctk_p276_023": ("kyutai/tts-voices", "vctk/p276_023.wav", "model"), "vctk_p277_023": ("kyutai/tts-voices", "vctk/p277_023.wav", "model"), "vctk_p278_023": ("kyutai/tts-voices", "vctk/p278_023.wav", "model"), "vctk_p279_023": ("kyutai/tts-voices", "vctk/p279_023.wav", "model"), "vctk_p280_023": ("kyutai/tts-voices", "vctk/p280_023.wav", "model"), "vctk_p281_023": ("kyutai/tts-voices", "vctk/p281_023.wav", "model"), "vctk_p282_023": ("kyutai/tts-voices", "vctk/p282_023.wav", "model"), "vctk_p283_023": ("kyutai/tts-voices", "vctk/p283_023.wav", "model"), "vctk_p284_023": ("kyutai/tts-voices", "vctk/p284_023.wav", "model"), "vctk_p285_023": ("kyutai/tts-voices", "vctk/p285_023.wav", "model"), "vctk_p286_023": ("kyutai/tts-voices", "vctk/p286_023.wav", "model"), "vctk_p287_023": ("kyutai/tts-voices", "vctk/p287_023.wav", "model"), "vctk_p288_023": ("kyutai/tts-voices", "vctk/p288_023.wav", "model"), "vctk_p292_023": ("kyutai/tts-voices", "vctk/p292_023.wav", "model"), "vctk_p293_023": ("kyutai/tts-voices", "vctk/p293_023.wav", "model"), "vctk_p294_023": ("kyutai/tts-voices", "vctk/p294_023.wav", "model"), "vctk_p297_023": ("kyutai/tts-voices", "vctk/p297_023.wav", "model"), "vctk_p298_023": ("kyutai/tts-voices", "vctk/p298_023.wav", "model"), "vctk_p299_023": ("kyutai/tts-voices", "vctk/p299_023.wav", "model"), "vctk_p300_023": ("kyutai/tts-voices", "vctk/p300_023.wav", "model"), "vctk_p301_023": ("kyutai/tts-voices", "vctk/p301_023.wav", "model"), "vctk_p302_023": ("kyutai/tts-voices", "vctk/p302_023.wav", "model"), "vctk_p303_023": ("kyutai/tts-voices", "vctk/p303_023.wav", "model"), "vctk_p304_023": ("kyutai/tts-voices", "vctk/p304_023.wav", "model"), "vctk_p305_023": ("kyutai/tts-voices", "vctk/p305_023.wav", "model"), "vctk_p306_023": ("kyutai/tts-voices", "vctk/p306_023.wav", "model"), "vctk_p307_023": ("kyutai/tts-voices", "vctk/p307_023.wav", "model"), "vctk_p308_023": ("kyutai/tts-voices", "vctk/p308_023.wav", "model"), "vctk_p310_023": ("kyutai/tts-voices", "vctk/p310_023.wav", "model"), "vctk_p311_023": ("kyutai/tts-voices", "vctk/p311_023.wav", "model"), "vctk_p312_023": ("kyutai/tts-voices", "vctk/p312_023.wav", "model"), "vctk_p313_023": ("kyutai/tts-voices", "vctk/p313_023.wav", "model"), "vctk_p314_023": ("kyutai/tts-voices", "vctk/p314_023.wav", "model"), "vctk_p315_023": ("kyutai/tts-voices", "vctk/p315_023.wav", "model"), "vctk_p316_023": ("kyutai/tts-voices", "vctk/p316_023.wav", "model"), "vctk_p317_023": ("kyutai/tts-voices", "vctk/p317_023.wav", "model"), "vctk_p318_023": ("kyutai/tts-voices", "vctk/p318_023.wav", "model"), "vctk_p323_023": ("kyutai/tts-voices", "vctk/p323_023.wav", "model"), "vctk_p326_023": ("kyutai/tts-voices", "vctk/p326_023.wav", "model"), "vctk_p329_023": ("kyutai/tts-voices", "vctk/p329_023.wav", "model"), "vctk_p330_023": ("kyutai/tts-voices", "vctk/p330_023.wav", "model"), "vctk_p333_023": ("kyutai/tts-voices", "vctk/p333_023.wav", "model"), "vctk_p334_023": ("kyutai/tts-voices", "vctk/p334_023.wav", "model"), "vctk_p335_023": ("kyutai/tts-voices", "vctk/p335_023.wav", "model"), "vctk_p336_023": ("kyutai/tts-voices", "vctk/p336_023.wav", "model"), "vctk_p339_023": ("kyutai/tts-voices", "vctk/p339_023.wav", "model"), "vctk_p341_023": ("kyutai/tts-voices", "vctk/p341_023.wav", "model"), "vctk_p343_023": ("kyutai/tts-voices", "vctk/p343_023.wav", "model"), "vctk_p345_023": ("kyutai/tts-voices", "vctk/p345_023.wav", "model"), "vctk_p347_023": ("kyutai/tts-voices", "vctk/p347_023.wav", "model"), "vctk_p351_023": ("kyutai/tts-voices", "vctk/p351_023.wav", "model"), "vctk_p360_023": ("kyutai/tts-voices", "vctk/p360_023.wav", "model"), "vctk_p361_023": ("kyutai/tts-voices", "vctk/p361_023.wav", "model"), "vctk_p363_023": ("kyutai/tts-voices", "vctk/p363_023.wav", "model"), "vctk_p364_023": ("kyutai/tts-voices", "vctk/p364_023.wav", "model"), "vctk_p374_023": ("kyutai/tts-voices", "vctk/p374_023.wav", "model"), "vctk_p376_023": ("kyutai/tts-voices", "vctk/p376_023.wav", "model"), "vctk_s5_023": ("kyutai/tts-voices", "vctk/s5_023.wav", "model"), "vd_0a67": ("kyutai/tts-voices", "voice-donations/0a67.wav", "model"), "vd_1410": ("kyutai/tts-voices", "voice-donations/1410.wav", "model"), "vd_1dd0": ("kyutai/tts-voices", "voice-donations/1dd0.wav", "model"), "vd_2181": ("kyutai/tts-voices", "voice-donations/2181.wav", "model"), "vd_245e": ("kyutai/tts-voices", "voice-donations/245e.wav", "model"), "vd_29da": ("kyutai/tts-voices", "voice-donations/29da.wav", "model"), "vd_30c5": ("kyutai/tts-voices", "voice-donations/30c5.wav", "model"), "vd_3973": ("kyutai/tts-voices", "voice-donations/3973.wav", "model"), "vd_4189": ("kyutai/tts-voices", "voice-donations/4189.wav", "model"), "vd_468c": ("kyutai/tts-voices", "voice-donations/468c.wav", "model"), "vd_4b13": ("kyutai/tts-voices", "voice-donations/4b13.wav", "model"), "vd_4b70": ("kyutai/tts-voices", "voice-donations/4b70.wav", "model"), "vd_5b55": ("kyutai/tts-voices", "voice-donations/5b55.wav", "model"), "vd_6148": ("kyutai/tts-voices", "voice-donations/6148.wav", "model"), "vd_617b": ("kyutai/tts-voices", "voice-donations/617b.wav", "model"), "vd_7020": ("kyutai/tts-voices", "voice-donations/7020.wav", "model"), "vd_7909": ("kyutai/tts-voices", "voice-donations/7909.wav", "model"), "vd_7b2b": ("kyutai/tts-voices", "voice-donations/7b2b.wav", "model"), "vd_8935": ("kyutai/tts-voices", "voice-donations/8935.wav", "model"), "vd_8dc9": ("kyutai/tts-voices", "voice-donations/8dc9.wav", "model"), "vd_8f15": ("kyutai/tts-voices", "voice-donations/8f15.wav", "model"), "vd_92f0": ("kyutai/tts-voices", "voice-donations/92f0.wav", "model"), "vd_9a2e": ("kyutai/tts-voices", "voice-donations/9a2e.wav", "model"), "vd_9a66": ("kyutai/tts-voices", "voice-donations/9a66.wav", "model"), "vd_AHmad": ("kyutai/tts-voices", "voice-donations/AHmad.wav", "model"), "vd_ASEN": ("kyutai/tts-voices", "voice-donations/ASEN.wav", "model"), "vd_Aadi": ("kyutai/tts-voices", "voice-donations/Aadi.wav", "model"), "vd_AbD": ("kyutai/tts-voices", "voice-donations/AbD.wav", "model"), "vd_Abhinox": ("kyutai/tts-voices", "voice-donations/Abhinox.wav", "model"), "vd_Abo_Ayman": ("kyutai/tts-voices", "voice-donations/Abo_Ayman.wav", "model"), "vd_Abob_Malay": ("kyutai/tts-voices", "voice-donations/Abob_Malay.wav", "model"), "vd_Adarsh_Bulla": ("kyutai/tts-voices", "voice-donations/Adarsh_Bulla.wav", "model"), "vd_AgentCobra": ("kyutai/tts-voices", "voice-donations/AgentCobra.wav", "model"), "vd_Ajith": ("kyutai/tts-voices", "voice-donations/Ajith.wav", "model"), "vd_Alejandro_espanol_latino": ("kyutai/tts-voices", "voice-donations/Alejandro_espanol_latino.wav", "model"), "vd_Allen": ("kyutai/tts-voices", "voice-donations/Allen.wav", "model"), "vd_AmitNag": ("kyutai/tts-voices", "voice-donations/AmitNag.wav", "model"), "vd_Andrea": ("kyutai/tts-voices", "voice-donations/Andrea.wav", "model"), "vd_Andrea_Spanish": ("kyutai/tts-voices", "voice-donations/Andrea_(Spanish).wav", "model"), "vd_Antoine_Vala": ("kyutai/tts-voices", "voice-donations/Antoine_Vala.wav", "model"), "vd_Antoni": ("kyutai/tts-voices", "voice-donations/Antoni.wav", "model"), "vd_Aon": ("kyutai/tts-voices", "voice-donations/Aon.wav", "model"), "vd_Arjun_Z": ("kyutai/tts-voices", "voice-donations/Arjun_Z.wav", "model"), "vd_Aryobe": ("kyutai/tts-voices", "voice-donations/Aryobe.wav", "model"), "vd_BLUE": ("kyutai/tts-voices", "voice-donations/BLUE.wav", "model"), "vd_Bijay": ("kyutai/tts-voices", "voice-donations/Bijay.wav", "model"), "vd_Blake": ("kyutai/tts-voices", "voice-donations/Blake.wav", "model"), "vd_Bobby_McFern": ("kyutai/tts-voices", "voice-donations/Bobby_McFern.wav", "model"), "vd_Breaking_1": ("kyutai/tts-voices", "voice-donations/Breaking_1.wav", "model"), "vd_BrokenHypocrite": ("kyutai/tts-voices", "voice-donations/BrokenHypocrite.wav", "model"), "vd_Butter": ("kyutai/tts-voices", "voice-donations/Butter.wav", "model"), "vd_CPS_001": ("kyutai/tts-voices", "voice-donations/CPS_001.wav", "model"), "vd_Chujus": ("kyutai/tts-voices", "voice-donations/Chujus.wav", "model"), "vd_Cicada": ("kyutai/tts-voices", "voice-donations/Cicada.wav", "model"), "vd_ClassicWizard": ("kyutai/tts-voices", "voice-donations/ClassicWizard.wav", "model"), "vd_Curlinvictus": ("kyutai/tts-voices", "voice-donations/Curlinvictus.wav", "model"), "vd_Darius": ("kyutai/tts-voices", "voice-donations/Darius.wav", "model"), "vd_Darya_khan": ("kyutai/tts-voices", "voice-donations/Darya_khan.wav", "model"), "vd_Deepak": ("kyutai/tts-voices", "voice-donations/Deepak.wav", "model"), "vd_Dhruv_Rao": ("kyutai/tts-voices", "voice-donations/Dhruv_Rao.wav", "model"), "vd_Dil": ("kyutai/tts-voices", "voice-donations/Dil.wav", "model"), "vd_Enrique": ("kyutai/tts-voices", "voice-donations/Enrique.wav", "model"), "vd_Enrique_Spanish": ("kyutai/tts-voices", "voice-donations/Enrique_(Spanish).wav", "model"), "vd_Erick": ("kyutai/tts-voices", "voice-donations/Erick.wav", "model"), "vd_Ernesto_Y": ("kyutai/tts-voices", "voice-donations/Ernesto_Y.wav", "model"), "vd_Eshan": ("kyutai/tts-voices", "voice-donations/Eshan.wav", "model"), "vd_Esteban_Aguirre_Arias": ("kyutai/tts-voices", "voice-donations/Esteban_Aguirre_Arias.wav", "model"), "vd_Ferdinand": ("kyutai/tts-voices", "voice-donations/Ferdinand.wav", "model"), "vd_FlorDaddy": ("kyutai/tts-voices", "voice-donations/FlorDaddy.wav", "model"), "vd_Fred_Mara": ("kyutai/tts-voices", "voice-donations/Fred_Mara.wav", "model"), "vd_Giovanne": ("kyutai/tts-voices", "voice-donations/Giovanne.wav", "model"), "vd_Glenn": ("kyutai/tts-voices", "voice-donations/Glenn.wav", "model"), "vd_Goku": ("kyutai/tts-voices", "voice-donations/Goku.wav", "model"), "vd_Gonzalo": ("kyutai/tts-voices", "voice-donations/Gonzalo.wav", "model"), "vd_Gonzalo_1": ("kyutai/tts-voices", "voice-donations/Gonzalo-1.wav", "model"), "vd_Greggy": ("kyutai/tts-voices", "voice-donations/Greggy.wav", "model"), "vd_Haku": ("kyutai/tts-voices", "voice-donations/Haku.wav", "model"), "vd_Hannah": ("kyutai/tts-voices", "voice-donations/Hannah.wav", "model"), "vd_Hardik_Clone": ("kyutai/tts-voices", "voice-donations/Hardik_Clone.wav", "model"), "vd_Hillbilly_Jim": ("kyutai/tts-voices", "voice-donations/Hillbilly_Jim.wav", "model"), "vd_Hkl": ("kyutai/tts-voices", "voice-donations/Hkl.wav", "model"), "vd_Hugo_the_frenchie": ("kyutai/tts-voices", "voice-donations/Hugo_the_frenchie.wav", "model"), "vd_Ilyass_yea": ("kyutai/tts-voices", "voice-donations/Ilyass_yea.wav", "model"), "vd_Imran_475": ("kyutai/tts-voices", "voice-donations/Imran_475.wav", "model"), "vd_Imran_from_I_India": ("kyutai/tts-voices", "voice-donations/Imran_from_I_India.wav", "model"), "vd_Indian_guy": ("kyutai/tts-voices", "voice-donations/Indian_guy.wav", "model"), "vd_Ineedthisnow": ("kyutai/tts-voices", "voice-donations/Ineedthisnow.wav", "model"), "vd_JJis2123": ("kyutai/tts-voices", "voice-donations/JJis2123.wav", "model"), "vd_JOSHE": ("kyutai/tts-voices", "voice-donations/JOSHE.wav", "model"), "vd_James": ("kyutai/tts-voices", "voice-donations/James.wav", "model"), "vd_Jaspino": ("kyutai/tts-voices", "voice-donations/Jaspino.wav", "model"), "vd_Jaw": ("kyutai/tts-voices", "voice-donations/Jaw.wav", "model"), "vd_Jay": ("kyutai/tts-voices", "voice-donations/Jay.wav", "model"), "vd_Jeff_Andrew": ("kyutai/tts-voices", "voice-donations/Jeff_Andrew.wav", "model"), "vd_Jeffrey": ("kyutai/tts-voices", "voice-donations/Jeffrey.wav", "model"), "vd_Jeremy_Q": ("kyutai/tts-voices", "voice-donations/Jeremy_Q.wav", "model"), "vd_Jimmy": ("kyutai/tts-voices", "voice-donations/Jimmy.wav", "model"), "vd_Joaopedrobil1": ("kyutai/tts-voices", "voice-donations/Joaopedrobil1.wav", "model"), "vd_John_Triguero": ("kyutai/tts-voices", "voice-donations/John_Triguero.wav", "model"), "vd_Juanrestrepo177777": ("kyutai/tts-voices", "voice-donations/Juanrestrepo177777.wav", "model"), "vd_Karti": ("kyutai/tts-voices", "voice-donations/Karti.wav", "model"), "vd_Kditz": ("kyutai/tts-voices", "voice-donations/Kditz.wav", "model"), "vd_Koorosh": ("kyutai/tts-voices", "voice-donations/Koorosh.wav", "model"), "vd_LC": ("kyutai/tts-voices", "voice-donations/LC.wav", "model"), "vd_L_Roy": ("kyutai/tts-voices", "voice-donations/L_Roy.wav", "model"), "vd_Lake": ("kyutai/tts-voices", "voice-donations/Lake.wav", "model"), "vd_Lammy": ("kyutai/tts-voices", "voice-donations/Lammy.wav", "model"), "vd_Lara": ("kyutai/tts-voices", "voice-donations/Lara.wav", "model"), "vd_Latin_Accent": ("kyutai/tts-voices", "voice-donations/Latin_Accent.wav", "model"), "vd_Liquescent": ("kyutai/tts-voices", "voice-donations/Liquescent.wav", "model"), "vd_Louis": ("kyutai/tts-voices", "voice-donations/Louis.wav", "model"), "vd_Lucas": ("kyutai/tts-voices", "voice-donations/Lucas.wav", "model"), "vd_MJDePedro": ("kyutai/tts-voices", "voice-donations/MJDePedro.wav", "model"), "vd_Maisako": ("kyutai/tts-voices", "voice-donations/Maisako.wav", "model"), "vd_Manahen": ("kyutai/tts-voices", "voice-donations/Manahen.wav", "model"), "vd_Marshal_Indian": ("kyutai/tts-voices", "voice-donations/Marshal_Indian.wav", "model"), "vd_Midlands_Bedfordshire_Dialect": ("kyutai/tts-voices", "voice-donations/Midlands_Bedfordshire_Dialect.wav", "model"), "vd_Moses": ("kyutai/tts-voices", "voice-donations/Moses.wav", "model"), "vd_MrHat": ("kyutai/tts-voices", "voice-donations/MrHat.wav", "model"), "vd_Mr_captain": ("kyutai/tts-voices", "voice-donations/Mr_captain.wav", "model"), "vd_Muhtasims_Voice": ("kyutai/tts-voices", "voice-donations/Muhtasim's_Voice.wav", "model"), "vd_Mystery_Sir": ("kyutai/tts-voices", "voice-donations/Mystery_Sir.wav", "model"), "vd_Narrum": ("kyutai/tts-voices", "voice-donations/Narrum.wav", "model"), "vd_Nick": ("kyutai/tts-voices", "voice-donations/Nick.wav", "model"), "vd_P0LFR": ("kyutai/tts-voices", "voice-donations/P0LFR.wav", "model"), "vd_Pai_ve": ("kyutai/tts-voices", "voice-donations/Pai_ve.wav", "model"), "vd_Parthiban": ("kyutai/tts-voices", "voice-donations/Parthiban.wav", "model"), "vd_Prakash369": ("kyutai/tts-voices", "voice-donations/Prakash369.wav", "model"), "vd_Puzzle": ("kyutai/tts-voices", "voice-donations/Puzzle.wav", "model"), "vd_Qasim_Wali_Khan": ("kyutai/tts-voices", "voice-donations/Qasim_Wali_Khan.wav", "model"), "vd_RAJ": ("kyutai/tts-voices", "voice-donations/RAJ.wav", "model"), "vd_Rafaelpazv": ("kyutai/tts-voices", "voice-donations/Rafaelpazv.wav", "model"), "vd_Rahul": ("kyutai/tts-voices", "voice-donations/Rahul.wav", "model"), "vd_Raj25": ("kyutai/tts-voices", "voice-donations/Raj25.wav", "model"), "vd_Ramu": ("kyutai/tts-voices", "voice-donations/Ramu.wav", "model"), "vd_Ranjith": ("kyutai/tts-voices", "voice-donations/Ranjith.wav", "model"), "vd_Richard_cuban": ("kyutai/tts-voices", "voice-donations/Richard_cuban.wav", "model"), "vd_Rony": ("kyutai/tts-voices", "voice-donations/Rony.wav", "model"), "vd_Roscoe": ("kyutai/tts-voices", "voice-donations/Roscoe.wav", "model"), "vd_Rs": ("kyutai/tts-voices", "voice-donations/Rs.wav", "model"), "vd_Rup": ("kyutai/tts-voices", "voice-donations/Rup.wav", "model"), "vd_SSA150803": ("kyutai/tts-voices", "voice-donations/SSA150803.wav", "model"), "vd_SS_1684": ("kyutai/tts-voices", "voice-donations/SS_1684.wav", "model"), "vd_STONE": ("kyutai/tts-voices", "voice-donations/STONE.wav", "model"), "vd_Samsewak": ("kyutai/tts-voices", "voice-donations/Samsewak.wav", "model"), "vd_Selfie": ("kyutai/tts-voices", "voice-donations/Selfie.wav", "model"), "vd_Sheddy": ("kyutai/tts-voices", "voice-donations/Sheddy.wav", "model"), "vd_Siddh_Indian": ("kyutai/tts-voices", "voice-donations/Siddh_Indian.wav", "model"), "vd_Sir_TJ": ("kyutai/tts-voices", "voice-donations/Sir_TJ.wav", "model"), "vd_Sirajo_x": ("kyutai/tts-voices", "voice-donations/Sirajo_x.wav", "model"), "vd_Sp46": ("kyutai/tts-voices", "voice-donations/Sp46.wav", "model"), "vd_Sr_Erick": ("kyutai/tts-voices", "voice-donations/Sr_Erick.wav", "model"), "vd_Standollars": ("kyutai/tts-voices", "voice-donations/Standollars.wav", "model"), "vd_TESLLA": ("kyutai/tts-voices", "voice-donations/TESLLA.wav", "model"), "vd_Tahii": ("kyutai/tts-voices", "voice-donations/Tahii.wav", "model"), "vd_TheFin": ("kyutai/tts-voices", "voice-donations/TheFin.wav", "model"), "vd_The_Sustainabler": ("kyutai/tts-voices", "voice-donations/The_Sustainabler.wav", "model"), "vd_The_other_brother": ("kyutai/tts-voices", "voice-donations/The_other_brother.wav", "model"), "vd_Titorium": ("kyutai/tts-voices", "voice-donations/Titorium.wav", "model"), "vd_Tonmoy": ("kyutai/tts-voices", "voice-donations/Tonmoy.wav", "model"), "vd_Umair": ("kyutai/tts-voices", "voice-donations/Umair.wav", "model"), "vd_Vexat": ("kyutai/tts-voices", "voice-donations/Vexat.wav", "model"), "vd_Victor_Garcia": ("kyutai/tts-voices", "voice-donations/Victor_Garcia.wav", "model"), "vd_Vinith___English_India": ("kyutai/tts-voices", "voice-donations/Vinith___English_India.wav", "model"), "vd_Vitch": ("kyutai/tts-voices", "voice-donations/Vitch.wav", "model"), "vd_Vivaldi": ("kyutai/tts-voices", "voice-donations/Vivaldi.wav", "model"), "vd_W_A_H": ("kyutai/tts-voices", "voice-donations/W_A_H.wav", "model"), "vd_Wealthiest": ("kyutai/tts-voices", "voice-donations/Wealthiest.wav", "model"), "vd_WhisperInEar": ("kyutai/tts-voices", "voice-donations/WhisperInEar.wav", "model"), "vd_Yesid": ("kyutai/tts-voices", "voice-donations/Yesid.wav", "model"), "vd_Youfied": ("kyutai/tts-voices", "voice-donations/Youfied.wav", "model"), "vd_Yuush": ("kyutai/tts-voices", "voice-donations/Yuush.wav", "model"), "vd_a59a": ("kyutai/tts-voices", "voice-donations/a59a.wav", "model"), "vd_a6f9": ("kyutai/tts-voices", "voice-donations/a6f9.wav", "model"), "vd_a96a": ("kyutai/tts-voices", "voice-donations/a96a.wav", "model"), "vd_aepeak": ("kyutai/tts-voices", "voice-donations/aepeak.wav", "model"), "vd_albertoforofo007": ("kyutai/tts-voices", "voice-donations/albertoforofo007.wav", "model"), "vd_amazon_box": ("kyutai/tts-voices", "voice-donations/amazon_box.wav", "model"), "vd_awais_shah": ("kyutai/tts-voices", "voice-donations/awais_shah.wav", "model"), "vd_bathri": ("kyutai/tts-voices", "voice-donations/bathri.wav", "model"), "vd_bbe4": ("kyutai/tts-voices", "voice-donations/bbe4.wav", "model"), "vd_bc98": ("kyutai/tts-voices", "voice-donations/bc98.wav", "model"), "vd_bevi": ("kyutai/tts-voices", "voice-donations/bevi.wav", "model"), "vd_boom": ("kyutai/tts-voices", "voice-donations/boom.wav", "model"), "vd_c0a0": ("kyutai/tts-voices", "voice-donations/c0a0.wav", "model"), "vd_cybina": ("kyutai/tts-voices", "voice-donations/cybina.wav", "model"), "vd_d4a9": ("kyutai/tts-voices", "voice-donations/d4a9.wav", "model"), "vd_dce6": ("kyutai/tts-voices", "voice-donations/dce6.wav", "model"), "vd_dwp": ("kyutai/tts-voices", "voice-donations/dwp.wav", "model"), "vd_e819": ("kyutai/tts-voices", "voice-donations/e819.wav", "model"), "vd_edd4": ("kyutai/tts-voices", "voice-donations/edd4.wav", "model"), "vd_efeb": ("kyutai/tts-voices", "voice-donations/efeb.wav", "model"), "vd_english_with_german_accent": ("kyutai/tts-voices", "voice-donations/english_with_german_accent.wav", "model"), "vd_erihppas": ("kyutai/tts-voices", "voice-donations/erihppas.wav", "model"), "vd_f179": ("kyutai/tts-voices", "voice-donations/f179.wav", "model"), "vd_f9cf": ("kyutai/tts-voices", "voice-donations/f9cf.wav", "model"), "vd_fa52": ("kyutai/tts-voices", "voice-donations/fa52.wav", "model"), "vd_fc96": ("kyutai/tts-voices", "voice-donations/fc96.wav", "model"), "vd_floyd2026": ("kyutai/tts-voices", "voice-donations/floyd2026.wav", "model"), "vd_gmaskell92": ("kyutai/tts-voices", "voice-donations/gmaskell92.wav", "model"), "vd_gyroo": ("kyutai/tts-voices", "voice-donations/gyroo.wav", "model"), "vd_hielos_1": ("kyutai/tts-voices", "voice-donations/hielos_1.wav", "model"), "vd_hielos_2": ("kyutai/tts-voices", "voice-donations/hielos_2.wav", "model"), "vd_injul": ("kyutai/tts-voices", "voice-donations/injul.wav", "model"), "vd_kbrn1": ("kyutai/tts-voices", "voice-donations/kbrn1.wav", "model"), "vd_oldNerd": ("kyutai/tts-voices", "voice-donations/oldNerd.wav", "model"), "vd_oldNerd2": ("kyutai/tts-voices", "voice-donations/oldNerd2.wav", "model"), "vd_oldNerd3": ("kyutai/tts-voices", "voice-donations/oldNerd3.wav", "model"), "vd_ra_XOr": ("kyutai/tts-voices", "voice-donations/ra_XOr.wav", "model"), "vd_rewi": ("kyutai/tts-voices", "voice-donations/rewi.wav", "model"), "vd_robert": ("kyutai/tts-voices", "voice-donations/robert.wav", "model"), "vd_rshah_1_0": ("kyutai/tts-voices", "voice-donations/rshah_1_0.wav", "model"), "vd_sanjay": ("kyutai/tts-voices", "voice-donations/sanjay.wav", "model"), "vd_siddharth_khanna": ("kyutai/tts-voices", "voice-donations/siddharth_khanna.wav", "model"), "vd_solace": ("kyutai/tts-voices", "voice-donations/solace.wav", "model"), "vd_spanish_limaperu": ("kyutai/tts-voices", "voice-donations/spanish-limaperu.wav", "model"), "vd_stein": ("kyutai/tts-voices", "voice-donations/stein.wav", "model"), "vd_sujan_daikoawaj": ("kyutai/tts-voices", "voice-donations/sujan_daikoawaj.wav", "model"), "vd_surazy": ("kyutai/tts-voices", "voice-donations/surazy.wav", "model"), "vd_taiyo": ("kyutai/tts-voices", "voice-donations/taiyo.wav", "model"), "vd_temp_007": ("kyutai/tts-voices", "voice-donations/temp-007.wav", "model"), "vd_thepolishdane": ("kyutai/tts-voices", "voice-donations/thepolishdane.wav", "model"), "vd_utk": ("kyutai/tts-voices", "voice-donations/utk.wav", "model"), "vd_vinayak": ("kyutai/tts-voices", "voice-donations/vinayak.wav", "model"), "vd_virtu": ("kyutai/tts-voices", "voice-donations/virtu.wav", "model"), "vd_willbas": ("kyutai/tts-voices", "voice-donations/willbas.wav", "model"), "vd_yaemdluffy": ("kyutai/tts-voices", "voice-donations/yaemdluffy.wav", "model"), "vd_zerocool": ("kyutai/tts-voices", "voice-donations/zerocool.wav", "model"), "zero_bill_boerst": ("kyutai/tts-voices", "voice-zero/bill_boerst.wav", "model"), "zero_caro_davy": ("kyutai/tts-voices", "voice-zero/caro_davy.wav", "model"), "zero_peter_yearsley": ("kyutai/tts-voices", "voice-zero/peter_yearsley.wav", "model"), "zero_stuart_bell": ("kyutai/tts-voices", "voice-zero/stuart_bell.wav", "model"), } VOICE_SOURCES.update(_KYUTAI_VOICES) BUILTIN_VOICES = sorted(VOICE_SOURCES.keys()) def _init_model(): if _state["initialized"]: return if TTSModel is None: raise RuntimeError("pocket-tts not installed") print("Initializing Pocket TTS model (english_2026-04 with voice cloning)...") model = TTSModel.load_model(language="english_2026-04") _state["model"] = model _state["sample_rate"] = getattr(model, "sample_rate", 24000) _state["initialized"] = True print(f"Pocket TTS initialized. Sample rate: {_state['sample_rate']} Hz, voice_cloning: {model.has_voice_cloning}, voices: {len(BUILTIN_VOICES)}") def _get_voice_state(voice: str): model = _state["model"] if voice in _state["voice_cache"]: return _state["voice_cache"][voice] if voice not in VOICE_SOURCES: raise ValueError(f"Voice '{voice}' not found. Available: {BUILTIN_VOICES}") source = VOICE_SOURCES[voice] repo_id = source[0] voice_path_hf = source[1] repo_type = source[2] if len(source) > 2 else "space" from huggingface_hub import hf_hub_download try: voice_path = hf_hub_download( repo_id, voice_path_hf, repo_type=repo_type, token=HF_TOKEN or None, ) print(f"Downloaded voice '{voice}' from {repo_id} ({repo_type})") except Exception as e: raise ValueError(f"Failed to download voice '{voice}': {e}") voice_state = model.get_state_for_audio_prompt(voice_path) def detach_all(obj): if isinstance(obj, torch.Tensor): return obj.detach().clone() elif isinstance(obj, dict): return {k: detach_all(v) for k, v in obj.items()} else: return obj voice_state = detach_all(voice_state) _state["voice_cache"][voice] = voice_state print(f"Voice state loaded for '{voice}'") return voice_state def _generate_audio(text: str, voice: str, temperature: float = 0.7) -> tuple: _init_model() model = _state["model"] sample_rate = _state["sample_rate"] voice_state = _get_voice_state(voice) audio = model.generate_audio( voice_state, text, frames_after_eos=2, copy_state=True, ) audio_np = audio.cpu().numpy() if hasattr(audio, 'cpu') else audio max_val = np.max(np.abs(audio_np)) if max_val > 0: audio_np = audio_np / max_val * 0.95 audio_int16 = np.clip(audio_np * 32767, -32767, 32767).astype(np.int16) return audio_int16, sample_rate def _wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes: buf = io.BytesIO() with wave.open(buf, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio_int16.tobytes()) return buf.getvalue() def _ogg_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes: wav_data = _wav_bytes(audio_int16, sample_rate) proc = subprocess.run( ["ffmpeg", "-y", "-f", "wav", "-i", "pipe:0", "-c:a", "libopus", "-b:a", "64k", "-ar", "48000", "-ac", "1", "-f", "ogg", "pipe:1"], input=wav_data, capture_output=True, timeout=30, ) if proc.returncode != 0: raise RuntimeError(f"ffmpeg failed: {proc.stderr.decode()[:200]}") return proc.stdout @app.post("/tts") async def tts_post(req: TTSRequest): """POST endpoint — send full text in request body (no URL length limits).""" try: audio_int16, sample_rate = _generate_audio(req.text, req.voice, req.temperature) except ValueError as e: raise HTTPException(400, str(e)) except Exception as e: traceback.print_exc() raise HTTPException(500, str(e)[:300]) if req.format == "ogg": try: data = _ogg_bytes(audio_int16, sample_rate) return Response(content=data, media_type="audio/ogg", headers={"Content-Disposition": "attachment; filename=tts.ogg"}) except Exception as e: raise HTTPException(500, f"OGG encoding failed: {str(e)[:200]}") data = _wav_bytes(audio_int16, sample_rate) return Response(content=data, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=tts.wav"}) @app.get("/tts") async def tts_get( text: str = Query(..., description="Text to synthesize"), voice: str = Query("af_alloy", description="Voice name"), temperature: float = Query(0.7, ge=0.1, le=1.5), format: str = Query("ogg", description="Output format: wav or ogg"), ): try: audio_int16, sample_rate = _generate_audio(text, voice, temperature) except ValueError as e: raise HTTPException(400, str(e)) except Exception as e: traceback.print_exc() raise HTTPException(500, str(e)[:300]) if format == "ogg": try: data = _ogg_bytes(audio_int16, sample_rate) return Response(content=data, media_type="audio/ogg", headers={"Content-Disposition": "attachment; filename=tts.ogg"}) except Exception as e: raise HTTPException(500, f"OGG encoding failed: {str(e)[:200]}") data = _wav_bytes(audio_int16, sample_rate) return Response(content=data, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=tts.wav"}) @app.get("/voices") async def voices(): return {"voices": BUILTIN_VOICES, "count": len(BUILTIN_VOICES)} @app.get("/health") async def health(): return {"status": "ok", "initialized": _state["initialized"]} @app.get("/", response_class=HTMLResponse) async def index(): return """
FastAPI server running kyutai/pocket-tts
GET /tts?text=Hello&voice=af_alloy&format=oggGET /voicesGET /health