Spaces:
Running
Running
| """Pocket-TTS FastAPI server.""" | |
| import io | |
| import os | |
| import wave | |
| import subprocess | |
| import traceback | |
| from pathlib import Path | |
| import numpy as np | |
| from fastapi import FastAPI, Query, HTTPException, Form, Body | |
| from fastapi.responses import Response, HTMLResponse | |
| from pydantic import BaseModel | |
| class TTSRequest(BaseModel): | |
| text: str | |
| voice: str = "af_alloy" | |
| temperature: float = 0.7 | |
| format: str = "ogg" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| if HF_TOKEN: | |
| os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN | |
| try: | |
| from huggingface_hub import login | |
| login(token=HF_TOKEN, add_to_git_credential=False) | |
| print("Logged in to HuggingFace Hub") | |
| except Exception as e: | |
| print(f"HF login warning: {e}") | |
| try: | |
| import torch | |
| from pocket_tts import TTSModel | |
| except ImportError: | |
| torch = None | |
| TTSModel = None | |
| app = FastAPI(title="Pocket-TTS API") | |
| _state = { | |
| "initialized": False, | |
| "model": None, | |
| "sample_rate": 24000, | |
| "voice_cache": {}, | |
| } | |
| # Voice sources: (repo_id, path, repo_type) | |
| VOICE_SOURCES = { | |
| # === Nymbo/Pocket-TTS (standard voices) === | |
| "af_alloy": ("Nymbo/Pocket-TTS", "voices/af_alloy.wav", "space"), | |
| "af_aoede": ("Nymbo/Pocket-TTS", "voices/af_aoede.wav", "space"), | |
| "af_bella": ("Nymbo/Pocket-TTS", "voices/af_bella.wav", "space"), | |
| "af_heart": ("Nymbo/Pocket-TTS", "voices/af_heart.wav", "space"), | |
| "af_jessica": ("Nymbo/Pocket-TTS", "voices/af_jessica.wav", "space"), | |
| "af_kore": ("Nymbo/Pocket-TTS", "voices/af_kore.wav", "space"), | |
| "af_nicole": ("Nymbo/Pocket-TTS", "voices/af_nicole.wav", "space"), | |
| "af_nova": ("Nymbo/Pocket-TTS", "voices/af_nova.wav", "space"), | |
| "af_river": ("Nymbo/Pocket-TTS", "voices/af_river.wav", "space"), | |
| "af_sarah": ("Nymbo/Pocket-TTS", "voices/af_sarah.wav", "space"), | |
| "af_sky": ("Nymbo/Pocket-TTS", "voices/af_sky.wav", "space"), | |
| "am_adam": ("Nymbo/Pocket-TTS", "voices/am_adam.wav", "space"), | |
| "am_echo": ("Nymbo/Pocket-TTS", "voices/am_echo.wav", "space"), | |
| "am_eric": ("Nymbo/Pocket-TTS", "voices/am_eric.wav", "space"), | |
| "am_fenrir": ("Nymbo/Pocket-TTS", "voices/am_fenrir.wav", "space"), | |
| "am_liam": ("Nymbo/Pocket-TTS", "voices/am_liam.wav", "space"), | |
| "am_michael": ("Nymbo/Pocket-TTS", "voices/am_michael.wav", "space"), | |
| "am_onyx": ("Nymbo/Pocket-TTS", "voices/am_onyx.wav", "space"), | |
| "am_puck": ("Nymbo/Pocket-TTS", "voices/am_puck.wav", "space"), | |
| "am_santa": ("Nymbo/Pocket-TTS", "voices/am_santa.wav", "space"), | |
| "bf_alice": ("Nymbo/Pocket-TTS", "voices/bf_alice.wav", "space"), | |
| "bf_emma": ("Nymbo/Pocket-TTS", "voices/bf_emma.wav", "space"), | |
| "bf_isabella": ("Nymbo/Pocket-TTS", "voices/bf_isabella.wav", "space"), | |
| "bf_lily": ("Nymbo/Pocket-TTS", "voices/bf_lily.wav", "space"), | |
| "bm_daniel": ("Nymbo/Pocket-TTS", "voices/bm_daniel.wav", "space"), | |
| "bm_fable": ("Nymbo/Pocket-TTS", "voices/bm_fable.wav", "space"), | |
| "bm_george": ("Nymbo/Pocket-TTS", "voices/bm_george.wav", "space"), | |
| "bm_lewis": ("Nymbo/Pocket-TTS", "voices/bm_lewis.wav", "space"), | |
| "ef_dora": ("Nymbo/Pocket-TTS", "voices/ef_dora.wav", "space"), | |
| "em_alex": ("Nymbo/Pocket-TTS", "voices/em_alex.wav", "space"), | |
| "em_santa": ("Nymbo/Pocket-TTS", "voices/em_santa.wav", "space"), | |
| "ff_siwis": ("Nymbo/Pocket-TTS", "voices/ff_siwis.wav", "space"), | |
| "hf_alpha": ("Nymbo/Pocket-TTS", "voices/hf_alpha.wav", "space"), | |
| "hf_beta": ("Nymbo/Pocket-TTS", "voices/hf_beta.wav", "space"), | |
| "hm_omega": ("Nymbo/Pocket-TTS", "voices/hm_omega.wav", "space"), | |
| "hm_psi": ("Nymbo/Pocket-TTS", "voices/hm_psi.wav", "space"), | |
| "if_sara": ("Nymbo/Pocket-TTS", "voices/if_sara.wav", "space"), | |
| "im_nicola": ("Nymbo/Pocket-TTS", "voices/im_nicola.wav", "space"), | |
| "jf_alpha": ("Nymbo/Pocket-TTS", "voices/jf_alpha.wav", "space"), | |
| "jf_gongitsune": ("Nymbo/Pocket-TTS", "voices/jf_gongitsune.wav", "space"), | |
| "jf_nezumi": ("Nymbo/Pocket-TTS", "voices/jf_nezumi.wav", "space"), | |
| "jf_tebukuro": ("Nymbo/Pocket-TTS", "voices/jf_tebukuro.wav", "space"), | |
| "jm_kumo": ("Nymbo/Pocket-TTS", "voices/jm_kumo.wav", "space"), | |
| "pf_dora": ("Nymbo/Pocket-TTS", "voices/pf_dora.wav", "space"), | |
| "pm_alex": ("Nymbo/Pocket-TTS", "voices/pm_alex.wav", "space"), | |
| "pm_santa": ("Nymbo/Pocket-TTS", "voices/pm_santa.wav", "space"), | |
| "zf_xiaobei": ("Nymbo/Pocket-TTS", "voices/zf_xiaobei.wav", "space"), | |
| "zf_xiaoni": ("Nymbo/Pocket-TTS", "voices/zf_xiaoni.wav", "space"), | |
| "zf_xiaoxiao": ("Nymbo/Pocket-TTS", "voices/zf_xiaoxiao.wav", "space"), | |
| "zf_xiaoyi": ("Nymbo/Pocket-TTS", "voices/zf_xiaoyi.wav", "space"), | |
| "zm_yunjian": ("Nymbo/Pocket-TTS", "voices/zm_yunjian.wav", "space"), | |
| "zm_yunxi": ("Nymbo/Pocket-TTS", "voices/zm_yunxi.wav", "space"), | |
| "zm_yunxia": ("Nymbo/Pocket-TTS", "voices/zm_yunxia.wav", "space"), | |
| "zm_yunyang": ("Nymbo/Pocket-TTS", "voices/zm_yunyang.wav", "space"), | |
| # === chandypants character voices === | |
| "benji": ("chandypants/ollie-pocket-tts", "voices/benji.wav", "space"), | |
| "bertha": ("chandypants/ollie-pocket-tts", "voices/bertha.wav", "space"), | |
| "damian": ("chandypants/ollie-pocket-tts", "voices/damian.wav", "space"), | |
| "f01_young_bright": ("chandypants/ollie-pocket-tts", "voices/f01_young_bright.wav", "space"), | |
| "f02_texas_gal": ("chandypants/ollie-pocket-tts", "voices/f02_texas_gal.wav", "space"), | |
| "f03_sharp_pro": ("chandypants/ollie-pocket-tts", "voices/f03_sharp_pro.wav", "space"), | |
| "f04_warm_mom": ("chandypants/ollie-pocket-tts", "voices/f04_warm_mom.wav", "space"), | |
| "f05_husky_mature": ("chandypants/ollie-pocket-tts", "voices/f05_husky_mature.wav", "space"), | |
| "f06_perky_young": ("chandypants/ollie-pocket-tts", "voices/f06_perky_young.wav", "space"), | |
| "f07_southern_belle": ("chandypants/ollie-pocket-tts", "voices/f07_southern_belle.wav", "space"), | |
| "f08_tough_cop": ("chandypants/ollie-pocket-tts", "voices/f08_tough_cop.wav", "space"), | |
| "f09_elderly_sweet": ("chandypants/ollie-pocket-tts", "voices/f09_elderly_sweet.wav", "space"), | |
| "f10_theater_kid": ("chandypants/ollie-pocket-tts", "voices/f10_theater_kid.wav", "space"), | |
| "m01_deep_south": ("chandypants/ollie-pocket-tts", "voices/m01_deep_south.wav", "space"), | |
| "m02_smooth_tenor": ("chandypants/ollie-pocket-tts", "voices/m02_smooth_tenor.wav", "space"), | |
| "m03_gruff_ny": ("chandypants/ollie-pocket-tts", "voices/m03_gruff_ny.wav", "space"), | |
| "m04_warm_dad": ("chandypants/ollie-pocket-tts", "voices/m04_warm_dad.wav", "space"), | |
| "m05_distinguished": ("chandypants/ollie-pocket-tts", "voices/m05_distinguished.wav", "space"), | |
| "m06_young_rough": ("chandypants/ollie-pocket-tts", "voices/m06_young_rough.wav", "space"), | |
| "m07_cowboy": ("chandypants/ollie-pocket-tts", "voices/m07_cowboy.wav", "space"), | |
| "m08_fast_talker": ("chandypants/ollie-pocket-tts", "voices/m08_fast_talker.wav", "space"), | |
| "m09_gentle_giant": ("chandypants/ollie-pocket-tts", "voices/m09_gentle_giant.wav", "space"), | |
| "m10_slick": ("chandypants/ollie-pocket-tts", "voices/m10_slick.wav", "space"), | |
| } | |
| # === celebrity voices (voice clone samples) === | |
| VOICE_SOURCES.update({ | |
| "leonardo_dicaprio": ("hf4uwho/Pocket-TTS", "voices/celeb/leonardo_dicaprio.wav", "space"), | |
| "jack_nicholson": ("hf4uwho/Pocket-TTS", "voices/celeb/jack_nicholson.wav", "space"), | |
| "joe_pesci": ("hf4uwho/Pocket-TTS", "voices/celeb/joe_pesci.wav", "space"), | |
| "robert_de_niro": ("hf4uwho/Pocket-TTS", "voices/celeb/robert_de_niro.wav", "space"), | |
| "al_pacino": ("hf4uwho/Pocket-TTS", "voices/celeb/al_pacino.wav", "space"), | |
| "jake_gyllenhaal": ("hf4uwho/Pocket-TTS", "voices/celeb/jake_gyllenhaal.wav", "space"), | |
| "scarlett_johansson": ("hf4uwho/Pocket-TTS", "voices/celeb/scarlett_johansson.wav", "space"), | |
| }) | |
| # === kyutai/tts-voices (official voice catalog) === | |
| _KYUTAI_VOICES = { | |
| "alba_a_moment_by": ("kyutai/tts-voices", "alba-mackenna/a-moment-by.wav", "model"), | |
| "alba_announcer": ("kyutai/tts-voices", "alba-mackenna/announcer.wav", "model"), | |
| "alba_casual": ("kyutai/tts-voices", "alba-mackenna/casual.wav", "model"), | |
| "alba_merchant": ("kyutai/tts-voices", "alba-mackenna/merchant.wav", "model"), | |
| "cml_10087_11650_000028_0002": ("kyutai/tts-voices", "cml-tts/fr/10087_11650_000028-0002.wav", "model"), | |
| "cml_10177_10625_000134_0003": ("kyutai/tts-voices", "cml-tts/fr/10177_10625_000134-0003.wav", "model"), | |
| "cml_10179_11051_000005_0001": ("kyutai/tts-voices", "cml-tts/fr/10179_11051_000005-0001.wav", "model"), | |
| "cml_12080_11650_000047_0001": ("kyutai/tts-voices", "cml-tts/fr/12080_11650_000047-0001.wav", "model"), | |
| "cml_12205_11650_000004_0002": ("kyutai/tts-voices", "cml-tts/fr/12205_11650_000004-0002.wav", "model"), | |
| "cml_12977_10625_000037_0001": ("kyutai/tts-voices", "cml-tts/fr/12977_10625_000037-0001.wav", "model"), | |
| "cml_1406_1028_000009_0003": ("kyutai/tts-voices", "cml-tts/fr/1406_1028_000009-0003.wav", "model"), | |
| "cml_1591_1028_000108_0004": ("kyutai/tts-voices", "cml-tts/fr/1591_1028_000108-0004.wav", "model"), | |
| "cml_1770_1028_000036_0002": ("kyutai/tts-voices", "cml-tts/fr/1770_1028_000036-0002.wav", "model"), | |
| "cml_2114_1656_000053_0001": ("kyutai/tts-voices", "cml-tts/fr/2114_1656_000053-0001.wav", "model"), | |
| "cml_2154_2576_000020_0003": ("kyutai/tts-voices", "cml-tts/fr/2154_2576_000020-0003.wav", "model"), | |
| "cml_2216_1745_000007_0001": ("kyutai/tts-voices", "cml-tts/fr/2216_1745_000007-0001.wav", "model"), | |
| "cml_2223_1745_000009_0002": ("kyutai/tts-voices", "cml-tts/fr/2223_1745_000009-0002.wav", "model"), | |
| "cml_2465_1943_000152_0002": ("kyutai/tts-voices", "cml-tts/fr/2465_1943_000152-0002.wav", "model"), | |
| "cml_296_1028_000022_0001": ("kyutai/tts-voices", "cml-tts/fr/296_1028_000022-0001.wav", "model"), | |
| "cml_3267_1902_000075_0001": ("kyutai/tts-voices", "cml-tts/fr/3267_1902_000075-0001.wav", "model"), | |
| "cml_4193_3103_000004_0001": ("kyutai/tts-voices", "cml-tts/fr/4193_3103_000004-0001.wav", "model"), | |
| "cml_4482_3103_000063_0001": ("kyutai/tts-voices", "cml-tts/fr/4482_3103_000063-0001.wav", "model"), | |
| "cml_4724_3731_000031_0001": ("kyutai/tts-voices", "cml-tts/fr/4724_3731_000031-0001.wav", "model"), | |
| "cml_4937_3731_000004_0001": ("kyutai/tts-voices", "cml-tts/fr/4937_3731_000004-0001.wav", "model"), | |
| "cml_5207_3078_000031_0002": ("kyutai/tts-voices", "cml-tts/fr/5207_3078_000031-0002.wav", "model"), | |
| "cml_5476_3103_000072_0001": ("kyutai/tts-voices", "cml-tts/fr/5476_3103_000072-0001.wav", "model"), | |
| "cml_577_394_000070_0001": ("kyutai/tts-voices", "cml-tts/fr/577_394_000070-0001.wav", "model"), | |
| "cml_5790_4893_000052_0001": ("kyutai/tts-voices", "cml-tts/fr/5790_4893_000052-0001.wav", "model"), | |
| "cml_579_2548_000015_0001": ("kyutai/tts-voices", "cml-tts/fr/579_2548_000015-0001.wav", "model"), | |
| "cml_5830_4703_000037_0001": ("kyutai/tts-voices", "cml-tts/fr/5830_4703_000037-0001.wav", "model"), | |
| "cml_6318_7016_000027_0002": ("kyutai/tts-voices", "cml-tts/fr/6318_7016_000027-0002.wav", "model"), | |
| "cml_7142_2432_000124_0003": ("kyutai/tts-voices", "cml-tts/fr/7142_2432_000124-0003.wav", "model"), | |
| "cml_7400_2928_000100_0001": ("kyutai/tts-voices", "cml-tts/fr/7400_2928_000100-0001.wav", "model"), | |
| "cml_7591_6742_000149_0002": ("kyutai/tts-voices", "cml-tts/fr/7591_6742_000149-0002.wav", "model"), | |
| "cml_7601_7727_000062_0001": ("kyutai/tts-voices", "cml-tts/fr/7601_7727_000062-0001.wav", "model"), | |
| "cml_7762_8734_000048_0002": ("kyutai/tts-voices", "cml-tts/fr/7762_8734_000048-0002.wav", "model"), | |
| "cml_8128_7016_000047_0002": ("kyutai/tts-voices", "cml-tts/fr/8128_7016_000047-0002.wav", "model"), | |
| "cml_928_486_000075_0001": ("kyutai/tts-voices", "cml-tts/fr/928_486_000075-0001.wav", "model"), | |
| "cml_9834_9697_000150_0003": ("kyutai/tts-voices", "cml-tts/fr/9834_9697_000150-0003.wav", "model"), | |
| "ears_p001": ("kyutai/tts-voices", "ears/p001/freeform_speech_01.wav", "model"), | |
| "ears_p002": ("kyutai/tts-voices", "ears/p002/freeform_speech_01.wav", "model"), | |
| "ears_p003": ("kyutai/tts-voices", "ears/p003/freeform_speech_01.wav", "model"), | |
| "ears_p003_adoration": ("kyutai/tts-voices", "ears/p003/emo_adoration_freeform.wav", "model"), | |
| "ears_p003_amazement": ("kyutai/tts-voices", "ears/p003/emo_amazement_freeform.wav", "model"), | |
| "ears_p003_amusement": ("kyutai/tts-voices", "ears/p003/emo_amusement_freeform.wav", "model"), | |
| "ears_p003_anger": ("kyutai/tts-voices", "ears/p003/emo_anger_freeform.wav", "model"), | |
| "ears_p003_confusion": ("kyutai/tts-voices", "ears/p003/emo_confusion_freeform.wav", "model"), | |
| "ears_p003_contentment": ("kyutai/tts-voices", "ears/p003/emo_contentment_freeform.wav", "model"), | |
| "ears_p003_cuteness": ("kyutai/tts-voices", "ears/p003/emo_cuteness_freeform.wav", "model"), | |
| "ears_p003_desire": ("kyutai/tts-voices", "ears/p003/emo_desire_freeform.wav", "model"), | |
| "ears_p003_disappointment": ("kyutai/tts-voices", "ears/p003/emo_disappointment_freeform.wav", "model"), | |
| "ears_p003_disgust": ("kyutai/tts-voices", "ears/p003/emo_disgust_freeform.wav", "model"), | |
| "ears_p003_distress": ("kyutai/tts-voices", "ears/p003/emo_distress_freeform.wav", "model"), | |
| "ears_p003_embarassment": ("kyutai/tts-voices", "ears/p003/emo_embarassment_freeform.wav", "model"), | |
| "ears_p003_extasy": ("kyutai/tts-voices", "ears/p003/emo_extasy_freeform.wav", "model"), | |
| "ears_p003_fear": ("kyutai/tts-voices", "ears/p003/emo_fear_freeform.wav", "model"), | |
| "ears_p003_guilt": ("kyutai/tts-voices", "ears/p003/emo_guilt_freeform.wav", "model"), | |
| "ears_p003_interest": ("kyutai/tts-voices", "ears/p003/emo_interest_freeform.wav", "model"), | |
| "ears_p003_neutral": ("kyutai/tts-voices", "ears/p003/emo_neutral_freeform.wav", "model"), | |
| "ears_p003_pain": ("kyutai/tts-voices", "ears/p003/emo_pain_freeform.wav", "model"), | |
| "ears_p003_pride": ("kyutai/tts-voices", "ears/p003/emo_pride_freeform.wav", "model"), | |
| "ears_p003_realization": ("kyutai/tts-voices", "ears/p003/emo_realization_freeform.wav", "model"), | |
| "ears_p003_relief": ("kyutai/tts-voices", "ears/p003/emo_relief_freeform.wav", "model"), | |
| "ears_p003_sadness": ("kyutai/tts-voices", "ears/p003/emo_sadness_freeform.wav", "model"), | |
| "ears_p003_serenity": ("kyutai/tts-voices", "ears/p003/emo_serenity_freeform.wav", "model"), | |
| "ears_p004": ("kyutai/tts-voices", "ears/p004/freeform_speech_01.wav", "model"), | |
| "ears_p005": ("kyutai/tts-voices", "ears/p005/freeform_speech_01.wav", "model"), | |
| "ears_p006": ("kyutai/tts-voices", "ears/p006/freeform_speech_01.wav", "model"), | |
| "ears_p007": ("kyutai/tts-voices", "ears/p007/freeform_speech_01.wav", "model"), | |
| "ears_p008": ("kyutai/tts-voices", "ears/p008/freeform_speech_01.wav", "model"), | |
| "ears_p009": ("kyutai/tts-voices", "ears/p009/freeform_speech_01.wav", "model"), | |
| "ears_p010": ("kyutai/tts-voices", "ears/p010/freeform_speech_01.wav", "model"), | |
| "ears_p011": ("kyutai/tts-voices", "ears/p011/freeform_speech_01.wav", "model"), | |
| "ears_p012": ("kyutai/tts-voices", "ears/p012/freeform_speech_01.wav", "model"), | |
| "ears_p013": ("kyutai/tts-voices", "ears/p013/freeform_speech_01.wav", "model"), | |
| "ears_p014": ("kyutai/tts-voices", "ears/p014/freeform_speech_01.wav", "model"), | |
| "ears_p015": ("kyutai/tts-voices", "ears/p015/freeform_speech_01.wav", "model"), | |
| "ears_p016": ("kyutai/tts-voices", "ears/p016/freeform_speech_01.wav", "model"), | |
| "ears_p017": ("kyutai/tts-voices", "ears/p017/freeform_speech_01.wav", "model"), | |
| "ears_p018": ("kyutai/tts-voices", "ears/p018/freeform_speech_01.wav", "model"), | |
| "ears_p019": ("kyutai/tts-voices", "ears/p019/freeform_speech_01.wav", "model"), | |
| "ears_p020": ("kyutai/tts-voices", "ears/p020/freeform_speech_01.wav", "model"), | |
| "ears_p021": ("kyutai/tts-voices", "ears/p021/freeform_speech_01.wav", "model"), | |
| "ears_p022": ("kyutai/tts-voices", "ears/p022/freeform_speech_01.wav", "model"), | |
| "ears_p023": ("kyutai/tts-voices", "ears/p023/freeform_speech_01.wav", "model"), | |
| "ears_p024": ("kyutai/tts-voices", "ears/p024/freeform_speech_01.wav", "model"), | |
| "ears_p025": ("kyutai/tts-voices", "ears/p025/freeform_speech_01.wav", "model"), | |
| "ears_p026": ("kyutai/tts-voices", "ears/p026/freeform_speech_01.wav", "model"), | |
| "ears_p027": ("kyutai/tts-voices", "ears/p027/freeform_speech_01.wav", "model"), | |
| "ears_p028": ("kyutai/tts-voices", "ears/p028/freeform_speech_01.wav", "model"), | |
| "ears_p029": ("kyutai/tts-voices", "ears/p029/freeform_speech_01.wav", "model"), | |
| "ears_p030": ("kyutai/tts-voices", "ears/p030/freeform_speech_01.wav", "model"), | |
| "ears_p031": ("kyutai/tts-voices", "ears/p031/freeform_speech_01.wav", "model"), | |
| "ears_p031_adoration": ("kyutai/tts-voices", "ears/p031/emo_adoration_freeform.wav", "model"), | |
| "ears_p031_amazement": ("kyutai/tts-voices", "ears/p031/emo_amazement_freeform.wav", "model"), | |
| "ears_p031_amusement": ("kyutai/tts-voices", "ears/p031/emo_amusement_freeform.wav", "model"), | |
| "ears_p031_anger": ("kyutai/tts-voices", "ears/p031/emo_anger_freeform.wav", "model"), | |
| "ears_p031_confusion": ("kyutai/tts-voices", "ears/p031/emo_confusion_freeform.wav", "model"), | |
| "ears_p031_contentment": ("kyutai/tts-voices", "ears/p031/emo_contentment_freeform.wav", "model"), | |
| "ears_p031_cuteness": ("kyutai/tts-voices", "ears/p031/emo_cuteness_freeform.wav", "model"), | |
| "ears_p031_desire": ("kyutai/tts-voices", "ears/p031/emo_desire_freeform.wav", "model"), | |
| "ears_p031_disappointment": ("kyutai/tts-voices", "ears/p031/emo_disappointment_freeform.wav", "model"), | |
| "ears_p031_disgust": ("kyutai/tts-voices", "ears/p031/emo_disgust_freeform.wav", "model"), | |
| "ears_p031_distress": ("kyutai/tts-voices", "ears/p031/emo_distress_freeform.wav", "model"), | |
| "ears_p031_embarassment": ("kyutai/tts-voices", "ears/p031/emo_embarassment_freeform.wav", "model"), | |
| "ears_p031_extasy": ("kyutai/tts-voices", "ears/p031/emo_extasy_freeform.wav", "model"), | |
| "ears_p031_fear": ("kyutai/tts-voices", "ears/p031/emo_fear_freeform.wav", "model"), | |
| "ears_p031_guilt": ("kyutai/tts-voices", "ears/p031/emo_guilt_freeform.wav", "model"), | |
| "ears_p031_interest": ("kyutai/tts-voices", "ears/p031/emo_interest_freeform.wav", "model"), | |
| "ears_p031_neutral": ("kyutai/tts-voices", "ears/p031/emo_neutral_freeform.wav", "model"), | |
| "ears_p031_pain": ("kyutai/tts-voices", "ears/p031/emo_pain_freeform.wav", "model"), | |
| "ears_p031_pride": ("kyutai/tts-voices", "ears/p031/emo_pride_freeform.wav", "model"), | |
| "ears_p031_realization": ("kyutai/tts-voices", "ears/p031/emo_realization_freeform.wav", "model"), | |
| "ears_p031_relief": ("kyutai/tts-voices", "ears/p031/emo_relief_freeform.wav", "model"), | |
| "ears_p031_sadness": ("kyutai/tts-voices", "ears/p031/emo_sadness_freeform.wav", "model"), | |
| "ears_p031_serenity": ("kyutai/tts-voices", "ears/p031/emo_serenity_freeform.wav", "model"), | |
| "ears_p032": ("kyutai/tts-voices", "ears/p032/freeform_speech_01.wav", "model"), | |
| "ears_p033": ("kyutai/tts-voices", "ears/p033/freeform_speech_01.wav", "model"), | |
| "ears_p034": ("kyutai/tts-voices", "ears/p034/freeform_speech_01.wav", "model"), | |
| "ears_p035": ("kyutai/tts-voices", "ears/p035/freeform_speech_01.wav", "model"), | |
| "ears_p036": ("kyutai/tts-voices", "ears/p036/freeform_speech_01.wav", "model"), | |
| "ears_p037": ("kyutai/tts-voices", "ears/p037/freeform_speech_01.wav", "model"), | |
| "ears_p038": ("kyutai/tts-voices", "ears/p038/freeform_speech_01.wav", "model"), | |
| "ears_p039": ("kyutai/tts-voices", "ears/p039/freeform_speech_01.wav", "model"), | |
| "ears_p040": ("kyutai/tts-voices", "ears/p040/freeform_speech_01.wav", "model"), | |
| "ears_p041": ("kyutai/tts-voices", "ears/p041/freeform_speech_01.wav", "model"), | |
| "ears_p042": ("kyutai/tts-voices", "ears/p042/freeform_speech_01.wav", "model"), | |
| "ears_p043": ("kyutai/tts-voices", "ears/p043/freeform_speech_01.wav", "model"), | |
| "ears_p044": ("kyutai/tts-voices", "ears/p044/freeform_speech_01.wav", "model"), | |
| "ears_p045": ("kyutai/tts-voices", "ears/p045/freeform_speech_01.wav", "model"), | |
| "ears_p046": ("kyutai/tts-voices", "ears/p046/freeform_speech_01.wav", "model"), | |
| "ears_p047": ("kyutai/tts-voices", "ears/p047/freeform_speech_01.wav", "model"), | |
| "ears_p048": ("kyutai/tts-voices", "ears/p048/freeform_speech_01.wav", "model"), | |
| "ears_p049": ("kyutai/tts-voices", "ears/p049/freeform_speech_01.wav", "model"), | |
| "ears_p050": ("kyutai/tts-voices", "ears/p050/freeform_speech_01.wav", "model"), | |
| "ears_p051": ("kyutai/tts-voices", "ears/p051/freeform_speech_01.wav", "model"), | |
| "ears_p052": ("kyutai/tts-voices", "ears/p052/freeform_speech_01.wav", "model"), | |
| "ears_p053": ("kyutai/tts-voices", "ears/p053/freeform_speech_01.wav", "model"), | |
| "ears_p054": ("kyutai/tts-voices", "ears/p054/freeform_speech_01.wav", "model"), | |
| "ears_p055": ("kyutai/tts-voices", "ears/p055/freeform_speech_01.wav", "model"), | |
| "ears_p056": ("kyutai/tts-voices", "ears/p056/freeform_speech_01.wav", "model"), | |
| "ears_p057": ("kyutai/tts-voices", "ears/p057/freeform_speech_01.wav", "model"), | |
| "ears_p058": ("kyutai/tts-voices", "ears/p058/freeform_speech_01.wav", "model"), | |
| "ears_p059": ("kyutai/tts-voices", "ears/p059/freeform_speech_01.wav", "model"), | |
| "ears_p060": ("kyutai/tts-voices", "ears/p060/freeform_speech_01.wav", "model"), | |
| "ears_p061": ("kyutai/tts-voices", "ears/p061/freeform_speech_01.wav", "model"), | |
| "ears_p062": ("kyutai/tts-voices", "ears/p062/freeform_speech_01.wav", "model"), | |
| "ears_p063": ("kyutai/tts-voices", "ears/p063/freeform_speech_01.wav", "model"), | |
| "ears_p064": ("kyutai/tts-voices", "ears/p064/freeform_speech_01.wav", "model"), | |
| "ears_p065": ("kyutai/tts-voices", "ears/p065/freeform_speech_01.wav", "model"), | |
| "ears_p066": ("kyutai/tts-voices", "ears/p066/freeform_speech_01.wav", "model"), | |
| "ears_p067": ("kyutai/tts-voices", "ears/p067/freeform_speech_01.wav", "model"), | |
| "ears_p068": ("kyutai/tts-voices", "ears/p068/freeform_speech_01.wav", "model"), | |
| "ears_p069": ("kyutai/tts-voices", "ears/p069/freeform_speech_01.wav", "model"), | |
| "ears_p070": ("kyutai/tts-voices", "ears/p070/freeform_speech_01.wav", "model"), | |
| "ears_p071": ("kyutai/tts-voices", "ears/p071/freeform_speech_01.wav", "model"), | |
| "ears_p072": ("kyutai/tts-voices", "ears/p072/freeform_speech_01.wav", "model"), | |
| "ears_p073": ("kyutai/tts-voices", "ears/p073/freeform_speech_01.wav", "model"), | |
| "ears_p074": ("kyutai/tts-voices", "ears/p074/freeform_speech_01.wav", "model"), | |
| "ears_p075": ("kyutai/tts-voices", "ears/p075/freeform_speech_01.wav", "model"), | |
| "ears_p076": ("kyutai/tts-voices", "ears/p076/freeform_speech_01.wav", "model"), | |
| "ears_p077": ("kyutai/tts-voices", "ears/p077/freeform_speech_01.wav", "model"), | |
| "ears_p078": ("kyutai/tts-voices", "ears/p078/freeform_speech_01.wav", "model"), | |
| "ears_p079": ("kyutai/tts-voices", "ears/p079/freeform_speech_01.wav", "model"), | |
| "ears_p080": ("kyutai/tts-voices", "ears/p080/freeform_speech_01.wav", "model"), | |
| "ears_p081": ("kyutai/tts-voices", "ears/p081/freeform_speech_01.wav", "model"), | |
| "ears_p082": ("kyutai/tts-voices", "ears/p082/freeform_speech_01.wav", "model"), | |
| "ears_p083": ("kyutai/tts-voices", "ears/p083/freeform_speech_01.wav", "model"), | |
| "ears_p084": ("kyutai/tts-voices", "ears/p084/freeform_speech_01.wav", "model"), | |
| "ears_p085": ("kyutai/tts-voices", "ears/p085/freeform_speech_01.wav", "model"), | |
| "ears_p086": ("kyutai/tts-voices", "ears/p086/freeform_speech_01.wav", "model"), | |
| "ears_p087": ("kyutai/tts-voices", "ears/p087/freeform_speech_01.wav", "model"), | |
| "ears_p088": ("kyutai/tts-voices", "ears/p088/freeform_speech_01.wav", "model"), | |
| "ears_p089": ("kyutai/tts-voices", "ears/p089/freeform_speech_01.wav", "model"), | |
| "ears_p090": ("kyutai/tts-voices", "ears/p090/freeform_speech_01.wav", "model"), | |
| "ears_p091": ("kyutai/tts-voices", "ears/p091/freeform_speech_01.wav", "model"), | |
| "ears_p092": ("kyutai/tts-voices", "ears/p092/freeform_speech_01.wav", "model"), | |
| "ears_p093": ("kyutai/tts-voices", "ears/p093/freeform_speech_01.wav", "model"), | |
| "ears_p094": ("kyutai/tts-voices", "ears/p094/freeform_speech_01.wav", "model"), | |
| "ears_p095": ("kyutai/tts-voices", "ears/p095/freeform_speech_01.wav", "model"), | |
| "ears_p096": ("kyutai/tts-voices", "ears/p096/freeform_speech_01.wav", "model"), | |
| "ears_p097": ("kyutai/tts-voices", "ears/p097/freeform_speech_01.wav", "model"), | |
| "ears_p098": ("kyutai/tts-voices", "ears/p098/freeform_speech_01.wav", "model"), | |
| "ears_p099": ("kyutai/tts-voices", "ears/p099/freeform_speech_01.wav", "model"), | |
| "ears_p100": ("kyutai/tts-voices", "ears/p100/freeform_speech_01.wav", "model"), | |
| "ears_p101": ("kyutai/tts-voices", "ears/p101/freeform_speech_01.wav", "model"), | |
| "ears_p102": ("kyutai/tts-voices", "ears/p102/freeform_speech_01.wav", "model"), | |
| "ears_p103": ("kyutai/tts-voices", "ears/p103/freeform_speech_01.wav", "model"), | |
| "ears_p104": ("kyutai/tts-voices", "ears/p104/freeform_speech_01.wav", "model"), | |
| "ears_p105": ("kyutai/tts-voices", "ears/p105/freeform_speech_01.wav", "model"), | |
| "ears_p106": ("kyutai/tts-voices", "ears/p106/freeform_speech_01.wav", "model"), | |
| "ears_p107": ("kyutai/tts-voices", "ears/p107/freeform_speech_01.wav", "model"), | |
| "ex_duo_a_default": ("kyutai/tts-voices", "expresso/ex01-ex02_default_001_channel1_168s.wav", "model"), | |
| "ex_duo_a_enunciated": ("kyutai/tts-voices", "expresso/ex01-ex02_enunciated_001_channel1_432s.wav", "model"), | |
| "ex_duo_a_fast": ("kyutai/tts-voices", "expresso/ex01-ex02_fast_001_channel1_104s.wav", "model"), | |
| "ex_duo_a_projected": ("kyutai/tts-voices", "expresso/ex01-ex02_projected_001_channel1_46s.wav", "model"), | |
| "ex_duo_a_whisper": ("kyutai/tts-voices", "expresso/ex01-ex02_whisper_001_channel1_579s.wav", "model"), | |
| "ex_duo_b_default": ("kyutai/tts-voices", "expresso/ex04-ex03_default_001_channel1_3s.wav", "model"), | |
| "ex_duo_b_enunciated": ("kyutai/tts-voices", "expresso/ex04-ex03_enunciated_001_channel1_86s.wav", "model"), | |
| "ex_duo_b_fast": ("kyutai/tts-voices", "expresso/ex04-ex03_fast_001_channel1_208s.wav", "model"), | |
| "ex_duo_b_projected": ("kyutai/tts-voices", "expresso/ex04-ex03_projected_001_channel1_192s.wav", "model"), | |
| "ex_duo_b_whisper": ("kyutai/tts-voices", "expresso/ex04-ex03_whisper_001_channel1_198s.wav", "model"), | |
| "ex_fem_emote_angry": ("kyutai/tts-voices", "expresso/ex03-ex01_angry_001_channel1_201s.wav", "model"), | |
| "ex_fem_emote_awe": ("kyutai/tts-voices", "expresso/ex03-ex01_awe_001_channel1_1323s.wav", "model"), | |
| "ex_fem_emote_calm": ("kyutai/tts-voices", "expresso/ex03-ex01_calm_001_channel1_1143s.wav", "model"), | |
| "ex_fem_emote_confused": ("kyutai/tts-voices", "expresso/ex03-ex01_confused_001_channel1_909s.wav", "model"), | |
| "ex_fem_emote_desire": ("kyutai/tts-voices", "expresso/ex03-ex01_desire_004_channel1_545s.wav", "model"), | |
| "ex_fem_emote_disgusted": ("kyutai/tts-voices", "expresso/ex03-ex01_disgusted_004_channel1_170s.wav", "model"), | |
| "ex_fem_emote_enunciated": ("kyutai/tts-voices", "expresso/ex03-ex01_enunciated_001_channel1_388s.wav", "model"), | |
| "ex_fem_emote_happy": ("kyutai/tts-voices", "expresso/ex03-ex01_happy_001_channel1_334s.wav", "model"), | |
| "ex_fem_emote_laughing": ("kyutai/tts-voices", "expresso/ex03-ex01_laughing_001_channel1_188s.wav", "model"), | |
| "ex_fem_emote_nonverbal": ("kyutai/tts-voices", "expresso/ex03-ex01_nonverbal_006_channel1_62s.wav", "model"), | |
| "ex_fem_emote_sarcastic": ("kyutai/tts-voices", "expresso/ex03-ex01_sarcastic_001_channel1_435s.wav", "model"), | |
| "ex_fem_emote_sleepy": ("kyutai/tts-voices", "expresso/ex03-ex01_sleepy_001_channel1_619s.wav", "model"), | |
| "ex_fem_narr_animal_animaldir": ("kyutai/tts-voices", "expresso/ex03-ex02_animal-animaldir_003_channel1_32s.wav", "model"), | |
| "ex_fem_narr_animaldir_animal": ("kyutai/tts-voices", "expresso/ex03-ex02_animaldir-animal_008_channel1_147s.wav", "model"), | |
| "ex_fem_narr_child_childdir": ("kyutai/tts-voices", "expresso/ex03-ex02_child-childdir_001_channel1_291s.wav", "model"), | |
| "ex_fem_narr_childdir_child": ("kyutai/tts-voices", "expresso/ex03-ex02_childdir-child_004_channel1_308s.wav", "model"), | |
| "ex_fem_narr_laughing": ("kyutai/tts-voices", "expresso/ex03-ex02_laughing_001_channel1_248s.wav", "model"), | |
| "ex_fem_narr_narration": ("kyutai/tts-voices", "expresso/ex03-ex02_narration_001_channel1_674s.wav", "model"), | |
| "ex_fem_narr_sad_sympathetic": ("kyutai/tts-voices", "expresso/ex03-ex02_sad-sympathetic_001_channel1_454s.wav", "model"), | |
| "ex_fem_narr_sympathetic_sad": ("kyutai/tts-voices", "expresso/ex03-ex02_sympathetic-sad_008_channel1_215s.wav", "model"), | |
| "ex_mal_emote_angry": ("kyutai/tts-voices", "expresso/ex04-ex02_angry_001_channel1_119s.wav", "model"), | |
| "ex_mal_emote_awe": ("kyutai/tts-voices", "expresso/ex04-ex02_awe_001_channel1_982s.wav", "model"), | |
| "ex_mal_emote_bored": ("kyutai/tts-voices", "expresso/ex04-ex02_bored_001_channel1_254s.wav", "model"), | |
| "ex_mal_emote_calm": ("kyutai/tts-voices", "expresso/ex04-ex02_calm_002_channel1_480s.wav", "model"), | |
| "ex_mal_emote_confused": ("kyutai/tts-voices", "expresso/ex04-ex02_confused_001_channel1_499s.wav", "model"), | |
| "ex_mal_emote_desire": ("kyutai/tts-voices", "expresso/ex04-ex02_desire_001_channel1_657s.wav", "model"), | |
| "ex_mal_emote_disgusted": ("kyutai/tts-voices", "expresso/ex04-ex02_disgusted_004_channel1_169s.wav", "model"), | |
| "ex_mal_emote_enunciated": ("kyutai/tts-voices", "expresso/ex04-ex02_enunciated_001_channel1_496s.wav", "model"), | |
| "ex_mal_emote_fearful": ("kyutai/tts-voices", "expresso/ex04-ex02_fearful_001_channel1_316s.wav", "model"), | |
| "ex_mal_emote_happy": ("kyutai/tts-voices", "expresso/ex04-ex02_happy_001_channel1_118s.wav", "model"), | |
| "ex_mal_emote_laughing": ("kyutai/tts-voices", "expresso/ex04-ex02_laughing_001_channel1_147s.wav", "model"), | |
| "ex_mal_emote_nonverbal": ("kyutai/tts-voices", "expresso/ex04-ex02_nonverbal_004_channel1_18s.wav", "model"), | |
| "ex_mal_emote_sarcastic": ("kyutai/tts-voices", "expresso/ex04-ex02_sarcastic_001_channel1_519s.wav", "model"), | |
| "ex_mal_narr_animal_animaldir": ("kyutai/tts-voices", "expresso/ex04-ex01_animal-animaldir_006_channel1_196s.wav", "model"), | |
| "ex_mal_narr_animaldir_animal": ("kyutai/tts-voices", "expresso/ex04-ex01_animaldir-animal_001_channel1_118s.wav", "model"), | |
| "ex_mal_narr_child_childdir": ("kyutai/tts-voices", "expresso/ex04-ex01_child-childdir_004_channel1_118s.wav", "model"), | |
| "ex_mal_narr_childdir_child": ("kyutai/tts-voices", "expresso/ex04-ex01_childdir-child_001_channel1_228s.wav", "model"), | |
| "ex_mal_narr_disgusted": ("kyutai/tts-voices", "expresso/ex04-ex01_disgusted_001_channel1_130s.wav", "model"), | |
| "ex_mal_narr_laughing": ("kyutai/tts-voices", "expresso/ex04-ex01_laughing_001_channel1_306s.wav", "model"), | |
| "ex_mal_narr_narration": ("kyutai/tts-voices", "expresso/ex04-ex01_narration_001_channel1_605s.wav", "model"), | |
| "ex_mal_narr_sad_sympathetic": ("kyutai/tts-voices", "expresso/ex04-ex01_sad-sympathetic_001_channel1_267s.wav", "model"), | |
| "ex_mal_narr_sympathetic_sad": ("kyutai/tts-voices", "expresso/ex04-ex01_sympathetic-sad_008_channel1_415s.wav", "model"), | |
| "unmute_default_voice": ("kyutai/tts-voices", "unmute-prod-website/default_voice.wav", "model"), | |
| "unmute_degaulle_2": ("kyutai/tts-voices", "unmute-prod-website/degaulle-2.wav", "model"), | |
| "unmute_developpeuse_3": ("kyutai/tts-voices", "unmute-prod-website/developpeuse-3.wav", "model"), | |
| "unmute_ex04_narration_longform_00001": ("kyutai/tts-voices", "unmute-prod-website/ex04_narration_longform_00001.wav", "model"), | |
| "unmute_fabieng_enhanced_v2": ("kyutai/tts-voices", "unmute-prod-website/fabieng-enhanced-v2.wav", "model"), | |
| "unmute_p329_022": ("kyutai/tts-voices", "unmute-prod-website/p329_022.wav", "model"), | |
| "vctk_p225_023": ("kyutai/tts-voices", "vctk/p225_023.wav", "model"), | |
| "vctk_p226_023": ("kyutai/tts-voices", "vctk/p226_023.wav", "model"), | |
| "vctk_p227_023": ("kyutai/tts-voices", "vctk/p227_023.wav", "model"), | |
| "vctk_p228_023": ("kyutai/tts-voices", "vctk/p228_023.wav", "model"), | |
| "vctk_p229_023": ("kyutai/tts-voices", "vctk/p229_023.wav", "model"), | |
| "vctk_p230_023": ("kyutai/tts-voices", "vctk/p230_023.wav", "model"), | |
| "vctk_p231_023": ("kyutai/tts-voices", "vctk/p231_023.wav", "model"), | |
| "vctk_p232_023": ("kyutai/tts-voices", "vctk/p232_023.wav", "model"), | |
| "vctk_p233_023": ("kyutai/tts-voices", "vctk/p233_023.wav", "model"), | |
| "vctk_p234_023": ("kyutai/tts-voices", "vctk/p234_023.wav", "model"), | |
| "vctk_p236_023": ("kyutai/tts-voices", "vctk/p236_023.wav", "model"), | |
| "vctk_p237_023": ("kyutai/tts-voices", "vctk/p237_023.wav", "model"), | |
| "vctk_p238_023": ("kyutai/tts-voices", "vctk/p238_023.wav", "model"), | |
| "vctk_p239_023": ("kyutai/tts-voices", "vctk/p239_023.wav", "model"), | |
| "vctk_p240_023": ("kyutai/tts-voices", "vctk/p240_023.wav", "model"), | |
| "vctk_p241_023": ("kyutai/tts-voices", "vctk/p241_023.wav", "model"), | |
| "vctk_p243_023": ("kyutai/tts-voices", "vctk/p243_023.wav", "model"), | |
| "vctk_p244_023": ("kyutai/tts-voices", "vctk/p244_023.wav", "model"), | |
| "vctk_p245_023": ("kyutai/tts-voices", "vctk/p245_023.wav", "model"), | |
| "vctk_p246_023": ("kyutai/tts-voices", "vctk/p246_023.wav", "model"), | |
| "vctk_p247_023": ("kyutai/tts-voices", "vctk/p247_023.wav", "model"), | |
| "vctk_p248_023": ("kyutai/tts-voices", "vctk/p248_023.wav", "model"), | |
| "vctk_p249_023": ("kyutai/tts-voices", "vctk/p249_023.wav", "model"), | |
| "vctk_p250_023": ("kyutai/tts-voices", "vctk/p250_023.wav", "model"), | |
| "vctk_p251_023": ("kyutai/tts-voices", "vctk/p251_023.wav", "model"), | |
| "vctk_p252_023": ("kyutai/tts-voices", "vctk/p252_023.wav", "model"), | |
| "vctk_p253_023": ("kyutai/tts-voices", "vctk/p253_023.wav", "model"), | |
| "vctk_p254_023": ("kyutai/tts-voices", "vctk/p254_023.wav", "model"), | |
| "vctk_p255_023": ("kyutai/tts-voices", "vctk/p255_023.wav", "model"), | |
| "vctk_p256_023": ("kyutai/tts-voices", "vctk/p256_023.wav", "model"), | |
| "vctk_p257_023": ("kyutai/tts-voices", "vctk/p257_023.wav", "model"), | |
| "vctk_p258_023": ("kyutai/tts-voices", "vctk/p258_023.wav", "model"), | |
| "vctk_p259_023": ("kyutai/tts-voices", "vctk/p259_023.wav", "model"), | |
| "vctk_p260_023": ("kyutai/tts-voices", "vctk/p260_023.wav", "model"), | |
| "vctk_p261_023": ("kyutai/tts-voices", "vctk/p261_023.wav", "model"), | |
| "vctk_p262_023": ("kyutai/tts-voices", "vctk/p262_023.wav", "model"), | |
| "vctk_p263_023": ("kyutai/tts-voices", "vctk/p263_023.wav", "model"), | |
| "vctk_p264_023": ("kyutai/tts-voices", "vctk/p264_023.wav", "model"), | |
| "vctk_p265_023": ("kyutai/tts-voices", "vctk/p265_023.wav", "model"), | |
| "vctk_p266_023": ("kyutai/tts-voices", "vctk/p266_023.wav", "model"), | |
| "vctk_p267_023": ("kyutai/tts-voices", "vctk/p267_023.wav", "model"), | |
| "vctk_p269_023": ("kyutai/tts-voices", "vctk/p269_023.wav", "model"), | |
| "vctk_p270_023": ("kyutai/tts-voices", "vctk/p270_023.wav", "model"), | |
| "vctk_p271_023": ("kyutai/tts-voices", "vctk/p271_023.wav", "model"), | |
| "vctk_p272_023": ("kyutai/tts-voices", "vctk/p272_023.wav", "model"), | |
| "vctk_p273_023": ("kyutai/tts-voices", "vctk/p273_023.wav", "model"), | |
| "vctk_p274_023": ("kyutai/tts-voices", "vctk/p274_023.wav", "model"), | |
| "vctk_p275_023": ("kyutai/tts-voices", "vctk/p275_023.wav", "model"), | |
| "vctk_p276_023": ("kyutai/tts-voices", "vctk/p276_023.wav", "model"), | |
| "vctk_p277_023": ("kyutai/tts-voices", "vctk/p277_023.wav", "model"), | |
| "vctk_p278_023": ("kyutai/tts-voices", "vctk/p278_023.wav", "model"), | |
| "vctk_p279_023": ("kyutai/tts-voices", "vctk/p279_023.wav", "model"), | |
| "vctk_p280_023": ("kyutai/tts-voices", "vctk/p280_023.wav", "model"), | |
| "vctk_p281_023": ("kyutai/tts-voices", "vctk/p281_023.wav", "model"), | |
| "vctk_p282_023": ("kyutai/tts-voices", "vctk/p282_023.wav", "model"), | |
| "vctk_p283_023": ("kyutai/tts-voices", "vctk/p283_023.wav", "model"), | |
| "vctk_p284_023": ("kyutai/tts-voices", "vctk/p284_023.wav", "model"), | |
| "vctk_p285_023": ("kyutai/tts-voices", "vctk/p285_023.wav", "model"), | |
| "vctk_p286_023": ("kyutai/tts-voices", "vctk/p286_023.wav", "model"), | |
| "vctk_p287_023": ("kyutai/tts-voices", "vctk/p287_023.wav", "model"), | |
| "vctk_p288_023": ("kyutai/tts-voices", "vctk/p288_023.wav", "model"), | |
| "vctk_p292_023": ("kyutai/tts-voices", "vctk/p292_023.wav", "model"), | |
| "vctk_p293_023": ("kyutai/tts-voices", "vctk/p293_023.wav", "model"), | |
| "vctk_p294_023": ("kyutai/tts-voices", "vctk/p294_023.wav", "model"), | |
| "vctk_p297_023": ("kyutai/tts-voices", "vctk/p297_023.wav", "model"), | |
| "vctk_p298_023": ("kyutai/tts-voices", "vctk/p298_023.wav", "model"), | |
| "vctk_p299_023": ("kyutai/tts-voices", "vctk/p299_023.wav", "model"), | |
| "vctk_p300_023": ("kyutai/tts-voices", "vctk/p300_023.wav", "model"), | |
| "vctk_p301_023": ("kyutai/tts-voices", "vctk/p301_023.wav", "model"), | |
| "vctk_p302_023": ("kyutai/tts-voices", "vctk/p302_023.wav", "model"), | |
| "vctk_p303_023": ("kyutai/tts-voices", "vctk/p303_023.wav", "model"), | |
| "vctk_p304_023": ("kyutai/tts-voices", "vctk/p304_023.wav", "model"), | |
| "vctk_p305_023": ("kyutai/tts-voices", "vctk/p305_023.wav", "model"), | |
| "vctk_p306_023": ("kyutai/tts-voices", "vctk/p306_023.wav", "model"), | |
| "vctk_p307_023": ("kyutai/tts-voices", "vctk/p307_023.wav", "model"), | |
| "vctk_p308_023": ("kyutai/tts-voices", "vctk/p308_023.wav", "model"), | |
| "vctk_p310_023": ("kyutai/tts-voices", "vctk/p310_023.wav", "model"), | |
| "vctk_p311_023": ("kyutai/tts-voices", "vctk/p311_023.wav", "model"), | |
| "vctk_p312_023": ("kyutai/tts-voices", "vctk/p312_023.wav", "model"), | |
| "vctk_p313_023": ("kyutai/tts-voices", "vctk/p313_023.wav", "model"), | |
| "vctk_p314_023": ("kyutai/tts-voices", "vctk/p314_023.wav", "model"), | |
| "vctk_p315_023": ("kyutai/tts-voices", "vctk/p315_023.wav", "model"), | |
| "vctk_p316_023": ("kyutai/tts-voices", "vctk/p316_023.wav", "model"), | |
| "vctk_p317_023": ("kyutai/tts-voices", "vctk/p317_023.wav", "model"), | |
| "vctk_p318_023": ("kyutai/tts-voices", "vctk/p318_023.wav", "model"), | |
| "vctk_p323_023": ("kyutai/tts-voices", "vctk/p323_023.wav", "model"), | |
| "vctk_p326_023": ("kyutai/tts-voices", "vctk/p326_023.wav", "model"), | |
| "vctk_p329_023": ("kyutai/tts-voices", "vctk/p329_023.wav", "model"), | |
| "vctk_p330_023": ("kyutai/tts-voices", "vctk/p330_023.wav", "model"), | |
| "vctk_p333_023": ("kyutai/tts-voices", "vctk/p333_023.wav", "model"), | |
| "vctk_p334_023": ("kyutai/tts-voices", "vctk/p334_023.wav", "model"), | |
| "vctk_p335_023": ("kyutai/tts-voices", "vctk/p335_023.wav", "model"), | |
| "vctk_p336_023": ("kyutai/tts-voices", "vctk/p336_023.wav", "model"), | |
| "vctk_p339_023": ("kyutai/tts-voices", "vctk/p339_023.wav", "model"), | |
| "vctk_p341_023": ("kyutai/tts-voices", "vctk/p341_023.wav", "model"), | |
| "vctk_p343_023": ("kyutai/tts-voices", "vctk/p343_023.wav", "model"), | |
| "vctk_p345_023": ("kyutai/tts-voices", "vctk/p345_023.wav", "model"), | |
| "vctk_p347_023": ("kyutai/tts-voices", "vctk/p347_023.wav", "model"), | |
| "vctk_p351_023": ("kyutai/tts-voices", "vctk/p351_023.wav", "model"), | |
| "vctk_p360_023": ("kyutai/tts-voices", "vctk/p360_023.wav", "model"), | |
| "vctk_p361_023": ("kyutai/tts-voices", "vctk/p361_023.wav", "model"), | |
| "vctk_p363_023": ("kyutai/tts-voices", "vctk/p363_023.wav", "model"), | |
| "vctk_p364_023": ("kyutai/tts-voices", "vctk/p364_023.wav", "model"), | |
| "vctk_p374_023": ("kyutai/tts-voices", "vctk/p374_023.wav", "model"), | |
| "vctk_p376_023": ("kyutai/tts-voices", "vctk/p376_023.wav", "model"), | |
| "vctk_s5_023": ("kyutai/tts-voices", "vctk/s5_023.wav", "model"), | |
| "vd_0a67": ("kyutai/tts-voices", "voice-donations/0a67.wav", "model"), | |
| "vd_1410": ("kyutai/tts-voices", "voice-donations/1410.wav", "model"), | |
| "vd_1dd0": ("kyutai/tts-voices", "voice-donations/1dd0.wav", "model"), | |
| "vd_2181": ("kyutai/tts-voices", "voice-donations/2181.wav", "model"), | |
| "vd_245e": ("kyutai/tts-voices", "voice-donations/245e.wav", "model"), | |
| "vd_29da": ("kyutai/tts-voices", "voice-donations/29da.wav", "model"), | |
| "vd_30c5": ("kyutai/tts-voices", "voice-donations/30c5.wav", "model"), | |
| "vd_3973": ("kyutai/tts-voices", "voice-donations/3973.wav", "model"), | |
| "vd_4189": ("kyutai/tts-voices", "voice-donations/4189.wav", "model"), | |
| "vd_468c": ("kyutai/tts-voices", "voice-donations/468c.wav", "model"), | |
| "vd_4b13": ("kyutai/tts-voices", "voice-donations/4b13.wav", "model"), | |
| "vd_4b70": ("kyutai/tts-voices", "voice-donations/4b70.wav", "model"), | |
| "vd_5b55": ("kyutai/tts-voices", "voice-donations/5b55.wav", "model"), | |
| "vd_6148": ("kyutai/tts-voices", "voice-donations/6148.wav", "model"), | |
| "vd_617b": ("kyutai/tts-voices", "voice-donations/617b.wav", "model"), | |
| "vd_7020": ("kyutai/tts-voices", "voice-donations/7020.wav", "model"), | |
| "vd_7909": ("kyutai/tts-voices", "voice-donations/7909.wav", "model"), | |
| "vd_7b2b": ("kyutai/tts-voices", "voice-donations/7b2b.wav", "model"), | |
| "vd_8935": ("kyutai/tts-voices", "voice-donations/8935.wav", "model"), | |
| "vd_8dc9": ("kyutai/tts-voices", "voice-donations/8dc9.wav", "model"), | |
| "vd_8f15": ("kyutai/tts-voices", "voice-donations/8f15.wav", "model"), | |
| "vd_92f0": ("kyutai/tts-voices", "voice-donations/92f0.wav", "model"), | |
| "vd_9a2e": ("kyutai/tts-voices", "voice-donations/9a2e.wav", "model"), | |
| "vd_9a66": ("kyutai/tts-voices", "voice-donations/9a66.wav", "model"), | |
| "vd_AHmad": ("kyutai/tts-voices", "voice-donations/AHmad.wav", "model"), | |
| "vd_ASEN": ("kyutai/tts-voices", "voice-donations/ASEN.wav", "model"), | |
| "vd_Aadi": ("kyutai/tts-voices", "voice-donations/Aadi.wav", "model"), | |
| "vd_AbD": ("kyutai/tts-voices", "voice-donations/AbD.wav", "model"), | |
| "vd_Abhinox": ("kyutai/tts-voices", "voice-donations/Abhinox.wav", "model"), | |
| "vd_Abo_Ayman": ("kyutai/tts-voices", "voice-donations/Abo_Ayman.wav", "model"), | |
| "vd_Abob_Malay": ("kyutai/tts-voices", "voice-donations/Abob_Malay.wav", "model"), | |
| "vd_Adarsh_Bulla": ("kyutai/tts-voices", "voice-donations/Adarsh_Bulla.wav", "model"), | |
| "vd_AgentCobra": ("kyutai/tts-voices", "voice-donations/AgentCobra.wav", "model"), | |
| "vd_Ajith": ("kyutai/tts-voices", "voice-donations/Ajith.wav", "model"), | |
| "vd_Alejandro_espanol_latino": ("kyutai/tts-voices", "voice-donations/Alejandro_espanol_latino.wav", "model"), | |
| "vd_Allen": ("kyutai/tts-voices", "voice-donations/Allen.wav", "model"), | |
| "vd_AmitNag": ("kyutai/tts-voices", "voice-donations/AmitNag.wav", "model"), | |
| "vd_Andrea": ("kyutai/tts-voices", "voice-donations/Andrea.wav", "model"), | |
| "vd_Andrea_Spanish": ("kyutai/tts-voices", "voice-donations/Andrea_(Spanish).wav", "model"), | |
| "vd_Antoine_Vala": ("kyutai/tts-voices", "voice-donations/Antoine_Vala.wav", "model"), | |
| "vd_Antoni": ("kyutai/tts-voices", "voice-donations/Antoni.wav", "model"), | |
| "vd_Aon": ("kyutai/tts-voices", "voice-donations/Aon.wav", "model"), | |
| "vd_Arjun_Z": ("kyutai/tts-voices", "voice-donations/Arjun_Z.wav", "model"), | |
| "vd_Aryobe": ("kyutai/tts-voices", "voice-donations/Aryobe.wav", "model"), | |
| "vd_BLUE": ("kyutai/tts-voices", "voice-donations/BLUE.wav", "model"), | |
| "vd_Bijay": ("kyutai/tts-voices", "voice-donations/Bijay.wav", "model"), | |
| "vd_Blake": ("kyutai/tts-voices", "voice-donations/Blake.wav", "model"), | |
| "vd_Bobby_McFern": ("kyutai/tts-voices", "voice-donations/Bobby_McFern.wav", "model"), | |
| "vd_Breaking_1": ("kyutai/tts-voices", "voice-donations/Breaking_1.wav", "model"), | |
| "vd_BrokenHypocrite": ("kyutai/tts-voices", "voice-donations/BrokenHypocrite.wav", "model"), | |
| "vd_Butter": ("kyutai/tts-voices", "voice-donations/Butter.wav", "model"), | |
| "vd_CPS_001": ("kyutai/tts-voices", "voice-donations/CPS_001.wav", "model"), | |
| "vd_Chujus": ("kyutai/tts-voices", "voice-donations/Chujus.wav", "model"), | |
| "vd_Cicada": ("kyutai/tts-voices", "voice-donations/Cicada.wav", "model"), | |
| "vd_ClassicWizard": ("kyutai/tts-voices", "voice-donations/ClassicWizard.wav", "model"), | |
| "vd_Curlinvictus": ("kyutai/tts-voices", "voice-donations/Curlinvictus.wav", "model"), | |
| "vd_Darius": ("kyutai/tts-voices", "voice-donations/Darius.wav", "model"), | |
| "vd_Darya_khan": ("kyutai/tts-voices", "voice-donations/Darya_khan.wav", "model"), | |
| "vd_Deepak": ("kyutai/tts-voices", "voice-donations/Deepak.wav", "model"), | |
| "vd_Dhruv_Rao": ("kyutai/tts-voices", "voice-donations/Dhruv_Rao.wav", "model"), | |
| "vd_Dil": ("kyutai/tts-voices", "voice-donations/Dil.wav", "model"), | |
| "vd_Enrique": ("kyutai/tts-voices", "voice-donations/Enrique.wav", "model"), | |
| "vd_Enrique_Spanish": ("kyutai/tts-voices", "voice-donations/Enrique_(Spanish).wav", "model"), | |
| "vd_Erick": ("kyutai/tts-voices", "voice-donations/Erick.wav", "model"), | |
| "vd_Ernesto_Y": ("kyutai/tts-voices", "voice-donations/Ernesto_Y.wav", "model"), | |
| "vd_Eshan": ("kyutai/tts-voices", "voice-donations/Eshan.wav", "model"), | |
| "vd_Esteban_Aguirre_Arias": ("kyutai/tts-voices", "voice-donations/Esteban_Aguirre_Arias.wav", "model"), | |
| "vd_Ferdinand": ("kyutai/tts-voices", "voice-donations/Ferdinand.wav", "model"), | |
| "vd_FlorDaddy": ("kyutai/tts-voices", "voice-donations/FlorDaddy.wav", "model"), | |
| "vd_Fred_Mara": ("kyutai/tts-voices", "voice-donations/Fred_Mara.wav", "model"), | |
| "vd_Giovanne": ("kyutai/tts-voices", "voice-donations/Giovanne.wav", "model"), | |
| "vd_Glenn": ("kyutai/tts-voices", "voice-donations/Glenn.wav", "model"), | |
| "vd_Goku": ("kyutai/tts-voices", "voice-donations/Goku.wav", "model"), | |
| "vd_Gonzalo": ("kyutai/tts-voices", "voice-donations/Gonzalo.wav", "model"), | |
| "vd_Gonzalo_1": ("kyutai/tts-voices", "voice-donations/Gonzalo-1.wav", "model"), | |
| "vd_Greggy": ("kyutai/tts-voices", "voice-donations/Greggy.wav", "model"), | |
| "vd_Haku": ("kyutai/tts-voices", "voice-donations/Haku.wav", "model"), | |
| "vd_Hannah": ("kyutai/tts-voices", "voice-donations/Hannah.wav", "model"), | |
| "vd_Hardik_Clone": ("kyutai/tts-voices", "voice-donations/Hardik_Clone.wav", "model"), | |
| "vd_Hillbilly_Jim": ("kyutai/tts-voices", "voice-donations/Hillbilly_Jim.wav", "model"), | |
| "vd_Hkl": ("kyutai/tts-voices", "voice-donations/Hkl.wav", "model"), | |
| "vd_Hugo_the_frenchie": ("kyutai/tts-voices", "voice-donations/Hugo_the_frenchie.wav", "model"), | |
| "vd_Ilyass_yea": ("kyutai/tts-voices", "voice-donations/Ilyass_yea.wav", "model"), | |
| "vd_Imran_475": ("kyutai/tts-voices", "voice-donations/Imran_475.wav", "model"), | |
| "vd_Imran_from_I_India": ("kyutai/tts-voices", "voice-donations/Imran_from_I_India.wav", "model"), | |
| "vd_Indian_guy": ("kyutai/tts-voices", "voice-donations/Indian_guy.wav", "model"), | |
| "vd_Ineedthisnow": ("kyutai/tts-voices", "voice-donations/Ineedthisnow.wav", "model"), | |
| "vd_JJis2123": ("kyutai/tts-voices", "voice-donations/JJis2123.wav", "model"), | |
| "vd_JOSHE": ("kyutai/tts-voices", "voice-donations/JOSHE.wav", "model"), | |
| "vd_James": ("kyutai/tts-voices", "voice-donations/James.wav", "model"), | |
| "vd_Jaspino": ("kyutai/tts-voices", "voice-donations/Jaspino.wav", "model"), | |
| "vd_Jaw": ("kyutai/tts-voices", "voice-donations/Jaw.wav", "model"), | |
| "vd_Jay": ("kyutai/tts-voices", "voice-donations/Jay.wav", "model"), | |
| "vd_Jeff_Andrew": ("kyutai/tts-voices", "voice-donations/Jeff_Andrew.wav", "model"), | |
| "vd_Jeffrey": ("kyutai/tts-voices", "voice-donations/Jeffrey.wav", "model"), | |
| "vd_Jeremy_Q": ("kyutai/tts-voices", "voice-donations/Jeremy_Q.wav", "model"), | |
| "vd_Jimmy": ("kyutai/tts-voices", "voice-donations/Jimmy.wav", "model"), | |
| "vd_Joaopedrobil1": ("kyutai/tts-voices", "voice-donations/Joaopedrobil1.wav", "model"), | |
| "vd_John_Triguero": ("kyutai/tts-voices", "voice-donations/John_Triguero.wav", "model"), | |
| "vd_Juanrestrepo177777": ("kyutai/tts-voices", "voice-donations/Juanrestrepo177777.wav", "model"), | |
| "vd_Karti": ("kyutai/tts-voices", "voice-donations/Karti.wav", "model"), | |
| "vd_Kditz": ("kyutai/tts-voices", "voice-donations/Kditz.wav", "model"), | |
| "vd_Koorosh": ("kyutai/tts-voices", "voice-donations/Koorosh.wav", "model"), | |
| "vd_LC": ("kyutai/tts-voices", "voice-donations/LC.wav", "model"), | |
| "vd_L_Roy": ("kyutai/tts-voices", "voice-donations/L_Roy.wav", "model"), | |
| "vd_Lake": ("kyutai/tts-voices", "voice-donations/Lake.wav", "model"), | |
| "vd_Lammy": ("kyutai/tts-voices", "voice-donations/Lammy.wav", "model"), | |
| "vd_Lara": ("kyutai/tts-voices", "voice-donations/Lara.wav", "model"), | |
| "vd_Latin_Accent": ("kyutai/tts-voices", "voice-donations/Latin_Accent.wav", "model"), | |
| "vd_Liquescent": ("kyutai/tts-voices", "voice-donations/Liquescent.wav", "model"), | |
| "vd_Louis": ("kyutai/tts-voices", "voice-donations/Louis.wav", "model"), | |
| "vd_Lucas": ("kyutai/tts-voices", "voice-donations/Lucas.wav", "model"), | |
| "vd_MJDePedro": ("kyutai/tts-voices", "voice-donations/MJDePedro.wav", "model"), | |
| "vd_Maisako": ("kyutai/tts-voices", "voice-donations/Maisako.wav", "model"), | |
| "vd_Manahen": ("kyutai/tts-voices", "voice-donations/Manahen.wav", "model"), | |
| "vd_Marshal_Indian": ("kyutai/tts-voices", "voice-donations/Marshal_Indian.wav", "model"), | |
| "vd_Midlands_Bedfordshire_Dialect": ("kyutai/tts-voices", "voice-donations/Midlands_Bedfordshire_Dialect.wav", "model"), | |
| "vd_Moses": ("kyutai/tts-voices", "voice-donations/Moses.wav", "model"), | |
| "vd_MrHat": ("kyutai/tts-voices", "voice-donations/MrHat.wav", "model"), | |
| "vd_Mr_captain": ("kyutai/tts-voices", "voice-donations/Mr_captain.wav", "model"), | |
| "vd_Muhtasims_Voice": ("kyutai/tts-voices", "voice-donations/Muhtasim's_Voice.wav", "model"), | |
| "vd_Mystery_Sir": ("kyutai/tts-voices", "voice-donations/Mystery_Sir.wav", "model"), | |
| "vd_Narrum": ("kyutai/tts-voices", "voice-donations/Narrum.wav", "model"), | |
| "vd_Nick": ("kyutai/tts-voices", "voice-donations/Nick.wav", "model"), | |
| "vd_P0LFR": ("kyutai/tts-voices", "voice-donations/P0LFR.wav", "model"), | |
| "vd_Pai_ve": ("kyutai/tts-voices", "voice-donations/Pai_ve.wav", "model"), | |
| "vd_Parthiban": ("kyutai/tts-voices", "voice-donations/Parthiban.wav", "model"), | |
| "vd_Prakash369": ("kyutai/tts-voices", "voice-donations/Prakash369.wav", "model"), | |
| "vd_Puzzle": ("kyutai/tts-voices", "voice-donations/Puzzle.wav", "model"), | |
| "vd_Qasim_Wali_Khan": ("kyutai/tts-voices", "voice-donations/Qasim_Wali_Khan.wav", "model"), | |
| "vd_RAJ": ("kyutai/tts-voices", "voice-donations/RAJ.wav", "model"), | |
| "vd_Rafaelpazv": ("kyutai/tts-voices", "voice-donations/Rafaelpazv.wav", "model"), | |
| "vd_Rahul": ("kyutai/tts-voices", "voice-donations/Rahul.wav", "model"), | |
| "vd_Raj25": ("kyutai/tts-voices", "voice-donations/Raj25.wav", "model"), | |
| "vd_Ramu": ("kyutai/tts-voices", "voice-donations/Ramu.wav", "model"), | |
| "vd_Ranjith": ("kyutai/tts-voices", "voice-donations/Ranjith.wav", "model"), | |
| "vd_Richard_cuban": ("kyutai/tts-voices", "voice-donations/Richard_cuban.wav", "model"), | |
| "vd_Rony": ("kyutai/tts-voices", "voice-donations/Rony.wav", "model"), | |
| "vd_Roscoe": ("kyutai/tts-voices", "voice-donations/Roscoe.wav", "model"), | |
| "vd_Rs": ("kyutai/tts-voices", "voice-donations/Rs.wav", "model"), | |
| "vd_Rup": ("kyutai/tts-voices", "voice-donations/Rup.wav", "model"), | |
| "vd_SSA150803": ("kyutai/tts-voices", "voice-donations/SSA150803.wav", "model"), | |
| "vd_SS_1684": ("kyutai/tts-voices", "voice-donations/SS_1684.wav", "model"), | |
| "vd_STONE": ("kyutai/tts-voices", "voice-donations/STONE.wav", "model"), | |
| "vd_Samsewak": ("kyutai/tts-voices", "voice-donations/Samsewak.wav", "model"), | |
| "vd_Selfie": ("kyutai/tts-voices", "voice-donations/Selfie.wav", "model"), | |
| "vd_Sheddy": ("kyutai/tts-voices", "voice-donations/Sheddy.wav", "model"), | |
| "vd_Siddh_Indian": ("kyutai/tts-voices", "voice-donations/Siddh_Indian.wav", "model"), | |
| "vd_Sir_TJ": ("kyutai/tts-voices", "voice-donations/Sir_TJ.wav", "model"), | |
| "vd_Sirajo_x": ("kyutai/tts-voices", "voice-donations/Sirajo_x.wav", "model"), | |
| "vd_Sp46": ("kyutai/tts-voices", "voice-donations/Sp46.wav", "model"), | |
| "vd_Sr_Erick": ("kyutai/tts-voices", "voice-donations/Sr_Erick.wav", "model"), | |
| "vd_Standollars": ("kyutai/tts-voices", "voice-donations/Standollars.wav", "model"), | |
| "vd_TESLLA": ("kyutai/tts-voices", "voice-donations/TESLLA.wav", "model"), | |
| "vd_Tahii": ("kyutai/tts-voices", "voice-donations/Tahii.wav", "model"), | |
| "vd_TheFin": ("kyutai/tts-voices", "voice-donations/TheFin.wav", "model"), | |
| "vd_The_Sustainabler": ("kyutai/tts-voices", "voice-donations/The_Sustainabler.wav", "model"), | |
| "vd_The_other_brother": ("kyutai/tts-voices", "voice-donations/The_other_brother.wav", "model"), | |
| "vd_Titorium": ("kyutai/tts-voices", "voice-donations/Titorium.wav", "model"), | |
| "vd_Tonmoy": ("kyutai/tts-voices", "voice-donations/Tonmoy.wav", "model"), | |
| "vd_Umair": ("kyutai/tts-voices", "voice-donations/Umair.wav", "model"), | |
| "vd_Vexat": ("kyutai/tts-voices", "voice-donations/Vexat.wav", "model"), | |
| "vd_Victor_Garcia": ("kyutai/tts-voices", "voice-donations/Victor_Garcia.wav", "model"), | |
| "vd_Vinith___English_India": ("kyutai/tts-voices", "voice-donations/Vinith___English_India.wav", "model"), | |
| "vd_Vitch": ("kyutai/tts-voices", "voice-donations/Vitch.wav", "model"), | |
| "vd_Vivaldi": ("kyutai/tts-voices", "voice-donations/Vivaldi.wav", "model"), | |
| "vd_W_A_H": ("kyutai/tts-voices", "voice-donations/W_A_H.wav", "model"), | |
| "vd_Wealthiest": ("kyutai/tts-voices", "voice-donations/Wealthiest.wav", "model"), | |
| "vd_WhisperInEar": ("kyutai/tts-voices", "voice-donations/WhisperInEar.wav", "model"), | |
| "vd_Yesid": ("kyutai/tts-voices", "voice-donations/Yesid.wav", "model"), | |
| "vd_Youfied": ("kyutai/tts-voices", "voice-donations/Youfied.wav", "model"), | |
| "vd_Yuush": ("kyutai/tts-voices", "voice-donations/Yuush.wav", "model"), | |
| "vd_a59a": ("kyutai/tts-voices", "voice-donations/a59a.wav", "model"), | |
| "vd_a6f9": ("kyutai/tts-voices", "voice-donations/a6f9.wav", "model"), | |
| "vd_a96a": ("kyutai/tts-voices", "voice-donations/a96a.wav", "model"), | |
| "vd_aepeak": ("kyutai/tts-voices", "voice-donations/aepeak.wav", "model"), | |
| "vd_albertoforofo007": ("kyutai/tts-voices", "voice-donations/albertoforofo007.wav", "model"), | |
| "vd_amazon_box": ("kyutai/tts-voices", "voice-donations/amazon_box.wav", "model"), | |
| "vd_awais_shah": ("kyutai/tts-voices", "voice-donations/awais_shah.wav", "model"), | |
| "vd_bathri": ("kyutai/tts-voices", "voice-donations/bathri.wav", "model"), | |
| "vd_bbe4": ("kyutai/tts-voices", "voice-donations/bbe4.wav", "model"), | |
| "vd_bc98": ("kyutai/tts-voices", "voice-donations/bc98.wav", "model"), | |
| "vd_bevi": ("kyutai/tts-voices", "voice-donations/bevi.wav", "model"), | |
| "vd_boom": ("kyutai/tts-voices", "voice-donations/boom.wav", "model"), | |
| "vd_c0a0": ("kyutai/tts-voices", "voice-donations/c0a0.wav", "model"), | |
| "vd_cybina": ("kyutai/tts-voices", "voice-donations/cybina.wav", "model"), | |
| "vd_d4a9": ("kyutai/tts-voices", "voice-donations/d4a9.wav", "model"), | |
| "vd_dce6": ("kyutai/tts-voices", "voice-donations/dce6.wav", "model"), | |
| "vd_dwp": ("kyutai/tts-voices", "voice-donations/dwp.wav", "model"), | |
| "vd_e819": ("kyutai/tts-voices", "voice-donations/e819.wav", "model"), | |
| "vd_edd4": ("kyutai/tts-voices", "voice-donations/edd4.wav", "model"), | |
| "vd_efeb": ("kyutai/tts-voices", "voice-donations/efeb.wav", "model"), | |
| "vd_english_with_german_accent": ("kyutai/tts-voices", "voice-donations/english_with_german_accent.wav", "model"), | |
| "vd_erihppas": ("kyutai/tts-voices", "voice-donations/erihppas.wav", "model"), | |
| "vd_f179": ("kyutai/tts-voices", "voice-donations/f179.wav", "model"), | |
| "vd_f9cf": ("kyutai/tts-voices", "voice-donations/f9cf.wav", "model"), | |
| "vd_fa52": ("kyutai/tts-voices", "voice-donations/fa52.wav", "model"), | |
| "vd_fc96": ("kyutai/tts-voices", "voice-donations/fc96.wav", "model"), | |
| "vd_floyd2026": ("kyutai/tts-voices", "voice-donations/floyd2026.wav", "model"), | |
| "vd_gmaskell92": ("kyutai/tts-voices", "voice-donations/gmaskell92.wav", "model"), | |
| "vd_gyroo": ("kyutai/tts-voices", "voice-donations/gyroo.wav", "model"), | |
| "vd_hielos_1": ("kyutai/tts-voices", "voice-donations/hielos_1.wav", "model"), | |
| "vd_hielos_2": ("kyutai/tts-voices", "voice-donations/hielos_2.wav", "model"), | |
| "vd_injul": ("kyutai/tts-voices", "voice-donations/injul.wav", "model"), | |
| "vd_kbrn1": ("kyutai/tts-voices", "voice-donations/kbrn1.wav", "model"), | |
| "vd_oldNerd": ("kyutai/tts-voices", "voice-donations/oldNerd.wav", "model"), | |
| "vd_oldNerd2": ("kyutai/tts-voices", "voice-donations/oldNerd2.wav", "model"), | |
| "vd_oldNerd3": ("kyutai/tts-voices", "voice-donations/oldNerd3.wav", "model"), | |
| "vd_ra_XOr": ("kyutai/tts-voices", "voice-donations/ra_XOr.wav", "model"), | |
| "vd_rewi": ("kyutai/tts-voices", "voice-donations/rewi.wav", "model"), | |
| "vd_robert": ("kyutai/tts-voices", "voice-donations/robert.wav", "model"), | |
| "vd_rshah_1_0": ("kyutai/tts-voices", "voice-donations/rshah_1_0.wav", "model"), | |
| "vd_sanjay": ("kyutai/tts-voices", "voice-donations/sanjay.wav", "model"), | |
| "vd_siddharth_khanna": ("kyutai/tts-voices", "voice-donations/siddharth_khanna.wav", "model"), | |
| "vd_solace": ("kyutai/tts-voices", "voice-donations/solace.wav", "model"), | |
| "vd_spanish_limaperu": ("kyutai/tts-voices", "voice-donations/spanish-limaperu.wav", "model"), | |
| "vd_stein": ("kyutai/tts-voices", "voice-donations/stein.wav", "model"), | |
| "vd_sujan_daikoawaj": ("kyutai/tts-voices", "voice-donations/sujan_daikoawaj.wav", "model"), | |
| "vd_surazy": ("kyutai/tts-voices", "voice-donations/surazy.wav", "model"), | |
| "vd_taiyo": ("kyutai/tts-voices", "voice-donations/taiyo.wav", "model"), | |
| "vd_temp_007": ("kyutai/tts-voices", "voice-donations/temp-007.wav", "model"), | |
| "vd_thepolishdane": ("kyutai/tts-voices", "voice-donations/thepolishdane.wav", "model"), | |
| "vd_utk": ("kyutai/tts-voices", "voice-donations/utk.wav", "model"), | |
| "vd_vinayak": ("kyutai/tts-voices", "voice-donations/vinayak.wav", "model"), | |
| "vd_virtu": ("kyutai/tts-voices", "voice-donations/virtu.wav", "model"), | |
| "vd_willbas": ("kyutai/tts-voices", "voice-donations/willbas.wav", "model"), | |
| "vd_yaemdluffy": ("kyutai/tts-voices", "voice-donations/yaemdluffy.wav", "model"), | |
| "vd_zerocool": ("kyutai/tts-voices", "voice-donations/zerocool.wav", "model"), | |
| "zero_bill_boerst": ("kyutai/tts-voices", "voice-zero/bill_boerst.wav", "model"), | |
| "zero_caro_davy": ("kyutai/tts-voices", "voice-zero/caro_davy.wav", "model"), | |
| "zero_peter_yearsley": ("kyutai/tts-voices", "voice-zero/peter_yearsley.wav", "model"), | |
| "zero_stuart_bell": ("kyutai/tts-voices", "voice-zero/stuart_bell.wav", "model"), | |
| } | |
| VOICE_SOURCES.update(_KYUTAI_VOICES) | |
| BUILTIN_VOICES = sorted(VOICE_SOURCES.keys()) | |
| def _init_model(): | |
| if _state["initialized"]: | |
| return | |
| if TTSModel is None: | |
| raise RuntimeError("pocket-tts not installed") | |
| print("Initializing Pocket TTS model (english_2026-04 with voice cloning)...") | |
| model = TTSModel.load_model(language="english_2026-04") | |
| _state["model"] = model | |
| _state["sample_rate"] = getattr(model, "sample_rate", 24000) | |
| _state["initialized"] = True | |
| print(f"Pocket TTS initialized. Sample rate: {_state['sample_rate']} Hz, voice_cloning: {model.has_voice_cloning}, voices: {len(BUILTIN_VOICES)}") | |
| def _get_voice_state(voice: str): | |
| model = _state["model"] | |
| if voice in _state["voice_cache"]: | |
| return _state["voice_cache"][voice] | |
| if voice not in VOICE_SOURCES: | |
| raise ValueError(f"Voice '{voice}' not found. Available: {BUILTIN_VOICES}") | |
| source = VOICE_SOURCES[voice] | |
| repo_id = source[0] | |
| voice_path_hf = source[1] | |
| repo_type = source[2] if len(source) > 2 else "space" | |
| from huggingface_hub import hf_hub_download | |
| try: | |
| voice_path = hf_hub_download( | |
| repo_id, | |
| voice_path_hf, | |
| repo_type=repo_type, | |
| token=HF_TOKEN or None, | |
| ) | |
| print(f"Downloaded voice '{voice}' from {repo_id} ({repo_type})") | |
| except Exception as e: | |
| raise ValueError(f"Failed to download voice '{voice}': {e}") | |
| voice_state = model.get_state_for_audio_prompt(voice_path) | |
| def detach_all(obj): | |
| if isinstance(obj, torch.Tensor): | |
| return obj.detach().clone() | |
| elif isinstance(obj, dict): | |
| return {k: detach_all(v) for k, v in obj.items()} | |
| else: | |
| return obj | |
| voice_state = detach_all(voice_state) | |
| _state["voice_cache"][voice] = voice_state | |
| print(f"Voice state loaded for '{voice}'") | |
| return voice_state | |
| def _generate_audio(text: str, voice: str, temperature: float = 0.7) -> tuple: | |
| _init_model() | |
| model = _state["model"] | |
| sample_rate = _state["sample_rate"] | |
| voice_state = _get_voice_state(voice) | |
| audio = model.generate_audio( | |
| voice_state, | |
| text, | |
| frames_after_eos=2, | |
| copy_state=True, | |
| ) | |
| audio_np = audio.cpu().numpy() if hasattr(audio, 'cpu') else audio | |
| max_val = np.max(np.abs(audio_np)) | |
| if max_val > 0: | |
| audio_np = audio_np / max_val * 0.95 | |
| audio_int16 = np.clip(audio_np * 32767, -32767, 32767).astype(np.int16) | |
| return audio_int16, sample_rate | |
| def _wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes: | |
| buf = io.BytesIO() | |
| with wave.open(buf, "wb") as wf: | |
| wf.setnchannels(1) | |
| wf.setsampwidth(2) | |
| wf.setframerate(sample_rate) | |
| wf.writeframes(audio_int16.tobytes()) | |
| return buf.getvalue() | |
| def _ogg_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes: | |
| wav_data = _wav_bytes(audio_int16, sample_rate) | |
| proc = subprocess.run( | |
| ["ffmpeg", "-y", "-f", "wav", "-i", "pipe:0", | |
| "-c:a", "libopus", "-b:a", "64k", "-ar", "48000", "-ac", "1", | |
| "-f", "ogg", "pipe:1"], | |
| input=wav_data, | |
| capture_output=True, | |
| timeout=30, | |
| ) | |
| if proc.returncode != 0: | |
| raise RuntimeError(f"ffmpeg failed: {proc.stderr.decode()[:200]}") | |
| return proc.stdout | |
| async def tts_post(req: TTSRequest): | |
| """POST endpoint β send full text in request body (no URL length limits).""" | |
| try: | |
| audio_int16, sample_rate = _generate_audio(req.text, req.voice, req.temperature) | |
| except ValueError as e: | |
| raise HTTPException(400, str(e)) | |
| except Exception as e: | |
| traceback.print_exc() | |
| raise HTTPException(500, str(e)[:300]) | |
| if req.format == "ogg": | |
| try: | |
| data = _ogg_bytes(audio_int16, sample_rate) | |
| return Response(content=data, media_type="audio/ogg", | |
| headers={"Content-Disposition": "attachment; filename=tts.ogg"}) | |
| except Exception as e: | |
| raise HTTPException(500, f"OGG encoding failed: {str(e)[:200]}") | |
| data = _wav_bytes(audio_int16, sample_rate) | |
| return Response(content=data, media_type="audio/wav", | |
| headers={"Content-Disposition": "attachment; filename=tts.wav"}) | |
| async def tts_get( | |
| text: str = Query(..., description="Text to synthesize"), | |
| voice: str = Query("af_alloy", description="Voice name"), | |
| temperature: float = Query(0.7, ge=0.1, le=1.5), | |
| format: str = Query("ogg", description="Output format: wav or ogg"), | |
| ): | |
| try: | |
| audio_int16, sample_rate = _generate_audio(text, voice, temperature) | |
| except ValueError as e: | |
| raise HTTPException(400, str(e)) | |
| except Exception as e: | |
| traceback.print_exc() | |
| raise HTTPException(500, str(e)[:300]) | |
| if format == "ogg": | |
| try: | |
| data = _ogg_bytes(audio_int16, sample_rate) | |
| return Response(content=data, media_type="audio/ogg", | |
| headers={"Content-Disposition": "attachment; filename=tts.ogg"}) | |
| except Exception as e: | |
| raise HTTPException(500, f"OGG encoding failed: {str(e)[:200]}") | |
| data = _wav_bytes(audio_int16, sample_rate) | |
| return Response(content=data, media_type="audio/wav", | |
| headers={"Content-Disposition": "attachment; filename=tts.wav"}) | |
| async def voices(): | |
| return {"voices": BUILTIN_VOICES, "count": len(BUILTIN_VOICES)} | |
| async def health(): | |
| return {"status": "ok", "initialized": _state["initialized"]} | |
| async def index(): | |
| return """ | |
| <html><body style="font-family:monospace;max-width:700px;margin:40px auto"> | |
| <h1>π Pocket-TTS API</h1> | |
| <p>FastAPI server running <a href="https://huggingface.co/kyutai/pocket-tts">kyutai/pocket-tts</a></p> | |
| <h3>Endpoints</h3> | |
| <ul> | |
| <li><code>GET /tts?text=Hello&voice=af_alloy&format=ogg</code></li> | |
| <li><code>GET /voices</code></li> | |
| <li><code>GET /health</code></li> | |
| </ul> | |
| <h3>Voice Categories</h3> | |
| <ul> | |
| <li><b>af_*/am_*/bf_*/bm_*/ef_*/em_*/ff_*/hf_*/hm_*/if_*/im_*/jf_*/jm_*/pf_*/pm_*/zf_*/zm_*</b> β Standard multilingual</li> | |
| <li><b>f01-f10/m01-m10</b> β Character voices</li> | |
| <li><b>alba_*</b> β Alba Mackenna characters</li> | |
| <li><b>ex_*</b> β Expressive/emotional (angry, happy, whisper, sarcastic, etc.)</li> | |
| <li><b>ears_*</b> β EARS speakers + emotional variants</li> | |
| <li><b>vd_*</b> β Community voice donations (Goku, ClassicWizard, etc.)</li> | |
| <li><b>vctk_*</b> β VCTK dataset speakers</li> | |
| <li><b>cml_*</b> β CML-TTS French speakers</li> | |
| <li><b>zero_*</b> β Voice-zero characters</li> | |
| <li><b>unmute_*</b> β Unmute voices</li> | |
| </ul> | |
| </body></html> | |
| """ | |