File size: 11,535 Bytes
64801d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | import os
import time
import asyncio
from pathlib import Path
from typing import Optional
import numpy as np
import soundfile as sf
import torch
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import logging
import io
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ── App Setup ──────────────────────────────────────────────────────────────────
app = FastAPI(
title="Kokoro TTS API",
description="Text-to-Speech API powered by Kokoro-82M",
version="1.0.0",
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ── Directories ────────────────────────────────────────────────────────────────
STATIC_DIR = Path("static")
STATIC_DIR.mkdir(exist_ok=True)
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
# ── Model Loading ──────────────────────────────────────────────────────────────
CUDA_AVAILABLE = torch.cuda.is_available()
device = "cuda" if CUDA_AVAILABLE else "cpu"
logger.info(f"Device: {device}")
model = None
pipelines = {}
def load_model():
global model, pipelines
try:
from kokoro import KPipeline, KModel
logger.info("Loading Kokoro model...")
model = KModel(repo_id="hexgrad/Kokoro-82M").to(device).eval()
lang_codes = ["a", "b", "e", "f", "h", "i", "j", "p", "z"]
for code in lang_codes:
try:
pipelines[code] = KPipeline(lang_code=code, model=False)
except Exception as e:
logger.warning(f"Pipeline '{code}' failed: {e}")
if "a" in pipelines:
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
if "b" in pipelines:
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
logger.info(f"Model loaded. Pipelines: {list(pipelines.keys())}")
except Exception as e:
logger.error(f"Model load failed: {e}")
# Load on startup
@app.on_event("startup")
async def startup_event():
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, load_model)
# ── Voice Registry ─────────────────────────────────────────────────────────────
VOICES = {
# American English
"af_heart": {"label": "Heart", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_bella": {"label": "Bella", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_nicole": {"label": "Nicole", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_aoede": {"label": "Aoede", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_kore": {"label": "Kore", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_sarah": {"label": "Sarah", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_nova": {"label": "Nova", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_sky": {"label": "Sky", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"af_river": {"label": "River", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"},
"am_michael": {"label": "Michael", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
"am_fenrir": {"label": "Fenrir", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
"am_puck": {"label": "Puck", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
"am_echo": {"label": "Echo", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
"am_eric": {"label": "Eric", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
"am_liam": {"label": "Liam", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
"am_adam": {"label": "Adam", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"},
# British English
"bf_emma": {"label": "Emma", "lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"},
"bf_isabella": {"label": "Isabella","lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"},
"bf_alice": {"label": "Alice", "lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"},
"bf_lily": {"label": "Lily", "lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"},
"bm_george": {"label": "George", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"},
"bm_fable": {"label": "Fable", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"},
"bm_lewis": {"label": "Lewis", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"},
"bm_daniel": {"label": "Daniel", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"},
# Spanish
"ef_dora": {"label": "Dora", "lang": "es", "gender": "female", "flag": "🇪🇸", "code": "e"},
"em_alex": {"label": "Alex", "lang": "es", "gender": "male", "flag": "🇪🇸", "code": "e"},
# French
"ff_siwis": {"label": "Siwis", "lang": "fr", "gender": "female", "flag": "🇫🇷", "code": "f"},
# Hindi
"hf_alpha": {"label": "Alpha", "lang": "hi", "gender": "female", "flag": "🇮🇳", "code": "h"},
"hf_beta": {"label": "Beta", "lang": "hi", "gender": "female", "flag": "🇮🇳", "code": "h"},
"hm_omega": {"label": "Omega", "lang": "hi", "gender": "male", "flag": "🇮🇳", "code": "h"},
"hm_psi": {"label": "Psi", "lang": "hi", "gender": "male", "flag": "🇮🇳", "code": "h"},
# Italian
"if_sara": {"label": "Sara", "lang": "it", "gender": "female", "flag": "🇮🇹", "code": "i"},
"im_nicola": {"label": "Nicola", "lang": "it", "gender": "male", "flag": "🇮🇹", "code": "i"},
# Japanese
"jf_alpha": {"label": "Alpha", "lang": "ja", "gender": "female", "flag": "🇯🇵", "code": "j"},
"jf_gongitsune":{"label": "Gongitsune","lang": "ja", "gender": "female", "flag": "🇯🇵", "code": "j"},
"jf_nezumi": {"label": "Nezumi", "lang": "ja", "gender": "female", "flag": "🇯🇵", "code": "j"},
"jm_kumo": {"label": "Kumo", "lang": "ja", "gender": "male", "flag": "🇯🇵", "code": "j"},
# Portuguese
"pf_dora": {"label": "Dora", "lang": "pt", "gender": "female", "flag": "🇧🇷", "code": "p"},
"pm_alex": {"label": "Alex", "lang": "pt", "gender": "male", "flag": "🇧🇷", "code": "p"},
# Chinese
"zf_xiaobei": {"label": "Xiaobei", "lang": "zh", "gender": "female", "flag": "🇨🇳", "code": "z"},
"zf_xiaoxiao": {"label": "Xiaoxiao", "lang": "zh", "gender": "female", "flag": "🇨🇳", "code": "z"},
"zm_yunjian": {"label": "Yunjian", "lang": "zh", "gender": "male", "flag": "🇨🇳", "code": "z"},
"zm_yunxi": {"label": "Yunxi", "lang": "zh", "gender": "male", "flag": "🇨🇳", "code": "z"},
}
# ── Pydantic Models ────────────────────────────────────────────────────────────
class TTSRequest(BaseModel):
text: str
voice: str = "af_heart"
speed: float = 1.0
output_format: str = "wav" # "wav" or "mp3"
# ── Helper ─────────────────────────────────────────────────────────────────────
def _synthesize_to_bytes(text: str, voice: str, speed: float, output_format: str) -> tuple:
if model is None:
raise RuntimeError("Model not loaded yet")
voice_info = VOICES.get(voice)
if not voice_info:
raise ValueError(f"Unknown voice: {voice}")
pipeline = pipelines.get(voice_info["code"])
if not pipeline:
raise ValueError(f"No pipeline for lang code: {voice_info['code']}")
voice_pack = pipeline.load_voice(voice)
all_audio = []
for _, ps, _ in pipeline(text, voice, speed, split_pattern=r"\n+"):
ref_s = voice_pack[len(ps) - 1].to(device)
all_audio.append(model(ps, ref_s, speed).cpu().numpy())
if not all_audio:
raise RuntimeError("No audio generated")
final_audio = np.concatenate(all_audio)
duration = len(final_audio) / 24000
buf = io.BytesIO()
sf.write(buf, final_audio, 24000, format="WAV")
return buf.getvalue(), duration
# ── Routes ─────────────────────────────────────────────────────────────────────
@app.get("/")
async def root():
return FileResponse("static/index.html")
@app.get("/health")
async def health():
return {
"status": "ok",
"model_loaded": model is not None,
"device": device,
"cuda": CUDA_AVAILABLE,
"pipelines": list(pipelines.keys()),
}
@app.get("/voices")
async def list_voices():
available = {
k: v for k, v in VOICES.items() if v["code"] in pipelines
}
# Group by language
grouped = {}
for vid, info in available.items():
lang = info["lang"]
if lang not in grouped:
grouped[lang] = []
grouped[lang].append({"id": vid, **info})
return {"voices": available, "grouped": grouped, "total": len(available)}
@app.post("/tts")
async def text_to_speech(request: TTSRequest):
if not request.text.strip():
raise HTTPException(400, "text cannot be empty")
if request.voice not in VOICES:
raise HTTPException(400, f"Unknown voice. GET /voices for list.")
if not 0.5 <= request.speed <= 2.0:
raise HTTPException(400, "speed must be between 0.5 and 2.0")
if request.output_format not in ("wav", "mp3"):
raise HTTPException(400, "output_format must be wav or mp3")
if model is None:
raise HTTPException(503, "Model is still loading, please retry in a moment")
try:
loop = asyncio.get_event_loop()
audio_bytes, duration = await loop.run_in_executor(
None,
lambda: _synthesize_to_bytes(request.text, request.voice, request.speed, request.output_format),
)
except Exception as e:
logger.error(f"TTS error: {e}")
raise HTTPException(500, str(e))
fmt = request.output_format
return StreamingResponse(
io.BytesIO(audio_bytes),
media_type="audio/mpeg" if fmt == "mp3" else "audio/wav",
headers={
"Content-Disposition": f'attachment; filename="kokoro_{request.voice}.{fmt}"',
"X-Duration-Seconds": str(round(duration, 2)),
},
)
|