#!/usr/bin/env python3 """ Local Vocence wrapper for testing miner.py without Chutes. Run: python vocence_local_wrapper.py Then call: GET http://127.0.0.1:8000/health POST http://127.0.0.1:8000/speak """ from __future__ import annotations import io import wave from pathlib import Path from typing import Optional import numpy as np import uvicorn from fastapi import FastAPI, HTTPException, status from fastapi.responses import Response from pydantic import BaseModel, Field from yaml import safe_load from miner import Miner VOCENCE_MAX_AUDIO_SECONDS = 30 VOCENCE_MAX_TEXT_LEN = 2000 VOCENCE_MAX_INSTRUCTION_LEN = 600 class VocenceSpeakRequest(BaseModel): instruction: str = Field(..., min_length=1, max_length=VOCENCE_MAX_INSTRUCTION_LEN) text: str = Field(..., min_length=1, max_length=VOCENCE_MAX_TEXT_LEN) class VocenceHealthResponse(BaseModel): status: str model_loaded: bool sample_rate: Optional[int] = None adapter: Optional[str] = None repo_path: str def waveform_to_wav_bytes(waveform: np.ndarray, sample_rate: int) -> bytes: if waveform.ndim != 1: raise ValueError("waveform must be 1D mono") if waveform.dtype != np.int16: wf = np.asarray(waveform, dtype=np.float32) wf = np.clip(wf, -1.0, 1.0) wf = (wf * 32767.0).astype(np.int16) else: wf = waveform buf = io.BytesIO() with wave.open(buf, "wb") as wav: wav.setnchannels(1) wav.setsampwidth(2) wav.setframerate(sample_rate) wav.writeframes(wf.tobytes()) return buf.getvalue() repo_path = Path(__file__).resolve().parent app = FastAPI(title="Vocence Local Wrapper", version="0.1.0") @app.on_event("startup") async def startup_event() -> None: app.state.status = "unknown" app.state.sample_rate = None app.state.adapter = None app.state.tts_engine = None try: app.state.tts_engine = Miner(repo_path) app.state.tts_engine.warmup() vocence_yaml = repo_path / "vocence_config.yaml" if vocence_yaml.exists(): with vocence_yaml.open("r", encoding="utf-8") as f: cfg = safe_load(f) or {} app.state.sample_rate = int(cfg.get("generation", {}).get("sample_rate", 24000)) app.state.adapter = str(cfg.get("runtime", {}).get("adapter", "unknown")) else: app.state.sample_rate = 24000 app.state.adapter = "unknown" app.state.status = "healthy" except Exception as exc: app.state.status = f"startup_failed: {exc}" app.state.tts_engine = None @app.get("/health") async def health() -> dict: return VocenceHealthResponse( status=getattr(app.state, "status", "unknown"), model_loaded=getattr(app.state, "tts_engine", None) is not None, sample_rate=getattr(app.state, "sample_rate", None), adapter=getattr(app.state, "adapter", None), repo_path=str(repo_path), ).model_dump() @app.post("/speak", response_class=Response) async def speak(args: VocenceSpeakRequest): engine = getattr(app.state, "tts_engine", None) if engine is None: raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="TTS engine not loaded", ) waveform, sample_rate = engine.generate_wav(instruction=args.instruction, text=args.text) waveform = np.asarray(waveform) if waveform.ndim != 1 or waveform.size == 0: raise HTTPException(status_code=400, detail="invalid waveform") duration_sec = float(waveform.shape[0]) / float(sample_rate) if duration_sec <= 0 or duration_sec > VOCENCE_MAX_AUDIO_SECONDS: raise HTTPException(status_code=400, detail="invalid duration") return Response( content=waveform_to_wav_bytes(waveform, sample_rate), media_type="audio/wav", ) if __name__ == "__main__": uvicorn.run("vocence_local_wrapper:app", host="127.0.0.1", port=8000, reload=False)