Spaces:
Sleeping
Sleeping
Mouhamed Naby NDIAYE commited on
Commit Β·
7b5e2ea
1
Parent(s): 254814c
Fix Hugging Face Space model loading
Browse files- Dockerfile +25 -25
- frontend/src/App.jsx +11 -10
- wolof_voice_agent/app/main.py +3 -1
- wolof_voice_agent/app/services/llm/llamacpp_llm.py +2 -2
- wolof_voice_agent/app/services/rag/admin_rag.py +1 -1
- wolof_voice_agent/app/services/tts/parler_wolof_tts.py +190 -120
- wolof_voice_agent/config/models.yaml +7 -7
- wolof_voice_agent/requirements.txt +10 -6
Dockerfile
CHANGED
|
@@ -1,64 +1,66 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
# System dependencies
|
| 4 |
RUN apt-get update && apt-get install -y \
|
| 5 |
-
ffmpeg git curl nodejs npm build-essential cmake \
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
|
| 8 |
WORKDIR /app
|
| 9 |
|
| 10 |
-
#
|
| 11 |
RUN pip install --no-cache-dir \
|
| 12 |
torch==2.4.1 torchaudio==2.4.1 \
|
| 13 |
--index-url https://download.pytorch.org/whl/cpu
|
| 14 |
|
| 15 |
-
# ββ Python: project requirements ββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
COPY wolof_voice_agent/requirements.txt ./requirements.txt
|
| 17 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
pip install --no-cache-dir diskcache jinja2
|
| 24 |
|
| 25 |
-
# ββ Bake models into image (one Docker layer, cached independently of code) β
|
| 26 |
-
# Models go into the same path that get_cache_dir() returns at runtime.
|
| 27 |
ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
|
| 28 |
ENV PYTHONPATH=/app/wolof_voice_agent
|
| 29 |
ENV GGUF_REPO=DevQuasar-6/soynade-research.Oolel-v0.1-GGUF
|
| 30 |
-
ENV
|
|
|
|
|
|
|
| 31 |
|
| 32 |
RUN mkdir -p /app/wolof_voice_agent/data/cache/huggingface \
|
| 33 |
/app/wolof_voice_agent/models/gguf
|
| 34 |
|
| 35 |
RUN python - <<'PYEOF'
|
| 36 |
import os
|
|
|
|
| 37 |
from huggingface_hub import snapshot_download, hf_hub_download
|
| 38 |
|
| 39 |
-
cache
|
| 40 |
-
gguf_dir
|
| 41 |
gguf_repo = os.environ["GGUF_REPO"]
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
|
| 44 |
print("Downloading ASR: M9and2M/whisper-small-wolof ...")
|
| 45 |
snapshot_download(
|
| 46 |
-
"M9and2M/whisper-small-wolof",
|
|
|
|
| 47 |
ignore_patterns=["*.msgpack", "*.h5", "flax_model*", "tf_model*"],
|
| 48 |
)
|
| 49 |
|
| 50 |
-
print("Downloading TTS:
|
| 51 |
-
snapshot_download(
|
| 52 |
|
| 53 |
-
print("Downloading
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
print("All models ready.")
|
| 59 |
PYEOF
|
| 60 |
|
| 61 |
-
# ββ React frontend build βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 62 |
COPY frontend/package.json frontend/package-lock.json \
|
| 63 |
/app/wolof_voice_agent/frontend/
|
| 64 |
RUN cd /app/wolof_voice_agent/frontend && npm ci
|
|
@@ -66,10 +68,8 @@ RUN cd /app/wolof_voice_agent/frontend && npm ci
|
|
| 66 |
COPY frontend/ /app/wolof_voice_agent/frontend/
|
| 67 |
RUN cd /app/wolof_voice_agent/frontend && npm run build
|
| 68 |
|
| 69 |
-
# ββ Backend code (after models so code changes don't bust model layer) βββββββ
|
| 70 |
COPY wolof_voice_agent/ /app/wolof_voice_agent/
|
| 71 |
|
| 72 |
-
# ββ Runtime ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
WORKDIR /app/wolof_voice_agent
|
| 74 |
|
| 75 |
ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
|
|
|
| 3 |
RUN apt-get update && apt-get install -y \
|
| 4 |
+
ffmpeg git curl nodejs npm build-essential cmake pkg-config libopenblas-dev \
|
| 5 |
&& rm -rf /var/lib/apt/lists/*
|
| 6 |
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
+
# CPU torch for Hugging Face Spaces CPU. Keep this aligned with requirements pins.
|
| 10 |
RUN pip install --no-cache-dir \
|
| 11 |
torch==2.4.1 torchaudio==2.4.1 \
|
| 12 |
--index-url https://download.pytorch.org/whl/cpu
|
| 13 |
|
|
|
|
| 14 |
COPY wolof_voice_agent/requirements.txt ./requirements.txt
|
| 15 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
|
| 17 |
+
# Build llama-cpp-python inside Debian. The prebuilt wheel previously loaded a
|
| 18 |
+
# musl-linked libllama.so on Spaces and failed with libc.musl-x86_64.so.1.
|
| 19 |
+
RUN CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \
|
| 20 |
+
FORCE_CMAKE=1 \
|
| 21 |
+
pip install --no-cache-dir --no-binary llama-cpp-python llama-cpp-python==0.2.90 && \
|
| 22 |
pip install --no-cache-dir diskcache jinja2
|
| 23 |
|
|
|
|
|
|
|
| 24 |
ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
|
| 25 |
ENV PYTHONPATH=/app/wolof_voice_agent
|
| 26 |
ENV GGUF_REPO=DevQuasar-6/soynade-research.Oolel-v0.1-GGUF
|
| 27 |
+
ENV GGUF_SOURCE_FILENAME=soynade-research.Oolel-v0.1.Q4_K_M.gguf
|
| 28 |
+
ENV GGUF_FILENAME=oolel-v0.1-q4_k_m.gguf
|
| 29 |
+
ENV TTS_REPO=CONCREE/Adia_TTS
|
| 30 |
|
| 31 |
RUN mkdir -p /app/wolof_voice_agent/data/cache/huggingface \
|
| 32 |
/app/wolof_voice_agent/models/gguf
|
| 33 |
|
| 34 |
RUN python - <<'PYEOF'
|
| 35 |
import os
|
| 36 |
+
import shutil
|
| 37 |
from huggingface_hub import snapshot_download, hf_hub_download
|
| 38 |
|
| 39 |
+
cache = os.environ["HF_HOME"]
|
| 40 |
+
gguf_dir = "/app/wolof_voice_agent/models/gguf"
|
| 41 |
gguf_repo = os.environ["GGUF_REPO"]
|
| 42 |
+
gguf_source_file = os.environ["GGUF_SOURCE_FILENAME"]
|
| 43 |
+
gguf_runtime_file = os.environ["GGUF_FILENAME"]
|
| 44 |
+
tts_repo = os.environ["TTS_REPO"]
|
| 45 |
|
| 46 |
print("Downloading ASR: M9and2M/whisper-small-wolof ...")
|
| 47 |
snapshot_download(
|
| 48 |
+
"M9and2M/whisper-small-wolof",
|
| 49 |
+
cache_dir=cache,
|
| 50 |
ignore_patterns=["*.msgpack", "*.h5", "flax_model*", "tf_model*"],
|
| 51 |
)
|
| 52 |
|
| 53 |
+
print(f"Downloading TTS: {tts_repo} ...")
|
| 54 |
+
snapshot_download(tts_repo, cache_dir=cache)
|
| 55 |
|
| 56 |
+
print(f"Downloading LLM GGUF: {gguf_source_file} from {gguf_repo} ...")
|
| 57 |
+
src = hf_hub_download(repo_id=gguf_repo, filename=gguf_source_file, local_dir=gguf_dir)
|
| 58 |
+
dst = os.path.join(gguf_dir, gguf_runtime_file)
|
| 59 |
+
if src != dst:
|
| 60 |
+
shutil.copy2(src, dst)
|
| 61 |
print("All models ready.")
|
| 62 |
PYEOF
|
| 63 |
|
|
|
|
| 64 |
COPY frontend/package.json frontend/package-lock.json \
|
| 65 |
/app/wolof_voice_agent/frontend/
|
| 66 |
RUN cd /app/wolof_voice_agent/frontend && npm ci
|
|
|
|
| 68 |
COPY frontend/ /app/wolof_voice_agent/frontend/
|
| 69 |
RUN cd /app/wolof_voice_agent/frontend && npm run build
|
| 70 |
|
|
|
|
| 71 |
COPY wolof_voice_agent/ /app/wolof_voice_agent/
|
| 72 |
|
|
|
|
| 73 |
WORKDIR /app/wolof_voice_agent
|
| 74 |
|
| 75 |
ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
|
frontend/src/App.jsx
CHANGED
|
@@ -254,17 +254,18 @@ export default function App() {
|
|
| 254 |
body: JSON.stringify({ text, profile: 'general', context: {} }),
|
| 255 |
})
|
| 256 |
const responseText = res.response_text_wo || res.response_text || ''
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
const tts = await apiFetch('/voice/synthesize', {
|
| 260 |
-
method: 'POST',
|
| 261 |
-
headers: { 'Content-Type': 'application/json' },
|
| 262 |
-
body: JSON.stringify({ text_wo: responseText }),
|
| 263 |
-
})
|
| 264 |
-
if (tts.audio_url) audioUrl = apiBase + tts.audio_url
|
| 265 |
-
} catch {}
|
| 266 |
-
replaceLoading(loadId, { text: responseText, audioUrl })
|
| 267 |
setStatus('ok')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
} catch (err) {
|
| 269 |
replaceLoading(loadId, { text: `JΓ pp na soxor: ${err.message}` })
|
| 270 |
setStatus('err')
|
|
|
|
| 254 |
body: JSON.stringify({ text, profile: 'general', context: {} }),
|
| 255 |
})
|
| 256 |
const responseText = res.response_text_wo || res.response_text || ''
|
| 257 |
+
// Affiche le texte immΓ©diatement β sans attendre le TTS
|
| 258 |
+
replaceLoading(loadId, { text: responseText, audioUrl: null })
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
setStatus('ok')
|
| 260 |
+
// TTS en arriΓ¨re-plan β met Γ jour l'audio quand prΓͺt
|
| 261 |
+
apiFetch('/voice/synthesize', {
|
| 262 |
+
method: 'POST',
|
| 263 |
+
headers: { 'Content-Type': 'application/json' },
|
| 264 |
+
body: JSON.stringify({ text_wo: responseText }),
|
| 265 |
+
}).then(tts => {
|
| 266 |
+
if (tts.audio_url)
|
| 267 |
+
setMessages(prev => prev.map(m => m.id === loadId ? { ...m, audioUrl: apiBase + tts.audio_url } : m))
|
| 268 |
+
}).catch(() => {})
|
| 269 |
} catch (err) {
|
| 270 |
replaceLoading(loadId, { text: `JΓ pp na soxor: ${err.message}` })
|
| 271 |
setStatus('err')
|
wolof_voice_agent/app/main.py
CHANGED
|
@@ -281,7 +281,7 @@ def text_respond(req: RespondRequest):
|
|
| 281 |
normalizer = get_normalizer()
|
| 282 |
normalized_text = normalizer.normalize(req.text)
|
| 283 |
rag_sources = []
|
| 284 |
-
_RAG_THRESHOLD = 0.
|
| 285 |
try:
|
| 286 |
# Always check RAG β use it only when relevant (score >= threshold)
|
| 287 |
rag_result = answer_administration_question(
|
|
@@ -300,6 +300,8 @@ def text_respond(req: RespondRequest):
|
|
| 300 |
logger.error(f"/text/respond error: {e}")
|
| 301 |
raise HTTPException(status_code=500, detail=str(e))
|
| 302 |
|
|
|
|
|
|
|
| 303 |
safety_flags, safe_response, requires_review = apply_safety(
|
| 304 |
text=normalized_text,
|
| 305 |
response=raw_response,
|
|
|
|
| 281 |
normalizer = get_normalizer()
|
| 282 |
normalized_text = normalizer.normalize(req.text)
|
| 283 |
rag_sources = []
|
| 284 |
+
_RAG_THRESHOLD = 0.50
|
| 285 |
try:
|
| 286 |
# Always check RAG β use it only when relevant (score >= threshold)
|
| 287 |
rag_result = answer_administration_question(
|
|
|
|
| 300 |
logger.error(f"/text/respond error: {e}")
|
| 301 |
raise HTTPException(status_code=500, detail=str(e))
|
| 302 |
|
| 303 |
+
raw_response = raw_response.replace("\\n", " ").replace("\n", " ").strip()
|
| 304 |
+
|
| 305 |
safety_flags, safe_response, requires_review = apply_safety(
|
| 306 |
text=normalized_text,
|
| 307 |
response=raw_response,
|
wolof_voice_agent/app/services/llm/llamacpp_llm.py
CHANGED
|
@@ -33,7 +33,7 @@ _SYSTEM_BY_PROFILE = {
|
|
| 33 |
"administration": (
|
| 34 |
"Tu es khAdI, une IA qui parle wolof. "
|
| 35 |
"Reponds en UN SEUL paragraphe fluide, sans liste, sans numerotation, sans tiret, sans \\n. "
|
| 36 |
-
"
|
| 37 |
),
|
| 38 |
}
|
| 39 |
|
|
@@ -138,7 +138,7 @@ class LlamaCppLLM(LLMInterface):
|
|
| 138 |
f"Sources officielles:\n{rag_context}\n\n"
|
| 139 |
"IMPORTANT: Reponse en wolof simple, UN SEUL paragraphe fluide, "
|
| 140 |
"sans liste, sans numerotation, sans tiret, sans \\n. "
|
| 141 |
-
"
|
| 142 |
)
|
| 143 |
result = self._chat(system, user_msg)
|
| 144 |
return result or "Baal ma, amuma tontu bu woor ci documents yi."
|
|
|
|
| 33 |
"administration": (
|
| 34 |
"Tu es khAdI, une IA qui parle wolof. "
|
| 35 |
"Reponds en UN SEUL paragraphe fluide, sans liste, sans numerotation, sans tiret, sans \\n. "
|
| 36 |
+
"6 a 8 phrases enchainΓ©es naturellement. Sois factuel et complet, couvre tous les details importants."
|
| 37 |
),
|
| 38 |
}
|
| 39 |
|
|
|
|
| 138 |
f"Sources officielles:\n{rag_context}\n\n"
|
| 139 |
"IMPORTANT: Reponse en wolof simple, UN SEUL paragraphe fluide, "
|
| 140 |
"sans liste, sans numerotation, sans tiret, sans \\n. "
|
| 141 |
+
"6 a 8 phrases enchainΓ©es naturellement. Couvre tous les details utiles."
|
| 142 |
)
|
| 143 |
result = self._chat(system, user_msg)
|
| 144 |
return result or "Baal ma, amuma tontu bu woor ci documents yi."
|
wolof_voice_agent/app/services/rag/admin_rag.py
CHANGED
|
@@ -214,7 +214,7 @@ def answer_administration_question(
|
|
| 214 |
prompt_context["rag_instruction"] = (
|
| 215 |
"Les sources sont en francais. Reponds en wolof simple. "
|
| 216 |
"UN SEUL paragraphe fluide, sans liste, sans numero, sans tiret, sans \\n. "
|
| 217 |
-
"Couvre en
|
| 218 |
"Ne repete aucune information. N'invente rien qui n'est pas dans les sources."
|
| 219 |
)
|
| 220 |
try:
|
|
|
|
| 214 |
prompt_context["rag_instruction"] = (
|
| 215 |
"Les sources sont en francais. Reponds en wolof simple. "
|
| 216 |
"UN SEUL paragraphe fluide, sans liste, sans numero, sans tiret, sans \\n. "
|
| 217 |
+
"Couvre en 6-8 phrases: ce que c'est, a quoi ca sert, comment l'obtenir, documents requis, delais, precision cle. "
|
| 218 |
"Ne repete aucune information. N'invente rien qui n'est pas dans les sources."
|
| 219 |
)
|
| 220 |
try:
|
wolof_voice_agent/app/services/tts/parler_wolof_tts.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
import io
|
| 2 |
import logging
|
|
|
|
| 3 |
import re
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Optional
|
| 6 |
|
|
|
|
| 7 |
import torch
|
| 8 |
|
| 9 |
from .mock_tts import MockTTS
|
|
@@ -12,58 +14,127 @@ from app.core.config import get_cache_dir, get_config
|
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
class ParlerWolofTTS(TTSInterface):
|
| 45 |
-
"""
|
| 46 |
|
| 47 |
def __init__(self, model_name: Optional[str] = None):
|
| 48 |
cfg = get_config()
|
| 49 |
-
self._model_id = model_name or cfg.tts.model_name or "
|
| 50 |
self._playback_speed = max(0.65, min(1.25, float(cfg.tts.playback_speed)))
|
| 51 |
self._target_rms = max(0.02, min(0.2, float(cfg.tts.target_rms)))
|
| 52 |
-
self._device = torch.device(
|
|
|
|
|
|
|
| 53 |
self._tokenizer = None
|
| 54 |
self._model = None
|
|
|
|
|
|
|
| 55 |
self._loaded = False
|
| 56 |
self._fallback = MockTTS()
|
| 57 |
|
| 58 |
-
def _find_snapshot(self
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
snaps = cache / f"models--{key}" / "snapshots"
|
| 61 |
-
if
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
if (snap / "config.json").exists():
|
| 66 |
-
return snap
|
| 67 |
return None
|
| 68 |
|
| 69 |
def _load(self) -> None:
|
|
@@ -73,124 +144,124 @@ class ParlerWolofTTS(TTSInterface):
|
|
| 73 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 74 |
from transformers import AutoTokenizer
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
model_ref = str(snap) if snap else self._model_id
|
| 79 |
local_only = snap is not None
|
| 80 |
-
logger.info(f"Loading
|
| 81 |
|
| 82 |
self._tokenizer = AutoTokenizer.from_pretrained(
|
| 83 |
-
|
| 84 |
-
cache_dir=str(cache),
|
| 85 |
-
local_files_only=local_only,
|
| 86 |
)
|
| 87 |
self._model = ParlerTTSForConditionalGeneration.from_pretrained(
|
| 88 |
-
|
| 89 |
-
cache_dir=str(cache),
|
| 90 |
-
local_files_only=local_only,
|
| 91 |
).to(self._device)
|
| 92 |
self._model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
self._loaded = True
|
| 94 |
-
logger.info("
|
| 95 |
except Exception:
|
| 96 |
-
logger.exception("
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
def
|
| 99 |
-
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
if audio.size == 0:
|
| 102 |
return audio.astype(np.float32)
|
| 103 |
audio = audio.astype(np.float32)
|
| 104 |
if audio.ndim > 1:
|
| 105 |
audio = audio.mean(axis=1)
|
| 106 |
-
audio =
|
| 107 |
-
|
| 108 |
-
try:
|
| 109 |
-
from scipy.signal import butter, sosfiltfilt
|
| 110 |
-
|
| 111 |
-
high = min(7800, int(sr * 0.44))
|
| 112 |
-
sos = butter(6, [80, high], btype="bandpass", fs=sr, output="sos")
|
| 113 |
-
audio = sosfiltfilt(sos, audio).astype(np.float32)
|
| 114 |
-
except Exception:
|
| 115 |
-
# Lightweight fallback: remove slow DC drift with a moving average.
|
| 116 |
-
win = max(64, min(2048, sr // 40))
|
| 117 |
-
kernel = np.ones(win, dtype=np.float32) / win
|
| 118 |
-
drift = np.convolve(audio, kernel, mode="same")
|
| 119 |
-
audio = (audio - drift).astype(np.float32)
|
| 120 |
-
|
| 121 |
-
# Noise gate: attenuate very low-level background without hard clipping words.
|
| 122 |
-
gate = 0.006
|
| 123 |
-
abs_audio = np.abs(audio)
|
| 124 |
-
mask = abs_audio < gate
|
| 125 |
-
audio[mask] *= 0.18
|
| 126 |
-
|
| 127 |
-
rms = float(np.sqrt(np.mean(np.square(audio))) + 1e-9)
|
| 128 |
audio *= self._target_rms / rms
|
| 129 |
peak = float(np.max(np.abs(audio)) + 1e-9)
|
| 130 |
if peak > 0.95:
|
| 131 |
audio *= 0.95 / peak
|
| 132 |
-
|
| 133 |
if self._playback_speed != 1.0 and audio.size > 32:
|
| 134 |
target_len = max(32, int(audio.size / self._playback_speed))
|
| 135 |
-
x_old = np.linspace(0.0, 1.0,
|
| 136 |
-
x_new = np.linspace(0.0, 1.0,
|
| 137 |
audio = np.interp(x_new, x_old, audio).astype(np.float32)
|
| 138 |
-
|
| 139 |
-
fade = min(int(sr * 0.018), audio.size // 8)
|
| 140 |
-
if fade > 1:
|
| 141 |
-
ramp = np.linspace(0.0, 1.0, fade, dtype=np.float32)
|
| 142 |
-
audio[:fade] *= ramp
|
| 143 |
-
audio[-fade:] *= ramp[::-1]
|
| 144 |
return np.clip(audio, -0.98, 0.98).astype(np.float32)
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def synthesize(self, text: str) -> TTSResult:
|
| 147 |
self._load()
|
| 148 |
if not self._loaded:
|
| 149 |
return self._fallback.synthesize(text)
|
| 150 |
|
| 151 |
try:
|
| 152 |
-
import numpy as np
|
| 153 |
import soundfile as sf
|
| 154 |
|
| 155 |
-
|
| 156 |
-
"A clear Wolof speaking voice, natural and calm, recorded in a quiet room, "
|
| 157 |
-
"with normal pauses between words, high audio quality, and no background noise."
|
| 158 |
-
)
|
| 159 |
-
desc = self._tokenizer(description, return_tensors="pt")
|
| 160 |
-
desc = {k: v.to(self._device) for k, v in desc.items()}
|
| 161 |
-
|
| 162 |
-
chunks = []
|
| 163 |
sr = int(self._model.config.sampling_rate)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
for
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
generation = self._model.generate(
|
| 171 |
-
input_ids=desc["input_ids"],
|
| 172 |
-
attention_mask=desc.get("attention_mask"),
|
| 173 |
-
prompt_input_ids=prompt["input_ids"],
|
| 174 |
-
prompt_attention_mask=prompt.get("attention_mask"),
|
| 175 |
-
do_sample=True,
|
| 176 |
-
temperature=0.8,
|
| 177 |
-
top_k=50,
|
| 178 |
-
repetition_penalty=1.2,
|
| 179 |
-
min_new_tokens=120,
|
| 180 |
-
max_new_tokens=1000,
|
| 181 |
-
)
|
| 182 |
-
audio = generation.detach().cpu().numpy().squeeze().astype(np.float32)
|
| 183 |
-
audio = self._clean_audio(audio, sr)
|
| 184 |
if audio.size:
|
| 185 |
chunks.append(audio)
|
| 186 |
-
chunks.append(
|
| 187 |
|
| 188 |
if not chunks:
|
| 189 |
return self._fallback.synthesize(text)
|
| 190 |
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
buf = io.BytesIO()
|
| 193 |
-
sf.write(buf,
|
| 194 |
buf.seek(0)
|
| 195 |
return TTSResult(
|
| 196 |
audio_bytes=buf.read(),
|
|
@@ -200,7 +271,7 @@ class ParlerWolofTTS(TTSInterface):
|
|
| 200 |
audio_contains_speech=True,
|
| 201 |
)
|
| 202 |
except Exception:
|
| 203 |
-
logger.exception("
|
| 204 |
return self._fallback.synthesize(text)
|
| 205 |
|
| 206 |
@property
|
|
@@ -213,8 +284,7 @@ class ParlerWolofTTS(TTSInterface):
|
|
| 213 |
|
| 214 |
def unload(self) -> None:
|
| 215 |
if self._model is not None:
|
| 216 |
-
del self._model, self._tokenizer
|
| 217 |
-
self._model = None
|
| 218 |
-
self._tokenizer = None
|
| 219 |
self._loaded = False
|
| 220 |
-
logger.info("
|
|
|
|
| 1 |
import io
|
| 2 |
import logging
|
| 3 |
+
import random
|
| 4 |
import re
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Optional
|
| 7 |
|
| 8 |
+
import numpy as np
|
| 9 |
import torch
|
| 10 |
|
| 11 |
from .mock_tts import MockTTS
|
|
|
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
+
# ββ Pauses prosodiques (ms) βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
_PAUSE_COMMA = (200, 340)
|
| 19 |
+
_PAUSE_PHRASE = (420, 600)
|
| 20 |
+
_PAUSE_LONG = (650, 900)
|
| 21 |
+
_PAUSE_END = (900, 1200)
|
| 22 |
+
|
| 23 |
+
_MAX_WORDS = 12
|
| 24 |
+
|
| 25 |
+
_CONNECTORS = re.compile(
|
| 26 |
+
r"\b(ak|waaye|ndax|ndaxte|te|walla|bu|su|ngir|dΓ«kΓ«k)\b",
|
| 27 |
+
re.IGNORECASE,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
_DESCRIPTION = (
|
| 31 |
+
"Une voix fΓ©minine claire et naturelle en wolof, "
|
| 32 |
+
"calme, bien articulΓ©e, avec des pauses naturelles."
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _rnd_pause(r: tuple) -> int:
|
| 37 |
+
return max(80, random.randint(*r) + random.randint(-60, 80))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _ending_pause(text: str) -> int:
|
| 41 |
+
t = text.rstrip()
|
| 42 |
+
if not t:
|
| 43 |
+
return _rnd_pause(_PAUSE_PHRASE)
|
| 44 |
+
last = t[-1]
|
| 45 |
+
if last in "?!":
|
| 46 |
+
return _rnd_pause(_PAUSE_LONG)
|
| 47 |
+
if last == ".":
|
| 48 |
+
return _rnd_pause(_PAUSE_PHRASE) if len(t.split()) <= 8 else _rnd_pause(_PAUSE_LONG)
|
| 49 |
+
if last == ",":
|
| 50 |
+
return _rnd_pause(_PAUSE_COMMA)
|
| 51 |
+
return _rnd_pause(_PAUSE_PHRASE)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _split_on_connector(text: str) -> list[str]:
|
| 55 |
+
words = text.split()
|
| 56 |
+
if len(words) <= _MAX_WORDS:
|
| 57 |
+
return [text]
|
| 58 |
+
mid = len(words) // 2
|
| 59 |
+
best_idx, best_dist = None, len(words)
|
| 60 |
+
for i, w in enumerate(words):
|
| 61 |
+
if _CONNECTORS.fullmatch(w.strip(",.!?")):
|
| 62 |
+
d = abs(i - mid)
|
| 63 |
+
if d < best_dist:
|
| 64 |
+
best_dist, best_idx = d, i
|
| 65 |
+
if best_idx is not None and 2 <= best_idx <= len(words) - 2:
|
| 66 |
+
left = " ".join(words[:best_idx])
|
| 67 |
+
right = " ".join(words[best_idx:])
|
| 68 |
+
return _split_on_connector(left) + _split_on_connector(right)
|
| 69 |
+
return [" ".join(words[i:i + _MAX_WORDS]) for i in range(0, len(words), _MAX_WORDS)]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _clean_seg(text: str) -> str:
|
| 73 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 74 |
+
if text and text[-1] not in ".!?,;":
|
| 75 |
+
text += "."
|
| 76 |
+
return text
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def prosody_split(text: str) -> list[dict]:
|
| 80 |
+
"""Découpe le texte en segments avec pause naturelle après chaque segment."""
|
| 81 |
+
sentences = re.split(r"(?<=[.!?;])\s+", text.strip())
|
| 82 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 83 |
+
segments: list[dict] = []
|
| 84 |
+
|
| 85 |
+
for sent in sentences:
|
| 86 |
+
comma_parts = [p.strip() for p in sent.split(",") if p.strip()]
|
| 87 |
+
if len(sent.split()) > _MAX_WORDS and len(comma_parts) > 1:
|
| 88 |
+
sub_units = [(p, "," if i < len(comma_parts) - 1 else "") for i, p in enumerate(comma_parts)]
|
| 89 |
+
else:
|
| 90 |
+
sub_units = [(sent, "")]
|
| 91 |
+
|
| 92 |
+
for raw, delim in sub_units:
|
| 93 |
+
pieces = _split_on_connector(raw)
|
| 94 |
+
for j, piece in enumerate(pieces):
|
| 95 |
+
cleaned = _clean_seg(piece)
|
| 96 |
+
if j == len(pieces) - 1:
|
| 97 |
+
synthetic = cleaned if not delim else piece.rstrip() + ","
|
| 98 |
+
pause = _ending_pause(synthetic)
|
| 99 |
+
else:
|
| 100 |
+
pause = _rnd_pause(_PAUSE_COMMA)
|
| 101 |
+
segments.append({"text": cleaned, "pause_after_ms": pause})
|
| 102 |
+
|
| 103 |
+
if segments:
|
| 104 |
+
segments[-1]["pause_after_ms"] = _rnd_pause(_PAUSE_END)
|
| 105 |
+
return segments
|
| 106 |
|
| 107 |
|
| 108 |
class ParlerWolofTTS(TTSInterface):
|
| 109 |
+
"""Adia TTS (CONCREE/Adia_TTS) avec segmentation prosodique wolof."""
|
| 110 |
|
| 111 |
def __init__(self, model_name: Optional[str] = None):
|
| 112 |
cfg = get_config()
|
| 113 |
+
self._model_id = model_name or cfg.tts.model_name or "CONCREE/Adia_TTS"
|
| 114 |
self._playback_speed = max(0.65, min(1.25, float(cfg.tts.playback_speed)))
|
| 115 |
self._target_rms = max(0.02, min(0.2, float(cfg.tts.target_rms)))
|
| 116 |
+
self._device = torch.device(
|
| 117 |
+
"cuda" if cfg.runtime.prefer_gpu_for_tts and torch.cuda.is_available() else "cpu"
|
| 118 |
+
)
|
| 119 |
self._tokenizer = None
|
| 120 |
self._model = None
|
| 121 |
+
self._desc_ids = None # pre-tokenized description (reused across segments)
|
| 122 |
+
self._desc_mask = None
|
| 123 |
self._loaded = False
|
| 124 |
self._fallback = MockTTS()
|
| 125 |
|
| 126 |
+
def _find_snapshot(self) -> Optional[Path]:
|
| 127 |
+
cache = get_cache_dir()
|
| 128 |
+
key = self._model_id.replace("/", "--")
|
| 129 |
+
# prefer local/ directory (manual download)
|
| 130 |
+
local = cache / f"models--{key}" / "local"
|
| 131 |
+
if (local / "config.json").exists():
|
| 132 |
+
return local
|
| 133 |
snaps = cache / f"models--{key}" / "snapshots"
|
| 134 |
+
if snaps.exists():
|
| 135 |
+
for snap in sorted(snaps.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True):
|
| 136 |
+
if (snap / "config.json").exists():
|
| 137 |
+
return snap
|
|
|
|
|
|
|
| 138 |
return None
|
| 139 |
|
| 140 |
def _load(self) -> None:
|
|
|
|
| 144 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 145 |
from transformers import AutoTokenizer
|
| 146 |
|
| 147 |
+
snap = self._find_snapshot()
|
| 148 |
+
ref = str(snap) if snap else self._model_id
|
|
|
|
| 149 |
local_only = snap is not None
|
| 150 |
+
logger.info(f"Loading Adia TTS '{ref}' on {self._device}")
|
| 151 |
|
| 152 |
self._tokenizer = AutoTokenizer.from_pretrained(
|
| 153 |
+
ref, cache_dir=str(get_cache_dir()), local_files_only=local_only,
|
|
|
|
|
|
|
| 154 |
)
|
| 155 |
self._model = ParlerTTSForConditionalGeneration.from_pretrained(
|
| 156 |
+
ref, cache_dir=str(get_cache_dir()), local_files_only=local_only,
|
|
|
|
|
|
|
| 157 |
).to(self._device)
|
| 158 |
self._model.eval()
|
| 159 |
+
|
| 160 |
+
# Pre-tokenize description once
|
| 161 |
+
enc = self._tokenizer(_DESCRIPTION, return_tensors="pt")
|
| 162 |
+
self._desc_ids = enc["input_ids"].to(self._device)
|
| 163 |
+
self._desc_mask = enc.get("attention_mask")
|
| 164 |
+
if self._desc_mask is not None:
|
| 165 |
+
self._desc_mask = self._desc_mask.to(self._device)
|
| 166 |
+
else:
|
| 167 |
+
self._desc_mask = torch.ones_like(self._desc_ids)
|
| 168 |
+
|
| 169 |
self._loaded = True
|
| 170 |
+
logger.info("Adia TTS loaded")
|
| 171 |
except Exception:
|
| 172 |
+
logger.exception("Adia TTS load failed β using MockTTS")
|
| 173 |
+
|
| 174 |
+
# ββ Audio utils βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
|
| 176 |
+
def _silence(self, ms: int, sr: int) -> np.ndarray:
|
| 177 |
+
return np.zeros(int(sr * ms / 1000), dtype=np.float32)
|
| 178 |
|
| 179 |
+
def _fade(self, audio: np.ndarray, fade_ms: float, sr: int) -> np.ndarray:
|
| 180 |
+
n = min(int(sr * fade_ms / 1000), audio.size // 4)
|
| 181 |
+
if n < 2:
|
| 182 |
+
return audio
|
| 183 |
+
ramp = np.linspace(0.0, 1.0, n, dtype=np.float32)
|
| 184 |
+
audio = audio.copy()
|
| 185 |
+
audio[:n] *= ramp
|
| 186 |
+
audio[-n:] *= ramp[::-1]
|
| 187 |
+
return audio
|
| 188 |
+
|
| 189 |
+
def _normalize(self, audio: np.ndarray) -> np.ndarray:
|
| 190 |
if audio.size == 0:
|
| 191 |
return audio.astype(np.float32)
|
| 192 |
audio = audio.astype(np.float32)
|
| 193 |
if audio.ndim > 1:
|
| 194 |
audio = audio.mean(axis=1)
|
| 195 |
+
audio -= float(np.mean(audio))
|
| 196 |
+
rms = float(np.sqrt(np.mean(audio ** 2)) + 1e-9)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
audio *= self._target_rms / rms
|
| 198 |
peak = float(np.max(np.abs(audio)) + 1e-9)
|
| 199 |
if peak > 0.95:
|
| 200 |
audio *= 0.95 / peak
|
|
|
|
| 201 |
if self._playback_speed != 1.0 and audio.size > 32:
|
| 202 |
target_len = max(32, int(audio.size / self._playback_speed))
|
| 203 |
+
x_old = np.linspace(0.0, 1.0, audio.size)
|
| 204 |
+
x_new = np.linspace(0.0, 1.0, target_len)
|
| 205 |
audio = np.interp(x_new, x_old, audio).astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
return np.clip(audio, -0.98, 0.98).astype(np.float32)
|
| 207 |
|
| 208 |
+
def _synth_segment(self, text: str) -> np.ndarray:
|
| 209 |
+
enc = self._tokenizer(text, return_tensors="pt")
|
| 210 |
+
prompt_ids = enc["input_ids"].to(self._device)
|
| 211 |
+
prompt_mask = enc.get("attention_mask")
|
| 212 |
+
if prompt_mask is not None:
|
| 213 |
+
prompt_mask = prompt_mask.to(self._device)
|
| 214 |
+
else:
|
| 215 |
+
prompt_mask = torch.ones_like(prompt_ids)
|
| 216 |
+
|
| 217 |
+
with torch.no_grad():
|
| 218 |
+
gen = self._model.generate(
|
| 219 |
+
input_ids=self._desc_ids,
|
| 220 |
+
attention_mask=self._desc_mask,
|
| 221 |
+
prompt_input_ids=prompt_ids,
|
| 222 |
+
prompt_attention_mask=prompt_mask,
|
| 223 |
+
do_sample=True,
|
| 224 |
+
temperature=0.8,
|
| 225 |
+
top_k=50,
|
| 226 |
+
repetition_penalty=1.2,
|
| 227 |
+
min_new_tokens=60,
|
| 228 |
+
max_new_tokens=800,
|
| 229 |
+
)
|
| 230 |
+
return gen.detach().cpu().numpy().squeeze().astype(np.float32)
|
| 231 |
+
|
| 232 |
+
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 233 |
+
|
| 234 |
def synthesize(self, text: str) -> TTSResult:
|
| 235 |
self._load()
|
| 236 |
if not self._loaded:
|
| 237 |
return self._fallback.synthesize(text)
|
| 238 |
|
| 239 |
try:
|
|
|
|
| 240 |
import soundfile as sf
|
| 241 |
|
| 242 |
+
segments = prosody_split(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
sr = int(self._model.config.sampling_rate)
|
| 244 |
+
chunks: list[np.ndarray] = []
|
| 245 |
+
|
| 246 |
+
for seg in segments:
|
| 247 |
+
audio = self._synth_segment(seg["text"])
|
| 248 |
+
audio = self._normalize(audio)
|
| 249 |
+
audio = self._fade(audio, fade_ms=18, sr=sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
if audio.size:
|
| 251 |
chunks.append(audio)
|
| 252 |
+
chunks.append(self._silence(seg["pause_after_ms"], sr))
|
| 253 |
|
| 254 |
if not chunks:
|
| 255 |
return self._fallback.synthesize(text)
|
| 256 |
|
| 257 |
+
final = np.concatenate(chunks[:-1] if len(chunks) > 1 else chunks)
|
| 258 |
+
# Final normalisation globale
|
| 259 |
+
peak = float(np.max(np.abs(final)) + 1e-9)
|
| 260 |
+
if peak > 0.95:
|
| 261 |
+
final *= 0.95 / peak
|
| 262 |
+
|
| 263 |
buf = io.BytesIO()
|
| 264 |
+
sf.write(buf, final, samplerate=sr, format="WAV")
|
| 265 |
buf.seek(0)
|
| 266 |
return TTSResult(
|
| 267 |
audio_bytes=buf.read(),
|
|
|
|
| 271 |
audio_contains_speech=True,
|
| 272 |
)
|
| 273 |
except Exception:
|
| 274 |
+
logger.exception("Adia synthesis failed β using MockTTS")
|
| 275 |
return self._fallback.synthesize(text)
|
| 276 |
|
| 277 |
@property
|
|
|
|
| 284 |
|
| 285 |
def unload(self) -> None:
|
| 286 |
if self._model is not None:
|
| 287 |
+
del self._model, self._tokenizer, self._desc_ids, self._desc_mask
|
| 288 |
+
self._model = self._tokenizer = self._desc_ids = self._desc_mask = None
|
|
|
|
| 289 |
self._loaded = False
|
| 290 |
+
logger.info("Adia TTS unloaded")
|
wolof_voice_agent/config/models.yaml
CHANGED
|
@@ -46,9 +46,9 @@ llm:
|
|
| 46 |
quantization: "4bit_if_available"
|
| 47 |
# llama.cpp runtime context. Oolel train context is 32768, but 8192 is a
|
| 48 |
# practical first production-test value for RAG without huge latency.
|
| 49 |
-
n_ctx:
|
| 50 |
-
n_threads:
|
| 51 |
-
max_new_tokens:
|
| 52 |
temperature: 0.2
|
| 53 |
allow_download: true
|
| 54 |
# Wolof-first NLU settings
|
|
@@ -57,10 +57,10 @@ llm:
|
|
| 57 |
use_french_pivot_on_uncertainty: true
|
| 58 |
|
| 59 |
tts:
|
| 60 |
-
provider: "
|
| 61 |
-
model_name: "
|
| 62 |
-
fallback_model_name: "
|
| 63 |
-
third_fallback_model_name: "
|
| 64 |
device: "cpu"
|
| 65 |
allow_download: false
|
| 66 |
playback_speed: 1.0
|
|
|
|
| 46 |
quantization: "4bit_if_available"
|
| 47 |
# llama.cpp runtime context. Oolel train context is 32768, but 8192 is a
|
| 48 |
# practical first production-test value for RAG without huge latency.
|
| 49 |
+
n_ctx: 2048
|
| 50 |
+
n_threads: 16
|
| 51 |
+
max_new_tokens: 600
|
| 52 |
temperature: 0.2
|
| 53 |
allow_download: true
|
| 54 |
# Wolof-first NLU settings
|
|
|
|
| 57 |
use_french_pivot_on_uncertainty: true
|
| 58 |
|
| 59 |
tts:
|
| 60 |
+
provider: "parler"
|
| 61 |
+
model_name: "CONCREE/Adia_TTS"
|
| 62 |
+
fallback_model_name: "Moustapha91/TTS_WOLOF_FINAL"
|
| 63 |
+
third_fallback_model_name: "bilalfaye/speecht5_tts-wolof"
|
| 64 |
device: "cpu"
|
| 65 |
allow_download: false
|
| 66 |
playback_speed: 1.0
|
wolof_voice_agent/requirements.txt
CHANGED
|
@@ -18,11 +18,14 @@ pydantic>=2.7.0
|
|
| 18 |
pyyaml>=6.0.1
|
| 19 |
|
| 20 |
# ML / NLP
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Audio
|
| 28 |
soundfile>=0.12.1
|
|
@@ -30,7 +33,7 @@ librosa>=0.10.1
|
|
| 30 |
scipy>=1.13.0
|
| 31 |
|
| 32 |
# Numerical
|
| 33 |
-
numpy>=1.26.0
|
| 34 |
|
| 35 |
# Optional: quantization (install separately if compatible)
|
| 36 |
# bitsandbytes>=0.43.0
|
|
@@ -47,3 +50,4 @@ pytest-asyncio>=0.23.0
|
|
| 47 |
# NOTE: torch, torchaudio are NOT listed here to avoid overwriting the
|
| 48 |
# existing GPU-enabled installation in the 'llama' conda environment.
|
| 49 |
# If installing fresh: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
|
|
|
|
|
|
| 18 |
pyyaml>=6.0.1
|
| 19 |
|
| 20 |
# ML / NLP
|
| 21 |
+
# Pinned for Hugging Face Spaces CPU image.
|
| 22 |
+
# Newer transformers versions can hit torch.library custom_op schema errors
|
| 23 |
+
# with the CPU torch used in the Dockerfile.
|
| 24 |
+
transformers==4.46.3
|
| 25 |
+
accelerate==0.34.2
|
| 26 |
+
huggingface_hub==0.26.5
|
| 27 |
+
sentencepiece==0.2.0
|
| 28 |
+
tokenizers==0.20.3
|
| 29 |
|
| 30 |
# Audio
|
| 31 |
soundfile>=0.12.1
|
|
|
|
| 33 |
scipy>=1.13.0
|
| 34 |
|
| 35 |
# Numerical
|
| 36 |
+
numpy>=1.26.0,<2.1
|
| 37 |
|
| 38 |
# Optional: quantization (install separately if compatible)
|
| 39 |
# bitsandbytes>=0.43.0
|
|
|
|
| 50 |
# NOTE: torch, torchaudio are NOT listed here to avoid overwriting the
|
| 51 |
# existing GPU-enabled installation in the 'llama' conda environment.
|
| 52 |
# If installing fresh: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 53 |
+
parler-tts
|