Spaces:

GaindeNdiaye
/

khAdI

Sleeping

App Files Files Community

Mouhamed Naby NDIAYE commited on May 20

Commit

7b5e2ea

1 Parent(s): 254814c

Fix Hugging Face Space model loading

Browse files

Files changed (8) hide show

Dockerfile +25 -25
frontend/src/App.jsx +11 -10
wolof_voice_agent/app/main.py +3 -1
wolof_voice_agent/app/services/llm/llamacpp_llm.py +2 -2
wolof_voice_agent/app/services/rag/admin_rag.py +1 -1
wolof_voice_agent/app/services/tts/parler_wolof_tts.py +190 -120
wolof_voice_agent/config/models.yaml +7 -7
wolof_voice_agent/requirements.txt +10 -6

Dockerfile CHANGED Viewed

@@ -1,64 +1,66 @@
 FROM python:3.11-slim
-# System dependencies
 RUN apt-get update && apt-get install -y \
-    ffmpeg git curl nodejs npm build-essential cmake \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-# ── Python: PyTorch CPU ─────────────────────────────────────────────────────
 RUN pip install --no-cache-dir \
     torch==2.4.1 torchaudio==2.4.1 \
     --index-url https://download.pytorch.org/whl/cpu
-# ── Python: project requirements ────────────────────────────────────────────
 COPY wolof_voice_agent/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
-# ── Python: llama-cpp-python (prebuilt CPU wheel — évite la compilation C++) ─
-RUN pip install --no-cache-dir llama-cpp-python==0.2.90 \
-    --index-url https://abetlen.github.io/llama-cpp-python/whl/cpu \
-    --no-deps && \
     pip install --no-cache-dir diskcache jinja2
-# ── Bake models into image (one Docker layer, cached independently of code) ─
-# Models go into the same path that get_cache_dir() returns at runtime.
 ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
 ENV PYTHONPATH=/app/wolof_voice_agent
 ENV GGUF_REPO=DevQuasar-6/soynade-research.Oolel-v0.1-GGUF
-ENV GGUF_FILENAME=soynade-research.Oolel-v0.1.Q5_K_M.gguf
 RUN mkdir -p /app/wolof_voice_agent/data/cache/huggingface \
              /app/wolof_voice_agent/models/gguf
 RUN python - <<'PYEOF'
 import os
 from huggingface_hub import snapshot_download, hf_hub_download
-cache     = os.environ["HF_HOME"]
-gguf_dir  = "/app/wolof_voice_agent/models/gguf"
 gguf_repo = os.environ["GGUF_REPO"]
-gguf_file = os.environ["GGUF_FILENAME"]
 print("Downloading ASR: M9and2M/whisper-small-wolof ...")
 snapshot_download(
-    "M9and2M/whisper-small-wolof", cache_dir=cache,
     ignore_patterns=["*.msgpack", "*.h5", "flax_model*", "tf_model*"],
 )
-print("Downloading TTS: Moustapha91/TTS_WOLOF_FINAL ...")
-snapshot_download("Moustapha91/TTS_WOLOF_FINAL", cache_dir=cache)
-print("Downloading TTS vocoder: microsoft/speecht5_hifigan ...")
-snapshot_download("microsoft/speecht5_hifigan", cache_dir=cache)
-print(f"Downloading LLM GGUF: {gguf_file} from {gguf_repo} ...")
-hf_hub_download(repo_id=gguf_repo, filename=gguf_file, local_dir=gguf_dir)
 print("All models ready.")
 PYEOF
-# ── React frontend build ─────────────────────────────────────────────────────
 COPY frontend/package.json frontend/package-lock.json \
      /app/wolof_voice_agent/frontend/
 RUN cd /app/wolof_voice_agent/frontend && npm ci
@@ -66,10 +68,8 @@ RUN cd /app/wolof_voice_agent/frontend && npm ci
 COPY frontend/ /app/wolof_voice_agent/frontend/
 RUN cd /app/wolof_voice_agent/frontend && npm run build
-# ── Backend code (after models so code changes don't bust model layer) ───────
 COPY wolof_voice_agent/ /app/wolof_voice_agent/
-# ── Runtime ──────────────────────────────────────────────────────────────────
 WORKDIR /app/wolof_voice_agent
 ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface

 FROM python:3.11-slim
 RUN apt-get update && apt-get install -y \
+    ffmpeg git curl nodejs npm build-essential cmake pkg-config libopenblas-dev \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# CPU torch for Hugging Face Spaces CPU. Keep this aligned with requirements pins.
 RUN pip install --no-cache-dir \
     torch==2.4.1 torchaudio==2.4.1 \
     --index-url https://download.pytorch.org/whl/cpu
 COPY wolof_voice_agent/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
+# Build llama-cpp-python inside Debian. The prebuilt wheel previously loaded a
+# musl-linked libllama.so on Spaces and failed with libc.musl-x86_64.so.1.
+RUN CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \
+    FORCE_CMAKE=1 \
+    pip install --no-cache-dir --no-binary llama-cpp-python llama-cpp-python==0.2.90 && \
     pip install --no-cache-dir diskcache jinja2
 ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
 ENV PYTHONPATH=/app/wolof_voice_agent
 ENV GGUF_REPO=DevQuasar-6/soynade-research.Oolel-v0.1-GGUF
+ENV GGUF_SOURCE_FILENAME=soynade-research.Oolel-v0.1.Q4_K_M.gguf
+ENV GGUF_FILENAME=oolel-v0.1-q4_k_m.gguf
+ENV TTS_REPO=CONCREE/Adia_TTS
 RUN mkdir -p /app/wolof_voice_agent/data/cache/huggingface \
              /app/wolof_voice_agent/models/gguf
 RUN python - <<'PYEOF'
 import os
+import shutil
 from huggingface_hub import snapshot_download, hf_hub_download
+cache = os.environ["HF_HOME"]
+gguf_dir = "/app/wolof_voice_agent/models/gguf"
 gguf_repo = os.environ["GGUF_REPO"]
+gguf_source_file = os.environ["GGUF_SOURCE_FILENAME"]
+gguf_runtime_file = os.environ["GGUF_FILENAME"]
+tts_repo = os.environ["TTS_REPO"]
 print("Downloading ASR: M9and2M/whisper-small-wolof ...")
 snapshot_download(
+    "M9and2M/whisper-small-wolof",
+    cache_dir=cache,
     ignore_patterns=["*.msgpack", "*.h5", "flax_model*", "tf_model*"],
 )
+print(f"Downloading TTS: {tts_repo} ...")
+snapshot_download(tts_repo, cache_dir=cache)
+print(f"Downloading LLM GGUF: {gguf_source_file} from {gguf_repo} ...")
+src = hf_hub_download(repo_id=gguf_repo, filename=gguf_source_file, local_dir=gguf_dir)
+dst = os.path.join(gguf_dir, gguf_runtime_file)
+if src != dst:
+    shutil.copy2(src, dst)
 print("All models ready.")
 PYEOF
 COPY frontend/package.json frontend/package-lock.json \
      /app/wolof_voice_agent/frontend/
 RUN cd /app/wolof_voice_agent/frontend && npm ci
 COPY frontend/ /app/wolof_voice_agent/frontend/
 RUN cd /app/wolof_voice_agent/frontend && npm run build
 COPY wolof_voice_agent/ /app/wolof_voice_agent/
 WORKDIR /app/wolof_voice_agent
 ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface

frontend/src/App.jsx CHANGED Viewed

@@ -254,17 +254,18 @@ export default function App() {
         body: JSON.stringify({ text, profile: 'general', context: {} }),
       })
       const responseText = res.response_text_wo || res.response_text || ''
-      let audioUrl = null
-      try {
-        const tts = await apiFetch('/voice/synthesize', {
-          method: 'POST',
-          headers: { 'Content-Type': 'application/json' },
-          body: JSON.stringify({ text_wo: responseText }),
-        })
-        if (tts.audio_url) audioUrl = apiBase + tts.audio_url
-      } catch {}
-      replaceLoading(loadId, { text: responseText, audioUrl })
       setStatus('ok')
     } catch (err) {
       replaceLoading(loadId, { text: `Jàpp na soxor: ${err.message}` })
       setStatus('err')

         body: JSON.stringify({ text, profile: 'general', context: {} }),
       })
       const responseText = res.response_text_wo || res.response_text || ''
+      // Affiche le texte immédiatement — sans attendre le TTS
+      replaceLoading(loadId, { text: responseText, audioUrl: null })
       setStatus('ok')
+      // TTS en arrière-plan — met à jour l'audio quand prêt
+      apiFetch('/voice/synthesize', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ text_wo: responseText }),
+      }).then(tts => {
+        if (tts.audio_url)
+          setMessages(prev => prev.map(m => m.id === loadId ? { ...m, audioUrl: apiBase + tts.audio_url } : m))
+      }).catch(() => {})
     } catch (err) {
       replaceLoading(loadId, { text: `Jàpp na soxor: ${err.message}` })
       setStatus('err')

wolof_voice_agent/app/main.py CHANGED Viewed

@@ -281,7 +281,7 @@ def text_respond(req: RespondRequest):
     normalizer = get_normalizer()
     normalized_text = normalizer.normalize(req.text)
     rag_sources = []
-    _RAG_THRESHOLD = 0.25
     try:
         # Always check RAG — use it only when relevant (score >= threshold)
         rag_result = answer_administration_question(
@@ -300,6 +300,8 @@ def text_respond(req: RespondRequest):
         logger.error(f"/text/respond error: {e}")
         raise HTTPException(status_code=500, detail=str(e))
     safety_flags, safe_response, requires_review = apply_safety(
         text=normalized_text,
         response=raw_response,

     normalizer = get_normalizer()
     normalized_text = normalizer.normalize(req.text)
     rag_sources = []
+    _RAG_THRESHOLD = 0.50
     try:
         # Always check RAG — use it only when relevant (score >= threshold)
         rag_result = answer_administration_question(
         logger.error(f"/text/respond error: {e}")
         raise HTTPException(status_code=500, detail=str(e))
+    raw_response = raw_response.replace("\\n", " ").replace("\n", " ").strip()
     safety_flags, safe_response, requires_review = apply_safety(
         text=normalized_text,
         response=raw_response,

wolof_voice_agent/app/services/llm/llamacpp_llm.py CHANGED Viewed

@@ -33,7 +33,7 @@ _SYSTEM_BY_PROFILE = {
     "administration": (
         "Tu es khAdI, une IA qui parle wolof. "
         "Reponds en UN SEUL paragraphe fluide, sans liste, sans numerotation, sans tiret, sans \\n. "
-        "4 a 6 phrases enchainées naturellement. Sois factuel, arrete-toi après la 6e phrase."
     ),
 }
@@ -138,7 +138,7 @@ class LlamaCppLLM(LLMInterface):
                     f"Sources officielles:\n{rag_context}\n\n"
                     "IMPORTANT: Reponse en wolof simple, UN SEUL paragraphe fluide, "
                     "sans liste, sans numerotation, sans tiret, sans \\n. "
-                    "4 a 6 phrases enchainées naturellement. Stop apres la 6e phrase."
                 )
                 result = self._chat(system, user_msg)
                 return result or "Baal ma, amuma tontu bu woor ci documents yi."

     "administration": (
         "Tu es khAdI, une IA qui parle wolof. "
         "Reponds en UN SEUL paragraphe fluide, sans liste, sans numerotation, sans tiret, sans \\n. "
+        "6 a 8 phrases enchainées naturellement. Sois factuel et complet, couvre tous les details importants."
     ),
 }
                     f"Sources officielles:\n{rag_context}\n\n"
                     "IMPORTANT: Reponse en wolof simple, UN SEUL paragraphe fluide, "
                     "sans liste, sans numerotation, sans tiret, sans \\n. "
+                    "6 a 8 phrases enchainées naturellement. Couvre tous les details utiles."
                 )
                 result = self._chat(system, user_msg)
                 return result or "Baal ma, amuma tontu bu woor ci documents yi."

wolof_voice_agent/app/services/rag/admin_rag.py CHANGED Viewed

@@ -214,7 +214,7 @@ def answer_administration_question(
         prompt_context["rag_instruction"] = (
             "Les sources sont en francais. Reponds en wolof simple. "
             "UN SEUL paragraphe fluide, sans liste, sans numero, sans tiret, sans \\n. "
-            "Couvre en 4-6 phrases: ce que c'est, a quoi ca sert, comment l'obtenir, precision cle. "
             "Ne repete aucune information. N'invente rien qui n'est pas dans les sources."
         )
         try:

         prompt_context["rag_instruction"] = (
             "Les sources sont en francais. Reponds en wolof simple. "
             "UN SEUL paragraphe fluide, sans liste, sans numero, sans tiret, sans \\n. "
+            "Couvre en 6-8 phrases: ce que c'est, a quoi ca sert, comment l'obtenir, documents requis, delais, precision cle. "
             "Ne repete aucune information. N'invente rien qui n'est pas dans les sources."
         )
         try:

wolof_voice_agent/app/services/tts/parler_wolof_tts.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import io
 import logging
 import re
 from pathlib import Path
 from typing import Optional
 import torch
 from .mock_tts import MockTTS
@@ -12,58 +14,127 @@ from app.core.config import get_cache_dir, get_config
 logger = logging.getLogger(__name__)
-_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
-def _split_tts_units(text: str, max_chars: int = 160) -> list[str]:
-    units: list[str] = []
-    for sentence in _SENTENCE_SPLIT_RE.split(text.strip()):
-        sentence = re.sub(r"\s+", " ", sentence).strip()
-        if not sentence:
-            continue
-        if len(sentence) <= max_chars:
-            units.append(sentence)
-            continue
-        words = sentence.split()
-        current: list[str] = []
-        current_len = 0
-        for word in words:
-            next_len = current_len + len(word) + (1 if current else 0)
-            if current and next_len > max_chars:
-                units.append(" ".join(current))
-                current = [word]
-                current_len = len(word)
-            else:
-                current.append(word)
-                current_len = next_len
-        if current:
-            units.append(" ".join(current))
-    return units or [text.strip()]
 class ParlerWolofTTS(TTSInterface):
-    """Wolof Parler-TTS backend for GalsenAI/Adia-style models."""
     def __init__(self, model_name: Optional[str] = None):
         cfg = get_config()
-        self._model_id = model_name or cfg.tts.model_name or "galsenai/parler-tts-mini-v1-wolof"
         self._playback_speed = max(0.65, min(1.25, float(cfg.tts.playback_speed)))
         self._target_rms = max(0.02, min(0.2, float(cfg.tts.target_rms)))
-        self._device = torch.device("cuda" if cfg.runtime.prefer_gpu_for_tts and torch.cuda.is_available() else "cpu")
         self._tokenizer = None
         self._model = None
         self._loaded = False
         self._fallback = MockTTS()
-    def _find_snapshot(self, cache: Path, model_id: str) -> Optional[Path]:
-        key = model_id.replace("/", "--")
         snaps = cache / f"models--{key}" / "snapshots"
-        if not snaps.exists():
-            return None
-        candidates = sorted(snaps.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
-        for snap in candidates:
-            if (snap / "config.json").exists():
-                return snap
         return None
     def _load(self) -> None:
@@ -73,124 +144,124 @@ class ParlerWolofTTS(TTSInterface):
             from parler_tts import ParlerTTSForConditionalGeneration
             from transformers import AutoTokenizer
-            cache = get_cache_dir()
-            snap = self._find_snapshot(cache, self._model_id)
-            model_ref = str(snap) if snap else self._model_id
             local_only = snap is not None
-            logger.info(f"Loading Parler Wolof TTS '{model_ref}' on {self._device}")
             self._tokenizer = AutoTokenizer.from_pretrained(
-                model_ref,
-                cache_dir=str(cache),
-                local_files_only=local_only,
             )
             self._model = ParlerTTSForConditionalGeneration.from_pretrained(
-                model_ref,
-                cache_dir=str(cache),
-                local_files_only=local_only,
             ).to(self._device)
             self._model.eval()
             self._loaded = True
-            logger.info("Parler Wolof TTS loaded")
         except Exception:
-            logger.exception("Parler Wolof TTS load failed - using MockTTS")
-    def _clean_audio(self, audio: "np.ndarray", sr: int) -> "np.ndarray":
-        import numpy as np
         if audio.size == 0:
             return audio.astype(np.float32)
         audio = audio.astype(np.float32)
         if audio.ndim > 1:
             audio = audio.mean(axis=1)
-        audio = audio - float(np.mean(audio))
-        try:
-            from scipy.signal import butter, sosfiltfilt
-            high = min(7800, int(sr * 0.44))
-            sos = butter(6, [80, high], btype="bandpass", fs=sr, output="sos")
-            audio = sosfiltfilt(sos, audio).astype(np.float32)
-        except Exception:
-            # Lightweight fallback: remove slow DC drift with a moving average.
-            win = max(64, min(2048, sr // 40))
-            kernel = np.ones(win, dtype=np.float32) / win
-            drift = np.convolve(audio, kernel, mode="same")
-            audio = (audio - drift).astype(np.float32)
-        # Noise gate: attenuate very low-level background without hard clipping words.
-        gate = 0.006
-        abs_audio = np.abs(audio)
-        mask = abs_audio < gate
-        audio[mask] *= 0.18
-        rms = float(np.sqrt(np.mean(np.square(audio))) + 1e-9)
         audio *= self._target_rms / rms
         peak = float(np.max(np.abs(audio)) + 1e-9)
         if peak > 0.95:
             audio *= 0.95 / peak
         if self._playback_speed != 1.0 and audio.size > 32:
             target_len = max(32, int(audio.size / self._playback_speed))
-            x_old = np.linspace(0.0, 1.0, num=audio.size, endpoint=True)
-            x_new = np.linspace(0.0, 1.0, num=target_len, endpoint=True)
             audio = np.interp(x_new, x_old, audio).astype(np.float32)
-        fade = min(int(sr * 0.018), audio.size // 8)
-        if fade > 1:
-            ramp = np.linspace(0.0, 1.0, fade, dtype=np.float32)
-            audio[:fade] *= ramp
-            audio[-fade:] *= ramp[::-1]
         return np.clip(audio, -0.98, 0.98).astype(np.float32)
     def synthesize(self, text: str) -> TTSResult:
         self._load()
         if not self._loaded:
             return self._fallback.synthesize(text)
         try:
-            import numpy as np
             import soundfile as sf
-            description = (
-                "A clear Wolof speaking voice, natural and calm, recorded in a quiet room, "
-                "with normal pauses between words, high audio quality, and no background noise."
-            )
-            desc = self._tokenizer(description, return_tensors="pt")
-            desc = {k: v.to(self._device) for k, v in desc.items()}
-            chunks = []
             sr = int(self._model.config.sampling_rate)
-            pause = np.zeros(int(sr * 0.28), dtype=np.float32)
-            for unit in _split_tts_units(text):
-                prompt = self._tokenizer(unit, return_tensors="pt")
-                prompt = {k: v.to(self._device) for k, v in prompt.items()}
-                with torch.no_grad():
-                    generation = self._model.generate(
-                        input_ids=desc["input_ids"],
-                        attention_mask=desc.get("attention_mask"),
-                        prompt_input_ids=prompt["input_ids"],
-                        prompt_attention_mask=prompt.get("attention_mask"),
-                        do_sample=True,
-                        temperature=0.8,
-                        top_k=50,
-                        repetition_penalty=1.2,
-                        min_new_tokens=120,
-                        max_new_tokens=1000,
-                    )
-                audio = generation.detach().cpu().numpy().squeeze().astype(np.float32)
-                audio = self._clean_audio(audio, sr)
                 if audio.size:
                     chunks.append(audio)
-                    chunks.append(pause)
             if not chunks:
                 return self._fallback.synthesize(text)
-            audio_np = np.concatenate(chunks[:-1] if len(chunks) > 1 else chunks)
             buf = io.BytesIO()
-            sf.write(buf, audio_np, samplerate=sr, format="WAV")
             buf.seek(0)
             return TTSResult(
                 audio_bytes=buf.read(),
@@ -200,7 +271,7 @@ class ParlerWolofTTS(TTSInterface):
                 audio_contains_speech=True,
             )
         except Exception:
-            logger.exception("Parler Wolof synthesis failed - using MockTTS")
             return self._fallback.synthesize(text)
     @property
@@ -213,8 +284,7 @@ class ParlerWolofTTS(TTSInterface):
     def unload(self) -> None:
         if self._model is not None:
-            del self._model, self._tokenizer
-            self._model = None
-            self._tokenizer = None
             self._loaded = False
-            logger.info("Parler Wolof TTS unloaded")

 import io
 import logging
+import random
 import re
 from pathlib import Path
 from typing import Optional
+import numpy as np
 import torch
 from .mock_tts import MockTTS
 logger = logging.getLogger(__name__)
+# ── Pauses prosodiques (ms) ───────────────────────────────────────────────────
+_PAUSE_COMMA  = (200, 340)
+_PAUSE_PHRASE = (420, 600)
+_PAUSE_LONG   = (650, 900)
+_PAUSE_END    = (900, 1200)
+_MAX_WORDS = 12
+_CONNECTORS = re.compile(
+    r"\b(ak|waaye|ndax|ndaxte|te|walla|bu|su|ngir|dëkëk)\b",
+    re.IGNORECASE,
+)
+_DESCRIPTION = (
+    "Une voix féminine claire et naturelle en wolof, "
+    "calme, bien articulée, avec des pauses naturelles."
+)
+def _rnd_pause(r: tuple) -> int:
+    return max(80, random.randint(*r) + random.randint(-60, 80))
+def _ending_pause(text: str) -> int:
+    t = text.rstrip()
+    if not t:
+        return _rnd_pause(_PAUSE_PHRASE)
+    last = t[-1]
+    if last in "?!":
+        return _rnd_pause(_PAUSE_LONG)
+    if last == ".":
+        return _rnd_pause(_PAUSE_PHRASE) if len(t.split()) <= 8 else _rnd_pause(_PAUSE_LONG)
+    if last == ",":
+        return _rnd_pause(_PAUSE_COMMA)
+    return _rnd_pause(_PAUSE_PHRASE)
+def _split_on_connector(text: str) -> list[str]:
+    words = text.split()
+    if len(words) <= _MAX_WORDS:
+        return [text]
+    mid = len(words) // 2
+    best_idx, best_dist = None, len(words)
+    for i, w in enumerate(words):
+        if _CONNECTORS.fullmatch(w.strip(",.!?")):
+            d = abs(i - mid)
+            if d < best_dist:
+                best_dist, best_idx = d, i
+    if best_idx is not None and 2 <= best_idx <= len(words) - 2:
+        left  = " ".join(words[:best_idx])
+        right = " ".join(words[best_idx:])
+        return _split_on_connector(left) + _split_on_connector(right)
+    return [" ".join(words[i:i + _MAX_WORDS]) for i in range(0, len(words), _MAX_WORDS)]
+def _clean_seg(text: str) -> str:
+    text = re.sub(r"\s+", " ", text).strip()
+    if text and text[-1] not in ".!?,;":
+        text += "."
+    return text
+def prosody_split(text: str) -> list[dict]:
+    """Découpe le texte en segments avec pause naturelle après chaque segment."""
+    sentences = re.split(r"(?<=[.!?;])\s+", text.strip())
+    sentences = [s.strip() for s in sentences if s.strip()]
+    segments: list[dict] = []
+    for sent in sentences:
+        comma_parts = [p.strip() for p in sent.split(",") if p.strip()]
+        if len(sent.split()) > _MAX_WORDS and len(comma_parts) > 1:
+            sub_units = [(p, "," if i < len(comma_parts) - 1 else "") for i, p in enumerate(comma_parts)]
+        else:
+            sub_units = [(sent, "")]
+        for raw, delim in sub_units:
+            pieces = _split_on_connector(raw)
+            for j, piece in enumerate(pieces):
+                cleaned = _clean_seg(piece)
+                if j == len(pieces) - 1:
+                    synthetic = cleaned if not delim else piece.rstrip() + ","
+                    pause = _ending_pause(synthetic)
+                else:
+                    pause = _rnd_pause(_PAUSE_COMMA)
+                segments.append({"text": cleaned, "pause_after_ms": pause})
+    if segments:
+        segments[-1]["pause_after_ms"] = _rnd_pause(_PAUSE_END)
+    return segments
 class ParlerWolofTTS(TTSInterface):
+    """Adia TTS (CONCREE/Adia_TTS) avec segmentation prosodique wolof."""
     def __init__(self, model_name: Optional[str] = None):
         cfg = get_config()
+        self._model_id = model_name or cfg.tts.model_name or "CONCREE/Adia_TTS"
         self._playback_speed = max(0.65, min(1.25, float(cfg.tts.playback_speed)))
         self._target_rms = max(0.02, min(0.2, float(cfg.tts.target_rms)))
+        self._device = torch.device(
+            "cuda" if cfg.runtime.prefer_gpu_for_tts and torch.cuda.is_available() else "cpu"
+        )
         self._tokenizer = None
         self._model = None
+        self._desc_ids = None       # pre-tokenized description (reused across segments)
+        self._desc_mask = None
         self._loaded = False
         self._fallback = MockTTS()
+    def _find_snapshot(self) -> Optional[Path]:
+        cache = get_cache_dir()
+        key = self._model_id.replace("/", "--")
+        # prefer local/ directory (manual download)
+        local = cache / f"models--{key}" / "local"
+        if (local / "config.json").exists():
+            return local
         snaps = cache / f"models--{key}" / "snapshots"
+        if snaps.exists():
+            for snap in sorted(snaps.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True):
+                if (snap / "config.json").exists():
+                    return snap
         return None
     def _load(self) -> None:
             from parler_tts import ParlerTTSForConditionalGeneration
             from transformers import AutoTokenizer
+            snap = self._find_snapshot()
+            ref = str(snap) if snap else self._model_id
             local_only = snap is not None
+            logger.info(f"Loading Adia TTS '{ref}' on {self._device}")
             self._tokenizer = AutoTokenizer.from_pretrained(
+                ref, cache_dir=str(get_cache_dir()), local_files_only=local_only,
             )
             self._model = ParlerTTSForConditionalGeneration.from_pretrained(
+                ref, cache_dir=str(get_cache_dir()), local_files_only=local_only,
             ).to(self._device)
             self._model.eval()
+            # Pre-tokenize description once
+            enc = self._tokenizer(_DESCRIPTION, return_tensors="pt")
+            self._desc_ids  = enc["input_ids"].to(self._device)
+            self._desc_mask = enc.get("attention_mask")
+            if self._desc_mask is not None:
+                self._desc_mask = self._desc_mask.to(self._device)
+            else:
+                self._desc_mask = torch.ones_like(self._desc_ids)
             self._loaded = True
+            logger.info("Adia TTS loaded")
         except Exception:
+            logger.exception("Adia TTS load failed — using MockTTS")
+    # ── Audio utils ───────────────────────────────────────────────────────────
+    def _silence(self, ms: int, sr: int) -> np.ndarray:
+        return np.zeros(int(sr * ms / 1000), dtype=np.float32)
+    def _fade(self, audio: np.ndarray, fade_ms: float, sr: int) -> np.ndarray:
+        n = min(int(sr * fade_ms / 1000), audio.size // 4)
+        if n < 2:
+            return audio
+        ramp = np.linspace(0.0, 1.0, n, dtype=np.float32)
+        audio = audio.copy()
+        audio[:n]  *= ramp
+        audio[-n:] *= ramp[::-1]
+        return audio
+    def _normalize(self, audio: np.ndarray) -> np.ndarray:
         if audio.size == 0:
             return audio.astype(np.float32)
         audio = audio.astype(np.float32)
         if audio.ndim > 1:
             audio = audio.mean(axis=1)
+        audio -= float(np.mean(audio))
+        rms = float(np.sqrt(np.mean(audio ** 2)) + 1e-9)
         audio *= self._target_rms / rms
         peak = float(np.max(np.abs(audio)) + 1e-9)
         if peak > 0.95:
             audio *= 0.95 / peak
         if self._playback_speed != 1.0 and audio.size > 32:
             target_len = max(32, int(audio.size / self._playback_speed))
+            x_old = np.linspace(0.0, 1.0, audio.size)
+            x_new = np.linspace(0.0, 1.0, target_len)
             audio = np.interp(x_new, x_old, audio).astype(np.float32)
         return np.clip(audio, -0.98, 0.98).astype(np.float32)
+    def _synth_segment(self, text: str) -> np.ndarray:
+        enc = self._tokenizer(text, return_tensors="pt")
+        prompt_ids  = enc["input_ids"].to(self._device)
+        prompt_mask = enc.get("attention_mask")
+        if prompt_mask is not None:
+            prompt_mask = prompt_mask.to(self._device)
+        else:
+            prompt_mask = torch.ones_like(prompt_ids)
+        with torch.no_grad():
+            gen = self._model.generate(
+                input_ids=self._desc_ids,
+                attention_mask=self._desc_mask,
+                prompt_input_ids=prompt_ids,
+                prompt_attention_mask=prompt_mask,
+                do_sample=True,
+                temperature=0.8,
+                top_k=50,
+                repetition_penalty=1.2,
+                min_new_tokens=60,
+                max_new_tokens=800,
+            )
+        return gen.detach().cpu().numpy().squeeze().astype(np.float32)
+    # ── Public API ────────────────────────────────────────────────────────────
     def synthesize(self, text: str) -> TTSResult:
         self._load()
         if not self._loaded:
             return self._fallback.synthesize(text)
         try:
             import soundfile as sf
+            segments = prosody_split(text)
             sr = int(self._model.config.sampling_rate)
+            chunks: list[np.ndarray] = []
+            for seg in segments:
+                audio = self._synth_segment(seg["text"])
+                audio = self._normalize(audio)
+                audio = self._fade(audio, fade_ms=18, sr=sr)
                 if audio.size:
                     chunks.append(audio)
+                    chunks.append(self._silence(seg["pause_after_ms"], sr))
             if not chunks:
                 return self._fallback.synthesize(text)
+            final = np.concatenate(chunks[:-1] if len(chunks) > 1 else chunks)
+            # Final normalisation globale
+            peak = float(np.max(np.abs(final)) + 1e-9)
+            if peak > 0.95:
+                final *= 0.95 / peak
             buf = io.BytesIO()
+            sf.write(buf, final, samplerate=sr, format="WAV")
             buf.seek(0)
             return TTSResult(
                 audio_bytes=buf.read(),
                 audio_contains_speech=True,
             )
         except Exception:
+            logger.exception("Adia synthesis failed — using MockTTS")
             return self._fallback.synthesize(text)
     @property
     def unload(self) -> None:
         if self._model is not None:
+            del self._model, self._tokenizer, self._desc_ids, self._desc_mask
+            self._model = self._tokenizer = self._desc_ids = self._desc_mask = None
             self._loaded = False
+            logger.info("Adia TTS unloaded")

wolof_voice_agent/config/models.yaml CHANGED Viewed

@@ -46,9 +46,9 @@ llm:
   quantization: "4bit_if_available"
   # llama.cpp runtime context. Oolel train context is 32768, but 8192 is a
   # practical first production-test value for RAG without huge latency.
-  n_ctx: 8192
-  n_threads: 6
-  max_new_tokens: 420
   temperature: 0.2
   allow_download: true
   # Wolof-first NLU settings
@@ -57,10 +57,10 @@ llm:
   use_french_pivot_on_uncertainty: true
 tts:
-  provider: "moustapha"
-  model_name: "Moustapha91/TTS_WOLOF_FINAL"
-  fallback_model_name: "bilalfaye/speecht5_tts-wolof"
-  third_fallback_model_name: "CONCREE/Adia_TTS"
   device: "cpu"
   allow_download: false
   playback_speed: 1.0

   quantization: "4bit_if_available"
   # llama.cpp runtime context. Oolel train context is 32768, but 8192 is a
   # practical first production-test value for RAG without huge latency.
+  n_ctx: 2048
+  n_threads: 16
+  max_new_tokens: 600
   temperature: 0.2
   allow_download: true
   # Wolof-first NLU settings
   use_french_pivot_on_uncertainty: true
 tts:
+  provider: "parler"
+  model_name: "CONCREE/Adia_TTS"
+  fallback_model_name: "Moustapha91/TTS_WOLOF_FINAL"
+  third_fallback_model_name: "bilalfaye/speecht5_tts-wolof"
   device: "cpu"
   allow_download: false
   playback_speed: 1.0

wolof_voice_agent/requirements.txt CHANGED Viewed

@@ -18,11 +18,14 @@ pydantic>=2.7.0
 pyyaml>=6.0.1
 # ML / NLP
-transformers>=4.40.0
-accelerate>=0.29.0
-huggingface_hub>=0.22.0
-sentencepiece>=0.2.0
-tokenizers>=0.19.0
 # Audio
 soundfile>=0.12.1
@@ -30,7 +33,7 @@ librosa>=0.10.1
 scipy>=1.13.0
 # Numerical
-numpy>=1.26.0
 # Optional: quantization (install separately if compatible)
 # bitsandbytes>=0.43.0
@@ -47,3 +50,4 @@ pytest-asyncio>=0.23.0
 # NOTE: torch, torchaudio are NOT listed here to avoid overwriting the
 # existing GPU-enabled installation in the 'llama' conda environment.
 # If installing fresh: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121

 pyyaml>=6.0.1
 # ML / NLP
+# Pinned for Hugging Face Spaces CPU image.
+# Newer transformers versions can hit torch.library custom_op schema errors
+# with the CPU torch used in the Dockerfile.
+transformers==4.46.3
+accelerate==0.34.2
+huggingface_hub==0.26.5
+sentencepiece==0.2.0
+tokenizers==0.20.3
 # Audio
 soundfile>=0.12.1
 scipy>=1.13.0
 # Numerical
+numpy>=1.26.0,<2.1
 # Optional: quantization (install separately if compatible)
 # bitsandbytes>=0.43.0
 # NOTE: torch, torchaudio are NOT listed here to avoid overwriting the
 # existing GPU-enabled installation in the 'llama' conda environment.
 # If installing fresh: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
+parler-tts