Mouhamed Naby NDIAYE commited on
Commit
7b5e2ea
Β·
1 Parent(s): 254814c

Fix Hugging Face Space model loading

Browse files
Dockerfile CHANGED
@@ -1,64 +1,66 @@
1
  FROM python:3.11-slim
2
 
3
- # System dependencies
4
  RUN apt-get update && apt-get install -y \
5
- ffmpeg git curl nodejs npm build-essential cmake \
6
  && rm -rf /var/lib/apt/lists/*
7
 
8
  WORKDIR /app
9
 
10
- # ── Python: PyTorch CPU ─────────────────────────────────────────────────────
11
  RUN pip install --no-cache-dir \
12
  torch==2.4.1 torchaudio==2.4.1 \
13
  --index-url https://download.pytorch.org/whl/cpu
14
 
15
- # ── Python: project requirements ────────────────────────────────────────────
16
  COPY wolof_voice_agent/requirements.txt ./requirements.txt
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- # ── Python: llama-cpp-python (prebuilt CPU wheel β€” Γ©vite la compilation C++) ─
20
- RUN pip install --no-cache-dir llama-cpp-python==0.2.90 \
21
- --index-url https://abetlen.github.io/llama-cpp-python/whl/cpu \
22
- --no-deps && \
 
23
  pip install --no-cache-dir diskcache jinja2
24
 
25
- # ── Bake models into image (one Docker layer, cached independently of code) ─
26
- # Models go into the same path that get_cache_dir() returns at runtime.
27
  ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
28
  ENV PYTHONPATH=/app/wolof_voice_agent
29
  ENV GGUF_REPO=DevQuasar-6/soynade-research.Oolel-v0.1-GGUF
30
- ENV GGUF_FILENAME=soynade-research.Oolel-v0.1.Q5_K_M.gguf
 
 
31
 
32
  RUN mkdir -p /app/wolof_voice_agent/data/cache/huggingface \
33
  /app/wolof_voice_agent/models/gguf
34
 
35
  RUN python - <<'PYEOF'
36
  import os
 
37
  from huggingface_hub import snapshot_download, hf_hub_download
38
 
39
- cache = os.environ["HF_HOME"]
40
- gguf_dir = "/app/wolof_voice_agent/models/gguf"
41
  gguf_repo = os.environ["GGUF_REPO"]
42
- gguf_file = os.environ["GGUF_FILENAME"]
 
 
43
 
44
  print("Downloading ASR: M9and2M/whisper-small-wolof ...")
45
  snapshot_download(
46
- "M9and2M/whisper-small-wolof", cache_dir=cache,
 
47
  ignore_patterns=["*.msgpack", "*.h5", "flax_model*", "tf_model*"],
48
  )
49
 
50
- print("Downloading TTS: Moustapha91/TTS_WOLOF_FINAL ...")
51
- snapshot_download("Moustapha91/TTS_WOLOF_FINAL", cache_dir=cache)
52
 
53
- print("Downloading TTS vocoder: microsoft/speecht5_hifigan ...")
54
- snapshot_download("microsoft/speecht5_hifigan", cache_dir=cache)
55
-
56
- print(f"Downloading LLM GGUF: {gguf_file} from {gguf_repo} ...")
57
- hf_hub_download(repo_id=gguf_repo, filename=gguf_file, local_dir=gguf_dir)
58
  print("All models ready.")
59
  PYEOF
60
 
61
- # ── React frontend build ─────────────────────────────────────────────────────
62
  COPY frontend/package.json frontend/package-lock.json \
63
  /app/wolof_voice_agent/frontend/
64
  RUN cd /app/wolof_voice_agent/frontend && npm ci
@@ -66,10 +68,8 @@ RUN cd /app/wolof_voice_agent/frontend && npm ci
66
  COPY frontend/ /app/wolof_voice_agent/frontend/
67
  RUN cd /app/wolof_voice_agent/frontend && npm run build
68
 
69
- # ── Backend code (after models so code changes don't bust model layer) ───────
70
  COPY wolof_voice_agent/ /app/wolof_voice_agent/
71
 
72
- # ── Runtime ──────────────────────────────────────────────────────────────────
73
  WORKDIR /app/wolof_voice_agent
74
 
75
  ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
 
1
  FROM python:3.11-slim
2
 
 
3
  RUN apt-get update && apt-get install -y \
4
+ ffmpeg git curl nodejs npm build-essential cmake pkg-config libopenblas-dev \
5
  && rm -rf /var/lib/apt/lists/*
6
 
7
  WORKDIR /app
8
 
9
+ # CPU torch for Hugging Face Spaces CPU. Keep this aligned with requirements pins.
10
  RUN pip install --no-cache-dir \
11
  torch==2.4.1 torchaudio==2.4.1 \
12
  --index-url https://download.pytorch.org/whl/cpu
13
 
 
14
  COPY wolof_voice_agent/requirements.txt ./requirements.txt
15
  RUN pip install --no-cache-dir -r requirements.txt
16
 
17
+ # Build llama-cpp-python inside Debian. The prebuilt wheel previously loaded a
18
+ # musl-linked libllama.so on Spaces and failed with libc.musl-x86_64.so.1.
19
+ RUN CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \
20
+ FORCE_CMAKE=1 \
21
+ pip install --no-cache-dir --no-binary llama-cpp-python llama-cpp-python==0.2.90 && \
22
  pip install --no-cache-dir diskcache jinja2
23
 
 
 
24
  ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
25
  ENV PYTHONPATH=/app/wolof_voice_agent
26
  ENV GGUF_REPO=DevQuasar-6/soynade-research.Oolel-v0.1-GGUF
27
+ ENV GGUF_SOURCE_FILENAME=soynade-research.Oolel-v0.1.Q4_K_M.gguf
28
+ ENV GGUF_FILENAME=oolel-v0.1-q4_k_m.gguf
29
+ ENV TTS_REPO=CONCREE/Adia_TTS
30
 
31
  RUN mkdir -p /app/wolof_voice_agent/data/cache/huggingface \
32
  /app/wolof_voice_agent/models/gguf
33
 
34
  RUN python - <<'PYEOF'
35
  import os
36
+ import shutil
37
  from huggingface_hub import snapshot_download, hf_hub_download
38
 
39
+ cache = os.environ["HF_HOME"]
40
+ gguf_dir = "/app/wolof_voice_agent/models/gguf"
41
  gguf_repo = os.environ["GGUF_REPO"]
42
+ gguf_source_file = os.environ["GGUF_SOURCE_FILENAME"]
43
+ gguf_runtime_file = os.environ["GGUF_FILENAME"]
44
+ tts_repo = os.environ["TTS_REPO"]
45
 
46
  print("Downloading ASR: M9and2M/whisper-small-wolof ...")
47
  snapshot_download(
48
+ "M9and2M/whisper-small-wolof",
49
+ cache_dir=cache,
50
  ignore_patterns=["*.msgpack", "*.h5", "flax_model*", "tf_model*"],
51
  )
52
 
53
+ print(f"Downloading TTS: {tts_repo} ...")
54
+ snapshot_download(tts_repo, cache_dir=cache)
55
 
56
+ print(f"Downloading LLM GGUF: {gguf_source_file} from {gguf_repo} ...")
57
+ src = hf_hub_download(repo_id=gguf_repo, filename=gguf_source_file, local_dir=gguf_dir)
58
+ dst = os.path.join(gguf_dir, gguf_runtime_file)
59
+ if src != dst:
60
+ shutil.copy2(src, dst)
61
  print("All models ready.")
62
  PYEOF
63
 
 
64
  COPY frontend/package.json frontend/package-lock.json \
65
  /app/wolof_voice_agent/frontend/
66
  RUN cd /app/wolof_voice_agent/frontend && npm ci
 
68
  COPY frontend/ /app/wolof_voice_agent/frontend/
69
  RUN cd /app/wolof_voice_agent/frontend && npm run build
70
 
 
71
  COPY wolof_voice_agent/ /app/wolof_voice_agent/
72
 
 
73
  WORKDIR /app/wolof_voice_agent
74
 
75
  ENV HF_HOME=/app/wolof_voice_agent/data/cache/huggingface
frontend/src/App.jsx CHANGED
@@ -254,17 +254,18 @@ export default function App() {
254
  body: JSON.stringify({ text, profile: 'general', context: {} }),
255
  })
256
  const responseText = res.response_text_wo || res.response_text || ''
257
- let audioUrl = null
258
- try {
259
- const tts = await apiFetch('/voice/synthesize', {
260
- method: 'POST',
261
- headers: { 'Content-Type': 'application/json' },
262
- body: JSON.stringify({ text_wo: responseText }),
263
- })
264
- if (tts.audio_url) audioUrl = apiBase + tts.audio_url
265
- } catch {}
266
- replaceLoading(loadId, { text: responseText, audioUrl })
267
  setStatus('ok')
 
 
 
 
 
 
 
 
 
268
  } catch (err) {
269
  replaceLoading(loadId, { text: `JΓ pp na soxor: ${err.message}` })
270
  setStatus('err')
 
254
  body: JSON.stringify({ text, profile: 'general', context: {} }),
255
  })
256
  const responseText = res.response_text_wo || res.response_text || ''
257
+ // Affiche le texte immΓ©diatement β€” sans attendre le TTS
258
+ replaceLoading(loadId, { text: responseText, audioUrl: null })
 
 
 
 
 
 
 
 
259
  setStatus('ok')
260
+ // TTS en arriΓ¨re-plan β€” met Γ  jour l'audio quand prΓͺt
261
+ apiFetch('/voice/synthesize', {
262
+ method: 'POST',
263
+ headers: { 'Content-Type': 'application/json' },
264
+ body: JSON.stringify({ text_wo: responseText }),
265
+ }).then(tts => {
266
+ if (tts.audio_url)
267
+ setMessages(prev => prev.map(m => m.id === loadId ? { ...m, audioUrl: apiBase + tts.audio_url } : m))
268
+ }).catch(() => {})
269
  } catch (err) {
270
  replaceLoading(loadId, { text: `JΓ pp na soxor: ${err.message}` })
271
  setStatus('err')
wolof_voice_agent/app/main.py CHANGED
@@ -281,7 +281,7 @@ def text_respond(req: RespondRequest):
281
  normalizer = get_normalizer()
282
  normalized_text = normalizer.normalize(req.text)
283
  rag_sources = []
284
- _RAG_THRESHOLD = 0.25
285
  try:
286
  # Always check RAG β€” use it only when relevant (score >= threshold)
287
  rag_result = answer_administration_question(
@@ -300,6 +300,8 @@ def text_respond(req: RespondRequest):
300
  logger.error(f"/text/respond error: {e}")
301
  raise HTTPException(status_code=500, detail=str(e))
302
 
 
 
303
  safety_flags, safe_response, requires_review = apply_safety(
304
  text=normalized_text,
305
  response=raw_response,
 
281
  normalizer = get_normalizer()
282
  normalized_text = normalizer.normalize(req.text)
283
  rag_sources = []
284
+ _RAG_THRESHOLD = 0.50
285
  try:
286
  # Always check RAG β€” use it only when relevant (score >= threshold)
287
  rag_result = answer_administration_question(
 
300
  logger.error(f"/text/respond error: {e}")
301
  raise HTTPException(status_code=500, detail=str(e))
302
 
303
+ raw_response = raw_response.replace("\\n", " ").replace("\n", " ").strip()
304
+
305
  safety_flags, safe_response, requires_review = apply_safety(
306
  text=normalized_text,
307
  response=raw_response,
wolof_voice_agent/app/services/llm/llamacpp_llm.py CHANGED
@@ -33,7 +33,7 @@ _SYSTEM_BY_PROFILE = {
33
  "administration": (
34
  "Tu es khAdI, une IA qui parle wolof. "
35
  "Reponds en UN SEUL paragraphe fluide, sans liste, sans numerotation, sans tiret, sans \\n. "
36
- "4 a 6 phrases enchainées naturellement. Sois factuel, arrete-toi après la 6e phrase."
37
  ),
38
  }
39
 
@@ -138,7 +138,7 @@ class LlamaCppLLM(LLMInterface):
138
  f"Sources officielles:\n{rag_context}\n\n"
139
  "IMPORTANT: Reponse en wolof simple, UN SEUL paragraphe fluide, "
140
  "sans liste, sans numerotation, sans tiret, sans \\n. "
141
- "4 a 6 phrases enchainΓ©es naturellement. Stop apres la 6e phrase."
142
  )
143
  result = self._chat(system, user_msg)
144
  return result or "Baal ma, amuma tontu bu woor ci documents yi."
 
33
  "administration": (
34
  "Tu es khAdI, une IA qui parle wolof. "
35
  "Reponds en UN SEUL paragraphe fluide, sans liste, sans numerotation, sans tiret, sans \\n. "
36
+ "6 a 8 phrases enchainΓ©es naturellement. Sois factuel et complet, couvre tous les details importants."
37
  ),
38
  }
39
 
 
138
  f"Sources officielles:\n{rag_context}\n\n"
139
  "IMPORTANT: Reponse en wolof simple, UN SEUL paragraphe fluide, "
140
  "sans liste, sans numerotation, sans tiret, sans \\n. "
141
+ "6 a 8 phrases enchainΓ©es naturellement. Couvre tous les details utiles."
142
  )
143
  result = self._chat(system, user_msg)
144
  return result or "Baal ma, amuma tontu bu woor ci documents yi."
wolof_voice_agent/app/services/rag/admin_rag.py CHANGED
@@ -214,7 +214,7 @@ def answer_administration_question(
214
  prompt_context["rag_instruction"] = (
215
  "Les sources sont en francais. Reponds en wolof simple. "
216
  "UN SEUL paragraphe fluide, sans liste, sans numero, sans tiret, sans \\n. "
217
- "Couvre en 4-6 phrases: ce que c'est, a quoi ca sert, comment l'obtenir, precision cle. "
218
  "Ne repete aucune information. N'invente rien qui n'est pas dans les sources."
219
  )
220
  try:
 
214
  prompt_context["rag_instruction"] = (
215
  "Les sources sont en francais. Reponds en wolof simple. "
216
  "UN SEUL paragraphe fluide, sans liste, sans numero, sans tiret, sans \\n. "
217
+ "Couvre en 6-8 phrases: ce que c'est, a quoi ca sert, comment l'obtenir, documents requis, delais, precision cle. "
218
  "Ne repete aucune information. N'invente rien qui n'est pas dans les sources."
219
  )
220
  try:
wolof_voice_agent/app/services/tts/parler_wolof_tts.py CHANGED
@@ -1,9 +1,11 @@
1
  import io
2
  import logging
 
3
  import re
4
  from pathlib import Path
5
  from typing import Optional
6
 
 
7
  import torch
8
 
9
  from .mock_tts import MockTTS
@@ -12,58 +14,127 @@ from app.core.config import get_cache_dir, get_config
12
 
13
  logger = logging.getLogger(__name__)
14
 
15
- _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
16
-
17
-
18
- def _split_tts_units(text: str, max_chars: int = 160) -> list[str]:
19
- units: list[str] = []
20
- for sentence in _SENTENCE_SPLIT_RE.split(text.strip()):
21
- sentence = re.sub(r"\s+", " ", sentence).strip()
22
- if not sentence:
23
- continue
24
- if len(sentence) <= max_chars:
25
- units.append(sentence)
26
- continue
27
- words = sentence.split()
28
- current: list[str] = []
29
- current_len = 0
30
- for word in words:
31
- next_len = current_len + len(word) + (1 if current else 0)
32
- if current and next_len > max_chars:
33
- units.append(" ".join(current))
34
- current = [word]
35
- current_len = len(word)
36
- else:
37
- current.append(word)
38
- current_len = next_len
39
- if current:
40
- units.append(" ".join(current))
41
- return units or [text.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  class ParlerWolofTTS(TTSInterface):
45
- """Wolof Parler-TTS backend for GalsenAI/Adia-style models."""
46
 
47
  def __init__(self, model_name: Optional[str] = None):
48
  cfg = get_config()
49
- self._model_id = model_name or cfg.tts.model_name or "galsenai/parler-tts-mini-v1-wolof"
50
  self._playback_speed = max(0.65, min(1.25, float(cfg.tts.playback_speed)))
51
  self._target_rms = max(0.02, min(0.2, float(cfg.tts.target_rms)))
52
- self._device = torch.device("cuda" if cfg.runtime.prefer_gpu_for_tts and torch.cuda.is_available() else "cpu")
 
 
53
  self._tokenizer = None
54
  self._model = None
 
 
55
  self._loaded = False
56
  self._fallback = MockTTS()
57
 
58
- def _find_snapshot(self, cache: Path, model_id: str) -> Optional[Path]:
59
- key = model_id.replace("/", "--")
 
 
 
 
 
60
  snaps = cache / f"models--{key}" / "snapshots"
61
- if not snaps.exists():
62
- return None
63
- candidates = sorted(snaps.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
64
- for snap in candidates:
65
- if (snap / "config.json").exists():
66
- return snap
67
  return None
68
 
69
  def _load(self) -> None:
@@ -73,124 +144,124 @@ class ParlerWolofTTS(TTSInterface):
73
  from parler_tts import ParlerTTSForConditionalGeneration
74
  from transformers import AutoTokenizer
75
 
76
- cache = get_cache_dir()
77
- snap = self._find_snapshot(cache, self._model_id)
78
- model_ref = str(snap) if snap else self._model_id
79
  local_only = snap is not None
80
- logger.info(f"Loading Parler Wolof TTS '{model_ref}' on {self._device}")
81
 
82
  self._tokenizer = AutoTokenizer.from_pretrained(
83
- model_ref,
84
- cache_dir=str(cache),
85
- local_files_only=local_only,
86
  )
87
  self._model = ParlerTTSForConditionalGeneration.from_pretrained(
88
- model_ref,
89
- cache_dir=str(cache),
90
- local_files_only=local_only,
91
  ).to(self._device)
92
  self._model.eval()
 
 
 
 
 
 
 
 
 
 
93
  self._loaded = True
94
- logger.info("Parler Wolof TTS loaded")
95
  except Exception:
96
- logger.exception("Parler Wolof TTS load failed - using MockTTS")
 
 
97
 
98
- def _clean_audio(self, audio: "np.ndarray", sr: int) -> "np.ndarray":
99
- import numpy as np
100
 
 
 
 
 
 
 
 
 
 
 
 
101
  if audio.size == 0:
102
  return audio.astype(np.float32)
103
  audio = audio.astype(np.float32)
104
  if audio.ndim > 1:
105
  audio = audio.mean(axis=1)
106
- audio = audio - float(np.mean(audio))
107
-
108
- try:
109
- from scipy.signal import butter, sosfiltfilt
110
-
111
- high = min(7800, int(sr * 0.44))
112
- sos = butter(6, [80, high], btype="bandpass", fs=sr, output="sos")
113
- audio = sosfiltfilt(sos, audio).astype(np.float32)
114
- except Exception:
115
- # Lightweight fallback: remove slow DC drift with a moving average.
116
- win = max(64, min(2048, sr // 40))
117
- kernel = np.ones(win, dtype=np.float32) / win
118
- drift = np.convolve(audio, kernel, mode="same")
119
- audio = (audio - drift).astype(np.float32)
120
-
121
- # Noise gate: attenuate very low-level background without hard clipping words.
122
- gate = 0.006
123
- abs_audio = np.abs(audio)
124
- mask = abs_audio < gate
125
- audio[mask] *= 0.18
126
-
127
- rms = float(np.sqrt(np.mean(np.square(audio))) + 1e-9)
128
  audio *= self._target_rms / rms
129
  peak = float(np.max(np.abs(audio)) + 1e-9)
130
  if peak > 0.95:
131
  audio *= 0.95 / peak
132
-
133
  if self._playback_speed != 1.0 and audio.size > 32:
134
  target_len = max(32, int(audio.size / self._playback_speed))
135
- x_old = np.linspace(0.0, 1.0, num=audio.size, endpoint=True)
136
- x_new = np.linspace(0.0, 1.0, num=target_len, endpoint=True)
137
  audio = np.interp(x_new, x_old, audio).astype(np.float32)
138
-
139
- fade = min(int(sr * 0.018), audio.size // 8)
140
- if fade > 1:
141
- ramp = np.linspace(0.0, 1.0, fade, dtype=np.float32)
142
- audio[:fade] *= ramp
143
- audio[-fade:] *= ramp[::-1]
144
  return np.clip(audio, -0.98, 0.98).astype(np.float32)
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def synthesize(self, text: str) -> TTSResult:
147
  self._load()
148
  if not self._loaded:
149
  return self._fallback.synthesize(text)
150
 
151
  try:
152
- import numpy as np
153
  import soundfile as sf
154
 
155
- description = (
156
- "A clear Wolof speaking voice, natural and calm, recorded in a quiet room, "
157
- "with normal pauses between words, high audio quality, and no background noise."
158
- )
159
- desc = self._tokenizer(description, return_tensors="pt")
160
- desc = {k: v.to(self._device) for k, v in desc.items()}
161
-
162
- chunks = []
163
  sr = int(self._model.config.sampling_rate)
164
- pause = np.zeros(int(sr * 0.28), dtype=np.float32)
165
-
166
- for unit in _split_tts_units(text):
167
- prompt = self._tokenizer(unit, return_tensors="pt")
168
- prompt = {k: v.to(self._device) for k, v in prompt.items()}
169
- with torch.no_grad():
170
- generation = self._model.generate(
171
- input_ids=desc["input_ids"],
172
- attention_mask=desc.get("attention_mask"),
173
- prompt_input_ids=prompt["input_ids"],
174
- prompt_attention_mask=prompt.get("attention_mask"),
175
- do_sample=True,
176
- temperature=0.8,
177
- top_k=50,
178
- repetition_penalty=1.2,
179
- min_new_tokens=120,
180
- max_new_tokens=1000,
181
- )
182
- audio = generation.detach().cpu().numpy().squeeze().astype(np.float32)
183
- audio = self._clean_audio(audio, sr)
184
  if audio.size:
185
  chunks.append(audio)
186
- chunks.append(pause)
187
 
188
  if not chunks:
189
  return self._fallback.synthesize(text)
190
 
191
- audio_np = np.concatenate(chunks[:-1] if len(chunks) > 1 else chunks)
 
 
 
 
 
192
  buf = io.BytesIO()
193
- sf.write(buf, audio_np, samplerate=sr, format="WAV")
194
  buf.seek(0)
195
  return TTSResult(
196
  audio_bytes=buf.read(),
@@ -200,7 +271,7 @@ class ParlerWolofTTS(TTSInterface):
200
  audio_contains_speech=True,
201
  )
202
  except Exception:
203
- logger.exception("Parler Wolof synthesis failed - using MockTTS")
204
  return self._fallback.synthesize(text)
205
 
206
  @property
@@ -213,8 +284,7 @@ class ParlerWolofTTS(TTSInterface):
213
 
214
  def unload(self) -> None:
215
  if self._model is not None:
216
- del self._model, self._tokenizer
217
- self._model = None
218
- self._tokenizer = None
219
  self._loaded = False
220
- logger.info("Parler Wolof TTS unloaded")
 
1
  import io
2
  import logging
3
+ import random
4
  import re
5
  from pathlib import Path
6
  from typing import Optional
7
 
8
+ import numpy as np
9
  import torch
10
 
11
  from .mock_tts import MockTTS
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
+ # ── Pauses prosodiques (ms) ───────────────────────────────────────────────────
18
+ _PAUSE_COMMA = (200, 340)
19
+ _PAUSE_PHRASE = (420, 600)
20
+ _PAUSE_LONG = (650, 900)
21
+ _PAUSE_END = (900, 1200)
22
+
23
+ _MAX_WORDS = 12
24
+
25
+ _CONNECTORS = re.compile(
26
+ r"\b(ak|waaye|ndax|ndaxte|te|walla|bu|su|ngir|dΓ«kΓ«k)\b",
27
+ re.IGNORECASE,
28
+ )
29
+
30
+ _DESCRIPTION = (
31
+ "Une voix fΓ©minine claire et naturelle en wolof, "
32
+ "calme, bien articulΓ©e, avec des pauses naturelles."
33
+ )
34
+
35
+
36
+ def _rnd_pause(r: tuple) -> int:
37
+ return max(80, random.randint(*r) + random.randint(-60, 80))
38
+
39
+
40
+ def _ending_pause(text: str) -> int:
41
+ t = text.rstrip()
42
+ if not t:
43
+ return _rnd_pause(_PAUSE_PHRASE)
44
+ last = t[-1]
45
+ if last in "?!":
46
+ return _rnd_pause(_PAUSE_LONG)
47
+ if last == ".":
48
+ return _rnd_pause(_PAUSE_PHRASE) if len(t.split()) <= 8 else _rnd_pause(_PAUSE_LONG)
49
+ if last == ",":
50
+ return _rnd_pause(_PAUSE_COMMA)
51
+ return _rnd_pause(_PAUSE_PHRASE)
52
+
53
+
54
+ def _split_on_connector(text: str) -> list[str]:
55
+ words = text.split()
56
+ if len(words) <= _MAX_WORDS:
57
+ return [text]
58
+ mid = len(words) // 2
59
+ best_idx, best_dist = None, len(words)
60
+ for i, w in enumerate(words):
61
+ if _CONNECTORS.fullmatch(w.strip(",.!?")):
62
+ d = abs(i - mid)
63
+ if d < best_dist:
64
+ best_dist, best_idx = d, i
65
+ if best_idx is not None and 2 <= best_idx <= len(words) - 2:
66
+ left = " ".join(words[:best_idx])
67
+ right = " ".join(words[best_idx:])
68
+ return _split_on_connector(left) + _split_on_connector(right)
69
+ return [" ".join(words[i:i + _MAX_WORDS]) for i in range(0, len(words), _MAX_WORDS)]
70
+
71
+
72
+ def _clean_seg(text: str) -> str:
73
+ text = re.sub(r"\s+", " ", text).strip()
74
+ if text and text[-1] not in ".!?,;":
75
+ text += "."
76
+ return text
77
+
78
+
79
+ def prosody_split(text: str) -> list[dict]:
80
+ """Découpe le texte en segments avec pause naturelle après chaque segment."""
81
+ sentences = re.split(r"(?<=[.!?;])\s+", text.strip())
82
+ sentences = [s.strip() for s in sentences if s.strip()]
83
+ segments: list[dict] = []
84
+
85
+ for sent in sentences:
86
+ comma_parts = [p.strip() for p in sent.split(",") if p.strip()]
87
+ if len(sent.split()) > _MAX_WORDS and len(comma_parts) > 1:
88
+ sub_units = [(p, "," if i < len(comma_parts) - 1 else "") for i, p in enumerate(comma_parts)]
89
+ else:
90
+ sub_units = [(sent, "")]
91
+
92
+ for raw, delim in sub_units:
93
+ pieces = _split_on_connector(raw)
94
+ for j, piece in enumerate(pieces):
95
+ cleaned = _clean_seg(piece)
96
+ if j == len(pieces) - 1:
97
+ synthetic = cleaned if not delim else piece.rstrip() + ","
98
+ pause = _ending_pause(synthetic)
99
+ else:
100
+ pause = _rnd_pause(_PAUSE_COMMA)
101
+ segments.append({"text": cleaned, "pause_after_ms": pause})
102
+
103
+ if segments:
104
+ segments[-1]["pause_after_ms"] = _rnd_pause(_PAUSE_END)
105
+ return segments
106
 
107
 
108
  class ParlerWolofTTS(TTSInterface):
109
+ """Adia TTS (CONCREE/Adia_TTS) avec segmentation prosodique wolof."""
110
 
111
  def __init__(self, model_name: Optional[str] = None):
112
  cfg = get_config()
113
+ self._model_id = model_name or cfg.tts.model_name or "CONCREE/Adia_TTS"
114
  self._playback_speed = max(0.65, min(1.25, float(cfg.tts.playback_speed)))
115
  self._target_rms = max(0.02, min(0.2, float(cfg.tts.target_rms)))
116
+ self._device = torch.device(
117
+ "cuda" if cfg.runtime.prefer_gpu_for_tts and torch.cuda.is_available() else "cpu"
118
+ )
119
  self._tokenizer = None
120
  self._model = None
121
+ self._desc_ids = None # pre-tokenized description (reused across segments)
122
+ self._desc_mask = None
123
  self._loaded = False
124
  self._fallback = MockTTS()
125
 
126
+ def _find_snapshot(self) -> Optional[Path]:
127
+ cache = get_cache_dir()
128
+ key = self._model_id.replace("/", "--")
129
+ # prefer local/ directory (manual download)
130
+ local = cache / f"models--{key}" / "local"
131
+ if (local / "config.json").exists():
132
+ return local
133
  snaps = cache / f"models--{key}" / "snapshots"
134
+ if snaps.exists():
135
+ for snap in sorted(snaps.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True):
136
+ if (snap / "config.json").exists():
137
+ return snap
 
 
138
  return None
139
 
140
  def _load(self) -> None:
 
144
  from parler_tts import ParlerTTSForConditionalGeneration
145
  from transformers import AutoTokenizer
146
 
147
+ snap = self._find_snapshot()
148
+ ref = str(snap) if snap else self._model_id
 
149
  local_only = snap is not None
150
+ logger.info(f"Loading Adia TTS '{ref}' on {self._device}")
151
 
152
  self._tokenizer = AutoTokenizer.from_pretrained(
153
+ ref, cache_dir=str(get_cache_dir()), local_files_only=local_only,
 
 
154
  )
155
  self._model = ParlerTTSForConditionalGeneration.from_pretrained(
156
+ ref, cache_dir=str(get_cache_dir()), local_files_only=local_only,
 
 
157
  ).to(self._device)
158
  self._model.eval()
159
+
160
+ # Pre-tokenize description once
161
+ enc = self._tokenizer(_DESCRIPTION, return_tensors="pt")
162
+ self._desc_ids = enc["input_ids"].to(self._device)
163
+ self._desc_mask = enc.get("attention_mask")
164
+ if self._desc_mask is not None:
165
+ self._desc_mask = self._desc_mask.to(self._device)
166
+ else:
167
+ self._desc_mask = torch.ones_like(self._desc_ids)
168
+
169
  self._loaded = True
170
+ logger.info("Adia TTS loaded")
171
  except Exception:
172
+ logger.exception("Adia TTS load failed β€” using MockTTS")
173
+
174
+ # ── Audio utils ───────────────────────────────────────────────────────────
175
 
176
+ def _silence(self, ms: int, sr: int) -> np.ndarray:
177
+ return np.zeros(int(sr * ms / 1000), dtype=np.float32)
178
 
179
+ def _fade(self, audio: np.ndarray, fade_ms: float, sr: int) -> np.ndarray:
180
+ n = min(int(sr * fade_ms / 1000), audio.size // 4)
181
+ if n < 2:
182
+ return audio
183
+ ramp = np.linspace(0.0, 1.0, n, dtype=np.float32)
184
+ audio = audio.copy()
185
+ audio[:n] *= ramp
186
+ audio[-n:] *= ramp[::-1]
187
+ return audio
188
+
189
+ def _normalize(self, audio: np.ndarray) -> np.ndarray:
190
  if audio.size == 0:
191
  return audio.astype(np.float32)
192
  audio = audio.astype(np.float32)
193
  if audio.ndim > 1:
194
  audio = audio.mean(axis=1)
195
+ audio -= float(np.mean(audio))
196
+ rms = float(np.sqrt(np.mean(audio ** 2)) + 1e-9)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  audio *= self._target_rms / rms
198
  peak = float(np.max(np.abs(audio)) + 1e-9)
199
  if peak > 0.95:
200
  audio *= 0.95 / peak
 
201
  if self._playback_speed != 1.0 and audio.size > 32:
202
  target_len = max(32, int(audio.size / self._playback_speed))
203
+ x_old = np.linspace(0.0, 1.0, audio.size)
204
+ x_new = np.linspace(0.0, 1.0, target_len)
205
  audio = np.interp(x_new, x_old, audio).astype(np.float32)
 
 
 
 
 
 
206
  return np.clip(audio, -0.98, 0.98).astype(np.float32)
207
 
208
+ def _synth_segment(self, text: str) -> np.ndarray:
209
+ enc = self._tokenizer(text, return_tensors="pt")
210
+ prompt_ids = enc["input_ids"].to(self._device)
211
+ prompt_mask = enc.get("attention_mask")
212
+ if prompt_mask is not None:
213
+ prompt_mask = prompt_mask.to(self._device)
214
+ else:
215
+ prompt_mask = torch.ones_like(prompt_ids)
216
+
217
+ with torch.no_grad():
218
+ gen = self._model.generate(
219
+ input_ids=self._desc_ids,
220
+ attention_mask=self._desc_mask,
221
+ prompt_input_ids=prompt_ids,
222
+ prompt_attention_mask=prompt_mask,
223
+ do_sample=True,
224
+ temperature=0.8,
225
+ top_k=50,
226
+ repetition_penalty=1.2,
227
+ min_new_tokens=60,
228
+ max_new_tokens=800,
229
+ )
230
+ return gen.detach().cpu().numpy().squeeze().astype(np.float32)
231
+
232
+ # ── Public API ────────────────────────────────────────────────────────────
233
+
234
  def synthesize(self, text: str) -> TTSResult:
235
  self._load()
236
  if not self._loaded:
237
  return self._fallback.synthesize(text)
238
 
239
  try:
 
240
  import soundfile as sf
241
 
242
+ segments = prosody_split(text)
 
 
 
 
 
 
 
243
  sr = int(self._model.config.sampling_rate)
244
+ chunks: list[np.ndarray] = []
245
+
246
+ for seg in segments:
247
+ audio = self._synth_segment(seg["text"])
248
+ audio = self._normalize(audio)
249
+ audio = self._fade(audio, fade_ms=18, sr=sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  if audio.size:
251
  chunks.append(audio)
252
+ chunks.append(self._silence(seg["pause_after_ms"], sr))
253
 
254
  if not chunks:
255
  return self._fallback.synthesize(text)
256
 
257
+ final = np.concatenate(chunks[:-1] if len(chunks) > 1 else chunks)
258
+ # Final normalisation globale
259
+ peak = float(np.max(np.abs(final)) + 1e-9)
260
+ if peak > 0.95:
261
+ final *= 0.95 / peak
262
+
263
  buf = io.BytesIO()
264
+ sf.write(buf, final, samplerate=sr, format="WAV")
265
  buf.seek(0)
266
  return TTSResult(
267
  audio_bytes=buf.read(),
 
271
  audio_contains_speech=True,
272
  )
273
  except Exception:
274
+ logger.exception("Adia synthesis failed β€” using MockTTS")
275
  return self._fallback.synthesize(text)
276
 
277
  @property
 
284
 
285
  def unload(self) -> None:
286
  if self._model is not None:
287
+ del self._model, self._tokenizer, self._desc_ids, self._desc_mask
288
+ self._model = self._tokenizer = self._desc_ids = self._desc_mask = None
 
289
  self._loaded = False
290
+ logger.info("Adia TTS unloaded")
wolof_voice_agent/config/models.yaml CHANGED
@@ -46,9 +46,9 @@ llm:
46
  quantization: "4bit_if_available"
47
  # llama.cpp runtime context. Oolel train context is 32768, but 8192 is a
48
  # practical first production-test value for RAG without huge latency.
49
- n_ctx: 8192
50
- n_threads: 6
51
- max_new_tokens: 420
52
  temperature: 0.2
53
  allow_download: true
54
  # Wolof-first NLU settings
@@ -57,10 +57,10 @@ llm:
57
  use_french_pivot_on_uncertainty: true
58
 
59
  tts:
60
- provider: "moustapha"
61
- model_name: "Moustapha91/TTS_WOLOF_FINAL"
62
- fallback_model_name: "bilalfaye/speecht5_tts-wolof"
63
- third_fallback_model_name: "CONCREE/Adia_TTS"
64
  device: "cpu"
65
  allow_download: false
66
  playback_speed: 1.0
 
46
  quantization: "4bit_if_available"
47
  # llama.cpp runtime context. Oolel train context is 32768, but 8192 is a
48
  # practical first production-test value for RAG without huge latency.
49
+ n_ctx: 2048
50
+ n_threads: 16
51
+ max_new_tokens: 600
52
  temperature: 0.2
53
  allow_download: true
54
  # Wolof-first NLU settings
 
57
  use_french_pivot_on_uncertainty: true
58
 
59
  tts:
60
+ provider: "parler"
61
+ model_name: "CONCREE/Adia_TTS"
62
+ fallback_model_name: "Moustapha91/TTS_WOLOF_FINAL"
63
+ third_fallback_model_name: "bilalfaye/speecht5_tts-wolof"
64
  device: "cpu"
65
  allow_download: false
66
  playback_speed: 1.0
wolof_voice_agent/requirements.txt CHANGED
@@ -18,11 +18,14 @@ pydantic>=2.7.0
18
  pyyaml>=6.0.1
19
 
20
  # ML / NLP
21
- transformers>=4.40.0
22
- accelerate>=0.29.0
23
- huggingface_hub>=0.22.0
24
- sentencepiece>=0.2.0
25
- tokenizers>=0.19.0
 
 
 
26
 
27
  # Audio
28
  soundfile>=0.12.1
@@ -30,7 +33,7 @@ librosa>=0.10.1
30
  scipy>=1.13.0
31
 
32
  # Numerical
33
- numpy>=1.26.0
34
 
35
  # Optional: quantization (install separately if compatible)
36
  # bitsandbytes>=0.43.0
@@ -47,3 +50,4 @@ pytest-asyncio>=0.23.0
47
  # NOTE: torch, torchaudio are NOT listed here to avoid overwriting the
48
  # existing GPU-enabled installation in the 'llama' conda environment.
49
  # If installing fresh: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
 
 
18
  pyyaml>=6.0.1
19
 
20
  # ML / NLP
21
+ # Pinned for Hugging Face Spaces CPU image.
22
+ # Newer transformers versions can hit torch.library custom_op schema errors
23
+ # with the CPU torch used in the Dockerfile.
24
+ transformers==4.46.3
25
+ accelerate==0.34.2
26
+ huggingface_hub==0.26.5
27
+ sentencepiece==0.2.0
28
+ tokenizers==0.20.3
29
 
30
  # Audio
31
  soundfile>=0.12.1
 
33
  scipy>=1.13.0
34
 
35
  # Numerical
36
+ numpy>=1.26.0,<2.1
37
 
38
  # Optional: quantization (install separately if compatible)
39
  # bitsandbytes>=0.43.0
 
50
  # NOTE: torch, torchaudio are NOT listed here to avoid overwriting the
51
  # existing GPU-enabled installation in the 'llama' conda environment.
52
  # If installing fresh: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
53
+ parler-tts