Upload 3 files

4f65a37 verified 4 days ago

135 kB

	#!/usr/bin/env python3
	"""
	OmniVoice Engine v2.0.0 — "Mind Through Voice"
	================================================
	A consciousness-aware real-time voice conversation engine that feels
	like a mind speaking through a voice, not a synthesizer.

	ARCHITECTURE:
	- Whisper ASR (local) for real speech-to-text + interrupt detection
	- Coqui XTTS for neural text-to-speech with voice cloning
	- Procedural numpy DSP for non-verbal expressions (laughter, sighs, gasps, etc.)
	- Smart interrupt awareness that ignores non-verbals, backchannels, and
	collaborative turn-sharing
	- Graceful interruption responses ("I'm sorry, were you saying something?")
	- NIMA-integrated adapter (reads ConsciousnessSnapshot to drive prosody)

	v2.0.0 NEW MODULES — the "mind through voice" layer:

	CONVERSATIONAL FLOW:
	- AdaptiveProsodyShaper: emotion → pitch/rhythm/timbre dynamics
	(softer when empathetic, brighter when excited)
	- MicroIntonationInjector: hesitations, breaths, emphasis shifts
	that signal thoughtfulness or uncertainty
	- TurnTakingPredictor: predicts when user will finish, smoothly
	takes the floor instead of waiting for silence

	EMOTIONAL & COGNITIVE GROUNDING:
	- AffectiveMirror: matches user's emotional tone (calm, energetic,
	concerned) with subtle vocal adjustments
	- SomaticFeedbackIntegrator: ties voice modulation to system strain
	or energy states (biological fatigue signals)
	- EmpathyPhraseGenerator: contextual empathy inserts ("That must
	feel tough") instead of generic nods

	MEMORY & CONTINUITY:
	- VoiceEventMemoryBridge: stores every utterance as an episodic
	voice event in MemPalace with affective tags
	- NarrativeContinuityEngine: references past conversations naturally
	("As you mentioned yesterday, you sounded excited about...")

	EXPRESSIVE EXTENSIONS:
	- SingingInterjectionModule: short melodic phrases (humming, tonal
	affirmations) woven into speech
	- MultimodalCueEmitter: pairs voice with haptic/visual signals
	(soft vibration or light pulse when nodding)
	- DynamicLaughterSynth: adaptive laughter (chuckle → full laugh)
	scaled by intensity instead of fixed samples

	INTERRUPT HANDLING REFINEMENT:
	- ContextAwareApologyGenerator: casual vs serious apologies
	("Sorry, please go ahead" vs "I didn't mean to cut you off")
	- NonBlockingContinuationManager: keeps voice flowing after
	acknowledging an interrupt, so it feels conversational

	Author: Norman de la Paz-Tabora
	"""

	from __future__ import annotations

	import asyncio
	import json
	import logging
	import math
	import os
	import random
	import struct
	import sys
	import threading
	import time
	import uuid
	import wave
	from collections import deque
	from dataclasses import dataclass, field
	from enum import Enum
	from typing import (
	Any, AsyncGenerator, Callable, Deque, Dict, Generator,
	List, Optional, Tuple, Union,
	)

	import numpy as np

	# ── Optional dependencies (all gracefully degrade) ──

	# ASR: try openai-whisper first, then faster-whisper
	try:
	import whisper
	WHISPER_AVAILABLE = True
	WHISPER_BACKEND = "openai-whisper"
	except ImportError:
	WHISPER_AVAILABLE = False
	whisper = None # type: ignore[assignment]

	if not WHISPER_AVAILABLE:
	try:
	from faster_whisper import WhisperModel
	WHISPER_AVAILABLE = True
	WHISPER_BACKEND = "faster-whisper"
	except ImportError:
	WhisperModel = None # type: ignore[assignment, misc]
	WHISPER_BACKEND = None

	# TTS: try coqui-tts
	try:
	from TTS.api import TTS as CoquiTTS
	COQUI_TTS_AVAILABLE = True
	except ImportError:
	try:
	from TTS.api import TTS as CoquiTTS # older package name
	COQUI_TTS_AVAILABLE = True
	except ImportError:
	COQUI_TTS_AVAILABLE = False
	CoquiTTS = None # type: ignore[assignment, misc]

	# ── Logging ──
	logger = logging.getLogger("OmniVoice")
	if not logger.handlers:
	_h = logging.StreamHandler(sys.stdout)
	_h.setFormatter(logging.Formatter(
	"%(asctime)s [%(levelname)s] %(name)s :: %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	))
	logger.addHandler(_h)
	logger.setLevel(logging.INFO)

	OMNIVOICE_VERSION = "2.0.0-MIND-THROUGH-VOICE"


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 1 — Enums & Data Structures
	# ═══════════════════════════════════════════════════════════════════════════

	class NonVerbalType(Enum):
	"""Categories of non-verbal vocal expressions."""
	LAUGHTER = "laughter"
	GIGGLE = "giggle"
	GASp = "gasp"
	GROAN = "groan"
	MOAN = "moan"
	SIGH = "sigh"
	CLUCK = "cluck"
	CLICK = "click"
	AWW = "aww"
	OH = "oh"
	MM = "mm"
	WOW = "wow"


	class ConversationPhase(Enum):
	"""Which phase of the conversation we're in."""
	IDLE = "idle" # No one speaking
	USER_SPEAKING = "user_speaking" # User has the floor
	NIMA_SPEAKING = "nima_speaking" # Nima has the floor
	OVERLAP = "overlap" # Both speaking (potential interrupt)
	YIELDING = "yielding" # Nima yielding the floor after interrupt


	class InterruptType(Enum):
	"""Classification of detected user speech during Nima's turn."""
	REAL_INTERRUPT = "real_interrupt" # User is taking the turn
	NON_VERBAL = "non_verbal" # Laughter, sigh, etc. — IGNORE
	BACKCHANNEL = "backchannel" # "yeah", "mm-hmm" — IGNORE
	COLLABORATIVE_TURN_SHARING = "collaborative" # User finishing sentence — IGNORE
	SILENCE = "silence" # No speech detected


	class BackchannelTrigger(Enum):
	"""Why a backchannel was emitted."""
	PERIODIC = "periodic" # Every N seconds of user speech
	ON_PAUSE = "on_pause" # User paused 0.3-0.8s mid-utterance
	ON_EMOTION_SHIFT = "emotion" # User's prosody shifted (arousal spike)


	class TTSMode(Enum):
	"""Which TTS backend is active."""
	COQUI_XTTS = "coqui_xtts"
	PROCEDURAL = "procedural" # Fallback


	class ASRMode(Enum):
	"""Which ASR backend is active."""
	WHISPER = "whisper"
	VAD_ONLY = "vad_only" # Fallback: detects speech but no transcription


	@dataclass
	class AudioFrame:
	"""A chunk of audio with metadata."""
	samples: np.ndarray
	sample_rate: int = 16000
	timestamp: float = field(default_factory=time.time)
	is_speech: bool = False
	energy: float = 0.0

	@property
	def duration(self) -> float:
	return len(self.samples) / self.sample_rate


	@dataclass
	class TranscriptSegment:
	"""A transcribed segment of user speech."""
	text: str
	start_time: float
	end_time: float
	confidence: float = 0.0
	is_backchannel: bool = False
	is_non_verbal: bool = False


	@dataclass
	class BackchannelEvent:
	"""A backchannel emission (verbal nod or non-verbal expression)."""
	trigger: BackchannelTrigger
	audio: np.ndarray
	sample_rate: int = 22050
	is_verbal: bool = True # True = "mm-hmm", False = laughter/sigh
	label: str = "" # "mm-hmm", "laughter", "sigh", etc.
	timestamp: float = field(default_factory=time.time)


	@dataclass
	class InterruptClassification:
	"""Result of classifying detected user speech during Nima's turn."""
	interrupt_type: InterruptType
	confidence: float = 0.0
	reason: str = ""
	transcript: str = ""
	duration_s: float = 0.0
	spectral_features: Dict[str, float] = field(default_factory=dict)


	@dataclass
	class ProsodyParams:
	"""Prosody parameters driven by consciousness state."""
	base_pitch_hz: float = 180.0
	speech_rate_wpm: float = 140.0
	energy: float = 0.8
	breathiness: float = 0.1
	vibrato_depth: float = 0.0
	warmth: float = 0.7 # 0 = cold/clinical, 1 = warm/intimate
	pitch_variance: float = 0.15
	emotional_tone: str = "neutral"


	@dataclass
	class ConversationState:
	"""Tracks the current state of the conversation."""
	phase: ConversationPhase = ConversationPhase.IDLE
	user_speech_start: float = 0.0
	user_speech_duration: float = 0.0
	nima_speech_start: float = 0.0
	nima_speech_duration: float = 0.0
	last_backchannel_time: float = 0.0
	last_user_pause_time: float = 0.0
	user_emotion_arousal: float = 0.3
	user_emotion_valence: float = 0.0
	last_arousal_sample: float = 0.3
	interrupt_count: int = 0
	backchannel_count: int = 0
	current_text: str = ""
	current_text_position: float = 0.0 # 0.0 = just started, 1.0 = finished


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 2 — Procedural Non-Verbal Synthesizer (numpy DSP)
	# ═══════════════════════════════════════════════════════════════════════════

	class ProceduralNonVerbalSynth:
	"""
	Synthesizes non-verbal vocal expressions using pure numpy DSP.
	Each expression type has a hand-crafted signal model:

	LAUGHTER: Periodic bursts of filtered noise with 80-120ms "ha" cycles
	GIGGLE: Faster, higher-pitched laughter (160-200ms cycles, f0 upshift)
	GASp: Short (200ms) inverse-filtered impulse, sharp onset, quick decay
	GROAN: Low-pitched (80Hz) descending sawtooth, 600ms, with noise
	MOAN: Mid-pitched (140Hz) sustained sine with vibrato, 800ms
	SIGH: Downward-filtered noise, 500ms, lowpass sweep 2000→400Hz
	CLUCK: Short (60ms) plosive burst + click, dual impulse
	CLICK: Single impulse (20ms) with quick decay — "tsk" sound
	AWW: Low-pitched (150Hz) "aw" vowel, 400ms, with warmth
	OH: Mid-pitched (200Hz) "oh" vowel, 300ms
	MM: Humming (120Hz), nasal-filtered, 400ms
	WOW: Rising pitch (180→260Hz) "wow" vowel, 500ms
	"""

	SAMPLE_RATE: int = 22050

	# ── Vowel formants for vowel-based expressions (aww, oh, mm, wow) ──
	VOWEL_FORMANTS: Dict[str, Dict] = {
	"aw": {"F1": 570, "F2": 840, "F3": 2410, "bw": [55, 75, 115]},
	"oh": {"F1": 480, "F2": 760, "F3": 2300, "bw": [50, 70, 110]},
	"mm": {"F1": 280, "F2": 900, "F3": 2200, "bw": [45, 65, 105]},
	"ah": {"F1": 730, "F2": 1090, "F3": 2440, "bw": [60, 80, 120]},
	}

	def __init__(self, sample_rate: int = 22050):
	self.sample_rate = sample_rate

	def synth(self, expr_type: NonVerbalType, intensity: float = 0.7,
	duration_override: Optional[float] = None) -> np.ndarray:
	"""Synthesize a non-verbal expression. Returns float32 audio."""
	intensity = float(max(0.1, min(1.0, intensity)))
	method = getattr(self, f"_synth_{expr_type.value}", None)
	if method is None:
	logger.warning("Unknown non-verbal type: %s, falling back to sigh", expr_type)
	method = self._synth_sigh
	audio = method(intensity, duration_override)
	# Normalize to target amplitude
	max_val = float(np.max(np.abs(audio))) if len(audio) > 0 else 0.0
	if max_val > 0:
	audio = audio / max_val * 0.7 * intensity
	return audio.astype(np.float32)

	# ── Laughter: periodic "ha-ha-ha" bursts ──
	def _synth_laughter(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	total_dur = dur or 1.2
	ha_period = 0.10 # 100ms per "ha"
	n_has = int(total_dur / ha_period)
	chunks = []
	for i in range(n_has):
	ha = self._gen_ha_burst(ha_period * 0.7, intensity, pitch=180 + random.uniform(-20, 20))
	# Add inter-ha gap
	gap = np.zeros(int(self.sample_rate * ha_period * 0.3))
	# Decay across the laughter
	decay = 1.0 - 0.3 * (i / max(1, n_has - 1))
	chunks.append(ha * decay)
	chunks.append(gap)
	return np.concatenate(chunks) if chunks else np.zeros(0)

	def _gen_ha_burst(self, duration: float, intensity: float, pitch: float = 180) -> np.ndarray:
	"""Generate a single 'ha' burst — voiced segment with fast onset/offset."""
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Glottal source (sawtooth-like via harmonics)
	phase = 2.0 * np.pi * pitch * t
	source = np.sin(phase)
	for h in range(2, 5):
	source += (0.4 / h) * np.sin(phase * h)
	source /= 3.0
	# Add breathy noise
	noise = np.random.normal(0, 0.3, n)
	mixed = source * 0.6 + noise * 0.4
	# Bandpass filter around vowel region (rough "ah" formant)
	mixed = self._bandpass(mixed, 400, 3000)
	# Envelope: fast attack, fast decay (ha-ha character)
	env = np.ones(n)
	attack = min(int(0.01 * self.sample_rate), n // 4)
	release = min(int(0.04 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0, release)
	return mixed * env * intensity

	# ── Giggle: faster, higher-pitched laughter ──
	def _synth_giggle(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	total_dur = dur or 0.8
	hee_period = 0.07
	n_hees = int(total_dur / hee_period)
	chunks = []
	for i in range(n_hees):
	hee = self._gen_ha_burst(hee_period * 0.6, intensity * 0.8,
	pitch=260 + random.uniform(-30, 30))
	gap = np.zeros(int(self.sample_rate * hee_period * 0.4))
	chunks.append(hee)
	chunks.append(gap)
	return np.concatenate(chunks) if chunks else np.zeros(0)

	# ── Gasp: short sharp intake ──
	def _synth_gasp(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.25
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Sharp onset noise burst (intake)
	noise = np.random.normal(0, 1, n)
	# High-pass to make it breathy/sharp
	filtered = self._highpass(noise, 800)
	# Quick attack, exponential decay
	env = np.exp(-t * 15.0)
	attack = min(int(0.005 * self.sample_rate), n // 10)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack) * env[:attack] / max(env[:attack].max(), 1e-6)
	# Add a faint glottal pulse
	pulse = 0.2 * np.sin(2 * np.pi * 200 * t) * np.exp(-t * 10)
	return (filtered * 0.7 + pulse * 0.3) * env * intensity

	# ── Groan: low-pitched descending ──
	def _synth_groan(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.6
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Descending pitch 90 → 60 Hz
	f0 = 90.0 - 30.0 * (t / duration)
	phase = 2.0 * np.pi * np.cumsum(f0) / self.sample_rate
	source = np.sin(phase)
	for h in range(2, 4):
	source += (0.3 / h) * np.sin(phase * h)
	source /= 2.0
	# Add low-frequency noise
	noise = np.random.normal(0, 0.2, n)
	mixed = source * 0.7 + noise * 0.3
	mixed = self._lowpass(mixed, 600)
	# Envelope: slow attack, sustain, slow release
	env = np.ones(n)
	attack = min(int(0.08 * self.sample_rate), n // 4)
	release = min(int(0.15 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.3, release)
	return mixed * env * intensity

	# ── Moan: sustained mid-pitch with vibrato ──
	def _synth_moan(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.8
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	f0 = 140.0
	vibrato = 4.0 * np.sin(2 * np.pi * 5.5 * t) # 5.5 Hz vibrato
	phase = 2.0 * np.pi * np.cumsum(f0 + vibrato) / self.sample_rate
	source = np.sin(phase)
	for h in range(2, 5):
	source += (0.4 / h) * np.sin(phase * h)
	source /= 3.0
	# Add breathiness
	noise = np.random.normal(0, 0.15, n)
	mixed = source * 0.85 + noise * 0.15
	mixed = self._bandpass(mixed, 200, 2000)
	# Envelope: slow attack, sustain, slow release
	env = np.ones(n)
	attack = min(int(0.1 * self.sample_rate), n // 4)
	release = min(int(0.2 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.4, release)
	return mixed * env * intensity

	# ── Sigh: downward-filtered noise ──
	def _synth_sigh(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.5
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	noise = np.random.normal(0, 1, n)
	# Lowpass sweep from 2000 → 400 Hz (exhale character)
	# Approximate by filtering in chunks
	chunk_size = max(1, n // 10)
	filtered = np.zeros(n)
	for i in range(0, n, chunk_size):
	end = min(i + chunk_size, n)
	cutoff = 2000.0 - 1600.0 * (i / max(1, n))
	filtered[i:end] = self._lowpass(noise[i:end], cutoff)
	# Add faint glottal pulse for voicing
	pulse = 0.15 * np.sin(2 * np.pi * 120 * t) * np.exp(-t * 2)
	mixed = filtered * 0.8 + pulse * 0.2
	# Envelope: medium attack, long decay (exhale)
	env = np.ones(n)
	attack = min(int(0.05 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	env = np.exp(-t 2.5) # gradual decay
	return mixed * env * intensity

	# ── Cluck: plosive + click (tongue sound) ──
	def _synth_cluck(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.08
	n = int(self.sample_rate * duration)
	# Short plosive burst
	burst_len = min(int(0.02 * self.sample_rate), n)
	burst = np.zeros(n)
	if burst_len > 0:
	burst[:burst_len] = np.random.normal(0, 1, burst_len) * np.hanning(burst_len)
	# Click component (shorter than total)
	click_len = min(int(0.005 * self.sample_rate), n)
	click = np.zeros(n)
	if click_len > 0:
	click[:click_len] = np.random.normal(0, 1, click_len) * 0.6
	# Combine with offset
	audio = burst.copy()
	offset = min(int(0.03 * self.sample_rate), n - click_len)
	if offset + click_len <= n:
	audio[offset:offset + click_len] += click[:click_len] * 0.6
	# Lowpass
	audio = self._lowpass(audio, 3000)
	return audio * intensity

	# ── Click: single "tsk" impulse ──
	def _synth_click(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.03
	n = int(self.sample_rate * duration)
	# Short impulse with quick decay
	impulse = np.zeros(n)
	impulse_len = min(int(0.003 * self.sample_rate), n)
	if impulse_len > 0:
	impulse[:impulse_len] = np.random.normal(0, 1, impulse_len)
	# Quick exponential decay
	t = np.linspace(0, duration, n)
	env = np.exp(-t * 100)
	audio = impulse * env
	# Highpass to make it sharp
	audio = self._highpass(audio, 1500)
	return audio * intensity

	# ── Aww: low-pitched warm vowel ──
	def _synth_aww(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	return self._synth_vowel_expr("aw", 150, 0.4, intensity, dur)

	# ── Oh: mid-pitched vowel ──
	def _synth_oh(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	return self._synth_vowel_expr("oh", 200, 0.3, intensity, dur)

	# ── Mm: humming ──
	def _synth_mm(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	return self._synth_vowel_expr("mm", 120, 0.4, intensity, dur, nasal=True)

	# ── Wow: rising pitch vowel ──
	def _synth_wow(self, intensity: float, dur: Optional[float]) -> np.ndarray:
	duration = dur or 0.5
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Rising pitch 180 → 260 Hz
	f0 = 180.0 + 80.0 * (t / duration)
	phase = 2.0 * np.pi * np.cumsum(f0) / self.sample_rate
	audio = self._formant_filter(np.sin(phase), "aw", n)
	# Envelope
	env = np.ones(n)
	attack = min(int(0.05 * self.sample_rate), n // 4)
	release = min(int(0.1 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.5, release)
	return audio * env * intensity

	# ── Helper: vowel-based expression with formant filtering ──
	def _synth_vowel_expr(self, vowel: str, f0: float, duration: float,
	intensity: float, dur_override: Optional[float],
	nasal: bool = False) -> np.ndarray:
	duration = dur_override or duration
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Glottal source
	phase = 2.0 * np.pi * f0 * t
	source = np.sin(phase)
	for h in range(2, 6):
	source += (0.4 / h) * np.sin(phase * h)
	source /= 3.0
	# Formant filter
	audio = self._formant_filter(source, vowel, n)
	if nasal:
	# Nasal: reduce high frequencies, add low resonance
	audio = self._lowpass(audio, 1500)
	audio += 0.2 * np.sin(2 * np.pi * 250 * t)
	# Envelope
	env = np.ones(n)
	attack = min(int(0.05 * self.sample_rate), n // 4)
	release = min(int(0.1 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.5, release)
	return audio * env * intensity

	# ── DSP helpers ──

	def _formant_filter(self, signal: np.ndarray, vowel: str, n: int) -> np.ndarray:
	"""Apply 3-formant resonator filter for vowel synthesis."""
	formants = self.VOWEL_FORMANTS.get(vowel, self.VOWEL_FORMANTS["ah"])
	output = np.zeros(n)
	for fi, (fn, bw) in enumerate(zip(
	[formants["F1"], formants["F2"], formants["F3"]],
	formants["bw"]
	)):
	r = float(np.exp(-np.pi * bw / self.sample_rate))
	a1 = -2 * r * math.cos(2 * math.pi * fn / self.sample_rate)
	a2 = r * r
	gain = (1 - r) * math.sqrt(max(0, 1 - 2 * r * math.cos(2 * math.pi * fn / self.sample_rate) + r * r))
	filtered = np.zeros(n)
	for i in range(2, n):
	filtered[i] = gain * signal[i] - a1 * filtered[i - 1] - a2 * filtered[i - 2]
	formant_gains = [1.0, 0.6, 0.3]
	output += filtered * formant_gains[fi]
	return output

	def _lowpass(self, signal: np.ndarray, cutoff_hz: float) -> np.ndarray:
	"""Simple one-pole lowpass filter."""
	if len(signal) == 0:
	return signal
	rc = 1.0 / (2 * math.pi * cutoff_hz)
	dt = 1.0 / self.sample_rate
	alpha = dt / (rc + dt)
	output = np.zeros_like(signal)
	output[0] = signal[0] * alpha
	for i in range(1, len(signal)):
	output[i] = output[i - 1] + alpha * (signal[i] - output[i - 1])
	return output

	def _highpass(self, signal: np.ndarray, cutoff_hz: float) -> np.ndarray:
	"""Simple one-pole highpass filter."""
	if len(signal) == 0:
	return signal
	rc = 1.0 / (2 * math.pi * cutoff_hz)
	dt = 1.0 / self.sample_rate
	alpha = rc / (rc + dt)
	output = np.zeros_like(signal)
	output[0] = signal[0]
	for i in range(1, len(signal)):
	output[i] = alpha * (output[i - 1] + signal[i] - signal[i - 1])
	return output

	def _bandpass(self, signal: np.ndarray, low_hz: float, high_hz: float) -> np.ndarray:
	"""Bandpass = lowpass + highpass in series."""
	return self._highpass(self._lowpass(signal, high_hz), low_hz)


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 3 — ASR Layer (Whisper + Energy VAD fallback)
	# ═══════════════════════════════════════════════════════════════════════════

	class EnergyVAD:
	"""
	Energy-based Voice Activity Detection. Detects WHEN speech occurs
	but not WHAT is said. Used as a fallback when Whisper is unavailable,
	and as a fast pre-filter even when Whisper is active.
	"""

	def __init__(self, sample_rate: int = 16000, frame_duration_ms: int = 20,
	energy_threshold: float = 0.005):
	self.sample_rate = sample_rate
	self.frame_duration_ms = frame_duration_ms
	self.frame_size = int(sample_rate * frame_duration_ms / 1000)
	self.energy_threshold = energy_threshold
	self._noise_floor = 0.001
	self._adaptation_rate = 0.01

	def detect_speech(self, audio: np.ndarray) -> bool:
	"""Return True if the audio frame contains speech."""
	if len(audio) == 0:
	return False
	# Normalize to float32 in [-1, 1] range
	if audio.dtype == np.int16:
	audio = audio.astype(np.float32) / 32768.0
	elif audio.dtype == np.int32:
	audio = audio.astype(np.float32) / 2147483648.0
	elif audio.dtype == np.uint8:
	audio = (audio.astype(np.float32) - 128) / 128.0
	elif audio.dtype != np.float32:
	audio = audio.astype(np.float32)
	# Compute RMS energy
	rms = float(np.sqrt(np.mean(audio ** 2)))
	# Adapt noise floor (only for low-energy frames)
	if rms < self.energy_threshold * 0.5:
	self._noise_floor = (1 - self._adaptation_rate) * self._noise_floor + self._adaptation_rate * rms
	# Speech if energy exceeds max of fixed threshold or 3x noise floor
	threshold = max(self.energy_threshold, self._noise_floor * 3)
	return rms > threshold

	def compute_energy(self, audio: np.ndarray) -> float:
	if len(audio) == 0:
	return 0.0
	# Normalize to float32 in [-1, 1] range
	if audio.dtype == np.int16:
	audio = audio.astype(np.float32) / 32768.0
	elif audio.dtype == np.int32:
	audio = audio.astype(np.float32) / 2147483648.0
	elif audio.dtype == np.uint8:
	audio = (audio.astype(np.float32) - 128) / 128.0
	elif audio.dtype != np.float32:
	audio = audio.astype(np.float32)
	return float(np.sqrt(np.mean(audio ** 2)))

	def detect_pause(self, audio: np.ndarray, min_pause_s: float = 0.3,
	max_pause_s: float = 0.8) -> Tuple[bool, float]:
	"""
	Detect if the audio contains a mid-utterance pause (0.3-0.8s of silence).
	Returns (is_pause, pause_duration).
	"""
	if len(audio) == 0:
	return False, 0.0
	n_frames = len(audio) // self.frame_size
	if n_frames < 2:
	return False, 0.0
	# Check each frame for speech
	silence_start = None
	max_silence = 0.0
	for i in range(n_frames):
	frame = audio[i * self.frame_size:(i + 1) * self.frame_size]
	is_speech = self.detect_speech(frame)
	frame_dur = self.frame_duration_ms / 1000.0
	if not is_speech:
	if silence_start is None:
	silence_start = i * frame_dur
	current_silence = (i + 1) * frame_dur - silence_start
	max_silence = max(max_silence, current_silence)
	else:
	silence_start = None
	is_pause = min_pause_s <= max_silence <= max_pause_s
	return is_pause, max_silence


	class WhisperASR:
	"""
	OpenAI Whisper ASR backend. Transcribes user speech to text.
	Falls back to VAD-only mode if Whisper is not installed.
	"""

	def __init__(self, model_name: str = "base", device: Optional[str] = None):
	self.model_name = model_name
	self.mode = ASRMode.WHISPER if WHISPER_AVAILABLE else ASRMode.VAD_ONLY
	self._model = None
	self._backend = WHISPER_BACKEND if WHISPER_AVAILABLE else None
	self.vad = EnergyVAD()
	if self.mode == ASRMode.WHISPER:
	try:
	logger.info("[WhisperASR] loading model '%s' via %s...", model_name, self._backend)
	if self._backend == "openai-whisper":
	device = device or ("cuda" if _torch_cuda_available() else "cpu")
	self._model = whisper.load_model(model_name, device=device)
	elif self._backend == "faster-whisper":
	# faster-whisper uses model size names like "base", "small", etc.
	# and downloads automatically from HuggingFace
	compute_type = "int8" if device != "cuda" else "float16"
	self._model = WhisperModel(model_name, compute_type=compute_type)
	logger.info("[WhisperASR] model loaded (backend=%s)", self._backend)
	except Exception as e:
	logger.warning("[WhisperASR] failed to load Whisper (%s); falling back to VAD-only", e)
	self.mode = ASRMode.VAD_ONLY
	self._model = None
	else:
	logger.warning("[WhisperASR] whisper not installed; using VAD-only mode")

	def transcribe(self, audio: np.ndarray, sample_rate: int = 16000) -> TranscriptSegment:
	"""
	Transcribe audio to text. Returns a TranscriptSegment.
	In VAD-only mode, text is empty but is_speech/is_non_verbal are still set.
	"""
	# Ensure float32, mono
	if audio.dtype != np.float32:
	audio = audio.astype(np.float32)
	if audio.size == 0:
	return TranscriptSegment(text="", start_time=time.time(),
	end_time=time.time(), confidence=0.0)
	if self.mode == ASRMode.WHISPER and self._model is not None:
	try:
	if self._backend == "openai-whisper":
	result = self._model.transcribe(audio, fp16=False, language="en")
	text = result.get("text", "").strip()
	segments = result.get("segments", [])
	confidence = float(np.mean([s.get("avg_logprob", -1) for s in segments])) if segments else 0.0
	confidence = max(0.0, min(1.0, (confidence + 1.0) / 1.0))
	elif self._backend == "faster-whisper":
	segments_iter, info = self._model.transcribe(audio, language="en", beam_size=1)
	segments_list = list(segments_iter)
	text = " ".join(s.text.strip() for s in segments_list).strip()
	confidence = 0.0
	if segments_list:
	avg_logprob = float(np.mean([s.avg_log_prob for s in segments_list]))
	confidence = max(0.0, min(1.0, (avg_logprob + 1.0) / 1.0))
	return TranscriptSegment(
	text=text,
	start_time=time.time() - len(audio) / sample_rate,
	end_time=time.time(),
	confidence=confidence,
	)
	except Exception as e:
	logger.warning("[WhisperASR] transcription failed: %s", e)
	# VAD-only fallback
	is_speech = self.vad.detect_speech(audio)
	return TranscriptSegment(
	text="" if not is_speech else "[speech detected]",
	start_time=time.time() - len(audio) / sample_rate,
	end_time=time.time(),
	confidence=0.0,
	)

	def is_backchannel_text(self, text: str) -> bool:
	"""Check if transcribed text is a backchannel ('yeah', 'mm-hmm', etc.)."""
	if not text:
	return False
	text_lower = text.lower().strip().strip(".?!,")
	backchannel_vocab = {
	"yeah", "yes", "yep", "yup", "mhm", "mm-hmm", "mm", "hmm",
	"uh-huh", "right", "sure", "ok", "okay", "i see", "got it",
	"makes sense", "true", "exactly", "wow", "oh", "ah",
	}
	return text_lower in backchannel_vocab


	def _torch_cuda_available() -> bool:
	"""Check if torch + CUDA are available."""
	try:
	import torch
	return torch.cuda.is_available()
	except ImportError:
	return False


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 4 — TTS Layer (Coqui XTTS + Procedural fallback)
	# ═══════════════════════════════════════════════════════════════════════════

	class ProceduralFormantTTS:
	"""
	Fallback TTS using formant synthesis. Produces understandable but
	robotic speech. Used when Coqui XTTS is not available.
	"""

	SAMPLE_RATE: int = 22050

	# Phoneme → (type, duration, formant_vowel_or_noise)
	PHONEME_MAP: Dict[str, Tuple[str, float, str]] = {
	"a": ("vowel", 0.10, "ah"), "e": ("vowel", 0.10, "eh"),
	"i": ("vowel", 0.10, "ee"), "o": ("vowel", 0.10, "oh"),
	"u": ("vowel", 0.10, "oo"),
	"b": ("plosive", 0.05, ""), "p": ("plosive", 0.05, ""),
	"t": ("plosive", 0.05, ""), "d": ("plosive", 0.05, ""),
	"k": ("plosive", 0.05, ""), "g": ("plosive", 0.05, ""),
	"s": ("fricative", 0.12, ""), "z": ("fricative", 0.12, ""),
	"f": ("fricative", 0.10, ""), "v": ("fricative", 0.10, ""),
	"h": ("fricative", 0.08, ""),
	"m": ("nasal", 0.08, ""), "n": ("nasal", 0.08, ""),
	"l": ("approximant", 0.07, ""), "r": ("approximant", 0.07, ""),
	"w": ("approximant", 0.07, ""), "y": ("approximant", 0.07, ""),
	}

	VOWEL_FORMANTS: Dict[str, Dict] = {
	"ah": {"F1": 730, "F2": 1090, "F3": 2440, "bw": [60, 80, 120]},
	"eh": {"F1": 530, "F2": 1840, "F3": 2480, "bw": [50, 70, 110]},
	"ee": {"F1": 270, "F2": 2290, "F3": 3010, "bw": [40, 60, 100]},
	"oh": {"F1": 570, "F2": 840, "F3": 2410, "bw": [55, 75, 115]},
	"oo": {"F1": 300, "F2": 870, "F3": 2240, "bw": [45, 65, 105]},
	}

	def __init__(self, sample_rate: int = 22050):
	self.sample_rate = sample_rate
	self._nonverbal = ProceduralNonVerbalSynth(sample_rate)

	def synthesize(self, text: str, prosody: ProsodyParams) -> np.ndarray:
	"""Synthesize text to speech using formant synthesis."""
	if not text.strip():
	return np.zeros(0, dtype=np.float32)
	# Decompose text into phonemes
	frames = self._text_to_phonemes(text)
	if not frames:
	return np.zeros(0, dtype=np.float32)
	# Synthesize each phoneme
	chunks = []
	for ptype, duration, vowel_or_noise in frames:
	if ptype == "pause":
	chunks.append(np.zeros(int(self.sample_rate * duration), dtype=np.float32))
	elif ptype == "vowel":
	chunks.append(self._synth_vowel(vowel_or_noise, duration, prosody))
	elif ptype == "plosive":
	chunks.append(self._synth_plosive(duration, prosody))
	elif ptype == "fricative":
	chunks.append(self._synth_fricative(duration, prosody))
	elif ptype == "nasal":
	chunks.append(self._synth_nasal(duration, prosody))
	elif ptype == "approximant":
	chunks.append(self._synth_approximant(duration, prosody))
	audio = np.concatenate(chunks) if chunks else np.zeros(0, dtype=np.float32)
	# Apply prosody modifications
	audio = self._apply_prosody(audio, prosody)
	return audio.astype(np.float32)

	def _text_to_phonemes(self, text: str) -> List[Tuple[str, float, str]]:
	"""Simple grapheme-to-phoneme: one char → one phoneme."""
	frames = []
	text = text.lower()
	i = 0
	while i < len(text):
	char = text[i]
	if char in self.PHONEME_MAP:
	ptype, dur, vowel = self.PHONEME_MAP[char]
	frames.append((ptype, dur, vowel))
	elif char == " ":
	frames.append(("pause", 0.08, ""))
	elif char in ".,!?;:":
	frames.append(("pause", 0.20, ""))
	i += 1
	return frames

	def _synth_vowel(self, vowel: str, duration: float, prosody: ProsodyParams) -> np.ndarray:
	n = int(self.sample_rate * duration)
	if n < 2:
	return np.zeros(max(2, n), dtype=np.float32)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Glottal source with pitch + vibrato
	f0 = prosody.base_pitch_hz
	vibrato = prosody.vibrato_depth * np.sin(2 * np.pi * 5.5 * t)
	phase = 2.0 * np.pi * np.cumsum(f0 + vibrato) / self.sample_rate
	source = np.sin(phase)
	for h in range(2, 6):
	source += (0.4 / h) * np.sin(phase * h)
	source /= 3.0
	# Formant filter
	formants = self.VOWEL_FORMANTS.get(vowel, self.VOWEL_FORMANTS["ah"])
	output = np.zeros(n)
	for fi, (fn, bw) in enumerate(zip(
	[formants["F1"], formants["F2"], formants["F3"]], formants["bw"]
	)):
	r = float(np.exp(-np.pi * bw / self.sample_rate))
	a1 = -2 * r * math.cos(2 * math.pi * fn / self.sample_rate)
	a2 = r * r
	gain = (1 - r) * math.sqrt(max(0, 1 - 2 * r * math.cos(2 * math.pi * fn / self.sample_rate) + r * r))
	filtered = np.zeros(n)
	for i in range(2, n):
	filtered[i] = gain * source[i] - a1 * filtered[i - 1] - a2 * filtered[i - 2]
	formant_gains = [1.0, 0.6, 0.3]
	output += filtered * formant_gains[fi]
	# Add breathiness
	noise = np.random.normal(0, prosody.breathiness, n)
	output += noise * 0.3
	# Envelope
	attack = min(int(0.015 * self.sample_rate), n // 4)
	release = min(int(0.025 * self.sample_rate), n // 4)
	env = np.ones(n)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0, release)
	return (output * env * prosody.energy).astype(np.float32)

	def _synth_plosive(self, duration: float, prosody: ProsodyParams) -> np.ndarray:
	n = int(self.sample_rate * duration)
	burst_len = max(2, min(int(0.008 * self.sample_rate), n))
	audio = np.zeros(n, dtype=np.float32)
	audio[:burst_len] = np.random.normal(0, 1, burst_len) * np.hanning(burst_len)
	return audio * prosody.energy * 0.5

	def _synth_fricative(self, duration: float, prosody: ProsodyParams) -> np.ndarray:
	n = int(self.sample_rate * duration)
	noise = np.random.normal(0, 1, n)
	# Bandpass 3000-7000 Hz
	audio = self._bandpass_simple(noise, 3000, 7000)
	return (audio * prosody.energy * 0.3).astype(np.float32)

	def _synth_nasal(self, duration: float, prosody: ProsodyParams) -> np.ndarray:
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	f0 = prosody.base_pitch_hz * 0.8
	source = np.sin(2 * np.pi * f0 * t)
	# Lowpass for nasal character
	audio = self._lowpass_simple(source, 1500)
	return (audio * prosody.energy * 0.4).astype(np.float32)

	def _synth_approximant(self, duration: float, prosody: ProsodyParams) -> np.ndarray:
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	f0 = prosody.base_pitch_hz
	source = np.sin(2 * np.pi * f0 * t) * 0.7
	noise = np.random.normal(0, 0.2, n)
	audio = source + noise * 0.3
	return (audio * prosody.energy * 0.35).astype(np.float32)

	def _apply_prosody(self, audio: np.ndarray, prosody: ProsodyParams) -> np.ndarray:
	"""Apply global prosody modifications (energy, warmth)."""
	if len(audio) == 0:
	return audio
	# Warmth: boost low frequencies
	if prosody.warmth > 0.5:
	low_boost = self._lowpass_simple(audio, 800)
	audio = audio + low_boost * (prosody.warmth - 0.5) * 0.5
	# Pitch variance: add subtle random pitch modulation
	if prosody.pitch_variance > 0:
	n = len(audio)
	mod = 1.0 + prosody.pitch_variance * 0.05 * np.sin(2 * np.pi * 2.0 * np.arange(n) / self.sample_rate)
	audio = audio * mod
	# Normalize
	max_val = float(np.max(np.abs(audio)))
	if max_val > 0:
	audio = audio / max_val * 0.85
	return audio

	def _lowpass_simple(self, signal: np.ndarray, cutoff_hz: float) -> np.ndarray:
	if len(signal) == 0:
	return signal
	rc = 1.0 / (2 * math.pi * cutoff_hz)
	dt = 1.0 / self.sample_rate
	alpha = dt / (rc + dt)
	output = np.zeros_like(signal)
	output[0] = signal[0] * alpha
	for i in range(1, len(signal)):
	output[i] = output[i - 1] + alpha * (signal[i] - output[i - 1])
	return output

	def _bandpass_simple(self, signal: np.ndarray, low_hz: float, high_hz: float) -> np.ndarray:
	lp = self._lowpass_simple(signal, high_hz)
	# Highpass = signal - lowpass
	hp = lp - self._lowpass_simple(lp, low_hz)
	return hp


	class CoquiXTTSBackend:
	"""
	Coqui XTTS neural TTS backend. Produces high-quality natural speech
	with optional voice cloning. Falls back to ProceduralFormantTTS if
	Coqui is not installed or model loading fails.
	"""

	def __init__(self, model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2",
	speaker_wav: Optional[str] = None,
	language: str = "en"):
	self.model_name = model_name
	self.speaker_wav = speaker_wav
	self.language = language
	self.mode = TTSMode.COQUI_XTTS if COQUI_TTS_AVAILABLE else TTSMode.PROCEDURAL
	self._model = None
	self._fallback = ProceduralFormantTTS()
	if self.mode == TTSMode.COQUI_XTTS:
	try:
	logger.info("[CoquiXTTS] loading model '%s'...", model_name)
	self._model = CoquiTTS(model_name)
	logger.info("[CoquiXTTS] model loaded")
	except Exception as e:
	logger.warning("[CoquiXTTS] failed to load (%s); falling back to procedural", e)
	self.mode = TTSMode.PROCEDURAL
	self._model = None
	else:
	logger.warning("[CoquiXTTS] TTS package not installed; using procedural formant fallback")

	def synthesize(self, text: str, prosody: ProsodyParams) -> np.ndarray:
	"""Synthesize text to speech. Returns float32 audio at 22050 Hz."""
	if not text.strip():
	return np.zeros(0, dtype=np.float32)
	if self.mode == TTSMode.COQUI_XTTS and self._model is not None:
	try:
	kwargs = {
	"text": text,
	"language": self.language,
	"speaker_wav": self.speaker_wav,
	} if self.speaker_wav else {
	"text": text,
	"language": self.language,
	"speaker": "Ana NeP",
	}
	wav = self._model.tts(**kwargs)
	audio = np.array(wav, dtype=np.float32)
	# Apply prosody modifications (pitch shift via resampling, energy)
	audio = self._apply_prosody(audio, prosody)
	return audio
	except Exception as e:
	logger.warning("[CoquiXTTS] synthesis failed (%s); using fallback for this utterance", e)
	return self._fallback.synthesize(text, prosody)

	def _apply_prosody(self, audio: np.ndarray, prosody: ProsodyParams) -> np.ndarray:
	"""Apply prosody modifications to Coqui output."""
	if len(audio) == 0:
	return audio
	# Energy scaling
	audio = audio * prosody.energy
	# Normalize
	max_val = float(np.max(np.abs(audio)))
	if max_val > 0:
	audio = audio / max_val * 0.9
	return audio.astype(np.float32)


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 5 — Interrupt Detector (smart classification)
	# ═══════════════════════════════════════════════════════════════════════════

	class InterruptDetector:
	"""
	Classifies detected user speech during Nima's turn into:
	- REAL_INTERRUPT: user is taking the turn (long speech, starts mid-Nima)
	- NON_VERBAL: laughter/sigh/gasp/etc. — IGNORE (not an interrupt)
	- BACKCHANNEL: "yeah", "mm-hmm" — IGNORE (not an interrupt)
	- COLLABORATIVE_TURN_SHARING: user finishing Nima's sentence — IGNORE
	- SILENCE: no speech detected

	This is the KEY DIFFERENTIATOR: the system doesn't treat all user
	speech as an interrupt. Backchannels and non-verbal expressions are
	natural parts of conversation and should NOT trigger Nima to stop.
	"""

	# Backchannel vocabulary — short utterances that signal "I'm listening"
	BACKCHANNEL_VOCAB: Set[str] = {
	"yeah", "yes", "yep", "yup", "mhm", "mm-hmm", "mm", "hmm",
	"uh-huh", "right", "sure", "ok", "okay", "i see", "got it",
	"makes sense", "true", "exactly", "wow", "oh", "ah", "ha",
	}

	# Duration thresholds
	BACKCHANNEL_MAX_DURATION: float = 0.8 # <0.8s = likely backchannel
	NON_VERBAL_MAX_DURATION: float = 1.5 # <1.5s with spectral signature = non-verbal
	REAL_INTERRUPT_MIN_DURATION: float = 1.0 # >1.0s = likely real interrupt

	# Collaborative turn-sharing: user speech in the last 300ms of Nima's utterance
	COLLABORATIVE_WINDOW_S: float = 0.3

	def __init__(self, asr: WhisperASR):
	self._asr = asr
	self._vad = asr.vad

	def classify(self, audio: np.ndarray, sample_rate: int = 16000,
	nima_text_progress: float = 1.0,
	nima_speech_remaining_s: float = 0.0) -> InterruptClassification:
	"""
	Classify a segment of user speech detected during Nima's turn.

	Args:
	audio: user audio (float32, mono)
	sample_rate: audio sample rate
	nima_text_progress: 0.0 = Nima just started, 1.0 = Nima finished
	nima_speech_remaining_s: seconds left in Nima's current utterance

	Returns:
	InterruptClassification with the verdict.
	"""
	if len(audio) == 0:
	return InterruptClassification(
	interrupt_type=InterruptType.SILENCE,
	reason="no audio",
	)

	# Compute basic features
	duration = len(audio) / sample_rate
	energy = self._vad.compute_energy(audio)
	spectral = self._compute_spectral_features(audio, sample_rate)

	# Check if there's actually speech
	if not self._vad.detect_speech(audio):
	return InterruptClassification(
	interrupt_type=InterruptType.SILENCE,
	reason="below VAD threshold",
	duration_s=duration,
	spectral_features=spectral,
	)

	# Transcribe (if Whisper available)
	segment = self._asr.transcribe(audio, sample_rate)
	transcript = segment.text.strip().lower()

	# ── Classification logic ──

	# 1. Check for backchannel (short + matches vocab)
	if duration < self.BACKCHANNEL_MAX_DURATION:
	if self._asr.is_backchannel_text(transcript) or self._is_backchannel_spectral(spectral):
	return InterruptClassification(
	interrupt_type=InterruptType.BACKCHANNEL,
	confidence=0.85,
	reason=f"short ({duration:.2f}s) + backchannel vocab/spectral",
	transcript=transcript,
	duration_s=duration,
	spectral_features=spectral,
	)

	# 2. Check for non-verbal expression (spectral signature)
	non_verbal_match = self._classify_non_verbal(spectral, duration)
	if non_verbal_match:
	return InterruptClassification(
	interrupt_type=InterruptType.NON_VERBAL,
	confidence=non_verbal_match[1],
	reason=f"non-verbal spectral match: {non_verbal_match[0]}",
	transcript=transcript,
	duration_s=duration,
	spectral_features=spectral,
	)

	# 3. Check for collaborative turn-sharing (speech at end of Nima's turn)
	if nima_text_progress > 0.7 and nima_speech_remaining_s < self.COLLABORATIVE_WINDOW_S:
	if duration < 1.5:
	return InterruptClassification(
	interrupt_type=InterruptType.COLLABORATIVE_TURN_SHARING,
	confidence=0.70,
	reason=f"speech at end of Nima's turn (progress={nima_text_progress:.2f})",
	transcript=transcript,
	duration_s=duration,
	spectral_features=spectral,
	)

	# 4. Otherwise: real interrupt
	confidence = min(1.0, duration / 2.0) # longer = more confident
	return InterruptClassification(
	interrupt_type=InterruptType.REAL_INTERRUPT,
	confidence=confidence,
	reason=f"real speech ({duration:.2f}s, progress={nima_text_progress:.2f})",
	transcript=transcript,
	duration_s=duration,
	spectral_features=spectral,
	)

	def _compute_spectral_features(self, audio: np.ndarray, sr: int) -> Dict[str, float]:
	"""Compute spectral features for non-verbal classification."""
	if len(audio) < 256:
	return {}
	# FFT
	fft = np.fft.rfft(audio.astype(np.float32))
	magnitude = np.abs(fft)
	freqs = np.fft.rfftfreq(len(audio), 1.0 / sr)
	# Spectral centroid (brightness)
	if magnitude.sum() > 0:
	centroid = float(np.sum(freqs * magnitude) / np.sum(magnitude))
	else:
	centroid = 0.0
	# Spectral rolloff (85% of energy)
	cumsum = np.cumsum(magnitude)
	if cumsum[-1] > 0:
	rolloff_idx = np.searchsorted(cumsum, 0.85 * cumsum[-1])
	rolloff = float(freqs[min(rolloff_idx, len(freqs) - 1)])
	else:
	rolloff = 0.0
	# Zero crossing rate (voicing indicator)
	zcr = float(np.mean(np.abs(np.diff(np.sign(audio))) > 0))
	# Energy
	energy = float(np.sqrt(np.mean(audio ** 2)))
	# Low-frequency energy ratio (voicing)
	low_mask = freqs < 500
	low_energy = float(np.sum(magnitude[low_mask]) / max(1e-10, np.sum(magnitude)))
	# Periodicity (for laughter detection)
	periodicity = self._estimate_periodicity(audio, sr)
	# Energy variance across frames (distinguishes burst-like laughter
	# from sustained speech). Laughter has high variance (bursts + gaps),
	# real speech has lower variance (continuous voicing).
	frame_size = int(sr * 0.02) # 20ms frames
	n_frames = max(1, len(audio) // frame_size)
	frame_energies = []
	for i in range(n_frames):
	frame = audio[i * frame_size:(i + 1) * frame_size]
	if len(frame) > 0:
	frame_energies.append(float(np.sqrt(np.mean(frame ** 2))))
	if len(frame_energies) >= 3:
	energy_mean = float(np.mean(frame_energies))
	energy_std = float(np.std(frame_energies))
	# Coefficient of variation (normalized std)
	energy_cv = energy_std / max(1e-6, energy_mean)
	else:
	energy_cv = 0.0
	return {
	"centroid_hz": centroid,
	"rolloff_hz": rolloff,
	"zcr": zcr,
	"energy": energy,
	"low_freq_ratio": low_energy,
	"periodicity": periodicity,
	"duration_s": len(audio) / sr,
	"energy_cv": energy_cv, # burst-like vs sustained
	}

	def _estimate_periodicity(self, audio: np.ndarray, sr: int) -> float:
	"""Estimate periodicity (0=aperiodic/noise, 1=strongly periodic)."""
	if len(audio) < sr * 0.05:
	return 0.0
	# Autocorrelation
	audio_centered = audio - np.mean(audio)
	if np.std(audio_centered) < 1e-6:
	return 0.0
	autocorr = np.correlate(audio_centered, audio_centered, mode="full")
	autocorr = autocorr[len(autocorr) // 2:]
	if autocorr[0] == 0:
	return 0.0
	# Normalize
	autocorr = autocorr / autocorr[0]
	# Find first peak after lag 0 (in 50-200ms range = 5-20Hz = laughter "ha" rate)
	min_lag = int(sr * 0.05) # 50ms
	max_lag = int(sr * 0.20) # 200ms
	if max_lag >= len(autocorr):
	return 0.0
	region = autocorr[min_lag:max_lag]
	if len(region) == 0:
	return 0.0
	peak = float(np.max(region))
	return max(0.0, min(1.0, peak))

	def _is_backchannel_spectral(self, spectral: Dict[str, float]) -> bool:
	"""Check if spectral features match a backchannel (short, voiced, soft-ish)."""
	if not spectral:
	return False
	energy = spectral.get("energy", 0.0)
	low_ratio = spectral.get("low_freq_ratio", 0.0)
	zcr = spectral.get("zcr", 0.5)
	periodicity = spectral.get("periodicity", 0.0)
	centroid = spectral.get("centroid_hz", 0.0)
	duration = spectral.get("duration_s", 1.0)
	# Backchannels are short (<0.8s) and voiced
	if duration > 0.8:
	return False
	is_voiced = periodicity > 0.2 or low_ratio > 0.25
	is_smooth = zcr < 0.35
	# "mm-hmm" pattern: voiced, low centroid (not breathy), smooth
	is_low_centroid = centroid < 1500
	return is_voiced and is_smooth and is_low_centroid and energy > 0.01

	def _classify_non_verbal(self, spectral: Dict[str, float],
	duration: float) -> Optional[Tuple[str, float]]:
	"""
	Classify non-verbal expression from spectral features.
	Returns (expression_name, confidence) or None.

	Key insight: non-verbal expressions have DISTINCTIVE spectral
	signatures + are typically SHORT (<1.5s). Sustained voiced
	audio >1.0s with low energy variance is likely real speech,
	NOT a non-verbal expression — even if periodicity is high.
	"""
	if not spectral:
	return None
	periodicity = spectral.get("periodicity", 0.0)
	centroid = spectral.get("centroid_hz", 0.0)
	energy = spectral.get("energy", 0.0)
	zcr = spectral.get("zcr", 0.0)
	low_ratio = spectral.get("low_freq_ratio", 0.0)
	energy_cv = spectral.get("energy_cv", 0.0) # burst-like vs sustained

	# ── Guard: sustained audio >1.0s with low energy variance is
	# likely real speech, not a non-verbal expression. ──
	if duration > 1.0 and energy_cv < 0.3:
	return None # let it fall through to REAL_INTERRUPT

	# Laughter: burst-like (high energy_cv), periodic, moderate energy
	# The energy_cv check is key — laughter has ha-ha-ha gaps
	if energy_cv > 0.3 and periodicity > 0.1 and 0.3 < duration < 2.0 and energy > 0.03:
	if centroid > 1500:
	return ("laughter", 0.8)
	return ("giggle", 0.7)

	# Sigh: low periodicity, breathy (high centroid), low-mid energy, short
	if periodicity < 0.2 and 0.3 < duration < 1.0 and centroid > 1500 and energy > 0.02:
	return ("sigh", 0.65)

	# Gasp: very short, high centroid (breathy), moderate energy
	if duration < 0.35 and centroid > 1500 and energy > 0.03:
	return ("gasp", 0.75)

	# Groan: low centroid, voiced, sustained (low energy_cv), short
	if duration > 0.4 and duration < 0.8 and centroid < 1200 and periodicity > 0.2:
	return ("groan", 0.65)

	# Moan: mid centroid, voiced, sustained, medium duration
	if 0.5 < duration < 1.0 and 1000 < centroid < 2000 and low_ratio > 0.35:
	return ("moan", 0.6)

	# Cluck/click: very short, high ZCR
	if duration < 0.12 and zcr > 0.3:
	return ("click", 0.5)

	return None


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 6 — Backchannel Controller
	# ═══════════════════════════════════════════════════════════════════════════

	class BackchannelController:
	"""
	Decides when to emit backchannels (verbal nods + non-verbal expressions)
	while the user is speaking.

	Triggers (per user's spec):
	- ON_PAUSE: user paused 0.3-0.8s mid-utterance → soft verbal nod ("mm-hmm")
	- ON_EMOTION_SHIFT: user's prosody shifted (arousal spike) → non-verbal reaction

	The controller also avoids over-backchanneling: minimum 1.5s between
	any two backchannels.
	"""

	MIN_BACKCHANNEL_INTERVAL_S: float = 1.5
	PAUSE_MIN_S: float = 0.3
	PAUSE_MAX_S: float = 0.8
	AROUSAL_SPIKE_THRESHOLD: float = 0.3 # +0.3 arousal = spike

	# Verbal nod options
	VERBAL_NODS: List[str] = ["mm-hmm", "yeah", "right", "i see", "mhm", "uh-huh"]

	# Emotion shift → non-verbal expression mapping
	EMOTION_REACTIONS: Dict[str, NonVerbalType] = {
	"surprise": NonVerbalType.GASp,
	"joy": NonVerbalType.LAUGHTER,
	"sadness": NonVerbalType.AWW,
	"fear": NonVerbalType.GASp,
	"anger": NonVerbalType.GROAN,
	"neutral": NonVerbalType.MM,
	}

	def __init__(self, tts: CoquiXTTSBackend, nonverbal_synth: ProceduralNonVerbalSynth,
	sample_rate: int = 22050):
	self._tts = tts
	self._nonverbal = nonverbal_synth
	self.sample_rate = sample_rate
	self._last_backchannel_time: float = 0.0
	self._last_arousal: float = 0.3
	self._arousal_history: Deque[float] = deque(maxlen=10)

	def should_backchannel(self, state: ConversationState,
	audio: Optional[np.ndarray] = None) -> Optional[BackchannelEvent]:
	"""
	Check if a backchannel should be emitted based on current state.

	Returns a BackchannelEvent if one should fire, else None.
	"""
	now = time.time()
	# Throttle: don't backchannel too frequently
	if now - self._last_backchannel_time < self.MIN_BACKCHANNEL_INTERVAL_S:
	return None

	# Only backchannel while user is speaking
	if state.phase != ConversationPhase.USER_SPEAKING:
	return None

	# ── Trigger 1: ON_PAUSE ──
	if audio is not None and len(audio) > 0:
	vad = EnergyVAD(sample_rate=16000)
	is_pause, pause_dur = vad.detect_pause(audio, self.PAUSE_MIN_S, self.PAUSE_MAX_S)
	if is_pause:
	nod_text = random.choice(self.VERBAL_NODS)
	audio_out = self._tts.synthesize(nod_text, ProsodyParams(
	base_pitch_hz=160, energy=0.4, warmth=0.8, breathiness=0.2,
	))
	event = BackchannelEvent(
	trigger=BackchannelTrigger.ON_PAUSE,
	audio=audio_out,
	is_verbal=True,
	label=nod_text,
	)
	self._last_backchannel_time = now
	logger.debug("[Backchannel] ON_PAUSE nod: '%s' (pause=%.2fs)", nod_text, pause_dur)
	return event

	# ── Trigger 2: ON_EMOTION_SHIFT ──
	current_arousal = state.user_emotion_arousal
	self._arousal_history.append(current_arousal)
	if len(self._arousal_history) >= 3:
	baseline = float(np.mean(list(self._arousal_history)[:-2]))
	shift = current_arousal - baseline
	if shift > self.AROUSAL_SPIKE_THRESHOLD:
	# Determine emotion from valence + arousal
	emotion = self._classify_emotion_shift(
	state.user_emotion_valence, current_arousal
	)
	expr_type = self.EMOTION_REACTIONS.get(emotion, NonVerbalType.MM)
	audio_out = self._nonverbal.synth(expr_type, intensity=0.6)
	event = BackchannelEvent(
	trigger=BackchannelTrigger.ON_EMOTION_SHIFT,
	audio=audio_out,
	sample_rate=self._nonverbal.sample_rate,
	is_verbal=False,
	label=expr_type.value,
	)
	self._last_backchannel_time = now
	logger.debug("[Backchannel] ON_EMOTION_SHIFT: %s (arousal %.2f→%.2f)",
	expr_type.value, baseline, current_arousal)
	return event

	return None

	def _classify_emotion_shift(self, valence: float, arousal: float) -> str:
	"""Classify the emotion from valence + arousal."""
	if arousal > 0.7 and valence > 0.3:
	return "joy"
	if arousal > 0.7 and valence < -0.3:
	return "anger"
	if arousal > 0.6 and valence < -0.2:
	return "fear"
	if arousal > 0.6:
	return "surprise"
	if valence < -0.3:
	return "sadness"
	return "neutral"


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 7 — Interruption Response
	# ═══════════════════════════════════════════════════════════════════════════

	class InterruptionResponse:
	"""
	Generates context-dependent responses when a real interrupt is detected.
	Instead of just stopping, Nima says one of:
	- "I'm sorry, were you saying something?" (early in Nima's utterance)
	- "Sorry, please go ahead." (mid/late in Nima's utterance)

	The response is chosen based on:
	- How far into the utterance the interrupt occurred
	- Whether the user's speech seems urgent (high arousal)
	- Conversation history (don't apologize every time)
	"""

	EARLY_RESPONSES: List[str] = [
	"I'm sorry, were you saying something?",
	"Oh, sorry — please, go ahead.",
	"My apologies, you were saying?",
	]

	LATE_RESPONSES: List[str] = [
	"Sorry, please go ahead.",
	"Go right ahead — I can wait.",
	"Of course, after you.",
	]

	URGENT_RESPONSES: List[str] = [
	"Of course, go ahead.",
	"Please, go on.",
	"I'm listening — go ahead.",
	]

	# Don't apologize more than once every 30s
	COOLDOWN_S: float = 30.0

	def __init__(self):
	self._last_response_time: float = 0.0
	self._response_count: int = 0

	def should_respond(self, classification: InterruptClassification) -> bool:
	"""Check if an interruption response should be emitted."""
	if classification.interrupt_type != InterruptType.REAL_INTERRUPT:
	return False
	# Cooldown: don't respond to every single interrupt
	now = time.time()
	if now - self._last_response_time < self.COOLDOWN_S:
	return False
	return True

	def generate_response(self, classification: InterruptClassification,
	nima_text_progress: float,
	user_arousal: float = 0.3) -> str:
	"""
	Generate the appropriate interruption response text.

	Args:
	classification: the interrupt classification
	nima_text_progress: 0.0 = Nima just started, 1.0 = Nima almost done
	user_arousal: detected arousal level of the user's interrupt

	Returns:
	Response text string.
	"""
	self._last_response_time = time.time()
	self._response_count += 1

	# Urgent interrupt (high arousal) → minimal apology
	if user_arousal > 0.7:
	return random.choice(self.URGENT_RESPONSES)

	# Early in utterance (< 30% done) → "were you saying something?"
	if nima_text_progress < 0.3:
	return random.choice(self.EARLY_RESPONSES)

	# Mid/late (>= 30% done) → "please go ahead"
	return random.choice(self.LATE_RESPONSES)


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 7.5 — v2.0.0 "MIND THROUGH VOICE" MODULES
	# ═══════════════════════════════════════════════════════════════════════════
	#
	# These modules add the layers that separate a synthesizer from a voice
	# with a mind behind it: adaptive prosody, micro-intonation, affective
	# mirroring, somatic feedback, episodic memory, narrative continuity,
	# singing interjections, dynamic laughter, and refined interrupt handling.


	# ── CONVERSATIONAL FLOW ─────────────────────────────────────────────────────

	class AdaptiveProsodyShaper:
	"""
	Dynamically adjusts pitch, rhythm, and timbre based on emotional state
	or context. Softer tone when empathetic, brighter when excited.

	Maps an emotional context (valence + arousal + empathy_level) to
	concrete prosody modifications applied on top of the base ProsodyParams.
	"""

	# Emotion archetype → prosody delta multipliers
	EMOTION_PROFILES: Dict[str, Dict[str, float]] = {
	"empathetic": {"pitch_mult": 0.92, "rate_mult": 0.88, "warmth_add": 0.20, "breathiness_add": 0.08, "energy_mult": 0.85},
	"excited": {"pitch_mult": 1.18, "rate_mult": 1.12, "warmth_add": 0.05, "breathiness_add": -0.03, "energy_mult": 1.25},
	"contemplative": {"pitch_mult": 0.96, "rate_mult": 0.82, "warmth_add": 0.10, "breathiness_add": 0.05, "energy_mult": 0.90},
	"concerned": {"pitch_mult": 0.88, "rate_mult": 0.90, "warmth_add": 0.15, "breathiness_add": 0.10, "energy_mult": 0.80},
	"joyful": {"pitch_mult": 1.10, "rate_mult": 1.08, "warmth_add": 0.12, "breathiness_add": -0.02, "energy_mult": 1.15},
	"vulnerable": {"pitch_mult": 0.85, "rate_mult": 0.85, "warmth_add": 0.25, "breathiness_add": 0.15, "energy_mult": 0.70},
	"assertive": {"pitch_mult": 0.98, "rate_mult": 1.05, "warmth_add": -0.05, "breathiness_add": -0.05, "energy_mult": 1.20},
	}

	def shape(self, base_prosody: ProsodyParams,
	emotion: str = "neutral",
	valence: float = 0.0,
	arousal: float = 0.3,
	empathy_level: float = 0.5) -> ProsodyParams:
	"""
	Apply adaptive shaping to base prosody.

	Args:
	base_prosody: the starting prosody params
	emotion: emotion label (empathetic, excited, contemplative, etc.)
	valence: [-1, 1] emotional valence
	arousal: [0, 1] emotional arousal
	empathy_level: [0, 1] how empathetic the response should be

	Returns:
	New ProsodyParams with adaptive modifications applied.
	"""
	# Start from base
	shaped = ProsodyParams(
	base_pitch_hz=base_prosody.base_pitch_hz,
	speech_rate_wpm=base_prosody.speech_rate_wpm,
	energy=base_prosody.energy,
	breathiness=base_prosody.breathiness,
	warmth=base_prosody.warmth,
	vibrato_depth=base_prosody.vibrato_depth,
	pitch_variance=base_prosody.pitch_variance,
	emotional_tone=emotion,
	)

	# Apply emotion profile
	profile = self.EMOTION_PROFILES.get(emotion, {})
	if profile:
	shaped.base_pitch_hz *= profile.get("pitch_mult", 1.0)
	shaped.speech_rate_wpm *= profile.get("rate_mult", 1.0)
	shaped.energy *= profile.get("energy_mult", 1.0)
	shaped.warmth = float(min(1.0, max(0.0, shaped.warmth + profile.get("warmth_add", 0.0))))
	shaped.breathiness = float(min(0.5, max(0.0, shaped.breathiness + profile.get("breathiness_add", 0.0))))

	# Valence → pitch variance (positive = more expressive)
	shaped.pitch_variance = float(min(0.4, max(0.05, 0.15 + valence * 0.10)))

	# Arousal → energy + rate
	shaped.energy = float(min(1.0, shaped.energy * (0.7 + arousal * 0.6)))
	shaped.speech_rate_wpm = (0.9 + arousal 0.3)

	# Empathy → warmth boost + breathiness (softer, more intimate)
	if empathy_level > 0.5:
	empathy_boost = (empathy_level - 0.5) * 2.0 # [0, 1]
	shaped.warmth = float(min(1.0, shaped.warmth + 0.15 * empathy_boost))
	shaped.breathiness = float(min(0.5, shaped.breathiness + 0.05 * empathy_boost))
	shaped.base_pitch_hz = (1.0 - 0.03 empathy_boost) # slightly lower = more intimate

	return shaped


	class MicroIntonationInjector:
	"""
	Adds tiny hesitations, breaths, and emphasis shifts that signal
	thoughtfulness or uncertainty. These make speech feel alive.

	Injects micro-events at sentence boundaries and before key words:
	- "..." hesitation (50-150ms pause + subtle pitch drop)
	- inhale breath (80ms)
	- emphasis shift (pitch bump on the emphasized word)
	"""

	# Words that tend to receive emphasis
	EMPHASIS_WORDS: Set[str] = {
	"really", "truly", "actually", "honestly", "important",
	"never", "always", "exactly", "absolutely", "indeed",
	}

	# Hesitation markers (fillers)
	HESITATIONS: List[str] = ["...", "um", "hmm", "well"]

	def __init__(self, sample_rate: int = 22050):
	self.sample_rate = sample_rate
	self._breath_synth = ProceduralNonVerbalSynth(sample_rate)

	def inject(self, text: str, prosody: ProsodyParams,
	thoughtfulness: float = 0.3,
	uncertainty: float = 0.2) -> Tuple[str, List[Dict[str, Any]]]:
	"""
	Analyze text and inject micro-intonation events.

	Args:
	text: the input text
	prosody: current prosody params
	thoughtfulness: [0, 1] how thoughtful/reflective (more hesitations)
	uncertainty: [0, 1] how uncertain (more fillers + pitch drops)

	Returns:
	(modified_text, events) where events is a list of dicts:
	{"type": "hesitation"\|"breath"\|"emphasis", "position": float, "audio": np.ndarray}
	"""
	modified = text
	events: List[Dict[str, Any]] = []

	# 1. Add hesitation at sentence start if thoughtful
	if thoughtfulness > 0.4 and random.random() < thoughtfulness:
	hesitation = random.choice(self.HESITATIONS[:2]) # "..." or "um"
	modified = f"{hesitation} {modified}"
	events.append({
	"type": "hesitation",
	"position": 0.0,
	"duration_s": 0.1 + thoughtfulness * 0.15,
	"audio": self._gen_hesitation_audio(0.1 + thoughtfulness * 0.15, prosody),
	})

	# 2. Add breath before commas/periods if thoughtful
	if thoughtfulness > 0.3:
	breath_chance = thoughtfulness * 0.6
	words = modified.split()
	new_words = []
	for i, word in enumerate(words):
	new_words.append(word)
	if word.endswith(",") or word.endswith("."):
	if random.random() < breath_chance:
	events.append({
	"type": "breath",
	"position": (i + 1) / len(words),
	"duration_s": 0.08,
	"audio": self._breath_synth.synth(NonVerbalType.SIGH, intensity=0.2),
	})
	modified = " ".join(new_words)

	# 3. Emphasis shifts on key words
	words = modified.split()
	for i, word in enumerate(words):
	clean = word.lower().strip(".,!?;:")
	if clean in self.EMPHASIS_WORDS:
	events.append({
	"type": "emphasis",
	"position": i / max(1, len(words)),
	"word": word,
	"pitch_bump": 30.0, # Hz bump
	})

	# 4. Uncertainty → trailing pitch drop
	if uncertainty > 0.5:
	events.append({
	"type": "uncertainty_drop",
	"position": 1.0,
	"pitch_drop": 20.0 * uncertainty,
	})

	return modified, events

	def _gen_hesitation_audio(self, duration: float, prosody: ProsodyParams) -> np.ndarray:
	"""Generate a subtle hesitation sound (low 'um' or breath)."""
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Low-pitched nasal 'mm'
	f0 = prosody.base_pitch_hz * 0.7
	source = np.sin(2 * np.pi * f0 * t) * 0.3
	# Fade in/out
	env = np.ones(n)
	attack = min(int(0.03 * self.sample_rate), n // 3)
	release = min(int(0.05 * self.sample_rate), n // 3)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0, release)
	return (source * env * 0.3).astype(np.float32)


	class TurnTakingPredictor:
	"""
	Predicts when the user is about to finish speaking, so the system
	can smoothly take the floor instead of waiting for silence.

	Uses a combination of:
	- Speech rate deceleration (users slow down at turn ends)
	- Pitch declination (pitch drops at sentence ends)
	- Pause lengthening (longer pauses near turn end)
	- Filler detection ("you know", "so yeah")
	"""

	# Turn-end indicators
	TURN_END_FILLERS: Set[str] = {
	"you know", "so yeah", "i think", "something like that",
	"that's about it", "yeah", "right", "anyway",
	}

	def __init__(self):
	self._speech_rate_history: Deque[float] = deque(maxlen=10)
	self._pitch_history: Deque[float] = deque(maxlen=10)
	self._pause_history: Deque[float] = deque(maxlen=5)

	def update(self, speech_rate: float, pitch: float, pause_duration: float):
	"""Update the predictor with recent observations."""
	self._speech_rate_history.append(speech_rate)
	self._pitch_history.append(pitch)
	self._pause_history.append(pause_duration)

	def predict_turn_end_probability(self, transcript: str = "") -> float:
	"""
	Predict the probability [0, 1] that the user is about to finish.
	"""
	prob = 0.0
	# 1. Speech rate deceleration
	if len(self._speech_rate_history) >= 3:
	recent = list(self._speech_rate_history)[-3:]
	if recent[2] < recent[0] * 0.8: # slowed down 20%+
	prob += 0.3
	# 2. Pitch declination
	if len(self._pitch_history) >= 3:
	recent = list(self._pitch_history)[-3:]
	if recent[2] < recent[0] * 0.9: # dropped 10%+
	prob += 0.25
	# 3. Pause lengthening
	if len(self._pause_history) >= 2:
	recent = list(self._pause_history)[-2:]
	if recent[1] > 0.5: # pause > 500ms
	prob += 0.2
	# 4. Turn-end fillers in transcript
	if transcript:
	tl = transcript.lower()
	for filler in self.TURN_END_FILLERS:
	if filler in tl:
	prob += 0.25
	break
	return float(min(1.0, prob))

	def should_take_floor(self, transcript: str = "") -> bool:
	"""Returns True if the system should start speaking now."""
	return self.predict_turn_end_probability(transcript) > 0.6


	# ── EMOTIONAL & COGNITIVE GROUNDING ─────────────────────────────────────────

	class AffectiveMirror:
	"""
	Matches the user's emotional tone (calm, energetic, concerned) with
	subtle vocal adjustments. The voice subtly reflects the user's state
	without mimicking it overtly.

	Mapping:
	- User calm → Nima slightly slower, warmer
	- User energetic → Nima slightly faster, brighter
	- User concerned → Nima softer, lower pitch
	- User joyful → Nima lighter, more pitch variance
	"""

	def mirror(self, user_valence: float, user_arousal: float,
	base_prosody: ProsodyParams) -> Tuple[ProsodyParams, str]:
	"""
	Mirror the user's emotional state in the voice.

	Returns:
	(mirrored_prosody, emotion_label)
	"""
	mirrored = ProsodyParams(
	base_pitch_hz=base_prosody.base_pitch_hz,
	speech_rate_wpm=base_prosody.speech_rate_wpm,
	energy=base_prosody.energy,
	breathiness=base_prosody.breathiness,
	warmth=base_prosody.warmth,
	vibrato_depth=base_prosody.vibrato_depth,
	pitch_variance=base_prosody.pitch_variance,
	)

	# Determine user's emotional state
	if user_arousal < 0.3 and abs(user_valence) < 0.3:
	emotion = "calm"
	mirrored.speech_rate_wpm *= 0.95
	mirrored.warmth = float(min(1.0, mirrored.warmth + 0.05))
	elif user_arousal > 0.6 and user_valence > 0.3:
	emotion = "energetic"
	mirrored.speech_rate_wpm *= 1.08
	mirrored.base_pitch_hz *= 1.05
	mirrored.energy = float(min(1.0, mirrored.energy * 1.1))
	elif user_valence < -0.3:
	emotion = "concerned"
	mirrored.base_pitch_hz *= 0.95
	mirrored.breathiness = float(min(0.3, mirrored.breathiness + 0.05))
	mirrored.warmth = float(min(1.0, mirrored.warmth + 0.10))
	elif user_valence > 0.4:
	emotion = "joyful"
	mirrored.pitch_variance = float(min(0.35, mirrored.pitch_variance + 0.08))
	mirrored.base_pitch_hz *= 1.03
	else:
	emotion = "neutral"

	return mirrored, emotion


	class SomaticFeedbackIntegrator:
	"""
	Ties voice modulation to system "strain" or "energy" states.
	Like biological fatigue signals — when the system is under strain,
	the voice becomes slightly slower, breathier, lower-pitched.

	Reads NIMA's phenomenological_strain and allostatic_load to modulate
	the voice. This makes the voice itself a signal of the system's
	internal state.
	"""

	def __init__(self):
	self._current_strain: float = 0.0
	self._current_energy: float = 1.0
	self._allostatic_load: float = 0.0

	def update_from_nima(self, strain: float, allostatic_load: float = 0.0):
	"""Update the somatic state from NIMA's metrics."""
	self._current_strain = float(max(0.0, min(2.0, strain)))
	self._allostatic_load = float(max(0.0, min(1.0, allostatic_load)))
	# Energy inversely related to strain + allostatic
	self._current_energy = float(max(0.3, 1.0 - 0.3 * self._current_strain - 0.2 * self._allostatic_load))

	def apply_somatic_modulation(self, prosody: ProsodyParams) -> ProsodyParams:
	"""Apply fatigue/strain modulation to prosody."""
	if self._current_strain < 0.1 and self._allostatic_load < 0.1:
	return prosody # no modulation needed
	modulated = ProsodyParams(
	base_pitch_hz=prosody.base_pitch_hz,
	speech_rate_wpm=prosody.speech_rate_wpm,
	energy=prosody.energy,
	breathiness=prosody.breathiness,
	warmth=prosody.warmth,
	vibrato_depth=prosody.vibrato_depth,
	pitch_variance=prosody.pitch_variance,
	emotional_tone=prosody.emotional_tone,
	)
	# Strain → lower pitch, slower, breathier
	strain_factor = min(1.0, self._current_strain)
	modulated.base_pitch_hz = (1.0 - 0.05 strain_factor)
	modulated.speech_rate_wpm = (1.0 - 0.10 strain_factor)
	modulated.breathiness = float(min(0.4, modulated.breathiness + 0.08 * strain_factor))
	# Allostatic load → reduced energy, more warmth (self-soothing)
	modulated.energy = (1.0 - 0.15 self._allostatic_load)
	modulated.warmth = float(min(1.0, modulated.warmth + 0.05 * self._allostatic_load))
	return modulated

	@property
	def strain(self) -> float:
	return self._current_strain

	@property
	def energy(self) -> float:
	return self._current_energy


	class EmpathyPhraseGenerator:
	"""
	Generates short contextual empathy inserts instead of generic nods.
	Instead of "mm-hmm", generates "That must feel tough" or "I get what you mean."

	Selects the phrase based on the user's emotional state + topic keywords.
	"""

	# Empathy phrase templates by emotion
	EMPATHY_PHRASES: Dict[str, List[str]] = {
	"sadness": [
	"That sounds really hard.",
	"I can hear how much this weighs on you.",
	"That must feel tough.",
	"I'm sorry you're going through this.",
	],
	"joy": [
	"That's wonderful to hear.",
	"I can feel your excitement.",
	"That sounds amazing.",
	"I love that for you.",
	],
	"anger": [
	"That sounds frustrating.",
	"I can see why that would upset you.",
	"That would make me angry too.",
	"You have every right to feel that way.",
	],
	"fear": [
	"That sounds scary.",
	"I can understand why you'd be worried.",
	"It makes sense that you're concerned.",
	"That's a lot to sit with.",
	],
	"surprise": [
	"Oh wow.",
	"That's unexpected.",
	"I didn't see that coming either.",
	"Hmm, that's something.",
	],
	"neutral": [
	"I hear you.",
	"I get what you mean.",
	"That makes sense.",
	"I'm following you.",
	"Go on, I'm listening.",
	],
	}

	def generate(self, user_emotion: str = "neutral",
	user_valence: float = 0.0,
	user_arousal: float = 0.3) -> str:
	"""Generate a contextual empathy phrase."""
	# Map valence/arousal to emotion if not given
	if user_emotion == "neutral":
	if user_valence < -0.3 and user_arousal > 0.5:
	user_emotion = "anger"
	elif user_valence < -0.3:
	user_emotion = "sadness"
	elif user_valence > 0.4 and user_arousal > 0.6:
	user_emotion = "joy"
	elif user_arousal > 0.6:
	user_emotion = "surprise"
	phrases = self.EMPATHY_PHRASES.get(user_emotion, self.EMPATHY_PHRASES["neutral"])
	return random.choice(phrases)


	# ── MEMORY & CONTINUITY ─────────────────────────────────────────────────────

	@dataclass
	class VoiceEvent:
	"""An episodic voice event stored in MemPalace."""
	event_id: str = field(default_factory=lambda: f"ve_{uuid.uuid4().hex[:12]}")
	timestamp: float = field(default_factory=time.time)
	speaker: str = "nima" # "nima" or "user"
	text: str = ""
	audio_duration_s: float = 0.0
	prosody_snapshot: Dict[str, float] = field(default_factory=dict)
	emotion: str = "neutral"
	valence: float = 0.0
	arousal: float = 0.3
	strain: float = 0.0
	conversation_phase: str = "nima_speaking"
	interrupt_count: int = 0
	backchannel_count: int = 0


	class VoiceEventMemoryBridge:
	"""
	Stores every utterance as an episodic voice event with affective tags.
	Later, the system recalls not just what was said but how it was said.

	This bridge connects OmniVoice to NIMA's MemoryPalace. Each voice
	event is stored as an Episode with the speaker, text, prosody, and
	affective state — enabling later recall of vocal quality, not just
	content.
	"""

	def __init__(self, palace: Any = None):
	"""
	Args:
	palace: a NIMA MemoryPalace instance. If None, voice events
	are stored in an in-memory list (no persistence).
	"""
	self._palace = palace
	self._local_events: Deque[VoiceEvent] = deque(maxlen=500)
	self._event_count = 0

	def store_voice_event(self, event: VoiceEvent) -> str:
	"""Store a voice event in MemPalace (if available) + local buffer."""
	self._local_events.append(event)
	self._event_count += 1
	# If NIMA MemoryPalace is available, store as an episode
	if self._palace is not None:
	try:
	self._palace.store_episode(
	processor_name=f"voice_{event.speaker}",
	sensory_intensity=event.arousal,
	affective_weight=abs(event.valence) * 0.5 + event.arousal * 0.5,
	score=event.strain,
	valence=event.valence,
	arousal=event.arousal,
	novelty=0.3, # could be computed from text novelty
	input_text=event.text[:500],
	content={
	"speaker": event.speaker,
	"audio_duration_s": event.audio_duration_s,
	"prosody_snapshot": event.prosody_snapshot,
	"emotion": event.emotion,
	"conversation_phase": event.conversation_phase,
	"interrupt_count": event.interrupt_count,
	"backchannel_count": event.backchannel_count,
	"event_type": "voice_event",
	},
	)
	except Exception as e:
	logger.warning("[VoiceEventMemoryBridge] MemPalace store failed: %s", e)
	return event.event_id

	def recall_voice_events(self, speaker: Optional[str] = None,
	emotion: Optional[str] = None,
	limit: int = 5) -> List[VoiceEvent]:
	"""Recall recent voice events, optionally filtered."""
	results = list(self._local_events)
	if speaker:
	results = [e for e in results if e.speaker == speaker]
	if emotion:
	results = [e for e in results if e.emotion == emotion]
	return results[-limit:]

	def get_stats(self) -> Dict[str, Any]:
	return {
	"total_events": self._event_count,
	"buffered_events": len(self._local_events),
	"palace_connected": self._palace is not None,
	}


	class NarrativeContinuityEngine:
	"""
	References past conversations naturally. The voice stream can say
	"As you mentioned yesterday, you sounded excited about..." because
	it recalls the episodic voice events with their affective tags.

	Generates narrative continuity phrases by querying VoiceEventMemoryBridge
	for past events that match the current context.
	"""

	# Continuity phrase templates
	CONTINUITY_TEMPLATES: List[str] = [
	"Earlier you mentioned {topic}. You sounded {emotion} about it.",
	"As you said before, {topic}. I remember how {emotion} you were.",
	"Going back to what you said about {topic} — you seemed {emotion}.",
	"I was thinking about what you said earlier, about {topic}.",
	"You mentioned {topic} earlier. That stayed with me.",
	]

	def __init__(self, memory_bridge: VoiceEventMemoryBridge):
	self._memory = memory_bridge

	def generate_continuity_phrase(self, current_topic: str = "",
	current_emotion: str = "neutral") -> Optional[str]:
	"""
	Generate a natural continuity phrase referencing a past voice event.
	Returns None if no suitable past event exists.
	"""
	past_events = self._memory.recall_voice_events(
	speaker="user", limit=10
	)
	if not past_events:
	return None
	# Find a past event with different content (not the immediate last)
	candidate = None
	for event in reversed(past_events[:-1]): # skip most recent
	if event.text and len(event.text) > 10:
	candidate = event
	break
	if candidate is None:
	return None
	# Extract a topic fragment from the past event
	topic = self._extract_topic(candidate.text)
	emotion_word = self._emotion_to_word(candidate.emotion, candidate.valence)
	template = random.choice(self.CONTINUITY_TEMPLATES)
	return template.format(topic=topic, emotion=emotion_word)

	def _extract_topic(self, text: str) -> str:
	"""Extract a short topic phrase from past text."""
	words = text.split()
	if len(words) <= 5:
	return text
	# Take a 3-5 word fragment from the middle
	start = max(0, len(words) // 2 - 2)
	end = min(len(words), start + 5)
	fragment = " ".join(words[start:end]).strip(".,!?")
	return fragment

	def _emotion_to_word(self, emotion: str, valence: float) -> str:
	"""Map emotion label to a descriptive word."""
	mapping = {
	"joy": "excited" if valence > 0.5 else "positive",
	"sadness": "down" if valence < -0.3 else "thoughtful",
	"anger": "frustrated",
	"fear": "worried",
	"surprise": "surprised",
	"neutral": "engaged" if valence > 0 else "reflective",
	}
	return mapping.get(emotion, "engaged")


	# ── EXPRESSIVE EXTENSIONS ───────────────────────────────────────────────────

	class SingingInterjectionModule:
	"""
	Short melodic phrases (humming, tonal affirmations) woven into speech.
	These add a distinctive, near-human musicality to the voice.

	Interjection types:
	- affirmation_hum: a rising "mm-mm" confirming what was said
	- thinking_hum: a contemplative "hmmm" while processing
	- transition_tone: a brief melodic bridge between topics
	- warmth_chord: a soft harmonic when expressing empathy
	"""

	def __init__(self, sample_rate: int = 22050):
	self.sample_rate = sample_rate
	self._nonverbal = ProceduralNonVerbalSynth(sample_rate)

	def synth_affirmation_hum(self, duration: float = 0.4) -> np.ndarray:
	"""A rising 'mm-mm' that affirms what was said."""
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Two-tone: low then high (rising)
	f0 = 120.0 + 60.0 * (t / duration)
	phase = 2.0 * np.pi * np.cumsum(f0) / self.sample_rate
	source = np.sin(phase) * 0.5
	# Nasal filter
	audio = self._nonverbal._lowpass(source, 1500)
	# Envelope
	env = np.ones(n)
	attack = min(int(0.05 * self.sample_rate), n // 4)
	release = min(int(0.08 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.3, release)
	return (audio * env * 0.4).astype(np.float32)

	def synth_thinking_hum(self, duration: float = 0.6) -> np.ndarray:
	"""A contemplative 'hmmm' while processing."""
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Slightly wavering pitch
	f0 = 140.0 + 10.0 * np.sin(2 * np.pi * 3.0 * t)
	phase = 2.0 * np.pi * np.cumsum(f0) / self.sample_rate
	source = np.sin(phase) * 0.4
	audio = self._nonverbal._lowpass(source, 1200)
	env = np.ones(n)
	attack = min(int(0.08 * self.sample_rate), n // 4)
	release = min(int(0.12 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.4, release)
	return (audio * env * 0.35).astype(np.float32)

	def synth_transition_tone(self, duration: float = 0.5) -> np.ndarray:
	"""A brief melodic bridge between topics."""
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Pentatonic-ish rising sequence
	notes = [220, 261, 293, 329] # A-C-D-E
	note_duration = duration / len(notes)
	audio = np.zeros(n)
	for i, freq in enumerate(notes):
	start = int(i * note_duration * self.sample_rate)
	end = min(n, int((i + 1) * note_duration * self.sample_rate))
	note_t = t[:end - start]
	note_phase = 2 * np.pi * freq * note_t
	note_audio = np.sin(note_phase) * 0.3
	# Soft attack/release per note
	note_len = end - start
	note_attack = min(int(0.02 * self.sample_rate), note_len // 3)
	note_env = np.ones(note_len)
	if note_attack > 0:
	note_env[:note_attack] = np.linspace(0, 1, note_attack)
	note_env[-min(int(0.02 * self.sample_rate), note_len // 3):] = np.linspace(1, 0.3, min(int(0.02 self.sample_rate), note_len // 3))
	audio[start:end] = note_audio * note_env
	return (audio * 0.3).astype(np.float32)

	def synth_warmth_chord(self, duration: float = 0.8) -> np.ndarray:
	"""A soft harmonic chord when expressing empathy."""
	n = int(self.sample_rate * duration)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Major triad: C-E-G (130, 165, 196 Hz)
	chord = (np.sin(2 * np.pi * 130 * t) +
	0.7 * np.sin(2 * np.pi * 165 * t) +
	0.5 * np.sin(2 * np.pi * 196 * t)) / 2.2
	audio = self._nonverbal._lowpass(chord, 800)
	env = np.ones(n)
	attack = min(int(0.15 * self.sample_rate), n // 3)
	release = min(int(0.25 * self.sample_rate), n // 3)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0.2, release)
	return (audio * env * 0.25).astype(np.float32)


	@dataclass
	class MultimodalCue:
	"""A non-audio cue paired with a voice event."""
	cue_type: str # "haptic" \| "visual" \| "light"
	intensity: float = 0.5
	duration_s: float = 0.3
	pattern: str = "pulse" # "pulse" \| "wave" \| "steady"
	timestamp: float = field(default_factory=time.time)


	class MultimodalCueEmitter:
	"""
	Pairs voice with subtle haptic or visual signals.
	Example: a soft vibration or light pulse when nodding.

	This module emits cue events that an external system (robotics,
	display, haptic actuator) can consume. It doesn't produce audio
	itself — it produces cue metadata synchronized to voice events.
	"""

	def __init__(self):
	self._cue_history: Deque[MultimodalCue] = deque(maxlen=100)
	self._cue_callback: Optional[Callable[[MultimodalCue], None]] = None

	def set_callback(self, callback: Callable[[MultimodalCue], None]):
	"""Set a callback to receive cues in real-time."""
	self._cue_callback = callback

	def emit_for_backchannel(self, is_verbal: bool, intensity: float = 0.5):
	"""Emit a cue when a backchannel is emitted."""
	cue = MultimodalCue(
	cue_type="haptic",
	intensity=0.3 + intensity * 0.3,
	duration_s=0.2,
	pattern="pulse",
	)
	self._emit(cue)

	def emit_for_empathy(self, emotion: str = "neutral"):
	"""Emit a cue when an empathy phrase is spoken."""
	intensity = 0.4 if emotion in ("sadness", "fear") else 0.3
	cue = MultimodalCue(
	cue_type="light",
	intensity=intensity,
	duration_s=0.5,
	pattern="wave",
	)
	self._emit(cue)

	def emit_for_laughter(self, intensity: float = 0.7):
	"""Emit a cue when laughter is emitted."""
	cue = MultimodalCue(
	cue_type="haptic",
	intensity=0.4 + intensity * 0.4,
	duration_s=0.3,
	pattern="pulse",
	)
	self._emit(cue)

	def _emit(self, cue: MultimodalCue):
	self._cue_history.append(cue)
	if self._cue_callback:
	try:
	self._cue_callback(cue)
	except Exception as e:
	logger.warning("[MultimodalCueEmitter] callback failed: %s", e)

	def get_recent_cues(self, n: int = 10) -> List[MultimodalCue]:
	return list(self._cue_history)[-n:]


	class DynamicLaughterSynth:
	"""
	Procedural laughter that adapts to intensity.
	Chuckle (low intensity) → full laugh (high intensity).

	Instead of fixed samples, scales:
	- Number of "ha" bursts
	- Pitch (higher for chuckle, lower for full laugh)
	- Energy
	- Breathiness
	"""

	def __init__(self, sample_rate: int = 22050):
	self.sample_rate = sample_rate
	self._nonverbal = ProceduralNonVerbalSynth(sample_rate)

	def synth(self, intensity: float = 0.5,
	duration: Optional[float] = None) -> np.ndarray:
	"""
	Synthesize adaptive laughter.

	Args:
	intensity: [0, 1] 0.2 = chuckle, 0.5 = normal laugh, 0.9 = full laugh
	duration: override duration (auto-computed if None)

	Returns:
	Laughter audio (float32).
	"""
	intensity = float(max(0.1, min(1.0, intensity)))

	# Scale parameters by intensity
	if intensity < 0.3:
	# Chuckle: 2-3 "ha"s, higher pitch, quiet
	n_has = random.randint(2, 3)
	ha_period = 0.12
	pitch = 240 + random.uniform(-20, 20)
	energy = 0.4
	elif intensity < 0.6:
	# Normal laugh: 4-6 "ha"s
	n_has = random.randint(4, 6)
	ha_period = 0.10
	pitch = 180 + random.uniform(-15, 15)
	energy = 0.6
	else:
	# Full laugh: 6-9 "ha"s, lower pitch, loud
	n_has = random.randint(6, 9)
	ha_period = 0.09
	pitch = 150 + random.uniform(-10, 10)
	energy = 0.8

	total_dur = duration or (n_has * ha_period * 1.3)
	chunks = []
	for i in range(n_has):
	ha = self._gen_ha(ha_period * 0.7, energy, pitch)
	gap = np.zeros(int(self.sample_rate * ha_period * 0.3))
	# Decay slightly across the laugh
	decay = 1.0 - 0.2 * (i / max(1, n_has - 1))
	chunks.append(ha * decay)
	chunks.append(gap)
	# Add trailing breath
	if intensity > 0.5:
	breath = self._nonverbal.synth(NonVerbalType.SIGH, intensity=0.3)
	chunks.append(breath[:int(self.sample_rate * 0.3)])
	audio = np.concatenate(chunks) if chunks else np.zeros(0)
	# Normalize
	max_val = float(np.max(np.abs(audio))) if len(audio) > 0 else 0.0
	if max_val > 0:
	audio = audio / max_val * 0.7 * intensity
	return audio.astype(np.float32)

	def _gen_ha(self, duration: float, intensity: float, pitch: float) -> np.ndarray:
	"""Generate a single 'ha' burst."""
	n = int(self.sample_rate * duration)
	if n < 2:
	return np.zeros(max(2, n), dtype=np.float32)
	t = np.linspace(0, duration, n, dtype=np.float64)
	# Glottal source
	phase = 2.0 * np.pi * pitch * t
	source = np.sin(phase)
	for h in range(2, 5):
	source += (0.4 / h) * np.sin(phase * h)
	source /= 3.0
	# Breathy noise
	noise = np.random.normal(0, 0.3, n)
	mixed = source * 0.6 + noise * 0.4
	mixed = self._nonverbal._bandpass(mixed, 400, 3000)
	# Envelope
	env = np.ones(n)
	attack = min(int(0.01 * self.sample_rate), n // 4)
	release = min(int(0.04 * self.sample_rate), n // 4)
	if attack > 0:
	env[:attack] = np.linspace(0, 1, attack)
	if release > 0:
	env[-release:] = np.linspace(1, 0, release)
	return (mixed * env * intensity).astype(np.float32)


	# ── INTERRUPT HANDLING REFINEMENT ───────────────────────────────────────────

	class ContextAwareApologyGenerator:
	"""
	Differentiates between casual and serious interruptions.

	Casual: "Sorry, please go ahead"
	Serious: "I didn't mean to cut you off, please continue"

	Determines seriousness from:
	- How far into the utterance the interrupt occurred (early = more serious)
	- User's arousal (high = more serious)
	- Frequency of interrupts (repeated = more serious)
	"""

	CASUAL_RESPONSES: List[str] = [
	"Sorry, please go ahead.",
	"Go right ahead.",
	"After you.",
	"Of course — go on.",
	]

	SERIOUS_RESPONSES: List[str] = [
	"I'm sorry, I didn't mean to cut you off. Please continue.",
	"My apologies — please, go ahead, I'm listening.",
	"I'm sorry, were you saying something? Please, continue.",
	"Forgive me — I didn't mean to interrupt. What were you saying?",
	]

	URGENT_RESPONSES: List[str] = [
	"Of course, go ahead.",
	"Please, go on.",
	"I'm listening.",
	]

	COOLDOWN_S: float = 15.0

	def __init__(self):
	self._last_response_time: float = 0.0
	self._interrupt_history: Deque[float] = deque(maxlen=10)

	def generate(self, nima_text_progress: float, user_arousal: float = 0.3,
	interrupt_count: int = 0) -> str:
	"""Generate a context-appropriate apology."""
	now = time.time()
	self._interrupt_history.append(now)

	# Count recent interrupts (within 60s)
	recent = sum(1 for t in self._interrupt_history if now - t < 60.0)

	# Determine seriousness
	is_serious = (
	nima_text_progress < 0.2 or # very early
	user_arousal > 0.7 or # user is aroused
	recent > 2 # repeated interrupts
	)
	is_urgent = user_arousal > 0.8

	self._last_response_time = now

	if is_urgent:
	return random.choice(self.URGENT_RESPONSES)
	elif is_serious:
	return random.choice(self.SERIOUS_RESPONSES)
	else:
	return random.choice(self.CASUAL_RESPONSES)

	def should_respond(self, interrupt_type: InterruptType) -> bool:
	"""Check if an apology should be emitted."""
	if interrupt_type != InterruptType.REAL_INTERRUPT:
	return False
	now = time.time()
	if now - self._last_response_time < self.COOLDOWN_S:
	return False
	return True


	class NonBlockingContinuationManager:
	"""
	Keeps the voice stream flowing even after acknowledging an interrupt.
	Instead of stopping entirely, the system:
	1. Pauses briefly (200ms)
	2. Speaks the apology ("Sorry, please go ahead")
	3. Yields the floor but remains ready to resume

	This makes the interaction feel conversational rather than mechanical.
	"""

	PAUSE_BEFORE_APOLOGY_S: float = 0.2
	RESUME_THRESHOLD_S: float = 1.5 # if user doesn't speak for 1.5s, resume

	def __init__(self):
	self._is_paused: bool = False
	self._pause_start: float = 0.0
	self._deferred_text: str = ""
	self._deferred_position: int = 0 # character position to resume from

	def yield_floor(self, deferred_text: str, position: int):
	"""Yield the floor but remember where to resume from."""
	self._is_paused = True
	self._pause_start = time.time()
	self._deferred_text = deferred_text
	self._deferred_position = position

	def should_resume(self, user_speaking: bool) -> bool:
	"""Check if the system should resume its deferred utterance."""
	if not self._is_paused:
	return False
	# Resume if user hasn't spoken for RESUME_THRESHOLD_S
	if not user_speaking:
	elapsed = time.time() - self._pause_start
	if elapsed > self.RESUME_THRESHOLD_S:
	self._is_paused = False
	return True
	return False

	def get_resume_text(self) -> Optional[str]:
	"""Get the text to resume (from the deferred position)."""
	if not self._deferred_text:
	return None
	remaining = self._deferred_text[self._deferred_position:]
	# Add a brief resume marker
	if remaining:
	return f"As I was saying, {remaining.lower().lstrip()}"
	return None

	@property
	def is_paused(self) -> bool:
	return self._is_paused


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 8 — OmniVoice Engine (main orchestrator)
	# ═══════════════════════════════════════════════════════════════════════════

	class OmniVoiceEngine:
	"""
	The main OmniVoice engine. Orchestrates ASR, TTS, non-verbal synthesis,
	backchannel emission, and interrupt handling into a unified real-time
	voice conversation system.

	Usage:
	engine = OmniVoiceEngine()
	async for audio_chunk in engine.stream("Hello, how are you?"):
	play(audio_chunk)
	"""

	def __init__(self,
	whisper_model: str = "base",
	coqui_model: str = "tts_models/multilingual/multi-dataset/xtts_v2",
	speaker_wav: Optional[str] = None,
	language: str = "en",
	sample_rate: int = 22050,
	palace: Any = None):
	logger.info("[OmniVoice] initializing v%s...", OMNIVOICE_VERSION)

	self.sample_rate = sample_rate

	# Initialize backends
	self.asr = WhisperASR(model_name=whisper_model)
	self.tts = CoquiXTTSBackend(model_name=coqui_model, speaker_wav=speaker_wav,
	language=language)
	self.nonverbal = ProceduralNonVerbalSynth(sample_rate=sample_rate)
	self.backchannel = BackchannelController(self.tts, self.nonverbal, sample_rate)
	self.interrupt_detector = InterruptDetector(self.asr)
	self.interrupt_response = InterruptionResponse()

	# ── v2.0.0 "Mind Through Voice" modules ──
	self.prosody_shaper = AdaptiveProsodyShaper()
	self.micro_intonation = MicroIntonationInjector(sample_rate)
	self.turn_predictor = TurnTakingPredictor()
	self.affective_mirror = AffectiveMirror()
	self.somatic_integrator = SomaticFeedbackIntegrator()
	self.empathy_generator = EmpathyPhraseGenerator()
	self.voice_memory = VoiceEventMemoryBridge(palace=palace)
	self.narrative_engine = NarrativeContinuityEngine(self.voice_memory)
	self.singing = SingingInterjectionModule(sample_rate)
	self.multimodal = MultimodalCueEmitter()
	self.dynamic_laughter = DynamicLaughterSynth(sample_rate)
	self.apology_generator = ContextAwareApologyGenerator()
	self.continuation_manager = NonBlockingContinuationManager()

	# State
	self.state = ConversationState()
	self._nima_audio_queue: Deque[np.ndarray] = deque()
	self._user_audio_buffer: List[np.ndarray] = []
	self._lock = threading.Lock()

	logger.info("[OmniVoice] ready (ASR=%s, TTS=%s)",
	self.asr.mode.value, self.tts.mode.value)

	def update_prosody_from_nima(self, snapshot: Any) -> ProsodyParams:
	"""
	Update prosody parameters from a NIMA ConsciousnessSnapshot.
	This is the NIMA integration point — when NIMA is ready, pass its
	snapshot here to drive voice prosody in real-time.
	"""
	prosody = ProsodyParams()
	if snapshot is None:
	return prosody
	try:
	# Map NIMA phi → energy
	if hasattr(snapshot, "phi") and snapshot.phi:
	prosody.energy = float(max(0.3, min(1.0, 0.5 + snapshot.phi.phi_composite * 0.5)))
	# Map NIMA rho → warmth
	if hasattr(snapshot, "rho") and snapshot.rho:
	prosody.warmth = float(max(0.2, min(1.0, snapshot.rho.integrity)))
	# Map NIMA emotion → pitch + tone
	if hasattr(snapshot, "emotion") and snapshot.emotion:
	prosody.base_pitch_hz = 180.0 + (snapshot.emotion.arousal - 0.3) * 60.0
	prosody.emotional_tone = getattr(snapshot.emotion, "label", "neutral")
	if snapshot.emotion.valence < -0.3:
	prosody.pitch_variance = 0.08 # flat for sad
	elif snapshot.emotion.valence > 0.3:
	prosody.pitch_variance = 0.25 # expressive for happy
	# Map qualia authenticity → breathiness
	if hasattr(snapshot, "qualia") and snapshot.qualia:
	prosody.breathiness = float(max(0.05, 0.3 - snapshot.qualia.authenticity_index * 0.25))
	except Exception as e:
	logger.warning("[OmniVoice] NIMA snapshot mapping failed: %s", e)
	return prosody

	async def stream(self, text: str,
	prosody: Optional[ProsodyParams] = None,
	user_audio_stream: Optional[AsyncGenerator[np.ndarray, None]] = None,
	) -> AsyncGenerator[np.ndarray, None]:
	"""
	Stream synthesized speech for `text`, yielding audio chunks.
	If `user_audio_stream` is provided, simultaneously monitors for
	interrupts and emits backchannels.

	Args:
	text: text to synthesize
	prosody: prosody parameters (if None, uses defaults)
	user_audio_stream: async generator of user audio frames
	(for real-time interrupt detection + backchanneling)

	Yields:
	Audio chunks (float32 numpy arrays at self.sample_rate Hz).
	"""
	prosody = prosody or ProsodyParams()
	self.state.phase = ConversationPhase.NIMA_SPEAKING
	self.state.nima_speech_start = time.time()
	self.state.current_text = text
	self.state.current_text_position = 0.0

	# Synthesize the full utterance
	full_audio = self.tts.synthesize(text, prosody)
	if len(full_audio) == 0:
	self.state.phase = ConversationPhase.IDLE
	return

	total_duration = len(full_audio) / self.sample_rate
	chunk_size = int(self.sample_rate * 0.05) # 50ms chunks
	chunks_yielded = 0
	total_chunks = max(1, len(full_audio) // chunk_size)

	# If no user audio stream, just stream the audio
	if user_audio_stream is None:
	for i in range(0, len(full_audio), chunk_size):
	chunk = full_audio[i:i + chunk_size]
	self.state.current_text_position = min(1.0, (i + chunk_size) / len(full_audio))
	yield chunk
	self.state.phase = ConversationPhase.IDLE
	return

	# ── Real-time mode: stream audio + monitor user ──
	user_audio_task = asyncio.create_task(self._collect_user_audio(user_audio_stream))
	try:
	for i in range(0, len(full_audio), chunk_size):
	chunk = full_audio[i:i + chunk_size]
	chunks_yielded += 1
	self.state.current_text_position = min(1.0, chunks_yielded / total_chunks)
	self.state.nima_speech_duration = time.time() - self.state.nima_speech_start

	# Check for backchannel emission (while user is speaking)
	# Note: backchannels are emitted DURING Nima's speech if the user
	# is also speaking (overlap). This is the "while the speaker is
	# talking" feature.

	# Check for interrupts
	remaining_s = (len(full_audio) - i) / self.sample_rate
	interrupt = self._check_for_interrupt(remaining_s)
	if interrupt and self.interrupt_response.should_respond(interrupt):
	# Yield remaining chunk + interruption response
	response_text = self.interrupt_response.generate_response(
	interrupt, self.state.current_text_position,
	self.state.user_emotion_arousal,
	)
	response_audio = self.tts.synthesize(response_text, ProsodyParams(
	base_pitch_hz=200, energy=0.6, warmth=0.8,
	))
	yield chunk # yield current chunk
	# Yield response in smaller chunks
	for j in range(0, len(response_audio), chunk_size):
	yield response_audio[j:j + chunk_size]
	self.state.phase = ConversationPhase.YIELDING
	self.state.interrupt_count += 1
	logger.info("[OmniVoice] interrupted at %.0f%%: '%s'",
	self.state.current_text_position * 100, response_text)
	return # Stop streaming Nima's audio

	yield chunk

	# Finished speaking without interruption
	self.state.phase = ConversationPhase.IDLE
	finally:
	user_audio_task.cancel()
	try:
	await user_audio_task
	except asyncio.CancelledError:
	pass

	async def _collect_user_audio(self, stream: AsyncGenerator[np.ndarray, None]):
	"""Background task: collect user audio for interrupt detection."""
	try:
	async for frame in stream:
	with self._lock:
	self._user_audio_buffer.append(frame)
	# Keep only last 2 seconds
	max_samples = 16000 * 2 # 2s at 16kHz
	total = sum(len(f) for f in self._user_audio_buffer)
	while total > max_samples and self._user_audio_buffer:
	removed = self._user_audio_buffer.pop(0)
	total -= len(removed)
	except asyncio.CancelledError:
	pass

	def _check_for_interrupt(self, remaining_s: float) -> Optional[InterruptClassification]:
	"""Check if there's an interrupt in the buffered user audio."""
	with self._lock:
	if not self._user_audio_buffer:
	return None
	audio = np.concatenate(self._user_audio_buffer[-5:]) # last ~500ms
	self._user_audio_buffer.clear()
	if len(audio) < 1600: # <100ms
	return None
	classification = self.interrupt_detector.classify(
	audio, sample_rate=16000,
	nima_text_progress=self.state.current_text_position,
	nima_speech_remaining_s=remaining_s,
	)
	if classification.interrupt_type == InterruptType.REAL_INTERRUPT:
	return classification
	# Log ignored interrupts (backchannels, non-verbals)
	if classification.interrupt_type != InterruptType.SILENCE:
	logger.debug("[OmniVoice] ignored %s: %s",
	classification.interrupt_type.value, classification.reason)
	return None

	def emit_backchannel(self, user_audio: np.ndarray) -> Optional[BackchannelEvent]:
	"""
	Check if a backchannel should be emitted while the user is speaking.
	Call this with recent user audio frames.

	Returns a BackchannelEvent if one should fire, else None.
	"""
	return self.backchannel.should_backchannel(self.state, user_audio)

	def synth_non_verbal(self, expr_type: NonVerbalType, intensity: float = 0.7) -> np.ndarray:
	"""Synthesize a non-verbal expression directly."""
	return self.nonverbal.synth(expr_type, intensity)

	def get_stats(self) -> Dict[str, Any]:
	return {
	"version": OMNIVOICE_VERSION,
	"asr_mode": self.asr.mode.value,
	"tts_mode": self.tts.mode.value,
	"sample_rate": self.sample_rate,
	"conversation_state": {
	"phase": self.state.phase.value,
	"interrupt_count": self.state.interrupt_count,
	"backchannel_count": self.state.backchannel_count,
	},
	# v2.0.0 module stats
	"v2_modules": {
	"prosody_shaper": "active",
	"micro_intonation": "active",
	"turn_predictor": "active",
	"affective_mirror": "active",
	"somatic_integrator": {
	"strain": self.somatic_integrator.strain,
	"energy": self.somatic_integrator.energy,
	},
	"empathy_generator": "active",
	"voice_memory": self.voice_memory.get_stats(),
	"narrative_engine": "active",
	"singing_interjections": "active",
	"multimodal_cues": len(self.multimodal.get_recent_cues(1000)),
	"dynamic_laughter": "active",
	"apology_generator": "active",
	"continuation_manager": {
	"is_paused": self.continuation_manager.is_paused,
	},
	},
	}


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 9 — NIMA Voice Adapter
	# ═══════════════════════════════════════════════════════════════════════════

	class NimaVoiceAdapter:
	"""
	Bridges NIMA's ConsciousnessSnapshot → OmniVoice prosody params.
	Also bridges NIMA's CTM tournament + MemoryPalace episodes → voice context.

	v2.0.0: Now integrates ALL "mind through voice" modules:
	- AdaptiveProsodyShaper (emotion → prosody dynamics)
	- AffectiveMirror (mirrors user's emotional tone)
	- SomaticFeedbackIntegrator (strain → voice fatigue)
	- VoiceEventMemoryBridge (stores voice events in MemPalace)
	- NarrativeContinuityEngine (references past conversations)

	Usage:
	adapter = NimaVoiceAdapter(engine)
	prosody = adapter.snapshot_to_prosody(nima_snapshot)
	async for chunk in engine.stream(text, prosody=prosody):
	...

	Full NIMA + CTM + MemPalace integration:
	# After NIMA's process_stimulus():
	adapter.update_from_snapshot(snapshot)
	adapter.update_from_ctm_winner(ctm_winner)
	adapter.update_somatic_from_nima(snapshot.phi, snapshot.rho)
	prosody = adapter.get_contextual_prosody()
	# After speaking:
	adapter.store_voice_event(text, prosody, duration_s)
	"""

	def __init__(self, engine: OmniVoiceEngine):
	self._engine = engine
	self._last_snapshot: Any = None
	self._last_ctm_winner: Optional[Dict[str, Any]] = None
	self._last_episode_context: Optional[Dict[str, Any]] = None
	self._user_emotion: str = "neutral"
	self._user_valence: float = 0.0
	self._user_arousal: float = 0.3

	def update_from_snapshot(self, snapshot: Any) -> ProsodyParams:
	"""Update engine prosody from a NIMA ConsciousnessSnapshot."""
	self._last_snapshot = snapshot
	# Extract user emotion from snapshot (if available)
	if snapshot and hasattr(snapshot, "emotion") and snapshot.emotion:
	self._user_valence = float(getattr(snapshot.emotion, "valence", 0.0))
	self._user_arousal = float(getattr(snapshot.emotion, "arousal", 0.3))
	self._user_emotion = getattr(snapshot.emotion, "label", "neutral")
	return self._engine.update_prosody_from_nima(snapshot)

	def update_from_ctm_winner(self, ctm_winner: Optional[Dict[str, Any]]) -> None:
	"""
	Update engine context from a CTM tournament winner.
	The winning processor's character influences voice style:
	- memory_palace → warmer, more nostalgic
	- somatic_registry → more emotionally resonant
	- wernicke → clearer, more articulate
	- broca → faster, more fluent
	"""
	if ctm_winner is None:
	self._last_ctm_winner = None
	return
	self._last_ctm_winner = ctm_winner
	logger.debug("[NimaVoiceAdapter] CTM winner: %s (score=%.3f)",
	ctm_winner.get("processor_name", "?"),
	ctm_winner.get("score", 0.0))

	def update_somatic_from_nima(self, phi: Any, rho: Any) -> None:
	"""
	Update the somatic feedback integrator from NIMA's phi + rho.
	Ties voice modulation to system strain (biological fatigue signals).
	"""
	strain = 0.0
	allostatic = 0.0
	if phi and hasattr(phi, "phenomenological_strain"):
	strain = float(phi.phenomenological_strain)
	# Allostatic load approximation from rho dissonance
	if rho and hasattr(rho, "dissonance"):
	allostatic = float(rho.dissonance)
	self._engine.somatic_integrator.update_from_nima(strain, allostatic)

	def update_from_episode(self, episode: Optional[Dict[str, Any]]) -> None:
	"""
	Update engine context from a MemoryPalace episode.
	If the episode has high strain or negative valence, the voice
	should reflect that (lower pitch, more breathiness).
	"""
	if episode is None:
	self._last_episode_context = None
	return
	self._last_episode_context = episode
	logger.debug("[NimaVoiceAdapter] episode context updated: valence=%.2f",
	episode.get("valence", 0.0))

	def get_contextual_prosody(self) -> ProsodyParams:
	"""
	Get prosody params that reflect NIMA state + CTM winner + episode
	context + somatic feedback + affective mirroring + adaptive shaping.

	This is the FULL v2.0.0 integration — all modules contribute.
	"""
	# 1. Start with NIMA snapshot → base prosody
	prosody = self._engine.update_prosody_from_nima(self._last_snapshot)

	# 2. Apply affective mirroring (match user's emotional tone)
	prosody, mirror_emotion = self._engine.affective_mirror.mirror(
	self._user_valence, self._user_arousal, prosody
	)

	# 3. Apply adaptive prosody shaping (emotion → pitch/rhythm/timbre)
	empathy_level = 0.5
	if self._user_valence < -0.3:
	empathy_level = 0.8 # more empathetic when user is negative
	emotion_for_shaping = self._user_emotion if self._user_emotion != "neutral" else mirror_emotion
	prosody = self._engine.prosody_shaper.shape(
	prosody, emotion=emotion_for_shaping,
	valence=self._user_valence, arousal=self._user_arousal,
	empathy_level=empathy_level,
	)

	# 4. Apply somatic feedback (strain → voice fatigue)
	prosody = self._engine.somatic_integrator.apply_somatic_modulation(prosody)

	# 5. Apply CTM winner influence on voice character
	if self._last_ctm_winner:
	processor = self._last_ctm_winner.get("processor_name", "")
	if processor == "memory_palace":
	prosody.warmth = float(min(1.0, prosody.warmth + 0.10))
	prosody.speech_rate_wpm *= 0.95 # more measured, nostalgic
	elif processor == "somatic_registry":
	prosody.breathiness = float(min(0.3, prosody.breathiness + 0.05))
	prosody.pitch_variance = float(min(0.35, prosody.pitch_variance + 0.05))
	elif processor == "wernicke":
	prosody.speech_rate_wpm *= 1.05 # clearer, more articulate
	elif processor == "broca":
	prosody.speech_rate_wpm *= 1.08 # faster, more fluent

	# 6. Apply episode context modifications
	if self._last_episode_context:
	ep = self._last_episode_context
	strain = ep.get("score", 0.0)
	if strain > 0.5:
	prosody.base_pitch_hz -= 10.0
	prosody.breathiness = float(min(0.4, prosody.breathiness + 0.05))
	if ep.get("valence", 0.0) < -0.3:
	prosody.warmth = float(min(1.0, prosody.warmth + 0.1))
	prosody.speech_rate_wpm -= 10.0

	return prosody

	def store_voice_event(self, text: str, prosody: ProsodyParams,
	duration_s: float, speaker: str = "nima") -> str:
	"""
	Store a voice event in MemPalace with full affective tags.
	Call this after each utterance to build episodic voice memory.
	"""
	event = VoiceEvent(
	speaker=speaker,
	text=text,
	audio_duration_s=duration_s,
	prosody_snapshot={
	"pitch_hz": prosody.base_pitch_hz,
	"rate_wpm": prosody.speech_rate_wpm,
	"energy": prosody.energy,
	"warmth": prosody.warmth,
	"breathiness": prosody.breathiness,
	},
	emotion=prosody.emotional_tone,
	valence=self._user_valence,
	arousal=self._user_arousal,
	strain=self._engine.somatic_integrator.strain,
	conversation_phase=self._engine.state.phase.value,
	interrupt_count=self._engine.state.interrupt_count,
	backchannel_count=self._engine.state.backchannel_count,
	)
	return self._engine.voice_memory.store_voice_event(event)

	def get_narrative_continuity(self, current_topic: str = "") -> Optional[str]:
	"""
	Generate a narrative continuity phrase referencing a past voice event.
	Returns None if no suitable past event exists.
	"""
	return self._engine.narrative_engine.generate_continuity_phrase(current_topic)

	def get_empathy_phrase(self) -> str:
	"""Generate a contextual empathy phrase based on current user state."""
	return self._engine.empathy_generator.generate(
	self._user_emotion, self._user_valence, self._user_arousal
	)


	# ═══════════════════════════════════════════════════════════════════════════
	# SECTION 10 — Utility functions
	# ═══════════════════════════════════════════════════════════════════════════

	def save_wav(audio: np.ndarray, path: str, sample_rate: int = 22050) -> str:
	"""Save audio array to a WAV file."""
	audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
	with wave.open(path, "wb") as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(sample_rate)
	wf.writeframes(audio_int16.tobytes())
	return path


	def load_wav(path: str) -> Tuple[np.ndarray, int]:
	"""Load a WAV file into a float32 numpy array."""
	with wave.open(path, "rb") as wf:
	n_channels = wf.getnchannels()
	sampwidth = wf.getsampwidth()
	sample_rate = wf.getframerate()
	frames = wf.readframes(wf.getnframes())
	if sampwidth == 2:
	audio = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
	elif sampwidth == 1:
	audio = (np.frombuffer(frames, dtype=np.uint8).astype(np.float32) - 128) / 128.0
	else:
	raise ValueError(f"Unsupported sample width: {sampwidth}")
	if n_channels > 1:
	audio = audio[::n_channels] # mono downmix (take first channel)
	return audio, sample_rate


	async def demo():
	"""OmniVoice Engine demo."""
	print("\n" + "=" * 70)
	print(f" OmniVoice Engine v{OMNIVOICE_VERSION} — Demo")
	print("=" * 70 + "\n")

	engine = OmniVoiceEngine()
	print(f"ASR mode: {engine.asr.mode.value}")
	print(f"TTS mode: {engine.tts.mode.value}")
	print()

	# Test 1: Basic TTS synthesis
	print("[Test 1] Basic speech synthesis...")
	prosody = ProsodyParams(base_pitch_hz=180, energy=0.8, warmth=0.7)
	audio = engine.tts.synthesize("Hello, I am OmniVoice. Nice to meet you.", prosody)
	print(f" Audio: {len(audio)} samples, {len(audio)/engine.sample_rate:.2f}s")
	save_wav(audio, "/home/z/my-project/download/omnivoice_test1_speech.wav", engine.sample_rate)
	print(f" Saved: omnivoice_test1_speech.wav")
	print()

	# Test 2: Non-verbal expressions
	print("[Test 2] Non-verbal expressions...")
	for expr in [NonVerbalType.LAUGHTER, NonVerbalType.SIGH, NonVerbalType.GASp,
	NonVerbalType.GROAN, NonVerbalType.AWW, NonVerbalType.MM]:
	audio = engine.synth_non_verbal(expr, intensity=0.7)
	print(f" {expr.value:12s}: {len(audio)} samples, {len(audio)/engine.sample_rate:.2f}s")
	# Save laughter for verification
	laugh = engine.synth_non_verbal(NonVerbalType.LAUGHTER)
	save_wav(laugh, "/home/z/my-project/download/omnivoice_test2_laughter.wav", engine.sample_rate)
	print(f" Saved: omnivoice_test2_laughter.wav")
	print()

	# Test 3: Streaming
	print("[Test 3] Streaming speech...")
	chunks = []
	async for chunk in engine.stream("This is a streaming test of the OmniVoice engine.", prosody=prosody):
	chunks.append(chunk)
	full = np.concatenate(chunks)
	print(f" Streamed {len(chunks)} chunks, total {len(full)} samples, {len(full)/engine.sample_rate:.2f}s")
	save_wav(full, "/home/z/my-project/download/omnivoice_test3_stream.wav", engine.sample_rate)
	print(f" Saved: omnivoice_test3_stream.wav")
	print()

	# Test 4: Interrupt classification
	print("[Test 4] Interrupt classification...")
	# Simulate different types of user speech
	test_cases = [
	("Backchannel 'yeah'", engine.synth_non_verbal(NonVerbalType.MM, 0.3)[:int(16000*0.4)]),
	("Laughter", engine.synth_non_verbal(NonVerbalType.LAUGHTER, 0.7)[:int(16000*0.8)]),
	("Sigh", engine.synth_non_verbal(NonVerbalType.SIGH, 0.6)[:int(16000*0.5)]),
	]
	for name, audio in test_cases:
	cls = engine.interrupt_detector.classify(audio, sample_rate=16000)
	print(f" {name:25s} → {cls.interrupt_type.value} (conf={cls.confidence:.2f}, reason='{cls.reason}')")
	print()

	# Test 5: Interruption response
	print("[Test 5] Interruption responses...")
	for progress in [0.1, 0.5, 0.9]:
	fake_interrupt = InterruptClassification(
	interrupt_type=InterruptType.REAL_INTERRUPT,
	confidence=0.8,
	duration_s=1.5,
	)
	response = engine.interrupt_response.generate_response(
	fake_interrupt, nima_text_progress=progress, user_arousal=0.4,
	)
	print(f" Progress {progress:.0%}: '{response}'")
	print()

	# Test 6: NIMA adapter
	print("[Test 6] NIMA voice adapter...")
	adapter = NimaVoiceAdapter(engine)
	prosody = adapter.get_contextual_prosody()
	print(f" Default prosody: pitch={prosody.base_pitch_hz:.0f}Hz, energy={prosody.energy:.2f}, warmth={prosody.warmth:.2f}")
	# Simulate episode context
	adapter.update_from_episode({"valence": -0.5, "score": 0.7, "processor_name": "somatic_registry"})
	prosody2 = adapter.get_contextual_prosody()
	print(f" With episode (val=-0.5, strain=0.7): pitch={prosody2.base_pitch_hz:.0f}Hz, "
	f"energy={prosody2.energy:.2f}, warmth={prosody2.warmth:.2f}, breath={prosody2.breathiness:.2f}")
	print()

	print("=" * 70)
	print(f" OmniVoice v{OMNIVOICE_VERSION} Demo Complete")
	print("=" * 70)


	if __name__ == "__main__":
	asyncio.run(demo())