import { AutoModel, Tensor, env, pipeline } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0"; env.allowLocalModels = false; env.useBrowserCache = true; installFetchTelemetry("asr"); const SAMPLE_RATE = 16000; const SPEECH_THRESHOLD = 0.3; const EXIT_THRESHOLD = 0.1; const DEFAULT_SILENCE_DURATION_MS = 480; const MIN_SILENCE_DURATION_MS = 200; const MAX_SILENCE_DURATION_MS = 800; const SPEECH_PAD_SAMPLES = 80 * (SAMPLE_RATE / 1000); const MIN_SPEECH_DURATION_SAMPLES = 250 * (SAMPLE_RATE / 1000); const MAX_BUFFER_DURATION = 30; const NEW_BUFFER_SIZE = 512; const MAX_NUM_PREV_BUFFERS = Math.ceil(SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE); const PARTIAL_INTERVAL_MS = 1600; let vadModel = null; let transcriber = null; let device = "wasm"; let inputQueue = new Float32Array(0); let vadChain = Promise.resolve(); let asrChain = Promise.resolve(); let vadState = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]); let srTensor = new Tensor("int64", [SAMPLE_RATE], []); let isRecording = false; let bufferPointer = 0; let postSpeechSamples = 0; let previousBuffers = []; let partialEnabled = true; let minSilenceDurationSamples = silenceDurationSamples(DEFAULT_SILENCE_DURATION_MS); let partialBusy = false; let lastPartialAt = 0; let utteranceStartedAt = 0; const recordingBuffer = new Float32Array(MAX_BUFFER_DURATION * SAMPLE_RATE); function installFetchTelemetry(scope) { const originalFetch = globalThis.fetch?.bind(globalThis); if (!originalFetch || globalThis.__browserSpeakFetchTelemetryInstalled) return; globalThis.__browserSpeakFetchTelemetryInstalled = true; globalThis.fetch = async (input, init) => { const startedAt = performance.now(); const url = fetchUrl(input); const method = String(init?.method || input?.method || "GET").toUpperCase(); try { const response = await originalFetch(input, init); self.postMessage({ type: "network", scope, method, url, responseUrl: response.url || url, status: response.status, ok: response.ok, durationMs: performance.now() - startedAt, }); return response; } catch (error) { self.postMessage({ type: "network", scope, method, url, status: null, ok: false, durationMs: performance.now() - startedAt, error: error.message ?? String(error), }); throw error; } }; } function fetchUrl(input) { if (typeof input === "string") return input; if (input instanceof URL) return input.href; return input?.url ?? ""; } self.onmessage = async (event) => { const message = event.data; try { if (message.type === "load") { await load(message); } else if (message.type === "configure") { configure(message); } else if (message.type === "audio") { ingestAudio(message.buffer, message.sampleRate); } else if (message.type === "flush") { await flushRecording(); } } catch (error) { self.postMessage({ type: "error", message: error.message ?? String(error) }); } }; async function load({ model, device: requestedDevice, partial, silenceMs }) { device = requestedDevice; configure({ partial, silenceMs }); self.postMessage({ type: "status", scope: "vad", message: "Loading", mode: "warn" }); vadModel = await AutoModel.from_pretrained("onnx-community/silero-vad", { config: { model_type: "custom" }, dtype: "fp32", progress_callback: reportProgress("VAD"), }); self.postMessage({ type: "status", message: "Loading", mode: "warn" }); const dtype = model.includes("moonshine") ? { encoder_model: device === "webgpu" ? "fp32" : "fp32", decoder_model_merged: "q4", } : device === "webgpu" ? { encoder_model: "fp32", decoder_model_merged: "q4", } : { encoder_model: "fp32", decoder_model_merged: "q4", }; transcriber = await pipeline("automatic-speech-recognition", model, { device, dtype, progress_callback: reportProgress("STT"), }); self.postMessage({ type: "status", message: "Warming", mode: "warn" }); await transcribeBuffer(new Float32Array(SAMPLE_RATE), { warmup: true }); self.postMessage({ type: "ready" }); } function reportProgress(label) { return (progress) => { if (progress.status === "progress") { const pct = Number.isFinite(progress.progress) ? ` ${progress.progress.toFixed(0)}%` : ""; self.postMessage({ type: "status", message: `${label}${pct}`, mode: "warn" }); } }; } function ingestAudio(buffer, sourceRate) { const resampled = resampleTo16k(buffer, sourceRate); inputQueue = concat(inputQueue, resampled); while (inputQueue.length >= NEW_BUFFER_SIZE) { const chunk = inputQueue.slice(0, NEW_BUFFER_SIZE); inputQueue = inputQueue.slice(NEW_BUFFER_SIZE); void handleVadChunk(chunk); } } async function handleVadChunk(buffer) { const wasRecording = isRecording; const speech = await vad(buffer); if (!wasRecording && !speech) { if (previousBuffers.length >= MAX_NUM_PREV_BUFFERS) previousBuffers.shift(); previousBuffers.push(buffer); return; } const remaining = recordingBuffer.length - bufferPointer; if (buffer.length >= remaining) { recordingBuffer.set(buffer.subarray(0, remaining), bufferPointer); bufferPointer += remaining; dispatchForTranscription(buffer.subarray(remaining)); return; } recordingBuffer.set(buffer, bufferPointer); bufferPointer += buffer.length; if (speech) { if (!isRecording) { utteranceStartedAt = performance.now(); self.postMessage({ type: "speechstart" }); } isRecording = true; postSpeechSamples = 0; maybePartial(); return; } postSpeechSamples += buffer.length; if (postSpeechSamples < minSilenceDurationSamples) return; if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) { reset(); return; } self.postMessage({ type: "speechend", trailingSilenceMs: sampleDurationMs(postSpeechSamples), }); dispatchForTranscription(); } async function vad(buffer) { const input = new Tensor("float32", buffer, [1, buffer.length]); const result = await (vadChain = vadChain.then(() => vadModel({ input, sr: srTensor, state: vadState }), )); vadState = result.stateN; const probability = result.output.data[0]; return probability > SPEECH_THRESHOLD || (isRecording && probability >= EXIT_THRESHOLD); } function maybePartial() { if (!partialEnabled || partialBusy) return; const now = performance.now(); if (now - lastPartialAt < PARTIAL_INTERVAL_MS || bufferPointer < SAMPLE_RATE) return; partialBusy = true; lastPartialAt = now; const buffer = paddedRecordingBuffer(); transcribeBuffer(buffer, { partial: true }) .then((text) => { if (text.trim()) self.postMessage({ type: "partial", text }); }) .finally(() => { partialBusy = false; }); } function dispatchForTranscription(overflow) { const buffer = paddedRecordingBuffer(); transcribeBuffer(buffer, { partial: false }).then((text) => { self.postMessage({ type: "transcript", text, durationMs: performance.now() - utteranceStartedAt, }); }); if (overflow?.length) { recordingBuffer.set(overflow, 0); } reset(overflow?.length ?? 0); } async function flushRecording() { await (vadChain = vadChain.then(() => Promise.resolve())); if (!isRecording || bufferPointer < MIN_SPEECH_DURATION_SAMPLES) return; self.postMessage({ type: "speechend", trailingSilenceMs: sampleDurationMs(postSpeechSamples), forced: true, }); dispatchForTranscription(); } function paddedRecordingBuffer() { const current = recordingBuffer.slice(0, Math.min(bufferPointer + SPEECH_PAD_SAMPLES, recordingBuffer.length)); const prevLength = previousBuffers.reduce((sum, item) => sum + item.length, 0); const padded = new Float32Array(prevLength + current.length); let offset = 0; for (const prev of previousBuffers) { padded.set(prev, offset); offset += prev.length; } padded.set(current, offset); return padded; } async function transcribeBuffer(buffer, { warmup = false } = {}) { const output = await (asrChain = asrChain.then(() => transcriber(buffer))); if (warmup) return ""; return output.text ?? ""; } function reset(offset = 0) { recordingBuffer.fill(0, offset); bufferPointer = offset; isRecording = false; postSpeechSamples = 0; previousBuffers = []; lastPartialAt = 0; } function configure({ partial, silenceMs } = {}) { if (typeof partial === "boolean") partialEnabled = partial; if (silenceMs != null) minSilenceDurationSamples = silenceDurationSamples(silenceMs); } function silenceDurationSamples(value) { const numericValue = Number(value); const ms = Number.isFinite(numericValue) ? numericValue : DEFAULT_SILENCE_DURATION_MS; const clampedMs = Math.min(MAX_SILENCE_DURATION_MS, Math.max(MIN_SILENCE_DURATION_MS, ms)); return Math.round(clampedMs * (SAMPLE_RATE / 1000)); } function sampleDurationMs(samples) { return (samples / SAMPLE_RATE) * 1000; } function resampleTo16k(input, sourceRate) { if (sourceRate === SAMPLE_RATE) return input; const ratio = sourceRate / SAMPLE_RATE; const length = Math.floor(input.length / ratio); const output = new Float32Array(length); for (let i = 0; i < length; i += 1) { const position = i * ratio; const left = Math.floor(position); const right = Math.min(left + 1, input.length - 1); const weight = position - left; output[i] = input[left] * (1 - weight) + input[right] * weight; } return output; } function concat(left, right) { if (left.length === 0) return right; const out = new Float32Array(left.length + right.length); out.set(left, 0); out.set(right, left.length); return out; }