const $ = (id) => document.getElementById(id);
const elements = {
loadButton: $("loadButton"),
micButton: $("micButton"),
stopButton: $("stopButton"),
suiteButton: $("suiteButton"),
benchmarkButton: $("benchmarkButton"),
chatBenchmarkButton: $("chatBenchmarkButton"),
ttsBenchmarkButton: $("ttsBenchmarkButton"),
loopbackButton: $("loopbackButton"),
bargeInButton: $("bargeInButton"),
gpuBenchmarkButton: $("gpuBenchmarkButton"),
evidenceCaptureButton: $("evidenceCaptureButton"),
micBenchmarkButton: $("micBenchmarkButton"),
micSeriesButton: $("micSeriesButton"),
copyResultsButton: $("copyResultsButton"),
downloadResultsButton: $("downloadResultsButton"),
clearResultsButton: $("clearResultsButton"),
clearLogButton: $("clearLogButton"),
deviceSelect: $("deviceSelect"),
runtimeStatus: $("runtimeStatus"),
runtimeDeviceStatus: $("runtimeDeviceStatus"),
runtimeDeviceDetail: $("runtimeDeviceDetail"),
runtimeBuildStatus: $("runtimeBuildStatus"),
llmModelSelect: $("llmModelSelect"),
asrModelSelect: $("asrModelSelect"),
voiceSelect: $("voiceSelect"),
ttsSteps: $("ttsSteps"),
ttsStepsValue: $("ttsStepsValue"),
vadSilence: $("vadSilence"),
vadSilenceValue: $("vadSilenceValue"),
partialToggle: $("partialToggle"),
micBadge: $("micBadge"),
micLevelMeter: $("micLevelMeter"),
micLevelBar: $("micLevelBar"),
micLevelValue: $("micLevelValue"),
audioBadge: $("audioBadge"),
partialTranscript: $("partialTranscript"),
finalTranscript: $("finalTranscript"),
llmOutput: $("llmOutput"),
eventLog: $("eventLog"),
benchmarkSummary: $("benchmarkSummary"),
resultsBody: $("resultsBody"),
micValidationCard: $("micValidationCard"),
micValidationStatus: $("micValidationStatus"),
micValidationDetail: $("micValidationDetail"),
micValidationProgressBar: $("micValidationProgressBar"),
gpuValidationCard: $("gpuValidationCard"),
gpuValidationStatus: $("gpuValidationStatus"),
gpuValidationDetail: $("gpuValidationDetail"),
gpuValidationProgressBar: $("gpuValidationProgressBar"),
vadCloseLatency: $("vadCloseLatency"),
asrLatency: $("asrLatency"),
firstTokenLatency: $("firstTokenLatency"),
firstTtsQueuedLatency: $("firstTtsQueuedLatency"),
ttsSynthLatency: $("ttsSynthLatency"),
firstAudioLatency: $("firstAudioLatency"),
speechToAudioLatency: $("speechToAudioLatency"),
decodeRate: $("decodeRate"),
tiles: {
vad: $("vadTile"),
asr: $("asrTile"),
llm: $("llmTile"),
tts: $("ttsTile"),
},
states: {
vad: $("vadState"),
asr: $("asrState"),
llm: $("llmState"),
tts: $("ttsState"),
},
};
const APP_IDENTITY_PROMPT = "What app is this?";
const APP_IDENTITY_ANSWER =
"This is a client-side browser voice assistant demo with local speech recognition, an LLM, and Supertonic TTS.";
const GENERIC_SYSTEM_PROMPT =
"You are the assistant inside a local browser voice demo. Reply in one short sentence under 18 words by default. Do not wrap responses in quotation marks.";
const IDENTITY_SYSTEM_PROMPT = `If asked what this demo, app, or application is, reply exactly: ${APP_IDENTITY_ANSWER} Do not shorten or paraphrase that identity answer. You are the assistant inside that local browser voice demo. Reply in one short sentence under 18 words by default. Do not wrap responses in quotation marks.`;
const SYSTEM_MESSAGES = [
{
role: "system",
content: GENERIC_SYSTEM_PROMPT,
},
];
const IDENTITY_PRIMER_MESSAGES = [
{
role: "user",
content: APP_IDENTITY_PROMPT,
},
{
role: "assistant",
content: APP_IDENTITY_ANSWER,
},
{
role: "user",
content: "Please identify this browser demo.",
},
{
role: "assistant",
content: APP_IDENTITY_ANSWER,
},
];
const APP_IDENTITY_QUALITY_RULES = [
{ label: "client/browser/local", pattern: /\b(client(?:-side)?|browser|local)\b/i },
{ label: "speech recognition", pattern: /\b(speech recognition|stt|transcrib(?:e|es|ing|er|ers|ed)?)\b/i },
{ label: "LLM", pattern: /\b(llm|language model)\b/i },
{ label: "TTS/Supertonic", pattern: /\b(tts|supertonic|speech synthesis|text-to-speech)\b/i },
];
const WEBGPU_ONLY_LLMS = new Set([
"onnx-community/Qwen3-0.6B-ONNX",
"onnx-community/granite-4.0-350m-ONNX-web",
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
]);
const WEBGPU_ONLY_REASONS = new Map([
[
"onnx-community/granite-4.0-350m-ONNX-web",
"its q4 WASM external data is about 576 MB; keep it on WebGPU until it is measured there.",
],
[
"onnx-community/Qwen3-0.6B-ONNX",
"local testing showed its q4 WASM load can exhaust the browser.",
],
[
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
"its browser assets are too large for this WASM fallback test environment.",
],
]);
const DEFAULT_VAD_SILENCE_MS = 480;
const MIN_VAD_SILENCE_MS = 200;
const MAX_VAD_SILENCE_MS = 800;
const DEFAULT_LOOPBACK_SPEED = 1.0;
const DEFAULT_LOOPBACK_PREROLL_MS = 180;
const DEFAULT_LOOPBACK_PROMPT = "Identify this browser demo.";
const REAL_MIC_TARGET_RUNS = 3;
const REAL_MIC_MAX_WER = 0.25;
const BENCHMARK_EXPORT_SCHEMA_VERSION = "browser-speak-benchmarks/v2";
const BENCHMARK_STORAGE_KEY = "browser-speak:benchmark-results:v1";
const BENCHMARK_STORAGE_VERSION = 1;
const BENCHMARK_STORAGE_MAX_ROWS = 80;
const HARDWARE_WEBGPU_EVIDENCE_STACK = Object.freeze({
device: "webgpu",
llm: "HuggingFaceTB/SmolLM2-135M-Instruct",
asr: "onnx-community/moonshine-base-ONNX",
voice: "F2",
ttsSteps: 2,
vadSilenceMs: 480,
partialAsr: true,
});
const MIC_START_TIMEOUT_MS = 15000;
const WEBGPU_ADAPTER_TIMEOUT_MS = 5000;
const MIC_LEVEL_UI_INTERVAL_MS = 80;
const MAX_WORKER_NETWORK_EVENTS = 300;
const MAX_ROW_WORKER_NETWORK_EVENTS = 40;
const DEFAULT_TTS_CHUNKING = Object.freeze({
firstSentenceMinChars: 5,
sentenceMinChars: 8,
firstClauseMinChars: 5,
clauseMinChars: 24,
firstTargetChars: 5,
targetChars: 36,
firstMinSpaceChars: 3,
minSpaceChars: 20,
});
const state = {
asrWorker: null,
llmWorker: null,
ttsWorker: null,
webgpuAvailable: null,
webgpuAdapterInfo: null,
webgpuAdapterFeatures: [],
webgpuSoftwareAdapter: false,
modelsLoaded: false,
modelsLoading: false,
micActive: false,
mediaStream: null,
audioContext: null,
micSource: null,
workletNode: null,
micMonitorGain: null,
playback: null,
messages: initialMessages(),
currentAssistant: "",
currentUserStartedAt: null,
lastSpeechEndAt: null,
lastVadCloseAt: null,
lastVadCloseDelayMs: null,
lastTranscriptAt: null,
awaitingFirstToken: false,
awaitingFirstAudio: false,
firstTtsChunkQueued: false,
ttsBuffer: "",
ttsSequence: 0,
loopbackRequestId: 0,
loopbackFeedTimers: new Set(),
ttsVoiceRequestId: 0,
ttsVoiceRequests: new Map(),
automationSynthesisRequestId: 0,
automationSynthesisRequests: new Map(),
hostMetadata: {
page: location.href,
host: location.host,
hfSpaceCommit: "",
etag: "",
fetchedAt: "",
},
activeTurnId: 0,
modelLoadStartedAt: null,
loadedStack: null,
inputSampleRate: null,
inputTrackSettings: null,
micInputStats: {
chunks: 0,
samples: 0,
peak: 0,
sumSquares: 0,
},
micLevel: 0,
lastMicLevelUiAt: 0,
activeBenchmark: null,
benchmarkTimeout: null,
pendingPlaybackSchedules: 0,
suiteRunning: false,
evidenceCaptureRunning: false,
micSeries: {
active: false,
target: 3,
completed: 0,
timer: 0,
},
workerNetworkSeq: 0,
workerNetworkEvents: [],
ttsChunking: { ...DEFAULT_TTS_CHUNKING },
benchmarkResults: [],
};
let pageInitPromise = null;
class PlaybackQueue {
constructor() {
this.context = null;
this.nextTime = 0;
this.sources = new Set();
this.started = false;
}
async ensure() {
if (!this.context) {
this.context = new AudioContext();
}
if (this.context.state !== "running") {
await this.context.resume();
}
}
async unlock() {
await this.ensure();
const buffer = this.context.createBuffer(1, 1, 22050);
const source = this.context.createBufferSource();
const gain = this.context.createGain();
gain.gain.value = 0;
source.buffer = buffer;
source.connect(gain);
gain.connect(this.context.destination);
source.start();
}
async play(samples, sampleRate) {
await this.ensure();
const data = samples instanceof Float32Array ? samples : new Float32Array(samples);
const buffer = this.context.createBuffer(1, data.length, sampleRate);
buffer.copyToChannel(data, 0);
const source = this.context.createBufferSource();
source.buffer = buffer;
source.connect(this.context.destination);
source.onended = () => {
this.sources.delete(source);
if (this.sources.size === 0) {
this.started = false;
setAudioState("Audio idle", false);
}
};
const startAt = Math.max(this.context.currentTime + 0.015, this.nextTime);
const scheduledAt = this.context.currentTime;
source.start(startAt);
this.nextTime = startAt + buffer.duration;
this.sources.add(source);
if (!this.started) {
this.started = true;
setAudioState("Playing", true);
}
return {
startDelayMs: Math.max(0, startAt - scheduledAt) * 1000,
durationMs: buffer.duration * 1000,
endDelayMs: Math.max(0, this.nextTime - scheduledAt) * 1000,
};
}
stop() {
for (const source of this.sources) {
try {
source.stop();
} catch {
// Source may already have ended.
}
}
this.sources.clear();
if (this.context) {
this.nextTime = this.context.currentTime;
}
this.started = false;
setAudioState("Audio idle", false);
}
async close() {
this.stop();
if (this.context && this.context.state !== "closed") {
await this.context.close().catch(() => {});
}
this.context = null;
this.nextTime = 0;
this.started = false;
}
}
function setTile(key, text, mode = "idle") {
elements.tiles[key].dataset.state = mode;
elements.states[key].textContent = text;
}
function setAudioState(text, active) {
elements.audioBadge.textContent = text;
elements.audioBadge.classList.toggle("active", active);
}
function setMicState(text, active) {
elements.micBadge.textContent = text;
elements.micBadge.classList.toggle("active", active);
}
function formatGpuAdapter(adapter) {
if (!adapter) return "unknown adapter";
return (
[adapter.vendor, adapter.architecture, adapter.device, adapter.description]
.filter(Boolean)
.join(" / ") || "unknown adapter"
);
}
function isSoftwareWebGpuAdapter(adapter) {
const text = [adapter?.vendor, adapter?.architecture, adapter?.device, adapter?.description]
.filter(Boolean)
.join(" ")
.toLowerCase();
return /\b(swiftshader|llvmpipe|software rasterizer|software adapter|warp)\b/.test(text);
}
function updateRuntimeStatus() {
const requested = elements.deviceSelect.value;
let mode = "warn";
let status = "Checking runtime";
let detail = "Adapter probe pending.";
if (state.webgpuAvailable === false) {
mode = requested === "webgpu" ? "error" : "warn";
status = requested === "webgpu" ? "WebGPU unavailable" : "WASM fallback";
detail = "This browser did not expose a WebGPU adapter.";
} else if (state.webgpuAvailable === true && state.webgpuSoftwareAdapter) {
const adapter = formatGpuAdapter(state.webgpuAdapterInfo);
mode = "warn";
if (requested === "webgpu") {
status = "Software WebGPU selected";
detail = `${adapter}. Explicit WebGPU is allowed, but benchmark automation skips software adapters by default.`;
} else if (requested === "wasm") {
status = "WASM selected";
detail = `${adapter}. Auto also uses WASM because this is a software adapter.`;
} else {
status = "Auto: WASM fallback";
detail = `${adapter}. Hardware WebGPU is not exposed.`;
}
} else if (state.webgpuAvailable === true) {
const adapter = formatGpuAdapter(state.webgpuAdapterInfo);
mode = "ready";
status = requested === "wasm" ? "WASM selected" : requested === "webgpu" ? "WebGPU selected" : "Auto: WebGPU";
detail = adapter;
}
elements.runtimeStatus.dataset.state = mode;
elements.runtimeDeviceStatus.textContent = status;
elements.runtimeDeviceDetail.textContent = detail;
updateGpuValidationStatus();
}
function formatMs(value) {
if (!Number.isFinite(value)) return "-";
if (value < 1000) return `${Math.round(value)} ms`;
return `${(value / 1000).toFixed(2)} s`;
}
function formatPercent(value) {
if (!Number.isFinite(value)) return "-";
return `${Math.round(value * 100)}%`;
}
function formatQuality(result) {
if (result.llmQualityPass == null) return "-";
const total = result.llmQualityTotal ?? APP_IDENTITY_QUALITY_RULES.length;
const score = result.llmQualityScore ?? result.llmQualityHits?.length;
const suffix = Number.isFinite(score) && Number.isFinite(total) ? ` ${score}/${total}` : "";
return `${result.llmQualityPass ? "pass" : "fail"}${suffix}`;
}
function formatPromptMetrics(result) {
const tokens = Number.isFinite(result.llmPromptTokens) ? `${result.llmPromptTokens} tok` : "";
const build = Number.isFinite(result.llmPromptBuildMs) ? formatMs(result.llmPromptBuildMs) : "";
if (tokens && build) return `${tokens} / ${build}`;
return tokens || build || "-";
}
function currentVadSilenceMs() {
const value = Number(elements.vadSilence.value);
if (!Number.isFinite(value)) return DEFAULT_VAD_SILENCE_MS;
return Math.min(MAX_VAD_SILENCE_MS, Math.max(MIN_VAD_SILENCE_MS, Math.round(value)));
}
function updateVadSilenceLabel() {
elements.vadSilenceValue.textContent = String(currentVadSilenceMs());
}
function updateTtsStepsLabel() {
elements.ttsStepsValue.textContent = String(Number(elements.ttsSteps.value));
}
function configureAsrWorker() {
state.asrWorker?.postMessage({
type: "configure",
partial: elements.partialToggle.checked,
silenceMs: currentVadSilenceMs(),
});
}
function logEvent(message) {
const li = document.createElement("li");
li.textContent = `${new Date().toLocaleTimeString()} ${message}`;
elements.eventLog.prepend(li);
while (elements.eventLog.children.length > 80) {
elements.eventLog.lastElementChild.remove();
}
}
async function refreshHostMetadata() {
try {
const response = await fetch(location.href, { method: "HEAD", cache: "no-store" });
state.hostMetadata = {
page: location.href,
host: location.host,
hfSpaceCommit: response.headers.get("x-repo-commit") ?? "",
etag: response.headers.get("etag") ?? "",
fetchedAt: new Date().toISOString(),
};
} catch {
state.hostMetadata = {
...state.hostMetadata,
fetchedAt: new Date().toISOString(),
};
}
updateBuildStatus();
}
function updateBuildStatus() {
const commit = state.hostMetadata.hfSpaceCommit;
const label = commit ? `Space build ${commit.slice(0, 8)}` : `${location.hostname || "local"} build`;
elements.runtimeBuildStatus.textContent = label;
if (state.benchmarkResults.length > 0) renderBenchmarkSummary();
}
function setLoadButton(mode) {
const options = {
load: { icon: "↓", label: "Load models", disabled: false },
loading: { icon: "…", label: "Loading", disabled: true },
unload: { icon: "↻", label: "Unload models", disabled: false },
};
const option = options[mode] ?? options.load;
elements.loadButton.innerHTML = `${option.icon} ${option.label}`;
elements.loadButton.disabled = option.disabled;
}
function setSessionControls(mode) {
const loaded = mode === "loaded";
const busy = mode === "loading";
setLoadButton(busy ? "loading" : loaded ? "unload" : "load");
elements.deviceSelect.disabled = busy || loaded;
elements.llmModelSelect.disabled = busy || loaded;
elements.asrModelSelect.disabled = busy || loaded;
elements.micButton.disabled = !loaded;
elements.stopButton.disabled = !loaded;
setBenchmarkControlsDisabled(!loaded || state.suiteRunning);
if (!loaded) {
elements.micButton.innerHTML = '● Start mic';
}
updateMicValidationStatus();
}
function setBenchmarkControlsDisabled(disabled) {
const locked = disabled || state.micSeries.active || state.evidenceCaptureRunning;
elements.suiteButton.disabled = locked;
elements.benchmarkButton.disabled = locked;
elements.chatBenchmarkButton.disabled = locked;
elements.ttsBenchmarkButton.disabled = locked;
elements.loopbackButton.disabled = locked;
elements.bargeInButton.disabled = locked;
elements.micBenchmarkButton.disabled = locked;
elements.micSeriesButton.disabled = locked;
updateGpuEvidenceButton();
updateEvidenceCaptureButton();
}
function resetMetrics() {
elements.vadCloseLatency.textContent = "-";
elements.asrLatency.textContent = "-";
elements.firstTokenLatency.textContent = "-";
elements.firstTtsQueuedLatency.textContent = "-";
elements.ttsSynthLatency.textContent = "-";
elements.firstAudioLatency.textContent = "-";
elements.speechToAudioLatency.textContent = "-";
elements.decodeRate.textContent = "-";
}
function setMicInputLevel(level) {
const normalized = Math.max(0, Math.min(1, Number(level) || 0));
const percent = Math.round(normalized * 100);
state.micLevel = normalized;
elements.micLevelBar.style.width = `${percent}%`;
elements.micLevelMeter.setAttribute("aria-valuenow", String(percent));
elements.micLevelValue.textContent = `${percent}%`;
}
function resetConversationUi() {
elements.partialTranscript.textContent = "Waiting for speech.";
elements.finalTranscript.textContent = "";
elements.llmOutput.textContent = "Load the models, start the microphone, and speak naturally.";
}
function updateMicValidationStatus() {
const prompt = APP_IDENTITY_PROMPT;
const target = state.micSeries.target || 3;
const activeStackKey = state.loadedStack ? stackKey(currentBenchmarkStack()) : "";
const scopedRows = activeStackKey
? state.benchmarkResults.filter((result) => stackKey(result.stack) === activeStackKey)
: state.benchmarkResults;
const completedMicRows = scopedRows.filter((result) => result.kind === "mic" && !result.error);
const completed = Math.min(target, completedMicRows.length);
const activeMic = state.activeBenchmark?.kind === "mic";
let mode = "idle";
let status = state.modelsLoaded
? `${completed}/${target} real-mic rows collected`
: completed > 0
? `${completed}/${target} real-mic rows in saved results`
: `Load models to collect ${target} rows`;
let detail =
completed > 0 && !state.modelsLoaded
? "Load a stack to collect or compare current-stack rows."
: `Use the mic series and say: "${prompt}"`;
let progress = completed / target;
if (state.modelsLoading) {
mode = "warn";
status = "Models loading";
detail = "Real-mic validation starts after the local stack is ready.";
} else if (state.micSeries.active || activeMic) {
const run = state.activeBenchmark?.micSeriesRun ?? state.micSeries.completed + 1;
const runTarget = state.activeBenchmark?.micSeriesTarget ?? target;
mode = "active";
status = `Listening for row ${Math.min(run, runTarget)}/${runTarget}`;
detail = `Say: "${prompt}" after the mic badge shows Listening.`;
progress = Math.min(1, (state.micSeries.completed + (activeMic ? 0.35 : 0)) / runTarget);
} else if (state.modelsLoaded && completed >= target) {
const summary = benchmarkSummaryForRows(scopedRows, { scope: "current" });
mode = "ready";
status = `${completed}/${target} real-mic rows complete`;
detail = `Median WER ${formatPercent(summary.micMedianWer)}, speech end to audio ${formatMs(
summary.micMedianSpeechEndToFirstAudioMs,
)}.`;
progress = 1;
} else if (state.modelsLoaded) {
mode = "warn";
detail = `Click Run 3 real-mic series and say: "${prompt}" each time.`;
} else if (completed >= target) {
mode = "ready";
progress = 1;
} else if (completed > 0) {
mode = "warn";
}
elements.micValidationCard.dataset.state = mode;
elements.micValidationStatus.textContent = status;
elements.micValidationDetail.textContent = detail;
elements.micValidationProgressBar.style.width = `${Math.round(Math.max(0, Math.min(1, progress)) * 100)}%`;
updateGpuValidationStatus();
}
function updateGpuValidationStatus() {
if (!elements.gpuValidationCard) return;
const evidence = evidenceSummaryForRows(state.benchmarkResults).hardwareWebgpu;
const hardwareRows = evidence.hardwareRows ?? 0;
let mode = "idle";
let status = "Checking adapter";
let detail = "Adapter probe pending.";
let progress = 0;
if (hardwareRows > 0) {
mode = "ready";
status = `${hardwareRows} hardware WebGPU row${hardwareRows === 1 ? "" : "s"}`;
detail = evidence.adapters[0]?.label ?? "Hardware adapter recorded in benchmark rows.";
progress = 1;
} else if (state.webgpuAvailable === false) {
mode = "warn";
status = "No WebGPU adapter";
detail = "Hardware WebGPU is not exposed in this browser.";
} else if (state.webgpuAvailable === true && state.webgpuSoftwareAdapter) {
mode = "warn";
status = "Software WebGPU only";
detail = `Software adapter: ${formatGpuAdapter(state.webgpuAdapterInfo)}`;
} else if (state.webgpuAvailable === true) {
mode = "warn";
status = "Hardware WebGPU available";
detail = `${formatGpuAdapter(state.webgpuAdapterInfo)}. Run a WebGPU stack to capture a row.`;
progress = 0.5;
}
elements.gpuValidationCard.dataset.state = mode;
elements.gpuValidationStatus.textContent = status;
elements.gpuValidationDetail.textContent = detail;
elements.gpuValidationProgressBar.style.width = `${Math.round(Math.max(0, Math.min(1, progress)) * 100)}%`;
updateGpuEvidenceButton();
updateEvidenceCaptureButton();
}
function updateGpuEvidenceButton() {
if (!elements.gpuBenchmarkButton) return;
const hasHardware = state.webgpuAvailable === true && state.webgpuSoftwareAdapter !== true;
const busy =
state.modelsLoading ||
state.suiteRunning ||
state.evidenceCaptureRunning ||
state.micSeries.active ||
Boolean(state.activeBenchmark);
const hardwareRows = evidenceSummaryForRows(state.benchmarkResults).hardwareWebgpu.hardwareRows ?? 0;
elements.gpuBenchmarkButton.disabled = busy || !hasHardware;
elements.gpuBenchmarkButton.textContent =
hardwareRows > 0 ? "Run another WebGPU evidence row" : "Run WebGPU evidence row";
}
function updateEvidenceCaptureButton() {
if (!elements.evidenceCaptureButton) return;
const busy =
state.modelsLoading ||
state.suiteRunning ||
state.evidenceCaptureRunning ||
state.micSeries.active ||
Boolean(state.activeBenchmark);
elements.evidenceCaptureButton.disabled = busy;
elements.evidenceCaptureButton.textContent = state.evidenceCaptureRunning
? "Capturing evidence..."
: "Run evidence capture";
}
function evidenceCaptureBusy() {
return (
state.activeBenchmark ||
state.suiteRunning ||
state.micSeries.active ||
state.modelsLoading ||
state.evidenceCaptureRunning
);
}
function resetTurnState() {
state.messages = initialMessages();
state.currentAssistant = "";
state.currentUserStartedAt = null;
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
state.lastTranscriptAt = null;
state.awaitingFirstToken = false;
state.awaitingFirstAudio = false;
state.firstTtsChunkQueued = false;
state.ttsBuffer = "";
state.ttsSequence = 0;
state.loopbackRequestId += 1;
clearLoopbackFeed();
state.activeBenchmark = null;
}
function resetMicInputStats() {
state.micInputStats = {
chunks: 0,
samples: 0,
peak: 0,
sumSquares: 0,
};
state.lastMicLevelUiAt = 0;
setMicInputLevel(0);
}
function updateMicInputStats(buffer) {
if (!buffer?.length) return;
let chunkPeak = 0;
let chunkSumSquares = 0;
for (let i = 0; i < buffer.length; i += 1) {
const sample = buffer[i];
const abs = Math.abs(sample);
if (abs > chunkPeak) chunkPeak = abs;
chunkSumSquares += sample * sample;
}
state.micInputStats.chunks += 1;
state.micInputStats.samples += buffer.length;
state.micInputStats.peak = Math.max(state.micInputStats.peak, chunkPeak);
state.micInputStats.sumSquares += chunkSumSquares;
const now = performance.now();
if (now - state.lastMicLevelUiAt >= MIC_LEVEL_UI_INTERVAL_MS) {
const chunkRms = Math.sqrt(chunkSumSquares / Math.max(1, buffer.length));
const displayLevel = Math.min(1, Math.max(chunkPeak, chunkRms * 3) * 4);
state.lastMicLevelUiAt = now;
setMicInputLevel(displayLevel);
}
if (state.activeBenchmark?.kind === "mic") {
state.activeBenchmark.micInputChunks = state.micInputStats.chunks;
state.activeBenchmark.micInputPeak = state.micInputStats.peak;
state.activeBenchmark.micInputRms = Math.sqrt(
state.micInputStats.sumSquares / Math.max(1, state.micInputStats.samples),
);
}
}
function terminateWorkers() {
state.asrWorker?.terminate();
state.llmWorker?.terminate();
state.ttsWorker?.terminate();
state.asrWorker = null;
state.llmWorker = null;
state.ttsWorker = null;
}
async function resetPipelineTiles() {
setTile("vad", "Idle", "idle");
setTile("asr", "Idle", "idle");
setTile("llm", "Idle", "idle");
setTile("tts", "Idle", "idle");
await supportsWebGPU();
if (!state.webgpuAvailable || state.webgpuSoftwareAdapter) {
const label = state.webgpuSoftwareAdapter ? "WASM auto" : "WASM only";
setTile("llm", label, "warn");
setTile("tts", label, "warn");
}
}
async function supportsWebGPU() {
if (state.webgpuAvailable !== null) {
updateRuntimeStatus();
return state.webgpuAvailable;
}
if (!navigator.gpu) {
state.webgpuAvailable = false;
state.webgpuAdapterInfo = null;
state.webgpuAdapterFeatures = [];
state.webgpuSoftwareAdapter = false;
updateRuntimeStatus();
return false;
}
try {
const adapter = await withTimeout(
navigator.gpu.requestAdapter(),
WEBGPU_ADAPTER_TIMEOUT_MS,
"WebGPU adapter probe",
);
state.webgpuAvailable = Boolean(adapter);
state.webgpuAdapterInfo = await readGpuAdapterInfo(adapter);
state.webgpuAdapterFeatures = adapter?.features ? [...adapter.features].sort() : [];
state.webgpuSoftwareAdapter = isSoftwareWebGpuAdapter(state.webgpuAdapterInfo);
} catch {
state.webgpuAvailable = false;
state.webgpuAdapterInfo = null;
state.webgpuAdapterFeatures = [];
state.webgpuSoftwareAdapter = false;
}
updateRuntimeStatus();
return state.webgpuAvailable;
}
async function readGpuAdapterInfo(adapter) {
let info = adapter?.info;
if (!info && typeof adapter?.requestAdapterInfo === "function") {
info = await adapter.requestAdapterInfo().catch(() => null);
}
if (!info) return null;
return {
vendor: info.vendor || "",
architecture: info.architecture || "",
device: info.device || "",
description: info.description || "",
};
}
async function resolveDevice() {
const requested = elements.deviceSelect.value;
if (requested === "auto") {
await supportsWebGPU();
return state.webgpuAvailable && !state.webgpuSoftwareAdapter ? "webgpu" : "wasm";
}
return requested;
}
function createWorker(path, kind, handler) {
const worker = new Worker(new URL(path, import.meta.url), { type: "module" });
worker.addEventListener("message", (event) => {
if (event.data?.type === "network") {
handleWorkerNetworkEvent(kind, event.data);
return;
}
handler(event);
});
worker.addEventListener("error", (event) => {
const message = `${path} error: ${event.message || "worker failed to load"}`;
logEvent(message);
window.dispatchEvent(new CustomEvent("model-error", { detail: { kind, message } }));
});
return worker;
}
async function loadModels({ ttsWarmup = true } = {}) {
if (state.modelsLoaded || state.modelsLoading) return;
state.modelsLoading = true;
setSessionControls("loading");
const device = await resolveDevice();
if (device === "webgpu" && !(await supportsWebGPU())) {
logEvent("WebGPU was requested but is not available in this browser.");
state.modelsLoading = false;
setSessionControls("unloaded");
await resetPipelineTiles();
return;
}
const stackError = selectedStackError(device);
if (stackError) {
logEvent(stackError);
state.modelsLoading = false;
setSessionControls("unloaded");
await resetPipelineTiles();
return;
}
state.playback ??= new PlaybackQueue();
const audioUnlock = state.playback.unlock().catch((error) => {
logEvent(`Audio unlock deferred: ${error.message}`);
});
setTile("vad", "Queued", "warn");
setTile("asr", "Queued", "warn");
setTile("llm", "Queued", "warn");
setTile("tts", "Queued", "warn");
logEvent(`Loading models on ${device.toUpperCase()}.`);
state.modelLoadStartedAt = performance.now();
state.loadedStack = {
device,
llm: elements.llmModelSelect.value,
asr: elements.asrModelSelect.value,
vad: "onnx-community/silero-vad",
tts: "onnx-community/Supertonic-TTS-ONNX",
transformers: "@huggingface/transformers@4.2.0",
modelLoadMs: null,
};
void audioUnlock;
try {
setTile("vad", "Loading", "warn");
setTile("asr", "Loading", "warn");
logEvent("Loading local VAD and STT models.");
state.asrWorker = createWorker("./workers/asr-worker.js", "asr", onAsrMessage);
const asrReady = waitForReady("asr");
state.asrWorker.postMessage({
type: "load",
model: elements.asrModelSelect.value,
device,
partial: elements.partialToggle.checked,
silenceMs: currentVadSilenceMs(),
});
await asrReady;
setTile("llm", "Loading", "warn");
logEvent("Loading local LLM.");
state.llmWorker = createWorker("./workers/llm-worker.js", "llm", onLlmMessage);
const llmReady = waitForReady("llm");
state.llmWorker.postMessage({
type: "load",
model: elements.llmModelSelect.value,
device,
});
await llmReady;
setTile("tts", "Loading", "warn");
logEvent("Loading local Supertonic TTS.");
state.ttsWorker = createWorker("./workers/tts-worker.js", "tts", onTtsMessage);
const ttsReady = waitForReady("tts");
state.ttsWorker.postMessage({
type: "load",
model: "onnx-community/Supertonic-TTS-ONNX",
device,
voice: elements.voiceSelect.value,
warmup: ttsWarmup,
});
await ttsReady;
const loadMs = performance.now() - state.modelLoadStartedAt;
state.loadedStack.modelLoadMs = loadMs;
state.modelsLoading = false;
state.modelsLoaded = true;
setSessionControls("loaded");
logEvent(`All local inference models are ready in ${formatMs(loadMs)}.`);
} catch (error) {
logEvent(`Model loading failed: ${error.message}`);
await unloadModels({ quiet: true });
}
}
async function unloadModels({ quiet = false, preserveEvidenceCapture = false } = {}) {
if (state.micActive) stopMic();
cancelMicSeries();
interruptForBargeIn();
cancelActiveBenchmark();
rejectTtsVoiceRequests(new Error("TTS worker unloaded."));
terminateWorkers();
state.modelsLoaded = false;
state.modelsLoading = false;
state.modelLoadStartedAt = null;
state.loadedStack = null;
state.suiteRunning = false;
if (!preserveEvidenceCapture) state.evidenceCaptureRunning = false;
resetTurnState();
resetMetrics();
resetConversationUi();
setMicState("Mic off", false);
setAudioState("Audio idle", false);
await state.playback?.close();
state.playback = null;
setSessionControls("unloaded");
await resetPipelineTiles();
if (!quiet) logEvent("Unloaded local inference models. Benchmark rows are preserved.");
}
function waitForReady(kind) {
return new Promise((resolve, reject) => {
const timeout = window.setTimeout(() => reject(new Error(`${kind} load timeout`)), 180000);
const handler = (event) => {
if (event.detail.kind !== kind) return;
window.clearTimeout(timeout);
window.removeEventListener("model-ready", handler);
window.removeEventListener("model-error", errorHandler);
resolve();
};
const errorHandler = (event) => {
if (event.detail.kind !== kind) return;
window.clearTimeout(timeout);
window.removeEventListener("model-ready", handler);
window.removeEventListener("model-error", errorHandler);
reject(new Error(event.detail.message));
};
window.addEventListener("model-ready", handler);
window.addEventListener("model-error", errorHandler);
});
}
async function startMic() {
if (!state.modelsLoaded || state.micActive) return;
state.playback?.unlock().catch(() => {});
state.mediaStream = await getUserMediaWithTimeout({
audio: {
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
state.inputTrackSettings = readAudioTrackSettings(state.mediaStream);
state.audioContext = new AudioContext();
state.inputSampleRate = state.audioContext.sampleRate;
if (state.activeBenchmark) {
state.activeBenchmark.inputSampleRate = state.audioContext.sampleRate;
state.activeBenchmark.inputTrackSettings = state.inputTrackSettings;
if (state.activeBenchmark.stack?.environment) {
state.activeBenchmark.stack.environment.inputSampleRate = state.audioContext.sampleRate;
state.activeBenchmark.stack.environment.inputTrackSettings = state.inputTrackSettings;
}
}
const workletUrl = URL.createObjectURL(
new Blob([audioWorkletSource()], { type: "text/javascript" }),
);
await state.audioContext.audioWorklet.addModule(workletUrl);
URL.revokeObjectURL(workletUrl);
state.micSource = state.audioContext.createMediaStreamSource(state.mediaStream);
state.workletNode = new AudioWorkletNode(state.audioContext, "pcm-capture");
state.micMonitorGain = state.audioContext.createGain();
state.micMonitorGain.gain.value = 0;
state.workletNode.port.onmessage = (event) => {
if (!state.asrWorker || !state.micActive) return;
const buffer = event.data;
updateMicInputStats(buffer);
state.asrWorker.postMessage(
{
type: "audio",
buffer,
sampleRate: state.audioContext.sampleRate,
},
[buffer.buffer],
);
};
state.micSource.connect(state.workletNode);
state.workletNode.connect(state.micMonitorGain);
state.micMonitorGain.connect(state.audioContext.destination);
state.micActive = true;
elements.micButton.innerHTML = '● Stop mic';
setMicState("Listening", true);
setTile("vad", "Listening", "active");
logEvent("Microphone capture started.");
}
async function getUserMediaWithTimeout(constraints, timeoutMs = MIC_START_TIMEOUT_MS) {
let timeoutId = 0;
let timedOut = false;
const mediaPromise = navigator.mediaDevices.getUserMedia(constraints).then((stream) => {
if (timedOut) {
for (const track of stream.getTracks()) track.stop();
return null;
}
return stream;
});
mediaPromise.catch(() => {});
const timeoutPromise = new Promise((resolve) => {
timeoutId = window.setTimeout(() => {
timedOut = true;
resolve(null);
}, timeoutMs);
});
const stream = await Promise.race([mediaPromise, timeoutPromise]);
window.clearTimeout(timeoutId);
if (!stream) throw new Error(`Microphone start timed out after ${(timeoutMs / 1000).toFixed(0)} seconds.`);
return stream;
}
function readAudioTrackSettings(stream) {
const settings = stream?.getAudioTracks?.()[0]?.getSettings?.();
if (!settings) return null;
const allowed = [
"autoGainControl",
"channelCount",
"echoCancellation",
"latency",
"noiseSuppression",
"sampleRate",
"sampleSize",
];
return Object.fromEntries(
allowed
.filter((key) => settings[key] != null)
.map((key) => [key, settings[key]]),
);
}
function withTimeout(promise, timeoutMs, label) {
let timeoutId = 0;
return Promise.race([
promise.finally(() => window.clearTimeout(timeoutId)),
new Promise((_, reject) => {
timeoutId = window.setTimeout(() => {
reject(new Error(`${label} timed out after ${(timeoutMs / 1000).toFixed(0)} seconds.`));
}, timeoutMs);
}),
]);
}
function stopMic() {
if (!state.micActive) return;
state.micActive = false;
state.workletNode?.disconnect();
state.micMonitorGain?.disconnect();
state.micSource?.disconnect();
for (const track of state.mediaStream?.getTracks() ?? []) {
track.stop();
}
state.audioContext?.close();
state.audioContext = null;
state.inputSampleRate = null;
state.inputTrackSettings = null;
state.mediaStream = null;
state.workletNode = null;
state.micSource = null;
state.micMonitorGain = null;
resetMicInputStats();
elements.micButton.innerHTML = '● Start mic';
setMicState("Mic off", false);
setTile("vad", "Ready", "ready");
logEvent("Microphone capture stopped.");
}
function audioWorkletSource() {
return `
class PcmCaptureProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.size = 512;
this.buffer = new Float32Array(this.size);
this.offset = 0;
}
process(inputs) {
const channel = inputs[0] && inputs[0][0];
if (!channel) return true;
for (let i = 0; i < channel.length; i += 1) {
this.buffer[this.offset++] = channel[i];
if (this.offset === this.size) {
const out = this.buffer;
this.port.postMessage(out, [out.buffer]);
this.buffer = new Float32Array(this.size);
this.offset = 0;
}
}
return true;
}
}
registerProcessor("pcm-capture", PcmCaptureProcessor);
`;
}
function shouldAcceptAsrSpeechEvent() {
return state.micActive || state.activeBenchmark?.kind === "loopback";
}
function shouldAcceptAsrTranscript() {
if (state.micActive || state.activeBenchmark?.kind === "loopback") return true;
if (state.activeBenchmark?.kind === "mic" && !state.activeBenchmark.transcript) return true;
return false;
}
function onAsrMessage(event) {
const message = event.data;
if (message.type === "status") {
handleStatus("asr", message);
if (message.scope === "vad") handleStatus("vad", message);
return;
}
if (message.type === "ready") {
setTile("vad", "Ready", "ready");
setTile("asr", "Ready", "ready");
window.dispatchEvent(new CustomEvent("model-ready", { detail: { kind: "asr" } }));
return;
}
if (message.type === "error") {
setTile("asr", "Error", "error");
window.dispatchEvent(
new CustomEvent("model-error", { detail: { kind: "asr", message: message.message } }),
);
return;
}
if (message.type === "speechstart") {
if (!shouldAcceptAsrSpeechEvent()) return;
handleSpeechStart();
return;
}
if (message.type === "speechend") {
if (!shouldAcceptAsrSpeechEvent()) return;
const now = performance.now();
const closeDelay = Number.isFinite(message.trailingSilenceMs) ? message.trailingSilenceMs : 0;
state.lastVadCloseAt = now;
state.lastVadCloseDelayMs = closeDelay;
state.lastSpeechEndAt = now - closeDelay;
elements.vadCloseLatency.textContent = formatMs(closeDelay);
if (state.activeBenchmark) state.activeBenchmark.vadCloseDelayMs = closeDelay;
elements.partialTranscript.textContent = "Transcribing...";
setTile("vad", "Transcribing", "warn");
setTile("asr", "Transcribing", "active");
return;
}
if (message.type === "partial") {
if (!shouldAcceptAsrSpeechEvent()) return;
if (message.text?.trim()) {
elements.partialTranscript.textContent = message.text.trim();
}
return;
}
if (message.type === "transcript") {
if (!shouldAcceptAsrTranscript()) return;
const text = message.text.trim();
const shouldStopMicAfterTranscript =
state.activeBenchmark?.kind === "mic" && state.activeBenchmark.stopMicAfterTranscript === true;
state.lastTranscriptAt = performance.now();
elements.asrLatency.textContent = formatMs(state.lastTranscriptAt - state.lastSpeechEndAt);
if (state.activeBenchmark) {
state.activeBenchmark.transcript = text;
state.activeBenchmark.asrMs = state.lastTranscriptAt - state.lastSpeechEndAt;
updateTranscriptQuality(state.activeBenchmark, text);
}
setTile("vad", state.micActive ? "Listening" : "Ready", state.micActive ? "active" : "ready");
setTile("asr", "Ready", "ready");
if (!text) {
elements.partialTranscript.textContent = "No speech recognized.";
failActiveBenchmark("No speech recognized.");
return;
}
if (
state.activeBenchmark?.kind === "mic" &&
state.activeBenchmark.requireExactTranscript === true &&
state.activeBenchmark.sttWer !== 0
) {
elements.partialTranscript.textContent = `Heard: "${text}"`;
logEvent(`Ignoring non-matching scripted transcript: "${text}"`);
return;
}
elements.partialTranscript.textContent = "";
elements.finalTranscript.textContent = text;
logEvent(`Transcript: "${text}"`);
generateResponse(text);
if (shouldStopMicAfterTranscript) stopMic();
}
}
function onLlmMessage(event) {
const message = event.data;
if (message.type === "status") {
handleStatus("llm", message);
return;
}
if (message.type === "ready") {
setTile("llm", "Ready", "ready");
window.dispatchEvent(new CustomEvent("model-ready", { detail: { kind: "llm" } }));
return;
}
if (message.type === "error") {
setTile("llm", "Error", "error");
window.dispatchEvent(
new CustomEvent("model-error", { detail: { kind: "llm", message: message.message } }),
);
return;
}
if (message.type === "start") {
if (message.turnId !== state.activeTurnId) return;
const llmStartMs = Number.isFinite(state.lastTranscriptAt) ? performance.now() - state.lastTranscriptAt : null;
state.currentAssistant = "";
state.ttsBuffer = "";
state.awaitingFirstToken = true;
state.awaitingFirstAudio = true;
state.firstTtsChunkQueued = false;
elements.llmOutput.textContent = "";
elements.firstTokenLatency.textContent = "-";
elements.firstTtsQueuedLatency.textContent = "-";
elements.ttsSynthLatency.textContent = "-";
elements.firstAudioLatency.textContent = "-";
elements.speechToAudioLatency.textContent = "-";
elements.decodeRate.textContent = "-";
if (state.activeBenchmark) state.activeBenchmark.llmStartMs = llmStartMs;
setTile("llm", "Generating", "active");
return;
}
if (message.type === "prompt") {
if (message.turnId !== state.activeTurnId) return;
if (state.activeBenchmark) {
state.activeBenchmark.llmPromptBuildMs = message.promptBuildMs ?? null;
state.activeBenchmark.llmPromptTokens = message.inputTokens ?? null;
}
return;
}
if (message.type === "token") {
if (message.turnId !== state.activeTurnId) return;
if (state.awaitingFirstToken) {
state.awaitingFirstToken = false;
const latency = performance.now() - state.lastTranscriptAt;
elements.firstTokenLatency.textContent = formatMs(latency);
if (state.activeBenchmark) state.activeBenchmark.firstTokenMs = latency;
}
state.currentAssistant += message.text;
elements.llmOutput.textContent = state.currentAssistant;
if (message.tps) elements.decodeRate.textContent = `${message.tps.toFixed(1)} tok/s`;
bufferTts(message.text);
return;
}
if (message.type === "complete") {
if (message.turnId !== state.activeTurnId) return;
flushTts(true);
setTile("llm", "Ready", "ready");
const cleanAssistant = cleanAssistantResponse(state.currentAssistant);
if (cleanAssistant) {
state.currentAssistant = cleanAssistant;
elements.llmOutput.textContent = cleanAssistant;
state.messages.push({ role: "assistant", content: cleanAssistant });
trimConversation();
}
if (state.activeBenchmark) {
state.activeBenchmark.output = cleanAssistant;
updateOutputQuality(state.activeBenchmark, cleanAssistant);
state.activeBenchmark.decodeRate = elements.decodeRate.textContent;
state.activeBenchmark.llmDoneAt = performance.now();
state.activeBenchmark.llmCompleteMs = state.activeBenchmark.llmDoneAt - state.lastTranscriptAt;
window.setTimeout(() => finalizeBenchmarkIfIdle(), 1200);
}
logEvent("LLM response complete.");
}
}
function onTtsMessage(event) {
const message = event.data;
if (message.type === "status") {
handleStatus("tts", message);
return;
}
if (message.type === "ready") {
setTile("tts", "Ready", "ready");
window.dispatchEvent(new CustomEvent("model-ready", { detail: { kind: "tts" } }));
return;
}
if (message.type === "error") {
setTile("tts", "Error", "error");
rejectTtsVoiceRequests(new Error(message.message));
window.dispatchEvent(
new CustomEvent("model-error", { detail: { kind: "tts", message: message.message } }),
);
return;
}
if (message.type === "voice-ready") {
const request = state.ttsVoiceRequests.get(message.requestId);
if (request) {
window.clearTimeout(request.timeout);
state.ttsVoiceRequests.delete(message.requestId);
request.resolve();
}
return;
}
if (message.type === "voice-error") {
const request = state.ttsVoiceRequests.get(message.requestId);
if (request) {
window.clearTimeout(request.timeout);
state.ttsVoiceRequests.delete(message.requestId);
request.reject(new Error(message.message));
return;
}
logEvent(`Voice ${message.voice} preload failed: ${message.message}`);
return;
}
if (message.type === "audio") {
if (message.turnId !== state.activeTurnId) return;
let recordedFirstAudio = false;
if (state.awaitingFirstAudio) {
recordedFirstAudio = true;
state.awaitingFirstAudio = false;
const now = performance.now();
const latency = now - state.lastTranscriptAt;
const speechEndToAudioMs = Number.isFinite(state.lastSpeechEndAt)
? now - state.lastSpeechEndAt
: null;
elements.firstAudioLatency.textContent = formatMs(latency);
elements.ttsSynthLatency.textContent = formatMs(message.synthesisMs);
elements.speechToAudioLatency.textContent = formatMs(speechEndToAudioMs);
if (state.activeBenchmark) {
state.activeBenchmark.firstAudioMs = latency;
state.activeBenchmark.speechEndToFirstAudioMs =
state.activeBenchmark.asrMs != null ? speechEndToAudioMs : null;
state.activeBenchmark.firstTtsSynthesisMs = message.synthesisMs ?? null;
state.activeBenchmark.firstTtsRoundTripMs = Number.isFinite(message.enqueuedAt)
? now - message.enqueuedAt
: null;
state.activeBenchmark.firstTtsChars = message.text?.length ?? null;
state.activeBenchmark.firstTtsText = message.text ?? "";
}
}
if (state.activeBenchmark) {
state.activeBenchmark.ttsChunkCount += 1;
}
state.pendingPlaybackSchedules += 1;
state.playback
.play(message.audio, message.sampleRate)
.then((playback) => {
if (message.turnId !== state.activeTurnId) return;
if (state.activeBenchmark && playback) {
const now = performance.now();
const audioEndMs = now + (playback.endDelayMs ?? playback.durationMs ?? 0) - state.lastTranscriptAt;
state.activeBenchmark.audioEndMs = Math.max(
state.activeBenchmark.audioEndMs ?? 0,
audioEndMs,
);
state.activeBenchmark.speechEndToAudioEndMs = Number.isFinite(state.lastSpeechEndAt)
? Math.max(
state.activeBenchmark.speechEndToAudioEndMs ?? 0,
now + (playback.endDelayMs ?? playback.durationMs ?? 0) - state.lastSpeechEndAt,
)
: null;
}
if (recordedFirstAudio && state.activeBenchmark) {
state.activeBenchmark.firstTtsPlaybackDelayMs = playback?.startDelayMs ?? null;
}
if (state.activeBenchmark?.llmDoneAt) window.setTimeout(() => finalizeBenchmarkIfIdle(), 0);
})
.catch((error) => logEvent(`Playback failed: ${error.message}`))
.finally(() => {
state.pendingPlaybackSchedules = Math.max(0, state.pendingPlaybackSchedules - 1);
if (state.activeBenchmark?.llmDoneAt) window.setTimeout(() => finalizeBenchmarkIfIdle(), 0);
});
setTile("tts", "Speaking", "active");
return;
}
if (message.type === "synthetic-audio") {
if (state.automationSynthesisRequests.has(message.requestId)) {
const request = state.automationSynthesisRequests.get(message.requestId);
request.audio = new Float32Array(message.audio);
request.sampleRate = message.sampleRate;
request.synthesisMs = message.synthesisMs ?? null;
return;
}
if (message.requestId !== state.loopbackRequestId) return;
if (state.activeBenchmark) state.activeBenchmark.loopbackTtsSynthesisMs = message.synthesisMs ?? null;
feedLoopbackAudio(message.audio, message.sampleRate);
return;
}
if (message.type === "done") {
if (state.automationSynthesisRequests.has(message.requestId)) {
const request = state.automationSynthesisRequests.get(message.requestId);
state.automationSynthesisRequests.delete(message.requestId);
setTile("tts", "Ready", "ready");
if (!request.audio) {
request.reject(new Error("Automation synthesis finished without audio."));
return;
}
request.resolve({
text: request.text,
voice: request.voice,
steps: request.steps,
speed: request.speed,
sampleRate: request.sampleRate,
synthesisMs: request.synthesisMs,
audio: Array.from(request.audio),
});
return;
}
if (message.requestId === state.loopbackRequestId) {
setTile("tts", "Ready", "ready");
if (state.activeBenchmark?.llmDoneAt) window.setTimeout(() => finalizeBenchmarkIfIdle(), 0);
return;
}
if (message.turnId !== state.activeTurnId) return;
setTile("tts", "Ready", "ready");
if (state.activeBenchmark?.llmDoneAt) window.setTimeout(() => finalizeBenchmarkIfIdle(), 0);
}
}
function handleStatus(kind, message) {
const label = message.message ?? "Working";
const stateMode = message.mode ?? (label.toLowerCase().includes("load") ? "warn" : "active");
setTile(kind, label, stateMode);
if (message.detail) logEvent(message.detail);
}
function handleWorkerNetworkEvent(worker, message) {
const event = {
id: ++state.workerNetworkSeq,
capturedAt: new Date().toISOString(),
elapsedMs: Math.round(performance.now()),
phase: workerNetworkPhase(),
worker,
method: String(message.method || "GET").toUpperCase(),
url: sanitizeNetworkUrl(message.responseUrl || message.url),
requestUrl: sanitizeNetworkUrl(message.url),
status: Number.isFinite(message.status) ? message.status : null,
ok: typeof message.ok === "boolean" ? message.ok : null,
durationMs: Number.isFinite(message.durationMs) ? message.durationMs : null,
error: message.error ? String(message.error) : "",
};
event.serverInferenceSuspect = isWorkerServerInferenceSuspect(event);
state.workerNetworkEvents.push(event);
if (state.workerNetworkEvents.length > MAX_WORKER_NETWORK_EVENTS) {
state.workerNetworkEvents.splice(0, state.workerNetworkEvents.length - MAX_WORKER_NETWORK_EVENTS);
}
if (!state.activeBenchmark) return;
state.activeBenchmark.workerNetworkRequestCount += 1;
if (event.serverInferenceSuspect) state.activeBenchmark.workerNetworkServerInferenceSuspects += 1;
state.activeBenchmark.workerNetworkRequests.push(event);
if (state.activeBenchmark.workerNetworkRequests.length > MAX_ROW_WORKER_NETWORK_EVENTS) {
state.activeBenchmark.workerNetworkRequests.splice(
0,
state.activeBenchmark.workerNetworkRequests.length - MAX_ROW_WORKER_NETWORK_EVENTS,
);
}
}
function workerNetworkPhase() {
if (state.modelsLoading) return "load";
if (state.activeBenchmark) return "benchmark";
if (state.modelsLoaded) return "ready";
return "idle";
}
function sanitizeNetworkUrl(value = "") {
const raw = String(value || "");
if (!raw) return "";
try {
const url = new URL(raw, location.href);
url.hash = "";
url.search = "";
return url.href;
} catch {
return raw.slice(0, 500);
}
}
function isWorkerServerInferenceSuspect(event) {
if (!event.url) return false;
const lowerUrl = event.url.toLowerCase();
if (isKnownModelAssetUrl(lowerUrl)) return false;
if (
lowerUrl.includes("/v1/chat/completions") ||
lowerUrl.includes("/v1/completions") ||
lowerUrl.includes("/v1/audio/") ||
lowerUrl.includes("/api/inference") ||
lowerUrl.includes("/api/pipeline")
) {
return true;
}
return event.phase === "benchmark";
}
function isKnownModelAssetUrl(value = "") {
try {
const url = new URL(value);
const host = url.hostname.toLowerCase();
return (
host === "cdn.jsdelivr.net" ||
host === "huggingface.co" ||
host.endsWith(".huggingface.co") ||
host.endsWith(".hf.co") ||
host.endsWith(".xethub.hf.co")
);
} catch {
return false;
}
}
function currentBenchmarkStack() {
return {
...state.loadedStack,
voice: elements.voiceSelect.value,
ttsSteps: Number(elements.ttsSteps.value),
vadSilenceMs: currentVadSilenceMs(),
partialAsr: elements.partialToggle.checked,
ttsChunking: { ...state.ttsChunking },
environment: runtimeEnvironment(),
};
}
function selectedStackError(device) {
const llm = elements.llmModelSelect.value;
if (device !== "webgpu" && WEBGPU_ONLY_LLMS.has(llm)) {
const reason = WEBGPU_ONLY_REASONS.get(llm) ?? "it is too large for the WASM fallback path.";
return `${shortModel(llm)} is disabled on WASM fallback because ${reason} Select WebGPU on a supported browser for this candidate.`;
}
return "";
}
function updateTranscriptQuality(benchmark, transcript) {
if (!benchmark.referenceText) return;
benchmark.sttWer = errorRate(words(benchmark.referenceText), words(transcript));
benchmark.sttCer = errorRate(chars(benchmark.referenceText), chars(transcript));
}
function updateOutputQuality(benchmark, output) {
if (!isAppIdentityQuestion(benchmark.prompt) && !isAppIdentityQuestion(benchmark.transcript)) return;
const hits = APP_IDENTITY_QUALITY_RULES
.filter((rule) => rule.pattern.test(output))
.map((rule) => rule.label);
const missing = APP_IDENTITY_QUALITY_RULES
.filter((rule) => !rule.pattern.test(output))
.map((rule) => rule.label);
benchmark.llmQualityHits = hits;
benchmark.llmQualityMissing = missing;
benchmark.llmQualityScore = hits.length;
benchmark.llmQualityTotal = APP_IDENTITY_QUALITY_RULES.length;
benchmark.llmQualityPass = missing.length === 0;
}
function isAppIdentityQuestion(text) {
const normalized = normalizeIdentityIntent(text);
return (
/^(what's|what is|what app is|what demo is|what application is) this$/.test(normalized) ||
/^(what's|what is) this (browser )?(app|application|demo)$/.test(normalized) ||
/^(please )?identify (this )?(browser )?(demo|app|application)$/.test(normalized)
);
}
function words(text) {
return normalizeForQuality(text).split(" ").filter(Boolean);
}
function chars(text) {
return normalizeForQuality(text).replace(/\s+/g, "").split("");
}
function normalizeForQuality(text) {
return String(text ?? "")
.toLowerCase()
.replace(/[^a-z0-9' ]+/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function normalizeIdentityIntent(text) {
return normalizeForQuality(text)
.replace(/\bbrowse\b/g, "browser")
.replace(/\bdome\b/g, "demo")
.replace(/\bdemos\b/g, "demo")
.replace(/\bidentifies\b/g, "identify");
}
function errorRate(reference, hypothesis) {
if (reference.length === 0) return hypothesis.length === 0 ? 0 : 1;
return editDistance(reference, hypothesis) / reference.length;
}
function editDistance(reference, hypothesis) {
let previous = Array.from({ length: hypothesis.length + 1 }, (_, index) => index);
for (let i = 1; i <= reference.length; i += 1) {
const current = [i];
for (let j = 1; j <= hypothesis.length; j += 1) {
const substitutionCost = reference[i - 1] === hypothesis[j - 1] ? 0 : 1;
current[j] = Math.min(
previous[j] + 1,
current[j - 1] + 1,
previous[j - 1] + substitutionCost,
);
}
previous = current;
}
return previous[hypothesis.length];
}
function runtimeEnvironment() {
return {
page: location.href,
hostMetadata: { ...state.hostMetadata },
secureContext: window.isSecureContext,
userAgent: navigator.userAgent,
platform: navigator.platform || "",
language: navigator.language || "",
hardwareConcurrency: navigator.hardwareConcurrency ?? null,
deviceMemoryGb: navigator.deviceMemory ?? null,
webgpuAvailable: state.webgpuAvailable,
webgpuAdapter: state.webgpuAdapterInfo,
webgpuFeatures: state.webgpuAdapterFeatures,
webgpuSoftwareAdapter: state.webgpuSoftwareAdapter,
inputSampleRate: state.inputSampleRate,
inputTrackSettings: state.inputTrackSettings,
};
}
function startBenchmark(kind, prompt, { referenceText = "", timeoutMs = 120000 } = {}) {
clearBenchmarkTimeout();
if (kind === "mic") resetMicInputStats();
state.activeBenchmark = {
id: state.benchmarkResults.length + 1,
kind,
startedAt: new Date().toISOString(),
stack: currentBenchmarkStack(),
prompt,
referenceText: referenceText || (kind === "loopback" ? prompt : ""),
micSeriesRun: kind === "mic" && state.micSeries.active ? state.micSeries.completed + 1 : null,
micSeriesTarget: kind === "mic" && state.micSeries.active ? state.micSeries.target : null,
transcript: "",
output: "",
asrMs: null,
sttWer: null,
sttCer: null,
vadCloseDelayMs: null,
llmStartMs: null,
llmPromptBuildMs: null,
llmPromptTokens: null,
firstTokenMs: null,
firstTtsQueuedMs: null,
firstTtsSynthesisMs: null,
firstTtsRoundTripMs: null,
firstTtsPlaybackDelayMs: null,
firstTtsChars: null,
firstTtsText: "",
firstTtsBoundaryKind: "",
firstTtsWordBoundarySafe: null,
ttsChunkCount: 0,
audioEndMs: null,
speechEndToAudioEndMs: null,
loopbackTtsSynthesisMs: null,
inputSampleRate: state.inputSampleRate,
inputTrackSettings: state.inputTrackSettings,
micInputChunks: null,
micInputPeak: null,
micInputRms: null,
loopbackSampleRate: null,
bargeInMs: null,
bargeInPass: null,
firstAudioMs: null,
speechEndToFirstAudioMs: null,
workerNetworkRequestCount: 0,
workerNetworkServerInferenceSuspects: 0,
workerNetworkRequests: [],
decodeRate: "-",
llmCompleteMs: null,
llmQualityPass: null,
llmQualityScore: null,
llmQualityTotal: APP_IDENTITY_QUALITY_RULES.length,
llmQualityHits: [],
llmQualityMissing: [],
error: "",
llmDoneAt: null,
};
state.benchmarkTimeout = window.setTimeout(() => {
failActiveBenchmark(`${kind} benchmark timed out after ${(timeoutMs / 1000).toFixed(0)} seconds.`);
}, timeoutMs);
updateMicValidationStatus();
}
function cancelActiveBenchmark(message) {
if (!state.activeBenchmark) return;
clearBenchmarkTimeout();
state.activeBenchmark = null;
updateMicValidationStatus();
if (message) logEvent(message);
}
function clearBenchmarkTimeout() {
if (!state.benchmarkTimeout) return;
window.clearTimeout(state.benchmarkTimeout);
state.benchmarkTimeout = null;
}
function updateMicSeriesButton() {
if (!state.micSeries.active) {
elements.micSeriesButton.textContent = "Run 3 real-mic series";
updateMicValidationStatus();
return;
}
const currentRun = Math.min(state.micSeries.completed + 1, state.micSeries.target);
elements.micSeriesButton.textContent = `Real mic ${currentRun}/${state.micSeries.target}`;
updateMicValidationStatus();
}
function cancelMicSeries(message = "") {
const wasActive = state.micSeries.active || Boolean(state.micSeries.timer);
if (state.micSeries.timer) {
window.clearTimeout(state.micSeries.timer);
state.micSeries.timer = 0;
}
state.micSeries.active = false;
state.micSeries.completed = 0;
updateMicSeriesButton();
if (state.modelsLoaded && !state.suiteRunning) setBenchmarkControlsDisabled(false);
updateMicValidationStatus();
if (message && wasActive) logEvent(message);
}
function failActiveBenchmark(message) {
if (!state.activeBenchmark) return;
clearBenchmarkTimeout();
clearLoopbackFeed();
state.activeBenchmark.error = message;
state.activeBenchmark.output ||= message;
state.activeBenchmark.decodeRate = elements.decodeRate.textContent;
state.activeBenchmark.firstAudioMs ??= parseMetricMs(elements.firstAudioLatency.textContent);
state.activeBenchmark.speechEndToFirstAudioMs ??= parseMetricMs(elements.speechToAudioLatency.textContent);
if (state.activeBenchmark.kind === "mic") {
state.activeBenchmark.micInputChunks = state.micInputStats.chunks;
state.activeBenchmark.micInputPeak = state.micInputStats.peak;
state.activeBenchmark.micInputRms =
state.micInputStats.samples > 0
? Math.sqrt(state.micInputStats.sumSquares / state.micInputStats.samples)
: null;
}
const result = { ...state.activeBenchmark };
state.benchmarkResults.unshift(result);
persistBenchmarkResults();
state.activeBenchmark = null;
renderBenchmarkResults();
logEvent(message);
handleBenchmarkRowAdded(result);
updateMicValidationStatus();
}
function completeActiveBenchmark() {
if (!state.activeBenchmark) return;
clearBenchmarkTimeout();
const result = { ...state.activeBenchmark };
state.benchmarkResults.unshift(result);
persistBenchmarkResults();
state.activeBenchmark = null;
renderBenchmarkResults();
handleBenchmarkRowAdded(result);
updateMicValidationStatus();
}
function handleBenchmarkRowAdded(result) {
if (result.kind !== "mic" || !state.micSeries.active) return;
if (result.error) {
cancelMicSeries("Real-mic series stopped after a failed run.");
return;
}
state.micSeries.completed += 1;
updateMicSeriesButton();
if (state.micSeries.completed >= state.micSeries.target) {
cancelMicSeries("Real-mic series complete.");
return;
}
const nextRun = state.micSeries.completed + 1;
const prompt = APP_IDENTITY_PROMPT;
elements.partialTranscript.textContent = `Say: "${prompt}" (${nextRun}/${state.micSeries.target})`;
logEvent(`Ready for real-mic run ${nextRun}/${state.micSeries.target}.`);
state.micSeries.timer = window.setTimeout(() => {
state.micSeries.timer = 0;
if (!state.micSeries.active) return;
runMicBenchmark({ series: true })
.then((started) => {
if (!started) cancelMicSeries("Real-mic series stopped before the next run.");
})
.catch((error) => {
cancelMicSeries(`Real-mic series stopped: ${error.message}`);
});
}, 900);
}
function finalizeBenchmarkIfIdle() {
if (!state.activeBenchmark) return;
if (!Number.isFinite(state.activeBenchmark.llmDoneAt)) return;
const pendingAudio = state.activeBenchmark.firstAudioMs == null && elements.firstAudioLatency.textContent === "-";
const ttsBusy = elements.states.tts.textContent === "Synthesizing" || elements.states.tts.textContent === "Speaking";
if (
state.pendingPlaybackSchedules > 0 ||
(pendingAudio && (ttsBusy || performance.now() - state.activeBenchmark.llmDoneAt < 8000))
) {
window.setTimeout(() => finalizeBenchmarkIfIdle(), state.pendingPlaybackSchedules > 0 ? 100 : 1000);
return;
}
clearBenchmarkTimeout();
state.activeBenchmark.firstAudioMs = parseMetricMs(elements.firstAudioLatency.textContent);
state.activeBenchmark.decodeRate = elements.decodeRate.textContent;
completeActiveBenchmark();
}
function parseMetricMs(text) {
if (!text || text === "-") return null;
if (text.endsWith(" ms")) return Number(text.replace(" ms", ""));
if (text.endsWith(" s")) return Number(text.replace(" s", "")) * 1000;
return null;
}
function renderBenchmarkResults() {
elements.resultsBody.replaceChildren();
elements.copyResultsButton.disabled = state.benchmarkResults.length === 0;
if (elements.downloadResultsButton) elements.downloadResultsButton.disabled = state.benchmarkResults.length === 0;
elements.clearResultsButton.disabled = state.benchmarkResults.length === 0;
renderBenchmarkSummary();
if (state.benchmarkResults.length === 0) {
const row = document.createElement("tr");
const cell = document.createElement("td");
cell.colSpan = 16;
cell.textContent = "No benchmark runs yet.";
row.append(cell);
elements.resultsBody.append(row);
return;
}
for (const result of state.benchmarkResults) {
const row = document.createElement("tr");
const stack = result.stack ?? {};
const runLabel = result.micSeriesTarget
? `${result.kind} ${result.micSeriesRun}/${result.micSeriesTarget}`
: result.kind;
const values = [
`${result.id}. ${runLabel}`,
`${shortModel(stack.llm)} / ${stack.device?.toUpperCase() ?? "-"} / ${stack.voice ?? "-"} / ${stack.ttsSteps ?? "-"} steps`,
formatMs(result.asrMs),
formatPercent(result.sttWer),
formatMs(result.vadCloseDelayMs),
formatPromptMetrics(result),
formatMs(result.firstTokenMs),
formatMs(result.firstTtsQueuedMs),
formatMs(result.firstTtsSynthesisMs),
formatMs(result.firstAudioMs),
formatMs(result.speechEndToFirstAudioMs),
formatMs(result.audioEndMs),
result.decodeRate ?? "-",
formatQuality(result),
result.transcript || result.prompt || "-",
result.error || result.output || "-",
];
for (const value of values) {
const cell = document.createElement("td");
cell.textContent = value;
row.append(cell);
}
elements.resultsBody.append(row);
}
}
function benchmarkStorageScope() {
const commit = state.hostMetadata.hfSpaceCommit || "local";
return `${location.origin}${location.pathname}#${commit}`;
}
function persistBenchmarkResults() {
if (state.benchmarkResults.length === 0) {
clearPersistedBenchmarkResults();
return;
}
try {
const payload = {
version: BENCHMARK_STORAGE_VERSION,
scope: benchmarkStorageScope(),
savedAt: new Date().toISOString(),
hostMetadata: { ...state.hostMetadata },
results: state.benchmarkResults.slice(0, BENCHMARK_STORAGE_MAX_ROWS),
};
localStorage.setItem(BENCHMARK_STORAGE_KEY, JSON.stringify(payload));
} catch {
// Storage can be disabled or full; benchmark export still works in memory.
}
}
function restoreBenchmarkResults() {
let payload = null;
try {
payload = JSON.parse(localStorage.getItem(BENCHMARK_STORAGE_KEY) || "null");
} catch {
return;
}
if (!payload || payload.version !== BENCHMARK_STORAGE_VERSION) return;
if (payload.scope !== benchmarkStorageScope()) {
logEvent("Saved benchmark rows belong to a different build and were not restored.");
return;
}
const rows = Array.isArray(payload.results)
? payload.results.filter((result) => result && typeof result === "object").slice(0, BENCHMARK_STORAGE_MAX_ROWS)
: [];
if (rows.length === 0) return;
state.benchmarkResults = rows;
renderBenchmarkResults();
logEvent(`Restored ${rows.length} saved benchmark row${rows.length === 1 ? "" : "s"}.`);
}
function clearPersistedBenchmarkResults() {
try {
localStorage.removeItem(BENCHMARK_STORAGE_KEY);
} catch {
// Ignore storage failures; in-memory rows are already cleared.
}
}
function renderBenchmarkSummary() {
const summary = displayBenchmarkSummary();
const evidence = evidenceSummaryForRows(state.benchmarkResults);
if (summary.totalRuns === 0 && summary.allRuns === 0) {
elements.benchmarkSummary.textContent = "No benchmark runs yet.";
updateMicValidationStatus();
return;
}
const items = [
["Runs", `${summary.totalRuns} ${summary.scope}`],
["Real mic", `${summary.micRuns}/${REAL_MIC_TARGET_RUNS}${summary.micMedianWer == null ? "" : `, WER ${formatPercent(summary.micMedianWer)}`}`],
["Mic end -> audio", formatMs(summary.micMedianSpeechEndToFirstAudioMs)],
["Loopback WER", summary.loopbackMedianWer == null ? "-" : formatPercent(summary.loopbackMedianWer)],
["Loopback end -> audio", formatMs(summary.loopbackMedianSpeechEndToFirstAudioMs)],
["Hardware GPU", evidence.hardwareWebgpu.passed ? "pass" : `${evidence.hardwareWebgpu.hardwareRows} rows`],
["Hosted build", evidence.hosted.passed ? shortCommit(evidence.hosted.hfSpaceCommit) : "local"],
["Barge-in", `${summary.bargeInPasses}/${summary.bargeInRuns} pass`],
];
elements.benchmarkSummary.replaceChildren(
...items.map(([label, value]) => {
const item = document.createElement("div");
item.className = "summary-item";
const labelEl = document.createElement("span");
labelEl.textContent = label;
const valueEl = document.createElement("strong");
valueEl.textContent = value;
item.append(labelEl, valueEl);
return item;
}),
);
updateMicValidationStatus();
}
function displayBenchmarkSummary() {
const allSummary = benchmarkSummaryForRows(state.benchmarkResults, { scope: "all" });
if (!state.loadedStack) return allSummary;
const activeKey = stackKey(currentBenchmarkStack());
const currentRows = state.benchmarkResults.filter((result) => stackKey(result.stack) === activeKey);
return {
...benchmarkSummaryForRows(currentRows, {
scope: "current",
stackKey: activeKey,
stackLabel: stackLabel(currentBenchmarkStack()),
}),
allRuns: allSummary.totalRuns,
};
}
function exportBenchmarkSummary() {
const activeKey = state.loadedStack ? stackKey(currentBenchmarkStack()) : null;
const currentRows = activeKey
? state.benchmarkResults.filter((result) => stackKey(result.stack) === activeKey)
: [];
return {
all: benchmarkSummaryForRows(state.benchmarkResults, { scope: "all" }),
current: activeKey
? benchmarkSummaryForRows(currentRows, {
scope: "current",
stackKey: activeKey,
stackLabel: stackLabel(currentBenchmarkStack()),
})
: null,
byStack: benchmarkSummariesByStack(),
};
}
function benchmarkExportPayload() {
const generatedAt = new Date().toISOString();
const evidence = evidenceSummaryForRows(state.benchmarkResults);
return {
schemaVersion: BENCHMARK_EXPORT_SCHEMA_VERSION,
exportId: createExportId(),
generatedAt,
hostMetadata: { ...state.hostMetadata },
runtime: runtimeEnvironment(),
network: workerNetworkSummary(),
evidence,
evidenceGuide: browserEvidenceGuide(evidence),
summary: exportBenchmarkSummary(),
results: [...state.benchmarkResults],
};
}
function createExportId() {
const random = globalThis.crypto?.randomUUID?.();
return random ? `browser-speak-${random}` : `browser-speak-${Date.now().toString(36)}`;
}
function browserEvidenceGuide(evidence) {
return {
auditCommand: "node tools/audit-browser-evidence.mjs ",
pairedNetworkCheck: "node tools/run-hosted-smoke.mjs",
requirements: {
hostedSpaceMetadata: {
hostSuffix: ".hf.space",
commitHeader: "x-repo-commit",
},
realMic: {
prompt: APP_IDENTITY_PROMPT,
requiredRows: REAL_MIC_TARGET_RUNS,
maxMedianWer: REAL_MIC_MAX_WER,
requiredSignals: [
"completed mic rows",
"mic input stats",
"identity answer pass",
"finite speech-end-to-first-audio latency",
],
},
hardwareWebgpu: {
requiredRows: 1,
requiredSignals: [
"row stack device is webgpu",
"browser reports WebGPU available",
"adapter is not classified as software",
],
},
},
current: {
realMicPassed: evidence.realMic.passed,
realMicRows: evidence.realMic.rows,
hardwareWebgpuPassed: evidence.hardwareWebgpu.passed,
hardwareWebgpuRows: evidence.hardwareWebgpu.hardwareRows,
hostedPassed: evidence.hosted.passed,
},
nextActions: browserEvidenceNextActions(evidence),
limitations: [
"Downloaded browser JSON proves page-reported rows and adapter metadata.",
"Worker fetch telemetry is bounded and page-reported; hosted CDP smoke is the authoritative whole-page network check.",
"Pair with hosted smoke for no-server network evidence.",
"Use the real-mic harness for Chrome launch provenance when fake-capture flags must be ruled out.",
],
};
}
function workerNetworkSummary() {
const events = [...state.workerNetworkEvents];
const benchmarkEvents = events.filter((event) => event.phase === "benchmark");
return {
capturedEvents: events.length,
benchmarkRequests: benchmarkEvents.length,
benchmarkServerInferenceSuspects: benchmarkEvents.filter((event) => event.serverInferenceSuspect).length,
recent: events.slice(-MAX_WORKER_NETWORK_EVENTS),
};
}
function browserEvidenceNextActions(evidence) {
const actions = [];
if (!evidence.hosted.passed) {
actions.push("Run and download evidence from the hosted static HF Space.");
}
if (!evidence.realMic.passed) {
actions.push(`Run evidence capture or Run 3 real-mic series, saying "${APP_IDENTITY_PROMPT}" each time.`);
}
if (!evidence.hardwareWebgpu.passed) {
actions.push("Run evidence capture in a browser exposing a hardware WebGPU adapter.");
}
if (actions.length === 0) actions.push("Audit this JSON with tools/audit-browser-evidence.mjs.");
return actions;
}
function downloadBenchmarkJson({
prefix = "browser-speak-benchmarks",
message = "Benchmark JSON download started.",
} = {}) {
const payload = JSON.stringify(benchmarkExportPayload(), null, 2);
const blob = new Blob([`${payload}\n`], { type: "application/json" });
const url = URL.createObjectURL(blob);
const anchor = document.createElement("a");
anchor.href = url;
anchor.download = `${prefix}-${new Date().toISOString().replace(/[:.]/g, "-")}.json`;
document.body.append(anchor);
anchor.click();
anchor.remove();
window.setTimeout(() => URL.revokeObjectURL(url), 1000);
logEvent(message);
}
function benchmarkSummariesByStack() {
const grouped = new Map();
for (const result of state.benchmarkResults) {
const key = stackKey(result.stack);
if (!grouped.has(key)) grouped.set(key, []);
grouped.get(key).push(result);
}
return [...grouped.entries()].map(([key, rows]) =>
benchmarkSummaryForRows(rows, {
scope: "stack",
stackKey: key,
stackLabel: stackLabel(rows[0]?.stack),
}),
);
}
function benchmarkSummaryForRows(results, { scope, stackKey: key = "", stackLabel: label = "" } = {}) {
const completed = results.filter((result) => !result.error);
const ttsRows = completed.filter((result) => result.kind === "tts");
const identityRows = completed.filter((result) => result.kind === "identity");
const chatRows = completed.filter((result) => result.kind === "chat");
const micRows = completed.filter((result) => result.kind === "mic");
const loopbackRows = completed.filter((result) => result.kind === "loopback");
const bargeInRows = results.filter((result) => result.kind === "barge-in");
return {
scope,
stackKey: key,
stackLabel: label,
totalRuns: results.length,
completedRuns: completed.length,
ttsRuns: ttsRows.length,
ttsMedianFirstAudioMs: median(ttsRows.map((result) => result.firstAudioMs)),
ttsMedianSynthesisMs: median(ttsRows.map((result) => result.firstTtsSynthesisMs)),
ttsMedianAudioEndMs: median(ttsRows.map((result) => result.audioEndMs)),
identityRuns: identityRows.length,
identityMedianFirstTokenMs: median(identityRows.map((result) => result.firstTokenMs)),
identityMedianFirstAudioMs: median(identityRows.map((result) => result.firstAudioMs)),
identityPasses: identityRows.filter((result) => result.llmQualityPass).length,
chatRuns: chatRows.length,
chatMedianFirstTokenMs: median(chatRows.map((result) => result.firstTokenMs)),
chatMedianFirstAudioMs: median(chatRows.map((result) => result.firstAudioMs)),
micRuns: micRows.length,
micTargetRuns: REAL_MIC_TARGET_RUNS,
micMedianWer: median(micRows.map((result) => result.sttWer)),
micMedianSpeechEndToFirstAudioMs: median(micRows.map((result) => result.speechEndToFirstAudioMs)),
micMedianSpeechEndToAudioEndMs: median(micRows.map((result) => result.speechEndToAudioEndMs)),
loopbackRuns: loopbackRows.length,
loopbackMedianWer: median(loopbackRows.map((result) => result.sttWer)),
loopbackMedianSpeechEndToFirstAudioMs: median(
loopbackRows.map((result) => result.speechEndToFirstAudioMs),
),
loopbackMedianSpeechEndToAudioEndMs: median(
loopbackRows.map((result) => result.speechEndToAudioEndMs),
),
bargeInRuns: bargeInRows.length,
bargeInPasses: bargeInRows.filter((result) => result.bargeInPass).length,
};
}
function evidenceSummaryForRows(results = []) {
const completed = results.filter((result) => !result.error);
const micRows = completed.filter((result) => result.kind === "mic");
const micRowsWithInput = micRows.filter(hasRealMicInputStats);
const micRowsWithTrackSettings = micRows.filter(hasInputTrackSettings);
const micIdentityPasses = micRows.filter((result) => result.llmQualityPass === true);
const micMedianWer = median(micRows.map((result) => result.sttWer));
const micMedianSpeechEndToFirstAudioMs = median(
micRows.map((result) => result.speechEndToFirstAudioMs),
);
const webgpuRows = completed.filter((result) => result.stack?.device === "webgpu");
const hardwareRows = webgpuRows.filter(rowUsesHardwareWebGpu);
const host = state.hostMetadata.host ?? "";
const hfSpaceCommit = state.hostMetadata.hfSpaceCommit ?? "";
return {
generatedAt: new Date().toISOString(),
realMic: {
requiredRows: REAL_MIC_TARGET_RUNS,
maxMedianWer: REAL_MIC_MAX_WER,
rows: micRows.length,
rowsWithInput: micRowsWithInput.length,
rowsWithTrackSettings: micRowsWithTrackSettings.length,
identityPasses: micIdentityPasses.length,
medianWer: micMedianWer,
medianSpeechEndToFirstAudioMs: micMedianSpeechEndToFirstAudioMs,
passed:
micRows.length >= REAL_MIC_TARGET_RUNS &&
micRowsWithInput.length >= REAL_MIC_TARGET_RUNS &&
micIdentityPasses.length >= REAL_MIC_TARGET_RUNS &&
Number.isFinite(micMedianWer) &&
micMedianWer <= REAL_MIC_MAX_WER &&
Number.isFinite(micMedianSpeechEndToFirstAudioMs),
},
hardwareWebgpu: {
rows: webgpuRows.length,
hardwareRows: hardwareRows.length,
passed: hardwareRows.length > 0,
adapters: uniqueRowAdapters(webgpuRows),
currentAdapter: state.webgpuAdapterInfo,
currentSoftwareAdapter: state.webgpuSoftwareAdapter,
},
hosted: {
host,
hfSpaceCommit,
etag: state.hostMetadata.etag ?? "",
passed: host.endsWith(".hf.space") && hfSpaceCommit.length >= 7,
},
};
}
function hasRealMicInputStats(row) {
return (
Number.isFinite(row.micInputChunks) &&
row.micInputChunks > 0 &&
Number.isFinite(row.micInputPeak) &&
row.micInputPeak > 0 &&
Number.isFinite(row.inputSampleRate ?? row.stack?.environment?.inputSampleRate)
);
}
function hasInputTrackSettings(row) {
const settings = row.inputTrackSettings ?? row.stack?.environment?.inputTrackSettings;
return Boolean(settings && Object.keys(settings).length > 0);
}
function rowUsesHardwareWebGpu(row) {
const environment = row.stack?.environment ?? {};
return (
row.stack?.device === "webgpu" &&
environment.webgpuAvailable === true &&
environment.webgpuSoftwareAdapter !== true &&
!isSoftwareWebGpuAdapter(environment.webgpuAdapter)
);
}
function uniqueRowAdapters(rows) {
const seen = new Set();
const adapters = [];
for (const row of rows) {
const environment = row.stack?.environment ?? {};
const adapter = environment.webgpuAdapter ?? null;
const label = formatGpuAdapter(adapter);
if (seen.has(label)) continue;
seen.add(label);
adapters.push({
label,
softwareAdapter: environment.webgpuSoftwareAdapter ?? null,
adapter,
});
}
return adapters;
}
function shortCommit(commit = "") {
return commit ? commit.slice(0, 8) : "-";
}
function stackKey(stack = {}) {
return [
stack.device ?? "",
stack.llm ?? "",
stack.asr ?? "",
stack.voice ?? "",
stack.ttsSteps ?? "",
stack.vadSilenceMs ?? "",
stack.partialAsr ? "partial" : "final",
ttsChunkingKey(stack.ttsChunking),
].join("|");
}
function stackLabel(stack = {}) {
const parts = [
shortModel(stack.llm),
stack.device?.toUpperCase() ?? "-",
shortModel(stack.asr),
stack.voice ?? "-",
`${stack.ttsSteps ?? "-"} steps`,
];
const chunking = ttsChunkingLabel(stack.ttsChunking);
if (chunking) parts.push(chunking);
return parts.join(" / ");
}
function ttsChunkingKey(chunking = DEFAULT_TTS_CHUNKING) {
const normalized = { ...DEFAULT_TTS_CHUNKING, ...(chunking ?? {}) };
return Object.keys(DEFAULT_TTS_CHUNKING)
.map((key) => `${key}:${normalized[key]}`)
.join(",");
}
function ttsChunkingLabel(chunking = DEFAULT_TTS_CHUNKING) {
const normalized = { ...DEFAULT_TTS_CHUNKING, ...(chunking ?? {}) };
if (ttsChunkingKey(normalized) === ttsChunkingKey(DEFAULT_TTS_CHUNKING)) return "";
return `chunk ${normalized.firstTargetChars}/${normalized.targetChars}`;
}
function median(values) {
const finite = values.filter(Number.isFinite).sort((a, b) => a - b);
if (finite.length === 0) return null;
const middle = Math.floor(finite.length / 2);
if (finite.length % 2 === 1) return finite[middle];
return (finite[middle - 1] + finite[middle]) / 2;
}
function shortModel(modelId = "") {
return modelId.split("/").at(-1)?.replace(/-ONNX$/, "") ?? modelId;
}
function clearLoopbackFeed() {
for (const timer of state.loopbackFeedTimers) {
window.clearTimeout(timer);
}
state.loopbackFeedTimers.clear();
}
function scheduleLoopbackChunk(callback, delayMs) {
const timer = window.setTimeout(() => {
state.loopbackFeedTimers.delete(timer);
callback();
}, delayMs);
state.loopbackFeedTimers.add(timer);
}
function handleSpeechStart() {
state.currentUserStartedAt = performance.now();
elements.partialTranscript.textContent = "Speech detected.";
setTile("vad", "Speech", "active");
interruptForBargeIn({
preserveLoopbackFeed: state.activeBenchmark?.kind === "loopback",
logInterruption: true,
});
}
function interruptForBargeIn({ preserveLoopbackFeed = false, cancelTts = true, logInterruption = false } = {}) {
const llmBusy = elements.states.llm.textContent === "Generating";
const ttsBusy = elements.states.tts.textContent === "Synthesizing" || elements.states.tts.textContent === "Speaking";
state.activeTurnId += 1;
state.ttsBuffer = "";
state.awaitingFirstToken = false;
state.awaitingFirstAudio = false;
state.firstTtsChunkQueued = false;
state.pendingPlaybackSchedules = 0;
if (!preserveLoopbackFeed) clearLoopbackFeed();
state.playback?.stop();
if (cancelTts) state.ttsWorker?.postMessage({ type: "cancel" });
state.llmWorker?.postMessage({ type: "interrupt" });
if (state.modelsLoaded && llmBusy) setTile("llm", "Ready", "ready");
if (state.modelsLoaded && cancelTts && ttsBusy) setTile("tts", "Ready", "ready");
if (logInterruption && (llmBusy || ttsBusy)) logEvent("Barge-in interrupted generation or playback.");
setAudioState("Audio idle", false);
}
function stopAll() {
state.suiteRunning = false;
state.evidenceCaptureRunning = false;
cancelMicSeries();
interruptForBargeIn();
cancelActiveBenchmark();
setTile("llm", state.modelsLoaded ? "Ready" : "Idle", state.modelsLoaded ? "ready" : "idle");
setTile("tts", state.modelsLoaded ? "Ready" : "Idle", state.modelsLoaded ? "ready" : "idle");
if (state.modelsLoaded) setBenchmarkControlsDisabled(false);
logEvent("Stopped generation and playback.");
}
function generateResponse(text) {
state.activeTurnId += 1;
state.messages.push({ role: "user", content: text });
trimConversation();
const promptMessages = promptMessagesForTurn(text);
state.lastTranscriptAt = performance.now();
state.llmWorker.postMessage({
type: "generate",
turnId: state.activeTurnId,
messages: promptMessages,
});
}
function trimConversation() {
const pinned = state.messages.slice(0, SYSTEM_MESSAGES.length);
const rest = state.messages.slice(SYSTEM_MESSAGES.length);
state.messages = [...pinned, ...rest.slice(-8)];
}
function promptMessagesForTurn(text) {
if (!isAppIdentityQuestion(text)) return state.messages;
return [
{
role: "system",
content: IDENTITY_SYSTEM_PROMPT,
},
...IDENTITY_PRIMER_MESSAGES,
{ role: "user", content: text },
];
}
function initialMessages() {
return SYSTEM_MESSAGES.map((message) => ({ ...message }));
}
function resetConversationHistory() {
state.messages = initialMessages();
}
function bufferTts(text) {
state.ttsBuffer += text;
const chunks = extractSpeakableChunks(state.ttsBuffer, false);
for (const chunk of chunks.ready) {
enqueueTts(chunk);
}
state.ttsBuffer = chunks.remainder;
}
function flushTts(final = false) {
const chunks = extractSpeakableChunks(state.ttsBuffer, final);
for (const chunk of chunks.ready) {
enqueueTts(chunk);
}
state.ttsBuffer = chunks.remainder;
}
function extractSpeakableChunks(buffer, final) {
const ready = [];
let text = buffer.replace(/\s+/g, " ");
while (text.length > 0) {
const boundary = findBoundary(text, final);
if (!boundary) break;
const chunk = text.slice(0, boundary.index).trim();
text = text.slice(boundary.index).trimStart();
if (chunk.length > 0) {
ready.push({
text: chunk,
boundaryKind: boundary.kind,
wordBoundarySafe: boundary.wordBoundarySafe,
});
}
}
return { ready, remainder: text };
}
function findBoundary(text, final) {
const firstChunk = state.awaitingFirstAudio && !state.firstTtsChunkQueued;
const chunking = state.ttsChunking;
const sentence = text.search(/[.!?]\s/);
if (sentence >= (firstChunk ? chunking.firstSentenceMinChars : chunking.sentenceMinChars)) {
return boundaryAt(text, sentence + 1, "sentence");
}
const clause = text.search(/[,;:]\s/);
if (clause >= (firstChunk ? chunking.firstClauseMinChars : chunking.clauseMinChars)) {
return boundaryAt(text, clause + 1, "clause");
}
const targetLength = firstChunk ? chunking.firstTargetChars : chunking.targetChars;
const minSpace = firstChunk ? chunking.firstMinSpaceChars : chunking.minSpaceChars;
if (text.length >= targetLength) {
const space = text.lastIndexOf(" ", targetLength);
if (space >= minSpace) return boundaryAt(text, space, "space-before-target");
const maxForward = targetLength * 2;
const nextSpace = text.indexOf(" ", targetLength);
if (nextSpace >= minSpace && nextSpace <= maxForward) return boundaryAt(text, nextSpace, "space-after-target");
if (firstChunk && text.length < maxForward) return null;
return boundaryAt(text, targetLength, "hard-limit");
}
if (final && text.trim().length > 0) return boundaryAt(text, text.length, "final");
return null;
}
function boundaryAt(text, index, kind) {
return {
index,
kind,
wordBoundarySafe: isWordBoundarySafe(text, index),
};
}
function isWordBoundarySafe(text, index) {
if (index >= text.length) return true;
const before = text[index - 1] ?? "";
const after = text[index] ?? "";
if (/\s/.test(after)) return true;
if (/[.!?,;:]/.test(before)) return true;
return !(isWordLikeChar(before) && isWordLikeChar(after));
}
function isWordLikeChar(char) {
return /[A-Za-z0-9']/.test(char);
}
function enqueueTts(chunk) {
const voice = elements.voiceSelect.value;
const steps = Number(elements.ttsSteps.value);
const sequence = ++state.ttsSequence;
const chunkInfo = normalizeTtsChunk(chunk);
const speakableText = chunkInfo.text.trim().replace(/^["'“”]+|["'“”]+$/g, "");
if (!speakableText) return;
const enqueuedAt = performance.now();
if (!state.firstTtsChunkQueued) {
state.firstTtsChunkQueued = true;
const queuedMs = Number.isFinite(state.lastTranscriptAt) ? enqueuedAt - state.lastTranscriptAt : null;
elements.firstTtsQueuedLatency.textContent = formatMs(queuedMs);
if (state.activeBenchmark) {
state.activeBenchmark.firstTtsQueuedMs = queuedMs;
state.activeBenchmark.firstTtsText = speakableText;
state.activeBenchmark.firstTtsChars = speakableText.length;
state.activeBenchmark.firstTtsBoundaryKind = chunkInfo.boundaryKind;
state.activeBenchmark.firstTtsWordBoundarySafe = chunkInfo.wordBoundarySafe;
}
}
state.ttsWorker.postMessage({
type: "speak",
turnId: state.activeTurnId,
sequence,
enqueuedAt,
text: speakableText,
voice,
steps,
speed: 1.08,
});
}
function normalizeTtsChunk(chunk) {
if (typeof chunk === "string") {
return {
text: chunk,
boundaryKind: "direct",
wordBoundarySafe: true,
};
}
return {
text: chunk?.text ?? "",
boundaryKind: chunk?.boundaryKind ?? "",
wordBoundarySafe: chunk?.wordBoundarySafe ?? null,
};
}
function cleanAssistantResponse(text) {
return text.trim().replace(/^["'“”]+|["'“”]+$/g, "").trim();
}
function runBenchmark() {
if (!state.modelsLoaded || !canStartBenchmark()) return false;
state.playback?.unlock().catch(() => {});
interruptForBargeIn();
resetConversationHistory();
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
resetMetrics();
const prompt = APP_IDENTITY_PROMPT;
startBenchmark("identity", prompt);
logEvent("Running identity benchmark.");
elements.finalTranscript.textContent = prompt;
generateResponse(prompt);
return true;
}
function runChatBenchmark() {
if (!state.modelsLoaded || !canStartBenchmark()) return false;
state.playback?.unlock().catch(() => {});
interruptForBargeIn();
resetConversationHistory();
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
resetMetrics();
const prompt = "Greet the user in one short sentence.";
startBenchmark("chat", prompt);
logEvent("Running chat benchmark.");
elements.finalTranscript.textContent = prompt;
generateResponse(prompt);
return true;
}
function runTtsBenchmark() {
if (!state.modelsLoaded || !canStartBenchmark()) return false;
state.playback?.unlock().catch(() => {});
const ttsIdle = elements.states.tts.textContent === "Ready" || elements.states.tts.textContent === "Idle";
interruptForBargeIn({ cancelTts: !ttsIdle });
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
resetMetrics();
const text = "This is a short local speech benchmark.";
startBenchmark("tts", text);
state.lastTranscriptAt = performance.now();
state.awaitingFirstAudio = true;
state.firstTtsChunkQueued = false;
state.ttsBuffer = "";
state.currentAssistant = text;
elements.partialTranscript.textContent = "";
elements.finalTranscript.textContent = "TTS benchmark";
elements.llmOutput.textContent = text;
state.activeBenchmark.output = text;
logEvent("Running TTS benchmark.");
enqueueTts(text);
state.activeBenchmark.llmDoneAt = performance.now();
state.activeBenchmark.llmCompleteMs = 0;
window.setTimeout(() => finalizeBenchmarkIfIdle(), 0);
return true;
}
function runLoopbackBenchmark(options = {}) {
if (!state.modelsLoaded || !canStartBenchmark()) return false;
state.playback?.unlock().catch(() => {});
interruptForBargeIn();
resetConversationHistory();
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
state.loopbackRequestId += 1;
const text = typeof options?.text === "string" && options.text.trim() ? options.text.trim() : DEFAULT_LOOPBACK_PROMPT;
startBenchmark("loopback", text);
elements.partialTranscript.textContent = "Synthesizing local loopback speech...";
elements.finalTranscript.textContent = "";
elements.llmOutput.textContent = "";
resetMetrics();
const speed = Number.isFinite(options?.speed) ? options.speed : DEFAULT_LOOPBACK_SPEED;
state.activeBenchmark.loopbackSpeed = speed;
logEvent(`Running voice loopback benchmark: "${text}"`);
state.ttsWorker.postMessage({
type: "synthesize",
requestId: state.loopbackRequestId,
text,
voice: elements.voiceSelect.value,
steps: 2,
speed,
});
return true;
}
function runBargeInBenchmark() {
if (!state.modelsLoaded || !canStartBenchmark()) return false;
state.playback?.unlock().catch(() => {});
interruptForBargeIn();
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
resetMetrics();
const text = "This is a deliberately long local speech benchmark for interrupting audio playback.";
startBenchmark("barge-in", "Synthetic speech start during TTS");
state.lastTranscriptAt = performance.now();
state.awaitingFirstAudio = true;
state.firstTtsChunkQueued = false;
state.ttsBuffer = "";
state.currentAssistant = text;
elements.partialTranscript.textContent = "";
elements.finalTranscript.textContent = "Barge-in check";
elements.llmOutput.textContent = text;
state.activeBenchmark.output = "Waiting to interrupt in-flight TTS.";
state.activeBenchmark.llmDoneAt = performance.now();
state.activeBenchmark.llmCompleteMs = 0;
logEvent("Running barge-in check.");
enqueueTts(text);
const startedAt = performance.now();
window.setTimeout(() => {
if (state.activeBenchmark?.kind !== "barge-in") return;
state.activeBenchmark.bargeInMs = performance.now() - startedAt;
handleSpeechStart();
window.setTimeout(() => {
if (state.activeBenchmark?.kind !== "barge-in") return;
state.activeBenchmark.firstAudioMs ??= parseMetricMs(elements.firstAudioLatency.textContent);
state.activeBenchmark.decodeRate = elements.decodeRate.textContent;
state.activeBenchmark.bargeInPass = state.activeBenchmark.firstAudioMs == null;
state.activeBenchmark.output = state.activeBenchmark.bargeInPass
? "Barge-in cancelled in-flight TTS before playback."
: "Barge-in did not prevent first audio.";
if (!state.activeBenchmark.bargeInPass) state.activeBenchmark.error = state.activeBenchmark.output;
completeActiveBenchmark();
}, 2200);
}, 250);
return true;
}
async function runHardwareWebGpuEvidenceBenchmark() {
if (state.activeBenchmark || state.suiteRunning || state.micSeries.active || state.modelsLoading) {
logEvent("Finish the current benchmark before collecting WebGPU evidence.");
return false;
}
await supportsWebGPU();
if (!state.webgpuAvailable || state.webgpuSoftwareAdapter) {
logEvent("Hardware WebGPU evidence requires a browser exposing a real GPU adapter.");
updateGpuValidationStatus();
return false;
}
if (!state.modelsLoaded) {
applyHardwareWebGpuEvidenceStack();
updateRuntimeStatus();
logEvent("Loading the default hardware WebGPU evidence stack.");
await loadModels({ ttsWarmup: false });
if (!state.modelsLoaded) {
logEvent("WebGPU evidence stack did not finish loading.");
return false;
}
}
const stack = currentBenchmarkStack();
if (stack.device !== "webgpu") {
logEvent("Unload the current stack, select WebGPU or Auto on a hardware adapter, then rerun WebGPU evidence.");
return false;
}
logEvent("Running hardware WebGPU evidence row.");
return runBenchmark();
}
async function runEvidenceCaptureSequence({ autoDownload = false } = {}) {
if (evidenceCaptureBusy()) {
logEvent("Finish the current run before starting evidence capture.");
return false;
}
state.evidenceCaptureRunning = true;
setBenchmarkControlsDisabled(true);
logEvent("Starting evidence capture: hardware WebGPU row when available, then 3 real-mic rows.");
try {
await supportsWebGPU();
const hasHardwareWebGpu = state.webgpuAvailable === true && state.webgpuSoftwareAdapter !== true;
if (!state.modelsLoaded) {
if (hasHardwareWebGpu) {
await loadHardwareWebGpuEvidenceStack();
} else {
logEvent("Loading the selected stack for real-mic evidence capture.");
updateRuntimeStatus();
await loadModels({ ttsWarmup: false });
}
if (!state.modelsLoaded) throw new Error("Model load did not complete.");
}
const beforeGpuEvidence = evidenceSummaryForRows(state.benchmarkResults).hardwareWebgpu;
if (hasHardwareWebGpu && beforeGpuEvidence.hardwareRows === 0) {
if (currentBenchmarkStack().device !== "webgpu") await loadHardwareWebGpuEvidenceStack();
const previousCount = state.benchmarkResults.length;
const started = await runHardwareWebGpuEvidenceBenchmark();
if (!started) throw new Error("Hardware WebGPU evidence row did not start.");
await waitForBenchmarkRow(previousCount, "Hardware WebGPU evidence", {
timeoutMs: 150000,
isActive: () => state.evidenceCaptureRunning,
});
} else if (!hasHardwareWebGpu) {
logEvent("Hardware WebGPU is not exposed; evidence capture will collect real-mic rows only.");
}
const realMicEvidence = evidenceSummaryForRows(state.benchmarkResults).realMic;
if (!realMicEvidence.passed) {
const started = await runMicSeriesBenchmark();
if (!started) throw new Error("Real-mic series did not start.");
await waitForAutomationCondition(
() => !state.micSeries.active && !state.activeBenchmark,
420000,
"Evidence real-mic series",
);
} else {
logEvent("Real-mic evidence is already complete.");
}
const evidence = evidenceSummaryForRows(state.benchmarkResults);
const missing = [];
if (!evidence.realMic.passed) missing.push("real mic");
if (!evidence.hardwareWebgpu.passed) missing.push("hardware WebGPU");
logEvent(
missing.length === 0
? autoDownload
? "Evidence capture complete; starting JSON download."
: "Evidence capture complete; use Download JSON to save the proof."
: autoDownload
? `Evidence capture finished; still missing ${missing.join(" and ")} evidence. Starting JSON download with current rows.`
: `Evidence capture finished; still missing ${missing.join(" and ")} evidence. Use Download JSON to save current rows.`,
);
if (autoDownload && state.benchmarkResults.length > 0) {
downloadBenchmarkJson({
prefix: "browser-speak-evidence",
message: "Evidence JSON download started.",
});
}
return true;
} catch (error) {
if (state.micSeries.active) cancelMicSeries();
logEvent(`Evidence capture stopped: ${error.message}`);
return false;
} finally {
state.evidenceCaptureRunning = false;
if (state.modelsLoaded && !state.suiteRunning) setBenchmarkControlsDisabled(false);
updateEvidenceCaptureButton();
updateMicValidationStatus();
}
}
async function loadHardwareWebGpuEvidenceStack() {
if (state.modelsLoaded && currentBenchmarkStack().device !== "webgpu") {
logEvent("Switching to the default hardware WebGPU evidence stack. Existing benchmark rows are preserved.");
await unloadModels({ quiet: true, preserveEvidenceCapture: true });
}
if (state.modelsLoaded && currentBenchmarkStack().device === "webgpu") return;
applyHardwareWebGpuEvidenceStack();
updateRuntimeStatus();
logEvent("Loading the default hardware WebGPU evidence stack.");
await loadModels({ ttsWarmup: false });
if (!state.modelsLoaded || currentBenchmarkStack().device !== "webgpu") {
throw new Error("WebGPU evidence stack did not finish loading.");
}
}
function applyHardwareWebGpuEvidenceStack() {
elements.deviceSelect.value = HARDWARE_WEBGPU_EVIDENCE_STACK.device;
elements.llmModelSelect.value = HARDWARE_WEBGPU_EVIDENCE_STACK.llm;
elements.asrModelSelect.value = HARDWARE_WEBGPU_EVIDENCE_STACK.asr;
elements.voiceSelect.value = HARDWARE_WEBGPU_EVIDENCE_STACK.voice;
elements.ttsSteps.value = String(HARDWARE_WEBGPU_EVIDENCE_STACK.ttsSteps);
elements.vadSilence.value = String(HARDWARE_WEBGPU_EVIDENCE_STACK.vadSilenceMs);
elements.partialToggle.checked = HARDWARE_WEBGPU_EVIDENCE_STACK.partialAsr;
updateTtsStepsLabel();
updateVadSilenceLabel();
}
async function runMicBenchmark(options = {}) {
const series = options?.series === true;
if (!state.modelsLoaded) return false;
if (!canStartBenchmark({ allowMicSeries: series })) return false;
state.playback?.unlock().catch(() => {});
interruptForBargeIn();
state.lastSpeechEndAt = null;
state.lastVadCloseAt = null;
state.lastVadCloseDelayMs = null;
resetMetrics();
const prompt = APP_IDENTITY_PROMPT;
const runLabel = series ? ` (${state.micSeries.completed + 1}/${state.micSeries.target})` : "";
elements.finalTranscript.textContent = prompt;
elements.llmOutput.textContent = "";
elements.partialTranscript.textContent = `Say: "${prompt}"${runLabel}`;
startBenchmark("mic", prompt, { referenceText: prompt, timeoutMs: options?.timeoutMs ?? 120000 });
state.activeBenchmark.stopMicAfterTranscript = options?.stopMicAfterTranscript === true;
state.activeBenchmark.requireExactTranscript = options?.requireExactTranscript === true;
updateMicValidationStatus();
logEvent(`Starting real-mic benchmark${runLabel}: "${prompt}"`);
try {
if (!state.micActive) await startMic();
logEvent(`Benchmarking real-mic turn: "${prompt}"`);
return true;
} catch (error) {
cancelActiveBenchmark();
if (series) cancelMicSeries();
logEvent(`Microphone benchmark failed: ${error.message}`);
return false;
}
}
async function runMicSeriesBenchmark() {
if (!state.modelsLoaded) return false;
if (!canStartBenchmark()) return false;
state.micSeries.active = true;
state.micSeries.completed = 0;
state.micSeries.target = 3;
updateMicSeriesButton();
setBenchmarkControlsDisabled(false);
logEvent("Starting 3-run real-mic series.");
const started = await runMicBenchmark({ series: true });
if (!started) cancelMicSeries();
return started;
}
function canStartBenchmark({ allowMicSeries = false } = {}) {
if (state.activeBenchmark) {
logEvent("A benchmark is already running.");
return false;
}
if (state.micSeries.active && !allowMicSeries) {
logEvent("A real-mic series is already running.");
return false;
}
return true;
}
async function runBenchmarkSuite() {
if (!state.modelsLoaded || state.suiteRunning) return;
if (!canStartBenchmark()) return;
state.suiteRunning = true;
setBenchmarkControlsDisabled(true);
logEvent("Running benchmark suite for the current stack.");
try {
await runSuiteStep("TTS", runTtsBenchmark);
await runSuiteStep("barge-in", runBargeInBenchmark);
await runSuiteStep("identity", runBenchmark);
await runSuiteStep("chat", runChatBenchmark);
await runSuiteStep("loopback", runLoopbackBenchmark);
logEvent("Benchmark suite complete.");
} catch (error) {
logEvent(`Benchmark suite stopped: ${error.message}`);
} finally {
state.suiteRunning = false;
if (state.modelsLoaded) setBenchmarkControlsDisabled(false);
}
}
async function runSuiteStep(label, runner) {
const previousCount = state.benchmarkResults.length;
if (runner() === false) throw new Error(`${label} benchmark did not start.`);
await waitForBenchmarkRow(previousCount, label);
}
function waitForBenchmarkRow(
previousCount,
label,
{ timeoutMs = 135000, isActive = () => state.suiteRunning } = {},
) {
return new Promise((resolve, reject) => {
const startedAt = performance.now();
const timer = window.setInterval(() => {
if (state.benchmarkResults.length > previousCount) {
window.clearInterval(timer);
resolve();
return;
}
if (!isActive()) {
window.clearInterval(timer);
reject(new Error(`${label} benchmark was cancelled.`));
return;
}
if (performance.now() - startedAt > timeoutMs) {
window.clearInterval(timer);
reject(new Error(`${label} benchmark did not finish.`));
}
}, 250);
});
}
function feedLoopbackAudio(audio, sampleRate) {
clearLoopbackFeed();
const requestId = state.loopbackRequestId;
const samples = audio instanceof Float32Array ? audio : new Float32Array(audio);
const chunkSize = Math.max(512, Math.floor(sampleRate * 0.032));
if (state.activeBenchmark) state.activeBenchmark.loopbackSampleRate = sampleRate;
elements.partialTranscript.textContent = "Feeding synthesized speech through VAD...";
let delayMs = 0;
const postChunk = (chunk) => {
scheduleLoopbackChunk(() => {
if (requestId !== state.loopbackRequestId || !state.asrWorker) return;
state.asrWorker.postMessage({ type: "audio", buffer: chunk, sampleRate }, [chunk.buffer]);
}, delayMs);
delayMs += (chunk.length / sampleRate) * 1000;
};
const prerollSamples = Math.ceil((sampleRate * DEFAULT_LOOPBACK_PREROLL_MS) / 1000);
if (prerollSamples > 0) {
if (state.activeBenchmark) state.activeBenchmark.loopbackPrerollMs = DEFAULT_LOOPBACK_PREROLL_MS;
for (let offset = 0; offset < prerollSamples; offset += chunkSize) {
postChunk(new Float32Array(Math.min(chunkSize, prerollSamples - offset)));
}
}
for (let offset = 0; offset < samples.length; offset += chunkSize) {
const chunk = samples.slice(offset, Math.min(offset + chunkSize, samples.length));
postChunk(chunk);
}
const silenceSeconds = (currentVadSilenceMs() + 220) / 1000;
const silenceChunks = Math.ceil(sampleRate * silenceSeconds / chunkSize);
for (let i = 0; i < silenceChunks; i += 1) {
const silence = new Float32Array(chunkSize);
postChunk(silence);
}
scheduleLoopbackChunk(() => {
if (requestId !== state.loopbackRequestId || !state.asrWorker) return;
state.asrWorker.postMessage({ type: "flush" });
}, delayMs + 1000);
}
function isLocalAutomationHost() {
return ["localhost", "127.0.0.1", "::1", ""].includes(location.hostname) || location.hostname.endsWith(".hf.space");
}
function automationSnapshot() {
return {
modelsLoaded: state.modelsLoaded,
modelsLoading: state.modelsLoading,
micActive: state.micActive,
micInputStats: {
chunks: state.micInputStats.chunks,
samples: state.micInputStats.samples,
peak: state.micInputStats.peak,
level: state.micLevel,
rms:
state.micInputStats.samples > 0
? Math.sqrt(state.micInputStats.sumSquares / state.micInputStats.samples)
: null,
},
activeBenchmark: state.activeBenchmark ? { ...state.activeBenchmark } : null,
suiteRunning: state.suiteRunning,
evidenceCaptureRunning: state.evidenceCaptureRunning,
micSeries: { ...state.micSeries, timer: Boolean(state.micSeries.timer) },
ttsChunking: { ...state.ttsChunking },
network: workerNetworkSummary(),
stack: state.loadedStack ? currentBenchmarkStack() : null,
summary: exportBenchmarkSummary(),
results: [...state.benchmarkResults],
events: [...elements.eventLog.children].map((item) => item.textContent),
};
}
async function automationWebGpuInfo() {
await supportsWebGPU();
return {
available: state.webgpuAvailable,
adapter: state.webgpuAdapterInfo,
features: state.webgpuAdapterFeatures,
softwareAdapter: state.webgpuSoftwareAdapter,
userAgent: navigator.userAgent,
platform: navigator.platform || "",
hardwareConcurrency: navigator.hardwareConcurrency ?? null,
deviceMemoryGb: navigator.deviceMemory ?? null,
};
}
function applyAutomationStack(options = {}) {
if (state.modelsLoaded || state.modelsLoading) {
throw new Error("Unload models before changing the benchmark stack.");
}
if (options.device) elements.deviceSelect.value = options.device;
if (options.llm) elements.llmModelSelect.value = options.llm;
if (options.asr) elements.asrModelSelect.value = options.asr;
applyRuntimeOptions(options);
}
function normalizedTtsChunking(options = {}) {
const next = { ...state.ttsChunking };
for (const key of Object.keys(DEFAULT_TTS_CHUNKING)) {
if (options[key] == null) continue;
const value = Number(options[key]);
if (!Number.isFinite(value)) continue;
next[key] = Math.max(1, Math.min(160, Math.round(value)));
}
return next;
}
function hasTtsChunkingOption(options = {}) {
return Object.keys(DEFAULT_TTS_CHUNKING).some((key) => options[key] != null);
}
function applyRuntimeOptions(options = {}) {
if (options.voice) {
elements.voiceSelect.value = options.voice;
if (state.modelsLoaded || state.modelsLoading) {
state.ttsWorker?.postMessage({ type: "preload-voice", voice: elements.voiceSelect.value });
}
}
if (options.ttsSteps != null) {
elements.ttsSteps.value = String(options.ttsSteps);
updateTtsStepsLabel();
}
if (options.vadSilenceMs != null) {
elements.vadSilence.value = String(options.vadSilenceMs);
updateVadSilenceLabel();
}
if (options.partialAsr != null) {
elements.partialToggle.checked = Boolean(options.partialAsr);
}
if (options.ttsChunking || hasTtsChunkingOption(options)) {
state.ttsChunking = normalizedTtsChunking(options.ttsChunking ?? options);
}
if (state.modelsLoaded || state.modelsLoading) configureAsrWorker();
}
function preloadTtsVoice(voice = elements.voiceSelect.value, { timeoutMs = 30000 } = {}) {
if (!state.ttsWorker) return Promise.reject(new Error("TTS worker is not loaded."));
const requestId = ++state.ttsVoiceRequestId;
return new Promise((resolve, reject) => {
const timeout = window.setTimeout(() => {
state.ttsVoiceRequests.delete(requestId);
reject(new Error(`Voice ${voice} preload timed out.`));
}, timeoutMs);
state.ttsVoiceRequests.set(requestId, { resolve, reject, timeout });
state.ttsWorker.postMessage({ type: "preload-voice", voice, requestId });
});
}
function rejectTtsVoiceRequests(error) {
for (const request of state.ttsVoiceRequests.values()) {
window.clearTimeout(request.timeout);
request.reject(error);
}
state.ttsVoiceRequests.clear();
}
function waitForAutomationCondition(predicate, timeoutMs, label) {
return new Promise((resolve, reject) => {
const startedAt = performance.now();
const timer = window.setInterval(() => {
if (predicate()) {
window.clearInterval(timer);
resolve();
return;
}
if (performance.now() - startedAt > timeoutMs) {
window.clearInterval(timer);
reject(new Error(`${label} timed out after ${(timeoutMs / 1000).toFixed(0)} seconds.`));
}
}, 250);
});
}
function benchmarkRowsAddedSince(previousCount) {
return state.benchmarkResults.slice(0, Math.max(0, state.benchmarkResults.length - previousCount));
}
function synthesizeAutomationAudio({
text = APP_IDENTITY_PROMPT,
voice = elements.voiceSelect.value,
steps = 2,
speed = 1.05,
timeoutMs = 60000,
} = {}) {
if (!state.modelsLoaded || !state.ttsWorker) {
return Promise.reject(new Error("Load models before synthesizing automation audio."));
}
const requestId = `automation-${++state.automationSynthesisRequestId}`;
return new Promise((resolve, reject) => {
const timeout = window.setTimeout(() => {
state.automationSynthesisRequests.delete(requestId);
reject(new Error("Automation synthesis timed out."));
}, timeoutMs);
state.automationSynthesisRequests.set(requestId, {
text,
voice,
steps,
speed,
resolve: (result) => {
window.clearTimeout(timeout);
resolve(result);
},
reject: (error) => {
window.clearTimeout(timeout);
reject(error);
},
});
state.ttsWorker.postMessage({
type: "synthesize",
requestId,
text,
voice,
steps,
speed,
});
});
}
async function runSingleAutomationBenchmark(runner, { timeoutMs = 150000, ...runnerOptions } = {}) {
const previousCount = state.benchmarkResults.length;
const started = await runner({ ...runnerOptions, timeoutMs: Math.max(1000, timeoutMs - 5000) });
if (started === false) throw new Error("Benchmark did not start.");
await waitForAutomationCondition(
() => state.benchmarkResults.length > previousCount,
timeoutMs,
"Benchmark",
);
return automationSnapshot();
}
async function runAutomationMicSeries({ timeoutMs = 420000 } = {}) {
const previousCount = state.benchmarkResults.length;
const started = await runMicSeriesBenchmark();
if (started === false) throw new Error("Real-mic series did not start.");
await waitForAutomationCondition(
() => !state.micSeries.active && !state.activeBenchmark,
timeoutMs,
"Real-mic series",
);
const rows = benchmarkRowsAddedSince(previousCount);
const target = rows.find((result) => result.kind === "mic")?.micSeriesTarget ?? 3;
const micRows = rows.filter((result) => result.kind === "mic" && result.micSeriesTarget === target);
if (micRows.length < target || micRows.some((result) => result.error)) {
throw new Error(`Real-mic series produced ${micRows.length}/${target} completed rows.`);
}
return automationSnapshot();
}
function installAutomationApi() {
if (!isLocalAutomationHost()) return;
window.browserSpeakBench = {
version: 1,
state: automationSnapshot,
exportResults() {
return benchmarkExportPayload();
},
webgpuInfo: automationWebGpuInfo,
setRuntimeOptions(options = {}) {
applyRuntimeOptions(options);
return automationSnapshot();
},
async preloadVoice(options = {}) {
await preloadTtsVoice(options.voice ?? elements.voiceSelect.value, {
timeoutMs: options.timeoutMs ?? 30000,
});
return automationSnapshot();
},
async loadStack(options = {}) {
await pageInitPromise;
applyAutomationStack(options);
await loadModels({ ttsWarmup: options.ttsWarmup !== false });
if (!state.modelsLoaded) throw new Error("Model load did not complete.");
return automationSnapshot();
},
async unload() {
await unloadModels();
return automationSnapshot();
},
runSuite: async () => {
await runBenchmarkSuite();
return automationSnapshot();
},
runIdentity: (options) => runSingleAutomationBenchmark(runBenchmark, options),
runChat: (options) => runSingleAutomationBenchmark(runChatBenchmark, options),
runTts: (options) => runSingleAutomationBenchmark(runTtsBenchmark, options),
runLoopback: (options) => runSingleAutomationBenchmark(runLoopbackBenchmark, options),
runBargeIn: (options) => runSingleAutomationBenchmark(runBargeInBenchmark, options),
runWebGpuEvidence: (options) => runSingleAutomationBenchmark(runHardwareWebGpuEvidenceBenchmark, options),
async runEvidenceCapture(options = {}) {
const started = await runEvidenceCaptureSequence({
autoDownload: options.autoDownload === true,
});
if (!started) throw new Error("Evidence capture did not start.");
return automationSnapshot();
},
downloadResults(options = {}) {
downloadBenchmarkJson({
prefix: options.prefix ?? "browser-speak-benchmarks",
message: options.message ?? "Benchmark JSON download started.",
});
return automationSnapshot();
},
runMic: (options) => runSingleAutomationBenchmark(runMicBenchmark, options),
runMicSeries: runAutomationMicSeries,
synthesizeAudio: synthesizeAutomationAudio,
waitForRows(count, { timeoutMs = 180000 } = {}) {
return waitForAutomationCondition(
() => state.benchmarkResults.length >= count,
timeoutMs,
`${count} benchmark rows`,
).then(automationSnapshot);
},
clearResults() {
cancelMicSeries();
clearBenchmarkTimeout();
state.benchmarkResults = [];
state.activeBenchmark = null;
clearPersistedBenchmarkResults();
renderBenchmarkResults();
return automationSnapshot();
},
stop() {
stopAll();
return automationSnapshot();
},
stopMic() {
stopMic();
return automationSnapshot();
},
};
}
elements.loadButton.addEventListener("click", () => {
const action = (async () => {
await pageInitPromise;
return state.modelsLoaded ? unloadModels() : loadModels();
})();
action.catch((error) => logEvent(`Model control failed: ${error.message}`));
});
elements.micButton.addEventListener("click", () => {
if (state.micActive) stopMic();
else startMic().catch((error) => logEvent(`Microphone failed: ${error.message}`));
});
elements.stopButton.addEventListener("click", stopAll);
elements.suiteButton.addEventListener("click", runBenchmarkSuite);
elements.benchmarkButton.addEventListener("click", runBenchmark);
elements.chatBenchmarkButton.addEventListener("click", runChatBenchmark);
elements.ttsBenchmarkButton.addEventListener("click", runTtsBenchmark);
elements.loopbackButton.addEventListener("click", runLoopbackBenchmark);
elements.bargeInButton.addEventListener("click", runBargeInBenchmark);
elements.gpuBenchmarkButton.addEventListener("click", () => {
runHardwareWebGpuEvidenceBenchmark().catch((error) => logEvent(`WebGPU evidence failed: ${error.message}`));
});
elements.evidenceCaptureButton.addEventListener("click", () => {
runEvidenceCaptureSequence({ autoDownload: true }).catch((error) =>
logEvent(`Evidence capture failed: ${error.message}`),
);
});
elements.micBenchmarkButton.addEventListener("click", runMicBenchmark);
elements.micSeriesButton.addEventListener("click", runMicSeriesBenchmark);
elements.copyResultsButton.addEventListener("click", () => {
const payload = JSON.stringify(benchmarkExportPayload(), null, 2);
const write = navigator.clipboard?.writeText(payload);
if (write) {
write.then(() => logEvent("Benchmark JSON copied.")).catch(() => logEvent(payload));
} else {
logEvent(payload);
}
});
elements.downloadResultsButton?.addEventListener("click", downloadBenchmarkJson);
elements.clearResultsButton.addEventListener("click", () => {
cancelMicSeries();
clearBenchmarkTimeout();
state.benchmarkResults = [];
state.activeBenchmark = null;
clearPersistedBenchmarkResults();
renderBenchmarkResults();
});
elements.clearLogButton.addEventListener("click", () => {
elements.eventLog.replaceChildren();
});
elements.deviceSelect.addEventListener("change", () => {
updateRuntimeStatus();
if (!state.modelsLoaded && !state.modelsLoading) {
resetPipelineTiles().catch((error) => logEvent(`Runtime status failed: ${error.message}`));
}
});
elements.ttsSteps.addEventListener("input", updateTtsStepsLabel);
elements.voiceSelect.addEventListener("change", () => {
if (!state.modelsLoaded) return;
preloadTtsVoice().catch((error) => logEvent(`Voice preload failed: ${error.message}`));
});
elements.partialToggle.addEventListener("change", () => {
configureAsrWorker();
});
elements.vadSilence.addEventListener("input", () => {
updateVadSilenceLabel();
configureAsrWorker();
});
updateTtsStepsLabel();
updateVadSilenceLabel();
updateMicValidationStatus();
installAutomationApi();
pageInitPromise = initPage();
async function initPage() {
await refreshHostMetadata().catch(() => {});
restoreBenchmarkResults();
await initRuntimeSupport();
}
async function initRuntimeSupport() {
if (!window.isSecureContext) {
logEvent("Use http://localhost or HTTPS so the browser will allow microphone access.");
}
await supportsWebGPU();
if (!state.webgpuAvailable || state.webgpuSoftwareAdapter) {
if (!state.modelsLoaded && !state.modelsLoading) {
const label = state.webgpuSoftwareAdapter ? "WASM auto" : "WASM only";
setTile("llm", label, "warn");
setTile("tts", label, "warn");
}
if (state.webgpuSoftwareAdapter) {
logEvent(
`Software WebGPU adapter detected (${formatGpuAdapter(
state.webgpuAdapterInfo,
)}); Auto uses WASM fallback.`,
);
} else {
logEvent("WebGPU is unavailable; the demo can fall back to WASM but latency will be higher.");
}
}
}