import { AutoModel, Tensor, env, pipeline } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0";

env.allowLocalModels = false;
env.useBrowserCache = true;
installFetchTelemetry("asr");

const SAMPLE_RATE = 16000;
const SPEECH_THRESHOLD = 0.3;
const EXIT_THRESHOLD = 0.1;
const DEFAULT_SILENCE_DURATION_MS = 480;
const MIN_SILENCE_DURATION_MS = 200;
const MAX_SILENCE_DURATION_MS = 800;
const SPEECH_PAD_SAMPLES = 80 * (SAMPLE_RATE / 1000);
const MIN_SPEECH_DURATION_SAMPLES = 250 * (SAMPLE_RATE / 1000);
const MAX_BUFFER_DURATION = 30;
const NEW_BUFFER_SIZE = 512;
const MAX_NUM_PREV_BUFFERS = Math.ceil(SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE);
const PARTIAL_INTERVAL_MS = 1600;

let vadModel = null;
let transcriber = null;
let device = "wasm";
let inputQueue = new Float32Array(0);
let vadChain = Promise.resolve();
let asrChain = Promise.resolve();
let vadState = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
let srTensor = new Tensor("int64", [SAMPLE_RATE], []);
let isRecording = false;
let bufferPointer = 0;
let postSpeechSamples = 0;
let previousBuffers = [];
let partialEnabled = true;
let minSilenceDurationSamples = silenceDurationSamples(DEFAULT_SILENCE_DURATION_MS);
let partialBusy = false;
let lastPartialAt = 0;
let utteranceStartedAt = 0;
const recordingBuffer = new Float32Array(MAX_BUFFER_DURATION * SAMPLE_RATE);

function installFetchTelemetry(scope) {
  const originalFetch = globalThis.fetch?.bind(globalThis);
  if (!originalFetch || globalThis.__browserSpeakFetchTelemetryInstalled) return;
  globalThis.__browserSpeakFetchTelemetryInstalled = true;
  globalThis.fetch = async (input, init) => {
    const startedAt = performance.now();
    const url = fetchUrl(input);
    const method = String(init?.method || input?.method || "GET").toUpperCase();
    try {
      const response = await originalFetch(input, init);
      self.postMessage({
        type: "network",
        scope,
        method,
        url,
        responseUrl: response.url || url,
        status: response.status,
        ok: response.ok,
        durationMs: performance.now() - startedAt,
      });
      return response;
    } catch (error) {
      self.postMessage({
        type: "network",
        scope,
        method,
        url,
        status: null,
        ok: false,
        durationMs: performance.now() - startedAt,
        error: error.message ?? String(error),
      });
      throw error;
    }
  };
}

function fetchUrl(input) {
  if (typeof input === "string") return input;
  if (input instanceof URL) return input.href;
  return input?.url ?? "";
}

self.onmessage = async (event) => {
  const message = event.data;
  try {
    if (message.type === "load") {
      await load(message);
    } else if (message.type === "configure") {
      configure(message);
    } else if (message.type === "audio") {
      ingestAudio(message.buffer, message.sampleRate);
    } else if (message.type === "flush") {
      await flushRecording();
    }
  } catch (error) {
    self.postMessage({ type: "error", message: error.message ?? String(error) });
  }
};

async function load({ model, device: requestedDevice, partial, silenceMs }) {
  device = requestedDevice;
  configure({ partial, silenceMs });
  self.postMessage({ type: "status", scope: "vad", message: "Loading", mode: "warn" });
  vadModel = await AutoModel.from_pretrained("onnx-community/silero-vad", {
    config: { model_type: "custom" },
    dtype: "fp32",
    progress_callback: reportProgress("VAD"),
  });

  self.postMessage({ type: "status", message: "Loading", mode: "warn" });
  const dtype =
    model.includes("moonshine")
      ? {
          encoder_model: device === "webgpu" ? "fp32" : "fp32",
          decoder_model_merged: "q4",
        }
      : device === "webgpu"
        ? {
            encoder_model: "fp32",
            decoder_model_merged: "q4",
          }
        : {
            encoder_model: "fp32",
            decoder_model_merged: "q4",
          };

  transcriber = await pipeline("automatic-speech-recognition", model, {
    device,
    dtype,
    progress_callback: reportProgress("STT"),
  });

  self.postMessage({ type: "status", message: "Warming", mode: "warn" });
  await transcribeBuffer(new Float32Array(SAMPLE_RATE), { warmup: true });
  self.postMessage({ type: "ready" });
}

function reportProgress(label) {
  return (progress) => {
    if (progress.status === "progress") {
      const pct = Number.isFinite(progress.progress) ? ` ${progress.progress.toFixed(0)}%` : "";
      self.postMessage({ type: "status", message: `${label}${pct}`, mode: "warn" });
    }
  };
}

function ingestAudio(buffer, sourceRate) {
  const resampled = resampleTo16k(buffer, sourceRate);
  inputQueue = concat(inputQueue, resampled);
  while (inputQueue.length >= NEW_BUFFER_SIZE) {
    const chunk = inputQueue.slice(0, NEW_BUFFER_SIZE);
    inputQueue = inputQueue.slice(NEW_BUFFER_SIZE);
    void handleVadChunk(chunk);
  }
}

async function handleVadChunk(buffer) {
  const wasRecording = isRecording;
  const speech = await vad(buffer);

  if (!wasRecording && !speech) {
    if (previousBuffers.length >= MAX_NUM_PREV_BUFFERS) previousBuffers.shift();
    previousBuffers.push(buffer);
    return;
  }

  const remaining = recordingBuffer.length - bufferPointer;
  if (buffer.length >= remaining) {
    recordingBuffer.set(buffer.subarray(0, remaining), bufferPointer);
    bufferPointer += remaining;
    dispatchForTranscription(buffer.subarray(remaining));
    return;
  }

  recordingBuffer.set(buffer, bufferPointer);
  bufferPointer += buffer.length;

  if (speech) {
    if (!isRecording) {
      utteranceStartedAt = performance.now();
      self.postMessage({ type: "speechstart" });
    }
    isRecording = true;
    postSpeechSamples = 0;
    maybePartial();
    return;
  }

  postSpeechSamples += buffer.length;
  if (postSpeechSamples < minSilenceDurationSamples) return;

  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
    reset();
    return;
  }

  self.postMessage({
    type: "speechend",
    trailingSilenceMs: sampleDurationMs(postSpeechSamples),
  });
  dispatchForTranscription();
}

async function vad(buffer) {
  const input = new Tensor("float32", buffer, [1, buffer.length]);
  const result = await (vadChain = vadChain.then(() =>
    vadModel({ input, sr: srTensor, state: vadState }),
  ));
  vadState = result.stateN;
  const probability = result.output.data[0];
  return probability > SPEECH_THRESHOLD || (isRecording && probability >= EXIT_THRESHOLD);
}

function maybePartial() {
  if (!partialEnabled || partialBusy) return;
  const now = performance.now();
  if (now - lastPartialAt < PARTIAL_INTERVAL_MS || bufferPointer < SAMPLE_RATE) return;
  partialBusy = true;
  lastPartialAt = now;
  const buffer = paddedRecordingBuffer();
  transcribeBuffer(buffer, { partial: true })
    .then((text) => {
      if (text.trim()) self.postMessage({ type: "partial", text });
    })
    .finally(() => {
      partialBusy = false;
    });
}

function dispatchForTranscription(overflow) {
  const buffer = paddedRecordingBuffer();
  transcribeBuffer(buffer, { partial: false }).then((text) => {
    self.postMessage({
      type: "transcript",
      text,
      durationMs: performance.now() - utteranceStartedAt,
    });
  });

  if (overflow?.length) {
    recordingBuffer.set(overflow, 0);
  }
  reset(overflow?.length ?? 0);
}

async function flushRecording() {
  await (vadChain = vadChain.then(() => Promise.resolve()));
  if (!isRecording || bufferPointer < MIN_SPEECH_DURATION_SAMPLES) return;
  self.postMessage({
    type: "speechend",
    trailingSilenceMs: sampleDurationMs(postSpeechSamples),
    forced: true,
  });
  dispatchForTranscription();
}

function paddedRecordingBuffer() {
  const current = recordingBuffer.slice(0, Math.min(bufferPointer + SPEECH_PAD_SAMPLES, recordingBuffer.length));
  const prevLength = previousBuffers.reduce((sum, item) => sum + item.length, 0);
  const padded = new Float32Array(prevLength + current.length);
  let offset = 0;
  for (const prev of previousBuffers) {
    padded.set(prev, offset);
    offset += prev.length;
  }
  padded.set(current, offset);
  return padded;
}

async function transcribeBuffer(buffer, { warmup = false } = {}) {
  const output = await (asrChain = asrChain.then(() => transcriber(buffer)));
  if (warmup) return "";
  return output.text ?? "";
}

function reset(offset = 0) {
  recordingBuffer.fill(0, offset);
  bufferPointer = offset;
  isRecording = false;
  postSpeechSamples = 0;
  previousBuffers = [];
  lastPartialAt = 0;
}

function configure({ partial, silenceMs } = {}) {
  if (typeof partial === "boolean") partialEnabled = partial;
  if (silenceMs != null) minSilenceDurationSamples = silenceDurationSamples(silenceMs);
}

function silenceDurationSamples(value) {
  const numericValue = Number(value);
  const ms = Number.isFinite(numericValue) ? numericValue : DEFAULT_SILENCE_DURATION_MS;
  const clampedMs = Math.min(MAX_SILENCE_DURATION_MS, Math.max(MIN_SILENCE_DURATION_MS, ms));
  return Math.round(clampedMs * (SAMPLE_RATE / 1000));
}

function sampleDurationMs(samples) {
  return (samples / SAMPLE_RATE) * 1000;
}

function resampleTo16k(input, sourceRate) {
  if (sourceRate === SAMPLE_RATE) return input;
  const ratio = sourceRate / SAMPLE_RATE;
  const length = Math.floor(input.length / ratio);
  const output = new Float32Array(length);
  for (let i = 0; i < length; i += 1) {
    const position = i * ratio;
    const left = Math.floor(position);
    const right = Math.min(left + 1, input.length - 1);
    const weight = position - left;
    output[i] = input[left] * (1 - weight) + input[right] * weight;
  }
  return output;
}

function concat(left, right) {
  if (left.length === 0) return right;
  const out = new Float32Array(left.length + right.length);
  out.set(left, 0);
  out.set(right, left.length);
  return out;
}