"""Vector index persistence and retrieval."""

from __future__ import annotations

import json
from dataclasses import asdict, dataclass
from pathlib import Path

import numpy as np

from .embeddings import EmbeddingBackend
from .schemas import Citation, RetrievedChunk
from .text import TextChunk


CHUNKS_FILE = "chunks.jsonl"
VECTORS_FILE = "vectors.npy"
MANIFEST_FILE = "manifest.json"


@dataclass(frozen=True)
class SearchResult:
    chunk: TextChunk
    score: float

    def to_retrieved_chunk(self) -> RetrievedChunk:
        return RetrievedChunk(
            chunk_id=self.chunk.chunk_id,
            source_id=self.chunk.source_id,
            title=self.chunk.title,
            text=self.chunk.text,
            score=float(self.score),
            url=self.chunk.url,
            license=self.chunk.license,
            tags=self.chunk.tags,
        )

    def to_citation(self) -> Citation:
        return Citation(
            source_id=self.chunk.source_id,
            title=self.chunk.title,
            url=self.chunk.url,
            license=self.chunk.license,
            attribution=self.chunk.attribution,
        )


def save_index(
    index_dir: str | Path,
    *,
    chunks: list[TextChunk],
    vectors: np.ndarray,
    manifest: dict,
) -> None:
    """Persist chunks, vectors, and manifest to disk."""

    target = Path(index_dir)
    target.mkdir(parents=True, exist_ok=True)

    with (target / CHUNKS_FILE).open("w", encoding="utf-8") as handle:
        for chunk in chunks:
            handle.write(json.dumps(asdict(chunk), ensure_ascii=False) + "\n")

    np.save(target / VECTORS_FILE, np.asarray(vectors, dtype=np.float32))
    (target / MANIFEST_FILE).write_text(json.dumps(manifest, indent=2), encoding="utf-8")


class VectorIndex:
    """A small local vector index backed by NumPy arrays."""

    def __init__(self, index_dir: str | Path, embedder: EmbeddingBackend) -> None:
        self.index_dir = Path(index_dir)
        self.embedder = embedder
        self.manifest: dict = {}
        self.chunks: list[TextChunk] = []
        self.vectors = np.zeros((0, embedder.dimensions), dtype=np.float32)
        self.loaded = False

    def load(self) -> None:
        chunks_path = self.index_dir / CHUNKS_FILE
        vectors_path = self.index_dir / VECTORS_FILE
        manifest_path = self.index_dir / MANIFEST_FILE
        if not chunks_path.exists() or not vectors_path.exists() or not manifest_path.exists():
            self.loaded = False
            return

        self.manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
        chunks: list[TextChunk] = []
        with chunks_path.open("r", encoding="utf-8") as handle:
            for line in handle:
                if not line.strip():
                    continue
                chunks.append(TextChunk(**json.loads(line)))

        vectors = np.load(vectors_path)
        if vectors.ndim != 2:
            raise ValueError("index vectors must be a 2D matrix")
        if vectors.shape[0] != len(chunks):
            raise ValueError("index vector count does not match chunk count")
        if vectors.shape[1] != self.embedder.dimensions:
            raise ValueError(
                f"index dimensions {vectors.shape[1]} do not match embedder dimensions {self.embedder.dimensions}"
            )

        self.chunks = chunks
        self.vectors = np.asarray(vectors, dtype=np.float32)
        self.loaded = True

    def search(self, query: str, k: int = 4, min_score: float = 0.08) -> list[SearchResult]:
        if not self.loaded:
            self.load()
        if not self.loaded or not self.chunks or k <= 0:
            return []

        query_vector = self.embedder.encode([query])[0]
        scores = self.vectors @ query_vector
        if scores.size == 0:
            return []

        top_indices = np.argsort(scores)[::-1][:k]
        results: list[SearchResult] = []
        for index in top_indices:
            score = float(scores[index])
            if score < min_score:
                continue
            results.append(SearchResult(chunk=self.chunks[int(index)], score=score))
        return results


def unique_citations(results: list[SearchResult]) -> list[Citation]:
    """Collapse duplicate citations while preserving result order."""

    seen: set[str] = set()
    citations: list[Citation] = []
    for result in results:
        key = result.chunk.source_id
        if key in seen:
            continue
        seen.add(key)
        citations.append(result.to_citation())
    return citations