"""Vector index persistence and retrieval.""" from __future__ import annotations import json from dataclasses import asdict, dataclass from pathlib import Path import numpy as np from .embeddings import EmbeddingBackend from .schemas import Citation, RetrievedChunk from .text import TextChunk CHUNKS_FILE = "chunks.jsonl" VECTORS_FILE = "vectors.npy" MANIFEST_FILE = "manifest.json" @dataclass(frozen=True) class SearchResult: chunk: TextChunk score: float def to_retrieved_chunk(self) -> RetrievedChunk: return RetrievedChunk( chunk_id=self.chunk.chunk_id, source_id=self.chunk.source_id, title=self.chunk.title, text=self.chunk.text, score=float(self.score), url=self.chunk.url, license=self.chunk.license, tags=self.chunk.tags, ) def to_citation(self) -> Citation: return Citation( source_id=self.chunk.source_id, title=self.chunk.title, url=self.chunk.url, license=self.chunk.license, attribution=self.chunk.attribution, ) def save_index( index_dir: str | Path, *, chunks: list[TextChunk], vectors: np.ndarray, manifest: dict, ) -> None: """Persist chunks, vectors, and manifest to disk.""" target = Path(index_dir) target.mkdir(parents=True, exist_ok=True) with (target / CHUNKS_FILE).open("w", encoding="utf-8") as handle: for chunk in chunks: handle.write(json.dumps(asdict(chunk), ensure_ascii=False) + "\n") np.save(target / VECTORS_FILE, np.asarray(vectors, dtype=np.float32)) (target / MANIFEST_FILE).write_text(json.dumps(manifest, indent=2), encoding="utf-8") class VectorIndex: """A small local vector index backed by NumPy arrays.""" def __init__(self, index_dir: str | Path, embedder: EmbeddingBackend) -> None: self.index_dir = Path(index_dir) self.embedder = embedder self.manifest: dict = {} self.chunks: list[TextChunk] = [] self.vectors = np.zeros((0, embedder.dimensions), dtype=np.float32) self.loaded = False def load(self) -> None: chunks_path = self.index_dir / CHUNKS_FILE vectors_path = self.index_dir / VECTORS_FILE manifest_path = self.index_dir / MANIFEST_FILE if not chunks_path.exists() or not vectors_path.exists() or not manifest_path.exists(): self.loaded = False return self.manifest = json.loads(manifest_path.read_text(encoding="utf-8")) chunks: list[TextChunk] = [] with chunks_path.open("r", encoding="utf-8") as handle: for line in handle: if not line.strip(): continue chunks.append(TextChunk(**json.loads(line))) vectors = np.load(vectors_path) if vectors.ndim != 2: raise ValueError("index vectors must be a 2D matrix") if vectors.shape[0] != len(chunks): raise ValueError("index vector count does not match chunk count") if vectors.shape[1] != self.embedder.dimensions: raise ValueError( f"index dimensions {vectors.shape[1]} do not match embedder dimensions {self.embedder.dimensions}" ) self.chunks = chunks self.vectors = np.asarray(vectors, dtype=np.float32) self.loaded = True def search(self, query: str, k: int = 4, min_score: float = 0.08) -> list[SearchResult]: if not self.loaded: self.load() if not self.loaded or not self.chunks or k <= 0: return [] query_vector = self.embedder.encode([query])[0] scores = self.vectors @ query_vector if scores.size == 0: return [] top_indices = np.argsort(scores)[::-1][:k] results: list[SearchResult] = [] for index in top_indices: score = float(scores[index]) if score < min_score: continue results.append(SearchResult(chunk=self.chunks[int(index)], score=score)) return results def unique_citations(results: list[SearchResult]) -> list[Citation]: """Collapse duplicate citations while preserving result order.""" seen: set[str] = set() citations: list[Citation] = [] for result in results: key = result.chunk.source_id if key in seen: continue seen.add(key) citations.append(result.to_citation()) return citations