"""Embedding backends for retrieval.""" from __future__ import annotations import hashlib import re from dataclasses import dataclass from typing import Protocol import numpy as np TOKEN_RE = re.compile(r"[A-Za-z0-9][A-Za-z0-9_'/-]*") class EmbeddingBackend(Protocol): name: str dimensions: int def encode(self, texts: list[str]) -> np.ndarray: """Return normalized float32 embeddings.""" @dataclass class HashingEmbeddingBackend: """Small deterministic embedding fallback for tests and offline bootstrap.""" dimensions: int = 384 name: str = "hashing" def encode(self, texts: list[str]) -> np.ndarray: vectors = np.zeros((len(texts), self.dimensions), dtype=np.float32) for row, text in enumerate(texts): for token in TOKEN_RE.findall(text.lower()): digest = hashlib.blake2b(token.encode("utf-8"), digest_size=8).digest() index = int.from_bytes(digest[:4], "little") % self.dimensions sign = 1.0 if digest[4] % 2 == 0 else -1.0 vectors[row, index] += sign norm = np.linalg.norm(vectors[row]) if norm > 0: vectors[row] /= norm return vectors class SentenceTransformerEmbeddingBackend: """SentenceTransformers backend used for production semantic retrieval.""" def __init__(self, model_name: str, dimensions: int = 384) -> None: self.model_name = model_name self.name = model_name self.dimensions = dimensions self._model = None def _load_model(self): if self._model is None: from sentence_transformers import SentenceTransformer self._model = SentenceTransformer(self.model_name) return self._model def encode(self, texts: list[str]) -> np.ndarray: model = self._load_model() vectors = model.encode( texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, ) return np.asarray(vectors, dtype=np.float32) def create_embedding_backend( backend: str, model_name: str, dimensions: int = 384, ) -> EmbeddingBackend: """Create an embedding backend, falling back to hashing when requested.""" normalized = backend.strip().lower() if normalized in {"hash", "hashing"}: return HashingEmbeddingBackend(dimensions=dimensions) if normalized in {"sentence-transformers", "sentence_transformers", "st"}: return SentenceTransformerEmbeddingBackend(model_name=model_name, dimensions=dimensions) if normalized == "auto": try: import sentence_transformers # noqa: F401 except Exception: return HashingEmbeddingBackend(dimensions=dimensions) return SentenceTransformerEmbeddingBackend(model_name=model_name, dimensions=dimensions) raise ValueError(f"unknown embedding backend: {backend}")