"""Text extraction and chunking helpers.""" from __future__ import annotations import hashlib import re from dataclasses import dataclass from bs4 import BeautifulSoup WHITESPACE_RE = re.compile(r"\s+") @dataclass(frozen=True) class TextChunk: chunk_id: str source_id: str title: str text: str ordinal: int url: str | None license: str attribution: str | None tags: list[str] def html_to_text(html: str) -> str: """Extract readable text from HTML.""" soup = BeautifulSoup(html, "html.parser") for element in soup(["script", "style", "noscript", "svg"]): element.decompose() return normalize_text(soup.get_text("\n")) def normalize_text(text: str) -> str: """Normalize whitespace while preserving paragraph boundaries lightly.""" lines = [WHITESPACE_RE.sub(" ", line).strip() for line in text.splitlines()] paragraphs = [line for line in lines if line] return "\n".join(paragraphs) def chunk_text( *, source_id: str, title: str, text: str, url: str | None, license: str, attribution: str | None, tags: list[str], chunk_words: int = 260, overlap_words: int = 50, ) -> list[TextChunk]: """Split normalized text into overlapping word chunks.""" normalized = normalize_text(text) words = normalized.split() if not words: return [] if chunk_words <= overlap_words: raise ValueError("chunk_words must be larger than overlap_words") chunks: list[TextChunk] = [] start = 0 ordinal = 0 while start < len(words): window = words[start : start + chunk_words] chunk_body = " ".join(window).strip() digest = hashlib.sha256(f"{source_id}:{ordinal}:{chunk_body}".encode("utf-8")).hexdigest()[:16] chunks.append( TextChunk( chunk_id=f"{source_id}:{ordinal}:{digest}", source_id=source_id, title=title, text=chunk_body, ordinal=ordinal, url=url, license=license, attribution=attribution, tags=list(tags), ) ) ordinal += 1 if start + chunk_words >= len(words): break start += chunk_words - overlap_words return chunks