"""Text extraction and chunking helpers."""

from __future__ import annotations

import hashlib
import re
from dataclasses import dataclass

from bs4 import BeautifulSoup


WHITESPACE_RE = re.compile(r"\s+")


@dataclass(frozen=True)
class TextChunk:
    chunk_id: str
    source_id: str
    title: str
    text: str
    ordinal: int
    url: str | None
    license: str
    attribution: str | None
    tags: list[str]


def html_to_text(html: str) -> str:
    """Extract readable text from HTML."""

    soup = BeautifulSoup(html, "html.parser")
    for element in soup(["script", "style", "noscript", "svg"]):
        element.decompose()
    return normalize_text(soup.get_text("\n"))


def normalize_text(text: str) -> str:
    """Normalize whitespace while preserving paragraph boundaries lightly."""

    lines = [WHITESPACE_RE.sub(" ", line).strip() for line in text.splitlines()]
    paragraphs = [line for line in lines if line]
    return "\n".join(paragraphs)


def chunk_text(
    *,
    source_id: str,
    title: str,
    text: str,
    url: str | None,
    license: str,
    attribution: str | None,
    tags: list[str],
    chunk_words: int = 260,
    overlap_words: int = 50,
) -> list[TextChunk]:
    """Split normalized text into overlapping word chunks."""

    normalized = normalize_text(text)
    words = normalized.split()
    if not words:
        return []
    if chunk_words <= overlap_words:
        raise ValueError("chunk_words must be larger than overlap_words")

    chunks: list[TextChunk] = []
    start = 0
    ordinal = 0
    while start < len(words):
        window = words[start : start + chunk_words]
        chunk_body = " ".join(window).strip()
        digest = hashlib.sha256(f"{source_id}:{ordinal}:{chunk_body}".encode("utf-8")).hexdigest()[:16]
        chunks.append(
            TextChunk(
                chunk_id=f"{source_id}:{ordinal}:{digest}",
                source_id=source_id,
                title=title,
                text=chunk_body,
                ordinal=ordinal,
                url=url,
                license=license,
                attribution=attribution,
                tags=list(tags),
            )
        )
        ordinal += 1
        if start + chunk_words >= len(words):
            break
        start += chunk_words - overlap_words
    return chunks