"""Curated legal source catalog and scraper-backed index builder."""

from __future__ import annotations

import hashlib
import json
import time
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import quote, urljoin, urlparse, urlunparse
from urllib.robotparser import RobotFileParser

import numpy as np
import requests
from bs4 import BeautifulSoup

from .embeddings import create_embedding_backend
from .index_store import save_index
from .text import TextChunk, chunk_text, html_to_text, normalize_text


DEFAULT_USER_AGENT = "GameMasterCopilot/0.1 (+https://huggingface.co/spaces/vivekchakraverty/gamemaster-design-copilot)"
SKIP_EXTENSIONS = {
    ".7z",
    ".avi",
    ".css",
    ".gif",
    ".gz",
    ".ico",
    ".jpeg",
    ".jpg",
    ".js",
    ".json",
    ".mp3",
    ".mp4",
    ".ogg",
    ".pdf",
    ".png",
    ".svg",
    ".tar",
    ".webm",
    ".zip",
}


@dataclass(frozen=True)
class ScrapedDocument:
    """One fetched document ready for chunking."""

    source_id: str
    title: str
    text: str
    url: str
    license: str
    attribution: str
    tags: list[str]


@dataclass(frozen=True)
class CatalogEntry:
    """One allowlisted source collection."""

    id: str
    label: str
    description: str
    license: str
    permission: str
    attribution: str
    tags: list[str]
    kind: str
    source_url: str
    start_urls: tuple[str, ...] = ()
    allowed_prefixes: tuple[str, ...] = ()
    api_url: str | None = None
    pages: tuple[str, ...] = ()
    notes: str = ""


WIKIPEDIA_GAME_DESIGN_PAGES = (
    "Game design",
    "Video game design",
    "Game mechanics",
    "Gameplay",
    "Game balance",
    "Level design",
    "Playtesting",
    "Emergent gameplay",
    "Nonlinear gameplay",
    "Game studies",
    "Ludology",
    "Flow (psychology)",
    "Tutorial (video games)",
    "Boss (video games)",
    "Game artificial intelligence",
    "Serious game",
)


CATALOG: tuple[CatalogEntry, ...] = (
    CatalogEntry(
        id="game_design_concepts",
        label="Game Design Concepts course",
        description="Ian Schreiber's free game design course posts.",
        license="CC BY 3.0 US",
        permission="permissive",
        attribution="Game Design Concepts by Ian Schreiber",
        tags=["game-design", "course", "mechanics", "iteration", "balance"],
        kind="wordpress_rest",
        source_url="https://gamedesignconcepts.wordpress.com/about/",
        api_url="https://gamedesignconcepts.wordpress.com/wp-json/wp/v2/posts",
        notes="The course about page states that course content may be shared and adapted with attribution.",
    ),
    CatalogEntry(
        id="wikipedia_game_design",
        label="Wikipedia game design topics",
        description="Selected game design, mechanics, level design, and playtesting encyclopedia topics.",
        license="CC BY-SA 4.0 / GFDL",
        permission="sharealike_open",
        attribution="Wikipedia contributors",
        tags=["game-design", "mechanics", "level-design", "encyclopedia"],
        kind="mediawiki_pages",
        source_url="https://en.wikipedia.org/wiki/Wikipedia:Copyrights",
        api_url="https://api.wikimedia.org/core/v1/wikipedia/en/page",
        pages=WIKIPEDIA_GAME_DESIGN_PAGES,
        notes="Text reuse requires attribution and share-alike compliance.",
    ),
    CatalogEntry(
        id="mit_ocw_cms608_2014",
        label="MIT OCW CMS.608 Game Design",
        description="MIT OpenCourseWare Game Design course pages from Spring 2014.",
        license="CC BY-NC-SA",
        permission="noncommercial_open",
        attribution="MIT OpenCourseWare, CMS.608 Game Design, Spring 2014",
        tags=["game-design", "course", "non-digital-games", "assignments"],
        kind="crawl_prefix",
        source_url="https://ocw.mit.edu/courses/cms-608-game-design-spring-2014/",
        start_urls=("https://ocw.mit.edu/courses/cms-608-game-design-spring-2014/",),
        allowed_prefixes=("https://ocw.mit.edu/courses/cms-608-game-design-spring-2014/",),
        notes="MIT OCW materials are open, but this catalog entry is noncommercial/share-alike.",
    ),
    CatalogEntry(
        id="fate_core_srd",
        label="Fate Core SRD",
        description="Fate Core SRD pages for tabletop RPG system design examples.",
        license="CC BY 3.0 Unported",
        permission="permissive",
        attribution="Fate Core System by Evil Hat Productions and Fate SRD contributors",
        tags=["ttrpg", "srd", "narrative-design", "resolution-mechanics"],
        kind="crawl_prefix",
        source_url="https://fate-srd.com/official-licensing-fate",
        start_urls=("https://fate-srd.com/fate-core/", "https://fate-srd.com/official-licensing-fate"),
        allowed_prefixes=("https://fate-srd.com/fate-core/", "https://fate-srd.com/official-licensing-fate"),
        notes="The official licensing page identifies Fate SRD as the endorsed SRD source.",
    ),
    CatalogEntry(
        id="blades_srd",
        label="Blades in the Dark SRD",
        description="Forged in the Dark SRD pages for position/effect, clocks, crews, and action resolution.",
        license="CC BY 3.0 Unported",
        permission="permissive",
        attribution="Blades in the Dark by One Seven Design, developed and authored by John Harper",
        tags=["ttrpg", "srd", "fiction-first", "clocks", "gm-tools"],
        kind="crawl_prefix",
        source_url="https://bladesinthedark.com/licensing",
        start_urls=("https://bladesinthedark.com/basics/", "https://bladesinthedark.com/licensing"),
        allowed_prefixes=("https://bladesinthedark.com/",),
        notes="Catalog notes warn users not to use non-SRD setting, NPC, artwork, or map material.",
    ),
    CatalogEntry(
        id="dnd_srd",
        label="D&D Beyond SRD",
        description="Official D&D SRD page for rules/mechanics examples under Creative Commons.",
        license="CC BY 4.0",
        permission="permissive",
        attribution="System Reference Document by Wizards of the Coast LLC",
        tags=["ttrpg", "srd", "rules", "encounters", "mechanics"],
        kind="single_pages",
        source_url="https://www.dndbeyond.com/srd",
        start_urls=("https://www.dndbeyond.com/srd",),
        notes="Only SRD content is included; trademarks and non-SRD material remain outside scope.",
    ),
)


def get_catalog() -> list[CatalogEntry]:
    """Return the curated legal source catalog."""

    return list(CATALOG)


def get_catalog_entry(entry_id: str) -> CatalogEntry:
    for entry in CATALOG:
        if entry.id == entry_id:
            return entry
    raise KeyError(f"unknown catalog source: {entry_id}")


def catalog_summary_markdown() -> str:
    lines = [
        "### Curated Legal Source Catalog",
        "",
        "Only these allowlisted sources can be scraped by the UI. Robots.txt is checked for web pages, "
        "binary assets are skipped, and each chunk keeps license and attribution metadata.",
        "",
    ]
    for entry in CATALOG:
        lines.append(f"- **{entry.label}** (`{entry.id}`): {entry.license}. {entry.description}")
    return "\n".join(lines)


def _safe_slug(value: str) -> str:
    cleaned = "".join(char.lower() if char.isalnum() else "-" for char in value)
    cleaned = "-".join(part for part in cleaned.split("-") if part)
    return cleaned[:80] or hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]


def _canonical_url(url: str) -> str:
    parsed = urlparse(url)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path.rstrip("/") + "/", "", "", ""))


def _skip_url(url: str) -> bool:
    path = urlparse(url).path.lower()
    return any(path.endswith(extension) for extension in SKIP_EXTENSIONS)


def _robots_allowed(url: str, user_agent: str) -> bool:
    parsed = urlparse(url)
    robots_url = urljoin(f"{parsed.scheme}://{parsed.netloc}", "/robots.txt")
    try:
        response = requests.get(robots_url, headers={"User-Agent": user_agent}, timeout=10)
    except Exception:
        return False
    if response.status_code == 404:
        return True
    if response.status_code >= 400:
        return False
    parser = RobotFileParser()
    parser.set_url(robots_url)
    parser.parse(response.text.splitlines())
    return parser.can_fetch(user_agent, url)


def _fetch(url: str, user_agent: str, timeout_seconds: int) -> requests.Response:
    if not _robots_allowed(url, user_agent):
        raise PermissionError(f"robots.txt disallows fetching {url}")
    response = requests.get(url, headers={"User-Agent": user_agent}, timeout=timeout_seconds)
    response.raise_for_status()
    return response


def _document_from_html(entry: CatalogEntry, url: str, html: str, fallback_title: str | None = None) -> ScrapedDocument:
    soup = BeautifulSoup(html, "html.parser")
    title_node = soup.find("h1") or soup.find("title")
    title = normalize_text(title_node.get_text(" ")) if title_node else fallback_title or entry.label
    text = html_to_text(html)
    return ScrapedDocument(
        source_id=f"{entry.id}.{_safe_slug(url)}",
        title=title or entry.label,
        text=text,
        url=url,
        license=entry.license,
        attribution=entry.attribution,
        tags=list(entry.tags),
    )


def _extract_links(html: str, base_url: str, allowed_prefixes: tuple[str, ...]) -> list[str]:
    soup = BeautifulSoup(html, "html.parser")
    links: list[str] = []
    for anchor in soup.find_all("a", href=True):
        url = urljoin(base_url, anchor["href"])
        parsed = urlparse(url)
        if parsed.scheme not in {"http", "https"}:
            continue
        candidate = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
        if _skip_url(candidate):
            continue
        if any(candidate.startswith(prefix) for prefix in allowed_prefixes):
            links.append(candidate)
    return links


def _scrape_single_pages(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]:
    docs: list[ScrapedDocument] = []
    warnings: list[str] = []
    for url in entry.start_urls[:max_docs]:
        try:
            response = _fetch(url, user_agent, timeout_seconds)
            docs.append(_document_from_html(entry, url, response.text))
        except Exception as exc:
            warnings.append(f"skipped {url}: {exc}")
    return docs, warnings


def _scrape_crawl_prefix(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]:
    docs: list[ScrapedDocument] = []
    warnings: list[str] = []
    queue = list(entry.start_urls)
    seen: set[str] = set()

    while queue and len(docs) < max_docs:
        url = queue.pop(0)
        canonical = _canonical_url(url)
        if canonical in seen or _skip_url(url):
            continue
        seen.add(canonical)
        if not any(url.startswith(prefix) for prefix in entry.allowed_prefixes):
            continue

        try:
            response = _fetch(url, user_agent, timeout_seconds)
            content_type = response.headers.get("content-type", "").lower()
            if "text/html" not in content_type:
                warnings.append(f"skipped non-html URL {url}: {content_type or 'unknown content type'}")
                continue
            docs.append(_document_from_html(entry, url, response.text))
            for link in _extract_links(response.text, url, entry.allowed_prefixes):
                if _canonical_url(link) not in seen and link not in queue:
                    queue.append(link)
            time.sleep(0.1)
        except Exception as exc:
            warnings.append(f"skipped {url}: {exc}")
    return docs, warnings


def _scrape_wordpress_rest(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]:
    if not entry.api_url:
        return [], [f"{entry.id} is missing api_url"]

    docs: list[ScrapedDocument] = []
    warnings: list[str] = []
    page = 1
    while len(docs) < max_docs:
        url = f"{entry.api_url}?per_page=20&page={page}&_fields=link,title,content"
        try:
            response = _fetch(url, user_agent, timeout_seconds)
        except requests.HTTPError as exc:
            if exc.response is not None and exc.response.status_code == 400:
                break
            warnings.append(f"stopped WordPress fetch at page {page}: {exc}")
            break
        except Exception as exc:
            warnings.append(f"stopped WordPress fetch at page {page}: {exc}")
            break

        posts = response.json()
        if not posts:
            break
        for post in posts:
            title = html_to_text(post.get("title", {}).get("rendered", "")) or entry.label
            link = post.get("link") or entry.source_url
            content = post.get("content", {}).get("rendered", "")
            text = html_to_text(content)
            if not text:
                continue
            docs.append(
                ScrapedDocument(
                    source_id=f"{entry.id}.{_safe_slug(link)}",
                    title=title,
                    text=text,
                    url=link,
                    license=entry.license,
                    attribution=entry.attribution,
                    tags=list(entry.tags),
                )
            )
            if len(docs) >= max_docs:
                break
        page += 1
    return docs, warnings


def _scrape_mediawiki_pages(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]:
    if not entry.api_url:
        return [], [f"{entry.id} is missing api_url"]

    docs: list[ScrapedDocument] = []
    warnings: list[str] = []
    for title in entry.pages[:max_docs]:
        encoded_title = quote(title.replace(" ", "_"), safe="")
        fetch_url = f"{entry.api_url.rstrip('/')}/{encoded_title}/html"
        try:
            response = _fetch(fetch_url, user_agent, timeout_seconds)
            text = html_to_text(response.text)
            if not text:
                warnings.append(f"empty Wikipedia page extract: {title}")
                continue
            full_url = f"https://en.wikipedia.org/wiki/{encoded_title}"
            docs.append(
                ScrapedDocument(
                    source_id=f"{entry.id}.{_safe_slug(title)}",
                    title=title,
                    text=text,
                    url=full_url,
                    license=entry.license,
                    attribution=entry.attribution,
                    tags=list(entry.tags),
                )
            )
            time.sleep(0.1)
        except Exception as exc:
            warnings.append(f"skipped Wikipedia page {title}: {exc}")
    return docs, warnings


def scrape_catalog_entry(
    entry: CatalogEntry,
    *,
    max_docs: int = 30,
    user_agent: str = DEFAULT_USER_AGENT,
    timeout_seconds: int = 20,
) -> tuple[list[ScrapedDocument], list[str]]:
    """Scrape one curated catalog entry."""

    bounded_max_docs = max(1, min(max_docs, 200))
    if entry.kind == "single_pages":
        return _scrape_single_pages(entry, bounded_max_docs, user_agent, timeout_seconds)
    if entry.kind == "crawl_prefix":
        return _scrape_crawl_prefix(entry, bounded_max_docs, user_agent, timeout_seconds)
    if entry.kind == "wordpress_rest":
        return _scrape_wordpress_rest(entry, bounded_max_docs, user_agent, timeout_seconds)
    if entry.kind == "mediawiki_pages":
        return _scrape_mediawiki_pages(entry, bounded_max_docs, user_agent, timeout_seconds)
    raise ValueError(f"unsupported catalog scraper kind: {entry.kind}")


def build_catalog_index(
    *,
    selected_ids: list[str],
    index_dir: str | Path,
    embedding_backend: str,
    embedding_model: str,
    embedding_dimensions: int,
    max_docs_per_source: int = 30,
    chunk_words: int = 260,
    overlap_words: int = 50,
) -> dict:
    """Scrape selected allowlisted catalog sources and persist a RAG index."""

    if not selected_ids:
        raise ValueError("select at least one catalog source")

    entries = [get_catalog_entry(entry_id) for entry_id in selected_ids]
    embedder = create_embedding_backend(
        embedding_backend,
        model_name=embedding_model,
        dimensions=embedding_dimensions,
    )

    documents: list[ScrapedDocument] = []
    chunks: list[TextChunk] = []
    warnings: list[str] = []
    seen_hashes: set[str] = set()

    for entry in entries:
        docs, entry_warnings = scrape_catalog_entry(entry, max_docs=max_docs_per_source)
        warnings.extend(f"{entry.id}: {warning}" for warning in entry_warnings)
        documents.extend(docs)
        for doc in docs:
            for chunk in chunk_text(
                source_id=doc.source_id,
                title=doc.title,
                text=doc.text,
                url=doc.url,
                license=doc.license,
                attribution=doc.attribution,
                tags=doc.tags,
                chunk_words=chunk_words,
                overlap_words=overlap_words,
            ):
                digest = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
                if digest in seen_hashes:
                    continue
                seen_hashes.add(digest)
                chunks.append(chunk)

    if chunks:
        vectors = embedder.encode([chunk.text for chunk in chunks])
    else:
        vectors = np.zeros((0, embedder.dimensions), dtype=np.float32)

    manifest = {
        "created_at": datetime.now(timezone.utc).isoformat(),
        "source_registry": "curated_legal_catalog",
        "legal_policy": "Curated allowlist only; robots.txt checked for web pages; binary assets skipped; citations retain license and attribution metadata.",
        "selected_catalog_source_ids": selected_ids,
        "registered_source_count": len(entries),
        "scraped_document_count": len(documents),
        "ingested_source_count": len({doc.source_id for doc in documents}),
        "chunk_count": len(chunks),
        "embedding_backend": embedder.name,
        "embedding_dimensions": int(vectors.shape[1]) if vectors.ndim == 2 else embedder.dimensions,
        "catalog": [asdict(entry) for entry in entries],
        "warnings": warnings,
    }
    save_index(index_dir, chunks=chunks, vectors=vectors, manifest=manifest)

    manifest_path = Path(index_dir) / "catalog-manifest.json"
    manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
    return manifest