"""Curated legal source catalog and scraper-backed index builder.""" from __future__ import annotations import hashlib import json import time from dataclasses import asdict, dataclass from datetime import datetime, timezone from pathlib import Path from urllib.parse import quote, urljoin, urlparse, urlunparse from urllib.robotparser import RobotFileParser import numpy as np import requests from bs4 import BeautifulSoup from .embeddings import create_embedding_backend from .index_store import save_index from .text import TextChunk, chunk_text, html_to_text, normalize_text DEFAULT_USER_AGENT = "GameMasterCopilot/0.1 (+https://huggingface.co/spaces/vivekchakraverty/gamemaster-design-copilot)" SKIP_EXTENSIONS = { ".7z", ".avi", ".css", ".gif", ".gz", ".ico", ".jpeg", ".jpg", ".js", ".json", ".mp3", ".mp4", ".ogg", ".pdf", ".png", ".svg", ".tar", ".webm", ".zip", } @dataclass(frozen=True) class ScrapedDocument: """One fetched document ready for chunking.""" source_id: str title: str text: str url: str license: str attribution: str tags: list[str] @dataclass(frozen=True) class CatalogEntry: """One allowlisted source collection.""" id: str label: str description: str license: str permission: str attribution: str tags: list[str] kind: str source_url: str start_urls: tuple[str, ...] = () allowed_prefixes: tuple[str, ...] = () api_url: str | None = None pages: tuple[str, ...] = () notes: str = "" WIKIPEDIA_GAME_DESIGN_PAGES = ( "Game design", "Video game design", "Game mechanics", "Gameplay", "Game balance", "Level design", "Playtesting", "Emergent gameplay", "Nonlinear gameplay", "Game studies", "Ludology", "Flow (psychology)", "Tutorial (video games)", "Boss (video games)", "Game artificial intelligence", "Serious game", ) CATALOG: tuple[CatalogEntry, ...] = ( CatalogEntry( id="game_design_concepts", label="Game Design Concepts course", description="Ian Schreiber's free game design course posts.", license="CC BY 3.0 US", permission="permissive", attribution="Game Design Concepts by Ian Schreiber", tags=["game-design", "course", "mechanics", "iteration", "balance"], kind="wordpress_rest", source_url="https://gamedesignconcepts.wordpress.com/about/", api_url="https://gamedesignconcepts.wordpress.com/wp-json/wp/v2/posts", notes="The course about page states that course content may be shared and adapted with attribution.", ), CatalogEntry( id="wikipedia_game_design", label="Wikipedia game design topics", description="Selected game design, mechanics, level design, and playtesting encyclopedia topics.", license="CC BY-SA 4.0 / GFDL", permission="sharealike_open", attribution="Wikipedia contributors", tags=["game-design", "mechanics", "level-design", "encyclopedia"], kind="mediawiki_pages", source_url="https://en.wikipedia.org/wiki/Wikipedia:Copyrights", api_url="https://api.wikimedia.org/core/v1/wikipedia/en/page", pages=WIKIPEDIA_GAME_DESIGN_PAGES, notes="Text reuse requires attribution and share-alike compliance.", ), CatalogEntry( id="mit_ocw_cms608_2014", label="MIT OCW CMS.608 Game Design", description="MIT OpenCourseWare Game Design course pages from Spring 2014.", license="CC BY-NC-SA", permission="noncommercial_open", attribution="MIT OpenCourseWare, CMS.608 Game Design, Spring 2014", tags=["game-design", "course", "non-digital-games", "assignments"], kind="crawl_prefix", source_url="https://ocw.mit.edu/courses/cms-608-game-design-spring-2014/", start_urls=("https://ocw.mit.edu/courses/cms-608-game-design-spring-2014/",), allowed_prefixes=("https://ocw.mit.edu/courses/cms-608-game-design-spring-2014/",), notes="MIT OCW materials are open, but this catalog entry is noncommercial/share-alike.", ), CatalogEntry( id="fate_core_srd", label="Fate Core SRD", description="Fate Core SRD pages for tabletop RPG system design examples.", license="CC BY 3.0 Unported", permission="permissive", attribution="Fate Core System by Evil Hat Productions and Fate SRD contributors", tags=["ttrpg", "srd", "narrative-design", "resolution-mechanics"], kind="crawl_prefix", source_url="https://fate-srd.com/official-licensing-fate", start_urls=("https://fate-srd.com/fate-core/", "https://fate-srd.com/official-licensing-fate"), allowed_prefixes=("https://fate-srd.com/fate-core/", "https://fate-srd.com/official-licensing-fate"), notes="The official licensing page identifies Fate SRD as the endorsed SRD source.", ), CatalogEntry( id="blades_srd", label="Blades in the Dark SRD", description="Forged in the Dark SRD pages for position/effect, clocks, crews, and action resolution.", license="CC BY 3.0 Unported", permission="permissive", attribution="Blades in the Dark by One Seven Design, developed and authored by John Harper", tags=["ttrpg", "srd", "fiction-first", "clocks", "gm-tools"], kind="crawl_prefix", source_url="https://bladesinthedark.com/licensing", start_urls=("https://bladesinthedark.com/basics/", "https://bladesinthedark.com/licensing"), allowed_prefixes=("https://bladesinthedark.com/",), notes="Catalog notes warn users not to use non-SRD setting, NPC, artwork, or map material.", ), CatalogEntry( id="dnd_srd", label="D&D Beyond SRD", description="Official D&D SRD page for rules/mechanics examples under Creative Commons.", license="CC BY 4.0", permission="permissive", attribution="System Reference Document by Wizards of the Coast LLC", tags=["ttrpg", "srd", "rules", "encounters", "mechanics"], kind="single_pages", source_url="https://www.dndbeyond.com/srd", start_urls=("https://www.dndbeyond.com/srd",), notes="Only SRD content is included; trademarks and non-SRD material remain outside scope.", ), ) def get_catalog() -> list[CatalogEntry]: """Return the curated legal source catalog.""" return list(CATALOG) def get_catalog_entry(entry_id: str) -> CatalogEntry: for entry in CATALOG: if entry.id == entry_id: return entry raise KeyError(f"unknown catalog source: {entry_id}") def catalog_summary_markdown() -> str: lines = [ "### Curated Legal Source Catalog", "", "Only these allowlisted sources can be scraped by the UI. Robots.txt is checked for web pages, " "binary assets are skipped, and each chunk keeps license and attribution metadata.", "", ] for entry in CATALOG: lines.append(f"- **{entry.label}** (`{entry.id}`): {entry.license}. {entry.description}") return "\n".join(lines) def _safe_slug(value: str) -> str: cleaned = "".join(char.lower() if char.isalnum() else "-" for char in value) cleaned = "-".join(part for part in cleaned.split("-") if part) return cleaned[:80] or hashlib.sha256(value.encode("utf-8")).hexdigest()[:16] def _canonical_url(url: str) -> str: parsed = urlparse(url) return urlunparse((parsed.scheme, parsed.netloc, parsed.path.rstrip("/") + "/", "", "", "")) def _skip_url(url: str) -> bool: path = urlparse(url).path.lower() return any(path.endswith(extension) for extension in SKIP_EXTENSIONS) def _robots_allowed(url: str, user_agent: str) -> bool: parsed = urlparse(url) robots_url = urljoin(f"{parsed.scheme}://{parsed.netloc}", "/robots.txt") try: response = requests.get(robots_url, headers={"User-Agent": user_agent}, timeout=10) except Exception: return False if response.status_code == 404: return True if response.status_code >= 400: return False parser = RobotFileParser() parser.set_url(robots_url) parser.parse(response.text.splitlines()) return parser.can_fetch(user_agent, url) def _fetch(url: str, user_agent: str, timeout_seconds: int) -> requests.Response: if not _robots_allowed(url, user_agent): raise PermissionError(f"robots.txt disallows fetching {url}") response = requests.get(url, headers={"User-Agent": user_agent}, timeout=timeout_seconds) response.raise_for_status() return response def _document_from_html(entry: CatalogEntry, url: str, html: str, fallback_title: str | None = None) -> ScrapedDocument: soup = BeautifulSoup(html, "html.parser") title_node = soup.find("h1") or soup.find("title") title = normalize_text(title_node.get_text(" ")) if title_node else fallback_title or entry.label text = html_to_text(html) return ScrapedDocument( source_id=f"{entry.id}.{_safe_slug(url)}", title=title or entry.label, text=text, url=url, license=entry.license, attribution=entry.attribution, tags=list(entry.tags), ) def _extract_links(html: str, base_url: str, allowed_prefixes: tuple[str, ...]) -> list[str]: soup = BeautifulSoup(html, "html.parser") links: list[str] = [] for anchor in soup.find_all("a", href=True): url = urljoin(base_url, anchor["href"]) parsed = urlparse(url) if parsed.scheme not in {"http", "https"}: continue candidate = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) if _skip_url(candidate): continue if any(candidate.startswith(prefix) for prefix in allowed_prefixes): links.append(candidate) return links def _scrape_single_pages(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]: docs: list[ScrapedDocument] = [] warnings: list[str] = [] for url in entry.start_urls[:max_docs]: try: response = _fetch(url, user_agent, timeout_seconds) docs.append(_document_from_html(entry, url, response.text)) except Exception as exc: warnings.append(f"skipped {url}: {exc}") return docs, warnings def _scrape_crawl_prefix(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]: docs: list[ScrapedDocument] = [] warnings: list[str] = [] queue = list(entry.start_urls) seen: set[str] = set() while queue and len(docs) < max_docs: url = queue.pop(0) canonical = _canonical_url(url) if canonical in seen or _skip_url(url): continue seen.add(canonical) if not any(url.startswith(prefix) for prefix in entry.allowed_prefixes): continue try: response = _fetch(url, user_agent, timeout_seconds) content_type = response.headers.get("content-type", "").lower() if "text/html" not in content_type: warnings.append(f"skipped non-html URL {url}: {content_type or 'unknown content type'}") continue docs.append(_document_from_html(entry, url, response.text)) for link in _extract_links(response.text, url, entry.allowed_prefixes): if _canonical_url(link) not in seen and link not in queue: queue.append(link) time.sleep(0.1) except Exception as exc: warnings.append(f"skipped {url}: {exc}") return docs, warnings def _scrape_wordpress_rest(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]: if not entry.api_url: return [], [f"{entry.id} is missing api_url"] docs: list[ScrapedDocument] = [] warnings: list[str] = [] page = 1 while len(docs) < max_docs: url = f"{entry.api_url}?per_page=20&page={page}&_fields=link,title,content" try: response = _fetch(url, user_agent, timeout_seconds) except requests.HTTPError as exc: if exc.response is not None and exc.response.status_code == 400: break warnings.append(f"stopped WordPress fetch at page {page}: {exc}") break except Exception as exc: warnings.append(f"stopped WordPress fetch at page {page}: {exc}") break posts = response.json() if not posts: break for post in posts: title = html_to_text(post.get("title", {}).get("rendered", "")) or entry.label link = post.get("link") or entry.source_url content = post.get("content", {}).get("rendered", "") text = html_to_text(content) if not text: continue docs.append( ScrapedDocument( source_id=f"{entry.id}.{_safe_slug(link)}", title=title, text=text, url=link, license=entry.license, attribution=entry.attribution, tags=list(entry.tags), ) ) if len(docs) >= max_docs: break page += 1 return docs, warnings def _scrape_mediawiki_pages(entry: CatalogEntry, max_docs: int, user_agent: str, timeout_seconds: int) -> tuple[list[ScrapedDocument], list[str]]: if not entry.api_url: return [], [f"{entry.id} is missing api_url"] docs: list[ScrapedDocument] = [] warnings: list[str] = [] for title in entry.pages[:max_docs]: encoded_title = quote(title.replace(" ", "_"), safe="") fetch_url = f"{entry.api_url.rstrip('/')}/{encoded_title}/html" try: response = _fetch(fetch_url, user_agent, timeout_seconds) text = html_to_text(response.text) if not text: warnings.append(f"empty Wikipedia page extract: {title}") continue full_url = f"https://en.wikipedia.org/wiki/{encoded_title}" docs.append( ScrapedDocument( source_id=f"{entry.id}.{_safe_slug(title)}", title=title, text=text, url=full_url, license=entry.license, attribution=entry.attribution, tags=list(entry.tags), ) ) time.sleep(0.1) except Exception as exc: warnings.append(f"skipped Wikipedia page {title}: {exc}") return docs, warnings def scrape_catalog_entry( entry: CatalogEntry, *, max_docs: int = 30, user_agent: str = DEFAULT_USER_AGENT, timeout_seconds: int = 20, ) -> tuple[list[ScrapedDocument], list[str]]: """Scrape one curated catalog entry.""" bounded_max_docs = max(1, min(max_docs, 200)) if entry.kind == "single_pages": return _scrape_single_pages(entry, bounded_max_docs, user_agent, timeout_seconds) if entry.kind == "crawl_prefix": return _scrape_crawl_prefix(entry, bounded_max_docs, user_agent, timeout_seconds) if entry.kind == "wordpress_rest": return _scrape_wordpress_rest(entry, bounded_max_docs, user_agent, timeout_seconds) if entry.kind == "mediawiki_pages": return _scrape_mediawiki_pages(entry, bounded_max_docs, user_agent, timeout_seconds) raise ValueError(f"unsupported catalog scraper kind: {entry.kind}") def build_catalog_index( *, selected_ids: list[str], index_dir: str | Path, embedding_backend: str, embedding_model: str, embedding_dimensions: int, max_docs_per_source: int = 30, chunk_words: int = 260, overlap_words: int = 50, ) -> dict: """Scrape selected allowlisted catalog sources and persist a RAG index.""" if not selected_ids: raise ValueError("select at least one catalog source") entries = [get_catalog_entry(entry_id) for entry_id in selected_ids] embedder = create_embedding_backend( embedding_backend, model_name=embedding_model, dimensions=embedding_dimensions, ) documents: list[ScrapedDocument] = [] chunks: list[TextChunk] = [] warnings: list[str] = [] seen_hashes: set[str] = set() for entry in entries: docs, entry_warnings = scrape_catalog_entry(entry, max_docs=max_docs_per_source) warnings.extend(f"{entry.id}: {warning}" for warning in entry_warnings) documents.extend(docs) for doc in docs: for chunk in chunk_text( source_id=doc.source_id, title=doc.title, text=doc.text, url=doc.url, license=doc.license, attribution=doc.attribution, tags=doc.tags, chunk_words=chunk_words, overlap_words=overlap_words, ): digest = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest() if digest in seen_hashes: continue seen_hashes.add(digest) chunks.append(chunk) if chunks: vectors = embedder.encode([chunk.text for chunk in chunks]) else: vectors = np.zeros((0, embedder.dimensions), dtype=np.float32) manifest = { "created_at": datetime.now(timezone.utc).isoformat(), "source_registry": "curated_legal_catalog", "legal_policy": "Curated allowlist only; robots.txt checked for web pages; binary assets skipped; citations retain license and attribution metadata.", "selected_catalog_source_ids": selected_ids, "registered_source_count": len(entries), "scraped_document_count": len(documents), "ingested_source_count": len({doc.source_id for doc in documents}), "chunk_count": len(chunks), "embedding_backend": embedder.name, "embedding_dimensions": int(vectors.shape[1]) if vectors.ndim == 2 else embedder.dimensions, "catalog": [asdict(entry) for entry in entries], "warnings": warnings, } save_index(index_dir, chunks=chunks, vectors=vectors, manifest=manifest) manifest_path = Path(index_dir) / "catalog-manifest.json" manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8") return manifest