"""Source registry validation and loading.""" from __future__ import annotations from pathlib import Path from typing import Literal import yaml from pydantic import BaseModel, Field, HttpUrl, model_validator Permission = Literal[ "public_domain", "permissive", "sharealike_open", "noncommercial_open", "owned", "explicit_permission", ] class CrawlPolicy(BaseModel): """Fetch policy for URL-backed sources.""" enabled: bool = False respect_robots: bool = True user_agent: str = "GameMasterCopilot/0.1" timeout_seconds: int = Field(default=20, ge=1, le=120) class SourceRecord(BaseModel): """One approved source in the RAG corpus registry.""" id: str = Field(pattern=r"^[A-Za-z0-9][A-Za-z0-9_.-]{1,80}$") title: str = Field(min_length=1, max_length=240) url: HttpUrl | None = None path: str | None = None license: str = Field(min_length=1, max_length=120) permission: Permission attribution: str | None = None tags: list[str] = Field(default_factory=list) crawl: CrawlPolicy = Field(default_factory=CrawlPolicy) notes: str | None = None @model_validator(mode="after") def validate_location(self) -> "SourceRecord": if not self.url and not self.path: raise ValueError("each source must define either url or path") if self.url and self.path: raise ValueError("each source must define only one of url or path") return self @property def is_url_source(self) -> bool: return self.url is not None @property def is_ingestable(self) -> bool: return bool(self.path) or bool(self.url and self.crawl.enabled) @property def url_text(self) -> str | None: return str(self.url) if self.url else None class SourceRegistry(BaseModel): """Top-level source registry.""" sources: list[SourceRecord] = Field(default_factory=list) @model_validator(mode="after") def validate_unique_ids(self) -> "SourceRegistry": seen: set[str] = set() duplicates: set[str] = set() for source in self.sources: if source.id in seen: duplicates.add(source.id) seen.add(source.id) if duplicates: names = ", ".join(sorted(duplicates)) raise ValueError(f"duplicate source ids: {names}") return self def approved_ingestable_sources(self) -> list[SourceRecord]: return [source for source in self.sources if source.is_ingestable] def load_source_registry(path: str | Path) -> SourceRegistry: """Load and validate a source registry YAML file.""" registry_path = Path(path) if not registry_path.exists(): return SourceRegistry(sources=[]) raw = yaml.safe_load(registry_path.read_text(encoding="utf-8")) or {} if isinstance(raw, list): raw = {"sources": raw} if not isinstance(raw, dict): raise ValueError("source registry must be a YAML mapping with a sources list") return SourceRegistry.model_validate(raw)