"""
corpus.py — the data model and on-disk storage for the verse corpus.

A note on dataclasses vs. plain dicts
-------------------------------------
We could have used dicts everywhere and saved keystrokes. We don't, because
the Verse type is the contract between five different modules — parsers,
enrichment, indexing, retrieval, and the metric — and a typed contract
catches mistakes that "I thought 'sources_cited' was a list" wouldn't.

The pipeline lifecycle of a verse
---------------------------------
    parsers.*        →  Verse           (no LLM-derived fields)
    enrichment.py    →  EnrichedVerse   (with LLM-derived fields)
    knowledge_base   →  reads EnrichedVerse, writes 3 embeddings per verse
    advisor.py       →  receives EnrichedVerse via retriever hits
    metrics.py       →  uses verse_id for exact citation grounding

Storage choice
--------------
JSONL on disk. Each line is a verse. Why not Parquet, sqlite, etc.?
- Easy to grep
- Easy to diff in PRs
- Easy for a human to spot-check enrichment quality (the whole point)
- We never need to scan more than a few thousand lines, so format doesn't matter
"""

from __future__ import annotations
import json
from dataclasses import dataclass, field, asdict, fields
from pathlib import Path
from typing import Iterable, Iterator


# ──────────────────────────── Verse: the raw record ────────────────────────────
@dataclass
class Verse:
    """A natural unit of scripture: one verse, one mantra, one sūtra.

    The required fields are minimal — every parser must produce at least these.
    Optional fields (sanskrit, transliteration, bhashya, ...) are filled when
    the source provides them.

    `verse_id` is the global unique key. Convention:
        '<work_slug>_<section_slug>_<verse_number>'
    e.g. 'bhagavad_gita_02_47', 'mundaka_upanishad_2_1_3'.

    `verse_ref` is the human-readable citation form:
        e.g. 'BG 2.47', 'Muṇḍaka Up. 2.1.3', 'Vivekacūḍāmaṇi 11'.
    The advisor's response uses this exact string in citations.
    """
    # Identity — required for every record
    verse_id: str
    work: str
    work_display: str
    verse_ref: str
    tier: str                         # primary | shankara | supporting

    # Section/chapter info — required when the work has chapters
    section: str = ""                 # 'chapter_02'
    section_display: str = ""         # 'Chapter 2: Sāṅkhya Yoga'

    # Content — at least one of {translation, bhashya} must be non-empty
    translation: str = ""             # English translation of the verse itself
    translator: str = ""              # who translated it (for attribution)

    sanskrit: str = ""                # original Devanāgarī
    transliteration: str = ""         # IAST roman transliteration
    word_meanings: str = ""           # word-by-word gloss when present

    bhashya: str = ""                 # Śaṅkara's commentary on this verse, if any
    bhashya_translator: str = ""      # who translated the bhāṣya

    # Provenance for accountability and license display
    source_key: str = ""              # the registry key this came from
    license: str = ""                 # license tag from registry

    def has_content(self) -> bool:
        """Used by parsers/loaders to drop empty records before they pollute
        the index. A 'verse' with only a verse_id and no actual text is junk."""
        return bool(self.translation.strip() or self.bhashya.strip())


# ──────────────────────────── EnrichedVerse: with LLM extractions ────────────────
@dataclass
class EnrichedVerse(Verse):
    """A Verse + the structured fields produced by the offline LLM pass.

    Every list defaults to empty so a verse that fails enrichment can still
    be stored (without enrichment, indexed only on its literal text/bhāṣya).
    """
    # The plain-English statement of what the verse teaches. Ideally 1–2
    # sentences. This is what the synthesizer reads downstream.
    paraphrase: str = ""

    # Vedānta concepts engaged by the verse. Tradition-native vocabulary.
    # Examples: 'karma_yoga', 'vairagya', 'sakshi', 'two_truths', 'adhyasa'.
    themes: list[str] = field(default_factory=list)

    # Mundane life situations where this verse would help. User-language.
    # Examples: 'facing failure after sustained effort', 'watching a parent decline'.
    life_situations: list[str] = field(default_factory=list)

    # Emotions addressed, from a small consistent vocabulary.
    # See enrichment.py EMOTION_VOCAB for the closed set.
    emotions_addressed: list[str] = field(default_factory=list)

    # What does this verse ask the seeker to do or shift?
    practical_teaching: str = ""

    # Hypothetical questions a real person might bring to this verse.
    # These are gold for retrieval; they bridge the language gap.
    hypothetical_questions: list[str] = field(default_factory=list)

    # Quality / debugging
    enrichment_model: str = ""        # which LM produced these fields
    enrichment_version: int = 1       # bump when the prompt changes substantively

    # ---- Derived "views" used at indexing time ----
    def literal_view(self) -> str:
        """The literal English translation, lightly enriched with the Sanskrit
        if available. Best for queries that share lexical features with the text."""
        parts = []
        if self.translation:
            parts.append(self.translation.strip())
        if self.transliteration:
            parts.append(f"({self.transliteration.strip()})")
        return "\n".join(parts)

    def bhashya_view(self) -> str:
        """Śaṅkara's commentary on this verse. Best for queries about the
        Vedāntic explanation rather than the verse text itself."""
        return self.bhashya.strip()

    def advisor_view(self) -> str:
        """The composed view that bridges the language gap.

        This is what makes the user-question-→-verse mapping work. A user who
        types 'I feel hollow even though I got everything I wanted' will not
        find anything in the Sanskrit. They will find a near-neighbor in this
        view if the enrichment did its job.
        """
        bits = []
        if self.paraphrase:
            bits.append(f"Teaching: {self.paraphrase}")
        if self.life_situations:
            bits.append(
                "Speaks to: " + "; ".join(self.life_situations)
            )
        if self.emotions_addressed:
            bits.append(
                "Addresses: " + ", ".join(self.emotions_addressed)
            )
        if self.themes:
            bits.append(
                "Themes: " + ", ".join(self.themes)
            )
        if self.hypothetical_questions:
            bits.append(
                "Questions this answers:\n  - "
                + "\n  - ".join(self.hypothetical_questions)
            )
        if self.practical_teaching:
            bits.append(f"Practical shift: {self.practical_teaching}")
        return "\n".join(bits)

    def is_enriched(self) -> bool:
        """Did enrichment populate at least the minimum-viable fields?"""
        return bool(self.paraphrase) and bool(self.life_situations) and bool(self.hypothetical_questions)


# ──────────────────────────── On-disk JSONL ────────────────────────────
def write_jsonl(records: Iterable[Verse], path: Path) -> int:
    """Write a stream of records as JSONL. Returns count written."""
    path.parent.mkdir(parents=True, exist_ok=True)
    n = 0
    with path.open("w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
            n += 1
    return n


def read_jsonl_verses(path: Path) -> Iterator[Verse]:
    """Read a JSONL file as Verse records. Skips lines we can't parse."""
    if not path.exists():
        return
    with path.open(encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                d = json.loads(line)
                yield _verse_from_dict(d, Verse)
            except Exception as e:
                print(f"[corpus] skipping malformed line {line_no} in {path}: {e}")


def read_jsonl_enriched(path: Path) -> Iterator[EnrichedVerse]:
    """Read a JSONL file as EnrichedVerse records."""
    if not path.exists():
        return
    with path.open(encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                d = json.loads(line)
                yield _verse_from_dict(d, EnrichedVerse)
            except Exception as e:
                print(f"[corpus] skipping malformed line {line_no} in {path}: {e}")


def _verse_from_dict(d: dict, cls):
    """Construct a Verse/EnrichedVerse, ignoring keys the dataclass doesn't know.

    This forward-compatibility matters: if a future version adds a field, old
    JSONL files should still load. And if enrichment adds extra debug fields,
    we don't want the dataclass to choke on them.
    """
    valid = {f.name for f in fields(cls)}
    return cls(**{k: v for k, v in d.items() if k in valid})