"""Tests Sprint 45 — couche backend de stratification par script_type.

Couvre :

1. ``BenchmarkResult.doc_strata`` accepte ``None`` (rétrocompat) ou
   un dict ``{doc_id: script_type}``.
2. ``available_strata()`` retourne la liste triée des strates
   distinctes (vide si pas de doc_strata, ignore les valeurs vides).
3. ``stratified_ranking()`` :
   - Recalcule mean/median par moteur sur les docs de la strate
   - Trie par médiane (cohérent avec ``ranking()`` Sprint 44)
   - Inclut les moteurs sans aucun doc dans la strate (entrée
     dégénérée avec mean/median = None)
4. ``corpus_homogeneity()`` :
   - Retourne ``None`` quand < 2 strates
   - Calcule l'écart inter-strate du leader (en CER médian)
   - Identifie la paire de strates min/max
5. ``as_dict()`` expose ``doc_strata``, ``available_strata`` et
   ``stratified_ranking`` quand renseignés (rétrocompat sinon).
6. **Test propriété** : sur un corpus asymétrique réaliste où le
   leader global change selon la strate, ``stratified_ranking``
   doit refléter ce changement.
"""

from __future__ import annotations

import pytest

from picarones.measurements.metrics import MetricsResult
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport


# ──────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────


def _make_dr(doc_id: str, cer: float, error: str | None = None) -> DocumentResult:
    return DocumentResult(
        doc_id=doc_id, image_path=f"/tmp/{doc_id}.png",
        ground_truth="x", hypothesis="x",
        metrics=MetricsResult(
            cer=cer, cer_nfc=cer, cer_caseless=cer,
            wer=cer, wer_normalized=cer, mer=cer, wil=cer,
            reference_length=1, hypothesis_length=1,
            error=error,
        ),
        duration_seconds=0.1,
    )


def _make_engine(name: str, cers_by_doc: dict[str, float]) -> EngineReport:
    drs = [_make_dr(d, c) for d, c in cers_by_doc.items()]
    return EngineReport(
        engine_name=name, engine_version="1", engine_config={},
        document_results=drs,
    )


def _make_benchmark(
    engines: list[EngineReport],
    doc_strata: dict[str, str] | None = None,
) -> BenchmarkResult:
    return BenchmarkResult(
        corpus_name="test",
        corpus_source=None,
        document_count=0,
        engine_reports=engines,
        doc_strata=doc_strata,
    )


# ──────────────────────────────────────────────────────────────────────────
# 1. doc_strata field
# ──────────────────────────────────────────────────────────────────────────


class TestDocStrataField:
    def test_default_is_none(self) -> None:
        b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
        assert b.doc_strata is None

    def test_accepts_dict(self) -> None:
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.1})],
            doc_strata={"d1": "gothic"},
        )
        assert b.doc_strata == {"d1": "gothic"}


# ──────────────────────────────────────────────────────────────────────────
# 2. available_strata
# ──────────────────────────────────────────────────────────────────────────


class TestAvailableStrata:
    def test_empty_when_no_doc_strata(self) -> None:
        b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
        assert b.available_strata() == []

    def test_returns_sorted_unique(self) -> None:
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.1, "d2": 0.2, "d3": 0.3})],
            doc_strata={
                "d1": "gothic", "d2": "humanistic", "d3": "gothic",
            },
        )
        assert b.available_strata() == ["gothic", "humanistic"]

    def test_ignores_empty_strings(self) -> None:
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.1, "d2": 0.2})],
            doc_strata={"d1": "gothic", "d2": ""},
        )
        assert b.available_strata() == ["gothic"]


# ──────────────────────────────────────────────────────────────────────────
# 3. stratified_ranking
# ──────────────────────────────────────────────────────────────────────────


class TestStratifiedRanking:
    def test_empty_when_no_strata(self) -> None:
        b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
        assert b.stratified_ranking() == {}

    def test_one_entry_per_engine_per_stratum(self) -> None:
        b = _make_benchmark(
            [
                _make_engine("a", {"d1": 0.1, "d2": 0.2, "d3": 0.3}),
                _make_engine("b", {"d1": 0.5, "d2": 0.6, "d3": 0.7}),
            ],
            doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"},
        )
        out = b.stratified_ranking()
        assert set(out.keys()) == {"S1", "S2"}
        for stratum, entries in out.items():
            assert len(entries) == 2

    def test_metrics_are_per_stratum(self) -> None:
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5})],
            doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"},
        )
        out = b.stratified_ranking()
        s1 = out["S1"][0]
        s2 = out["S2"][0]
        assert s1["mean_cer"] == pytest.approx(0.0)
        assert s1["median_cer"] == pytest.approx(0.0)
        assert s1["documents"] == 2
        assert s2["mean_cer"] == pytest.approx(0.5)
        assert s2["documents"] == 1

    def test_sorts_by_median_within_each_stratum(self) -> None:
        # Sur S1, A médiane=0.0, B médiane=0.1 → A 1er
        # Sur S2, A médiane=0.5, B médiane=0.0 → B 1er (changement de leader)
        b = _make_benchmark(
            [
                _make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5, "d4": 0.5}),
                _make_engine("b", {"d1": 0.1, "d2": 0.1, "d3": 0.0, "d4": 0.0}),
            ],
            doc_strata={"d1": "S1", "d2": "S1", "d3": "S2", "d4": "S2"},
        )
        out = b.stratified_ranking()
        assert out["S1"][0]["engine"] == "a"
        assert out["S2"][0]["engine"] == "b"

    def test_engine_with_no_docs_in_stratum_appears_with_none(self) -> None:
        # B n'a aucun doc dans S2
        b = _make_benchmark(
            [
                _make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5}),
                _make_engine("b", {"d1": 0.1, "d2": 0.1}),
            ],
            doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"},
        )
        out = b.stratified_ranking()
        s2_b = next(e for e in out["S2"] if e["engine"] == "b")
        assert s2_b["mean_cer"] is None
        assert s2_b["median_cer"] is None
        assert s2_b["documents"] == 0


# ──────────────────────────────────────────────────────────────────────────
# 4. corpus_homogeneity
# ──────────────────────────────────────────────────────────────────────────


class TestCorpusHomogeneity:
    def test_returns_none_when_no_strata(self) -> None:
        b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
        assert b.corpus_homogeneity() is None

    def test_returns_none_when_single_stratum(self) -> None:
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.1, "d2": 0.2})],
            doc_strata={"d1": "S1", "d2": "S1"},
        )
        assert b.corpus_homogeneity() is None

    def test_detects_inter_stratum_gap(self) -> None:
        # Le leader A : médiane = 0.0 sur S1, 0.5 sur S2 → gap = 0.5
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5, "d4": 0.5})],
            doc_strata={"d1": "S1", "d2": "S1", "d3": "S2", "d4": "S2"},
        )
        h = b.corpus_homogeneity()
        assert h is not None
        assert h["leader"] == "a"
        assert h["n_strata"] == 2
        assert h["max_inter_strata_gap"] == pytest.approx(0.5)
        assert set(h["leader_max_gap_strata"]) == {"S1", "S2"}
        # Min en premier (S1 = 0.0), max en deuxième (S2 = 0.5)
        assert h["leader_max_gap_strata"][0] == "S1"
        assert h["leader_max_gap_strata"][1] == "S2"


# ──────────────────────────────────────────────────────────────────────────
# 5. as_dict expose les strates
# ──────────────────────────────────────────────────────────────────────────


class TestAsDictSerialization:
    def test_no_strata_keys_when_doc_strata_is_none(self) -> None:
        b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
        d = b.as_dict()
        assert "doc_strata" not in d
        assert "stratified_ranking" not in d
        assert "corpus_homogeneity" not in d

    def test_strata_keys_present_when_doc_strata_is_set(self) -> None:
        b = _make_benchmark(
            [_make_engine("a", {"d1": 0.0, "d2": 0.5})],
            doc_strata={"d1": "S1", "d2": "S2"},
        )
        d = b.as_dict()
        assert d["doc_strata"] == {"d1": "S1", "d2": "S2"}
        assert d["available_strata"] == ["S1", "S2"]
        assert "S1" in d["stratified_ranking"]


# ──────────────────────────────────────────────────────────────────────────
# 6. Test propriété — leader change selon la strate (cas réaliste)
# ──────────────────────────────────────────────────────────────────────────


class TestRealisticAsymmetry:
    def test_global_leader_can_lose_on_a_stratum(self) -> None:
        """Cas patrimonial typique : Tesseract domine globalement
        (médiocre sur le manuscrit, excellent sur l'imprimé), Pero
        domine spécifiquement sur le manuscrit."""
        b = _make_benchmark(
            [
                # Tesseract : 10 docs imprimés à 0.02, 5 docs manuscrit à 0.30
                _make_engine("tesseract", {
                    **{f"print_{i}": 0.02 for i in range(10)},
                    **{f"ms_{i}": 0.30 for i in range(5)},
                }),
                # Pero : 10 docs imprimés à 0.05, 5 docs manuscrit à 0.10
                _make_engine("pero", {
                    **{f"print_{i}": 0.05 for i in range(10)},
                    **{f"ms_{i}": 0.10 for i in range(5)},
                }),
            ],
            doc_strata={
                **{f"print_{i}": "imprimé" for i in range(10)},
                **{f"ms_{i}": "manuscrit" for i in range(5)},
            },
        )

        # Globalement, Tesseract gagne sur la médiane (0.02 sur la
        # majorité des docs vs 0.05 pour Pero)
        global_leader = b.ranking()[0]["engine"]
        assert global_leader == "tesseract"

        # Mais sur la strate manuscrit, Pero gagne (0.10 < 0.30)
        strat = b.stratified_ranking()
        assert strat["manuscrit"][0]["engine"] == "pero"
        assert strat["imprimé"][0]["engine"] == "tesseract"

        # Le score d'homogénéité doit refléter le fort écart de
        # Tesseract entre strates (0.02 vs 0.30 = 0.28)
        h = b.corpus_homogeneity()
        assert h["leader"] == "tesseract"
        assert h["max_inter_strata_gap"] == pytest.approx(0.28, abs=1e-9)