Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

File size: 24,144 Bytes

d28d854

"""Tests Sprint 19 — Moteur narratif complet (détecteurs + arbitre + rendu).

Sprint 4 du plan rapport. Couvre :
  1. Les 9 détecteurs implémentés (scénarios canoniques + cas vides).
  2. L'arbitre : tri par importance, non-redondance, contradiction Nemenyi/Wilcoxon.
  3. Le renderer : chargement des templates YAML, déterminisme.
  4. Le garde-fou anti-hallucination : tout nombre rendu existe dans le JSON.
  5. L'intégration au rapport HTML (section synthèse, reproductibilité).
"""

from __future__ import annotations

import hashlib
import re

import pytest

from picarones.core.narrative import (
    Fact,
    FactImportance,
    FactType,
    build_synthesis,
    extract_numbers,
    render_fact,
    render_synthesis,
    select_facts,
)
from picarones.core.narrative.detectors import (
    detect_confidence_warning,
    detect_error_profile_outlier,
    detect_global_leader_cer,
    detect_llm_hallucination_flag,
    detect_robustness_fragile,
    detect_significant_gap,
    detect_speed_winner,
    detect_statistical_tie,
    detect_stratum_collapse,
    detect_stratum_winner,
)


# ---------------------------------------------------------------------------
# Fixtures — données de benchmark minimales et contrôlées
# ---------------------------------------------------------------------------

def _minimal_data(**overrides) -> dict:
    base = {
        "meta": {"document_count": 10},
        "ranking": [
            {"engine": "A", "mean_cer": 0.05, "mean_wer": 0.15, "documents": 10, "failed": 0},
            {"engine": "B", "mean_cer": 0.12, "mean_wer": 0.25, "documents": 10, "failed": 0},
            {"engine": "C", "mean_cer": 0.30, "mean_wer": 0.50, "documents": 10, "failed": 0},
        ],
        "engines": [
            {"name": "A", "cer": 0.05, "wer": 0.15, "is_pipeline": False, "is_vlm": False},
            {"name": "B", "cer": 0.12, "wer": 0.25, "is_pipeline": False, "is_vlm": False},
            {"name": "C", "cer": 0.30, "wer": 0.50, "is_pipeline": False, "is_vlm": False},
        ],
        "documents": [],
        "statistics": {
            "pairwise_wilcoxon": [],
            "bootstrap_cis": [],
            "friedman": {},
            "nemenyi": {"tied_groups": [], "mean_ranks": {}, "critical_distance": 0.0},
        },
    }
    base.update(overrides)
    return base


# ---------------------------------------------------------------------------
# Détecteurs individuels
# ---------------------------------------------------------------------------

class TestGlobalLeaderCer:
    def test_emits_fact_with_cer_pct_and_n_docs(self):
        facts = detect_global_leader_cer(_minimal_data())
        assert len(facts) == 1
        f = facts[0]
        assert f.type == FactType.GLOBAL_LEADER_CER
        assert f.importance == FactImportance.CRITICAL
        assert f.payload["engine"] == "A"
        assert f.payload["cer_pct"] == 5.0
        assert f.payload["n_docs"] == 10
        assert f.payload["runner_up"] == "B"

    def test_empty_when_no_ranking(self):
        assert detect_global_leader_cer(_minimal_data(ranking=[])) == []


class TestSignificantGap:
    def test_emits_when_leader_vs_runnerup_is_significant(self):
        data = _minimal_data(statistics={
            "pairwise_wilcoxon": [
                {"engine_a": "A", "engine_b": "B", "p_value": 0.002,
                 "significant": True, "n_pairs": 10},
            ],
            "bootstrap_cis": [], "friedman": {},
            "nemenyi": {"tied_groups": [], "mean_ranks": {}},
        })
        facts = detect_significant_gap(data)
        assert len(facts) == 1
        assert facts[0].payload["leader"] == "A"
        assert facts[0].payload["runner_up"] == "B"
        assert facts[0].payload["p_value"] == pytest.approx(0.002)

    def test_empty_when_not_significant(self):
        data = _minimal_data(statistics={
            "pairwise_wilcoxon": [
                {"engine_a": "A", "engine_b": "B", "p_value": 0.4,
                 "significant": False, "n_pairs": 10},
            ],
            "bootstrap_cis": [], "friedman": {},
            "nemenyi": {"tied_groups": [], "mean_ranks": {}},
        })
        assert detect_significant_gap(data) == []


class TestStatisticalTie:
    def test_emits_for_each_tied_group(self):
        data = _minimal_data(statistics={
            "pairwise_wilcoxon": [],
            "bootstrap_cis": [],
            "friedman": {},
            "nemenyi": {
                "tied_groups": [["A", "B"], ["C"]],
                "mean_ranks": {"A": 1.2, "B": 1.5, "C": 3.0},
                "critical_distance": 0.8,
                "alpha": 0.05,
                "n_blocks": 10,
            },
        })
        facts = detect_statistical_tie(data)
        assert len(facts) == 1
        assert set(facts[0].engines_involved) == {"A", "B"}
        assert facts[0].payload["includes_leader"] is True


class TestErrorProfileOutlier:
    def test_flags_engine_with_atypical_profile(self):
        engines = [
            {"name": "A", "aggregated_taxonomy": {"distribution": {"visual_confusion": 0.50, "abbreviation_error": 0.10}}},
            {"name": "B", "aggregated_taxonomy": {"distribution": {"visual_confusion": 0.20, "abbreviation_error": 0.10}}},
            {"name": "C", "aggregated_taxonomy": {"distribution": {"visual_confusion": 0.15, "abbreviation_error": 0.10}}},
        ]
        data = _minimal_data(engines=engines)
        facts = detect_error_profile_outlier(data)
        flagged = [f for f in facts if f.payload["engine"] == "A"]
        assert flagged
        assert flagged[0].payload["error_class"] == "visual_confusion"

    def test_empty_when_no_taxonomy(self):
        assert detect_error_profile_outlier(_minimal_data()) == []


class TestLlmHallucinationFlag:
    def test_flags_pipeline_with_high_rate(self):
        engines = [
            {"name": "tesseract", "aggregated_hallucination": {"hallucinating_doc_rate": 0.05},
             "is_pipeline": False, "is_vlm": False},
            {"name": "gpt-4o", "aggregated_hallucination": {
                "hallucinating_doc_rate": 0.45, "anchor_score_mean": 0.55, "length_ratio_mean": 1.4},
             "is_pipeline": True, "is_vlm": True},
        ]
        data = _minimal_data(engines=engines)
        facts = detect_llm_hallucination_flag(data)
        assert len(facts) == 1
        assert facts[0].payload["engine"] == "gpt-4o"
        assert facts[0].payload["hallucinating_rate_pct"] == 45.0

    def test_ignores_non_llm_engines(self):
        engines = [
            {"name": "tesseract", "aggregated_hallucination": {"hallucinating_doc_rate": 0.9},
             "is_pipeline": False, "is_vlm": False},
        ]
        data = _minimal_data(engines=engines)
        assert detect_llm_hallucination_flag(data) == []


class TestStratumDetectors:
    def _docs_with_strata(self):
        # 6 docs — 3 en "gothique", 3 en "humaniste"
        # Engine A est super bon en humaniste, moyen en gothique
        # Engine B est moyen partout
        docs = []
        for i in range(3):
            docs.append({
                "doc_id": f"goth{i}",
                "script_type": "gothique",
                "engine_results": [
                    {"engine": "A", "cer": 0.12, "error": None},
                    {"engine": "B", "cer": 0.15, "error": None},
                ],
            })
        for i in range(3):
            docs.append({
                "doc_id": f"hum{i}",
                "script_type": "humaniste",
                "engine_results": [
                    {"engine": "A", "cer": 0.02, "error": None},
                    {"engine": "B", "cer": 0.10, "error": None},
                ],
            })
        return docs

    def test_stratum_winner_detected(self):
        docs = self._docs_with_strata()
        engines = [{"name": "A", "cer": 0.07}, {"name": "B", "cer": 0.12}]
        data = _minimal_data(documents=docs, engines=engines)
        facts = detect_stratum_winner(data)
        humanist = [f for f in facts if f.stratum == "humaniste"]
        assert humanist
        assert humanist[0].payload["engine"] == "A"

    def test_stratum_collapse_detected(self):
        # Engine A globalement bon (0.05) mais s'effondre sur "cursive" (0.30)
        docs = []
        for i in range(5):
            docs.append({
                "doc_id": f"good{i}",
                "script_type": "textualis",
                "engine_results": [{"engine": "A", "cer": 0.04, "error": None}],
            })
        for i in range(3):
            docs.append({
                "doc_id": f"bad{i}",
                "script_type": "cursive",
                "engine_results": [{"engine": "A", "cer": 0.30, "error": None}],
            })
        engines = [{"name": "A", "cer": 0.10}]
        data = _minimal_data(documents=docs, engines=engines)
        facts = detect_stratum_collapse(data)
        assert any(f.stratum == "cursive" for f in facts)


class TestSpeedWinner:
    def test_detects_fast_engine_with_comparable_quality(self):
        # "fast" est 50× plus rapide ET n'est qu'à 6 % de CER en plus du leader
        # (dans la marge de tolérance de qualité du détecteur).
        docs = []
        for i in range(5):
            docs.append({
                "doc_id": f"d{i}",
                "engine_results": [
                    {"engine": "fast", "cer": 0.053, "error": None, "duration": 0.1},
                    {"engine": "slow", "cer": 0.050, "error": None, "duration": 5.0},
                ],
            })
        engines = [{"name": "fast", "cer": 0.053}, {"name": "slow", "cer": 0.050}]
        ranking = [
            {"engine": "slow", "mean_cer": 0.050, "documents": 5, "failed": 0},
            {"engine": "fast", "mean_cer": 0.053, "documents": 5, "failed": 0},
        ]
        data = _minimal_data(documents=docs, engines=engines, ranking=ranking)
        facts = detect_speed_winner(data)
        assert facts, "speed_winner devrait détecter un moteur 50× plus rapide"
        assert facts[0].payload["engine"] == "fast"
        assert facts[0].payload["speedup"] >= 3.0

    def test_ignores_fast_engine_with_bad_quality(self):
        # "fast" est rapide mais a un CER 3× celui du leader — pas un speed winner
        docs = [{
            "doc_id": f"d{i}",
            "engine_results": [
                {"engine": "fast", "cer": 0.15, "error": None, "duration": 0.1},
                {"engine": "slow", "cer": 0.05, "error": None, "duration": 5.0},
            ],
        } for i in range(5)]
        engines = [{"name": "fast", "cer": 0.15}, {"name": "slow", "cer": 0.05}]
        ranking = [
            {"engine": "slow", "mean_cer": 0.05, "documents": 5, "failed": 0},
            {"engine": "fast", "mean_cer": 0.15, "documents": 5, "failed": 0},
        ]
        data = _minimal_data(documents=docs, engines=engines, ranking=ranking)
        assert detect_speed_winner(data) == []


class TestConfidenceWarning:
    def test_wide_ci_triggers_warning(self):
        cis = [
            {"engine": "A", "mean": 0.05, "ci_lower": 0.01, "ci_upper": 0.25},
            {"engine": "B", "mean": 0.12, "ci_lower": 0.08, "ci_upper": 0.16},
        ]
        data = _minimal_data(statistics={
            "pairwise_wilcoxon": [], "bootstrap_cis": cis,
            "friedman": {}, "nemenyi": {"tied_groups": [], "mean_ranks": {}},
        })
        facts = detect_confidence_warning(data)
        assert len(facts) == 1
        assert facts[0].payload["engine"] == "A"


class TestRobustnessFragile:
    def test_detects_collapse_under_high_degradation(self):
        data = _minimal_data(robustness={
            "curves": [
                {"engine": "X", "degradation_type": "noise", "points": [
                    {"level": 0, "cer": 0.05},
                    {"level": 80, "cer": 0.40},
                ]},
                {"engine": "Y", "degradation_type": "noise", "points": [
                    {"level": 0, "cer": 0.05},
                    {"level": 80, "cer": 0.08},
                ]},
            ],
        })
        facts = detect_robustness_fragile(data)
        names = {f.payload["engine"] for f in facts}
        assert "X" in names
        assert "Y" not in names


# ---------------------------------------------------------------------------
# Arbitre
# ---------------------------------------------------------------------------

class TestArbiter:
    def _fact(self, t, imp=FactImportance.HIGH, engines=("A",), stratum=None, payload=None):
        return Fact(type=t, importance=imp, payload=payload or {},
                    engines_involved=tuple(engines), stratum=stratum)

    def test_sort_by_importance_descending(self):
        f1 = self._fact(FactType.SPEED_WINNER, imp=FactImportance.MEDIUM)
        f2 = self._fact(FactType.GLOBAL_LEADER_CER, imp=FactImportance.CRITICAL, engines=("B",))
        selected = select_facts([f1, f2])
        assert selected[0].type == FactType.GLOBAL_LEADER_CER

    def test_max_facts_limit(self):
        facts = [self._fact(FactType.ERROR_PROFILE_OUTLIER, engines=(f"E{i}",)) for i in range(10)]
        selected = select_facts(facts, max_facts=3)
        assert len(selected) == 3

    def test_deduplicates_same_engine_same_type(self):
        f1 = self._fact(FactType.ERROR_PROFILE_OUTLIER, engines=("A",), payload={"x": 1})
        f2 = self._fact(FactType.ERROR_PROFILE_OUTLIER, engines=("A",), payload={"x": 2})
        selected = select_facts([f1, f2])
        assert len(selected) == 1

    def test_keeps_complementary_facts_for_same_engine(self):
        leader = self._fact(FactType.GLOBAL_LEADER_CER, imp=FactImportance.CRITICAL, engines=("A",))
        gap = self._fact(FactType.SIGNIFICANT_GAP, imp=FactImportance.CRITICAL, engines=("A", "B"))
        selected = select_facts([leader, gap])
        # Les deux doivent survivre (paire complémentaire)
        types = {f.type for f in selected}
        assert FactType.GLOBAL_LEADER_CER in types
        assert FactType.SIGNIFICANT_GAP in types

    def test_low_importance_filtered(self):
        low = Fact(type=FactType.SPEED_WINNER, importance=FactImportance.LOW,
                   payload={}, engines_involved=("A",))
        high = self._fact(FactType.GLOBAL_LEADER_CER, imp=FactImportance.CRITICAL, engines=("A",))
        selected = select_facts([low, high])
        assert all(f.importance >= FactImportance.MEDIUM for f in selected)

    def test_nemenyi_tie_suppresses_contradicting_wilcoxon_gap(self):
        # Si A et B sont dans le même groupe Nemenyi, on ne doit pas afficher
        # un SIGNIFICANT_GAP entre A et B en plus.
        tie = self._fact(FactType.STATISTICAL_TIE, imp=FactImportance.CRITICAL,
                         engines=("A", "B", "C"))
        gap = self._fact(FactType.SIGNIFICANT_GAP, imp=FactImportance.CRITICAL,
                         engines=("A", "B"))
        selected = select_facts([tie, gap])
        types = {f.type for f in selected}
        assert FactType.STATISTICAL_TIE in types
        assert FactType.SIGNIFICANT_GAP not in types


# ---------------------------------------------------------------------------
# Rendu et déterminisme
# ---------------------------------------------------------------------------

class TestRenderer:
    def test_render_fact_with_known_template(self):
        f = Fact(
            type=FactType.GLOBAL_LEADER_CER,
            importance=FactImportance.CRITICAL,
            payload={"engine": "testseract", "cer_pct": 4.2, "n_docs": 50,
                     "cer": 0.042, "n_engines": 3},
            engines_involved=("testseract",),
        )
        text = render_fact(f, "fr")
        assert "testseract" in text
        assert "4.2" in text
        assert "50" in text

    def test_render_respects_language(self):
        f = Fact(
            type=FactType.GLOBAL_LEADER_CER,
            importance=FactImportance.CRITICAL,
            payload={"engine": "X", "cer_pct": 1.0, "n_docs": 10,
                     "cer": 0.01, "n_engines": 2},
        )
        fr = render_fact(f, "fr")
        en = render_fact(f, "en")
        assert fr != en
        assert "Sur ce corpus" in fr
        assert "On this corpus" in en

    def test_render_missing_key_does_not_crash(self):
        # Payload incomplet volontairement
        f = Fact(
            type=FactType.GLOBAL_LEADER_CER,
            importance=FactImportance.CRITICAL,
            payload={"engine": "only_name"},
        )
        text = render_fact(f)
        # Doit renvoyer une phrase non vide, même si certains placeholders sont manquants
        assert "only_name" in text

    def test_render_synthesis_deterministic(self):
        facts = [
            Fact(type=FactType.GLOBAL_LEADER_CER, importance=FactImportance.CRITICAL,
                 payload={"engine": "A", "cer_pct": 3.1, "n_docs": 20,
                          "cer": 0.031, "n_engines": 2},
                 engines_involved=("A",)),
        ]
        s1 = render_synthesis(facts, "fr")
        s2 = render_synthesis(facts, "fr")
        assert s1 == s2


class TestBuildSynthesisE2E:
    def test_full_pipeline_produces_sentences(self):
        data = _minimal_data(statistics={
            "pairwise_wilcoxon": [
                {"engine_a": "A", "engine_b": "B", "p_value": 0.01,
                 "significant": True, "n_pairs": 10},
            ],
            "bootstrap_cis": [
                {"engine": "A", "mean": 0.05, "ci_lower": 0.04, "ci_upper": 0.06},
                {"engine": "B", "mean": 0.12, "ci_lower": 0.11, "ci_upper": 0.13},
            ],
            "friedman": {},
            "nemenyi": {"tied_groups": [["A"], ["B"], ["C"]],
                        "mean_ranks": {"A": 1.0, "B": 2.0, "C": 3.0},
                        "critical_distance": 0.5},
        })
        result = build_synthesis(data, "fr")
        assert "sentences" in result
        assert "facts" in result
        assert len(result["sentences"]) >= 1
        # Au moins la mention du leader
        assert any("A" in s for s in result["sentences"])

    def test_pipeline_deterministic_across_calls(self):
        data = _minimal_data()
        s1 = build_synthesis(data, "fr")
        s2 = build_synthesis(data, "fr")
        assert s1 == s2


# ---------------------------------------------------------------------------
# Garde-fou anti-hallucination : traçabilité des nombres
# ---------------------------------------------------------------------------

def _numbers_in_payload(payload: dict) -> set[str]:
    """Collecte tous les nombres d'un payload de Fact sous formes multiples.

    Inclut les représentations usuelles produites par ``str.format`` :
    ``5``, ``5.0``, ``5.00``, ``5.000``, etc., pour tolérer les formats
    ``{x}`` et ``{x:.2f}`` dans les templates.
    """
    out: set[str] = set()

    def _add_variants(v):
        try:
            f = float(v)
        except (TypeError, ValueError):
            return
        out.add(str(v))
        out.add(str(f))
        if f == int(f):
            out.add(str(int(f)))
        for dec in (1, 2, 3, 4):
            out.add(f"{f:.{dec}f}")

    def _walk(x):
        if isinstance(x, dict):
            for v in x.values():
                _walk(v)
        elif isinstance(x, (list, tuple)):
            for v in x:
                _walk(v)
        elif isinstance(x, bool):
            return
        elif isinstance(x, (int, float)):
            _add_variants(x)
        elif isinstance(x, str):
            for n in re.findall(r"\d+(?:\.\d+)?", x):
                _add_variants(n)

    _walk(payload)
    return out


# Constantes littérales autorisées dans les templates (non traçables au
# payload car ce sont des éléments typographiques — seuil 95 % correspondant
# à α = 0,05, etc.). Ajouter ici rend la règle explicite.
_TEMPLATE_CONSTANTS = {"95", "100"}


class TestAntiHallucinationTraceability:
    """Chaque nombre dans la synthèse doit venir du payload d'un Fact
    (lui-même traçable au JSON d'entrée par construction des détecteurs)
    ou appartenir à la liste limitative des constantes de template.
    """

    def test_every_number_in_synthesis_is_traceable(self):
        data = _minimal_data(statistics={
            "pairwise_wilcoxon": [
                {"engine_a": "A", "engine_b": "B", "p_value": 0.0123,
                 "significant": True, "n_pairs": 10},
            ],
            "bootstrap_cis": [
                {"engine": "A", "mean": 0.05, "ci_lower": 0.01, "ci_upper": 0.25},
                {"engine": "B", "mean": 0.12, "ci_lower": 0.11, "ci_upper": 0.13},
            ],
            "friedman": {"statistic": 5.2, "p_value": 0.07, "significant": False},
            "nemenyi": {
                "tied_groups": [["A", "B"]],
                "mean_ranks": {"A": 1.3, "B": 1.7, "C": 3.0},
                "critical_distance": 0.856,
                "alpha": 0.05,
                "n_blocks": 10,
            },
        })
        result = build_synthesis(data, "fr")
        # Concaténer tous les payloads des Facts retenus
        allowed = set(_TEMPLATE_CONSTANTS)
        for f in result["facts"]:
            allowed |= _numbers_in_payload(f.get("payload", {}))

        unknown = []
        for sentence in result["sentences"]:
            for num in extract_numbers(sentence):
                num_norm = num.replace(",", ".")
                if num_norm not in allowed:
                    unknown.append((num, sentence))
        assert not unknown, f"Nombres non traçables : {unknown}"


# ---------------------------------------------------------------------------
# Intégration au rapport HTML
# ---------------------------------------------------------------------------

@pytest.fixture(scope="module")
def benchmark_result():
    from picarones import fixtures
    return fixtures.generate_sample_benchmark(n_docs=8)


class TestReportIntegration:
    def test_report_contains_synthesis_section(self, benchmark_result, tmp_path):
        from picarones.report.generator import ReportGenerator
        out = tmp_path / "report.html"
        ReportGenerator(benchmark_result).generate(out)
        html = out.read_text(encoding="utf-8")
        assert 'class="synth-card"' in html
        assert 'id="synth-title"' in html
        # Au moins une phrase rendue
        assert re.search(r'<ul class="synth-list">\s*<li>', html)

    def test_report_synthesis_is_deterministic(self, benchmark_result, tmp_path):
        from picarones.report.generator import ReportGenerator
        out1 = tmp_path / "r1.html"
        out2 = tmp_path / "r2.html"
        ReportGenerator(benchmark_result).generate(out1)
        ReportGenerator(benchmark_result).generate(out2)
        # Extraire la section synth et comparer
        h1 = out1.read_text(encoding="utf-8")
        h2 = out2.read_text(encoding="utf-8")
        s1 = re.search(r'<section class="synth-card".*?</section>', h1, re.DOTALL)
        s2 = re.search(r'<section class="synth-card".*?</section>', h2, re.DOTALL)
        assert s1 and s2
        assert hashlib.sha256(s1.group().encode()).hexdigest() == \
               hashlib.sha256(s2.group().encode()).hexdigest()

    def test_default_registry_has_all_types_registered(self):
        from picarones.core.narrative import _DEFAULT_REGISTRY
        registered = set(_DEFAULT_REGISTRY.registered_types())
        # Tous les 12 types doivent être enregistrés (même ceux encore stubs)
        assert len(registered) == 12

    def test_english_locale_produces_english_sentences(self, benchmark_result, tmp_path):
        from picarones.report.generator import ReportGenerator
        out = tmp_path / "report_en.html"
        ReportGenerator(benchmark_result, lang="en").generate(out)
        html = out.read_text(encoding="utf-8")
        m = re.search(r'<ul class="synth-list">(.*?)</ul>', html, re.DOTALL)
        assert m
        ul_content = m.group(1)
        # Soit "On this corpus" (leader) soit "Engines" (tie) soit "The gap"
        assert any(marker in ul_content for marker in
                   ("On this corpus", "Engines ", "The gap", "statistically"))