"""Tests Sprint 19 — Moteur narratif complet (détecteurs + arbitre + rendu). Sprint 4 du plan rapport. Couvre : 1. Les 9 détecteurs implémentés (scénarios canoniques + cas vides). 2. L'arbitre : tri par importance, non-redondance, contradiction Nemenyi/Wilcoxon. 3. Le renderer : chargement des templates YAML, déterminisme. 4. Le garde-fou anti-hallucination : tout nombre rendu existe dans le JSON. 5. L'intégration au rapport HTML (section synthèse, reproductibilité). """ from __future__ import annotations import hashlib import re import pytest from picarones.core.narrative import ( Fact, FactImportance, FactType, build_synthesis, extract_numbers, render_fact, render_synthesis, select_facts, ) from picarones.core.narrative.detectors import ( detect_confidence_warning, detect_error_profile_outlier, detect_global_leader_cer, detect_llm_hallucination_flag, detect_robustness_fragile, detect_significant_gap, detect_speed_winner, detect_statistical_tie, detect_stratum_collapse, detect_stratum_winner, ) # --------------------------------------------------------------------------- # Fixtures — données de benchmark minimales et contrôlées # --------------------------------------------------------------------------- def _minimal_data(**overrides) -> dict: base = { "meta": {"document_count": 10}, "ranking": [ {"engine": "A", "mean_cer": 0.05, "mean_wer": 0.15, "documents": 10, "failed": 0}, {"engine": "B", "mean_cer": 0.12, "mean_wer": 0.25, "documents": 10, "failed": 0}, {"engine": "C", "mean_cer": 0.30, "mean_wer": 0.50, "documents": 10, "failed": 0}, ], "engines": [ {"name": "A", "cer": 0.05, "wer": 0.15, "is_pipeline": False, "is_vlm": False}, {"name": "B", "cer": 0.12, "wer": 0.25, "is_pipeline": False, "is_vlm": False}, {"name": "C", "cer": 0.30, "wer": 0.50, "is_pipeline": False, "is_vlm": False}, ], "documents": [], "statistics": { "pairwise_wilcoxon": [], "bootstrap_cis": [], "friedman": {}, "nemenyi": {"tied_groups": [], "mean_ranks": {}, "critical_distance": 0.0}, }, } base.update(overrides) return base # --------------------------------------------------------------------------- # Détecteurs individuels # --------------------------------------------------------------------------- class TestGlobalLeaderCer: def test_emits_fact_with_cer_pct_and_n_docs(self): facts = detect_global_leader_cer(_minimal_data()) assert len(facts) == 1 f = facts[0] assert f.type == FactType.GLOBAL_LEADER_CER assert f.importance == FactImportance.CRITICAL assert f.payload["engine"] == "A" assert f.payload["cer_pct"] == 5.0 assert f.payload["n_docs"] == 10 assert f.payload["runner_up"] == "B" def test_empty_when_no_ranking(self): assert detect_global_leader_cer(_minimal_data(ranking=[])) == [] class TestSignificantGap: def test_emits_when_leader_vs_runnerup_is_significant(self): data = _minimal_data(statistics={ "pairwise_wilcoxon": [ {"engine_a": "A", "engine_b": "B", "p_value": 0.002, "significant": True, "n_pairs": 10}, ], "bootstrap_cis": [], "friedman": {}, "nemenyi": {"tied_groups": [], "mean_ranks": {}}, }) facts = detect_significant_gap(data) assert len(facts) == 1 assert facts[0].payload["leader"] == "A" assert facts[0].payload["runner_up"] == "B" assert facts[0].payload["p_value"] == pytest.approx(0.002) def test_empty_when_not_significant(self): data = _minimal_data(statistics={ "pairwise_wilcoxon": [ {"engine_a": "A", "engine_b": "B", "p_value": 0.4, "significant": False, "n_pairs": 10}, ], "bootstrap_cis": [], "friedman": {}, "nemenyi": {"tied_groups": [], "mean_ranks": {}}, }) assert detect_significant_gap(data) == [] class TestStatisticalTie: def test_emits_for_each_tied_group(self): data = _minimal_data(statistics={ "pairwise_wilcoxon": [], "bootstrap_cis": [], "friedman": {}, "nemenyi": { "tied_groups": [["A", "B"], ["C"]], "mean_ranks": {"A": 1.2, "B": 1.5, "C": 3.0}, "critical_distance": 0.8, "alpha": 0.05, "n_blocks": 10, }, }) facts = detect_statistical_tie(data) assert len(facts) == 1 assert set(facts[0].engines_involved) == {"A", "B"} assert facts[0].payload["includes_leader"] is True class TestErrorProfileOutlier: def test_flags_engine_with_atypical_profile(self): engines = [ {"name": "A", "aggregated_taxonomy": {"distribution": {"visual_confusion": 0.50, "abbreviation_error": 0.10}}}, {"name": "B", "aggregated_taxonomy": {"distribution": {"visual_confusion": 0.20, "abbreviation_error": 0.10}}}, {"name": "C", "aggregated_taxonomy": {"distribution": {"visual_confusion": 0.15, "abbreviation_error": 0.10}}}, ] data = _minimal_data(engines=engines) facts = detect_error_profile_outlier(data) flagged = [f for f in facts if f.payload["engine"] == "A"] assert flagged assert flagged[0].payload["error_class"] == "visual_confusion" def test_empty_when_no_taxonomy(self): assert detect_error_profile_outlier(_minimal_data()) == [] class TestLlmHallucinationFlag: def test_flags_pipeline_with_high_rate(self): engines = [ {"name": "tesseract", "aggregated_hallucination": {"hallucinating_doc_rate": 0.05}, "is_pipeline": False, "is_vlm": False}, {"name": "gpt-4o", "aggregated_hallucination": { "hallucinating_doc_rate": 0.45, "anchor_score_mean": 0.55, "length_ratio_mean": 1.4}, "is_pipeline": True, "is_vlm": True}, ] data = _minimal_data(engines=engines) facts = detect_llm_hallucination_flag(data) assert len(facts) == 1 assert facts[0].payload["engine"] == "gpt-4o" assert facts[0].payload["hallucinating_rate_pct"] == 45.0 def test_ignores_non_llm_engines(self): engines = [ {"name": "tesseract", "aggregated_hallucination": {"hallucinating_doc_rate": 0.9}, "is_pipeline": False, "is_vlm": False}, ] data = _minimal_data(engines=engines) assert detect_llm_hallucination_flag(data) == [] class TestStratumDetectors: def _docs_with_strata(self): # 6 docs — 3 en "gothique", 3 en "humaniste" # Engine A est super bon en humaniste, moyen en gothique # Engine B est moyen partout docs = [] for i in range(3): docs.append({ "doc_id": f"goth{i}", "script_type": "gothique", "engine_results": [ {"engine": "A", "cer": 0.12, "error": None}, {"engine": "B", "cer": 0.15, "error": None}, ], }) for i in range(3): docs.append({ "doc_id": f"hum{i}", "script_type": "humaniste", "engine_results": [ {"engine": "A", "cer": 0.02, "error": None}, {"engine": "B", "cer": 0.10, "error": None}, ], }) return docs def test_stratum_winner_detected(self): docs = self._docs_with_strata() engines = [{"name": "A", "cer": 0.07}, {"name": "B", "cer": 0.12}] data = _minimal_data(documents=docs, engines=engines) facts = detect_stratum_winner(data) humanist = [f for f in facts if f.stratum == "humaniste"] assert humanist assert humanist[0].payload["engine"] == "A" def test_stratum_collapse_detected(self): # Engine A globalement bon (0.05) mais s'effondre sur "cursive" (0.30) docs = [] for i in range(5): docs.append({ "doc_id": f"good{i}", "script_type": "textualis", "engine_results": [{"engine": "A", "cer": 0.04, "error": None}], }) for i in range(3): docs.append({ "doc_id": f"bad{i}", "script_type": "cursive", "engine_results": [{"engine": "A", "cer": 0.30, "error": None}], }) engines = [{"name": "A", "cer": 0.10}] data = _minimal_data(documents=docs, engines=engines) facts = detect_stratum_collapse(data) assert any(f.stratum == "cursive" for f in facts) class TestSpeedWinner: def test_detects_fast_engine_with_comparable_quality(self): # "fast" est 50× plus rapide ET n'est qu'à 6 % de CER en plus du leader # (dans la marge de tolérance de qualité du détecteur). docs = [] for i in range(5): docs.append({ "doc_id": f"d{i}", "engine_results": [ {"engine": "fast", "cer": 0.053, "error": None, "duration": 0.1}, {"engine": "slow", "cer": 0.050, "error": None, "duration": 5.0}, ], }) engines = [{"name": "fast", "cer": 0.053}, {"name": "slow", "cer": 0.050}] ranking = [ {"engine": "slow", "mean_cer": 0.050, "documents": 5, "failed": 0}, {"engine": "fast", "mean_cer": 0.053, "documents": 5, "failed": 0}, ] data = _minimal_data(documents=docs, engines=engines, ranking=ranking) facts = detect_speed_winner(data) assert facts, "speed_winner devrait détecter un moteur 50× plus rapide" assert facts[0].payload["engine"] == "fast" assert facts[0].payload["speedup"] >= 3.0 def test_ignores_fast_engine_with_bad_quality(self): # "fast" est rapide mais a un CER 3× celui du leader — pas un speed winner docs = [{ "doc_id": f"d{i}", "engine_results": [ {"engine": "fast", "cer": 0.15, "error": None, "duration": 0.1}, {"engine": "slow", "cer": 0.05, "error": None, "duration": 5.0}, ], } for i in range(5)] engines = [{"name": "fast", "cer": 0.15}, {"name": "slow", "cer": 0.05}] ranking = [ {"engine": "slow", "mean_cer": 0.05, "documents": 5, "failed": 0}, {"engine": "fast", "mean_cer": 0.15, "documents": 5, "failed": 0}, ] data = _minimal_data(documents=docs, engines=engines, ranking=ranking) assert detect_speed_winner(data) == [] class TestConfidenceWarning: def test_wide_ci_triggers_warning(self): cis = [ {"engine": "A", "mean": 0.05, "ci_lower": 0.01, "ci_upper": 0.25}, {"engine": "B", "mean": 0.12, "ci_lower": 0.08, "ci_upper": 0.16}, ] data = _minimal_data(statistics={ "pairwise_wilcoxon": [], "bootstrap_cis": cis, "friedman": {}, "nemenyi": {"tied_groups": [], "mean_ranks": {}}, }) facts = detect_confidence_warning(data) assert len(facts) == 1 assert facts[0].payload["engine"] == "A" class TestRobustnessFragile: def test_detects_collapse_under_high_degradation(self): data = _minimal_data(robustness={ "curves": [ {"engine": "X", "degradation_type": "noise", "points": [ {"level": 0, "cer": 0.05}, {"level": 80, "cer": 0.40}, ]}, {"engine": "Y", "degradation_type": "noise", "points": [ {"level": 0, "cer": 0.05}, {"level": 80, "cer": 0.08}, ]}, ], }) facts = detect_robustness_fragile(data) names = {f.payload["engine"] for f in facts} assert "X" in names assert "Y" not in names # --------------------------------------------------------------------------- # Arbitre # --------------------------------------------------------------------------- class TestArbiter: def _fact(self, t, imp=FactImportance.HIGH, engines=("A",), stratum=None, payload=None): return Fact(type=t, importance=imp, payload=payload or {}, engines_involved=tuple(engines), stratum=stratum) def test_sort_by_importance_descending(self): f1 = self._fact(FactType.SPEED_WINNER, imp=FactImportance.MEDIUM) f2 = self._fact(FactType.GLOBAL_LEADER_CER, imp=FactImportance.CRITICAL, engines=("B",)) selected = select_facts([f1, f2]) assert selected[0].type == FactType.GLOBAL_LEADER_CER def test_max_facts_limit(self): facts = [self._fact(FactType.ERROR_PROFILE_OUTLIER, engines=(f"E{i}",)) for i in range(10)] selected = select_facts(facts, max_facts=3) assert len(selected) == 3 def test_deduplicates_same_engine_same_type(self): f1 = self._fact(FactType.ERROR_PROFILE_OUTLIER, engines=("A",), payload={"x": 1}) f2 = self._fact(FactType.ERROR_PROFILE_OUTLIER, engines=("A",), payload={"x": 2}) selected = select_facts([f1, f2]) assert len(selected) == 1 def test_keeps_complementary_facts_for_same_engine(self): leader = self._fact(FactType.GLOBAL_LEADER_CER, imp=FactImportance.CRITICAL, engines=("A",)) gap = self._fact(FactType.SIGNIFICANT_GAP, imp=FactImportance.CRITICAL, engines=("A", "B")) selected = select_facts([leader, gap]) # Les deux doivent survivre (paire complémentaire) types = {f.type for f in selected} assert FactType.GLOBAL_LEADER_CER in types assert FactType.SIGNIFICANT_GAP in types def test_low_importance_filtered(self): low = Fact(type=FactType.SPEED_WINNER, importance=FactImportance.LOW, payload={}, engines_involved=("A",)) high = self._fact(FactType.GLOBAL_LEADER_CER, imp=FactImportance.CRITICAL, engines=("A",)) selected = select_facts([low, high]) assert all(f.importance >= FactImportance.MEDIUM for f in selected) def test_nemenyi_tie_suppresses_contradicting_wilcoxon_gap(self): # Si A et B sont dans le même groupe Nemenyi, on ne doit pas afficher # un SIGNIFICANT_GAP entre A et B en plus. tie = self._fact(FactType.STATISTICAL_TIE, imp=FactImportance.CRITICAL, engines=("A", "B", "C")) gap = self._fact(FactType.SIGNIFICANT_GAP, imp=FactImportance.CRITICAL, engines=("A", "B")) selected = select_facts([tie, gap]) types = {f.type for f in selected} assert FactType.STATISTICAL_TIE in types assert FactType.SIGNIFICANT_GAP not in types # --------------------------------------------------------------------------- # Rendu et déterminisme # --------------------------------------------------------------------------- class TestRenderer: def test_render_fact_with_known_template(self): f = Fact( type=FactType.GLOBAL_LEADER_CER, importance=FactImportance.CRITICAL, payload={"engine": "testseract", "cer_pct": 4.2, "n_docs": 50, "cer": 0.042, "n_engines": 3}, engines_involved=("testseract",), ) text = render_fact(f, "fr") assert "testseract" in text assert "4.2" in text assert "50" in text def test_render_respects_language(self): f = Fact( type=FactType.GLOBAL_LEADER_CER, importance=FactImportance.CRITICAL, payload={"engine": "X", "cer_pct": 1.0, "n_docs": 10, "cer": 0.01, "n_engines": 2}, ) fr = render_fact(f, "fr") en = render_fact(f, "en") assert fr != en assert "Sur ce corpus" in fr assert "On this corpus" in en def test_render_missing_key_does_not_crash(self): # Payload incomplet volontairement f = Fact( type=FactType.GLOBAL_LEADER_CER, importance=FactImportance.CRITICAL, payload={"engine": "only_name"}, ) text = render_fact(f) # Doit renvoyer une phrase non vide, même si certains placeholders sont manquants assert "only_name" in text def test_render_synthesis_deterministic(self): facts = [ Fact(type=FactType.GLOBAL_LEADER_CER, importance=FactImportance.CRITICAL, payload={"engine": "A", "cer_pct": 3.1, "n_docs": 20, "cer": 0.031, "n_engines": 2}, engines_involved=("A",)), ] s1 = render_synthesis(facts, "fr") s2 = render_synthesis(facts, "fr") assert s1 == s2 class TestBuildSynthesisE2E: def test_full_pipeline_produces_sentences(self): data = _minimal_data(statistics={ "pairwise_wilcoxon": [ {"engine_a": "A", "engine_b": "B", "p_value": 0.01, "significant": True, "n_pairs": 10}, ], "bootstrap_cis": [ {"engine": "A", "mean": 0.05, "ci_lower": 0.04, "ci_upper": 0.06}, {"engine": "B", "mean": 0.12, "ci_lower": 0.11, "ci_upper": 0.13}, ], "friedman": {}, "nemenyi": {"tied_groups": [["A"], ["B"], ["C"]], "mean_ranks": {"A": 1.0, "B": 2.0, "C": 3.0}, "critical_distance": 0.5}, }) result = build_synthesis(data, "fr") assert "sentences" in result assert "facts" in result assert len(result["sentences"]) >= 1 # Au moins la mention du leader assert any("A" in s for s in result["sentences"]) def test_pipeline_deterministic_across_calls(self): data = _minimal_data() s1 = build_synthesis(data, "fr") s2 = build_synthesis(data, "fr") assert s1 == s2 # --------------------------------------------------------------------------- # Garde-fou anti-hallucination : traçabilité des nombres # --------------------------------------------------------------------------- def _numbers_in_payload(payload: dict) -> set[str]: """Collecte tous les nombres d'un payload de Fact sous formes multiples. Inclut les représentations usuelles produites par ``str.format`` : ``5``, ``5.0``, ``5.00``, ``5.000``, etc., pour tolérer les formats ``{x}`` et ``{x:.2f}`` dans les templates. """ out: set[str] = set() def _add_variants(v): try: f = float(v) except (TypeError, ValueError): return out.add(str(v)) out.add(str(f)) if f == int(f): out.add(str(int(f))) for dec in (1, 2, 3, 4): out.add(f"{f:.{dec}f}") def _walk(x): if isinstance(x, dict): for v in x.values(): _walk(v) elif isinstance(x, (list, tuple)): for v in x: _walk(v) elif isinstance(x, bool): return elif isinstance(x, (int, float)): _add_variants(x) elif isinstance(x, str): for n in re.findall(r"\d+(?:\.\d+)?", x): _add_variants(n) _walk(payload) return out # Constantes littérales autorisées dans les templates (non traçables au # payload car ce sont des éléments typographiques — seuil 95 % correspondant # à α = 0,05, etc.). Ajouter ici rend la règle explicite. _TEMPLATE_CONSTANTS = {"95", "100"} class TestAntiHallucinationTraceability: """Chaque nombre dans la synthèse doit venir du payload d'un Fact (lui-même traçable au JSON d'entrée par construction des détecteurs) ou appartenir à la liste limitative des constantes de template. """ def test_every_number_in_synthesis_is_traceable(self): data = _minimal_data(statistics={ "pairwise_wilcoxon": [ {"engine_a": "A", "engine_b": "B", "p_value": 0.0123, "significant": True, "n_pairs": 10}, ], "bootstrap_cis": [ {"engine": "A", "mean": 0.05, "ci_lower": 0.01, "ci_upper": 0.25}, {"engine": "B", "mean": 0.12, "ci_lower": 0.11, "ci_upper": 0.13}, ], "friedman": {"statistic": 5.2, "p_value": 0.07, "significant": False}, "nemenyi": { "tied_groups": [["A", "B"]], "mean_ranks": {"A": 1.3, "B": 1.7, "C": 3.0}, "critical_distance": 0.856, "alpha": 0.05, "n_blocks": 10, }, }) result = build_synthesis(data, "fr") # Concaténer tous les payloads des Facts retenus allowed = set(_TEMPLATE_CONSTANTS) for f in result["facts"]: allowed |= _numbers_in_payload(f.get("payload", {})) unknown = [] for sentence in result["sentences"]: for num in extract_numbers(sentence): num_norm = num.replace(",", ".") if num_norm not in allowed: unknown.append((num, sentence)) assert not unknown, f"Nombres non traçables : {unknown}" # --------------------------------------------------------------------------- # Intégration au rapport HTML # --------------------------------------------------------------------------- @pytest.fixture(scope="module") def benchmark_result(): from picarones import fixtures return fixtures.generate_sample_benchmark(n_docs=8) class TestReportIntegration: def test_report_contains_synthesis_section(self, benchmark_result, tmp_path): from picarones.report.generator import ReportGenerator out = tmp_path / "report.html" ReportGenerator(benchmark_result).generate(out) html = out.read_text(encoding="utf-8") assert 'class="synth-card"' in html assert 'id="synth-title"' in html # Au moins une phrase rendue assert re.search(r'