"""Tests Sprint 90 — A.II.4 finition : détecteur narratif ``engine_unstable`` + vue HTML stabilité multi-runs. Couvre : 1. ``FactType.ENGINE_UNSTABLE`` enregistré + arbiter order. 2. ``detect_engine_unstable`` : - silence si pas de ``multirun_stability`` - silence si CV faible et identical_rate haut - HIGH si CV > 10 % - HIGH si identical_rate < 50 % 3. Templates FR/EN : rendu factuel. 4. Anti-hallucination : chaque chiffre rendu provient du payload. 5. Vue HTML : adaptive, anti-injection, FR + EN. 6. Complétude i18n FR/EN. """ from __future__ import annotations import json import re from pathlib import Path from picarones.reports.narrative import build_synthesis from picarones.reports.narrative.detectors import detect_engine_unstable from picarones.domain.facts import FactImportance, FactType from picarones.reports.html.renderers.multirun_stability import ( build_multirun_stability_html, ) def _load_labels(lang: str) -> dict: p = ( Path(__file__).parent.parent.parent / "picarones" / "reports" / "i18n" / f"{lang}.json" ) return json.loads(p.read_text(encoding="utf-8")) # ────────────────────────────────────────────────────────────────────────── # 1. Modèle / registre # ────────────────────────────────────────────────────────────────────────── class TestFactType: def test_enum_value(self) -> None: assert FactType.ENGINE_UNSTABLE.value == "engine_unstable" def test_in_arbiter_fallback_order(self) -> None: from picarones.reports.narrative.arbiter import _FALLBACK_TYPE_ORDER assert FactType.ENGINE_UNSTABLE in _FALLBACK_TYPE_ORDER # ────────────────────────────────────────────────────────────────────────── # 2. Détecteur # ────────────────────────────────────────────────────────────────────────── class TestDetector: def test_silent_without_data(self) -> None: assert detect_engine_unstable({}) == [] assert detect_engine_unstable({"multirun_stability": []}) == [] def test_silent_when_stable(self) -> None: # CV bas + tous identiques → pas de Fact data = {"multirun_stability": [{ "engine_name": "tess", "n_runs": 3, "cer_mean": 0.04, "cer_stdev": 0.001, "cer_cv": 0.025, "identical_run_rate": 1.0, "n_distinct_outputs": 1, }]} assert detect_engine_unstable(data) == [] def test_emits_when_cv_high(self) -> None: data = {"multirun_stability": [{ "engine_name": "gpt-4o", "n_runs": 5, "cer_mean": 0.10, "cer_stdev": 0.025, "cer_cv": 0.25, "identical_run_rate": 0.10, "n_distinct_outputs": 5, }]} facts = detect_engine_unstable(data) assert len(facts) == 1 assert facts[0].type == FactType.ENGINE_UNSTABLE assert facts[0].importance == FactImportance.HIGH assert facts[0].payload["engine"] == "gpt-4o" assert facts[0].payload["cer_cv_pct"] == 25.0 def test_emits_when_identical_rate_low(self) -> None: # CV bas mais runs très différents → divergence détectée data = {"multirun_stability": [{ "engine_name": "vlm", "n_runs": 4, "cer_mean": 0.05, "cer_stdev": 0.0025, "cer_cv": 0.05, "identical_run_rate": 0.20, "n_distinct_outputs": 4, }]} facts = detect_engine_unstable(data) assert len(facts) == 1 assert facts[0].payload["identical_run_rate_pct"] == 20.0 def test_silent_when_lt_two_runs(self) -> None: data = {"multirun_stability": [{ "engine_name": "tess", "n_runs": 1, "cer_cv": 0.5, "identical_run_rate": 0.0, }]} assert detect_engine_unstable(data) == [] def test_silent_when_engine_missing(self) -> None: data = {"multirun_stability": [{ "n_runs": 3, "cer_cv": 0.30, "identical_run_rate": 0.0, }]} assert detect_engine_unstable(data) == [] def test_multiple_engines(self) -> None: data = {"multirun_stability": [ {"engine_name": "tess", "n_runs": 3, "cer_cv": 0.02, "identical_run_rate": 1.0}, {"engine_name": "gpt-4o", "n_runs": 3, "cer_cv": 0.30, "identical_run_rate": 0.0}, ]} facts = detect_engine_unstable(data) # Seul gpt-4o instable assert len(facts) == 1 assert facts[0].payload["engine"] == "gpt-4o" # ────────────────────────────────────────────────────────────────────────── # 3. Anti-hallucination : tout chiffre rendu vient du payload # ────────────────────────────────────────────────────────────────────────── def _payload_numbers(payload: dict) -> set[str]: out: set[str] = set() for v in payload.values(): if isinstance(v, (int, float)): out.add(str(v)) if isinstance(v, float) and v.is_integer(): out.add(str(int(v))) return out def _numbers_in(text: str) -> set[str]: return set(re.findall(r"\d+(?:\.\d+)?", text)) class TestAntiHallucination: def _build(self, lang: str) -> tuple[list[str], dict]: data = { "ranking": [{"engine": "gpt-4o", "mean_cer": 0.10}], "engines": [{"name": "gpt-4o", "mean_cer": 0.10}], "meta": {"document_count": 5}, "multirun_stability": [{ "engine_name": "gpt-4o", "n_runs": 4, "cer_mean": 0.103, "cer_stdev": 0.025, "cer_cv": 0.243, "identical_run_rate": 0.16, "n_distinct_outputs": 4, }], } synthesis = build_synthesis(data, lang=lang, max_facts=10) facts = detect_engine_unstable(data) return synthesis["sentences"], facts[0].payload def _find_unstable_sentence( self, sentences: list[str], lang: str, ) -> str: marker = "prudence" if lang == "fr" else "caution" for s in sentences: if marker in s: return s raise AssertionError( f"phrase ENGINE_UNSTABLE introuvable parmi {sentences}", ) def test_fr_numbers_traceable(self) -> None: sentences, payload = self._build("fr") sentence = self._find_unstable_sentence(sentences, "fr") rendered = _numbers_in(sentence) allowed = _payload_numbers(payload) assert rendered.issubset(allowed), ( f"non traçable : {rendered - allowed}" ) def test_en_numbers_traceable(self) -> None: sentences, payload = self._build("en") sentence = self._find_unstable_sentence(sentences, "en") rendered = _numbers_in(sentence) allowed = _payload_numbers(payload) assert rendered.issubset(allowed), ( f"non traçable : {rendered - allowed}" ) # ────────────────────────────────────────────────────────────────────────── # 4. Vue HTML # ────────────────────────────────────────────────────────────────────────── class TestRender: def test_empty_returns_empty(self) -> None: assert build_multirun_stability_html(None) == "" assert build_multirun_stability_html([]) == "" def test_renders_table(self) -> None: stab = [{ "engine_name": "gpt-4o", "n_runs": 5, "cer_mean": 0.10, "cer_stdev": 0.025, "cer_cv": 0.25, "identical_run_rate": 0.10, "n_distinct_outputs": 5, }] html = build_multirun_stability_html(stab, _load_labels("fr")) assert " None: stab = [{ "engine_name": "", "n_runs": 2, "cer_cv": 0.3, "identical_run_rate": 0.0, }] html = build_multirun_stability_html(stab, _load_labels("fr")) assert "