Picarones / tests /reports /test_sprint90_engine_unstable.py
Claude
feat(sprint-S8): cohΓ©rence finale β€” renames test dirs, /metrics endpoint, SBOM workflow
43478ec unverified
Raw
History Blame
10.8 kB
"""Tests Sprint 90 β€” A.II.4 finition : dΓ©tecteur narratif
``engine_unstable`` + vue HTML stabilitΓ© multi-runs.
Couvre :
1. ``FactType.ENGINE_UNSTABLE`` enregistrΓ© + arbiter order.
2. ``detect_engine_unstable`` :
- silence si pas de ``multirun_stability``
- silence si CV faible et identical_rate haut
- HIGH si CV > 10 %
- HIGH si identical_rate < 50 %
3. Templates FR/EN : rendu factuel.
4. Anti-hallucination : chaque chiffre rendu provient du payload.
5. Vue HTML : adaptive, anti-injection, FR + EN.
6. ComplΓ©tude i18n FR/EN.
"""
from __future__ import annotations
import json
import re
from pathlib import Path
from picarones.reports.narrative import build_synthesis
from picarones.reports.narrative.detectors import detect_engine_unstable
from picarones.domain.facts import FactImportance, FactType
from picarones.reports.html.renderers.multirun_stability import (
build_multirun_stability_html,
)
def _load_labels(lang: str) -> dict:
p = (
Path(__file__).parent.parent.parent
/ "picarones" / "reports" / "i18n" / f"{lang}.json"
)
return json.loads(p.read_text(encoding="utf-8"))
# ──────────────────────────────────────────────────────────────────────────
# 1. Modèle / registre
# ──────────────────────────────────────────────────────────────────────────
class TestFactType:
def test_enum_value(self) -> None:
assert FactType.ENGINE_UNSTABLE.value == "engine_unstable"
def test_in_arbiter_fallback_order(self) -> None:
from picarones.reports.narrative.arbiter import _FALLBACK_TYPE_ORDER
assert FactType.ENGINE_UNSTABLE in _FALLBACK_TYPE_ORDER
# ──────────────────────────────────────────────────────────────────────────
# 2. DΓ©tecteur
# ──────────────────────────────────────────────────────────────────────────
class TestDetector:
def test_silent_without_data(self) -> None:
assert detect_engine_unstable({}) == []
assert detect_engine_unstable({"multirun_stability": []}) == []
def test_silent_when_stable(self) -> None:
# CV bas + tous identiques β†’ pas de Fact
data = {"multirun_stability": [{
"engine_name": "tess", "n_runs": 3,
"cer_mean": 0.04, "cer_stdev": 0.001, "cer_cv": 0.025,
"identical_run_rate": 1.0, "n_distinct_outputs": 1,
}]}
assert detect_engine_unstable(data) == []
def test_emits_when_cv_high(self) -> None:
data = {"multirun_stability": [{
"engine_name": "gpt-4o", "n_runs": 5,
"cer_mean": 0.10, "cer_stdev": 0.025, "cer_cv": 0.25,
"identical_run_rate": 0.10, "n_distinct_outputs": 5,
}]}
facts = detect_engine_unstable(data)
assert len(facts) == 1
assert facts[0].type == FactType.ENGINE_UNSTABLE
assert facts[0].importance == FactImportance.HIGH
assert facts[0].payload["engine"] == "gpt-4o"
assert facts[0].payload["cer_cv_pct"] == 25.0
def test_emits_when_identical_rate_low(self) -> None:
# CV bas mais runs très différents → divergence détectée
data = {"multirun_stability": [{
"engine_name": "vlm", "n_runs": 4,
"cer_mean": 0.05, "cer_stdev": 0.0025, "cer_cv": 0.05,
"identical_run_rate": 0.20, "n_distinct_outputs": 4,
}]}
facts = detect_engine_unstable(data)
assert len(facts) == 1
assert facts[0].payload["identical_run_rate_pct"] == 20.0
def test_silent_when_lt_two_runs(self) -> None:
data = {"multirun_stability": [{
"engine_name": "tess", "n_runs": 1,
"cer_cv": 0.5, "identical_run_rate": 0.0,
}]}
assert detect_engine_unstable(data) == []
def test_silent_when_engine_missing(self) -> None:
data = {"multirun_stability": [{
"n_runs": 3, "cer_cv": 0.30,
"identical_run_rate": 0.0,
}]}
assert detect_engine_unstable(data) == []
def test_multiple_engines(self) -> None:
data = {"multirun_stability": [
{"engine_name": "tess", "n_runs": 3,
"cer_cv": 0.02, "identical_run_rate": 1.0},
{"engine_name": "gpt-4o", "n_runs": 3,
"cer_cv": 0.30, "identical_run_rate": 0.0},
]}
facts = detect_engine_unstable(data)
# Seul gpt-4o instable
assert len(facts) == 1
assert facts[0].payload["engine"] == "gpt-4o"
# ──────────────────────────────────────────────────────────────────────────
# 3. Anti-hallucination : tout chiffre rendu vient du payload
# ──────────────────────────────────────────────────────────────────────────
def _payload_numbers(payload: dict) -> set[str]:
out: set[str] = set()
for v in payload.values():
if isinstance(v, (int, float)):
out.add(str(v))
if isinstance(v, float) and v.is_integer():
out.add(str(int(v)))
return out
def _numbers_in(text: str) -> set[str]:
return set(re.findall(r"\d+(?:\.\d+)?", text))
class TestAntiHallucination:
def _build(self, lang: str) -> tuple[list[str], dict]:
data = {
"ranking": [{"engine": "gpt-4o", "mean_cer": 0.10}],
"engines": [{"name": "gpt-4o", "mean_cer": 0.10}],
"meta": {"document_count": 5},
"multirun_stability": [{
"engine_name": "gpt-4o", "n_runs": 4,
"cer_mean": 0.103, "cer_stdev": 0.025,
"cer_cv": 0.243, "identical_run_rate": 0.16,
"n_distinct_outputs": 4,
}],
}
synthesis = build_synthesis(data, lang=lang, max_facts=10)
facts = detect_engine_unstable(data)
return synthesis["sentences"], facts[0].payload
def _find_unstable_sentence(
self, sentences: list[str], lang: str,
) -> str:
marker = "prudence" if lang == "fr" else "caution"
for s in sentences:
if marker in s:
return s
raise AssertionError(
f"phrase ENGINE_UNSTABLE introuvable parmi {sentences}",
)
def test_fr_numbers_traceable(self) -> None:
sentences, payload = self._build("fr")
sentence = self._find_unstable_sentence(sentences, "fr")
rendered = _numbers_in(sentence)
allowed = _payload_numbers(payload)
assert rendered.issubset(allowed), (
f"non traΓ§able : {rendered - allowed}"
)
def test_en_numbers_traceable(self) -> None:
sentences, payload = self._build("en")
sentence = self._find_unstable_sentence(sentences, "en")
rendered = _numbers_in(sentence)
allowed = _payload_numbers(payload)
assert rendered.issubset(allowed), (
f"non traΓ§able : {rendered - allowed}"
)
# ──────────────────────────────────────────────────────────────────────────
# 4. Vue HTML
# ──────────────────────────────────────────────────────────────────────────
class TestRender:
def test_empty_returns_empty(self) -> None:
assert build_multirun_stability_html(None) == ""
assert build_multirun_stability_html([]) == ""
def test_renders_table(self) -> None:
stab = [{
"engine_name": "gpt-4o", "n_runs": 5,
"cer_mean": 0.10, "cer_stdev": 0.025, "cer_cv": 0.25,
"identical_run_rate": 0.10, "n_distinct_outputs": 5,
}]
html = build_multirun_stability_html(stab, _load_labels("fr"))
assert "<table" in html
assert "gpt-4o" in html
# CV formatΓ© : 25.0
assert "25.0" in html
def test_anti_injection(self) -> None:
stab = [{
"engine_name": "<script>alert(1)</script>", "n_runs": 2,
"cer_cv": 0.3, "identical_run_rate": 0.0,
}]
html = build_multirun_stability_html(stab, _load_labels("fr"))
assert "<script>alert" not in html
assert "&lt;script&gt;" in html
def test_handles_missing_cv(self) -> None:
# cer_cv None β†’ cellule β€”, pas de crash
stab = [{
"engine_name": "tess", "n_runs": 2,
"cer_mean": 0.0, "cer_stdev": 0.0, "cer_cv": None,
"identical_run_rate": 1.0, "n_distinct_outputs": 1,
}]
html = build_multirun_stability_html(stab, _load_labels("fr"))
assert "<table" in html
assert "β€”" in html
def test_renders_in_english(self) -> None:
stab = [{
"engine_name": "tess", "n_runs": 3,
"cer_cv": 0.05, "identical_run_rate": 0.66,
"n_distinct_outputs": 2,
}]
html = build_multirun_stability_html(stab, _load_labels("en"))
assert "Multi-run stability" in html
# ──────────────────────────────────────────────────────────────────────────
# 5. ComplΓ©tude i18n
# ──────────────────────────────────────────────────────────────────────────
_KEYS = {
"stability_title", "stability_note", "stability_engine",
"stability_n_runs", "stability_cer", "stability_cv",
"stability_identical", "stability_distinct",
}
class TestI18n:
def test_fr(self) -> None:
d = _load_labels("fr")
assert not _KEYS - d.keys()
def test_en(self) -> None:
d = _load_labels("en")
assert not _KEYS - d.keys()