Picarones / tests /measurements /test_sprint44_median_default.py
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified
Raw
History Blame
12.3 kB
"""Tests Sprint 44 β€” mΓ©diane par dΓ©faut + dΓ©tecteur d'asymΓ©trie.
Couvre :
1. ``EngineReport.median_cer`` lit ``aggregated_metrics["cer"]["median"]``.
2. ``BenchmarkResult.ranking()`` :
- inclut ``median_cer`` dans chaque entrΓ©e
- trie sur la mΓ©diane par dΓ©faut (et non plus la moyenne)
- retombe sur la moyenne si la mΓ©diane est absente
3. DΓ©tecteur ``MEDIAN_MEAN_GAP_WARNING`` :
- se dΓ©clenche quand le ratio ``|moyenne - mΓ©diane| / mΓ©diane > 30%``
- ne se dΓ©clenche pas quand symΓ©trique
- ne se dΓ©clenche pas si la mΓ©diane est nulle (corpus parfait)
- importance HIGH si gap relatif β‰₯ 100 %
4. Anti-hallucination : chaque nombre rendu est dans le payload.
5. RΓ©trocompat : les consommateurs qui lisent ``mean_cer`` continuent
Γ  fonctionner.
"""
from __future__ import annotations
import re
import pytest
from picarones.measurements.metrics import MetricsResult
from picarones.measurements.narrative.detectors import detect_median_mean_gap_warning
from picarones.core.facts import FactImportance, FactType
from picarones.measurements.narrative.renderer import extract_numbers, render_fact
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
# ──────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────
def _make_dr(cer: float, doc_id: str = "d") -> DocumentResult:
return DocumentResult(
doc_id=doc_id, image_path="/tmp/x.png",
ground_truth="x", hypothesis="x",
metrics=MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=1, hypothesis_length=1,
),
duration_seconds=0.1,
)
def _make_engine_report(name: str, cers: list[float]) -> EngineReport:
drs = [_make_dr(c, doc_id=f"d{i}") for i, c in enumerate(cers)]
return EngineReport(
engine_name=name, engine_version="1", engine_config={},
document_results=drs,
)
# ──────────────────────────────────────────────────────────────────────────
# 1. EngineReport.median_cer
# ──────────────────────────────────────────────────────────────────────────
class TestMedianCerProperty:
def test_returns_median_from_aggregated(self) -> None:
rep = _make_engine_report("e", [0.0, 0.0, 0.0, 1.0, 1.0])
# MΓ©diane de [0,0,0,1,1] = 0
assert rep.median_cer == pytest.approx(0.0)
def test_returns_none_when_no_docs(self) -> None:
rep = EngineReport(
engine_name="e", engine_version="1", engine_config={},
document_results=[],
)
# Pas de docs β†’ aggregated_metrics vide β†’ mean/median = None
assert rep.median_cer is None
# ──────────────────────────────────────────────────────────────────────────
# 2. ranking() β€” tri par mΓ©diane
# ──────────────────────────────────────────────────────────────────────────
class TestRankingByMedian:
def test_includes_median_cer(self) -> None:
bench = BenchmarkResult(
corpus_name="c", corpus_source=None, document_count=3,
engine_reports=[_make_engine_report("a", [0.1, 0.2, 0.3])],
)
ranking = bench.ranking()
assert "median_cer" in ranking[0]
assert ranking[0]["median_cer"] == pytest.approx(0.2)
def test_sorts_by_median_not_mean(self) -> None:
# Moteur A : 80 % Γ  0,03 + 20 % Γ  0,40 β†’ moyenne β‰ˆ 0,11, mΓ©diane = 0,03
# Moteur B : 100 % Γ  0,05 β†’ moyenne = 0,05, mΓ©diane = 0,05
# Tri par moyenne : B (0.05) < A (0.11) β†’ A est 2e
# Tri par mΓ©diane : A (0.03) < B (0.05) β†’ A est 1er
ers = [
_make_engine_report(
"A_asymmetric",
[0.03] * 8 + [0.40] * 2,
),
_make_engine_report(
"B_steady",
[0.05] * 10,
),
]
bench = BenchmarkResult(
corpus_name="c", corpus_source=None, document_count=10,
engine_reports=ers,
)
ranking = bench.ranking()
# Le moteur A doit gagner sur la mΓ©diane mΓͺme si sa moyenne est pire
assert ranking[0]["engine"] == "A_asymmetric"
assert ranking[0]["mean_cer"] > ranking[1]["mean_cer"]
assert ranking[0]["median_cer"] < ranking[1]["median_cer"]
def test_falls_back_to_mean_when_median_missing(self) -> None:
"""Si median_cer est None, le tri retombe sur mean_cer.
On reproduit ici la clΓ© de tri utilisΓ©e par
``BenchmarkResult.ranking()`` pour valider sa logique sur des
entrΓ©es synthΓ©tiques (impossible Γ  produire via vrais
``EngineReport`` car ``aggregate_metrics`` calcule toujours
une mΓ©diane quand il y a au moins un doc).
"""
ranked = [
{"engine": "x", "mean_cer": 0.10, "median_cer": None,
"mean_wer": 0.0, "documents": 1, "failed": 0},
{"engine": "y", "mean_cer": 0.05, "median_cer": None,
"mean_wer": 0.0, "documents": 1, "failed": 0},
]
def _key(e: dict) -> tuple:
p = e.get("median_cer") if e.get("median_cer") is not None else e.get("mean_cer")
return (p is None, p if p is not None else float("inf"))
ranking = sorted(ranked, key=_key)
# y (mean=0.05) doit passer avant x (mean=0.10)
assert ranking[0]["engine"] == "y"
# ──────────────────────────────────────────────────────────────────────────
# 3. DΓ©tecteur MEDIAN_MEAN_GAP_WARNING
# ──────────────────────────────────────────────────────────────────────────
class TestMedianMeanGapDetector:
def test_no_fact_when_distribution_symmetric(self) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.05, "mean_cer": 0.055,
"documents": 100,
}]}
# Gap relatif = 10% β†’ en dessous du seuil 30%
assert detect_median_mean_gap_warning(data) == []
def test_emits_fact_when_asymmetric(self) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.03, "mean_cer": 0.07,
"documents": 100,
}]}
# Gap relatif = 133% β†’ au-dessus du seuil
facts = detect_median_mean_gap_warning(data)
assert len(facts) == 1
assert facts[0].type is FactType.MEDIAN_MEAN_GAP_WARNING
assert facts[0].importance is FactImportance.HIGH # >= 100 %
assert facts[0].payload["engine"] == "tess"
def test_medium_importance_when_moderate_gap(self) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.05, "mean_cer": 0.075,
"documents": 100,
}]}
# Gap relatif = 50% β†’ au-dessus du seuil mais < 100 %
facts = detect_median_mean_gap_warning(data)
assert facts[0].importance is FactImportance.MEDIUM
def test_no_fact_when_median_zero(self) -> None:
"""MΓ©diane nulle β†’ ratio non calculable β†’ on s'abstient."""
data = {"ranking": [{
"engine": "tess", "median_cer": 0.0, "mean_cer": 0.05,
"documents": 100,
}]}
assert detect_median_mean_gap_warning(data) == []
def test_no_fact_when_no_ranking(self) -> None:
assert detect_median_mean_gap_warning({}) == []
assert detect_median_mean_gap_warning({"ranking": []}) == []
assert detect_median_mean_gap_warning({"ranking": [{
"engine": "x", "mean_cer": None, "median_cer": None,
}]}) == []
# ──────────────────────────────────────────────────────────────────────────
# 4. TraΓ§abilitΓ© anti-hallucination
# ──────────────────────────────────────────────────────────────────────────
class TestTraceability:
@pytest.mark.parametrize("lang", ["fr", "en"])
def test_every_rendered_number_is_in_payload(self, lang: str) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.03, "mean_cer": 0.07,
"documents": 100,
}]}
facts = detect_median_mean_gap_warning(data)
sentence = render_fact(facts[0], lang)
# Whitelist : aucune constante de template n'est attendue ici
whitelist: set[str] = set()
# Recompute payload representations
payload_nums: set[str] = set()
for v in facts[0].payload.values():
if isinstance(v, (int, float)):
payload_nums.add(str(v))
if isinstance(v, float) and v.is_integer():
payload_nums.add(str(int(v)))
for num in extract_numbers(sentence):
normalized = num.replace(",", ".")
assert normalized in payload_nums | whitelist, (
f"Nombre {normalized!r} dans la phrase rendue n'est pas "
f"traΓ§able au payload {facts[0].payload!r}"
)
def test_template_has_no_hardcoded_numbers(self) -> None:
from picarones.measurements.narrative.renderer import _load_templates
for lang in ("fr", "en"):
tpl = _load_templates(lang).get("median_mean_gap_warning", "")
assert tpl, f"Template absent pour {lang}"
# Enlever les placeholders {x} avant de chercher des chiffres
cleaned = re.sub(r"\{[^}]+\}", "", tpl)
digits = re.findall(r"\d", cleaned)
assert not digits, f"Template {lang} contient des chiffres en dur : {digits}"
# ──────────────────────────────────────────────────────────────────────────
# 5. IntΓ©gration via build_synthesis
# ──────────────────────────────────────────────────────────────────────────
class TestSynthesisIntegration:
def test_detector_registered_by_default(self) -> None:
from picarones.measurements.narrative.registry import iter_detectors
types = {entry.fact_type for entry in iter_detectors()}
assert FactType.MEDIAN_MEAN_GAP_WARNING in types
def test_synthesis_includes_warning_when_asymmetric(self) -> None:
from picarones.measurements.narrative import build_synthesis
data = {"ranking": [{
"engine": "tess", "median_cer": 0.03, "mean_cer": 0.07,
"documents": 100,
}]}
out = build_synthesis(data, lang="fr", max_facts=5)
sentences = out["sentences"]
# Au moins une phrase doit mentionner l'asymΓ©trie
assert any(
"asymΓ©trique" in s.lower() or "mΓ©diane" in s.lower()
for s in sentences
)