Spaces:
Sleeping
Sleeping
File size: 12,342 Bytes
dbf3578 979f3c3 dbf3578 979f3c3 dbf3578 979f3c3 dbf3578 979f3c3 dbf3578 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | """Tests Sprint 44 β mΓ©diane par dΓ©faut + dΓ©tecteur d'asymΓ©trie.
Couvre :
1. ``EngineReport.median_cer`` lit ``aggregated_metrics["cer"]["median"]``.
2. ``BenchmarkResult.ranking()`` :
- inclut ``median_cer`` dans chaque entrΓ©e
- trie sur la mΓ©diane par dΓ©faut (et non plus la moyenne)
- retombe sur la moyenne si la mΓ©diane est absente
3. DΓ©tecteur ``MEDIAN_MEAN_GAP_WARNING`` :
- se dΓ©clenche quand le ratio ``|moyenne - mΓ©diane| / mΓ©diane > 30%``
- ne se dΓ©clenche pas quand symΓ©trique
- ne se dΓ©clenche pas si la mΓ©diane est nulle (corpus parfait)
- importance HIGH si gap relatif β₯ 100 %
4. Anti-hallucination : chaque nombre rendu est dans le payload.
5. RΓ©trocompat : les consommateurs qui lisent ``mean_cer`` continuent
Γ fonctionner.
"""
from __future__ import annotations
import re
import pytest
from picarones.measurements.metrics import MetricsResult
from picarones.measurements.narrative.detectors import detect_median_mean_gap_warning
from picarones.core.facts import FactImportance, FactType
from picarones.measurements.narrative.renderer import extract_numbers, render_fact
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Helpers
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _make_dr(cer: float, doc_id: str = "d") -> DocumentResult:
return DocumentResult(
doc_id=doc_id, image_path="/tmp/x.png",
ground_truth="x", hypothesis="x",
metrics=MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=1, hypothesis_length=1,
),
duration_seconds=0.1,
)
def _make_engine_report(name: str, cers: list[float]) -> EngineReport:
drs = [_make_dr(c, doc_id=f"d{i}") for i, c in enumerate(cers)]
return EngineReport(
engine_name=name, engine_version="1", engine_config={},
document_results=drs,
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. EngineReport.median_cer
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestMedianCerProperty:
def test_returns_median_from_aggregated(self) -> None:
rep = _make_engine_report("e", [0.0, 0.0, 0.0, 1.0, 1.0])
# MΓ©diane de [0,0,0,1,1] = 0
assert rep.median_cer == pytest.approx(0.0)
def test_returns_none_when_no_docs(self) -> None:
rep = EngineReport(
engine_name="e", engine_version="1", engine_config={},
document_results=[],
)
# Pas de docs β aggregated_metrics vide β mean/median = None
assert rep.median_cer is None
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. ranking() β tri par mΓ©diane
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRankingByMedian:
def test_includes_median_cer(self) -> None:
bench = BenchmarkResult(
corpus_name="c", corpus_source=None, document_count=3,
engine_reports=[_make_engine_report("a", [0.1, 0.2, 0.3])],
)
ranking = bench.ranking()
assert "median_cer" in ranking[0]
assert ranking[0]["median_cer"] == pytest.approx(0.2)
def test_sorts_by_median_not_mean(self) -> None:
# Moteur A : 80 % Γ 0,03 + 20 % Γ 0,40 β moyenne β 0,11, mΓ©diane = 0,03
# Moteur B : 100 % Γ 0,05 β moyenne = 0,05, mΓ©diane = 0,05
# Tri par moyenne : B (0.05) < A (0.11) β A est 2e
# Tri par mΓ©diane : A (0.03) < B (0.05) β A est 1er
ers = [
_make_engine_report(
"A_asymmetric",
[0.03] * 8 + [0.40] * 2,
),
_make_engine_report(
"B_steady",
[0.05] * 10,
),
]
bench = BenchmarkResult(
corpus_name="c", corpus_source=None, document_count=10,
engine_reports=ers,
)
ranking = bench.ranking()
# Le moteur A doit gagner sur la mΓ©diane mΓͺme si sa moyenne est pire
assert ranking[0]["engine"] == "A_asymmetric"
assert ranking[0]["mean_cer"] > ranking[1]["mean_cer"]
assert ranking[0]["median_cer"] < ranking[1]["median_cer"]
def test_falls_back_to_mean_when_median_missing(self) -> None:
"""Si median_cer est None, le tri retombe sur mean_cer.
On reproduit ici la clΓ© de tri utilisΓ©e par
``BenchmarkResult.ranking()`` pour valider sa logique sur des
entrΓ©es synthΓ©tiques (impossible Γ produire via vrais
``EngineReport`` car ``aggregate_metrics`` calcule toujours
une mΓ©diane quand il y a au moins un doc).
"""
ranked = [
{"engine": "x", "mean_cer": 0.10, "median_cer": None,
"mean_wer": 0.0, "documents": 1, "failed": 0},
{"engine": "y", "mean_cer": 0.05, "median_cer": None,
"mean_wer": 0.0, "documents": 1, "failed": 0},
]
def _key(e: dict) -> tuple:
p = e.get("median_cer") if e.get("median_cer") is not None else e.get("mean_cer")
return (p is None, p if p is not None else float("inf"))
ranking = sorted(ranked, key=_key)
# y (mean=0.05) doit passer avant x (mean=0.10)
assert ranking[0]["engine"] == "y"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. DΓ©tecteur MEDIAN_MEAN_GAP_WARNING
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestMedianMeanGapDetector:
def test_no_fact_when_distribution_symmetric(self) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.05, "mean_cer": 0.055,
"documents": 100,
}]}
# Gap relatif = 10% β en dessous du seuil 30%
assert detect_median_mean_gap_warning(data) == []
def test_emits_fact_when_asymmetric(self) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.03, "mean_cer": 0.07,
"documents": 100,
}]}
# Gap relatif = 133% β au-dessus du seuil
facts = detect_median_mean_gap_warning(data)
assert len(facts) == 1
assert facts[0].type is FactType.MEDIAN_MEAN_GAP_WARNING
assert facts[0].importance is FactImportance.HIGH # >= 100 %
assert facts[0].payload["engine"] == "tess"
def test_medium_importance_when_moderate_gap(self) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.05, "mean_cer": 0.075,
"documents": 100,
}]}
# Gap relatif = 50% β au-dessus du seuil mais < 100 %
facts = detect_median_mean_gap_warning(data)
assert facts[0].importance is FactImportance.MEDIUM
def test_no_fact_when_median_zero(self) -> None:
"""MΓ©diane nulle β ratio non calculable β on s'abstient."""
data = {"ranking": [{
"engine": "tess", "median_cer": 0.0, "mean_cer": 0.05,
"documents": 100,
}]}
assert detect_median_mean_gap_warning(data) == []
def test_no_fact_when_no_ranking(self) -> None:
assert detect_median_mean_gap_warning({}) == []
assert detect_median_mean_gap_warning({"ranking": []}) == []
assert detect_median_mean_gap_warning({"ranking": [{
"engine": "x", "mean_cer": None, "median_cer": None,
}]}) == []
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. TraΓ§abilitΓ© anti-hallucination
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestTraceability:
@pytest.mark.parametrize("lang", ["fr", "en"])
def test_every_rendered_number_is_in_payload(self, lang: str) -> None:
data = {"ranking": [{
"engine": "tess", "median_cer": 0.03, "mean_cer": 0.07,
"documents": 100,
}]}
facts = detect_median_mean_gap_warning(data)
sentence = render_fact(facts[0], lang)
# Whitelist : aucune constante de template n'est attendue ici
whitelist: set[str] = set()
# Recompute payload representations
payload_nums: set[str] = set()
for v in facts[0].payload.values():
if isinstance(v, (int, float)):
payload_nums.add(str(v))
if isinstance(v, float) and v.is_integer():
payload_nums.add(str(int(v)))
for num in extract_numbers(sentence):
normalized = num.replace(",", ".")
assert normalized in payload_nums | whitelist, (
f"Nombre {normalized!r} dans la phrase rendue n'est pas "
f"traΓ§able au payload {facts[0].payload!r}"
)
def test_template_has_no_hardcoded_numbers(self) -> None:
from picarones.measurements.narrative.renderer import _load_templates
for lang in ("fr", "en"):
tpl = _load_templates(lang).get("median_mean_gap_warning", "")
assert tpl, f"Template absent pour {lang}"
# Enlever les placeholders {x} avant de chercher des chiffres
cleaned = re.sub(r"\{[^}]+\}", "", tpl)
digits = re.findall(r"\d", cleaned)
assert not digits, f"Template {lang} contient des chiffres en dur : {digits}"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. IntΓ©gration via build_synthesis
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestSynthesisIntegration:
def test_detector_registered_by_default(self) -> None:
from picarones.measurements.narrative.registry import iter_detectors
types = {entry.fact_type for entry in iter_detectors()}
assert FactType.MEDIAN_MEAN_GAP_WARNING in types
def test_synthesis_includes_warning_when_asymmetric(self) -> None:
from picarones.measurements.narrative import build_synthesis
data = {"ranking": [{
"engine": "tess", "median_cer": 0.03, "mean_cer": 0.07,
"documents": 100,
}]}
out = build_synthesis(data, lang="fr", max_facts=5)
sentences = out["sentences"]
# Au moins une phrase doit mentionner l'asymΓ©trie
assert any(
"asymΓ©trique" in s.lower() or "mΓ©diane" in s.lower()
for s in sentences
)
|