Picarones / tests /measurements /test_sprint45_stratification.py
Claude
feat(migration): Lots H + I + J β€” statistics, htr_united/huggingface, MetricsResult
c813aa1 unverified
Raw
History Blame
12.9 kB
"""Tests Sprint 45 β€” couche backend de stratification par script_type.
Couvre :
1. ``BenchmarkResult.doc_strata`` accepte ``None`` (rΓ©trocompat) ou
un dict ``{doc_id: script_type}``.
2. ``available_strata()`` retourne la liste triΓ©e des strates
distinctes (vide si pas de doc_strata, ignore les valeurs vides).
3. ``stratified_ranking()`` :
- Recalcule mean/median par moteur sur les docs de la strate
- Trie par mΓ©diane (cohΓ©rent avec ``ranking()`` Sprint 44)
- Inclut les moteurs sans aucun doc dans la strate (entrΓ©e
dΓ©gΓ©nΓ©rΓ©e avec mean/median = None)
4. ``corpus_homogeneity()`` :
- Retourne ``None`` quand < 2 strates
- Calcule l'Γ©cart inter-strate du leader (en CER mΓ©dian)
- Identifie la paire de strates min/max
5. ``as_dict()`` expose ``doc_strata``, ``available_strata`` et
``stratified_ranking`` quand renseignΓ©s (rΓ©trocompat sinon).
6. **Test propriΓ©tΓ©** : sur un corpus asymΓ©trique rΓ©aliste oΓΉ le
leader global change selon la strate, ``stratified_ranking``
doit reflΓ©ter ce changement.
"""
from __future__ import annotations
import pytest
from picarones.evaluation.metric_result import MetricsResult
from picarones.evaluation.benchmark_result import BenchmarkResult, DocumentResult, EngineReport
# ──────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────
def _make_dr(doc_id: str, cer: float, error: str | None = None) -> DocumentResult:
return DocumentResult(
doc_id=doc_id, image_path=f"/tmp/{doc_id}.png",
ground_truth="x", hypothesis="x",
metrics=MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=1, hypothesis_length=1,
error=error,
),
duration_seconds=0.1,
)
def _make_engine(name: str, cers_by_doc: dict[str, float]) -> EngineReport:
drs = [_make_dr(d, c) for d, c in cers_by_doc.items()]
return EngineReport(
engine_name=name, engine_version="1", engine_config={},
document_results=drs,
)
def _make_benchmark(
engines: list[EngineReport],
doc_strata: dict[str, str] | None = None,
) -> BenchmarkResult:
return BenchmarkResult(
corpus_name="test",
corpus_source=None,
document_count=0,
engine_reports=engines,
doc_strata=doc_strata,
)
# ──────────────────────────────────────────────────────────────────────────
# 1. doc_strata field
# ──────────────────────────────────────────────────────────────────────────
class TestDocStrataField:
def test_default_is_none(self) -> None:
b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
assert b.doc_strata is None
def test_accepts_dict(self) -> None:
b = _make_benchmark(
[_make_engine("a", {"d1": 0.1})],
doc_strata={"d1": "gothic"},
)
assert b.doc_strata == {"d1": "gothic"}
# ──────────────────────────────────────────────────────────────────────────
# 2. available_strata
# ──────────────────────────────────────────────────────────────────────────
class TestAvailableStrata:
def test_empty_when_no_doc_strata(self) -> None:
b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
assert b.available_strata() == []
def test_returns_sorted_unique(self) -> None:
b = _make_benchmark(
[_make_engine("a", {"d1": 0.1, "d2": 0.2, "d3": 0.3})],
doc_strata={
"d1": "gothic", "d2": "humanistic", "d3": "gothic",
},
)
assert b.available_strata() == ["gothic", "humanistic"]
def test_ignores_empty_strings(self) -> None:
b = _make_benchmark(
[_make_engine("a", {"d1": 0.1, "d2": 0.2})],
doc_strata={"d1": "gothic", "d2": ""},
)
assert b.available_strata() == ["gothic"]
# ──────────────────────────────────────────────────────────────────────────
# 3. stratified_ranking
# ──────────────────────────────────────────────────────────────────────────
class TestStratifiedRanking:
def test_empty_when_no_strata(self) -> None:
b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
assert b.stratified_ranking() == {}
def test_one_entry_per_engine_per_stratum(self) -> None:
b = _make_benchmark(
[
_make_engine("a", {"d1": 0.1, "d2": 0.2, "d3": 0.3}),
_make_engine("b", {"d1": 0.5, "d2": 0.6, "d3": 0.7}),
],
doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"},
)
out = b.stratified_ranking()
assert set(out.keys()) == {"S1", "S2"}
for stratum, entries in out.items():
assert len(entries) == 2
def test_metrics_are_per_stratum(self) -> None:
b = _make_benchmark(
[_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5})],
doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"},
)
out = b.stratified_ranking()
s1 = out["S1"][0]
s2 = out["S2"][0]
assert s1["mean_cer"] == pytest.approx(0.0)
assert s1["median_cer"] == pytest.approx(0.0)
assert s1["documents"] == 2
assert s2["mean_cer"] == pytest.approx(0.5)
assert s2["documents"] == 1
def test_sorts_by_median_within_each_stratum(self) -> None:
# Sur S1, A mΓ©diane=0.0, B mΓ©diane=0.1 β†’ A 1er
# Sur S2, A mΓ©diane=0.5, B mΓ©diane=0.0 β†’ B 1er (changement de leader)
b = _make_benchmark(
[
_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5, "d4": 0.5}),
_make_engine("b", {"d1": 0.1, "d2": 0.1, "d3": 0.0, "d4": 0.0}),
],
doc_strata={"d1": "S1", "d2": "S1", "d3": "S2", "d4": "S2"},
)
out = b.stratified_ranking()
assert out["S1"][0]["engine"] == "a"
assert out["S2"][0]["engine"] == "b"
def test_engine_with_no_docs_in_stratum_appears_with_none(self) -> None:
# B n'a aucun doc dans S2
b = _make_benchmark(
[
_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5}),
_make_engine("b", {"d1": 0.1, "d2": 0.1}),
],
doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"},
)
out = b.stratified_ranking()
s2_b = next(e for e in out["S2"] if e["engine"] == "b")
assert s2_b["mean_cer"] is None
assert s2_b["median_cer"] is None
assert s2_b["documents"] == 0
# ──────────────────────────────────────────────────────────────────────────
# 4. corpus_homogeneity
# ──────────────────────────────────────────────────────────────────────────
class TestCorpusHomogeneity:
def test_returns_none_when_no_strata(self) -> None:
b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
assert b.corpus_homogeneity() is None
def test_returns_none_when_single_stratum(self) -> None:
b = _make_benchmark(
[_make_engine("a", {"d1": 0.1, "d2": 0.2})],
doc_strata={"d1": "S1", "d2": "S1"},
)
assert b.corpus_homogeneity() is None
def test_detects_inter_stratum_gap(self) -> None:
# Le leader A : mΓ©diane = 0.0 sur S1, 0.5 sur S2 β†’ gap = 0.5
b = _make_benchmark(
[_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5, "d4": 0.5})],
doc_strata={"d1": "S1", "d2": "S1", "d3": "S2", "d4": "S2"},
)
h = b.corpus_homogeneity()
assert h is not None
assert h["leader"] == "a"
assert h["n_strata"] == 2
assert h["max_inter_strata_gap"] == pytest.approx(0.5)
assert set(h["leader_max_gap_strata"]) == {"S1", "S2"}
# Min en premier (S1 = 0.0), max en deuxième (S2 = 0.5)
assert h["leader_max_gap_strata"][0] == "S1"
assert h["leader_max_gap_strata"][1] == "S2"
# ──────────────────────────────────────────────────────────────────────────
# 5. as_dict expose les strates
# ──────────────────────────────────────────────────────────────────────────
class TestAsDictSerialization:
def test_no_strata_keys_when_doc_strata_is_none(self) -> None:
b = _make_benchmark([_make_engine("a", {"d1": 0.1})])
d = b.as_dict()
assert "doc_strata" not in d
assert "stratified_ranking" not in d
assert "corpus_homogeneity" not in d
def test_strata_keys_present_when_doc_strata_is_set(self) -> None:
b = _make_benchmark(
[_make_engine("a", {"d1": 0.0, "d2": 0.5})],
doc_strata={"d1": "S1", "d2": "S2"},
)
d = b.as_dict()
assert d["doc_strata"] == {"d1": "S1", "d2": "S2"}
assert d["available_strata"] == ["S1", "S2"]
assert "S1" in d["stratified_ranking"]
# ──────────────────────────────────────────────────────────────────────────
# 6. Test propriΓ©tΓ© β€” leader change selon la strate (cas rΓ©aliste)
# ──────────────────────────────────────────────────────────────────────────
class TestRealisticAsymmetry:
def test_global_leader_can_lose_on_a_stratum(self) -> None:
"""Cas patrimonial typique : Tesseract domine globalement
(mΓ©diocre sur le manuscrit, excellent sur l'imprimΓ©), Pero
domine spΓ©cifiquement sur le manuscrit."""
b = _make_benchmark(
[
# Tesseract : 10 docs imprimΓ©s Γ  0.02, 5 docs manuscrit Γ  0.30
_make_engine("tesseract", {
**{f"print_{i}": 0.02 for i in range(10)},
**{f"ms_{i}": 0.30 for i in range(5)},
}),
# Pero : 10 docs imprimΓ©s Γ  0.05, 5 docs manuscrit Γ  0.10
_make_engine("pero", {
**{f"print_{i}": 0.05 for i in range(10)},
**{f"ms_{i}": 0.10 for i in range(5)},
}),
],
doc_strata={
**{f"print_{i}": "imprimΓ©" for i in range(10)},
**{f"ms_{i}": "manuscrit" for i in range(5)},
},
)
# Globalement, Tesseract gagne sur la mΓ©diane (0.02 sur la
# majoritΓ© des docs vs 0.05 pour Pero)
global_leader = b.ranking()[0]["engine"]
assert global_leader == "tesseract"
# Mais sur la strate manuscrit, Pero gagne (0.10 < 0.30)
strat = b.stratified_ranking()
assert strat["manuscrit"][0]["engine"] == "pero"
assert strat["imprimΓ©"][0]["engine"] == "tesseract"
# Le score d'homogΓ©nΓ©itΓ© doit reflΓ©ter le fort Γ©cart de
# Tesseract entre strates (0.02 vs 0.30 = 0.28)
h = b.corpus_homogeneity()
assert h["leader"] == "tesseract"
assert h["max_inter_strata_gap"] == pytest.approx(0.28, abs=1e-9)