Spaces:
Sleeping
Sleeping
Claude
feat(migration): Lots H + I + J β statistics, htr_united/huggingface, MetricsResult
c813aa1 unverified | """Tests Sprint 45 β couche backend de stratification par script_type. | |
| Couvre : | |
| 1. ``BenchmarkResult.doc_strata`` accepte ``None`` (rΓ©trocompat) ou | |
| un dict ``{doc_id: script_type}``. | |
| 2. ``available_strata()`` retourne la liste triΓ©e des strates | |
| distinctes (vide si pas de doc_strata, ignore les valeurs vides). | |
| 3. ``stratified_ranking()`` : | |
| - Recalcule mean/median par moteur sur les docs de la strate | |
| - Trie par mΓ©diane (cohΓ©rent avec ``ranking()`` Sprint 44) | |
| - Inclut les moteurs sans aucun doc dans la strate (entrΓ©e | |
| dΓ©gΓ©nΓ©rΓ©e avec mean/median = None) | |
| 4. ``corpus_homogeneity()`` : | |
| - Retourne ``None`` quand < 2 strates | |
| - Calcule l'Γ©cart inter-strate du leader (en CER mΓ©dian) | |
| - Identifie la paire de strates min/max | |
| 5. ``as_dict()`` expose ``doc_strata``, ``available_strata`` et | |
| ``stratified_ranking`` quand renseignΓ©s (rΓ©trocompat sinon). | |
| 6. **Test propriΓ©tΓ©** : sur un corpus asymΓ©trique rΓ©aliste oΓΉ le | |
| leader global change selon la strate, ``stratified_ranking`` | |
| doit reflΓ©ter ce changement. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from picarones.evaluation.metric_result import MetricsResult | |
| from picarones.evaluation.benchmark_result import BenchmarkResult, DocumentResult, EngineReport | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_dr(doc_id: str, cer: float, error: str | None = None) -> DocumentResult: | |
| return DocumentResult( | |
| doc_id=doc_id, image_path=f"/tmp/{doc_id}.png", | |
| ground_truth="x", hypothesis="x", | |
| metrics=MetricsResult( | |
| cer=cer, cer_nfc=cer, cer_caseless=cer, | |
| wer=cer, wer_normalized=cer, mer=cer, wil=cer, | |
| reference_length=1, hypothesis_length=1, | |
| error=error, | |
| ), | |
| duration_seconds=0.1, | |
| ) | |
| def _make_engine(name: str, cers_by_doc: dict[str, float]) -> EngineReport: | |
| drs = [_make_dr(d, c) for d, c in cers_by_doc.items()] | |
| return EngineReport( | |
| engine_name=name, engine_version="1", engine_config={}, | |
| document_results=drs, | |
| ) | |
| def _make_benchmark( | |
| engines: list[EngineReport], | |
| doc_strata: dict[str, str] | None = None, | |
| ) -> BenchmarkResult: | |
| return BenchmarkResult( | |
| corpus_name="test", | |
| corpus_source=None, | |
| document_count=0, | |
| engine_reports=engines, | |
| doc_strata=doc_strata, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. doc_strata field | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDocStrataField: | |
| def test_default_is_none(self) -> None: | |
| b = _make_benchmark([_make_engine("a", {"d1": 0.1})]) | |
| assert b.doc_strata is None | |
| def test_accepts_dict(self) -> None: | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.1})], | |
| doc_strata={"d1": "gothic"}, | |
| ) | |
| assert b.doc_strata == {"d1": "gothic"} | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. available_strata | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestAvailableStrata: | |
| def test_empty_when_no_doc_strata(self) -> None: | |
| b = _make_benchmark([_make_engine("a", {"d1": 0.1})]) | |
| assert b.available_strata() == [] | |
| def test_returns_sorted_unique(self) -> None: | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.1, "d2": 0.2, "d3": 0.3})], | |
| doc_strata={ | |
| "d1": "gothic", "d2": "humanistic", "d3": "gothic", | |
| }, | |
| ) | |
| assert b.available_strata() == ["gothic", "humanistic"] | |
| def test_ignores_empty_strings(self) -> None: | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.1, "d2": 0.2})], | |
| doc_strata={"d1": "gothic", "d2": ""}, | |
| ) | |
| assert b.available_strata() == ["gothic"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. stratified_ranking | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestStratifiedRanking: | |
| def test_empty_when_no_strata(self) -> None: | |
| b = _make_benchmark([_make_engine("a", {"d1": 0.1})]) | |
| assert b.stratified_ranking() == {} | |
| def test_one_entry_per_engine_per_stratum(self) -> None: | |
| b = _make_benchmark( | |
| [ | |
| _make_engine("a", {"d1": 0.1, "d2": 0.2, "d3": 0.3}), | |
| _make_engine("b", {"d1": 0.5, "d2": 0.6, "d3": 0.7}), | |
| ], | |
| doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"}, | |
| ) | |
| out = b.stratified_ranking() | |
| assert set(out.keys()) == {"S1", "S2"} | |
| for stratum, entries in out.items(): | |
| assert len(entries) == 2 | |
| def test_metrics_are_per_stratum(self) -> None: | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5})], | |
| doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"}, | |
| ) | |
| out = b.stratified_ranking() | |
| s1 = out["S1"][0] | |
| s2 = out["S2"][0] | |
| assert s1["mean_cer"] == pytest.approx(0.0) | |
| assert s1["median_cer"] == pytest.approx(0.0) | |
| assert s1["documents"] == 2 | |
| assert s2["mean_cer"] == pytest.approx(0.5) | |
| assert s2["documents"] == 1 | |
| def test_sorts_by_median_within_each_stratum(self) -> None: | |
| # Sur S1, A mΓ©diane=0.0, B mΓ©diane=0.1 β A 1er | |
| # Sur S2, A mΓ©diane=0.5, B mΓ©diane=0.0 β B 1er (changement de leader) | |
| b = _make_benchmark( | |
| [ | |
| _make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5, "d4": 0.5}), | |
| _make_engine("b", {"d1": 0.1, "d2": 0.1, "d3": 0.0, "d4": 0.0}), | |
| ], | |
| doc_strata={"d1": "S1", "d2": "S1", "d3": "S2", "d4": "S2"}, | |
| ) | |
| out = b.stratified_ranking() | |
| assert out["S1"][0]["engine"] == "a" | |
| assert out["S2"][0]["engine"] == "b" | |
| def test_engine_with_no_docs_in_stratum_appears_with_none(self) -> None: | |
| # B n'a aucun doc dans S2 | |
| b = _make_benchmark( | |
| [ | |
| _make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5}), | |
| _make_engine("b", {"d1": 0.1, "d2": 0.1}), | |
| ], | |
| doc_strata={"d1": "S1", "d2": "S1", "d3": "S2"}, | |
| ) | |
| out = b.stratified_ranking() | |
| s2_b = next(e for e in out["S2"] if e["engine"] == "b") | |
| assert s2_b["mean_cer"] is None | |
| assert s2_b["median_cer"] is None | |
| assert s2_b["documents"] == 0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. corpus_homogeneity | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestCorpusHomogeneity: | |
| def test_returns_none_when_no_strata(self) -> None: | |
| b = _make_benchmark([_make_engine("a", {"d1": 0.1})]) | |
| assert b.corpus_homogeneity() is None | |
| def test_returns_none_when_single_stratum(self) -> None: | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.1, "d2": 0.2})], | |
| doc_strata={"d1": "S1", "d2": "S1"}, | |
| ) | |
| assert b.corpus_homogeneity() is None | |
| def test_detects_inter_stratum_gap(self) -> None: | |
| # Le leader A : mΓ©diane = 0.0 sur S1, 0.5 sur S2 β gap = 0.5 | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.0, "d2": 0.0, "d3": 0.5, "d4": 0.5})], | |
| doc_strata={"d1": "S1", "d2": "S1", "d3": "S2", "d4": "S2"}, | |
| ) | |
| h = b.corpus_homogeneity() | |
| assert h is not None | |
| assert h["leader"] == "a" | |
| assert h["n_strata"] == 2 | |
| assert h["max_inter_strata_gap"] == pytest.approx(0.5) | |
| assert set(h["leader_max_gap_strata"]) == {"S1", "S2"} | |
| # Min en premier (S1 = 0.0), max en deuxième (S2 = 0.5) | |
| assert h["leader_max_gap_strata"][0] == "S1" | |
| assert h["leader_max_gap_strata"][1] == "S2" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. as_dict expose les strates | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestAsDictSerialization: | |
| def test_no_strata_keys_when_doc_strata_is_none(self) -> None: | |
| b = _make_benchmark([_make_engine("a", {"d1": 0.1})]) | |
| d = b.as_dict() | |
| assert "doc_strata" not in d | |
| assert "stratified_ranking" not in d | |
| assert "corpus_homogeneity" not in d | |
| def test_strata_keys_present_when_doc_strata_is_set(self) -> None: | |
| b = _make_benchmark( | |
| [_make_engine("a", {"d1": 0.0, "d2": 0.5})], | |
| doc_strata={"d1": "S1", "d2": "S2"}, | |
| ) | |
| d = b.as_dict() | |
| assert d["doc_strata"] == {"d1": "S1", "d2": "S2"} | |
| assert d["available_strata"] == ["S1", "S2"] | |
| assert "S1" in d["stratified_ranking"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Test propriΓ©tΓ© β leader change selon la strate (cas rΓ©aliste) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRealisticAsymmetry: | |
| def test_global_leader_can_lose_on_a_stratum(self) -> None: | |
| """Cas patrimonial typique : Tesseract domine globalement | |
| (mΓ©diocre sur le manuscrit, excellent sur l'imprimΓ©), Pero | |
| domine spΓ©cifiquement sur le manuscrit.""" | |
| b = _make_benchmark( | |
| [ | |
| # Tesseract : 10 docs imprimΓ©s Γ 0.02, 5 docs manuscrit Γ 0.30 | |
| _make_engine("tesseract", { | |
| **{f"print_{i}": 0.02 for i in range(10)}, | |
| **{f"ms_{i}": 0.30 for i in range(5)}, | |
| }), | |
| # Pero : 10 docs imprimΓ©s Γ 0.05, 5 docs manuscrit Γ 0.10 | |
| _make_engine("pero", { | |
| **{f"print_{i}": 0.05 for i in range(10)}, | |
| **{f"ms_{i}": 0.10 for i in range(5)}, | |
| }), | |
| ], | |
| doc_strata={ | |
| **{f"print_{i}": "imprimΓ©" for i in range(10)}, | |
| **{f"ms_{i}": "manuscrit" for i in range(5)}, | |
| }, | |
| ) | |
| # Globalement, Tesseract gagne sur la mΓ©diane (0.02 sur la | |
| # majoritΓ© des docs vs 0.05 pour Pero) | |
| global_leader = b.ranking()[0]["engine"] | |
| assert global_leader == "tesseract" | |
| # Mais sur la strate manuscrit, Pero gagne (0.10 < 0.30) | |
| strat = b.stratified_ranking() | |
| assert strat["manuscrit"][0]["engine"] == "pero" | |
| assert strat["imprimΓ©"][0]["engine"] == "tesseract" | |
| # Le score d'homogΓ©nΓ©itΓ© doit reflΓ©ter le fort Γ©cart de | |
| # Tesseract entre strates (0.02 vs 0.30 = 0.28) | |
| h = b.corpus_homogeneity() | |
| assert h["leader"] == "tesseract" | |
| assert h["max_inter_strata_gap"] == pytest.approx(0.28, abs=1e-9) | |