Spaces:
Running
Running
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified | """Tests Sprint 44 β mΓ©diane par dΓ©faut + dΓ©tecteur d'asymΓ©trie. | |
| Couvre : | |
| 1. ``EngineReport.median_cer`` lit ``aggregated_metrics["cer"]["median"]``. | |
| 2. ``BenchmarkResult.ranking()`` : | |
| - inclut ``median_cer`` dans chaque entrΓ©e | |
| - trie sur la mΓ©diane par dΓ©faut (et non plus la moyenne) | |
| - retombe sur la moyenne si la mΓ©diane est absente | |
| 3. DΓ©tecteur ``MEDIAN_MEAN_GAP_WARNING`` : | |
| - se dΓ©clenche quand le ratio ``|moyenne - mΓ©diane| / mΓ©diane > 30%`` | |
| - ne se dΓ©clenche pas quand symΓ©trique | |
| - ne se dΓ©clenche pas si la mΓ©diane est nulle (corpus parfait) | |
| - importance HIGH si gap relatif β₯ 100 % | |
| 4. Anti-hallucination : chaque nombre rendu est dans le payload. | |
| 5. RΓ©trocompat : les consommateurs qui lisent ``mean_cer`` continuent | |
| Γ fonctionner. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import pytest | |
| from picarones.measurements.metrics import MetricsResult | |
| from picarones.measurements.narrative.detectors import detect_median_mean_gap_warning | |
| from picarones.core.facts import FactImportance, FactType | |
| from picarones.measurements.narrative.renderer import extract_numbers, render_fact | |
| from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_dr(cer: float, doc_id: str = "d") -> DocumentResult: | |
| return DocumentResult( | |
| doc_id=doc_id, image_path="/tmp/x.png", | |
| ground_truth="x", hypothesis="x", | |
| metrics=MetricsResult( | |
| cer=cer, cer_nfc=cer, cer_caseless=cer, | |
| wer=cer, wer_normalized=cer, mer=cer, wil=cer, | |
| reference_length=1, hypothesis_length=1, | |
| ), | |
| duration_seconds=0.1, | |
| ) | |
| def _make_engine_report(name: str, cers: list[float]) -> EngineReport: | |
| drs = [_make_dr(c, doc_id=f"d{i}") for i, c in enumerate(cers)] | |
| return EngineReport( | |
| engine_name=name, engine_version="1", engine_config={}, | |
| document_results=drs, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. EngineReport.median_cer | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestMedianCerProperty: | |
| def test_returns_median_from_aggregated(self) -> None: | |
| rep = _make_engine_report("e", [0.0, 0.0, 0.0, 1.0, 1.0]) | |
| # MΓ©diane de [0,0,0,1,1] = 0 | |
| assert rep.median_cer == pytest.approx(0.0) | |
| def test_returns_none_when_no_docs(self) -> None: | |
| rep = EngineReport( | |
| engine_name="e", engine_version="1", engine_config={}, | |
| document_results=[], | |
| ) | |
| # Pas de docs β aggregated_metrics vide β mean/median = None | |
| assert rep.median_cer is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. ranking() β tri par mΓ©diane | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRankingByMedian: | |
| def test_includes_median_cer(self) -> None: | |
| bench = BenchmarkResult( | |
| corpus_name="c", corpus_source=None, document_count=3, | |
| engine_reports=[_make_engine_report("a", [0.1, 0.2, 0.3])], | |
| ) | |
| ranking = bench.ranking() | |
| assert "median_cer" in ranking[0] | |
| assert ranking[0]["median_cer"] == pytest.approx(0.2) | |
| def test_sorts_by_median_not_mean(self) -> None: | |
| # Moteur A : 80 % Γ 0,03 + 20 % Γ 0,40 β moyenne β 0,11, mΓ©diane = 0,03 | |
| # Moteur B : 100 % Γ 0,05 β moyenne = 0,05, mΓ©diane = 0,05 | |
| # Tri par moyenne : B (0.05) < A (0.11) β A est 2e | |
| # Tri par mΓ©diane : A (0.03) < B (0.05) β A est 1er | |
| ers = [ | |
| _make_engine_report( | |
| "A_asymmetric", | |
| [0.03] * 8 + [0.40] * 2, | |
| ), | |
| _make_engine_report( | |
| "B_steady", | |
| [0.05] * 10, | |
| ), | |
| ] | |
| bench = BenchmarkResult( | |
| corpus_name="c", corpus_source=None, document_count=10, | |
| engine_reports=ers, | |
| ) | |
| ranking = bench.ranking() | |
| # Le moteur A doit gagner sur la mΓ©diane mΓͺme si sa moyenne est pire | |
| assert ranking[0]["engine"] == "A_asymmetric" | |
| assert ranking[0]["mean_cer"] > ranking[1]["mean_cer"] | |
| assert ranking[0]["median_cer"] < ranking[1]["median_cer"] | |
| def test_falls_back_to_mean_when_median_missing(self) -> None: | |
| """Si median_cer est None, le tri retombe sur mean_cer. | |
| On reproduit ici la clΓ© de tri utilisΓ©e par | |
| ``BenchmarkResult.ranking()`` pour valider sa logique sur des | |
| entrΓ©es synthΓ©tiques (impossible Γ produire via vrais | |
| ``EngineReport`` car ``aggregate_metrics`` calcule toujours | |
| une mΓ©diane quand il y a au moins un doc). | |
| """ | |
| ranked = [ | |
| {"engine": "x", "mean_cer": 0.10, "median_cer": None, | |
| "mean_wer": 0.0, "documents": 1, "failed": 0}, | |
| {"engine": "y", "mean_cer": 0.05, "median_cer": None, | |
| "mean_wer": 0.0, "documents": 1, "failed": 0}, | |
| ] | |
| def _key(e: dict) -> tuple: | |
| p = e.get("median_cer") if e.get("median_cer") is not None else e.get("mean_cer") | |
| return (p is None, p if p is not None else float("inf")) | |
| ranking = sorted(ranked, key=_key) | |
| # y (mean=0.05) doit passer avant x (mean=0.10) | |
| assert ranking[0]["engine"] == "y" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. DΓ©tecteur MEDIAN_MEAN_GAP_WARNING | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestMedianMeanGapDetector: | |
| def test_no_fact_when_distribution_symmetric(self) -> None: | |
| data = {"ranking": [{ | |
| "engine": "tess", "median_cer": 0.05, "mean_cer": 0.055, | |
| "documents": 100, | |
| }]} | |
| # Gap relatif = 10% β en dessous du seuil 30% | |
| assert detect_median_mean_gap_warning(data) == [] | |
| def test_emits_fact_when_asymmetric(self) -> None: | |
| data = {"ranking": [{ | |
| "engine": "tess", "median_cer": 0.03, "mean_cer": 0.07, | |
| "documents": 100, | |
| }]} | |
| # Gap relatif = 133% β au-dessus du seuil | |
| facts = detect_median_mean_gap_warning(data) | |
| assert len(facts) == 1 | |
| assert facts[0].type is FactType.MEDIAN_MEAN_GAP_WARNING | |
| assert facts[0].importance is FactImportance.HIGH # >= 100 % | |
| assert facts[0].payload["engine"] == "tess" | |
| def test_medium_importance_when_moderate_gap(self) -> None: | |
| data = {"ranking": [{ | |
| "engine": "tess", "median_cer": 0.05, "mean_cer": 0.075, | |
| "documents": 100, | |
| }]} | |
| # Gap relatif = 50% β au-dessus du seuil mais < 100 % | |
| facts = detect_median_mean_gap_warning(data) | |
| assert facts[0].importance is FactImportance.MEDIUM | |
| def test_no_fact_when_median_zero(self) -> None: | |
| """MΓ©diane nulle β ratio non calculable β on s'abstient.""" | |
| data = {"ranking": [{ | |
| "engine": "tess", "median_cer": 0.0, "mean_cer": 0.05, | |
| "documents": 100, | |
| }]} | |
| assert detect_median_mean_gap_warning(data) == [] | |
| def test_no_fact_when_no_ranking(self) -> None: | |
| assert detect_median_mean_gap_warning({}) == [] | |
| assert detect_median_mean_gap_warning({"ranking": []}) == [] | |
| assert detect_median_mean_gap_warning({"ranking": [{ | |
| "engine": "x", "mean_cer": None, "median_cer": None, | |
| }]}) == [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. TraΓ§abilitΓ© anti-hallucination | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTraceability: | |
| def test_every_rendered_number_is_in_payload(self, lang: str) -> None: | |
| data = {"ranking": [{ | |
| "engine": "tess", "median_cer": 0.03, "mean_cer": 0.07, | |
| "documents": 100, | |
| }]} | |
| facts = detect_median_mean_gap_warning(data) | |
| sentence = render_fact(facts[0], lang) | |
| # Whitelist : aucune constante de template n'est attendue ici | |
| whitelist: set[str] = set() | |
| # Recompute payload representations | |
| payload_nums: set[str] = set() | |
| for v in facts[0].payload.values(): | |
| if isinstance(v, (int, float)): | |
| payload_nums.add(str(v)) | |
| if isinstance(v, float) and v.is_integer(): | |
| payload_nums.add(str(int(v))) | |
| for num in extract_numbers(sentence): | |
| normalized = num.replace(",", ".") | |
| assert normalized in payload_nums | whitelist, ( | |
| f"Nombre {normalized!r} dans la phrase rendue n'est pas " | |
| f"traΓ§able au payload {facts[0].payload!r}" | |
| ) | |
| def test_template_has_no_hardcoded_numbers(self) -> None: | |
| from picarones.measurements.narrative.renderer import _load_templates | |
| for lang in ("fr", "en"): | |
| tpl = _load_templates(lang).get("median_mean_gap_warning", "") | |
| assert tpl, f"Template absent pour {lang}" | |
| # Enlever les placeholders {x} avant de chercher des chiffres | |
| cleaned = re.sub(r"\{[^}]+\}", "", tpl) | |
| digits = re.findall(r"\d", cleaned) | |
| assert not digits, f"Template {lang} contient des chiffres en dur : {digits}" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. IntΓ©gration via build_synthesis | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestSynthesisIntegration: | |
| def test_detector_registered_by_default(self) -> None: | |
| from picarones.measurements.narrative.registry import iter_detectors | |
| types = {entry.fact_type for entry in iter_detectors()} | |
| assert FactType.MEDIAN_MEAN_GAP_WARNING in types | |
| def test_synthesis_includes_warning_when_asymmetric(self) -> None: | |
| from picarones.measurements.narrative import build_synthesis | |
| data = {"ranking": [{ | |
| "engine": "tess", "median_cer": 0.03, "mean_cer": 0.07, | |
| "documents": 100, | |
| }]} | |
| out = build_synthesis(data, lang="fr", max_facts=5) | |
| sentences = out["sentences"] | |
| # Au moins une phrase doit mentionner l'asymΓ©trie | |
| assert any( | |
| "asymΓ©trique" in s.lower() or "mΓ©diane" in s.lower() | |
| for s in sentences | |
| ) | |