"""Tests Sprint 42 — exposition des token_confidences + câblage runner. Le runner peut maintenant calculer des métriques de calibration (ECE / MCE / reliability) dès qu'un moteur expose des ``token_confidences`` sur l'``EngineResult``. Couvre : 1. ``EngineResult.token_confidences`` accepte ``None`` (rétrocompat stricte) ou une liste de dicts. 2. ``DocumentResult.calibration_metrics`` est sérialisé via ``as_dict`` uniquement quand renseigné, libéré par ``compact()``. 3. ``EngineReport.aggregated_calibration`` apparaît dans ``as_dict`` quand renseigné. 4. ``_calibration_from_engine_result`` : - Aligne en bag-of-words avec multiplicité (proxy oracle) - Normalise les confidences en pourcentage (>1) à [0, 1] - Ignore les confidences négatives (Tesseract -1 pour non-mots) - Retourne ``None`` sur entrée vide / ``None`` 5. ``_aggregate_calibration`` : - Combine les bins de plusieurs documents en somme pondérée - Recalcule ECE/MCE micro à partir des sommes - Retourne ``None`` si aucun doc n'a de calibration 6. Rétrocompat : sans token_confidences sur l'EngineResult, aucun calcul calibration ; ``aggregated_calibration = None``. """ from __future__ import annotations import pytest from picarones.measurements.runner import ( _aggregate_calibration, _calibration_from_engine_result, ) from picarones.core.results import DocumentResult, EngineReport from picarones.engines.base import EngineResult # ────────────────────────────────────────────────────────────────────────── # 1. EngineResult.token_confidences # ────────────────────────────────────────────────────────────────────────── class TestEngineResultExtension: def test_default_is_none(self) -> None: r = EngineResult("e", "/tmp/x.png", "hello", 1.0) assert r.token_confidences is None def test_accepts_list_of_dicts(self) -> None: confs = [{"token": "hello", "confidence": 0.95}] r = EngineResult("e", "/tmp/x.png", "hello", 1.0, token_confidences=confs) assert r.token_confidences == confs # ────────────────────────────────────────────────────────────────────────── # 2-3. Modèles : sérialisation et compact # ────────────────────────────────────────────────────────────────────────── def _make_dr(calibration_metrics: dict | None = None) -> DocumentResult: from picarones.measurements.metrics import MetricsResult return DocumentResult( doc_id="d1", image_path="/tmp/x.png", ground_truth="a b c", hypothesis="a b c", metrics=MetricsResult( cer=0.0, cer_nfc=0.0, cer_caseless=0.0, wer=0.0, wer_normalized=0.0, mer=0.0, wil=0.0, reference_length=5, hypothesis_length=5, ), duration_seconds=0.1, calibration_metrics=calibration_metrics, ) class TestModelsSerialization: def test_calibration_metrics_omitted_when_none(self) -> None: d = _make_dr(None).as_dict() assert "calibration_metrics" not in d def test_calibration_metrics_present_when_set(self) -> None: d = _make_dr({"ece": 0.05, "mce": 0.1}).as_dict() assert d["calibration_metrics"] == {"ece": 0.05, "mce": 0.1} def test_compact_clears_calibration(self) -> None: dr = _make_dr({"ece": 0.05}) dr.compact() assert dr.calibration_metrics is None def test_engine_report_aggregated_calibration_omitted_when_none(self) -> None: rep = EngineReport( engine_name="t", engine_version="1", engine_config={}, document_results=[_make_dr()], ) assert "aggregated_calibration" not in rep.as_dict() def test_engine_report_aggregated_calibration_included_when_set(self) -> None: rep = EngineReport( engine_name="t", engine_version="1", engine_config={}, document_results=[_make_dr()], aggregated_calibration={"ece": 0.05, "n_predictions": 100}, ) assert rep.as_dict()["aggregated_calibration"] == { "ece": 0.05, "n_predictions": 100, } # ────────────────────────────────────────────────────────────────────────── # 4. Helper d'alignement # ────────────────────────────────────────────────────────────────────────── class TestCalibrationFromEngineResult: def test_returns_none_for_empty_inputs(self) -> None: assert _calibration_from_engine_result("text", None) is None assert _calibration_from_engine_result("text", []) is None def test_perfect_calibration_when_conf_matches_accuracy(self) -> None: gt = "a b c d e f g h i j" # 7 tokens dans la GT à conf=0.7, 3 hors de la GT à conf=0.7 → ECE = 0 tcs = ( [{"token": c, "confidence": 0.7} for c in "abcdefg"] + [{"token": c, "confidence": 0.7} for c in ["X", "Y", "Z"]] ) m = _calibration_from_engine_result(gt, tcs) assert m is not None assert m["ece"] == pytest.approx(0.0, abs=1e-9) assert m["overall_accuracy"] == pytest.approx(0.7) assert m["n_predictions"] == 10 def test_normalizes_percentage_confidences(self) -> None: """Conf > 1 est interprétée en pourcentage et divisée par 100.""" m = _calibration_from_engine_result( "hello", [{"token": "hello", "confidence": 95.0}], ) assert m is not None # 95/100 = 0.95 assert m["overall_confidence"] == 0.95 def test_skips_negative_confidences(self) -> None: """Tesseract met -1 pour les non-mots ; on les ignore.""" m = _calibration_from_engine_result( "hello", [ {"token": "hello", "confidence": 0.9}, {"token": ".", "confidence": -1.0}, ], ) assert m is not None assert m["n_predictions"] == 1 def test_bag_of_words_with_multiplicity(self) -> None: # GT contient deux 'le'. L'hypothèse en a trois → 2 corrects, 1 incorrect. gt = "le chat le chien" tcs = [ {"token": "le", "confidence": 0.9}, {"token": "le", "confidence": 0.9}, {"token": "le", "confidence": 0.9}, # 3e 'le' : pas dans la GT {"token": "chat", "confidence": 0.9}, {"token": "chien", "confidence": 0.9}, ] m = _calibration_from_engine_result(gt, tcs) # 4 corrects sur 5 assert m["overall_accuracy"] == 0.8 assert m["n_predictions"] == 5 def test_skips_invalid_entries(self) -> None: m = _calibration_from_engine_result( "hello", [ "not a dict", {"no_token": True, "confidence": 0.5}, {"token": "hello"}, # pas de confidence {"token": "hello", "confidence": "abc"}, # conf non numérique {"token": "hello", "confidence": 0.9}, # valide ], ) assert m is not None assert m["n_predictions"] == 1 # ────────────────────────────────────────────────────────────────────────── # 5. Agrégateur # ────────────────────────────────────────────────────────────────────────── class TestAggregateCalibration: def test_returns_none_when_no_doc_has_calibration(self) -> None: drs = [_make_dr(None), _make_dr(None)] assert _aggregate_calibration(drs) is None def test_combines_bins_across_docs(self) -> None: # Doc 1 : bin [0.5, 0.6) avec 10 prédictions, conf=0.55, acc=0.5 # Doc 2 : bin [0.5, 0.6) avec 20 prédictions, conf=0.55, acc=0.7 # Agrégat attendu : 30 prédictions dans ce bin, conf moy = 0.55, # acc moy pondérée = (10*0.5 + 20*0.7) / 30 = 19/30 ≈ 0.633 empty_bin = lambda lo, hi: { # noqa: E731 "bin_low": lo, "bin_high": hi, "avg_confidence": None, "accuracy": None, "count": 0, "gap": None, } bins1 = [empty_bin(k / 10, (k + 1) / 10) for k in range(10)] bins1[5] = { "bin_low": 0.5, "bin_high": 0.6, "avg_confidence": 0.55, "accuracy": 0.5, "count": 10, "gap": 0.05, } m1 = { "ece": 0.05, "mce": 0.05, "n_bins": 10, "n_predictions": 10, "overall_accuracy": 0.5, "overall_confidence": 0.55, "bins": bins1, } bins2 = [empty_bin(k / 10, (k + 1) / 10) for k in range(10)] bins2[5] = { "bin_low": 0.5, "bin_high": 0.6, "avg_confidence": 0.55, "accuracy": 0.7, "count": 20, "gap": 0.15, } m2 = { "ece": 0.15, "mce": 0.15, "n_bins": 10, "n_predictions": 20, "overall_accuracy": 0.7, "overall_confidence": 0.55, "bins": bins2, } drs = [_make_dr(m1), _make_dr(m2)] agg = _aggregate_calibration(drs) assert agg is not None assert agg["n_predictions"] == 30 assert agg["doc_count"] == 2 # Accuracy combinée = (10*0.5 + 20*0.7) / 30 assert agg["overall_accuracy"] == (10 * 0.5 + 20 * 0.7) / 30 # Confidence combinée = 0.55 (constante) assert abs(agg["overall_confidence"] - 0.55) < 1e-9 # ECE micro : seul bin non vide (bin 5), avec count=30, # avg_conf=0.55, accuracy=19/30 ≈ 0.633, gap = |0.55 - 0.633| expected_ece = abs(0.55 - 19 / 30) assert abs(agg["ece"] - expected_ece) < 1e-9 assert agg["mce"] == agg["ece"] # un seul bin non vide → MCE = ECE # ────────────────────────────────────────────────────────────────────────── # 6. Rétrocompat : sans token_confidences, rien ne change # ────────────────────────────────────────────────────────────────────────── class TestBackwardCompat: def test_engine_result_default_no_calibration(self) -> None: # Un EngineResult sans token_confidences → calibration_metrics # ne doit pas être calculée. from picarones.measurements.runner import _compute_document_result ocr = EngineResult( engine_name="e", image_path="/tmp/x.png", text="a b c", duration_seconds=0.1, token_confidences=None, ) dr = _compute_document_result( doc_id="d1", image_path="/tmp/x.png", ground_truth="a b c", ocr_result=ocr, char_exclude=None, ) assert dr.calibration_metrics is None def test_engine_result_with_confs_triggers_calibration(self) -> None: from picarones.measurements.runner import _compute_document_result ocr = EngineResult( engine_name="e", image_path="/tmp/x.png", text="a b c", duration_seconds=0.1, token_confidences=[ {"token": "a", "confidence": 0.9}, {"token": "b", "confidence": 0.9}, {"token": "c", "confidence": 0.9}, ], ) dr = _compute_document_result( doc_id="d1", image_path="/tmp/x.png", ground_truth="a b c", ocr_result=ocr, char_exclude=None, ) assert dr.calibration_metrics is not None # 3 tokens, tous corrects, conf 0.9 → accuracy = 1, conf = 0.9 assert dr.calibration_metrics["overall_accuracy"] == 1.0 assert dr.calibration_metrics["overall_confidence"] == 0.9