Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

File size: 12,919 Bytes

"""Tests Sprint 42 — exposition des token_confidences + câblage runner.

Le runner peut maintenant calculer des métriques de calibration
(ECE / MCE / reliability) dès qu'un moteur expose des
``token_confidences`` sur l'``EngineResult``.

Couvre :

1. ``EngineResult.token_confidences`` accepte ``None`` (rétrocompat
   stricte) ou une liste de dicts.
2. ``DocumentResult.calibration_metrics`` est sérialisé via ``as_dict``
   uniquement quand renseigné, libéré par ``compact()``.
3. ``EngineReport.aggregated_calibration`` apparaît dans ``as_dict``
   quand renseigné.
4. ``_calibration_from_engine_result`` :
   - Aligne en bag-of-words avec multiplicité (proxy oracle)
   - Normalise les confidences en pourcentage (>1) à [0, 1]
   - Ignore les confidences négatives (Tesseract -1 pour non-mots)
   - Retourne ``None`` sur entrée vide / ``None``
5. ``_aggregate_calibration`` :
   - Combine les bins de plusieurs documents en somme pondérée
   - Recalcule ECE/MCE micro à partir des sommes
   - Retourne ``None`` si aucun doc n'a de calibration
6. Rétrocompat : sans token_confidences sur l'EngineResult, aucun
   calcul calibration ; ``aggregated_calibration = None``.
"""

from __future__ import annotations

import pytest

from picarones.measurements.runner import (
    _aggregate_calibration,
    _calibration_from_engine_result,
)
from picarones.core.results import DocumentResult, EngineReport
from picarones.engines.base import EngineResult


# ──────────────────────────────────────────────────────────────────────────
# 1. EngineResult.token_confidences
# ──────────────────────────────────────────────────────────────────────────


class TestEngineResultExtension:
    def test_default_is_none(self) -> None:
        r = EngineResult("e", "/tmp/x.png", "hello", 1.0)
        assert r.token_confidences is None

    def test_accepts_list_of_dicts(self) -> None:
        confs = [{"token": "hello", "confidence": 0.95}]
        r = EngineResult("e", "/tmp/x.png", "hello", 1.0, token_confidences=confs)
        assert r.token_confidences == confs


# ──────────────────────────────────────────────────────────────────────────
# 2-3. Modèles : sérialisation et compact
# ──────────────────────────────────────────────────────────────────────────


def _make_dr(calibration_metrics: dict | None = None) -> DocumentResult:
    from picarones.measurements.metrics import MetricsResult

    return DocumentResult(
        doc_id="d1", image_path="/tmp/x.png",
        ground_truth="a b c", hypothesis="a b c",
        metrics=MetricsResult(
            cer=0.0, cer_nfc=0.0, cer_caseless=0.0,
            wer=0.0, wer_normalized=0.0, mer=0.0, wil=0.0,
            reference_length=5, hypothesis_length=5,
        ),
        duration_seconds=0.1,
        calibration_metrics=calibration_metrics,
    )


class TestModelsSerialization:
    def test_calibration_metrics_omitted_when_none(self) -> None:
        d = _make_dr(None).as_dict()
        assert "calibration_metrics" not in d

    def test_calibration_metrics_present_when_set(self) -> None:
        d = _make_dr({"ece": 0.05, "mce": 0.1}).as_dict()
        assert d["calibration_metrics"] == {"ece": 0.05, "mce": 0.1}

    def test_compact_clears_calibration(self) -> None:
        # Sprint A14-S1 — ``compact()`` est désormais opt-in.
        dr = _make_dr({"ece": 0.05})
        dr.compact(drop_analyses=True)
        assert dr.calibration_metrics is None

    def test_engine_report_aggregated_calibration_omitted_when_none(self) -> None:
        rep = EngineReport(
            engine_name="t", engine_version="1", engine_config={},
            document_results=[_make_dr()],
        )
        assert "aggregated_calibration" not in rep.as_dict()

    def test_engine_report_aggregated_calibration_included_when_set(self) -> None:
        rep = EngineReport(
            engine_name="t", engine_version="1", engine_config={},
            document_results=[_make_dr()],
            aggregated_calibration={"ece": 0.05, "n_predictions": 100},
        )
        assert rep.as_dict()["aggregated_calibration"] == {
            "ece": 0.05, "n_predictions": 100,
        }


# ──────────────────────────────────────────────────────────────────────────
# 4. Helper d'alignement
# ──────────────────────────────────────────────────────────────────────────


class TestCalibrationFromEngineResult:
    def test_returns_none_for_empty_inputs(self) -> None:
        assert _calibration_from_engine_result("text", None) is None
        assert _calibration_from_engine_result("text", []) is None

    def test_perfect_calibration_when_conf_matches_accuracy(self) -> None:
        gt = "a b c d e f g h i j"
        # 7 tokens dans la GT à conf=0.7, 3 hors de la GT à conf=0.7 → ECE = 0
        tcs = (
            [{"token": c, "confidence": 0.7} for c in "abcdefg"]
            + [{"token": c, "confidence": 0.7} for c in ["X", "Y", "Z"]]
        )
        m = _calibration_from_engine_result(gt, tcs)
        assert m is not None
        assert m["ece"] == pytest.approx(0.0, abs=1e-9)
        assert m["overall_accuracy"] == pytest.approx(0.7)
        assert m["n_predictions"] == 10

    def test_normalizes_percentage_confidences(self) -> None:
        """Conf > 1 est interprétée en pourcentage et divisée par 100."""
        m = _calibration_from_engine_result(
            "hello", [{"token": "hello", "confidence": 95.0}],
        )
        assert m is not None
        # 95/100 = 0.95
        assert m["overall_confidence"] == 0.95

    def test_skips_negative_confidences(self) -> None:
        """Tesseract met -1 pour les non-mots ; on les ignore."""
        m = _calibration_from_engine_result(
            "hello", [
                {"token": "hello", "confidence": 0.9},
                {"token": ".", "confidence": -1.0},
            ],
        )
        assert m is not None
        assert m["n_predictions"] == 1

    def test_bag_of_words_with_multiplicity(self) -> None:
        # GT contient deux 'le'. L'hypothèse en a trois → 2 corrects, 1 incorrect.
        gt = "le chat le chien"
        tcs = [
            {"token": "le", "confidence": 0.9},
            {"token": "le", "confidence": 0.9},
            {"token": "le", "confidence": 0.9},  # 3e 'le' : pas dans la GT
            {"token": "chat", "confidence": 0.9},
            {"token": "chien", "confidence": 0.9},
        ]
        m = _calibration_from_engine_result(gt, tcs)
        # 4 corrects sur 5
        assert m["overall_accuracy"] == 0.8
        assert m["n_predictions"] == 5

    def test_skips_invalid_entries(self) -> None:
        m = _calibration_from_engine_result(
            "hello", [
                "not a dict",
                {"no_token": True, "confidence": 0.5},
                {"token": "hello"},  # pas de confidence
                {"token": "hello", "confidence": "abc"},  # conf non numérique
                {"token": "hello", "confidence": 0.9},  # valide
            ],
        )
        assert m is not None
        assert m["n_predictions"] == 1


# ──────────────────────────────────────────────────────────────────────────
# 5. Agrégateur
# ──────────────────────────────────────────────────────────────────────────


class TestAggregateCalibration:
    def test_returns_none_when_no_doc_has_calibration(self) -> None:
        drs = [_make_dr(None), _make_dr(None)]
        assert _aggregate_calibration(drs) is None

    def test_combines_bins_across_docs(self) -> None:
        # Doc 1 : bin [0.5, 0.6) avec 10 prédictions, conf=0.55, acc=0.5
        # Doc 2 : bin [0.5, 0.6) avec 20 prédictions, conf=0.55, acc=0.7
        # Agrégat attendu : 30 prédictions dans ce bin, conf moy = 0.55,
        # acc moy pondérée = (10*0.5 + 20*0.7) / 30 = 19/30 ≈ 0.633
        empty_bin = lambda lo, hi: {  # noqa: E731
            "bin_low": lo, "bin_high": hi,
            "avg_confidence": None, "accuracy": None,
            "count": 0, "gap": None,
        }
        bins1 = [empty_bin(k / 10, (k + 1) / 10) for k in range(10)]
        bins1[5] = {
            "bin_low": 0.5, "bin_high": 0.6,
            "avg_confidence": 0.55, "accuracy": 0.5,
            "count": 10, "gap": 0.05,
        }
        m1 = {
            "ece": 0.05, "mce": 0.05, "n_bins": 10, "n_predictions": 10,
            "overall_accuracy": 0.5, "overall_confidence": 0.55, "bins": bins1,
        }
        bins2 = [empty_bin(k / 10, (k + 1) / 10) for k in range(10)]
        bins2[5] = {
            "bin_low": 0.5, "bin_high": 0.6,
            "avg_confidence": 0.55, "accuracy": 0.7,
            "count": 20, "gap": 0.15,
        }
        m2 = {
            "ece": 0.15, "mce": 0.15, "n_bins": 10, "n_predictions": 20,
            "overall_accuracy": 0.7, "overall_confidence": 0.55, "bins": bins2,
        }
        drs = [_make_dr(m1), _make_dr(m2)]
        agg = _aggregate_calibration(drs)
        assert agg is not None
        assert agg["n_predictions"] == 30
        assert agg["doc_count"] == 2
        # Accuracy combinée = (10*0.5 + 20*0.7) / 30
        assert agg["overall_accuracy"] == (10 * 0.5 + 20 * 0.7) / 30
        # Confidence combinée = 0.55 (constante)
        assert abs(agg["overall_confidence"] - 0.55) < 1e-9
        # ECE micro : seul bin non vide (bin 5), avec count=30,
        # avg_conf=0.55, accuracy=19/30 ≈ 0.633, gap = |0.55 - 0.633|
        expected_ece = abs(0.55 - 19 / 30)
        assert abs(agg["ece"] - expected_ece) < 1e-9
        assert agg["mce"] == agg["ece"]  # un seul bin non vide → MCE = ECE


# ──────────────────────────────────────────────────────────────────────────
# 6. Rétrocompat : sans token_confidences, rien ne change
# ──────────────────────────────────────────────────────────────────────────


class TestBackwardCompat:
    def test_engine_result_default_no_calibration(self) -> None:
        # Un EngineResult sans token_confidences → calibration_metrics
        # ne doit pas être calculée.
        from picarones.measurements.runner import _compute_document_result
        ocr = EngineResult(
            engine_name="e",
            image_path="/tmp/x.png",
            text="a b c",
            duration_seconds=0.1,
            token_confidences=None,
        )
        dr = _compute_document_result(
            doc_id="d1", image_path="/tmp/x.png",
            ground_truth="a b c",
            ocr_result=ocr,
            char_exclude=None,
        )
        assert dr.calibration_metrics is None

    def test_engine_result_with_confs_triggers_calibration(self) -> None:
        from picarones.measurements.runner import _compute_document_result
        ocr = EngineResult(
            engine_name="e",
            image_path="/tmp/x.png",
            text="a b c",
            duration_seconds=0.1,
            token_confidences=[
                {"token": "a", "confidence": 0.9},
                {"token": "b", "confidence": 0.9},
                {"token": "c", "confidence": 0.9},
            ],
        )
        dr = _compute_document_result(
            doc_id="d1", image_path="/tmp/x.png",
            ground_truth="a b c",
            ocr_result=ocr,
            char_exclude=None,
        )
        assert dr.calibration_metrics is not None
        # 3 tokens, tous corrects, conf 0.9 → accuracy = 1, conf = 0.9
        assert dr.calibration_metrics["overall_accuracy"] == 1.0
        assert dr.calibration_metrics["overall_confidence"] == 0.9