Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Picarones / tests /measurements /test_sprint42_calibration_runner.py

Claude

fix(security,metrics): Sprint A14-S1 — boucher les 6 P0 du rewrite ciblé

a2bea75 unverified about 2 months ago

12.9 kB

	"""Tests Sprint 42 — exposition des token_confidences + câblage runner.

	Le runner peut maintenant calculer des métriques de calibration
	(ECE / MCE / reliability) dès qu'un moteur expose des
	``token_confidences`` sur l'``EngineResult``.

	Couvre :

	1. ``EngineResult.token_confidences`` accepte ``None`` (rétrocompat
	stricte) ou une liste de dicts.
	2. ``DocumentResult.calibration_metrics`` est sérialisé via ``as_dict``
	uniquement quand renseigné, libéré par ``compact()``.
	3. ``EngineReport.aggregated_calibration`` apparaît dans ``as_dict``
	quand renseigné.
	4. ``_calibration_from_engine_result`` :
	- Aligne en bag-of-words avec multiplicité (proxy oracle)
	- Normalise les confidences en pourcentage (>1) à [0, 1]
	- Ignore les confidences négatives (Tesseract -1 pour non-mots)
	- Retourne ``None`` sur entrée vide / ``None``
	5. ``_aggregate_calibration`` :
	- Combine les bins de plusieurs documents en somme pondérée
	- Recalcule ECE/MCE micro à partir des sommes
	- Retourne ``None`` si aucun doc n'a de calibration
	6. Rétrocompat : sans token_confidences sur l'EngineResult, aucun
	calcul calibration ; ``aggregated_calibration = None``.
	"""

	from __future__ import annotations

	import pytest

	from picarones.measurements.runner import (
	_aggregate_calibration,
	_calibration_from_engine_result,
	)
	from picarones.core.results import DocumentResult, EngineReport
	from picarones.engines.base import EngineResult


	# ──────────────────────────────────────────────────────────────────────────
	# 1. EngineResult.token_confidences
	# ──────────────────────────────────────────────────────────────────────────


	class TestEngineResultExtension:
	def test_default_is_none(self) -> None:
	r = EngineResult("e", "/tmp/x.png", "hello", 1.0)
	assert r.token_confidences is None

	def test_accepts_list_of_dicts(self) -> None:
	confs = [{"token": "hello", "confidence": 0.95}]
	r = EngineResult("e", "/tmp/x.png", "hello", 1.0, token_confidences=confs)
	assert r.token_confidences == confs


	# ──────────────────────────────────────────────────────────────────────────
	# 2-3. Modèles : sérialisation et compact
	# ──────────────────────────────────────────────────────────────────────────


	def _make_dr(calibration_metrics: dict \| None = None) -> DocumentResult:
	from picarones.measurements.metrics import MetricsResult

	return DocumentResult(
	doc_id="d1", image_path="/tmp/x.png",
	ground_truth="a b c", hypothesis="a b c",
	metrics=MetricsResult(
	cer=0.0, cer_nfc=0.0, cer_caseless=0.0,
	wer=0.0, wer_normalized=0.0, mer=0.0, wil=0.0,
	reference_length=5, hypothesis_length=5,
	),
	duration_seconds=0.1,
	calibration_metrics=calibration_metrics,
	)


	class TestModelsSerialization:
	def test_calibration_metrics_omitted_when_none(self) -> None:
	d = _make_dr(None).as_dict()
	assert "calibration_metrics" not in d

	def test_calibration_metrics_present_when_set(self) -> None:
	d = _make_dr({"ece": 0.05, "mce": 0.1}).as_dict()
	assert d["calibration_metrics"] == {"ece": 0.05, "mce": 0.1}

	def test_compact_clears_calibration(self) -> None:
	# Sprint A14-S1 — ``compact()`` est désormais opt-in.
	dr = _make_dr({"ece": 0.05})
	dr.compact(drop_analyses=True)
	assert dr.calibration_metrics is None

	def test_engine_report_aggregated_calibration_omitted_when_none(self) -> None:
	rep = EngineReport(
	engine_name="t", engine_version="1", engine_config={},
	document_results=[_make_dr()],
	)
	assert "aggregated_calibration" not in rep.as_dict()

	def test_engine_report_aggregated_calibration_included_when_set(self) -> None:
	rep = EngineReport(
	engine_name="t", engine_version="1", engine_config={},
	document_results=[_make_dr()],
	aggregated_calibration={"ece": 0.05, "n_predictions": 100},
	)
	assert rep.as_dict()["aggregated_calibration"] == {
	"ece": 0.05, "n_predictions": 100,
	}


	# ──────────────────────────────────────────────────────────────────────────
	# 4. Helper d'alignement
	# ──────────────────────────────────────────────────────────────────────────


	class TestCalibrationFromEngineResult:
	def test_returns_none_for_empty_inputs(self) -> None:
	assert _calibration_from_engine_result("text", None) is None
	assert _calibration_from_engine_result("text", []) is None

	def test_perfect_calibration_when_conf_matches_accuracy(self) -> None:
	gt = "a b c d e f g h i j"
	# 7 tokens dans la GT à conf=0.7, 3 hors de la GT à conf=0.7 → ECE = 0
	tcs = (
	[{"token": c, "confidence": 0.7} for c in "abcdefg"]
	+ [{"token": c, "confidence": 0.7} for c in ["X", "Y", "Z"]]
	)
	m = _calibration_from_engine_result(gt, tcs)
	assert m is not None
	assert m["ece"] == pytest.approx(0.0, abs=1e-9)
	assert m["overall_accuracy"] == pytest.approx(0.7)
	assert m["n_predictions"] == 10

	def test_normalizes_percentage_confidences(self) -> None:
	"""Conf > 1 est interprétée en pourcentage et divisée par 100."""
	m = _calibration_from_engine_result(
	"hello", [{"token": "hello", "confidence": 95.0}],
	)
	assert m is not None
	# 95/100 = 0.95
	assert m["overall_confidence"] == 0.95

	def test_skips_negative_confidences(self) -> None:
	"""Tesseract met -1 pour les non-mots ; on les ignore."""
	m = _calibration_from_engine_result(
	"hello", [
	{"token": "hello", "confidence": 0.9},
	{"token": ".", "confidence": -1.0},
	],
	)
	assert m is not None
	assert m["n_predictions"] == 1

	def test_bag_of_words_with_multiplicity(self) -> None:
	# GT contient deux 'le'. L'hypothèse en a trois → 2 corrects, 1 incorrect.
	gt = "le chat le chien"
	tcs = [
	{"token": "le", "confidence": 0.9},
	{"token": "le", "confidence": 0.9},
	{"token": "le", "confidence": 0.9}, # 3e 'le' : pas dans la GT
	{"token": "chat", "confidence": 0.9},
	{"token": "chien", "confidence": 0.9},
	]
	m = _calibration_from_engine_result(gt, tcs)
	# 4 corrects sur 5
	assert m["overall_accuracy"] == 0.8
	assert m["n_predictions"] == 5

	def test_skips_invalid_entries(self) -> None:
	m = _calibration_from_engine_result(
	"hello", [
	"not a dict",
	{"no_token": True, "confidence": 0.5},
	{"token": "hello"}, # pas de confidence
	{"token": "hello", "confidence": "abc"}, # conf non numérique
	{"token": "hello", "confidence": 0.9}, # valide
	],
	)
	assert m is not None
	assert m["n_predictions"] == 1


	# ──────────────────────────────────────────────────────────────────────────
	# 5. Agrégateur
	# ──────────────────────────────────────────────────────────────────────────


	class TestAggregateCalibration:
	def test_returns_none_when_no_doc_has_calibration(self) -> None:
	drs = [_make_dr(None), _make_dr(None)]
	assert _aggregate_calibration(drs) is None

	def test_combines_bins_across_docs(self) -> None:
	# Doc 1 : bin [0.5, 0.6) avec 10 prédictions, conf=0.55, acc=0.5
	# Doc 2 : bin [0.5, 0.6) avec 20 prédictions, conf=0.55, acc=0.7
	# Agrégat attendu : 30 prédictions dans ce bin, conf moy = 0.55,
	# acc moy pondérée = (100.5 + 200.7) / 30 = 19/30 ≈ 0.633
	empty_bin = lambda lo, hi: { # noqa: E731
	"bin_low": lo, "bin_high": hi,
	"avg_confidence": None, "accuracy": None,
	"count": 0, "gap": None,
	}
	bins1 = [empty_bin(k / 10, (k + 1) / 10) for k in range(10)]
	bins1[5] = {
	"bin_low": 0.5, "bin_high": 0.6,
	"avg_confidence": 0.55, "accuracy": 0.5,
	"count": 10, "gap": 0.05,
	}
	m1 = {
	"ece": 0.05, "mce": 0.05, "n_bins": 10, "n_predictions": 10,
	"overall_accuracy": 0.5, "overall_confidence": 0.55, "bins": bins1,
	}
	bins2 = [empty_bin(k / 10, (k + 1) / 10) for k in range(10)]
	bins2[5] = {
	"bin_low": 0.5, "bin_high": 0.6,
	"avg_confidence": 0.55, "accuracy": 0.7,
	"count": 20, "gap": 0.15,
	}
	m2 = {
	"ece": 0.15, "mce": 0.15, "n_bins": 10, "n_predictions": 20,
	"overall_accuracy": 0.7, "overall_confidence": 0.55, "bins": bins2,
	}
	drs = [_make_dr(m1), _make_dr(m2)]
	agg = _aggregate_calibration(drs)
	assert agg is not None
	assert agg["n_predictions"] == 30
	assert agg["doc_count"] == 2
	# Accuracy combinée = (100.5 + 200.7) / 30
	assert agg["overall_accuracy"] == (10 * 0.5 + 20 * 0.7) / 30
	# Confidence combinée = 0.55 (constante)
	assert abs(agg["overall_confidence"] - 0.55) < 1e-9
	# ECE micro : seul bin non vide (bin 5), avec count=30,
	# avg_conf=0.55, accuracy=19/30 ≈ 0.633, gap = \|0.55 - 0.633\|
	expected_ece = abs(0.55 - 19 / 30)
	assert abs(agg["ece"] - expected_ece) < 1e-9
	assert agg["mce"] == agg["ece"] # un seul bin non vide → MCE = ECE


	# ──────────────────────────────────────────────────────────────────────────
	# 6. Rétrocompat : sans token_confidences, rien ne change
	# ──────────────────────────────────────────────────────────────────────────


	class TestBackwardCompat:
	def test_engine_result_default_no_calibration(self) -> None:
	# Un EngineResult sans token_confidences → calibration_metrics
	# ne doit pas être calculée.
	from picarones.measurements.runner import _compute_document_result
	ocr = EngineResult(
	engine_name="e",
	image_path="/tmp/x.png",
	text="a b c",
	duration_seconds=0.1,
	token_confidences=None,
	)
	dr = _compute_document_result(
	doc_id="d1", image_path="/tmp/x.png",
	ground_truth="a b c",
	ocr_result=ocr,
	char_exclude=None,
	)
	assert dr.calibration_metrics is None

	def test_engine_result_with_confs_triggers_calibration(self) -> None:
	from picarones.measurements.runner import _compute_document_result
	ocr = EngineResult(
	engine_name="e",
	image_path="/tmp/x.png",
	text="a b c",
	duration_seconds=0.1,
	token_confidences=[
	{"token": "a", "confidence": 0.9},
	{"token": "b", "confidence": 0.9},
	{"token": "c", "confidence": 0.9},
	],
	)
	dr = _compute_document_result(
	doc_id="d1", image_path="/tmp/x.png",
	ground_truth="a b c",
	ocr_result=ocr,
	char_exclude=None,
	)
	assert dr.calibration_metrics is not None
	# 3 tokens, tous corrects, conf 0.9 → accuracy = 1, conf = 0.9
	assert dr.calibration_metrics["overall_accuracy"] == 1.0
	assert dr.calibration_metrics["overall_confidence"] == 0.9