Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Picarones / tests /reports /test_sprint43_calibration_html.py

Claude

feat(sprint-S8): cohérence finale — renames test dirs, /metrics endpoint, SBOM workflow

43478ec unverified about 2 months ago

13.4 kB

	"""Tests Sprint 43 — section calibration dans le rapport HTML.

	Couvre :

	1. ``build_calibration_summary_html`` rend le tableau résumé (ECE/MCE
	par moteur).
	2. ``build_reliability_diagram_svg`` rend un SVG avec les barres
	d'accuracy par bin, les points (avg_conf, accuracy) et la diagonale.
	3. ``build_reliability_diagrams_grid_html`` génère un SVG par moteur
	ayant ``aggregated_calibration``, dans une grille.
	4. Masquage adaptatif : les fonctions retournent ``""`` si aucun
	moteur n'a de ``aggregated_calibration`` (cas par défaut tant que
	les engines n'exposent pas leurs confidences natives).
	5. Anti-injection : un nom de moteur avec balises HTML est
	échappé.
	6. Intégration ReportGenerator : la section apparaît quand au
	moins un moteur a ``aggregated_calibration``, est omise sinon.
	7. i18n FR/EN : les clés sont présentes et utilisées.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path

	import pytest

	from picarones.evaluation.synthetic import generate_sample_benchmark
	from picarones.reports.html.renderers.calibration import (
	build_calibration_summary_html,
	build_reliability_diagram_svg,
	build_reliability_diagrams_grid_html,
	)
	from picarones.reports.html.generator import ReportGenerator


	# ──────────────────────────────────────────────────────────────────────────
	# Fixtures
	# ──────────────────────────────────────────────────────────────────────────


	def _make_calibration(
	ece: float = 0.05, mce: float = 0.12,
	n_predictions: int = 1000, n_bins: int = 10,
	) -> dict:
	"""Calibration agrégée réaliste avec quelques bins peuplés."""
	bins = []
	for k in range(n_bins):
	if k >= 5:
	bins.append({
	"bin_low": k / n_bins, "bin_high": (k + 1) / n_bins,
	"avg_confidence": (k + 0.5) / n_bins,
	"accuracy": max(0, (k - 1) / n_bins),
	"count": n_predictions // 5,
	"gap": 0.1,
	})
	else:
	bins.append({
	"bin_low": k / n_bins, "bin_high": (k + 1) / n_bins,
	"avg_confidence": None, "accuracy": None,
	"count": 0, "gap": None,
	})
	return {
	"ece": ece, "mce": mce, "n_bins": n_bins,
	"n_predictions": n_predictions,
	"overall_accuracy": 0.78, "overall_confidence": 0.82,
	"doc_count": 50, "bins": bins,
	}


	def _engine_with_calibration(name: str = "tess", **kwargs) -> dict:
	return {"name": name, "aggregated_calibration": _make_calibration(**kwargs)}


	def _engine_without_calibration(name: str = "no_cal") -> dict:
	return {"name": name, "aggregated_calibration": None}


	# ──────────────────────────────────────────────────────────────────────────
	# 1. Résumé
	# ──────────────────────────────────────────────────────────────────────────


	class TestSummaryTable:
	def test_renders_row_per_engine(self) -> None:
	engines = [_engine_with_calibration("a"), _engine_with_calibration("b")]
	html = build_calibration_summary_html(engines)
	assert "calibration-summary" in html
	assert "a" in html
	assert "b" in html
	# ECE et MCE rendus en pourcentage
	assert "5.00 %" in html # ECE 0.05
	assert "12.00 %" in html # MCE 0.12

	def test_includes_overall_accuracy_and_confidence(self) -> None:
	html = build_calibration_summary_html([_engine_with_calibration("x")])
	assert "78.0 %" in html # accuracy
	assert "82.0 %" in html # confidence

	def test_n_predictions_formatted_with_thousand_sep(self) -> None:
	html = build_calibration_summary_html(
	[_engine_with_calibration("x", n_predictions=12345)],
	)
	# 12 345 (espace insécable selon la convention française)
	assert "12 345" in html or "12345" in html

	def test_engine_without_calibration_omitted(self) -> None:
	engines = [_engine_with_calibration("a"), _engine_without_calibration("b")]
	html = build_calibration_summary_html(engines)
	assert "a" in html
	# Le moteur sans calibration ne doit pas avoir de ligne
	# (vérification approximative : son nom n'apparaît pas en gras)
	assert "<td" in html and ">b</td>" not in html


	# ──────────────────────────────────────────────────────────────────────────
	# 2. SVG reliability diagram
	# ──────────────────────────────────────────────────────────────────────────


	class TestReliabilityDiagramSvg:
	def test_returns_svg_with_bars_and_diagonal(self) -> None:
	svg = build_reliability_diagram_svg(_make_calibration())
	assert "<svg" in svg
	# Au moins une barre par bin non vide (5 bins peuplés)
	assert svg.count("<rect") >= 5
	# La diagonale en pointillé
	assert "stroke-dasharray" in svg
	# Au moins un point par bin non vide
	assert svg.count("<circle") >= 5

	def test_returns_empty_when_no_data(self) -> None:
	assert build_reliability_diagram_svg(None) == ""
	assert build_reliability_diagram_svg({}) == ""
	# bins tous à count = 0
	assert build_reliability_diagram_svg({
	"bins": [
	{"bin_low": 0, "bin_high": 1,
	"avg_confidence": None, "accuracy": None,
	"count": 0, "gap": None},
	],
	}) == ""

	def test_axis_labels_via_i18n(self) -> None:
	labels = {
	"reliability_x_axis": "CUSTOM_X",
	"reliability_y_axis": "CUSTOM_Y",
	}
	svg = build_reliability_diagram_svg(_make_calibration(), labels=labels)
	assert "CUSTOM_X" in svg
	assert "CUSTOM_Y" in svg


	# ──────────────────────────────────────────────────────────────────────────
	# 3. Grille de reliability diagrams
	# ──────────────────────────────────────────────────────────────────────────


	class TestGrid:
	def test_one_svg_per_engine(self) -> None:
	engines = [
	_engine_with_calibration("a"), _engine_with_calibration("b"),
	_engine_with_calibration("c"),
	]
	html = build_reliability_diagrams_grid_html(engines)
	assert html.count("<svg") == 3

	def test_engine_without_calibration_omitted_from_grid(self) -> None:
	engines = [_engine_with_calibration("a"), _engine_without_calibration("b")]
	html = build_reliability_diagrams_grid_html(engines)
	# Un seul SVG (pour "a")
	assert html.count("<svg") == 1


	# ──────────────────────────────────────────────────────────────────────────
	# 4. Masquage adaptatif
	# ──────────────────────────────────────────────────────────────────────────


	class TestAdaptiveMasking:
	def test_summary_empty_when_no_engine_has_calibration(self) -> None:
	assert build_calibration_summary_html([]) == ""
	assert build_calibration_summary_html(
	[_engine_without_calibration("a")],
	) == ""

	def test_grid_empty_when_no_engine_has_calibration(self) -> None:
	assert build_reliability_diagrams_grid_html([]) == ""
	assert build_reliability_diagrams_grid_html(
	[_engine_without_calibration("a")],
	) == ""


	# ──────────────────────────────────────────────────────────────────────────
	# 5. Anti-injection
	# ──────────────────────────────────────────────────────────────────────────


	class TestAntiInjection:
	def test_engine_name_escaped_in_summary(self) -> None:
	engine = _engine_with_calibration("<script>alert(1)</script>")
	html = build_calibration_summary_html([engine])
	assert "<script>" not in html
	assert "<script>" in html

	def test_engine_name_escaped_in_grid(self) -> None:
	engine = _engine_with_calibration("<img src=x>")
	html = build_reliability_diagrams_grid_html([engine])
	assert "<img src=x>" not in html
	assert "<img" in html


	# ──────────────────────────────────────────────────────────────────────────
	# 6. Intégration ReportGenerator
	# ──────────────────────────────────────────────────────────────────────────


	class TestReportIntegration:
	def test_section_absent_when_no_calibration(self, tmp_path: Path) -> None:
	bench = generate_sample_benchmark()
	for r in bench.engine_reports:
	assert r.aggregated_calibration is None

	out = tmp_path / "report.html"
	ReportGenerator(bench).generate(out)
	html = out.read_text(encoding="utf-8")
	assert "calibration-summary" not in html
	assert "reliability-diagrams-grid" not in html

	def test_section_present_when_at_least_one_engine_has_calibration(
	self, tmp_path: Path,
	) -> None:
	bench = generate_sample_benchmark()
	bench.engine_reports[0].aggregated_calibration = _make_calibration()

	out = tmp_path / "report.html"
	ReportGenerator(bench).generate(out)
	html = out.read_text(encoding="utf-8")
	assert "calibration-summary" in html
	assert "reliability-diagrams-grid" in html
	assert "5.00 %" in html # ECE

	def test_french_locale_uses_french_labels(self, tmp_path: Path) -> None:
	bench = generate_sample_benchmark()
	bench.engine_reports[0].aggregated_calibration = _make_calibration()

	out = tmp_path / "report_fr.html"
	ReportGenerator(bench, lang="fr").generate(out)
	html = out.read_text(encoding="utf-8")
	assert "Diagramme de fiabilité" in html
	assert "Précision moyenne" in html

	def test_english_locale_uses_english_labels(self, tmp_path: Path) -> None:
	bench = generate_sample_benchmark()
	bench.engine_reports[0].aggregated_calibration = _make_calibration()

	out = tmp_path / "report_en.html"
	ReportGenerator(bench, lang="en").generate(out)
	html = out.read_text(encoding="utf-8")
	assert "Reliability diagram" in html
	assert "Mean accuracy" in html


	# ──────────────────────────────────────────────────────────────────────────
	# 7. i18n FR/EN
	# ──────────────────────────────────────────────────────────────────────────


	REQUIRED_KEYS = (
	"h_calibration",
	"calibration_note",
	"calibration_summary_caption",
	"calibration_engine_label",
	"calibration_ece_label",
	"calibration_mce_label",
	"calibration_n_label",
	"calibration_acc_label",
	"calibration_conf_label",
	"calibration_docs_label",
	"reliability_diagram_title",
	"reliability_x_axis",
	"reliability_y_axis",
	)


	class TestI18NCompleteness:
	@pytest.mark.parametrize("lang", ["fr", "en"])
	@pytest.mark.parametrize("key", REQUIRED_KEYS)
	def test_key_present(self, lang: str, key: str) -> None:
	path = (
	Path(__file__).parent.parent.parent
	/ "picarones" / "reports" / "i18n" / f"{lang}.json"
	)
	data = json.loads(path.read_text(encoding="utf-8"))
	assert key in data, f"Clé {key!r} manquante dans {lang}.json"
	assert data[key].strip(), f"Clé {key!r} vide dans {lang}.json"