Picarones / tests /reports /test_sprint43_calibration_html.py
Claude
feat(sprint-S8): cohérence finale — renames test dirs, /metrics endpoint, SBOM workflow
43478ec unverified
Raw
History Blame
13.4 kB
"""Tests Sprint 43 — section calibration dans le rapport HTML.
Couvre :
1. ``build_calibration_summary_html`` rend le tableau résumé (ECE/MCE
par moteur).
2. ``build_reliability_diagram_svg`` rend un SVG avec les barres
d'accuracy par bin, les points (avg_conf, accuracy) et la diagonale.
3. ``build_reliability_diagrams_grid_html`` génère un SVG par moteur
ayant ``aggregated_calibration``, dans une grille.
4. **Masquage adaptatif** : les fonctions retournent ``""`` si aucun
moteur n'a de ``aggregated_calibration`` (cas par défaut tant que
les engines n'exposent pas leurs confidences natives).
5. **Anti-injection** : un nom de moteur avec balises HTML est
échappé.
6. **Intégration ReportGenerator** : la section apparaît quand au
moins un moteur a ``aggregated_calibration``, est omise sinon.
7. **i18n FR/EN** : les clés sont présentes et utilisées.
"""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from picarones.evaluation.synthetic import generate_sample_benchmark
from picarones.reports.html.renderers.calibration import (
build_calibration_summary_html,
build_reliability_diagram_svg,
build_reliability_diagrams_grid_html,
)
from picarones.reports.html.generator import ReportGenerator
# ──────────────────────────────────────────────────────────────────────────
# Fixtures
# ──────────────────────────────────────────────────────────────────────────
def _make_calibration(
ece: float = 0.05, mce: float = 0.12,
n_predictions: int = 1000, n_bins: int = 10,
) -> dict:
"""Calibration agrégée réaliste avec quelques bins peuplés."""
bins = []
for k in range(n_bins):
if k >= 5:
bins.append({
"bin_low": k / n_bins, "bin_high": (k + 1) / n_bins,
"avg_confidence": (k + 0.5) / n_bins,
"accuracy": max(0, (k - 1) / n_bins),
"count": n_predictions // 5,
"gap": 0.1,
})
else:
bins.append({
"bin_low": k / n_bins, "bin_high": (k + 1) / n_bins,
"avg_confidence": None, "accuracy": None,
"count": 0, "gap": None,
})
return {
"ece": ece, "mce": mce, "n_bins": n_bins,
"n_predictions": n_predictions,
"overall_accuracy": 0.78, "overall_confidence": 0.82,
"doc_count": 50, "bins": bins,
}
def _engine_with_calibration(name: str = "tess", **kwargs) -> dict:
return {"name": name, "aggregated_calibration": _make_calibration(**kwargs)}
def _engine_without_calibration(name: str = "no_cal") -> dict:
return {"name": name, "aggregated_calibration": None}
# ──────────────────────────────────────────────────────────────────────────
# 1. Résumé
# ──────────────────────────────────────────────────────────────────────────
class TestSummaryTable:
def test_renders_row_per_engine(self) -> None:
engines = [_engine_with_calibration("a"), _engine_with_calibration("b")]
html = build_calibration_summary_html(engines)
assert "calibration-summary" in html
assert "a" in html
assert "b" in html
# ECE et MCE rendus en pourcentage
assert "5.00 %" in html # ECE 0.05
assert "12.00 %" in html # MCE 0.12
def test_includes_overall_accuracy_and_confidence(self) -> None:
html = build_calibration_summary_html([_engine_with_calibration("x")])
assert "78.0 %" in html # accuracy
assert "82.0 %" in html # confidence
def test_n_predictions_formatted_with_thousand_sep(self) -> None:
html = build_calibration_summary_html(
[_engine_with_calibration("x", n_predictions=12345)],
)
# 12 345 (espace insécable selon la convention française)
assert "12 345" in html or "12345" in html
def test_engine_without_calibration_omitted(self) -> None:
engines = [_engine_with_calibration("a"), _engine_without_calibration("b")]
html = build_calibration_summary_html(engines)
assert "a" in html
# Le moteur sans calibration ne doit pas avoir de ligne
# (vérification approximative : son nom n'apparaît pas en gras)
assert "<td" in html and ">b</td>" not in html
# ──────────────────────────────────────────────────────────────────────────
# 2. SVG reliability diagram
# ──────────────────────────────────────────────────────────────────────────
class TestReliabilityDiagramSvg:
def test_returns_svg_with_bars_and_diagonal(self) -> None:
svg = build_reliability_diagram_svg(_make_calibration())
assert "<svg" in svg
# Au moins une barre par bin non vide (5 bins peuplés)
assert svg.count("<rect") >= 5
# La diagonale en pointillé
assert "stroke-dasharray" in svg
# Au moins un point par bin non vide
assert svg.count("<circle") >= 5
def test_returns_empty_when_no_data(self) -> None:
assert build_reliability_diagram_svg(None) == ""
assert build_reliability_diagram_svg({}) == ""
# bins tous à count = 0
assert build_reliability_diagram_svg({
"bins": [
{"bin_low": 0, "bin_high": 1,
"avg_confidence": None, "accuracy": None,
"count": 0, "gap": None},
],
}) == ""
def test_axis_labels_via_i18n(self) -> None:
labels = {
"reliability_x_axis": "CUSTOM_X",
"reliability_y_axis": "CUSTOM_Y",
}
svg = build_reliability_diagram_svg(_make_calibration(), labels=labels)
assert "CUSTOM_X" in svg
assert "CUSTOM_Y" in svg
# ──────────────────────────────────────────────────────────────────────────
# 3. Grille de reliability diagrams
# ──────────────────────────────────────────────────────────────────────────
class TestGrid:
def test_one_svg_per_engine(self) -> None:
engines = [
_engine_with_calibration("a"), _engine_with_calibration("b"),
_engine_with_calibration("c"),
]
html = build_reliability_diagrams_grid_html(engines)
assert html.count("<svg") == 3
def test_engine_without_calibration_omitted_from_grid(self) -> None:
engines = [_engine_with_calibration("a"), _engine_without_calibration("b")]
html = build_reliability_diagrams_grid_html(engines)
# Un seul SVG (pour "a")
assert html.count("<svg") == 1
# ──────────────────────────────────────────────────────────────────────────
# 4. Masquage adaptatif
# ──────────────────────────────────────────────────────────────────────────
class TestAdaptiveMasking:
def test_summary_empty_when_no_engine_has_calibration(self) -> None:
assert build_calibration_summary_html([]) == ""
assert build_calibration_summary_html(
[_engine_without_calibration("a")],
) == ""
def test_grid_empty_when_no_engine_has_calibration(self) -> None:
assert build_reliability_diagrams_grid_html([]) == ""
assert build_reliability_diagrams_grid_html(
[_engine_without_calibration("a")],
) == ""
# ──────────────────────────────────────────────────────────────────────────
# 5. Anti-injection
# ──────────────────────────────────────────────────────────────────────────
class TestAntiInjection:
def test_engine_name_escaped_in_summary(self) -> None:
engine = _engine_with_calibration("<script>alert(1)</script>")
html = build_calibration_summary_html([engine])
assert "<script>" not in html
assert "&lt;script&gt;" in html
def test_engine_name_escaped_in_grid(self) -> None:
engine = _engine_with_calibration("<img src=x>")
html = build_reliability_diagrams_grid_html([engine])
assert "<img src=x>" not in html
assert "&lt;img" in html
# ──────────────────────────────────────────────────────────────────────────
# 6. Intégration ReportGenerator
# ──────────────────────────────────────────────────────────────────────────
class TestReportIntegration:
def test_section_absent_when_no_calibration(self, tmp_path: Path) -> None:
bench = generate_sample_benchmark()
for r in bench.engine_reports:
assert r.aggregated_calibration is None
out = tmp_path / "report.html"
ReportGenerator(bench).generate(out)
html = out.read_text(encoding="utf-8")
assert "calibration-summary" not in html
assert "reliability-diagrams-grid" not in html
def test_section_present_when_at_least_one_engine_has_calibration(
self, tmp_path: Path,
) -> None:
bench = generate_sample_benchmark()
bench.engine_reports[0].aggregated_calibration = _make_calibration()
out = tmp_path / "report.html"
ReportGenerator(bench).generate(out)
html = out.read_text(encoding="utf-8")
assert "calibration-summary" in html
assert "reliability-diagrams-grid" in html
assert "5.00 %" in html # ECE
def test_french_locale_uses_french_labels(self, tmp_path: Path) -> None:
bench = generate_sample_benchmark()
bench.engine_reports[0].aggregated_calibration = _make_calibration()
out = tmp_path / "report_fr.html"
ReportGenerator(bench, lang="fr").generate(out)
html = out.read_text(encoding="utf-8")
assert "Diagramme de fiabilité" in html
assert "Précision moyenne" in html
def test_english_locale_uses_english_labels(self, tmp_path: Path) -> None:
bench = generate_sample_benchmark()
bench.engine_reports[0].aggregated_calibration = _make_calibration()
out = tmp_path / "report_en.html"
ReportGenerator(bench, lang="en").generate(out)
html = out.read_text(encoding="utf-8")
assert "Reliability diagram" in html
assert "Mean accuracy" in html
# ──────────────────────────────────────────────────────────────────────────
# 7. i18n FR/EN
# ──────────────────────────────────────────────────────────────────────────
REQUIRED_KEYS = (
"h_calibration",
"calibration_note",
"calibration_summary_caption",
"calibration_engine_label",
"calibration_ece_label",
"calibration_mce_label",
"calibration_n_label",
"calibration_acc_label",
"calibration_conf_label",
"calibration_docs_label",
"reliability_diagram_title",
"reliability_x_axis",
"reliability_y_axis",
)
class TestI18NCompleteness:
@pytest.mark.parametrize("lang", ["fr", "en"])
@pytest.mark.parametrize("key", REQUIRED_KEYS)
def test_key_present(self, lang: str, key: str) -> None:
path = (
Path(__file__).parent.parent.parent
/ "picarones" / "reports" / "i18n" / f"{lang}.json"
)
data = json.loads(path.read_text(encoding="utf-8"))
assert key in data, f"Clé {key!r} manquante dans {lang}.json"
assert data[key].strip(), f"Clé {key!r} vide dans {lang}.json"