Spaces:
Running
Running
Claude
feat(sprint-S8): cohérence finale — renames test dirs, /metrics endpoint, SBOM workflow
43478ec unverified | """Tests Sprint 43 — section calibration dans le rapport HTML. | |
| Couvre : | |
| 1. ``build_calibration_summary_html`` rend le tableau résumé (ECE/MCE | |
| par moteur). | |
| 2. ``build_reliability_diagram_svg`` rend un SVG avec les barres | |
| d'accuracy par bin, les points (avg_conf, accuracy) et la diagonale. | |
| 3. ``build_reliability_diagrams_grid_html`` génère un SVG par moteur | |
| ayant ``aggregated_calibration``, dans une grille. | |
| 4. **Masquage adaptatif** : les fonctions retournent ``""`` si aucun | |
| moteur n'a de ``aggregated_calibration`` (cas par défaut tant que | |
| les engines n'exposent pas leurs confidences natives). | |
| 5. **Anti-injection** : un nom de moteur avec balises HTML est | |
| échappé. | |
| 6. **Intégration ReportGenerator** : la section apparaît quand au | |
| moins un moteur a ``aggregated_calibration``, est omise sinon. | |
| 7. **i18n FR/EN** : les clés sont présentes et utilisées. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import pytest | |
| from picarones.evaluation.synthetic import generate_sample_benchmark | |
| from picarones.reports.html.renderers.calibration import ( | |
| build_calibration_summary_html, | |
| build_reliability_diagram_svg, | |
| build_reliability_diagrams_grid_html, | |
| ) | |
| from picarones.reports.html.generator import ReportGenerator | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # Fixtures | |
| # ────────────────────────────────────────────────────────────────────────── | |
| def _make_calibration( | |
| ece: float = 0.05, mce: float = 0.12, | |
| n_predictions: int = 1000, n_bins: int = 10, | |
| ) -> dict: | |
| """Calibration agrégée réaliste avec quelques bins peuplés.""" | |
| bins = [] | |
| for k in range(n_bins): | |
| if k >= 5: | |
| bins.append({ | |
| "bin_low": k / n_bins, "bin_high": (k + 1) / n_bins, | |
| "avg_confidence": (k + 0.5) / n_bins, | |
| "accuracy": max(0, (k - 1) / n_bins), | |
| "count": n_predictions // 5, | |
| "gap": 0.1, | |
| }) | |
| else: | |
| bins.append({ | |
| "bin_low": k / n_bins, "bin_high": (k + 1) / n_bins, | |
| "avg_confidence": None, "accuracy": None, | |
| "count": 0, "gap": None, | |
| }) | |
| return { | |
| "ece": ece, "mce": mce, "n_bins": n_bins, | |
| "n_predictions": n_predictions, | |
| "overall_accuracy": 0.78, "overall_confidence": 0.82, | |
| "doc_count": 50, "bins": bins, | |
| } | |
| def _engine_with_calibration(name: str = "tess", **kwargs) -> dict: | |
| return {"name": name, "aggregated_calibration": _make_calibration(**kwargs)} | |
| def _engine_without_calibration(name: str = "no_cal") -> dict: | |
| return {"name": name, "aggregated_calibration": None} | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 1. Résumé | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestSummaryTable: | |
| def test_renders_row_per_engine(self) -> None: | |
| engines = [_engine_with_calibration("a"), _engine_with_calibration("b")] | |
| html = build_calibration_summary_html(engines) | |
| assert "calibration-summary" in html | |
| assert "a" in html | |
| assert "b" in html | |
| # ECE et MCE rendus en pourcentage | |
| assert "5.00 %" in html # ECE 0.05 | |
| assert "12.00 %" in html # MCE 0.12 | |
| def test_includes_overall_accuracy_and_confidence(self) -> None: | |
| html = build_calibration_summary_html([_engine_with_calibration("x")]) | |
| assert "78.0 %" in html # accuracy | |
| assert "82.0 %" in html # confidence | |
| def test_n_predictions_formatted_with_thousand_sep(self) -> None: | |
| html = build_calibration_summary_html( | |
| [_engine_with_calibration("x", n_predictions=12345)], | |
| ) | |
| # 12 345 (espace insécable selon la convention française) | |
| assert "12 345" in html or "12345" in html | |
| def test_engine_without_calibration_omitted(self) -> None: | |
| engines = [_engine_with_calibration("a"), _engine_without_calibration("b")] | |
| html = build_calibration_summary_html(engines) | |
| assert "a" in html | |
| # Le moteur sans calibration ne doit pas avoir de ligne | |
| # (vérification approximative : son nom n'apparaît pas en gras) | |
| assert "<td" in html and ">b</td>" not in html | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 2. SVG reliability diagram | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestReliabilityDiagramSvg: | |
| def test_returns_svg_with_bars_and_diagonal(self) -> None: | |
| svg = build_reliability_diagram_svg(_make_calibration()) | |
| assert "<svg" in svg | |
| # Au moins une barre par bin non vide (5 bins peuplés) | |
| assert svg.count("<rect") >= 5 | |
| # La diagonale en pointillé | |
| assert "stroke-dasharray" in svg | |
| # Au moins un point par bin non vide | |
| assert svg.count("<circle") >= 5 | |
| def test_returns_empty_when_no_data(self) -> None: | |
| assert build_reliability_diagram_svg(None) == "" | |
| assert build_reliability_diagram_svg({}) == "" | |
| # bins tous à count = 0 | |
| assert build_reliability_diagram_svg({ | |
| "bins": [ | |
| {"bin_low": 0, "bin_high": 1, | |
| "avg_confidence": None, "accuracy": None, | |
| "count": 0, "gap": None}, | |
| ], | |
| }) == "" | |
| def test_axis_labels_via_i18n(self) -> None: | |
| labels = { | |
| "reliability_x_axis": "CUSTOM_X", | |
| "reliability_y_axis": "CUSTOM_Y", | |
| } | |
| svg = build_reliability_diagram_svg(_make_calibration(), labels=labels) | |
| assert "CUSTOM_X" in svg | |
| assert "CUSTOM_Y" in svg | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 3. Grille de reliability diagrams | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestGrid: | |
| def test_one_svg_per_engine(self) -> None: | |
| engines = [ | |
| _engine_with_calibration("a"), _engine_with_calibration("b"), | |
| _engine_with_calibration("c"), | |
| ] | |
| html = build_reliability_diagrams_grid_html(engines) | |
| assert html.count("<svg") == 3 | |
| def test_engine_without_calibration_omitted_from_grid(self) -> None: | |
| engines = [_engine_with_calibration("a"), _engine_without_calibration("b")] | |
| html = build_reliability_diagrams_grid_html(engines) | |
| # Un seul SVG (pour "a") | |
| assert html.count("<svg") == 1 | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 4. Masquage adaptatif | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestAdaptiveMasking: | |
| def test_summary_empty_when_no_engine_has_calibration(self) -> None: | |
| assert build_calibration_summary_html([]) == "" | |
| assert build_calibration_summary_html( | |
| [_engine_without_calibration("a")], | |
| ) == "" | |
| def test_grid_empty_when_no_engine_has_calibration(self) -> None: | |
| assert build_reliability_diagrams_grid_html([]) == "" | |
| assert build_reliability_diagrams_grid_html( | |
| [_engine_without_calibration("a")], | |
| ) == "" | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 5. Anti-injection | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestAntiInjection: | |
| def test_engine_name_escaped_in_summary(self) -> None: | |
| engine = _engine_with_calibration("<script>alert(1)</script>") | |
| html = build_calibration_summary_html([engine]) | |
| assert "<script>" not in html | |
| assert "<script>" in html | |
| def test_engine_name_escaped_in_grid(self) -> None: | |
| engine = _engine_with_calibration("<img src=x>") | |
| html = build_reliability_diagrams_grid_html([engine]) | |
| assert "<img src=x>" not in html | |
| assert "<img" in html | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 6. Intégration ReportGenerator | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestReportIntegration: | |
| def test_section_absent_when_no_calibration(self, tmp_path: Path) -> None: | |
| bench = generate_sample_benchmark() | |
| for r in bench.engine_reports: | |
| assert r.aggregated_calibration is None | |
| out = tmp_path / "report.html" | |
| ReportGenerator(bench).generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "calibration-summary" not in html | |
| assert "reliability-diagrams-grid" not in html | |
| def test_section_present_when_at_least_one_engine_has_calibration( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| bench = generate_sample_benchmark() | |
| bench.engine_reports[0].aggregated_calibration = _make_calibration() | |
| out = tmp_path / "report.html" | |
| ReportGenerator(bench).generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "calibration-summary" in html | |
| assert "reliability-diagrams-grid" in html | |
| assert "5.00 %" in html # ECE | |
| def test_french_locale_uses_french_labels(self, tmp_path: Path) -> None: | |
| bench = generate_sample_benchmark() | |
| bench.engine_reports[0].aggregated_calibration = _make_calibration() | |
| out = tmp_path / "report_fr.html" | |
| ReportGenerator(bench, lang="fr").generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "Diagramme de fiabilité" in html | |
| assert "Précision moyenne" in html | |
| def test_english_locale_uses_english_labels(self, tmp_path: Path) -> None: | |
| bench = generate_sample_benchmark() | |
| bench.engine_reports[0].aggregated_calibration = _make_calibration() | |
| out = tmp_path / "report_en.html" | |
| ReportGenerator(bench, lang="en").generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "Reliability diagram" in html | |
| assert "Mean accuracy" in html | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 7. i18n FR/EN | |
| # ────────────────────────────────────────────────────────────────────────── | |
| REQUIRED_KEYS = ( | |
| "h_calibration", | |
| "calibration_note", | |
| "calibration_summary_caption", | |
| "calibration_engine_label", | |
| "calibration_ece_label", | |
| "calibration_mce_label", | |
| "calibration_n_label", | |
| "calibration_acc_label", | |
| "calibration_conf_label", | |
| "calibration_docs_label", | |
| "reliability_diagram_title", | |
| "reliability_x_axis", | |
| "reliability_y_axis", | |
| ) | |
| class TestI18NCompleteness: | |
| def test_key_present(self, lang: str, key: str) -> None: | |
| path = ( | |
| Path(__file__).parent.parent.parent | |
| / "picarones" / "reports" / "i18n" / f"{lang}.json" | |
| ) | |
| data = json.loads(path.read_text(encoding="utf-8")) | |
| assert key in data, f"Clé {key!r} manquante dans {lang}.json" | |
| assert data[key].strip(), f"Clé {key!r} vide dans {lang}.json" | |