"""Tests Sprint 68 — vue HTML de comparaison de N pipelines. Couvre : 1. ``RankingSpec`` : ``display_label`` auto / explicite. 2. ``build_pipeline_ranking_table_html`` : - tableau rang / pipeline / valeur, ordre cohérent - pipelines sans valeur en queue avec tirets - cellule de rang colorée (gradient vert→rouge) - vide si la comparaison ne contient aucune pipeline 3. ``build_pipeline_gain_table_html`` : - tableau pipeline / valeur / absolute / relative - baseline marquée explicitement - couleur cellule favorable / défavorable selon ``higher_is_better`` - baseline inconnue → chaîne vide 4. ``build_pipeline_comparison_summary_html`` : corpus, n_docs, n_pipelines, durée, mini-résumé par pipeline. 5. ``build_pipeline_comparison_report_html`` : - document HTML autonome (doctype, head, body, styles) - titre, lang attribute FR/EN - rankings affichés si ranking_specs fourni - gain table affiché uniquement si baseline_pipeline fourni 6. Anti-injection : pipeline name, corpus, labels. 7. Complétude i18n : nouvelles clés ``pipeline_*`` présentes en FR et EN. """ from __future__ import annotations import json from pathlib import Path from picarones.core.modules import ArtifactType from picarones.measurements.pipeline_benchmark import ( PipelineBenchmarkResult, StepAggregate, ) from picarones.measurements.pipeline_comparison import PipelineComparisonResult from picarones.report.pipeline_render import ( RankingSpec, build_pipeline_comparison_report_html, build_pipeline_comparison_summary_html, build_pipeline_gain_table_html, build_pipeline_ranking_table_html, ) def _make_bench(name: str, cer_mean: float, n: int = 10) -> PipelineBenchmarkResult: bench = PipelineBenchmarkResult( pipeline_name=name, corpus_name="demo", n_docs=n, total_duration_seconds=1.0, ) class _PR: def __init__(self, ok): self._ok = ok @property def succeeded(self): return self._ok bench.per_doc_results = [_PR(True)] * n bench.per_step_aggregates = [ StepAggregate( step_name="ocr", n_docs=n, n_succeeded=n, n_failed=0, duration_seconds_total=0.5, duration_seconds_mean=0.05, duration_seconds_median=0.05, junction_metrics={ "text": { "cer": {"mean": cer_mean, "median": cer_mean, "n": n}, }, }, ), ] return bench def _make_comparison( pipelines: list[tuple[str, float]], ) -> PipelineComparisonResult: """Crée une comparaison avec pipelines = [(name, cer_mean), ...].""" comparison = PipelineComparisonResult( corpus_name="demo", n_docs=10, total_duration_seconds=3.0, ) for name, cer in pipelines: comparison.per_pipeline[name] = _make_bench(name, cer) return comparison # ────────────────────────────────────────────────────────────────────────── # 1. RankingSpec # ────────────────────────────────────────────────────────────────────────── class TestRankingSpec: def test_display_label_default(self) -> None: spec = RankingSpec(ArtifactType.TEXT, "cer") assert spec.display_label == "text.cer" def test_display_label_explicit(self) -> None: spec = RankingSpec(ArtifactType.TEXT, "cer", label="CER global") assert spec.display_label == "CER global" def test_higher_is_better_default_false(self) -> None: spec = RankingSpec(ArtifactType.TEXT, "cer") assert spec.higher_is_better is False # ────────────────────────────────────────────────────────────────────────── # 2. ranking table # ────────────────────────────────────────────────────────────────────────── class TestRankingTable: def test_orders_by_metric_ascending(self) -> None: comparison = _make_comparison([ ("alpha", 0.20), ("beta", 0.05), ("gamma", 0.10), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_ranking_table_html(comparison, spec) # beta (CER 0.05) doit apparaître avant alpha (CER 0.20) idx_beta = html.find("beta") idx_alpha = html.find("alpha") idx_gamma = html.find("gamma") assert 0 < idx_beta < idx_gamma < idx_alpha def test_higher_is_better_reverses(self) -> None: comparison = _make_comparison([ ("alpha", 0.20), ("beta", 0.80), ]) spec = RankingSpec( ArtifactType.TEXT, "cer", higher_is_better=True, ) html = build_pipeline_ranking_table_html(comparison, spec) # beta (0.80) en premier puisqu'on inverse idx_beta = html.find("beta") idx_alpha = html.find("alpha") assert idx_beta < idx_alpha def test_pipelines_without_metric_in_queue(self) -> None: # Pipeline "bad" sans CER (aucun step n'a tourné) comparison = _make_comparison([("alpha", 0.10)]) comparison.per_pipeline["bad"] = PipelineBenchmarkResult( pipeline_name="bad", corpus_name="demo", ) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_ranking_table_html(comparison, spec) idx_bad = html.find("bad") idx_alpha = html.find("alpha") assert 0 < idx_alpha < idx_bad # Le pipeline sans valeur affiche un tiret assert "—" in html def test_rank_cell_colored(self) -> None: comparison = _make_comparison([ ("a", 0.1), ("b", 0.2), ("c", 0.3), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_ranking_table_html(comparison, spec) assert "background:#" in html def test_empty_comparison_returns_empty(self) -> None: comparison = PipelineComparisonResult(corpus_name="empty") spec = RankingSpec(ArtifactType.TEXT, "cer") assert build_pipeline_ranking_table_html(comparison, spec) == "" def test_uses_display_label_in_title(self) -> None: comparison = _make_comparison([("alpha", 0.1)]) spec = RankingSpec( ArtifactType.TEXT, "cer", label="Mon Label", ) html = build_pipeline_ranking_table_html(comparison, spec) assert "Mon Label" in html # ────────────────────────────────────────────────────────────────────────── # 3. gain table # ────────────────────────────────────────────────────────────────────────── class TestGainTable: def test_baseline_marked(self) -> None: comparison = _make_comparison([ ("baseline", 0.20), ("better", 0.10), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_gain_table_html( comparison, spec, baseline_pipeline="baseline", ) assert "(référence)" in html # Les deux pipelines apparaissent assert "baseline" in html assert "better" in html def test_gain_absolute_and_relative(self) -> None: comparison = _make_comparison([ ("baseline", 0.20), ("better", 0.10), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_gain_table_html( comparison, spec, baseline_pipeline="baseline", ) # better : -0.1000 absolute, -50% relative assert "-0.1000" in html assert "-50.0%" in html def test_color_favorable_when_lower_better(self) -> None: # CER baisse → favorable → cellule verte (#cfe8cf) comparison = _make_comparison([ ("baseline", 0.20), ("better", 0.05), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_gain_table_html( comparison, spec, baseline_pipeline="baseline", ) assert "#cfe8cf" in html def test_color_unfavorable_when_lower_better(self) -> None: # CER monte → défavorable → cellule rouge (#f4cfcf) comparison = _make_comparison([ ("baseline", 0.10), ("worse", 0.30), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_gain_table_html( comparison, spec, baseline_pipeline="baseline", ) assert "#f4cfcf" in html def test_unknown_baseline_returns_empty(self) -> None: comparison = _make_comparison([("alpha", 0.1)]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_gain_table_html( comparison, spec, baseline_pipeline="nonexistent", ) assert html == "" # ────────────────────────────────────────────────────────────────────────── # 4. comparison summary # ────────────────────────────────────────────────────────────────────────── class TestComparisonSummary: def test_renders_corpus_and_counts(self) -> None: comparison = _make_comparison([ ("a", 0.1), ("b", 0.2), ]) html = build_pipeline_comparison_summary_html(comparison) assert "demo" in html assert "10" in html # n_docs # 2 pipelines assert ">2<" in html def test_per_pipeline_mini_summary(self) -> None: comparison = _make_comparison([ ("a", 0.1), ("b", 0.2), ]) html = build_pipeline_comparison_summary_html(comparison) # Mini-résumé : nom (n_succeeded/n_docs) assert "a" in html assert "b" in html assert "10/10" in html # ────────────────────────────────────────────────────────────────────────── # 5. document autonome # ────────────────────────────────────────────────────────────────────────── class TestComparisonReport: def test_doctype_and_structure(self) -> None: comparison = _make_comparison([("a", 0.1)]) html = build_pipeline_comparison_report_html(comparison) assert html.startswith("") assert "" in html assert "
" in html assert "" in html def test_lang_attribute(self) -> None: comparison = _make_comparison([("a", 0.1)]) html_fr = build_pipeline_comparison_report_html( comparison, lang="fr", ) html_en = build_pipeline_comparison_report_html( comparison, lang="en", ) assert 'lang="fr"' in html_fr assert 'lang="en"' in html_en def test_rankings_displayed_when_specs_provided(self) -> None: comparison = _make_comparison([ ("a", 0.20), ("b", 0.05), ]) specs = [RankingSpec(ArtifactType.TEXT, "cer", label="CER")] html = build_pipeline_comparison_report_html( comparison, ranking_specs=specs, ) assert "Classement par CER" in html def test_no_rankings_without_specs(self) -> None: comparison = _make_comparison([("a", 0.1)]) html = build_pipeline_comparison_report_html(comparison) # Pas de tableau de classement sans ranking_specs assert "Classement par" not in html def test_gain_table_only_with_baseline(self) -> None: comparison = _make_comparison([ ("baseline", 0.20), ("better", 0.10), ]) specs = [RankingSpec(ArtifactType.TEXT, "cer")] # Sans baseline : pas de gain table html_no_baseline = build_pipeline_comparison_report_html( comparison, ranking_specs=specs, ) assert "Gain vs" not in html_no_baseline # Avec baseline : gain table présent html_with_baseline = build_pipeline_comparison_report_html( comparison, ranking_specs=specs, baseline_pipeline="baseline", ) assert "Gain vs" in html_with_baseline # ────────────────────────────────────────────────────────────────────────── # 6. Anti-injection # ────────────────────────────────────────────────────────────────────────── class TestAntiInjection: def test_pipeline_name_escaped_in_ranking(self) -> None: comparison = _make_comparison([ ("", 0.1), ]) spec = RankingSpec(ArtifactType.TEXT, "cer") html = build_pipeline_ranking_table_html(comparison, spec) assert "