Spaces:
Sleeping
Sleeping
| """Tests Sprint 68 — vue HTML de comparaison de N pipelines. | |
| Couvre : | |
| 1. ``RankingSpec`` : ``display_label`` auto / explicite. | |
| 2. ``build_pipeline_ranking_table_html`` : | |
| - tableau rang / pipeline / valeur, ordre cohérent | |
| - pipelines sans valeur en queue avec tirets | |
| - cellule de rang colorée (gradient vert→rouge) | |
| - vide si la comparaison ne contient aucune pipeline | |
| 3. ``build_pipeline_gain_table_html`` : | |
| - tableau pipeline / valeur / absolute / relative | |
| - baseline marquée explicitement | |
| - couleur cellule favorable / défavorable selon | |
| ``higher_is_better`` | |
| - baseline inconnue → chaîne vide | |
| 4. ``build_pipeline_comparison_summary_html`` : corpus, n_docs, | |
| n_pipelines, durée, mini-résumé par pipeline. | |
| 5. ``build_pipeline_comparison_report_html`` : | |
| - document HTML autonome (doctype, head, body, styles) | |
| - titre, lang attribute FR/EN | |
| - rankings affichés si ranking_specs fourni | |
| - gain table affiché uniquement si baseline_pipeline fourni | |
| 6. Anti-injection : pipeline name, corpus, labels. | |
| 7. Complétude i18n : nouvelles clés ``pipeline_*`` présentes | |
| en FR et EN. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from picarones.core.modules import ArtifactType | |
| from picarones.measurements.pipeline_benchmark import ( | |
| PipelineBenchmarkResult, | |
| StepAggregate, | |
| ) | |
| from picarones.measurements.pipeline_comparison import PipelineComparisonResult | |
| from picarones.report.pipeline_render import ( | |
| RankingSpec, | |
| build_pipeline_comparison_report_html, | |
| build_pipeline_comparison_summary_html, | |
| build_pipeline_gain_table_html, | |
| build_pipeline_ranking_table_html, | |
| ) | |
| def _make_bench(name: str, cer_mean: float, n: int = 10) -> PipelineBenchmarkResult: | |
| bench = PipelineBenchmarkResult( | |
| pipeline_name=name, corpus_name="demo", | |
| n_docs=n, total_duration_seconds=1.0, | |
| ) | |
| class _PR: | |
| def __init__(self, ok): self._ok = ok | |
| def succeeded(self): return self._ok | |
| bench.per_doc_results = [_PR(True)] * n | |
| bench.per_step_aggregates = [ | |
| StepAggregate( | |
| step_name="ocr", n_docs=n, n_succeeded=n, n_failed=0, | |
| duration_seconds_total=0.5, duration_seconds_mean=0.05, | |
| duration_seconds_median=0.05, | |
| junction_metrics={ | |
| "text": { | |
| "cer": {"mean": cer_mean, "median": cer_mean, "n": n}, | |
| }, | |
| }, | |
| ), | |
| ] | |
| return bench | |
| def _make_comparison( | |
| pipelines: list[tuple[str, float]], | |
| ) -> PipelineComparisonResult: | |
| """Crée une comparaison avec pipelines = [(name, cer_mean), ...].""" | |
| comparison = PipelineComparisonResult( | |
| corpus_name="demo", | |
| n_docs=10, | |
| total_duration_seconds=3.0, | |
| ) | |
| for name, cer in pipelines: | |
| comparison.per_pipeline[name] = _make_bench(name, cer) | |
| return comparison | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 1. RankingSpec | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestRankingSpec: | |
| def test_display_label_default(self) -> None: | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| assert spec.display_label == "text.cer" | |
| def test_display_label_explicit(self) -> None: | |
| spec = RankingSpec(ArtifactType.TEXT, "cer", label="CER global") | |
| assert spec.display_label == "CER global" | |
| def test_higher_is_better_default_false(self) -> None: | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| assert spec.higher_is_better is False | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 2. ranking table | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestRankingTable: | |
| def test_orders_by_metric_ascending(self) -> None: | |
| comparison = _make_comparison([ | |
| ("alpha", 0.20), | |
| ("beta", 0.05), | |
| ("gamma", 0.10), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_ranking_table_html(comparison, spec) | |
| # beta (CER 0.05) doit apparaître avant alpha (CER 0.20) | |
| idx_beta = html.find("beta") | |
| idx_alpha = html.find("alpha") | |
| idx_gamma = html.find("gamma") | |
| assert 0 < idx_beta < idx_gamma < idx_alpha | |
| def test_higher_is_better_reverses(self) -> None: | |
| comparison = _make_comparison([ | |
| ("alpha", 0.20), | |
| ("beta", 0.80), | |
| ]) | |
| spec = RankingSpec( | |
| ArtifactType.TEXT, "cer", higher_is_better=True, | |
| ) | |
| html = build_pipeline_ranking_table_html(comparison, spec) | |
| # beta (0.80) en premier puisqu'on inverse | |
| idx_beta = html.find("beta") | |
| idx_alpha = html.find("alpha") | |
| assert idx_beta < idx_alpha | |
| def test_pipelines_without_metric_in_queue(self) -> None: | |
| # Pipeline "bad" sans CER (aucun step n'a tourné) | |
| comparison = _make_comparison([("alpha", 0.10)]) | |
| comparison.per_pipeline["bad"] = PipelineBenchmarkResult( | |
| pipeline_name="bad", corpus_name="demo", | |
| ) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_ranking_table_html(comparison, spec) | |
| idx_bad = html.find("bad") | |
| idx_alpha = html.find("alpha") | |
| assert 0 < idx_alpha < idx_bad | |
| # Le pipeline sans valeur affiche un tiret | |
| assert "—" in html | |
| def test_rank_cell_colored(self) -> None: | |
| comparison = _make_comparison([ | |
| ("a", 0.1), ("b", 0.2), ("c", 0.3), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_ranking_table_html(comparison, spec) | |
| assert "background:#" in html | |
| def test_empty_comparison_returns_empty(self) -> None: | |
| comparison = PipelineComparisonResult(corpus_name="empty") | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| assert build_pipeline_ranking_table_html(comparison, spec) == "" | |
| def test_uses_display_label_in_title(self) -> None: | |
| comparison = _make_comparison([("alpha", 0.1)]) | |
| spec = RankingSpec( | |
| ArtifactType.TEXT, "cer", label="Mon Label", | |
| ) | |
| html = build_pipeline_ranking_table_html(comparison, spec) | |
| assert "Mon Label" in html | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 3. gain table | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestGainTable: | |
| def test_baseline_marked(self) -> None: | |
| comparison = _make_comparison([ | |
| ("baseline", 0.20), ("better", 0.10), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_gain_table_html( | |
| comparison, spec, baseline_pipeline="baseline", | |
| ) | |
| assert "(référence)" in html | |
| # Les deux pipelines apparaissent | |
| assert "baseline" in html | |
| assert "better" in html | |
| def test_gain_absolute_and_relative(self) -> None: | |
| comparison = _make_comparison([ | |
| ("baseline", 0.20), ("better", 0.10), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_gain_table_html( | |
| comparison, spec, baseline_pipeline="baseline", | |
| ) | |
| # better : -0.1000 absolute, -50% relative | |
| assert "-0.1000" in html | |
| assert "-50.0%" in html | |
| def test_color_favorable_when_lower_better(self) -> None: | |
| # CER baisse → favorable → cellule verte (#cfe8cf) | |
| comparison = _make_comparison([ | |
| ("baseline", 0.20), ("better", 0.05), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_gain_table_html( | |
| comparison, spec, baseline_pipeline="baseline", | |
| ) | |
| assert "#cfe8cf" in html | |
| def test_color_unfavorable_when_lower_better(self) -> None: | |
| # CER monte → défavorable → cellule rouge (#f4cfcf) | |
| comparison = _make_comparison([ | |
| ("baseline", 0.10), ("worse", 0.30), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_gain_table_html( | |
| comparison, spec, baseline_pipeline="baseline", | |
| ) | |
| assert "#f4cfcf" in html | |
| def test_unknown_baseline_returns_empty(self) -> None: | |
| comparison = _make_comparison([("alpha", 0.1)]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_gain_table_html( | |
| comparison, spec, baseline_pipeline="nonexistent", | |
| ) | |
| assert html == "" | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 4. comparison summary | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestComparisonSummary: | |
| def test_renders_corpus_and_counts(self) -> None: | |
| comparison = _make_comparison([ | |
| ("a", 0.1), ("b", 0.2), | |
| ]) | |
| html = build_pipeline_comparison_summary_html(comparison) | |
| assert "demo" in html | |
| assert "10" in html # n_docs | |
| # 2 pipelines | |
| assert ">2<" in html | |
| def test_per_pipeline_mini_summary(self) -> None: | |
| comparison = _make_comparison([ | |
| ("a", 0.1), ("b", 0.2), | |
| ]) | |
| html = build_pipeline_comparison_summary_html(comparison) | |
| # Mini-résumé : nom (n_succeeded/n_docs) | |
| assert "a" in html | |
| assert "b" in html | |
| assert "10/10" in html | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 5. document autonome | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestComparisonReport: | |
| def test_doctype_and_structure(self) -> None: | |
| comparison = _make_comparison([("a", 0.1)]) | |
| html = build_pipeline_comparison_report_html(comparison) | |
| assert html.startswith("<!doctype html>") | |
| assert "<html" in html | |
| assert "<head>" in html | |
| assert "<body>" in html | |
| assert "</html>" in html | |
| def test_lang_attribute(self) -> None: | |
| comparison = _make_comparison([("a", 0.1)]) | |
| html_fr = build_pipeline_comparison_report_html( | |
| comparison, lang="fr", | |
| ) | |
| html_en = build_pipeline_comparison_report_html( | |
| comparison, lang="en", | |
| ) | |
| assert 'lang="fr"' in html_fr | |
| assert 'lang="en"' in html_en | |
| def test_rankings_displayed_when_specs_provided(self) -> None: | |
| comparison = _make_comparison([ | |
| ("a", 0.20), ("b", 0.05), | |
| ]) | |
| specs = [RankingSpec(ArtifactType.TEXT, "cer", label="CER")] | |
| html = build_pipeline_comparison_report_html( | |
| comparison, ranking_specs=specs, | |
| ) | |
| assert "Classement par CER" in html | |
| def test_no_rankings_without_specs(self) -> None: | |
| comparison = _make_comparison([("a", 0.1)]) | |
| html = build_pipeline_comparison_report_html(comparison) | |
| # Pas de tableau de classement sans ranking_specs | |
| assert "Classement par" not in html | |
| def test_gain_table_only_with_baseline(self) -> None: | |
| comparison = _make_comparison([ | |
| ("baseline", 0.20), ("better", 0.10), | |
| ]) | |
| specs = [RankingSpec(ArtifactType.TEXT, "cer")] | |
| # Sans baseline : pas de gain table | |
| html_no_baseline = build_pipeline_comparison_report_html( | |
| comparison, ranking_specs=specs, | |
| ) | |
| assert "Gain vs" not in html_no_baseline | |
| # Avec baseline : gain table présent | |
| html_with_baseline = build_pipeline_comparison_report_html( | |
| comparison, ranking_specs=specs, | |
| baseline_pipeline="baseline", | |
| ) | |
| assert "Gain vs" in html_with_baseline | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 6. Anti-injection | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestAntiInjection: | |
| def test_pipeline_name_escaped_in_ranking(self) -> None: | |
| comparison = _make_comparison([ | |
| ("<script>alert(1)</script>", 0.1), | |
| ]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| html = build_pipeline_ranking_table_html(comparison, spec) | |
| assert "<script>alert" not in html | |
| assert "<script>" in html | |
| def test_corpus_name_escaped_in_summary(self) -> None: | |
| comparison = PipelineComparisonResult( | |
| corpus_name="<img src=x onerror=alert(1)>", | |
| ) | |
| html = build_pipeline_comparison_summary_html(comparison) | |
| assert "<img src=x" not in html | |
| assert "<img" in html | |
| def test_label_via_i18n_escaped(self) -> None: | |
| comparison = _make_comparison([("a", 0.1)]) | |
| spec = RankingSpec(ArtifactType.TEXT, "cer") | |
| labels = {"pipeline_ranking_title": "<b>Hack</b>"} | |
| html = build_pipeline_ranking_table_html( | |
| comparison, spec, labels=labels, | |
| ) | |
| assert "<b>Hack</b>" not in html | |
| assert "<b>Hack</b>" in html | |
| # ────────────────────────────────────────────────────────────────────────── | |
| # 7. Complétude i18n | |
| # ────────────────────────────────────────────────────────────────────────── | |
| class TestI18nCompleteness: | |
| def _load(self, lang: str) -> dict: | |
| path = ( | |
| Path(__file__).parent.parent.parent | |
| / "picarones" / "report" / "i18n" / f"{lang}.json" | |
| ) | |
| return json.loads(path.read_text(encoding="utf-8")) | |
| def test_new_keys_present_fr(self) -> None: | |
| d = self._load("fr") | |
| required = ( | |
| "pipeline_comparison_report_title", | |
| "pipeline_comparison_report_note", | |
| "pipeline_comparison_summary_title", | |
| "pipeline_n_pipelines_label", | |
| "pipeline_n_pipelines_short", | |
| "pipeline_per_pipeline_label", | |
| "pipeline_ranking_title", "pipeline_rank_label", | |
| "pipeline_value_label", | |
| "pipeline_gain_title", | |
| "pipeline_gain_absolute_label", | |
| "pipeline_gain_relative_label", | |
| "pipeline_baseline_marker", | |
| ) | |
| for key in required: | |
| assert key in d, f"manque clé FR : {key}" | |
| def test_new_keys_present_en(self) -> None: | |
| d_fr = self._load("fr") | |
| d_en = self._load("en") | |
| for key in d_fr: | |
| if key.startswith("pipeline_"): | |
| assert key in d_en, f"manque clé EN : {key}" | |