Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / tests /report /test_sprint68_pipeline_comparison_html.py

Claude

test: corriger 4 défauts de classification du chantier B

315a6b9 unverified about 2 months ago

16.5 kB

	"""Tests Sprint 68 — vue HTML de comparaison de N pipelines.

	Couvre :

	1. ``RankingSpec`` : ``display_label`` auto / explicite.
	2. ``build_pipeline_ranking_table_html`` :
	- tableau rang / pipeline / valeur, ordre cohérent
	- pipelines sans valeur en queue avec tirets
	- cellule de rang colorée (gradient vert→rouge)
	- vide si la comparaison ne contient aucune pipeline
	3. ``build_pipeline_gain_table_html`` :
	- tableau pipeline / valeur / absolute / relative
	- baseline marquée explicitement
	- couleur cellule favorable / défavorable selon
	``higher_is_better``
	- baseline inconnue → chaîne vide
	4. ``build_pipeline_comparison_summary_html`` : corpus, n_docs,
	n_pipelines, durée, mini-résumé par pipeline.
	5. ``build_pipeline_comparison_report_html`` :
	- document HTML autonome (doctype, head, body, styles)
	- titre, lang attribute FR/EN
	- rankings affichés si ranking_specs fourni
	- gain table affiché uniquement si baseline_pipeline fourni
	6. Anti-injection : pipeline name, corpus, labels.
	7. Complétude i18n : nouvelles clés ``pipeline_*`` présentes
	en FR et EN.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path

	from picarones.core.modules import ArtifactType
	from picarones.measurements.pipeline_benchmark import (
	PipelineBenchmarkResult,
	StepAggregate,
	)
	from picarones.measurements.pipeline_comparison import PipelineComparisonResult
	from picarones.report.pipeline_render import (
	RankingSpec,
	build_pipeline_comparison_report_html,
	build_pipeline_comparison_summary_html,
	build_pipeline_gain_table_html,
	build_pipeline_ranking_table_html,
	)


	def _make_bench(name: str, cer_mean: float, n: int = 10) -> PipelineBenchmarkResult:
	bench = PipelineBenchmarkResult(
	pipeline_name=name, corpus_name="demo",
	n_docs=n, total_duration_seconds=1.0,
	)

	class _PR:
	def __init__(self, ok): self._ok = ok
	@property
	def succeeded(self): return self._ok
	bench.per_doc_results = [_PR(True)] * n
	bench.per_step_aggregates = [
	StepAggregate(
	step_name="ocr", n_docs=n, n_succeeded=n, n_failed=0,
	duration_seconds_total=0.5, duration_seconds_mean=0.05,
	duration_seconds_median=0.05,
	junction_metrics={
	"text": {
	"cer": {"mean": cer_mean, "median": cer_mean, "n": n},
	},
	},
	),
	]
	return bench


	def _make_comparison(
	pipelines: list[tuple[str, float]],
	) -> PipelineComparisonResult:
	"""Crée une comparaison avec pipelines = [(name, cer_mean), ...]."""
	comparison = PipelineComparisonResult(
	corpus_name="demo",
	n_docs=10,
	total_duration_seconds=3.0,
	)
	for name, cer in pipelines:
	comparison.per_pipeline[name] = _make_bench(name, cer)
	return comparison


	# ──────────────────────────────────────────────────────────────────────────
	# 1. RankingSpec
	# ──────────────────────────────────────────────────────────────────────────


	class TestRankingSpec:
	def test_display_label_default(self) -> None:
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	assert spec.display_label == "text.cer"

	def test_display_label_explicit(self) -> None:
	spec = RankingSpec(ArtifactType.TEXT, "cer", label="CER global")
	assert spec.display_label == "CER global"

	def test_higher_is_better_default_false(self) -> None:
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	assert spec.higher_is_better is False


	# ──────────────────────────────────────────────────────────────────────────
	# 2. ranking table
	# ──────────────────────────────────────────────────────────────────────────


	class TestRankingTable:
	def test_orders_by_metric_ascending(self) -> None:
	comparison = _make_comparison([
	("alpha", 0.20),
	("beta", 0.05),
	("gamma", 0.10),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_ranking_table_html(comparison, spec)
	# beta (CER 0.05) doit apparaître avant alpha (CER 0.20)
	idx_beta = html.find("beta")
	idx_alpha = html.find("alpha")
	idx_gamma = html.find("gamma")
	assert 0 < idx_beta < idx_gamma < idx_alpha

	def test_higher_is_better_reverses(self) -> None:
	comparison = _make_comparison([
	("alpha", 0.20),
	("beta", 0.80),
	])
	spec = RankingSpec(
	ArtifactType.TEXT, "cer", higher_is_better=True,
	)
	html = build_pipeline_ranking_table_html(comparison, spec)
	# beta (0.80) en premier puisqu'on inverse
	idx_beta = html.find("beta")
	idx_alpha = html.find("alpha")
	assert idx_beta < idx_alpha

	def test_pipelines_without_metric_in_queue(self) -> None:
	# Pipeline "bad" sans CER (aucun step n'a tourné)
	comparison = _make_comparison([("alpha", 0.10)])
	comparison.per_pipeline["bad"] = PipelineBenchmarkResult(
	pipeline_name="bad", corpus_name="demo",
	)
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_ranking_table_html(comparison, spec)
	idx_bad = html.find("bad")
	idx_alpha = html.find("alpha")
	assert 0 < idx_alpha < idx_bad
	# Le pipeline sans valeur affiche un tiret
	assert "—" in html

	def test_rank_cell_colored(self) -> None:
	comparison = _make_comparison([
	("a", 0.1), ("b", 0.2), ("c", 0.3),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_ranking_table_html(comparison, spec)
	assert "background:#" in html

	def test_empty_comparison_returns_empty(self) -> None:
	comparison = PipelineComparisonResult(corpus_name="empty")
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	assert build_pipeline_ranking_table_html(comparison, spec) == ""

	def test_uses_display_label_in_title(self) -> None:
	comparison = _make_comparison([("alpha", 0.1)])
	spec = RankingSpec(
	ArtifactType.TEXT, "cer", label="Mon Label",
	)
	html = build_pipeline_ranking_table_html(comparison, spec)
	assert "Mon Label" in html


	# ──────────────────────────────────────────────────────────────────────────
	# 3. gain table
	# ──────────────────────────────────────────────────────────────────────────


	class TestGainTable:
	def test_baseline_marked(self) -> None:
	comparison = _make_comparison([
	("baseline", 0.20), ("better", 0.10),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_gain_table_html(
	comparison, spec, baseline_pipeline="baseline",
	)
	assert "(référence)" in html
	# Les deux pipelines apparaissent
	assert "baseline" in html
	assert "better" in html

	def test_gain_absolute_and_relative(self) -> None:
	comparison = _make_comparison([
	("baseline", 0.20), ("better", 0.10),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_gain_table_html(
	comparison, spec, baseline_pipeline="baseline",
	)
	# better : -0.1000 absolute, -50% relative
	assert "-0.1000" in html
	assert "-50.0%" in html

	def test_color_favorable_when_lower_better(self) -> None:
	# CER baisse → favorable → cellule verte (#cfe8cf)
	comparison = _make_comparison([
	("baseline", 0.20), ("better", 0.05),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_gain_table_html(
	comparison, spec, baseline_pipeline="baseline",
	)
	assert "#cfe8cf" in html

	def test_color_unfavorable_when_lower_better(self) -> None:
	# CER monte → défavorable → cellule rouge (#f4cfcf)
	comparison = _make_comparison([
	("baseline", 0.10), ("worse", 0.30),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_gain_table_html(
	comparison, spec, baseline_pipeline="baseline",
	)
	assert "#f4cfcf" in html

	def test_unknown_baseline_returns_empty(self) -> None:
	comparison = _make_comparison([("alpha", 0.1)])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_gain_table_html(
	comparison, spec, baseline_pipeline="nonexistent",
	)
	assert html == ""


	# ──────────────────────────────────────────────────────────────────────────
	# 4. comparison summary
	# ──────────────────────────────────────────────────────────────────────────


	class TestComparisonSummary:
	def test_renders_corpus_and_counts(self) -> None:
	comparison = _make_comparison([
	("a", 0.1), ("b", 0.2),
	])
	html = build_pipeline_comparison_summary_html(comparison)
	assert "demo" in html
	assert "10" in html # n_docs
	# 2 pipelines
	assert ">2<" in html

	def test_per_pipeline_mini_summary(self) -> None:
	comparison = _make_comparison([
	("a", 0.1), ("b", 0.2),
	])
	html = build_pipeline_comparison_summary_html(comparison)
	# Mini-résumé : nom (n_succeeded/n_docs)
	assert "a" in html
	assert "b" in html
	assert "10/10" in html


	# ──────────────────────────────────────────────────────────────────────────
	# 5. document autonome
	# ──────────────────────────────────────────────────────────────────────────


	class TestComparisonReport:
	def test_doctype_and_structure(self) -> None:
	comparison = _make_comparison([("a", 0.1)])
	html = build_pipeline_comparison_report_html(comparison)
	assert html.startswith("<!doctype html>")
	assert "<html" in html
	assert "<head>" in html
	assert "<body>" in html
	assert "</html>" in html

	def test_lang_attribute(self) -> None:
	comparison = _make_comparison([("a", 0.1)])
	html_fr = build_pipeline_comparison_report_html(
	comparison, lang="fr",
	)
	html_en = build_pipeline_comparison_report_html(
	comparison, lang="en",
	)
	assert 'lang="fr"' in html_fr
	assert 'lang="en"' in html_en

	def test_rankings_displayed_when_specs_provided(self) -> None:
	comparison = _make_comparison([
	("a", 0.20), ("b", 0.05),
	])
	specs = [RankingSpec(ArtifactType.TEXT, "cer", label="CER")]
	html = build_pipeline_comparison_report_html(
	comparison, ranking_specs=specs,
	)
	assert "Classement par CER" in html

	def test_no_rankings_without_specs(self) -> None:
	comparison = _make_comparison([("a", 0.1)])
	html = build_pipeline_comparison_report_html(comparison)
	# Pas de tableau de classement sans ranking_specs
	assert "Classement par" not in html

	def test_gain_table_only_with_baseline(self) -> None:
	comparison = _make_comparison([
	("baseline", 0.20), ("better", 0.10),
	])
	specs = [RankingSpec(ArtifactType.TEXT, "cer")]
	# Sans baseline : pas de gain table
	html_no_baseline = build_pipeline_comparison_report_html(
	comparison, ranking_specs=specs,
	)
	assert "Gain vs" not in html_no_baseline
	# Avec baseline : gain table présent
	html_with_baseline = build_pipeline_comparison_report_html(
	comparison, ranking_specs=specs,
	baseline_pipeline="baseline",
	)
	assert "Gain vs" in html_with_baseline


	# ──────────────────────────────────────────────────────────────────────────
	# 6. Anti-injection
	# ──────────────────────────────────────────────────────────────────────────


	class TestAntiInjection:
	def test_pipeline_name_escaped_in_ranking(self) -> None:
	comparison = _make_comparison([
	("<script>alert(1)</script>", 0.1),
	])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	html = build_pipeline_ranking_table_html(comparison, spec)
	assert "<script>alert" not in html
	assert "<script>" in html

	def test_corpus_name_escaped_in_summary(self) -> None:
	comparison = PipelineComparisonResult(
	corpus_name="<img src=x onerror=alert(1)>",
	)
	html = build_pipeline_comparison_summary_html(comparison)
	assert "<img src=x" not in html
	assert "<img" in html

	def test_label_via_i18n_escaped(self) -> None:
	comparison = _make_comparison([("a", 0.1)])
	spec = RankingSpec(ArtifactType.TEXT, "cer")
	labels = {"pipeline_ranking_title": "<b>Hack</b>"}
	html = build_pipeline_ranking_table_html(
	comparison, spec, labels=labels,
	)
	assert "<b>Hack</b>" not in html
	assert "<b>Hack</b>" in html


	# ──────────────────────────────────────────────────────────────────────────
	# 7. Complétude i18n
	# ──────────────────────────────────────────────────────────────────────────


	class TestI18nCompleteness:
	def _load(self, lang: str) -> dict:
	path = (
	Path(__file__).parent.parent.parent
	/ "picarones" / "report" / "i18n" / f"{lang}.json"
	)
	return json.loads(path.read_text(encoding="utf-8"))

	def test_new_keys_present_fr(self) -> None:
	d = self._load("fr")
	required = (
	"pipeline_comparison_report_title",
	"pipeline_comparison_report_note",
	"pipeline_comparison_summary_title",
	"pipeline_n_pipelines_label",
	"pipeline_n_pipelines_short",
	"pipeline_per_pipeline_label",
	"pipeline_ranking_title", "pipeline_rank_label",
	"pipeline_value_label",
	"pipeline_gain_title",
	"pipeline_gain_absolute_label",
	"pipeline_gain_relative_label",
	"pipeline_baseline_marker",
	)
	for key in required:
	assert key in d, f"manque clé FR : {key}"

	def test_new_keys_present_en(self) -> None:
	d_fr = self._load("fr")
	d_en = self._load("en")
	for key in d_fr:
	if key.startswith("pipeline_"):
	assert key in d_en, f"manque clé EN : {key}"