"""Tests Sprint 96 — B.5 : comparaison incrémentale. Couvre : 1. ``compare_isolated_effect`` : - cas standard 4×2 → effet du LLM isolé - mean_rank correct - best/worst identifiés - higher_is_better inverse l'ordre - lt 2 runs → None - varying_slot inconnu → None - schémas de slots incompatibles ignorés - acceptation de dicts compatibles 2. Vue HTML : - adaptive - tri par rang moyen - marquage best ★ / worst ▼ - anti-injection 3. Cas réaliste 5 OCR × 2 LLM. 4. Complétude i18n FR/EN. """ from __future__ import annotations import json from pathlib import Path import pytest from picarones.measurements.incremental_comparison import ( PipelineRun, compare_isolated_effect, ) from picarones.report.incremental_comparison_render import ( build_incremental_comparison_html, ) def _load_labels(lang: str) -> dict: p = ( Path(__file__).parent.parent.parent / "picarones" / "report" / "i18n" / f"{lang}.json" ) return json.loads(p.read_text(encoding="utf-8")) # ────────────────────────────────────────────────────────────────────────── # 1. compare_isolated_effect # ────────────────────────────────────────────────────────────────────────── class TestIsolatedEffect: def _runs_4x2(self) -> list[PipelineRun]: runs = [] for ocr, q in [ ("tess", 0.05), ("pero", 0.04), ("mistral", 0.03), ("vlm", 0.06), ]: for llm, boost in [("gpt", -0.01), ("claude", -0.005)]: runs.append(PipelineRun( name=f"{ocr}+{llm}", slots={"ocr": ocr, "llm": llm}, score=q + boost, )) return runs def test_basic_4x2(self) -> None: r = compare_isolated_effect(self._runs_4x2(), "llm") assert r is not None assert r["varying_slot"] == "llm" assert r["n_runs"] == 8 assert r["n_groups"] == 4 assert sorted(r["values"]) == ["claude", "gpt"] def test_mean_rank(self) -> None: # gpt domine systématiquement → rang moyen 1.0 r = compare_isolated_effect(self._runs_4x2(), "llm") assert r["per_value"]["gpt"]["mean_rank"] == 1.0 assert r["per_value"]["claude"]["mean_rank"] == 2.0 def test_best_worst(self) -> None: r = compare_isolated_effect(self._runs_4x2(), "llm") assert r["best_value"] == "gpt" assert r["worst_value"] == "claude" def test_higher_is_better_inverts(self) -> None: # Score = F1 (haut = mieux) runs = [ PipelineRun("a+x", {"a": "1", "b": "x"}, 0.95), PipelineRun("a+y", {"a": "1", "b": "y"}, 0.80), PipelineRun("c+x", {"a": "2", "b": "x"}, 0.92), PipelineRun("c+y", {"a": "2", "b": "y"}, 0.75), ] r = compare_isolated_effect(runs, "b", higher_is_better=True) assert r["best_value"] == "x" assert r["worst_value"] == "y" assert r["per_value"]["x"]["mean_rank"] == 1.0 def test_lt_two_returns_none(self) -> None: assert compare_isolated_effect([], "x") is None assert compare_isolated_effect( [PipelineRun("a", {"x": "1"}, 0.1)], "x", ) is None def test_unknown_slot_returns_none(self) -> None: runs = [ PipelineRun("a", {"ocr": "tess"}, 0.1), PipelineRun("b", {"ocr": "pero"}, 0.05), ] assert compare_isolated_effect(runs, "ghost") is None def test_incompatible_schemas_skipped(self) -> None: # 2 runs avec schéma {ocr, llm}, 1 run avec schéma {ocr} runs = [ PipelineRun("a", {"ocr": "tess", "llm": "g"}, 0.04), PipelineRun("b", {"ocr": "pero", "llm": "g"}, 0.03), PipelineRun("c", {"ocr": "mistral"}, 0.02), ] r = compare_isolated_effect(runs, "ocr") # Le 3e run a un schéma incompatible (pas de "llm") → ignoré # quand on commence avec {ocr, llm} assert r is not None assert r["n_runs"] == 3 # tous les runs avec varying_slot # Mais seuls 2 sont dans des groupes assert sum(g["n_members"] for g in r["groups"]) == 2 def test_accepts_dicts(self) -> None: runs = [ {"name": "a", "slots": {"ocr": "tess", "llm": "g"}, "score": 0.05}, {"name": "b", "slots": {"ocr": "tess", "llm": "c"}, "score": 0.04}, ] r = compare_isolated_effect(runs, "llm") assert r is not None def test_ties_handled(self) -> None: # Scores identiques → rangs moyens runs = [ PipelineRun("a", {"x": "1", "y": "p"}, 0.05), PipelineRun("b", {"x": "1", "y": "q"}, 0.05), # ex aequo ] r = compare_isolated_effect(runs, "y") # Rangs : 1.5 et 1.5 assert r["per_value"]["p"]["mean_rank"] == 1.5 assert r["per_value"]["q"]["mean_rank"] == 1.5 # ────────────────────────────────────────────────────────────────────────── # 2. Vue HTML # ────────────────────────────────────────────────────────────────────────── class TestRender: def test_empty_returns_empty(self) -> None: assert build_incremental_comparison_html(None) == "" assert build_incremental_comparison_html( {"per_value": {}}, ) == "" def test_renders_table(self) -> None: runs = [ PipelineRun("a", {"ocr": "tess", "llm": "g"}, 0.04), PipelineRun("b", {"ocr": "tess", "llm": "c"}, 0.05), PipelineRun("c", {"ocr": "pero", "llm": "g"}, 0.03), PipelineRun("d", {"ocr": "pero", "llm": "c"}, 0.04), ] analysis = compare_isolated_effect(runs, "llm") html = build_incremental_comparison_html( analysis, _load_labels("fr"), ) assert "