Picarones / tests /measurements /test_sprint65_pipeline_comparison.py
Claude
test: corriger 4 dΓ©fauts de classification du chantier B
315a6b9 unverified
Raw
History Blame
14.2 kB
"""Tests Sprint 65 β€” comparaison de N pipelines sur un corpus.
Couvre :
1. ``compare_pipelines`` :
- 1 pipeline β†’ Γ©quivalent Γ  ``run_pipeline_benchmark`` mais
emballΓ© dans un ``PipelineComparisonResult``
- 2+ pipelines β†’ rΓ©sultats indexΓ©s par nom dans l'ordre
d'insertion
- Noms en double β†’ ``ValueError`` explicite
- ``factories`` par pipeline respectΓ©
- Corpus vide β†’ rΓ©sultats vides cohΓ©rents
2. ``ranking_by_final_metric`` :
- Tri ascendant pour mΓ©triques de type CER (par dΓ©faut)
- Tri descendant si ``higher_is_better=True``
- Pipelines sans mΓ©trique β†’ en queue, ordre prΓ©servΓ©
3. ``gain_table`` :
- ``baseline_pipeline`` inconnue β†’ ``KeyError``
- Baseline elle-mΓͺme : absolute=0, relative=0
- ``relative`` Γ  ``None`` si baseline = 0
- ``absolute`` et ``relative`` Γ  ``None`` si valeur absente
4. Cas rΓ©aliste : OCR fautif vs OCR+correcteur β†’ le correcteur
gagne au ranking et au gain_table.
5. Philosophie inchangΓ©e : tous les modules sont des **mocks**
dΓ©finis dans le test.
"""
from __future__ import annotations
from typing import Any
import pytest
from picarones.core.corpus import Corpus, Document, GTLevel, TextGT
from picarones.core.modules import ArtifactType, BaseModule
from picarones.measurements.pipeline_comparison import (
PipelineComparisonResult,
compare_pipelines,
)
from picarones.core.pipeline import PipelineSpec, PipelineStep
# ──────────────────────────────────────────────────────────────────────────
# Mocks
# ──────────────────────────────────────────────────────────────────────────
class MockOCR(BaseModule):
input_types = (ArtifactType.IMAGE,)
output_types = (ArtifactType.TEXT,)
execution_mode: Any = "io"
def __init__(self, fn) -> None:
self._fn = fn
@property
def name(self) -> str:
return "mock-ocr"
def process(self, inputs):
return {ArtifactType.TEXT: self._fn(inputs[ArtifactType.IMAGE])}
class TextFixer(BaseModule):
"""Rewriter mock qui applique un dict de remplacements."""
input_types = (ArtifactType.TEXT,)
output_types = (ArtifactType.TEXT,)
execution_mode: Any = "cpu"
def __init__(self, replacements: dict[str, str]) -> None:
self._replacements = replacements
@property
def name(self) -> str:
return "fixer"
def process(self, inputs):
text = inputs[ArtifactType.TEXT]
for src, dst in self._replacements.items():
text = text.replace(src, dst)
return {ArtifactType.TEXT: text}
def _make_corpus(n: int = 2, name: str = "demo") -> Corpus:
docs = []
for i in range(n):
gt = f"texte {i}"
docs.append(Document(
image_path=f"/tmp/d{i}.png",
ground_truth=gt,
doc_id=f"d{i}",
ground_truths={GTLevel.TEXT: TextGT(text=gt)},
))
return Corpus(name=name, documents=docs)
def _ocr_perfect(path: str) -> str:
idx = path.replace("/tmp/d", "").replace(".png", "")
return f"texte {idx}"
def _ocr_with_typo(path: str) -> str:
idx = path.replace("/tmp/d", "").replace(".png", "")
return f"txete {idx}"
# ──────────────────────────────────────────────────────────────────────────
# 1. compare_pipelines β€” chemins nominaux
# ──────────────────────────────────────────────────────────────────────────
class TestCompareBasic:
def test_single_pipeline(self) -> None:
corpus = _make_corpus(2)
spec = PipelineSpec(
name="ocr_only",
steps=[PipelineStep("ocr", MockOCR(_ocr_perfect))],
)
result = compare_pipelines([spec], corpus)
assert result.corpus_name == "demo"
assert result.n_docs == 2
assert result.pipeline_names() == ["ocr_only"]
assert "ocr_only" in result.per_pipeline
def test_multiple_pipelines_preserved_order(self) -> None:
corpus = _make_corpus(1)
specs = [
PipelineSpec("alpha", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
PipelineSpec("beta", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
PipelineSpec("gamma", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
]
result = compare_pipelines(specs, corpus)
assert result.pipeline_names() == ["alpha", "beta", "gamma"]
def test_duplicate_names_raises(self) -> None:
corpus = _make_corpus(1)
specs = [
PipelineSpec("dup", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
PipelineSpec("dup", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
]
with pytest.raises(ValueError, match="non uniques"):
compare_pipelines(specs, corpus)
def test_empty_corpus(self) -> None:
corpus = Corpus(name="empty", documents=[])
spec = PipelineSpec(
name="ocr",
steps=[PipelineStep("ocr", MockOCR(_ocr_perfect))],
)
result = compare_pipelines([spec], corpus)
assert result.n_docs == 0
assert "ocr" in result.per_pipeline
# ──────────────────────────────────────────────────────────────────────────
# 2. ranking_by_final_metric
# ──────────────────────────────────────────────────────────────────────────
class TestRanking:
def test_lower_is_better_default(self) -> None:
corpus = _make_corpus(2)
specs = [
# OCR parfait β†’ CER=0
PipelineSpec("perfect", [
PipelineStep("ocr", MockOCR(_ocr_perfect)),
]),
# OCR fautif β†’ CER>0
PipelineSpec("typo", [
PipelineStep("ocr", MockOCR(_ocr_with_typo)),
]),
]
result = compare_pipelines(specs, corpus)
ranked = result.ranking_by_final_metric(
ArtifactType.TEXT, "cer",
)
# Le parfait arrive en premier (CER 0 < typo CER > 0)
assert ranked[0][0] == "perfect"
assert ranked[0][1] == 0.0
assert ranked[1][0] == "typo"
assert ranked[1][1] > 0.0
def test_higher_is_better(self) -> None:
corpus = _make_corpus(1)
# On utilise la mΓ©trique unicode_block_global_accuracy
# (plus haut = meilleur)
specs = [
PipelineSpec("perfect", [
PipelineStep("ocr", MockOCR(_ocr_perfect)),
]),
PipelineSpec("typo", [
PipelineStep("ocr", MockOCR(_ocr_with_typo)),
]),
]
result = compare_pipelines(specs, corpus)
# On bascule sur cer + higher_is_better=True : on vΓ©rifie
# que le tri s'inverse
ranked_lower = result.ranking_by_final_metric(
ArtifactType.TEXT, "cer", higher_is_better=False,
)
ranked_higher = result.ranking_by_final_metric(
ArtifactType.TEXT, "cer", higher_is_better=True,
)
# Si les deux pipelines ont des valeurs diffΓ©rentes, l'ordre
# est inversΓ©
if ranked_lower[0][1] != ranked_lower[1][1]:
assert ranked_lower[0][0] != ranked_higher[0][0]
def test_pipelines_without_metric_in_queue(self) -> None:
# Pipeline qui ne produit pas de TEXT (ex. crash de tous
# les docs) : pas de mΓ©trique β†’ en queue
corpus = _make_corpus(1)
class AlwaysFails(BaseModule):
input_types = (ArtifactType.IMAGE,)
output_types = (ArtifactType.TEXT,)
execution_mode: Any = "io"
@property
def name(self) -> str:
return "fail"
def process(self, inputs):
raise RuntimeError("boom")
specs = [
PipelineSpec("ok", [
PipelineStep("ocr", MockOCR(_ocr_perfect)),
]),
PipelineSpec("ko", [
PipelineStep("ocr", AlwaysFails()),
]),
]
result = compare_pipelines(specs, corpus)
ranked = result.ranking_by_final_metric(
ArtifactType.TEXT, "cer",
)
# ok est en tΓͺte, ko en queue avec valeur None
assert ranked[0][0] == "ok"
assert ranked[0][1] == 0.0
assert ranked[-1][0] == "ko"
assert ranked[-1][1] is None
# ──────────────────────────────────────────────────────────────────────────
# 3. gain_table
# ──────────────────────────────────────────────────────────────────────────
class TestGainTable:
def test_baseline_unknown_raises(self) -> None:
corpus = _make_corpus(1)
spec = PipelineSpec("a", [PipelineStep("ocr", MockOCR(_ocr_perfect))])
result = compare_pipelines([spec], corpus)
with pytest.raises(KeyError, match="baseline"):
result.gain_table(
ArtifactType.TEXT, "cer", baseline_pipeline="inconnue",
)
def test_baseline_self_zero_gain(self) -> None:
corpus = _make_corpus(1)
spec = PipelineSpec("a", [PipelineStep("ocr", MockOCR(_ocr_perfect))])
result = compare_pipelines([spec], corpus)
gains = result.gain_table(ArtifactType.TEXT, "cer", "a")
assert gains["a"]["absolute"] == 0.0
# CER vaut 0 pour les deux ; relative = None car baseline = 0
assert gains["a"]["relative"] is None
def test_relative_none_when_baseline_zero(self) -> None:
corpus = _make_corpus(1)
specs = [
PipelineSpec("perfect", [
PipelineStep("ocr", MockOCR(_ocr_perfect)),
]),
PipelineSpec("typo", [
PipelineStep("ocr", MockOCR(_ocr_with_typo)),
]),
]
result = compare_pipelines(specs, corpus)
gains = result.gain_table(ArtifactType.TEXT, "cer", "perfect")
# baseline = 0 β†’ relative = None
assert gains["typo"]["relative"] is None
assert gains["typo"]["absolute"] is not None
assert gains["typo"]["absolute"] > 0
def test_realistic_fixer_outperforms_baseline(self) -> None:
# OCR avec fautes corrigeables, fixer ramène à perfection
corpus = _make_corpus(2)
def ocr_typo(path: str) -> str:
idx = path.replace("/tmp/d", "").replace(".png", "")
return f"txete {idx}" # 'texte' β†’ 'txete'
specs = [
PipelineSpec("ocr_only", [
PipelineStep("ocr", MockOCR(ocr_typo)),
]),
PipelineSpec("ocr_with_fixer", [
PipelineStep("ocr", MockOCR(ocr_typo)),
PipelineStep("fix", TextFixer({"txete": "texte"})),
]),
]
result = compare_pipelines(specs, corpus)
gains = result.gain_table(
ArtifactType.TEXT, "cer", "ocr_only",
)
# ocr_only : CER > 0 ; ocr_with_fixer : CER = 0
assert gains["ocr_only"]["value"] > 0
assert gains["ocr_with_fixer"]["value"] == 0.0
# absolute nΓ©gatif (CER baisse β†’ mieux)
assert gains["ocr_with_fixer"]["absolute"] < 0
# ──────────────────────────────────────────────────────────────────────────
# 4. factories par pipeline
# ──────────────────────────────────────────────────────────────────────────
class TestCustomFactoriesPerPipeline:
def test_factories_routed_per_pipeline(self) -> None:
corpus = _make_corpus(1)
# Pipeline A : dΓ©marre par IMAGE (factory par dΓ©faut)
# Pipeline B : dΓ©marre par TEXT (factory custom)
specs = [
PipelineSpec("from_image", [
PipelineStep("ocr", MockOCR(_ocr_perfect)),
]),
PipelineSpec("from_text", [
PipelineStep("fix", TextFixer({"texte": "TEXTE"})),
]),
]
factories = {
"from_text": lambda doc: {ArtifactType.TEXT: doc.ground_truth},
}
result = compare_pipelines(specs, corpus, factories)
# Les deux pipelines ont tournΓ© sans erreur
assert result.per_pipeline["from_image"].n_pipelines_succeeded == 1
assert result.per_pipeline["from_text"].n_pipelines_succeeded == 1
# ──────────────────────────────────────────────────────────────────────────
# 5. Dataclass directe
# ──────────────────────────────────────────────────────────────────────────
class TestDataclass:
def test_default(self) -> None:
r = PipelineComparisonResult(corpus_name="c")
assert r.n_docs == 0
assert r.per_pipeline == {}
assert r.pipeline_names() == []