Picarones / tests /evaluation /test_sprint_a14_s13_view_executor.py
Claude
refactor: kill bricolage S49-S57 β€” fixes structurels (audit cleanup)
88add17 unverified
raw
history blame
17.6 kB
"""Sprint A14-S13 β€” ``DefaultEvaluationViewExecutor``.
Tests d'orchestration : la vue + ses dΓ©pendances (registries +
payload loader) sur 10+ cas couvrant les chemins critiques.
"""
from __future__ import annotations
import pytest
from picarones.domain import (
Artifact,
ArtifactType,
EvaluationView,
MetricSpec,
ProjectionError,
ProjectionSpec,
)
from picarones.evaluation.projectors import (
ProjectionReport,
ProjectorRegistry,
ProjectorRegistrationError,
ProjectorNotFoundError,
)
from picarones.evaluation.registry import MetricRegistry
from picarones.evaluation.views import (
DefaultEvaluationViewExecutor,
)
# ──────────────────────────────────────────────────────────────────────
# Stubs rΓ©utilisables
# ──────────────────────────────────────────────────────────────────────
class _StubProjector:
"""Projecteur ALTO β†’ texte simple pour les tests."""
name = "stub_alto_to_text"
source_type = ArtifactType.ALTO_XML
target_type = ArtifactType.RAW_TEXT
def __init__(self, output_payload: str = "projected text") -> None:
self.output_payload = output_payload
def project(self, artifact, params):
target = Artifact(
id=f"{artifact.id}:projected",
document_id=artifact.document_id,
type=self.target_type,
)
report = ProjectionReport(
source_artifact_id=artifact.id,
source_type=self.source_type,
target_type=self.target_type,
projector_name=self.name,
lossy=True,
ignored_dimensions=("geometry", "blocks"),
warnings=("ordre de lecture devinΓ©",),
)
# Sprint S25 β€” retourne le payload directement.
return target, self.output_payload, report
def _build_executor(
payloads: dict[str, object],
*,
register_projector: bool = True,
extra_metrics: dict[str, object] | None = None,
) -> DefaultEvaluationViewExecutor:
metrics = MetricRegistry()
metrics.register(
MetricSpec(
name="cer",
input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT),
),
lambda gt, hyp: 0.0 if gt == hyp else (
0.5 if isinstance(gt, str) and isinstance(hyp, str) and len(gt) == len(hyp)
else 1.0
),
)
metrics.register(
MetricSpec(
name="wer",
input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT),
),
lambda gt, hyp: 0.0 if gt == hyp else 0.5,
)
if extra_metrics:
for name, fn in extra_metrics.items():
metrics.register(
MetricSpec(
name=name,
input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT),
),
fn,
)
projectors = ProjectorRegistry()
if register_projector:
projectors.register(_StubProjector())
def loader(artifact: Artifact):
if artifact.id not in payloads:
raise KeyError(f"payload manquant : {artifact.id}")
return payloads[artifact.id]
return DefaultEvaluationViewExecutor.from_registries(metrics, projectors, loader)
def _text_view(
*,
name: str = "text_final",
candidate_types: frozenset = frozenset({
ArtifactType.RAW_TEXT,
ArtifactType.CORRECTED_TEXT,
ArtifactType.ALTO_XML,
}),
projection: ProjectionSpec | None = None,
normalization_profile: str | None = None,
metric_names: tuple[str, ...] = ("cer",),
ignored_dimensions: tuple[str, ...] = (),
warnings: tuple[str, ...] = (),
) -> EvaluationView:
return EvaluationView(
name=name,
candidate_types=candidate_types,
projection=projection,
normalization_profile=normalization_profile,
metric_names=metric_names,
ignored_dimensions=ignored_dimensions,
warnings=warnings,
)
# ──────────────────────────────────────────────────────────────────────
# 10 cas d'Γ©valuation
# ──────────────────────────────────────────────────────────────────────
class TestEvaluator:
def test_text_direct_no_projection(self) -> None:
"""Cas 1 β€” RAW_TEXT direct, pas de projection."""
payloads = {"cand": "hello", "gt": "hello"}
executor = _build_executor(payloads)
view = _text_view(metric_names=("cer", "wer"))
cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
assert result.metric_values["cer"] == 0.0
assert result.metric_values["wer"] == 0.0
assert result.projection_report is None
assert result.failed_metrics == {}
def test_text_direct_with_difference(self) -> None:
"""Cas 2 β€” RAW_TEXT, candidat diffΓ©rent de la GT."""
payloads = {"cand": "world", "gt": "hello"}
executor = _build_executor(payloads)
view = _text_view()
cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
assert result.metric_values["cer"] > 0
def test_alto_to_text_via_projection(self) -> None:
"""Cas 3 β€” ALTO_XML projetΓ© en RAW_TEXT, projection_report prΓ©sent."""
payloads = {
"alto:projected": "projected text",
"gt": "projected text",
}
executor = _build_executor(payloads)
view = _text_view(
projection=ProjectionSpec(
source_type=ArtifactType.ALTO_XML,
target_type=ArtifactType.RAW_TEXT,
projector_name="stub_alto_to_text",
),
)
cand = Artifact(id="alto", document_id="d", type=ArtifactType.ALTO_XML)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
assert result.projection_report is not None
assert result.projection_report.projector_name == "stub_alto_to_text"
assert "geometry" in result.ignored_dimensions
assert "ordre de lecture devinΓ©" in result.warnings
assert result.metric_values["cer"] == 0.0
def test_view_rejects_wrong_artifact_type(self) -> None:
"""Cas 4 β€” la vue n'accepte pas IMAGE β†’ ValueError."""
payloads = {}
executor = _build_executor(payloads)
view = _text_view(
candidate_types=frozenset({ArtifactType.RAW_TEXT}),
)
cand = Artifact(id="x", document_id="d", type=ArtifactType.IMAGE)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
with pytest.raises(ValueError, match="n'accepte pas"):
executor.evaluate(view, cand, gt, pipeline_name="test")
def test_unknown_projector_raises_projection_error(self) -> None:
"""Cas 5 β€” la vue rΓ©fΓ©rence un projecteur non enregistrΓ©."""
payloads = {"cand": "x", "gt": "x"}
executor = _build_executor(payloads, register_projector=False)
view = _text_view(
projection=ProjectionSpec(
source_type=ArtifactType.ALTO_XML,
target_type=ArtifactType.RAW_TEXT,
projector_name="nonexistent",
),
)
cand = Artifact(id="cand", document_id="d", type=ArtifactType.ALTO_XML)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
with pytest.raises(ProjectionError, match="introuvable"):
executor.evaluate(view, cand, gt, pipeline_name="test")
def test_projector_that_raises_wraps_in_projection_error(self) -> None:
"""Cas 6 — le projecteur lève une exception interne."""
class _CrashingProjector:
name = "crash"
source_type = ArtifactType.ALTO_XML
target_type = ArtifactType.RAW_TEXT
def project(self, artifact, params):
raise RuntimeError("boom interne")
metrics = MetricRegistry()
projectors = ProjectorRegistry()
projectors.register(_CrashingProjector())
executor = DefaultEvaluationViewExecutor.from_registries(
metrics, projectors, lambda a: None,
)
view = _text_view(
projection=ProjectionSpec(
source_type=ArtifactType.ALTO_XML,
target_type=ArtifactType.RAW_TEXT,
projector_name="crash",
),
metric_names=(),
)
cand = Artifact(id="c", document_id="d", type=ArtifactType.ALTO_XML)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
with pytest.raises(ProjectionError, match="boom interne"):
executor.evaluate(view, cand, gt, pipeline_name="test")
def test_metric_that_raises_goes_to_failed_metrics(self) -> None:
"""Cas 7 — une métrique qui lève → failed_metrics, pas plante."""
def _broken(gt, hyp):
raise ValueError("mΓ©trique cassΓ©e")
payloads = {"cand": "x", "gt": "x"}
executor = _build_executor(
payloads,
extra_metrics={"broken": _broken},
)
view = _text_view(metric_names=("cer", "broken", "wer"))
cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
assert "cer" in result.metric_values
assert "wer" in result.metric_values
assert "broken" in result.failed_metrics
assert "mΓ©trique cassΓ©e" in result.failed_metrics["broken"]
def test_unknown_metric_goes_to_failed_metrics(self) -> None:
"""Cas 8 β€” une mΓ©trique non enregistrΓ©e β†’ failed_metrics."""
payloads = {"cand": "x", "gt": "x"}
executor = _build_executor(payloads)
view = _text_view(metric_names=("cer", "nonexistent_metric"))
cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
assert "cer" in result.metric_values
assert "nonexistent_metric" in result.failed_metrics
assert "non enregistrΓ©e" in result.failed_metrics["nonexistent_metric"]
def test_normalization_profile_applied(self) -> None:
"""Cas 9 β€” vue avec normalization_profile applique la
normalisation aux deux payloads."""
# Avec medieval_french : ΕΏ β†’ s, u β†’ v
payloads = {"cand": "afpre", "gt": "aΕΏpre"}
executor = _build_executor(payloads)
view = _text_view(normalization_profile="medieval_french")
cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
# Après normalisation, les deux deviennent "aspre" (cer stub
# retourne 0.5 pour len Γ©gal, 0.0 pour Γ©galitΓ© stricte).
# On vΓ©rifie au moins que la mΓ©trique a Γ©tΓ© calculΓ©e.
assert "cer" in result.metric_values
def test_payload_loader_failure_blocks_all_metrics(self) -> None:
"""Cas 10 β€” le loader plante β†’ toutes les mΓ©triques sont
marquΓ©es en Γ©chec global."""
# Loader plante systΓ©matiquement.
metrics = MetricRegistry()
metrics.register(
MetricSpec(
name="cer",
input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT),
),
lambda r, h: 0.0,
)
projectors = ProjectorRegistry()
def _bad_loader(artifact):
raise FileNotFoundError(f"missing file for {artifact.id}")
executor = DefaultEvaluationViewExecutor.from_registries(
metrics, projectors, _bad_loader,
)
view = _text_view(metric_names=("cer",))
cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT)
result = executor.evaluate(view, cand, gt, pipeline_name="test")
assert result.metric_values == {}
assert "cer" in result.failed_metrics
assert "payload_loader a Γ©chouΓ©" in result.failed_metrics["cer"]
# ──────────────────────────────────────────────────────────────────────
# Constructor validation
# ──────────────────────────────────────────────────────────────────────
class TestConstructor:
"""Le constructeur canonique (S27) attend deux engines + un loader."""
def test_rejects_non_projection_engine(self) -> None:
from picarones.evaluation.evaluation_engine import EvaluationEngine
with pytest.raises(TypeError, match="projection_engine"):
DefaultEvaluationViewExecutor(
"not an engine", # type: ignore[arg-type]
EvaluationEngine(MetricRegistry()),
lambda a: None,
)
def test_rejects_non_evaluation_engine(self) -> None:
from picarones.evaluation.projection_engine import ProjectionEngine
with pytest.raises(TypeError, match="evaluation_engine"):
DefaultEvaluationViewExecutor(
ProjectionEngine(ProjectorRegistry()),
"nope", # type: ignore[arg-type]
lambda a: None,
)
def test_rejects_non_callable_loader(self) -> None:
from picarones.evaluation.evaluation_engine import EvaluationEngine
from picarones.evaluation.projection_engine import ProjectionEngine
with pytest.raises(TypeError, match="callable"):
DefaultEvaluationViewExecutor(
ProjectionEngine(ProjectorRegistry()),
EvaluationEngine(MetricRegistry()),
"not_callable", # type: ignore[arg-type]
)
def test_from_registries_rejects_non_metric_registry(self) -> None:
with pytest.raises(TypeError, match="metric_registry"):
DefaultEvaluationViewExecutor.from_registries(
"not a registry", ProjectorRegistry(), lambda a: None, # type: ignore[arg-type]
)
def test_from_registries_rejects_non_projector_registry(self) -> None:
with pytest.raises(TypeError, match="projector_registry"):
DefaultEvaluationViewExecutor.from_registries(
MetricRegistry(), "nope", lambda a: None, # type: ignore[arg-type]
)
def test_from_registries_rejects_non_callable_loader(self) -> None:
with pytest.raises(TypeError, match="callable"):
DefaultEvaluationViewExecutor.from_registries(
MetricRegistry(), ProjectorRegistry(), "not_callable", # type: ignore[arg-type]
)
# ──────────────────────────────────────────────────────────────────────
# ProjectorRegistry β€” tests directs
# ──────────────────────────────────────────────────────────────────────
class TestProjectorRegistry:
def test_register_and_get(self) -> None:
reg = ProjectorRegistry()
p = _StubProjector()
reg.register(p)
assert "stub_alto_to_text" in reg
assert reg.get("stub_alto_to_text") is p
def test_register_non_protocol_raises(self) -> None:
reg = ProjectorRegistry()
class _NotAProjector:
pass
with pytest.raises(ProjectorRegistrationError):
reg.register(_NotAProjector()) # type: ignore[arg-type]
def test_idempotent_re_registration(self) -> None:
reg = ProjectorRegistry()
p = _StubProjector()
reg.register(p)
reg.register(p) # ne lève pas
assert len(reg) == 1
def test_get_unknown_raises(self) -> None:
reg = ProjectorRegistry()
with pytest.raises(ProjectorNotFoundError):
reg.get("missing")
def test_two_registries_independent(self) -> None:
a = ProjectorRegistry()
b = ProjectorRegistry()
a.register(_StubProjector())
assert "stub_alto_to_text" in a
assert "stub_alto_to_text" not in b