Picarones / tests /pipeline /test_sprint_a14_s6_validation.py
Claude
feat(pipeline): Sprint A14-S6 — PipelineSpec déclaratif + validation + YAML round-trip
b9ff8de unverified
Raw
History Blame
12.1 kB
"""Sprint A14-S6 — ``validate_spec``.
Couvre les ~12 cas typiques : chaîne valide, type manquant,
adapter inconnu, fork avec ``inputs_from``, références invalides,
DAG vide, IDs dupliqués.
Aucun ``StepExecutor`` instancié — la validation est purement
statique sur la spec.
"""
from __future__ import annotations
from picarones.domain import ArtifactType
from picarones.pipeline import (
INITIAL_STEP_ID,
PipelineSpec,
PipelineStep,
validate_spec,
)
# ──────────────────────────────────────────────────────────────────────
# Cas valides
# ──────────────────────────────────────────────────────────────────────
class TestValidSpecs:
def test_simple_ocr_pipeline(self) -> None:
spec = PipelineSpec(
name="ocr_only",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tesseract",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
),
)
assert validate_spec(spec) == []
def test_ocr_then_llm(self) -> None:
spec = PipelineSpec(
name="ocr_llm",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tesseract",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
PipelineStep(
id="correct", kind="post_correction",
adapter_name="openai:gpt-4o",
input_types=(ArtifactType.RAW_TEXT,),
output_types=(ArtifactType.CORRECTED_TEXT,),
),
),
)
assert validate_spec(spec) == []
def test_def_of_done_tesseract_llm_alto_remap(self) -> None:
"""Définition de done du S6 : valider le YAML cible BnF."""
spec = PipelineSpec(
name="tesseract_llm_alto_remap",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tesseract",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT, ArtifactType.ALTO_XML),
),
PipelineStep(
id="correction", kind="post_correction",
adapter_name="openai:gpt-4o",
input_types=(ArtifactType.RAW_TEXT,),
output_types=(ArtifactType.CORRECTED_TEXT,),
inputs_from={ArtifactType.RAW_TEXT: "ocr"},
),
PipelineStep(
id="alto_remap", kind="alto_remapping",
adapter_name="picarones-contrib:line_remapper",
input_types=(
ArtifactType.CORRECTED_TEXT, ArtifactType.ALTO_XML,
),
output_types=(ArtifactType.ALTO_XML,),
inputs_from={
ArtifactType.CORRECTED_TEXT: "correction",
ArtifactType.ALTO_XML: "ocr",
},
),
),
)
assert validate_spec(spec) == []
def test_inputs_from_initial_explicit(self) -> None:
"""Une étape peut référencer explicitement les entrées
initiales via ``__initial__``."""
spec = PipelineSpec(
name="explicit_initial",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tesseract",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
inputs_from={ArtifactType.IMAGE: INITIAL_STEP_ID},
),
),
)
assert validate_spec(spec) == []
# ──────────────────────────────────────────────────────────────────────
# Cas invalides
# ──────────────────────────────────────────────────────────────────────
class TestInvalidSpecs:
def test_empty_pipeline(self) -> None:
spec = PipelineSpec(name="empty")
errors = validate_spec(spec)
assert len(errors) == 1
assert errors[0].code == "empty_pipeline"
def test_missing_input_no_initial(self) -> None:
"""Une étape qui demande IMAGE mais initial_inputs vide."""
spec = PipelineSpec(
name="missing_image",
initial_inputs=(),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tesseract",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "missing_input" in codes
def test_missing_input_step_order_wrong(self) -> None:
"""L'étape de correction est avant l'OCR — le RAW_TEXT n'existe
pas encore."""
spec = PipelineSpec(
name="wrong_order",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="correct", kind="post_correction",
adapter_name="openai",
input_types=(ArtifactType.RAW_TEXT,),
output_types=(ArtifactType.CORRECTED_TEXT,),
),
PipelineStep(
id="ocr", kind="ocr", adapter_name="tesseract",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "missing_input" in codes
# La première étape (correct) doit être le step_id signalé.
missing = [e for e in errors if e.code == "missing_input"]
assert any(e.step_id == "correct" for e in missing)
def test_duplicate_step_id(self) -> None:
spec = PipelineSpec(
name="dup",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="step", kind="ocr", adapter_name="a",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
PipelineStep(
id="step", kind="post_correction", adapter_name="b",
input_types=(ArtifactType.RAW_TEXT,),
output_types=(ArtifactType.CORRECTED_TEXT,),
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "duplicate_id" in codes
def test_unknown_adapter_when_registry_provided(self) -> None:
spec = PipelineSpec(
name="unknown",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="not_in_registry",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
),
)
errors = validate_spec(spec, available_adapters={"tesseract"})
codes = [e.code for e in errors]
assert "unknown_adapter" in codes
def test_no_adapter_check_when_registry_none(self) -> None:
"""Si available_adapters=None, on ne vérifie pas les adapters."""
spec = PipelineSpec(
name="x",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="not_registered_anywhere",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
),
)
errors = validate_spec(spec) # registry=None
codes = [e.code for e in errors]
assert "unknown_adapter" not in codes
def test_inputs_from_unused_type(self) -> None:
"""Une étape déclare ``inputs_from[X]`` mais X n'est pas dans
son ``input_types``."""
spec = PipelineSpec(
name="x",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tess",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
inputs_from={ArtifactType.ALTO_XML: INITIAL_STEP_ID},
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "inputs_from_unused" in codes
def test_unknown_input_source(self) -> None:
"""``inputs_from[type] = "ghost"`` mais ``ghost`` n'existe pas."""
spec = PipelineSpec(
name="x",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tess",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
inputs_from={ArtifactType.IMAGE: "ghost"},
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "unknown_input_source" in codes
def test_source_does_not_produce_type(self) -> None:
"""``inputs_from[ALTO_XML] = "ocr"`` mais ``ocr`` ne produit que
``RAW_TEXT``."""
spec = PipelineSpec(
name="x",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr", kind="ocr", adapter_name="tess",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
PipelineStep(
id="alto_consumer", kind="x", adapter_name="y",
input_types=(ArtifactType.ALTO_XML,),
output_types=(ArtifactType.ALTO_XML,),
inputs_from={ArtifactType.ALTO_XML: "ocr"},
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "source_does_not_produce_type" in codes
# En plus, ALTO_XML n'est pas disponible dans le bag → missing_input
# peut aussi être levé.
def test_multiple_errors_at_once(self) -> None:
"""``validate_spec`` ne s'arrête pas à la première erreur."""
spec = PipelineSpec(
name="multi_errors",
initial_inputs=(),
steps=(
PipelineStep(
id="dup", kind="x", adapter_name="a",
input_types=(ArtifactType.IMAGE,),
output_types=(),
),
PipelineStep(
id="dup", kind="y", adapter_name="b",
input_types=(ArtifactType.RAW_TEXT,),
output_types=(),
),
),
)
errors = validate_spec(spec)
codes = [e.code for e in errors]
assert "duplicate_id" in codes
assert "missing_input" in codes # IMAGE et RAW_TEXT manquants