Spaces:
Sleeping
Sleeping
File size: 4,710 Bytes
b9ff8de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | """Sprint A14-S6 — round-trip YAML d'une ``PipelineSpec``.
Garantit que ``dump_spec_to_yaml(spec)`` produit du YAML qui se
recharge en une spec strictement égale. C'est la propriété qui
permet de versionner les pipelines en git de façon
human-readable + machine-actionable.
"""
from __future__ import annotations
import pytest
from picarones.domain import ArtifactType, PicaronesError
from picarones.pipeline import (
PipelineSpec,
PipelineStep,
dump_spec_to_yaml,
load_spec_from_yaml,
)
def _ocr_only_spec() -> PipelineSpec:
return PipelineSpec(
name="ocr_only",
description="Tesseract sur image patrimoniale.",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr",
kind="ocr",
adapter_name="tesseract",
params={"lang": "fra", "psm": 6},
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),
),
)
def _full_pipeline_spec() -> PipelineSpec:
return PipelineSpec(
name="tesseract_llm_alto_remap",
description="OCR + LLM + remapping ALTO (cas BnF central).",
initial_inputs=(ArtifactType.IMAGE,),
steps=(
PipelineStep(
id="ocr",
kind="ocr",
adapter_name="tesseract",
params={"lang": "fra"},
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT, ArtifactType.ALTO_XML),
),
PipelineStep(
id="correction",
kind="post_correction",
adapter_name="openai:gpt-4o",
params={"temperature": 0.0, "max_tokens": 4096},
input_types=(ArtifactType.RAW_TEXT,),
output_types=(ArtifactType.CORRECTED_TEXT,),
inputs_from={ArtifactType.RAW_TEXT: "ocr"},
),
PipelineStep(
id="alto_remap",
kind="alto_remapping",
adapter_name="picarones-contrib:line_remapper",
input_types=(
ArtifactType.CORRECTED_TEXT, ArtifactType.ALTO_XML,
),
output_types=(ArtifactType.ALTO_XML,),
inputs_from={
ArtifactType.CORRECTED_TEXT: "correction",
ArtifactType.ALTO_XML: "ocr",
},
),
),
)
class TestYAMLRoundtrip:
@pytest.mark.parametrize("spec_factory", [_ocr_only_spec, _full_pipeline_spec])
def test_roundtrip_preserves_equality(self, spec_factory) -> None:
spec = spec_factory()
yml = dump_spec_to_yaml(spec)
spec2 = load_spec_from_yaml(yml)
assert spec == spec2
def test_roundtrip_is_idempotent(self) -> None:
"""Dump → Load → Dump produit le même YAML byte-pour-byte."""
spec = _full_pipeline_spec()
yml1 = dump_spec_to_yaml(spec)
spec2 = load_spec_from_yaml(yml1)
yml2 = dump_spec_to_yaml(spec2)
assert yml1 == yml2
def test_yaml_is_human_readable(self) -> None:
"""Le YAML produit doit utiliser le style 'block' (un champ
par ligne), pas le style 'flow' (JSON-like)."""
yml = dump_spec_to_yaml(_full_pipeline_spec())
assert "name: tesseract_llm_alto_remap" in yml
assert "steps:" in yml
# Pas de "{" pour signaler le style block.
# Les ``params`` peuvent encore contenir des ``{}`` quand le
# dict est vide ; on vérifie juste que le format général
# est lisible.
assert "- id: ocr" in yml
def test_empty_yaml_raises(self) -> None:
with pytest.raises(PicaronesError, match="vide"):
load_spec_from_yaml("")
def test_yaml_ordered_fields(self) -> None:
"""``sort_keys=False`` doit être respecté."""
yml = dump_spec_to_yaml(_ocr_only_spec())
# Dans la spec, ``name`` apparaît avant ``description``,
# ``initial_inputs`` avant ``steps``.
i_name = yml.index("name:")
i_desc = yml.index("description:")
i_init = yml.index("initial_inputs:")
i_steps = yml.index("steps:")
assert i_name < i_desc < i_init < i_steps
def test_invalid_yaml_raises(self) -> None:
"""Un YAML qui ne respecte pas le schéma de PipelineSpec
lève une ValidationError pydantic."""
bad = "name: x\nsteps:\n - id: ocr\n kind: ocr\n adapter_name: x\n input_types: [bogus_type]\n"
with pytest.raises(Exception): # pydantic ValidationError
load_spec_from_yaml(bad)
|