Picarones / tests /adapters /test_ocr_adapter.py
Claude
test(rename): dΓ©-sprintage tests/adapters (22 fichiers, git mv ; collision vlm arbitrΓ©e)
a59515a unverified
Raw
History Blame
13.9 kB
"""Sprint A14-S26 β€” ``BaseOCRAdapter`` + ``PrecomputedTextAdapter``.
Couverture :
- **Contrat** : un ``BaseOCRAdapter`` est instanciable, expose
``name`` / ``input_types`` / ``output_types`` / ``execution_mode``,
son ``execute()`` est abstrait.
- **PrecomputedTextAdapter** : validation du ``source_label``,
lecture filesystem par convention de nommage, politique
``"raise"`` vs ``"empty"`` sur fichier manquant, validation
UTF-8, isolation entre instances de sources distinctes.
- **Pipeline executor** : un ``PrecomputedTextAdapter`` est consommΓ©
directement par le ``PipelineExecutor`` (S7) β€” preuve que le
contrat ``BaseOCRAdapter`` satisfait ``StepExecutor``.
- **CLI E2E** : YAML dΓ©clarant 3 sources prΓ©-calculΓ©es diffΓ©rentes
β†’ benchmark complet avec 3 pipelines comparΓ©s sur TextView,
sans aucun OCR rΓ©el.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from picarones.adapters.ocr import (
BaseOCRAdapter,
OCRAdapterError,
PrecomputedTextAdapter,
)
from picarones.domain.artifacts import Artifact, ArtifactType
from picarones.pipeline.types import RunContext
# ──────────────────────────────────────────────────────────────────
# Fixtures
# ──────────────────────────────────────────────────────────────────
def _png_bytes() -> bytes:
return (
b"\x89PNG\r\n\x1a\n"
b"\x00\x00\x00\rIHDR"
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00"
b"\x1f\x15\xc4\x89"
)
def _ctx(doc_id: str = "doc01") -> RunContext:
return RunContext(
document_id=doc_id,
code_version="1.0.0-s26-test",
pipeline_name="test_pipeline",
)
def _image_artifact(doc_id: str, path: Path) -> Artifact:
return Artifact(
id=f"{doc_id}:image",
document_id=doc_id,
type=ArtifactType.IMAGE,
uri=str(path),
)
# ──────────────────────────────────────────────────────────────────
# Contrat BaseOCRAdapter
# ──────────────────────────────────────────────────────────────────
class TestBaseOCRAdapterContract:
def test_cannot_instantiate_abstract_directly(self) -> None:
with pytest.raises(TypeError):
BaseOCRAdapter() # type: ignore[abstract]
def test_minimal_subclass_with_name_and_execute_works(self) -> None:
class _Minimal(BaseOCRAdapter):
@property
def name(self) -> str:
return "minimal"
def execute(self, inputs, params, context):
return {}
adapter = _Minimal()
assert adapter.name == "minimal"
assert ArtifactType.IMAGE in adapter.input_types
assert ArtifactType.RAW_TEXT in adapter.output_types
assert adapter.execution_mode == "io"
def test_subclass_can_override_io_modes(self) -> None:
class _CPUBound(BaseOCRAdapter):
execution_mode = "cpu"
input_types = frozenset({ArtifactType.IMAGE})
output_types = frozenset({
ArtifactType.RAW_TEXT, ArtifactType.ALTO_XML,
})
@property
def name(self) -> str:
return "cpu_bound"
def execute(self, inputs, params, context):
return {}
adapter = _CPUBound()
assert adapter.execution_mode == "cpu"
assert ArtifactType.ALTO_XML in adapter.output_types
# ──────────────────────────────────────────────────────────────────
# PrecomputedTextAdapter β€” validation Γ  l'init
# ──────────────────────────────────────────────────────────────────
class TestPrecomputedInitValidation:
def test_empty_source_label_rejected(self) -> None:
with pytest.raises(OCRAdapterError, match="vide"):
PrecomputedTextAdapter(source_label="")
def test_whitespace_source_label_rejected(self) -> None:
with pytest.raises(OCRAdapterError, match="vide"):
PrecomputedTextAdapter(source_label=" ")
def test_invalid_chars_in_source_label_rejected(self) -> None:
for bad in ("foo/bar", "foo bar", "foo.bar", "foo:bar"):
with pytest.raises(OCRAdapterError, match="invalide"):
PrecomputedTextAdapter(source_label=bad)
def test_valid_source_labels_accepted(self) -> None:
for good in ("tesseract", "gpt-4v", "pero_ocr", "ABC123"):
adapter = PrecomputedTextAdapter(source_label=good)
assert adapter.source_label == good
assert adapter.name == f"precomputed_{good}"
def test_invalid_missing_text_policy_rejected(self) -> None:
with pytest.raises(OCRAdapterError, match="missing_text_policy"):
PrecomputedTextAdapter(
source_label="tess",
missing_text_policy="silent", # type: ignore[arg-type]
)
def test_default_missing_text_policy_is_raise(self) -> None:
adapter = PrecomputedTextAdapter(source_label="tess")
assert adapter._missing_policy == "raise"
# ──────────────────────────────────────────────────────────────────
# PrecomputedTextAdapter β€” exΓ©cution
# ──────────────────────────────────────────────────────────────────
class TestPrecomputedExecute:
def test_reads_text_file_by_convention(self, tmp_path: Path) -> None:
# PrΓ©parer image + texte prΓ©-calculΓ©.
image_path = tmp_path / "doc01.png"
image_path.write_bytes(_png_bytes())
text_path = tmp_path / "doc01.tesseract.txt"
text_path.write_text("Bonjour le monde", encoding="utf-8")
adapter = PrecomputedTextAdapter(source_label="tesseract")
outputs = adapter.execute(
inputs={ArtifactType.IMAGE: _image_artifact("doc01", image_path)},
params={},
context=_ctx("doc01"),
)
art = outputs[ArtifactType.RAW_TEXT]
assert art.type == ArtifactType.RAW_TEXT
assert art.document_id == "doc01"
assert Path(art.uri).read_text(encoding="utf-8") == "Bonjour le monde"
# Convention <doc_id>:<owner>:<role>.
assert art.id == "doc01:precomputed_tesseract:raw_text"
def test_missing_text_raises_by_default(self, tmp_path: Path) -> None:
image_path = tmp_path / "doc01.png"
image_path.write_bytes(_png_bytes())
# Pas de doc01.tesseract.txt.
adapter = PrecomputedTextAdapter(source_label="tesseract")
with pytest.raises(OCRAdapterError, match="introuvable"):
adapter.execute(
inputs={ArtifactType.IMAGE: _image_artifact("doc01", image_path)},
params={},
context=_ctx("doc01"),
)
def test_missing_text_empty_policy_creates_empty_file(
self, tmp_path: Path,
) -> None:
image_path = tmp_path / "doc01.png"
image_path.write_bytes(_png_bytes())
adapter = PrecomputedTextAdapter(
source_label="tess",
missing_text_policy="empty",
)
outputs = adapter.execute(
inputs={ArtifactType.IMAGE: _image_artifact("doc01", image_path)},
params={},
context=_ctx("doc01"),
)
art = outputs[ArtifactType.RAW_TEXT]
assert Path(art.uri).read_text(encoding="utf-8") == ""
def test_non_utf8_file_rejected(self, tmp_path: Path) -> None:
image_path = tmp_path / "doc01.png"
image_path.write_bytes(_png_bytes())
text_path = tmp_path / "doc01.tess.txt"
# Bytes invalides en UTF-8 (latin-1 avec accent).
text_path.write_bytes(b"\xe9\xe8")
adapter = PrecomputedTextAdapter(source_label="tess")
with pytest.raises(OCRAdapterError, match="UTF-8"):
adapter.execute(
inputs={ArtifactType.IMAGE: _image_artifact("doc01", image_path)},
params={},
context=_ctx("doc01"),
)
def test_missing_image_input_rejected(self, tmp_path: Path) -> None:
adapter = PrecomputedTextAdapter(source_label="tess")
with pytest.raises(OCRAdapterError, match="IMAGE manquant"):
adapter.execute(inputs={}, params={}, context=_ctx())
def test_image_artifact_without_uri_rejected(self) -> None:
adapter = PrecomputedTextAdapter(source_label="tess")
with pytest.raises(OCRAdapterError, match="sans URI"):
adapter.execute(
inputs={
ArtifactType.IMAGE: Artifact(
id="d:image", document_id="d",
type=ArtifactType.IMAGE,
),
},
params={},
context=_ctx(),
)
def test_two_sources_isolated_in_same_dir(self, tmp_path: Path) -> None:
"""Cas BnF central : deux sources prΓ©-calculΓ©es dans le mΓͺme
rΓ©pertoire ne se piΓ©tinent pas β€” chaque adapter lit son
propre fichier."""
image_path = tmp_path / "doc01.png"
image_path.write_bytes(_png_bytes())
(tmp_path / "doc01.tess.txt").write_text(
"tesseract output", encoding="utf-8",
)
(tmp_path / "doc01.gpt4v.txt").write_text(
"gpt-4 vision output", encoding="utf-8",
)
a_tess = PrecomputedTextAdapter(source_label="tess")
a_gpt = PrecomputedTextAdapter(source_label="gpt4v")
out_tess = a_tess.execute(
inputs={ArtifactType.IMAGE: _image_artifact("doc01", image_path)},
params={},
context=_ctx("doc01"),
)
out_gpt = a_gpt.execute(
inputs={ArtifactType.IMAGE: _image_artifact("doc01", image_path)},
params={},
context=_ctx("doc01"),
)
assert Path(out_tess[ArtifactType.RAW_TEXT].uri).read_text() \
== "tesseract output"
assert Path(out_gpt[ArtifactType.RAW_TEXT].uri).read_text() \
== "gpt-4 vision output"
def test_image_extension_variations_handled(
self, tmp_path: Path,
) -> None:
"""``stem`` strip toutes les extensions image courantes."""
for ext in (".png", ".jpg", ".jpeg", ".tif", ".tiff"):
image_path = tmp_path / f"folio_001{ext}"
image_path.write_bytes(_png_bytes())
text_path = tmp_path / "folio_001.src.txt"
text_path.write_text("ok", encoding="utf-8")
adapter = PrecomputedTextAdapter(source_label="src")
out = adapter.execute(
inputs={
ArtifactType.IMAGE: _image_artifact("folio_001", image_path),
},
params={},
context=_ctx("folio_001"),
)
assert Path(out[ArtifactType.RAW_TEXT].uri).read_text() == "ok"
# ──────────────────────────────────────────────────────────────────
# Smoke pipeline executor
# ──────────────────────────────────────────────────────────────────
class TestPipelineExecutorIntegration:
def test_adapter_consumed_by_pipeline_executor(
self, tmp_path: Path,
) -> None:
"""DΓ©montre que ``BaseOCRAdapter`` satisfait le contrat
``StepExecutor`` du nouveau pipeline executor β€” preuve que
le contrat propre du nouveau monde est suffisant."""
from picarones.domain.documents import DocumentRef
from picarones.pipeline import (
PipelineExecutor, PipelineSpec, PipelineStep,
)
image_path = tmp_path / "doc01.png"
image_path.write_bytes(_png_bytes())
(tmp_path / "doc01.tess.txt").write_text(
"Bonjour", encoding="utf-8",
)
adapter = PrecomputedTextAdapter(source_label="tess")
spec = PipelineSpec(
name="precomputed_smoke",
initial_inputs=(ArtifactType.IMAGE,),
steps=(PipelineStep(
id="ocr", kind="ocr",
adapter_name="precomputed",
input_types=(ArtifactType.IMAGE,),
output_types=(ArtifactType.RAW_TEXT,),
),),
)
executor = PipelineExecutor(adapter_resolver=lambda n: adapter)
result = executor.run(
spec=spec,
document=DocumentRef(id="doc01", image_uri=str(image_path)),
initial_inputs={
ArtifactType.IMAGE: _image_artifact("doc01", image_path),
},
context=_ctx("doc01"),
)
assert result.succeeded
text_arts = result.artifacts_of_type(ArtifactType.RAW_TEXT)
assert len(text_arts) == 1
assert Path(text_arts[0].uri).read_text() == "Bonjour"