Picarones / tests /adapters /ocr /test_sprint_a14_s30_tesseract_adapter.py
Claude
feat(adapters): Phase B5 — TesseractAdapter expose ALTO XML natif
1c1ad9a unverified
Raw
History Blame
15.7 kB
"""Sprint A14-S30 — ``TesseractAdapter`` natif au contrat S26.
Tests de l'adapter Tesseract migré nativement (pas de shim sur le
legacy ``picarones.engines.tesseract``).
Couvre :
1. Constructeur :
- rejet des paramètres invalides (name, psm, oem) ;
- valeurs par défaut ;
- propriétés en lecture.
2. ``execute`` :
- cas nominal (mock pytesseract) → Artifact RAW_TEXT avec URI ;
- input IMAGE absent → OCRAdapterError ;
- artefact image sans URI → OCRAdapterError ;
- image inexistante → OCRAdapterError ;
- pytesseract non installé → OCRAdapterError ;
- Tesseract lève → OCRAdapterError ;
- écriture du fichier de sortie au bon emplacement ;
- tesseract_cmd appliqué.
3. Contrat ``BaseOCRAdapter`` :
- input_types / output_types / execution_mode ;
- hérite bien de BaseOCRAdapter.
"""
from __future__ import annotations
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from picarones.adapters.ocr import (
BaseOCRAdapter,
OCRAdapterError,
TesseractAdapter,
)
from picarones.domain.artifacts import Artifact, ArtifactType
from picarones.pipeline.types import RunContext
# ──────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────
def _make_image_artifact(uri: str) -> Artifact:
return Artifact(
id="d1:initial:image",
document_id="d1",
type=ArtifactType.IMAGE,
uri=uri,
)
def _make_context() -> RunContext:
return RunContext(
document_id="d1",
code_version="1.0.0",
pipeline_name="test",
)
# ──────────────────────────────────────────────────────────────────────
# Constructeur
# ──────────────────────────────────────────────────────────────────────
class TestTesseractAdapterConstructor:
def test_defaults(self) -> None:
adapter = TesseractAdapter()
assert adapter.name == "tesseract"
assert adapter.lang == "fra"
assert adapter.psm == 6
assert adapter.oem == 3
def test_custom_name(self) -> None:
adapter = TesseractAdapter(name="my_tesseract_lat")
assert adapter.name == "my_tesseract_lat"
def test_rejects_empty_name(self) -> None:
with pytest.raises(OCRAdapterError, match="vide"):
TesseractAdapter(name="")
def test_rejects_whitespace_name(self) -> None:
with pytest.raises(OCRAdapterError, match="vide"):
TesseractAdapter(name=" ")
def test_rejects_invalid_chars_in_name(self) -> None:
with pytest.raises(OCRAdapterError, match="invalide"):
TesseractAdapter(name="bad name with space")
def test_rejects_psm_out_of_range(self) -> None:
with pytest.raises(OCRAdapterError, match=r"psm.*\[0, 13\]"):
TesseractAdapter(psm=14)
with pytest.raises(OCRAdapterError, match=r"psm.*\[0, 13\]"):
TesseractAdapter(psm=-1)
def test_rejects_oem_out_of_range(self) -> None:
with pytest.raises(OCRAdapterError, match=r"oem.*\[0, 3\]"):
TesseractAdapter(oem=4)
with pytest.raises(OCRAdapterError, match=r"oem.*\[0, 3\]"):
TesseractAdapter(oem=-1)
def test_accepts_psm_boundary_values(self) -> None:
TesseractAdapter(psm=0)
TesseractAdapter(psm=13)
def test_accepts_oem_boundary_values(self) -> None:
TesseractAdapter(oem=0)
TesseractAdapter(oem=3)
# ──────────────────────────────────────────────────────────────────────
# Contrat BaseOCRAdapter
# ──────────────────────────────────────────────────────────────────────
class TestTesseractAdapterContract:
def test_inherits_base_adapter(self) -> None:
adapter = TesseractAdapter()
assert isinstance(adapter, BaseOCRAdapter)
def test_input_types(self) -> None:
assert TesseractAdapter.input_types == frozenset({ArtifactType.IMAGE})
def test_output_types(self) -> None:
"""``output_types`` est l'ensemble maximal produit (constante de
classe). Si ``expose_confidences=False`` ou ``expose_alto=False``,
l'execute() omet le type correspondant — le YAML
``PipelineSpec`` doit alors déclarer seulement les types
effectivement consommés pour cohérence.
Phase B5 (mai 2026) — ``ALTO_XML`` ajouté au set maximal pour
permettre la production d'un ALTO natif via
``image_to_alto_xml`` (opt-in via ``expose_alto=True``).
"""
assert TesseractAdapter.output_types == frozenset(
{ArtifactType.RAW_TEXT, ArtifactType.CONFIDENCES,
ArtifactType.ALTO_XML},
)
def test_execution_mode_is_cpu(self) -> None:
"""Tesseract est CPU-bound — utilise un ProcessPool dans le runner."""
assert TesseractAdapter.execution_mode == "cpu"
# ──────────────────────────────────────────────────────────────────────
# execute() — validation des inputs
# ──────────────────────────────────────────────────────────────────────
class TestTesseractAdapterInputValidation:
def test_missing_image_input_raises(self, tmp_path: Path) -> None:
adapter = TesseractAdapter()
with pytest.raises(OCRAdapterError, match="IMAGE manquant"):
adapter.execute(inputs={}, params={}, context=_make_context())
def test_image_artifact_without_uri_raises(self) -> None:
adapter = TesseractAdapter()
artifact = Artifact(
id="d1:img",
document_id="d1",
type=ArtifactType.IMAGE,
uri=None, # explicit no URI
)
with pytest.raises(OCRAdapterError, match="sans URI"):
adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
def test_image_path_does_not_exist_raises(self) -> None:
adapter = TesseractAdapter()
artifact = _make_image_artifact("/nonexistent/path/img.png")
with pytest.raises(OCRAdapterError, match="introuvable"):
adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
# ──────────────────────────────────────────────────────────────────────
# execute() — chemin nominal et erreurs Tesseract
# ──────────────────────────────────────────────────────────────────────
class TestTesseractAdapterExecute:
def _create_dummy_image(self, tmp_path: Path) -> Path:
"""Crée un fichier vide qui sert d'image (les tests mocquent
pytesseract donc le contenu n'est pas analysé)."""
path = tmp_path / "page.png"
path.write_bytes(b"\x89PNG\r\n\x1a\n") # signature PNG basique
return path
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_nominal_execution(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
"""Cas nominal : pytesseract retourne du texte → Artifact RAW_TEXT
avec URI vers un fichier produit."""
mock_image_to_string.return_value = "Bonjour le monde\n"
mock_image_open.return_value.__enter__.return_value = MagicMock()
adapter = TesseractAdapter()
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
result = adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
assert ArtifactType.RAW_TEXT in result
produced = result[ArtifactType.RAW_TEXT]
assert produced.type == ArtifactType.RAW_TEXT
assert produced.uri is not None
# Le fichier de sortie existe et contient le texte stripé.
out_path = Path(produced.uri)
assert out_path.exists()
assert out_path.read_text(encoding="utf-8") == "Bonjour le monde"
# Convention : <stem>.<name>.txt à côté de l'image.
assert out_path.name == "page.tesseract.txt"
assert out_path.parent == tmp_path
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_custom_name_changes_output_filename(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
mock_image_to_string.return_value = "x"
mock_image_open.return_value.__enter__.return_value = MagicMock()
adapter = TesseractAdapter(name="tess_lat_psm6")
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
result = adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
out_path = Path(result[ArtifactType.RAW_TEXT].uri)
assert out_path.name == "page.tess_lat_psm6.txt"
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_lang_psm_oem_passed_to_pytesseract(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
mock_image_to_string.return_value = "x"
mock_image_open.return_value.__enter__.return_value = MagicMock()
adapter = TesseractAdapter(lang="lat", psm=4, oem=1)
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
# On vérifie l'appel à pytesseract.image_to_string avec les bons args.
assert mock_image_to_string.called
kwargs = mock_image_to_string.call_args.kwargs
assert kwargs["lang"] == "lat"
assert "--psm 4" in kwargs["config"]
assert "--oem 1" in kwargs["config"]
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_tesseract_cmd_applied_when_set(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
mock_image_to_string.return_value = "x"
mock_image_open.return_value.__enter__.return_value = MagicMock()
# Ré-import temporaire pour récupérer le module.
import pytesseract # type: ignore[import-untyped]
adapter = TesseractAdapter(tesseract_cmd="/custom/bin/tesseract")
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
assert pytesseract.pytesseract.tesseract_cmd == "/custom/bin/tesseract"
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_tesseract_exception_wrapped_in_ocr_error(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
mock_image_open.return_value.__enter__.return_value = MagicMock()
mock_image_to_string.side_effect = RuntimeError("Tesseract crashed")
adapter = TesseractAdapter()
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
with pytest.raises(OCRAdapterError, match="RuntimeError.*Tesseract crashed"):
adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
def test_pytesseract_not_installed_raises_clean_error(
self, tmp_path: Path,
) -> None:
"""Si pytesseract est introuvable, l'erreur est claire et
propose une commande pip."""
adapter = TesseractAdapter()
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
# Simule que pytesseract est absent.
with patch.dict(sys.modules, {"pytesseract": None}):
with pytest.raises(
OCRAdapterError, match="pytesseract.*pip install",
):
adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_artifact_id_uses_adapter_name(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
mock_image_to_string.return_value = "x"
mock_image_open.return_value.__enter__.return_value = MagicMock()
adapter = TesseractAdapter(name="custom_name")
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
result = adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
produced = result[ArtifactType.RAW_TEXT]
assert produced.id == "d1:custom_name:raw_text"
assert produced.document_id == "d1"
assert produced.produced_by_step == "ocr"
@patch("PIL.Image.open")
@patch("pytesseract.image_to_string")
def test_text_is_stripped(
self, mock_image_to_string: MagicMock,
mock_image_open: MagicMock,
tmp_path: Path,
) -> None:
"""Le texte est strippé des whitespaces extérieurs comme dans
le legacy."""
mock_image_to_string.return_value = " \n\nHello world\n\n "
mock_image_open.return_value.__enter__.return_value = MagicMock()
adapter = TesseractAdapter()
image_path = self._create_dummy_image(tmp_path)
artifact = _make_image_artifact(str(image_path))
result = adapter.execute(
inputs={ArtifactType.IMAGE: artifact},
params={},
context=_make_context(),
)
out_text = Path(result[ArtifactType.RAW_TEXT].uri).read_text(
encoding="utf-8",
)
assert out_text == "Hello world"