"""Phase B5 — production native ALTO XML par ``TesseractAdapter``. Tesseract sait nativement produire un ALTO 4 via ``pytesseract.image_to_alto_xml``. Ce test vérifie que : 1. Le flag ``expose_alto`` (off par défaut, compat ascendante) ajoute un ``Artifact ALTO_XML`` à la sortie d'``execute()``. 2. La sortie est validée structurellement (XML bien formé) avant d'être promue en artefact. 3. Les défaillances (Tesseract qui plante, sortie vide, XML mal formé) sont absorbées en warning sans casser l'OCR ``RAW_TEXT``. 4. Un test ``@pytest.mark.live`` invoque le vrai binaire ``tesseract`` et vérifie que l'ALTO produit est valide. """ from __future__ import annotations from pathlib import Path from unittest.mock import MagicMock, patch import pytest from picarones.adapters.ocr import TesseractAdapter from picarones.domain.artifacts import Artifact, ArtifactType from picarones.pipeline.types import RunContext # ────────────────────────────────────────────────────────────────────── # Helpers # ────────────────────────────────────────────────────────────────────── _PNG_HEADER = b"\x89PNG\r\n\x1a\n" _ALTO_VALID = """ """ def _make_image_artifact(uri: str) -> Artifact: return Artifact( id="d1:initial:image", document_id="d1", type=ArtifactType.IMAGE, uri=uri, ) def _make_context() -> RunContext: return RunContext( document_id="d1", code_version="1.0.0", pipeline_name="test", ) def _create_dummy_image(tmp_path: Path) -> Path: path = tmp_path / "page.png" path.write_bytes(_PNG_HEADER) return path # ────────────────────────────────────────────────────────────────────── # Constructeur # ────────────────────────────────────────────────────────────────────── class TestExposeAltoFlag: def test_default_off(self) -> None: """Compat ascendante : ``expose_alto`` est désactivé par défaut. Les pipelines existants qui consomment ``RAW_TEXT`` / ``CONFIDENCES`` ne reçoivent aucun nouvel artefact non sollicité. """ adapter = TesseractAdapter() assert adapter.expose_alto is False def test_can_be_enabled(self) -> None: adapter = TesseractAdapter(expose_alto=True) assert adapter.expose_alto is True def test_alto_xml_in_class_output_types(self) -> None: """Phase B5 — ``ALTO_XML`` est dans le set maximal de l'adapter (le YAML ``output_types`` du step décide quels types l'aval consomme). """ assert ArtifactType.ALTO_XML in TesseractAdapter.output_types def test_default_output_still_includes_raw_text(self) -> None: """Pas de régression : ``RAW_TEXT`` et ``CONFIDENCES`` restent dans le set maximal.""" assert ArtifactType.RAW_TEXT in TesseractAdapter.output_types assert ArtifactType.CONFIDENCES in TesseractAdapter.output_types # ────────────────────────────────────────────────────────────────────── # execute() — pas de production ALTO si expose_alto=False # ────────────────────────────────────────────────────────────────────── class TestExecuteNoAlto: @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_function_not_called_by_default( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """Sans ``expose_alto``, ``pytesseract.image_to_alto_xml`` n'est jamais invoqué — pas de coût Tesseract additionnel.""" mock_image_to_string.return_value = "Bonjour le monde" mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( expose_alto=False, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) # ALTO absent du résultat. assert ArtifactType.ALTO_XML not in result # ``image_to_alto_xml`` jamais invoqué. mock_image_to_alto.assert_not_called() # ────────────────────────────────────────────────────────────────────── # execute() — production ALTO quand expose_alto=True # ────────────────────────────────────────────────────────────────────── class TestExecuteAltoEnabled: @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_artifact_produced( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """Avec ``expose_alto=True``, un ``Artifact ALTO_XML`` est produit en plus du ``RAW_TEXT``.""" mock_image_to_string.return_value = "Bonjour monde" mock_image_to_alto.return_value = _ALTO_VALID.encode("utf-8") mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( expose_alto=True, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) assert ArtifactType.ALTO_XML in result alto_artifact = result[ArtifactType.ALTO_XML] assert alto_artifact.type == ArtifactType.ALTO_XML assert alto_artifact.uri is not None # Le fichier ALTO existe et contient l'XML retourné par Tesseract. alto_path = Path(alto_artifact.uri) assert alto_path.exists() assert alto_path.suffix == ".xml" assert "alto" in alto_path.name.lower() assert "Bonjour" in alto_path.read_text(encoding="utf-8") @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_called_with_correct_lang_and_config( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """``image_to_alto_xml`` reçoit les mêmes ``lang``/``config`` que ``image_to_string`` — cohérence des paramètres OCR.""" mock_image_to_string.return_value = "x" mock_image_to_alto.return_value = _ALTO_VALID.encode("utf-8") mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( lang="lat", psm=4, oem=1, expose_alto=True, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) # Vérification que image_to_alto_xml a été invoqué avec # la bonne langue et la bonne config. assert mock_image_to_alto.call_count == 1 kwargs = mock_image_to_alto.call_args.kwargs assert kwargs["lang"] == "lat" assert kwargs["config"] == "--oem 1 --psm 4" @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_failure_does_not_break_raw_text( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """Si ``image_to_alto_xml`` lève une exception, l'OCR ``RAW_TEXT`` reste valide — l'ALTO est juste sauté avec un warning loggé. """ mock_image_to_string.return_value = "Bonjour" mock_image_to_alto.side_effect = RuntimeError("Tesseract ALTO crash") mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( expose_alto=True, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) # RAW_TEXT toujours présent. assert ArtifactType.RAW_TEXT in result # ALTO absent (best-effort skip). assert ArtifactType.ALTO_XML not in result @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_empty_output_skipped( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """Un ALTO vide ou que des espaces n'est pas promu en artefact.""" mock_image_to_string.return_value = "x" mock_image_to_alto.return_value = b"" mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( expose_alto=True, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) assert ArtifactType.ALTO_XML not in result @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_malformed_xml_skipped( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """Un ALTO mal formé (balise non fermée, etc.) n'est pas promu en artefact — la validation ``safe_parse_xml`` rejette.""" mock_image_to_string.return_value = "x" # XML invalide : pas de balise root fermante. mock_image_to_alto.return_value = b"" mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( expose_alto=True, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) assert ArtifactType.ALTO_XML not in result @patch("PIL.Image.open") @patch("pytesseract.image_to_string") @patch("pytesseract.image_to_alto_xml") def test_alto_string_output_normalized( self, mock_image_to_alto: MagicMock, mock_image_to_string: MagicMock, mock_image_open: MagicMock, tmp_path: Path, ) -> None: """``pytesseract.image_to_alto_xml`` peut retourner un ``str`` au lieu de ``bytes`` selon la version — l'adapter doit gérer les deux types.""" mock_image_to_string.return_value = "x" mock_image_to_alto.return_value = _ALTO_VALID # str, pas bytes mock_image_open.return_value.__enter__.return_value = MagicMock() adapter = TesseractAdapter( expose_alto=True, expose_confidences=False, ) image_path = _create_dummy_image(tmp_path) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))}, params={}, context=_make_context(), ) assert ArtifactType.ALTO_XML in result # ────────────────────────────────────────────────────────────────────── # Test live — vraie exécution Tesseract # ────────────────────────────────────────────────────────────────────── @pytest.mark.live class TestExecuteAltoLive: """Tests qui invoquent le vrai binaire ``tesseract``. Activés uniquement avec ``pytest -m live``. Skipped sans le binaire (vérifié au fixture). """ @pytest.fixture def real_image(self, tmp_path: Path) -> Path: """Crée une image PNG avec du texte rendu via Pillow. Tesseract devrait être capable de transcrire ce texte. """ from PIL import Image, ImageDraw img = Image.new("RGB", (300, 80), color=(255, 255, 255)) d = ImageDraw.Draw(img) d.text((10, 30), "Bonjour", fill=(0, 0, 0)) path = tmp_path / "live_page.png" img.save(path) return path def test_real_tesseract_produces_valid_alto( self, real_image: Path, tmp_path: Path, ) -> None: """Vrai Tesseract → ALTO XML structurellement valide.""" from picarones.formats.alto.parser import parse_alto adapter = TesseractAdapter( lang="eng", psm=7, expose_alto=True, expose_confidences=False, ) result = adapter.execute( inputs={ArtifactType.IMAGE: _make_image_artifact(str(real_image))}, params={}, context=_make_context(), ) assert ArtifactType.ALTO_XML in result, ( "Tesseract n'a pas produit d'ALTO — vérifier l'installation " "tesseract + pytesseract." ) alto_path = Path(result[ArtifactType.ALTO_XML].uri) assert alto_path.exists() # Le parser ALTO de Picarones doit accepter la sortie Tesseract. parsed = parse_alto(alto_path.read_text(encoding="utf-8")) assert parsed is not None