File size: 15,551 Bytes
1c1ad9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""Phase B5 β€” production native ALTO XML par ``TesseractAdapter``.

Tesseract sait nativement produire un ALTO 4 via
``pytesseract.image_to_alto_xml``.  Ce test vΓ©rifie que :

1. Le flag ``expose_alto`` (off par dΓ©faut, compat ascendante) ajoute
   un ``Artifact ALTO_XML`` Γ  la sortie d'``execute()``.
2. La sortie est validΓ©e structurellement (XML bien formΓ©) avant
   d'Γͺtre promue en artefact.
3. Les dΓ©faillances (Tesseract qui plante, sortie vide, XML mal
   formΓ©) sont absorbΓ©es en warning sans casser l'OCR ``RAW_TEXT``.
4. Un test ``@pytest.mark.live`` invoque le vrai binaire
   ``tesseract`` et vΓ©rifie que l'ALTO produit est valide.
"""

from __future__ import annotations

from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from picarones.adapters.ocr import TesseractAdapter
from picarones.domain.artifacts import Artifact, ArtifactType
from picarones.pipeline.types import RunContext


# ──────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────


_PNG_HEADER = b"\x89PNG\r\n\x1a\n"


_ALTO_VALID = """<?xml version="1.0" encoding="UTF-8"?>
<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#">
  <Layout>
    <Page ID="page_1" PHYSICAL_IMG_NR="1" WIDTH="1000" HEIGHT="1500">
      <PrintSpace ID="ps_1">
        <TextBlock ID="block_1">
          <TextLine ID="line_1">
            <String ID="word_1" CONTENT="Bonjour"
                    HPOS="100" VPOS="100" WIDTH="80" HEIGHT="20"/>
            <String ID="word_2" CONTENT="monde"
                    HPOS="200" VPOS="100" WIDTH="60" HEIGHT="20"/>
          </TextLine>
        </TextBlock>
      </PrintSpace>
    </Page>
  </Layout>
</alto>
"""


def _make_image_artifact(uri: str) -> Artifact:
    return Artifact(
        id="d1:initial:image",
        document_id="d1",
        type=ArtifactType.IMAGE,
        uri=uri,
    )


def _make_context() -> RunContext:
    return RunContext(
        document_id="d1",
        code_version="1.0.0",
        pipeline_name="test",
    )


def _create_dummy_image(tmp_path: Path) -> Path:
    path = tmp_path / "page.png"
    path.write_bytes(_PNG_HEADER)
    return path


# ──────────────────────────────────────────────────────────────────────
# Constructeur
# ──────────────────────────────────────────────────────────────────────


class TestExposeAltoFlag:
    def test_default_off(self) -> None:
        """Compat ascendante : ``expose_alto`` est dΓ©sactivΓ© par dΓ©faut.

        Les pipelines existants qui consomment ``RAW_TEXT`` /
        ``CONFIDENCES`` ne reΓ§oivent aucun nouvel artefact non
        sollicitΓ©.
        """
        adapter = TesseractAdapter()
        assert adapter.expose_alto is False

    def test_can_be_enabled(self) -> None:
        adapter = TesseractAdapter(expose_alto=True)
        assert adapter.expose_alto is True

    def test_alto_xml_in_class_output_types(self) -> None:
        """Phase B5 β€” ``ALTO_XML`` est dans le set maximal de
        l'adapter (le YAML ``output_types`` du step dΓ©cide quels
        types l'aval consomme).
        """
        assert ArtifactType.ALTO_XML in TesseractAdapter.output_types

    def test_default_output_still_includes_raw_text(self) -> None:
        """Pas de rΓ©gression : ``RAW_TEXT`` et ``CONFIDENCES`` restent
        dans le set maximal."""
        assert ArtifactType.RAW_TEXT in TesseractAdapter.output_types
        assert ArtifactType.CONFIDENCES in TesseractAdapter.output_types


# ──────────────────────────────────────────────────────────────────────
# execute() β€” pas de production ALTO si expose_alto=False
# ──────────────────────────────────────────────────────────────────────


class TestExecuteNoAlto:
    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_function_not_called_by_default(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """Sans ``expose_alto``, ``pytesseract.image_to_alto_xml``
        n'est jamais invoquΓ© β€” pas de coΓ»t Tesseract additionnel."""
        mock_image_to_string.return_value = "Bonjour le monde"
        mock_image_open.return_value.__enter__.return_value = MagicMock()
        adapter = TesseractAdapter(
            expose_alto=False, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)

        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        # ALTO absent du rΓ©sultat.
        assert ArtifactType.ALTO_XML not in result
        # ``image_to_alto_xml`` jamais invoquΓ©.
        mock_image_to_alto.assert_not_called()


# ──────────────────────────────────────────────────────────────────────
# execute() β€” production ALTO quand expose_alto=True
# ──────────────────────────────────────────────────────────────────────


class TestExecuteAltoEnabled:
    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_artifact_produced(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """Avec ``expose_alto=True``, un ``Artifact ALTO_XML`` est
        produit en plus du ``RAW_TEXT``."""
        mock_image_to_string.return_value = "Bonjour monde"
        mock_image_to_alto.return_value = _ALTO_VALID.encode("utf-8")
        mock_image_open.return_value.__enter__.return_value = MagicMock()

        adapter = TesseractAdapter(
            expose_alto=True, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)

        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        assert ArtifactType.ALTO_XML in result
        alto_artifact = result[ArtifactType.ALTO_XML]
        assert alto_artifact.type == ArtifactType.ALTO_XML
        assert alto_artifact.uri is not None
        # Le fichier ALTO existe et contient l'XML retournΓ© par Tesseract.
        alto_path = Path(alto_artifact.uri)
        assert alto_path.exists()
        assert alto_path.suffix == ".xml"
        assert "alto" in alto_path.name.lower()
        assert "Bonjour" in alto_path.read_text(encoding="utf-8")

    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_called_with_correct_lang_and_config(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """``image_to_alto_xml`` reΓ§oit les mΓͺmes ``lang``/``config``
        que ``image_to_string`` — cohérence des paramètres OCR."""
        mock_image_to_string.return_value = "x"
        mock_image_to_alto.return_value = _ALTO_VALID.encode("utf-8")
        mock_image_open.return_value.__enter__.return_value = MagicMock()

        adapter = TesseractAdapter(
            lang="lat", psm=4, oem=1,
            expose_alto=True, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)
        adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        # VΓ©rification que image_to_alto_xml a Γ©tΓ© invoquΓ© avec
        # la bonne langue et la bonne config.
        assert mock_image_to_alto.call_count == 1
        kwargs = mock_image_to_alto.call_args.kwargs
        assert kwargs["lang"] == "lat"
        assert kwargs["config"] == "--oem 1 --psm 4"

    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_failure_does_not_break_raw_text(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """Si ``image_to_alto_xml`` lève une exception, l'OCR
        ``RAW_TEXT`` reste valide β€” l'ALTO est juste sautΓ© avec
        un warning loggΓ©.
        """
        mock_image_to_string.return_value = "Bonjour"
        mock_image_to_alto.side_effect = RuntimeError("Tesseract ALTO crash")
        mock_image_open.return_value.__enter__.return_value = MagicMock()

        adapter = TesseractAdapter(
            expose_alto=True, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)
        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        # RAW_TEXT toujours prΓ©sent.
        assert ArtifactType.RAW_TEXT in result
        # ALTO absent (best-effort skip).
        assert ArtifactType.ALTO_XML not in result

    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_empty_output_skipped(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """Un ALTO vide ou que des espaces n'est pas promu en artefact."""
        mock_image_to_string.return_value = "x"
        mock_image_to_alto.return_value = b""
        mock_image_open.return_value.__enter__.return_value = MagicMock()

        adapter = TesseractAdapter(
            expose_alto=True, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)
        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        assert ArtifactType.ALTO_XML not in result

    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_malformed_xml_skipped(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """Un ALTO mal formΓ© (balise non fermΓ©e, etc.) n'est pas promu
        en artefact β€” la validation ``safe_parse_xml`` rejette."""
        mock_image_to_string.return_value = "x"
        # XML invalide : pas de balise root fermante.
        mock_image_to_alto.return_value = b"<alto><Page></alto>"
        mock_image_open.return_value.__enter__.return_value = MagicMock()

        adapter = TesseractAdapter(
            expose_alto=True, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)
        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        assert ArtifactType.ALTO_XML not in result

    @patch("PIL.Image.open")
    @patch("pytesseract.image_to_string")
    @patch("pytesseract.image_to_alto_xml")
    def test_alto_string_output_normalized(
        self,
        mock_image_to_alto: MagicMock,
        mock_image_to_string: MagicMock,
        mock_image_open: MagicMock,
        tmp_path: Path,
    ) -> None:
        """``pytesseract.image_to_alto_xml`` peut retourner un ``str``
        au lieu de ``bytes`` selon la version β€” l'adapter doit gΓ©rer
        les deux types."""
        mock_image_to_string.return_value = "x"
        mock_image_to_alto.return_value = _ALTO_VALID  # str, pas bytes
        mock_image_open.return_value.__enter__.return_value = MagicMock()

        adapter = TesseractAdapter(
            expose_alto=True, expose_confidences=False,
        )
        image_path = _create_dummy_image(tmp_path)
        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(image_path))},
            params={}, context=_make_context(),
        )

        assert ArtifactType.ALTO_XML in result


# ──────────────────────────────────────────────────────────────────────
# Test live β€” vraie exΓ©cution Tesseract
# ──────────────────────────────────────────────────────────────────────


@pytest.mark.live
class TestExecuteAltoLive:
    """Tests qui invoquent le vrai binaire ``tesseract``.

    ActivΓ©s uniquement avec ``pytest -m live``.  Skipped sans le
    binaire (vΓ©rifiΓ© au fixture).
    """

    @pytest.fixture
    def real_image(self, tmp_path: Path) -> Path:
        """CrΓ©e une image PNG avec du texte rendu via Pillow.

        Tesseract devrait Γͺtre capable de transcrire ce texte.
        """
        from PIL import Image, ImageDraw

        img = Image.new("RGB", (300, 80), color=(255, 255, 255))
        d = ImageDraw.Draw(img)
        d.text((10, 30), "Bonjour", fill=(0, 0, 0))
        path = tmp_path / "live_page.png"
        img.save(path)
        return path

    def test_real_tesseract_produces_valid_alto(
        self, real_image: Path, tmp_path: Path,
    ) -> None:
        """Vrai Tesseract β†’ ALTO XML structurellement valide."""
        from picarones.formats.alto.parser import parse_alto

        adapter = TesseractAdapter(
            lang="eng", psm=7,
            expose_alto=True, expose_confidences=False,
        )

        result = adapter.execute(
            inputs={ArtifactType.IMAGE: _make_image_artifact(str(real_image))},
            params={}, context=_make_context(),
        )

        assert ArtifactType.ALTO_XML in result, (
            "Tesseract n'a pas produit d'ALTO β€” vΓ©rifier l'installation "
            "tesseract + pytesseract."
        )
        alto_path = Path(result[ArtifactType.ALTO_XML].uri)
        assert alto_path.exists()
        # Le parser ALTO de Picarones doit accepter la sortie Tesseract.
        parsed = parse_alto(alto_path.read_text(encoding="utf-8"))
        assert parsed is not None