File size: 12,136 Bytes
cb3fbeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""Tests Sprint 31 — couverture dédiée des moteurs OCR cloud.

Avant Sprint 31, ``picarones/engines/{mistral_ocr,google_vision,
azure_doc_intel}.py`` n'étaient testés que via les fixtures du runner —
ce qui signifiait qu'on ne déclenchait jamais leurs branches d'erreur
(clé manquante, endpoint manquant, HTTP 4xx/5xx, format de réponse
inattendu). Ce fichier mocke ``urllib.request.urlopen`` pour les trois
moteurs et vérifie :

- la **création** réussie sans clef API ne plante pas (clés sont lues
  paresseusement dans ``_run_ocr``) ;
- l'**absence de clé** lève ``RuntimeError`` avec un message qui
  pointe vers la bonne variable d'environnement ;
- le **happy path REST** retourne le texte attendu d'une réponse JSON
  fictive ;
- les **erreurs HTTP** sont remontées en ``RuntimeError`` lisibles ;
- les **propriétés** ``name``, ``version`` et ``execution_mode``
  sont déclarées correctement (Sprint 31 — moteurs cloud doivent
  hériter de ``execution_mode='io'`` du parent).
"""

from __future__ import annotations

import io
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
from urllib.error import HTTPError

import pytest


# ---------------------------------------------------------------------------
# Fixture utilitaire — image PNG minimale
# ---------------------------------------------------------------------------

@pytest.fixture
def fake_image(tmp_path: Path) -> Path:
    """Crée un PNG 10x10 décodable par Pillow."""
    from PIL import Image
    p = tmp_path / "test.png"
    Image.new("RGB", (10, 10), color=(120, 120, 120)).save(p, format="PNG")
    return p


def _mock_urlopen_response(json_body: dict, headers: dict | None = None) -> MagicMock:
    """Construit un faux ``urlopen`` context manager qui retourne ``json_body``."""
    raw = json.dumps(json_body).encode("utf-8")
    mock_resp = MagicMock()
    mock_resp.read.return_value = raw
    mock_resp.headers = headers or {}
    mock_cm = MagicMock()
    mock_cm.__enter__.return_value = mock_resp
    mock_cm.__exit__.return_value = False
    return mock_cm


# ---------------------------------------------------------------------------
# 1. MistralOCREngine
# ---------------------------------------------------------------------------

class TestMistralOCREngine:
    def test_class_metadata(self, monkeypatch):
        from picarones.engines.mistral_ocr import MistralOCREngine
        monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
        eng = MistralOCREngine()
        assert eng.name == "mistral_ocr"
        assert eng.version()  # retourne un str non vide
        # Sprint 24/31 — execution_mode hérite de la valeur 'io' du parent
        assert eng.execution_mode == "io"

    def test_missing_api_key_raises(self, monkeypatch, fake_image):
        from picarones.engines.mistral_ocr import MistralOCREngine
        monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
        eng = MistralOCREngine()
        with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"):
            eng._run_ocr(fake_image)

    def test_native_ocr_endpoint_parses_pages(self, monkeypatch, fake_image):
        """``mistral-ocr-latest`` route vers ``/v1/ocr`` et concatène les pages."""
        from picarones.engines.mistral_ocr import MistralOCREngine
        monkeypatch.setenv("MISTRAL_API_KEY", "fake-key")
        eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})

        body = {
            "pages": [
                {"markdown": "Page 1 — Lorem ipsum"},
                {"markdown": "Page 2 — dolor sit amet"},
            ],
        }
        with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
            text = eng._run_ocr(fake_image)
        assert "Page 1" in text
        assert "Page 2" in text
        # Concaténation par double saut de ligne
        assert "\n\n" in text

    def test_native_endpoint_handles_empty_pages(self, monkeypatch, fake_image):
        from picarones.engines.mistral_ocr import MistralOCREngine
        monkeypatch.setenv("MISTRAL_API_KEY", "fake-key")
        eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})

        with patch("urllib.request.urlopen",
                   return_value=_mock_urlopen_response({"pages": []})):
            text = eng._run_ocr(fake_image)
        assert text == ""


# ---------------------------------------------------------------------------
# 2. GoogleVisionEngine
# ---------------------------------------------------------------------------

class TestGoogleVisionEngine:
    def test_class_metadata(self, monkeypatch):
        from picarones.engines.google_vision import GoogleVisionEngine
        monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
        monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
        eng = GoogleVisionEngine()
        assert eng.name == "google_vision"
        assert eng.version() == "v1"
        assert eng.execution_mode == "io"

    def test_missing_credentials_raises(self, monkeypatch, fake_image):
        from picarones.engines.google_vision import GoogleVisionEngine
        monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
        monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
        eng = GoogleVisionEngine()
        with pytest.raises(RuntimeError, match="(?i)Authentification"):
            eng._run_ocr(fake_image)

    def test_rest_happy_path_extracts_text(self, monkeypatch, fake_image):
        from picarones.engines.google_vision import GoogleVisionEngine
        monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
        monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
        eng = GoogleVisionEngine()

        body = {
            "responses": [
                {"fullTextAnnotation": {"text": "Texte reconstitué de Gallica"}},
            ],
        }
        with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
            text = eng._run_ocr(fake_image)
        assert text == "Texte reconstitué de Gallica"

    def test_rest_response_with_error_field_raises(self, monkeypatch, fake_image):
        from picarones.engines.google_vision import GoogleVisionEngine
        monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
        monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
        eng = GoogleVisionEngine()

        body = {"responses": [{"error": {"message": "Quota exhausted"}}]}
        with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
            with pytest.raises(RuntimeError, match="(?i)Quota"):
                eng._run_ocr(fake_image)

    def test_http_error_remontes_lisible(self, monkeypatch, fake_image):
        from picarones.engines.google_vision import GoogleVisionEngine
        monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
        monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
        eng = GoogleVisionEngine()

        err = HTTPError(
            url="https://vision.googleapis.com/v1/images:annotate",
            code=400,
            msg="Bad Request",
            hdrs=None,  # type: ignore[arg-type]
            fp=io.BytesIO(b'{"error": "bad image"}'),
        )
        with patch("urllib.request.urlopen", side_effect=err):
            with pytest.raises(RuntimeError, match="(?i)400"):
                eng._run_ocr(fake_image)

    def test_text_detection_extracts_first_annotation(self, monkeypatch, fake_image):
        from picarones.engines.google_vision import GoogleVisionEngine
        monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
        monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
        eng = GoogleVisionEngine(config={"feature_type": "TEXT_DETECTION"})

        body = {
            "responses": [{
                "textAnnotations": [
                    {"description": "Premier annot"},
                    {"description": "Second annot"},
                ],
            }],
        }
        with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
            text = eng._run_ocr(fake_image)
        assert text == "Premier annot"


# ---------------------------------------------------------------------------
# 3. AzureDocIntelEngine
# ---------------------------------------------------------------------------

class TestAzureDocIntelEngine:
    def test_class_metadata(self, monkeypatch):
        from picarones.engines.azure_doc_intel import AzureDocIntelEngine
        monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
        monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
        eng = AzureDocIntelEngine()
        assert eng.name == "azure_doc_intel"
        assert eng.version()  # date string non vide
        assert eng.execution_mode == "io"

    def test_missing_key_raises(self, monkeypatch, fake_image):
        from picarones.engines.azure_doc_intel import AzureDocIntelEngine
        monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
        monkeypatch.setenv("AZURE_DOC_INTEL_ENDPOINT", "https://x.cognitiveservices.azure.com")
        eng = AzureDocIntelEngine()
        with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_KEY"):
            eng._run_ocr(fake_image)

    def test_missing_endpoint_raises(self, monkeypatch, fake_image):
        from picarones.engines.azure_doc_intel import AzureDocIntelEngine
        monkeypatch.setenv("AZURE_DOC_INTEL_KEY", "k")
        monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
        eng = AzureDocIntelEngine()
        with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_ENDPOINT"):
            eng._run_ocr(fake_image)

    def test_extract_text_pure_function(self):
        # Méthode statique — testable sans réseau ni mocks.
        from picarones.engines.azure_doc_intel import AzureDocIntelEngine
        result = {
            "analyzeResult": {
                "pages": [
                    {"lines": [
                        {"content": "Première ligne"},
                        {"content": "Deuxième ligne"},
                        {"content": ""},  # ignoré
                    ]},
                    {"lines": [{"content": "Page 2 — texte"}]},
                ],
            },
        }
        text = AzureDocIntelEngine._extract_text_from_result(result)
        assert "Première ligne" in text
        assert "Deuxième ligne" in text
        assert "Page 2 — texte" in text

    def test_extract_text_handles_empty_result(self):
        from picarones.engines.azure_doc_intel import AzureDocIntelEngine
        assert AzureDocIntelEngine._extract_text_from_result({}) == ""
        assert AzureDocIntelEngine._extract_text_from_result(
            {"analyzeResult": {"pages": []}}
        ) == ""


# ---------------------------------------------------------------------------
# 4. Cohérence inter-moteurs cloud — Sprint 24/31
# ---------------------------------------------------------------------------

class TestCloudEngineExecutionMode:
    """Sprint 24 documente que les moteurs cloud sont en mode IO. Le test
    vérifie cette invariance — si un futur sprint passe l'un d'eux en
    'cpu', le runner ne le mettrait plus dans le ThreadPool, ce qui
    serait une régression silencieuse de performance."""

    def test_all_cloud_engines_are_io_bound(self, monkeypatch):
        # Nettoyer les env vars pour ne pas tenter d'init clients cloud.
        for v in ("MISTRAL_API_KEY", "GOOGLE_API_KEY",
                  "GOOGLE_APPLICATION_CREDENTIALS",
                  "AZURE_DOC_INTEL_KEY", "AZURE_DOC_INTEL_ENDPOINT"):
            monkeypatch.delenv(v, raising=False)

        from picarones.engines.azure_doc_intel import AzureDocIntelEngine
        from picarones.engines.google_vision import GoogleVisionEngine
        from picarones.engines.mistral_ocr import MistralOCREngine

        for cls in (MistralOCREngine, GoogleVisionEngine, AzureDocIntelEngine):
            eng = cls()
            assert eng.execution_mode == "io", (
                f"{cls.__name__} doit rester IO-bound (utilisé en ThreadPool)"
            )