Spaces:
Sleeping
Sleeping
File size: 12,136 Bytes
cb3fbeb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | """Tests Sprint 31 — couverture dédiée des moteurs OCR cloud.
Avant Sprint 31, ``picarones/engines/{mistral_ocr,google_vision,
azure_doc_intel}.py`` n'étaient testés que via les fixtures du runner —
ce qui signifiait qu'on ne déclenchait jamais leurs branches d'erreur
(clé manquante, endpoint manquant, HTTP 4xx/5xx, format de réponse
inattendu). Ce fichier mocke ``urllib.request.urlopen`` pour les trois
moteurs et vérifie :
- la **création** réussie sans clef API ne plante pas (clés sont lues
paresseusement dans ``_run_ocr``) ;
- l'**absence de clé** lève ``RuntimeError`` avec un message qui
pointe vers la bonne variable d'environnement ;
- le **happy path REST** retourne le texte attendu d'une réponse JSON
fictive ;
- les **erreurs HTTP** sont remontées en ``RuntimeError`` lisibles ;
- les **propriétés** ``name``, ``version`` et ``execution_mode``
sont déclarées correctement (Sprint 31 — moteurs cloud doivent
hériter de ``execution_mode='io'`` du parent).
"""
from __future__ import annotations
import io
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
from urllib.error import HTTPError
import pytest
# ---------------------------------------------------------------------------
# Fixture utilitaire — image PNG minimale
# ---------------------------------------------------------------------------
@pytest.fixture
def fake_image(tmp_path: Path) -> Path:
"""Crée un PNG 10x10 décodable par Pillow."""
from PIL import Image
p = tmp_path / "test.png"
Image.new("RGB", (10, 10), color=(120, 120, 120)).save(p, format="PNG")
return p
def _mock_urlopen_response(json_body: dict, headers: dict | None = None) -> MagicMock:
"""Construit un faux ``urlopen`` context manager qui retourne ``json_body``."""
raw = json.dumps(json_body).encode("utf-8")
mock_resp = MagicMock()
mock_resp.read.return_value = raw
mock_resp.headers = headers or {}
mock_cm = MagicMock()
mock_cm.__enter__.return_value = mock_resp
mock_cm.__exit__.return_value = False
return mock_cm
# ---------------------------------------------------------------------------
# 1. MistralOCREngine
# ---------------------------------------------------------------------------
class TestMistralOCREngine:
def test_class_metadata(self, monkeypatch):
from picarones.engines.mistral_ocr import MistralOCREngine
monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
eng = MistralOCREngine()
assert eng.name == "mistral_ocr"
assert eng.version() # retourne un str non vide
# Sprint 24/31 — execution_mode hérite de la valeur 'io' du parent
assert eng.execution_mode == "io"
def test_missing_api_key_raises(self, monkeypatch, fake_image):
from picarones.engines.mistral_ocr import MistralOCREngine
monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
eng = MistralOCREngine()
with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"):
eng._run_ocr(fake_image)
def test_native_ocr_endpoint_parses_pages(self, monkeypatch, fake_image):
"""``mistral-ocr-latest`` route vers ``/v1/ocr`` et concatène les pages."""
from picarones.engines.mistral_ocr import MistralOCREngine
monkeypatch.setenv("MISTRAL_API_KEY", "fake-key")
eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})
body = {
"pages": [
{"markdown": "Page 1 — Lorem ipsum"},
{"markdown": "Page 2 — dolor sit amet"},
],
}
with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
text = eng._run_ocr(fake_image)
assert "Page 1" in text
assert "Page 2" in text
# Concaténation par double saut de ligne
assert "\n\n" in text
def test_native_endpoint_handles_empty_pages(self, monkeypatch, fake_image):
from picarones.engines.mistral_ocr import MistralOCREngine
monkeypatch.setenv("MISTRAL_API_KEY", "fake-key")
eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})
with patch("urllib.request.urlopen",
return_value=_mock_urlopen_response({"pages": []})):
text = eng._run_ocr(fake_image)
assert text == ""
# ---------------------------------------------------------------------------
# 2. GoogleVisionEngine
# ---------------------------------------------------------------------------
class TestGoogleVisionEngine:
def test_class_metadata(self, monkeypatch):
from picarones.engines.google_vision import GoogleVisionEngine
monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
eng = GoogleVisionEngine()
assert eng.name == "google_vision"
assert eng.version() == "v1"
assert eng.execution_mode == "io"
def test_missing_credentials_raises(self, monkeypatch, fake_image):
from picarones.engines.google_vision import GoogleVisionEngine
monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
eng = GoogleVisionEngine()
with pytest.raises(RuntimeError, match="(?i)Authentification"):
eng._run_ocr(fake_image)
def test_rest_happy_path_extracts_text(self, monkeypatch, fake_image):
from picarones.engines.google_vision import GoogleVisionEngine
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
eng = GoogleVisionEngine()
body = {
"responses": [
{"fullTextAnnotation": {"text": "Texte reconstitué de Gallica"}},
],
}
with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
text = eng._run_ocr(fake_image)
assert text == "Texte reconstitué de Gallica"
def test_rest_response_with_error_field_raises(self, monkeypatch, fake_image):
from picarones.engines.google_vision import GoogleVisionEngine
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
eng = GoogleVisionEngine()
body = {"responses": [{"error": {"message": "Quota exhausted"}}]}
with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
with pytest.raises(RuntimeError, match="(?i)Quota"):
eng._run_ocr(fake_image)
def test_http_error_remontes_lisible(self, monkeypatch, fake_image):
from picarones.engines.google_vision import GoogleVisionEngine
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
eng = GoogleVisionEngine()
err = HTTPError(
url="https://vision.googleapis.com/v1/images:annotate",
code=400,
msg="Bad Request",
hdrs=None, # type: ignore[arg-type]
fp=io.BytesIO(b'{"error": "bad image"}'),
)
with patch("urllib.request.urlopen", side_effect=err):
with pytest.raises(RuntimeError, match="(?i)400"):
eng._run_ocr(fake_image)
def test_text_detection_extracts_first_annotation(self, monkeypatch, fake_image):
from picarones.engines.google_vision import GoogleVisionEngine
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
eng = GoogleVisionEngine(config={"feature_type": "TEXT_DETECTION"})
body = {
"responses": [{
"textAnnotations": [
{"description": "Premier annot"},
{"description": "Second annot"},
],
}],
}
with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
text = eng._run_ocr(fake_image)
assert text == "Premier annot"
# ---------------------------------------------------------------------------
# 3. AzureDocIntelEngine
# ---------------------------------------------------------------------------
class TestAzureDocIntelEngine:
def test_class_metadata(self, monkeypatch):
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
eng = AzureDocIntelEngine()
assert eng.name == "azure_doc_intel"
assert eng.version() # date string non vide
assert eng.execution_mode == "io"
def test_missing_key_raises(self, monkeypatch, fake_image):
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
monkeypatch.setenv("AZURE_DOC_INTEL_ENDPOINT", "https://x.cognitiveservices.azure.com")
eng = AzureDocIntelEngine()
with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_KEY"):
eng._run_ocr(fake_image)
def test_missing_endpoint_raises(self, monkeypatch, fake_image):
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
monkeypatch.setenv("AZURE_DOC_INTEL_KEY", "k")
monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
eng = AzureDocIntelEngine()
with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_ENDPOINT"):
eng._run_ocr(fake_image)
def test_extract_text_pure_function(self):
# Méthode statique — testable sans réseau ni mocks.
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
result = {
"analyzeResult": {
"pages": [
{"lines": [
{"content": "Première ligne"},
{"content": "Deuxième ligne"},
{"content": ""}, # ignoré
]},
{"lines": [{"content": "Page 2 — texte"}]},
],
},
}
text = AzureDocIntelEngine._extract_text_from_result(result)
assert "Première ligne" in text
assert "Deuxième ligne" in text
assert "Page 2 — texte" in text
def test_extract_text_handles_empty_result(self):
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
assert AzureDocIntelEngine._extract_text_from_result({}) == ""
assert AzureDocIntelEngine._extract_text_from_result(
{"analyzeResult": {"pages": []}}
) == ""
# ---------------------------------------------------------------------------
# 4. Cohérence inter-moteurs cloud — Sprint 24/31
# ---------------------------------------------------------------------------
class TestCloudEngineExecutionMode:
"""Sprint 24 documente que les moteurs cloud sont en mode IO. Le test
vérifie cette invariance — si un futur sprint passe l'un d'eux en
'cpu', le runner ne le mettrait plus dans le ThreadPool, ce qui
serait une régression silencieuse de performance."""
def test_all_cloud_engines_are_io_bound(self, monkeypatch):
# Nettoyer les env vars pour ne pas tenter d'init clients cloud.
for v in ("MISTRAL_API_KEY", "GOOGLE_API_KEY",
"GOOGLE_APPLICATION_CREDENTIALS",
"AZURE_DOC_INTEL_KEY", "AZURE_DOC_INTEL_ENDPOINT"):
monkeypatch.delenv(v, raising=False)
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
from picarones.engines.google_vision import GoogleVisionEngine
from picarones.engines.mistral_ocr import MistralOCREngine
for cls in (MistralOCREngine, GoogleVisionEngine, AzureDocIntelEngine):
eng = cls()
assert eng.execution_mode == "io", (
f"{cls.__name__} doit rester IO-bound (utilisé en ThreadPool)"
)
|