"""Tests Sprint 31 — couverture dédiée des moteurs OCR cloud. Avant Sprint 31, ``picarones/engines/{mistral_ocr,google_vision, azure_doc_intel}.py`` n'étaient testés que via les fixtures du runner — ce qui signifiait qu'on ne déclenchait jamais leurs branches d'erreur (clé manquante, endpoint manquant, HTTP 4xx/5xx, format de réponse inattendu). Ce fichier mocke ``urllib.request.urlopen`` pour les trois moteurs et vérifie : - la **création** réussie sans clef API ne plante pas (clés sont lues paresseusement dans ``_run_ocr``) ; - l'**absence de clé** lève ``RuntimeError`` avec un message qui pointe vers la bonne variable d'environnement ; - le **happy path REST** retourne le texte attendu d'une réponse JSON fictive ; - les **erreurs HTTP** sont remontées en ``RuntimeError`` lisibles ; - les **propriétés** ``name``, ``version`` et ``execution_mode`` sont déclarées correctement (Sprint 31 — moteurs cloud doivent hériter de ``execution_mode='io'`` du parent). """ from __future__ import annotations import io import json from pathlib import Path from unittest.mock import MagicMock, patch from urllib.error import HTTPError import pytest # --------------------------------------------------------------------------- # Fixture utilitaire — image PNG minimale # --------------------------------------------------------------------------- @pytest.fixture def fake_image(tmp_path: Path) -> Path: """Crée un PNG 10x10 décodable par Pillow.""" from PIL import Image p = tmp_path / "test.png" Image.new("RGB", (10, 10), color=(120, 120, 120)).save(p, format="PNG") return p def _mock_urlopen_response(json_body: dict, headers: dict | None = None) -> MagicMock: """Construit un faux ``urlopen`` context manager qui retourne ``json_body``.""" raw = json.dumps(json_body).encode("utf-8") mock_resp = MagicMock() mock_resp.read.return_value = raw mock_resp.headers = headers or {} mock_cm = MagicMock() mock_cm.__enter__.return_value = mock_resp mock_cm.__exit__.return_value = False return mock_cm # --------------------------------------------------------------------------- # 1. MistralOCREngine # --------------------------------------------------------------------------- class TestMistralOCREngine: def test_class_metadata(self, monkeypatch): from picarones.engines.mistral_ocr import MistralOCREngine monkeypatch.delenv("MISTRAL_API_KEY", raising=False) eng = MistralOCREngine() assert eng.name == "mistral_ocr" assert eng.version() # retourne un str non vide # Sprint 24/31 — execution_mode hérite de la valeur 'io' du parent assert eng.execution_mode == "io" def test_missing_api_key_raises(self, monkeypatch, fake_image): from picarones.engines.mistral_ocr import MistralOCREngine monkeypatch.delenv("MISTRAL_API_KEY", raising=False) eng = MistralOCREngine() with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"): eng._run_ocr(fake_image) def test_native_ocr_endpoint_parses_pages(self, monkeypatch, fake_image): """``mistral-ocr-latest`` route vers ``/v1/ocr`` et concatène les pages.""" from picarones.engines.mistral_ocr import MistralOCREngine monkeypatch.setenv("MISTRAL_API_KEY", "fake-key") eng = MistralOCREngine(config={"model": "mistral-ocr-latest"}) body = { "pages": [ {"markdown": "Page 1 — Lorem ipsum"}, {"markdown": "Page 2 — dolor sit amet"}, ], } with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)): text = eng._run_ocr(fake_image) assert "Page 1" in text assert "Page 2" in text # Concaténation par double saut de ligne assert "\n\n" in text def test_native_endpoint_handles_empty_pages(self, monkeypatch, fake_image): from picarones.engines.mistral_ocr import MistralOCREngine monkeypatch.setenv("MISTRAL_API_KEY", "fake-key") eng = MistralOCREngine(config={"model": "mistral-ocr-latest"}) with patch("urllib.request.urlopen", return_value=_mock_urlopen_response({"pages": []})): text = eng._run_ocr(fake_image) assert text == "" # --------------------------------------------------------------------------- # 2. GoogleVisionEngine # --------------------------------------------------------------------------- class TestGoogleVisionEngine: def test_class_metadata(self, monkeypatch): from picarones.engines.google_vision import GoogleVisionEngine monkeypatch.delenv("GOOGLE_API_KEY", raising=False) monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) eng = GoogleVisionEngine() assert eng.name == "google_vision" assert eng.version() == "v1" assert eng.execution_mode == "io" def test_missing_credentials_raises(self, monkeypatch, fake_image): from picarones.engines.google_vision import GoogleVisionEngine monkeypatch.delenv("GOOGLE_API_KEY", raising=False) monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) eng = GoogleVisionEngine() with pytest.raises(RuntimeError, match="(?i)Authentification"): eng._run_ocr(fake_image) def test_rest_happy_path_extracts_text(self, monkeypatch, fake_image): from picarones.engines.google_vision import GoogleVisionEngine monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) monkeypatch.setenv("GOOGLE_API_KEY", "fake-key") eng = GoogleVisionEngine() body = { "responses": [ {"fullTextAnnotation": {"text": "Texte reconstitué de Gallica"}}, ], } with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)): text = eng._run_ocr(fake_image) assert text == "Texte reconstitué de Gallica" def test_rest_response_with_error_field_raises(self, monkeypatch, fake_image): from picarones.engines.google_vision import GoogleVisionEngine monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) monkeypatch.setenv("GOOGLE_API_KEY", "fake-key") eng = GoogleVisionEngine() body = {"responses": [{"error": {"message": "Quota exhausted"}}]} with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)): with pytest.raises(RuntimeError, match="(?i)Quota"): eng._run_ocr(fake_image) def test_http_error_remontes_lisible(self, monkeypatch, fake_image): from picarones.engines.google_vision import GoogleVisionEngine monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) monkeypatch.setenv("GOOGLE_API_KEY", "fake-key") eng = GoogleVisionEngine() err = HTTPError( url="https://vision.googleapis.com/v1/images:annotate", code=400, msg="Bad Request", hdrs=None, # type: ignore[arg-type] fp=io.BytesIO(b'{"error": "bad image"}'), ) with patch("urllib.request.urlopen", side_effect=err): with pytest.raises(RuntimeError, match="(?i)400"): eng._run_ocr(fake_image) def test_text_detection_extracts_first_annotation(self, monkeypatch, fake_image): from picarones.engines.google_vision import GoogleVisionEngine monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) monkeypatch.setenv("GOOGLE_API_KEY", "fake-key") eng = GoogleVisionEngine(config={"feature_type": "TEXT_DETECTION"}) body = { "responses": [{ "textAnnotations": [ {"description": "Premier annot"}, {"description": "Second annot"}, ], }], } with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)): text = eng._run_ocr(fake_image) assert text == "Premier annot" # --------------------------------------------------------------------------- # 3. AzureDocIntelEngine # --------------------------------------------------------------------------- class TestAzureDocIntelEngine: def test_class_metadata(self, monkeypatch): from picarones.engines.azure_doc_intel import AzureDocIntelEngine monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False) monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False) eng = AzureDocIntelEngine() assert eng.name == "azure_doc_intel" assert eng.version() # date string non vide assert eng.execution_mode == "io" def test_missing_key_raises(self, monkeypatch, fake_image): from picarones.engines.azure_doc_intel import AzureDocIntelEngine monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False) monkeypatch.setenv("AZURE_DOC_INTEL_ENDPOINT", "https://x.cognitiveservices.azure.com") eng = AzureDocIntelEngine() with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_KEY"): eng._run_ocr(fake_image) def test_missing_endpoint_raises(self, monkeypatch, fake_image): from picarones.engines.azure_doc_intel import AzureDocIntelEngine monkeypatch.setenv("AZURE_DOC_INTEL_KEY", "k") monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False) eng = AzureDocIntelEngine() with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_ENDPOINT"): eng._run_ocr(fake_image) def test_extract_text_pure_function(self): # Méthode statique — testable sans réseau ni mocks. from picarones.engines.azure_doc_intel import AzureDocIntelEngine result = { "analyzeResult": { "pages": [ {"lines": [ {"content": "Première ligne"}, {"content": "Deuxième ligne"}, {"content": ""}, # ignoré ]}, {"lines": [{"content": "Page 2 — texte"}]}, ], }, } text = AzureDocIntelEngine._extract_text_from_result(result) assert "Première ligne" in text assert "Deuxième ligne" in text assert "Page 2 — texte" in text def test_extract_text_handles_empty_result(self): from picarones.engines.azure_doc_intel import AzureDocIntelEngine assert AzureDocIntelEngine._extract_text_from_result({}) == "" assert AzureDocIntelEngine._extract_text_from_result( {"analyzeResult": {"pages": []}} ) == "" # --------------------------------------------------------------------------- # 4. Cohérence inter-moteurs cloud — Sprint 24/31 # --------------------------------------------------------------------------- class TestCloudEngineExecutionMode: """Sprint 24 documente que les moteurs cloud sont en mode IO. Le test vérifie cette invariance — si un futur sprint passe l'un d'eux en 'cpu', le runner ne le mettrait plus dans le ThreadPool, ce qui serait une régression silencieuse de performance.""" def test_all_cloud_engines_are_io_bound(self, monkeypatch): # Nettoyer les env vars pour ne pas tenter d'init clients cloud. for v in ("MISTRAL_API_KEY", "GOOGLE_API_KEY", "GOOGLE_APPLICATION_CREDENTIALS", "AZURE_DOC_INTEL_KEY", "AZURE_DOC_INTEL_ENDPOINT"): monkeypatch.delenv(v, raising=False) from picarones.engines.azure_doc_intel import AzureDocIntelEngine from picarones.engines.google_vision import GoogleVisionEngine from picarones.engines.mistral_ocr import MistralOCREngine for cls in (MistralOCREngine, GoogleVisionEngine, AzureDocIntelEngine): eng = cls() assert eng.execution_mode == "io", ( f"{cls.__name__} doit rester IO-bound (utilisé en ThreadPool)" )