Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Picarones / tests /test_engines_cloud.py

Claude

sprint31: couverture des modules sous-testés (char_scores, cloud, pricing)

cb3fbeb unverified 2 months ago

12.1 kB

	"""Tests Sprint 31 — couverture dédiée des moteurs OCR cloud.

	Avant Sprint 31, ``picarones/engines/{mistral_ocr,google_vision,
	azure_doc_intel}.py`` n'étaient testés que via les fixtures du runner —
	ce qui signifiait qu'on ne déclenchait jamais leurs branches d'erreur
	(clé manquante, endpoint manquant, HTTP 4xx/5xx, format de réponse
	inattendu). Ce fichier mocke ``urllib.request.urlopen`` pour les trois
	moteurs et vérifie :

	- la création réussie sans clef API ne plante pas (clés sont lues
	paresseusement dans ``_run_ocr``) ;
	- l'absence de clé lève ``RuntimeError`` avec un message qui
	pointe vers la bonne variable d'environnement ;
	- le happy path REST retourne le texte attendu d'une réponse JSON
	fictive ;
	- les erreurs HTTP sont remontées en ``RuntimeError`` lisibles ;
	- les propriétés ``name``, ``version`` et ``execution_mode``
	sont déclarées correctement (Sprint 31 — moteurs cloud doivent
	hériter de ``execution_mode='io'`` du parent).
	"""

	from __future__ import annotations

	import io
	import json
	from pathlib import Path
	from unittest.mock import MagicMock, patch
	from urllib.error import HTTPError

	import pytest


	# ---------------------------------------------------------------------------
	# Fixture utilitaire — image PNG minimale
	# ---------------------------------------------------------------------------

	@pytest.fixture
	def fake_image(tmp_path: Path) -> Path:
	"""Crée un PNG 10x10 décodable par Pillow."""
	from PIL import Image
	p = tmp_path / "test.png"
	Image.new("RGB", (10, 10), color=(120, 120, 120)).save(p, format="PNG")
	return p


	def _mock_urlopen_response(json_body: dict, headers: dict \| None = None) -> MagicMock:
	"""Construit un faux ``urlopen`` context manager qui retourne ``json_body``."""
	raw = json.dumps(json_body).encode("utf-8")
	mock_resp = MagicMock()
	mock_resp.read.return_value = raw
	mock_resp.headers = headers or {}
	mock_cm = MagicMock()
	mock_cm.__enter__.return_value = mock_resp
	mock_cm.__exit__.return_value = False
	return mock_cm


	# ---------------------------------------------------------------------------
	# 1. MistralOCREngine
	# ---------------------------------------------------------------------------

	class TestMistralOCREngine:
	def test_class_metadata(self, monkeypatch):
	from picarones.engines.mistral_ocr import MistralOCREngine
	monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
	eng = MistralOCREngine()
	assert eng.name == "mistral_ocr"
	assert eng.version() # retourne un str non vide
	# Sprint 24/31 — execution_mode hérite de la valeur 'io' du parent
	assert eng.execution_mode == "io"

	def test_missing_api_key_raises(self, monkeypatch, fake_image):
	from picarones.engines.mistral_ocr import MistralOCREngine
	monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
	eng = MistralOCREngine()
	with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"):
	eng._run_ocr(fake_image)

	def test_native_ocr_endpoint_parses_pages(self, monkeypatch, fake_image):
	"""``mistral-ocr-latest`` route vers ``/v1/ocr`` et concatène les pages."""
	from picarones.engines.mistral_ocr import MistralOCREngine
	monkeypatch.setenv("MISTRAL_API_KEY", "fake-key")
	eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})

	body = {
	"pages": [
	{"markdown": "Page 1 — Lorem ipsum"},
	{"markdown": "Page 2 — dolor sit amet"},
	],
	}
	with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
	text = eng._run_ocr(fake_image)
	assert "Page 1" in text
	assert "Page 2" in text
	# Concaténation par double saut de ligne
	assert "\n\n" in text

	def test_native_endpoint_handles_empty_pages(self, monkeypatch, fake_image):
	from picarones.engines.mistral_ocr import MistralOCREngine
	monkeypatch.setenv("MISTRAL_API_KEY", "fake-key")
	eng = MistralOCREngine(config={"model": "mistral-ocr-latest"})

	with patch("urllib.request.urlopen",
	return_value=_mock_urlopen_response({"pages": []})):
	text = eng._run_ocr(fake_image)
	assert text == ""


	# ---------------------------------------------------------------------------
	# 2. GoogleVisionEngine
	# ---------------------------------------------------------------------------

	class TestGoogleVisionEngine:
	def test_class_metadata(self, monkeypatch):
	from picarones.engines.google_vision import GoogleVisionEngine
	monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
	monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
	eng = GoogleVisionEngine()
	assert eng.name == "google_vision"
	assert eng.version() == "v1"
	assert eng.execution_mode == "io"

	def test_missing_credentials_raises(self, monkeypatch, fake_image):
	from picarones.engines.google_vision import GoogleVisionEngine
	monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
	monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
	eng = GoogleVisionEngine()
	with pytest.raises(RuntimeError, match="(?i)Authentification"):
	eng._run_ocr(fake_image)

	def test_rest_happy_path_extracts_text(self, monkeypatch, fake_image):
	from picarones.engines.google_vision import GoogleVisionEngine
	monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
	monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
	eng = GoogleVisionEngine()

	body = {
	"responses": [
	{"fullTextAnnotation": {"text": "Texte reconstitué de Gallica"}},
	],
	}
	with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
	text = eng._run_ocr(fake_image)
	assert text == "Texte reconstitué de Gallica"

	def test_rest_response_with_error_field_raises(self, monkeypatch, fake_image):
	from picarones.engines.google_vision import GoogleVisionEngine
	monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
	monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
	eng = GoogleVisionEngine()

	body = {"responses": [{"error": {"message": "Quota exhausted"}}]}
	with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
	with pytest.raises(RuntimeError, match="(?i)Quota"):
	eng._run_ocr(fake_image)

	def test_http_error_remontes_lisible(self, monkeypatch, fake_image):
	from picarones.engines.google_vision import GoogleVisionEngine
	monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
	monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
	eng = GoogleVisionEngine()

	err = HTTPError(
	url="https://vision.googleapis.com/v1/images:annotate",
	code=400,
	msg="Bad Request",
	hdrs=None, # type: ignore[arg-type]
	fp=io.BytesIO(b'{"error": "bad image"}'),
	)
	with patch("urllib.request.urlopen", side_effect=err):
	with pytest.raises(RuntimeError, match="(?i)400"):
	eng._run_ocr(fake_image)

	def test_text_detection_extracts_first_annotation(self, monkeypatch, fake_image):
	from picarones.engines.google_vision import GoogleVisionEngine
	monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
	monkeypatch.setenv("GOOGLE_API_KEY", "fake-key")
	eng = GoogleVisionEngine(config={"feature_type": "TEXT_DETECTION"})

	body = {
	"responses": [{
	"textAnnotations": [
	{"description": "Premier annot"},
	{"description": "Second annot"},
	],
	}],
	}
	with patch("urllib.request.urlopen", return_value=_mock_urlopen_response(body)):
	text = eng._run_ocr(fake_image)
	assert text == "Premier annot"


	# ---------------------------------------------------------------------------
	# 3. AzureDocIntelEngine
	# ---------------------------------------------------------------------------

	class TestAzureDocIntelEngine:
	def test_class_metadata(self, monkeypatch):
	from picarones.engines.azure_doc_intel import AzureDocIntelEngine
	monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
	monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
	eng = AzureDocIntelEngine()
	assert eng.name == "azure_doc_intel"
	assert eng.version() # date string non vide
	assert eng.execution_mode == "io"

	def test_missing_key_raises(self, monkeypatch, fake_image):
	from picarones.engines.azure_doc_intel import AzureDocIntelEngine
	monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
	monkeypatch.setenv("AZURE_DOC_INTEL_ENDPOINT", "https://x.cognitiveservices.azure.com")
	eng = AzureDocIntelEngine()
	with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_KEY"):
	eng._run_ocr(fake_image)

	def test_missing_endpoint_raises(self, monkeypatch, fake_image):
	from picarones.engines.azure_doc_intel import AzureDocIntelEngine
	monkeypatch.setenv("AZURE_DOC_INTEL_KEY", "k")
	monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
	eng = AzureDocIntelEngine()
	with pytest.raises(RuntimeError, match="AZURE_DOC_INTEL_ENDPOINT"):
	eng._run_ocr(fake_image)

	def test_extract_text_pure_function(self):
	# Méthode statique — testable sans réseau ni mocks.
	from picarones.engines.azure_doc_intel import AzureDocIntelEngine
	result = {
	"analyzeResult": {
	"pages": [
	{"lines": [
	{"content": "Première ligne"},
	{"content": "Deuxième ligne"},
	{"content": ""}, # ignoré
	]},
	{"lines": [{"content": "Page 2 — texte"}]},
	],
	},
	}
	text = AzureDocIntelEngine._extract_text_from_result(result)
	assert "Première ligne" in text
	assert "Deuxième ligne" in text
	assert "Page 2 — texte" in text

	def test_extract_text_handles_empty_result(self):
	from picarones.engines.azure_doc_intel import AzureDocIntelEngine
	assert AzureDocIntelEngine._extract_text_from_result({}) == ""
	assert AzureDocIntelEngine._extract_text_from_result(
	{"analyzeResult": {"pages": []}}
	) == ""


	# ---------------------------------------------------------------------------
	# 4. Cohérence inter-moteurs cloud — Sprint 24/31
	# ---------------------------------------------------------------------------

	class TestCloudEngineExecutionMode:
	"""Sprint 24 documente que les moteurs cloud sont en mode IO. Le test
	vérifie cette invariance — si un futur sprint passe l'un d'eux en
	'cpu', le runner ne le mettrait plus dans le ThreadPool, ce qui
	serait une régression silencieuse de performance."""

	def test_all_cloud_engines_are_io_bound(self, monkeypatch):
	# Nettoyer les env vars pour ne pas tenter d'init clients cloud.
	for v in ("MISTRAL_API_KEY", "GOOGLE_API_KEY",
	"GOOGLE_APPLICATION_CREDENTIALS",
	"AZURE_DOC_INTEL_KEY", "AZURE_DOC_INTEL_ENDPOINT"):
	monkeypatch.delenv(v, raising=False)

	from picarones.engines.azure_doc_intel import AzureDocIntelEngine
	from picarones.engines.google_vision import GoogleVisionEngine
	from picarones.engines.mistral_ocr import MistralOCREngine

	for cls in (MistralOCREngine, GoogleVisionEngine, AzureDocIntelEngine):
	eng = cls()
	assert eng.execution_mode == "io", (
	f"{cls.__name__} doit rester IO-bound (utilisé en ThreadPool)"
	)