Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / picarones /engines /google_vision.py

Claude

refactor(engines): unifier l'API token_confidences à un seul nom canonique

eca43d9 unverified about 2 months ago

10.8 kB

	"""Adaptateur OCR — Google Cloud Vision API.

	Utilise l'API Google Cloud Vision pour la détection de texte dans des
	documents (méthode ``DOCUMENT_TEXT_DETECTION``, optimisée pour les textes
	denses et multilinguistiques).

	Authentification :
	- Via service account JSON : variable d'environnement
	``GOOGLE_APPLICATION_CREDENTIALS`` → chemin vers le fichier JSON
	- Via clé API simple : variable d'environnement ``GOOGLE_API_KEY``

	Le mode service account est recommandé pour la production.

	Sprint 50 — exposition des token_confidences
	---------------------------------------------
	``DOCUMENT_TEXT_DETECTION`` expose ``Word.confidence`` au niveau mot
	sur chaque ``page > block > paragraph > word``. L'adapter parcourt
	cette hiérarchie et émet une entrée par mot au format Sprint 42.
	Les deux chemins (SDK ``google-cloud-vision`` et REST direct via
	``urllib``) sont normalisés vers une représentation unifiée.

	Pour ``TEXT_DETECTION`` (mode "court"), aucune confidence par mot
	n'est exposée : ``token_confidences = None``.

	Refactor du chantier 1 (post-Sprint 97)
	---------------------------------------
	L'adapter ne surcharge plus ``run()`` — il implémente ``_run_with_native``
	et ``_extract_raw_confidences`` (les hooks factorisés dans ``BaseOCREngine``).
	Comportement externe et octets de sortie strictement identiques.
	"""

	from __future__ import annotations

	import base64
	import json
	import logging
	import os
	import urllib.error
	import urllib.request
	from pathlib import Path
	from typing import Any, Optional

	from picarones.engines.base import BaseOCREngine


	logger = logging.getLogger(__name__)


	class GoogleVisionEngine(BaseOCREngine):
	"""Moteur OCR via l'API Google Cloud Vision.

	Configuration
	-------------
	language_hints : list[str]
	Suggestions de langue (ex : ``["fr"]``). Améliore la précision.
	feature_type : str
	Type de détection : ``"DOCUMENT_TEXT_DETECTION"`` (défaut, pour textes
	denses) ou ``"TEXT_DETECTION"`` (pour textes courts).
	expose_confidences : bool
	``True`` (défaut) : extrait ``Word.confidence`` quand
	``feature_type=DOCUMENT_TEXT_DETECTION`` (Sprint 50).
	``False`` : désactive l'extraction (économise quelques ms par
	image).
	"""

	@property
	def name(self) -> str:
	return "google_vision"

	def version(self) -> str:
	return "v1"

	def __init__(self, config: Optional[dict] = None) -> None:
	super().__init__(config)
	self._api_key = os.environ.get("GOOGLE_API_KEY")
	self._credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
	self._language_hints: list[str] = self.config.get("language_hints", ["fr"])
	self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")

	def _run_ocr(self, image_path: Path) -> str:
	"""Retourne uniquement le texte (interface ``BaseOCREngine``)."""
	text, _full = self._run_with_native(image_path)
	return text

	def _run_with_native(
	self, image_path: Path,
	) -> tuple[str, Optional[dict]]:
	"""Exécute l'OCR et retourne ``(text, full_text_annotation_dict)``.

	``full_text_annotation_dict`` est :
	- le JSON brut ``fullTextAnnotation`` du REST quand on passe
	par REST,
	- une représentation dict normalisée quand on passe par SDK,
	- ``None`` pour ``TEXT_DETECTION`` (mode court sans
	confidence par mot).
	"""
	if self._credentials_path:
	return self._run_via_sdk(image_path)
	elif self._api_key:
	return self._run_via_rest(image_path)
	else:
	raise RuntimeError(
	"Authentification Google Vision manquante. Définissez "
	"GOOGLE_APPLICATION_CREDENTIALS (service account JSON) "
	"ou GOOGLE_API_KEY."
	)

	def _run_via_sdk(self, image_path: Path) -> tuple[str, Optional[dict]]:
	try:
	from google.cloud import vision
	except ImportError as exc:
	raise RuntimeError(
	"Le package 'google-cloud-vision' n'est pas installé. "
	"Lancez : pip install google-cloud-vision"
	) from exc

	client = vision.ImageAnnotatorClient()
	image_bytes = image_path.read_bytes()
	image = vision.Image(content=image_bytes)

	if self._feature_type == "DOCUMENT_TEXT_DETECTION":
	response = client.document_text_detection(
	image=image,
	image_context=vision.ImageContext(
	language_hints=self._language_hints
	),
	)
	text = response.full_text_annotation.text
	full = self._sdk_full_text_to_dict(response.full_text_annotation)
	return text, full
	else:
	response = client.text_detection(
	image=image,
	image_context=vision.ImageContext(
	language_hints=self._language_hints
	),
	)
	texts = response.text_annotations
	text = texts[0].description if texts else ""
	return text, None

	def _run_via_rest(self, image_path: Path) -> tuple[str, Optional[dict]]:
	"""Appel REST direct (sans SDK), avec clé API simple."""
	image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
	payload = {
	"requests": [
	{
	"image": {"content": image_b64},
	"features": [{"type": self._feature_type, "maxResults": 1}],
	"imageContext": {"languageHints": self._language_hints},
	}
	]
	}
	url = "https://vision.googleapis.com/v1/images:annotate"
	data = json.dumps(payload).encode("utf-8")
	req = urllib.request.Request(
	url, data=data,
	headers={
	"Content-Type": "application/json",
	"X-Goog-Api-Key": self._api_key,
	},
	)
	try:
	with urllib.request.urlopen(req, timeout=60) as resp:
	result = json.loads(resp.read().decode("utf-8"))
	except urllib.error.HTTPError as exc:
	raise RuntimeError(f"Google Vision API erreur {exc.code}: {exc.read().decode()}") from exc

	responses = result.get("responses", [{}])
	if not responses:
	return "", None
	r = responses[0]
	if "error" in r:
	raise RuntimeError(f"Google Vision API erreur : {r['error']}")

	if self._feature_type == "DOCUMENT_TEXT_DETECTION":
	full = r.get("fullTextAnnotation") or None
	text = (full or {}).get("text", "") if isinstance(full, dict) else ""
	return text, full
	else:
	texts = r.get("textAnnotations", [])
	text = texts[0]["description"] if texts else ""
	return text, None

	# ──────────────────────────────────────────────────────────────────
	# Conversion SDK → dict normalisé (pour traitement uniforme)
	# ──────────────────────────────────────────────────────────────────

	@staticmethod
	def _sdk_full_text_to_dict(full_text_annotation: Any) -> dict:
	"""Convertit une réponse proto SDK en dict avec la même
	structure que le REST : ``{pages: [{blocks: [{paragraphs:
	[{words: [{confidence, symbols: [{text}]}]}]}]}]}``."""
	pages = []
	for page in getattr(full_text_annotation, "pages", []) or []:
	blocks = []
	for block in getattr(page, "blocks", []) or []:
	paragraphs = []
	for para in getattr(block, "paragraphs", []) or []:
	words = []
	for word in getattr(para, "words", []) or []:
	symbols = [
	{"text": getattr(s, "text", "")}
	for s in getattr(word, "symbols", []) or []
	]
	words.append({
	"confidence": float(getattr(word, "confidence", 0.0)),
	"symbols": symbols,
	})
	paragraphs.append({"words": words})
	blocks.append({"paragraphs": paragraphs})
	pages.append({"blocks": blocks})
	return {"pages": pages}

	# ──────────────────────────────────────────────────────────────────
	# Extraction des token_confidences au format Sprint 42
	# ──────────────────────────────────────────────────────────────────

	def _extract_raw_confidences(
	self, native: Any,
	) -> Optional[list[dict[str, Any]]]:
	"""Parcourt ``pages → blocks → paragraphs → words`` et émet
	``{"token": mot, "confidence": float}`` par mot.

	Le mot est reconstitué par concaténation des
	``word.symbols[i].text``. ``word.confidence`` ∈ [0, 1] (la
	normalisation par la base accepte directement ce format).
	"""
	if not self.config.get("expose_confidences", True):
	return None
	if not native or not isinstance(native, dict):
	return None
	out: list[dict[str, Any]] = []
	for page in native.get("pages") or []:
	if not isinstance(page, dict):
	continue
	for block in page.get("blocks") or []:
	if not isinstance(block, dict):
	continue
	for para in block.get("paragraphs") or []:
	if not isinstance(para, dict):
	continue
	for word in para.get("words") or []:
	if not isinstance(word, dict):
	continue
	text = "".join(
	(s or {}).get("text", "")
	for s in (word.get("symbols") or [])
	).strip()
	if not text:
	continue
	conf = word.get("confidence")
	if conf is None:
	continue
	out.append({"token": text, "confidence": conf})
	return out or None