Picarones / picarones /engines /google_vision.py
Claude
refactor(engines): unifier l'API token_confidences Γ  un seul nom canonique
eca43d9 unverified
Raw
History Blame
10.8 kB
"""Adaptateur OCR β€” Google Cloud Vision API.
Utilise l'API Google Cloud Vision pour la dΓ©tection de texte dans des
documents (mΓ©thode ``DOCUMENT_TEXT_DETECTION``, optimisΓ©e pour les textes
denses et multilinguistiques).
Authentification :
- Via service account JSON : variable d'environnement
``GOOGLE_APPLICATION_CREDENTIALS`` β†’ chemin vers le fichier JSON
- Via clΓ© API simple : variable d'environnement ``GOOGLE_API_KEY``
Le mode service account est recommandΓ© pour la production.
Sprint 50 β€” exposition des token_confidences
---------------------------------------------
``DOCUMENT_TEXT_DETECTION`` expose ``Word.confidence`` au niveau mot
sur chaque ``page > block > paragraph > word``. L'adapter parcourt
cette hiΓ©rarchie et Γ©met une entrΓ©e par mot au format Sprint 42.
Les deux chemins (SDK ``google-cloud-vision`` et REST direct via
``urllib``) sont normalisΓ©s vers une reprΓ©sentation unifiΓ©e.
Pour ``TEXT_DETECTION`` (mode "court"), aucune confidence par mot
n'est exposΓ©e : ``token_confidences = None``.
Refactor du chantier 1 (post-Sprint 97)
---------------------------------------
L'adapter ne surcharge plus ``run()`` β€” il implΓ©mente ``_run_with_native``
et ``_extract_raw_confidences`` (les hooks factorisΓ©s dans ``BaseOCREngine``).
Comportement externe et octets de sortie strictement identiques.
"""
from __future__ import annotations
import base64
import json
import logging
import os
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any, Optional
from picarones.engines.base import BaseOCREngine
logger = logging.getLogger(__name__)
class GoogleVisionEngine(BaseOCREngine):
"""Moteur OCR via l'API Google Cloud Vision.
Configuration
-------------
language_hints : list[str]
Suggestions de langue (ex : ``["fr"]``). AmΓ©liore la prΓ©cision.
feature_type : str
Type de dΓ©tection : ``"DOCUMENT_TEXT_DETECTION"`` (dΓ©faut, pour textes
denses) ou ``"TEXT_DETECTION"`` (pour textes courts).
expose_confidences : bool
``True`` (dΓ©faut) : extrait ``Word.confidence`` quand
``feature_type=DOCUMENT_TEXT_DETECTION`` (Sprint 50).
``False`` : dΓ©sactive l'extraction (Γ©conomise quelques ms par
image).
"""
@property
def name(self) -> str:
return "google_vision"
def version(self) -> str:
return "v1"
def __init__(self, config: Optional[dict] = None) -> None:
super().__init__(config)
self._api_key = os.environ.get("GOOGLE_API_KEY")
self._credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
self._language_hints: list[str] = self.config.get("language_hints", ["fr"])
self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")
def _run_ocr(self, image_path: Path) -> str:
"""Retourne uniquement le texte (interface ``BaseOCREngine``)."""
text, _full = self._run_with_native(image_path)
return text
def _run_with_native(
self, image_path: Path,
) -> tuple[str, Optional[dict]]:
"""ExΓ©cute l'OCR et retourne ``(text, full_text_annotation_dict)``.
``full_text_annotation_dict`` est :
- le JSON brut ``fullTextAnnotation`` du REST quand on passe
par REST,
- une reprΓ©sentation dict normalisΓ©e quand on passe par SDK,
- ``None`` pour ``TEXT_DETECTION`` (mode court sans
confidence par mot).
"""
if self._credentials_path:
return self._run_via_sdk(image_path)
elif self._api_key:
return self._run_via_rest(image_path)
else:
raise RuntimeError(
"Authentification Google Vision manquante. DΓ©finissez "
"GOOGLE_APPLICATION_CREDENTIALS (service account JSON) "
"ou GOOGLE_API_KEY."
)
def _run_via_sdk(self, image_path: Path) -> tuple[str, Optional[dict]]:
try:
from google.cloud import vision
except ImportError as exc:
raise RuntimeError(
"Le package 'google-cloud-vision' n'est pas installΓ©. "
"Lancez : pip install google-cloud-vision"
) from exc
client = vision.ImageAnnotatorClient()
image_bytes = image_path.read_bytes()
image = vision.Image(content=image_bytes)
if self._feature_type == "DOCUMENT_TEXT_DETECTION":
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(
language_hints=self._language_hints
),
)
text = response.full_text_annotation.text
full = self._sdk_full_text_to_dict(response.full_text_annotation)
return text, full
else:
response = client.text_detection(
image=image,
image_context=vision.ImageContext(
language_hints=self._language_hints
),
)
texts = response.text_annotations
text = texts[0].description if texts else ""
return text, None
def _run_via_rest(self, image_path: Path) -> tuple[str, Optional[dict]]:
"""Appel REST direct (sans SDK), avec clΓ© API simple."""
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
payload = {
"requests": [
{
"image": {"content": image_b64},
"features": [{"type": self._feature_type, "maxResults": 1}],
"imageContext": {"languageHints": self._language_hints},
}
]
}
url = "https://vision.googleapis.com/v1/images:annotate"
data = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(
url, data=data,
headers={
"Content-Type": "application/json",
"X-Goog-Api-Key": self._api_key,
},
)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
result = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
raise RuntimeError(f"Google Vision API erreur {exc.code}: {exc.read().decode()}") from exc
responses = result.get("responses", [{}])
if not responses:
return "", None
r = responses[0]
if "error" in r:
raise RuntimeError(f"Google Vision API erreur : {r['error']}")
if self._feature_type == "DOCUMENT_TEXT_DETECTION":
full = r.get("fullTextAnnotation") or None
text = (full or {}).get("text", "") if isinstance(full, dict) else ""
return text, full
else:
texts = r.get("textAnnotations", [])
text = texts[0]["description"] if texts else ""
return text, None
# ──────────────────────────────────────────────────────────────────
# Conversion SDK β†’ dict normalisΓ© (pour traitement uniforme)
# ──────────────────────────────────────────────────────────────────
@staticmethod
def _sdk_full_text_to_dict(full_text_annotation: Any) -> dict:
"""Convertit une rΓ©ponse proto SDK en dict avec la mΓͺme
structure que le REST : ``{pages: [{blocks: [{paragraphs:
[{words: [{confidence, symbols: [{text}]}]}]}]}]}``."""
pages = []
for page in getattr(full_text_annotation, "pages", []) or []:
blocks = []
for block in getattr(page, "blocks", []) or []:
paragraphs = []
for para in getattr(block, "paragraphs", []) or []:
words = []
for word in getattr(para, "words", []) or []:
symbols = [
{"text": getattr(s, "text", "")}
for s in getattr(word, "symbols", []) or []
]
words.append({
"confidence": float(getattr(word, "confidence", 0.0)),
"symbols": symbols,
})
paragraphs.append({"words": words})
blocks.append({"paragraphs": paragraphs})
pages.append({"blocks": blocks})
return {"pages": pages}
# ──────────────────────────────────────────────────────────────────
# Extraction des token_confidences au format Sprint 42
# ──────────────────────────────────────────────────────────────────
def _extract_raw_confidences(
self, native: Any,
) -> Optional[list[dict[str, Any]]]:
"""Parcourt ``pages β†’ blocks β†’ paragraphs β†’ words`` et Γ©met
``{"token": mot, "confidence": float}`` par mot.
Le mot est reconstituΓ© par concatΓ©nation des
``word.symbols[i].text``. ``word.confidence`` ∈ [0, 1] (la
normalisation par la base accepte directement ce format).
"""
if not self.config.get("expose_confidences", True):
return None
if not native or not isinstance(native, dict):
return None
out: list[dict[str, Any]] = []
for page in native.get("pages") or []:
if not isinstance(page, dict):
continue
for block in page.get("blocks") or []:
if not isinstance(block, dict):
continue
for para in block.get("paragraphs") or []:
if not isinstance(para, dict):
continue
for word in para.get("words") or []:
if not isinstance(word, dict):
continue
text = "".join(
(s or {}).get("text", "")
for s in (word.get("symbols") or [])
).strip()
if not text:
continue
conf = word.get("confidence")
if conf is None:
continue
out.append({"token": text, "confidence": conf})
return out or None