Spaces:
Sleeping
refactor(engines): unifier l'API token_confidences à un seul nom canonique
Browse filesAvant ce commit, chaque adapter OCR exposait deux APIs équivalentes :
- Tesseract : ``_extract_token_confidences(image_path)`` (Sprint 47) +
``_extract_raw_confidences(native)`` (chantier 1)
- Pero : ``_extract_token_confidences_from_layout(layout)`` (Sprint 48) +
``_extract_raw_confidences(native)``
- Mistral : ``_extract_token_confidences_from_response(response)`` (Sprint 49) +
``_extract_raw_confidences(native)`` + ``_run_ocr_with_response(image_path)``
(déléguant à ``_run_with_native``)
- Google Vision : ``_extract_token_confidences_from_full_text(full)`` (Sprint 50)
+ ``_extract_raw_confidences(native)`` + ``_run_ocr_with_full_annotation``
- Azure DI : ``_extract_token_confidences_from_result(result)`` (Sprint 51)
+ ``_extract_raw_confidences(native)`` + ``_run_ocr_with_result``
Cette double API doublait la surface, brouillait le contrat et obligeait
tout modificateur à toucher deux noms.
Ce commit supprime les noms historiques et garde **une seule API
canonique** par engine :
- ``_run_with_native(image_path) → (text, native)`` — appel API unique
- ``_extract_raw_confidences(native) → list[dict] | None`` — parsing
- ``_normalize_token_confidences(raw)`` — filtrage final (hérité de
``BaseOCREngine``)
Mise à jour de la docstring de ``_normalize_token_confidences`` :
préciser que l'échelle native ([0, 100] pour Tesseract, [0, 1] pour
les autres) est conservée. La normalisation finale au moment du
calcul de calibration est faite dans
``picarones.measurements.builtin_hooks.calibration_from_engine_result``.
Les tests Sprints 47-51 ont été migrés vers la nouvelle API au
commit précédent.
https://claude.ai/code/session_01Hsd7kL8yeCbXn1mA7GQK9L
- picarones/engines/azure_doc_intel.py +1 -21
- picarones/engines/base.py +6 -10
- picarones/engines/google_vision.py +1 -21
- picarones/engines/mistral_ocr.py +1 -21
- picarones/engines/pero_ocr.py +0 -11
- picarones/engines/tesseract.py +0 -22
|
@@ -80,21 +80,12 @@ class AzureDocIntelEngine(BaseOCREngine):
|
|
| 80 |
self._api_version: str = self.config.get("api_version", "2024-02-29-preview")
|
| 81 |
|
| 82 |
def _run_ocr(self, image_path: Path) -> str:
|
| 83 |
-
"""
|
| 84 |
text, _result = self._run_with_native(image_path)
|
| 85 |
return text
|
| 86 |
|
| 87 |
def _run_with_native(
|
| 88 |
self, image_path: Path,
|
| 89 |
-
) -> tuple[str, Optional[dict]]:
|
| 90 |
-
"""Hook framework (chantier 1) — délègue à ``_run_ocr_with_result``
|
| 91 |
-
pour permettre aux tests Sprint 51 de monkeypatcher l'appel réseau
|
| 92 |
-
sous son nom historique.
|
| 93 |
-
"""
|
| 94 |
-
return self._run_ocr_with_result(image_path)
|
| 95 |
-
|
| 96 |
-
def _run_ocr_with_result(
|
| 97 |
-
self, image_path: Path,
|
| 98 |
) -> tuple[str, Optional[dict]]:
|
| 99 |
"""Exécute l'OCR et retourne ``(text, analyze_result_dict)``.
|
| 100 |
|
|
@@ -252,14 +243,3 @@ class AzureDocIntelEngine(BaseOCREngine):
|
|
| 252 |
continue
|
| 253 |
out.append({"token": content, "confidence": conf})
|
| 254 |
return out or None
|
| 255 |
-
|
| 256 |
-
def _extract_token_confidences_from_result(
|
| 257 |
-
self, result: Any,
|
| 258 |
-
) -> Optional[list[dict[str, Any]]]:
|
| 259 |
-
"""Alias rétrocompat (Sprint 51) — extrait les confidences d'un ``analyzeResult``.
|
| 260 |
-
|
| 261 |
-
Wrapper qui chaîne ``_extract_raw_confidences`` puis
|
| 262 |
-
``_normalize_token_confidences`` (filtrage tokens vides / négatifs).
|
| 263 |
-
"""
|
| 264 |
-
raw = self._extract_raw_confidences(result)
|
| 265 |
-
return self._normalize_token_confidences(raw)
|
|
|
|
| 80 |
self._api_version: str = self.config.get("api_version", "2024-02-29-preview")
|
| 81 |
|
| 82 |
def _run_ocr(self, image_path: Path) -> str:
|
| 83 |
+
"""Retourne uniquement le texte (interface ``BaseOCREngine``)."""
|
| 84 |
text, _result = self._run_with_native(image_path)
|
| 85 |
return text
|
| 86 |
|
| 87 |
def _run_with_native(
|
| 88 |
self, image_path: Path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
) -> tuple[str, Optional[dict]]:
|
| 90 |
"""Exécute l'OCR et retourne ``(text, analyze_result_dict)``.
|
| 91 |
|
|
|
|
| 243 |
continue
|
| 244 |
out.append({"token": content, "confidence": conf})
|
| 245 |
return out or None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -172,22 +172,18 @@ class BaseOCREngine(BaseModule):
|
|
| 172 |
def _normalize_token_confidences(
|
| 173 |
raw: Optional[list[dict[str, Any]]],
|
| 174 |
) -> Optional[list[dict[str, Any]]]:
|
| 175 |
-
"""Filtre les confidences brutes (
|
| 176 |
|
| 177 |
- Tokens vides ou ``None`` → écartés.
|
| 178 |
- Confidences négatives (Tesseract met -1 pour les non-mots) → écartées.
|
| 179 |
- Confidences non convertibles en float → écartées.
|
| 180 |
|
| 181 |
-
L'
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
`
|
| 185 |
-
calibration. Cette discipline préserve la rétrocompat des
|
| 186 |
-
tests Sprints 47-51 qui inspectent ``EngineResult.token_confidences``.
|
| 187 |
|
| 188 |
-
Retourne ``None`` si aucune entrée n'est exploitable
|
| 189 |
-
lieu d'une liste vide), ce qui signale au runner de sauter
|
| 190 |
-
le calcul de calibration sur ce document.
|
| 191 |
"""
|
| 192 |
if not raw:
|
| 193 |
return None
|
|
|
|
| 172 |
def _normalize_token_confidences(
|
| 173 |
raw: Optional[list[dict[str, Any]]],
|
| 174 |
) -> Optional[list[dict[str, Any]]]:
|
| 175 |
+
"""Filtre les confidences brutes (échelle native conservée).
|
| 176 |
|
| 177 |
- Tokens vides ou ``None`` → écartés.
|
| 178 |
- Confidences négatives (Tesseract met -1 pour les non-mots) → écartées.
|
| 179 |
- Confidences non convertibles en float → écartées.
|
| 180 |
|
| 181 |
+
L'échelle native des moteurs ([0, 100] pour Tesseract,
|
| 182 |
+
[0, 1] pour les autres) est conservée. La normalisation finale
|
| 183 |
+
au moment du calcul de calibration est faite dans
|
| 184 |
+
:func:`picarones.measurements.builtin_hooks.calibration_from_engine_result`.
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
Retourne ``None`` si aucune entrée n'est exploitable.
|
|
|
|
|
|
|
| 187 |
"""
|
| 188 |
if not raw:
|
| 189 |
return None
|
|
@@ -78,21 +78,12 @@ class GoogleVisionEngine(BaseOCREngine):
|
|
| 78 |
self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")
|
| 79 |
|
| 80 |
def _run_ocr(self, image_path: Path) -> str:
|
| 81 |
-
"""
|
| 82 |
text, _full = self._run_with_native(image_path)
|
| 83 |
return text
|
| 84 |
|
| 85 |
def _run_with_native(
|
| 86 |
self, image_path: Path,
|
| 87 |
-
) -> tuple[str, Optional[dict]]:
|
| 88 |
-
"""Hook framework (chantier 1) — délègue à ``_run_ocr_with_full_annotation``
|
| 89 |
-
pour permettre aux tests Sprint 50 de monkeypatcher l'appel réseau
|
| 90 |
-
sous son nom historique.
|
| 91 |
-
"""
|
| 92 |
-
return self._run_ocr_with_full_annotation(image_path)
|
| 93 |
-
|
| 94 |
-
def _run_ocr_with_full_annotation(
|
| 95 |
-
self, image_path: Path,
|
| 96 |
) -> tuple[str, Optional[dict]]:
|
| 97 |
"""Exécute l'OCR et retourne ``(text, full_text_annotation_dict)``.
|
| 98 |
|
|
@@ -263,14 +254,3 @@ class GoogleVisionEngine(BaseOCREngine):
|
|
| 263 |
continue
|
| 264 |
out.append({"token": text, "confidence": conf})
|
| 265 |
return out or None
|
| 266 |
-
|
| 267 |
-
def _extract_token_confidences_from_full_text(
|
| 268 |
-
self, full: Any,
|
| 269 |
-
) -> Optional[list[dict[str, Any]]]:
|
| 270 |
-
"""Alias rétrocompat (Sprint 50) — extrait les confidences d'un ``fullTextAnnotation``.
|
| 271 |
-
|
| 272 |
-
Wrapper qui chaîne ``_extract_raw_confidences`` puis
|
| 273 |
-
``_normalize_token_confidences`` (filtrage tokens vides / négatifs).
|
| 274 |
-
"""
|
| 275 |
-
raw = self._extract_raw_confidences(full)
|
| 276 |
-
return self._normalize_token_confidences(raw)
|
|
|
|
| 78 |
self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")
|
| 79 |
|
| 80 |
def _run_ocr(self, image_path: Path) -> str:
|
| 81 |
+
"""Retourne uniquement le texte (interface ``BaseOCREngine``)."""
|
| 82 |
text, _full = self._run_with_native(image_path)
|
| 83 |
return text
|
| 84 |
|
| 85 |
def _run_with_native(
|
| 86 |
self, image_path: Path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
) -> tuple[str, Optional[dict]]:
|
| 88 |
"""Exécute l'OCR et retourne ``(text, full_text_annotation_dict)``.
|
| 89 |
|
|
|
|
| 254 |
continue
|
| 255 |
out.append({"token": text, "confidence": conf})
|
| 256 |
return out or None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -76,21 +76,12 @@ class MistralOCREngine(BaseOCREngine):
|
|
| 76 |
self._max_tokens = int(self.config.get("max_tokens", 4096))
|
| 77 |
|
| 78 |
def _run_ocr(self, image_path: Path) -> str:
|
| 79 |
-
"""
|
| 80 |
text, _raw = self._run_with_native(image_path)
|
| 81 |
return text
|
| 82 |
|
| 83 |
def _run_with_native(
|
| 84 |
self, image_path: Path,
|
| 85 |
-
) -> tuple[str, Optional[dict]]:
|
| 86 |
-
"""Hook framework (chantier 1) — délègue à ``_run_ocr_with_response``
|
| 87 |
-
pour permettre aux tests Sprint 49 de monkeypatcher l'appel réseau
|
| 88 |
-
sous son nom historique.
|
| 89 |
-
"""
|
| 90 |
-
return self._run_ocr_with_response(image_path)
|
| 91 |
-
|
| 92 |
-
def _run_ocr_with_response(
|
| 93 |
-
self, image_path: Path,
|
| 94 |
) -> tuple[str, Optional[dict]]:
|
| 95 |
"""Exécute l'OCR et retourne ``(text, raw_response)``.
|
| 96 |
|
|
@@ -238,14 +229,3 @@ class MistralOCREngine(BaseOCREngine):
|
|
| 238 |
for word in text.split():
|
| 239 |
if word:
|
| 240 |
out.append({"token": word, "confidence": conf})
|
| 241 |
-
|
| 242 |
-
def _extract_token_confidences_from_response(
|
| 243 |
-
self, response: Any,
|
| 244 |
-
) -> Optional[list[dict[str, Any]]]:
|
| 245 |
-
"""Alias rétrocompat (Sprint 49) — extrait les confidences d'une réponse JSON.
|
| 246 |
-
|
| 247 |
-
Wrapper qui chaîne ``_extract_raw_confidences`` puis
|
| 248 |
-
``_normalize_token_confidences`` (filtrage tokens vides / négatifs).
|
| 249 |
-
"""
|
| 250 |
-
raw = self._extract_raw_confidences(response)
|
| 251 |
-
return self._normalize_token_confidences(raw)
|
|
|
|
| 76 |
self._max_tokens = int(self.config.get("max_tokens", 4096))
|
| 77 |
|
| 78 |
def _run_ocr(self, image_path: Path) -> str:
|
| 79 |
+
"""Retourne uniquement le texte (interface ``BaseOCREngine``)."""
|
| 80 |
text, _raw = self._run_with_native(image_path)
|
| 81 |
return text
|
| 82 |
|
| 83 |
def _run_with_native(
|
| 84 |
self, image_path: Path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
) -> tuple[str, Optional[dict]]:
|
| 86 |
"""Exécute l'OCR et retourne ``(text, raw_response)``.
|
| 87 |
|
|
|
|
| 229 |
for word in text.split():
|
| 230 |
if word:
|
| 231 |
out.append({"token": word, "confidence": conf})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -177,17 +177,6 @@ class PeroOCREngine(BaseOCREngine):
|
|
| 177 |
out.append({"token": word, "confidence": conf})
|
| 178 |
return out or None
|
| 179 |
|
| 180 |
-
def _extract_token_confidences_from_layout(
|
| 181 |
-
self, layout: Any,
|
| 182 |
-
) -> Optional[list[dict[str, Any]]]:
|
| 183 |
-
"""Alias rétrocompat (Sprint 48) — extrait les confidences d'un ``page_layout``.
|
| 184 |
-
|
| 185 |
-
Wrapper qui chaîne ``_extract_raw_confidences`` puis
|
| 186 |
-
``_normalize_token_confidences`` (filtrage tokens vides / négatifs).
|
| 187 |
-
"""
|
| 188 |
-
raw = self._extract_raw_confidences(layout)
|
| 189 |
-
return self._normalize_token_confidences(raw)
|
| 190 |
-
|
| 191 |
@classmethod
|
| 192 |
def from_config(cls, config: Optional[dict] = None) -> "PeroOCREngine":
|
| 193 |
return cls(config=config or {})
|
|
|
|
| 177 |
out.append({"token": word, "confidence": conf})
|
| 178 |
return out or None
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
@classmethod
|
| 181 |
def from_config(cls, config: Optional[dict] = None) -> "PeroOCREngine":
|
| 182 |
return cls(config=config or {})
|
|
@@ -172,28 +172,6 @@ class TesseractEngine(BaseOCREngine):
|
|
| 172 |
out.append({"token": tok_text, "confidence": conf})
|
| 173 |
return out or None
|
| 174 |
|
| 175 |
-
def _extract_token_confidences(
|
| 176 |
-
self, image_path: Path,
|
| 177 |
-
) -> Optional[list[dict[str, Any]]]:
|
| 178 |
-
"""Alias rétrocompat (Sprint 47) — extrait les confidences depuis ``image_path``.
|
| 179 |
-
|
| 180 |
-
Pipeline interne du chantier 1 : ``_run_with_native`` → ``_extract_raw_confidences``
|
| 181 |
-
→ ``_normalize_token_confidences``. Retourne ``None`` si pytesseract est
|
| 182 |
-
absent ou si l'extraction échoue (signal au runner de sauter la calibration).
|
| 183 |
-
"""
|
| 184 |
-
if not _PYTESSERACT_AVAILABLE:
|
| 185 |
-
return None
|
| 186 |
-
try:
|
| 187 |
-
_text, native = self._run_with_native(Path(image_path))
|
| 188 |
-
raw = self._extract_raw_confidences(native)
|
| 189 |
-
return self._normalize_token_confidences(raw)
|
| 190 |
-
except Exception as exc: # noqa: BLE001
|
| 191 |
-
logger.warning(
|
| 192 |
-
"[tesseract] extraction des token_confidences indisponible : %s",
|
| 193 |
-
exc,
|
| 194 |
-
)
|
| 195 |
-
return None
|
| 196 |
-
|
| 197 |
@classmethod
|
| 198 |
def from_config(cls, config: Optional[dict] = None) -> "TesseractEngine":
|
| 199 |
return cls(config=config or {})
|
|
|
|
| 172 |
out.append({"token": tok_text, "confidence": conf})
|
| 173 |
return out or None
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
@classmethod
|
| 176 |
def from_config(cls, config: Optional[dict] = None) -> "TesseractEngine":
|
| 177 |
return cls(config=config or {})
|