Picarones / tests /engines /test_sprint51_azure_confidences.py
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified
Raw
History Blame
11.9 kB
"""Tests Sprint 51 β€” adaptation Azure Document Intelligence pour exposer
token_confidences.
Couvre :
1. ``_extract_token_confidences_from_result`` parcourt
``pages[].words[]`` et Γ©met ``{"token": content, "confidence": float}``
par mot.
2. Filtrage des mots sans confidence, conf nΓ©gative, contenu vide.
3. ``expose_confidences=False`` dΓ©sactive l'extraction.
4. ``analyze_result = None`` ou structures invalides β†’ retourne ``None``.
5. ``_sdk_result_to_dict`` convertit un objet SDK proto en dict
normalisΓ© compatible avec le chemin REST.
6. ``run()`` orchestre les deux chemins (SDK + REST) et expose les
confidences sur l'``EngineResult``.
7. Γ‰chec API β†’ ``error`` renseignΓ©, ``token_confidences = None``.
8. IntΓ©gration runner : ``calibration_metrics`` calculΓ©e bout-en-bout.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
# ──────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────
def _word(content: str, conf: float | None) -> dict:
return {"content": content, "confidence": conf}
def _result(words: list[dict]) -> dict:
return {"pages": [{"words": words}]}
# ──────────────────────────────────────────────────────────────────────────
# 1-2. Extraction depuis analyze_result
# ──────────────────────────────────────────────────────────────────────────
class TestExtractFromResult:
def test_emits_one_entry_per_word(self) -> None:
engine = AzureDocIntelEngine()
result = _result([
_word("Bonjour", 0.97),
_word("monde", 0.93),
])
out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
assert out == [
{"token": "Bonjour", "confidence": 0.97},
{"token": "monde", "confidence": 0.93},
]
def test_skips_word_without_confidence(self) -> None:
engine = AzureDocIntelEngine()
result = _result([
_word("ok", 0.95),
{"content": "no_conf"}, # pas de confidence
_word("none_conf", None),
])
out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
assert out == [{"token": "ok", "confidence": 0.95}]
def test_skips_negative_confidence(self) -> None:
engine = AzureDocIntelEngine()
result = _result([
_word("ok", 0.9),
_word("dropped", -0.1),
])
out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
assert out == [{"token": "ok", "confidence": 0.9}]
def test_skips_empty_content(self) -> None:
engine = AzureDocIntelEngine()
result = _result([
_word("", 0.95),
_word("ok", 0.9),
])
out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
assert out == [{"token": "ok", "confidence": 0.9}]
def test_traverses_multiple_pages(self) -> None:
engine = AzureDocIntelEngine()
result = {
"pages": [
{"words": [_word("alpha", 0.9), _word("beta", 0.85)]},
{"words": [_word("gamma", 0.8)]},
],
}
out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
assert [tc["token"] for tc in (out or [])] == ["alpha", "beta", "gamma"]
# ──────────────────────────────────────────────────────────────────────────
# 3. expose_confidences=False
# ──────────────────────────────────────────────────────────────────────────
class TestExposeFlag:
def test_disabled_returns_none(self) -> None:
engine = AzureDocIntelEngine(config={"expose_confidences": False})
assert engine._normalize_token_confidences(
engine._extract_raw_confidences(_result([_word("ok", 0.9)])),
) is None
# ──────────────────────────────────────────────────────────────────────────
# 4. Cas dΓ©gΓ©nΓ©rΓ©s
# ──────────────────────────────────────────────────────────────────────────
class TestDegenerateInputs:
def test_none(self) -> None:
engine = AzureDocIntelEngine()
assert engine._normalize_token_confidences(engine._extract_raw_confidences(None)) is None
def test_empty_dict(self) -> None:
engine = AzureDocIntelEngine()
assert engine._normalize_token_confidences(engine._extract_raw_confidences({})) is None
def test_no_pages(self) -> None:
engine = AzureDocIntelEngine()
assert engine._normalize_token_confidences(engine._extract_raw_confidences(
{"pages": []},
)) is None
def test_pages_without_words(self) -> None:
engine = AzureDocIntelEngine()
assert engine._normalize_token_confidences(engine._extract_raw_confidences(
{"pages": [{"lines": [{"content": "no words"}]}]},
)) is None
# ──────────────────────────────────────────────────────────────────────────
# 5. Conversion SDK β†’ dict
# ──────────────────────────────────────────────────────────────────────────
class TestSdkConversion:
def test_sdk_to_dict(self) -> None:
# Mock du proto SDK
word_mock = MagicMock()
word_mock.content = "Bonjour"
word_mock.confidence = 0.97
page_mock = MagicMock()
page_mock.words = [word_mock]
result_mock = MagicMock()
result_mock.pages = [page_mock]
out = AzureDocIntelEngine._sdk_result_to_dict(result_mock)
assert "pages" in out
assert out["pages"][0]["words"][0]["content"] == "Bonjour"
assert out["pages"][0]["words"][0]["confidence"] == pytest.approx(0.97)
def test_sdk_word_with_none_confidence(self) -> None:
word_mock = MagicMock()
word_mock.content = "ok"
word_mock.confidence = None
page_mock = MagicMock()
page_mock.words = [word_mock]
result_mock = MagicMock()
result_mock.pages = [page_mock]
out = AzureDocIntelEngine._sdk_result_to_dict(result_mock)
assert out["pages"][0]["words"][0]["confidence"] is None
# ──────────────────────────────────────────────────────────────────────────
# 6-7. run() avec mock
# ──────────────────────────────────────────────────────────────────────────
def _patch_run_with_result(
monkeypatch: pytest.MonkeyPatch,
text: str,
analyze_result: dict | None,
*,
raise_exc: Exception | None = None,
) -> AzureDocIntelEngine:
engine = AzureDocIntelEngine()
engine._api_key = "test-key"
engine._endpoint = "https://test.cognitiveservices.azure.com"
def _fake(self, image_path):
if raise_exc is not None:
raise raise_exc
return text, analyze_result
monkeypatch.setattr(
AzureDocIntelEngine, "_run_with_native", _fake,
)
return engine
class TestRunOverride:
def test_run_exposes_confidences(
self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path,
) -> None:
engine = _patch_run_with_result(
monkeypatch,
text="Bonjour\nmonde",
analyze_result=_result([
_word("Bonjour", 0.97),
_word("monde", 0.93),
]),
)
img = tmp_path / "p.png"
img.write_bytes(b"x")
result = engine.run(img)
assert result.text == "Bonjour\nmonde"
assert result.error is None
assert result.token_confidences is not None
assert len(result.token_confidences) == 2
def test_run_no_result_no_confidences(
self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path,
) -> None:
engine = _patch_run_with_result(
monkeypatch, text="Texte", analyze_result=None,
)
img = tmp_path / "p.png"
img.write_bytes(b"x")
result = engine.run(img)
assert result.text == "Texte"
assert result.token_confidences is None
def test_run_api_failure_keeps_error(
self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path,
) -> None:
engine = _patch_run_with_result(
monkeypatch, text="", analyze_result=None,
raise_exc=RuntimeError("Azure timeout"),
)
img = tmp_path / "p.png"
img.write_bytes(b"x")
result = engine.run(img)
assert result.error == "Azure timeout"
assert result.token_confidences is None
# ──────────────────────────────────────────────────────────────────────────
# 8. IntΓ©gration runner
# ──────────────────────────────────────────────────────────────────────────
class TestEndToEndWithRunner:
def test_runner_picks_up_azure_confidences(self) -> None:
from picarones.measurements.runner import _compute_document_result
from picarones.engines.base import EngineResult
ocr = EngineResult(
engine_name="azure_doc_intel",
image_path="/tmp/x.png",
text="alpha beta gamma",
duration_seconds=0.1,
token_confidences=[
{"token": "alpha", "confidence": 0.97},
{"token": "beta", "confidence": 0.93},
{"token": "gamma", "confidence": 0.95},
],
)
dr = _compute_document_result(
doc_id="d1", image_path="/tmp/x.png",
ground_truth="alpha beta gamma",
ocr_result=ocr, char_exclude=None,
)
assert dr.calibration_metrics is not None
assert dr.calibration_metrics["overall_accuracy"] == 1.0
assert dr.calibration_metrics["overall_confidence"] == pytest.approx(
(0.97 + 0.93 + 0.95) / 3,
)