Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Picarones / tests /engines /test_sprint51_azure_confidences.py

Claude

test: réorganiser les 110 fichiers tests/test_*.py par cercle architectural

d109222 unverified about 2 months ago

11.9 kB

	"""Tests Sprint 51 — adaptation Azure Document Intelligence pour exposer
	token_confidences.

	Couvre :

	1. ``_extract_token_confidences_from_result`` parcourt
	``pages[].words[]`` et émet ``{"token": content, "confidence": float}``
	par mot.
	2. Filtrage des mots sans confidence, conf négative, contenu vide.
	3. ``expose_confidences=False`` désactive l'extraction.
	4. ``analyze_result = None`` ou structures invalides → retourne ``None``.
	5. ``_sdk_result_to_dict`` convertit un objet SDK proto en dict
	normalisé compatible avec le chemin REST.
	6. ``run()`` orchestre les deux chemins (SDK + REST) et expose les
	confidences sur l'``EngineResult``.
	7. Échec API → ``error`` renseigné, ``token_confidences = None``.
	8. Intégration runner : ``calibration_metrics`` calculée bout-en-bout.
	"""

	from __future__ import annotations

	from pathlib import Path
	from unittest.mock import MagicMock

	import pytest

	from picarones.engines.azure_doc_intel import AzureDocIntelEngine


	# ──────────────────────────────────────────────────────────────────────────
	# Helpers
	# ──────────────────────────────────────────────────────────────────────────


	def _word(content: str, conf: float \| None) -> dict:
	return {"content": content, "confidence": conf}


	def _result(words: list[dict]) -> dict:
	return {"pages": [{"words": words}]}


	# ──────────────────────────────────────────────────────────────────────────
	# 1-2. Extraction depuis analyze_result
	# ──────────────────────────────────────────────────────────────────────────


	class TestExtractFromResult:
	def test_emits_one_entry_per_word(self) -> None:
	engine = AzureDocIntelEngine()
	result = _result([
	_word("Bonjour", 0.97),
	_word("monde", 0.93),
	])
	out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
	assert out == [
	{"token": "Bonjour", "confidence": 0.97},
	{"token": "monde", "confidence": 0.93},
	]

	def test_skips_word_without_confidence(self) -> None:
	engine = AzureDocIntelEngine()
	result = _result([
	_word("ok", 0.95),
	{"content": "no_conf"}, # pas de confidence
	_word("none_conf", None),
	])
	out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
	assert out == [{"token": "ok", "confidence": 0.95}]

	def test_skips_negative_confidence(self) -> None:
	engine = AzureDocIntelEngine()
	result = _result([
	_word("ok", 0.9),
	_word("dropped", -0.1),
	])
	out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
	assert out == [{"token": "ok", "confidence": 0.9}]

	def test_skips_empty_content(self) -> None:
	engine = AzureDocIntelEngine()
	result = _result([
	_word("", 0.95),
	_word("ok", 0.9),
	])
	out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
	assert out == [{"token": "ok", "confidence": 0.9}]

	def test_traverses_multiple_pages(self) -> None:
	engine = AzureDocIntelEngine()
	result = {
	"pages": [
	{"words": [_word("alpha", 0.9), _word("beta", 0.85)]},
	{"words": [_word("gamma", 0.8)]},
	],
	}
	out = engine._normalize_token_confidences(engine._extract_raw_confidences(result))
	assert [tc["token"] for tc in (out or [])] == ["alpha", "beta", "gamma"]


	# ──────────────────────────────────────────────────────────────────────────
	# 3. expose_confidences=False
	# ──────────────────────────────────────────────────────────────────────────


	class TestExposeFlag:
	def test_disabled_returns_none(self) -> None:
	engine = AzureDocIntelEngine(config={"expose_confidences": False})
	assert engine._normalize_token_confidences(
	engine._extract_raw_confidences(_result([_word("ok", 0.9)])),
	) is None


	# ──────────────────────────────────────────────────────────────────────────
	# 4. Cas dégénérés
	# ──────────────────────────────────────────────────────────────────────────


	class TestDegenerateInputs:
	def test_none(self) -> None:
	engine = AzureDocIntelEngine()
	assert engine._normalize_token_confidences(engine._extract_raw_confidences(None)) is None

	def test_empty_dict(self) -> None:
	engine = AzureDocIntelEngine()
	assert engine._normalize_token_confidences(engine._extract_raw_confidences({})) is None

	def test_no_pages(self) -> None:
	engine = AzureDocIntelEngine()
	assert engine._normalize_token_confidences(engine._extract_raw_confidences(
	{"pages": []},
	)) is None

	def test_pages_without_words(self) -> None:
	engine = AzureDocIntelEngine()
	assert engine._normalize_token_confidences(engine._extract_raw_confidences(
	{"pages": [{"lines": [{"content": "no words"}]}]},
	)) is None


	# ──────────────────────────────────────────────────────────────────────────
	# 5. Conversion SDK → dict
	# ──────────────────────────────────────────────────────────────────────────


	class TestSdkConversion:
	def test_sdk_to_dict(self) -> None:
	# Mock du proto SDK
	word_mock = MagicMock()
	word_mock.content = "Bonjour"
	word_mock.confidence = 0.97
	page_mock = MagicMock()
	page_mock.words = [word_mock]
	result_mock = MagicMock()
	result_mock.pages = [page_mock]

	out = AzureDocIntelEngine._sdk_result_to_dict(result_mock)
	assert "pages" in out
	assert out["pages"][0]["words"][0]["content"] == "Bonjour"
	assert out["pages"][0]["words"][0]["confidence"] == pytest.approx(0.97)

	def test_sdk_word_with_none_confidence(self) -> None:
	word_mock = MagicMock()
	word_mock.content = "ok"
	word_mock.confidence = None
	page_mock = MagicMock()
	page_mock.words = [word_mock]
	result_mock = MagicMock()
	result_mock.pages = [page_mock]

	out = AzureDocIntelEngine._sdk_result_to_dict(result_mock)
	assert out["pages"][0]["words"][0]["confidence"] is None


	# ──────────────────────────────────────────────────────────────────────────
	# 6-7. run() avec mock
	# ──────────────────────────────────────────────────────────────────────────


	def _patch_run_with_result(
	monkeypatch: pytest.MonkeyPatch,
	text: str,
	analyze_result: dict \| None,
	*,
	raise_exc: Exception \| None = None,
	) -> AzureDocIntelEngine:
	engine = AzureDocIntelEngine()
	engine._api_key = "test-key"
	engine._endpoint = "https://test.cognitiveservices.azure.com"

	def _fake(self, image_path):
	if raise_exc is not None:
	raise raise_exc
	return text, analyze_result

	monkeypatch.setattr(
	AzureDocIntelEngine, "_run_with_native", _fake,
	)
	return engine


	class TestRunOverride:
	def test_run_exposes_confidences(
	self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path,
	) -> None:
	engine = _patch_run_with_result(
	monkeypatch,
	text="Bonjour\nmonde",
	analyze_result=_result([
	_word("Bonjour", 0.97),
	_word("monde", 0.93),
	]),
	)
	img = tmp_path / "p.png"
	img.write_bytes(b"x")
	result = engine.run(img)
	assert result.text == "Bonjour\nmonde"
	assert result.error is None
	assert result.token_confidences is not None
	assert len(result.token_confidences) == 2

	def test_run_no_result_no_confidences(
	self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path,
	) -> None:
	engine = _patch_run_with_result(
	monkeypatch, text="Texte", analyze_result=None,
	)
	img = tmp_path / "p.png"
	img.write_bytes(b"x")
	result = engine.run(img)
	assert result.text == "Texte"
	assert result.token_confidences is None

	def test_run_api_failure_keeps_error(
	self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path,
	) -> None:
	engine = _patch_run_with_result(
	monkeypatch, text="", analyze_result=None,
	raise_exc=RuntimeError("Azure timeout"),
	)
	img = tmp_path / "p.png"
	img.write_bytes(b"x")
	result = engine.run(img)
	assert result.error == "Azure timeout"
	assert result.token_confidences is None


	# ──────────────────────────────────────────────────────────────────────────
	# 8. Intégration runner
	# ──────────────────────────────────────────────────────────────────────────


	class TestEndToEndWithRunner:
	def test_runner_picks_up_azure_confidences(self) -> None:
	from picarones.measurements.runner import _compute_document_result
	from picarones.engines.base import EngineResult

	ocr = EngineResult(
	engine_name="azure_doc_intel",
	image_path="/tmp/x.png",
	text="alpha beta gamma",
	duration_seconds=0.1,
	token_confidences=[
	{"token": "alpha", "confidence": 0.97},
	{"token": "beta", "confidence": 0.93},
	{"token": "gamma", "confidence": 0.95},
	],
	)
	dr = _compute_document_result(
	doc_id="d1", image_path="/tmp/x.png",
	ground_truth="alpha beta gamma",
	ocr_result=ocr, char_exclude=None,
	)
	assert dr.calibration_metrics is not None
	assert dr.calibration_metrics["overall_accuracy"] == 1.0
	assert dr.calibration_metrics["overall_confidence"] == pytest.approx(
	(0.97 + 0.93 + 0.95) / 3,
	)