Spaces:
Running
Running
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified | """Tests Sprint 51 β adaptation Azure Document Intelligence pour exposer | |
| token_confidences. | |
| Couvre : | |
| 1. ``_extract_token_confidences_from_result`` parcourt | |
| ``pages[].words[]`` et Γ©met ``{"token": content, "confidence": float}`` | |
| par mot. | |
| 2. Filtrage des mots sans confidence, conf nΓ©gative, contenu vide. | |
| 3. ``expose_confidences=False`` dΓ©sactive l'extraction. | |
| 4. ``analyze_result = None`` ou structures invalides β retourne ``None``. | |
| 5. ``_sdk_result_to_dict`` convertit un objet SDK proto en dict | |
| normalisΓ© compatible avec le chemin REST. | |
| 6. ``run()`` orchestre les deux chemins (SDK + REST) et expose les | |
| confidences sur l'``EngineResult``. | |
| 7. Γchec API β ``error`` renseignΓ©, ``token_confidences = None``. | |
| 8. IntΓ©gration runner : ``calibration_metrics`` calculΓ©e bout-en-bout. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from unittest.mock import MagicMock | |
| import pytest | |
| from picarones.engines.azure_doc_intel import AzureDocIntelEngine | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _word(content: str, conf: float | None) -> dict: | |
| return {"content": content, "confidence": conf} | |
| def _result(words: list[dict]) -> dict: | |
| return {"pages": [{"words": words}]} | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1-2. Extraction depuis analyze_result | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExtractFromResult: | |
| def test_emits_one_entry_per_word(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| result = _result([ | |
| _word("Bonjour", 0.97), | |
| _word("monde", 0.93), | |
| ]) | |
| out = engine._normalize_token_confidences(engine._extract_raw_confidences(result)) | |
| assert out == [ | |
| {"token": "Bonjour", "confidence": 0.97}, | |
| {"token": "monde", "confidence": 0.93}, | |
| ] | |
| def test_skips_word_without_confidence(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| result = _result([ | |
| _word("ok", 0.95), | |
| {"content": "no_conf"}, # pas de confidence | |
| _word("none_conf", None), | |
| ]) | |
| out = engine._normalize_token_confidences(engine._extract_raw_confidences(result)) | |
| assert out == [{"token": "ok", "confidence": 0.95}] | |
| def test_skips_negative_confidence(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| result = _result([ | |
| _word("ok", 0.9), | |
| _word("dropped", -0.1), | |
| ]) | |
| out = engine._normalize_token_confidences(engine._extract_raw_confidences(result)) | |
| assert out == [{"token": "ok", "confidence": 0.9}] | |
| def test_skips_empty_content(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| result = _result([ | |
| _word("", 0.95), | |
| _word("ok", 0.9), | |
| ]) | |
| out = engine._normalize_token_confidences(engine._extract_raw_confidences(result)) | |
| assert out == [{"token": "ok", "confidence": 0.9}] | |
| def test_traverses_multiple_pages(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| result = { | |
| "pages": [ | |
| {"words": [_word("alpha", 0.9), _word("beta", 0.85)]}, | |
| {"words": [_word("gamma", 0.8)]}, | |
| ], | |
| } | |
| out = engine._normalize_token_confidences(engine._extract_raw_confidences(result)) | |
| assert [tc["token"] for tc in (out or [])] == ["alpha", "beta", "gamma"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. expose_confidences=False | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExposeFlag: | |
| def test_disabled_returns_none(self) -> None: | |
| engine = AzureDocIntelEngine(config={"expose_confidences": False}) | |
| assert engine._normalize_token_confidences( | |
| engine._extract_raw_confidences(_result([_word("ok", 0.9)])), | |
| ) is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. Cas dΓ©gΓ©nΓ©rΓ©s | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDegenerateInputs: | |
| def test_none(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| assert engine._normalize_token_confidences(engine._extract_raw_confidences(None)) is None | |
| def test_empty_dict(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| assert engine._normalize_token_confidences(engine._extract_raw_confidences({})) is None | |
| def test_no_pages(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| assert engine._normalize_token_confidences(engine._extract_raw_confidences( | |
| {"pages": []}, | |
| )) is None | |
| def test_pages_without_words(self) -> None: | |
| engine = AzureDocIntelEngine() | |
| assert engine._normalize_token_confidences(engine._extract_raw_confidences( | |
| {"pages": [{"lines": [{"content": "no words"}]}]}, | |
| )) is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Conversion SDK β dict | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestSdkConversion: | |
| def test_sdk_to_dict(self) -> None: | |
| # Mock du proto SDK | |
| word_mock = MagicMock() | |
| word_mock.content = "Bonjour" | |
| word_mock.confidence = 0.97 | |
| page_mock = MagicMock() | |
| page_mock.words = [word_mock] | |
| result_mock = MagicMock() | |
| result_mock.pages = [page_mock] | |
| out = AzureDocIntelEngine._sdk_result_to_dict(result_mock) | |
| assert "pages" in out | |
| assert out["pages"][0]["words"][0]["content"] == "Bonjour" | |
| assert out["pages"][0]["words"][0]["confidence"] == pytest.approx(0.97) | |
| def test_sdk_word_with_none_confidence(self) -> None: | |
| word_mock = MagicMock() | |
| word_mock.content = "ok" | |
| word_mock.confidence = None | |
| page_mock = MagicMock() | |
| page_mock.words = [word_mock] | |
| result_mock = MagicMock() | |
| result_mock.pages = [page_mock] | |
| out = AzureDocIntelEngine._sdk_result_to_dict(result_mock) | |
| assert out["pages"][0]["words"][0]["confidence"] is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6-7. run() avec mock | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _patch_run_with_result( | |
| monkeypatch: pytest.MonkeyPatch, | |
| text: str, | |
| analyze_result: dict | None, | |
| *, | |
| raise_exc: Exception | None = None, | |
| ) -> AzureDocIntelEngine: | |
| engine = AzureDocIntelEngine() | |
| engine._api_key = "test-key" | |
| engine._endpoint = "https://test.cognitiveservices.azure.com" | |
| def _fake(self, image_path): | |
| if raise_exc is not None: | |
| raise raise_exc | |
| return text, analyze_result | |
| monkeypatch.setattr( | |
| AzureDocIntelEngine, "_run_with_native", _fake, | |
| ) | |
| return engine | |
| class TestRunOverride: | |
| def test_run_exposes_confidences( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| engine = _patch_run_with_result( | |
| monkeypatch, | |
| text="Bonjour\nmonde", | |
| analyze_result=_result([ | |
| _word("Bonjour", 0.97), | |
| _word("monde", 0.93), | |
| ]), | |
| ) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| result = engine.run(img) | |
| assert result.text == "Bonjour\nmonde" | |
| assert result.error is None | |
| assert result.token_confidences is not None | |
| assert len(result.token_confidences) == 2 | |
| def test_run_no_result_no_confidences( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| engine = _patch_run_with_result( | |
| monkeypatch, text="Texte", analyze_result=None, | |
| ) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| result = engine.run(img) | |
| assert result.text == "Texte" | |
| assert result.token_confidences is None | |
| def test_run_api_failure_keeps_error( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| engine = _patch_run_with_result( | |
| monkeypatch, text="", analyze_result=None, | |
| raise_exc=RuntimeError("Azure timeout"), | |
| ) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| result = engine.run(img) | |
| assert result.error == "Azure timeout" | |
| assert result.token_confidences is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. IntΓ©gration runner | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEndToEndWithRunner: | |
| def test_runner_picks_up_azure_confidences(self) -> None: | |
| from picarones.measurements.runner import _compute_document_result | |
| from picarones.engines.base import EngineResult | |
| ocr = EngineResult( | |
| engine_name="azure_doc_intel", | |
| image_path="/tmp/x.png", | |
| text="alpha beta gamma", | |
| duration_seconds=0.1, | |
| token_confidences=[ | |
| {"token": "alpha", "confidence": 0.97}, | |
| {"token": "beta", "confidence": 0.93}, | |
| {"token": "gamma", "confidence": 0.95}, | |
| ], | |
| ) | |
| dr = _compute_document_result( | |
| doc_id="d1", image_path="/tmp/x.png", | |
| ground_truth="alpha beta gamma", | |
| ocr_result=ocr, char_exclude=None, | |
| ) | |
| assert dr.calibration_metrics is not None | |
| assert dr.calibration_metrics["overall_accuracy"] == 1.0 | |
| assert dr.calibration_metrics["overall_confidence"] == pytest.approx( | |
| (0.97 + 0.93 + 0.95) / 3, | |
| ) | |