"""Tests Sprint 48 — adaptation Pero OCR pour exposer token_confidences. Couvre : 1. ``_extract_token_confidences_from_layout`` parcourt regions/lines et émet un dict ``{"token": str, "confidence": float}`` par mot, en utilisant ``line.transcription_confidence`` propagée à tous les mots de la ligne. 2. Les lignes sans ``transcription`` ou sans ``transcription_confidence`` sont sautées. 3. Une transcription multi-mots produit autant d'entrées que de mots. 4. ``expose_confidences=False`` désactive l'extraction. 5. ``page_layout = None`` ou vide → retourne ``None`` sans crash. 6. ``run()`` appelle ``_run_pero_pipeline`` **une seule fois** (pas de double coût comme Tesseract) et expose ``token_confidences`` sur l'``EngineResult``. 7. Si le pipeline lève, ``error`` est renseigné, ``text=""``, et ``token_confidences = None``. 8. Intégration bout-en-bout avec ``_compute_document_result``. """ from __future__ import annotations from pathlib import Path from unittest.mock import MagicMock import pytest import picarones.engines.pero_ocr as pero_module from picarones.engines.pero_ocr import PeroOCREngine # ────────────────────────────────────────────────────────────────────────── # Helpers : mock d'un page_layout Pero OCR # ────────────────────────────────────────────────────────────────────────── def _mock_line(transcription: str, conf: float | None) -> MagicMock: line = MagicMock() line.transcription = transcription line.transcription_confidence = conf return line def _mock_region(lines: list) -> MagicMock: region = MagicMock() region.lines = lines return region def _mock_layout(regions: list) -> MagicMock: layout = MagicMock() layout.regions = regions return layout # ────────────────────────────────────────────────────────────────────────── # 1-3. Extraction depuis page_layout # ────────────────────────────────────────────────────────────────────────── class TestExtractFromLayout: def test_one_word_per_token(self) -> None: engine = PeroOCREngine() layout = _mock_layout([ _mock_region([ _mock_line("Bonjour le monde", 0.92), ]), ]) out = engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) assert out is not None assert out == [ {"token": "Bonjour", "confidence": 0.92}, {"token": "le", "confidence": 0.92}, {"token": "monde", "confidence": 0.92}, ] def test_multiple_lines_concatenated(self) -> None: engine = PeroOCREngine() layout = _mock_layout([ _mock_region([ _mock_line("Première ligne", 0.95), _mock_line("Deuxième ligne", 0.80), ]), ]) out = engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) assert out is not None # Chaque mot porte la confidence de SA ligne assert {"token": "Première", "confidence": 0.95} in out assert {"token": "Deuxième", "confidence": 0.80} in out def test_skips_empty_transcription(self) -> None: engine = PeroOCREngine() layout = _mock_layout([ _mock_region([ _mock_line("", 0.95), # transcription vide _mock_line(None, 0.95), # transcription None _mock_line("ok", 0.95), # ok ]), ]) out = engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) assert out == [{"token": "ok", "confidence": 0.95}] def test_skips_none_confidence(self) -> None: engine = PeroOCREngine() layout = _mock_layout([ _mock_region([ _mock_line("avec_conf", 0.85), _mock_line("sans_conf", None), ]), ]) out = engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) assert out == [{"token": "avec_conf", "confidence": 0.85}] def test_skips_negative_confidence(self) -> None: engine = PeroOCREngine() layout = _mock_layout([ _mock_region([ _mock_line("ok", 0.9), _mock_line("dropped", -0.1), ]), ]) out = engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) assert out == [{"token": "ok", "confidence": 0.9}] # ────────────────────────────────────────────────────────────────────────── # 4. expose_confidences=False # ────────────────────────────────────────────────────────────────────────── class TestExposeFlag: def test_disabled_returns_none(self) -> None: engine = PeroOCREngine(config={"expose_confidences": False}) layout = _mock_layout([ _mock_region([_mock_line("hello", 0.9)]), ]) assert engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) is None # ────────────────────────────────────────────────────────────────────────── # 5. Cas dégénérés # ────────────────────────────────────────────────────────────────────────── class TestDegenerateLayouts: def test_none_layout(self) -> None: engine = PeroOCREngine() assert engine._normalize_token_confidences(engine._extract_raw_confidences(None)) is None def test_empty_regions(self) -> None: engine = PeroOCREngine() layout = _mock_layout([]) assert engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) is None def test_only_lines_without_conf_returns_none(self) -> None: engine = PeroOCREngine() layout = _mock_layout([ _mock_region([ _mock_line("ok", None), _mock_line("ok2", None), ]), ]) assert engine._normalize_token_confidences(engine._extract_raw_confidences(layout)) is None # ────────────────────────────────────────────────────────────────────────── # 6-7. run() avec mock du pipeline complet # ────────────────────────────────────────────────────────────────────────── def _make_engine_with_mock_pipeline( monkeypatch: pytest.MonkeyPatch, *, text: str = "Bonjour le monde", layout_regions: list | None = None, raise_on_pipeline: bool = False, ) -> PeroOCREngine: """Mocke ``_run_pero_pipeline`` pour ne pas dépendre de pero-ocr.""" engine = PeroOCREngine() if layout_regions is None: layout_regions = [ _mock_region([_mock_line(text, 0.92)]), ] layout = _mock_layout(layout_regions) def _fake_pipeline(self, image_path): if raise_on_pipeline: raise RuntimeError("simulated pipeline failure") return text, layout monkeypatch.setattr( PeroOCREngine, "_run_pero_pipeline", _fake_pipeline, ) return engine class TestRunPipeline: def test_run_exposes_confidences( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: engine = _make_engine_with_mock_pipeline(monkeypatch) img = tmp_path / "p.png" img.write_bytes(b"x") result = engine.run(img) assert result.text == "Bonjour le monde" assert result.error is None assert result.token_confidences is not None assert len(result.token_confidences) == 3 def test_run_text_preserved_octet_for_octet( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: engine = _make_engine_with_mock_pipeline( monkeypatch, text="Texte avec\nplusieurs lignes", ) img = tmp_path / "p.png" img.write_bytes(b"x") result = engine.run(img) assert result.text == "Texte avec\nplusieurs lignes" def test_pipeline_failure_keeps_error( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: engine = _make_engine_with_mock_pipeline( monkeypatch, raise_on_pipeline=True, ) img = tmp_path / "p.png" img.write_bytes(b"x") result = engine.run(img) assert result.error == "simulated pipeline failure" assert result.text == "" assert result.token_confidences is None # ────────────────────────────────────────────────────────────────────────── # 8. Intégration bout-en-bout avec le runner # ────────────────────────────────────────────────────────────────────────── class TestEndToEndWithRunner: def test_runner_picks_up_confidences(self) -> None: from picarones.measurements.runner import _compute_document_result from picarones.engines.base import EngineResult ocr = EngineResult( engine_name="pero", image_path="/tmp/x.png", text="alpha beta gamma", duration_seconds=0.1, # Confidence ∈ [0, 1] côté Pero (vs [0, 100] Tesseract) — # le runner Sprint 42 normalise via le helper bag-of-words. token_confidences=[ {"token": "alpha", "confidence": 0.95}, {"token": "beta", "confidence": 0.95}, {"token": "gamma", "confidence": 0.95}, ], ) dr = _compute_document_result( doc_id="d1", image_path="/tmp/x.png", ground_truth="alpha beta gamma", ocr_result=ocr, char_exclude=None, ) assert dr.calibration_metrics is not None assert dr.calibration_metrics["overall_accuracy"] == 1.0 assert dr.calibration_metrics["overall_confidence"] == pytest.approx(0.95) # ────────────────────────────────────────────────────────────────────────── # 9. Pero absent — fallback gracieux côté pipeline réel # ────────────────────────────────────────────────────────────────────────── class TestPeroAbsent: def test_pipeline_missing_pero_raises( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: """Si pero-ocr n'est pas installé, ``_run_pero_pipeline`` lève à travers ``_get_parser()``. ``run()`` capture l'exception et retourne ``EngineResult.error``.""" monkeypatch.setattr(pero_module, "_PERO_AVAILABLE", False) engine = PeroOCREngine(config={"config": "/no/such/file.ini"}) img = tmp_path / "p.png" img.write_bytes(b"x") result = engine.run(img) assert result.error is not None assert "pero" in result.error.lower() or "Pillow" in result.error assert result.token_confidences is None