Spaces:
Running
Running
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified | """Tests Sprint 47 β adaptation Tesseract pour exposer token_confidences. | |
| Couvre : | |
| 1. ``run()`` retourne ``EngineResult.token_confidences`` non-vide | |
| quand pytesseract est disponible et qu'``image_to_data`` produit | |
| des confidences. | |
| 2. Le ``text`` retournΓ© reste **strictement identique** Γ ce que | |
| produit ``image_to_string`` (rΓ©trocompat octet par octet β | |
| l'extraction des confidences n'altère jamais le texte). | |
| 3. ``expose_confidences=False`` dΓ©sactive l'extraction (Γ©conomie | |
| d'un appel Tesseract par image). | |
| 4. Si ``image_to_data`` lève, l'OCR continue : ``text`` retourné, | |
| ``token_confidences = None``, warning loggΓ©. | |
| 5. Les non-mots (conf = -1) et tokens vides sont filtrΓ©s. | |
| 6. Les confidences passent le runner Sprint 42 et alimentent | |
| ``DocumentResult.calibration_metrics``. | |
| 7. Si pytesseract n'est pas installΓ©, ``token_confidences = None`` | |
| sans crash (fallback gracieux). | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import pytest | |
| import picarones.engines.tesseract as tesseract_module | |
| from picarones.engines.tesseract import TesseractEngine | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Mocks | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class _MockPytesseract: | |
| """Mock minimal de pytesseract qui simule une rΓ©ponse rΓ©aliste.""" | |
| class Output: | |
| DICT = "DICT" | |
| class pytesseract: # noqa: N801 (imite le namespace rΓ©el) | |
| tesseract_cmd: str = "tesseract" | |
| def __init__( | |
| self, | |
| text: str = "Bonjour le monde", | |
| data: dict | None = None, | |
| raise_on_data: bool = False, | |
| raise_on_string: bool = False, | |
| ) -> None: | |
| self._text = text | |
| self._data = data or { | |
| "text": ["Bonjour", "le", "monde"], | |
| "conf": [95.5, 88.0, 91.3], | |
| } | |
| self.raise_on_data = raise_on_data | |
| self.raise_on_string = raise_on_string | |
| def image_to_string(self, image, lang=None, config=None) -> str: | |
| if self.raise_on_string: | |
| raise RuntimeError("simulated OCR failure") | |
| return self._text | |
| def image_to_data(self, image, lang=None, config=None, output_type=None) -> dict: | |
| if self.raise_on_data: | |
| raise RuntimeError("simulated image_to_data failure") | |
| return self._data | |
| def get_tesseract_version(self): | |
| class _V: | |
| vstring = "5.0.0-mock" | |
| return _V() | |
| class _MockImage: | |
| def open(path): | |
| return object() # placeholder | |
| def patched_tesseract(monkeypatch: pytest.MonkeyPatch) -> _MockPytesseract: | |
| """Patche le module pour utiliser le mock.""" | |
| mock = _MockPytesseract() | |
| monkeypatch.setattr(tesseract_module, "pytesseract", mock) | |
| monkeypatch.setattr(tesseract_module, "Image", _MockImage) | |
| monkeypatch.setattr(tesseract_module, "_PYTESSERACT_AVAILABLE", True) | |
| return mock | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1-2. run() expose token_confidences sans modifier le texte | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRunExposesConfidences: | |
| def test_run_returns_token_confidences( | |
| self, patched_tesseract: _MockPytesseract, tmp_path: Path, | |
| ) -> None: | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine() | |
| result = engine.run(img) | |
| assert result.token_confidences is not None | |
| assert len(result.token_confidences) == 3 | |
| assert result.token_confidences[0] == { | |
| "token": "Bonjour", "confidence": pytest.approx(95.5), | |
| } | |
| def test_text_matches_image_to_string( | |
| self, patched_tesseract: _MockPytesseract, tmp_path: Path, | |
| ) -> None: | |
| """Le texte de l'EngineResult doit Γͺtre strictement celui de | |
| image_to_string, pas une reconstruction depuis image_to_data.""" | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine() | |
| result = engine.run(img) | |
| assert result.text == "Bonjour le monde" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. expose_confidences=False dΓ©sactive | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExposeConfidencesFlag: | |
| def test_disabled_returns_no_confidences( | |
| self, patched_tesseract: _MockPytesseract, tmp_path: Path, | |
| ) -> None: | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine(config={"expose_confidences": False}) | |
| result = engine.run(img) | |
| assert result.text == "Bonjour le monde" | |
| assert result.token_confidences is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. image_to_data Γ©choue β fallback gracieux | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExtractionFailureFallback: | |
| def test_image_to_data_failure_returns_none_confidences( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| caplog: pytest.LogCaptureFixture, | |
| ) -> None: | |
| mock = _MockPytesseract(raise_on_data=True) | |
| monkeypatch.setattr(tesseract_module, "pytesseract", mock) | |
| monkeypatch.setattr(tesseract_module, "Image", _MockImage) | |
| monkeypatch.setattr(tesseract_module, "_PYTESSERACT_AVAILABLE", True) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine() | |
| with caplog.at_level("WARNING", logger="picarones.engines.tesseract"): | |
| result = engine.run(img) | |
| # OCR a rΓ©ussi sur le texte | |
| assert result.text == "Bonjour le monde" | |
| assert result.error is None | |
| # Mais les confidences sont None | |
| assert result.token_confidences is None | |
| # Et un warning explicite a Γ©tΓ© Γ©mis | |
| assert any("token_confidences" in rec.message for rec in caplog.records) | |
| def test_image_to_string_failure_keeps_error( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| """Si l'OCR principal lΓ¨ve, on n'essaie mΓͺme pas d'extraire les | |
| confidences (cohΓ©rent avec le contrat de BaseOCREngine.run).""" | |
| mock = _MockPytesseract(raise_on_string=True) | |
| monkeypatch.setattr(tesseract_module, "pytesseract", mock) | |
| monkeypatch.setattr(tesseract_module, "Image", _MockImage) | |
| monkeypatch.setattr(tesseract_module, "_PYTESSERACT_AVAILABLE", True) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine() | |
| result = engine.run(img) | |
| assert result.error == "simulated OCR failure" | |
| assert result.text == "" | |
| assert result.token_confidences is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Filtrage des non-mots et tokens vides | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTokenFiltering: | |
| def test_negative_conf_filtered( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| mock = _MockPytesseract( | |
| text="Bonjour monde", | |
| data={ | |
| "text": ["Bonjour", "", "monde", "."], | |
| "conf": [95.0, -1.0, 88.0, -1.0], | |
| }, | |
| ) | |
| monkeypatch.setattr(tesseract_module, "pytesseract", mock) | |
| monkeypatch.setattr(tesseract_module, "Image", _MockImage) | |
| monkeypatch.setattr(tesseract_module, "_PYTESSERACT_AVAILABLE", True) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine() | |
| result = engine.run(img) | |
| assert result.token_confidences is not None | |
| # Seuls "Bonjour" et "monde" sont retenus (conf > 0 et token non vide) | |
| tokens = [tc["token"] for tc in result.token_confidences] | |
| assert tokens == ["Bonjour", "monde"] | |
| def test_mismatched_lengths_returns_none( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| # text et conf de longueurs diffΓ©rentes β format inattendu | |
| mock = _MockPytesseract( | |
| text="Bonjour", | |
| data={"text": ["Bonjour", "le"], "conf": [95.0]}, | |
| ) | |
| monkeypatch.setattr(tesseract_module, "pytesseract", mock) | |
| monkeypatch.setattr(tesseract_module, "Image", _MockImage) | |
| monkeypatch.setattr(tesseract_module, "_PYTESSERACT_AVAILABLE", True) | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"x") | |
| engine = TesseractEngine() | |
| result = engine.run(img) | |
| assert result.token_confidences is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Bout-en-bout avec le runner : calibration_metrics calculΓ©e | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEndToEndWithRunner: | |
| def test_runner_picks_up_confidences_and_computes_calibration( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| from picarones.measurements.runner import _compute_document_result | |
| from picarones.engines.base import EngineResult | |
| # Simulation : on appelle directement _compute_document_result | |
| # avec un EngineResult mockΓ© qui porte des confidences. On | |
| # vΓ©rifie que la calibration_metrics est bien attachΓ©e. | |
| ocr = EngineResult( | |
| engine_name="tess", | |
| image_path="/tmp/x.png", | |
| text="alpha beta gamma", | |
| duration_seconds=0.1, | |
| token_confidences=[ | |
| {"token": "alpha", "confidence": 95.0}, | |
| {"token": "beta", "confidence": 95.0}, | |
| {"token": "gamma", "confidence": 95.0}, | |
| ], | |
| ) | |
| dr = _compute_document_result( | |
| doc_id="d1", image_path="/tmp/x.png", | |
| ground_truth="alpha beta gamma", | |
| ocr_result=ocr, char_exclude=None, | |
| ) | |
| assert dr.calibration_metrics is not None | |
| # 3 tokens, tous corrects β accuracy = 1, conf = 0.95 | |
| assert dr.calibration_metrics["overall_accuracy"] == 1.0 | |
| assert dr.calibration_metrics["overall_confidence"] == pytest.approx(0.95) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. pytesseract absent β fallback gracieux | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestPytesseractAbsent: | |
| def test_extraction_returns_none_without_pytesseract( | |
| self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, | |
| ) -> None: | |
| monkeypatch.setattr(tesseract_module, "_PYTESSERACT_AVAILABLE", False) | |
| engine = TesseractEngine() | |
| result = engine.run(tmp_path / "p.png").token_confidences | |
| assert result is None | |