Spaces:
Sleeping
Sleeping
| """Tests Sprint 16 — câblage line_metrics/hallucination + fondations du moteur narratif. | |
| Couverture : | |
| 1. ``compute_document_result`` via le runner peuple bien ``line_metrics`` et | |
| ``hallucination_metrics`` sur un document réussi. | |
| 2. ``EngineReport`` expose ``aggregated_line_metrics`` et | |
| ``aggregated_hallucination`` après un benchmark. | |
| 3. Le modèle ``Fact`` et le ``DetectorRegistry`` fonctionnent. | |
| 4. Le registre par défaut est vide en Sprint 1 (les détecteurs seront activés | |
| progressivement dans les sprints suivants). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import tempfile | |
| from pathlib import Path | |
| import pytest | |
| from picarones.core.corpus import Corpus, Document | |
| from picarones.core.narrative import ( | |
| DetectorRegistry, | |
| Fact, | |
| FactImportance, | |
| FactType, | |
| detect_all, | |
| ) | |
| from picarones.core.runner import ( | |
| _aggregate_hallucination, | |
| _aggregate_line_metrics, | |
| _compute_document_result, | |
| run_benchmark, | |
| ) | |
| from picarones.engines.base import BaseOCREngine, EngineResult | |
| class _FakeEngine(BaseOCREngine): | |
| """Moteur factice — renvoie un texte configurable, utile en test.""" | |
| def __init__(self, output_text: str, name: str = "fake", config=None): | |
| super().__init__(config) | |
| self._output = output_text | |
| self._display_name = name | |
| def name(self) -> str: | |
| return self._display_name | |
| def version(self) -> str: | |
| return "test" | |
| def _run_ocr(self, image_path): | |
| return self._output, None | |
| def run(self, image_path) -> EngineResult: | |
| return EngineResult( | |
| engine_name=self.name, | |
| image_path=str(image_path), | |
| text=self._output, | |
| duration_seconds=0.01, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # 1. Câblage line_metrics et hallucination par document | |
| # --------------------------------------------------------------------------- | |
| class TestDocumentResultWiring: | |
| """Vérifie que ``_compute_document_result`` peuple les nouveaux champs.""" | |
| def test_line_metrics_populated_on_success(self, tmp_path: Path): | |
| image = tmp_path / "doc.png" | |
| image.write_bytes(b"\x89PNG\r\n\x1a\n") # stub — image_quality loggera un warning | |
| ocr = EngineResult( | |
| engine_name="fake", | |
| image_path=str(image), | |
| text="ligne une\nligne deux\nligne trois", | |
| duration_seconds=0.1, | |
| ) | |
| gt = "ligne une\nligne deux\nligne trois" | |
| result = _compute_document_result( | |
| doc_id="doc1", | |
| image_path=str(image), | |
| ground_truth=gt, | |
| ocr_result=ocr, | |
| char_exclude=None, | |
| ) | |
| assert result.line_metrics is not None, "line_metrics doit être peuplé" | |
| assert "percentiles" in result.line_metrics | |
| assert "gini" in result.line_metrics | |
| assert result.line_metrics["line_count"] == 3 | |
| def test_hallucination_metrics_populated_on_success(self, tmp_path: Path): | |
| image = tmp_path / "doc.png" | |
| image.write_bytes(b"") | |
| gt = "le chat est sur le tapis rouge et dort paisiblement" | |
| hyp = "le chat mange des bananes spatiales en orbite lunaire" | |
| ocr = EngineResult( | |
| engine_name="fake", | |
| image_path=str(image), | |
| text=hyp, | |
| duration_seconds=0.1, | |
| ) | |
| result = _compute_document_result( | |
| doc_id="doc1", | |
| image_path=str(image), | |
| ground_truth=gt, | |
| ocr_result=ocr, | |
| char_exclude=None, | |
| ) | |
| assert result.hallucination_metrics is not None | |
| assert "anchor_score" in result.hallucination_metrics | |
| assert "length_ratio" in result.hallucination_metrics | |
| assert "is_hallucinating" in result.hallucination_metrics | |
| def test_new_fields_empty_on_engine_failure(self, tmp_path: Path): | |
| """Si l'OCR échoue (success=False), pas de calcul line_metrics/hallucination.""" | |
| image = tmp_path / "doc.png" | |
| image.write_bytes(b"") | |
| ocr = EngineResult( | |
| engine_name="fake", | |
| image_path=str(image), | |
| text="", | |
| duration_seconds=0.1, | |
| error="simulated failure", | |
| ) | |
| result = _compute_document_result( | |
| doc_id="doc1", | |
| image_path=str(image), | |
| ground_truth="ground truth text", | |
| ocr_result=ocr, | |
| char_exclude=None, | |
| ) | |
| assert result.line_metrics is None | |
| assert result.hallucination_metrics is None | |
| # --------------------------------------------------------------------------- | |
| # 2. Agrégation au niveau EngineReport | |
| # --------------------------------------------------------------------------- | |
| class TestAggregationWiring: | |
| """Vérifie que le benchmark complet produit les agrégations.""" | |
| def test_aggregate_line_metrics_helper_with_empty_list(self): | |
| assert _aggregate_line_metrics([]) is None | |
| def test_aggregate_hallucination_helper_with_empty_list(self): | |
| assert _aggregate_hallucination([]) is None | |
| def test_benchmark_end_to_end_produces_aggregations(self, tmp_path: Path): | |
| img = tmp_path / "test.png" | |
| img.write_bytes(b"") | |
| corpus = Corpus( | |
| name="test", | |
| documents=[ | |
| Document( | |
| doc_id="d1", | |
| image_path=img, | |
| ground_truth="bonjour le monde\nligne deux\nfin", | |
| ), | |
| Document( | |
| doc_id="d2", | |
| image_path=img, | |
| ground_truth="autre document test\navec deux lignes", | |
| ), | |
| ], | |
| source_path=str(tmp_path), | |
| ) | |
| engine = _FakeEngine( | |
| output_text="bonjour le monde\nligne deux\nfin", | |
| name="fake_engine", | |
| ) | |
| result = run_benchmark( | |
| corpus=corpus, | |
| engines=[engine], | |
| show_progress=False, | |
| max_workers=1, | |
| partial_dir=str(tmp_path / "partial"), | |
| ) | |
| assert len(result.engine_reports) == 1 | |
| report = result.engine_reports[0] | |
| assert report.aggregated_line_metrics is not None, ( | |
| "aggregated_line_metrics doit être peuplé après benchmark" | |
| ) | |
| assert "gini_mean" in report.aggregated_line_metrics | |
| assert "document_count" in report.aggregated_line_metrics | |
| assert report.aggregated_line_metrics["document_count"] == 2 | |
| assert report.aggregated_hallucination is not None, ( | |
| "aggregated_hallucination doit être peuplé après benchmark" | |
| ) | |
| assert "anchor_score_mean" in report.aggregated_hallucination | |
| assert report.aggregated_hallucination["document_count"] == 2 | |
| def test_json_export_includes_new_aggregations(self, tmp_path: Path): | |
| img = tmp_path / "t.png" | |
| img.write_bytes(b"") | |
| corpus = Corpus( | |
| name="test", | |
| documents=[ | |
| Document(doc_id="d1", image_path=img, ground_truth="un\ndeux"), | |
| ], | |
| source_path=str(tmp_path), | |
| ) | |
| engine = _FakeEngine(output_text="un\ndeux", name="fake") | |
| out = tmp_path / "bench.json" | |
| run_benchmark( | |
| corpus=corpus, | |
| engines=[engine], | |
| output_json=out, | |
| show_progress=False, | |
| max_workers=1, | |
| partial_dir=str(tmp_path / "partial"), | |
| ) | |
| data = json.loads(out.read_text(encoding="utf-8")) | |
| report = data["engine_reports"][0] | |
| assert "aggregated_line_metrics" in report | |
| assert "aggregated_hallucination" in report | |
| # --------------------------------------------------------------------------- | |
| # 3. Modèle Fact et DetectorRegistry | |
| # --------------------------------------------------------------------------- | |
| class TestFactModel: | |
| def test_fact_is_serializable(self): | |
| fact = Fact( | |
| type=FactType.GLOBAL_LEADER_CER, | |
| importance=FactImportance.CRITICAL, | |
| payload={"engine": "tesseract", "cer": 0.042}, | |
| engines_involved=("tesseract",), | |
| ) | |
| d = fact.as_dict() | |
| assert d["type"] == "global_leader_cer" | |
| assert d["importance"] == 100 | |
| assert d["payload"]["cer"] == 0.042 | |
| assert d["engines_involved"] == ["tesseract"] | |
| def test_fact_importance_ordering(self): | |
| assert FactImportance.CRITICAL > FactImportance.HIGH | |
| assert FactImportance.HIGH > FactImportance.MEDIUM | |
| assert FactImportance.MEDIUM > FactImportance.LOW | |
| class TestDetectorRegistry: | |
| def test_registry_starts_empty(self): | |
| registry = DetectorRegistry() | |
| assert registry.registered_types() == () | |
| assert registry.run({}) == [] | |
| def test_register_and_run(self): | |
| registry = DetectorRegistry() | |
| def dummy_detector(data: dict) -> list[Fact]: | |
| return [Fact( | |
| type=FactType.GLOBAL_LEADER_CER, | |
| importance=FactImportance.CRITICAL, | |
| payload={"engine": data.get("leader", "unknown")}, | |
| )] | |
| registry.register(FactType.GLOBAL_LEADER_CER, dummy_detector) | |
| assert FactType.GLOBAL_LEADER_CER in registry.registered_types() | |
| facts = registry.run({"leader": "tesseract"}) | |
| assert len(facts) == 1 | |
| assert facts[0].payload["engine"] == "tesseract" | |
| def test_registry_swallows_detector_exceptions(self): | |
| """Un détecteur défaillant ne doit pas casser le pipeline narratif.""" | |
| registry = DetectorRegistry() | |
| def broken_detector(data: dict) -> list[Fact]: | |
| raise RuntimeError("boom") | |
| def working_detector(data: dict) -> list[Fact]: | |
| return [Fact( | |
| type=FactType.SPEED_WINNER, | |
| importance=FactImportance.HIGH, | |
| payload={}, | |
| )] | |
| registry.register(FactType.GLOBAL_LEADER_CER, broken_detector) | |
| registry.register(FactType.SPEED_WINNER, working_detector) | |
| facts = registry.run({}) | |
| assert len(facts) == 1 | |
| assert facts[0].type == FactType.SPEED_WINNER | |
| def test_default_registry_is_empty_in_sprint_1(self): | |
| """Sprint 1 = fondations uniquement. Aucun détecteur n'est activé | |
| par défaut — ils le seront au Sprint 4 avec leurs templates.""" | |
| facts = detect_all({}) | |
| assert facts == [] | |