File size: 10,523 Bytes
0aa159b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
"""Tests Sprint 16 — câblage line_metrics/hallucination + fondations du moteur narratif.

Couverture :
1. ``compute_document_result`` via le runner peuple bien ``line_metrics`` et
   ``hallucination_metrics`` sur un document réussi.
2. ``EngineReport`` expose ``aggregated_line_metrics`` et
   ``aggregated_hallucination`` après un benchmark.
3. Le modèle ``Fact`` et le ``DetectorRegistry`` fonctionnent.
4. Le registre par défaut est vide en Sprint 1 (les détecteurs seront activés
   progressivement dans les sprints suivants).
"""

from __future__ import annotations

import json
import tempfile
from pathlib import Path

import pytest

from picarones.core.corpus import Corpus, Document
from picarones.core.narrative import (
    DetectorRegistry,
    Fact,
    FactImportance,
    FactType,
    detect_all,
)
from picarones.core.runner import (
    _aggregate_hallucination,
    _aggregate_line_metrics,
    _compute_document_result,
    run_benchmark,
)
from picarones.engines.base import BaseOCREngine, EngineResult


class _FakeEngine(BaseOCREngine):
    """Moteur factice — renvoie un texte configurable, utile en test."""

    def __init__(self, output_text: str, name: str = "fake", config=None):
        super().__init__(config)
        self._output = output_text
        self._display_name = name

    @property
    def name(self) -> str:
        return self._display_name

    def version(self) -> str:
        return "test"

    def _run_ocr(self, image_path):
        return self._output, None

    def run(self, image_path) -> EngineResult:
        return EngineResult(
            engine_name=self.name,
            image_path=str(image_path),
            text=self._output,
            duration_seconds=0.01,
        )


# ---------------------------------------------------------------------------
# 1. Câblage line_metrics et hallucination par document
# ---------------------------------------------------------------------------

class TestDocumentResultWiring:
    """Vérifie que ``_compute_document_result`` peuple les nouveaux champs."""

    def test_line_metrics_populated_on_success(self, tmp_path: Path):
        image = tmp_path / "doc.png"
        image.write_bytes(b"\x89PNG\r\n\x1a\n")  # stub — image_quality loggera un warning

        ocr = EngineResult(
            engine_name="fake",
            image_path=str(image),
            text="ligne une\nligne deux\nligne trois",
            duration_seconds=0.1,
        )
        gt = "ligne une\nligne deux\nligne trois"

        result = _compute_document_result(
            doc_id="doc1",
            image_path=str(image),
            ground_truth=gt,
            ocr_result=ocr,
            char_exclude=None,
        )

        assert result.line_metrics is not None, "line_metrics doit être peuplé"
        assert "percentiles" in result.line_metrics
        assert "gini" in result.line_metrics
        assert result.line_metrics["line_count"] == 3

    def test_hallucination_metrics_populated_on_success(self, tmp_path: Path):
        image = tmp_path / "doc.png"
        image.write_bytes(b"")

        gt = "le chat est sur le tapis rouge et dort paisiblement"
        hyp = "le chat mange des bananes spatiales en orbite lunaire"

        ocr = EngineResult(
            engine_name="fake",
            image_path=str(image),
            text=hyp,
            duration_seconds=0.1,
        )

        result = _compute_document_result(
            doc_id="doc1",
            image_path=str(image),
            ground_truth=gt,
            ocr_result=ocr,
            char_exclude=None,
        )

        assert result.hallucination_metrics is not None
        assert "anchor_score" in result.hallucination_metrics
        assert "length_ratio" in result.hallucination_metrics
        assert "is_hallucinating" in result.hallucination_metrics

    def test_new_fields_empty_on_engine_failure(self, tmp_path: Path):
        """Si l'OCR échoue (success=False), pas de calcul line_metrics/hallucination."""
        image = tmp_path / "doc.png"
        image.write_bytes(b"")

        ocr = EngineResult(
            engine_name="fake",
            image_path=str(image),
            text="",
            duration_seconds=0.1,
            error="simulated failure",
        )
        result = _compute_document_result(
            doc_id="doc1",
            image_path=str(image),
            ground_truth="ground truth text",
            ocr_result=ocr,
            char_exclude=None,
        )

        assert result.line_metrics is None
        assert result.hallucination_metrics is None


# ---------------------------------------------------------------------------
# 2. Agrégation au niveau EngineReport
# ---------------------------------------------------------------------------

class TestAggregationWiring:
    """Vérifie que le benchmark complet produit les agrégations."""

    def test_aggregate_line_metrics_helper_with_empty_list(self):
        assert _aggregate_line_metrics([]) is None

    def test_aggregate_hallucination_helper_with_empty_list(self):
        assert _aggregate_hallucination([]) is None

    def test_benchmark_end_to_end_produces_aggregations(self, tmp_path: Path):
        img = tmp_path / "test.png"
        img.write_bytes(b"")

        corpus = Corpus(
            name="test",
            documents=[
                Document(
                    doc_id="d1",
                    image_path=img,
                    ground_truth="bonjour le monde\nligne deux\nfin",
                ),
                Document(
                    doc_id="d2",
                    image_path=img,
                    ground_truth="autre document test\navec deux lignes",
                ),
            ],
            source_path=str(tmp_path),
        )

        engine = _FakeEngine(
            output_text="bonjour le monde\nligne deux\nfin",
            name="fake_engine",
        )

        result = run_benchmark(
            corpus=corpus,
            engines=[engine],
            show_progress=False,
            max_workers=1,
            partial_dir=str(tmp_path / "partial"),
        )

        assert len(result.engine_reports) == 1
        report = result.engine_reports[0]

        assert report.aggregated_line_metrics is not None, (
            "aggregated_line_metrics doit être peuplé après benchmark"
        )
        assert "gini_mean" in report.aggregated_line_metrics
        assert "document_count" in report.aggregated_line_metrics
        assert report.aggregated_line_metrics["document_count"] == 2

        assert report.aggregated_hallucination is not None, (
            "aggregated_hallucination doit être peuplé après benchmark"
        )
        assert "anchor_score_mean" in report.aggregated_hallucination
        assert report.aggregated_hallucination["document_count"] == 2

    def test_json_export_includes_new_aggregations(self, tmp_path: Path):
        img = tmp_path / "t.png"
        img.write_bytes(b"")
        corpus = Corpus(
            name="test",
            documents=[
                Document(doc_id="d1", image_path=img, ground_truth="un\ndeux"),
            ],
            source_path=str(tmp_path),
        )
        engine = _FakeEngine(output_text="un\ndeux", name="fake")

        out = tmp_path / "bench.json"
        run_benchmark(
            corpus=corpus,
            engines=[engine],
            output_json=out,
            show_progress=False,
            max_workers=1,
            partial_dir=str(tmp_path / "partial"),
        )

        data = json.loads(out.read_text(encoding="utf-8"))
        report = data["engine_reports"][0]
        assert "aggregated_line_metrics" in report
        assert "aggregated_hallucination" in report


# ---------------------------------------------------------------------------
# 3. Modèle Fact et DetectorRegistry
# ---------------------------------------------------------------------------

class TestFactModel:
    def test_fact_is_serializable(self):
        fact = Fact(
            type=FactType.GLOBAL_LEADER_CER,
            importance=FactImportance.CRITICAL,
            payload={"engine": "tesseract", "cer": 0.042},
            engines_involved=("tesseract",),
        )
        d = fact.as_dict()
        assert d["type"] == "global_leader_cer"
        assert d["importance"] == 100
        assert d["payload"]["cer"] == 0.042
        assert d["engines_involved"] == ["tesseract"]

    def test_fact_importance_ordering(self):
        assert FactImportance.CRITICAL > FactImportance.HIGH
        assert FactImportance.HIGH > FactImportance.MEDIUM
        assert FactImportance.MEDIUM > FactImportance.LOW


class TestDetectorRegistry:
    def test_registry_starts_empty(self):
        registry = DetectorRegistry()
        assert registry.registered_types() == ()
        assert registry.run({}) == []

    def test_register_and_run(self):
        registry = DetectorRegistry()

        def dummy_detector(data: dict) -> list[Fact]:
            return [Fact(
                type=FactType.GLOBAL_LEADER_CER,
                importance=FactImportance.CRITICAL,
                payload={"engine": data.get("leader", "unknown")},
            )]

        registry.register(FactType.GLOBAL_LEADER_CER, dummy_detector)
        assert FactType.GLOBAL_LEADER_CER in registry.registered_types()

        facts = registry.run({"leader": "tesseract"})
        assert len(facts) == 1
        assert facts[0].payload["engine"] == "tesseract"

    def test_registry_swallows_detector_exceptions(self):
        """Un détecteur défaillant ne doit pas casser le pipeline narratif."""
        registry = DetectorRegistry()

        def broken_detector(data: dict) -> list[Fact]:
            raise RuntimeError("boom")

        def working_detector(data: dict) -> list[Fact]:
            return [Fact(
                type=FactType.SPEED_WINNER,
                importance=FactImportance.HIGH,
                payload={},
            )]

        registry.register(FactType.GLOBAL_LEADER_CER, broken_detector)
        registry.register(FactType.SPEED_WINNER, working_detector)

        facts = registry.run({})
        assert len(facts) == 1
        assert facts[0].type == FactType.SPEED_WINNER

    def test_default_registry_is_empty_in_sprint_1(self):
        """Sprint 1 = fondations uniquement. Aucun détecteur n'est activé
        par défaut — ils le seront au Sprint 4 avec leurs templates."""
        facts = detect_all({})
        assert facts == []