File size: 18,208 Bytes
74646e0
 
 
 
 
 
de9192c
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff7895c
de9192c
74646e0
 
ff7895c
74646e0
 
 
 
 
5112943
74646e0
 
 
ff7895c
74646e0
 
 
ff7895c
74646e0
 
 
 
 
ff7895c
74646e0
 
ff7895c
 
74646e0
ff7895c
 
 
 
 
 
 
 
 
 
 
 
 
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
ff7895c
 
74646e0
 
 
 
 
ff7895c
 
74646e0
ff7895c
74646e0
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
ff7895c
 
74646e0
 
 
 
ff7895c
74646e0
ff7895c
74646e0
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5112943
74646e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
"""Sprint D.2.c-f โ€” features additionnelles dans
``run_benchmark_via_service``.

Couvre les paramรจtres legacy auparavant ignorรฉs :

- D.2.c (``output_json``) : dรฉjร  actif depuis D.1.d, couvert par
  ``les tests bout-en-bout du benchmark_runner``.
- D.2.d (``over_normalization``) : pour les pipelines OCR+LLM avec
  รฉtape OCR amont, ``DocumentResult.pipeline_metadata`` porte
  dรฉsormais une clรฉ ``over_normalization``.
- D.2.e (``entity_extractor``) : pour les documents avec une GT
  ``ENTITIES``, les mรฉtriques NER sont calculรฉes + attachรฉes.
- D.2.f (``profile``) : un profil inconnu lรจve ``PicaronesError``
  au dรฉmarrage du bench.
"""

from __future__ import annotations

from pathlib import Path

import pytest

from picarones.adapters.llm.base import BaseLLMAdapter
from picarones.adapters.ocr.base import BaseOCRAdapter
from picarones.app.services.benchmark_runner import (
    _aggregate_ner_metrics,
)
from picarones.domain.artifacts import Artifact, ArtifactType
from picarones.evaluation.corpus import (
    Corpus,
    Document,
    EntitiesGT,
)
from tests._migration_helpers import run_via_orchestrator


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Mocks (canoniques)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class _MockOCR(BaseOCRAdapter):
    def __init__(self, name: str = "mock_ocr", text: str = "ocr") -> None:
        self._name = name
        self._text = text

    @property
    def name(self) -> str:
        return self._name

    def execute(self, inputs, params, context):
        from pathlib import Path

        out_dir = Path(context.workspace_uri)
        out_dir.mkdir(parents=True, exist_ok=True)
        out_path = out_dir / f"{context.document_id}_mock.txt"
        out_path.write_text(self._text, encoding="utf-8")
        return {
            ArtifactType.RAW_TEXT: Artifact(
                id=f"{context.document_id}:{self._name}:raw_text",
                document_id=context.document_id,
                type=ArtifactType.RAW_TEXT,
                produced_by_step="ocr",
                uri=str(out_path),
            ),
        }


class _MockLLM(BaseLLMAdapter):
    def __init__(self, model: str = "mock-1", text: str = "corrected") -> None:
        super().__init__(model=model, config={})
        self._text = text

    @property
    def name(self) -> str:
        return "mock_llm"

    @property
    def default_model(self) -> str:
        return "mock-1"

    def _call(self, prompt, image_b64=None):
        return self._text


def _make_simple_corpus(tmp_path: Path, n: int = 1) -> Corpus:
    docs = []
    for i in range(n):
        img = tmp_path / f"doc{i}.png"
        img.write_bytes(b"x")
        docs.append(Document(
            image_path=img,
            ground_truth=f"texte {i}",
            doc_id=f"doc{i}",
        ))
    return Corpus(name="cdef_test", documents=docs)


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# D.2.f โ€” profile validation
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestProfileValidation:
    """Sprint D.2.f โ€” ``profile`` est validรฉ au dรฉmarrage."""

    def test_unknown_profile_raises(self, tmp_path: Path) -> None:
        corpus = _make_simple_corpus(tmp_path)
        ocr = _MockOCR()

        with pytest.raises(ValueError, match="profil"):
            run_via_orchestrator(
                corpus, [ocr], profile="not_a_real_profile",
            )

    def test_standard_profile_accepted(self, tmp_path: Path) -> None:
        corpus = _make_simple_corpus(tmp_path)
        ocr = _MockOCR()
        bm = run_via_orchestrator(corpus, [ocr], profile="standard")
        assert bm.engine_reports

    def test_default_profile_is_standard(self, tmp_path: Path) -> None:
        """Pas de kwarg = utilise ``standard``, qui passe la validation."""
        corpus = _make_simple_corpus(tmp_path)
        ocr = _MockOCR()
        bm = run_via_orchestrator(corpus, [ocr])
        assert bm.engine_reports

    def test_validation_happens_before_bench(self, tmp_path: Path) -> None:
        """Le profil invalide lรจve AVANT toute exรฉcution OCR (sinon on
        gรขche du temps de calcul pour un nom mal orthographiรฉ)."""
        corpus = _make_simple_corpus(tmp_path)

        call_counter = {"n": 0}

        class _CountingOCR(_MockOCR):
            def _run_ocr(self, image_path):
                call_counter["n"] += 1
                return "ocr"

        ocr = _CountingOCR()
        with pytest.raises(ValueError):
            run_via_orchestrator(
                corpus, [ocr], profile="oops",
            )
        # OCR jamais appelรฉ.
        assert call_counter["n"] == 0


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# D.2.d โ€” over_normalization
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestOverNormalization:
    """Sprint D.2.d โ€” les pipelines OCR+LLM avec OCR amont ont
    une clรฉ ``over_normalization`` dans ``pipeline_metadata``."""

    def test_ocr_only_has_no_over_normalization(self, tmp_path: Path) -> None:
        """Un moteur OCR seul (pas de pipeline) n'a pas
        d'``over_normalization`` puisqu'il n'y a pas de LLM."""
        corpus = _make_simple_corpus(tmp_path)
        ocr = _MockOCR(text="texte 0")
        bm = run_via_orchestrator(corpus, [ocr])

        dr = bm.engine_reports[0].document_results[0]
        assert "over_normalization" not in dr.pipeline_metadata

    def test_pipeline_text_only_computes_over_normalization(
        self, tmp_path: Path,
    ) -> None:
        """Pipeline OCR+LLM en mode ``text_only`` : le LLM reรงoit le
        texte OCR et le corrige.  ``over_normalization`` doit
        apparaรฎtre dans pipeline_metadata."""
        from picarones.pipeline.llm_pipeline_config import (
            OCRLLMPipelineConfig,
        )

        corpus = _make_simple_corpus(tmp_path)
        ocr = _MockOCR(name="upstream_ocr", text="texto 0")  # 1 erreur
        llm = _MockLLM(model="m1", text="texte 0")  # corrige bien
        pipeline = OCRLLMPipelineConfig(
            ocr_adapter=ocr,
            llm_adapter=llm,
            mode="text_only",
        )

        bm = run_via_orchestrator(corpus, [pipeline])

        dr = bm.engine_reports[0].document_results[0]
        assert dr.pipeline_metadata.get("is_pipeline") is True
        assert "over_normalization" in dr.pipeline_metadata
        # Le payload est un dict via OverNormalizationResult.as_dict().
        ov = dr.pipeline_metadata["over_normalization"]
        assert isinstance(ov, dict)

    def test_pipeline_zero_shot_has_no_over_normalization(
        self, tmp_path: Path,
    ) -> None:
        """Pipeline zero-shot : le VLM reรงoit l'image directement, pas
        d'OCR amont, donc pas d'``ocr_intermediate`` et pas
        d'``over_normalization``."""
        from picarones.pipeline.llm_pipeline_config import (
            OCRLLMPipelineConfig,
        )

        corpus = _make_simple_corpus(tmp_path)
        llm = _MockLLM(model="vlm-1", text="texte 0")
        pipeline = OCRLLMPipelineConfig(
            llm_adapter=llm,
            mode="zero_shot",
        )

        bm = run_via_orchestrator(corpus, [pipeline])
        dr = bm.engine_reports[0].document_results[0]
        # Pipeline mais pas d'OCR amont โ†’ pas d'over_normalization.
        assert "over_normalization" not in dr.pipeline_metadata


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# D.2.e โ€” NER attach via entity_extractor
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestNERAttach:
    """Sprint D.2.e โ€” quand ``entity_extractor`` est fourni, les
    documents avec une GT ``ENTITIES`` reรงoivent un ``ner_metrics``
    et l'engine_report a un ``aggregated_ner``."""

    def _make_corpus_with_entities(
        self, tmp_path: Path, n: int = 2,
    ) -> Corpus:
        from picarones.domain.artifacts import ArtifactType

        docs = []
        for i in range(n):
            img = tmp_path / f"d{i}.png"
            img.write_bytes(b"x")
            doc = Document(
                image_path=img,
                ground_truth=f"Jean {i} habite Paris",
                doc_id=f"d{i}",
            )
            doc.ground_truths[ArtifactType.ENTITIES] = EntitiesGT(
                entities=[
                    {"label": "PER", "start": 0, "end": 6 + len(str(i)),
                     "text": f"Jean {i}"},
                    {"label": "LOC", "start": 14 + len(str(i)),
                     "end": 19 + len(str(i)), "text": "Paris"},
                ],
            )
            docs.append(doc)
        return Corpus(name="ner_test", documents=docs)

    def test_no_extractor_no_ner_metrics(self, tmp_path: Path) -> None:
        corpus = self._make_corpus_with_entities(tmp_path)
        ocr = _MockOCR(text="Jean 0 habite Paris")

        bm = run_via_orchestrator(corpus, [ocr])
        report = bm.engine_reports[0]
        for dr in report.document_results:
            assert dr.ner_metrics is None
        assert report.aggregated_ner is None

    def test_extractor_attaches_metrics_to_doc(self, tmp_path: Path) -> None:
        """Quand l'extracteur retourne des entitรฉs sur l'hypothรจse,
        ``ner_metrics`` apparaรฎt sur le DocumentResult."""
        corpus = self._make_corpus_with_entities(tmp_path)
        ocr = _MockOCR(text="Jean 0 habite Paris")  # match parfait

        def extractor(text: str) -> list[dict]:
            # Reproduit les entitรฉs GT sur l'hypothรจse.
            ents = []
            if "Jean 0" in text:
                ents.append({"label": "PER", "start": 0, "end": 6,
                             "text": "Jean 0"})
            if "Paris" in text:
                idx = text.find("Paris")
                ents.append({"label": "LOC", "start": idx,
                             "end": idx + 5, "text": "Paris"})
            return ents

        bm = run_via_orchestrator(
            corpus, [ocr], entity_extractor=extractor,
        )

        report = bm.engine_reports[0]
        d0 = next(d for d in report.document_results if d.doc_id == "d0")
        assert d0.ner_metrics is not None
        # Les entitรฉs matchent โ†’ tp > 0.
        assert d0.ner_metrics["true_positives"] > 0

    def test_aggregated_ner_present_when_any_doc_evaluated(
        self, tmp_path: Path,
    ) -> None:
        corpus = self._make_corpus_with_entities(tmp_path)
        ocr = _MockOCR(text="Jean 0 habite Paris")

        def extractor(text: str) -> list[dict]:
            return [{"label": "PER", "start": 0, "end": 6, "text": "Jean 0"}]

        bm = run_via_orchestrator(
            corpus, [ocr], entity_extractor=extractor,
        )

        report = bm.engine_reports[0]
        assert report.aggregated_ner is not None
        assert "global" in report.aggregated_ner
        assert "precision" in report.aggregated_ner["global"]

    def test_doc_without_entities_gt_skipped(self, tmp_path: Path) -> None:
        """Un document sans GT ``ENTITIES`` n'est pas รฉvaluรฉ NER โ€”
        ``ner_metrics`` reste ``None`` mรชme si l'extracteur est
        fourni."""
        # Corpus mixte : 1 doc avec ENTITIES, 1 sans.
        from picarones.domain.artifacts import ArtifactType

        img1 = tmp_path / "d1.png"
        img1.write_bytes(b"x")
        doc_with = Document(
            image_path=img1, ground_truth="Jean", doc_id="with_ent",
        )
        doc_with.ground_truths[ArtifactType.ENTITIES] = EntitiesGT(
            entities=[{"label": "PER", "start": 0, "end": 4, "text": "Jean"}],
        )

        img2 = tmp_path / "d2.png"
        img2.write_bytes(b"x")
        doc_without = Document(
            image_path=img2, ground_truth="rien", doc_id="without_ent",
        )

        corpus = Corpus(
            name="mixed", documents=[doc_with, doc_without],
        )
        ocr = _MockOCR(text="Jean")

        def extractor(text: str) -> list[dict]:
            return [{"label": "PER", "start": 0, "end": 4, "text": "Jean"}]

        bm = run_via_orchestrator(
            corpus, [ocr], entity_extractor=extractor,
        )

        report = bm.engine_reports[0]
        d_with = next(
            d for d in report.document_results if d.doc_id == "with_ent"
        )
        d_without = next(
            d for d in report.document_results if d.doc_id == "without_ent"
        )

        assert d_with.ner_metrics is not None
        assert d_without.ner_metrics is None

    def test_extractor_exception_does_not_crash_bench(
        self, tmp_path: Path, caplog: pytest.LogCaptureFixture,
    ) -> None:
        corpus = self._make_corpus_with_entities(tmp_path, n=1)
        ocr = _MockOCR(text="Jean 0 habite Paris")

        def buggy_extractor(text: str) -> list[dict]:
            raise RuntimeError("NER backend down")

        with caplog.at_level("WARNING"):
            bm = run_via_orchestrator(
                corpus, [ocr], entity_extractor=buggy_extractor,
            )

        report = bm.engine_reports[0]
        # Le bench a abouti โ€” pas d'exception propagรฉe.
        assert len(report.document_results) == 1
        # ner_metrics non attachรฉ ร  cause du crash.
        assert report.document_results[0].ner_metrics is None


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# D.2.e โ€” agrรฉgation NER (helper interne testรฉ directement)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestAggregateNERMetrics:
    """Tests unitaires de ``_aggregate_ner_metrics`` โ€” รฉquivalent
    fonctionnel de l'ex-``measurements.runner.ner_attach._aggregate_ner``."""

    def test_empty_returns_none(self) -> None:
        from picarones.evaluation.benchmark_result import (
            DocumentResult,
        )
        from picarones.evaluation.metric_result import MetricsResult

        # Aucun ner_metrics sur les docs.
        drs = [
            DocumentResult(
                doc_id="d", image_path="", ground_truth="",
                hypothesis="", metrics=MetricsResult(), duration_seconds=0,
            ),
        ]
        assert _aggregate_ner_metrics(drs) is None

    def test_aggregates_global_prf(self) -> None:
        from picarones.evaluation.benchmark_result import (
            DocumentResult,
        )
        from picarones.evaluation.metric_result import MetricsResult

        dr1 = DocumentResult(
            doc_id="d1", image_path="", ground_truth="",
            hypothesis="", metrics=MetricsResult(), duration_seconds=0,
        )
        dr1.ner_metrics = {
            "true_positives": 5,
            "false_positives": 1,
            "false_negatives": 2,
            "per_category": {},
            "hallucinated_entities": [],
            "missed_entities": [],
        }
        dr2 = DocumentResult(
            doc_id="d2", image_path="", ground_truth="",
            hypothesis="", metrics=MetricsResult(), duration_seconds=0,
        )
        dr2.ner_metrics = {
            "true_positives": 3,
            "false_positives": 0,
            "false_negatives": 1,
            "per_category": {},
            "hallucinated_entities": [],
            "missed_entities": [],
        }

        agg = _aggregate_ner_metrics([dr1, dr2])

        assert agg is not None
        # tp=8, fp=1, fn=3 โ†’ P=8/9, R=8/11, F1=2*P*R/(P+R)
        assert agg["global"]["precision"] == pytest.approx(8 / 9, abs=1e-4)
        assert agg["global"]["recall"] == pytest.approx(8 / 11, abs=1e-4)
        assert agg["n_documents"] == 2

    def test_per_category_aggregation(self) -> None:
        from picarones.evaluation.benchmark_result import (
            DocumentResult,
        )
        from picarones.evaluation.metric_result import MetricsResult

        dr = DocumentResult(
            doc_id="d", image_path="", ground_truth="",
            hypothesis="", metrics=MetricsResult(), duration_seconds=0,
        )
        dr.ner_metrics = {
            "true_positives": 4,
            "false_positives": 1,
            "false_negatives": 1,
            "per_category": {
                "PER": {
                    "support": 3, "recall": 1.0, "precision": 1.0,
                    "f1": 1.0,
                },
                "LOC": {
                    "support": 2, "recall": 0.5, "precision": 0.5,
                    "f1": 0.5,
                },
            },
            "hallucinated_entities": [],
            "missed_entities": [],
        }

        agg = _aggregate_ner_metrics([dr])

        assert "PER" in agg["per_category"]
        assert "LOC" in agg["per_category"]
        # PER : 3/3 โ†’ P=R=F1=1.0
        assert agg["per_category"]["PER"]["recall"] == pytest.approx(1.0)