File size: 14,229 Bytes
7c4ecda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
7c4ecda
 
 
979f3c3
7c4ecda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
"""Tests Sprint 65 β€” comparaison de N pipelines sur un corpus.

Couvre :

1. ``compare_pipelines`` :
   - 1 pipeline β†’ Γ©quivalent Γ  ``run_pipeline_benchmark`` mais
     emballΓ© dans un ``PipelineComparisonResult``
   - 2+ pipelines β†’ rΓ©sultats indexΓ©s par nom dans l'ordre
     d'insertion
   - Noms en double β†’ ``ValueError`` explicite
   - ``factories`` par pipeline respectΓ©
   - Corpus vide β†’ rΓ©sultats vides cohΓ©rents
2. ``ranking_by_final_metric`` :
   - Tri ascendant pour mΓ©triques de type CER (par dΓ©faut)
   - Tri descendant si ``higher_is_better=True``
   - Pipelines sans mΓ©trique β†’ en queue, ordre prΓ©servΓ©
3. ``gain_table`` :
   - ``baseline_pipeline`` inconnue β†’ ``KeyError``
   - Baseline elle-mΓͺme : absolute=0, relative=0
   - ``relative`` Γ  ``None`` si baseline = 0
   - ``absolute`` et ``relative`` Γ  ``None`` si valeur absente
4. Cas rΓ©aliste : OCR fautif vs OCR+correcteur β†’ le correcteur
   gagne au ranking et au gain_table.
5. Philosophie inchangΓ©e : tous les modules sont des **mocks**
   dΓ©finis dans le test.
"""

from __future__ import annotations

from typing import Any

import pytest

from picarones.core.corpus import Corpus, Document, GTLevel, TextGT
from picarones.core.modules import ArtifactType, BaseModule
from picarones.measurements.pipeline_comparison import (
    PipelineComparisonResult,
    compare_pipelines,
)
from picarones.core.pipeline import PipelineSpec, PipelineStep


# ──────────────────────────────────────────────────────────────────────────
# Mocks
# ──────────────────────────────────────────────────────────────────────────


class MockOCR(BaseModule):
    input_types = (ArtifactType.IMAGE,)
    output_types = (ArtifactType.TEXT,)
    execution_mode: Any = "io"

    def __init__(self, fn) -> None:
        self._fn = fn

    @property
    def name(self) -> str:
        return "mock-ocr"

    def process(self, inputs):
        return {ArtifactType.TEXT: self._fn(inputs[ArtifactType.IMAGE])}


class TextFixer(BaseModule):
    """Rewriter mock qui applique un dict de remplacements."""

    input_types = (ArtifactType.TEXT,)
    output_types = (ArtifactType.TEXT,)
    execution_mode: Any = "cpu"

    def __init__(self, replacements: dict[str, str]) -> None:
        self._replacements = replacements

    @property
    def name(self) -> str:
        return "fixer"

    def process(self, inputs):
        text = inputs[ArtifactType.TEXT]
        for src, dst in self._replacements.items():
            text = text.replace(src, dst)
        return {ArtifactType.TEXT: text}


def _make_corpus(n: int = 2, name: str = "demo") -> Corpus:
    docs = []
    for i in range(n):
        gt = f"texte {i}"
        docs.append(Document(
            image_path=f"/tmp/d{i}.png",
            ground_truth=gt,
            doc_id=f"d{i}",
            ground_truths={GTLevel.TEXT: TextGT(text=gt)},
        ))
    return Corpus(name=name, documents=docs)


def _ocr_perfect(path: str) -> str:
    idx = path.replace("/tmp/d", "").replace(".png", "")
    return f"texte {idx}"


def _ocr_with_typo(path: str) -> str:
    idx = path.replace("/tmp/d", "").replace(".png", "")
    return f"txete {idx}"


# ──────────────────────────────────────────────────────────────────────────
# 1. compare_pipelines β€” chemins nominaux
# ──────────────────────────────────────────────────────────────────────────


class TestCompareBasic:
    def test_single_pipeline(self) -> None:
        corpus = _make_corpus(2)
        spec = PipelineSpec(
            name="ocr_only",
            steps=[PipelineStep("ocr", MockOCR(_ocr_perfect))],
        )
        result = compare_pipelines([spec], corpus)
        assert result.corpus_name == "demo"
        assert result.n_docs == 2
        assert result.pipeline_names() == ["ocr_only"]
        assert "ocr_only" in result.per_pipeline

    def test_multiple_pipelines_preserved_order(self) -> None:
        corpus = _make_corpus(1)
        specs = [
            PipelineSpec("alpha", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
            PipelineSpec("beta", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
            PipelineSpec("gamma", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
        ]
        result = compare_pipelines(specs, corpus)
        assert result.pipeline_names() == ["alpha", "beta", "gamma"]

    def test_duplicate_names_raises(self) -> None:
        corpus = _make_corpus(1)
        specs = [
            PipelineSpec("dup", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
            PipelineSpec("dup", [PipelineStep("ocr", MockOCR(_ocr_perfect))]),
        ]
        with pytest.raises(ValueError, match="non uniques"):
            compare_pipelines(specs, corpus)

    def test_empty_corpus(self) -> None:
        corpus = Corpus(name="empty", documents=[])
        spec = PipelineSpec(
            name="ocr",
            steps=[PipelineStep("ocr", MockOCR(_ocr_perfect))],
        )
        result = compare_pipelines([spec], corpus)
        assert result.n_docs == 0
        assert "ocr" in result.per_pipeline


# ──────────────────────────────────────────────────────────────────────────
# 2. ranking_by_final_metric
# ──────────────────────────────────────────────────────────────────────────


class TestRanking:
    def test_lower_is_better_default(self) -> None:
        corpus = _make_corpus(2)
        specs = [
            # OCR parfait β†’ CER=0
            PipelineSpec("perfect", [
                PipelineStep("ocr", MockOCR(_ocr_perfect)),
            ]),
            # OCR fautif β†’ CER>0
            PipelineSpec("typo", [
                PipelineStep("ocr", MockOCR(_ocr_with_typo)),
            ]),
        ]
        result = compare_pipelines(specs, corpus)
        ranked = result.ranking_by_final_metric(
            ArtifactType.TEXT, "cer",
        )
        # Le parfait arrive en premier (CER 0 < typo CER > 0)
        assert ranked[0][0] == "perfect"
        assert ranked[0][1] == 0.0
        assert ranked[1][0] == "typo"
        assert ranked[1][1] > 0.0

    def test_higher_is_better(self) -> None:
        corpus = _make_corpus(1)
        # On utilise la mΓ©trique unicode_block_global_accuracy
        # (plus haut = meilleur)
        specs = [
            PipelineSpec("perfect", [
                PipelineStep("ocr", MockOCR(_ocr_perfect)),
            ]),
            PipelineSpec("typo", [
                PipelineStep("ocr", MockOCR(_ocr_with_typo)),
            ]),
        ]
        result = compare_pipelines(specs, corpus)
        # On bascule sur cer + higher_is_better=True : on vΓ©rifie
        # que le tri s'inverse
        ranked_lower = result.ranking_by_final_metric(
            ArtifactType.TEXT, "cer", higher_is_better=False,
        )
        ranked_higher = result.ranking_by_final_metric(
            ArtifactType.TEXT, "cer", higher_is_better=True,
        )
        # Si les deux pipelines ont des valeurs diffΓ©rentes, l'ordre
        # est inversΓ©
        if ranked_lower[0][1] != ranked_lower[1][1]:
            assert ranked_lower[0][0] != ranked_higher[0][0]

    def test_pipelines_without_metric_in_queue(self) -> None:
        # Pipeline qui ne produit pas de TEXT (ex. crash de tous
        # les docs) : pas de mΓ©trique β†’ en queue
        corpus = _make_corpus(1)

        class AlwaysFails(BaseModule):
            input_types = (ArtifactType.IMAGE,)
            output_types = (ArtifactType.TEXT,)
            execution_mode: Any = "io"

            @property
            def name(self) -> str:
                return "fail"

            def process(self, inputs):
                raise RuntimeError("boom")

        specs = [
            PipelineSpec("ok", [
                PipelineStep("ocr", MockOCR(_ocr_perfect)),
            ]),
            PipelineSpec("ko", [
                PipelineStep("ocr", AlwaysFails()),
            ]),
        ]
        result = compare_pipelines(specs, corpus)
        ranked = result.ranking_by_final_metric(
            ArtifactType.TEXT, "cer",
        )
        # ok est en tΓͺte, ko en queue avec valeur None
        assert ranked[0][0] == "ok"
        assert ranked[0][1] == 0.0
        assert ranked[-1][0] == "ko"
        assert ranked[-1][1] is None


# ──────────────────────────────────────────────────────────────────────────
# 3. gain_table
# ──────────────────────────────────────────────────────────────────────────


class TestGainTable:
    def test_baseline_unknown_raises(self) -> None:
        corpus = _make_corpus(1)
        spec = PipelineSpec("a", [PipelineStep("ocr", MockOCR(_ocr_perfect))])
        result = compare_pipelines([spec], corpus)
        with pytest.raises(KeyError, match="baseline"):
            result.gain_table(
                ArtifactType.TEXT, "cer", baseline_pipeline="inconnue",
            )

    def test_baseline_self_zero_gain(self) -> None:
        corpus = _make_corpus(1)
        spec = PipelineSpec("a", [PipelineStep("ocr", MockOCR(_ocr_perfect))])
        result = compare_pipelines([spec], corpus)
        gains = result.gain_table(ArtifactType.TEXT, "cer", "a")
        assert gains["a"]["absolute"] == 0.0
        # CER vaut 0 pour les deux ; relative = None car baseline = 0
        assert gains["a"]["relative"] is None

    def test_relative_none_when_baseline_zero(self) -> None:
        corpus = _make_corpus(1)
        specs = [
            PipelineSpec("perfect", [
                PipelineStep("ocr", MockOCR(_ocr_perfect)),
            ]),
            PipelineSpec("typo", [
                PipelineStep("ocr", MockOCR(_ocr_with_typo)),
            ]),
        ]
        result = compare_pipelines(specs, corpus)
        gains = result.gain_table(ArtifactType.TEXT, "cer", "perfect")
        # baseline = 0 β†’ relative = None
        assert gains["typo"]["relative"] is None
        assert gains["typo"]["absolute"] is not None
        assert gains["typo"]["absolute"] > 0

    def test_realistic_fixer_outperforms_baseline(self) -> None:
        # OCR avec fautes corrigeables, fixer ramène à perfection
        corpus = _make_corpus(2)

        def ocr_typo(path: str) -> str:
            idx = path.replace("/tmp/d", "").replace(".png", "")
            return f"txete {idx}"  # 'texte' β†’ 'txete'

        specs = [
            PipelineSpec("ocr_only", [
                PipelineStep("ocr", MockOCR(ocr_typo)),
            ]),
            PipelineSpec("ocr_with_fixer", [
                PipelineStep("ocr", MockOCR(ocr_typo)),
                PipelineStep("fix", TextFixer({"txete": "texte"})),
            ]),
        ]
        result = compare_pipelines(specs, corpus)
        gains = result.gain_table(
            ArtifactType.TEXT, "cer", "ocr_only",
        )
        # ocr_only : CER > 0 ; ocr_with_fixer : CER = 0
        assert gains["ocr_only"]["value"] > 0
        assert gains["ocr_with_fixer"]["value"] == 0.0
        # absolute nΓ©gatif (CER baisse β†’ mieux)
        assert gains["ocr_with_fixer"]["absolute"] < 0


# ──────────────────────────────────────────────────────────────────────────
# 4. factories par pipeline
# ──────────────────────────────────────────────────────────────────────────


class TestCustomFactoriesPerPipeline:
    def test_factories_routed_per_pipeline(self) -> None:
        corpus = _make_corpus(1)
        # Pipeline A : dΓ©marre par IMAGE (factory par dΓ©faut)
        # Pipeline B : dΓ©marre par TEXT (factory custom)
        specs = [
            PipelineSpec("from_image", [
                PipelineStep("ocr", MockOCR(_ocr_perfect)),
            ]),
            PipelineSpec("from_text", [
                PipelineStep("fix", TextFixer({"texte": "TEXTE"})),
            ]),
        ]
        factories = {
            "from_text": lambda doc: {ArtifactType.TEXT: doc.ground_truth},
        }
        result = compare_pipelines(specs, corpus, factories)
        # Les deux pipelines ont tournΓ© sans erreur
        assert result.per_pipeline["from_image"].n_pipelines_succeeded == 1
        assert result.per_pipeline["from_text"].n_pipelines_succeeded == 1


# ──────────────────────────────────────────────────────────────────────────
# 5. Dataclass directe
# ──────────────────────────────────────────────────────────────────────────


class TestDataclass:
    def test_default(self) -> None:
        r = PipelineComparisonResult(corpus_name="c")
        assert r.n_docs == 0
        assert r.per_pipeline == {}
        assert r.pipeline_names() == []