File size: 14,165 Bytes
f41e382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""Tests Sprint 32 β€” GT multi-niveaux (Phase 0.1 du plan d'Γ©volution).

VΓ©rifie :

1. RΓ©trocompatibilitΓ© stricte : un corpus historique (image + .gt.txt
   uniquement) se charge exactement comme avant et expose la mΓͺme API
   (``doc.ground_truth: str``).
2. DΓ©tection automatique des niveaux additionnels : ``.gt.alto.xml``,
   ``.gt.page.xml``, ``.gt.entities.json``, ``.gt.reading_order.json``.
3. Couverture partielle : un corpus mixte oΓΉ seuls certains documents
   ont l'ALTO doit reflΓ©ter cette couverture dans
   ``Corpus.gt_level_coverage()``.
4. Synchronisation TEXT entre champ ``ground_truth`` et
   ``ground_truths[GTLevel.TEXT]`` dans les deux sens.
5. Robustesse : un fichier JSON cassΓ© est dΓ©gradΓ© en warning, le
   document reste chargΓ© avec les niveaux qui ont fonctionnΓ©.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from picarones.core.corpus import (
    AltoGT,
    Document,
    EntitiesGT,
    GT_SUFFIXES,
    GTLevel,
    PageGT,
    ReadingOrderGT,
    TextGT,
    load_corpus_from_directory,
)


# Mini-PNG 1Γ—1 valide rΓ©utilisΓ© dans les tests
_TINY_PNG = (
    b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01"
    b"\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00"
    b"\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18"
    b"\xd8N\x00\x00\x00\x00IEND\xaeB`\x82"
)


def _write_pair(directory: Path, stem: str, gt_text: str) -> Path:
    """Γ‰crit une paire image + .gt.txt classique."""
    image = directory / f"{stem}.png"
    image.write_bytes(_TINY_PNG)
    (directory / f"{stem}.gt.txt").write_text(gt_text, encoding="utf-8")
    return image


# ──────────────────────────────────────────────────────────────────────────
# 1. RΓ©trocompatibilitΓ© stricte
# ──────────────────────────────────────────────────────────────────────────


class TestBackwardCompat:
    def test_text_only_corpus_loads_unchanged(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "doc_001", "Première page.")
        _write_pair(tmp_path, "doc_002", "Deuxième page.")

        corpus = load_corpus_from_directory(tmp_path)

        assert len(corpus) == 2
        for doc in corpus:
            # API historique : ground_truth: str
            assert isinstance(doc.ground_truth, str)
            assert doc.ground_truth  # non vide
            # Le niveau TEXT est automatiquement peuplΓ©
            assert doc.has_gt(GTLevel.TEXT)
            assert not doc.has_gt(GTLevel.ALTO)
            assert not doc.has_gt(GTLevel.PAGE)

    def test_document_dataclass_default_is_text_only(self) -> None:
        doc = Document(image_path=Path("/tmp/x.png"), ground_truth="abc")

        assert doc.ground_truth == "abc"
        assert doc.gt_levels == {GTLevel.TEXT}
        text_payload = doc.get_gt(GTLevel.TEXT)
        assert isinstance(text_payload, TextGT)
        assert text_payload.text == "abc"

    def test_document_construction_via_ground_truths_dict(self) -> None:
        """Construction par le nouveau format : le champ str est synchronisΓ©."""
        doc = Document(
            image_path=Path("/tmp/x.png"),
            ground_truths={GTLevel.TEXT: TextGT(text="hello")},
        )
        # Le post-init renseigne ground_truth depuis le dict
        assert doc.ground_truth == "hello"

    def test_no_extra_levels_means_no_change_in_api(self, tmp_path: Path) -> None:
        """Un corpus sans fichier ALTO/PAGE/JSON ne doit jamais lever."""
        _write_pair(tmp_path, "x", "y")
        corpus = load_corpus_from_directory(tmp_path)
        assert corpus.available_gt_levels == {GTLevel.TEXT}


# ──────────────────────────────────────────────────────────────────────────
# 2. DΓ©tection automatique des niveaux additionnels
# ──────────────────────────────────────────────────────────────────────────


_ALTO_SAMPLE = """<?xml version="1.0" encoding="UTF-8"?>
<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#">
  <Layout><Page><PrintSpace>
    <TextBlock ID="block_1"><TextLine ID="line_1">
      <String CONTENT="Bonjour"/>
    </TextLine></TextBlock>
  </PrintSpace></Page></Layout>
</alto>
"""

_PAGE_SAMPLE = """<?xml version="1.0" encoding="UTF-8"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
  <Page><TextRegion id="r1"><TextLine id="l1">
    <TextEquiv><Unicode>Salut</Unicode></TextEquiv>
  </TextLine></TextRegion></Page>
</PcGts>
"""


class TestExtraLevelsDetection:
    def test_alto_detected(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "doc", "Bonjour")
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ALTO]}").write_text(_ALTO_SAMPLE, encoding="utf-8")

        corpus = load_corpus_from_directory(tmp_path)
        doc = corpus.documents[0]

        assert doc.has_gt(GTLevel.ALTO)
        alto = doc.get_gt(GTLevel.ALTO)
        assert isinstance(alto, AltoGT)
        assert "TextBlock" in alto.xml_content
        assert alto.source_path is not None

    def test_page_detected(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "doc", "Salut")
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.PAGE]}").write_text(_PAGE_SAMPLE, encoding="utf-8")

        corpus = load_corpus_from_directory(tmp_path)
        doc = corpus.documents[0]

        page = doc.get_gt(GTLevel.PAGE)
        assert isinstance(page, PageGT)
        assert "TextRegion" in page.xml_content

    def test_entities_detected_object_form(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "doc", "Marie de Bourgogne en 1477.")
        entities = {
            "entities": [
                {"label": "PER", "start": 0, "end": 17, "text": "Marie de Bourgogne"},
                {"label": "DATE", "start": 21, "end": 25, "text": "1477"},
            ]
        }
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
            json.dumps(entities), encoding="utf-8"
        )

        corpus = load_corpus_from_directory(tmp_path)
        doc = corpus.documents[0]

        ent = doc.get_gt(GTLevel.ENTITIES)
        assert isinstance(ent, EntitiesGT)
        assert len(ent.entities) == 2
        assert ent.entities[0]["label"] == "PER"

    def test_entities_detected_array_form(self, tmp_path: Path) -> None:
        """Le loader accepte aussi un tableau JSON brut."""
        _write_pair(tmp_path, "doc", "Texte.")
        ent_data = [{"label": "MISC", "start": 0, "end": 5, "text": "Texte"}]
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
            json.dumps(ent_data), encoding="utf-8"
        )

        corpus = load_corpus_from_directory(tmp_path)
        ent = corpus.documents[0].get_gt(GTLevel.ENTITIES)
        assert isinstance(ent, EntitiesGT)
        assert ent.entities[0]["label"] == "MISC"

    def test_reading_order_detected(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "doc", "Multi-colonnes.")
        ro = {"region_order": ["r_main", "r_marginalia", "r_footer"]}
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.READING_ORDER]}").write_text(
            json.dumps(ro), encoding="utf-8"
        )

        corpus = load_corpus_from_directory(tmp_path)
        ro_payload = corpus.documents[0].get_gt(GTLevel.READING_ORDER)
        assert isinstance(ro_payload, ReadingOrderGT)
        assert ro_payload.region_order == ["r_main", "r_marginalia", "r_footer"]

    def test_all_four_extra_levels_simultaneously(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "doc", "Texte complet.")
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ALTO]}").write_text(_ALTO_SAMPLE, encoding="utf-8")
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.PAGE]}").write_text(_PAGE_SAMPLE, encoding="utf-8")
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
            json.dumps([{"label": "X", "start": 0, "end": 1, "text": "T"}]), encoding="utf-8"
        )
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.READING_ORDER]}").write_text(
            json.dumps(["r1"]), encoding="utf-8"
        )

        doc = load_corpus_from_directory(tmp_path).documents[0]
        assert doc.gt_levels == {
            GTLevel.TEXT,
            GTLevel.ALTO,
            GTLevel.PAGE,
            GTLevel.ENTITIES,
            GTLevel.READING_ORDER,
        }


# ──────────────────────────────────────────────────────────────────────────
# 3. Couverture partielle (corpus mixte)
# ──────────────────────────────────────────────────────────────────────────


class TestPartialCoverage:
    def test_partial_alto_coverage(self, tmp_path: Path) -> None:
        """3 documents, seul le premier porte un ALTO."""
        _write_pair(tmp_path, "doc_001", "Premier")
        _write_pair(tmp_path, "doc_002", "Deuxième")
        _write_pair(tmp_path, "doc_003", "Troisième")
        (tmp_path / f"doc_001{GT_SUFFIXES[GTLevel.ALTO]}").write_text(
            _ALTO_SAMPLE, encoding="utf-8"
        )

        corpus = load_corpus_from_directory(tmp_path)

        coverage = corpus.gt_level_coverage()
        assert coverage[GTLevel.TEXT] == 3
        assert coverage[GTLevel.ALTO] == 1
        # available_gt_levels = union sur tout le corpus
        assert corpus.available_gt_levels == {GTLevel.TEXT, GTLevel.ALTO}
        # Mais seul doc_001 expose ALTO
        doc_001 = next(d for d in corpus if d.doc_id == "doc_001")
        doc_002 = next(d for d in corpus if d.doc_id == "doc_002")
        assert doc_001.has_gt(GTLevel.ALTO)
        assert not doc_002.has_gt(GTLevel.ALTO)

    def test_stats_exposes_coverage(self, tmp_path: Path) -> None:
        _write_pair(tmp_path, "a", "x")
        _write_pair(tmp_path, "b", "y")
        (tmp_path / f"a{GT_SUFFIXES[GTLevel.ALTO]}").write_text(_ALTO_SAMPLE, encoding="utf-8")

        stats = load_corpus_from_directory(tmp_path).stats
        assert stats["gt_level_coverage"]["text"] == 2
        assert stats["gt_level_coverage"]["alto"] == 1


# ──────────────────────────────────────────────────────────────────────────
# 4. Synchronisation bidirectionnelle TEXT
# ──────────────────────────────────────────────────────────────────────────


class TestTextSync:
    def test_str_to_dict_sync(self) -> None:
        doc = Document(image_path=Path("/tmp/x.png"), ground_truth="aaa")
        text_gt = doc.get_gt(GTLevel.TEXT)
        assert isinstance(text_gt, TextGT)
        assert text_gt.text == "aaa"

    def test_dict_to_str_sync(self) -> None:
        doc = Document(
            image_path=Path("/tmp/x.png"),
            ground_truths={GTLevel.TEXT: TextGT(text="bbb")},
        )
        assert doc.ground_truth == "bbb"

    def test_both_provided_keeps_str(self) -> None:
        """Si les deux sont fournis, le champ str est prΓ©servΓ© tel quel β€”
        le dict reste la source pour les autres niveaux."""
        doc = Document(
            image_path=Path("/tmp/x.png"),
            ground_truth="canon",
            ground_truths={GTLevel.TEXT: TextGT(text="autre")},
        )
        # Le champ str fourni explicitement n'est pas Γ©crasΓ©
        assert doc.ground_truth == "canon"


# ──────────────────────────────────────────────────────────────────────────
# 5. Robustesse β€” JSON cassΓ©
# ──────────────────────────────────────────────────────────────────────────


class TestRobustness:
    def test_broken_entities_json_is_warning_not_error(
        self, tmp_path: Path, caplog: pytest.LogCaptureFixture
    ) -> None:
        _write_pair(tmp_path, "doc", "Texte.")
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
            "{ ceci n'est pas du JSON", encoding="utf-8"
        )

        with caplog.at_level("WARNING", logger="picarones.core.corpus"):
            corpus = load_corpus_from_directory(tmp_path)

        # Le document reste chargΓ© avec son niveau TEXT
        doc = corpus.documents[0]
        assert doc.has_gt(GTLevel.TEXT)
        assert not doc.has_gt(GTLevel.ENTITIES)
        # Et un warning explicite a été émis (cf. règle CLAUDE.md)
        assert any("entitΓ©s" in rec.message.lower() for rec in caplog.records)

    def test_unexpected_json_format_is_warning(
        self, tmp_path: Path, caplog: pytest.LogCaptureFixture
    ) -> None:
        _write_pair(tmp_path, "doc", "Texte.")
        # JSON valide mais format inattendu (pas dict avec "entities", pas liste)
        (tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
            json.dumps({"foo": "bar"}), encoding="utf-8"
        )

        with caplog.at_level("WARNING", logger="picarones.core.corpus"):
            corpus = load_corpus_from_directory(tmp_path)

        assert not corpus.documents[0].has_gt(GTLevel.ENTITIES)
        assert any("format" in rec.message.lower() for rec in caplog.records)