File size: 3,238 Bytes
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d00572
 
52412a3
0d00572
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Sprint A14-S4 — ``DocumentRef`` et ``GroundTruthRef`` multi-niveaux."""

from __future__ import annotations

import pytest

from picarones.domain import (
    ArtifactType,
    CorpusSpecError,
    DocumentRef,
    GroundTruthRef,
)


class TestDocumentRefBasics:
    def test_minimal_document(self) -> None:
        d = DocumentRef(id="folio_001")
        assert d.id == "folio_001"
        assert d.image_uri is None
        assert d.ground_truths == ()

    def test_document_with_image_and_text_gt(self) -> None:
        d = DocumentRef(
            id="folio_001",
            image_uri="/corpus/folio_001.png",
            ground_truths=(
                GroundTruthRef(type=ArtifactType.RAW_TEXT, uri="/corpus/folio_001.gt.txt"),
            ),
        )
        assert d.image_uri == "/corpus/folio_001.png"
        assert len(d.ground_truths) == 1

    def test_id_validation_rejects_spaces(self) -> None:
        with pytest.raises(CorpusSpecError, match="document id invalide"):
            DocumentRef(id="bad id")


class TestMultiLevelGT:
    def test_multi_level_gt(self) -> None:
        d = DocumentRef(
            id="folio_001",
            ground_truths=(
                GroundTruthRef(type=ArtifactType.RAW_TEXT, uri="/x.gt.txt"),
                GroundTruthRef(type=ArtifactType.ALTO_XML, uri="/x.gt.alto.xml"),
                GroundTruthRef(type=ArtifactType.READING_ORDER, uri="/x.ro.json"),
            ),
        )
        assert len(d.ground_truths) == 3
        assert d.available_gt_types == (
            ArtifactType.RAW_TEXT,
            ArtifactType.ALTO_XML,
            ArtifactType.READING_ORDER,
        )

    def test_gt_for_returns_matching_level(self) -> None:
        d = DocumentRef(
            id="x",
            ground_truths=(
                GroundTruthRef(type=ArtifactType.RAW_TEXT, uri="/x.txt"),
                GroundTruthRef(type=ArtifactType.ALTO_XML, uri="/x.xml"),
            ),
        )
        gt = d.gt_for(ArtifactType.ALTO_XML)
        assert gt is not None
        assert gt.uri == "/x.xml"

    def test_gt_for_returns_none_when_absent(self) -> None:
        d = DocumentRef(id="x")
        assert d.gt_for(ArtifactType.RAW_TEXT) is None

    def test_duplicate_gt_type_rejected(self) -> None:
        with pytest.raises(CorpusSpecError, match="GT dupliquée"):
            DocumentRef(
                id="x",
                ground_truths=(
                    GroundTruthRef(type=ArtifactType.RAW_TEXT, uri="/a.txt"),
                    GroundTruthRef(type=ArtifactType.RAW_TEXT, uri="/b.txt"),
                ),
            )


class TestDocumentRefImmutability:
    def test_frozen_blocks_mutation(self) -> None:
        from pydantic import ValidationError

        d = DocumentRef(id="x")
        with pytest.raises(ValidationError):
            d.id = "y"  # type: ignore[misc]

    def test_json_roundtrip(self) -> None:
        d = DocumentRef(
            id="vol_a/folio_001",
            image_uri="/c/folio_001.png",
            ground_truths=(
                GroundTruthRef(type=ArtifactType.ALTO_XML, uri="/x.xml"),
            ),
        )
        j = d.model_dump_json()
        d2 = DocumentRef.model_validate_json(j)
        assert d == d2