File size: 8,129 Bytes
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d00572
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d68969
 
 
 
52412a3
 
 
 
7d68969
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d00572
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d00572
52412a3
 
 
0d00572
52412a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""Sprint A14-S4 β€” ``Artifact`` et ``ArtifactType``.

VΓ©rifie les invariants des artefacts du nouveau domain : validation
des id, hash, immutabilitΓ©, sΓ©rialisation JSON dΓ©terministe.

Note : pas de test "logique mΓ©tier" ici β€” un Artifact ne fait rien,
il dΓ©crit.  Les tests qui valident le comportement viendront avec
le pipeline executor (S7) qui produit et consomme des artefacts.
"""

from __future__ import annotations

import hashlib

import pytest
from pydantic import ValidationError

from picarones.domain import (
    Artifact,
    ArtifactType,
    ArtifactValidationError,
    ProvenanceRecord,
    compute_content_hash,
)


def _prov() -> ProvenanceRecord:
    return ProvenanceRecord(code_version="1.0.0", parameters_hash="a" * 64)


# ──────────────────────────────────────────────────────────────────────
# ArtifactType
# ──────────────────────────────────────────────────────────────────────


class TestArtifactType:
    def test_canonical_values(self) -> None:
        """Sprint A14-S4 β€” valeurs canoniques (9 jusqu'au S49 ;
        +``confidences`` ajoutΓ© au S50 pour le sidecar JSON OCR).
        """
        expected = {
            "image", "raw_text", "corrected_text",
            "alto_xml", "page_xml", "canonical_document",
            "entities", "reading_order", "alignment",
            "confidences",
        }
        assert {t.value for t in ArtifactType} == expected

    def test_string_enum_serializes_as_value(self) -> None:
        """``ArtifactType`` hΓ©rite de ``str`` β†’ JSON en string brute."""
        assert ArtifactType.RAW_TEXT == "raw_text"
        assert ArtifactType("alto_xml") is ArtifactType.ALTO_XML


# ──────────────────────────────────────────────────────────────────────
# compute_content_hash
# ──────────────────────────────────────────────────────────────────────


class TestComputeContentHash:
    def test_returns_64_char_hex(self) -> None:
        h = compute_content_hash(b"hello")
        assert len(h) == 64
        assert int(h, 16) >= 0  # hex valide

    def test_deterministic(self) -> None:
        assert compute_content_hash(b"abc") == compute_content_hash(b"abc")

    def test_matches_sha256(self) -> None:
        h = compute_content_hash(b"picarones")
        assert h == hashlib.sha256(b"picarones").hexdigest()


# ──────────────────────────────────────────────────────────────────────
# Artifact β€” crΓ©ation et validation
# ──────────────────────────────────────────────────────────────────────


class TestArtifactCreation:
    def test_minimal_artifact(self) -> None:
        a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
        assert a.id == "x"
        assert a.uri is None
        assert a.content_hash is None
        assert a.produced_by_step is None
        assert a.provenance is None

    def test_full_artifact(self) -> None:
        a = Artifact(
            id="d1:ocr:raw_text",
            document_id="d1",
            type=ArtifactType.RAW_TEXT,
            uri="/tmp/x.txt",
            content_hash="b" * 64,
            produced_by_step="ocr",
            provenance=_prov(),
        )
        assert a.produced_by_step == "ocr"

    def test_id_validation_rejects_spaces(self) -> None:
        with pytest.raises(ArtifactValidationError, match="id invalide"):
            Artifact(id="bad id", document_id="d1", type=ArtifactType.RAW_TEXT)

    def test_id_validation_rejects_null_byte(self) -> None:
        with pytest.raises(ArtifactValidationError):
            Artifact(id="x\x00y", document_id="d1", type=ArtifactType.RAW_TEXT)

    def test_id_accepts_filesystem_safe_chars(self) -> None:
        # alphanum + ``_.-:/`` selon le regex.
        a = Artifact(
            id="vol_a:folio.001-r/raw_text",
            document_id="vol_a/folio.001-r",
            type=ArtifactType.RAW_TEXT,
        )
        assert a.id == "vol_a:folio.001-r/raw_text"

    def test_content_hash_must_be_64_hex(self) -> None:
        # Trop court
        with pytest.raises(ValidationError):
            Artifact(
                id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
                content_hash="abc",
            )
        # Bonne longueur mais pas hex
        with pytest.raises(ArtifactValidationError, match="hex SHA-256"):
            Artifact(
                id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
                content_hash="z" * 64,
            )

    def test_content_hash_lowercased(self) -> None:
        a = Artifact(
            id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
            content_hash="A" * 64,
        )
        assert a.content_hash == "a" * 64


# ──────────────────────────────────────────────────────────────────────
# Artifact β€” immutabilitΓ©
# ──────────────────────────────────────────────────────────────────────


class TestArtifactImmutability:
    def test_frozen_blocks_attribute_mutation(self) -> None:
        a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
        with pytest.raises(ValidationError):
            a.id = "y"  # type: ignore[misc]

    def test_extra_fields_rejected(self) -> None:
        with pytest.raises(ValidationError):
            Artifact(  # type: ignore[call-arg]
                id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
                bogus_field="oops",
            )


# ──────────────────────────────────────────────────────────────────────
# Artifact β€” sΓ©rialisation dΓ©terministe
# ──────────────────────────────────────────────────────────────────────


class TestArtifactSerialization:
    def test_json_roundtrip_preserves_equality(self) -> None:
        a = Artifact(
            id="d1:ocr:raw_text", document_id="d1",
            type=ArtifactType.RAW_TEXT, content_hash="c" * 64,
            produced_by_step="ocr", provenance=_prov(),
        )
        j = a.model_dump_json()
        a2 = Artifact.model_validate_json(j)
        assert a == a2

    def test_json_is_byte_deterministic(self) -> None:
        """MΓͺme contenu β†’ mΓͺmes octets exacts.  Indispensable au cache
        d'artefacts du Sprint S7."""
        a1 = Artifact(
            id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
            content_hash="d" * 64,
        )
        a2 = Artifact(
            id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
            content_hash="d" * 64,
        )
        assert a1.model_dump_json() == a2.model_dump_json()

    def test_artifacts_are_hashable(self) -> None:
        """Frozen pydantic models sont hashables β€” on peut les mettre
        dans un set ou utiliser comme clΓ© de dict."""
        a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
        s = {a}
        assert a in s