Spaces:
Sleeping
Sleeping
File size: 8,129 Bytes
52412a3 0d00572 52412a3 7d68969 52412a3 7d68969 52412a3 0d00572 52412a3 0d00572 52412a3 0d00572 52412a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """Sprint A14-S4 β ``Artifact`` et ``ArtifactType``.
VΓ©rifie les invariants des artefacts du nouveau domain : validation
des id, hash, immutabilitΓ©, sΓ©rialisation JSON dΓ©terministe.
Note : pas de test "logique mΓ©tier" ici β un Artifact ne fait rien,
il dΓ©crit. Les tests qui valident le comportement viendront avec
le pipeline executor (S7) qui produit et consomme des artefacts.
"""
from __future__ import annotations
import hashlib
import pytest
from pydantic import ValidationError
from picarones.domain import (
Artifact,
ArtifactType,
ArtifactValidationError,
ProvenanceRecord,
compute_content_hash,
)
def _prov() -> ProvenanceRecord:
return ProvenanceRecord(code_version="1.0.0", parameters_hash="a" * 64)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# ArtifactType
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestArtifactType:
def test_canonical_values(self) -> None:
"""Sprint A14-S4 β valeurs canoniques (9 jusqu'au S49 ;
+``confidences`` ajoutΓ© au S50 pour le sidecar JSON OCR).
"""
expected = {
"image", "raw_text", "corrected_text",
"alto_xml", "page_xml", "canonical_document",
"entities", "reading_order", "alignment",
"confidences",
}
assert {t.value for t in ArtifactType} == expected
def test_string_enum_serializes_as_value(self) -> None:
"""``ArtifactType`` hΓ©rite de ``str`` β JSON en string brute."""
assert ArtifactType.RAW_TEXT == "raw_text"
assert ArtifactType("alto_xml") is ArtifactType.ALTO_XML
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# compute_content_hash
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestComputeContentHash:
def test_returns_64_char_hex(self) -> None:
h = compute_content_hash(b"hello")
assert len(h) == 64
assert int(h, 16) >= 0 # hex valide
def test_deterministic(self) -> None:
assert compute_content_hash(b"abc") == compute_content_hash(b"abc")
def test_matches_sha256(self) -> None:
h = compute_content_hash(b"picarones")
assert h == hashlib.sha256(b"picarones").hexdigest()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Artifact β crΓ©ation et validation
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestArtifactCreation:
def test_minimal_artifact(self) -> None:
a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
assert a.id == "x"
assert a.uri is None
assert a.content_hash is None
assert a.produced_by_step is None
assert a.provenance is None
def test_full_artifact(self) -> None:
a = Artifact(
id="d1:ocr:raw_text",
document_id="d1",
type=ArtifactType.RAW_TEXT,
uri="/tmp/x.txt",
content_hash="b" * 64,
produced_by_step="ocr",
provenance=_prov(),
)
assert a.produced_by_step == "ocr"
def test_id_validation_rejects_spaces(self) -> None:
with pytest.raises(ArtifactValidationError, match="id invalide"):
Artifact(id="bad id", document_id="d1", type=ArtifactType.RAW_TEXT)
def test_id_validation_rejects_null_byte(self) -> None:
with pytest.raises(ArtifactValidationError):
Artifact(id="x\x00y", document_id="d1", type=ArtifactType.RAW_TEXT)
def test_id_accepts_filesystem_safe_chars(self) -> None:
# alphanum + ``_.-:/`` selon le regex.
a = Artifact(
id="vol_a:folio.001-r/raw_text",
document_id="vol_a/folio.001-r",
type=ArtifactType.RAW_TEXT,
)
assert a.id == "vol_a:folio.001-r/raw_text"
def test_content_hash_must_be_64_hex(self) -> None:
# Trop court
with pytest.raises(ValidationError):
Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="abc",
)
# Bonne longueur mais pas hex
with pytest.raises(ArtifactValidationError, match="hex SHA-256"):
Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="z" * 64,
)
def test_content_hash_lowercased(self) -> None:
a = Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="A" * 64,
)
assert a.content_hash == "a" * 64
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Artifact β immutabilitΓ©
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestArtifactImmutability:
def test_frozen_blocks_attribute_mutation(self) -> None:
a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
with pytest.raises(ValidationError):
a.id = "y" # type: ignore[misc]
def test_extra_fields_rejected(self) -> None:
with pytest.raises(ValidationError):
Artifact( # type: ignore[call-arg]
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
bogus_field="oops",
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Artifact β sΓ©rialisation dΓ©terministe
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestArtifactSerialization:
def test_json_roundtrip_preserves_equality(self) -> None:
a = Artifact(
id="d1:ocr:raw_text", document_id="d1",
type=ArtifactType.RAW_TEXT, content_hash="c" * 64,
produced_by_step="ocr", provenance=_prov(),
)
j = a.model_dump_json()
a2 = Artifact.model_validate_json(j)
assert a == a2
def test_json_is_byte_deterministic(self) -> None:
"""MΓͺme contenu β mΓͺmes octets exacts. Indispensable au cache
d'artefacts du Sprint S7."""
a1 = Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="d" * 64,
)
a2 = Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="d" * 64,
)
assert a1.model_dump_json() == a2.model_dump_json()
def test_artifacts_are_hashable(self) -> None:
"""Frozen pydantic models sont hashables β on peut les mettre
dans un set ou utiliser comme clΓ© de dict."""
a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
s = {a}
assert a in s
|