"""Tests Sprint 32 — GT multi-niveaux (Phase 0.1 du plan d'évolution).
Vérifie :
1. Rétrocompatibilité stricte : un corpus historique (image + .gt.txt
uniquement) se charge exactement comme avant et expose la même API
(``doc.ground_truth: str``).
2. Détection automatique des niveaux additionnels : ``.gt.alto.xml``,
``.gt.page.xml``, ``.gt.entities.json``, ``.gt.reading_order.json``.
3. Couverture partielle : un corpus mixte où seuls certains documents
ont l'ALTO doit refléter cette couverture dans
``Corpus.gt_level_coverage()``.
4. Synchronisation TEXT entre champ ``ground_truth`` et
``ground_truths[GTLevel.TEXT]`` dans les deux sens.
5. Robustesse : un fichier JSON cassé est dégradé en warning, le
document reste chargé avec les niveaux qui ont fonctionné.
"""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from picarones.core.corpus import (
AltoGT,
Document,
EntitiesGT,
GT_SUFFIXES,
GTLevel,
PageGT,
ReadingOrderGT,
TextGT,
load_corpus_from_directory,
)
# Mini-PNG 1×1 valide réutilisé dans les tests
_TINY_PNG = (
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01"
b"\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00"
b"\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18"
b"\xd8N\x00\x00\x00\x00IEND\xaeB`\x82"
)
def _write_pair(directory: Path, stem: str, gt_text: str) -> Path:
"""Écrit une paire image + .gt.txt classique."""
image = directory / f"{stem}.png"
image.write_bytes(_TINY_PNG)
(directory / f"{stem}.gt.txt").write_text(gt_text, encoding="utf-8")
return image
# ──────────────────────────────────────────────────────────────────────────
# 1. Rétrocompatibilité stricte
# ──────────────────────────────────────────────────────────────────────────
class TestBackwardCompat:
def test_text_only_corpus_loads_unchanged(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "doc_001", "Première page.")
_write_pair(tmp_path, "doc_002", "Deuxième page.")
corpus = load_corpus_from_directory(tmp_path)
assert len(corpus) == 2
for doc in corpus:
# API historique : ground_truth: str
assert isinstance(doc.ground_truth, str)
assert doc.ground_truth # non vide
# Le niveau TEXT est automatiquement peuplé
assert doc.has_gt(GTLevel.TEXT)
assert not doc.has_gt(GTLevel.ALTO)
assert not doc.has_gt(GTLevel.PAGE)
def test_document_dataclass_default_is_text_only(self) -> None:
doc = Document(image_path=Path("/tmp/x.png"), ground_truth="abc")
assert doc.ground_truth == "abc"
assert doc.gt_levels == {GTLevel.TEXT}
text_payload = doc.get_gt(GTLevel.TEXT)
assert isinstance(text_payload, TextGT)
assert text_payload.text == "abc"
def test_document_construction_via_ground_truths_dict(self) -> None:
"""Construction par le nouveau format : le champ str est synchronisé."""
doc = Document(
image_path=Path("/tmp/x.png"),
ground_truths={GTLevel.TEXT: TextGT(text="hello")},
)
# Le post-init renseigne ground_truth depuis le dict
assert doc.ground_truth == "hello"
def test_no_extra_levels_means_no_change_in_api(self, tmp_path: Path) -> None:
"""Un corpus sans fichier ALTO/PAGE/JSON ne doit jamais lever."""
_write_pair(tmp_path, "x", "y")
corpus = load_corpus_from_directory(tmp_path)
assert corpus.available_gt_levels == {GTLevel.TEXT}
# ──────────────────────────────────────────────────────────────────────────
# 2. Détection automatique des niveaux additionnels
# ──────────────────────────────────────────────────────────────────────────
_ALTO_SAMPLE = """
"""
_PAGE_SAMPLE = """
Salut
"""
class TestExtraLevelsDetection:
def test_alto_detected(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "doc", "Bonjour")
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ALTO]}").write_text(_ALTO_SAMPLE, encoding="utf-8")
corpus = load_corpus_from_directory(tmp_path)
doc = corpus.documents[0]
assert doc.has_gt(GTLevel.ALTO)
alto = doc.get_gt(GTLevel.ALTO)
assert isinstance(alto, AltoGT)
assert "TextBlock" in alto.xml_content
assert alto.source_path is not None
def test_page_detected(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "doc", "Salut")
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.PAGE]}").write_text(_PAGE_SAMPLE, encoding="utf-8")
corpus = load_corpus_from_directory(tmp_path)
doc = corpus.documents[0]
page = doc.get_gt(GTLevel.PAGE)
assert isinstance(page, PageGT)
assert "TextRegion" in page.xml_content
def test_entities_detected_object_form(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "doc", "Marie de Bourgogne en 1477.")
entities = {
"entities": [
{"label": "PER", "start": 0, "end": 17, "text": "Marie de Bourgogne"},
{"label": "DATE", "start": 21, "end": 25, "text": "1477"},
]
}
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
json.dumps(entities), encoding="utf-8"
)
corpus = load_corpus_from_directory(tmp_path)
doc = corpus.documents[0]
ent = doc.get_gt(GTLevel.ENTITIES)
assert isinstance(ent, EntitiesGT)
assert len(ent.entities) == 2
assert ent.entities[0]["label"] == "PER"
def test_entities_detected_array_form(self, tmp_path: Path) -> None:
"""Le loader accepte aussi un tableau JSON brut."""
_write_pair(tmp_path, "doc", "Texte.")
ent_data = [{"label": "MISC", "start": 0, "end": 5, "text": "Texte"}]
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
json.dumps(ent_data), encoding="utf-8"
)
corpus = load_corpus_from_directory(tmp_path)
ent = corpus.documents[0].get_gt(GTLevel.ENTITIES)
assert isinstance(ent, EntitiesGT)
assert ent.entities[0]["label"] == "MISC"
def test_reading_order_detected(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "doc", "Multi-colonnes.")
ro = {"region_order": ["r_main", "r_marginalia", "r_footer"]}
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.READING_ORDER]}").write_text(
json.dumps(ro), encoding="utf-8"
)
corpus = load_corpus_from_directory(tmp_path)
ro_payload = corpus.documents[0].get_gt(GTLevel.READING_ORDER)
assert isinstance(ro_payload, ReadingOrderGT)
assert ro_payload.region_order == ["r_main", "r_marginalia", "r_footer"]
def test_all_four_extra_levels_simultaneously(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "doc", "Texte complet.")
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ALTO]}").write_text(_ALTO_SAMPLE, encoding="utf-8")
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.PAGE]}").write_text(_PAGE_SAMPLE, encoding="utf-8")
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
json.dumps([{"label": "X", "start": 0, "end": 1, "text": "T"}]), encoding="utf-8"
)
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.READING_ORDER]}").write_text(
json.dumps(["r1"]), encoding="utf-8"
)
doc = load_corpus_from_directory(tmp_path).documents[0]
assert doc.gt_levels == {
GTLevel.TEXT,
GTLevel.ALTO,
GTLevel.PAGE,
GTLevel.ENTITIES,
GTLevel.READING_ORDER,
}
# ──────────────────────────────────────────────────────────────────────────
# 3. Couverture partielle (corpus mixte)
# ──────────────────────────────────────────────────────────────────────────
class TestPartialCoverage:
def test_partial_alto_coverage(self, tmp_path: Path) -> None:
"""3 documents, seul le premier porte un ALTO."""
_write_pair(tmp_path, "doc_001", "Premier")
_write_pair(tmp_path, "doc_002", "Deuxième")
_write_pair(tmp_path, "doc_003", "Troisième")
(tmp_path / f"doc_001{GT_SUFFIXES[GTLevel.ALTO]}").write_text(
_ALTO_SAMPLE, encoding="utf-8"
)
corpus = load_corpus_from_directory(tmp_path)
coverage = corpus.gt_level_coverage()
assert coverage[GTLevel.TEXT] == 3
assert coverage[GTLevel.ALTO] == 1
# available_gt_levels = union sur tout le corpus
assert corpus.available_gt_levels == {GTLevel.TEXT, GTLevel.ALTO}
# Mais seul doc_001 expose ALTO
doc_001 = next(d for d in corpus if d.doc_id == "doc_001")
doc_002 = next(d for d in corpus if d.doc_id == "doc_002")
assert doc_001.has_gt(GTLevel.ALTO)
assert not doc_002.has_gt(GTLevel.ALTO)
def test_stats_exposes_coverage(self, tmp_path: Path) -> None:
_write_pair(tmp_path, "a", "x")
_write_pair(tmp_path, "b", "y")
(tmp_path / f"a{GT_SUFFIXES[GTLevel.ALTO]}").write_text(_ALTO_SAMPLE, encoding="utf-8")
stats = load_corpus_from_directory(tmp_path).stats
assert stats["gt_level_coverage"]["text"] == 2
assert stats["gt_level_coverage"]["alto"] == 1
# ──────────────────────────────────────────────────────────────────────────
# 4. Synchronisation bidirectionnelle TEXT
# ──────────────────────────────────────────────────────────────────────────
class TestTextSync:
def test_str_to_dict_sync(self) -> None:
doc = Document(image_path=Path("/tmp/x.png"), ground_truth="aaa")
text_gt = doc.get_gt(GTLevel.TEXT)
assert isinstance(text_gt, TextGT)
assert text_gt.text == "aaa"
def test_dict_to_str_sync(self) -> None:
doc = Document(
image_path=Path("/tmp/x.png"),
ground_truths={GTLevel.TEXT: TextGT(text="bbb")},
)
assert doc.ground_truth == "bbb"
def test_both_provided_keeps_str(self) -> None:
"""Si les deux sont fournis, le champ str est préservé tel quel —
le dict reste la source pour les autres niveaux."""
doc = Document(
image_path=Path("/tmp/x.png"),
ground_truth="canon",
ground_truths={GTLevel.TEXT: TextGT(text="autre")},
)
# Le champ str fourni explicitement n'est pas écrasé
assert doc.ground_truth == "canon"
# ──────────────────────────────────────────────────────────────────────────
# 5. Robustesse — JSON cassé
# ──────────────────────────────────────────────────────────────────────────
class TestRobustness:
def test_broken_entities_json_is_warning_not_error(
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
_write_pair(tmp_path, "doc", "Texte.")
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
"{ ceci n'est pas du JSON", encoding="utf-8"
)
with caplog.at_level("WARNING", logger="picarones.core.corpus"):
corpus = load_corpus_from_directory(tmp_path)
# Le document reste chargé avec son niveau TEXT
doc = corpus.documents[0]
assert doc.has_gt(GTLevel.TEXT)
assert not doc.has_gt(GTLevel.ENTITIES)
# Et un warning explicite a été émis (cf. règle CLAUDE.md)
assert any("entités" in rec.message.lower() for rec in caplog.records)
def test_unexpected_json_format_is_warning(
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
_write_pair(tmp_path, "doc", "Texte.")
# JSON valide mais format inattendu (pas dict avec "entities", pas liste)
(tmp_path / f"doc{GT_SUFFIXES[GTLevel.ENTITIES]}").write_text(
json.dumps({"foo": "bar"}), encoding="utf-8"
)
with caplog.at_level("WARNING", logger="picarones.core.corpus"):
corpus = load_corpus_from_directory(tmp_path)
assert not corpus.documents[0].has_gt(GTLevel.ENTITIES)
assert any("format" in rec.message.lower() for rec in caplog.records)