File size: 3,777 Bytes
49cc409
 
 
 
 
6362212
49cc409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""Tests unitaires pour picarones.core.corpus."""

import pytest
from pathlib import Path

from picarones.core.corpus import load_corpus_from_directory, Document


@pytest.fixture
def sample_corpus_dir(tmp_path: Path) -> Path:
    """Crée un mini-corpus temporaire avec 3 paires image/GT."""
    images = [
        ("page_001.png", "La première page du document médiéval."),
        ("page_002.png", "Deuxième folio avec des abréviations."),
        ("page_003.png", "Fin du manuscrit avec colophon."),
    ]
    for filename, gt_text in images:
        # Image factice (1×1 PNG valide)
        image_path = tmp_path / filename
        image_path.write_bytes(
            b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01"
            b"\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00"
            b"\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18"
            b"\xd8N\x00\x00\x00\x00IEND\xaeB`\x82"
        )
        gt_path = tmp_path / (Path(filename).stem + ".gt.txt")
        gt_path.write_text(gt_text, encoding="utf-8")
    return tmp_path


class TestLoadCorpusFromDirectory:
    def test_loads_correct_count(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        assert len(corpus) == 3

    def test_corpus_name_defaults_to_dir_name(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        assert corpus.name == sample_corpus_dir.name

    def test_corpus_name_can_be_set(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir, name="Mon corpus test")
        assert corpus.name == "Mon corpus test"

    def test_document_ids(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        ids = {doc.doc_id for doc in corpus}
        assert "page_001" in ids
        assert "page_002" in ids
        assert "page_003" in ids

    def test_ground_truth_content(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        doc = next(d for d in corpus if d.doc_id == "page_001")
        assert "médiéval" in doc.ground_truth

    def test_source_path_set(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        assert corpus.source_path == str(sample_corpus_dir)

    def test_nonexistent_directory_raises(self, tmp_path):
        with pytest.raises(FileNotFoundError):
            load_corpus_from_directory(tmp_path / "inexistant")

    def test_directory_without_gt_raises(self, tmp_path):
        (tmp_path / "image.png").write_bytes(b"fake")
        with pytest.raises(ValueError):
            load_corpus_from_directory(tmp_path)

    def test_ignores_images_without_gt(self, sample_corpus_dir, tmp_path):
        # Copie le corpus et ajoute une image sans GT
        import shutil
        dest = tmp_path / "corpus2"
        shutil.copytree(sample_corpus_dir, dest)
        (dest / "orphan.png").write_bytes(b"fake")
        corpus = load_corpus_from_directory(dest)
        assert len(corpus) == 3  # L'image orpheline est ignorée

    def test_stats_computed(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        stats = corpus.stats
        assert stats["document_count"] == 3
        assert stats["gt_length_min"] > 0


class TestCorpusIteration:
    def test_iterable(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        docs = list(corpus)
        assert len(docs) == 3
        assert all(isinstance(d, Document) for d in docs)

    def test_repr(self, sample_corpus_dir):
        corpus = load_corpus_from_directory(sample_corpus_dir)
        r = repr(corpus)
        assert "Corpus" in r
        assert "3" in r