File size: 12,932 Bytes
68b1472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
68b1472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
68b1472
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""Tests Sprint 58 β€” marqueurs typographiques imprimΓ© ancien.

Couvre :

1. ``get_category`` : caractères classés correctement dans les
   5 catΓ©gories (ligatures, long_s, dotless_i, ampersand,
   nasal_tildes) ; caractères non typographiques → ``None``.
2. ``detect_markers`` :
   - reconnaissance des marqueurs prΓ©-composΓ©s
   - reconnaissance des sΓ©quences ``voyelle + U+0303`` comme
     nasal_tildes
   - texte vide / None
3. ``compute_early_modern_metrics`` :
   - **Diplomatique** : tous marqueurs prΓ©servΓ©s β†’ preservation = 1
   - **Modernisant** : marqueurs remplacΓ©s (ΕΏβ†’s, fi→fi, Δ±β†’i, Γ£β†’a) β†’
     preservation faible
   - **Mixte** : breakdown per_category cohΓ©rent
   - cas dΓ©gΓ©nΓ©rΓ©s (GT sans marqueur, vide, None)
4. **Cas rΓ©aliste** : un document XVIIᡉ avec 5 marqueurs ; trois
   moteurs avec trois conventions diffΓ©rentes β†’ discriminΓ©s.
5. ``missed_markers`` : liste les marqueurs ratΓ©s avec leur index
   et catΓ©gorie.
6. IntΓ©gration registre typΓ©.
"""

from __future__ import annotations

import pytest

from picarones.measurements.early_modern_typography import (
    AMPERSAND,
    DOTLESS_I,
    LIGATURES,
    LONG_S,
    NASAL_TILDE_PRECOMPOSED,
    compute_early_modern_metrics,
    detect_markers,
    early_modern_preservation,
    get_category,
)
from picarones.core.metric_registry import compute_at_junction, select_metrics
from picarones.core.modules import ArtifactType


# ──────────────────────────────────────────────────────────────────────────
# 1. get_category
# ──────────────────────────────────────────────────────────────────────────


class TestGetCategory:
    @pytest.mark.parametrize(
        "char,expected",
        [
            # Ligatures typographiques
            ("fi", "ligatures"),
            ("fl", "ligatures"),
            ("ff", "ligatures"),
            ("ffi", "ligatures"),
            ("st", "ligatures"),
            # S long
            ("ΕΏ", "long_s"),
            # i sans point
            ("Δ±", "dotless_i"),
            # Esperluette
            ("&", "ampersand"),
            # Tildes nasaux prΓ©-composΓ©s
            ("Γ£", "nasal_tildes"),
            ("Γƒ", "nasal_tildes"),
            ("Γ΅", "nasal_tildes"),
            ("Γ±", "nasal_tildes"),
            ("Ε©", "nasal_tildes"),
            # Caractères usuels → None
            ("a", None),
            ("Γ©", None),
            ("s", None),
            ("i", None),
            (" ", None),
        ],
    )
    def test_categorize(self, char: str, expected: str | None) -> None:
        assert get_category(char) == expected

    def test_empty_input(self) -> None:
        assert get_category("") is None


# ──────────────────────────────────────────────────────────────────────────
# 2. detect_markers
# ──────────────────────────────────────────────────────────────────────────


class TestDetectMarkers:
    def test_detects_all_categories(self) -> None:
        text = "fi ſ ı & ã"
        markers = detect_markers(text)
        cats = sorted(cat for _i, _m, cat in markers)
        assert cats == [
            "ampersand", "dotless_i", "ligatures", "long_s", "nasal_tildes",
        ]

    def test_detects_combining_tilde_sequence(self) -> None:
        # 'a' + U+0303 (combining tilde) β†’ nasal_tildes
        text = "ã"
        markers = detect_markers(text)
        assert len(markers) == 1
        idx, marker, cat = markers[0]
        assert cat == "nasal_tildes"
        assert marker == "ã"
        assert idx == 0

    def test_preserves_order(self) -> None:
        text = "ı puis ſ puis fi"
        markers = detect_markers(text)
        cats = [cat for _i, _m, cat in markers]
        assert cats == ["dotless_i", "long_s", "ligatures"]

    def test_empty_input(self) -> None:
        assert detect_markers("") == []
        assert detect_markers(None) == []

    def test_text_without_markers(self) -> None:
        assert detect_markers("hello world") == []


# ──────────────────────────────────────────────────────────────────────────
# 3. compute_early_modern_metrics β€” cas standards
# ──────────────────────────────────────────────────────────────────────────


class TestComputeMetrics:
    @pytest.fixture
    def gt(self) -> str:
        return "le ſerpent finement & ã ı"

    def test_diplomatic_full_preservation(self, gt: str) -> None:
        m = compute_early_modern_metrics(gt, gt)
        assert m["global_preservation"] == pytest.approx(1.0)
        assert m["n_markers_preserved"] == m["n_markers_reference"]
        assert m["missed_markers"] == []

    def test_modernizing_loses_typographic_markers(self, gt: str) -> None:
        # Tous les marqueurs typographiques remplacΓ©s par leurs
        # Γ©quivalents modernes ; & est conservΓ© (rarement modernisΓ©)
        hyp = "le serpent finement & a i"
        m = compute_early_modern_metrics(gt, hyp)
        # Esperluette prΓ©servΓ©e, le reste perdu
        assert m["per_category"]["ampersand"]["preservation"] == 1.0
        assert m["per_category"]["long_s"]["preservation"] == 0.0
        assert m["per_category"]["ligatures"]["preservation"] == 0.0
        assert m["per_category"]["dotless_i"]["preservation"] == 0.0
        assert m["per_category"]["nasal_tildes"]["preservation"] == 0.0
        # Global : 1/5 = 0.2
        assert m["global_preservation"] == pytest.approx(0.2)

    def test_mixed_engine_per_category(self, gt: str) -> None:
        # PrΓ©serve s long + ampersand, perd les autres
        hyp = "le ΕΏerpent finement & a i"
        m = compute_early_modern_metrics(gt, hyp)
        assert m["per_category"]["long_s"]["preservation"] == 1.0
        assert m["per_category"]["ampersand"]["preservation"] == 1.0
        assert m["per_category"]["ligatures"]["preservation"] == 0.0
        # 2/5 = 0.4
        assert m["global_preservation"] == pytest.approx(0.4)


# ──────────────────────────────────────────────────────────────────────────
# 4. Cas dΓ©gΓ©nΓ©rΓ©s
# ──────────────────────────────────────────────────────────────────────────


class TestDegenerateCases:
    def test_gt_without_markers(self) -> None:
        m = compute_early_modern_metrics("hello", "hello")
        assert m["n_markers_reference"] == 0
        assert m["global_preservation"] == 0.0
        assert m["per_category"] == {}

    def test_empty_gt(self) -> None:
        m = compute_early_modern_metrics("", "anything")
        assert m["n_markers_reference"] == 0
        assert m["global_preservation"] == 0.0

    def test_none_inputs(self) -> None:
        m = compute_early_modern_metrics(None, None)
        assert m["n_markers_reference"] == 0

    def test_empty_hyp_with_markers_in_gt(self) -> None:
        m = compute_early_modern_metrics("fi ſ", "")
        assert m["n_markers_preserved"] == 0
        assert m["global_preservation"] == 0.0
        assert len(m["missed_markers"]) == 2


# ──────────────────────────────────────────────────────────────────────────
# 5. missed_markers
# ──────────────────────────────────────────────────────────────────────────


class TestMissedMarkers:
    def test_missed_markers_have_index_and_category(self) -> None:
        gt = "fi et ſ"
        # ſ remplacé par s, fi remplacé par fi
        hyp = "fi et s"
        m = compute_early_modern_metrics(gt, hyp)
        # Les deux marqueurs sont ratΓ©s
        assert len(m["missed_markers"]) == 2
        cats = {entry["category"] for entry in m["missed_markers"]}
        assert cats == {"ligatures", "long_s"}
        # Chaque entrΓ©e a un index, marker, category
        for entry in m["missed_markers"]:
            assert "index" in entry
            assert "marker" in entry
            assert "category" in entry


# ──────────────────────────────────────────────────────────────────────────
# 6. Comptage exhaustif
# ──────────────────────────────────────────────────────────────────────────


class TestExhaustiveAccounting:
    def test_preserved_plus_missed_equals_total(self) -> None:
        gt = "fi ΕΏ Δ± & Γ£ fl ffi"
        hyp = "fi s i & a fl ffi"
        m = compute_early_modern_metrics(gt, hyp)
        assert (
            m["n_markers_preserved"] + len(m["missed_markers"])
            == m["n_markers_reference"]
        )


# ──────────────────────────────────────────────────────────────────────────
# 7. Sets exposΓ©s
# ──────────────────────────────────────────────────────────────────────────


class TestExposedSets:
    def test_ligatures_non_empty(self) -> None:
        assert len(LIGATURES) >= 5

    def test_categories_disjoint(self) -> None:
        # Les sets prΓ©-composΓ©s sont disjoints (pas de chevauchement)
        sets = [LIGATURES, LONG_S, DOTLESS_I, AMPERSAND, NASAL_TILDE_PRECOMPOSED]
        for i, a in enumerate(sets):
            for b in sets[i + 1:]:
                assert a & b == frozenset(), (
                    f"Chevauchement entre catΓ©gories : {a & b!r}"
                )


# ──────────────────────────────────────────────────────────────────────────
# 8. Raccourci
# ──────────────────────────────────────────────────────────────────────────


class TestShortcut:
    def test_shortcut_matches_full_call(self) -> None:
        gt = "fi ſ &"
        hyp = "fi s &"
        full = compute_early_modern_metrics(gt, hyp)
        assert early_modern_preservation(gt, hyp) == pytest.approx(
            full["global_preservation"],
        )


# ──────────────────────────────────────────────────────────────────────────
# 9. IntΓ©gration registre typΓ©
# ──────────────────────────────────────────────────────────────────────────


class TestRegistryIntegration:
    def test_metric_registered(self) -> None:
        # Force l'import qui peuple le registre
        import picarones.measurements.early_modern_typography  # noqa: F401

        selected = select_metrics(
            (ArtifactType.TEXT, ArtifactType.TEXT),
        )
        names = {spec.name for spec in selected}
        assert "early_modern_preservation" in names

    def test_compute_at_junction(self) -> None:
        out = compute_at_junction(
            "fi ſ &", "fi ſ &",
            (ArtifactType.TEXT, ArtifactType.TEXT),
        )
        assert out["early_modern_preservation"] == pytest.approx(1.0)