File size: 10,905 Bytes
ecb8713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
ecb8713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""Tests Sprint 85 β€” A.II.5b : prΓ©cision sur sΓ©quences numΓ©riques.

Couvre :

1. DΓ©tection par catΓ©gorie (year, roman, foliation, currency, regnal).
2. ``compute_numerical_sequence_metrics`` :
   - identitΓ© β†’ 1.0 sur strict et value
   - perte totale β†’ 0.0
   - GT vide β†’ scores 0.0 (pas None β€” convention float)
   - value prΓ©servΓ©e mais pas strict (XIV β†’ 14)
   - foliotation recto/verso non interchangeables
   - multiplicitΓ© respectΓ©e
3. Cas rΓ©alistes : charte XVIII, registre paroissial.
4. Enregistrement registre typΓ© : strict + value.
"""

from __future__ import annotations

from picarones.measurements.numerical_sequences import (
    CATEGORIES,
    _detect_currencies,
    _detect_foliations,
    _detect_regnal,
    _detect_romans_with_values,
    _detect_years,
    compute_numerical_sequence_metrics,
    numerical_sequence_strict_score,
    numerical_sequence_value_score,
)


# ──────────────────────────────────────────────────────────────────────────
# 1. DΓ©tection par catΓ©gorie
# ──────────────────────────────────────────────────────────────────────────


class TestDetectYears:
    def test_classic_year(self) -> None:
        years = _detect_years("nΓ© en 1789 puis mort en 1856")
        assert years == [("1789", 1789), ("1856", 1856)]

    def test_year_with_context(self) -> None:
        years = _detect_years("1ᡉʳ janvier 1789")
        assert years == [("1789", 1789)]

    def test_outside_range_ignored(self) -> None:
        # 999 (3 chiffres) et 2123 (au-delΓ  2099) : non dΓ©tectΓ©s
        assert _detect_years("999 et 2123") == []

    def test_empty(self) -> None:
        assert _detect_years("") == []


class TestDetectRomans:
    def test_classic(self) -> None:
        out = _detect_romans_with_values("Tome IV, MDCLXVIII")
        forms = [f for f, _ in out]
        assert "IV" in forms
        assert "MDCLXVIII" in forms

    def test_min_length_filters_single_letter(self) -> None:
        # I, V, X seuls β†’ ignorΓ©s (min_length=2)
        out = _detect_romans_with_values("I prononce le V")
        forms = [f for f, _ in out]
        assert "I" not in forms


class TestDetectFoliations:
    def test_recto_verso_preserved(self) -> None:
        out = _detect_foliations("voir f. 12r et f. 12v")
        keys = [k for _, k in out]
        assert "12r" in keys
        assert "12v" in keys

    def test_page_range(self) -> None:
        out = _detect_foliations("pp. 12-15")
        assert ("pp. 12-15", "12-15") in out

    def test_n_degree(self) -> None:
        out = _detect_foliations("voir nΒ° 42")
        assert any(k == "42" for _, k in out)


class TestDetectCurrencies:
    def test_ancien_regime(self) -> None:
        out = _detect_currencies("12 livres 5 sols 8 deniers")
        units = [v[1] for _, v in out]
        assert "livre" in units
        assert "sol" in units
        assert "denier" in units

    def test_modern_units(self) -> None:
        out = _detect_currencies("100 Β£ et 50 €")
        units = [v[1] for _, v in out]
        assert "Β£" in units
        assert "€" in units


class TestDetectRegnal:
    def test_simple_regnal(self) -> None:
        out = _detect_regnal("l'an III de la RΓ©publique")
        # Β« l'an III Β» + Β« an III de la RΓ©publique Β» fusionnΓ©s en une
        # seule occurrence par le regex
        assert any(v == 3 for _, v in out)

    def test_an_de_grace(self) -> None:
        out = _detect_regnal("Γ©crit en l'an de grΓ’ce 1450")
        assert any(v == 1450 for _, v in out)


# ──────────────────────────────────────────────────────────────────────────
# 2. compute_numerical_sequence_metrics
# ──────────────────────────────────────────────────────────────────────────


class TestComputeMetrics:
    def test_identity(self) -> None:
        gt = "Tome IV, an de grΓ’ce 1789, f. 12r, 5 livres"
        r = compute_numerical_sequence_metrics(gt, gt)
        assert r["global_strict_score"] == 1.0
        assert r["global_value_score"] == 1.0

    def test_total_loss(self) -> None:
        gt = "1789 IV f. 12r 5 livres"
        hyp = "alpha beta gamma delta"
        r = compute_numerical_sequence_metrics(gt, hyp)
        assert r["global_strict_score"] == 0.0
        assert r["global_value_score"] == 0.0
        assert r["n_total"] >= 1

    def test_empty_gt_returns_zero(self) -> None:
        r = compute_numerical_sequence_metrics("", "anything")
        # Pas de sΓ©quence en GT β†’ scores 0 (pas de division par 0)
        assert r["global_strict_score"] == 0.0
        assert r["global_value_score"] == 0.0
        assert r["n_total"] == 0

    def test_value_preserved_form_lost(self) -> None:
        # « XIV » en GT ; hypothèse contient « 14 » en année
        # (impossible ici car 14 < 1000 et hors plage years).
        # Cas plus robuste : Β« MMXX Β» (2020) β†’ hyp Β« 2020 Β».
        # Mais value_extractor de roman_numerals attend un int
        # romain β€” si hypothesis n'a pas Β« MMXX Β» mais bien
        # Β« 2020 Β», le dΓ©tecteur roman ne trouve rien, donc
        # le roman GT est lost en valeur aussi (cohΓ©rent : on
        # ne fait pas de cross-category match).
        # On teste donc le mode strict vs value sur foliotation :
        gt = "voir f. 12r"
        hyp = "voir fol. 12r"   # forme diffΓ©rente, valeur identique (12r)
        r = compute_numerical_sequence_metrics(gt, hyp)
        # Β« f. 12r Β» et Β« fol. 12r Β» ont la mΓͺme clΓ© de valeur
        # (Β« 12r Β»), donc value=1, strict=0
        assert r["per_category"]["foliation"]["value"] == 1
        assert r["per_category"]["foliation"]["strict"] == 0

    def test_recto_verso_not_interchangeable(self) -> None:
        # f. 12r (GT) et f. 12v (hyp) : recto/verso diffΓ©rents,
        # donc lost en value et en strict
        r = compute_numerical_sequence_metrics("f. 12r", "f. 12v")
        assert r["per_category"]["foliation"]["strict"] == 0
        assert r["per_category"]["foliation"]["value"] == 0

    def test_multiplicity(self) -> None:
        # 2 occurrences en GT, 1 en hyp β†’ 1 prΓ©servΓ©e
        gt = "1789 et 1789"
        hyp = "1789"
        r = compute_numerical_sequence_metrics(gt, hyp)
        assert r["per_category"]["year"]["n_total"] == 2
        assert r["per_category"]["year"]["strict"] == 1
        assert "1789" in r["per_category"]["year"]["lost_items"]

    def test_categories_constant(self) -> None:
        # Sanity : les 5 catΓ©gories sont dΓ©clarΓ©es
        assert set(CATEGORIES) == {
            "year", "roman", "foliation", "currency", "regnal",
        }

    def test_per_category_breakdown_keys(self) -> None:
        r = compute_numerical_sequence_metrics("1789", "1789")
        for cat in CATEGORIES:
            assert cat in r["per_category"]
            for k in (
                "n_total", "strict", "value",
                "strict_score", "value_score", "lost_items",
            ):
                assert k in r["per_category"][cat]


# ──────────────────────────────────────────────────────────────────────────
# 3. Cas rΓ©alistes
# ──────────────────────────────────────────────────────────────────────────


class TestRealistic:
    def test_charte_18e_strict_preserved(self) -> None:
        gt = (
            "DonnΓ© Γ  Paris l'an de grΓ’ce 1789, "
            "f. 12r, contre 25 livres 4 sols et 6 deniers."
        )
        hyp = (
            "DonnΓ© Γ  Paris l'an de grΓ’ce 1789, "
            "f. 12r, contre 25 livres 4 sols et 6 deniers."
        )
        r = compute_numerical_sequence_metrics(gt, hyp)
        assert r["global_strict_score"] == 1.0

    def test_baptismal_register_modernized(self) -> None:
        # OCR modernisant : XVIII β†’ 18 (forme romaine perdue)
        gt = "Au siècle XVIII, en l'an 1750, f. 3r"
        hyp = "Au siècle 18, en l'an 1750, f. 3r"
        r = compute_numerical_sequence_metrics(gt, hyp)
        # XVIII forme perdue (le hyp n'a pas un romain reconnaissable)
        assert "XVIII" in r["per_category"]["roman"]["lost_items"]
        # AnnΓ©e et foliation prΓ©servΓ©es
        assert r["per_category"]["year"]["strict"] == 1
        assert r["per_category"]["foliation"]["strict"] == 1


# ──────────────────────────────────────────────────────────────────────────
# 4. Registre typΓ©
# ──────────────────────────────────────────────────────────────────────────


class TestRegistry:
    def test_strict_and_value_metrics_registered(self) -> None:
        from picarones.core.metric_registry import select_metrics
        from picarones.core.modules import ArtifactType

        metrics = select_metrics((ArtifactType.TEXT, ArtifactType.TEXT))
        names = [m.name for m in metrics]
        assert "numerical_sequence_strict_score" in names
        assert "numerical_sequence_value_score" in names

    def test_strict_score_callable(self) -> None:
        v = numerical_sequence_strict_score("1789", "1789")
        assert v == 1.0

    def test_value_score_with_form_drift(self) -> None:
        # Β« f. 12r Β» vs Β« fol. 12r Β» : value prΓ©servΓ©e, strict perdu
        strict = numerical_sequence_strict_score("f. 12r", "fol. 12r")
        value = numerical_sequence_value_score("f. 12r", "fol. 12r")
        assert strict == 0.0
        assert value == 1.0

    def test_metric_via_compute_at_junction(self) -> None:
        from picarones.core.metric_registry import compute_at_junction
        from picarones.core.modules import ArtifactType

        results = compute_at_junction(
            "1789, IV", "1789, IV",
            (ArtifactType.TEXT, ArtifactType.TEXT),
        )
        assert results.get("numerical_sequence_strict_score") == 1.0
        assert results.get("numerical_sequence_value_score") == 1.0