Spaces:
Sleeping
Sleeping
File size: 10,905 Bytes
ecb8713 979f3c3 ecb8713 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | """Tests Sprint 85 β A.II.5b : prΓ©cision sur sΓ©quences numΓ©riques.
Couvre :
1. DΓ©tection par catΓ©gorie (year, roman, foliation, currency, regnal).
2. ``compute_numerical_sequence_metrics`` :
- identitΓ© β 1.0 sur strict et value
- perte totale β 0.0
- GT vide β scores 0.0 (pas None β convention float)
- value prΓ©servΓ©e mais pas strict (XIV β 14)
- foliotation recto/verso non interchangeables
- multiplicitΓ© respectΓ©e
3. Cas rΓ©alistes : charte XVIII, registre paroissial.
4. Enregistrement registre typΓ© : strict + value.
"""
from __future__ import annotations
from picarones.measurements.numerical_sequences import (
CATEGORIES,
_detect_currencies,
_detect_foliations,
_detect_regnal,
_detect_romans_with_values,
_detect_years,
compute_numerical_sequence_metrics,
numerical_sequence_strict_score,
numerical_sequence_value_score,
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. DΓ©tection par catΓ©gorie
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestDetectYears:
def test_classic_year(self) -> None:
years = _detect_years("nΓ© en 1789 puis mort en 1856")
assert years == [("1789", 1789), ("1856", 1856)]
def test_year_with_context(self) -> None:
years = _detect_years("1α΅Κ³ janvier 1789")
assert years == [("1789", 1789)]
def test_outside_range_ignored(self) -> None:
# 999 (3 chiffres) et 2123 (au-delΓ 2099) : non dΓ©tectΓ©s
assert _detect_years("999 et 2123") == []
def test_empty(self) -> None:
assert _detect_years("") == []
class TestDetectRomans:
def test_classic(self) -> None:
out = _detect_romans_with_values("Tome IV, MDCLXVIII")
forms = [f for f, _ in out]
assert "IV" in forms
assert "MDCLXVIII" in forms
def test_min_length_filters_single_letter(self) -> None:
# I, V, X seuls β ignorΓ©s (min_length=2)
out = _detect_romans_with_values("I prononce le V")
forms = [f for f, _ in out]
assert "I" not in forms
class TestDetectFoliations:
def test_recto_verso_preserved(self) -> None:
out = _detect_foliations("voir f. 12r et f. 12v")
keys = [k for _, k in out]
assert "12r" in keys
assert "12v" in keys
def test_page_range(self) -> None:
out = _detect_foliations("pp. 12-15")
assert ("pp. 12-15", "12-15") in out
def test_n_degree(self) -> None:
out = _detect_foliations("voir nΒ° 42")
assert any(k == "42" for _, k in out)
class TestDetectCurrencies:
def test_ancien_regime(self) -> None:
out = _detect_currencies("12 livres 5 sols 8 deniers")
units = [v[1] for _, v in out]
assert "livre" in units
assert "sol" in units
assert "denier" in units
def test_modern_units(self) -> None:
out = _detect_currencies("100 Β£ et 50 β¬")
units = [v[1] for _, v in out]
assert "Β£" in units
assert "β¬" in units
class TestDetectRegnal:
def test_simple_regnal(self) -> None:
out = _detect_regnal("l'an III de la RΓ©publique")
# Β« l'an III Β» + Β« an III de la RΓ©publique Β» fusionnΓ©s en une
# seule occurrence par le regex
assert any(v == 3 for _, v in out)
def test_an_de_grace(self) -> None:
out = _detect_regnal("Γ©crit en l'an de grΓ’ce 1450")
assert any(v == 1450 for _, v in out)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. compute_numerical_sequence_metrics
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestComputeMetrics:
def test_identity(self) -> None:
gt = "Tome IV, an de grΓ’ce 1789, f. 12r, 5 livres"
r = compute_numerical_sequence_metrics(gt, gt)
assert r["global_strict_score"] == 1.0
assert r["global_value_score"] == 1.0
def test_total_loss(self) -> None:
gt = "1789 IV f. 12r 5 livres"
hyp = "alpha beta gamma delta"
r = compute_numerical_sequence_metrics(gt, hyp)
assert r["global_strict_score"] == 0.0
assert r["global_value_score"] == 0.0
assert r["n_total"] >= 1
def test_empty_gt_returns_zero(self) -> None:
r = compute_numerical_sequence_metrics("", "anything")
# Pas de sΓ©quence en GT β scores 0 (pas de division par 0)
assert r["global_strict_score"] == 0.0
assert r["global_value_score"] == 0.0
assert r["n_total"] == 0
def test_value_preserved_form_lost(self) -> None:
# « XIV » en GT ; hypothèse contient « 14 » en année
# (impossible ici car 14 < 1000 et hors plage years).
# Cas plus robuste : Β« MMXX Β» (2020) β hyp Β« 2020 Β».
# Mais value_extractor de roman_numerals attend un int
# romain β si hypothesis n'a pas Β« MMXX Β» mais bien
# Β« 2020 Β», le dΓ©tecteur roman ne trouve rien, donc
# le roman GT est lost en valeur aussi (cohΓ©rent : on
# ne fait pas de cross-category match).
# On teste donc le mode strict vs value sur foliotation :
gt = "voir f. 12r"
hyp = "voir fol. 12r" # forme diffΓ©rente, valeur identique (12r)
r = compute_numerical_sequence_metrics(gt, hyp)
# Β« f. 12r Β» et Β« fol. 12r Β» ont la mΓͺme clΓ© de valeur
# (Β« 12r Β»), donc value=1, strict=0
assert r["per_category"]["foliation"]["value"] == 1
assert r["per_category"]["foliation"]["strict"] == 0
def test_recto_verso_not_interchangeable(self) -> None:
# f. 12r (GT) et f. 12v (hyp) : recto/verso diffΓ©rents,
# donc lost en value et en strict
r = compute_numerical_sequence_metrics("f. 12r", "f. 12v")
assert r["per_category"]["foliation"]["strict"] == 0
assert r["per_category"]["foliation"]["value"] == 0
def test_multiplicity(self) -> None:
# 2 occurrences en GT, 1 en hyp β 1 prΓ©servΓ©e
gt = "1789 et 1789"
hyp = "1789"
r = compute_numerical_sequence_metrics(gt, hyp)
assert r["per_category"]["year"]["n_total"] == 2
assert r["per_category"]["year"]["strict"] == 1
assert "1789" in r["per_category"]["year"]["lost_items"]
def test_categories_constant(self) -> None:
# Sanity : les 5 catΓ©gories sont dΓ©clarΓ©es
assert set(CATEGORIES) == {
"year", "roman", "foliation", "currency", "regnal",
}
def test_per_category_breakdown_keys(self) -> None:
r = compute_numerical_sequence_metrics("1789", "1789")
for cat in CATEGORIES:
assert cat in r["per_category"]
for k in (
"n_total", "strict", "value",
"strict_score", "value_score", "lost_items",
):
assert k in r["per_category"][cat]
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. Cas rΓ©alistes
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRealistic:
def test_charte_18e_strict_preserved(self) -> None:
gt = (
"DonnΓ© Γ Paris l'an de grΓ’ce 1789, "
"f. 12r, contre 25 livres 4 sols et 6 deniers."
)
hyp = (
"DonnΓ© Γ Paris l'an de grΓ’ce 1789, "
"f. 12r, contre 25 livres 4 sols et 6 deniers."
)
r = compute_numerical_sequence_metrics(gt, hyp)
assert r["global_strict_score"] == 1.0
def test_baptismal_register_modernized(self) -> None:
# OCR modernisant : XVIII β 18 (forme romaine perdue)
gt = "Au siècle XVIII, en l'an 1750, f. 3r"
hyp = "Au siècle 18, en l'an 1750, f. 3r"
r = compute_numerical_sequence_metrics(gt, hyp)
# XVIII forme perdue (le hyp n'a pas un romain reconnaissable)
assert "XVIII" in r["per_category"]["roman"]["lost_items"]
# AnnΓ©e et foliation prΓ©servΓ©es
assert r["per_category"]["year"]["strict"] == 1
assert r["per_category"]["foliation"]["strict"] == 1
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. Registre typΓ©
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRegistry:
def test_strict_and_value_metrics_registered(self) -> None:
from picarones.core.metric_registry import select_metrics
from picarones.core.modules import ArtifactType
metrics = select_metrics((ArtifactType.TEXT, ArtifactType.TEXT))
names = [m.name for m in metrics]
assert "numerical_sequence_strict_score" in names
assert "numerical_sequence_value_score" in names
def test_strict_score_callable(self) -> None:
v = numerical_sequence_strict_score("1789", "1789")
assert v == 1.0
def test_value_score_with_form_drift(self) -> None:
# Β« f. 12r Β» vs Β« fol. 12r Β» : value prΓ©servΓ©e, strict perdu
strict = numerical_sequence_strict_score("f. 12r", "fol. 12r")
value = numerical_sequence_value_score("f. 12r", "fol. 12r")
assert strict == 0.0
assert value == 1.0
def test_metric_via_compute_at_junction(self) -> None:
from picarones.core.metric_registry import compute_at_junction
from picarones.core.modules import ArtifactType
results = compute_at_junction(
"1789, IV", "1789, IV",
(ArtifactType.TEXT, ArtifactType.TEXT),
)
assert results.get("numerical_sequence_strict_score") == 1.0
assert results.get("numerical_sequence_value_score") == 1.0
|