Spaces:
Sleeping
Sleeping
File size: 10,226 Bytes
cf38aca 979f3c3 cf38aca 979f3c3 cf38aca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | """Tests Sprint 55 β PrΓ©cision par bloc Unicode.
Couvre :
1. ``get_block`` : caractères des blocs principaux correctement
classifiΓ©s ; caractΓ¨res inconnus β ``"Other"``.
2. ``compute_unicode_block_accuracy`` :
- Texte identique β toutes les accuracies Γ 1.0
- Texte vide β rΓ©sultat dΓ©gΓ©nΓ©rΓ© sans crash
- Substitutions ciblΓ©es par bloc (ASCII prΓ©servΓ©, prΓ©sentation
latine perdue) β cas rΓ©aliste du plan
- Insertions et suppressions
3. **Cas rΓ©aliste du plan d'Γ©volution** : OCR modernisant remplace
``ΕΏ`` par ``s`` et ``ο¬`` par ``fi`` β 100 % Latin de Base mais
0 % PrΓ©sentation latine et 0 % Latin Extended-A.
4. ``unicode_block_global_accuracy`` raccourci Γ©quivalent Γ
``compute["global_accuracy"]``.
5. **IntΓ©gration registre typΓ©** : ``unicode_block_global_accuracy``
sΓ©lectionnΓ©e pour la jonction ``(TEXT, TEXT)``.
"""
from __future__ import annotations
import pytest
from picarones.core.metric_registry import compute_at_junction, select_metrics
from picarones.core.modules import ArtifactType
from picarones.measurements.unicode_blocks import (
compute_unicode_block_accuracy,
get_block,
unicode_block_global_accuracy,
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. get_block
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestGetBlock:
@pytest.mark.parametrize(
"char,expected_block",
[
("a", "Basic Latin"),
("A", "Basic Latin"),
(" ", "Basic Latin"),
("Γ©", "Latin-1 Supplement"),
("Γ§", "Latin-1 Supplement"),
("Ζ", "Latin Extended-B"),
("ΕΏ", "Latin Extended-A"), # s long mΓ©diΓ©val
("ο¬", "Alphabetic Presentation Forms"), # ligature fi
("ο¬", "Alphabetic Presentation Forms"),
("Μ", "Combining Diacritical Marks"), # Μ combinant aigu
],
)
def test_known_blocks(self, char: str, expected_block: str) -> None:
assert get_block(char) == expected_block
def test_empty_string_returns_other(self) -> None:
assert get_block("") == "Other"
def test_unknown_char_returns_other(self) -> None:
# Γmoji β pas dans la table patrimoniale
assert get_block("π¦") == "Other"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. compute_unicode_block_accuracy β cas gΓ©nΓ©raux
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestComputeAccuracy:
def test_identical_text_full_accuracy(self) -> None:
m = compute_unicode_block_accuracy("hello world", "hello world")
assert m["global_accuracy"] == pytest.approx(1.0)
assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(1.0)
assert m["per_block"]["Basic Latin"]["correct"] == 11
assert m["per_block"]["Basic Latin"]["total"] == 11
def test_empty_reference(self) -> None:
m = compute_unicode_block_accuracy("", "anything")
assert m["per_block"] == {}
assert m["global_accuracy"] == 0.0
assert m["n_chars_reference"] == 0
def test_empty_hypothesis(self) -> None:
m = compute_unicode_block_accuracy("hello", "")
assert m["global_accuracy"] == 0.0
assert m["per_block"]["Basic Latin"]["correct"] == 0
assert m["per_block"]["Basic Latin"]["total"] == 5
def test_none_inputs(self) -> None:
m = compute_unicode_block_accuracy(None, None)
assert m["per_block"] == {}
assert m["global_accuracy"] == 0.0
def test_partial_substitution_per_block(self) -> None:
# Les Γ© (Latin-1 Sup) sont prΓ©servΓ©s ; les a (Basic Latin)
# sont remplacΓ©s par X.
gt = "Γ©aΓ©aΓ©a"
hyp = "Γ©XΓ©XΓ©X"
m = compute_unicode_block_accuracy(gt, hyp)
# Latin-1 Sup : 3 Γ© correctes
assert m["per_block"]["Latin-1 Supplement"]["accuracy"] == pytest.approx(1.0)
# Basic Latin : 0/3 (les a sont substituΓ©s)
assert m["per_block"]["Basic Latin"]["accuracy"] == 0.0
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. Cas rΓ©aliste du plan d'Γ©volution
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRealisticModernization:
def test_modernizing_ocr_loses_presentation_forms(self) -> None:
"""OCR qui remplace ΕΏ par s et ο¬ par fi β 100 % Latin de Base
prΓ©servΓ©, mais 0 % de PrΓ©sentation latine et de Latin
Extended-A. C'est l'illustration directe du plan : "ce moteur
restitue 95 % du Latin de Base mais 12 % de prΓ©sentation
latine".
"""
gt = "le ΕΏerpent ο¬nement"
ocr_modern = "le serpent finement"
m = compute_unicode_block_accuracy(gt, ocr_modern)
# PrΓ©sentation latine (ο¬ remplacΓ©e) : 0%
assert m["per_block"]["Alphabetic Presentation Forms"]["accuracy"] == 0.0
# Latin Extended-A (ΕΏ remplacΓ©) : 0% (1 occurrence dans "ΕΏerpent")
assert m["per_block"]["Latin Extended-A"]["accuracy"] == 0.0
# Basic Latin : prΓ©servΓ© Γ 100% (les espaces, lettres ASCII)
assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(1.0)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. Insertions / suppressions
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestInsertionDeletion:
def test_inserted_char_does_not_count(self) -> None:
# GT = "abc", hyp = "aXbc" : insertion de X β toutes les
# positions GT restent correctement restituΓ©es.
m = compute_unicode_block_accuracy("abc", "aXbc")
assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(1.0)
def test_deletion_lowers_accuracy(self) -> None:
# GT = "abc", hyp = "ac" : "b" est supprimΓ© β 2/3 prΓ©servΓ©s.
m = compute_unicode_block_accuracy("abc", "ac")
assert m["per_block"]["Basic Latin"]["correct"] == 2
assert m["per_block"]["Basic Latin"]["total"] == 3
assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(2 / 3)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. Coverage β tous les caractΓ¨res GT classΓ©s
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestCoverage:
def test_total_chars_match_reference_length(self) -> None:
gt = "Hello, Γ¦ther β vol. ο¬. Γ©"
m = compute_unicode_block_accuracy(gt, gt)
total = sum(d["total"] for d in m["per_block"].values())
assert total == len(gt)
assert m["n_chars_reference"] == len(gt)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 6. Raccourci global
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestShortcut:
def test_shortcut_matches_full_call(self) -> None:
gt = "le ΕΏerpent ο¬nement"
ocr = "le serpent finement"
full = compute_unicode_block_accuracy(gt, ocr)
assert unicode_block_global_accuracy(gt, ocr) == pytest.approx(
full["global_accuracy"],
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 7. IntΓ©gration registre typΓ©
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRegistryIntegration:
def test_metric_registered_for_text_text(self) -> None:
# Force l'import qui peuple le registre
import picarones.measurements.unicode_blocks # noqa: F401
selected = select_metrics(
(ArtifactType.TEXT, ArtifactType.TEXT),
)
names = {spec.name for spec in selected}
assert "unicode_block_global_accuracy" in names
def test_compute_at_junction(self) -> None:
out = compute_at_junction(
"hello",
"hello",
(ArtifactType.TEXT, ArtifactType.TEXT),
)
assert out["unicode_block_global_accuracy"] == pytest.approx(1.0)
|