Spaces:
Sleeping
Sleeping
File size: 10,295 Bytes
cf392fd 979f3c3 cf392fd 979f3c3 cf392fd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | """Tests Sprint 57 โ couverture MUFI (clรดture axe A.II.3 philologique).
Couvre :
1. ``is_mufi_char`` :
- caractรจres PUA (E000-F8FF) โ True
- Latin Extended-D (๊, etc.) โ True
- lettres mรฉdiรฉvales explicites (รพ, รฐ, ฦฟ, ลฟ, รฆ, ฦ, ศโฆ) โ True
- ligatures Alphabetic Presentation Forms (๏ฌ, ๏ฌ) โ True
- lettres latines courantes (a, A, รฉ) โ False
- chaรฎne vide โ False
- ``custom_chars`` รฉtend la liste reconnue
2. ``compute_mufi_coverage`` :
- GT diplomatique vs hyp diplomatique โ coverage = 1
- GT MUFI vs hyp modernisรฉe (tout latin moderne) โ coverage = 0
- cas partiel : breakdown ``per_char`` cohรฉrent
- liste ``missed_chars`` exhaustive
3. **Cas dรฉgรฉnรฉrรฉs** :
- GT vide / sans MUFI โ coverage = 0
- hyp vide โ coverage = 0
- GT et hyp identiques avec MUFI โ coverage = 1
4. ``custom_chars`` : รฉtend la dรฉtection (ex. accepter ``รฑ``).
5. Coverage exhaustive : ``n_preserved + len(missed_chars) ==
n_mufi_chars_reference`` quand toutes les positions sont
classรฉes.
6. Intรฉgration registre typรฉ : ``mufi_coverage`` enregistrรฉ pour
``(TEXT, TEXT)``.
"""
from __future__ import annotations
import pytest
from picarones.core.metric_registry import compute_at_junction, select_metrics
from picarones.core.modules import ArtifactType
from picarones.measurements.mufi import (
compute_mufi_coverage,
is_mufi_char,
mufi_coverage,
)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 1. is_mufi_char
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestIsMufiChar:
@pytest.mark.parametrize(
"char,expected",
[
# Lettres latines courantes โ non MUFI
("a", False), ("Z", False), ("รฉ", False), ("รง", False),
("รฑ", False), # caractรจre espagnol, pas MUFI par dรฉfaut
("0", False), (" ", False), ("", False),
# Lettres mรฉdiรฉvales explicites โ MUFI
("รพ", True), ("ร", True), ("รฐ", True), ("ร", True),
("ฦฟ", True), ("วท", True), ("ลฟ", True),
("รฆ", True), ("ร", True), ("ล", True), ("รธ", True),
("ฦ", True), ("ศ", True),
# Latin Extended-D โ MUFI
("๊", True), ("๊", True), ("๊", True),
# Alphabetic Presentation Forms โ MUFI
("๏ฌ", True), ("๏ฌ", True),
# Combining Diacritical Marks Supplement โ MUFI
# (U+1DC0 par exemple โ combining dotted grave)
("แท", True),
],
)
def test_known_chars(self, char: str, expected: bool) -> None:
assert is_mufi_char(char) is expected
def test_pua_range(self) -> None:
# Quelques points dans la PUA E000-F8FF
for cp in (0xE000, 0xE500, 0xF000, 0xF8FF):
assert is_mufi_char(chr(cp)) is True
def test_custom_chars_extend(self) -> None:
# รฑ n'est pas MUFI par dรฉfaut, mais devient MUFI si custom
assert is_mufi_char("รฑ") is False
assert is_mufi_char("รฑ", frozenset({"รฑ"})) is True
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 2. compute_mufi_coverage
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestComputeCoverage:
def test_diplomatic_engine_full_coverage(self) -> None:
gt = "รพรฆt ฦฟรฆลฟ ๊ รฐ"
m = compute_mufi_coverage(gt, gt)
assert m["coverage"] == pytest.approx(1.0)
assert m["n_mufi_chars_preserved"] == m["n_mufi_chars_reference"]
assert m["missed_chars"] == []
def test_modernizing_engine_zero_coverage(self) -> None:
gt = "รพรฆt ฦฟรฆลฟ ๊ รฐ"
# Toutes les MUFI sont remplacรฉes par des รฉquivalents latins
# modernes
hyp = "tha waes per d"
m = compute_mufi_coverage(gt, hyp)
assert m["coverage"] == 0.0
assert m["n_mufi_chars_preserved"] == 0
def test_partial_coverage_with_per_char_breakdown(self) -> None:
gt = "รพรฆt ฦฟรฆลฟ ๊"
# Partiel : รพ, รฆ (1 sur 2), ๊ prรฉservรฉs ; ฦฟ, ลฟ, รฆ (1/2) ratรฉs
hyp = "รพรฆt was ๊"
m = compute_mufi_coverage(gt, hyp)
# Total MUFI dans GT : รพ + รฆ + รฆ + ฦฟ + ลฟ + ๊ = 6
assert m["n_mufi_chars_reference"] == 6
# Preserved : รพ, premier รฆ, ๊ โ 3
assert m["n_mufi_chars_preserved"] == 3
per_char = m["per_char"]
assert per_char["รพ"]["coverage"] == 1.0
assert per_char["๊"]["coverage"] == 1.0
assert per_char["ฦฟ"]["coverage"] == 0.0
assert per_char["ลฟ"]["coverage"] == 0.0
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 3. Cas dรฉgรฉnรฉrรฉs
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestDegenerateCases:
def test_gt_without_mufi(self) -> None:
m = compute_mufi_coverage("hello world", "hello world")
assert m["n_mufi_chars_reference"] == 0
assert m["coverage"] == 0.0
assert m["per_char"] == {}
def test_empty_gt(self) -> None:
m = compute_mufi_coverage("", "anything")
assert m["n_mufi_chars_reference"] == 0
assert m["coverage"] == 0.0
def test_none_inputs(self) -> None:
m = compute_mufi_coverage(None, None)
assert m["n_mufi_chars_reference"] == 0
assert m["coverage"] == 0.0
def test_empty_hyp_with_mufi_gt(self) -> None:
m = compute_mufi_coverage("รพรฆt", "")
assert m["n_mufi_chars_preserved"] == 0
assert m["coverage"] == 0.0
# Tous les MUFI sont dans missed
assert "รพ" in m["missed_chars"]
assert "รฆ" in m["missed_chars"]
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 4. Custom chars
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestCustomChars:
def test_custom_chars_count_in_total(self) -> None:
# Sans custom : รฑ n'est pas MUFI, donc texte sans MUFI
assert compute_mufi_coverage("aรฑo", "aรฑo")["n_mufi_chars_reference"] == 0
# Avec custom : รฑ devient MUFI โ 1 dans GT, 1 prรฉservรฉ
m = compute_mufi_coverage("aรฑo", "aรฑo", custom_chars=["รฑ"])
assert m["n_mufi_chars_reference"] == 1
assert m["coverage"] == pytest.approx(1.0)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 5. Coverage exhaustive
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestExhaustiveAccounting:
def test_preserved_plus_missed_equals_total(self) -> None:
gt = "รพรฆt ฦฟรฆลฟ ๊ รฐ ๏ฌ"
hyp = "รพรฆt was ๊ d fi"
m = compute_mufi_coverage(gt, hyp)
# n_preserved + len(missed_chars) == n_total
assert (
m["n_mufi_chars_preserved"] + len(m["missed_chars"])
== m["n_mufi_chars_reference"]
)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 6. Raccourci
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestShortcut:
def test_shortcut_matches_full_call(self) -> None:
gt = "รพรฆt ฦฟรฆลฟ ๊"
hyp = "รพรฆt was ๊"
full = compute_mufi_coverage(gt, hyp)
assert mufi_coverage(gt, hyp) == pytest.approx(full["coverage"])
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# 7. Intรฉgration registre typรฉ
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
class TestRegistryIntegration:
def test_metric_registered_for_text_text(self) -> None:
# Force l'import qui peuple le registre
import picarones.measurements.mufi # noqa: F401
selected = select_metrics(
(ArtifactType.TEXT, ArtifactType.TEXT),
)
names = {spec.name for spec in selected}
assert "mufi_coverage" in names
def test_compute_at_junction(self) -> None:
out = compute_at_junction(
"รพรฆt", "รพรฆt",
(ArtifactType.TEXT, ArtifactType.TEXT),
)
assert out["mufi_coverage"] == pytest.approx(1.0)
|