File size: 10,295 Bytes
cf392fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
cf392fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
cf392fd
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""Tests Sprint 57 โ€” couverture MUFI (clรดture axe A.II.3 philologique).

Couvre :

1. ``is_mufi_char`` :
   - caractรจres PUA (E000-F8FF) โ†’ True
   - Latin Extended-D (๊‘, etc.) โ†’ True
   - lettres mรฉdiรฉvales explicites (รพ, รฐ, ฦฟ, ลฟ, รฆ, ฦ€, ศโ€ฆ) โ†’ True
   - ligatures Alphabetic Presentation Forms (๏ฌ, ๏ฌ‚) โ†’ True
   - lettres latines courantes (a, A, รฉ) โ†’ False
   - chaรฎne vide โ†’ False
   - ``custom_chars`` รฉtend la liste reconnue
2. ``compute_mufi_coverage`` :
   - GT diplomatique vs hyp diplomatique โ†’ coverage = 1
   - GT MUFI vs hyp modernisรฉe (tout latin moderne) โ†’ coverage = 0
   - cas partiel : breakdown ``per_char`` cohรฉrent
   - liste ``missed_chars`` exhaustive
3. **Cas dรฉgรฉnรฉrรฉs** :
   - GT vide / sans MUFI โ†’ coverage = 0
   - hyp vide โ†’ coverage = 0
   - GT et hyp identiques avec MUFI โ†’ coverage = 1
4. ``custom_chars`` : รฉtend la dรฉtection (ex. accepter ``รฑ``).
5. Coverage exhaustive : ``n_preserved + len(missed_chars) ==
   n_mufi_chars_reference`` quand toutes les positions sont
   classรฉes.
6. Intรฉgration registre typรฉ : ``mufi_coverage`` enregistrรฉ pour
   ``(TEXT, TEXT)``.
"""

from __future__ import annotations

import pytest

from picarones.core.metric_registry import compute_at_junction, select_metrics
from picarones.core.modules import ArtifactType
from picarones.measurements.mufi import (
    compute_mufi_coverage,
    is_mufi_char,
    mufi_coverage,
)


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 1. is_mufi_char
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestIsMufiChar:
    @pytest.mark.parametrize(
        "char,expected",
        [
            # Lettres latines courantes โ†’ non MUFI
            ("a", False), ("Z", False), ("รฉ", False), ("รง", False),
            ("รฑ", False),  # caractรจre espagnol, pas MUFI par dรฉfaut
            ("0", False), (" ", False), ("", False),
            # Lettres mรฉdiรฉvales explicites โ†’ MUFI
            ("รพ", True), ("รž", True), ("รฐ", True), ("ร", True),
            ("ฦฟ", True), ("วท", True), ("ลฟ", True),
            ("รฆ", True), ("ร†", True), ("ล“", True), ("รธ", True),
            ("ฦ€", True), ("ศ", True),
            # Latin Extended-D โ†’ MUFI
            ("๊‘", True), ("๊“", True), ("๊—", True),
            # Alphabetic Presentation Forms โ†’ MUFI
            ("๏ฌ", True), ("๏ฌ‚", True),
            # Combining Diacritical Marks Supplement โ†’ MUFI
            # (U+1DC0 par exemple โ€” combining dotted grave)
            ("แท€", True),
        ],
    )
    def test_known_chars(self, char: str, expected: bool) -> None:
        assert is_mufi_char(char) is expected

    def test_pua_range(self) -> None:
        # Quelques points dans la PUA E000-F8FF
        for cp in (0xE000, 0xE500, 0xF000, 0xF8FF):
            assert is_mufi_char(chr(cp)) is True

    def test_custom_chars_extend(self) -> None:
        # รฑ n'est pas MUFI par dรฉfaut, mais devient MUFI si custom
        assert is_mufi_char("รฑ") is False
        assert is_mufi_char("รฑ", frozenset({"รฑ"})) is True


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 2. compute_mufi_coverage
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestComputeCoverage:
    def test_diplomatic_engine_full_coverage(self) -> None:
        gt = "รพรฆt ฦฟรฆลฟ ๊‘ รฐ"
        m = compute_mufi_coverage(gt, gt)
        assert m["coverage"] == pytest.approx(1.0)
        assert m["n_mufi_chars_preserved"] == m["n_mufi_chars_reference"]
        assert m["missed_chars"] == []

    def test_modernizing_engine_zero_coverage(self) -> None:
        gt = "รพรฆt ฦฟรฆลฟ ๊‘ รฐ"
        # Toutes les MUFI sont remplacรฉes par des รฉquivalents latins
        # modernes
        hyp = "tha waes per d"
        m = compute_mufi_coverage(gt, hyp)
        assert m["coverage"] == 0.0
        assert m["n_mufi_chars_preserved"] == 0

    def test_partial_coverage_with_per_char_breakdown(self) -> None:
        gt = "รพรฆt ฦฟรฆลฟ ๊‘"
        # Partiel : รพ, รฆ (1 sur 2), ๊‘ prรฉservรฉs ; ฦฟ, ลฟ, รฆ (1/2) ratรฉs
        hyp = "รพรฆt was ๊‘"
        m = compute_mufi_coverage(gt, hyp)
        # Total MUFI dans GT : รพ + รฆ + รฆ + ฦฟ + ลฟ + ๊‘ = 6
        assert m["n_mufi_chars_reference"] == 6
        # Preserved : รพ, premier รฆ, ๊‘ โ†’ 3
        assert m["n_mufi_chars_preserved"] == 3
        per_char = m["per_char"]
        assert per_char["รพ"]["coverage"] == 1.0
        assert per_char["๊‘"]["coverage"] == 1.0
        assert per_char["ฦฟ"]["coverage"] == 0.0
        assert per_char["ลฟ"]["coverage"] == 0.0


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 3. Cas dรฉgรฉnรฉrรฉs
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestDegenerateCases:
    def test_gt_without_mufi(self) -> None:
        m = compute_mufi_coverage("hello world", "hello world")
        assert m["n_mufi_chars_reference"] == 0
        assert m["coverage"] == 0.0
        assert m["per_char"] == {}

    def test_empty_gt(self) -> None:
        m = compute_mufi_coverage("", "anything")
        assert m["n_mufi_chars_reference"] == 0
        assert m["coverage"] == 0.0

    def test_none_inputs(self) -> None:
        m = compute_mufi_coverage(None, None)
        assert m["n_mufi_chars_reference"] == 0
        assert m["coverage"] == 0.0

    def test_empty_hyp_with_mufi_gt(self) -> None:
        m = compute_mufi_coverage("รพรฆt", "")
        assert m["n_mufi_chars_preserved"] == 0
        assert m["coverage"] == 0.0
        # Tous les MUFI sont dans missed
        assert "รพ" in m["missed_chars"]
        assert "รฆ" in m["missed_chars"]


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 4. Custom chars
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestCustomChars:
    def test_custom_chars_count_in_total(self) -> None:
        # Sans custom : รฑ n'est pas MUFI, donc texte sans MUFI
        assert compute_mufi_coverage("aรฑo", "aรฑo")["n_mufi_chars_reference"] == 0
        # Avec custom : รฑ devient MUFI โ†’ 1 dans GT, 1 prรฉservรฉ
        m = compute_mufi_coverage("aรฑo", "aรฑo", custom_chars=["รฑ"])
        assert m["n_mufi_chars_reference"] == 1
        assert m["coverage"] == pytest.approx(1.0)


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 5. Coverage exhaustive
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestExhaustiveAccounting:
    def test_preserved_plus_missed_equals_total(self) -> None:
        gt = "รพรฆt ฦฟรฆลฟ ๊‘ รฐ ๏ฌ"
        hyp = "รพรฆt was ๊‘ d fi"
        m = compute_mufi_coverage(gt, hyp)
        # n_preserved + len(missed_chars) == n_total
        assert (
            m["n_mufi_chars_preserved"] + len(m["missed_chars"])
            == m["n_mufi_chars_reference"]
        )


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 6. Raccourci
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestShortcut:
    def test_shortcut_matches_full_call(self) -> None:
        gt = "รพรฆt ฦฟรฆลฟ ๊‘"
        hyp = "รพรฆt was ๊‘"
        full = compute_mufi_coverage(gt, hyp)
        assert mufi_coverage(gt, hyp) == pytest.approx(full["coverage"])


# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 7. Intรฉgration registre typรฉ
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


class TestRegistryIntegration:
    def test_metric_registered_for_text_text(self) -> None:
        # Force l'import qui peuple le registre
        import picarones.measurements.mufi  # noqa: F401

        selected = select_metrics(
            (ArtifactType.TEXT, ArtifactType.TEXT),
        )
        names = {spec.name for spec in selected}
        assert "mufi_coverage" in names

    def test_compute_at_junction(self) -> None:
        out = compute_at_junction(
            "รพรฆt", "รพรฆt",
            (ArtifactType.TEXT, ArtifactType.TEXT),
        )
        assert out["mufi_coverage"] == pytest.approx(1.0)