Spaces:
Sleeping
Sleeping
| """Tests Sprint 55 β PrΓ©cision par bloc Unicode. | |
| Couvre : | |
| 1. ``get_block`` : caractères des blocs principaux correctement | |
| classifiΓ©s ; caractΓ¨res inconnus β ``"Other"``. | |
| 2. ``compute_unicode_block_accuracy`` : | |
| - Texte identique β toutes les accuracies Γ 1.0 | |
| - Texte vide β rΓ©sultat dΓ©gΓ©nΓ©rΓ© sans crash | |
| - Substitutions ciblΓ©es par bloc (ASCII prΓ©servΓ©, prΓ©sentation | |
| latine perdue) β cas rΓ©aliste du plan | |
| - Insertions et suppressions | |
| 3. **Cas rΓ©aliste du plan d'Γ©volution** : OCR modernisant remplace | |
| ``ΕΏ`` par ``s`` et ``ο¬`` par ``fi`` β 100 % Latin de Base mais | |
| 0 % PrΓ©sentation latine et 0 % Latin Extended-A. | |
| 4. ``unicode_block_global_accuracy`` raccourci Γ©quivalent Γ | |
| ``compute["global_accuracy"]``. | |
| 5. **IntΓ©gration registre typΓ©** : ``unicode_block_global_accuracy`` | |
| sΓ©lectionnΓ©e pour la jonction ``(TEXT, TEXT)``. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from picarones.core.metric_registry import compute_at_junction, select_metrics | |
| from picarones.core.modules import ArtifactType | |
| from picarones.measurements.unicode_blocks import ( | |
| compute_unicode_block_accuracy, | |
| get_block, | |
| unicode_block_global_accuracy, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. get_block | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestGetBlock: | |
| def test_known_blocks(self, char: str, expected_block: str) -> None: | |
| assert get_block(char) == expected_block | |
| def test_empty_string_returns_other(self) -> None: | |
| assert get_block("") == "Other" | |
| def test_unknown_char_returns_other(self) -> None: | |
| # Γmoji β pas dans la table patrimoniale | |
| assert get_block("π¦") == "Other" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. compute_unicode_block_accuracy β cas gΓ©nΓ©raux | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestComputeAccuracy: | |
| def test_identical_text_full_accuracy(self) -> None: | |
| m = compute_unicode_block_accuracy("hello world", "hello world") | |
| assert m["global_accuracy"] == pytest.approx(1.0) | |
| assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(1.0) | |
| assert m["per_block"]["Basic Latin"]["correct"] == 11 | |
| assert m["per_block"]["Basic Latin"]["total"] == 11 | |
| def test_empty_reference(self) -> None: | |
| m = compute_unicode_block_accuracy("", "anything") | |
| assert m["per_block"] == {} | |
| assert m["global_accuracy"] == 0.0 | |
| assert m["n_chars_reference"] == 0 | |
| def test_empty_hypothesis(self) -> None: | |
| m = compute_unicode_block_accuracy("hello", "") | |
| assert m["global_accuracy"] == 0.0 | |
| assert m["per_block"]["Basic Latin"]["correct"] == 0 | |
| assert m["per_block"]["Basic Latin"]["total"] == 5 | |
| def test_none_inputs(self) -> None: | |
| m = compute_unicode_block_accuracy(None, None) | |
| assert m["per_block"] == {} | |
| assert m["global_accuracy"] == 0.0 | |
| def test_partial_substitution_per_block(self) -> None: | |
| # Les Γ© (Latin-1 Sup) sont prΓ©servΓ©s ; les a (Basic Latin) | |
| # sont remplacΓ©s par X. | |
| gt = "Γ©aΓ©aΓ©a" | |
| hyp = "Γ©XΓ©XΓ©X" | |
| m = compute_unicode_block_accuracy(gt, hyp) | |
| # Latin-1 Sup : 3 Γ© correctes | |
| assert m["per_block"]["Latin-1 Supplement"]["accuracy"] == pytest.approx(1.0) | |
| # Basic Latin : 0/3 (les a sont substituΓ©s) | |
| assert m["per_block"]["Basic Latin"]["accuracy"] == 0.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. Cas rΓ©aliste du plan d'Γ©volution | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRealisticModernization: | |
| def test_modernizing_ocr_loses_presentation_forms(self) -> None: | |
| """OCR qui remplace ΕΏ par s et ο¬ par fi β 100 % Latin de Base | |
| prΓ©servΓ©, mais 0 % de PrΓ©sentation latine et de Latin | |
| Extended-A. C'est l'illustration directe du plan : "ce moteur | |
| restitue 95 % du Latin de Base mais 12 % de prΓ©sentation | |
| latine". | |
| """ | |
| gt = "le ΕΏerpent ο¬nement" | |
| ocr_modern = "le serpent finement" | |
| m = compute_unicode_block_accuracy(gt, ocr_modern) | |
| # PrΓ©sentation latine (ο¬ remplacΓ©e) : 0% | |
| assert m["per_block"]["Alphabetic Presentation Forms"]["accuracy"] == 0.0 | |
| # Latin Extended-A (ΕΏ remplacΓ©) : 0% (1 occurrence dans "ΕΏerpent") | |
| assert m["per_block"]["Latin Extended-A"]["accuracy"] == 0.0 | |
| # Basic Latin : prΓ©servΓ© Γ 100% (les espaces, lettres ASCII) | |
| assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(1.0) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. Insertions / suppressions | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestInsertionDeletion: | |
| def test_inserted_char_does_not_count(self) -> None: | |
| # GT = "abc", hyp = "aXbc" : insertion de X β toutes les | |
| # positions GT restent correctement restituΓ©es. | |
| m = compute_unicode_block_accuracy("abc", "aXbc") | |
| assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(1.0) | |
| def test_deletion_lowers_accuracy(self) -> None: | |
| # GT = "abc", hyp = "ac" : "b" est supprimΓ© β 2/3 prΓ©servΓ©s. | |
| m = compute_unicode_block_accuracy("abc", "ac") | |
| assert m["per_block"]["Basic Latin"]["correct"] == 2 | |
| assert m["per_block"]["Basic Latin"]["total"] == 3 | |
| assert m["per_block"]["Basic Latin"]["accuracy"] == pytest.approx(2 / 3) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Coverage β tous les caractΓ¨res GT classΓ©s | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestCoverage: | |
| def test_total_chars_match_reference_length(self) -> None: | |
| gt = "Hello, Γ¦ther β vol. ο¬. Γ©" | |
| m = compute_unicode_block_accuracy(gt, gt) | |
| total = sum(d["total"] for d in m["per_block"].values()) | |
| assert total == len(gt) | |
| assert m["n_chars_reference"] == len(gt) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Raccourci global | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestShortcut: | |
| def test_shortcut_matches_full_call(self) -> None: | |
| gt = "le ΕΏerpent ο¬nement" | |
| ocr = "le serpent finement" | |
| full = compute_unicode_block_accuracy(gt, ocr) | |
| assert unicode_block_global_accuracy(gt, ocr) == pytest.approx( | |
| full["global_accuracy"], | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. IntΓ©gration registre typΓ© | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRegistryIntegration: | |
| def test_metric_registered_for_text_text(self) -> None: | |
| # Force l'import qui peuple le registre | |
| import picarones.measurements.unicode_blocks # noqa: F401 | |
| selected = select_metrics( | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| names = {spec.name for spec in selected} | |
| assert "unicode_block_global_accuracy" in names | |
| def test_compute_at_junction(self) -> None: | |
| out = compute_at_junction( | |
| "hello", | |
| "hello", | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| assert out["unicode_block_global_accuracy"] == pytest.approx(1.0) | |