Spaces:
Sleeping
Sleeping
| """Tests Sprint 56 β score d'expansion d'abrΓ©viations mΓ©diΓ©vales. | |
| Couvre : | |
| 1. ``detect_abbreviations`` : | |
| - reconnaissance des caractΓ¨res Unicode dΓ©diΓ©s (κ, κ, β, etc.) | |
| - reconnaissance des sΓ©quences ``lettre + U+0303`` (pΜ, qΜ) | |
| - tolΓ©rance NFC/NFD | |
| - texte vide / None / sans abrΓ©viation | |
| 2. ``compute_abbreviation_metrics`` : | |
| - **Diplomatique** : forme abrΓ©gΓ©e prΓ©servΓ©e β strict=1, expansion=1 | |
| - **Modernisant** : forme dΓ©veloppΓ©e β strict=0, expansion=1 | |
| (signal clΓ© du plan d'Γ©volution) | |
| - **Mauvais OCR** : ni l'abrΓ©gΓ© ni la dΓ©veloppΓ©e β 0/0 | |
| - Mixte : 1 prΓ©servΓ©e, 1 dΓ©veloppΓ©e β strict=0.5, expansion=1 | |
| - GT sans abrΓ©viation β tous compteurs Γ 0, scores Γ 0 | |
| 3. ``per_abbreviation`` dΓ©taille par abbreviation rencontrΓ©e. | |
| 4. **Cas rΓ©aliste du plan** : un GT avec κ + κ + β ; trois moteurs | |
| ayant adoptΓ© trois conventions diffΓ©rentes β strict/expansion | |
| permettent de les classer. | |
| 5. Frontière de mots pour les expansions courtes (« et », « us »). | |
| 6. IntΓ©gration registre typΓ© : ``abbreviation_strict_score`` et | |
| ``abbreviation_expansion_score`` enregistrΓ©s pour ``(TEXT, TEXT)``. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from picarones.measurements.abbreviations import ( | |
| ABBREVIATION_EXPANSIONS, | |
| abbreviation_expansion_score, | |
| abbreviation_strict_score, | |
| compute_abbreviation_metrics, | |
| detect_abbreviations, | |
| ) | |
| from picarones.core.metric_registry import compute_at_junction, select_metrics | |
| from picarones.core.modules import ArtifactType | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. DΓ©tection | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDetection: | |
| def test_detects_unicode_abbreviations(self) -> None: | |
| text = "κ κ κ κ β" | |
| out = detect_abbreviations(text) | |
| assert out == ["κ", "κ", "κ", "κ", "β"] | |
| def test_detects_combining_tilde_sequences(self) -> None: | |
| # pΜ = "p" + U+0303 (combining tilde) | |
| text = "pΜ qΜ" | |
| out = detect_abbreviations(text) | |
| assert "pΜ" in out | |
| assert "qΜ" in out | |
| def test_preserves_duplicates(self) -> None: | |
| # Trois κ β liste avec trois entrΓ©es | |
| out = detect_abbreviations("κκκ") | |
| assert out == ["κ", "κ", "κ"] | |
| def test_empty_text(self) -> None: | |
| assert detect_abbreviations("") == [] | |
| assert detect_abbreviations(None) == [] | |
| def test_text_without_abbreviation(self) -> None: | |
| assert detect_abbreviations("Lorem ipsum dolor") == [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. Cas standards : diplomatique / modernisant / mauvais OCR | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestStandardScenarios: | |
| def gt(self) -> str: | |
| # 4 abrΓ©viations diffΓ©rentes dans le GT | |
| return "κ ad κ et β κ" | |
| def test_diplomatic_engine(self, gt: str) -> None: | |
| # PrΓ©servation des formes abrΓ©gΓ©es Unicode | |
| m = compute_abbreviation_metrics(gt, gt) | |
| assert m["strict_score"] == 1.0 | |
| assert m["expansion_score"] == 1.0 | |
| def test_modernizing_engine(self, gt: str) -> None: | |
| # DΓ©veloppement des abrΓ©viations | |
| hyp = "per ad pro et et qui" | |
| m = compute_abbreviation_metrics(gt, hyp) | |
| assert m["strict_score"] == 0.0 | |
| assert m["expansion_score"] == pytest.approx(1.0) | |
| def test_bad_ocr(self, gt: str) -> None: | |
| # Ni la forme abrΓ©gΓ©e ni le dΓ©veloppement n'apparaissent | |
| hyp = "x x x x x x" | |
| m = compute_abbreviation_metrics(gt, hyp) | |
| assert m["strict_score"] == 0.0 | |
| assert m["expansion_score"] == 0.0 | |
| def test_mixed_strategy(self, gt: str) -> None: | |
| # 2 prΓ©servΓ©es (κ, β), 2 dΓ©veloppΓ©es (pro, qui) | |
| hyp = "κ ad pro et β qui" | |
| m = compute_abbreviation_metrics(gt, hyp) | |
| # 4 abrΓ©gΓ©s au total : 2 strict (κ et β prΓ©servΓ©s) | |
| assert m["n_strict_preserved"] == 2 | |
| # Mais les 4 sont au moins par expansion | |
| assert m["n_expansion_preserved"] == 4 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. per_abbreviation dΓ©taillΓ© | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestPerAbbreviationBreakdown: | |
| def test_per_abbr_records(self) -> None: | |
| m = compute_abbreviation_metrics("κ et κ", "per et κ") | |
| records = m["per_abbreviation"] | |
| assert len(records) == 2 | |
| # κ : pas prΓ©servΓ© strict, mais "per" prΓ©sent β expansion ok | |
| rec_p = next(r for r in records if r["abbr"] == "κ") | |
| assert rec_p["strict_preserved"] is False | |
| assert rec_p["expansion_preserved"] is True | |
| # κ : prΓ©servΓ© strict (donc aussi expansion) | |
| rec_pro = next(r for r in records if r["abbr"] == "κ") | |
| assert rec_pro["strict_preserved"] is True | |
| assert rec_pro["expansion_preserved"] is True | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. Cas dΓ©gΓ©nΓ©rΓ©s | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDegenerateCases: | |
| def test_gt_without_abbreviation(self) -> None: | |
| m = compute_abbreviation_metrics("Lorem ipsum dolor", "Lorem ipsum") | |
| assert m["n_abbreviations_in_reference"] == 0 | |
| assert m["strict_score"] == 0.0 | |
| assert m["expansion_score"] == 0.0 | |
| def test_empty_inputs(self) -> None: | |
| m = compute_abbreviation_metrics("", "") | |
| assert m["n_abbreviations_in_reference"] == 0 | |
| assert m["strict_score"] == 0.0 | |
| def test_none_inputs(self) -> None: | |
| m = compute_abbreviation_metrics(None, None) | |
| assert m["n_abbreviations_in_reference"] == 0 | |
| def test_empty_hypothesis_with_abbreviations_in_gt(self) -> None: | |
| m = compute_abbreviation_metrics("κ κ", "") | |
| assert m["n_abbreviations_in_reference"] == 2 | |
| assert m["strict_score"] == 0.0 | |
| assert m["expansion_score"] == 0.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Frontière de mot pour expansions courtes | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestShortExpansionWordBoundary: | |
| def test_et_requires_word_boundary(self) -> None: | |
| # GT a β β dΓ©veloppement attendu = "et" (court β requiert | |
| # frontière de mot pour ne pas matcher trivialement | |
| # "permettre", etc.) | |
| # Ici l'hyp ne contient pas le mot "et" comme unitΓ©, donc | |
| # expansion_preserved = False. | |
| m = compute_abbreviation_metrics("β", "permettre quelque chose") | |
| assert m["expansion_score"] == 0.0 | |
| def test_et_matches_at_word_boundary(self) -> None: | |
| m = compute_abbreviation_metrics("β", "fer et acier") | |
| assert m["expansion_score"] == 1.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Raccourcis | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestShortcuts: | |
| def test_strict_shortcut(self) -> None: | |
| full = compute_abbreviation_metrics("κ κ", "κ pro") | |
| assert abbreviation_strict_score("κ κ", "κ pro") == pytest.approx( | |
| full["strict_score"], | |
| ) | |
| def test_expansion_shortcut(self) -> None: | |
| full = compute_abbreviation_metrics("κ κ", "κ pro") | |
| assert abbreviation_expansion_score("κ κ", "κ pro") == pytest.approx( | |
| full["expansion_score"], | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. IntΓ©gration registre typΓ© | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRegistryIntegration: | |
| def test_metrics_registered_for_text_text(self) -> None: | |
| # Force l'import qui peuple le registre | |
| import picarones.measurements.abbreviations # noqa: F401 | |
| selected = select_metrics( | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| names = {spec.name for spec in selected} | |
| assert "abbreviation_strict_score" in names | |
| assert "abbreviation_expansion_score" in names | |
| def test_compute_at_junction_returns_both(self) -> None: | |
| out = compute_at_junction( | |
| "κ et κ", | |
| "κ et κ", | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| assert out["abbreviation_strict_score"] == pytest.approx(1.0) | |
| assert out["abbreviation_expansion_score"] == pytest.approx(1.0) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. SanitΓ© de la table d'expansions | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExpansionTable: | |
| def test_table_non_empty(self) -> None: | |
| # Au moins les 6 abrΓ©viations principales (Capelli) | |
| assert len(ABBREVIATION_EXPANSIONS) >= 6 | |
| def test_each_abbreviation_has_at_least_one_expansion(self) -> None: | |
| for abbr, expansions in ABBREVIATION_EXPANSIONS.items(): | |
| assert len(expansions) >= 1, ( | |
| f"L'abrΓ©viation {abbr!r} doit avoir au moins une expansion." | |
| ) | |
| def test_all_expansions_are_lowercase(self) -> None: | |
| for expansions in ABBREVIATION_EXPANSIONS.values(): | |
| for exp in expansions: | |
| assert exp == exp.lower(), ( | |
| f"Expansion {exp!r} doit Γͺtre en minuscules." | |
| ) | |