Spaces:
Sleeping
Sleeping
| """Tests Sprint 59 β abrΓ©viations et marqueurs des archives | |
| modernes XIXα΅-XXα΅. | |
| Couvre : | |
| 1. ``get_category`` : marqueurs des 9 catΓ©gories ; marqueurs | |
| inconnus β ``None``. | |
| 2. ``detect_modern_markers`` : | |
| - reconnaissance par catΓ©gorie | |
| - greedy Β« plus long gagne Β» (S.A.R. avant S.A.) | |
| - frontière de mot pour les abréviations courtes | |
| - ordre prΓ©servΓ© | |
| - texte vide / None | |
| 3. ``compute_modern_archives_metrics`` : | |
| - **Diplomatique** : tous marqueurs prΓ©servΓ©s β strict = expansion = 1 | |
| - **Modernisant** : abrΓ©gΓ©s remplacΓ©s par formes dΓ©veloppΓ©es β | |
| strict = 0, expansion = 1 | |
| - **Erreur** : signaux faibles partout | |
| - **Mixte** : breakdown per_category cohΓ©rent | |
| - cas dΓ©gΓ©nΓ©rΓ©s (GT sans marqueur, vide, None) | |
| 4. ``missed_markers`` : entrΓ©es avec index, marker, category, | |
| expansion_preserved. | |
| 5. **Cas rΓ©alistes** par catΓ©gorie : | |
| - Notice biblio : Β« vol. II p. 45 Β» β modernisant le dΓ©veloppe. | |
| - Γtat civil : Β« Γ©p. Martin, vve Durand Β» β discriminant. | |
| - Adresse : Β« bd Voltaire, arr. XIα΅ Β». | |
| - Politesse : Β« S.A.R. le duc Β» vs Β« Son Altesse Royale Β». | |
| - Monnaie : Β« 100 βΆ 5 s. 6 d. Β». | |
| 6. Comptage exhaustif : ``n_strict_preserved + len(missed_markers | |
| non-expansion) + cas mixtes`` cohΓ©rent. | |
| 7. IntΓ©gration registre typΓ©. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from picarones.core.metric_registry import compute_at_junction, select_metrics | |
| from picarones.measurements.modern_archives import ( | |
| ADDRESS, | |
| ADMINISTRATIVE, | |
| BIBLIOGRAPHIC, | |
| CIVIL_STATUS, | |
| CIVILITY_TITLES, | |
| CURRENCY, | |
| LATIN_ABBR_MODERN, | |
| ORDINALS, | |
| TYPOGRAPHIC_PUNCTUATION, | |
| compute_modern_archives_metrics, | |
| detect_modern_markers, | |
| get_category, | |
| get_expansions, | |
| modern_archives_expansion_score, | |
| modern_archives_strict_score, | |
| ) | |
| from picarones.core.modules import ArtifactType | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. get_category | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestGetCategory: | |
| def test_categorize(self, marker: str, expected: str | None) -> None: | |
| assert get_category(marker) == expected | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. get_expansions | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestGetExpansions: | |
| def test_known_marker(self) -> None: | |
| assert "Madame" in get_expansions("Mme") | |
| assert "boulevard" in get_expansions("bd") | |
| assert "page" in get_expansions("p.") | |
| def test_unknown_marker(self) -> None: | |
| assert get_expansions("xyz") == () | |
| assert get_expansions("") == () | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. detect_modern_markers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDetectMarkers: | |
| def test_detects_civility(self) -> None: | |
| markers = detect_modern_markers("Mme Dupont et Mgr Martin") | |
| cats = sorted({cat for _i, _m, cat in markers}) | |
| assert cats == ["civility_titles"] | |
| names = sorted({m for _i, m, _c in markers}) | |
| assert names == ["Mgr", "Mme"] | |
| def test_detects_ordinals(self) -> None: | |
| markers = detect_modern_markers("le 1α΅Κ³ et le XIXα΅ siΓ¨cle, 3α΅ Γ©tage") | |
| cats = [cat for _i, _m, cat in markers] | |
| assert all(c == "ordinals" for c in cats) | |
| assert len(cats) == 3 | |
| def test_detects_currency(self) -> None: | |
| markers = detect_modern_markers("100 βΆ 5 s. 6 d. et 50 Β£") | |
| cats = sorted({cat for _i, _m, cat in markers}) | |
| assert cats == ["currency"] | |
| assert len(markers) == 4 | |
| def test_detects_civil_status(self) -> None: | |
| markers = detect_modern_markers("Β° 1850 β 1920 Γ©p. Durand vve") | |
| cats = sorted({cat for _i, _m, cat in markers}) | |
| assert cats == ["civil_status"] | |
| def test_detects_typographic_punctuation(self) -> None: | |
| markers = detect_modern_markers("Β« voici Β» β pas malβ¦") | |
| cats = sorted({cat for _i, _m, cat in markers}) | |
| assert cats == ["typographic_punctuation"] | |
| def test_detects_latin_abbr(self) -> None: | |
| markers = detect_modern_markers("cf. p. 12, etc. ; ibid., op. cit.") | |
| cats = {cat for _i, _m, cat in markers} | |
| assert "latin_abbr_modern" in cats | |
| # Β« cf. Β», Β« etc. Β», Β« ibid. Β», Β« op. cit. Β» β 4 latins | |
| latin = [m for _i, m, c in markers if c == "latin_abbr_modern"] | |
| assert sorted(latin) == ["cf.", "etc.", "ibid.", "op. cit."] | |
| def test_detects_bibliographic(self) -> None: | |
| markers = detect_modern_markers("vol. II t. 3 p. 12 pp. 12 fasc. 4 nΒ° 7") | |
| cats = {cat for _i, _m, cat in markers} | |
| assert "bibliographic" in cats | |
| def test_detects_address(self) -> None: | |
| markers = detect_modern_markers("bd Voltaire, av. de l'OpΓ©ra, r. de Rivoli") | |
| names = sorted({m for _i, m, c in markers if c == "address"}) | |
| assert names == ["av.", "bd", "r."] | |
| def test_greedy_longest_wins(self) -> None: | |
| # Β« S.A.R. Β» doit gagner sur Β« S.M. Β» β ce sont deux marqueurs | |
| # distincts, mais la stratΓ©gie greedy garantit qu'on ne | |
| # dΓ©tecte pas Β« S. " ou Β« A.R. " sΓ©parΓ©ment. | |
| markers = detect_modern_markers("S.A.R. le duc") | |
| names = [m for _i, m, _c in markers] | |
| assert names == ["S.A.R."] | |
| def test_word_boundary_for_short_abbr(self) -> None: | |
| # Β« M. Β» dans Β« M.A.M. Β» ne doit PAS Γͺtre dΓ©tectΓ© (pas de | |
| # frontière espace/fin/ponctuation après le point final | |
| # avant un autre caractère mot). | |
| # Cas positif : Β« M. Dupont Β» β 1 dΓ©tection | |
| markers_pos = detect_modern_markers("M. Dupont") | |
| m_titles = [m for _i, m, c in markers_pos if c == "civility_titles"] | |
| assert "M." in m_titles | |
| # Cas litigieux : Β« M.A.M. Β» ne doit pas matcher 3 fois | |
| markers_neg = detect_modern_markers("M.A.M.") | |
| m_negs = [m for _i, m, c in markers_neg if c == "civility_titles"] | |
| # Au plus le dernier Β« M. Β» avec point final acceptΓ© | |
| assert m_negs.count("M.") <= 1 | |
| def test_word_boundary_blocks_false_positive(self) -> None: | |
| # Β« bd Β» dans Β« abdomen Β» ne doit pas matcher (pas en | |
| # frontière de mot). | |
| markers = detect_modern_markers("son abdomen est gonflΓ©") | |
| assert all(m != "bd" for _i, m, _c in markers) | |
| def test_preserves_order(self) -> None: | |
| markers = detect_modern_markers("Mme au 3α΅ Γ©tage du bd Voltaire") | |
| names = [m for _i, m, _c in markers] | |
| assert names == ["Mme", "3α΅", "bd"] | |
| def test_empty_input(self) -> None: | |
| assert detect_modern_markers("") == [] | |
| assert detect_modern_markers(None) == [] | |
| def test_text_without_markers(self) -> None: | |
| assert detect_modern_markers("hello world without abbreviations") == [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. compute_modern_archives_metrics β scΓ©narios standards | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestComputeMetrics: | |
| def gt(self) -> str: | |
| return "Mme Dupont S.A.R. Γ©p. au bd Voltaire vol. II p. 45" | |
| def test_diplomatic_full_preservation(self, gt: str) -> None: | |
| m = compute_modern_archives_metrics(gt, gt) | |
| assert m["global_strict_score"] == pytest.approx(1.0) | |
| assert m["global_expansion_score"] == pytest.approx(1.0) | |
| assert m["missed_markers"] == [] | |
| def test_modernizing_loses_strict_keeps_expansion(self, gt: str) -> None: | |
| # Toutes les abrΓ©viations sont dΓ©veloppΓ©es | |
| hyp = ( | |
| "Madame Dupont Son Altesse Royale Γ©pouse au boulevard " | |
| "Voltaire volume II page 45" | |
| ) | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| assert m["global_strict_score"] == pytest.approx(0.0) | |
| assert m["global_expansion_score"] == pytest.approx(1.0) | |
| def test_erroneous_loses_both(self, gt: str) -> None: | |
| # On ne prΓ©serve ni l'abrΓ©gΓ© ni le dΓ©veloppΓ© | |
| hyp = "Femme Dupont l'altesse au quai Voltaire." | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| assert m["global_strict_score"] == pytest.approx(0.0) | |
| assert m["global_expansion_score"] < 0.5 | |
| def test_mixed_per_category(self) -> None: | |
| gt = "Mme Dupont au bd Voltaire vol. II" | |
| # PrΓ©serve civility + bibliographic, perd address | |
| hyp = "Mme Dupont au boulevard Voltaire vol. II" | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| assert m["per_category"]["civility_titles"]["strict_score"] == 1.0 | |
| assert m["per_category"]["bibliographic"]["strict_score"] == 1.0 | |
| assert m["per_category"]["address"]["strict_score"] == 0.0 | |
| # Address : bd β boulevard, donc expansion satisfaite | |
| assert m["per_category"]["address"]["expansion_score"] == 1.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Cas dΓ©gΓ©nΓ©rΓ©s | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDegenerateCases: | |
| def test_gt_without_markers(self) -> None: | |
| m = compute_modern_archives_metrics("hello world", "hello world") | |
| assert m["n_markers_reference"] == 0 | |
| assert m["global_strict_score"] == 0.0 | |
| assert m["global_expansion_score"] == 0.0 | |
| assert m["per_category"] == {} | |
| def test_empty_gt(self) -> None: | |
| m = compute_modern_archives_metrics("", "anything") | |
| assert m["n_markers_reference"] == 0 | |
| def test_none_inputs(self) -> None: | |
| m = compute_modern_archives_metrics(None, None) | |
| assert m["n_markers_reference"] == 0 | |
| def test_empty_hyp_with_markers_in_gt(self) -> None: | |
| m = compute_modern_archives_metrics("Mme bd vol.", "") | |
| assert m["n_strict_preserved"] == 0 | |
| assert m["global_strict_score"] == 0.0 | |
| assert len(m["missed_markers"]) == 3 | |
| for entry in m["missed_markers"]: | |
| assert entry["expansion_preserved"] is False | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. missed_markers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestMissedMarkers: | |
| def test_missed_markers_have_required_fields(self) -> None: | |
| gt = "Mme bd vol." | |
| hyp = "Madame boulevard volume" | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| # Modernisant : tous strict ratΓ©s mais expansion prΓ©servΓ©e | |
| assert len(m["missed_markers"]) == 3 | |
| for entry in m["missed_markers"]: | |
| assert "index" in entry | |
| assert "marker" in entry | |
| assert "category" in entry | |
| assert "expansion_preserved" in entry | |
| assert entry["expansion_preserved"] is True | |
| def test_missed_marker_distinguishes_pure_loss(self) -> None: | |
| gt = "Mme Dupont au bd Voltaire" | |
| # PrΓ©serve Β« bd Voltaire Β» mais perd Β« Mme Β» sans le dΓ©velopper | |
| hyp = "Femme Dupont au bd Voltaire" | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| # Mme : ni abrΓ©gΓ© ni dΓ©veloppΓ© β expansion_preserved = False | |
| mme_missed = [e for e in m["missed_markers"] if e["marker"] == "Mme"] | |
| assert len(mme_missed) == 1 | |
| assert mme_missed[0]["expansion_preserved"] is False | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. Cas rΓ©alistes par catΓ©gorie | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRealisticBibliographicCitation: | |
| def test_diplomatic_vs_modernizing(self) -> None: | |
| gt = "Voir vol. II t. 3 p. 45 pp. 50-60 fasc. 4 nΒ° 7" | |
| hyp_modern = ( | |
| "Voir volume II tome 3 page 45 pages 50-60 fascicule 4 numΓ©ro 7" | |
| ) | |
| m_diplo = compute_modern_archives_metrics(gt, gt) | |
| m_mod = compute_modern_archives_metrics(gt, hyp_modern) | |
| assert m_diplo["per_category"]["bibliographic"]["strict_score"] == 1.0 | |
| assert m_mod["per_category"]["bibliographic"]["strict_score"] == 0.0 | |
| assert m_mod["per_category"]["bibliographic"]["expansion_score"] == 1.0 | |
| class TestRealisticVitalRecord: | |
| def test_vital_record_discriminates(self) -> None: | |
| gt = "Marie Dupont, Γ©p. Martin, vve Durand, β 1920" | |
| hyp_modern = ( | |
| "Marie Dupont, Γ©pouse Martin, veuve Durand, dΓ©cΓ©dΓ©e 1920" | |
| ) | |
| m_diplo = compute_modern_archives_metrics(gt, gt) | |
| m_mod = compute_modern_archives_metrics(gt, hyp_modern) | |
| assert m_diplo["per_category"]["civil_status"]["strict_score"] == 1.0 | |
| assert m_mod["per_category"]["civil_status"]["strict_score"] == 0.0 | |
| assert m_mod["per_category"]["civil_status"]["expansion_score"] == 1.0 | |
| class TestRealisticAddress: | |
| def test_address_typical_modernization(self) -> None: | |
| gt = "demeurant 14 bd Voltaire, arr. XIα΅" | |
| hyp_modern = "demeurant 14 boulevard Voltaire, arrondissement XIe" | |
| m = compute_modern_archives_metrics(gt, hyp_modern) | |
| # bd β boulevard (expansion ok), arr. β arrondissement (ok) | |
| assert m["per_category"]["address"]["expansion_score"] == 1.0 | |
| assert m["per_category"]["administrative"]["expansion_score"] == 1.0 | |
| # XIα΅ β XIe (forme plate, expansion ok) | |
| assert m["per_category"]["ordinals"]["expansion_score"] == 1.0 | |
| class TestRealisticHonorific: | |
| def test_royal_protocol_full_preservation(self) -> None: | |
| gt = "S.A.R. le duc et S.M. la reine" | |
| m = compute_modern_archives_metrics(gt, gt) | |
| assert m["per_category"]["civility_titles"]["strict_score"] == 1.0 | |
| def test_royal_protocol_modernized(self) -> None: | |
| gt = "S.A.R. le duc et S.M. la reine" | |
| hyp = "Son Altesse Royale le duc et Sa MajestΓ© la reine" | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| assert m["per_category"]["civility_titles"]["strict_score"] == 0.0 | |
| assert m["per_category"]["civility_titles"]["expansion_score"] == 1.0 | |
| class TestRealisticCurrency: | |
| def test_ancien_regime_currency(self) -> None: | |
| gt = "100 βΆ 5 s. 6 d." | |
| m = compute_modern_archives_metrics(gt, gt) | |
| assert m["per_category"]["currency"]["strict_score"] == 1.0 | |
| # Modernisant : dΓ©veloppement complet | |
| hyp = "100 livres tournois 5 sous 6 deniers" | |
| m_mod = compute_modern_archives_metrics(gt, hyp) | |
| assert m_mod["per_category"]["currency"]["strict_score"] == 0.0 | |
| assert m_mod["per_category"]["currency"]["expansion_score"] == 1.0 | |
| class TestRealisticTypographicPunctuation: | |
| def test_quotation_typographic_vs_ascii(self) -> None: | |
| gt = "il dit Β« bonjour Β» et β bien sΓ»r β sortitβ¦" | |
| hyp_ascii = 'il dit "bonjour" et - bien sΓ»r - sortit...' | |
| m = compute_modern_archives_metrics(gt, hyp_ascii) | |
| # Strict : aucune ponctuation typographique prΓ©servΓ©e | |
| ptyp = m["per_category"]["typographic_punctuation"] | |
| assert ptyp["strict_score"] == 0.0 | |
| # Expansion : ASCII Γ©quivalents acceptΓ©s | |
| assert ptyp["expansion_score"] == 1.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. Comptage exhaustif | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExhaustiveAccounting: | |
| def test_strict_plus_strict_missed_equals_total(self) -> None: | |
| gt = "Mme au bd Voltaire vol. II Γ©p. Martin" | |
| hyp = "Mme au boulevard Voltaire vol. II Γ©pouse Martin" | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| assert ( | |
| m["n_strict_preserved"] + len(m["missed_markers"]) | |
| == m["n_markers_reference"] | |
| ) | |
| def test_per_category_counts_consistent(self) -> None: | |
| gt = "Mme bd vol. II Γ©p. Martin" | |
| hyp = "Mme boulevard volume II Γ©p. Martin" | |
| m = compute_modern_archives_metrics(gt, hyp) | |
| for _cat, scores in m["per_category"].items(): | |
| assert scores["n_strict_preserved"] <= scores["n_total"] | |
| assert scores["n_expansion_preserved"] <= scores["n_total"] | |
| # Strict β expansion (un strict est aussi une expansion) | |
| assert scores["n_strict_preserved"] <= scores["n_expansion_preserved"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 9. Tables exposΓ©es | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExposedTables: | |
| def test_all_categories_non_empty(self) -> None: | |
| for table in ( | |
| CIVILITY_TITLES, ORDINALS, CURRENCY, ADMINISTRATIVE, | |
| CIVIL_STATUS, TYPOGRAPHIC_PUNCTUATION, LATIN_ABBR_MODERN, | |
| BIBLIOGRAPHIC, ADDRESS, | |
| ): | |
| assert len(table) >= 1 | |
| def test_table_entries_well_formed(self) -> None: | |
| # Chaque entrΓ©e : (marker_str, expansions_tuple) | |
| for table in ( | |
| CIVILITY_TITLES, ORDINALS, CURRENCY, ADMINISTRATIVE, | |
| CIVIL_STATUS, TYPOGRAPHIC_PUNCTUATION, LATIN_ABBR_MODERN, | |
| BIBLIOGRAPHIC, ADDRESS, | |
| ): | |
| for entry in table: | |
| assert len(entry) == 2 | |
| marker, expansions = entry | |
| assert isinstance(marker, str) and marker | |
| assert isinstance(expansions, tuple) | |
| # Chaque expansion non vide | |
| for exp in expansions: | |
| assert isinstance(exp, str) and exp | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 10. Raccourcis | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestShortcuts: | |
| def test_strict_shortcut_matches_full_call(self) -> None: | |
| gt = "Mme au bd Voltaire" | |
| hyp = "Madame au boulevard Voltaire" | |
| full = compute_modern_archives_metrics(gt, hyp) | |
| assert modern_archives_strict_score(gt, hyp) == pytest.approx( | |
| full["global_strict_score"], | |
| ) | |
| def test_expansion_shortcut_matches_full_call(self) -> None: | |
| gt = "Mme au bd Voltaire" | |
| hyp = "Madame au boulevard Voltaire" | |
| full = compute_modern_archives_metrics(gt, hyp) | |
| assert modern_archives_expansion_score(gt, hyp) == pytest.approx( | |
| full["global_expansion_score"], | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 11. IntΓ©gration registre typΓ© | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRegistryIntegration: | |
| def test_strict_metric_registered(self) -> None: | |
| import picarones.measurements.modern_archives # noqa: F401 | |
| selected = select_metrics( | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| names = {spec.name for spec in selected} | |
| assert "modern_archives_strict_score" in names | |
| assert "modern_archives_expansion_score" in names | |
| def test_compute_at_junction_strict(self) -> None: | |
| out = compute_at_junction( | |
| "Mme au bd", "Mme au bd", | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| assert out["modern_archives_strict_score"] == pytest.approx(1.0) | |
| def test_compute_at_junction_expansion(self) -> None: | |
| out = compute_at_junction( | |
| "Mme au bd", "Madame au boulevard", | |
| (ArtifactType.TEXT, ArtifactType.TEXT), | |
| ) | |
| assert out["modern_archives_strict_score"] == pytest.approx(0.0) | |
| assert out["modern_archives_expansion_score"] == pytest.approx(1.0) | |