"""Sprint A14-S16 — SearchView + métriques de recherchabilité.""" from __future__ import annotations import pytest from picarones.domain import Artifact, ArtifactType, MetricSpec from picarones.evaluation.metrics.search import ( levenshtein_distance, numerical_sequence_preservation, searchability_recall, ) from picarones.evaluation.projectors import ( AltoToText, CanonicalToText, PageToText, ProjectorRegistry, ) from picarones.evaluation.registry import MetricRegistry from picarones.evaluation.views import ( DEFAULT_SEARCH_METRICS, DefaultEvaluationViewExecutor, build_search_view, ) # ────────────────────────────────────────────────────────────────── # Métriques individuelles # ────────────────────────────────────────────────────────────────── class TestLevenshtein: def test_identical(self) -> None: assert levenshtein_distance("hello", "hello") == 0 def test_empty(self) -> None: assert levenshtein_distance("", "") == 0 assert levenshtein_distance("abc", "") == 3 assert levenshtein_distance("", "abc") == 3 def test_single_substitution(self) -> None: assert levenshtein_distance("hello", "hallo") == 1 def test_kitten_sitting(self) -> None: # Cas canonique : kitten → sitting (k→s, e→i, +g) = 3 ops assert levenshtein_distance("kitten", "sitting") == 3 class TestSearchabilityRecall: def test_perfect_match(self) -> None: recall = searchability_recall("hello world", "hello world") assert recall == 1.0 def test_fuzzy_match_within_threshold(self) -> None: # "monde" vs "monds" → 1 substitution, ≤ 2 → match recall = searchability_recall("le monde", "le monds") assert recall == 1.0 def test_fuzzy_match_beyond_threshold(self) -> None: # "monde" vs "rabbit" → distance > 2 → pas de match recall = searchability_recall("le monde", "le rabbit") # "le" matche, "monde" non → 1/2 = 0.5 assert recall == 0.5 def test_empty_gt_returns_zero(self) -> None: assert searchability_recall("", "hello") == 0.0 def test_multiplicity_respected(self) -> None: # GT a "le" deux fois, hyp une seule fois → 1/2 recall = searchability_recall("le le monde", "le monde") assert abs(recall - 2 / 3) < 1e-9 # "le", "monde" matchent (1 "le" non) def test_case_insensitive_by_default(self) -> None: assert searchability_recall("Bonjour", "bonjour") == 1.0 def test_negative_max_distance_raises(self) -> None: with pytest.raises(ValueError, match="max_distance"): searchability_recall("a", "b", max_distance=-1) class TestNumericalSequencePreservation: def test_perfect_year_preservation(self) -> None: score = numerical_sequence_preservation( "fait à Paris en 1789", "fait à Paris en 1789", ) assert score == 1.0 def test_year_corrupted(self) -> None: # GT contient "1789", hyp contient "1798" (pas dans hyp_years) # Mais "1798" est aussi une année 4 chiffres valide qui matche # le regex. Vérifions la sémantique : on cherche les années # GT dans les années hyp. score = numerical_sequence_preservation( "année 1789", "année 1798", ) # 1789 (GT) n'est PAS dans hyp_years = [1798] → 0/1 = 0.0 assert score == 0.0 def test_partial_preservation(self) -> None: score = numerical_sequence_preservation( "1789, 1799, 1815", "1789 et 1815", # 1799 perdu ) # 2/3 préservés assert abs(score - 2 / 3) < 1e-9 def test_no_years_in_gt(self) -> None: score = numerical_sequence_preservation( "pas de date ici", "pas de date là", ) assert score == 0.0 # convention : pas d'années GT → 0.0 def test_year_regex_bounds(self) -> None: # Année 999 → trop court (3 chiffres) # Année 1000 → OK # Année 2099 → hors plage (regex 2[0-2][0-9]) score = numerical_sequence_preservation("an 999 et 1000", "an 999 et 1000") # Seul "1000" est détecté en GT → comparé à hyp où "1000" présent aussi assert score == 1.0 # ────────────────────────────────────────────────────────────────── # SearchView shape # ────────────────────────────────────────────────────────────────── class TestSearchViewShape: def test_default_view_accepts_5_types(self) -> None: view = build_search_view() for t in ( ArtifactType.RAW_TEXT, ArtifactType.CORRECTED_TEXT, ArtifactType.ALTO_XML, ArtifactType.PAGE_XML, ArtifactType.CANONICAL_DOCUMENT, ): assert view.accepts(t) def test_default_metrics(self) -> None: view = build_search_view() assert view.metric_names == DEFAULT_SEARCH_METRICS def test_projection_for_alto_routes_correctly(self) -> None: view = build_search_view() spec = view.projection_for(ArtifactType.ALTO_XML) assert spec is not None assert spec.projector_name == "alto_to_text" def test_warnings_signal_higher_is_better_inversion(self) -> None: view = build_search_view() text = " ".join(view.warnings) assert "higher_is_better" in text or "OPPOSÉ" in text # ────────────────────────────────────────────────────────────────── # SearchView avec executor # ────────────────────────────────────────────────────────────────── def _build_search_executor(payloads: dict[str, str]) -> DefaultEvaluationViewExecutor: metrics = MetricRegistry() metrics.register( MetricSpec( name="searchability_recall", input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT), higher_is_better=True, ), searchability_recall, ) metrics.register( MetricSpec( name="numerical_sequence_preservation", input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT), higher_is_better=True, ), numerical_sequence_preservation, ) projectors = ProjectorRegistry() projectors.register(AltoToText()) projectors.register(PageToText()) projectors.register(CanonicalToText()) def loader(art: Artifact) -> str: if art.id not in payloads: raise KeyError(art.id) return payloads[art.id] return DefaultEvaluationViewExecutor.from_registries(metrics, projectors, loader) class TestSearchViewWithExecutor: def test_perfect_text_yields_recall_1(self) -> None: payloads = { "cand": "le petit chat noir 1789", "gt": "le petit chat noir 1789", } executor = _build_search_executor(payloads) view = build_search_view() cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT) gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT) result = executor.evaluate(view, cand, gt, pipeline_name="test") assert result.metric_values["searchability_recall"] == 1.0 assert result.metric_values["numerical_sequence_preservation"] == 1.0 def test_partial_text_quality_with_year_loss(self) -> None: payloads = { "cand": "le pelit chat noir 1798", # erreur typo + année corrompue "gt": "le petit chat noir 1789", } executor = _build_search_executor(payloads) view = build_search_view() cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT) gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT) result = executor.evaluate(view, cand, gt, pipeline_name="test") # "petit"→"pelit" = 1 sub, OK ; "1789"→"1798" = 2 subs, OK pour # searchability fuzzy. Donc searchability_recall ≈ 1.0. assert result.metric_values["searchability_recall"] >= 0.8 # Mais l'année 1789 N'EST PAS dans hyp → preservation = 0. assert result.metric_values["numerical_sequence_preservation"] == 0.0