Spaces:
Running
Running
| """Sprint A14-S16 β sanity check inter-vues sur le cas BnF central. | |
| VΓ©rifie qu'un mΓͺme pipeline a une cohΓ©rence (et parfois une | |
| divergence intΓ©ressante) entre TextView, AltoView et SearchView. | |
| Cas dΓ©montrΓ©s : | |
| - Pipeline parfait β toutes vues maximisent. | |
| - Pipeline avec erreur sur une annΓ©e β SearchView baisse fortement, | |
| TextView baisse légèrement (pattern "perte de données critiques | |
| invisible au CER global"). | |
| - Pipeline sans ALTO β AltoView l'OMET, autres vues l'Γ©valuent. | |
| """ | |
| from __future__ import annotations | |
| from picarones.domain import Artifact, ArtifactType, MetricSpec | |
| from picarones.evaluation.metrics.alto_structural import ( | |
| compute_alto_validity, | |
| compute_line_count_ratio, | |
| compute_word_box_coverage, | |
| ) | |
| from picarones.evaluation.metrics.search import ( | |
| numerical_sequence_preservation, | |
| searchability_recall, | |
| ) | |
| from picarones.evaluation.projectors import ( | |
| AltoToText, | |
| CanonicalToText, | |
| PageToText, | |
| ProjectorRegistry, | |
| ) | |
| from picarones.evaluation.registry import MetricRegistry | |
| from picarones.evaluation.views import ( | |
| DefaultEvaluationViewExecutor, | |
| build_alto_view, | |
| build_search_view, | |
| build_text_view, | |
| ) | |
| from picarones.formats.alto.types import ( | |
| AltoBBox, | |
| AltoDocument, | |
| AltoLine, | |
| AltoPage, | |
| AltoString, | |
| AltoTextBlock, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Stubs mΓ©triques texte (cer/wer simplifiΓ©s sans jiwer) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _stub_cer(reference: str, hypothesis: str) -> float: | |
| if not reference: | |
| return 0.0 if not hypothesis else 1.0 | |
| common = sum(1 for a, b in zip(reference, hypothesis) if a == b) | |
| return 1.0 - (common / max(len(reference), len(hypothesis))) | |
| def _stub_wer(reference: str, hypothesis: str) -> float: | |
| ref_w = reference.split() | |
| hyp_w = hypothesis.split() | |
| if not ref_w: | |
| return 0.0 if not hyp_w else 1.0 | |
| common = sum(1 for a, b in zip(ref_w, hyp_w) if a == b) | |
| return 1.0 - (common / len(ref_w)) | |
| def _build_unified_executor(payloads: dict) -> DefaultEvaluationViewExecutor: | |
| """Executor configurΓ© pour TextView + AltoView + SearchView.""" | |
| metrics = MetricRegistry() | |
| # TextView metrics | |
| for name, fn in ( | |
| ("cer", _stub_cer), | |
| ("wer", _stub_wer), | |
| ("mer", _stub_cer), | |
| ("wil", _stub_wer), | |
| ): | |
| metrics.register( | |
| MetricSpec( | |
| name=name, | |
| input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT), | |
| ), | |
| fn, | |
| ) | |
| # AltoView metrics | |
| for name, fn in ( | |
| ("alto_validity", compute_alto_validity), | |
| ("alto_line_count_ratio", compute_line_count_ratio), | |
| ("alto_word_box_coverage", compute_word_box_coverage), | |
| ): | |
| metrics.register( | |
| MetricSpec( | |
| name=name, | |
| input_types=(ArtifactType.ALTO_XML, ArtifactType.ALTO_XML), | |
| higher_is_better=True, | |
| ), | |
| fn, | |
| ) | |
| # SearchView metrics | |
| metrics.register( | |
| MetricSpec( | |
| name="searchability_recall", | |
| input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT), | |
| higher_is_better=True, | |
| ), | |
| searchability_recall, | |
| ) | |
| metrics.register( | |
| MetricSpec( | |
| name="numerical_sequence_preservation", | |
| input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT), | |
| higher_is_better=True, | |
| ), | |
| numerical_sequence_preservation, | |
| ) | |
| projectors = ProjectorRegistry() | |
| projectors.register(AltoToText()) | |
| projectors.register(PageToText()) | |
| projectors.register(CanonicalToText()) | |
| def loader(art: Artifact): | |
| if art.id not in payloads: | |
| raise KeyError(art.id) | |
| return payloads[art.id] | |
| return DefaultEvaluationViewExecutor.from_registries(metrics, projectors, loader) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Cas 1 β pipeline parfait | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestPerfectPipelineAcrossViews: | |
| def test_perfect_text_pipeline_maximizes_text_and_search(self) -> None: | |
| """Un pipeline qui produit du texte parfait : | |
| - TextView : CER = 0 | |
| - SearchView : recall = 1.0, year preservation = 1.0 | |
| - AltoView : OMIS (pas d'ALTO produit). | |
| """ | |
| gt_text = "Bonjour Paris en 1789" | |
| payloads = {"cand": gt_text, "gt_text": gt_text} | |
| executor = _build_unified_executor(payloads) | |
| text_view = build_text_view() | |
| search_view = build_search_view() | |
| alto_view = build_alto_view() | |
| cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT) | |
| gt = Artifact(id="gt_text", document_id="d", type=ArtifactType.RAW_TEXT) | |
| text_result = executor.evaluate(text_view, cand, gt, pipeline_name="test") | |
| search_result = executor.evaluate(search_view, cand, gt, pipeline_name="test") | |
| assert text_result.metric_values["cer"] == 0.0 | |
| assert search_result.metric_values["searchability_recall"] == 1.0 | |
| assert search_result.metric_values["numerical_sequence_preservation"] == 1.0 | |
| # AltoView OMIS : le caller doit filtrer. | |
| assert not alto_view.accepts(cand.type) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Cas 2 β divergence TextView β SearchView | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDivergencePattern: | |
| def test_year_corruption_invisible_to_cer_visible_to_search(self) -> None: | |
| """Pattern critique : une corruption d'année (1 caractère | |
| sur ~50) est invisible cΓ΄tΓ© CER mais catastrophique cΓ΄tΓ© | |
| recherchabilitΓ© numΓ©rique. | |
| C'est prΓ©cisΓ©ment ce que le rapport BnF doit rendre | |
| visible β les deux vues racontent des histoires | |
| complΓ©mentaires. | |
| """ | |
| gt_text = "Charte signΓ©e Γ Paris le 14 juillet 1789 en prΓ©sence du roi" | |
| # Hypothèse : le LLM a "corrigé" 1789 en 1798 (faute grossière). | |
| # Le reste du texte est identique. | |
| cand_text = "Charte signΓ©e Γ Paris le 14 juillet 1798 en prΓ©sence du roi" | |
| payloads = {"cand": cand_text, "gt": gt_text} | |
| executor = _build_unified_executor(payloads) | |
| cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT) | |
| gt = Artifact(id="gt", document_id="d", type=ArtifactType.RAW_TEXT) | |
| text_result = executor.evaluate(build_text_view(), cand, gt, pipeline_name="test") | |
| search_result = executor.evaluate(build_search_view(), cand, gt, pipeline_name="test") | |
| # CER β 0.03 (3 chars sur ~58) | |
| assert text_result.metric_values["cer"] < 0.1, "CER doit rester faible" | |
| # WER : 1 mot changΓ© sur 11 β 1/11 β 0.09 | |
| assert text_result.metric_values["wer"] < 0.15 | |
| # Mais SearchView : 1789 (GT) n'est PAS dans hyp_years = [1798] | |
| # β preservation = 0.0 (catastrophique pour un historien). | |
| assert search_result.metric_values["numerical_sequence_preservation"] == 0.0 | |
| # Searchability : "1789" GT n'est pas matchΓ© Γ "1798" (distance 2, | |
| # MAIS la longueur est Γ©gale, fuzziness β€ 2 le matche). | |
| # On vΓ©rifie juste qu'il y a un signal mesurable. | |
| assert search_result.metric_values["searchability_recall"] >= 0.8 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Cas 3 β pipeline ALTO Γ©valuable dans les 3 vues | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _build_simple_alto(words: list[str], n_lines: int = 1) -> AltoDocument: | |
| """Construit un AltoDocument avec ``words`` rΓ©partis sur | |
| ``n_lines`` lignes, chaque mot avec une bbox.""" | |
| chunks = [words[i::n_lines] for i in range(n_lines)] | |
| lines = tuple( | |
| AltoLine(strings=tuple( | |
| AltoString( | |
| content=w, | |
| bbox=AltoBBox(hpos=0, vpos=0, width=10, height=10), | |
| ) | |
| for w in chunk | |
| )) | |
| for chunk in chunks | |
| ) | |
| return AltoDocument(pages=(AltoPage(blocks=(AltoTextBlock(lines=lines),),),),) | |
| class TestAltoPipelineEvaluatedInThreeViews: | |
| def test_alto_pipeline_has_text_alto_search_results(self, tmp_path) -> None: | |
| """Un pipeline qui produit ALTO_XML est Γ©valuable dans les | |
| 3 vues : TextView (via projection), AltoView (direct), | |
| SearchView (via projection). | |
| """ | |
| from picarones.formats.alto import write_alto | |
| words_gt = "Charte signΓ©e Paris 14 juillet 1789".split() | |
| words_cand = "Charte signΓ©e Paris 14 juillet 1789".split() # identique | |
| # n_lines=1 pour prΓ©server l'ordre des mots dans l'extraction | |
| # (sinon ``alto_document_to_text`` produit des sauts de ligne | |
| # qui font diverger le CER d'une comparaison ligne unique). | |
| gt_alto = _build_simple_alto(words_gt, n_lines=1) | |
| cand_alto = _build_simple_alto(words_cand, n_lines=1) | |
| cand_alto_path = tmp_path / "cand.alto.xml" | |
| cand_alto_path.write_bytes(write_alto(cand_alto)) | |
| # Payloads : raw text pour les payloads projetΓ©s depuis ALTO, | |
| # AltoDocument pour la GT et le candidat ALTO direct. | |
| from picarones.evaluation.projectors import alto_document_to_text | |
| payloads = { | |
| "gt_text": " ".join(words_gt), | |
| "gt_alto": gt_alto, | |
| "cand": cand_alto, # AltoDocument pour AltoView | |
| "cand:projected_text": alto_document_to_text(cand_alto), | |
| } | |
| executor = _build_unified_executor(payloads) | |
| gt_text_art = Artifact(id="gt_text", document_id="d", type=ArtifactType.RAW_TEXT) | |
| gt_alto_art = Artifact(id="gt_alto", document_id="d", type=ArtifactType.ALTO_XML) | |
| cand_art = Artifact( | |
| id="cand", document_id="d", | |
| type=ArtifactType.ALTO_XML, uri=str(cand_alto_path), | |
| ) | |
| # TextView : projette ALTO β texte, compare au gt_text. | |
| text_result = executor.evaluate(build_text_view(), cand_art, gt_text_art, pipeline_name="test") | |
| assert text_result.metric_values["cer"] == 0.0 | |
| # SearchView : projette ALTO β texte, mesure recall + annΓ©es. | |
| search_result = executor.evaluate(build_search_view(), cand_art, gt_text_art, pipeline_name="test") | |
| assert search_result.metric_values["searchability_recall"] == 1.0 | |
| # AltoView : compare ALTO direct contre ALTO GT. | |
| alto_result = executor.evaluate(build_alto_view(), cand_art, gt_alto_art, pipeline_name="test") | |
| assert alto_result.metric_values["alto_validity"] == 1.0 | |
| assert alto_result.metric_values["alto_line_count_ratio"] == 1.0 | |
| assert alto_result.metric_values["alto_word_box_coverage"] == 1.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CohΓ©rence globale : projection report prΓ©sent ssi projection appliquΓ©e | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestProjectionReportConsistency: | |
| def test_text_search_views_share_projection_report_pattern(self) -> None: | |
| """Pour un mΓͺme candidat ALTO_XML Γ©valuΓ© dans TextView et | |
| SearchView, les deux ViewResult doivent porter un | |
| projection_report (les deux vues projettent vers texte).""" | |
| gt_text = "test" | |
| gt_alto = _build_simple_alto(["test"], n_lines=1) | |
| from picarones.evaluation.projectors import alto_document_to_text | |
| from picarones.formats.alto import write_alto | |
| # Pour ce test on n'a pas besoin du fichier rΓ©el β on simule | |
| # via le payload_loader qui retourne directement le texte | |
| # extrait pour l'id "cand:projected_text". | |
| payloads = { | |
| "gt_text": gt_text, | |
| "cand:projected_text": alto_document_to_text(gt_alto), | |
| } | |
| # Mais le projecteur a besoin d'un URI. On contourne en | |
| # crΓ©ant un fichier temporaire dans pytest fixture. | |
| # Pour ce test simple on Γ©crit dans /tmp. | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(suffix=".alto.xml", delete=False) as f: | |
| f.write(write_alto(gt_alto)) | |
| cand_uri = f.name | |
| executor = _build_unified_executor(payloads) | |
| cand = Artifact( | |
| id="cand", document_id="d", | |
| type=ArtifactType.ALTO_XML, uri=cand_uri, | |
| ) | |
| gt = Artifact(id="gt_text", document_id="d", type=ArtifactType.RAW_TEXT) | |
| text_result = executor.evaluate(build_text_view(), cand, gt, pipeline_name="test") | |
| search_result = executor.evaluate(build_search_view(), cand, gt, pipeline_name="test") | |
| # Les deux doivent avoir un projection_report (mΓͺme projecteur). | |
| assert text_result.projection_report is not None | |
| assert search_result.projection_report is not None | |
| assert text_result.projection_report.projector_name == "alto_to_text" | |
| assert search_result.projection_report.projector_name == "alto_to_text" | |