"""Tests Sprint 86 — A.II.5 bout-en-bout : helpers runner + rendu HTML. Couvre : 1. ``compute_searchability_metrics`` adaptive masking. 2. ``aggregate_searchability_metrics`` micro-recall. 3. ``compute_numerical_sequence_metrics_adaptive`` masking. 4. ``aggregate_numerical_sequence_metrics`` somme par catégorie. 5. Champs ``DocumentResult.searchability_metrics`` et ``EngineReport.aggregated_searchability``. 6. Rendu HTML adaptive + anti-injection. 7. Complétude i18n FR/EN. """ from __future__ import annotations import json from pathlib import Path from picarones.evaluation.metrics.numerical_sequences_hooks import ( aggregate_numerical_sequence_metrics, compute_numerical_sequence_metrics_adaptive, ) from picarones.evaluation.metric_result import MetricsResult from picarones.evaluation.benchmark_result import DocumentResult, EngineReport def _stub_metrics() -> MetricsResult: return MetricsResult( cer=0.0, cer_nfc=0.0, cer_caseless=0.0, wer=0.0, wer_normalized=0.0, mer=0.0, wil=0.0, reference_length=0, hypothesis_length=0, ) from picarones.evaluation.metrics.searchability_hooks import ( aggregate_searchability_metrics, compute_searchability_metrics, ) from picarones.reports.html.renderers.numerical_sequences import ( build_numerical_sequences_html, ) from picarones.reports.html.renderers.searchability import ( build_searchability_summary_html, ) def _load_labels(lang: str) -> dict: p = ( Path(__file__).parent.parent.parent / "picarones" / "reports" / "i18n" / f"{lang}.json" ) return json.loads(p.read_text(encoding="utf-8")) # ────────────────────────────────────────────────────────────────────────── # 1. Helpers searchability # ────────────────────────────────────────────────────────────────────────── class TestSearchabilityRunner: def test_empty_gt_returns_none(self) -> None: assert compute_searchability_metrics("", "anything") is None def test_normal(self) -> None: r = compute_searchability_metrics("le roi", "le roy") assert r is not None assert r["recall"] == 1.0 assert r["n_gt_tokens"] == 2 def test_aggregate_micro_recall(self) -> None: d1 = {"n_gt_tokens": 10, "n_searchable": 9, "missed_tokens": ["x"]} d2 = {"n_gt_tokens": 20, "n_searchable": 15, "missed_tokens": ["y"]} agg = aggregate_searchability_metrics([d1, d2]) assert agg is not None assert agg["n_gt_tokens"] == 30 assert agg["n_searchable"] == 24 assert agg["recall"] == 24 / 30 assert agg["n_docs"] == 2 def test_aggregate_empty(self) -> None: assert aggregate_searchability_metrics([None, None]) is None assert aggregate_searchability_metrics([]) is None # ────────────────────────────────────────────────────────────────────────── # 2. Helpers numerical sequences # ────────────────────────────────────────────────────────────────────────── class TestNumericalSequencesRunner: def test_no_signal_returns_none(self) -> None: # GT sans aucune séquence numérique assert compute_numerical_sequence_metrics_adaptive( "lorem ipsum dolor", "sit amet", ) is None def test_signal_present(self) -> None: r = compute_numerical_sequence_metrics_adaptive( "an III, 1789", "an III, 1789", ) assert r is not None assert r["n_total"] >= 1 def test_aggregate_sums_per_category(self) -> None: d1 = { "n_total": 3, "global_strict_score": 1.0, "global_value_score": 1.0, "per_category": { "year": {"n_total": 2, "strict": 2, "value": 2, "strict_score": 1.0, "value_score": 1.0, "lost_items": []}, "roman": {"n_total": 1, "strict": 1, "value": 1, "strict_score": 1.0, "value_score": 1.0, "lost_items": []}, "foliation": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, "currency": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, "regnal": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, }, } d2 = { "n_total": 4, "global_strict_score": 0.5, "global_value_score": 0.5, "per_category": { "year": {"n_total": 4, "strict": 2, "value": 2, "strict_score": 0.5, "value_score": 0.5, "lost_items": ["1500", "1600"]}, "roman": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, "foliation": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, "currency": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, "regnal": {"n_total": 0, "strict": 0, "value": 0, "strict_score": 0.0, "value_score": 0.0, "lost_items": []}, }, } agg = aggregate_numerical_sequence_metrics([d1, d2]) assert agg["n_total"] == 7 assert agg["per_category"]["year"]["n_total"] == 6 assert agg["per_category"]["year"]["strict"] == 4 assert agg["per_category"]["year"]["strict_score"] == 4 / 6 # global = (2+1 + 2) / 7 = 5/7 assert agg["global_strict_score"] == 5 / 7 def test_aggregate_empty(self) -> None: assert aggregate_numerical_sequence_metrics([None]) is None # ────────────────────────────────────────────────────────────────────────── # 3. Champs results.py # ────────────────────────────────────────────────────────────────────────── class TestResultsFields: def test_document_result_serializes_searchability(self) -> None: dr = DocumentResult( doc_id="doc1", image_path="x.png", ground_truth="hello", hypothesis="helo", metrics=_stub_metrics(), duration_seconds=1.0, searchability_metrics={"recall": 0.9}, numerical_sequence_metrics={"n_total": 1}, ) d = dr.as_dict() assert d["searchability_metrics"] == {"recall": 0.9} assert d["numerical_sequence_metrics"] == {"n_total": 1} def test_document_result_omits_when_none(self) -> None: dr = DocumentResult( doc_id="doc1", image_path="x.png", ground_truth="hello", hypothesis="helo", metrics=_stub_metrics(), duration_seconds=1.0, ) d = dr.as_dict() assert "searchability_metrics" not in d assert "numerical_sequence_metrics" not in d def test_compact_clears_fields(self) -> None: dr = DocumentResult( doc_id="doc1", image_path="x.png", ground_truth="hello", hypothesis="helo", metrics=_stub_metrics(), duration_seconds=1.0, searchability_metrics={"recall": 0.9}, numerical_sequence_metrics={"n_total": 1}, ) # Sprint A14-S1 — opt-in via drop_analyses=True. dr.compact(drop_analyses=True) assert dr.searchability_metrics is None assert dr.numerical_sequence_metrics is None def test_engine_report_serializes_aggregates(self) -> None: er = EngineReport( engine_name="t", engine_version="0", engine_config={}, document_results=[], pipeline_info=None, aggregated_searchability={"recall": 0.85}, aggregated_numerical_sequences={"global_strict_score": 0.9}, ) d = er.as_dict() assert d["aggregated_searchability"]["recall"] == 0.85 assert d["aggregated_numerical_sequences"]["global_strict_score"] == 0.9 def test_engine_report_omits_when_none(self) -> None: er = EngineReport( engine_name="t", engine_version="0", engine_config={}, document_results=[], pipeline_info=None, ) d = er.as_dict() assert "aggregated_searchability" not in d assert "aggregated_numerical_sequences" not in d # ────────────────────────────────────────────────────────────────────────── # 4. Rendu HTML # ────────────────────────────────────────────────────────────────────────── class TestSearchabilityHtml: def test_empty_returns_empty(self) -> None: assert build_searchability_summary_html([]) == "" def test_no_signal_returns_empty(self) -> None: engines = [{"name": "t"}] # pas de aggregated_searchability assert build_searchability_summary_html(engines) == "" def test_renders_table_with_recall(self) -> None: engines = [{ "name": "tess", "aggregated_searchability": { "recall": 0.92, "n_searchable": 92, "n_gt_tokens": 100, "n_docs": 5, }, }] html = build_searchability_summary_html( engines, _load_labels("fr"), ) assert "