Spaces:
Sleeping
Sleeping
Claude
feat(sprint-S8): cohรฉrence finale โ renames test dirs, /metrics endpoint, SBOM workflow
43478ec unverified | """Tests Sprint 46 โ vue HTML stratifiรฉe + dรฉtecteur narratif. | |
| Couvre : | |
| 1. ``build_stratified_ranking_html`` rend un ``<details>`` par strate | |
| avec tableau moteur ร (mรฉdiane, moyenne, docs). | |
| 2. Bandeau d'hรฉtรฉrogรฉnรฉitรฉ affichรฉ si ``corpus_homogeneity`` fourni. | |
| 3. **Masquage adaptatif** : retourne ``""`` si pas de strates. | |
| 4. **Anti-injection** : noms de strates et de moteurs avec balises | |
| HTML sont รฉchappรฉs. | |
| 5. **Dรฉtecteur ``STRATIFICATION_RECOMMENDED``** : | |
| - se dรฉclenche au-delร de 5 points d'รฉcart inter-strate | |
| - importance HIGH au-delร de 10 points, MEDIUM sinon | |
| - ne se dรฉclenche pas sans corpus_homogeneity | |
| 6. **Anti-hallucination** : chaque nombre rendu est dans le payload. | |
| 7. **Intรฉgration ReportGenerator** : la section apparaรฎt dans | |
| ``view_ranking`` quand ``doc_strata`` est peuplรฉ. | |
| 8. **i18n FR/EN** : clรฉs prรฉsentes pour la vue + le template narratif. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| import pytest | |
| from picarones.evaluation.metric_result import MetricsResult | |
| from picarones.reports.narrative.detectors import detect_stratification_recommended | |
| from picarones.domain.facts import FactImportance, FactType | |
| from picarones.reports.narrative.renderer import extract_numbers, render_fact | |
| from picarones.evaluation.benchmark_result import DocumentResult | |
| from picarones.reports.html.generator import ReportGenerator | |
| from picarones.reports.html.renderers.stratification import build_stratified_ranking_html | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # Helpers | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _SAMPLE_STRAT = { | |
| "gothique": [ | |
| {"engine": "pero", "median_cer": 0.05, "mean_cer": 0.07, "documents": 10}, | |
| {"engine": "tess", "median_cer": 0.20, "mean_cer": 0.22, "documents": 10}, | |
| ], | |
| "imprimรฉ": [ | |
| {"engine": "tess", "median_cer": 0.02, "mean_cer": 0.03, "documents": 10}, | |
| {"engine": "pero", "median_cer": 0.05, "mean_cer": 0.06, "documents": 10}, | |
| ], | |
| } | |
| _SAMPLE_STRATA = ["gothique", "imprimรฉ"] | |
| _SAMPLE_HOMOG = { | |
| "leader": "tess", | |
| "n_strata": 2, | |
| "max_inter_strata_gap": 0.18, | |
| "leader_max_gap_strata": ["imprimรฉ", "gothique"], | |
| "leader_per_stratum_median": {"imprimรฉ": 0.02, "gothique": 0.20}, | |
| } | |
| def _make_dr(doc_id: str, cer: float) -> DocumentResult: | |
| return DocumentResult( | |
| doc_id=doc_id, image_path=f"/tmp/{doc_id}.png", | |
| ground_truth="x", hypothesis="x", | |
| metrics=MetricsResult( | |
| cer=cer, cer_nfc=cer, cer_caseless=cer, | |
| wer=cer, wer_normalized=cer, mer=cer, wil=cer, | |
| reference_length=1, hypothesis_length=1, | |
| ), | |
| duration_seconds=0.1, | |
| ) | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 1-2. build_stratified_ranking_html | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| class TestRendering: | |
| def test_renders_one_details_per_stratum(self) -> None: | |
| html = build_stratified_ranking_html( | |
| _SAMPLE_STRAT, _SAMPLE_STRATA, _SAMPLE_HOMOG, | |
| ) | |
| assert html.count("<details") == 2 | |
| # Premier ouvert | |
| assert "<details" in html and " open" in html | |
| def test_includes_engine_metrics(self) -> None: | |
| html = build_stratified_ranking_html( | |
| _SAMPLE_STRAT, _SAMPLE_STRATA, _SAMPLE_HOMOG, | |
| ) | |
| # Mรฉdianes en pourcentage | |
| assert "5.00 %" in html # pero gothique | |
| assert "20.00 %" in html # tess gothique | |
| assert "2.00 %" in html # tess imprimรฉ | |
| def test_homogeneity_banner_present(self) -> None: | |
| html = build_stratified_ranking_html( | |
| _SAMPLE_STRAT, _SAMPLE_STRATA, _SAMPLE_HOMOG, | |
| ) | |
| # Le bandeau d'avertissement doit apparaรฎtre | |
| assert "tess" in html | |
| assert "18.0" in html | |
| def test_no_homogeneity_no_banner(self) -> None: | |
| html = build_stratified_ranking_html( | |
| _SAMPLE_STRAT, _SAMPLE_STRATA, homogeneity=None, | |
| ) | |
| # Pas de bandeau jaune | |
| assert "#fff8e1" not in html | |
| def test_uses_i18n_labels(self) -> None: | |
| labels = { | |
| "stratification_caption": "CUSTOM_CAPTION", | |
| "stratification_median_label": "MED", | |
| "stratification_mean_label": "MEAN", | |
| } | |
| html = build_stratified_ranking_html( | |
| _SAMPLE_STRAT, _SAMPLE_STRATA, None, labels=labels, | |
| ) | |
| assert "CUSTOM_CAPTION" in html | |
| assert "MED" in html | |
| assert "MEAN" in html | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 3. Masquage adaptatif | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| class TestAdaptiveMasking: | |
| def test_empty_when_no_stratified_ranking(self) -> None: | |
| assert build_stratified_ranking_html(None, ["S1"]) == "" | |
| assert build_stratified_ranking_html({}, ["S1"]) == "" | |
| def test_empty_when_no_available_strata(self) -> None: | |
| assert build_stratified_ranking_html(_SAMPLE_STRAT, None) == "" | |
| assert build_stratified_ranking_html(_SAMPLE_STRAT, []) == "" | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 4. Anti-injection | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| class TestAntiInjection: | |
| def test_engine_name_escaped(self) -> None: | |
| bad_strat = { | |
| "S1": [ | |
| {"engine": "<script>alert(1)</script>", | |
| "median_cer": 0.1, "mean_cer": 0.1, "documents": 1}, | |
| ], | |
| } | |
| html = build_stratified_ranking_html(bad_strat, ["S1"]) | |
| assert "<script>" not in html | |
| assert "<script>" in html | |
| def test_stratum_name_escaped(self) -> None: | |
| bad_strat = { | |
| "<img src=x>": [ | |
| {"engine": "a", "median_cer": 0.1, | |
| "mean_cer": 0.1, "documents": 1}, | |
| ], | |
| } | |
| html = build_stratified_ranking_html(bad_strat, ["<img src=x>"]) | |
| assert "<img src=x>" not in html | |
| assert "<img" in html | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 5. Dรฉtecteur STRATIFICATION_RECOMMENDED | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _data(gap: float, **overrides) -> dict: | |
| homog = { | |
| "leader": "tess", "n_strata": 2, | |
| "max_inter_strata_gap": gap, | |
| "leader_max_gap_strata": ["S1", "S2"], | |
| "leader_per_stratum_median": {"S1": 0.02, "S2": 0.02 + gap}, | |
| } | |
| homog.update(overrides) | |
| return {"corpus_homogeneity": homog} | |
| class TestStratificationDetector: | |
| def test_no_fact_below_threshold(self) -> None: | |
| # 4 points โ en dessous du seuil 5 points | |
| assert detect_stratification_recommended(_data(0.04)) == [] | |
| def test_emits_fact_above_threshold(self) -> None: | |
| facts = detect_stratification_recommended(_data(0.07)) | |
| assert len(facts) == 1 | |
| assert facts[0].type is FactType.STRATIFICATION_RECOMMENDED | |
| def test_medium_below_10pts(self) -> None: | |
| facts = detect_stratification_recommended(_data(0.07)) | |
| assert facts[0].importance is FactImportance.MEDIUM | |
| def test_high_above_10pts(self) -> None: | |
| facts = detect_stratification_recommended(_data(0.18)) | |
| assert facts[0].importance is FactImportance.HIGH | |
| def test_no_homogeneity_no_fact(self) -> None: | |
| assert detect_stratification_recommended({}) == [] | |
| assert detect_stratification_recommended({"corpus_homogeneity": None}) == [] | |
| def test_payload_carries_strata_and_cers(self) -> None: | |
| facts = detect_stratification_recommended(_data(0.18)) | |
| p = facts[0].payload | |
| assert p["leader"] == "tess" | |
| assert p["n_strata"] == 2 | |
| assert p["min_stratum"] == "S1" | |
| assert p["max_stratum"] == "S2" | |
| assert p["gap_pct"] == 18.0 | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 6. Anti-hallucination | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| class TestTraceability: | |
| def test_every_rendered_number_is_in_payload(self, lang: str) -> None: | |
| # On utilise des noms de strates sans chiffres (la traรงabilitรฉ | |
| # exige que tout chiffre rendu vienne du payload, mais les | |
| # noms de strates cรดtรฉ GT peuvent lรฉgitimement contenir des | |
| # chiffres ; pour le test on isole les nombres "mรฉtriques"). | |
| data = {"corpus_homogeneity": { | |
| "leader": "tess", "n_strata": 2, | |
| "max_inter_strata_gap": 0.18, | |
| "leader_max_gap_strata": ["impr", "goth"], | |
| "leader_per_stratum_median": {"impr": 0.02, "goth": 0.20}, | |
| }} | |
| facts = detect_stratification_recommended(data) | |
| sentence = render_fact(facts[0], lang) | |
| payload_nums: set[str] = set() | |
| for v in facts[0].payload.values(): | |
| if isinstance(v, (int, float)): | |
| payload_nums.add(str(v)) | |
| if isinstance(v, float) and v.is_integer(): | |
| payload_nums.add(str(int(v))) | |
| elif isinstance(v, str): | |
| # Capture aussi les chiffres prรฉsents dans les chaรฎnes | |
| # du payload (ex. noms de strates contenant un nombre) | |
| for match in re.findall(r"\d+(?:[.,]\d+)?", v): | |
| payload_nums.add(match.replace(",", ".")) | |
| for num in extract_numbers(sentence): | |
| normalized = num.replace(",", ".") | |
| assert normalized in payload_nums, ( | |
| f"Nombre {normalized!r} non traรงable au payload " | |
| f"{facts[0].payload!r}" | |
| ) | |
| def test_template_has_no_hardcoded_numbers(self) -> None: | |
| from picarones.reports.narrative.renderer import _load_templates | |
| for lang in ("fr", "en"): | |
| tpl = _load_templates(lang).get("stratification_recommended", "") | |
| assert tpl, f"Template absent pour {lang}" | |
| cleaned = re.sub(r"\{[^}]+\}", "", tpl) | |
| digits = re.findall(r"\d", cleaned) | |
| assert not digits, f"Template {lang} contient des chiffres en dur : {digits}" | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 7. Intรฉgration ReportGenerator | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| class TestReportIntegration: | |
| def test_section_absent_without_strata(self, tmp_path: Path) -> None: | |
| from picarones.evaluation.synthetic import generate_sample_benchmark | |
| bench = generate_sample_benchmark() | |
| bench.doc_strata = None # force absence | |
| out = tmp_path / "report.html" | |
| ReportGenerator(bench).generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "stratified-ranking" not in html | |
| def test_section_present_with_strata(self, tmp_path: Path) -> None: | |
| from picarones.evaluation.synthetic import generate_sample_benchmark | |
| bench = generate_sample_benchmark() | |
| # La fixture peuple image_quality.script_type ; on extrait | |
| # manuellement comme le ferait le runner. | |
| strata_map: dict[str, str] = {} | |
| for r in bench.engine_reports: | |
| for dr in r.document_results: | |
| if dr.image_quality and dr.image_quality.get("script_type"): | |
| strata_map.setdefault(dr.doc_id, dr.image_quality["script_type"]) | |
| bench.doc_strata = strata_map | |
| out = tmp_path / "report.html" | |
| ReportGenerator(bench).generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "stratified-ranking" in html | |
| # Au moins un <details> rendu | |
| assert "<details" in html | |
| def test_french_locale_uses_french_labels(self, tmp_path: Path) -> None: | |
| from picarones.evaluation.synthetic import generate_sample_benchmark | |
| bench = generate_sample_benchmark() | |
| strata_map = {} | |
| for r in bench.engine_reports: | |
| for dr in r.document_results: | |
| if dr.image_quality and dr.image_quality.get("script_type"): | |
| strata_map.setdefault(dr.doc_id, dr.image_quality["script_type"]) | |
| bench.doc_strata = strata_map | |
| out = tmp_path / "report_fr.html" | |
| ReportGenerator(bench, lang="fr").generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "Classement par strate" in html | |
| assert "Mรฉdiane CER" in html | |
| def test_english_locale_uses_english_labels(self, tmp_path: Path) -> None: | |
| from picarones.evaluation.synthetic import generate_sample_benchmark | |
| bench = generate_sample_benchmark() | |
| strata_map = {} | |
| for r in bench.engine_reports: | |
| for dr in r.document_results: | |
| if dr.image_quality and dr.image_quality.get("script_type"): | |
| strata_map.setdefault(dr.doc_id, dr.image_quality["script_type"]) | |
| bench.doc_strata = strata_map | |
| out = tmp_path / "report_en.html" | |
| ReportGenerator(bench, lang="en").generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "Ranking by stratum" in html | |
| assert "Median CER" in html | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 8. i18n FR/EN | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| REQUIRED_KEYS = ( | |
| "stratification_caption", | |
| "stratification_description", | |
| "stratification_median_label", | |
| "stratification_mean_label", | |
| "stratification_docs_label", | |
| "stratification_no_data_label", | |
| "stratification_n_docs_label", | |
| "stratification_gap_summary", | |
| ) | |
| class TestI18NCompleteness: | |
| def test_key_present(self, lang: str, key: str) -> None: | |
| path = ( | |
| Path(__file__).parent.parent.parent | |
| / "picarones" / "reports" / "i18n" / f"{lang}.json" | |
| ) | |
| data = json.loads(path.read_text(encoding="utf-8")) | |
| assert key in data, f"Clรฉ {key!r} manquante dans {lang}.json" | |
| assert data[key].strip(), f"Clรฉ {key!r} vide dans {lang}.json" | |