Picarones / tests /reports /test_sprint46_stratification_html.py
Claude
feat(sprint-S8): cohรฉrence finale โ€” renames test dirs, /metrics endpoint, SBOM workflow
43478ec unverified
Raw
History Blame
16.7 kB
"""Tests Sprint 46 โ€” vue HTML stratifiรฉe + dรฉtecteur narratif.
Couvre :
1. ``build_stratified_ranking_html`` rend un ``<details>`` par strate
avec tableau moteur ร— (mรฉdiane, moyenne, docs).
2. Bandeau d'hรฉtรฉrogรฉnรฉitรฉ affichรฉ si ``corpus_homogeneity`` fourni.
3. **Masquage adaptatif** : retourne ``""`` si pas de strates.
4. **Anti-injection** : noms de strates et de moteurs avec balises
HTML sont รฉchappรฉs.
5. **Dรฉtecteur ``STRATIFICATION_RECOMMENDED``** :
- se dรฉclenche au-delร  de 5 points d'รฉcart inter-strate
- importance HIGH au-delร  de 10 points, MEDIUM sinon
- ne se dรฉclenche pas sans corpus_homogeneity
6. **Anti-hallucination** : chaque nombre rendu est dans le payload.
7. **Intรฉgration ReportGenerator** : la section apparaรฎt dans
``view_ranking`` quand ``doc_strata`` est peuplรฉ.
8. **i18n FR/EN** : clรฉs prรฉsentes pour la vue + le template narratif.
"""
from __future__ import annotations
import json
import re
from pathlib import Path
import pytest
from picarones.evaluation.metric_result import MetricsResult
from picarones.reports.narrative.detectors import detect_stratification_recommended
from picarones.domain.facts import FactImportance, FactType
from picarones.reports.narrative.renderer import extract_numbers, render_fact
from picarones.evaluation.benchmark_result import DocumentResult
from picarones.reports.html.generator import ReportGenerator
from picarones.reports.html.renderers.stratification import build_stratified_ranking_html
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Helpers
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_SAMPLE_STRAT = {
"gothique": [
{"engine": "pero", "median_cer": 0.05, "mean_cer": 0.07, "documents": 10},
{"engine": "tess", "median_cer": 0.20, "mean_cer": 0.22, "documents": 10},
],
"imprimรฉ": [
{"engine": "tess", "median_cer": 0.02, "mean_cer": 0.03, "documents": 10},
{"engine": "pero", "median_cer": 0.05, "mean_cer": 0.06, "documents": 10},
],
}
_SAMPLE_STRATA = ["gothique", "imprimรฉ"]
_SAMPLE_HOMOG = {
"leader": "tess",
"n_strata": 2,
"max_inter_strata_gap": 0.18,
"leader_max_gap_strata": ["imprimรฉ", "gothique"],
"leader_per_stratum_median": {"imprimรฉ": 0.02, "gothique": 0.20},
}
def _make_dr(doc_id: str, cer: float) -> DocumentResult:
return DocumentResult(
doc_id=doc_id, image_path=f"/tmp/{doc_id}.png",
ground_truth="x", hypothesis="x",
metrics=MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=1, hypothesis_length=1,
),
duration_seconds=0.1,
)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 1-2. build_stratified_ranking_html
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
class TestRendering:
def test_renders_one_details_per_stratum(self) -> None:
html = build_stratified_ranking_html(
_SAMPLE_STRAT, _SAMPLE_STRATA, _SAMPLE_HOMOG,
)
assert html.count("<details") == 2
# Premier ouvert
assert "<details" in html and " open" in html
def test_includes_engine_metrics(self) -> None:
html = build_stratified_ranking_html(
_SAMPLE_STRAT, _SAMPLE_STRATA, _SAMPLE_HOMOG,
)
# Mรฉdianes en pourcentage
assert "5.00 %" in html # pero gothique
assert "20.00 %" in html # tess gothique
assert "2.00 %" in html # tess imprimรฉ
def test_homogeneity_banner_present(self) -> None:
html = build_stratified_ranking_html(
_SAMPLE_STRAT, _SAMPLE_STRATA, _SAMPLE_HOMOG,
)
# Le bandeau d'avertissement doit apparaรฎtre
assert "tess" in html
assert "18.0" in html
def test_no_homogeneity_no_banner(self) -> None:
html = build_stratified_ranking_html(
_SAMPLE_STRAT, _SAMPLE_STRATA, homogeneity=None,
)
# Pas de bandeau jaune
assert "#fff8e1" not in html
def test_uses_i18n_labels(self) -> None:
labels = {
"stratification_caption": "CUSTOM_CAPTION",
"stratification_median_label": "MED",
"stratification_mean_label": "MEAN",
}
html = build_stratified_ranking_html(
_SAMPLE_STRAT, _SAMPLE_STRATA, None, labels=labels,
)
assert "CUSTOM_CAPTION" in html
assert "MED" in html
assert "MEAN" in html
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 3. Masquage adaptatif
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
class TestAdaptiveMasking:
def test_empty_when_no_stratified_ranking(self) -> None:
assert build_stratified_ranking_html(None, ["S1"]) == ""
assert build_stratified_ranking_html({}, ["S1"]) == ""
def test_empty_when_no_available_strata(self) -> None:
assert build_stratified_ranking_html(_SAMPLE_STRAT, None) == ""
assert build_stratified_ranking_html(_SAMPLE_STRAT, []) == ""
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 4. Anti-injection
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
class TestAntiInjection:
def test_engine_name_escaped(self) -> None:
bad_strat = {
"S1": [
{"engine": "<script>alert(1)</script>",
"median_cer": 0.1, "mean_cer": 0.1, "documents": 1},
],
}
html = build_stratified_ranking_html(bad_strat, ["S1"])
assert "<script>" not in html
assert "&lt;script&gt;" in html
def test_stratum_name_escaped(self) -> None:
bad_strat = {
"<img src=x>": [
{"engine": "a", "median_cer": 0.1,
"mean_cer": 0.1, "documents": 1},
],
}
html = build_stratified_ranking_html(bad_strat, ["<img src=x>"])
assert "<img src=x>" not in html
assert "&lt;img" in html
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 5. Dรฉtecteur STRATIFICATION_RECOMMENDED
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _data(gap: float, **overrides) -> dict:
homog = {
"leader": "tess", "n_strata": 2,
"max_inter_strata_gap": gap,
"leader_max_gap_strata": ["S1", "S2"],
"leader_per_stratum_median": {"S1": 0.02, "S2": 0.02 + gap},
}
homog.update(overrides)
return {"corpus_homogeneity": homog}
class TestStratificationDetector:
def test_no_fact_below_threshold(self) -> None:
# 4 points โ†’ en dessous du seuil 5 points
assert detect_stratification_recommended(_data(0.04)) == []
def test_emits_fact_above_threshold(self) -> None:
facts = detect_stratification_recommended(_data(0.07))
assert len(facts) == 1
assert facts[0].type is FactType.STRATIFICATION_RECOMMENDED
def test_medium_below_10pts(self) -> None:
facts = detect_stratification_recommended(_data(0.07))
assert facts[0].importance is FactImportance.MEDIUM
def test_high_above_10pts(self) -> None:
facts = detect_stratification_recommended(_data(0.18))
assert facts[0].importance is FactImportance.HIGH
def test_no_homogeneity_no_fact(self) -> None:
assert detect_stratification_recommended({}) == []
assert detect_stratification_recommended({"corpus_homogeneity": None}) == []
def test_payload_carries_strata_and_cers(self) -> None:
facts = detect_stratification_recommended(_data(0.18))
p = facts[0].payload
assert p["leader"] == "tess"
assert p["n_strata"] == 2
assert p["min_stratum"] == "S1"
assert p["max_stratum"] == "S2"
assert p["gap_pct"] == 18.0
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 6. Anti-hallucination
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
class TestTraceability:
@pytest.mark.parametrize("lang", ["fr", "en"])
def test_every_rendered_number_is_in_payload(self, lang: str) -> None:
# On utilise des noms de strates sans chiffres (la traรงabilitรฉ
# exige que tout chiffre rendu vienne du payload, mais les
# noms de strates cรดtรฉ GT peuvent lรฉgitimement contenir des
# chiffres ; pour le test on isole les nombres "mรฉtriques").
data = {"corpus_homogeneity": {
"leader": "tess", "n_strata": 2,
"max_inter_strata_gap": 0.18,
"leader_max_gap_strata": ["impr", "goth"],
"leader_per_stratum_median": {"impr": 0.02, "goth": 0.20},
}}
facts = detect_stratification_recommended(data)
sentence = render_fact(facts[0], lang)
payload_nums: set[str] = set()
for v in facts[0].payload.values():
if isinstance(v, (int, float)):
payload_nums.add(str(v))
if isinstance(v, float) and v.is_integer():
payload_nums.add(str(int(v)))
elif isinstance(v, str):
# Capture aussi les chiffres prรฉsents dans les chaรฎnes
# du payload (ex. noms de strates contenant un nombre)
for match in re.findall(r"\d+(?:[.,]\d+)?", v):
payload_nums.add(match.replace(",", "."))
for num in extract_numbers(sentence):
normalized = num.replace(",", ".")
assert normalized in payload_nums, (
f"Nombre {normalized!r} non traรงable au payload "
f"{facts[0].payload!r}"
)
def test_template_has_no_hardcoded_numbers(self) -> None:
from picarones.reports.narrative.renderer import _load_templates
for lang in ("fr", "en"):
tpl = _load_templates(lang).get("stratification_recommended", "")
assert tpl, f"Template absent pour {lang}"
cleaned = re.sub(r"\{[^}]+\}", "", tpl)
digits = re.findall(r"\d", cleaned)
assert not digits, f"Template {lang} contient des chiffres en dur : {digits}"
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 7. Intรฉgration ReportGenerator
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
class TestReportIntegration:
def test_section_absent_without_strata(self, tmp_path: Path) -> None:
from picarones.evaluation.synthetic import generate_sample_benchmark
bench = generate_sample_benchmark()
bench.doc_strata = None # force absence
out = tmp_path / "report.html"
ReportGenerator(bench).generate(out)
html = out.read_text(encoding="utf-8")
assert "stratified-ranking" not in html
def test_section_present_with_strata(self, tmp_path: Path) -> None:
from picarones.evaluation.synthetic import generate_sample_benchmark
bench = generate_sample_benchmark()
# La fixture peuple image_quality.script_type ; on extrait
# manuellement comme le ferait le runner.
strata_map: dict[str, str] = {}
for r in bench.engine_reports:
for dr in r.document_results:
if dr.image_quality and dr.image_quality.get("script_type"):
strata_map.setdefault(dr.doc_id, dr.image_quality["script_type"])
bench.doc_strata = strata_map
out = tmp_path / "report.html"
ReportGenerator(bench).generate(out)
html = out.read_text(encoding="utf-8")
assert "stratified-ranking" in html
# Au moins un <details> rendu
assert "<details" in html
def test_french_locale_uses_french_labels(self, tmp_path: Path) -> None:
from picarones.evaluation.synthetic import generate_sample_benchmark
bench = generate_sample_benchmark()
strata_map = {}
for r in bench.engine_reports:
for dr in r.document_results:
if dr.image_quality and dr.image_quality.get("script_type"):
strata_map.setdefault(dr.doc_id, dr.image_quality["script_type"])
bench.doc_strata = strata_map
out = tmp_path / "report_fr.html"
ReportGenerator(bench, lang="fr").generate(out)
html = out.read_text(encoding="utf-8")
assert "Classement par strate" in html
assert "Mรฉdiane CER" in html
def test_english_locale_uses_english_labels(self, tmp_path: Path) -> None:
from picarones.evaluation.synthetic import generate_sample_benchmark
bench = generate_sample_benchmark()
strata_map = {}
for r in bench.engine_reports:
for dr in r.document_results:
if dr.image_quality and dr.image_quality.get("script_type"):
strata_map.setdefault(dr.doc_id, dr.image_quality["script_type"])
bench.doc_strata = strata_map
out = tmp_path / "report_en.html"
ReportGenerator(bench, lang="en").generate(out)
html = out.read_text(encoding="utf-8")
assert "Ranking by stratum" in html
assert "Median CER" in html
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# 8. i18n FR/EN
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
REQUIRED_KEYS = (
"stratification_caption",
"stratification_description",
"stratification_median_label",
"stratification_mean_label",
"stratification_docs_label",
"stratification_no_data_label",
"stratification_n_docs_label",
"stratification_gap_summary",
)
class TestI18NCompleteness:
@pytest.mark.parametrize("lang", ["fr", "en"])
@pytest.mark.parametrize("key", REQUIRED_KEYS)
def test_key_present(self, lang: str, key: str) -> None:
path = (
Path(__file__).parent.parent.parent
/ "picarones" / "reports" / "i18n" / f"{lang}.json"
)
data = json.loads(path.read_text(encoding="utf-8"))
assert key in data, f"Clรฉ {key!r} manquante dans {lang}.json"
assert data[key].strip(), f"Clรฉ {key!r} vide dans {lang}.json"