"""Tests Sprint 82 — A.I.9 : section « Leviers d'amélioration ». Couvre : 1. Modèle ``Lever`` + registre. 2. Les 5 détecteurs : ``dominant_recoverable_class``, ``pareto_concentration``, ``complementarity_observation``, ``lexical_modernization_observation``, ``robustness_projection_observation``. 3. Pipeline ``detect_levers`` (ordre, robustesse aux exceptions). 4. Rendu HTML : cards, anti-injection, masquage adaptatif. 5. Anti-hallucination : chaque chiffre rendu est dans le payload. 6. Complétude i18n FR/EN. """ from __future__ import annotations import json import re from pathlib import Path from picarones.evaluation.metrics.levers import ( Lever, LeverImportance, LeverType, detect_complementarity_observation, detect_dominant_recoverable_class, detect_levers, detect_lexical_modernization_observation, detect_pareto_concentration, detect_robustness_projection_observation, iter_lever_detectors, ) from picarones.reports.html.renderers.levers import build_levers_section_html # ────────────────────────────────────────────────────────────────────────── # 1. Modèle + registre # ────────────────────────────────────────────────────────────────────────── class TestModel: def test_lever_as_dict(self) -> None: lv = Lever( type=LeverType.DOMINANT_RECOVERABLE_CLASS, importance=LeverImportance.HIGH, payload={"engine": "t", "share_recoverable_pct": 65.0}, engines_involved=("t",), ) d = lv.as_dict() assert d["type"] == "dominant_recoverable_class" assert d["importance"] == 70 assert d["engines_involved"] == ["t"] def test_registry_contains_five_detectors(self) -> None: types = {e.lever_type for e in iter_lever_detectors()} assert LeverType.DOMINANT_RECOVERABLE_CLASS in types assert LeverType.PARETO_CONCENTRATION in types assert LeverType.COMPLEMENTARITY_OBSERVATION in types assert LeverType.LEXICAL_MODERNIZATION_OBSERVATION in types assert LeverType.ROBUSTNESS_PROJECTION_OBSERVATION in types def test_registry_priority_sorted(self) -> None: priorities = [e.priority for e in iter_lever_detectors()] assert priorities == sorted(priorities) # ────────────────────────────────────────────────────────────────────────── # 2. Détecteur dominant_recoverable_class # ────────────────────────────────────────────────────────────────────────── class TestDominantRecoverable: def test_emits_when_share_above_threshold(self) -> None: data = {"engines": [{ "name": "t", "aggregated_taxonomy": { "case_error": 30, "ligature_error": 10, "abbreviation_error": 25, # 65 récupérables "lacuna": 20, "diacritic_error": 15, }, }]} levers = detect_dominant_recoverable_class(data) assert len(levers) == 1 lv = levers[0] assert lv.payload["engine"] == "t" assert lv.payload["n_recoverable"] == 65 assert lv.payload["n_total_errors"] == 100 assert lv.payload["share_recoverable_pct"] == 65.0 assert lv.importance == LeverImportance.HIGH def test_silent_when_below_threshold(self) -> None: data = {"engines": [{ "name": "t", "aggregated_taxonomy": {"lacuna": 80, "case_error": 20}, }]} assert detect_dominant_recoverable_class(data) == [] def test_silent_when_no_taxonomy(self) -> None: data = {"engines": [{"name": "t"}]} assert detect_dominant_recoverable_class(data) == [] def test_top_classes_sorted_descending(self) -> None: data = {"engines": [{ "name": "t", "aggregated_taxonomy": { "case_error": 50, "ligature_error": 5, "abbreviation_error": 30, }, }]} lv = detect_dominant_recoverable_class(data)[0] names = [c["class"] for c in lv.payload["top_classes"]] assert names == ["case_error", "abbreviation_error", "ligature_error"] def test_accepts_counts_subdict(self) -> None: data = {"engines": [{ "name": "t", "aggregated_taxonomy": {"counts": {"case_error": 60, "lacuna": 40}}, }]} levers = detect_dominant_recoverable_class(data) assert len(levers) == 1 assert levers[0].payload["n_recoverable"] == 60 def test_medium_when_share_in_30_50(self) -> None: data = {"engines": [{ "name": "t", "aggregated_taxonomy": {"case_error": 35, "lacuna": 65}, }]} lv = detect_dominant_recoverable_class(data)[0] assert lv.importance == LeverImportance.MEDIUM # ────────────────────────────────────────────────────────────────────────── # 3. Détecteur pareto_concentration # ────────────────────────────────────────────────────────────────────────── class TestParetoConcentration: def test_concentrated_corpus(self) -> None: # 10 docs : 2 catastrophiques (CER 0.8), 8 OK (CER 0.05) → 80 % # du CER total est concentré sur 20 % des docs. data = { "ranking": [{"engine": "t", "mean_cer": 0.20}], "per_doc_cer": {"t": [0.8, 0.8] + [0.05] * 8}, } levers = detect_pareto_concentration(data) assert len(levers) == 1 p = levers[0].payload assert p["n_docs"] == 10 assert p["n_docs_top"] == 2 assert p["cer_share_pct"] >= 70 def test_uniform_corpus_silent(self) -> None: data = { "ranking": [{"engine": "t", "mean_cer": 0.10}], "per_doc_cer": {"t": [0.10] * 10}, } assert detect_pareto_concentration(data) == [] def test_reads_engine_per_doc(self) -> None: data = { "ranking": [{"engine": "t", "mean_cer": 0.20}], "engines": [{ "name": "t", "per_doc": [ {"cer": 0.9}, {"cer": 0.9}, {"cer": 0.05}, {"cer": 0.05}, {"cer": 0.05}, {"cer": 0.05}, {"cer": 0.05}, {"cer": 0.05}, {"cer": 0.05}, {"cer": 0.05}, ], }], } levers = detect_pareto_concentration(data) assert len(levers) == 1 def test_no_ranking_silent(self) -> None: assert detect_pareto_concentration({}) == [] def test_no_per_doc_silent(self) -> None: data = {"ranking": [{"engine": "t", "mean_cer": 0.10}]} assert detect_pareto_concentration(data) == [] # ────────────────────────────────────────────────────────────────────────── # 4. Détecteur complementarity_observation # ────────────────────────────────────────────────────────────────────────── class TestComplementarity: def test_emits_when_relative_gap_above_threshold(self) -> None: data = {"inter_engine_analysis": { "complementarity_gap": { "absolute_gap": 0.10, "relative_gap": 0.30, "best_engine": "t", "best_recall": 0.70, "oracle_recall": 0.80, }, }} levers = detect_complementarity_observation(data) assert len(levers) == 1 p = levers[0].payload assert p["best_engine"] == "t" assert p["absolute_gap_pct"] == 10.0 assert p["relative_gap_pct"] == 30.0 def test_silent_when_below_threshold(self) -> None: data = {"inter_engine_analysis": { "complementarity_gap": {"absolute_gap": 0.02, "relative_gap": 0.05}, }} assert detect_complementarity_observation(data) == [] def test_silent_when_no_data(self) -> None: assert detect_complementarity_observation({}) == [] def test_high_when_relative_gap_above_50(self) -> None: data = {"inter_engine_analysis": { "complementarity_gap": {"absolute_gap": 0.30, "relative_gap": 0.60}, }} lv = detect_complementarity_observation(data)[0] assert lv.importance == LeverImportance.HIGH # ────────────────────────────────────────────────────────────────────────── # 5. Détecteur lexical_modernization_observation # ────────────────────────────────────────────────────────────────────────── class TestLexicalModernization: def test_emits_top_three(self) -> None: data = {"engines": [{ "name": "gpt4o", "lexical_modernization": { "n_gt_tokens": 50, "tokens": { "maistre": {"n_total": 10, "n_modernized": 10, "rate_modernized": 1.0, "variants": {"maître": 10}}, "veoir": {"n_total": 5, "n_modernized": 5, "rate_modernized": 1.0, "variants": {"voir": 5}}, "nostre": {"n_total": 8, "n_modernized": 6, "rate_modernized": 0.75, "variants": {"notre": 6}}, "ami": {"n_total": 3, "n_modernized": 0, "rate_modernized": 0.0, "variants": {}}, }, }, }]} levers = detect_lexical_modernization_observation(data) assert len(levers) == 1 top = levers[0].payload["top_tokens"] gt_tokens = [t["gt_token"] for t in top] # Tri par rate desc, puis n_total desc → maistre, veoir, nostre assert gt_tokens == ["maistre", "veoir", "nostre"] assert levers[0].importance == LeverImportance.HIGH def test_silent_when_no_tokens_above_min_rate(self) -> None: data = {"engines": [{ "name": "t", "lexical_modernization": { "tokens": {"a": {"n_total": 10, "n_modernized": 1, "rate_modernized": 0.10, "variants": {}}}, }, }]} assert detect_lexical_modernization_observation(data) == [] def test_silent_when_n_total_below_min(self) -> None: data = {"engines": [{ "name": "t", "lexical_modernization": { "tokens": {"a": {"n_total": 1, "n_modernized": 1, "rate_modernized": 1.0, "variants": {}}}, }, }]} assert detect_lexical_modernization_observation(data) == [] def test_silent_when_no_lexical_field(self) -> None: data = {"engines": [{"name": "t"}]} assert detect_lexical_modernization_observation(data) == [] # ────────────────────────────────────────────────────────────────────────── # 6. Détecteur robustness_projection_observation # ────────────────────────────────────────────────────────────────────────── class TestRobustnessProjection: def test_emits_when_deficit_above_threshold(self) -> None: data = {"robustness_projection_aggregated": { "tess": { "total_expected_deficit": 0.06, "n_degradation_types": 2, "worst_degradation_type": "noise", "worst_degradation_deficit": 0.04, }, }} levers = detect_robustness_projection_observation(data) assert len(levers) == 1 p = levers[0].payload assert p["engine"] == "tess" assert p["total_expected_deficit_pct"] == 6.0 assert p["worst_degradation_type"] == "noise" assert levers[0].importance == LeverImportance.HIGH def test_silent_when_deficit_too_low(self) -> None: data = {"robustness_projection_aggregated": { "tess": {"total_expected_deficit": 0.005}, }} assert detect_robustness_projection_observation(data) == [] def test_silent_when_no_data(self) -> None: assert detect_robustness_projection_observation({}) == [] def test_sorted_by_deficit_descending(self) -> None: data = {"robustness_projection_aggregated": { "a": {"total_expected_deficit": 0.03, "n_degradation_types": 1}, "b": {"total_expected_deficit": 0.08, "n_degradation_types": 2}, }} levers = detect_robustness_projection_observation(data) assert [lv.payload["engine"] for lv in levers] == ["b", "a"] # ────────────────────────────────────────────────────────────────────────── # 7. Pipeline detect_levers # ────────────────────────────────────────────────────────────────────────── class TestDetectLevers: def test_aggregates_multiple_types(self) -> None: data = { "engines": [{ "name": "t", "aggregated_taxonomy": {"case_error": 60, "lacuna": 40}, }], "robustness_projection_aggregated": { "t": {"total_expected_deficit": 0.07, "n_degradation_types": 2}, }, } levers = detect_levers(data) types = [lv.type for lv in levers] assert LeverType.DOMINANT_RECOVERABLE_CLASS in types assert LeverType.ROBUSTNESS_PROJECTION_OBSERVATION in types def test_sorted_by_importance_desc(self) -> None: # HIGH (robustness 7%) avant MEDIUM (recoverable 35%) data = { "engines": [{ "name": "t", "aggregated_taxonomy": {"case_error": 35, "lacuna": 65}, }], "robustness_projection_aggregated": { "t": {"total_expected_deficit": 0.07, "n_degradation_types": 2}, }, } levers = detect_levers(data) importances = [int(lv.importance) for lv in levers] assert importances == sorted(importances, reverse=True) def test_empty_input_returns_empty(self) -> None: assert detect_levers({}) == [] # ────────────────────────────────────────────────────────────────────────── # 8. Rendu HTML # ────────────────────────────────────────────────────────────────────────── def _load_labels(lang: str) -> dict: p = ( Path(__file__).parent.parent.parent / "picarones" / "reports" / "i18n" / f"{lang}.json" ) return json.loads(p.read_text(encoding="utf-8")) class TestRender: def test_empty_returns_empty(self) -> None: assert build_levers_section_html([]) == "" def test_card_per_lever(self) -> None: levers = [ Lever( type=LeverType.DOMINANT_RECOVERABLE_CLASS, importance=LeverImportance.HIGH, payload={"engine": "t", "share_recoverable_pct": 65.0, "n_recoverable": 65, "n_total_errors": 100, "top_classes": [{"class": "case_error", "count": 50}]}, ), ] labels = _load_labels("fr") html = build_levers_section_html(levers, labels) assert "lever-card" in html assert "65" in html assert "case_error" in html assert "Important" in html def test_anti_injection(self) -> None: levers = [ Lever( type=LeverType.DOMINANT_RECOVERABLE_CLASS, importance=LeverImportance.HIGH, payload={"engine": "", "share_recoverable_pct": 60.0, "n_recoverable": 60, "n_total_errors": 100, "top_classes": []}, ), ] html = build_levers_section_html(levers, _load_labels("fr")) assert "