"""Rendu HTML « Stabilité multi-runs » — Sprint 90 (A.II.4). Suite directe ``picarones/core/reliability.compute_multirun_stability`` (Sprint 83). Pattern identique aux autres rendus : server-side, pas de JS, anti-injection systématique. Note d'intégration ------------------ La stabilité multi-runs n'est pas calculée automatiquement par le runner — l'utilisateur doit relancer son moteur LLM/VLM plusieurs fois (option ``--repeats N`` du runner reportée à un sprint dédié) et appeler ``compute_multirun_stability`` lui- même. Cette vue est donc un **module de rendu pur** que l'utilisateur compose : .. code-block:: python from picarones.measurements.reliability import compute_multirun_stability from picarones.report.multirun_stability_render import ( build_multirun_stability_html, ) stability = [] for engine_name, runs in per_engine_runs.items(): s = compute_multirun_stability(runs, reference=ref) if s is not None: s["engine_name"] = engine_name stability.append(s) html = build_multirun_stability_html(stability, labels) Vue --- Tableau moteur × {n_runs, CER moyen ± écart-type, CV (%), % paires identiques, n outputs distincts}. Cellule CV colorée par gradient vert (stable) → rouge (instable, CV > 20 %). Adaptive : ``""`` si la liste est vide ou que tous les ``cer_cv`` sont ``None``. """ from __future__ import annotations from html import escape as _e from typing import Optional from picarones.report.render_helpers import color_traffic_light def build_multirun_stability_html( stability: Optional[list], labels: Optional[dict[str, str]] = None, ) -> str: """Construit la vue HTML de stabilité multi-runs. Parameters ---------- stability: Liste de dicts (un par moteur) issus de ``compute_multirun_stability`` enrichis d'un ``engine_name``. Si vide ou ``None``, retourne ``""``. labels: Dict i18n. Clés sous le préfixe ``stability_*``. """ if not stability: return "" rows = [s for s in stability if isinstance(s, dict) and s.get("engine_name")] if not rows: return "" labels = labels or {} title = labels.get("stability_title", "Stabilité multi-runs") note = labels.get( "stability_note", "Quand un moteur LLM/VLM est non déterministe, la " "variance entre runs successifs sur les mêmes documents " "est un proxy de la fiabilité scientifique. Un CV élevé " "ou un faible taux de runs identiques discrédite " "l'interprétation du CER moyen.", ) h_engine = labels.get("stability_engine", "Moteur") h_n_runs = labels.get("stability_n_runs", "Runs") h_cer = labels.get("stability_cer", "CER moyen ± σ") h_cv = labels.get("stability_cv", "CV (%)") h_identical = labels.get("stability_identical", "% runs identiques") h_distinct = labels.get("stability_distinct", "Sorties distinctes") parts = [ '
', f'

{_e(title)}

', f'
' f'{_e(note)}
', '', '', ] for col in (h_engine, h_n_runs, h_cer, h_cv, h_identical, h_distinct): parts.append( f'' ) parts.append("") for stab in rows: engine = str(stab.get("engine_name") or "?") n_runs = int(stab.get("n_runs") or 0) cer_mean = stab.get("cer_mean") cer_stdev = stab.get("cer_stdev") cer_cv = stab.get("cer_cv") identical = stab.get("identical_run_rate") n_distinct = stab.get("n_distinct_outputs") if isinstance(cer_mean, (int, float)) and isinstance(cer_stdev, (int, float)): cer_str = f"{cer_mean * 100:.2f}% ± {cer_stdev * 100:.2f}%" elif isinstance(cer_mean, (int, float)): cer_str = f"{cer_mean * 100:.2f}%" else: cer_str = "—" if isinstance(cer_cv, (int, float)): cv_color = color_traffic_light(float(cer_cv), low_is_good=True, scale_max=0.25) cv_cell = ( f'' ) else: cv_cell = ( '' ) identical_str = ( f"{float(identical) * 100:.1f}" if isinstance(identical, (int, float)) else "—" ) distinct_str = str(n_distinct) if isinstance(n_distinct, int) else "—" parts.append( f'' f'' f'' f'' f'{cv_cell}' f'' f'' f'' ) parts.append("
' f'{_e(col)}
{float(cer_cv) * 100:.1f}
{_e(engine)}{n_runs}{cer_str}{identical_str}{distinct_str}
") return "".join(parts) __all__ = ["build_multirun_stability_html"]