Claude
feat(sprint-H.3)!: renommage reports_v2/ → reports/
9011070 unverified
Raw
History Blame
9.42 kB
"""Détecteurs narratifs orientés *classement* (chantier 5).
5 détecteurs déplacés depuis ``narrative/detectors.py`` :
- :func:`detect_global_leader_cer` (Sprint 4)
- :func:`detect_statistical_tie` (Sprint 18)
- :func:`detect_significant_gap` (Sprint 4)
- :func:`detect_speed_winner` (Sprint 4)
- :func:`detect_median_mean_gap_warning` (Sprint 44)
Comportement et signature inchangés. Tous restent enregistrés
automatiquement via ``@register_detector`` à l'import.
"""
from __future__ import annotations
import statistics as _stats
from picarones.domain.facts import Fact, FactImportance, FactType
from picarones.reports.narrative.registry import register_detector
from picarones.reports.narrative.detectors._helpers import (
_engine_by_name,
_mean_duration_per_engine,
_n_docs,
)
@register_detector(
FactType.GLOBAL_LEADER_CER,
priority=10,
importance=FactImportance.CRITICAL,
)
def detect_global_leader_cer(benchmark_data: dict) -> list[Fact]:
"""Moteur avec le CER moyen le plus bas sur l'ensemble du corpus.
Émet un Fact CRITICAL si au moins 2 moteurs sont comparés, en attachant
aussi le 2ᵉ pour permettre à l'arbitre de fusionner avec ``significant_gap``.
"""
ranking = benchmark_data.get("ranking") or []
# Éliminer les entrées sans CER calculé
valid = [r for r in ranking if r.get("mean_cer") is not None]
if len(valid) < 1:
return []
leader = valid[0]
runner_up = valid[1] if len(valid) >= 2 else None
payload = {
"engine": leader["engine"],
"cer": float(leader["mean_cer"]),
"cer_pct": round(float(leader["mean_cer"]) * 100, 2),
"n_engines": len(valid),
"n_docs": _n_docs(benchmark_data),
}
if runner_up is not None:
payload["runner_up"] = runner_up["engine"]
payload["runner_up_cer"] = float(runner_up["mean_cer"])
payload["runner_up_cer_pct"] = round(float(runner_up["mean_cer"]) * 100, 2)
return [Fact(
type=FactType.GLOBAL_LEADER_CER,
importance=FactImportance.CRITICAL,
payload=payload,
engines_involved=(leader["engine"],),
)]
@register_detector(
FactType.STATISTICAL_TIE,
priority=20,
importance=FactImportance.CRITICAL,
)
def detect_statistical_tie(benchmark_data: dict) -> list[Fact]:
"""Groupes de moteurs statistiquement indiscernables (Nemenyi)."""
nemenyi = benchmark_data.get("statistics", {}).get("nemenyi", {})
if not nemenyi or nemenyi.get("error"):
return []
tied_groups = nemenyi.get("tied_groups", [])
mean_ranks = nemenyi.get("mean_ranks", {})
cd = nemenyi.get("critical_distance", 0.0)
alpha = nemenyi.get("alpha", 0.05)
n_blocks = nemenyi.get("n_blocks", 0)
facts: list[Fact] = []
for group in tied_groups:
if len(group) < 2:
continue
is_leader_tie = min(mean_ranks.get(n, 999) for n in group) == min(
mean_ranks.values(), default=0
)
importance = FactImportance.CRITICAL if is_leader_tie else FactImportance.HIGH
facts.append(Fact(
type=FactType.STATISTICAL_TIE,
importance=importance,
payload={
"engines": list(group),
"engines_list": ", ".join(group),
"mean_ranks": {n: mean_ranks.get(n) for n in group},
"critical_distance": round(cd, 3),
"alpha": alpha,
"n_blocks": n_blocks,
"includes_leader": is_leader_tie,
"n_tied": len(group),
},
engines_involved=tuple(group),
))
return facts
@register_detector(
FactType.SIGNIFICANT_GAP,
priority=30,
importance=FactImportance.HIGH,
)
def detect_significant_gap(benchmark_data: dict) -> list[Fact]:
"""Écart statistiquement significatif entre le 1ᵉʳ et le 2ᵉ du classement.
Lit la matrice de Wilcoxon pairwise et vérifie si la paire (leader,
runner-up) y apparaît avec ``significant = True``.
"""
ranking = benchmark_data.get("ranking") or []
valid = [r for r in ranking if r.get("mean_cer") is not None]
if len(valid) < 2:
return []
leader = valid[0]["engine"]
runner_up = valid[1]["engine"]
pairwise = benchmark_data.get("statistics", {}).get("pairwise_wilcoxon") or []
match = None
for p in pairwise:
names = {p.get("engine_a"), p.get("engine_b")}
if names == {leader, runner_up}:
match = p
break
if match is None:
return []
if not match.get("significant"):
return [] # pas d'écart significatif — rien à signaler ici
delta_cer = abs(float(valid[0]["mean_cer"]) - float(valid[1]["mean_cer"]))
return [Fact(
type=FactType.SIGNIFICANT_GAP,
importance=FactImportance.CRITICAL,
payload={
"leader": leader,
"runner_up": runner_up,
"p_value": float(match.get("p_value", 0.0)),
"delta_cer": round(delta_cer, 4),
"delta_cer_pct": round(delta_cer * 100, 2),
"n_pairs": int(match.get("n_pairs", 0)),
},
engines_involved=(leader, runner_up),
)]
@register_detector(
FactType.SPEED_WINNER,
priority=100,
importance=FactImportance.MEDIUM,
)
def detect_speed_winner(benchmark_data: dict) -> list[Fact]:
"""Moteur significativement plus rapide pour une qualité comparable.
Déclenché si un moteur est au moins 3× plus rapide que la médiane ET que
son CER n'est pas significativement pire (dans le même groupe Nemenyi que
le leader OU CER ≤ 1,1 × CER du leader).
"""
durations = _mean_duration_per_engine(benchmark_data)
if len(durations) < 2:
return []
values = list(durations.values())
median_dur = _stats.median(values)
if median_dur <= 0:
return []
ranking = benchmark_data.get("ranking") or []
valid = [r for r in ranking if r.get("mean_cer") is not None]
if not valid:
return []
leader_cer = float(valid[0]["mean_cer"])
quality_ceiling = max(0.01, leader_cer * 1.10)
tied_groups = benchmark_data.get("statistics", {}).get("nemenyi", {}).get("tied_groups") or []
leader_group: set[str] = set()
for g in tied_groups:
if valid[0]["engine"] in g:
leader_group = set(g)
break
facts: list[Fact] = []
candidates = sorted(durations.items(), key=lambda kv: kv[1])
for engine, dur in candidates:
if dur * 3.0 > median_dur:
break # les suivants sont encore plus lents
summary = _engine_by_name(benchmark_data, engine) or {}
engine_cer = summary.get("cer")
if engine_cer is None:
continue
acceptable_quality = (
engine in leader_group or float(engine_cer) <= quality_ceiling
)
if not acceptable_quality:
continue
facts.append(Fact(
type=FactType.SPEED_WINNER,
importance=FactImportance.MEDIUM,
payload={
"engine": engine,
"mean_duration": round(dur, 3),
"median_duration": round(median_dur, 3),
"speedup": round(median_dur / dur, 1) if dur > 0 else None,
"cer": round(float(engine_cer), 4),
"cer_pct": round(float(engine_cer) * 100, 2),
},
engines_involved=(engine,),
))
return facts[:1] # seulement le plus rapide — éviter le bruit
@register_detector(
FactType.MEDIAN_MEAN_GAP_WARNING,
priority=140,
importance=FactImportance.MEDIUM,
)
def detect_median_mean_gap_warning(benchmark_data: dict) -> list[Fact]:
"""Avertit quand le ratio ``|moyenne - médiane| / médiane`` du leader
dépasse 30 %, ce qui indique une distribution fortement asymétrique
où la moyenne masque les performances réelles.
Sprint 44 — A.I.2 du plan d'évolution. Cohérent avec le passage du
tri par défaut sur la médiane : si la moyenne du leader diverge
fortement de la médiane, l'utilisateur doit le savoir pour
interpréter correctement les chiffres.
"""
ranking = benchmark_data.get("ranking") or []
valid = [
r for r in ranking
if r.get("median_cer") is not None
and r.get("mean_cer") is not None
]
if not valid:
return []
leader = valid[0]
median_cer = float(leader["median_cer"])
mean_cer = float(leader["mean_cer"])
if median_cer <= 0:
# Médiane nulle (corpus très facile pour ce moteur) — l'écart
# relatif n'est pas calculable de manière utile, on s'abstient.
return []
relative_gap = abs(mean_cer - median_cer) / median_cer
if relative_gap < 0.30:
return []
importance = (
FactImportance.HIGH if relative_gap >= 1.0 else FactImportance.MEDIUM
)
return [Fact(
type=FactType.MEDIAN_MEAN_GAP_WARNING,
importance=importance,
payload={
"engine": leader["engine"],
"median_cer_pct": round(median_cer * 100, 2),
"mean_cer_pct": round(mean_cer * 100, 2),
"relative_gap_pct": round(relative_gap * 100, 1),
"n_docs": int(leader.get("documents") or 0),
},
engines_involved=(leader["engine"],),
)]