Spaces:
Sleeping
Sleeping
File size: 7,097 Bytes
80c6417 9011070 80c6417 9011070 80c6417 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | """Détecteurs narratifs liés à la *stratification corpus* (chantier 5).
3 détecteurs + 1 helper (``_stratum_cer_by_engine``) déplacés depuis
``narrative/detectors.py`` :
- :func:`detect_stratum_winner` (Sprint 4)
- :func:`detect_stratum_collapse` (Sprint 4)
- :func:`detect_stratification_recommended` (Sprint 45)
"""
from __future__ import annotations
from picarones.domain.facts import Fact, FactImportance, FactType
from picarones.reports.narrative.registry import register_detector
from picarones.reports.narrative.detectors._helpers import (
_engine_by_name,
)
def _stratum_cer_by_engine(benchmark_data: dict) -> dict[str, dict[str, list[float]]]:
"""Agrège les CER par (moteur, strate).
Strate = ``document["script_type"]`` si présent. Retourne ``{}`` si aucun
document n'expose de strate (pas d'émission possible).
"""
out: dict[str, dict[str, list[float]]] = {}
for doc in benchmark_data.get("documents") or []:
stratum = doc.get("script_type")
if not stratum:
continue
for er in doc.get("engine_results") or []:
if er.get("error"):
continue
cer = er.get("cer")
if cer is None:
continue
name = er.get("engine")
out.setdefault(name, {}).setdefault(stratum, []).append(float(cer))
return out
@register_detector(
FactType.STRATUM_WINNER,
priority=40,
importance=FactImportance.MEDIUM,
)
def detect_stratum_winner(benchmark_data: dict) -> list[Fact]:
"""Moteur qui domine nettement sur une strate (≥ 3 documents, CER
au moins 25 % plus bas que le second sur cette strate).
"""
agg = _stratum_cer_by_engine(benchmark_data)
if not agg:
return []
# Inverser : {stratum: {engine: mean_cer}}
by_stratum: dict[str, dict[str, float]] = {}
for engine, strata in agg.items():
for stratum, vals in strata.items():
if len(vals) < 3:
continue
by_stratum.setdefault(stratum, {})[engine] = sum(vals) / len(vals)
facts: list[Fact] = []
for stratum, engine_cer in by_stratum.items():
if len(engine_cer) < 2:
continue
ordered = sorted(engine_cer.items(), key=lambda kv: kv[1])
best_name, best_cer = ordered[0]
second_cer = ordered[1][1]
if second_cer == 0:
continue
if best_cer < second_cer * 0.75: # dominance ≥ 25 %
facts.append(Fact(
type=FactType.STRATUM_WINNER,
importance=FactImportance.HIGH,
payload={
"engine": best_name,
"stratum": stratum,
"cer": round(best_cer, 4),
"cer_pct": round(best_cer * 100, 2),
"second_engine": ordered[1][0],
"second_cer": round(second_cer, 4),
"second_cer_pct": round(second_cer * 100, 2),
"n_docs_stratum": len(agg[best_name][stratum]),
},
engines_involved=(best_name,),
stratum=stratum,
))
return facts
@register_detector(
FactType.STRATUM_COLLAPSE,
priority=50,
importance=FactImportance.HIGH,
)
def detect_stratum_collapse(benchmark_data: dict) -> list[Fact]:
"""Moteur globalement compétitif qui s'effondre sur une strate.
Déclenché si, pour un moteur, le CER moyen sur une strate ≥ 3 documents
est plus du double du CER global du même moteur.
"""
agg = _stratum_cer_by_engine(benchmark_data)
if not agg:
return []
facts: list[Fact] = []
for engine_name, strata in agg.items():
summary = _engine_by_name(benchmark_data, engine_name) or {}
global_cer = summary.get("cer")
if global_cer is None:
continue
global_cer = float(global_cer)
if global_cer <= 0:
continue
for stratum, vals in strata.items():
if len(vals) < 3:
continue
local_cer = sum(vals) / len(vals)
if local_cer > 2.0 * global_cer and (local_cer - global_cer) > 0.05:
facts.append(Fact(
type=FactType.STRATUM_COLLAPSE,
importance=FactImportance.HIGH,
payload={
"engine": engine_name,
"stratum": stratum,
"local_cer": round(local_cer, 4),
"local_cer_pct": round(local_cer * 100, 2),
"global_cer": round(global_cer, 4),
"global_cer_pct": round(global_cer * 100, 2),
"delta_cer_pct": round((local_cer - global_cer) * 100, 2),
"n_docs_stratum": len(vals),
},
engines_involved=(engine_name,),
stratum=stratum,
))
return facts
@register_detector(
FactType.STRATIFICATION_RECOMMENDED,
priority=45, # juste après STRATUM_WINNER (40), avant STRATUM_COLLAPSE (50)
importance=FactImportance.HIGH,
)
def detect_stratification_recommended(benchmark_data: dict) -> list[Fact]:
"""Avertit quand le corpus est hétérogène et que la vue stratifiée
apporte un éclairage qualitativement différent du classement global.
Critère : ``corpus_homogeneity.max_inter_strata_gap > 5 points`` de
CER médian sur le moteur leader. Au-delà de 10 points, importance
``HIGH`` (situation très hétérogène où le seul classement global
serait trompeur).
Lit ``benchmark_data["corpus_homogeneity"]`` exposé par
``BenchmarkResult.as_dict()`` (Sprint 45).
"""
homog = benchmark_data.get("corpus_homogeneity")
if not homog:
return []
gap = homog.get("max_inter_strata_gap")
if gap is None:
return []
gap = float(gap)
if gap < 0.05:
return [] # 5 points de CER : seuil de pertinence éditoriale
leader = str(homog.get("leader") or "")
n_strata = int(homog.get("n_strata") or 0)
pair = homog.get("leader_max_gap_strata") or ["", ""]
if len(pair) < 2:
return []
min_strat, max_strat = str(pair[0]), str(pair[1])
leader_per_stratum = homog.get("leader_per_stratum_median") or {}
min_med = float(leader_per_stratum.get(min_strat, 0.0))
max_med = float(leader_per_stratum.get(max_strat, 0.0))
importance = (
FactImportance.HIGH if gap >= 0.10 else FactImportance.MEDIUM
)
return [Fact(
type=FactType.STRATIFICATION_RECOMMENDED,
importance=importance,
payload={
"leader": leader,
"n_strata": n_strata,
"gap_pct": round(gap * 100, 1),
"min_stratum": min_strat,
"max_stratum": max_strat,
"min_stratum_cer_pct": round(min_med * 100, 2),
"max_stratum_cer_pct": round(max_med * 100, 2),
},
engines_involved=(leader,) if leader else (),
)]
|