Spaces:
Sleeping
Sleeping
| """Détecteurs narratifs liés à la *stratification corpus* (chantier 5). | |
| 3 détecteurs + 1 helper (``_stratum_cer_by_engine``) déplacés depuis | |
| ``narrative/detectors.py`` : | |
| - :func:`detect_stratum_winner` (Sprint 4) | |
| - :func:`detect_stratum_collapse` (Sprint 4) | |
| - :func:`detect_stratification_recommended` (Sprint 45) | |
| """ | |
| from __future__ import annotations | |
| from picarones.domain.facts import Fact, FactImportance, FactType | |
| from picarones.reports.narrative.registry import register_detector | |
| from picarones.reports.narrative.detectors._helpers import ( | |
| _engine_by_name, | |
| ) | |
| def _stratum_cer_by_engine(benchmark_data: dict) -> dict[str, dict[str, list[float]]]: | |
| """Agrège les CER par (moteur, strate). | |
| Strate = ``document["script_type"]`` si présent. Retourne ``{}`` si aucun | |
| document n'expose de strate (pas d'émission possible). | |
| """ | |
| out: dict[str, dict[str, list[float]]] = {} | |
| for doc in benchmark_data.get("documents") or []: | |
| stratum = doc.get("script_type") | |
| if not stratum: | |
| continue | |
| for er in doc.get("engine_results") or []: | |
| if er.get("error"): | |
| continue | |
| cer = er.get("cer") | |
| if cer is None: | |
| continue | |
| name = er.get("engine") | |
| out.setdefault(name, {}).setdefault(stratum, []).append(float(cer)) | |
| return out | |
| def detect_stratum_winner(benchmark_data: dict) -> list[Fact]: | |
| """Moteur qui domine nettement sur une strate (≥ 3 documents, CER | |
| au moins 25 % plus bas que le second sur cette strate). | |
| """ | |
| agg = _stratum_cer_by_engine(benchmark_data) | |
| if not agg: | |
| return [] | |
| # Inverser : {stratum: {engine: mean_cer}} | |
| by_stratum: dict[str, dict[str, float]] = {} | |
| for engine, strata in agg.items(): | |
| for stratum, vals in strata.items(): | |
| if len(vals) < 3: | |
| continue | |
| by_stratum.setdefault(stratum, {})[engine] = sum(vals) / len(vals) | |
| facts: list[Fact] = [] | |
| for stratum, engine_cer in by_stratum.items(): | |
| if len(engine_cer) < 2: | |
| continue | |
| ordered = sorted(engine_cer.items(), key=lambda kv: kv[1]) | |
| best_name, best_cer = ordered[0] | |
| second_cer = ordered[1][1] | |
| if second_cer == 0: | |
| continue | |
| if best_cer < second_cer * 0.75: # dominance ≥ 25 % | |
| facts.append(Fact( | |
| type=FactType.STRATUM_WINNER, | |
| importance=FactImportance.HIGH, | |
| payload={ | |
| "engine": best_name, | |
| "stratum": stratum, | |
| "cer": round(best_cer, 4), | |
| "cer_pct": round(best_cer * 100, 2), | |
| "second_engine": ordered[1][0], | |
| "second_cer": round(second_cer, 4), | |
| "second_cer_pct": round(second_cer * 100, 2), | |
| "n_docs_stratum": len(agg[best_name][stratum]), | |
| }, | |
| engines_involved=(best_name,), | |
| stratum=stratum, | |
| )) | |
| return facts | |
| def detect_stratum_collapse(benchmark_data: dict) -> list[Fact]: | |
| """Moteur globalement compétitif qui s'effondre sur une strate. | |
| Déclenché si, pour un moteur, le CER moyen sur une strate ≥ 3 documents | |
| est plus du double du CER global du même moteur. | |
| """ | |
| agg = _stratum_cer_by_engine(benchmark_data) | |
| if not agg: | |
| return [] | |
| facts: list[Fact] = [] | |
| for engine_name, strata in agg.items(): | |
| summary = _engine_by_name(benchmark_data, engine_name) or {} | |
| global_cer = summary.get("cer") | |
| if global_cer is None: | |
| continue | |
| global_cer = float(global_cer) | |
| if global_cer <= 0: | |
| continue | |
| for stratum, vals in strata.items(): | |
| if len(vals) < 3: | |
| continue | |
| local_cer = sum(vals) / len(vals) | |
| if local_cer > 2.0 * global_cer and (local_cer - global_cer) > 0.05: | |
| facts.append(Fact( | |
| type=FactType.STRATUM_COLLAPSE, | |
| importance=FactImportance.HIGH, | |
| payload={ | |
| "engine": engine_name, | |
| "stratum": stratum, | |
| "local_cer": round(local_cer, 4), | |
| "local_cer_pct": round(local_cer * 100, 2), | |
| "global_cer": round(global_cer, 4), | |
| "global_cer_pct": round(global_cer * 100, 2), | |
| "delta_cer_pct": round((local_cer - global_cer) * 100, 2), | |
| "n_docs_stratum": len(vals), | |
| }, | |
| engines_involved=(engine_name,), | |
| stratum=stratum, | |
| )) | |
| return facts | |
| def detect_stratification_recommended(benchmark_data: dict) -> list[Fact]: | |
| """Avertit quand le corpus est hétérogène et que la vue stratifiée | |
| apporte un éclairage qualitativement différent du classement global. | |
| Critère : ``corpus_homogeneity.max_inter_strata_gap > 5 points`` de | |
| CER médian sur le moteur leader. Au-delà de 10 points, importance | |
| ``HIGH`` (situation très hétérogène où le seul classement global | |
| serait trompeur). | |
| Lit ``benchmark_data["corpus_homogeneity"]`` exposé par | |
| ``BenchmarkResult.as_dict()`` (Sprint 45). | |
| """ | |
| homog = benchmark_data.get("corpus_homogeneity") | |
| if not homog: | |
| return [] | |
| gap = homog.get("max_inter_strata_gap") | |
| if gap is None: | |
| return [] | |
| gap = float(gap) | |
| if gap < 0.05: | |
| return [] # 5 points de CER : seuil de pertinence éditoriale | |
| leader = str(homog.get("leader") or "") | |
| n_strata = int(homog.get("n_strata") or 0) | |
| pair = homog.get("leader_max_gap_strata") or ["", ""] | |
| if len(pair) < 2: | |
| return [] | |
| min_strat, max_strat = str(pair[0]), str(pair[1]) | |
| leader_per_stratum = homog.get("leader_per_stratum_median") or {} | |
| min_med = float(leader_per_stratum.get(min_strat, 0.0)) | |
| max_med = float(leader_per_stratum.get(max_strat, 0.0)) | |
| importance = ( | |
| FactImportance.HIGH if gap >= 0.10 else FactImportance.MEDIUM | |
| ) | |
| return [Fact( | |
| type=FactType.STRATIFICATION_RECOMMENDED, | |
| importance=importance, | |
| payload={ | |
| "leader": leader, | |
| "n_strata": n_strata, | |
| "gap_pct": round(gap * 100, 1), | |
| "min_stratum": min_strat, | |
| "max_stratum": max_strat, | |
| "min_stratum_cer_pct": round(min_med * 100, 2), | |
| "max_stratum_cer_pct": round(max_med * 100, 2), | |
| }, | |
| engines_involved=(leader,) if leader else (), | |
| )] | |