Spaces:
Running
Running
File size: 13,723 Bytes
6377044 979f3c3 6377044 979f3c3 6377044 979f3c3 6377044 979f3c3 6377044 d40d01e 6377044 979f3c3 6377044 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | """Tests Sprint 36 β cΓ’blage inter-moteurs au runner et au moteur narratif.
Couvre :
1. ``compute_inter_engine_analysis`` β agrΓ©gation au niveau benchmark
(corpus complet) avec vΓ©rification des invariants (oracle β₯ best
single, structure complète, top-N per_doc trié).
2. ``BenchmarkResult.inter_engine_analysis`` β sΓ©rialisation dans
``as_dict()`` quand renseignΓ©, absent quand ``None``.
3. ``detect_ensemble_opportunity`` β dΓ©clenchement au-delΓ du seuil
25 %, importance HIGH au-delΓ de 50 %, payload tracable, fallback
sur per_engine_recall quand la divergence taxonomique manque.
4. IntΓ©gration ``build_synthesis`` β le dΓ©tecteur s'enregistre par
défaut et la synthèse rendue contient les valeurs du payload.
5. Garde-fou anti-hallucination β chaque nombre rendu est dans le
payload (test de traΓ§abilitΓ©).
"""
from __future__ import annotations
import re
import pytest
from picarones.measurements.inter_engine import compute_inter_engine_analysis
from picarones.measurements.narrative.detectors import detect_ensemble_opportunity
from picarones.core.facts import FactImportance, FactType
from picarones.measurements.narrative.renderer import extract_numbers, render_fact
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. compute_inter_engine_analysis (agrΓ©gateur)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestComputeInterEngineAnalysis:
def test_returns_engines_alphabetical(self) -> None:
out = compute_inter_engine_analysis(
per_engine_outputs={"zebra": {"d1": "x"}, "alpha": {"d1": "x"}},
ground_truths={"d1": "x"},
)
assert out["engines"] == ["alpha", "zebra"]
def test_two_complementary_engines_oracle_is_one(self) -> None:
out = compute_inter_engine_analysis(
per_engine_outputs={
"a": {"d1": "alpha beta x y", "d2": "alpha x x x"},
"b": {"d1": "x y gamma delta", "d2": "x beta gamma delta"},
},
ground_truths={
"d1": "alpha beta gamma delta",
"d2": "alpha beta gamma delta",
},
)
comp = out["complementarity"]
assert comp["oracle_recall"] == pytest.approx(1.0)
assert comp["best_single_recall"] < 1.0
assert comp["absolute_gap"] > 0.0
# Tous les tokens GT sont rΓ©cupΓ©rables β relative_gap = 1
assert comp["relative_gap"] == pytest.approx(1.0)
def test_per_doc_top_is_sorted_by_gap(self) -> None:
out = compute_inter_engine_analysis(
per_engine_outputs={
"a": {"d1": "x", "d2": "alpha", "d3": "alpha beta"},
"b": {"d1": "alpha", "d2": "x", "d3": "alpha beta"},
},
ground_truths={"d1": "alpha", "d2": "alpha", "d3": "alpha beta"},
)
gaps = [r["absolute_gap"] for r in out["complementarity"]["per_doc"]]
assert gaps == sorted(gaps, reverse=True)
def test_taxonomy_divergence_attached_when_distributions_provided(self) -> None:
out = compute_inter_engine_analysis(
per_engine_outputs={"a": {"d1": "x"}, "b": {"d1": "y"}},
ground_truths={"d1": "x"},
taxonomy_distributions={
"a": {"visual": 0.9, "casse": 0.1},
"b": {"visual": 0.1, "casse": 0.9},
},
)
td = out["taxonomy_divergence"]
assert td is not None
assert td["metric"] == "js"
assert td["max_pair"] is not None
assert {td["max_pair"][0], td["max_pair"][1]} == {"a", "b"}
def test_no_taxonomy_means_section_none(self) -> None:
out = compute_inter_engine_analysis(
per_engine_outputs={"a": {"d1": "x"}, "b": {"d1": "y"}},
ground_truths={"d1": "x"},
taxonomy_distributions=None,
)
assert out["taxonomy_divergence"] is None
def test_oracle_at_least_best_per_engine(self) -> None:
"""Invariant fondamental : l'oracle est β₯ recall de tous les moteurs
individuels."""
out = compute_inter_engine_analysis(
per_engine_outputs={
"a": {"d1": "alpha beta x", "d2": "alpha"},
"b": {"d1": "x x gamma", "d2": "gamma"},
"c": {"d1": "delta x x", "d2": "delta"},
},
ground_truths={
"d1": "alpha beta gamma delta",
"d2": "alpha beta gamma delta",
},
)
oracle = out["complementarity"]["oracle_recall"]
for recall in out["complementarity"]["per_engine_recall"].values():
assert oracle >= recall - 1e-9
def test_empty_inputs_returns_no_complementarity(self) -> None:
out = compute_inter_engine_analysis(
per_engine_outputs={},
ground_truths={},
)
assert out["complementarity"] is None
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. BenchmarkResult expose inter_engine_analysis
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestBenchmarkResultExposure:
def test_as_dict_includes_when_set(self) -> None:
from picarones.core.results import BenchmarkResult
br = BenchmarkResult(
corpus_name="t",
corpus_source=None,
document_count=0,
engine_reports=[],
inter_engine_analysis={"engines": ["a"], "complementarity": None},
)
assert "inter_engine_analysis" in br.as_dict()
def test_as_dict_omits_when_none(self) -> None:
from picarones.core.results import BenchmarkResult
br = BenchmarkResult(
corpus_name="t",
corpus_source=None,
document_count=0,
engine_reports=[],
)
assert "inter_engine_analysis" not in br.as_dict()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. DΓ©tecteur ENSEMBLE_OPPORTUNITY
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _build_data(relative_gap: float, *, with_taxonomy: bool = True) -> dict:
"""Construit un benchmark_data minimaliste pour tester le dΓ©tecteur."""
base = {
"inter_engine_analysis": {
"engines": ["tess", "pero"],
"complementarity": {
"oracle_recall": 0.95,
"best_single_recall": 0.7,
"best_engine": "pero",
"absolute_gap": 0.25,
"relative_gap": relative_gap,
"doc_count": 47,
"per_engine_recall": {"pero": 0.7, "tess": 0.5},
},
"taxonomy_divergence": (
{
"metric": "js",
"matrix": {
"tess": {"tess": 0, "pero": 0.42},
"pero": {"tess": 0.42, "pero": 0},
},
"max_pair": ["tess", "pero", 0.42],
}
if with_taxonomy
else None
),
}
}
return base
class TestEnsembleOpportunityDetector:
def test_below_threshold_no_fact(self) -> None:
facts = detect_ensemble_opportunity(_build_data(relative_gap=0.10))
assert facts == []
def test_above_threshold_emits_fact(self) -> None:
facts = detect_ensemble_opportunity(_build_data(relative_gap=0.30))
assert len(facts) == 1
assert facts[0].type is FactType.ENSEMBLE_OPPORTUNITY
def test_high_importance_above_50pct(self) -> None:
facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83))
assert facts[0].importance is FactImportance.HIGH
def test_medium_importance_below_50pct(self) -> None:
facts = detect_ensemble_opportunity(_build_data(relative_gap=0.30))
assert facts[0].importance is FactImportance.MEDIUM
def test_payload_uses_taxonomy_pair_when_available(self) -> None:
facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83))
p = facts[0].payload
assert {p["pair_a"], p["pair_b"]} == {"tess", "pero"}
assert p["divergence"] == 0.42
assert p["divergence_metric"] == "js"
def test_fallback_pair_when_no_taxonomy(self) -> None:
facts = detect_ensemble_opportunity(
_build_data(relative_gap=0.83, with_taxonomy=False),
)
# Le fallback prend les deux meilleurs par per_engine_recall :
# pero (0.7) et tess (0.5)
p = facts[0].payload
assert {p["pair_a"], p["pair_b"]} == {"tess", "pero"}
assert p["divergence"] == 0.0 # divergence inconnue β 0
def test_no_inter_engine_analysis_no_fact(self) -> None:
assert detect_ensemble_opportunity({}) == []
assert detect_ensemble_opportunity({"inter_engine_analysis": None}) == []
assert detect_ensemble_opportunity({"inter_engine_analysis": {}}) == []
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. IntΓ©gration build_synthesis
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestSynthesisIntegration:
def test_detector_registered_by_default(self) -> None:
from picarones.measurements.narrative.registry import iter_detectors
types = {entry.fact_type for entry in iter_detectors()}
assert FactType.ENSEMBLE_OPPORTUNITY in types
def test_synthesis_includes_ensemble_phrase(self) -> None:
"""Le dΓ©tecteur s'active dans le pipeline complet et la phrase
rendue contient bien les chiffres clΓ©s."""
from picarones.measurements.narrative import build_synthesis
# benchmark_data minimal qui n'active QUE notre dΓ©tecteur (pas
# de ranking, pas de stats β pour isoler).
data = _build_data(relative_gap=0.83)
out = build_synthesis(data, lang="fr", max_facts=5)
sentences = out["sentences"]
assert any("voting" in s.lower() or "tess" in s for s in sentences)
def test_synthesis_en_locale(self) -> None:
from picarones.measurements.narrative import build_synthesis
data = _build_data(relative_gap=0.83)
out = build_synthesis(data, lang="en", max_facts=5)
sentences = out["sentences"]
assert any("majority vote" in s.lower() for s in sentences)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. Anti-hallucination β chaque nombre rendu doit Γͺtre dans le payload
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
from tests.measurements._helpers import numbers_in_payload as _numbers_in_payload # noqa: E402
class TestTraceability:
@pytest.mark.parametrize("lang", ["fr", "en"])
def test_every_rendered_number_is_in_payload(self, lang: str) -> None:
facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83))
assert facts
sentence = render_fact(facts[0], lang)
traceable = _numbers_in_payload(facts[0].payload)
# Whitelist limitΓ©e des constantes acceptΓ©es dans les templates
# (aucune pour ENSEMBLE_OPPORTUNITY β tout doit venir du payload).
whitelist: set[str] = set()
for num in extract_numbers(sentence):
normalized = num.replace(",", ".")
assert normalized in traceable | whitelist, (
f"Nombre {normalized!r} dans la phrase rendue n'est pas "
f"traΓ§able au payload {facts[0].payload!r}"
)
def test_no_extraneous_numbers_in_template(self) -> None:
"""Le template lui-mΓͺme ne contient pas de nombres en dur."""
from picarones.measurements.narrative.renderer import _load_templates
tpl = _load_templates("fr").get("ensemble_opportunity", "")
assert tpl
# Chercher des nombres en dur (hors {placeholder}). On enlève
# les placeholders et on cherche les chiffres restants.
without_placeholders = re.sub(r"\{[^}]+\}", "", tpl)
digits = re.findall(r"\d", without_placeholders)
assert not digits, f"Template contient des chiffres en dur : {digits}"
|