File size: 13,723 Bytes
6377044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
 
 
 
6377044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
6377044
 
 
 
 
 
 
979f3c3
6377044
 
 
 
 
 
 
 
 
979f3c3
6377044
 
 
 
 
 
 
 
 
 
 
 
d40d01e
6377044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979f3c3
6377044
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""Tests Sprint 36 β€” cΓ’blage inter-moteurs au runner et au moteur narratif.

Couvre :

1. ``compute_inter_engine_analysis`` β€” agrΓ©gation au niveau benchmark
   (corpus complet) avec vΓ©rification des invariants (oracle β‰₯ best
   single, structure complète, top-N per_doc trié).
2. ``BenchmarkResult.inter_engine_analysis`` β€” sΓ©rialisation dans
   ``as_dict()`` quand renseignΓ©, absent quand ``None``.
3. ``detect_ensemble_opportunity`` β€” dΓ©clenchement au-delΓ  du seuil
   25 %, importance HIGH au-delΓ  de 50 %, payload tracable, fallback
   sur per_engine_recall quand la divergence taxonomique manque.
4. IntΓ©gration ``build_synthesis`` β€” le dΓ©tecteur s'enregistre par
   défaut et la synthèse rendue contient les valeurs du payload.
5. Garde-fou anti-hallucination β€” chaque nombre rendu est dans le
   payload (test de traΓ§abilitΓ©).
"""

from __future__ import annotations

import re

import pytest

from picarones.measurements.inter_engine import compute_inter_engine_analysis
from picarones.measurements.narrative.detectors import detect_ensemble_opportunity
from picarones.core.facts import FactImportance, FactType
from picarones.measurements.narrative.renderer import extract_numbers, render_fact


# ──────────────────────────────────────────────────────────────────────────
# 1. compute_inter_engine_analysis (agrΓ©gateur)
# ──────────────────────────────────────────────────────────────────────────


class TestComputeInterEngineAnalysis:
    def test_returns_engines_alphabetical(self) -> None:
        out = compute_inter_engine_analysis(
            per_engine_outputs={"zebra": {"d1": "x"}, "alpha": {"d1": "x"}},
            ground_truths={"d1": "x"},
        )
        assert out["engines"] == ["alpha", "zebra"]

    def test_two_complementary_engines_oracle_is_one(self) -> None:
        out = compute_inter_engine_analysis(
            per_engine_outputs={
                "a": {"d1": "alpha beta x y", "d2": "alpha x x x"},
                "b": {"d1": "x y gamma delta", "d2": "x beta gamma delta"},
            },
            ground_truths={
                "d1": "alpha beta gamma delta",
                "d2": "alpha beta gamma delta",
            },
        )
        comp = out["complementarity"]
        assert comp["oracle_recall"] == pytest.approx(1.0)
        assert comp["best_single_recall"] < 1.0
        assert comp["absolute_gap"] > 0.0
        # Tous les tokens GT sont rΓ©cupΓ©rables β†’ relative_gap = 1
        assert comp["relative_gap"] == pytest.approx(1.0)

    def test_per_doc_top_is_sorted_by_gap(self) -> None:
        out = compute_inter_engine_analysis(
            per_engine_outputs={
                "a": {"d1": "x", "d2": "alpha", "d3": "alpha beta"},
                "b": {"d1": "alpha", "d2": "x", "d3": "alpha beta"},
            },
            ground_truths={"d1": "alpha", "d2": "alpha", "d3": "alpha beta"},
        )
        gaps = [r["absolute_gap"] for r in out["complementarity"]["per_doc"]]
        assert gaps == sorted(gaps, reverse=True)

    def test_taxonomy_divergence_attached_when_distributions_provided(self) -> None:
        out = compute_inter_engine_analysis(
            per_engine_outputs={"a": {"d1": "x"}, "b": {"d1": "y"}},
            ground_truths={"d1": "x"},
            taxonomy_distributions={
                "a": {"visual": 0.9, "casse": 0.1},
                "b": {"visual": 0.1, "casse": 0.9},
            },
        )
        td = out["taxonomy_divergence"]
        assert td is not None
        assert td["metric"] == "js"
        assert td["max_pair"] is not None
        assert {td["max_pair"][0], td["max_pair"][1]} == {"a", "b"}

    def test_no_taxonomy_means_section_none(self) -> None:
        out = compute_inter_engine_analysis(
            per_engine_outputs={"a": {"d1": "x"}, "b": {"d1": "y"}},
            ground_truths={"d1": "x"},
            taxonomy_distributions=None,
        )
        assert out["taxonomy_divergence"] is None

    def test_oracle_at_least_best_per_engine(self) -> None:
        """Invariant fondamental : l'oracle est β‰₯ recall de tous les moteurs
        individuels."""
        out = compute_inter_engine_analysis(
            per_engine_outputs={
                "a": {"d1": "alpha beta x", "d2": "alpha"},
                "b": {"d1": "x x gamma", "d2": "gamma"},
                "c": {"d1": "delta x x", "d2": "delta"},
            },
            ground_truths={
                "d1": "alpha beta gamma delta",
                "d2": "alpha beta gamma delta",
            },
        )
        oracle = out["complementarity"]["oracle_recall"]
        for recall in out["complementarity"]["per_engine_recall"].values():
            assert oracle >= recall - 1e-9

    def test_empty_inputs_returns_no_complementarity(self) -> None:
        out = compute_inter_engine_analysis(
            per_engine_outputs={},
            ground_truths={},
        )
        assert out["complementarity"] is None


# ──────────────────────────────────────────────────────────────────────────
# 2. BenchmarkResult expose inter_engine_analysis
# ──────────────────────────────────────────────────────────────────────────


class TestBenchmarkResultExposure:
    def test_as_dict_includes_when_set(self) -> None:
        from picarones.core.results import BenchmarkResult

        br = BenchmarkResult(
            corpus_name="t",
            corpus_source=None,
            document_count=0,
            engine_reports=[],
            inter_engine_analysis={"engines": ["a"], "complementarity": None},
        )
        assert "inter_engine_analysis" in br.as_dict()

    def test_as_dict_omits_when_none(self) -> None:
        from picarones.core.results import BenchmarkResult

        br = BenchmarkResult(
            corpus_name="t",
            corpus_source=None,
            document_count=0,
            engine_reports=[],
        )
        assert "inter_engine_analysis" not in br.as_dict()


# ──────────────────────────────────────────────────────────────────────────
# 3. DΓ©tecteur ENSEMBLE_OPPORTUNITY
# ──────────────────────────────────────────────────────────────────────────


def _build_data(relative_gap: float, *, with_taxonomy: bool = True) -> dict:
    """Construit un benchmark_data minimaliste pour tester le dΓ©tecteur."""
    base = {
        "inter_engine_analysis": {
            "engines": ["tess", "pero"],
            "complementarity": {
                "oracle_recall": 0.95,
                "best_single_recall": 0.7,
                "best_engine": "pero",
                "absolute_gap": 0.25,
                "relative_gap": relative_gap,
                "doc_count": 47,
                "per_engine_recall": {"pero": 0.7, "tess": 0.5},
            },
            "taxonomy_divergence": (
                {
                    "metric": "js",
                    "matrix": {
                        "tess": {"tess": 0, "pero": 0.42},
                        "pero": {"tess": 0.42, "pero": 0},
                    },
                    "max_pair": ["tess", "pero", 0.42],
                }
                if with_taxonomy
                else None
            ),
        }
    }
    return base


class TestEnsembleOpportunityDetector:
    def test_below_threshold_no_fact(self) -> None:
        facts = detect_ensemble_opportunity(_build_data(relative_gap=0.10))
        assert facts == []

    def test_above_threshold_emits_fact(self) -> None:
        facts = detect_ensemble_opportunity(_build_data(relative_gap=0.30))
        assert len(facts) == 1
        assert facts[0].type is FactType.ENSEMBLE_OPPORTUNITY

    def test_high_importance_above_50pct(self) -> None:
        facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83))
        assert facts[0].importance is FactImportance.HIGH

    def test_medium_importance_below_50pct(self) -> None:
        facts = detect_ensemble_opportunity(_build_data(relative_gap=0.30))
        assert facts[0].importance is FactImportance.MEDIUM

    def test_payload_uses_taxonomy_pair_when_available(self) -> None:
        facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83))
        p = facts[0].payload
        assert {p["pair_a"], p["pair_b"]} == {"tess", "pero"}
        assert p["divergence"] == 0.42
        assert p["divergence_metric"] == "js"

    def test_fallback_pair_when_no_taxonomy(self) -> None:
        facts = detect_ensemble_opportunity(
            _build_data(relative_gap=0.83, with_taxonomy=False),
        )
        # Le fallback prend les deux meilleurs par per_engine_recall :
        # pero (0.7) et tess (0.5)
        p = facts[0].payload
        assert {p["pair_a"], p["pair_b"]} == {"tess", "pero"}
        assert p["divergence"] == 0.0  # divergence inconnue β†’ 0

    def test_no_inter_engine_analysis_no_fact(self) -> None:
        assert detect_ensemble_opportunity({}) == []
        assert detect_ensemble_opportunity({"inter_engine_analysis": None}) == []
        assert detect_ensemble_opportunity({"inter_engine_analysis": {}}) == []


# ──────────────────────────────────────────────────────────────────────────
# 4. IntΓ©gration build_synthesis
# ──────────────────────────────────────────────────────────────────────────


class TestSynthesisIntegration:
    def test_detector_registered_by_default(self) -> None:
        from picarones.measurements.narrative.registry import iter_detectors

        types = {entry.fact_type for entry in iter_detectors()}
        assert FactType.ENSEMBLE_OPPORTUNITY in types

    def test_synthesis_includes_ensemble_phrase(self) -> None:
        """Le dΓ©tecteur s'active dans le pipeline complet et la phrase
        rendue contient bien les chiffres clΓ©s."""
        from picarones.measurements.narrative import build_synthesis

        # benchmark_data minimal qui n'active QUE notre dΓ©tecteur (pas
        # de ranking, pas de stats β€” pour isoler).
        data = _build_data(relative_gap=0.83)
        out = build_synthesis(data, lang="fr", max_facts=5)
        sentences = out["sentences"]
        assert any("voting" in s.lower() or "tess" in s for s in sentences)

    def test_synthesis_en_locale(self) -> None:
        from picarones.measurements.narrative import build_synthesis

        data = _build_data(relative_gap=0.83)
        out = build_synthesis(data, lang="en", max_facts=5)
        sentences = out["sentences"]
        assert any("majority vote" in s.lower() for s in sentences)


# ──────────────────────────────────────────────────────────────────────────
# 5. Anti-hallucination β€” chaque nombre rendu doit Γͺtre dans le payload
# ──────────────────────────────────────────────────────────────────────────


from tests.measurements._helpers import numbers_in_payload as _numbers_in_payload  # noqa: E402


class TestTraceability:
    @pytest.mark.parametrize("lang", ["fr", "en"])
    def test_every_rendered_number_is_in_payload(self, lang: str) -> None:
        facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83))
        assert facts
        sentence = render_fact(facts[0], lang)
        traceable = _numbers_in_payload(facts[0].payload)
        # Whitelist limitΓ©e des constantes acceptΓ©es dans les templates
        # (aucune pour ENSEMBLE_OPPORTUNITY β€” tout doit venir du payload).
        whitelist: set[str] = set()
        for num in extract_numbers(sentence):
            normalized = num.replace(",", ".")
            assert normalized in traceable | whitelist, (
                f"Nombre {normalized!r} dans la phrase rendue n'est pas "
                f"traΓ§able au payload {facts[0].payload!r}"
            )

    def test_no_extraneous_numbers_in_template(self) -> None:
        """Le template lui-mΓͺme ne contient pas de nombres en dur."""
        from picarones.measurements.narrative.renderer import _load_templates

        tpl = _load_templates("fr").get("ensemble_opportunity", "")
        assert tpl
        # Chercher des nombres en dur (hors {placeholder}). On enlève
        # les placeholders et on cherche les chiffres restants.
        without_placeholders = re.sub(r"\{[^}]+\}", "", tpl)
        digits = re.findall(r"\d", without_placeholders)
        assert not digits, f"Template contient des chiffres en dur : {digits}"