Spaces:
Running
Running
Claude
chore: supprimer unregister_job mort + factoriser _numbers_in_payload partagΓ©
d40d01e unverified | """Tests Sprint 36 β cΓ’blage inter-moteurs au runner et au moteur narratif. | |
| Couvre : | |
| 1. ``compute_inter_engine_analysis`` β agrΓ©gation au niveau benchmark | |
| (corpus complet) avec vΓ©rification des invariants (oracle β₯ best | |
| single, structure complète, top-N per_doc trié). | |
| 2. ``BenchmarkResult.inter_engine_analysis`` β sΓ©rialisation dans | |
| ``as_dict()`` quand renseignΓ©, absent quand ``None``. | |
| 3. ``detect_ensemble_opportunity`` β dΓ©clenchement au-delΓ du seuil | |
| 25 %, importance HIGH au-delΓ de 50 %, payload tracable, fallback | |
| sur per_engine_recall quand la divergence taxonomique manque. | |
| 4. IntΓ©gration ``build_synthesis`` β le dΓ©tecteur s'enregistre par | |
| défaut et la synthèse rendue contient les valeurs du payload. | |
| 5. Garde-fou anti-hallucination β chaque nombre rendu est dans le | |
| payload (test de traΓ§abilitΓ©). | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import pytest | |
| from picarones.measurements.inter_engine import compute_inter_engine_analysis | |
| from picarones.measurements.narrative.detectors import detect_ensemble_opportunity | |
| from picarones.core.facts import FactImportance, FactType | |
| from picarones.measurements.narrative.renderer import extract_numbers, render_fact | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. compute_inter_engine_analysis (agrΓ©gateur) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestComputeInterEngineAnalysis: | |
| def test_returns_engines_alphabetical(self) -> None: | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={"zebra": {"d1": "x"}, "alpha": {"d1": "x"}}, | |
| ground_truths={"d1": "x"}, | |
| ) | |
| assert out["engines"] == ["alpha", "zebra"] | |
| def test_two_complementary_engines_oracle_is_one(self) -> None: | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={ | |
| "a": {"d1": "alpha beta x y", "d2": "alpha x x x"}, | |
| "b": {"d1": "x y gamma delta", "d2": "x beta gamma delta"}, | |
| }, | |
| ground_truths={ | |
| "d1": "alpha beta gamma delta", | |
| "d2": "alpha beta gamma delta", | |
| }, | |
| ) | |
| comp = out["complementarity"] | |
| assert comp["oracle_recall"] == pytest.approx(1.0) | |
| assert comp["best_single_recall"] < 1.0 | |
| assert comp["absolute_gap"] > 0.0 | |
| # Tous les tokens GT sont rΓ©cupΓ©rables β relative_gap = 1 | |
| assert comp["relative_gap"] == pytest.approx(1.0) | |
| def test_per_doc_top_is_sorted_by_gap(self) -> None: | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={ | |
| "a": {"d1": "x", "d2": "alpha", "d3": "alpha beta"}, | |
| "b": {"d1": "alpha", "d2": "x", "d3": "alpha beta"}, | |
| }, | |
| ground_truths={"d1": "alpha", "d2": "alpha", "d3": "alpha beta"}, | |
| ) | |
| gaps = [r["absolute_gap"] for r in out["complementarity"]["per_doc"]] | |
| assert gaps == sorted(gaps, reverse=True) | |
| def test_taxonomy_divergence_attached_when_distributions_provided(self) -> None: | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={"a": {"d1": "x"}, "b": {"d1": "y"}}, | |
| ground_truths={"d1": "x"}, | |
| taxonomy_distributions={ | |
| "a": {"visual": 0.9, "casse": 0.1}, | |
| "b": {"visual": 0.1, "casse": 0.9}, | |
| }, | |
| ) | |
| td = out["taxonomy_divergence"] | |
| assert td is not None | |
| assert td["metric"] == "js" | |
| assert td["max_pair"] is not None | |
| assert {td["max_pair"][0], td["max_pair"][1]} == {"a", "b"} | |
| def test_no_taxonomy_means_section_none(self) -> None: | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={"a": {"d1": "x"}, "b": {"d1": "y"}}, | |
| ground_truths={"d1": "x"}, | |
| taxonomy_distributions=None, | |
| ) | |
| assert out["taxonomy_divergence"] is None | |
| def test_oracle_at_least_best_per_engine(self) -> None: | |
| """Invariant fondamental : l'oracle est β₯ recall de tous les moteurs | |
| individuels.""" | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={ | |
| "a": {"d1": "alpha beta x", "d2": "alpha"}, | |
| "b": {"d1": "x x gamma", "d2": "gamma"}, | |
| "c": {"d1": "delta x x", "d2": "delta"}, | |
| }, | |
| ground_truths={ | |
| "d1": "alpha beta gamma delta", | |
| "d2": "alpha beta gamma delta", | |
| }, | |
| ) | |
| oracle = out["complementarity"]["oracle_recall"] | |
| for recall in out["complementarity"]["per_engine_recall"].values(): | |
| assert oracle >= recall - 1e-9 | |
| def test_empty_inputs_returns_no_complementarity(self) -> None: | |
| out = compute_inter_engine_analysis( | |
| per_engine_outputs={}, | |
| ground_truths={}, | |
| ) | |
| assert out["complementarity"] is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. BenchmarkResult expose inter_engine_analysis | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestBenchmarkResultExposure: | |
| def test_as_dict_includes_when_set(self) -> None: | |
| from picarones.core.results import BenchmarkResult | |
| br = BenchmarkResult( | |
| corpus_name="t", | |
| corpus_source=None, | |
| document_count=0, | |
| engine_reports=[], | |
| inter_engine_analysis={"engines": ["a"], "complementarity": None}, | |
| ) | |
| assert "inter_engine_analysis" in br.as_dict() | |
| def test_as_dict_omits_when_none(self) -> None: | |
| from picarones.core.results import BenchmarkResult | |
| br = BenchmarkResult( | |
| corpus_name="t", | |
| corpus_source=None, | |
| document_count=0, | |
| engine_reports=[], | |
| ) | |
| assert "inter_engine_analysis" not in br.as_dict() | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. DΓ©tecteur ENSEMBLE_OPPORTUNITY | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _build_data(relative_gap: float, *, with_taxonomy: bool = True) -> dict: | |
| """Construit un benchmark_data minimaliste pour tester le dΓ©tecteur.""" | |
| base = { | |
| "inter_engine_analysis": { | |
| "engines": ["tess", "pero"], | |
| "complementarity": { | |
| "oracle_recall": 0.95, | |
| "best_single_recall": 0.7, | |
| "best_engine": "pero", | |
| "absolute_gap": 0.25, | |
| "relative_gap": relative_gap, | |
| "doc_count": 47, | |
| "per_engine_recall": {"pero": 0.7, "tess": 0.5}, | |
| }, | |
| "taxonomy_divergence": ( | |
| { | |
| "metric": "js", | |
| "matrix": { | |
| "tess": {"tess": 0, "pero": 0.42}, | |
| "pero": {"tess": 0.42, "pero": 0}, | |
| }, | |
| "max_pair": ["tess", "pero", 0.42], | |
| } | |
| if with_taxonomy | |
| else None | |
| ), | |
| } | |
| } | |
| return base | |
| class TestEnsembleOpportunityDetector: | |
| def test_below_threshold_no_fact(self) -> None: | |
| facts = detect_ensemble_opportunity(_build_data(relative_gap=0.10)) | |
| assert facts == [] | |
| def test_above_threshold_emits_fact(self) -> None: | |
| facts = detect_ensemble_opportunity(_build_data(relative_gap=0.30)) | |
| assert len(facts) == 1 | |
| assert facts[0].type is FactType.ENSEMBLE_OPPORTUNITY | |
| def test_high_importance_above_50pct(self) -> None: | |
| facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83)) | |
| assert facts[0].importance is FactImportance.HIGH | |
| def test_medium_importance_below_50pct(self) -> None: | |
| facts = detect_ensemble_opportunity(_build_data(relative_gap=0.30)) | |
| assert facts[0].importance is FactImportance.MEDIUM | |
| def test_payload_uses_taxonomy_pair_when_available(self) -> None: | |
| facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83)) | |
| p = facts[0].payload | |
| assert {p["pair_a"], p["pair_b"]} == {"tess", "pero"} | |
| assert p["divergence"] == 0.42 | |
| assert p["divergence_metric"] == "js" | |
| def test_fallback_pair_when_no_taxonomy(self) -> None: | |
| facts = detect_ensemble_opportunity( | |
| _build_data(relative_gap=0.83, with_taxonomy=False), | |
| ) | |
| # Le fallback prend les deux meilleurs par per_engine_recall : | |
| # pero (0.7) et tess (0.5) | |
| p = facts[0].payload | |
| assert {p["pair_a"], p["pair_b"]} == {"tess", "pero"} | |
| assert p["divergence"] == 0.0 # divergence inconnue β 0 | |
| def test_no_inter_engine_analysis_no_fact(self) -> None: | |
| assert detect_ensemble_opportunity({}) == [] | |
| assert detect_ensemble_opportunity({"inter_engine_analysis": None}) == [] | |
| assert detect_ensemble_opportunity({"inter_engine_analysis": {}}) == [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. IntΓ©gration build_synthesis | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestSynthesisIntegration: | |
| def test_detector_registered_by_default(self) -> None: | |
| from picarones.measurements.narrative.registry import iter_detectors | |
| types = {entry.fact_type for entry in iter_detectors()} | |
| assert FactType.ENSEMBLE_OPPORTUNITY in types | |
| def test_synthesis_includes_ensemble_phrase(self) -> None: | |
| """Le dΓ©tecteur s'active dans le pipeline complet et la phrase | |
| rendue contient bien les chiffres clΓ©s.""" | |
| from picarones.measurements.narrative import build_synthesis | |
| # benchmark_data minimal qui n'active QUE notre dΓ©tecteur (pas | |
| # de ranking, pas de stats β pour isoler). | |
| data = _build_data(relative_gap=0.83) | |
| out = build_synthesis(data, lang="fr", max_facts=5) | |
| sentences = out["sentences"] | |
| assert any("voting" in s.lower() or "tess" in s for s in sentences) | |
| def test_synthesis_en_locale(self) -> None: | |
| from picarones.measurements.narrative import build_synthesis | |
| data = _build_data(relative_gap=0.83) | |
| out = build_synthesis(data, lang="en", max_facts=5) | |
| sentences = out["sentences"] | |
| assert any("majority vote" in s.lower() for s in sentences) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Anti-hallucination β chaque nombre rendu doit Γͺtre dans le payload | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| from tests.measurements._helpers import numbers_in_payload as _numbers_in_payload # noqa: E402 | |
| class TestTraceability: | |
| def test_every_rendered_number_is_in_payload(self, lang: str) -> None: | |
| facts = detect_ensemble_opportunity(_build_data(relative_gap=0.83)) | |
| assert facts | |
| sentence = render_fact(facts[0], lang) | |
| traceable = _numbers_in_payload(facts[0].payload) | |
| # Whitelist limitΓ©e des constantes acceptΓ©es dans les templates | |
| # (aucune pour ENSEMBLE_OPPORTUNITY β tout doit venir du payload). | |
| whitelist: set[str] = set() | |
| for num in extract_numbers(sentence): | |
| normalized = num.replace(",", ".") | |
| assert normalized in traceable | whitelist, ( | |
| f"Nombre {normalized!r} dans la phrase rendue n'est pas " | |
| f"traΓ§able au payload {facts[0].payload!r}" | |
| ) | |
| def test_no_extraneous_numbers_in_template(self) -> None: | |
| """Le template lui-mΓͺme ne contient pas de nombres en dur.""" | |
| from picarones.measurements.narrative.renderer import _load_templates | |
| tpl = _load_templates("fr").get("ensemble_opportunity", "") | |
| assert tpl | |
| # Chercher des nombres en dur (hors {placeholder}). On enlève | |
| # les placeholders et on cherche les chiffres restants. | |
| without_placeholders = re.sub(r"\{[^}]+\}", "", tpl) | |
| digits = re.findall(r"\d", without_placeholders) | |
| assert not digits, f"Template contient des chiffres en dur : {digits}" | |