Spaces:
Sleeping
Sleeping
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified | """Tests Sprint 73 β A.I.3 : dΓ©tecteur ``engine_off_baseline``. | |
| Couvre : | |
| 1. ``compute_engine_baseline`` : | |
| - Cas standard : β₯ min_runs, Γ©cart > seuil β off_baseline=True | |
| - Γcart faible β off_baseline=False | |
| - Moins de min_runs β ``None`` | |
| - Baseline = 0 β ``relative_delta = None`` (et off si CER > 0) | |
| - ``current_run_id`` exclu de la baseline | |
| - Filtre par engine + corpus respectΓ© | |
| - CER historiques None ignorΓ©s | |
| 2. ``compute_corpus_difficulty_percentile`` : | |
| - Calcul de percentile correct | |
| - ``harder_than_usual`` au-dessus de P75 | |
| - ``easier_than_usual`` en-dessous de P25 | |
| - Moins de min_runs β ``None`` | |
| 3. DΓ©tecteur ``detect_engine_off_baseline`` : | |
| - Silencieux si pas de ``baseline_comparisons`` | |
| - Γmet 1 Fact par moteur off_baseline | |
| - Importance HIGH si |delta| β₯ 50 %, MEDIUM sinon | |
| - Payload contient les nombres exacts pour traΓ§abilitΓ© | |
| 4. Rendu narratif : chaque nombre rendu est traΓ§able au payload | |
| (anti-hallucination, FR + EN). | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any, Optional | |
| import pytest | |
| from picarones.measurements.baseline_comparison import ( | |
| compute_corpus_difficulty_percentile, | |
| compute_engine_baseline, | |
| ) | |
| from picarones.measurements.narrative.detectors import detect_engine_off_baseline | |
| from picarones.core.facts import FactImportance, FactType | |
| from picarones.measurements.narrative.renderer import render_fact | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Mock BenchmarkHistory | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class _Entry: | |
| run_id: str | |
| engine_name: str | |
| corpus_name: str | |
| cer_mean: Optional[float] | |
| metadata: dict = field(default_factory=dict) | |
| class _MockHistory: | |
| def __init__(self, entries: list[_Entry]) -> None: | |
| self._entries = entries | |
| def query( | |
| self, | |
| engine: Optional[str] = None, | |
| corpus: Optional[str] = None, | |
| since: Optional[str] = None, | |
| limit: int = 100, | |
| ) -> list[Any]: | |
| out = [] | |
| for e in self._entries: | |
| if engine and e.engine_name != engine: | |
| continue | |
| if corpus and e.corpus_name != corpus: | |
| continue | |
| out.append(e) | |
| return out[:limit] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. compute_engine_baseline | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEngineBaseline: | |
| def test_off_baseline_higher(self) -> None: | |
| # 10 runs historiques Γ 4 % CER, run courant Γ 5,2 % β +30 % | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "tess", "corpus_A", 0.04) | |
| for i in range(10) | |
| ]) | |
| result = compute_engine_baseline( | |
| history, "tess", "corpus_A", current_cer=0.052, | |
| ) | |
| assert result is not None | |
| assert result["n_runs"] == 10 | |
| assert result["cer_current"] == 0.052 | |
| assert result["cer_historical_mean"] == pytest.approx(0.04) | |
| assert result["absolute_delta"] == pytest.approx(0.012) | |
| assert result["relative_delta"] == pytest.approx(0.30) | |
| assert result["off_baseline"] is True | |
| def test_within_baseline(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "tess", "c", 0.04) | |
| for i in range(10) | |
| ]) | |
| # Run courant Γ 4,1 % β Γ©cart 2,5 %, sous le seuil 20 % | |
| result = compute_engine_baseline( | |
| history, "tess", "c", current_cer=0.041, | |
| ) | |
| assert result is not None | |
| assert result["off_baseline"] is False | |
| def test_min_runs_filter(self) -> None: | |
| # Seulement 4 runs β sous le min_runs=5 | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "tess", "c", 0.04) for i in range(4) | |
| ]) | |
| assert compute_engine_baseline( | |
| history, "tess", "c", current_cer=0.05, | |
| ) is None | |
| def test_custom_min_runs(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "tess", "c", 0.04) for i in range(3) | |
| ]) | |
| # min_runs=2 β assez | |
| result = compute_engine_baseline( | |
| history, "tess", "c", current_cer=0.05, min_runs=2, | |
| ) | |
| assert result is not None | |
| assert result["n_runs"] == 3 | |
| def test_current_run_excluded(self) -> None: | |
| history = _MockHistory([ | |
| _Entry("current", "tess", "c", 0.20), # run courant dΓ©jΓ loggΓ© | |
| *[_Entry(f"r{i}", "tess", "c", 0.04) for i in range(5)], | |
| ]) | |
| result = compute_engine_baseline( | |
| history, "tess", "c", current_cer=0.05, | |
| current_run_id="current", | |
| ) | |
| assert result is not None | |
| # Le 0,20 ne doit pas tirer la moyenne historique | |
| assert result["n_runs"] == 5 | |
| assert result["cer_historical_mean"] == pytest.approx(0.04) | |
| def test_filter_by_engine_and_corpus(self) -> None: | |
| history = _MockHistory([ | |
| *[_Entry(f"r{i}", "tess", "corpus_A", 0.04) for i in range(5)], | |
| # MΓͺmes runs sur autre corpus β ne doivent pas compter | |
| *[_Entry(f"o{i}", "tess", "corpus_B", 0.20) for i in range(5)], | |
| # Autre moteur, mΓͺme corpus β ne doivent pas compter | |
| *[_Entry(f"p{i}", "pero", "corpus_A", 0.99) for i in range(5)], | |
| ]) | |
| result = compute_engine_baseline( | |
| history, "tess", "corpus_A", current_cer=0.05, | |
| ) | |
| assert result is not None | |
| assert result["n_runs"] == 5 | |
| assert result["cer_historical_mean"] == pytest.approx(0.04) | |
| def test_cer_none_ignored(self) -> None: | |
| history = _MockHistory([ | |
| _Entry("r1", "tess", "c", None), | |
| _Entry("r2", "tess", "c", -0.5), # nΓ©gatif β ignorΓ© | |
| *[_Entry(f"r{i}", "tess", "c", 0.04) for i in range(3, 8)], | |
| ]) | |
| result = compute_engine_baseline( | |
| history, "tess", "c", current_cer=0.05, | |
| ) | |
| assert result is not None | |
| assert result["n_runs"] == 5 | |
| def test_baseline_zero_returns_none_relative(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "tess", "c", 0.0) for i in range(5) | |
| ]) | |
| result = compute_engine_baseline( | |
| history, "tess", "c", current_cer=0.05, | |
| ) | |
| assert result is not None | |
| assert result["relative_delta"] is None | |
| assert result["off_baseline"] is False # not calculable | |
| def test_invalid_current_cer(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "tess", "c", 0.04) for i in range(5) | |
| ]) | |
| assert compute_engine_baseline( | |
| history, "tess", "c", current_cer=None, # type: ignore | |
| ) is None | |
| assert compute_engine_baseline( | |
| history, "tess", "c", current_cer=-0.1, | |
| ) is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. compute_corpus_difficulty_percentile | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestCorpusDifficultyPercentile: | |
| def test_percentile_calculation(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "x", "c", 0.04, metadata={"difficulty": d}) | |
| for i, d in enumerate([0.1, 0.2, 0.3, 0.4, 0.5]) | |
| ]) | |
| result = compute_corpus_difficulty_percentile(history, 0.45) | |
| assert result is not None | |
| # 4 sur 5 valeurs β€ 0.45 β P80 | |
| assert result["percentile"] == pytest.approx(80.0) | |
| assert result["n_runs"] == 5 | |
| def test_harder_than_usual(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "x", "c", 0.04, metadata={"difficulty": 0.1 * i}) | |
| for i in range(1, 11) # 10 valeurs : 0.1 .. 1.0 | |
| ]) | |
| # 0.95 β percentile 90 β harder | |
| result = compute_corpus_difficulty_percentile(history, 0.95) | |
| assert result is not None | |
| assert result["harder_than_usual"] is True | |
| assert result["easier_than_usual"] is False | |
| def test_easier_than_usual(self) -> None: | |
| history = _MockHistory([ | |
| _Entry(f"r{i}", "x", "c", 0.04, metadata={"difficulty": 0.1 * i}) | |
| for i in range(1, 11) | |
| ]) | |
| result = compute_corpus_difficulty_percentile(history, 0.05) | |
| assert result is not None | |
| assert result["easier_than_usual"] is True | |
| assert result["harder_than_usual"] is False | |
| def test_min_runs_filter(self) -> None: | |
| history = _MockHistory([ | |
| _Entry("r1", "x", "c", 0.04, metadata={"difficulty": 0.5}), | |
| ]) | |
| assert compute_corpus_difficulty_percentile(history, 0.5) is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. DΓ©tecteur narratif | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDetector: | |
| def test_silent_without_baseline_data(self) -> None: | |
| assert detect_engine_off_baseline({}) == [] | |
| assert detect_engine_off_baseline( | |
| {"baseline_comparisons": []}, | |
| ) == [] | |
| def test_silent_when_off_baseline_false(self) -> None: | |
| facts = detect_engine_off_baseline({ | |
| "baseline_comparisons": [ | |
| { | |
| "engine_name": "t", "cer_current": 0.04, | |
| "cer_historical_mean": 0.04, "n_runs": 10, | |
| "relative_delta": 0.0, "off_baseline": False, | |
| }, | |
| ], | |
| }) | |
| assert facts == [] | |
| def test_silent_when_relative_delta_none(self) -> None: | |
| # Baseline = 0 β relative None β on s'abstient | |
| facts = detect_engine_off_baseline({ | |
| "baseline_comparisons": [ | |
| { | |
| "engine_name": "t", "cer_current": 0.05, | |
| "cer_historical_mean": 0.0, "n_runs": 10, | |
| "relative_delta": None, "off_baseline": True, | |
| }, | |
| ], | |
| }) | |
| assert facts == [] | |
| def test_emits_fact_for_off_baseline(self) -> None: | |
| facts = detect_engine_off_baseline({ | |
| "baseline_comparisons": [ | |
| { | |
| "engine_name": "tess", "cer_current": 0.052, | |
| "cer_historical_mean": 0.041, "n_runs": 12, | |
| "relative_delta": 0.268, "off_baseline": True, | |
| }, | |
| ], | |
| }) | |
| assert len(facts) == 1 | |
| f = facts[0] | |
| assert f.type == FactType.ENGINE_OFF_BASELINE | |
| assert f.importance == FactImportance.MEDIUM | |
| assert f.payload["engine"] == "tess" | |
| assert f.payload["cer_current_pct"] == 5.2 | |
| assert f.payload["cer_historical_mean_pct"] == 4.1 | |
| assert f.payload["n_runs"] == 12 | |
| assert f.payload["relative_delta_pct"] == 26.8 | |
| assert f.payload["direction"] == "higher" | |
| assert f.engines_involved == ("tess",) | |
| def test_high_importance_above_50pct(self) -> None: | |
| facts = detect_engine_off_baseline({ | |
| "baseline_comparisons": [ | |
| { | |
| "engine_name": "x", "cer_current": 0.08, | |
| "cer_historical_mean": 0.04, "n_runs": 10, | |
| "relative_delta": 1.0, "off_baseline": True, | |
| }, | |
| ], | |
| }) | |
| assert facts[0].importance == FactImportance.HIGH | |
| def test_multiple_engines(self) -> None: | |
| facts = detect_engine_off_baseline({ | |
| "baseline_comparisons": [ | |
| { | |
| "engine_name": "tess", "cer_current": 0.05, | |
| "cer_historical_mean": 0.04, "n_runs": 10, | |
| "relative_delta": 0.25, "off_baseline": True, | |
| }, | |
| { | |
| "engine_name": "pero", "cer_current": 0.03, | |
| "cer_historical_mean": 0.04, "n_runs": 10, | |
| "relative_delta": -0.25, "off_baseline": True, | |
| }, | |
| ], | |
| }) | |
| assert len(facts) == 2 | |
| assert facts[1].payload["direction"] == "lower" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. TraΓ§abilitΓ© anti-hallucination | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTraceability: | |
| def test_each_number_in_rendered_text_is_in_payload( | |
| self, lang: str, | |
| ) -> None: | |
| import re | |
| facts = detect_engine_off_baseline({ | |
| "baseline_comparisons": [ | |
| { | |
| "engine_name": "tess", "cer_current": 0.052, | |
| "cer_historical_mean": 0.041, "n_runs": 12, | |
| "relative_delta": 0.268, "off_baseline": True, | |
| }, | |
| ], | |
| }) | |
| text = render_fact(facts[0], lang=lang) | |
| assert text # non vide | |
| # Chaque nombre dans le texte doit venir du payload (ou d'une | |
| # constante de template β ici aucune) | |
| payload_nums = { | |
| "5.2", "4.1", "12", "26.8", | |
| } | |
| rendered_nums = set(re.findall(r"\d+\.?\d*", text)) | |
| for num in rendered_nums: | |
| assert num in payload_nums, ( | |
| f"nombre rendu {num!r} non traΓ§able au payload" | |
| ) | |