Picarones / tests /measurements /test_sprint73_baseline_comparison.py
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified
Raw
History Blame
14.9 kB
"""Tests Sprint 73 β€” A.I.3 : dΓ©tecteur ``engine_off_baseline``.
Couvre :
1. ``compute_engine_baseline`` :
- Cas standard : β‰₯ min_runs, Γ©cart > seuil β†’ off_baseline=True
- Γ‰cart faible β†’ off_baseline=False
- Moins de min_runs β†’ ``None``
- Baseline = 0 β†’ ``relative_delta = None`` (et off si CER > 0)
- ``current_run_id`` exclu de la baseline
- Filtre par engine + corpus respectΓ©
- CER historiques None ignorΓ©s
2. ``compute_corpus_difficulty_percentile`` :
- Calcul de percentile correct
- ``harder_than_usual`` au-dessus de P75
- ``easier_than_usual`` en-dessous de P25
- Moins de min_runs β†’ ``None``
3. DΓ©tecteur ``detect_engine_off_baseline`` :
- Silencieux si pas de ``baseline_comparisons``
- Γ‰met 1 Fact par moteur off_baseline
- Importance HIGH si |delta| β‰₯ 50 %, MEDIUM sinon
- Payload contient les nombres exacts pour traΓ§abilitΓ©
4. Rendu narratif : chaque nombre rendu est traΓ§able au payload
(anti-hallucination, FR + EN).
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Optional
import pytest
from picarones.measurements.baseline_comparison import (
compute_corpus_difficulty_percentile,
compute_engine_baseline,
)
from picarones.measurements.narrative.detectors import detect_engine_off_baseline
from picarones.core.facts import FactImportance, FactType
from picarones.measurements.narrative.renderer import render_fact
# ──────────────────────────────────────────────────────────────────────────
# Mock BenchmarkHistory
# ──────────────────────────────────────────────────────────────────────────
@dataclass
class _Entry:
run_id: str
engine_name: str
corpus_name: str
cer_mean: Optional[float]
metadata: dict = field(default_factory=dict)
class _MockHistory:
def __init__(self, entries: list[_Entry]) -> None:
self._entries = entries
def query(
self,
engine: Optional[str] = None,
corpus: Optional[str] = None,
since: Optional[str] = None,
limit: int = 100,
) -> list[Any]:
out = []
for e in self._entries:
if engine and e.engine_name != engine:
continue
if corpus and e.corpus_name != corpus:
continue
out.append(e)
return out[:limit]
# ──────────────────────────────────────────────────────────────────────────
# 1. compute_engine_baseline
# ──────────────────────────────────────────────────────────────────────────
class TestEngineBaseline:
def test_off_baseline_higher(self) -> None:
# 10 runs historiques Γ  4 % CER, run courant Γ  5,2 % β†’ +30 %
history = _MockHistory([
_Entry(f"r{i}", "tess", "corpus_A", 0.04)
for i in range(10)
])
result = compute_engine_baseline(
history, "tess", "corpus_A", current_cer=0.052,
)
assert result is not None
assert result["n_runs"] == 10
assert result["cer_current"] == 0.052
assert result["cer_historical_mean"] == pytest.approx(0.04)
assert result["absolute_delta"] == pytest.approx(0.012)
assert result["relative_delta"] == pytest.approx(0.30)
assert result["off_baseline"] is True
def test_within_baseline(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "tess", "c", 0.04)
for i in range(10)
])
# Run courant Γ  4,1 % β†’ Γ©cart 2,5 %, sous le seuil 20 %
result = compute_engine_baseline(
history, "tess", "c", current_cer=0.041,
)
assert result is not None
assert result["off_baseline"] is False
def test_min_runs_filter(self) -> None:
# Seulement 4 runs β†’ sous le min_runs=5
history = _MockHistory([
_Entry(f"r{i}", "tess", "c", 0.04) for i in range(4)
])
assert compute_engine_baseline(
history, "tess", "c", current_cer=0.05,
) is None
def test_custom_min_runs(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "tess", "c", 0.04) for i in range(3)
])
# min_runs=2 β†’ assez
result = compute_engine_baseline(
history, "tess", "c", current_cer=0.05, min_runs=2,
)
assert result is not None
assert result["n_runs"] == 3
def test_current_run_excluded(self) -> None:
history = _MockHistory([
_Entry("current", "tess", "c", 0.20), # run courant dΓ©jΓ  loggΓ©
*[_Entry(f"r{i}", "tess", "c", 0.04) for i in range(5)],
])
result = compute_engine_baseline(
history, "tess", "c", current_cer=0.05,
current_run_id="current",
)
assert result is not None
# Le 0,20 ne doit pas tirer la moyenne historique
assert result["n_runs"] == 5
assert result["cer_historical_mean"] == pytest.approx(0.04)
def test_filter_by_engine_and_corpus(self) -> None:
history = _MockHistory([
*[_Entry(f"r{i}", "tess", "corpus_A", 0.04) for i in range(5)],
# MΓͺmes runs sur autre corpus β€” ne doivent pas compter
*[_Entry(f"o{i}", "tess", "corpus_B", 0.20) for i in range(5)],
# Autre moteur, mΓͺme corpus β€” ne doivent pas compter
*[_Entry(f"p{i}", "pero", "corpus_A", 0.99) for i in range(5)],
])
result = compute_engine_baseline(
history, "tess", "corpus_A", current_cer=0.05,
)
assert result is not None
assert result["n_runs"] == 5
assert result["cer_historical_mean"] == pytest.approx(0.04)
def test_cer_none_ignored(self) -> None:
history = _MockHistory([
_Entry("r1", "tess", "c", None),
_Entry("r2", "tess", "c", -0.5), # nΓ©gatif β†’ ignorΓ©
*[_Entry(f"r{i}", "tess", "c", 0.04) for i in range(3, 8)],
])
result = compute_engine_baseline(
history, "tess", "c", current_cer=0.05,
)
assert result is not None
assert result["n_runs"] == 5
def test_baseline_zero_returns_none_relative(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "tess", "c", 0.0) for i in range(5)
])
result = compute_engine_baseline(
history, "tess", "c", current_cer=0.05,
)
assert result is not None
assert result["relative_delta"] is None
assert result["off_baseline"] is False # not calculable
def test_invalid_current_cer(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "tess", "c", 0.04) for i in range(5)
])
assert compute_engine_baseline(
history, "tess", "c", current_cer=None, # type: ignore
) is None
assert compute_engine_baseline(
history, "tess", "c", current_cer=-0.1,
) is None
# ──────────────────────────────────────────────────────────────────────────
# 2. compute_corpus_difficulty_percentile
# ──────────────────────────────────────────────────────────────────────────
class TestCorpusDifficultyPercentile:
def test_percentile_calculation(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "x", "c", 0.04, metadata={"difficulty": d})
for i, d in enumerate([0.1, 0.2, 0.3, 0.4, 0.5])
])
result = compute_corpus_difficulty_percentile(history, 0.45)
assert result is not None
# 4 sur 5 valeurs ≀ 0.45 β†’ P80
assert result["percentile"] == pytest.approx(80.0)
assert result["n_runs"] == 5
def test_harder_than_usual(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "x", "c", 0.04, metadata={"difficulty": 0.1 * i})
for i in range(1, 11) # 10 valeurs : 0.1 .. 1.0
])
# 0.95 β†’ percentile 90 β†’ harder
result = compute_corpus_difficulty_percentile(history, 0.95)
assert result is not None
assert result["harder_than_usual"] is True
assert result["easier_than_usual"] is False
def test_easier_than_usual(self) -> None:
history = _MockHistory([
_Entry(f"r{i}", "x", "c", 0.04, metadata={"difficulty": 0.1 * i})
for i in range(1, 11)
])
result = compute_corpus_difficulty_percentile(history, 0.05)
assert result is not None
assert result["easier_than_usual"] is True
assert result["harder_than_usual"] is False
def test_min_runs_filter(self) -> None:
history = _MockHistory([
_Entry("r1", "x", "c", 0.04, metadata={"difficulty": 0.5}),
])
assert compute_corpus_difficulty_percentile(history, 0.5) is None
# ──────────────────────────────────────────────────────────────────────────
# 3. DΓ©tecteur narratif
# ──────────────────────────────────────────────────────────────────────────
class TestDetector:
def test_silent_without_baseline_data(self) -> None:
assert detect_engine_off_baseline({}) == []
assert detect_engine_off_baseline(
{"baseline_comparisons": []},
) == []
def test_silent_when_off_baseline_false(self) -> None:
facts = detect_engine_off_baseline({
"baseline_comparisons": [
{
"engine_name": "t", "cer_current": 0.04,
"cer_historical_mean": 0.04, "n_runs": 10,
"relative_delta": 0.0, "off_baseline": False,
},
],
})
assert facts == []
def test_silent_when_relative_delta_none(self) -> None:
# Baseline = 0 β†’ relative None β†’ on s'abstient
facts = detect_engine_off_baseline({
"baseline_comparisons": [
{
"engine_name": "t", "cer_current": 0.05,
"cer_historical_mean": 0.0, "n_runs": 10,
"relative_delta": None, "off_baseline": True,
},
],
})
assert facts == []
def test_emits_fact_for_off_baseline(self) -> None:
facts = detect_engine_off_baseline({
"baseline_comparisons": [
{
"engine_name": "tess", "cer_current": 0.052,
"cer_historical_mean": 0.041, "n_runs": 12,
"relative_delta": 0.268, "off_baseline": True,
},
],
})
assert len(facts) == 1
f = facts[0]
assert f.type == FactType.ENGINE_OFF_BASELINE
assert f.importance == FactImportance.MEDIUM
assert f.payload["engine"] == "tess"
assert f.payload["cer_current_pct"] == 5.2
assert f.payload["cer_historical_mean_pct"] == 4.1
assert f.payload["n_runs"] == 12
assert f.payload["relative_delta_pct"] == 26.8
assert f.payload["direction"] == "higher"
assert f.engines_involved == ("tess",)
def test_high_importance_above_50pct(self) -> None:
facts = detect_engine_off_baseline({
"baseline_comparisons": [
{
"engine_name": "x", "cer_current": 0.08,
"cer_historical_mean": 0.04, "n_runs": 10,
"relative_delta": 1.0, "off_baseline": True,
},
],
})
assert facts[0].importance == FactImportance.HIGH
def test_multiple_engines(self) -> None:
facts = detect_engine_off_baseline({
"baseline_comparisons": [
{
"engine_name": "tess", "cer_current": 0.05,
"cer_historical_mean": 0.04, "n_runs": 10,
"relative_delta": 0.25, "off_baseline": True,
},
{
"engine_name": "pero", "cer_current": 0.03,
"cer_historical_mean": 0.04, "n_runs": 10,
"relative_delta": -0.25, "off_baseline": True,
},
],
})
assert len(facts) == 2
assert facts[1].payload["direction"] == "lower"
# ──────────────────────────────────────────────────────────────────────────
# 4. TraΓ§abilitΓ© anti-hallucination
# ──────────────────────────────────────────────────────────────────────────
class TestTraceability:
@pytest.mark.parametrize("lang", ["fr", "en"])
def test_each_number_in_rendered_text_is_in_payload(
self, lang: str,
) -> None:
import re
facts = detect_engine_off_baseline({
"baseline_comparisons": [
{
"engine_name": "tess", "cer_current": 0.052,
"cer_historical_mean": 0.041, "n_runs": 12,
"relative_delta": 0.268, "off_baseline": True,
},
],
})
text = render_fact(facts[0], lang=lang)
assert text # non vide
# Chaque nombre dans le texte doit venir du payload (ou d'une
# constante de template β€” ici aucune)
payload_nums = {
"5.2", "4.1", "12", "26.8",
}
rendered_nums = set(re.findall(r"\d+\.?\d*", text))
for num in rendered_nums:
assert num in payload_nums, (
f"nombre rendu {num!r} non traΓ§able au payload"
)