Spaces:
Sleeping
Sleeping
Claude
feat(sprint-S8): cohΓ©rence finale β renames test dirs, /metrics endpoint, SBOM workflow
43478ec unverified | """Tests Sprint 92 β A.II.9 : mΓ©triques longitudinales. | |
| Couvre : | |
| 1. ``compute_linear_trend`` : pente, RΒ², garde-fous. | |
| 2. ``detect_change_point`` : index correct, garde-fous. | |
| 3. ``compute_engine_longitudinal`` : intΓ©gration entries. | |
| 4. ``compute_corpus_longitudinal`` : agrΓ©gation multi-moteurs. | |
| 5. DΓ©tecteur ``regression_in_history`` : | |
| - silence sans data | |
| - silence si tendance plate | |
| - HIGH si Ξ β₯ 5 pts | |
| - rΓ©agit Γ change-point seul | |
| - traΓ§abilitΓ© anti-hallucination FR + EN. | |
| 6. Vue HTML : adaptive, anti-injection, FR + EN. | |
| 7. ComplΓ©tude i18n. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| import pytest | |
| from picarones.evaluation.metrics.longitudinal import ( | |
| compute_corpus_longitudinal, | |
| compute_engine_longitudinal, | |
| compute_linear_trend, | |
| detect_change_point, | |
| ) | |
| from picarones.reports.narrative import build_synthesis | |
| from picarones.reports.narrative.detectors import detect_regression_in_history | |
| from picarones.domain.facts import FactImportance, FactType | |
| from picarones.reports.html.renderers.longitudinal import build_longitudinal_html | |
| def _load_labels(lang: str) -> dict: | |
| p = ( | |
| Path(__file__).parent.parent.parent | |
| / "picarones" / "reports" / "i18n" / f"{lang}.json" | |
| ) | |
| return json.loads(p.read_text(encoding="utf-8")) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. compute_linear_trend | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestLinearTrend: | |
| def test_perfect_trend(self) -> None: | |
| series = [ | |
| ("2025-01-01", 0.04), ("2025-02-01", 0.05), | |
| ("2025-03-01", 0.06), | |
| ] | |
| t = compute_linear_trend(series) | |
| assert t.r_squared > 0.99 | |
| assert t.slope > 0 # CER monte β pente positive | |
| assert t.n_runs == 3 | |
| def test_flat_series(self) -> None: | |
| series = [ | |
| ("2025-01-01", 0.05), ("2025-02-01", 0.05), | |
| ("2025-03-01", 0.05), | |
| ] | |
| t = compute_linear_trend(series) | |
| # SΓ©rie plate : pente β 0. RΒ² mathΓ©matiquement indΓ©terminΓ© | |
| # (variance nulle sur y) ; le code accepte 0 ou 1 selon | |
| # l'arrondi flottant. | |
| assert t.slope == pytest.approx(0.0, abs=1e-9) | |
| assert t.r_squared in (0.0, 1.0) or 0.0 <= t.r_squared <= 1.0 | |
| def test_lt_two_returns_none(self) -> None: | |
| assert compute_linear_trend([("2025-01-01", 0.05)]) is None | |
| assert compute_linear_trend([]) is None | |
| def test_invalid_timestamps_skipped(self) -> None: | |
| # Tous invalides β < 2 valides | |
| assert compute_linear_trend([ | |
| ("invalid", 0.05), ("garbage", 0.06), | |
| ]) is None | |
| def test_same_timestamp_returns_none(self) -> None: | |
| # Tous les t identiques β variance nulle | |
| assert compute_linear_trend([ | |
| ("2025-01-01", 0.05), ("2025-01-01", 0.06), | |
| ("2025-01-01", 0.07), | |
| ]) is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. detect_change_point | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestChangePoint: | |
| def test_clean_break(self) -> None: | |
| # 3 points Γ 0.04 puis 3 points Γ 0.07 | |
| series = [ | |
| ("2025-01-01", 0.04), ("2025-01-15", 0.04), | |
| ("2025-02-01", 0.04), ("2025-02-15", 0.07), | |
| ("2025-03-01", 0.07), ("2025-03-15", 0.07), | |
| ] | |
| cp = detect_change_point(series, min_segment_size=3) | |
| assert cp is not None | |
| assert cp.index == 3 | |
| assert cp.delta == pytest.approx(0.03) | |
| def test_too_few_points(self) -> None: | |
| series = [ | |
| ("2025-01-01", 0.04), ("2025-02-01", 0.05), | |
| ] | |
| assert detect_change_point(series, min_segment_size=3) is None | |
| def test_uniform_series_returns_change_with_delta_zero(self) -> None: | |
| series = [ | |
| ("2025-01-01", 0.05), ("2025-02-01", 0.05), | |
| ("2025-03-01", 0.05), ("2025-04-01", 0.05), | |
| ("2025-05-01", 0.05), ("2025-06-01", 0.05), | |
| ] | |
| cp = detect_change_point(series, min_segment_size=3) | |
| # delta = 0 | |
| assert cp is not None | |
| assert abs(cp.delta) < 1e-9 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. compute_engine_longitudinal | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEngineLongitudinal: | |
| def _entries(self) -> list[dict]: | |
| return [ | |
| {"engine_name": "tess", "corpus_name": "bnf", | |
| "timestamp": ts, "cer_mean": cer} | |
| for ts, cer in [ | |
| ("2025-01-01", 0.04), ("2025-02-01", 0.045), | |
| ("2025-03-01", 0.05), ("2025-04-01", 0.06), | |
| ("2025-05-01", 0.07), ("2025-06-01", 0.08), | |
| ] | |
| ] | |
| def test_basic(self) -> None: | |
| r = compute_engine_longitudinal( | |
| self._entries(), "tess", corpus_name="bnf", | |
| ) | |
| assert r is not None | |
| assert r["n_runs"] == 6 | |
| assert r["trend"]["slope"] > 0 | |
| assert r["absolute_delta_pct"] == pytest.approx(4.0, abs=0.01) | |
| def test_filters_corpus(self) -> None: | |
| entries = self._entries() + [ | |
| {"engine_name": "tess", "corpus_name": "other", | |
| "timestamp": "2025-07-01", "cer_mean": 0.99}, | |
| ] | |
| r = compute_engine_longitudinal( | |
| entries, "tess", corpus_name="bnf", | |
| ) | |
| # L'entrΓ©e "other" ne doit pas polluer | |
| assert r["n_runs"] == 6 | |
| def test_min_runs_threshold(self) -> None: | |
| # min_runs_for_trend=10 > n_runs=6 | |
| r = compute_engine_longitudinal( | |
| self._entries(), "tess", corpus_name="bnf", | |
| min_runs_for_trend=10, | |
| ) | |
| assert r is None | |
| def test_change_point_threshold(self) -> None: | |
| # Avec un threshold immense, le change-point est supprimΓ© | |
| r = compute_engine_longitudinal( | |
| self._entries(), "tess", | |
| change_point_threshold=1.0, | |
| ) | |
| assert r["change_point"] is None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. compute_corpus_longitudinal | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestCorpusLongitudinal: | |
| def test_multiple_engines(self) -> None: | |
| entries: list[dict] = [] | |
| for engine in ("tess", "pero"): | |
| for i, cer in enumerate([0.04, 0.045, 0.05, 0.06]): | |
| entries.append({ | |
| "engine_name": engine, "corpus_name": "bnf", | |
| "timestamp": f"2025-0{i + 1}-01", | |
| "cer_mean": cer, | |
| }) | |
| out = compute_corpus_longitudinal(entries, corpus_name="bnf") | |
| names = [e["engine_name"] for e in out] | |
| assert "tess" in names | |
| assert "pero" in names | |
| def test_empty(self) -> None: | |
| assert compute_corpus_longitudinal([]) == [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. DΓ©tecteur regression_in_history | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDetector: | |
| def test_silent_without_data(self) -> None: | |
| assert detect_regression_in_history({}) == [] | |
| assert detect_regression_in_history( | |
| {"longitudinal_trends": []}, | |
| ) == [] | |
| def test_silent_when_flat(self) -> None: | |
| data = {"longitudinal_trends": [{ | |
| "engine_name": "tess", "n_runs": 5, | |
| "trend": {"slope": 1e-7, "r_squared": 0.0, | |
| "intercept": 0.05, "n_runs": 5}, | |
| "change_point": None, | |
| "first_cer": 0.05, "last_cer": 0.05, | |
| "absolute_delta": 0.0, "absolute_delta_pct": 0.0, | |
| }]} | |
| assert detect_regression_in_history(data) == [] | |
| def test_emits_when_slope_high(self) -> None: | |
| # Slope > 1 pt CER / 365 jours | |
| data = {"longitudinal_trends": [{ | |
| "engine_name": "tess", "n_runs": 5, | |
| "trend": {"slope": 0.0005, "r_squared": 0.9, | |
| "intercept": 0.04, "n_runs": 5}, | |
| "change_point": None, | |
| "first_cer": 0.04, "last_cer": 0.06, | |
| "absolute_delta": 0.02, "absolute_delta_pct": 2.0, | |
| }]} | |
| facts = detect_regression_in_history(data) | |
| assert len(facts) == 1 | |
| assert facts[0].type == FactType.REGRESSION_IN_HISTORY | |
| assert facts[0].importance == FactImportance.MEDIUM | |
| assert facts[0].payload["pattern"] == "trend" | |
| def test_emits_high_when_delta_large(self) -> None: | |
| # |Ξ| β₯ 5 pts β HIGH | |
| data = {"longitudinal_trends": [{ | |
| "engine_name": "tess", "n_runs": 8, | |
| "trend": {"slope": 0.001, "r_squared": 0.95, | |
| "intercept": 0.04, "n_runs": 8}, | |
| "change_point": None, | |
| "first_cer": 0.04, "last_cer": 0.10, | |
| "absolute_delta": 0.06, "absolute_delta_pct": 6.0, | |
| }]} | |
| facts = detect_regression_in_history(data) | |
| assert facts[0].importance == FactImportance.HIGH | |
| def test_emits_on_change_point_only(self) -> None: | |
| # Slope nul mais rupture brutale | |
| data = {"longitudinal_trends": [{ | |
| "engine_name": "tess", "n_runs": 8, | |
| "trend": {"slope": 1e-8, "r_squared": 0.0, | |
| "intercept": 0.04, "n_runs": 8}, | |
| "change_point": { | |
| "index": 4, "timestamp": "2025-03-01", | |
| "mean_before": 0.04, "mean_after": 0.07, | |
| "delta": 0.03, "n_before": 4, "n_after": 4, | |
| }, | |
| "first_cer": 0.04, "last_cer": 0.07, | |
| "absolute_delta": 0.03, "absolute_delta_pct": 3.0, | |
| }]} | |
| facts = detect_regression_in_history(data) | |
| assert len(facts) == 1 | |
| assert facts[0].payload["pattern"] == "change_point" | |
| assert "change_point_timestamp" in facts[0].payload | |
| def test_silent_when_lt_three_runs(self) -> None: | |
| data = {"longitudinal_trends": [{ | |
| "engine_name": "tess", "n_runs": 2, | |
| "trend": {"slope": 0.001, "r_squared": 0.9, | |
| "intercept": 0.04, "n_runs": 2}, | |
| "change_point": None, | |
| "absolute_delta": 0.05, | |
| }]} | |
| assert detect_regression_in_history(data) == [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Anti-hallucination synthesis | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _payload_numbers(payload: dict) -> set[str]: | |
| out: set[str] = set() | |
| for v in payload.values(): | |
| if isinstance(v, (int, float)): | |
| out.add(str(v)) | |
| if isinstance(v, float) and v.is_integer(): | |
| out.add(str(int(v))) | |
| return out | |
| def _numbers_in(text: str) -> set[str]: | |
| return set(re.findall(r"\d+(?:\.\d+)?", text)) | |
| class TestAntiHallucination: | |
| def _build(self, lang: str) -> tuple[list[str], dict]: | |
| data = { | |
| "ranking": [{"engine": "tess", "mean_cer": 0.07}], | |
| "engines": [{"name": "tess", "mean_cer": 0.07}], | |
| "meta": {"document_count": 5}, | |
| "longitudinal_trends": [{ | |
| "engine_name": "tess", "n_runs": 8, | |
| "trend": {"slope": 0.0002, "r_squared": 0.91, | |
| "intercept": 0.04, "n_runs": 8}, | |
| "change_point": None, | |
| "first_cer": 0.04, "last_cer": 0.07, | |
| "absolute_delta": 0.03, | |
| "absolute_delta_pct": 3.0, | |
| "first_cer_pct": 4.0, "last_cer_pct": 7.0, | |
| }], | |
| } | |
| synthesis = build_synthesis(data, lang=lang, max_facts=10) | |
| facts = detect_regression_in_history(data) | |
| return synthesis["sentences"], facts[0].payload | |
| def _find(self, sentences: list[str], lang: str) -> str: | |
| marker = "modèles" if lang == "fr" else "models" | |
| for s in sentences: | |
| if marker in s: | |
| return s | |
| raise AssertionError(f"phrase introuvable : {sentences}") | |
| def test_fr_traceable(self) -> None: | |
| sentences, payload = self._build("fr") | |
| sentence = self._find(sentences, "fr") | |
| rendered = _numbers_in(sentence) | |
| allowed = _payload_numbers(payload) | |
| assert rendered.issubset(allowed), ( | |
| f"non traΓ§able : {rendered - allowed}" | |
| ) | |
| def test_en_traceable(self) -> None: | |
| sentences, payload = self._build("en") | |
| sentence = self._find(sentences, "en") | |
| rendered = _numbers_in(sentence) | |
| allowed = _payload_numbers(payload) | |
| assert rendered.issubset(allowed), ( | |
| f"non traΓ§able : {rendered - allowed}" | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. Vue HTML | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRender: | |
| def test_empty_returns_empty(self) -> None: | |
| assert build_longitudinal_html(None) == "" | |
| assert build_longitudinal_html([]) == "" | |
| def test_renders_table(self) -> None: | |
| trends = [{ | |
| "engine_name": "tess", "n_runs": 8, | |
| "trend": {"slope": 0.0001, "r_squared": 0.85}, | |
| "change_point": { | |
| "timestamp": "2025-03-01", "delta": 0.025, | |
| }, | |
| "first_cer": 0.04, "last_cer": 0.07, | |
| "absolute_delta": 0.03, "absolute_delta_pct": 3.0, | |
| }] | |
| html = build_longitudinal_html(trends, _load_labels("fr")) | |
| assert "<table" in html | |
| assert "tess" in html | |
| # Ξ +3.00 | |
| assert "+3.00" in html | |
| # change-point | |
| assert "2025-03-01" in html | |
| def test_anti_injection(self) -> None: | |
| trends = [{ | |
| "engine_name": "<script>alert(1)</script>", | |
| "n_runs": 5, | |
| "trend": {"slope": 0.001, "r_squared": 0.9}, | |
| "change_point": None, | |
| "first_cer": 0.04, "last_cer": 0.05, | |
| "absolute_delta": 0.01, "absolute_delta_pct": 1.0, | |
| }] | |
| html = build_longitudinal_html(trends, _load_labels("fr")) | |
| assert "<script>alert" not in html | |
| assert "<script>" in html | |
| def test_renders_in_english(self) -> None: | |
| trends = [{ | |
| "engine_name": "tess", "n_runs": 5, | |
| "trend": {"slope": 0.001, "r_squared": 0.9}, | |
| "change_point": None, | |
| "first_cer": 0.04, "last_cer": 0.05, | |
| "absolute_delta": 0.01, "absolute_delta_pct": 1.0, | |
| }] | |
| html = build_longitudinal_html(trends, _load_labels("en")) | |
| assert "Evolution over time" in html | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. ComplΓ©tude i18n | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _KEYS = { | |
| "longitudinal_title", "longitudinal_note", "longitudinal_engine", | |
| "longitudinal_n_runs", "longitudinal_first", "longitudinal_last", | |
| "longitudinal_delta", "longitudinal_slope", "longitudinal_r2", | |
| "longitudinal_change", | |
| } | |
| class TestI18n: | |
| def test_fr(self) -> None: | |
| d = _load_labels("fr") | |
| assert not _KEYS - d.keys() | |
| def test_en(self) -> None: | |
| d = _load_labels("en") | |
| assert not _KEYS - d.keys() | |