Spaces:
Running
Running
Claude
refactor(core): faire de core/ un cercle 1 strict, déplacer cercle 2 vers measurements/
979f3c3 unverified | """Tests Sprint 28 — UX : save/load config, compare runs, synthesis preview. | |
| Le Sprint 28 réduit la friction du chercheur qui itère sur 8 prompts : | |
| 1. ``/api/config/save`` + ``/api/config/load`` — sérialisation/import | |
| d'une configuration de benchmark en JSON. | |
| 2. ``picarones/report/comparison.py`` — comparaison de deux runs JSON | |
| avec deltas par moteur et détection de régressions. | |
| 3. ``picarones compare`` (CLI) — équivalent ligne de commande. | |
| 4. ``/api/benchmark/{job_id}/synthesis_preview`` — synthèse narrative | |
| d'un job terminé sans rouvrir le HTML. | |
| 5. ``/api/history/regressions`` — surface de l'infrastructure Sprint 8. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import pytest | |
| from click.testing import CliRunner | |
| from fastapi.testclient import TestClient | |
| # --------------------------------------------------------------------------- | |
| # 1. Module comparison — compare_benchmarks | |
| # --------------------------------------------------------------------------- | |
| def _benchmark_json(engines_to_cer: dict[str, float], **extra) -> dict: | |
| """Fabrique un dict ``BenchmarkResult.as_dict()``-like.""" | |
| ranking = [ | |
| { | |
| "engine": name, | |
| "mean_cer": cer, | |
| "mean_wer": cer * 1.5 if cer is not None else None, | |
| "documents": 10, | |
| "failed": 0, | |
| } | |
| for name, cer in engines_to_cer.items() | |
| ] | |
| return { | |
| "ranking": ranking, | |
| "run_date": extra.get("run_date", "2026-04-01T00:00:00+00:00"), | |
| "corpus": {"name": extra.get("corpus", "test_corpus"), "source": "fixture"}, | |
| } | |
| class TestCompareBenchmarks: | |
| def test_identical_runs_no_regression(self): | |
| from picarones.report.comparison import compare_benchmarks | |
| a = _benchmark_json({"tesseract": 0.05, "pero": 0.07}) | |
| b = _benchmark_json({"tesseract": 0.05, "pero": 0.07}) | |
| diff = compare_benchmarks(a, b, threshold=0.005) | |
| assert all(d.delta_cer == 0.0 for d in diff.deltas) | |
| assert all(not d.is_regression and not d.is_improvement for d in diff.deltas) | |
| def test_regression_detected_above_threshold(self): | |
| from picarones.report.comparison import compare_benchmarks, detect_regressions | |
| a = _benchmark_json({"tesseract": 0.05}) | |
| b = _benchmark_json({"tesseract": 0.06}) # +1 pp | |
| diff = compare_benchmarks(a, b, threshold=0.005) | |
| regs = detect_regressions(diff) | |
| assert len(regs) == 1 | |
| assert regs[0].engine == "tesseract" | |
| assert regs[0].delta_cer == pytest.approx(0.01, abs=1e-9) | |
| def test_improvement_detected_below_threshold(self): | |
| from picarones.report.comparison import compare_benchmarks | |
| a = _benchmark_json({"tesseract": 0.05}) | |
| b = _benchmark_json({"tesseract": 0.04}) # -1 pp | |
| diff = compare_benchmarks(a, b, threshold=0.005) | |
| assert diff.deltas[0].is_improvement | |
| assert not diff.deltas[0].is_regression | |
| def test_below_threshold_is_stable(self): | |
| from picarones.report.comparison import compare_benchmarks | |
| a = _benchmark_json({"tesseract": 0.05}) | |
| b = _benchmark_json({"tesseract": 0.052}) # +0.2 pp, sous le seuil 0.5 pp | |
| diff = compare_benchmarks(a, b, threshold=0.005) | |
| assert not diff.deltas[0].is_regression | |
| def test_engines_only_in_one_side(self): | |
| from picarones.report.comparison import compare_benchmarks | |
| a = _benchmark_json({"tesseract": 0.05, "pero": 0.07}) | |
| b = _benchmark_json({"tesseract": 0.05, "kraken": 0.06}) | |
| diff = compare_benchmarks(a, b, threshold=0.005) | |
| assert diff.only_in_a == ["pero"] | |
| assert diff.only_in_b == ["kraken"] | |
| assert {d.engine for d in diff.deltas} == {"tesseract"} | |
| def test_none_cer_does_not_raise(self): | |
| from picarones.report.comparison import compare_benchmarks | |
| a = _benchmark_json({"tesseract": None}) | |
| b = _benchmark_json({"tesseract": 0.05}) | |
| diff = compare_benchmarks(a, b) | |
| assert diff.deltas[0].delta_cer is None | |
| assert not diff.deltas[0].is_regression | |
| def test_regressions_sorted_by_severity(self): | |
| from picarones.report.comparison import compare_benchmarks | |
| a = _benchmark_json({"a": 0.05, "b": 0.05, "c": 0.05}) | |
| b = _benchmark_json({"a": 0.07, "b": 0.10, "c": 0.06}) # b plus grave | |
| diff = compare_benchmarks(a, b, threshold=0.005) | |
| # Régressions en tête, plus grosse d'abord | |
| engines_in_order = [d.engine for d in diff.deltas] | |
| assert engines_in_order.index("b") < engines_in_order.index("a") | |
| def test_loads_from_file_path(self, tmp_path): | |
| from picarones.report.comparison import compare_benchmarks | |
| a_path = tmp_path / "a.json" | |
| b_path = tmp_path / "b.json" | |
| a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05}))) | |
| b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.06}))) | |
| diff = compare_benchmarks(a_path, b_path, threshold=0.005) | |
| assert len(diff.deltas) == 1 | |
| class TestRenderComparisonHTML: | |
| def test_html_is_self_contained_and_named(self, tmp_path): | |
| from picarones.report.comparison import compare_benchmarks, render_comparison_html | |
| a = _benchmark_json({"tesseract": 0.05}) | |
| b = _benchmark_json({"tesseract": 0.07}) | |
| diff = compare_benchmarks(a, b, label_a="V1", label_b="V2") | |
| out = tmp_path / "diff.html" | |
| render_comparison_html(diff, out) | |
| assert out.exists() | |
| html = out.read_text(encoding="utf-8") | |
| # Étiquettes et delta visible | |
| assert "V1" in html and "V2" in html | |
| assert "+0.020" in html # delta CER affiché | |
| assert "régression" in html.lower() | |
| # --------------------------------------------------------------------------- | |
| # 2. CLI picarones compare | |
| # --------------------------------------------------------------------------- | |
| class TestCompareCLI: | |
| def test_basic_compare_writes_html(self, tmp_path): | |
| from picarones.cli import cli | |
| a_path = tmp_path / "a.json" | |
| b_path = tmp_path / "b.json" | |
| a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05}))) | |
| b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05}))) | |
| out = tmp_path / "out.html" | |
| runner = CliRunner() | |
| result = runner.invoke(cli, [ | |
| "compare", str(a_path), str(b_path), "-o", str(out), | |
| ]) | |
| assert result.exit_code == 0, result.output | |
| assert out.exists() | |
| def test_regression_exit_code_is_2(self, tmp_path): | |
| from picarones.cli import cli | |
| a_path = tmp_path / "a.json" | |
| b_path = tmp_path / "b.json" | |
| a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05}))) | |
| b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.10}))) | |
| runner = CliRunner() | |
| result = runner.invoke(cli, [ | |
| "compare", str(a_path), str(b_path), | |
| "-o", str(tmp_path / "out.html"), | |
| ]) | |
| # exit 2 = régression détectée (utile en CI) | |
| assert result.exit_code == 2, result.output | |
| assert "régression" in result.output.lower() or "tesseract" in result.output.lower() | |
| def test_json_mode_outputs_dict(self, tmp_path): | |
| from picarones.cli import cli | |
| a_path = tmp_path / "a.json" | |
| b_path = tmp_path / "b.json" | |
| a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05}))) | |
| b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05}))) | |
| runner = CliRunner() | |
| result = runner.invoke(cli, [ | |
| "compare", str(a_path), str(b_path), "--json", | |
| ]) | |
| assert result.exit_code == 0, result.output | |
| parsed = json.loads(result.output) | |
| assert "deltas" in parsed | |
| assert "regressions" in parsed | |
| # --------------------------------------------------------------------------- | |
| # 3. /api/config/save + /api/config/load | |
| # --------------------------------------------------------------------------- | |
| def client(): | |
| from picarones.web.app import app | |
| return TestClient(app) | |
| class TestConfigSaveLoad: | |
| def test_save_returns_attachment(self, client): | |
| r = client.post("/api/config/save", json={ | |
| "engines": ["tesseract"], | |
| "normalization_profile": "medieval_french", | |
| "label": "test-1", | |
| }) | |
| assert r.status_code == 200 | |
| cd = r.headers.get("content-disposition", "") | |
| assert "attachment" in cd | |
| assert "test-1" in cd | |
| body = r.json() | |
| assert body["schema_version"] == 1 | |
| assert body["engines"] == ["tesseract"] | |
| assert "saved_at" in body | |
| def test_save_filters_unknown_fields(self, client): | |
| r = client.post("/api/config/save", json={ | |
| "engines": ["tesseract"], | |
| "secret_token": "ne-doit-pas-apparaitre", | |
| "OPENAI_API_KEY": "sk-...", | |
| }) | |
| body = r.json() | |
| assert "secret_token" not in body | |
| assert "OPENAI_API_KEY" not in body | |
| def test_save_sanitizes_label_for_filename(self, client): | |
| r = client.post("/api/config/save", json={ | |
| "label": "../../etc/passwd", | |
| }) | |
| cd = r.headers.get("content-disposition", "") | |
| assert ".." not in cd | |
| assert "etc" in cd or "passwd" in cd # caractères alnum gardés | |
| def test_load_round_trip(self, client): | |
| original = { | |
| "engines": ["tesseract", "pero"], | |
| "normalization_profile": "medieval_french", | |
| "char_exclude": "',-", | |
| "lang": "fra", | |
| } | |
| # 1. save | |
| r1 = client.post("/api/config/save", json=original) | |
| saved = r1.json() | |
| # 2. load | |
| r2 = client.post("/api/config/load", json=saved) | |
| assert r2.status_code == 200 | |
| loaded = r2.json()["config"] | |
| # Les champs originaux survivent au round-trip | |
| for k, v in original.items(): | |
| assert loaded[k] == v | |
| def test_load_rejects_bad_schema_version(self, client): | |
| r = client.post("/api/config/load", json={"schema_version": 99}) | |
| assert r.status_code == 400 | |
| assert "schema" in r.json()["detail"].lower() | |
| def test_load_rejects_missing_schema(self, client): | |
| r = client.post("/api/config/load", json={"engines": ["tesseract"]}) | |
| assert r.status_code == 400 | |
| # --------------------------------------------------------------------------- | |
| # 4. /api/benchmark/{job_id}/synthesis_preview | |
| # --------------------------------------------------------------------------- | |
| class TestSynthesisPreviewEndpoint: | |
| def job_with_results(self, monkeypatch, tmp_path): | |
| """Crée un job 'complete' + JSON résultat sur disque.""" | |
| from picarones import fixtures | |
| from picarones.web.jobs import get_default_store, reset_default_store | |
| from picarones.web import app as web_app | |
| # Isolate store | |
| monkeypatch.setenv("PICARONES_JOBS_DB", str(tmp_path / "jobs.db")) | |
| reset_default_store() | |
| web_app._JOB_STORE = get_default_store() | |
| web_app._JOBS.clear() | |
| # Génère un benchmark + écrit son JSON | |
| b = fixtures.generate_sample_benchmark(n_docs=4) | |
| out_dir = tmp_path / "rep" | |
| out_dir.mkdir() | |
| html_path = out_dir / "report.html" | |
| json_path = html_path.with_suffix(".json") | |
| json_path.write_text(json.dumps(b.as_dict(), ensure_ascii=False)) | |
| # Crée le job en base | |
| jid = web_app._JOB_STORE.create_job(job_id="job-prev-1") | |
| web_app._JOB_STORE.set_status(jid, "complete") | |
| web_app._JOB_STORE.update_progress(jid, output_path=str(html_path)) | |
| return TestClient(web_app.app), jid | |
| def test_returns_synthesis_for_complete_job(self, job_with_results): | |
| client, jid = job_with_results | |
| r = client.get(f"/api/benchmark/{jid}/synthesis_preview") | |
| assert r.status_code == 200, r.text | |
| d = r.json() | |
| assert d["job_id"] == jid | |
| assert d["lang"] == "fr" | |
| assert "sentences" in d and isinstance(d["sentences"], list) | |
| def test_404_for_unknown_job(self, client): | |
| r = client.get("/api/benchmark/never-existed/synthesis_preview") | |
| assert r.status_code == 404 | |
| def test_409_when_job_not_complete(self, monkeypatch, tmp_path): | |
| from picarones.web.jobs import get_default_store, reset_default_store | |
| from picarones.web import app as web_app | |
| monkeypatch.setenv("PICARONES_JOBS_DB", str(tmp_path / "jobs.db")) | |
| reset_default_store() | |
| web_app._JOB_STORE = get_default_store() | |
| web_app._JOBS.clear() | |
| web_app._JOB_STORE.create_job(job_id="running-1") | |
| web_app._JOB_STORE.set_status("running-1", "running") | |
| client = TestClient(web_app.app) | |
| r = client.get("/api/benchmark/running-1/synthesis_preview") | |
| assert r.status_code == 409 | |
| # --------------------------------------------------------------------------- | |
| # 5. /api/history/regressions | |
| # --------------------------------------------------------------------------- | |
| class TestHistoryRegressionsEndpoint: | |
| def test_empty_history_returns_zero(self, client, tmp_path): | |
| # Pas d'historique → 0 régression | |
| db = tmp_path / "history.db" | |
| r = client.get(f"/api/history/regressions?db_path={db}") | |
| assert r.status_code == 200 | |
| d = r.json() | |
| assert d["count"] == 0 | |
| assert d["regressions"] == [] | |
| def test_threshold_param_is_propagated(self, client, tmp_path): | |
| db = tmp_path / "history.db" | |
| r = client.get(f"/api/history/regressions?threshold=0.05&db_path={db}") | |
| assert r.json()["threshold"] == pytest.approx(0.05) | |