Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Picarones / tests /test_sprint28_ux_save_compare.py

Claude

refactor(core): faire de core/ un cercle 1 strict, déplacer cercle 2 vers measurements/

979f3c3 unverified about 2 months ago

13.8 kB

	"""Tests Sprint 28 — UX : save/load config, compare runs, synthesis preview.

	Le Sprint 28 réduit la friction du chercheur qui itère sur 8 prompts :

	1. ``/api/config/save`` + ``/api/config/load`` — sérialisation/import
	d'une configuration de benchmark en JSON.
	2. ``picarones/report/comparison.py`` — comparaison de deux runs JSON
	avec deltas par moteur et détection de régressions.
	3. ``picarones compare`` (CLI) — équivalent ligne de commande.
	4. ``/api/benchmark/{job_id}/synthesis_preview`` — synthèse narrative
	d'un job terminé sans rouvrir le HTML.
	5. ``/api/history/regressions`` — surface de l'infrastructure Sprint 8.
	"""

	from __future__ import annotations

	import json

	import pytest
	from click.testing import CliRunner
	from fastapi.testclient import TestClient


	# ---------------------------------------------------------------------------
	# 1. Module comparison — compare_benchmarks
	# ---------------------------------------------------------------------------

	def _benchmark_json(engines_to_cer: dict[str, float], **extra) -> dict:
	"""Fabrique un dict ``BenchmarkResult.as_dict()``-like."""
	ranking = [
	{
	"engine": name,
	"mean_cer": cer,
	"mean_wer": cer * 1.5 if cer is not None else None,
	"documents": 10,
	"failed": 0,
	}
	for name, cer in engines_to_cer.items()
	]
	return {
	"ranking": ranking,
	"run_date": extra.get("run_date", "2026-04-01T00:00:00+00:00"),
	"corpus": {"name": extra.get("corpus", "test_corpus"), "source": "fixture"},
	}


	class TestCompareBenchmarks:
	def test_identical_runs_no_regression(self):
	from picarones.report.comparison import compare_benchmarks
	a = _benchmark_json({"tesseract": 0.05, "pero": 0.07})
	b = _benchmark_json({"tesseract": 0.05, "pero": 0.07})
	diff = compare_benchmarks(a, b, threshold=0.005)
	assert all(d.delta_cer == 0.0 for d in diff.deltas)
	assert all(not d.is_regression and not d.is_improvement for d in diff.deltas)

	def test_regression_detected_above_threshold(self):
	from picarones.report.comparison import compare_benchmarks, detect_regressions
	a = _benchmark_json({"tesseract": 0.05})
	b = _benchmark_json({"tesseract": 0.06}) # +1 pp
	diff = compare_benchmarks(a, b, threshold=0.005)
	regs = detect_regressions(diff)
	assert len(regs) == 1
	assert regs[0].engine == "tesseract"
	assert regs[0].delta_cer == pytest.approx(0.01, abs=1e-9)

	def test_improvement_detected_below_threshold(self):
	from picarones.report.comparison import compare_benchmarks
	a = _benchmark_json({"tesseract": 0.05})
	b = _benchmark_json({"tesseract": 0.04}) # -1 pp
	diff = compare_benchmarks(a, b, threshold=0.005)
	assert diff.deltas[0].is_improvement
	assert not diff.deltas[0].is_regression

	def test_below_threshold_is_stable(self):
	from picarones.report.comparison import compare_benchmarks
	a = _benchmark_json({"tesseract": 0.05})
	b = _benchmark_json({"tesseract": 0.052}) # +0.2 pp, sous le seuil 0.5 pp
	diff = compare_benchmarks(a, b, threshold=0.005)
	assert not diff.deltas[0].is_regression

	def test_engines_only_in_one_side(self):
	from picarones.report.comparison import compare_benchmarks
	a = _benchmark_json({"tesseract": 0.05, "pero": 0.07})
	b = _benchmark_json({"tesseract": 0.05, "kraken": 0.06})
	diff = compare_benchmarks(a, b, threshold=0.005)
	assert diff.only_in_a == ["pero"]
	assert diff.only_in_b == ["kraken"]
	assert {d.engine for d in diff.deltas} == {"tesseract"}

	def test_none_cer_does_not_raise(self):
	from picarones.report.comparison import compare_benchmarks
	a = _benchmark_json({"tesseract": None})
	b = _benchmark_json({"tesseract": 0.05})
	diff = compare_benchmarks(a, b)
	assert diff.deltas[0].delta_cer is None
	assert not diff.deltas[0].is_regression

	def test_regressions_sorted_by_severity(self):
	from picarones.report.comparison import compare_benchmarks
	a = _benchmark_json({"a": 0.05, "b": 0.05, "c": 0.05})
	b = _benchmark_json({"a": 0.07, "b": 0.10, "c": 0.06}) # b plus grave
	diff = compare_benchmarks(a, b, threshold=0.005)
	# Régressions en tête, plus grosse d'abord
	engines_in_order = [d.engine for d in diff.deltas]
	assert engines_in_order.index("b") < engines_in_order.index("a")

	def test_loads_from_file_path(self, tmp_path):
	from picarones.report.comparison import compare_benchmarks
	a_path = tmp_path / "a.json"
	b_path = tmp_path / "b.json"
	a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05})))
	b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.06})))
	diff = compare_benchmarks(a_path, b_path, threshold=0.005)
	assert len(diff.deltas) == 1


	class TestRenderComparisonHTML:
	def test_html_is_self_contained_and_named(self, tmp_path):
	from picarones.report.comparison import compare_benchmarks, render_comparison_html
	a = _benchmark_json({"tesseract": 0.05})
	b = _benchmark_json({"tesseract": 0.07})
	diff = compare_benchmarks(a, b, label_a="V1", label_b="V2")
	out = tmp_path / "diff.html"
	render_comparison_html(diff, out)
	assert out.exists()
	html = out.read_text(encoding="utf-8")
	# Étiquettes et delta visible
	assert "V1" in html and "V2" in html
	assert "+0.020" in html # delta CER affiché
	assert "régression" in html.lower()


	# ---------------------------------------------------------------------------
	# 2. CLI picarones compare
	# ---------------------------------------------------------------------------

	class TestCompareCLI:
	def test_basic_compare_writes_html(self, tmp_path):
	from picarones.cli import cli
	a_path = tmp_path / "a.json"
	b_path = tmp_path / "b.json"
	a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05})))
	b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05})))
	out = tmp_path / "out.html"
	runner = CliRunner()
	result = runner.invoke(cli, [
	"compare", str(a_path), str(b_path), "-o", str(out),
	])
	assert result.exit_code == 0, result.output
	assert out.exists()

	def test_regression_exit_code_is_2(self, tmp_path):
	from picarones.cli import cli
	a_path = tmp_path / "a.json"
	b_path = tmp_path / "b.json"
	a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05})))
	b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.10})))
	runner = CliRunner()
	result = runner.invoke(cli, [
	"compare", str(a_path), str(b_path),
	"-o", str(tmp_path / "out.html"),
	])
	# exit 2 = régression détectée (utile en CI)
	assert result.exit_code == 2, result.output
	assert "régression" in result.output.lower() or "tesseract" in result.output.lower()

	def test_json_mode_outputs_dict(self, tmp_path):
	from picarones.cli import cli
	a_path = tmp_path / "a.json"
	b_path = tmp_path / "b.json"
	a_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05})))
	b_path.write_text(json.dumps(_benchmark_json({"tesseract": 0.05})))
	runner = CliRunner()
	result = runner.invoke(cli, [
	"compare", str(a_path), str(b_path), "--json",
	])
	assert result.exit_code == 0, result.output
	parsed = json.loads(result.output)
	assert "deltas" in parsed
	assert "regressions" in parsed


	# ---------------------------------------------------------------------------
	# 3. /api/config/save + /api/config/load
	# ---------------------------------------------------------------------------

	@pytest.fixture
	def client():
	from picarones.web.app import app
	return TestClient(app)


	class TestConfigSaveLoad:
	def test_save_returns_attachment(self, client):
	r = client.post("/api/config/save", json={
	"engines": ["tesseract"],
	"normalization_profile": "medieval_french",
	"label": "test-1",
	})
	assert r.status_code == 200
	cd = r.headers.get("content-disposition", "")
	assert "attachment" in cd
	assert "test-1" in cd
	body = r.json()
	assert body["schema_version"] == 1
	assert body["engines"] == ["tesseract"]
	assert "saved_at" in body

	def test_save_filters_unknown_fields(self, client):
	r = client.post("/api/config/save", json={
	"engines": ["tesseract"],
	"secret_token": "ne-doit-pas-apparaitre",
	"OPENAI_API_KEY": "sk-...",
	})
	body = r.json()
	assert "secret_token" not in body
	assert "OPENAI_API_KEY" not in body

	def test_save_sanitizes_label_for_filename(self, client):
	r = client.post("/api/config/save", json={
	"label": "../../etc/passwd",
	})
	cd = r.headers.get("content-disposition", "")
	assert ".." not in cd
	assert "etc" in cd or "passwd" in cd # caractères alnum gardés

	def test_load_round_trip(self, client):
	original = {
	"engines": ["tesseract", "pero"],
	"normalization_profile": "medieval_french",
	"char_exclude": "',-",
	"lang": "fra",
	}
	# 1. save
	r1 = client.post("/api/config/save", json=original)
	saved = r1.json()
	# 2. load
	r2 = client.post("/api/config/load", json=saved)
	assert r2.status_code == 200
	loaded = r2.json()["config"]
	# Les champs originaux survivent au round-trip
	for k, v in original.items():
	assert loaded[k] == v

	def test_load_rejects_bad_schema_version(self, client):
	r = client.post("/api/config/load", json={"schema_version": 99})
	assert r.status_code == 400
	assert "schema" in r.json()["detail"].lower()

	def test_load_rejects_missing_schema(self, client):
	r = client.post("/api/config/load", json={"engines": ["tesseract"]})
	assert r.status_code == 400


	# ---------------------------------------------------------------------------
	# 4. /api/benchmark/{job_id}/synthesis_preview
	# ---------------------------------------------------------------------------

	class TestSynthesisPreviewEndpoint:
	@pytest.fixture
	def job_with_results(self, monkeypatch, tmp_path):
	"""Crée un job 'complete' + JSON résultat sur disque."""
	from picarones import fixtures
	from picarones.web.jobs import get_default_store, reset_default_store
	from picarones.web import app as web_app
	# Isolate store
	monkeypatch.setenv("PICARONES_JOBS_DB", str(tmp_path / "jobs.db"))
	reset_default_store()
	web_app._JOB_STORE = get_default_store()
	web_app._JOBS.clear()
	# Génère un benchmark + écrit son JSON
	b = fixtures.generate_sample_benchmark(n_docs=4)
	out_dir = tmp_path / "rep"
	out_dir.mkdir()
	html_path = out_dir / "report.html"
	json_path = html_path.with_suffix(".json")
	json_path.write_text(json.dumps(b.as_dict(), ensure_ascii=False))
	# Crée le job en base
	jid = web_app._JOB_STORE.create_job(job_id="job-prev-1")
	web_app._JOB_STORE.set_status(jid, "complete")
	web_app._JOB_STORE.update_progress(jid, output_path=str(html_path))
	return TestClient(web_app.app), jid

	def test_returns_synthesis_for_complete_job(self, job_with_results):
	client, jid = job_with_results
	r = client.get(f"/api/benchmark/{jid}/synthesis_preview")
	assert r.status_code == 200, r.text
	d = r.json()
	assert d["job_id"] == jid
	assert d["lang"] == "fr"
	assert "sentences" in d and isinstance(d["sentences"], list)

	def test_404_for_unknown_job(self, client):
	r = client.get("/api/benchmark/never-existed/synthesis_preview")
	assert r.status_code == 404

	def test_409_when_job_not_complete(self, monkeypatch, tmp_path):
	from picarones.web.jobs import get_default_store, reset_default_store
	from picarones.web import app as web_app
	monkeypatch.setenv("PICARONES_JOBS_DB", str(tmp_path / "jobs.db"))
	reset_default_store()
	web_app._JOB_STORE = get_default_store()
	web_app._JOBS.clear()
	web_app._JOB_STORE.create_job(job_id="running-1")
	web_app._JOB_STORE.set_status("running-1", "running")
	client = TestClient(web_app.app)
	r = client.get("/api/benchmark/running-1/synthesis_preview")
	assert r.status_code == 409


	# ---------------------------------------------------------------------------
	# 5. /api/history/regressions
	# ---------------------------------------------------------------------------

	class TestHistoryRegressionsEndpoint:
	def test_empty_history_returns_zero(self, client, tmp_path):
	# Pas d'historique → 0 régression
	db = tmp_path / "history.db"
	r = client.get(f"/api/history/regressions?db_path={db}")
	assert r.status_code == 200
	d = r.json()
	assert d["count"] == 0
	assert d["regressions"] == []

	def test_threshold_param_is_propagated(self, client, tmp_path):
	db = tmp_path / "history.db"
	r = client.get(f"/api/history/regressions?threshold=0.05&db_path={db}")
	assert r.json()["threshold"] == pytest.approx(0.05)