"""Tests Sprint 7 — Rapport HTML v2 et analyses avancées. Classes de tests ---------------- TestBootstrapCI (7 tests) — intervalles de confiance par bootstrap TestWilcoxonTest (10 tests) — test de Wilcoxon signé-rangé TestPairwiseStats (6 tests) — matrice de tests par paires TestReliabilityCurve (7 tests) — courbes de fiabilité TestVennData (8 tests) — diagramme de Venn 2 et 3 ensembles TestErrorClustering (8 tests) — clustering des patterns d'erreurs TestCorrelationMatrix (8 tests) — matrice de corrélation TestDifficultyScore (10 tests) — score de difficulté intrinsèque par document TestAllDifficulties (6 tests) — compute_all_difficulties sur un corpus TestReportDataSprint7 (12 tests) — _build_report_data contient les nouvelles clés TestHTMLSprint7Features (10 tests) — HTML généré contient les nouvelles fonctionnalités """ from __future__ import annotations import pytest # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def sample_benchmark_s7(): from picarones.fixtures import generate_sample_benchmark return generate_sample_benchmark(n_docs=8, seed=7) @pytest.fixture def report_data_s7(sample_benchmark_s7): from picarones.report.generator import _build_report_data imgs = sample_benchmark_s7.metadata.get("_images_b64", {}) return _build_report_data(sample_benchmark_s7, imgs) @pytest.fixture def html_s7(sample_benchmark_s7): from picarones.report.generator import ReportGenerator import tempfile import pathlib gen = ReportGenerator(sample_benchmark_s7) with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as f: path = gen.generate(f.name) return pathlib.Path(path).read_text(encoding="utf-8") # =========================================================================== # TestBootstrapCI # =========================================================================== class TestBootstrapCI: def test_returns_tuple_of_two(self): from picarones.core.statistics import bootstrap_ci result = bootstrap_ci([0.1, 0.2, 0.3]) assert isinstance(result, tuple) and len(result) == 2 def test_lower_le_upper(self): from picarones.core.statistics import bootstrap_ci lo, hi = bootstrap_ci([0.1, 0.2, 0.3, 0.4, 0.5]) assert lo <= hi def test_ci_contains_mean(self): from picarones.core.statistics import bootstrap_ci values = [0.1, 0.15, 0.2, 0.12, 0.18, 0.13, 0.17] lo, hi = bootstrap_ci(values) mean = sum(values) / len(values) assert lo <= mean <= hi def test_empty_returns_zeros(self): from picarones.core.statistics import bootstrap_ci lo, hi = bootstrap_ci([]) assert lo == 0.0 and hi == 0.0 def test_single_value(self): from picarones.core.statistics import bootstrap_ci lo, hi = bootstrap_ci([0.25]) assert lo <= 0.25 <= hi def test_reproducible_with_seed(self): from picarones.core.statistics import bootstrap_ci vals = [0.1, 0.2, 0.3, 0.15, 0.25] r1 = bootstrap_ci(vals, seed=1) r2 = bootstrap_ci(vals, seed=1) assert r1 == r2 def test_wider_with_more_variance(self): from picarones.core.statistics import bootstrap_ci narrow = [0.10, 0.11, 0.10, 0.11, 0.10] wide = [0.01, 0.50, 0.02, 0.49, 0.01] lo_n, hi_n = bootstrap_ci(narrow, n_iter=500) lo_w, hi_w = bootstrap_ci(wide, n_iter=500) assert (hi_w - lo_w) > (hi_n - lo_n) # =========================================================================== # TestWilcoxonTest # =========================================================================== class TestWilcoxonTest: def test_returns_dict_with_keys(self): from picarones.core.statistics import wilcoxon_test r = wilcoxon_test([0.1]*5, [0.1]*5) assert "statistic" in r assert "p_value" in r assert "significant" in r assert "interpretation" in r def test_identical_series_not_significant(self): from picarones.core.statistics import wilcoxon_test vals = [0.1, 0.2, 0.3, 0.15, 0.05] r = wilcoxon_test(vals, vals) assert not r["significant"] def test_clearly_different_series_significant(self): from picarones.core.statistics import wilcoxon_test a = [0.01]*12 b = [0.80]*12 r = wilcoxon_test(a, b) assert r["significant"] assert r["p_value"] < 0.05 def test_p_value_in_range(self): from picarones.core.statistics import wilcoxon_test a = [0.1, 0.15, 0.2, 0.08] b = [0.2, 0.25, 0.3, 0.18] r = wilcoxon_test(a, b) assert 0.0 <= r["p_value"] <= 1.0 def test_interpretation_is_string(self): from picarones.core.statistics import wilcoxon_test r = wilcoxon_test([0.1, 0.2], [0.1, 0.2]) assert isinstance(r["interpretation"], str) and len(r["interpretation"]) > 10 def test_n_pairs_correct(self): from picarones.core.statistics import wilcoxon_test r = wilcoxon_test([0.1, 0.2, 0.3], [0.1, 0.2, 0.3]) # tous les diffs = 0, filtrés en mode wilcox assert r["n_pairs"] == 0 def test_mismatched_lengths_raises(self): from picarones.core.statistics import wilcoxon_test with pytest.raises(ValueError): wilcoxon_test([0.1, 0.2], [0.1]) def test_w_plus_w_minus_present(self): from picarones.core.statistics import wilcoxon_test a = [0.1, 0.2, 0.3, 0.15, 0.25, 0.18, 0.12, 0.22, 0.08, 0.27] b = [0.2, 0.3, 0.4, 0.25, 0.35, 0.28, 0.22, 0.32, 0.18, 0.37] r = wilcoxon_test(a, b) assert "W_plus" in r and "W_minus" in r def test_significant_larger_sample(self): from picarones.core.statistics import wilcoxon_test import random rng = random.Random(0) a = [rng.uniform(0.0, 0.05) for _ in range(15)] b = [rng.uniform(0.3, 0.7) for _ in range(15)] r = wilcoxon_test(a, b) assert r["significant"] def test_symmetry(self): from picarones.core.statistics import wilcoxon_test a = [0.1, 0.2, 0.3, 0.15, 0.25, 0.18, 0.22, 0.08, 0.27, 0.14] b = [0.2, 0.3, 0.4, 0.25, 0.35, 0.28, 0.32, 0.18, 0.37, 0.24] r_ab = wilcoxon_test(a, b) r_ba = wilcoxon_test(b, a) assert r_ab["p_value"] == pytest.approx(r_ba["p_value"], abs=1e-6) assert r_ab["significant"] == r_ba["significant"] # =========================================================================== # TestPairwiseStats # =========================================================================== class TestPairwiseStats: def test_returns_list(self): from picarones.core.statistics import compute_pairwise_stats r = compute_pairwise_stats({"A": [0.1, 0.2], "B": [0.3, 0.4]}) assert isinstance(r, list) def test_correct_pair_count_2_engines(self): from picarones.core.statistics import compute_pairwise_stats r = compute_pairwise_stats({"A": [0.1]*5, "B": [0.2]*5}) assert len(r) == 1 def test_correct_pair_count_3_engines(self): from picarones.core.statistics import compute_pairwise_stats r = compute_pairwise_stats({ "A": [0.1]*5, "B": [0.2]*5, "C": [0.3]*5 }) assert len(r) == 3 def test_pair_has_engine_names(self): from picarones.core.statistics import compute_pairwise_stats r = compute_pairwise_stats({"A": [0.1]*5, "B": [0.2]*5}) assert r[0]["engine_a"] in ["A", "B"] assert r[0]["engine_b"] in ["A", "B"] def test_pair_has_p_value(self): from picarones.core.statistics import compute_pairwise_stats r = compute_pairwise_stats({"A": [0.1]*5, "B": [0.2]*5}) assert "p_value" in r[0] def test_single_engine_returns_empty(self): from picarones.core.statistics import compute_pairwise_stats r = compute_pairwise_stats({"A": [0.1]*5}) assert r == [] # =========================================================================== # TestReliabilityCurve # =========================================================================== class TestReliabilityCurve: def test_returns_list(self): from picarones.core.statistics import compute_reliability_curve r = compute_reliability_curve([0.1, 0.2, 0.3]) assert isinstance(r, list) def test_correct_number_of_steps(self): from picarones.core.statistics import compute_reliability_curve r = compute_reliability_curve([0.1]*10, steps=5) assert len(r) == 5 def test_pct_docs_increases(self): from picarones.core.statistics import compute_reliability_curve r = compute_reliability_curve([0.1, 0.2, 0.3, 0.4, 0.5], steps=5) pcts = [p["pct_docs"] for p in r] assert pcts == sorted(pcts) def test_mean_cer_increases(self): from picarones.core.statistics import compute_reliability_curve r = compute_reliability_curve([0.05, 0.10, 0.20, 0.30, 0.50], steps=5) cers = [p["mean_cer"] for p in r] assert cers[0] <= cers[-1] def test_empty_returns_empty(self): from picarones.core.statistics import compute_reliability_curve assert compute_reliability_curve([]) == [] def test_last_point_includes_all(self): from picarones.core.statistics import compute_reliability_curve vals = [0.1, 0.2, 0.3] r = compute_reliability_curve(vals, steps=4) last = r[-1] expected = sum(vals) / len(vals) assert last["mean_cer"] == pytest.approx(expected, rel=1e-4) def test_each_point_has_required_keys(self): from picarones.core.statistics import compute_reliability_curve r = compute_reliability_curve([0.1, 0.2, 0.3], steps=3) for p in r: assert "pct_docs" in p and "mean_cer" in p # =========================================================================== # TestVennData # =========================================================================== class TestVennData: def test_venn2_type(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({"A": {"e1","e2"}, "B": {"e2","e3"}}) assert r["type"] == "venn2" def test_venn3_type(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({"A": {"e1"}, "B": {"e2"}, "C": {"e3"}}) assert r["type"] == "venn3" def test_venn2_counts_correct(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({"A": {"e1","e2","e3"}, "B": {"e2","e3","e4"}}) assert r["only_a"] == 1 assert r["only_b"] == 1 assert r["both"] == 2 def test_venn2_disjoint(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({"A": {"e1"}, "B": {"e2"}}) assert r["both"] == 0 assert r["only_a"] == 1 assert r["only_b"] == 1 def test_venn2_subset(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({"A": {"e1","e2"}, "B": {"e1","e2","e3"}}) assert r["only_a"] == 0 def test_venn3_abc_count(self): from picarones.core.statistics import compute_venn_data shared = {"e1","e2"} r = compute_venn_data({"A": shared, "B": shared, "C": shared}) assert r["abc"] == 2 def test_empty_returns_empty(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({}) assert r == {} def test_labels_present(self): from picarones.core.statistics import compute_venn_data r = compute_venn_data({"moteur_a": {"e1"}, "moteur_b": {"e2"}}) assert r["label_a"] == "moteur_a" assert r["label_b"] == "moteur_b" # =========================================================================== # TestErrorClustering # =========================================================================== class TestErrorClustering: def _sample_data(self): return [ {"engine": "tesseract", "gt": "maiſtre Froiſſart", "hypothesis": "maiftre Froiffart"}, {"engine": "tesseract", "gt": "nostre seigneur", "hypothesis": "noltre leigneur"}, {"engine": "pero", "gt": "regnoit en France", "hypothesis": "regnoit en France"}, {"engine": "pero", "gt": "en l'an de grace", "hypothesis": "en l'an de grace"}, {"engine": "mauvais", "gt": "icy commence le prologue", "hypothesis": "icy conmence le prologue"}, {"engine": "mauvais", "gt": "par la grace de Dieu", "hypothesis": "par la grce de Dieu"}, ] def test_returns_list(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data()) assert isinstance(result, list) def test_max_clusters_respected(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data(), max_clusters=3) assert len(result) <= 3 def test_cluster_has_required_keys(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data()) if result: c = result[0] assert hasattr(c, "cluster_id") assert hasattr(c, "label") assert hasattr(c, "count") assert hasattr(c, "examples") def test_as_dict_method(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data()) if result: d = result[0].as_dict() assert "cluster_id" in d assert "label" in d assert "count" in d assert "examples" in d def test_sorted_by_count_descending(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data()) if len(result) >= 2: assert result[0].count >= result[1].count def test_examples_capped_at_5(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data()) for c in result: assert len(c.as_dict()["examples"]) <= 5 def test_empty_data_returns_empty(self): from picarones.core.statistics import cluster_errors result = cluster_errors([]) assert result == [] def test_cluster_id_unique(self): from picarones.core.statistics import cluster_errors result = cluster_errors(self._sample_data()) ids = [c.cluster_id for c in result] assert len(ids) == len(set(ids)) # =========================================================================== # TestCorrelationMatrix # =========================================================================== class TestCorrelationMatrix: def _sample_metrics(self): return [ {"cer": 0.1, "wer": 0.2, "quality_score": 0.8}, {"cer": 0.2, "wer": 0.35, "quality_score": 0.6}, {"cer": 0.05, "wer": 0.1, "quality_score": 0.9}, {"cer": 0.3, "wer": 0.5, "quality_score": 0.5}, {"cer": 0.15, "wer": 0.25, "quality_score": 0.75}, ] def test_returns_dict_with_labels_and_matrix(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix(self._sample_metrics()) assert "labels" in r and "matrix" in r def test_matrix_is_square(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix(self._sample_metrics()) n = len(r["labels"]) assert len(r["matrix"]) == n for row in r["matrix"]: assert len(row) == n def test_diagonal_is_one(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix(self._sample_metrics()) for i in range(len(r["labels"])): assert r["matrix"][i][i] == pytest.approx(1.0) def test_cer_quality_negatively_correlated(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix(self._sample_metrics()) labels = r["labels"] if "cer" in labels and "quality_score" in labels: i = labels.index("cer") j = labels.index("quality_score") assert r["matrix"][i][j] < 0 # plus la qualité est bonne, plus le CER est bas def test_symmetric_matrix(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix(self._sample_metrics()) n = len(r["labels"]) for i in range(n): for j in range(n): assert r["matrix"][i][j] == pytest.approx(r["matrix"][j][i], abs=1e-6) def test_empty_returns_empty(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix([]) assert r == {"labels": [], "matrix": []} def test_custom_metric_keys(self): from picarones.core.statistics import compute_correlation_matrix data = [{"a": 1.0, "b": 2.0, "c": 3.0}] * 5 r = compute_correlation_matrix(data, metric_keys=["a", "b"]) assert r["labels"] == ["a", "b"] def test_values_in_range(self): from picarones.core.statistics import compute_correlation_matrix r = compute_correlation_matrix(self._sample_metrics()) for row in r["matrix"]: for v in row: assert -1.0 <= v <= 1.0 # =========================================================================== # TestDifficultyScore # =========================================================================== class TestDifficultyScore: def test_returns_difficulty_score(self): from picarones.core.difficulty import compute_difficulty_score ds = compute_difficulty_score("doc1", "maiſtre Froiſſart", [0.1, 0.2, 0.3]) from picarones.core.difficulty import DifficultyScore assert isinstance(ds, DifficultyScore) def test_score_in_range(self): from picarones.core.difficulty import compute_difficulty_score ds = compute_difficulty_score("doc1", "hello world", [0.1, 0.2]) assert 0.0 <= ds.score <= 1.0 def test_more_variance_higher_score(self): from picarones.core.difficulty import compute_difficulty_score low_var = compute_difficulty_score("doc1", "hello", [0.1, 0.1, 0.1]) high_var = compute_difficulty_score("doc1", "hello", [0.0, 0.5, 1.0]) assert high_var.score > low_var.score def test_bad_quality_image_harder(self): from picarones.core.difficulty import compute_difficulty_score good_img = compute_difficulty_score("doc1", "hello", [0.1], image_quality_score=0.9) bad_img = compute_difficulty_score("doc1", "hello", [0.1], image_quality_score=0.1) assert bad_img.score > good_img.score def test_special_chars_increase_difficulty(self): from picarones.core.difficulty import compute_difficulty_score plain = compute_difficulty_score("doc1", "hello world plain text", [0.1]) heritage = compute_difficulty_score("doc1", "maiſtre Froiſſart ꝑ &", [0.1]) assert heritage.score > plain.score def test_components_present(self): from picarones.core.difficulty import compute_difficulty_score ds = compute_difficulty_score("doc1", "text", [0.1, 0.2]) assert hasattr(ds, "variance_component") assert hasattr(ds, "quality_component") assert hasattr(ds, "density_component") def test_as_dict_has_doc_id(self): from picarones.core.difficulty import compute_difficulty_score ds = compute_difficulty_score("folio_001", "text", [0.1]) d = ds.as_dict() assert d["doc_id"] == "folio_001" def test_as_dict_rounded(self): from picarones.core.difficulty import compute_difficulty_score ds = compute_difficulty_score("doc1", "text", [0.1]) d = ds.as_dict() assert isinstance(d["score"], float) def test_no_engines_gives_low_variance(self): from picarones.core.difficulty import compute_difficulty_score ds = compute_difficulty_score("doc1", "text", []) assert ds.cer_variance == 0.0 def test_difficulty_label(self): from picarones.core.difficulty import difficulty_label assert difficulty_label(0.1) == "Facile" assert difficulty_label(0.35) == "Modéré" assert difficulty_label(0.6) == "Difficile" assert difficulty_label(0.9) == "Très difficile" # =========================================================================== # TestAllDifficulties # =========================================================================== class TestAllDifficulties: def test_returns_dict(self): from picarones.core.difficulty import compute_all_difficulties r = compute_all_difficulties( ["doc1", "doc2"], {"doc1": "hello", "doc2": "world"}, {"doc1": {"A": 0.1}, "doc2": {"A": 0.2}}, ) assert isinstance(r, dict) def test_all_docs_present(self): from picarones.core.difficulty import compute_all_difficulties r = compute_all_difficulties( ["d1", "d2", "d3"], {"d1": "a", "d2": "b", "d3": "c"}, {"d1": {"E": 0.1}, "d2": {"E": 0.2}, "d3": {"E": 0.3}}, ) assert set(r.keys()) == {"d1", "d2", "d3"} def test_scores_in_range(self): from picarones.core.difficulty import compute_all_difficulties r = compute_all_difficulties( ["d1", "d2"], {"d1": "maiſtre Jean", "d2": "simple text"}, {"d1": {"A": 0.1, "B": 0.5}, "d2": {"A": 0.1, "B": 0.1}}, ) for ds in r.values(): assert 0.0 <= ds.score <= 1.0 def test_with_image_quality(self): from picarones.core.difficulty import compute_all_difficulties r = compute_all_difficulties( ["d1"], {"d1": "text"}, {"d1": {"A": 0.1}}, image_quality_map={"d1": 0.3}, ) assert "d1" in r # qualité dégradée → composante élevée assert r["d1"].quality_component > 0.5 def test_empty_corpus(self): from picarones.core.difficulty import compute_all_difficulties r = compute_all_difficulties([], {}, {}) assert r == {} def test_missing_gt_handled(self): from picarones.core.difficulty import compute_all_difficulties r = compute_all_difficulties( ["d1"], {}, # GT manquante {"d1": {"A": 0.2}}, ) assert "d1" in r # =========================================================================== # TestReportDataSprint7 # =========================================================================== class TestReportDataSprint7: def test_has_statistics_key(self, report_data_s7): assert "statistics" in report_data_s7 def test_has_reliability_curves(self, report_data_s7): assert "reliability_curves" in report_data_s7 def test_has_venn_data(self, report_data_s7): assert "venn_data" in report_data_s7 def test_has_error_clusters(self, report_data_s7): assert "error_clusters" in report_data_s7 def test_has_correlation_per_engine(self, report_data_s7): assert "correlation_per_engine" in report_data_s7 def test_pairwise_wilcoxon_non_empty(self, report_data_s7): pw = report_data_s7["statistics"]["pairwise_wilcoxon"] assert len(pw) > 0 def test_bootstrap_cis_count(self, report_data_s7): cis = report_data_s7["statistics"]["bootstrap_cis"] n_engines = len(report_data_s7["engines"]) assert len(cis) == n_engines def test_documents_have_difficulty_score(self, report_data_s7): for doc in report_data_s7["documents"]: assert "difficulty_score" in doc assert 0.0 <= doc["difficulty_score"] <= 1.0 def test_documents_have_difficulty_label(self, report_data_s7): for doc in report_data_s7["documents"]: assert "difficulty_label" in doc assert doc["difficulty_label"] in ("Facile", "Modéré", "Difficile", "Très difficile") def test_reliability_curves_count(self, report_data_s7): rc = report_data_s7["reliability_curves"] assert len(rc) == len(report_data_s7["engines"]) def test_reliability_curves_have_points(self, report_data_s7): for curve in report_data_s7["reliability_curves"]: assert "engine" in curve assert "points" in curve assert len(curve["points"]) > 0 def test_correlation_matrix_symmetric(self, report_data_s7): for entry in report_data_s7["correlation_per_engine"]: m = entry["matrix"] n = len(m) for i in range(n): for j in range(n): assert m[i][j] == pytest.approx(m[j][i], abs=1e-5) # =========================================================================== # TestHTMLSprint7Features # =========================================================================== class TestHTMLSprint7Features: def test_html_contains_export_csv_button(self, html_s7): assert "exportCSV" in html_s7 or "CSV" in html_s7 def test_html_contains_presentation_mode_button(self, html_s7): assert "togglePresentMode" in html_s7 or "Présentation" in html_s7 def test_html_contains_reliability_chart(self, html_s7): assert "chart-reliability" in html_s7 def test_html_contains_bootstrap_ci_chart(self, html_s7): assert "chart-bootstrap-ci" in html_s7 def test_html_contains_venn_container(self, html_s7): assert "venn-container" in html_s7 def test_html_contains_wilcoxon_table(self, html_s7): assert "wilcoxon-table" in html_s7 def test_html_contains_error_clusters(self, html_s7): assert "error-clusters" in html_s7 def test_html_contains_correlation_matrix(self, html_s7): assert "corr-matrix" in html_s7 or "correlation" in html_s7.lower() def test_html_contains_difficulty_badge(self, html_s7): assert "difficulty" in html_s7.lower() or "diff-badge" in html_s7 def test_html_contains_url_state(self, html_s7): assert "updateURL" in html_s7 or "history.replaceState" in html_s7