File size: 26,864 Bytes
11b4df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6362212
 
11b4df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
"""Tests Sprint 7 — Rapport HTML v2 et analyses avancées.

Classes de tests
----------------
TestBootstrapCI           (7 tests)  — intervalles de confiance par bootstrap
TestWilcoxonTest          (10 tests) — test de Wilcoxon signé-rangé
TestPairwiseStats         (6 tests)  — matrice de tests par paires
TestReliabilityCurve      (7 tests)  — courbes de fiabilité
TestVennData              (8 tests)  — diagramme de Venn 2 et 3 ensembles
TestErrorClustering       (8 tests)  — clustering des patterns d'erreurs
TestCorrelationMatrix     (8 tests)  — matrice de corrélation
TestDifficultyScore       (10 tests) — score de difficulté intrinsèque par document
TestAllDifficulties       (6 tests)  — compute_all_difficulties sur un corpus
TestReportDataSprint7     (12 tests) — _build_report_data contient les nouvelles clés
TestHTMLSprint7Features   (10 tests) — HTML généré contient les nouvelles fonctionnalités
"""

from __future__ import annotations

import pytest

# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

@pytest.fixture
def sample_benchmark_s7():
    from picarones.fixtures import generate_sample_benchmark
    return generate_sample_benchmark(n_docs=8, seed=7)


@pytest.fixture
def report_data_s7(sample_benchmark_s7):
    from picarones.report.generator import _build_report_data
    imgs = sample_benchmark_s7.metadata.get("_images_b64", {})
    return _build_report_data(sample_benchmark_s7, imgs)


@pytest.fixture
def html_s7(sample_benchmark_s7):
    from picarones.report.generator import ReportGenerator
    import tempfile
    import pathlib
    gen = ReportGenerator(sample_benchmark_s7)
    with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as f:
        path = gen.generate(f.name)
    return pathlib.Path(path).read_text(encoding="utf-8")


# ===========================================================================
# TestBootstrapCI
# ===========================================================================

class TestBootstrapCI:
    def test_returns_tuple_of_two(self):
        from picarones.core.statistics import bootstrap_ci
        result = bootstrap_ci([0.1, 0.2, 0.3])
        assert isinstance(result, tuple) and len(result) == 2

    def test_lower_le_upper(self):
        from picarones.core.statistics import bootstrap_ci
        lo, hi = bootstrap_ci([0.1, 0.2, 0.3, 0.4, 0.5])
        assert lo <= hi

    def test_ci_contains_mean(self):
        from picarones.core.statistics import bootstrap_ci
        values = [0.1, 0.15, 0.2, 0.12, 0.18, 0.13, 0.17]
        lo, hi = bootstrap_ci(values)
        mean = sum(values) / len(values)
        assert lo <= mean <= hi

    def test_empty_returns_zeros(self):
        from picarones.core.statistics import bootstrap_ci
        lo, hi = bootstrap_ci([])
        assert lo == 0.0 and hi == 0.0

    def test_single_value(self):
        from picarones.core.statistics import bootstrap_ci
        lo, hi = bootstrap_ci([0.25])
        assert lo <= 0.25 <= hi

    def test_reproducible_with_seed(self):
        from picarones.core.statistics import bootstrap_ci
        vals = [0.1, 0.2, 0.3, 0.15, 0.25]
        r1 = bootstrap_ci(vals, seed=1)
        r2 = bootstrap_ci(vals, seed=1)
        assert r1 == r2

    def test_wider_with_more_variance(self):
        from picarones.core.statistics import bootstrap_ci
        narrow = [0.10, 0.11, 0.10, 0.11, 0.10]
        wide   = [0.01, 0.50, 0.02, 0.49, 0.01]
        lo_n, hi_n = bootstrap_ci(narrow, n_iter=500)
        lo_w, hi_w = bootstrap_ci(wide,   n_iter=500)
        assert (hi_w - lo_w) > (hi_n - lo_n)


# ===========================================================================
# TestWilcoxonTest
# ===========================================================================

class TestWilcoxonTest:
    def test_returns_dict_with_keys(self):
        from picarones.core.statistics import wilcoxon_test
        r = wilcoxon_test([0.1]*5, [0.1]*5)
        assert "statistic" in r
        assert "p_value" in r
        assert "significant" in r
        assert "interpretation" in r

    def test_identical_series_not_significant(self):
        from picarones.core.statistics import wilcoxon_test
        vals = [0.1, 0.2, 0.3, 0.15, 0.05]
        r = wilcoxon_test(vals, vals)
        assert not r["significant"]

    def test_clearly_different_series_significant(self):
        from picarones.core.statistics import wilcoxon_test
        a = [0.01]*12
        b = [0.80]*12
        r = wilcoxon_test(a, b)
        assert r["significant"]
        assert r["p_value"] < 0.05

    def test_p_value_in_range(self):
        from picarones.core.statistics import wilcoxon_test
        a = [0.1, 0.15, 0.2, 0.08]
        b = [0.2, 0.25, 0.3, 0.18]
        r = wilcoxon_test(a, b)
        assert 0.0 <= r["p_value"] <= 1.0

    def test_interpretation_is_string(self):
        from picarones.core.statistics import wilcoxon_test
        r = wilcoxon_test([0.1, 0.2], [0.1, 0.2])
        assert isinstance(r["interpretation"], str) and len(r["interpretation"]) > 10

    def test_n_pairs_correct(self):
        from picarones.core.statistics import wilcoxon_test
        r = wilcoxon_test([0.1, 0.2, 0.3], [0.1, 0.2, 0.3])
        # tous les diffs = 0, filtrés en mode wilcox
        assert r["n_pairs"] == 0

    def test_mismatched_lengths_raises(self):
        from picarones.core.statistics import wilcoxon_test
        with pytest.raises(ValueError):
            wilcoxon_test([0.1, 0.2], [0.1])

    def test_w_plus_w_minus_present(self):
        from picarones.core.statistics import wilcoxon_test
        a = [0.1, 0.2, 0.3, 0.15, 0.25, 0.18, 0.12, 0.22, 0.08, 0.27]
        b = [0.2, 0.3, 0.4, 0.25, 0.35, 0.28, 0.22, 0.32, 0.18, 0.37]
        r = wilcoxon_test(a, b)
        assert "W_plus" in r and "W_minus" in r

    def test_significant_larger_sample(self):
        from picarones.core.statistics import wilcoxon_test
        import random
        rng = random.Random(0)
        a = [rng.uniform(0.0, 0.05) for _ in range(15)]
        b = [rng.uniform(0.3, 0.7)  for _ in range(15)]
        r = wilcoxon_test(a, b)
        assert r["significant"]

    def test_symmetry(self):
        from picarones.core.statistics import wilcoxon_test
        a = [0.1, 0.2, 0.3, 0.15, 0.25, 0.18, 0.22, 0.08, 0.27, 0.14]
        b = [0.2, 0.3, 0.4, 0.25, 0.35, 0.28, 0.32, 0.18, 0.37, 0.24]
        r_ab = wilcoxon_test(a, b)
        r_ba = wilcoxon_test(b, a)
        assert r_ab["p_value"] == pytest.approx(r_ba["p_value"], abs=1e-6)
        assert r_ab["significant"] == r_ba["significant"]


# ===========================================================================
# TestPairwiseStats
# ===========================================================================

class TestPairwiseStats:
    def test_returns_list(self):
        from picarones.core.statistics import compute_pairwise_stats
        r = compute_pairwise_stats({"A": [0.1, 0.2], "B": [0.3, 0.4]})
        assert isinstance(r, list)

    def test_correct_pair_count_2_engines(self):
        from picarones.core.statistics import compute_pairwise_stats
        r = compute_pairwise_stats({"A": [0.1]*5, "B": [0.2]*5})
        assert len(r) == 1

    def test_correct_pair_count_3_engines(self):
        from picarones.core.statistics import compute_pairwise_stats
        r = compute_pairwise_stats({
            "A": [0.1]*5, "B": [0.2]*5, "C": [0.3]*5
        })
        assert len(r) == 3

    def test_pair_has_engine_names(self):
        from picarones.core.statistics import compute_pairwise_stats
        r = compute_pairwise_stats({"A": [0.1]*5, "B": [0.2]*5})
        assert r[0]["engine_a"] in ["A", "B"]
        assert r[0]["engine_b"] in ["A", "B"]

    def test_pair_has_p_value(self):
        from picarones.core.statistics import compute_pairwise_stats
        r = compute_pairwise_stats({"A": [0.1]*5, "B": [0.2]*5})
        assert "p_value" in r[0]

    def test_single_engine_returns_empty(self):
        from picarones.core.statistics import compute_pairwise_stats
        r = compute_pairwise_stats({"A": [0.1]*5})
        assert r == []


# ===========================================================================
# TestReliabilityCurve
# ===========================================================================

class TestReliabilityCurve:
    def test_returns_list(self):
        from picarones.core.statistics import compute_reliability_curve
        r = compute_reliability_curve([0.1, 0.2, 0.3])
        assert isinstance(r, list)

    def test_correct_number_of_steps(self):
        from picarones.core.statistics import compute_reliability_curve
        r = compute_reliability_curve([0.1]*10, steps=5)
        assert len(r) == 5

    def test_pct_docs_increases(self):
        from picarones.core.statistics import compute_reliability_curve
        r = compute_reliability_curve([0.1, 0.2, 0.3, 0.4, 0.5], steps=5)
        pcts = [p["pct_docs"] for p in r]
        assert pcts == sorted(pcts)

    def test_mean_cer_increases(self):
        from picarones.core.statistics import compute_reliability_curve
        r = compute_reliability_curve([0.05, 0.10, 0.20, 0.30, 0.50], steps=5)
        cers = [p["mean_cer"] for p in r]
        assert cers[0] <= cers[-1]

    def test_empty_returns_empty(self):
        from picarones.core.statistics import compute_reliability_curve
        assert compute_reliability_curve([]) == []

    def test_last_point_includes_all(self):
        from picarones.core.statistics import compute_reliability_curve
        vals = [0.1, 0.2, 0.3]
        r = compute_reliability_curve(vals, steps=4)
        last = r[-1]
        expected = sum(vals) / len(vals)
        assert last["mean_cer"] == pytest.approx(expected, rel=1e-4)

    def test_each_point_has_required_keys(self):
        from picarones.core.statistics import compute_reliability_curve
        r = compute_reliability_curve([0.1, 0.2, 0.3], steps=3)
        for p in r:
            assert "pct_docs" in p and "mean_cer" in p


# ===========================================================================
# TestVennData
# ===========================================================================

class TestVennData:
    def test_venn2_type(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({"A": {"e1","e2"}, "B": {"e2","e3"}})
        assert r["type"] == "venn2"

    def test_venn3_type(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({"A": {"e1"}, "B": {"e2"}, "C": {"e3"}})
        assert r["type"] == "venn3"

    def test_venn2_counts_correct(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({"A": {"e1","e2","e3"}, "B": {"e2","e3","e4"}})
        assert r["only_a"] == 1
        assert r["only_b"] == 1
        assert r["both"] == 2

    def test_venn2_disjoint(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({"A": {"e1"}, "B": {"e2"}})
        assert r["both"] == 0
        assert r["only_a"] == 1
        assert r["only_b"] == 1

    def test_venn2_subset(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({"A": {"e1","e2"}, "B": {"e1","e2","e3"}})
        assert r["only_a"] == 0

    def test_venn3_abc_count(self):
        from picarones.core.statistics import compute_venn_data
        shared = {"e1","e2"}
        r = compute_venn_data({"A": shared, "B": shared, "C": shared})
        assert r["abc"] == 2

    def test_empty_returns_empty(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({})
        assert r == {}

    def test_labels_present(self):
        from picarones.core.statistics import compute_venn_data
        r = compute_venn_data({"moteur_a": {"e1"}, "moteur_b": {"e2"}})
        assert r["label_a"] == "moteur_a"
        assert r["label_b"] == "moteur_b"


# ===========================================================================
# TestErrorClustering
# ===========================================================================

class TestErrorClustering:
    def _sample_data(self):
        return [
            {"engine": "tesseract", "gt": "maiſtre Froiſſart", "hypothesis": "maiftre Froiffart"},
            {"engine": "tesseract", "gt": "nostre seigneur", "hypothesis": "noltre leigneur"},
            {"engine": "pero", "gt": "regnoit en France", "hypothesis": "regnoit en France"},
            {"engine": "pero", "gt": "en l'an de grace", "hypothesis": "en l'an de grace"},
            {"engine": "mauvais", "gt": "icy commence le prologue", "hypothesis": "icy conmence le prologue"},
            {"engine": "mauvais", "gt": "par la grace de Dieu", "hypothesis": "par la grce de Dieu"},
        ]

    def test_returns_list(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data())
        assert isinstance(result, list)

    def test_max_clusters_respected(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data(), max_clusters=3)
        assert len(result) <= 3

    def test_cluster_has_required_keys(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data())
        if result:
            c = result[0]
            assert hasattr(c, "cluster_id")
            assert hasattr(c, "label")
            assert hasattr(c, "count")
            assert hasattr(c, "examples")

    def test_as_dict_method(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data())
        if result:
            d = result[0].as_dict()
            assert "cluster_id" in d
            assert "label" in d
            assert "count" in d
            assert "examples" in d

    def test_sorted_by_count_descending(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data())
        if len(result) >= 2:
            assert result[0].count >= result[1].count

    def test_examples_capped_at_5(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data())
        for c in result:
            assert len(c.as_dict()["examples"]) <= 5

    def test_empty_data_returns_empty(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors([])
        assert result == []

    def test_cluster_id_unique(self):
        from picarones.core.statistics import cluster_errors
        result = cluster_errors(self._sample_data())
        ids = [c.cluster_id for c in result]
        assert len(ids) == len(set(ids))


# ===========================================================================
# TestCorrelationMatrix
# ===========================================================================

class TestCorrelationMatrix:
    def _sample_metrics(self):
        return [
            {"cer": 0.1, "wer": 0.2, "quality_score": 0.8},
            {"cer": 0.2, "wer": 0.35, "quality_score": 0.6},
            {"cer": 0.05, "wer": 0.1, "quality_score": 0.9},
            {"cer": 0.3, "wer": 0.5, "quality_score": 0.5},
            {"cer": 0.15, "wer": 0.25, "quality_score": 0.75},
        ]

    def test_returns_dict_with_labels_and_matrix(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix(self._sample_metrics())
        assert "labels" in r and "matrix" in r

    def test_matrix_is_square(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix(self._sample_metrics())
        n = len(r["labels"])
        assert len(r["matrix"]) == n
        for row in r["matrix"]:
            assert len(row) == n

    def test_diagonal_is_one(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix(self._sample_metrics())
        for i in range(len(r["labels"])):
            assert r["matrix"][i][i] == pytest.approx(1.0)

    def test_cer_quality_negatively_correlated(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix(self._sample_metrics())
        labels = r["labels"]
        if "cer" in labels and "quality_score" in labels:
            i = labels.index("cer")
            j = labels.index("quality_score")
            assert r["matrix"][i][j] < 0  # plus la qualité est bonne, plus le CER est bas

    def test_symmetric_matrix(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix(self._sample_metrics())
        n = len(r["labels"])
        for i in range(n):
            for j in range(n):
                assert r["matrix"][i][j] == pytest.approx(r["matrix"][j][i], abs=1e-6)

    def test_empty_returns_empty(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix([])
        assert r == {"labels": [], "matrix": []}

    def test_custom_metric_keys(self):
        from picarones.core.statistics import compute_correlation_matrix
        data = [{"a": 1.0, "b": 2.0, "c": 3.0}] * 5
        r = compute_correlation_matrix(data, metric_keys=["a", "b"])
        assert r["labels"] == ["a", "b"]

    def test_values_in_range(self):
        from picarones.core.statistics import compute_correlation_matrix
        r = compute_correlation_matrix(self._sample_metrics())
        for row in r["matrix"]:
            for v in row:
                assert -1.0 <= v <= 1.0


# ===========================================================================
# TestDifficultyScore
# ===========================================================================

class TestDifficultyScore:
    def test_returns_difficulty_score(self):
        from picarones.core.difficulty import compute_difficulty_score
        ds = compute_difficulty_score("doc1", "maiſtre Froiſſart", [0.1, 0.2, 0.3])
        from picarones.core.difficulty import DifficultyScore
        assert isinstance(ds, DifficultyScore)

    def test_score_in_range(self):
        from picarones.core.difficulty import compute_difficulty_score
        ds = compute_difficulty_score("doc1", "hello world", [0.1, 0.2])
        assert 0.0 <= ds.score <= 1.0

    def test_more_variance_higher_score(self):
        from picarones.core.difficulty import compute_difficulty_score
        low_var  = compute_difficulty_score("doc1", "hello", [0.1, 0.1, 0.1])
        high_var = compute_difficulty_score("doc1", "hello", [0.0, 0.5, 1.0])
        assert high_var.score > low_var.score

    def test_bad_quality_image_harder(self):
        from picarones.core.difficulty import compute_difficulty_score
        good_img = compute_difficulty_score("doc1", "hello", [0.1], image_quality_score=0.9)
        bad_img  = compute_difficulty_score("doc1", "hello", [0.1], image_quality_score=0.1)
        assert bad_img.score > good_img.score

    def test_special_chars_increase_difficulty(self):
        from picarones.core.difficulty import compute_difficulty_score
        plain    = compute_difficulty_score("doc1", "hello world plain text", [0.1])
        heritage = compute_difficulty_score("doc1", "maiſtre Froiſſart ꝑ &", [0.1])
        assert heritage.score > plain.score

    def test_components_present(self):
        from picarones.core.difficulty import compute_difficulty_score
        ds = compute_difficulty_score("doc1", "text", [0.1, 0.2])
        assert hasattr(ds, "variance_component")
        assert hasattr(ds, "quality_component")
        assert hasattr(ds, "density_component")

    def test_as_dict_has_doc_id(self):
        from picarones.core.difficulty import compute_difficulty_score
        ds = compute_difficulty_score("folio_001", "text", [0.1])
        d = ds.as_dict()
        assert d["doc_id"] == "folio_001"

    def test_as_dict_rounded(self):
        from picarones.core.difficulty import compute_difficulty_score
        ds = compute_difficulty_score("doc1", "text", [0.1])
        d = ds.as_dict()
        assert isinstance(d["score"], float)

    def test_no_engines_gives_low_variance(self):
        from picarones.core.difficulty import compute_difficulty_score
        ds = compute_difficulty_score("doc1", "text", [])
        assert ds.cer_variance == 0.0

    def test_difficulty_label(self):
        from picarones.core.difficulty import difficulty_label
        assert difficulty_label(0.1)  == "Facile"
        assert difficulty_label(0.35) == "Modéré"
        assert difficulty_label(0.6)  == "Difficile"
        assert difficulty_label(0.9)  == "Très difficile"


# ===========================================================================
# TestAllDifficulties
# ===========================================================================

class TestAllDifficulties:
    def test_returns_dict(self):
        from picarones.core.difficulty import compute_all_difficulties
        r = compute_all_difficulties(
            ["doc1", "doc2"],
            {"doc1": "hello", "doc2": "world"},
            {"doc1": {"A": 0.1}, "doc2": {"A": 0.2}},
        )
        assert isinstance(r, dict)

    def test_all_docs_present(self):
        from picarones.core.difficulty import compute_all_difficulties
        r = compute_all_difficulties(
            ["d1", "d2", "d3"],
            {"d1": "a", "d2": "b", "d3": "c"},
            {"d1": {"E": 0.1}, "d2": {"E": 0.2}, "d3": {"E": 0.3}},
        )
        assert set(r.keys()) == {"d1", "d2", "d3"}

    def test_scores_in_range(self):
        from picarones.core.difficulty import compute_all_difficulties
        r = compute_all_difficulties(
            ["d1", "d2"],
            {"d1": "maiſtre Jean", "d2": "simple text"},
            {"d1": {"A": 0.1, "B": 0.5}, "d2": {"A": 0.1, "B": 0.1}},
        )
        for ds in r.values():
            assert 0.0 <= ds.score <= 1.0

    def test_with_image_quality(self):
        from picarones.core.difficulty import compute_all_difficulties
        r = compute_all_difficulties(
            ["d1"],
            {"d1": "text"},
            {"d1": {"A": 0.1}},
            image_quality_map={"d1": 0.3},
        )
        assert "d1" in r
        # qualité dégradée → composante élevée
        assert r["d1"].quality_component > 0.5

    def test_empty_corpus(self):
        from picarones.core.difficulty import compute_all_difficulties
        r = compute_all_difficulties([], {}, {})
        assert r == {}

    def test_missing_gt_handled(self):
        from picarones.core.difficulty import compute_all_difficulties
        r = compute_all_difficulties(
            ["d1"],
            {},  # GT manquante
            {"d1": {"A": 0.2}},
        )
        assert "d1" in r


# ===========================================================================
# TestReportDataSprint7
# ===========================================================================

class TestReportDataSprint7:
    def test_has_statistics_key(self, report_data_s7):
        assert "statistics" in report_data_s7

    def test_has_reliability_curves(self, report_data_s7):
        assert "reliability_curves" in report_data_s7

    def test_has_venn_data(self, report_data_s7):
        assert "venn_data" in report_data_s7

    def test_has_error_clusters(self, report_data_s7):
        assert "error_clusters" in report_data_s7

    def test_has_correlation_per_engine(self, report_data_s7):
        assert "correlation_per_engine" in report_data_s7

    def test_pairwise_wilcoxon_non_empty(self, report_data_s7):
        pw = report_data_s7["statistics"]["pairwise_wilcoxon"]
        assert len(pw) > 0

    def test_bootstrap_cis_count(self, report_data_s7):
        cis = report_data_s7["statistics"]["bootstrap_cis"]
        n_engines = len(report_data_s7["engines"])
        assert len(cis) == n_engines

    def test_documents_have_difficulty_score(self, report_data_s7):
        for doc in report_data_s7["documents"]:
            assert "difficulty_score" in doc
            assert 0.0 <= doc["difficulty_score"] <= 1.0

    def test_documents_have_difficulty_label(self, report_data_s7):
        for doc in report_data_s7["documents"]:
            assert "difficulty_label" in doc
            assert doc["difficulty_label"] in ("Facile", "Modéré", "Difficile", "Très difficile")

    def test_reliability_curves_count(self, report_data_s7):
        rc = report_data_s7["reliability_curves"]
        assert len(rc) == len(report_data_s7["engines"])

    def test_reliability_curves_have_points(self, report_data_s7):
        for curve in report_data_s7["reliability_curves"]:
            assert "engine" in curve
            assert "points" in curve
            assert len(curve["points"]) > 0

    def test_correlation_matrix_symmetric(self, report_data_s7):
        for entry in report_data_s7["correlation_per_engine"]:
            m = entry["matrix"]
            n = len(m)
            for i in range(n):
                for j in range(n):
                    assert m[i][j] == pytest.approx(m[j][i], abs=1e-5)


# ===========================================================================
# TestHTMLSprint7Features
# ===========================================================================

class TestHTMLSprint7Features:
    def test_html_contains_export_csv_button(self, html_s7):
        assert "exportCSV" in html_s7 or "CSV" in html_s7

    def test_html_contains_presentation_mode_button(self, html_s7):
        assert "togglePresentMode" in html_s7 or "Présentation" in html_s7

    def test_html_contains_reliability_chart(self, html_s7):
        assert "chart-reliability" in html_s7

    def test_html_contains_bootstrap_ci_chart(self, html_s7):
        assert "chart-bootstrap-ci" in html_s7

    def test_html_contains_venn_container(self, html_s7):
        assert "venn-container" in html_s7

    def test_html_contains_wilcoxon_table(self, html_s7):
        assert "wilcoxon-table" in html_s7

    def test_html_contains_error_clusters(self, html_s7):
        assert "error-clusters" in html_s7

    def test_html_contains_correlation_matrix(self, html_s7):
        assert "corr-matrix" in html_s7 or "correlation" in html_s7.lower()

    def test_html_contains_difficulty_badge(self, html_s7):
        assert "difficulty" in html_s7.lower() or "diff-badge" in html_s7

    def test_html_contains_url_state(self, html_s7):
        assert "updateURL" in html_s7 or "history.replaceState" in html_s7