Spaces:
Running
audit scientifique: micro-CER/WER, Wilcoxon exact, alignement Levenshtein
Browse filesCorrige quatre dΓ©fauts de fiabilitΓ© scientifique identifiΓ©s Γ l'audit
(le chemin de production par dΓ©faut est sans scipy β fallbacks natifs).
F1 β AgrΓ©gation corpus macro-aveugle Γ la longueur. MetricsResult
stocke dΓ©sormais les comptes bruts de l'alignement minimal jiwer
(cer_errors/cer_ref_chars, wer_errors/wer_ref_words) ; aggregate_metrics
expose cer_micro/wer_micro = Ξ£ erreurs / Ξ£ unitΓ©s_rΓ©fΓ©rence (standard
ICDAR/OCR-D/HTR-United). ranking() et stratified_ranking() trient
dΓ©sormais sur le micro-CER (repli mΓ©dianeβmoyenne). mean/median
conservΓ©s comme diagnostics de dispersion. Valeurs CER/WER
inchangΓ©es (process_characters().cer β‘ jiwer.cer).
F2 β Wilcoxon natif renvoyait des p-values fabriquΓ©es {0.04, 0.20} et
un faux positif "significatif" pour n β€ 5 (impossible Γ 5 % bilatΓ©ral).
RemplacΓ© par la distribution nulle exacte de WβΊ (DP sur 2βΏ, n β€ 25
sans ex-aequo), approximation normale corrigΓ©e des ex-aequo sinon.
VΓ©rifiΓ© contre les tables (n=8 W=3 β 0.0390625, etc.).
F4 β Matrice de confusion et diffs (_diff_utils) passaient par
difflib (RatcliffβObershelp, non minimal) tout en annonΓ§ant
"Levenshtein" ; comptes S/D/I incohΓ©rents avec le CER affichΓ© Γ cΓ΄tΓ©.
Bascule sur rapidfuzz.distance.Levenshtein ; blocs replace garantis
de longueur Γ©gale β suppression de l'heuristique _align_segments.
S+D+I de la matrice = distance d'Γ©dition exacte.
F9 β Correction de continuitΓ© Wilcoxon ramenΓ©e Γ (|WβΞΌ|βΒ½)/Ο bornΓ©e
Γ 0 ; plus de double retrait des zΓ©ros avant scipy.
F10 β char_exclude appliquΓ© avant le court-circuit des cas vides.
Tests : rΓ©gression dΓ©diΓ©e tests/evaluation/test_scientific_audit_2026.py
(F1/F2/F4/F9) ; test_sprint44 rΓ©visΓ© (micro par dΓ©faut, mΓ©diane=repli) ;
golden benchmark_result rΓ©gΓ©nΓ©rΓ© (champs micro additifs) ; budget LOC
benchmark_result relevΓ©.
https://claude.ai/code/session_01KTzTK55Hxu8AR72xJUjcUW
- picarones/evaluation/_diff_utils.py +20 -18
- picarones/evaluation/benchmark_result.py +102 -35
- picarones/evaluation/metric_result.py +69 -0
- picarones/evaluation/metrics/confusion.py +27 -69
- picarones/evaluation/metrics/text_metrics.py +32 -10
- picarones/evaluation/statistics/wilcoxon.py +108 -33
- tests/architecture/test_file_budgets.py +1 -1
- tests/evaluation/metrics/test_sprint44_median_default.py +112 -33
- tests/evaluation/test_scientific_audit_2026.py +203 -0
- tests/golden/fixtures/benchmark_result_v2.json +6 -2
|
@@ -20,19 +20,19 @@ au niveau du package).
|
|
| 20 |
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
-
import difflib
|
| 24 |
-
import re
|
| 25 |
from typing import Any
|
| 26 |
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def compute_word_diff(reference: str, hypothesis: str) -> list[dict[str, Any]]:
|
|
@@ -53,12 +53,13 @@ def compute_word_diff(reference: str, hypothesis: str) -> list[dict[str, Any]]:
|
|
| 53 |
ref_tokens = reference.split()
|
| 54 |
hyp_tokens = hypothesis.split()
|
| 55 |
|
| 56 |
-
matcher = difflib.SequenceMatcher(
|
| 57 |
-
None, ref_tokens, hyp_tokens, autojunk=False,
|
| 58 |
-
)
|
| 59 |
ops: list[dict[str, Any]] = []
|
| 60 |
|
| 61 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
ref_chunk = " ".join(ref_tokens[i1:i2])
|
| 63 |
hyp_chunk = " ".join(hyp_tokens[j1:j2])
|
| 64 |
|
|
@@ -76,12 +77,13 @@ def compute_word_diff(reference: str, hypothesis: str) -> list[dict[str, Any]]:
|
|
| 76 |
|
| 77 |
def compute_char_diff(reference: str, hypothesis: str) -> list[dict[str, Any]]:
|
| 78 |
"""Diff caractΓ¨re par caractΓ¨re β utile pour les tokens courts."""
|
| 79 |
-
matcher = difflib.SequenceMatcher(
|
| 80 |
-
None, list(reference), list(hypothesis), autojunk=False,
|
| 81 |
-
)
|
| 82 |
ops: list[dict[str, Any]] = []
|
| 83 |
|
| 84 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
ref_chunk = reference[i1:i2]
|
| 86 |
hyp_chunk = hypothesis[j1:j2]
|
| 87 |
if tag == "equal":
|
|
|
|
| 20 |
|
| 21 |
from __future__ import annotations
|
| 22 |
|
|
|
|
|
|
|
| 23 |
from typing import Any
|
| 24 |
|
| 25 |
+
from rapidfuzz.distance import Levenshtein
|
| 26 |
|
| 27 |
+
# Audit scientifique F4 β l'alignement utilise la distance de
|
| 28 |
+
# **Levenshtein** (rapidfuzz, coΓ»ts substitution = insertion =
|
| 29 |
+
# suppression = 1), et non plus ``difflib.SequenceMatcher``
|
| 30 |
+
# (RatcliffβObershelp, qui maximise les blocs communs et ne minimise
|
| 31 |
+
# pas le nombre d'Γ©ditions). ConsΓ©quence : le diff affichΓ©, les
|
| 32 |
+
# ensembles de Venn et les clusters d'erreurs sont dΓ©sormais
|
| 33 |
+
# **cohΓ©rents avec le CER/WER** (jiwer, lui aussi Levenshtein) montrΓ©s
|
| 34 |
+
# Γ cΓ΄tΓ©. Auparavant deux algorithmes diffΓ©rents produisaient des
|
| 35 |
+
# comptes contradictoires dans le mΓͺme rapport.
|
| 36 |
|
| 37 |
|
| 38 |
def compute_word_diff(reference: str, hypothesis: str) -> list[dict[str, Any]]:
|
|
|
|
| 53 |
ref_tokens = reference.split()
|
| 54 |
hyp_tokens = hypothesis.split()
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
ops: list[dict[str, Any]] = []
|
| 57 |
|
| 58 |
+
for op in Levenshtein.opcodes(ref_tokens, hyp_tokens):
|
| 59 |
+
i1, i2, j1, j2 = (
|
| 60 |
+
op.src_start, op.src_end, op.dest_start, op.dest_end,
|
| 61 |
+
)
|
| 62 |
+
tag = op.tag
|
| 63 |
ref_chunk = " ".join(ref_tokens[i1:i2])
|
| 64 |
hyp_chunk = " ".join(hyp_tokens[j1:j2])
|
| 65 |
|
|
|
|
| 77 |
|
| 78 |
def compute_char_diff(reference: str, hypothesis: str) -> list[dict[str, Any]]:
|
| 79 |
"""Diff caractΓ¨re par caractΓ¨re β utile pour les tokens courts."""
|
|
|
|
|
|
|
|
|
|
| 80 |
ops: list[dict[str, Any]] = []
|
| 81 |
|
| 82 |
+
for op in Levenshtein.opcodes(reference, hypothesis):
|
| 83 |
+
i1, i2, j1, j2 = (
|
| 84 |
+
op.src_start, op.src_end, op.dest_start, op.dest_end,
|
| 85 |
+
)
|
| 86 |
+
tag = op.tag
|
| 87 |
ref_chunk = reference[i1:i2]
|
| 88 |
hyp_chunk = hypothesis[j1:j2]
|
| 89 |
if tag == "equal":
|
|
@@ -382,6 +382,26 @@ class EngineReport:
|
|
| 382 |
[dr.metrics for dr in self.document_results]
|
| 383 |
)
|
| 384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
@property
|
| 386 |
def mean_cer(self) -> Optional[float]:
|
| 387 |
cer_stats = self.aggregated_metrics.get("cer", {})
|
|
@@ -540,27 +560,40 @@ class BenchmarkResult:
|
|
| 540 |
)
|
| 541 |
|
| 542 |
def ranking(self) -> list[dict]:
|
| 543 |
-
"""
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
"""
|
| 559 |
ranked = []
|
| 560 |
for report in self.engine_reports:
|
| 561 |
ranked.append(
|
| 562 |
{
|
| 563 |
"engine": report.engine_name,
|
|
|
|
|
|
|
| 564 |
"mean_cer": report.mean_cer,
|
| 565 |
"median_cer": report.median_cer,
|
| 566 |
"mean_wer": report.mean_wer,
|
|
@@ -570,8 +603,11 @@ class BenchmarkResult:
|
|
| 570 |
)
|
| 571 |
|
| 572 |
def _sort_key(entry: dict) -> tuple:
|
| 573 |
-
# PrioritΓ© :
|
| 574 |
-
|
|
|
|
|
|
|
|
|
|
| 575 |
if primary is None:
|
| 576 |
primary = entry.get("mean_cer")
|
| 577 |
return (primary is None, primary if primary is not None else float("inf"))
|
|
@@ -635,22 +671,39 @@ class BenchmarkResult:
|
|
| 635 |
# ``Optional[float]`` ; le double filtre ``error is None``
|
| 636 |
# garantit ``cer/wer is not None`` par convention, mais on
|
| 637 |
# le filtre explicitement aussi pour que mypy le voie.
|
| 638 |
-
|
| 639 |
-
dr.metrics
|
| 640 |
for dr in report.document_results
|
| 641 |
if dr.doc_id in doc_ids
|
| 642 |
and dr.metrics is not None
|
| 643 |
and dr.metrics.error is None
|
| 644 |
-
|
|
|
|
|
|
|
| 645 |
]
|
| 646 |
wers: list[float] = [
|
| 647 |
-
|
| 648 |
-
for dr in report.document_results
|
| 649 |
-
if dr.doc_id in doc_ids
|
| 650 |
-
and dr.metrics is not None
|
| 651 |
-
and dr.metrics.error is None
|
| 652 |
-
and dr.metrics.wer is not None
|
| 653 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
failed = sum(
|
| 655 |
1 for dr in report.document_results
|
| 656 |
if dr.doc_id in doc_ids
|
|
@@ -660,6 +713,8 @@ class BenchmarkResult:
|
|
| 660 |
if not cers:
|
| 661 |
entries.append({
|
| 662 |
"engine": report.engine_name,
|
|
|
|
|
|
|
| 663 |
"mean_cer": None,
|
| 664 |
"median_cer": None,
|
| 665 |
"mean_wer": None,
|
|
@@ -669,6 +724,8 @@ class BenchmarkResult:
|
|
| 669 |
continue
|
| 670 |
entries.append({
|
| 671 |
"engine": report.engine_name,
|
|
|
|
|
|
|
| 672 |
"mean_cer": _stats.mean(cers),
|
| 673 |
"median_cer": _stats.median(cers),
|
| 674 |
"mean_wer": _stats.mean(wers) if wers else None,
|
|
@@ -677,7 +734,9 @@ class BenchmarkResult:
|
|
| 677 |
})
|
| 678 |
|
| 679 |
def _sort_key(entry: dict) -> tuple:
|
| 680 |
-
primary = entry.get("
|
|
|
|
|
|
|
| 681 |
if primary is None:
|
| 682 |
primary = entry.get("mean_cer")
|
| 683 |
return (primary is None, primary if primary is not None else float("inf"))
|
|
@@ -711,24 +770,32 @@ class BenchmarkResult:
|
|
| 711 |
return None
|
| 712 |
|
| 713 |
global_ranking = self.ranking()
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
if not valid:
|
| 719 |
return None
|
| 720 |
leader = valid[0]["engine"]
|
| 721 |
|
| 722 |
-
# CER
|
|
|
|
| 723 |
per_stratum: dict[str, float] = {}
|
| 724 |
for stratum, entries in strata_rankings.items():
|
| 725 |
for entry in entries:
|
| 726 |
if entry["engine"] != leader:
|
| 727 |
continue
|
| 728 |
-
|
| 729 |
-
if
|
| 730 |
continue
|
| 731 |
-
per_stratum[stratum] =
|
| 732 |
break
|
| 733 |
|
| 734 |
if len(per_stratum) < 2:
|
|
|
|
| 382 |
[dr.metrics for dr in self.document_results]
|
| 383 |
)
|
| 384 |
|
| 385 |
+
@property
|
| 386 |
+
def micro_cer(self) -> Optional[float]:
|
| 387 |
+
"""CER **micro-moyennΓ©** corpus = Ξ£ distance_Γ©dition / Ξ£ car_rΓ©fΓ©rence.
|
| 388 |
+
|
| 389 |
+
Audit scientifique F1 β mΓ©trique d'agrΓ©gation standard du domaine
|
| 390 |
+
OCR/HTR (ICDAR, OCR-D, HTR-United, Transkribus, eScriptorium).
|
| 391 |
+
Contrairement Γ ``mean_cer`` / ``median_cer`` (macro, aveugles Γ
|
| 392 |
+
la longueur), elle pondère chaque document par son nombre de
|
| 393 |
+
caractères : une page de 5 000 caractères pèse 500à une légende
|
| 394 |
+
de 10. C'est le critère de tri par défaut de ``ranking()``.
|
| 395 |
+
``None`` si aucun document n'a de comptes bruts (jiwer absent,
|
| 396 |
+
rΓ©fΓ©rences vides).
|
| 397 |
+
"""
|
| 398 |
+
return self.aggregated_metrics.get("cer_micro", {}).get("value")
|
| 399 |
+
|
| 400 |
+
@property
|
| 401 |
+
def micro_wer(self) -> Optional[float]:
|
| 402 |
+
"""WER micro-moyennΓ© corpus = Ξ£ erreurs_mot / Ξ£ mots_rΓ©fΓ©rence."""
|
| 403 |
+
return self.aggregated_metrics.get("wer_micro", {}).get("value")
|
| 404 |
+
|
| 405 |
@property
|
| 406 |
def mean_cer(self) -> Optional[float]:
|
| 407 |
cer_stats = self.aggregated_metrics.get("cer", {})
|
|
|
|
| 560 |
)
|
| 561 |
|
| 562 |
def ranking(self) -> list[dict]:
|
| 563 |
+
"""Classement des moteurs triΓ© par **CER micro-moyennΓ©** croissant.
|
| 564 |
+
|
| 565 |
+
Audit scientifique F1 (mai 2026) β le tri par dΓ©faut bascule vers
|
| 566 |
+
le **micro-CER** (Σ distance_édition / Σ caractères_référence),
|
| 567 |
+
mΓ©trique d'agrΓ©gation standard du domaine OCR/HTR (ICDAR, OCR-D,
|
| 568 |
+
HTR-United, Transkribus, eScriptorium). C'est la seule agrΓ©gation
|
| 569 |
+
dΓ©fendable scientifiquement comme chiffre d'en-tΓͺte : elle
|
| 570 |
+
pondère chaque document par sa longueur, là où une moyenne ou une
|
| 571 |
+
mΓ©diane de taux par document donne le mΓͺme poids Γ une lΓ©gende de
|
| 572 |
+
10 caractères et à une page de 5 000 et peut inverser le
|
| 573 |
+
classement rΓ©el des moteurs.
|
| 574 |
+
|
| 575 |
+
Historique : Sprint 44 avait basculΓ© moyenne β mΓ©diane pour la
|
| 576 |
+
robustesse Γ l'asymΓ©trie des corpus patrimoniaux. Le diagnostic
|
| 577 |
+
de fond (la *moyenne* est tirΓ©e par quelques documents
|
| 578 |
+
catastrophiques) est exact, mais la *rΓ©ponse* correcte n'est pas
|
| 579 |
+
la mΓ©diane de taux (toujours aveugle Γ la longueur) : c'est le
|
| 580 |
+
micro-CER. ``mean_cer`` et ``median_cer`` restent exposΓ©s dans
|
| 581 |
+
chaque entrΓ©e comme **diagnostics de dispersion** (un grand Γ©cart
|
| 582 |
+
microβmΓ©diane signale une distribution trΓ¨s hΓ©tΓ©rogΓ¨ne β cf.
|
| 583 |
+
détecteur ``median_mean_gap_warning``), pas comme critère de
|
| 584 |
+
classement.
|
| 585 |
+
|
| 586 |
+
Le tri prend ``micro_cer`` quand disponible et retombe sur
|
| 587 |
+
``median_cer`` puis ``mean_cer`` (corpus sans comptes bruts :
|
| 588 |
+
jiwer absent, rΓ©fΓ©rences vides).
|
| 589 |
"""
|
| 590 |
ranked = []
|
| 591 |
for report in self.engine_reports:
|
| 592 |
ranked.append(
|
| 593 |
{
|
| 594 |
"engine": report.engine_name,
|
| 595 |
+
"micro_cer": report.micro_cer,
|
| 596 |
+
"micro_wer": report.micro_wer,
|
| 597 |
"mean_cer": report.mean_cer,
|
| 598 |
"median_cer": report.median_cer,
|
| 599 |
"mean_wer": report.mean_wer,
|
|
|
|
| 603 |
)
|
| 604 |
|
| 605 |
def _sort_key(entry: dict) -> tuple:
|
| 606 |
+
# PrioritΓ© scientifique : micro-CER ; repli mΓ©diane puis
|
| 607 |
+
# moyenne ; +β si rien (moteur sans document exploitable).
|
| 608 |
+
primary = entry.get("micro_cer")
|
| 609 |
+
if primary is None:
|
| 610 |
+
primary = entry.get("median_cer")
|
| 611 |
if primary is None:
|
| 612 |
primary = entry.get("mean_cer")
|
| 613 |
return (primary is None, primary if primary is not None else float("inf"))
|
|
|
|
| 671 |
# ``Optional[float]`` ; le double filtre ``error is None``
|
| 672 |
# garantit ``cer/wer is not None`` par convention, mais on
|
| 673 |
# le filtre explicitement aussi pour que mypy le voie.
|
| 674 |
+
stratum_metrics = [
|
| 675 |
+
dr.metrics
|
| 676 |
for dr in report.document_results
|
| 677 |
if dr.doc_id in doc_ids
|
| 678 |
and dr.metrics is not None
|
| 679 |
and dr.metrics.error is None
|
| 680 |
+
]
|
| 681 |
+
cers: list[float] = [
|
| 682 |
+
m.cer for m in stratum_metrics if m.cer is not None
|
| 683 |
]
|
| 684 |
wers: list[float] = [
|
| 685 |
+
m.wer for m in stratum_metrics if m.wer is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
]
|
| 687 |
+
# Micro-CER/WER de la strate (audit F1) β recalcul depuis
|
| 688 |
+
# les comptes bruts, cohΓ©rent avec ``ranking()`` global.
|
| 689 |
+
tot_ce = sum(
|
| 690 |
+
m.cer_errors for m in stratum_metrics
|
| 691 |
+
if m.cer_errors is not None and m.cer_ref_chars is not None
|
| 692 |
+
)
|
| 693 |
+
tot_cr = sum(
|
| 694 |
+
m.cer_ref_chars for m in stratum_metrics
|
| 695 |
+
if m.cer_errors is not None and m.cer_ref_chars is not None
|
| 696 |
+
)
|
| 697 |
+
tot_we = sum(
|
| 698 |
+
m.wer_errors for m in stratum_metrics
|
| 699 |
+
if m.wer_errors is not None and m.wer_ref_words is not None
|
| 700 |
+
)
|
| 701 |
+
tot_wr = sum(
|
| 702 |
+
m.wer_ref_words for m in stratum_metrics
|
| 703 |
+
if m.wer_errors is not None and m.wer_ref_words is not None
|
| 704 |
+
)
|
| 705 |
+
micro_cer = round(tot_ce / tot_cr, 6) if tot_cr > 0 else None
|
| 706 |
+
micro_wer = round(tot_we / tot_wr, 6) if tot_wr > 0 else None
|
| 707 |
failed = sum(
|
| 708 |
1 for dr in report.document_results
|
| 709 |
if dr.doc_id in doc_ids
|
|
|
|
| 713 |
if not cers:
|
| 714 |
entries.append({
|
| 715 |
"engine": report.engine_name,
|
| 716 |
+
"micro_cer": None,
|
| 717 |
+
"micro_wer": None,
|
| 718 |
"mean_cer": None,
|
| 719 |
"median_cer": None,
|
| 720 |
"mean_wer": None,
|
|
|
|
| 724 |
continue
|
| 725 |
entries.append({
|
| 726 |
"engine": report.engine_name,
|
| 727 |
+
"micro_cer": micro_cer,
|
| 728 |
+
"micro_wer": micro_wer,
|
| 729 |
"mean_cer": _stats.mean(cers),
|
| 730 |
"median_cer": _stats.median(cers),
|
| 731 |
"mean_wer": _stats.mean(wers) if wers else None,
|
|
|
|
| 734 |
})
|
| 735 |
|
| 736 |
def _sort_key(entry: dict) -> tuple:
|
| 737 |
+
primary = entry.get("micro_cer")
|
| 738 |
+
if primary is None:
|
| 739 |
+
primary = entry.get("median_cer")
|
| 740 |
if primary is None:
|
| 741 |
primary = entry.get("mean_cer")
|
| 742 |
return (primary is None, primary if primary is not None else float("inf"))
|
|
|
|
| 770 |
return None
|
| 771 |
|
| 772 |
global_ranking = self.ranking()
|
| 773 |
+
|
| 774 |
+
def _repr_cer(entry: dict) -> Optional[float]:
|
| 775 |
+
# CER reprΓ©sentatif cohΓ©rent avec ``ranking()`` : micro
|
| 776 |
+
# (audit F1) puis repli mΓ©diane / moyenne.
|
| 777 |
+
for key in ("micro_cer", "median_cer", "mean_cer"):
|
| 778 |
+
v = entry.get(key)
|
| 779 |
+
if v is not None:
|
| 780 |
+
return float(v)
|
| 781 |
+
return None
|
| 782 |
+
|
| 783 |
+
valid = [r for r in global_ranking if _repr_cer(r) is not None]
|
| 784 |
if not valid:
|
| 785 |
return None
|
| 786 |
leader = valid[0]["engine"]
|
| 787 |
|
| 788 |
+
# CER reprΓ©sentatif (micro, repli mΓ©diane) du leader sur chaque
|
| 789 |
+
# strate oΓΉ il a au moins 1 document.
|
| 790 |
per_stratum: dict[str, float] = {}
|
| 791 |
for stratum, entries in strata_rankings.items():
|
| 792 |
for entry in entries:
|
| 793 |
if entry["engine"] != leader:
|
| 794 |
continue
|
| 795 |
+
rc = _repr_cer(entry)
|
| 796 |
+
if rc is None:
|
| 797 |
continue
|
| 798 |
+
per_stratum[stratum] = rc
|
| 799 |
break
|
| 800 |
|
| 801 |
if len(per_stratum) < 2:
|
|
@@ -44,6 +44,21 @@ class MetricsResult:
|
|
| 44 |
reference_length: int = 0
|
| 45 |
hypothesis_length: int = 0
|
| 46 |
error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
cer_diplomatic: Optional[float] = None
|
| 48 |
"""CER calculΓ© aprΓ¨s normalisation diplomatique (ΕΏ=s, u=v, i=jβ¦).
|
| 49 |
None si aucun profil diplomatique n'a Γ©tΓ© fourni Γ compute_metrics.
|
|
@@ -66,6 +81,14 @@ class MetricsResult:
|
|
| 66 |
"hypothesis_length": self.hypothesis_length,
|
| 67 |
"error": self.error,
|
| 68 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
if self.cer_diplomatic is not None:
|
| 70 |
d["cer_diplomatic"] = round(self.cer_diplomatic, 6)
|
| 71 |
d["diplomatic_profile_name"] = self.diplomatic_profile_name
|
|
@@ -100,6 +123,10 @@ class MetricsResult:
|
|
| 100 |
reference_length=data.get("reference_length", 0),
|
| 101 |
hypothesis_length=data.get("hypothesis_length", 0),
|
| 102 |
error=data.get("error"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
cer_diplomatic=data.get("cer_diplomatic"),
|
| 104 |
diplomatic_profile_name=data.get("diplomatic_profile_name"),
|
| 105 |
)
|
|
@@ -163,6 +190,48 @@ def aggregate_metrics(results: list[MetricsResult]) -> dict:
|
|
| 163 |
if profile_name:
|
| 164 |
aggregated["cer_diplomatic"]["profile"] = profile_name
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
aggregated["document_count"] = len(results)
|
| 167 |
aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
|
| 168 |
|
|
|
|
| 44 |
reference_length: int = 0
|
| 45 |
hypothesis_length: int = 0
|
| 46 |
error: Optional[str] = None
|
| 47 |
+
# Audit scientifique (F1) β comptes bruts de l'alignement minimal
|
| 48 |
+
# (jiwer/Levenshtein) nΓ©cessaires pour le CER/WER **micro-moyennΓ©**
|
| 49 |
+
# corpus-wide (Ξ£ erreurs / Ξ£ unitΓ©s de rΓ©fΓ©rence), standard du domaine
|
| 50 |
+
# OCR/HTR (ICDAR, OCR-D, HTR-United). ``None`` si le calcul a Γ©chouΓ©
|
| 51 |
+
# ou pour les cas dΓ©gΓ©nΓ©rΓ©s (rΓ©fΓ©rence vide) oΓΉ le dΓ©nominateur micro
|
| 52 |
+
# n'est pas dΓ©fini β l'agrΓ©gateur micro saute alors le document.
|
| 53 |
+
cer_errors: Optional[int] = None
|
| 54 |
+
"""Distance d'édition caractère = substitutions + suppressions + insertions."""
|
| 55 |
+
cer_ref_chars: Optional[int] = None
|
| 56 |
+
"""Longueur de référence en caractères = substitutions + suppressions + hits
|
| 57 |
+
(dΓ©nominateur exact du CER, identique Γ celui utilisΓ© par jiwer)."""
|
| 58 |
+
wer_errors: Optional[int] = None
|
| 59 |
+
"""Distance d'Γ©dition mot = substitutions + suppressions + insertions."""
|
| 60 |
+
wer_ref_words: Optional[int] = None
|
| 61 |
+
"""Nombre de mots de rΓ©fΓ©rence = substitutions + suppressions + hits."""
|
| 62 |
cer_diplomatic: Optional[float] = None
|
| 63 |
"""CER calculΓ© aprΓ¨s normalisation diplomatique (ΕΏ=s, u=v, i=jβ¦).
|
| 64 |
None si aucun profil diplomatique n'a Γ©tΓ© fourni Γ compute_metrics.
|
|
|
|
| 81 |
"hypothesis_length": self.hypothesis_length,
|
| 82 |
"error": self.error,
|
| 83 |
}
|
| 84 |
+
# Comptes bruts (F1) β sΓ©rialisΓ©s seulement s'ils sont prΓ©sents
|
| 85 |
+
# pour ne pas alourdir le JSON des cas dΓ©gΓ©nΓ©rΓ©s / en erreur.
|
| 86 |
+
if self.cer_errors is not None and self.cer_ref_chars is not None:
|
| 87 |
+
d["cer_errors"] = self.cer_errors
|
| 88 |
+
d["cer_ref_chars"] = self.cer_ref_chars
|
| 89 |
+
if self.wer_errors is not None and self.wer_ref_words is not None:
|
| 90 |
+
d["wer_errors"] = self.wer_errors
|
| 91 |
+
d["wer_ref_words"] = self.wer_ref_words
|
| 92 |
if self.cer_diplomatic is not None:
|
| 93 |
d["cer_diplomatic"] = round(self.cer_diplomatic, 6)
|
| 94 |
d["diplomatic_profile_name"] = self.diplomatic_profile_name
|
|
|
|
| 123 |
reference_length=data.get("reference_length", 0),
|
| 124 |
hypothesis_length=data.get("hypothesis_length", 0),
|
| 125 |
error=data.get("error"),
|
| 126 |
+
cer_errors=data.get("cer_errors"),
|
| 127 |
+
cer_ref_chars=data.get("cer_ref_chars"),
|
| 128 |
+
wer_errors=data.get("wer_errors"),
|
| 129 |
+
wer_ref_words=data.get("wer_ref_words"),
|
| 130 |
cer_diplomatic=data.get("cer_diplomatic"),
|
| 131 |
diplomatic_profile_name=data.get("diplomatic_profile_name"),
|
| 132 |
)
|
|
|
|
| 190 |
if profile_name:
|
| 191 |
aggregated["cer_diplomatic"]["profile"] = profile_name
|
| 192 |
|
| 193 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
+
# CER / WER **micro-moyennΓ©s** (audit scientifique F1)
|
| 195 |
+
#
|
| 196 |
+
# Standard du domaine OCR/HTR (ICDAR, OCR-D, HTR-United, Transkribus,
|
| 197 |
+
# eScriptorium) : agrΓ©ger les *comptes bruts* avant de diviser β
|
| 198 |
+
# CER_micro = Σ distance_édition / Σ caractères_référence
|
| 199 |
+
# β et non moyenner des taux par document (macro), qui donne le mΓͺme
|
| 200 |
+
# poids à une légende de 10 caractères et à une page de 5 000.
|
| 201 |
+
# Le micro-CER est la mΓ©trique corpus de rΓ©fΓ©rence ; mean/median
|
| 202 |
+
# restent exposΓ©s ci-dessus comme diagnostics de dispersion.
|
| 203 |
+
# ``None`` si aucun document n'a de comptes exploitables (cas d'un
|
| 204 |
+
# jiwer absent ou de rΓ©fΓ©rences toutes vides).
|
| 205 |
+
def _micro(err_attr: str, ref_attr: str) -> Optional[dict]:
|
| 206 |
+
total_err = 0
|
| 207 |
+
total_ref = 0
|
| 208 |
+
n_docs = 0
|
| 209 |
+
for r in results:
|
| 210 |
+
if r.error is not None:
|
| 211 |
+
continue
|
| 212 |
+
e = getattr(r, err_attr)
|
| 213 |
+
d = getattr(r, ref_attr)
|
| 214 |
+
if e is None or d is None:
|
| 215 |
+
continue
|
| 216 |
+
total_err += e
|
| 217 |
+
total_ref += d
|
| 218 |
+
n_docs += 1
|
| 219 |
+
if n_docs == 0 or total_ref <= 0:
|
| 220 |
+
return None
|
| 221 |
+
return {
|
| 222 |
+
"value": round(total_err / total_ref, 6),
|
| 223 |
+
"total_errors": total_err,
|
| 224 |
+
"total_reference_units": total_ref,
|
| 225 |
+
"document_count": n_docs,
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
cer_micro = _micro("cer_errors", "cer_ref_chars")
|
| 229 |
+
if cer_micro is not None:
|
| 230 |
+
aggregated["cer_micro"] = cer_micro
|
| 231 |
+
wer_micro = _micro("wer_errors", "wer_ref_words")
|
| 232 |
+
if wer_micro is not None:
|
| 233 |
+
aggregated["wer_micro"] = wer_micro
|
| 234 |
+
|
| 235 |
aggregated["document_count"] = len(results)
|
| 236 |
aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
|
| 237 |
|
|
@@ -6,9 +6,17 @@ caractΓ©ristique de chaque moteur ou pipeline.
|
|
| 6 |
|
| 7 |
MΓ©thode
|
| 8 |
-------
|
| 9 |
-
L'alignement caractère par caractère utilise
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
La matrice est stockΓ©e comme un dict de dict :
|
| 14 |
``{gt_char: {ocr_char: count}}``
|
|
@@ -20,10 +28,11 @@ La valeur spΓ©ciale ``"β
"`` (U+2205) représente un caractère vide :
|
|
| 20 |
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
-
import difflib
|
| 24 |
from collections import defaultdict
|
| 25 |
from dataclasses import dataclass, field
|
| 26 |
|
|
|
|
|
|
|
| 27 |
# Symbole représentant un caractère absent (insertion / suppression)
|
| 28 |
EMPTY_CHAR = "β
"
|
| 29 |
|
|
@@ -114,10 +123,15 @@ def build_confusion_matrix(
|
|
| 114 |
if not ground_truth and not hypothesis:
|
| 115 |
return ConfusionMatrix(dict(matrix), 0, 0, 0)
|
| 116 |
|
| 117 |
-
#
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
if tag == "equal":
|
| 122 |
if not ignore_correct:
|
| 123 |
for ch in ground_truth[i1:i2]:
|
|
@@ -125,17 +139,11 @@ def build_confusion_matrix(
|
|
| 125 |
continue
|
| 126 |
matrix[ch][ch] += 1
|
| 127 |
elif tag == "replace":
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
n_subs += min(len(gt_seg), len(oc_seg))
|
| 134 |
-
surplus = abs(len(gt_seg) - len(oc_seg))
|
| 135 |
-
if len(gt_seg) > len(oc_seg):
|
| 136 |
-
n_dels += surplus
|
| 137 |
-
else:
|
| 138 |
-
n_ins += surplus
|
| 139 |
elif tag == "delete":
|
| 140 |
for ch in ground_truth[i1:i2]:
|
| 141 |
if ignore_whitespace and ch in _WHITESPACE:
|
|
@@ -162,56 +170,6 @@ def build_confusion_matrix(
|
|
| 162 |
)
|
| 163 |
|
| 164 |
|
| 165 |
-
def _align_segments(
|
| 166 |
-
gt_seg: str,
|
| 167 |
-
oc_seg: str,
|
| 168 |
-
matrix: dict,
|
| 169 |
-
ignore_whitespace: bool,
|
| 170 |
-
) -> None:
|
| 171 |
-
"""Aligne deux segments de longueurs potentiellement diffΓ©rentes."""
|
| 172 |
-
if not gt_seg:
|
| 173 |
-
for ch in oc_seg:
|
| 174 |
-
if ignore_whitespace and ch in _WHITESPACE:
|
| 175 |
-
continue
|
| 176 |
-
matrix[EMPTY_CHAR][ch] += 1
|
| 177 |
-
return
|
| 178 |
-
if not oc_seg:
|
| 179 |
-
for ch in gt_seg:
|
| 180 |
-
if ignore_whitespace and ch in _WHITESPACE:
|
| 181 |
-
continue
|
| 182 |
-
matrix[ch][EMPTY_CHAR] += 1
|
| 183 |
-
return
|
| 184 |
-
|
| 185 |
-
if len(gt_seg) == len(oc_seg):
|
| 186 |
-
# Substitutions 1-pour-1
|
| 187 |
-
for g, o in zip(gt_seg, oc_seg):
|
| 188 |
-
if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
|
| 189 |
-
continue
|
| 190 |
-
matrix[g][o] += 1
|
| 191 |
-
else:
|
| 192 |
-
# Longueurs diffΓ©rentes : utiliser SequenceMatcher rΓ©cursif sur segments courts
|
| 193 |
-
sub = difflib.SequenceMatcher(None, gt_seg, oc_seg, autojunk=False)
|
| 194 |
-
for tag2, i1, i2, j1, j2 in sub.get_opcodes():
|
| 195 |
-
if tag2 == "equal":
|
| 196 |
-
pass
|
| 197 |
-
elif tag2 == "replace":
|
| 198 |
-
# RΓ©gression simple : aligner par troncature
|
| 199 |
-
for g, o in zip(gt_seg[i1:i2], oc_seg[j1:j2]):
|
| 200 |
-
if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
|
| 201 |
-
continue
|
| 202 |
-
matrix[g][o] += 1
|
| 203 |
-
elif tag2 == "delete":
|
| 204 |
-
for g in gt_seg[i1:i2]:
|
| 205 |
-
if ignore_whitespace and g in _WHITESPACE:
|
| 206 |
-
continue
|
| 207 |
-
matrix[g][EMPTY_CHAR] += 1
|
| 208 |
-
elif tag2 == "insert":
|
| 209 |
-
for o in oc_seg[j1:j2]:
|
| 210 |
-
if ignore_whitespace and o in _WHITESPACE:
|
| 211 |
-
continue
|
| 212 |
-
matrix[EMPTY_CHAR][o] += 1
|
| 213 |
-
|
| 214 |
-
|
| 215 |
def aggregate_confusion_matrices(matrices: list[ConfusionMatrix]) -> ConfusionMatrix:
|
| 216 |
"""Agrège plusieurs matrices de confusion en une seule.
|
| 217 |
|
|
|
|
| 6 |
|
| 7 |
MΓ©thode
|
| 8 |
-------
|
| 9 |
+
L'alignement caractère par caractère utilise la distance de
|
| 10 |
+
**Levenshtein** (``rapidfuzz.distance.Levenshtein``, coΓ»ts
|
| 11 |
+
substitution = insertion = suppression = 1) β le mΓͺme modΓ¨le d'Γ©dition
|
| 12 |
+
que le CER (jiwer). Audit scientifique F4 : auparavant l'alignement
|
| 13 |
+
passait par ``difflib.SequenceMatcher`` (RatcliffβObershelp), qui
|
| 14 |
+
maximise les blocs communs et **ne minimise pas** le nombre
|
| 15 |
+
d'Γ©ditions ; les comptes substitutions/insertions/suppressions et
|
| 16 |
+
l'empreinte d'erreur affichΓ©s divergeaient alors du CER montrΓ© Γ cΓ΄tΓ©.
|
| 17 |
+
L'alignement minimal garantit aussi que tout bloc ``replace`` est de
|
| 18 |
+
longueur Γ©gale cΓ΄tΓ© GT et cΓ΄tΓ© OCR (substitutions 1-pour-1), ce qui
|
| 19 |
+
supprime l'heuristique d'alignement positionnel des segments inΓ©gaux.
|
| 20 |
|
| 21 |
La matrice est stockΓ©e comme un dict de dict :
|
| 22 |
``{gt_char: {ocr_char: count}}``
|
|
|
|
| 28 |
|
| 29 |
from __future__ import annotations
|
| 30 |
|
|
|
|
| 31 |
from collections import defaultdict
|
| 32 |
from dataclasses import dataclass, field
|
| 33 |
|
| 34 |
+
from rapidfuzz.distance import Levenshtein
|
| 35 |
+
|
| 36 |
# Symbole représentant un caractère absent (insertion / suppression)
|
| 37 |
EMPTY_CHAR = "β
"
|
| 38 |
|
|
|
|
| 123 |
if not ground_truth and not hypothesis:
|
| 124 |
return ConfusionMatrix(dict(matrix), 0, 0, 0)
|
| 125 |
|
| 126 |
+
# Alignement minimal de Levenshtein (audit F4) β cohΓ©rent avec le
|
| 127 |
+
# CER. Sous ce modèle, un bloc ``replace`` est une suite de
|
| 128 |
+
# substitutions 1-pour-1 : longueurs GT et OCR Γ©gales, alignement
|
| 129 |
+
# positionnel exact (plus d'heuristique sur segments inΓ©gaux).
|
| 130 |
+
for op in Levenshtein.opcodes(ground_truth, hypothesis):
|
| 131 |
+
tag = op.tag
|
| 132 |
+
i1, i2, j1, j2 = (
|
| 133 |
+
op.src_start, op.src_end, op.dest_start, op.dest_end,
|
| 134 |
+
)
|
| 135 |
if tag == "equal":
|
| 136 |
if not ignore_correct:
|
| 137 |
for ch in ground_truth[i1:i2]:
|
|
|
|
| 139 |
continue
|
| 140 |
matrix[ch][ch] += 1
|
| 141 |
elif tag == "replace":
|
| 142 |
+
for g, o in zip(ground_truth[i1:i2], hypothesis[j1:j2]):
|
| 143 |
+
if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
|
| 144 |
+
continue
|
| 145 |
+
matrix[g][o] += 1
|
| 146 |
+
n_subs += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
elif tag == "delete":
|
| 148 |
for ch in ground_truth[i1:i2]:
|
| 149 |
if ignore_whitespace and ch in _WHITESPACE:
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
def aggregate_confusion_matrices(matrices: list[ConfusionMatrix]) -> ConfusionMatrix:
|
| 174 |
"""Agrège plusieurs matrices de confusion en une seule.
|
| 175 |
|
|
@@ -115,6 +115,15 @@ def compute_metrics(
|
|
| 115 |
error="jiwer n'est pas installΓ© (pip install jiwer)",
|
| 116 |
)
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
# Cas dΓ©gΓ©nΓ©rΓ©s des inputs vides β jiwer 3.x lΓ¨ve sur ces cas
|
| 119 |
# (4.x les gère mais on ne dépend plus d'une majeure spécifique).
|
| 120 |
# Convention :
|
|
@@ -122,6 +131,9 @@ def compute_metrics(
|
|
| 122 |
# - vide ref vs hyp non vide β 1.0 (toute l'hypothΓ¨se est une
|
| 123 |
# insertion, error rate = 1.0).
|
| 124 |
# - ref non vide vs hyp vide β 1.0 (toute la GT manque).
|
|
|
|
|
|
|
|
|
|
| 125 |
ref_stripped = reference.strip()
|
| 126 |
hyp_stripped = hypothesis.strip() if hypothesis else ""
|
| 127 |
if not ref_stripped and not hyp_stripped:
|
|
@@ -147,13 +159,15 @@ def compute_metrics(
|
|
| 147 |
)
|
| 148 |
|
| 149 |
try:
|
| 150 |
-
#
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
# CER variants
|
| 156 |
-
cer_raw = _cer_from_strings(reference, hypothesis)
|
| 157 |
cer_nfc = _cer_from_strings(
|
| 158 |
_normalize_nfc(reference), _normalize_nfc(hypothesis)
|
| 159 |
)
|
|
@@ -161,14 +175,18 @@ def compute_metrics(
|
|
| 161 |
_normalize_caseless(reference), _normalize_caseless(hypothesis)
|
| 162 |
)
|
| 163 |
|
| 164 |
-
# WER
|
|
|
|
| 165 |
ref_norm = _normalize_whitespace(reference)
|
| 166 |
hyp_norm = _normalize_whitespace(hypothesis)
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
wer_normalized = jiwer.wer(ref_norm, hyp_norm)
|
| 170 |
-
mer =
|
| 171 |
-
wil =
|
| 172 |
|
| 173 |
# CER diplomatique β utilise le profil fourni ou le profil mΓ©diΓ©val par dΓ©faut
|
| 174 |
cer_diplomatic: Optional[float] = None
|
|
@@ -193,6 +211,10 @@ def compute_metrics(
|
|
| 193 |
wil=wil,
|
| 194 |
reference_length=len(reference),
|
| 195 |
hypothesis_length=len(hypothesis),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
cer_diplomatic=cer_diplomatic,
|
| 197 |
diplomatic_profile_name=diplomatic_profile_name,
|
| 198 |
)
|
|
|
|
| 115 |
error="jiwer n'est pas installΓ© (pip install jiwer)",
|
| 116 |
)
|
| 117 |
|
| 118 |
+
# Audit scientifique (F10) β l'exclusion de caractΓ¨res est appliquΓ©e
|
| 119 |
+
# **avant** le court-circuit des cas vides : si ``char_exclude`` vide
|
| 120 |
+
# entièrement un texte, le cas est traité par les conventions
|
| 121 |
+
# "texte vide" ci-dessous (rΓ©sultat dΓ©terministe) plutΓ΄t que de
|
| 122 |
+
# tomber dans le ``except`` et de renvoyer une erreur / des None.
|
| 123 |
+
if char_exclude:
|
| 124 |
+
reference = "".join(c for c in reference if c not in char_exclude)
|
| 125 |
+
hypothesis = "".join(c for c in hypothesis if c not in char_exclude)
|
| 126 |
+
|
| 127 |
# Cas dΓ©gΓ©nΓ©rΓ©s des inputs vides β jiwer 3.x lΓ¨ve sur ces cas
|
| 128 |
# (4.x les gère mais on ne dépend plus d'une majeure spécifique).
|
| 129 |
# Convention :
|
|
|
|
| 131 |
# - vide ref vs hyp non vide β 1.0 (toute l'hypothΓ¨se est une
|
| 132 |
# insertion, error rate = 1.0).
|
| 133 |
# - ref non vide vs hyp vide β 1.0 (toute la GT manque).
|
| 134 |
+
# Dans ces trois cas, les comptes bruts (cer_errors/cer_ref_charsβ¦)
|
| 135 |
+
# restent ``None`` : le dΓ©nominateur micro n'est pas dΓ©fini sur une
|
| 136 |
+
# rΓ©fΓ©rence vide, l'agrΓ©gateur micro saute donc le document.
|
| 137 |
ref_stripped = reference.strip()
|
| 138 |
hyp_stripped = hypothesis.strip() if hypothesis else ""
|
| 139 |
if not ref_stripped and not hyp_stripped:
|
|
|
|
| 159 |
)
|
| 160 |
|
| 161 |
try:
|
| 162 |
+
# CER : un seul appel ``process_characters`` fournit la valeur
|
| 163 |
+
# (``co.cer`` est bit-identique Γ ``jiwer.cer``) ET les comptes
|
| 164 |
+
# de l'alignement minimal (= Levenshtein) nΓ©cessaires au
|
| 165 |
+
# micro-CER corpus (audit scientifique F1).
|
| 166 |
+
co = jiwer.process_characters(reference, hypothesis)
|
| 167 |
+
cer_raw = co.cer
|
| 168 |
+
cer_errors = co.substitutions + co.deletions + co.insertions
|
| 169 |
+
cer_ref_chars = co.substitutions + co.deletions + co.hits
|
| 170 |
|
|
|
|
|
|
|
| 171 |
cer_nfc = _cer_from_strings(
|
| 172 |
_normalize_nfc(reference), _normalize_nfc(hypothesis)
|
| 173 |
)
|
|
|
|
| 175 |
_normalize_caseless(reference), _normalize_caseless(hypothesis)
|
| 176 |
)
|
| 177 |
|
| 178 |
+
# WER : idem via ``process_words`` (``wo.wer/mer/wil`` identiques
|
| 179 |
+
# aux fonctions jiwer, mΓͺme tokenisation par espaces).
|
| 180 |
ref_norm = _normalize_whitespace(reference)
|
| 181 |
hyp_norm = _normalize_whitespace(hypothesis)
|
| 182 |
|
| 183 |
+
wo = jiwer.process_words(reference, hypothesis)
|
| 184 |
+
wer_raw = wo.wer
|
| 185 |
+
wer_errors = wo.substitutions + wo.deletions + wo.insertions
|
| 186 |
+
wer_ref_words = wo.substitutions + wo.deletions + wo.hits
|
| 187 |
wer_normalized = jiwer.wer(ref_norm, hyp_norm)
|
| 188 |
+
mer = wo.mer
|
| 189 |
+
wil = wo.wil
|
| 190 |
|
| 191 |
# CER diplomatique β utilise le profil fourni ou le profil mΓ©diΓ©val par dΓ©faut
|
| 192 |
cer_diplomatic: Optional[float] = None
|
|
|
|
| 211 |
wil=wil,
|
| 212 |
reference_length=len(reference),
|
| 213 |
hypothesis_length=len(hypothesis),
|
| 214 |
+
cer_errors=cer_errors,
|
| 215 |
+
cer_ref_chars=cer_ref_chars,
|
| 216 |
+
wer_errors=wer_errors,
|
| 217 |
+
wer_ref_words=wer_ref_words,
|
| 218 |
cer_diplomatic=cer_diplomatic,
|
| 219 |
diplomatic_profile_name=diplomatic_profile_name,
|
| 220 |
)
|
|
@@ -63,11 +63,20 @@ def wilcoxon_test(
|
|
| 63 |
if len(a) != len(b):
|
| 64 |
raise ValueError("Les deux listes doivent avoir la mΓͺme longueur")
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
# Retirer les zΓ©ros (mΓ©thode "wilcox")
|
| 69 |
if zero_method == "wilcox":
|
| 70 |
-
diffs = [d for d in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
n = len(diffs)
|
| 73 |
if n == 0:
|
|
@@ -77,14 +86,22 @@ def wilcoxon_test(
|
|
| 77 |
"significant": False,
|
| 78 |
"interpretation": "Aucune diffΓ©rence entre les deux concurrents.",
|
| 79 |
"n_pairs": 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
# Rangs des valeurs absolues
|
| 83 |
abs_diffs = [abs(d) for d in diffs]
|
| 84 |
indexed = sorted(enumerate(abs_diffs), key=lambda x: x[1])
|
| 85 |
|
| 86 |
-
# Gestion des ex-aequo : rang moyen
|
|
|
|
|
|
|
|
|
|
| 87 |
ranks = [0.0] * n
|
|
|
|
| 88 |
i = 0
|
| 89 |
while i < n:
|
| 90 |
j = i
|
|
@@ -93,22 +110,39 @@ def wilcoxon_test(
|
|
| 93 |
avg_rank = (i + j + 1) / 2.0 # rang moyen (1-based)
|
| 94 |
for k in range(i, j):
|
| 95 |
ranks[indexed[k][0]] = avg_rank
|
|
|
|
| 96 |
i = j
|
|
|
|
| 97 |
|
| 98 |
W_plus = sum(ranks[k] for k in range(n) if diffs[k] > 0)
|
| 99 |
W_minus = sum(ranks[k] for k in range(n) if diffs[k] < 0)
|
| 100 |
W = min(W_plus, W_minus)
|
| 101 |
|
| 102 |
-
# Calcul de la p-value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
if _SCIPY_AVAILABLE:
|
| 104 |
try:
|
| 105 |
-
scipy_res = _scipy_wilcoxon(
|
| 106 |
p_value = float(scipy_res.pvalue)
|
|
|
|
| 107 |
except Exception: # noqa: BLE001 β fallback gracieux
|
| 108 |
-
|
| 109 |
-
p_value = _native_p_value(n, W)
|
| 110 |
else:
|
| 111 |
-
p_value = _native_p_value(n,
|
| 112 |
|
| 113 |
significant = p_value < 0.05
|
| 114 |
|
|
@@ -132,6 +166,11 @@ def wilcoxon_test(
|
|
| 132 |
"n_pairs": n,
|
| 133 |
"W_plus": round(W_plus, 4),
|
| 134 |
"W_minus": round(W_minus, 4),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
|
| 137 |
|
|
@@ -150,33 +189,69 @@ def _normal_sf(z: float) -> float:
|
|
| 150 |
return p if z >= 0 else 1.0 - p
|
| 151 |
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
"""
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
def compute_pairwise_stats(
|
|
|
|
| 63 |
if len(a) != len(b):
|
| 64 |
raise ValueError("Les deux listes doivent avoir la mΓͺme longueur")
|
| 65 |
|
| 66 |
+
# ``diffs_raw`` conserve les zΓ©ros : on le transmet **tel quel** Γ
|
| 67 |
+
# scipy (qui applique ``zero_method`` lui-mΓͺme). Audit F9 : Γ©viter
|
| 68 |
+
# le double retrait des zΓ©ros (ici puis dans scipy) qui faussait
|
| 69 |
+
# ``n`` et la p-value. L'implΓ©mentation native travaille sur
|
| 70 |
+
# ``diffs`` (zΓ©ros retirΓ©s pour la mΓ©thode "wilcox").
|
| 71 |
+
diffs_raw = [x - y for x, y in zip(a, b)]
|
| 72 |
|
|
|
|
| 73 |
if zero_method == "wilcox":
|
| 74 |
+
diffs = [d for d in diffs_raw if d != 0.0]
|
| 75 |
+
else:
|
| 76 |
+
# "pratt"/"zsplit" : non gΓ©rΓ©s par l'implΓ©mentation native ;
|
| 77 |
+
# scipy (s'il est lΓ ) les applique. En repli natif, on retombe
|
| 78 |
+
# sur "wilcox" en le signalant dans l'interprΓ©tation.
|
| 79 |
+
diffs = [d for d in diffs_raw if d != 0.0]
|
| 80 |
|
| 81 |
n = len(diffs)
|
| 82 |
if n == 0:
|
|
|
|
| 86 |
"significant": False,
|
| 87 |
"interpretation": "Aucune diffΓ©rence entre les deux concurrents.",
|
| 88 |
"n_pairs": 0,
|
| 89 |
+
"W_plus": 0.0,
|
| 90 |
+
"W_minus": 0.0,
|
| 91 |
+
"method": "exact",
|
| 92 |
+
"has_ties": False,
|
| 93 |
}
|
| 94 |
|
| 95 |
# Rangs des valeurs absolues
|
| 96 |
abs_diffs = [abs(d) for d in diffs]
|
| 97 |
indexed = sorted(enumerate(abs_diffs), key=lambda x: x[1])
|
| 98 |
|
| 99 |
+
# Gestion des ex-aequo : rang moyen. On mΓ©morise la taille des
|
| 100 |
+
# groupes d'ex-aequo : un groupe de taille > 1 invalide la
|
| 101 |
+
# distribution exacte (rangs non distincts) β bascule vers
|
| 102 |
+
# l'approximation normale avec correction d'ex-aequo.
|
| 103 |
ranks = [0.0] * n
|
| 104 |
+
tie_sizes: list[int] = []
|
| 105 |
i = 0
|
| 106 |
while i < n:
|
| 107 |
j = i
|
|
|
|
| 110 |
avg_rank = (i + j + 1) / 2.0 # rang moyen (1-based)
|
| 111 |
for k in range(i, j):
|
| 112 |
ranks[indexed[k][0]] = avg_rank
|
| 113 |
+
tie_sizes.append(j - i)
|
| 114 |
i = j
|
| 115 |
+
has_ties = any(t > 1 for t in tie_sizes)
|
| 116 |
|
| 117 |
W_plus = sum(ranks[k] for k in range(n) if diffs[k] > 0)
|
| 118 |
W_minus = sum(ranks[k] for k in range(n) if diffs[k] < 0)
|
| 119 |
W = min(W_plus, W_minus)
|
| 120 |
|
| 121 |
+
# Calcul de la p-value bilatΓ©rale.
|
| 122 |
+
#
|
| 123 |
+
# 1. scipy si disponible : mΓ©thode exacte (n β€ 25) ou approximation
|
| 124 |
+
# normale (n > 25), appelΓ©e sur ``diffs_raw`` (zΓ©ros inclus) avec
|
| 125 |
+
# ``zero_method`` β scipy gΓ¨re le retrait lui-mΓͺme (audit F9 : plus
|
| 126 |
+
# de double retrait).
|
| 127 |
+
# 2. Sinon, implΓ©mentation native **exacte** : distribution nulle de
|
| 128 |
+
# WβΊ Γ©numΓ©rΓ©e par programmation dynamique sur les 2βΏ assignations
|
| 129 |
+
# de signes (valable sans ex-aequo, n β€ 25 β au-delΓ l'Γ©numΓ©ration
|
| 130 |
+
# est inutile, l'approximation normale converge). Avec ex-aequo
|
| 131 |
+
# ou n > 25 : approximation normale avec correction d'ex-aequo et
|
| 132 |
+
# de continuitΓ©. Plus aucune p-value fabriquΓ©e (audit F2 : la
|
| 133 |
+
# table {0.04, 0.20} retournait des faux positifs pour n β€ 5, oΓΉ
|
| 134 |
+
# la significativitΓ© bilatΓ©rale Γ 5 % est mathΓ©matiquement
|
| 135 |
+
# impossible).
|
| 136 |
+
method_used = "exact"
|
| 137 |
if _SCIPY_AVAILABLE:
|
| 138 |
try:
|
| 139 |
+
scipy_res = _scipy_wilcoxon(diffs_raw, zero_method=zero_method)
|
| 140 |
p_value = float(scipy_res.pvalue)
|
| 141 |
+
method_used = "scipy"
|
| 142 |
except Exception: # noqa: BLE001 β fallback gracieux
|
| 143 |
+
p_value, method_used = _native_p_value(n, W_plus, W_minus, tie_sizes)
|
|
|
|
| 144 |
else:
|
| 145 |
+
p_value, method_used = _native_p_value(n, W_plus, W_minus, tie_sizes)
|
| 146 |
|
| 147 |
significant = p_value < 0.05
|
| 148 |
|
|
|
|
| 166 |
"n_pairs": n,
|
| 167 |
"W_plus": round(W_plus, 4),
|
| 168 |
"W_minus": round(W_minus, 4),
|
| 169 |
+
# Transparence mΓ©thodologique (audit F2/F9) : quelle mΓ©thode a
|
| 170 |
+
# produit la p-value, et prΓ©sence d'ex-aequo (qui force
|
| 171 |
+
# l'approximation normale en l'absence de scipy).
|
| 172 |
+
"method": method_used,
|
| 173 |
+
"has_ties": has_ties,
|
| 174 |
}
|
| 175 |
|
| 176 |
|
|
|
|
| 189 |
return p if z >= 0 else 1.0 - p
|
| 190 |
|
| 191 |
|
| 192 |
+
def _exact_signed_rank_two_sided_p(
|
| 193 |
+
n: int, w_plus: float, w_minus: float,
|
| 194 |
+
) -> float:
|
| 195 |
+
"""P-value bilatΓ©rale **exacte** du test des rangs signΓ©s (sans ex-aequo).
|
| 196 |
|
| 197 |
+
Sous H0, chacune des 2βΏ assignations de signes aux rangs 1..n est
|
| 198 |
+
Γ©quiprobable. La distribution de WβΊ (somme des rangs portant un
|
| 199 |
+
signe +) est le nombre de sous-ensembles de ``{1,β¦,n}`` de somme
|
| 200 |
+
``s`` divisΓ© par 2βΏ β fonction gΓ©nΓ©ratrice ``β(1 + xΚ³)``, calculΓ©e
|
| 201 |
+
par programmation dynamique (knapsack). La p-value bilatΓ©rale vaut
|
| 202 |
+
``2Β·P(WβΊ β€ T)`` avec ``T = min(WβΊ, Wβ»)``, bornΓ©e Γ 1.0. Identique
|
| 203 |
+
au mode exact de ``scipy.stats.wilcoxon``.
|
| 204 |
|
| 205 |
+
Pour n β€ 5 la p-value minimale possible est 2/2βΏ β₯ 0.0625 : le test
|
| 206 |
+
ne peut donc jamais Γͺtre significatif Γ 5 % bilatΓ©ral β ce que
|
| 207 |
+
l'ancienne table ``{0.04, 0.20}`` violait (faux positifs, audit F2).
|
| 208 |
"""
|
| 209 |
+
total = n * (n + 1) // 2
|
| 210 |
+
counts = [0] * (total + 1)
|
| 211 |
+
counts[0] = 1
|
| 212 |
+
for r in range(1, n + 1):
|
| 213 |
+
for s in range(total, r - 1, -1):
|
| 214 |
+
counts[s] += counts[s - r]
|
| 215 |
+
t = int(min(w_plus, w_minus))
|
| 216 |
+
tail = sum(counts[: t + 1])
|
| 217 |
+
return min(1.0, 2.0 * tail / float(1 << n))
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _native_p_value(
|
| 221 |
+
n: int,
|
| 222 |
+
w_plus: float,
|
| 223 |
+
w_minus: float,
|
| 224 |
+
tie_sizes: list[int],
|
| 225 |
+
) -> tuple[float, str]:
|
| 226 |
+
"""P-value bilatΓ©rale native + nom de la mΓ©thode employΓ©e.
|
| 227 |
+
|
| 228 |
+
- **Sans ex-aequo et n β€ 25** : distribution exacte (DP ci-dessus).
|
| 229 |
+
- **Sinon** (ex-aequo, ou n > 25) : approximation normale avec
|
| 230 |
+
correction d'ex-aequo sur la variance et correction de continuitΓ©
|
| 231 |
+
standard ``(|W β ΞΌ| β Β½)/Ο`` bornΓ©e Γ 0 (audit F9 : l'ancienne
|
| 232 |
+
forme ``|(W+Β½) β ΞΌ|`` Γ©tait lΓ©gΓ¨rement anti-conservatrice quand
|
| 233 |
+
W β ΞΌ).
|
| 234 |
+
|
| 235 |
+
Plus aucune p-value fabriquΓ©e (audit F2).
|
| 236 |
+
"""
|
| 237 |
+
if n == 0:
|
| 238 |
+
return 1.0, "exact"
|
| 239 |
+
has_ties = any(t > 1 for t in tie_sizes)
|
| 240 |
+
if not has_ties and n <= 25:
|
| 241 |
+
return _exact_signed_rank_two_sided_p(n, w_plus, w_minus), "exact"
|
| 242 |
+
|
| 243 |
+
mu = n * (n + 1) / 4.0
|
| 244 |
+
# ΟΒ² avec correction d'ex-aequo (Wilcoxon signΓ©-rangΓ©) :
|
| 245 |
+
# ΟΒ² = [n(n+1)(2n+1) β Β½Β·Ξ£(tβ±ΌΒ³ β tβ±Ό)] / 24
|
| 246 |
+
tie_term = sum(t ** 3 - t for t in tie_sizes)
|
| 247 |
+
sigma2 = (n * (n + 1) * (2 * n + 1) - 0.5 * tie_term) / 24.0
|
| 248 |
+
if sigma2 <= 0:
|
| 249 |
+
return 1.0, "normal_approx"
|
| 250 |
+
W = min(w_plus, w_minus)
|
| 251 |
+
z = (abs(W - mu) - 0.5) / math.sqrt(sigma2)
|
| 252 |
+
if z < 0.0:
|
| 253 |
+
z = 0.0
|
| 254 |
+
return min(1.0, 2.0 * _normal_sf(z)), "normal_approx"
|
| 255 |
|
| 256 |
|
| 257 |
def compute_pairwise_stats(
|
|
@@ -48,7 +48,7 @@ FILE_BUDGETS: dict[str, int] = {
|
|
| 48 |
# rΓ©fΓ©rencΓ©s ailleurs. L'historique reste accessible via git log
|
| 49 |
# + CHANGELOG.
|
| 50 |
"picarones/reports/html/generator.py": 550, # actuel 471
|
| 51 |
-
"picarones/evaluation/benchmark_result.py":
|
| 52 |
"picarones/reports/html/renderers/philological.py": 700, # actuel 601
|
| 53 |
"picarones/evaluation/metrics/modern_archives.py": 700, # actuel 599
|
| 54 |
"picarones/evaluation/metrics/builtin_hooks.py": 700, # actuel 590
|
|
|
|
| 48 |
# rΓ©fΓ©rencΓ©s ailleurs. L'historique reste accessible via git log
|
| 49 |
# + CHANGELOG.
|
| 50 |
"picarones/reports/html/generator.py": 550, # actuel 471
|
| 51 |
+
"picarones/evaluation/benchmark_result.py": 1058, # actuel ~920 (audit F1 : micro-CER/WER + tri)
|
| 52 |
"picarones/reports/html/renderers/philological.py": 700, # actuel 601
|
| 53 |
"picarones/evaluation/metrics/modern_archives.py": 700, # actuel 599
|
| 54 |
"picarones/evaluation/metrics/builtin_hooks.py": 700, # actuel 590
|
|
@@ -1,13 +1,23 @@
|
|
| 1 |
-
"""Tests Sprint 44 β
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
Couvre :
|
| 4 |
|
| 5 |
1. ``EngineReport.median_cer`` lit ``aggregated_metrics["cer"]["median"]``.
|
| 6 |
2. ``BenchmarkResult.ranking()`` :
|
| 7 |
-
- inclut ``median_cer`` dans chaque entrΓ©e
|
| 8 |
-
- trie sur
|
| 9 |
-
|
| 10 |
-
|
|
|
|
| 11 |
- se dΓ©clenche quand le ratio ``|moyenne - mΓ©diane| / mΓ©diane > 30%``
|
| 12 |
- ne se dΓ©clenche pas quand symΓ©trique
|
| 13 |
- ne se dΓ©clenche pas si la mΓ©diane est nulle (corpus parfait)
|
|
@@ -35,21 +45,53 @@ from picarones.evaluation.benchmark_result import BenchmarkResult, DocumentResul
|
|
| 35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
|
| 37 |
|
| 38 |
-
def _make_dr(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return DocumentResult(
|
| 40 |
doc_id=doc_id, image_path="/tmp/x.png",
|
| 41 |
ground_truth="x", hypothesis="x",
|
| 42 |
metrics=MetricsResult(
|
| 43 |
cer=cer, cer_nfc=cer, cer_caseless=cer,
|
| 44 |
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
|
| 45 |
-
reference_length=1, hypothesis_length=1,
|
|
|
|
|
|
|
| 46 |
),
|
| 47 |
duration_seconds=0.1,
|
| 48 |
)
|
| 49 |
|
| 50 |
|
| 51 |
-
def _make_engine_report(
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
return EngineReport(
|
| 54 |
engine_name=name, engine_version="1", engine_config={},
|
| 55 |
document_results=drs,
|
|
@@ -81,39 +123,72 @@ class TestMedianCerProperty:
|
|
| 81 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 82 |
|
| 83 |
|
| 84 |
-
class
|
| 85 |
-
def
|
| 86 |
bench = BenchmarkResult(
|
| 87 |
corpus_name="c", corpus_source=None, document_count=3,
|
| 88 |
-
engine_reports=[_make_engine_report(
|
|
|
|
|
|
|
| 89 |
)
|
| 90 |
ranking = bench.ranking()
|
| 91 |
assert "median_cer" in ranking[0]
|
|
|
|
| 92 |
assert ranking[0]["median_cer"] == pytest.approx(0.2)
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
ers = [
|
| 100 |
-
_make_engine_report(
|
| 101 |
-
|
| 102 |
-
[0.03] * 8 + [0.40] * 2,
|
| 103 |
-
),
|
| 104 |
-
_make_engine_report(
|
| 105 |
-
"B_steady",
|
| 106 |
-
[0.05] * 10,
|
| 107 |
-
),
|
| 108 |
]
|
| 109 |
bench = BenchmarkResult(
|
| 110 |
corpus_name="c", corpus_source=None, document_count=10,
|
| 111 |
engine_reports=ers,
|
| 112 |
)
|
| 113 |
ranking = bench.ranking()
|
| 114 |
-
|
| 115 |
assert ranking[0]["engine"] == "A_asymmetric"
|
| 116 |
-
assert ranking[0]["mean_cer"] > ranking[1]["mean_cer"]
|
| 117 |
assert ranking[0]["median_cer"] < ranking[1]["median_cer"]
|
| 118 |
|
| 119 |
def test_falls_back_to_mean_when_median_missing(self) -> None:
|
|
@@ -126,14 +201,18 @@ class TestRankingByMedian:
|
|
| 126 |
une mΓ©diane quand il y a au moins un doc).
|
| 127 |
"""
|
| 128 |
ranked = [
|
| 129 |
-
{"engine": "x", "
|
| 130 |
-
"mean_wer": 0.0, "documents": 1, "failed": 0},
|
| 131 |
-
{"engine": "y", "
|
| 132 |
-
"mean_wer": 0.0, "documents": 1, "failed": 0},
|
| 133 |
]
|
| 134 |
|
| 135 |
def _key(e: dict) -> tuple:
|
| 136 |
-
p = e.get("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
return (p is None, p if p is not None else float("inf"))
|
| 138 |
|
| 139 |
ranking = sorted(ranked, key=_key)
|
|
|
|
| 1 |
+
"""Tests Sprint 44 (mΓ©diane) β rΓ©visΓ©s par l'audit scientifique F1.
|
| 2 |
+
|
| 3 |
+
Historique : le Sprint 44 avait fait du **CER médian** le critère de
|
| 4 |
+
tri par dΓ©faut. L'audit scientifique (mai 2026, F1) a montrΓ© que la
|
| 5 |
+
médiane de taux par document reste aveugle à la longueur ; le critère
|
| 6 |
+
de tri par dΓ©faut est dΓ©sormais le **CER micro-moyennΓ©**
|
| 7 |
+
(Σ distance_édition / Σ caractères_référence), standard du domaine
|
| 8 |
+
OCR/HTR. La mΓ©diane redevient un **repli** (corpus sans comptes
|
| 9 |
+
bruts) et un **diagnostic de dispersion** (dΓ©tecteur
|
| 10 |
+
``median_mean_gap_warning``), plus un critère de classement.
|
| 11 |
|
| 12 |
Couvre :
|
| 13 |
|
| 14 |
1. ``EngineReport.median_cer`` lit ``aggregated_metrics["cer"]["median"]``.
|
| 15 |
2. ``BenchmarkResult.ranking()`` :
|
| 16 |
+
- inclut ``micro_cer`` et ``median_cer`` dans chaque entrΓ©e
|
| 17 |
+
- trie sur le **micro-CER** par dΓ©faut quand les comptes bruts
|
| 18 |
+
sont disponibles
|
| 19 |
+
- retombe sur la mΓ©diane puis la moyenne si le micro est absent
|
| 20 |
+
3. DΓ©tecteur ``MEDIAN_MEAN_GAP_WARNING`` (inchangΓ©) :
|
| 21 |
- se dΓ©clenche quand le ratio ``|moyenne - mΓ©diane| / mΓ©diane > 30%``
|
| 22 |
- ne se dΓ©clenche pas quand symΓ©trique
|
| 23 |
- ne se dΓ©clenche pas si la mΓ©diane est nulle (corpus parfait)
|
|
|
|
| 45 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
|
| 47 |
|
| 48 |
+
def _make_dr(
|
| 49 |
+
cer: float,
|
| 50 |
+
doc_id: str = "d",
|
| 51 |
+
ref_chars: int | None = None,
|
| 52 |
+
) -> DocumentResult:
|
| 53 |
+
"""DocumentResult synthΓ©tique.
|
| 54 |
+
|
| 55 |
+
Si ``ref_chars`` est fourni, on renseigne les comptes bruts
|
| 56 |
+
(``cer_errors``/``cer_ref_chars``) cohΓ©rents avec ``cer`` pour
|
| 57 |
+
activer le micro-CER ; sinon ils restent ``None`` et le tri
|
| 58 |
+
retombe sur la mΓ©diane (chemin de repli historique Sprint 44).
|
| 59 |
+
"""
|
| 60 |
+
cer_errors = None
|
| 61 |
+
cer_ref_chars = None
|
| 62 |
+
wer_errors = None
|
| 63 |
+
wer_ref_words = None
|
| 64 |
+
if ref_chars is not None:
|
| 65 |
+
cer_ref_chars = ref_chars
|
| 66 |
+
cer_errors = round(cer * ref_chars)
|
| 67 |
+
wer_ref_words = max(1, ref_chars // 5)
|
| 68 |
+
wer_errors = round(cer * wer_ref_words)
|
| 69 |
return DocumentResult(
|
| 70 |
doc_id=doc_id, image_path="/tmp/x.png",
|
| 71 |
ground_truth="x", hypothesis="x",
|
| 72 |
metrics=MetricsResult(
|
| 73 |
cer=cer, cer_nfc=cer, cer_caseless=cer,
|
| 74 |
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
|
| 75 |
+
reference_length=ref_chars or 1, hypothesis_length=ref_chars or 1,
|
| 76 |
+
cer_errors=cer_errors, cer_ref_chars=cer_ref_chars,
|
| 77 |
+
wer_errors=wer_errors, wer_ref_words=wer_ref_words,
|
| 78 |
),
|
| 79 |
duration_seconds=0.1,
|
| 80 |
)
|
| 81 |
|
| 82 |
|
| 83 |
+
def _make_engine_report(
|
| 84 |
+
name: str,
|
| 85 |
+
cers: list[float],
|
| 86 |
+
ref_chars: list[int] | None = None,
|
| 87 |
+
) -> EngineReport:
|
| 88 |
+
if ref_chars is None:
|
| 89 |
+
drs = [_make_dr(c, doc_id=f"d{i}") for i, c in enumerate(cers)]
|
| 90 |
+
else:
|
| 91 |
+
drs = [
|
| 92 |
+
_make_dr(c, doc_id=f"d{i}", ref_chars=rc)
|
| 93 |
+
for i, (c, rc) in enumerate(zip(cers, ref_chars))
|
| 94 |
+
]
|
| 95 |
return EngineReport(
|
| 96 |
engine_name=name, engine_version="1", engine_config={},
|
| 97 |
document_results=drs,
|
|
|
|
| 123 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
|
| 125 |
|
| 126 |
+
class TestRankingByMicro:
|
| 127 |
+
def test_includes_micro_and_median_cer(self) -> None:
|
| 128 |
bench = BenchmarkResult(
|
| 129 |
corpus_name="c", corpus_source=None, document_count=3,
|
| 130 |
+
engine_reports=[_make_engine_report(
|
| 131 |
+
"a", [0.1, 0.2, 0.3], ref_chars=[100, 100, 100],
|
| 132 |
+
)],
|
| 133 |
)
|
| 134 |
ranking = bench.ranking()
|
| 135 |
assert "median_cer" in ranking[0]
|
| 136 |
+
assert "micro_cer" in ranking[0]
|
| 137 |
assert ranking[0]["median_cer"] == pytest.approx(0.2)
|
| 138 |
+
# micro = (10+20+30)/300 = 0.2 (longueurs Γ©gales β micro == mean)
|
| 139 |
+
assert ranking[0]["micro_cer"] == pytest.approx(0.2)
|
| 140 |
+
|
| 141 |
+
def test_micro_is_default_sort_key_and_can_beat_median(self) -> None:
|
| 142 |
+
"""Cas scientifiquement dΓ©cisif (F1) : micro β mΓ©diane.
|
| 143 |
+
|
| 144 |
+
Moteur A : excellent sur 9 courts documents (10 car, CER 0,02)
|
| 145 |
+
mais catastrophique sur 1 page longue (5 000 car, CER 0,50).
|
| 146 |
+
- mΓ©diane CER = 0,02 (tirΓ©e par les courts)
|
| 147 |
+
- micro CER = (9Β·10Β·0,02 + 5000Β·0,50) / (9Β·10 + 5000)
|
| 148 |
+
β 2502 / 5090 β 0,4916
|
| 149 |
+
Moteur B : rΓ©gulier partout (CER 0,10).
|
| 150 |
+
- mΓ©diane = 0,10 ; micro β 0,10
|
| 151 |
+
Tri mΓ©diane : A (0,02) < B (0,10) β A gagnerait Γ tort.
|
| 152 |
+
Tri micro : B (0,10) < A (0,49) β B gagne, ce qui reflΓ¨te
|
| 153 |
+
la réalité (A rate la moitié d'une page de 5 000 caractères).
|
| 154 |
+
"""
|
| 155 |
+
a = _make_engine_report(
|
| 156 |
+
"A_short_specialist",
|
| 157 |
+
[0.02] * 9 + [0.50],
|
| 158 |
+
ref_chars=[10] * 9 + [5000],
|
| 159 |
+
)
|
| 160 |
+
b = _make_engine_report(
|
| 161 |
+
"B_steady",
|
| 162 |
+
[0.10] * 10,
|
| 163 |
+
ref_chars=[500] * 10,
|
| 164 |
+
)
|
| 165 |
+
bench = BenchmarkResult(
|
| 166 |
+
corpus_name="c", corpus_source=None, document_count=10,
|
| 167 |
+
engine_reports=[a, b],
|
| 168 |
+
)
|
| 169 |
+
ranking = bench.ranking()
|
| 170 |
+
# Le tri micro doit placer B premier, contredisant la mΓ©diane.
|
| 171 |
+
assert ranking[0]["engine"] == "B_steady"
|
| 172 |
+
assert ranking[0]["micro_cer"] < ranking[1]["micro_cer"]
|
| 173 |
+
# ... alors que la mΓ©diane aurait (Γ tort) favorisΓ© A.
|
| 174 |
+
a_entry = next(r for r in ranking if r["engine"] == "A_short_specialist")
|
| 175 |
+
assert a_entry["median_cer"] < ranking[0]["median_cer"]
|
| 176 |
+
assert a_entry["micro_cer"] == pytest.approx(0.4916, abs=2e-3)
|
| 177 |
+
|
| 178 |
+
def test_falls_back_to_median_when_micro_missing(self) -> None:
|
| 179 |
+
"""Sans comptes bruts (jiwer absent / fixture legacy), le tri
|
| 180 |
+
retombe sur la mΓ©diane β comportement Sprint 44 prΓ©servΓ©."""
|
| 181 |
ers = [
|
| 182 |
+
_make_engine_report("A_asymmetric", [0.03] * 8 + [0.40] * 2),
|
| 183 |
+
_make_engine_report("B_steady", [0.05] * 10),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
]
|
| 185 |
bench = BenchmarkResult(
|
| 186 |
corpus_name="c", corpus_source=None, document_count=10,
|
| 187 |
engine_reports=ers,
|
| 188 |
)
|
| 189 |
ranking = bench.ranking()
|
| 190 |
+
assert ranking[0]["micro_cer"] is None # pas de comptes bruts
|
| 191 |
assert ranking[0]["engine"] == "A_asymmetric"
|
|
|
|
| 192 |
assert ranking[0]["median_cer"] < ranking[1]["median_cer"]
|
| 193 |
|
| 194 |
def test_falls_back_to_mean_when_median_missing(self) -> None:
|
|
|
|
| 201 |
une mΓ©diane quand il y a au moins un doc).
|
| 202 |
"""
|
| 203 |
ranked = [
|
| 204 |
+
{"engine": "x", "micro_cer": None, "mean_cer": 0.10,
|
| 205 |
+
"median_cer": None, "mean_wer": 0.0, "documents": 1, "failed": 0},
|
| 206 |
+
{"engine": "y", "micro_cer": None, "mean_cer": 0.05,
|
| 207 |
+
"median_cer": None, "mean_wer": 0.0, "documents": 1, "failed": 0},
|
| 208 |
]
|
| 209 |
|
| 210 |
def _key(e: dict) -> tuple:
|
| 211 |
+
p = e.get("micro_cer")
|
| 212 |
+
if p is None:
|
| 213 |
+
p = e.get("median_cer")
|
| 214 |
+
if p is None:
|
| 215 |
+
p = e.get("mean_cer")
|
| 216 |
return (p is None, p if p is not None else float("inf"))
|
| 217 |
|
| 218 |
ranking = sorted(ranked, key=_key)
|
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RΓ©gression β audit scientifique (mai 2026).
|
| 2 |
+
|
| 3 |
+
Chaque test verrouille une correction de l'audit de fiabilitΓ©
|
| 4 |
+
scientifique afin qu'aucune rΓ©gression ne rΓ©-introduise un calcul
|
| 5 |
+
faux ou une donnΓ©e trompeuse. Les identifiants Fxx renvoient au
|
| 6 |
+
rapport d'audit.
|
| 7 |
+
|
| 8 |
+
Ces tests s'exΓ©cutent sur le chemin **sans scipy** (installation par
|
| 9 |
+
dΓ©faut ``[dev,web]``), qui est le chemin de production le plus courant
|
| 10 |
+
et celui oΓΉ les dΓ©fauts F2/F9 Γ©taient atteignables.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import math
|
| 16 |
+
|
| 17 |
+
import pytest
|
| 18 |
+
|
| 19 |
+
from picarones.evaluation._diff_utils import compute_char_diff, diff_stats
|
| 20 |
+
from picarones.evaluation.metric_result import MetricsResult, aggregate_metrics
|
| 21 |
+
from picarones.evaluation.metrics.confusion import build_confusion_matrix
|
| 22 |
+
from picarones.evaluation.metrics.text_metrics import compute_metrics
|
| 23 |
+
from picarones.evaluation.statistics.wilcoxon import (
|
| 24 |
+
_exact_signed_rank_two_sided_p,
|
| 25 |
+
wilcoxon_test,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
# F1 β CER/WER micro-moyennΓ© (pondΓ©rΓ© par la longueur)
|
| 31 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TestF1MicroAverage:
|
| 35 |
+
def test_compute_metrics_stores_exact_edit_counts(self) -> None:
|
| 36 |
+
"""Les comptes bruts permettent de recomposer le CER exact."""
|
| 37 |
+
m = compute_metrics("abcde fghij", "abXde fg")
|
| 38 |
+
assert m.cer_errors is not None and m.cer_ref_chars is not None
|
| 39 |
+
# CER = distance_édition / caractères_référence (def. exacte).
|
| 40 |
+
assert m.cer == pytest.approx(m.cer_errors / m.cer_ref_chars)
|
| 41 |
+
assert m.wer == pytest.approx(m.wer_errors / m.wer_ref_words)
|
| 42 |
+
|
| 43 |
+
def test_micro_average_is_length_weighted(self) -> None:
|
| 44 |
+
"""Le micro-CER pondère par la longueur ; la macro-moyenne non.
|
| 45 |
+
|
| 46 |
+
Doc court : 'ab' β 'aX' (1 erreur / 2 car = 0.50)
|
| 47 |
+
Doc long : 100Β·'a' β 90Β·'a'+10Β·'b' (10 err / 100 car = 0.10)
|
| 48 |
+
macro mean = (0.50 + 0.10)/2 = 0.30
|
| 49 |
+
micro = (1 + 10) / (2 + 100) = 11/102 β 0.1078
|
| 50 |
+
"""
|
| 51 |
+
docs = [
|
| 52 |
+
compute_metrics("ab", "aX"),
|
| 53 |
+
compute_metrics("a" * 100, "a" * 90 + "b" * 10),
|
| 54 |
+
]
|
| 55 |
+
agg = aggregate_metrics(docs)
|
| 56 |
+
assert agg["cer"]["mean"] == pytest.approx(0.30, abs=1e-6)
|
| 57 |
+
assert agg["cer_micro"]["value"] == pytest.approx(11 / 102, abs=1e-6)
|
| 58 |
+
assert agg["cer_micro"]["total_errors"] == 11
|
| 59 |
+
assert agg["cer_micro"]["total_reference_units"] == 102
|
| 60 |
+
|
| 61 |
+
def test_micro_absent_when_no_raw_counts(self) -> None:
|
| 62 |
+
"""Fixture legacy sans comptes β pas de clΓ© micro (repli mΓ©diane)."""
|
| 63 |
+
legacy = [
|
| 64 |
+
MetricsResult(cer=0.1, wer=0.1, reference_length=10),
|
| 65 |
+
MetricsResult(cer=0.2, wer=0.2, reference_length=10),
|
| 66 |
+
]
|
| 67 |
+
agg = aggregate_metrics(legacy)
|
| 68 |
+
assert "cer_micro" not in agg
|
| 69 |
+
assert agg["cer"]["mean"] == pytest.approx(0.15)
|
| 70 |
+
|
| 71 |
+
def test_round_trip_preserves_counts(self) -> None:
|
| 72 |
+
m = compute_metrics("le roy de France", "le roi de Frace")
|
| 73 |
+
restored = MetricsResult.from_dict(m.as_dict())
|
| 74 |
+
assert restored.cer_errors == m.cer_errors
|
| 75 |
+
assert restored.cer_ref_chars == m.cer_ref_chars
|
| 76 |
+
assert restored.wer_errors == m.wer_errors
|
| 77 |
+
assert restored.wer_ref_words == m.wer_ref_words
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
# F2 β Wilcoxon : plus aucune p-value fabriquΓ©e pour petit n
|
| 82 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class TestF2WilcoxonExactSmallN:
|
| 86 |
+
def test_no_false_positive_for_n_le_5(self) -> None:
|
| 87 |
+
"""Pour n β€ 5, la significativitΓ© bilatΓ©rale Γ 5 % est
|
| 88 |
+
mathΓ©matiquement impossible (p_min = 2/2βΏ β₯ 0.0625).
|
| 89 |
+
|
| 90 |
+
L'ancienne table renvoyait p=0.04 Β« significatif Β» quand un
|
| 91 |
+
moteur dominait l'autre sur les 5 documents β un faux positif.
|
| 92 |
+
"""
|
| 93 |
+
# DiffΓ©rences toutes positives, magnitudes distinctes β pas
|
| 94 |
+
# d'ex-aequo β chemin exact, W = 0.
|
| 95 |
+
worse = [0.20, 0.31, 0.42, 0.53, 0.64]
|
| 96 |
+
better = [0.10, 0.20, 0.30, 0.40, 0.50]
|
| 97 |
+
res = wilcoxon_test(better, worse)
|
| 98 |
+
assert res["method"] == "exact"
|
| 99 |
+
assert res["p_value"] == pytest.approx(0.0625)
|
| 100 |
+
assert res["significant"] is False
|
| 101 |
+
|
| 102 |
+
@pytest.mark.parametrize(
|
| 103 |
+
"n,w,expected",
|
| 104 |
+
[
|
| 105 |
+
(6, 0, 2 / 64), # plus petit n significatif Γ 5 %
|
| 106 |
+
(7, 2, 0.046875),
|
| 107 |
+
(8, 3, 0.0390625),
|
| 108 |
+
(8, 4, 0.0546875), # juste au-dessus du seuil
|
| 109 |
+
(10, 8, 0.0488281),
|
| 110 |
+
],
|
| 111 |
+
)
|
| 112 |
+
def test_exact_pvalues_match_statistical_tables(
|
| 113 |
+
self, n: int, w: int, expected: float,
|
| 114 |
+
) -> None:
|
| 115 |
+
total = n * (n + 1) // 2
|
| 116 |
+
p = _exact_signed_rank_two_sided_p(n, w, total - w)
|
| 117 |
+
assert p == pytest.approx(expected, abs=1e-6)
|
| 118 |
+
|
| 119 |
+
def test_n5_pvalue_distribution_is_well_formed(self) -> None:
|
| 120 |
+
"""La p-value exacte est un vrai quantile β ]0, 1], jamais une
|
| 121 |
+
constante fabriquΓ©e comme 0.04 ou 0.20."""
|
| 122 |
+
seen = set()
|
| 123 |
+
total = 5 * 6 // 2
|
| 124 |
+
for w in range(total + 1):
|
| 125 |
+
p = _exact_signed_rank_two_sided_p(5, w, total - w)
|
| 126 |
+
assert 0.0 < p <= 1.0
|
| 127 |
+
seen.add(round(p, 6))
|
| 128 |
+
assert 0.04 not in seen and 0.20 not in seen
|
| 129 |
+
assert min(seen) == pytest.approx(0.0625) # = 2/32
|
| 130 |
+
|
| 131 |
+
def test_ties_use_corrected_normal_approx(self) -> None:
|
| 132 |
+
a = [1, 2, 2, 3, 5, 5, 7, 9, 9, 11, 2, 4]
|
| 133 |
+
b = [1, 1, 2, 3, 4, 5, 6, 9, 8, 10, 2, 3]
|
| 134 |
+
res = wilcoxon_test(a, b)
|
| 135 |
+
assert res["has_ties"] is True
|
| 136 |
+
assert res["method"] == "normal_approx"
|
| 137 |
+
assert 0.0 < res["p_value"] <= 1.0
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 141 |
+
# F9 β correction de continuitΓ© standard, bornΓ©e Γ 0
|
| 142 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class TestF4MinimalAlignment:
|
| 146 |
+
"""Confusion matrix / diff alignΓ©s sur Levenshtein (β‘ CER)."""
|
| 147 |
+
|
| 148 |
+
@pytest.mark.parametrize(
|
| 149 |
+
"gt,hyp",
|
| 150 |
+
[
|
| 151 |
+
("maistre Jehan Froissart", "maiΕΏtre Iehan Froiflart"),
|
| 152 |
+
("le roy de France", "le roi de la France"),
|
| 153 |
+
("abcdefghij", "aXcdefghijKL"),
|
| 154 |
+
("ΕΏuΕΏpicion", "fufpicion"),
|
| 155 |
+
("", "insΓ©rΓ©"),
|
| 156 |
+
("supprimΓ©", ""),
|
| 157 |
+
],
|
| 158 |
+
)
|
| 159 |
+
def test_confusion_total_equals_levenshtein_distance(
|
| 160 |
+
self, gt: str, hyp: str,
|
| 161 |
+
) -> None:
|
| 162 |
+
"""S+D+I de la matrice = distance d'Γ©dition de Levenshtein,
|
| 163 |
+
donc cohΓ©rent avec le numΓ©rateur du CER (jiwer).
|
| 164 |
+
|
| 165 |
+
Sous RatcliffβObershelp (difflib, ancien code) cette Γ©galitΓ©
|
| 166 |
+
était fausse dès qu'une insertion/suppression décalait la suite.
|
| 167 |
+
"""
|
| 168 |
+
from rapidfuzz.distance import Levenshtein
|
| 169 |
+
|
| 170 |
+
cm = build_confusion_matrix(
|
| 171 |
+
gt, hyp, ignore_whitespace=False, ignore_correct=True,
|
| 172 |
+
)
|
| 173 |
+
total = (
|
| 174 |
+
cm.total_substitutions
|
| 175 |
+
+ cm.total_insertions
|
| 176 |
+
+ cm.total_deletions
|
| 177 |
+
)
|
| 178 |
+
assert total == Levenshtein.distance(gt, hyp)
|
| 179 |
+
|
| 180 |
+
def test_char_diff_is_minimal_edit(self) -> None:
|
| 181 |
+
"""Le diff caractère ne sur-segmente pas : le nombre d'opérations
|
| 182 |
+
non-equal Γ©gale la distance de Levenshtein (1 op = 1 Γ©dition)."""
|
| 183 |
+
from rapidfuzz.distance import Levenshtein
|
| 184 |
+
|
| 185 |
+
gt, hyp = "abcdef", "aXcdefY"
|
| 186 |
+
ops = compute_char_diff(gt, hyp)
|
| 187 |
+
st = diff_stats(ops)
|
| 188 |
+
edits = st["replace"] + st["insert"] + st["delete"]
|
| 189 |
+
assert edits == Levenshtein.distance(gt, hyp) == 2
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
class TestF9ContinuityCorrection:
|
| 193 |
+
def test_no_signal_gives_non_significant(self) -> None:
|
| 194 |
+
"""W β ΞΌ (aucun effet) β z bornΓ© Γ 0 β p = 1.0, jamais < 1
|
| 195 |
+
par sur-correction (ancienne forme |οΌW+Β½)βΞΌ|)."""
|
| 196 |
+
# Beaucoup d'ex-aequo et diffΓ©rences symΓ©triques β approx normale.
|
| 197 |
+
a = [0.10, 0.20, 0.10, 0.20, 0.10, 0.20, 0.10, 0.20,
|
| 198 |
+
0.10, 0.20, 0.10, 0.20]
|
| 199 |
+
b = [0.20, 0.10, 0.20, 0.10, 0.20, 0.10, 0.20, 0.10,
|
| 200 |
+
0.20, 0.10, 0.20, 0.10]
|
| 201 |
+
res = wilcoxon_test(a, b)
|
| 202 |
+
assert res["p_value"] == pytest.approx(1.0)
|
| 203 |
+
assert res["significant"] is False
|
|
@@ -222,7 +222,9 @@
|
|
| 222 |
"failed": 0,
|
| 223 |
"mean_cer": 0.025,
|
| 224 |
"mean_wer": 0.05,
|
| 225 |
-
"median_cer": 0.025
|
|
|
|
|
|
|
| 226 |
},
|
| 227 |
{
|
| 228 |
"documents": 2,
|
|
@@ -230,7 +232,9 @@
|
|
| 230 |
"failed": 0,
|
| 231 |
"mean_cer": 0.03125,
|
| 232 |
"mean_wer": 0.166666,
|
| 233 |
-
"median_cer": 0.03125
|
|
|
|
|
|
|
| 234 |
}
|
| 235 |
],
|
| 236 |
"run_date": "2026-05-09T00:00:00+00:00"
|
|
|
|
| 222 |
"failed": 0,
|
| 223 |
"mean_cer": 0.025,
|
| 224 |
"mean_wer": 0.05,
|
| 225 |
+
"median_cer": 0.025,
|
| 226 |
+
"micro_cer": null,
|
| 227 |
+
"micro_wer": null
|
| 228 |
},
|
| 229 |
{
|
| 230 |
"documents": 2,
|
|
|
|
| 232 |
"failed": 0,
|
| 233 |
"mean_cer": 0.03125,
|
| 234 |
"mean_wer": 0.166666,
|
| 235 |
+
"median_cer": 0.03125,
|
| 236 |
+
"micro_cer": null,
|
| 237 |
+
"micro_wer": null
|
| 238 |
}
|
| 239 |
],
|
| 240 |
"run_date": "2026-05-09T00:00:00+00:00"
|