Spaces:
Running
Running
File size: 5,898 Bytes
979f3c3 49cc409 979f3c3 49cc409 39b4865 49cc409 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | """Tests unitaires pour le module picarones.measurements.metrics."""
import pytest
from picarones.measurements.metrics import aggregate_metrics, compute_metrics, MetricsResult
class TestComputeMetrics:
"""Tests de compute_metrics sur des cas connus."""
def test_perfect_match(self):
"""CER et WER doivent être 0 quand référence == hypothèse."""
result = compute_metrics("Bonjour le monde", "Bonjour le monde")
assert result.cer == pytest.approx(0.0)
assert result.wer == pytest.approx(0.0)
assert result.error is None
def test_complete_mismatch(self):
"""CER proche de 1 quand les textes sont totalement différents."""
result = compute_metrics("abc", "xyz")
assert result.cer > 0.0
assert result.error is None
def test_empty_reference(self):
"""Référence vide : CER = 1.0 si hypothèse non vide."""
result = compute_metrics("", "quelque chose")
assert result.cer == pytest.approx(1.0)
def test_empty_both(self):
"""Référence et hypothèse vides : CER = 0.0."""
result = compute_metrics("", "")
assert result.cer == pytest.approx(0.0)
def test_single_substitution(self):
"""Une seule substitution sur 4 chars → CER = 0.25."""
result = compute_metrics("abcd", "abce")
assert result.cer == pytest.approx(0.25)
def test_case_insensitive_cer(self):
"""CER caseless ignore les différences de casse."""
result = compute_metrics("Bonjour", "bonjour")
assert result.cer_caseless == pytest.approx(0.0)
# CER brut doit être > 0 (B ≠ b)
assert result.cer > 0.0
def test_nfc_normalization(self):
"""CER NFC normalise les séquences unicode équivalentes."""
# é peut être encodé en forme composée (U+00E9) ou décomposée (e + U+0301)
composed = "\u00e9" # é (NFC)
decomposed = "e\u0301" # e + combining accent (NFD)
result = compute_metrics(composed, decomposed)
# Après NFC, les deux sont identiques → cer_nfc = 0
assert result.cer_nfc == pytest.approx(0.0)
def test_wer_one_word_wrong(self):
"""WER = 1/3 pour 1 mot faux sur 3."""
result = compute_metrics("le chat dort", "le chien dort")
assert result.wer == pytest.approx(1 / 3, rel=1e-2)
def test_result_has_lengths(self):
ref = "Texte de référence"
result = compute_metrics(ref, "Texte différent")
assert result.reference_length == len(ref)
assert result.hypothesis_length > 0
def test_metrics_result_as_dict(self):
"""as_dict() doit retourner toutes les clés attendues."""
result = compute_metrics("abc", "abc")
d = result.as_dict()
for key in ["cer", "cer_nfc", "cer_caseless", "wer", "wer_normalized", "mer", "wil"]:
assert key in d
def test_cer_percent_property(self):
result = compute_metrics("abcd", "abce")
assert result.cer_percent == pytest.approx(25.0, rel=1e-2)
# ── Bug fix : hypothèse vide → CER doit être 1.0, pas 0.0 (bug sprint 13) ──
def test_empty_hypothesis_cer_is_one(self):
"""Hypothèse vide avec référence non vide doit donner CER=1.0."""
result = compute_metrics("Bonjour le monde", "")
assert result.cer == pytest.approx(1.0), (
f"CER attendu 1.0 pour hypothèse vide, obtenu {result.cer}"
)
assert result.error is None
def test_empty_hypothesis_wer_is_one(self):
"""WER doit être 1.0 pour hypothèse vide (pas de ZeroDivisionError)."""
result = compute_metrics("hello world", "")
assert result.wer == pytest.approx(1.0)
assert result.mer == pytest.approx(1.0)
assert result.wil == pytest.approx(1.0)
assert result.error is None
def test_empty_hypothesis_whitespace_is_treated_as_empty(self):
"""Hypothèse avec uniquement des espaces est traitée comme vide."""
result = compute_metrics("Bonjour", " ")
assert result.cer == pytest.approx(1.0)
def test_empty_hypothesis_hypothesis_length_is_zero(self):
"""hypothesis_length doit être 0 pour hypothèse vide."""
result = compute_metrics("Bonjour le monde", "")
assert result.hypothesis_length == 0
class TestAggregateMetrics:
"""Tests de aggregate_metrics."""
def _make_result(self, cer: float) -> MetricsResult:
return MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=100,
hypothesis_length=100,
)
def test_empty_list(self):
assert aggregate_metrics([]) == {}
def test_single_result(self):
results = [self._make_result(0.1)]
agg = aggregate_metrics(results)
assert agg["cer"]["mean"] == pytest.approx(0.1)
assert agg["cer"]["min"] == pytest.approx(0.1)
assert agg["cer"]["max"] == pytest.approx(0.1)
def test_multiple_results(self):
results = [self._make_result(0.1), self._make_result(0.3)]
agg = aggregate_metrics(results)
assert agg["cer"]["mean"] == pytest.approx(0.2)
assert agg["document_count"] == 2
assert agg["failed_count"] == 0
def test_failed_results_excluded(self):
ok = self._make_result(0.1)
failed = MetricsResult(
cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
reference_length=50, hypothesis_length=0,
error="Moteur en erreur",
)
agg = aggregate_metrics([ok, failed])
# Les métriques agrégées n'incluent que les résultats sans erreur
assert agg["cer"]["mean"] == pytest.approx(0.1)
assert agg["failed_count"] == 1
|