Picarones / tests /measurements /test_sprint83_reliability.py
Claude
test: rΓ©organiser les 110 fichiers tests/test_*.py par cercle architectural
d109222 unverified
Raw
History Blame
9.19 kB
"""Tests Sprint 83 β€” A.II.4 : mΓ©triques de fiabilitΓ©.
Couvre :
1. ``cohen_kappa`` :
- accord parfait β†’ ΞΊ = 1
- hasard β†’ ΞΊ = 0
- dΓ©saccord pire que hasard β†’ ΞΊ < 0
- sΓ©quences de tailles incompatibles β†’ None
- sΓ©quence vide β†’ None
- un seul label (pe = 1) β†’ convention 1.0 ou 0.0
2. ``krippendorff_alpha`` :
- accord total
- missing values gΓ©rΓ©es
- corpus avec un seul label β†’ None
3. ``compute_iaa`` :
- GT identiques β†’ ΞΊ = 1, Ξ± = 1
- diffΓ©rence ponctuelle β†’ ΞΊ ∈ ]0, 1[
- inputs vides β†’ None
4. ``compute_multirun_stability`` :
- 1 run β†’ None
- 3 runs identiques β†’ divergence = 0, n_distinct = 1
- 3 runs diffΓ©rents β†’ divergence > 0
- reference fournie β†’ cer_per_run + variance + cv
5. Helper ``_aligned_char_pairs`` (privΓ© mais central).
"""
from __future__ import annotations
import pytest
from picarones.measurements.reliability import (
_aligned_char_pairs,
cohen_kappa,
compute_iaa,
compute_multirun_stability,
krippendorff_alpha,
)
# ──────────────────────────────────────────────────────────────────────────
# 1. cohen_kappa
# ──────────────────────────────────────────────────────────────────────────
class TestCohenKappa:
def test_perfect_agreement(self) -> None:
assert cohen_kappa(["a", "b", "c"], ["a", "b", "c"]) == 1.0
def test_total_disagreement_with_balanced_classes(self) -> None:
# 4 obs, 2 classes Γ©quiprobables, dΓ©saccord total possible
# quand A = [1,1,2,2] et B = [2,2,1,1]
kappa = cohen_kappa([1, 1, 2, 2], [2, 2, 1, 1])
assert kappa == pytest.approx(-1.0)
def test_chance_level(self) -> None:
# A = [1,2,1,2], B = [1,1,2,2] β†’ po=0.5, pe=0.5 β†’ ΞΊ=0
kappa = cohen_kappa([1, 2, 1, 2], [1, 1, 2, 2])
assert kappa == pytest.approx(0.0)
def test_size_mismatch(self) -> None:
assert cohen_kappa([1, 2], [1]) is None
def test_empty(self) -> None:
assert cohen_kappa([], []) is None
def test_single_label_identical(self) -> None:
# pe = 1 indΓ©fini ; convention identitΓ© β†’ 1.0
assert cohen_kappa(["a", "a"], ["a", "a"]) == 1.0
# ──────────────────────────────────────────────────────────────────────────
# 2. krippendorff_alpha
# ──────────────────────────────────────────────────────────────────────────
class TestKrippendorffAlpha:
def test_perfect_agreement(self) -> None:
units = [["a", "a", "a"], ["b", "b", "b"], ["c", "c", "c"]]
assert krippendorff_alpha(units) == 1.0
def test_handles_missing_values(self) -> None:
# Première unité 2 valides, seconde 3 valides
units = [["a", "a", None], ["b", "b", "b"]]
assert krippendorff_alpha(units) == 1.0
def test_single_label_returns_none(self) -> None:
# Un seul label dans tout le corpus β†’ De = 0 β†’ indΓ©fini
units = [["a", "a"], ["a", "a"]]
assert krippendorff_alpha(units) is None
def test_empty_returns_none(self) -> None:
assert krippendorff_alpha([]) is None
def test_units_with_less_than_two_skipped(self) -> None:
# Toutes les unitΓ©s ont moins de 2 valides β†’ None
units = [["a"], ["b"], [None]]
assert krippendorff_alpha(units) is None
# ──────────────────────────────────────────────────────────────────────────
# 3. compute_iaa
# ──────────────────────────────────────────────────────────────────────────
class TestComputeIAA:
def test_identical_transcriptions(self) -> None:
result = compute_iaa("hello world", "hello world")
assert result is not None
assert result["cohen_kappa"] == 1.0
assert result["agreement_rate"] == 1.0
def test_partial_disagreement(self) -> None:
result = compute_iaa("hello", "hallo")
assert result is not None
assert 0 < result["agreement_rate"] < 1
assert 0 < (result["cohen_kappa"] or 0) < 1
def test_empty_returns_none(self) -> None:
assert compute_iaa("", "") is None
def test_one_empty_returns_none(self) -> None:
# `_aligned_char_pairs` ne peut produire que des opcodes
# `insert` ou `delete` ici β†’ pas d'alignement bilatΓ©ral
assert compute_iaa("abc", "") is None
def test_returns_n_aligned_chars(self) -> None:
result = compute_iaa("hello", "hello")
assert result["n_aligned_chars"] == 5
# ──────────────────────────────────────────────────────────────────────────
# 4. compute_multirun_stability
# ──────────────────────────────────────────────────────────────────────────
class TestMultirunStability:
def test_single_run_returns_none(self) -> None:
assert compute_multirun_stability(["hello"]) is None
def test_three_identical_runs(self) -> None:
result = compute_multirun_stability(
["hello world", "hello world", "hello world"],
)
assert result["n_runs"] == 3
assert result["pairwise_disagreement_mean"] == 0.0
assert result["pairwise_disagreement_max"] == 0.0
assert result["identical_run_rate"] == 1.0
assert result["n_distinct_outputs"] == 1
# Pas de rΓ©fΓ©rence β†’ cer_per_run None
assert result["cer_per_run"] is None
def test_three_distinct_runs(self) -> None:
result = compute_multirun_stability(
["a b c", "a b d", "a c d"],
)
assert result["pairwise_disagreement_mean"] > 0
assert result["identical_run_rate"] == 0.0
assert result["n_distinct_outputs"] == 3
def test_with_reference_computes_cer_metrics(self) -> None:
result = compute_multirun_stability(
["hello world", "helo world", "hello word"],
reference="hello world",
)
assert result["cer_per_run"] is not None
assert len(result["cer_per_run"]) == 3
assert result["cer_mean"] is not None
assert result["cer_stdev"] is not None
assert result["cer_cv"] is not None
assert result["cer_cv"] > 0
def test_with_reference_perfect_runs(self) -> None:
# 3 runs identiques Γ©gaux Γ  la rΓ©fΓ©rence
result = compute_multirun_stability(
["abc"] * 3,
reference="abc",
)
assert result["cer_mean"] == 0.0
assert result["cer_stdev"] == 0.0
# CV indΓ©fini (mean=0) β†’ None
assert result["cer_cv"] is None
def test_partial_identical_pairs(self) -> None:
# Run1 == Run2, Run3 diffΓ©rent
result = compute_multirun_stability(
["a b c", "a b c", "x y z"],
)
# 3 paires : (1,2) identiques, (1,3) (2,3) diffΓ©rentes
assert result["identical_run_rate"] == pytest.approx(1.0 / 3.0)
# ──────────────────────────────────────────────────────────────────────────
# 5. _aligned_char_pairs
# ──────────────────────────────────────────────────────────────────────────
class TestAlignedCharPairs:
def test_identical(self) -> None:
pairs = _aligned_char_pairs("abc", "abc")
assert pairs == [("a", "a"), ("b", "b"), ("c", "c")]
def test_substitution(self) -> None:
pairs = _aligned_char_pairs("abc", "axc")
assert ("b", "x") in pairs
def test_insertion_skipped(self) -> None:
pairs = _aligned_char_pairs("ac", "abc")
# 'b' insΓ©rΓ© dans b β†’ pas de paire bilatΓ©rale pour cette
# position
assert all(a != "" and b != "" for a, b in pairs)
# Les caractères communs alignés sont a et c
assert ("a", "a") in pairs
assert ("c", "c") in pairs
def test_both_empty(self) -> None:
assert _aligned_char_pairs("", "") == []