Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

File size: 5,898 Bytes

"""Tests unitaires pour le module picarones.measurements.metrics."""

import pytest

from picarones.measurements.metrics import aggregate_metrics, compute_metrics, MetricsResult


class TestComputeMetrics:
    """Tests de compute_metrics sur des cas connus."""

    def test_perfect_match(self):
        """CER et WER doivent être 0 quand référence == hypothèse."""
        result = compute_metrics("Bonjour le monde", "Bonjour le monde")
        assert result.cer == pytest.approx(0.0)
        assert result.wer == pytest.approx(0.0)
        assert result.error is None

    def test_complete_mismatch(self):
        """CER proche de 1 quand les textes sont totalement différents."""
        result = compute_metrics("abc", "xyz")
        assert result.cer > 0.0
        assert result.error is None

    def test_empty_reference(self):
        """Référence vide : CER = 1.0 si hypothèse non vide."""
        result = compute_metrics("", "quelque chose")
        assert result.cer == pytest.approx(1.0)

    def test_empty_both(self):
        """Référence et hypothèse vides : CER = 0.0."""
        result = compute_metrics("", "")
        assert result.cer == pytest.approx(0.0)

    def test_single_substitution(self):
        """Une seule substitution sur 4 chars → CER = 0.25."""
        result = compute_metrics("abcd", "abce")
        assert result.cer == pytest.approx(0.25)

    def test_case_insensitive_cer(self):
        """CER caseless ignore les différences de casse."""
        result = compute_metrics("Bonjour", "bonjour")
        assert result.cer_caseless == pytest.approx(0.0)
        # CER brut doit être > 0 (B ≠ b)
        assert result.cer > 0.0

    def test_nfc_normalization(self):
        """CER NFC normalise les séquences unicode équivalentes."""
        # é peut être encodé en forme composée (U+00E9) ou décomposée (e + U+0301)
        composed = "\u00e9"       # é (NFC)
        decomposed = "e\u0301"    # e + combining accent (NFD)
        result = compute_metrics(composed, decomposed)
        # Après NFC, les deux sont identiques → cer_nfc = 0
        assert result.cer_nfc == pytest.approx(0.0)

    def test_wer_one_word_wrong(self):
        """WER = 1/3 pour 1 mot faux sur 3."""
        result = compute_metrics("le chat dort", "le chien dort")
        assert result.wer == pytest.approx(1 / 3, rel=1e-2)

    def test_result_has_lengths(self):
        ref = "Texte de référence"
        result = compute_metrics(ref, "Texte différent")
        assert result.reference_length == len(ref)
        assert result.hypothesis_length > 0

    def test_metrics_result_as_dict(self):
        """as_dict() doit retourner toutes les clés attendues."""
        result = compute_metrics("abc", "abc")
        d = result.as_dict()
        for key in ["cer", "cer_nfc", "cer_caseless", "wer", "wer_normalized", "mer", "wil"]:
            assert key in d

    def test_cer_percent_property(self):
        result = compute_metrics("abcd", "abce")
        assert result.cer_percent == pytest.approx(25.0, rel=1e-2)

    # ── Bug fix : hypothèse vide → CER doit être 1.0, pas 0.0 (bug sprint 13) ──

    def test_empty_hypothesis_cer_is_one(self):
        """Hypothèse vide avec référence non vide doit donner CER=1.0."""
        result = compute_metrics("Bonjour le monde", "")
        assert result.cer == pytest.approx(1.0), (
            f"CER attendu 1.0 pour hypothèse vide, obtenu {result.cer}"
        )
        assert result.error is None

    def test_empty_hypothesis_wer_is_one(self):
        """WER doit être 1.0 pour hypothèse vide (pas de ZeroDivisionError)."""
        result = compute_metrics("hello world", "")
        assert result.wer == pytest.approx(1.0)
        assert result.mer == pytest.approx(1.0)
        assert result.wil == pytest.approx(1.0)
        assert result.error is None

    def test_empty_hypothesis_whitespace_is_treated_as_empty(self):
        """Hypothèse avec uniquement des espaces est traitée comme vide."""
        result = compute_metrics("Bonjour", "   ")
        assert result.cer == pytest.approx(1.0)

    def test_empty_hypothesis_hypothesis_length_is_zero(self):
        """hypothesis_length doit être 0 pour hypothèse vide."""
        result = compute_metrics("Bonjour le monde", "")
        assert result.hypothesis_length == 0


class TestAggregateMetrics:
    """Tests de aggregate_metrics."""

    def _make_result(self, cer: float) -> MetricsResult:
        return MetricsResult(
            cer=cer, cer_nfc=cer, cer_caseless=cer,
            wer=cer, wer_normalized=cer, mer=cer, wil=cer,
            reference_length=100,
            hypothesis_length=100,
        )

    def test_empty_list(self):
        assert aggregate_metrics([]) == {}

    def test_single_result(self):
        results = [self._make_result(0.1)]
        agg = aggregate_metrics(results)
        assert agg["cer"]["mean"] == pytest.approx(0.1)
        assert agg["cer"]["min"] == pytest.approx(0.1)
        assert agg["cer"]["max"] == pytest.approx(0.1)

    def test_multiple_results(self):
        results = [self._make_result(0.1), self._make_result(0.3)]
        agg = aggregate_metrics(results)
        assert agg["cer"]["mean"] == pytest.approx(0.2)
        assert agg["document_count"] == 2
        assert agg["failed_count"] == 0

    def test_failed_results_excluded(self):
        ok = self._make_result(0.1)
        failed = MetricsResult(
            cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
            wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
            reference_length=50, hypothesis_length=0,
            error="Moteur en erreur",
        )
        agg = aggregate_metrics([ok, failed])
        # Les métriques agrégées n'incluent que les résultats sans erreur
        assert agg["cer"]["mean"] == pytest.approx(0.1)
        assert agg["failed_count"] == 1