Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

File size: 10,016 Bytes

df4f47c

"""Tests pour les nouvelles fonctionnalités du sprint 12 :
1. Filtrage des fichiers cachés macOS (._*) dans corpus et ZIP
2. Profils de normalisation avec exclusion de caractères
3. Vue Analyses — Chart.js inline (plus de CDN)
4. Métriques robustes dans le rapport HTML
"""

from __future__ import annotations

import io
import zipfile

import pytest


# ---------------------------------------------------------------------------
# 1. Filtrage des fichiers cachés macOS
# ---------------------------------------------------------------------------

FAKE_PNG = (
    b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01"
    b"\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00"
    b"\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18"
    b"\xd8N\x00\x00\x00\x00IEND\xaeB`\x82"
)


class TestMacOSHiddenFilesFiltering:
    def test_hidden_images_ignored_in_corpus(self, tmp_path):
        """Les fichiers ._* ne doivent pas être comptés comme images valides."""
        from picarones.core.corpus import load_corpus_from_directory

        # Image réelle avec GT
        (tmp_path / "page_001.png").write_bytes(FAKE_PNG)
        (tmp_path / "page_001.gt.txt").write_text("Texte réel", encoding="utf-8")

        # Fichiers AppleDouble macOS (sans GT associé)
        (tmp_path / "._page_001.png").write_bytes(b"\x00\x05\x16\x07")
        (tmp_path / ".DS_Store").write_bytes(b"\x00\x00\x00\x01Bud1")

        corpus = load_corpus_from_directory(tmp_path)
        assert len(corpus) == 1
        assert corpus.documents[0].doc_id == "page_001"

    def test_hidden_files_not_extracted_from_zip(self, tmp_path):
        """_flatten_zip_to_dir doit ignorer les entrées ._* dans le ZIP."""
        from picarones.web.app import _flatten_zip_to_dir

        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w") as zf:
            zf.writestr("page_001.png", FAKE_PNG)
            zf.writestr("page_001.gt.txt", "Texte réel")
            zf.writestr("._page_001.png", b"\x00\x05\x16\x07")
            zf.writestr("__MACOSX/._page_001.png", b"\x00\x05\x16\x07")

        buf.seek(0)
        dest = tmp_path / "corpus"
        dest.mkdir()
        with zipfile.ZipFile(buf) as zf:
            _flatten_zip_to_dir(zf, dest)

        files = {f.name for f in dest.iterdir()}
        assert "._page_001.png" not in files
        assert "page_001.png" in files
        assert "page_001.gt.txt" in files


# ---------------------------------------------------------------------------
# 2. Profils de normalisation avec exclusion de caractères
# ---------------------------------------------------------------------------

class TestExcludeCharsNormalization:
    def test_parse_exclude_chars_from_comma_string(self):
        from picarones.core.normalization import _parse_exclude_chars

        result = _parse_exclude_chars("', -, –")
        assert "'" in result
        assert "-" in result
        assert "–" in result

    def test_parse_exclude_chars_from_plain_string(self):
        from picarones.core.normalization import _parse_exclude_chars

        result = _parse_exclude_chars(".,;:!?")
        assert "." in result
        assert "," in result
        assert "?" in result

    def test_parse_exclude_chars_empty(self):
        from picarones.core.normalization import _parse_exclude_chars

        assert _parse_exclude_chars("") == frozenset()
        assert _parse_exclude_chars(None) == frozenset()

    def test_normalize_strips_excluded_chars(self):
        from picarones.core.normalization import NormalizationProfile

        profile = NormalizationProfile(
            name="test",
            exclude_chars=frozenset([".", ","]),
        )
        assert profile.normalize("Bonjour, monde.") == "Bonjour monde"

    def test_sans_ponctuation_profile_exists(self):
        from picarones.core.normalization import NORMALIZATION_PROFILES

        assert "sans_ponctuation" in NORMALIZATION_PROFILES
        p = NORMALIZATION_PROFILES["sans_ponctuation"]
        assert "." in p.exclude_chars
        assert "," in p.exclude_chars
        assert "?" in p.exclude_chars

    def test_sans_apostrophes_profile_exists(self):
        from picarones.core.normalization import NORMALIZATION_PROFILES

        assert "sans_apostrophes" in NORMALIZATION_PROFILES
        p = NORMALIZATION_PROFILES["sans_apostrophes"]
        assert "'" in p.exclude_chars
        assert "\u2019" in p.exclude_chars  # apostrophe typographique

    def test_compute_metrics_with_char_exclude(self):
        from picarones.core.metrics import compute_metrics

        ref = "Bonjour, monde!"
        hyp = "Bonjour monde"
        # Sans exclusion, CER > 0 (virgule et ! manquants)
        metrics_raw = compute_metrics(ref, hyp)
        assert metrics_raw.cer > 0

        # Avec exclusion de la ponctuation, les deux textes deviennent identiques
        metrics_excl = compute_metrics(ref, hyp, char_exclude=frozenset([",", "!", " "]))
        # CER devrait être 0 ou très faible maintenant (Bonjourmonde == Bonjourmonde)
        assert metrics_excl.cer == 0.0

    def test_char_exclude_propagated_in_run_benchmark(self, tmp_path):
        """char_exclude doit être transmis à run_benchmark et réduire le CER."""
        from picarones.core.corpus import Corpus, Document
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine, EngineResult

        class MockEngine(BaseOCREngine):
            name = "mock"
            version = "0.0"

            def _run_ocr(self, image_path):
                return EngineResult(text="Bonjour monde", success=True)

        doc = Document(image_path=tmp_path / "page.png", ground_truth="Bonjour, monde!")
        (tmp_path / "page.png").write_bytes(FAKE_PNG)
        corpus = Corpus(name="test", documents=[doc])

        result_raw = run_benchmark(corpus, [MockEngine()])
        cer_raw = result_raw.engine_reports[0].document_results[0].metrics.cer

        result_excl = run_benchmark(corpus, [MockEngine()], char_exclude=frozenset([",", "!"]))
        cer_excl = result_excl.engine_reports[0].document_results[0].metrics.cer

        assert cer_excl <= cer_raw


# ---------------------------------------------------------------------------
# 3. Vue Analyses — Chart.js inline
# ---------------------------------------------------------------------------

class TestChartJsInline:
    def test_chartjs_embedded_inline(self, sample_generator, tmp_path):
        """Le rapport HTML doit embarquer Chart.js inline (pas de CDN)."""
        out = tmp_path / "rapport.html"
        sample_generator.generate(out)
        html = out.read_text(encoding="utf-8")

        assert "cdnjs.cloudflare.com/ajax/libs/Chart.js" not in html
        assert "Chart.js v" in html or "new Chart(" in html

    def test_no_diff2html_cdn(self, sample_generator, tmp_path):
        """Le rapport ne doit plus référencer diff2html (CDN supprimé)."""
        out = tmp_path / "rapport.html"
        sample_generator.generate(out)
        html = out.read_text(encoding="utf-8")

        assert "diff2html" not in html

    def test_build_charts_function_present(self, sample_generator, tmp_path):
        out = tmp_path / "rapport.html"
        sample_generator.generate(out)
        html = out.read_text(encoding="utf-8")

        assert "function buildCharts()" in html
        assert "buildCerHistogram" in html
        assert "buildRadar" in html


@pytest.fixture
def sample_generator():
    """Fixture partagée : crée un ReportGenerator avec des données fictives."""
    from picarones.report.generator import ReportGenerator
    from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
    from picarones.core.metrics import MetricsResult

    def _make_metric(cer=0.1):
        return MetricsResult(
            cer=cer, cer_nfc=cer, cer_caseless=cer,
            wer=cer, wer_normalized=cer, mer=cer, wil=cer,
            reference_length=100, hypothesis_length=100,
        )

    docs = [
        DocumentResult(
            doc_id=f"doc_{i}", image_path="", ground_truth="GT text",
            hypothesis="Hyp text", metrics=_make_metric(0.1 + i * 0.01),
            duration_seconds=0.1,
        )
        for i in range(3)
    ]
    report = EngineReport(engine_name="tesseract", engine_version="5.0", engine_config={}, document_results=docs)
    bm = BenchmarkResult(
        corpus_name="TestCorpus", corpus_source=None, document_count=3,
        engine_reports=[report],
    )
    return ReportGenerator(bm)


# ---------------------------------------------------------------------------
# 4. Métriques robustes — présence dans le rapport HTML
# ---------------------------------------------------------------------------

class TestRobustMetrics:
    def test_robust_metrics_card_present(self, sample_generator, tmp_path):
        """La carte Métriques robustes doit être présente dans le rapport."""
        out = tmp_path / "rapport.html"
        sample_generator.generate(out)
        html = out.read_text(encoding="utf-8")

        assert "robust-metrics-card" in html
        assert "robust-anchor" in html
        assert "robust-ratio" in html
        assert "renderRobustMetrics" in html

    def test_robust_metrics_js_syntax_valid(self, sample_generator, tmp_path):
        """La fonction renderRobustMetrics ne doit pas introduire de SyntaxError JS."""
        import re
        import subprocess

        out = tmp_path / "rapport.html"
        sample_generator.generate(out)
        html = out.read_text(encoding="utf-8")

        scripts = re.findall(r"<script>(.*?)</script>", html, re.DOTALL)
        # Le bloc applicatif est le dernier script
        app_js = tmp_path / "app.js"
        app_js.write_text(scripts[-1], encoding="utf-8")

        result = subprocess.run(
            ["node", "--check", str(app_js)],
            capture_output=True, text=True,
        )
        assert result.returncode == 0, f"Erreur JS : {result.stderr}"