Picarones / tests /test_sprint12_nouvelles_fonctionnalites.py
Claude
fix: résoudre les 64 erreurs ruff pré-existantes révélées par le lint actif
6362212 unverified
Raw
History Blame
10 kB
"""Tests pour les nouvelles fonctionnalités du sprint 12 :
1. Filtrage des fichiers cachés macOS (._*) dans corpus et ZIP
2. Profils de normalisation avec exclusion de caractères
3. Vue Analyses — Chart.js inline (plus de CDN)
4. Métriques robustes dans le rapport HTML
"""
from __future__ import annotations
import io
import zipfile
import pytest
# ---------------------------------------------------------------------------
# 1. Filtrage des fichiers cachés macOS
# ---------------------------------------------------------------------------
FAKE_PNG = (
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01"
b"\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00"
b"\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18"
b"\xd8N\x00\x00\x00\x00IEND\xaeB`\x82"
)
class TestMacOSHiddenFilesFiltering:
def test_hidden_images_ignored_in_corpus(self, tmp_path):
"""Les fichiers ._* ne doivent pas être comptés comme images valides."""
from picarones.core.corpus import load_corpus_from_directory
# Image réelle avec GT
(tmp_path / "page_001.png").write_bytes(FAKE_PNG)
(tmp_path / "page_001.gt.txt").write_text("Texte réel", encoding="utf-8")
# Fichiers AppleDouble macOS (sans GT associé)
(tmp_path / "._page_001.png").write_bytes(b"\x00\x05\x16\x07")
(tmp_path / ".DS_Store").write_bytes(b"\x00\x00\x00\x01Bud1")
corpus = load_corpus_from_directory(tmp_path)
assert len(corpus) == 1
assert corpus.documents[0].doc_id == "page_001"
def test_hidden_files_not_extracted_from_zip(self, tmp_path):
"""_flatten_zip_to_dir doit ignorer les entrées ._* dans le ZIP."""
from picarones.web.app import _flatten_zip_to_dir
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("page_001.png", FAKE_PNG)
zf.writestr("page_001.gt.txt", "Texte réel")
zf.writestr("._page_001.png", b"\x00\x05\x16\x07")
zf.writestr("__MACOSX/._page_001.png", b"\x00\x05\x16\x07")
buf.seek(0)
dest = tmp_path / "corpus"
dest.mkdir()
with zipfile.ZipFile(buf) as zf:
_flatten_zip_to_dir(zf, dest)
files = {f.name for f in dest.iterdir()}
assert "._page_001.png" not in files
assert "page_001.png" in files
assert "page_001.gt.txt" in files
# ---------------------------------------------------------------------------
# 2. Profils de normalisation avec exclusion de caractères
# ---------------------------------------------------------------------------
class TestExcludeCharsNormalization:
def test_parse_exclude_chars_from_comma_string(self):
from picarones.core.normalization import _parse_exclude_chars
result = _parse_exclude_chars("', -, –")
assert "'" in result
assert "-" in result
assert "–" in result
def test_parse_exclude_chars_from_plain_string(self):
from picarones.core.normalization import _parse_exclude_chars
result = _parse_exclude_chars(".,;:!?")
assert "." in result
assert "," in result
assert "?" in result
def test_parse_exclude_chars_empty(self):
from picarones.core.normalization import _parse_exclude_chars
assert _parse_exclude_chars("") == frozenset()
assert _parse_exclude_chars(None) == frozenset()
def test_normalize_strips_excluded_chars(self):
from picarones.core.normalization import NormalizationProfile
profile = NormalizationProfile(
name="test",
exclude_chars=frozenset([".", ","]),
)
assert profile.normalize("Bonjour, monde.") == "Bonjour monde"
def test_sans_ponctuation_profile_exists(self):
from picarones.core.normalization import NORMALIZATION_PROFILES
assert "sans_ponctuation" in NORMALIZATION_PROFILES
p = NORMALIZATION_PROFILES["sans_ponctuation"]
assert "." in p.exclude_chars
assert "," in p.exclude_chars
assert "?" in p.exclude_chars
def test_sans_apostrophes_profile_exists(self):
from picarones.core.normalization import NORMALIZATION_PROFILES
assert "sans_apostrophes" in NORMALIZATION_PROFILES
p = NORMALIZATION_PROFILES["sans_apostrophes"]
assert "'" in p.exclude_chars
assert "\u2019" in p.exclude_chars # apostrophe typographique
def test_compute_metrics_with_char_exclude(self):
from picarones.core.metrics import compute_metrics
ref = "Bonjour, monde!"
hyp = "Bonjour monde"
# Sans exclusion, CER > 0 (virgule et ! manquants)
metrics_raw = compute_metrics(ref, hyp)
assert metrics_raw.cer > 0
# Avec exclusion de la ponctuation, les deux textes deviennent identiques
metrics_excl = compute_metrics(ref, hyp, char_exclude=frozenset([",", "!", " "]))
# CER devrait être 0 ou très faible maintenant (Bonjourmonde == Bonjourmonde)
assert metrics_excl.cer == 0.0
def test_char_exclude_propagated_in_run_benchmark(self, tmp_path):
"""char_exclude doit être transmis à run_benchmark et réduire le CER."""
from picarones.core.corpus import Corpus, Document
from picarones.core.runner import run_benchmark
from picarones.engines.base import BaseOCREngine, EngineResult
class MockEngine(BaseOCREngine):
name = "mock"
version = "0.0"
def _run_ocr(self, image_path):
return EngineResult(text="Bonjour monde", success=True)
doc = Document(image_path=tmp_path / "page.png", ground_truth="Bonjour, monde!")
(tmp_path / "page.png").write_bytes(FAKE_PNG)
corpus = Corpus(name="test", documents=[doc])
result_raw = run_benchmark(corpus, [MockEngine()])
cer_raw = result_raw.engine_reports[0].document_results[0].metrics.cer
result_excl = run_benchmark(corpus, [MockEngine()], char_exclude=frozenset([",", "!"]))
cer_excl = result_excl.engine_reports[0].document_results[0].metrics.cer
assert cer_excl <= cer_raw
# ---------------------------------------------------------------------------
# 3. Vue Analyses — Chart.js inline
# ---------------------------------------------------------------------------
class TestChartJsInline:
def test_chartjs_embedded_inline(self, sample_generator, tmp_path):
"""Le rapport HTML doit embarquer Chart.js inline (pas de CDN)."""
out = tmp_path / "rapport.html"
sample_generator.generate(out)
html = out.read_text(encoding="utf-8")
assert "cdnjs.cloudflare.com/ajax/libs/Chart.js" not in html
assert "Chart.js v" in html or "new Chart(" in html
def test_no_diff2html_cdn(self, sample_generator, tmp_path):
"""Le rapport ne doit plus référencer diff2html (CDN supprimé)."""
out = tmp_path / "rapport.html"
sample_generator.generate(out)
html = out.read_text(encoding="utf-8")
assert "diff2html" not in html
def test_build_charts_function_present(self, sample_generator, tmp_path):
out = tmp_path / "rapport.html"
sample_generator.generate(out)
html = out.read_text(encoding="utf-8")
assert "function buildCharts()" in html
assert "buildCerHistogram" in html
assert "buildRadar" in html
@pytest.fixture
def sample_generator():
"""Fixture partagée : crée un ReportGenerator avec des données fictives."""
from picarones.report.generator import ReportGenerator
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
from picarones.core.metrics import MetricsResult
def _make_metric(cer=0.1):
return MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=100, hypothesis_length=100,
)
docs = [
DocumentResult(
doc_id=f"doc_{i}", image_path="", ground_truth="GT text",
hypothesis="Hyp text", metrics=_make_metric(0.1 + i * 0.01),
duration_seconds=0.1,
)
for i in range(3)
]
report = EngineReport(engine_name="tesseract", engine_version="5.0", engine_config={}, document_results=docs)
bm = BenchmarkResult(
corpus_name="TestCorpus", corpus_source=None, document_count=3,
engine_reports=[report],
)
return ReportGenerator(bm)
# ---------------------------------------------------------------------------
# 4. Métriques robustes — présence dans le rapport HTML
# ---------------------------------------------------------------------------
class TestRobustMetrics:
def test_robust_metrics_card_present(self, sample_generator, tmp_path):
"""La carte Métriques robustes doit être présente dans le rapport."""
out = tmp_path / "rapport.html"
sample_generator.generate(out)
html = out.read_text(encoding="utf-8")
assert "robust-metrics-card" in html
assert "robust-anchor" in html
assert "robust-ratio" in html
assert "renderRobustMetrics" in html
def test_robust_metrics_js_syntax_valid(self, sample_generator, tmp_path):
"""La fonction renderRobustMetrics ne doit pas introduire de SyntaxError JS."""
import re
import subprocess
out = tmp_path / "rapport.html"
sample_generator.generate(out)
html = out.read_text(encoding="utf-8")
scripts = re.findall(r"<script>(.*?)</script>", html, re.DOTALL)
# Le bloc applicatif est le dernier script
app_js = tmp_path / "app.js"
app_js.write_text(scripts[-1], encoding="utf-8")
result = subprocess.run(
["node", "--check", str(app_js)],
capture_output=True, text=True,
)
assert result.returncode == 0, f"Erreur JS : {result.stderr}"