Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / tests /test_sprint12_nouvelles_fonctionnalites.py

Claude

fix: résoudre les 64 erreurs ruff pré-existantes révélées par le lint actif

6362212 unverified 2 months ago

10 kB

	"""Tests pour les nouvelles fonctionnalités du sprint 12 :
	1. Filtrage des fichiers cachés macOS (._*) dans corpus et ZIP
	2. Profils de normalisation avec exclusion de caractères
	3. Vue Analyses — Chart.js inline (plus de CDN)
	4. Métriques robustes dans le rapport HTML
	"""

	from __future__ import annotations

	import io
	import zipfile

	import pytest


	# ---------------------------------------------------------------------------
	# 1. Filtrage des fichiers cachés macOS
	# ---------------------------------------------------------------------------

	FAKE_PNG = (
	b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01"
	b"\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00"
	b"\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18"
	b"\xd8N\x00\x00\x00\x00IEND\xaeB`\x82"
	)


	class TestMacOSHiddenFilesFiltering:
	def test_hidden_images_ignored_in_corpus(self, tmp_path):
	"""Les fichiers ._* ne doivent pas être comptés comme images valides."""
	from picarones.core.corpus import load_corpus_from_directory

	# Image réelle avec GT
	(tmp_path / "page_001.png").write_bytes(FAKE_PNG)
	(tmp_path / "page_001.gt.txt").write_text("Texte réel", encoding="utf-8")

	# Fichiers AppleDouble macOS (sans GT associé)
	(tmp_path / "._page_001.png").write_bytes(b"\x00\x05\x16\x07")
	(tmp_path / ".DS_Store").write_bytes(b"\x00\x00\x00\x01Bud1")

	corpus = load_corpus_from_directory(tmp_path)
	assert len(corpus) == 1
	assert corpus.documents[0].doc_id == "page_001"

	def test_hidden_files_not_extracted_from_zip(self, tmp_path):
	"""_flatten_zip_to_dir doit ignorer les entrées ._* dans le ZIP."""
	from picarones.web.app import _flatten_zip_to_dir

	buf = io.BytesIO()
	with zipfile.ZipFile(buf, "w") as zf:
	zf.writestr("page_001.png", FAKE_PNG)
	zf.writestr("page_001.gt.txt", "Texte réel")
	zf.writestr("._page_001.png", b"\x00\x05\x16\x07")
	zf.writestr("__MACOSX/._page_001.png", b"\x00\x05\x16\x07")

	buf.seek(0)
	dest = tmp_path / "corpus"
	dest.mkdir()
	with zipfile.ZipFile(buf) as zf:
	_flatten_zip_to_dir(zf, dest)

	files = {f.name for f in dest.iterdir()}
	assert "._page_001.png" not in files
	assert "page_001.png" in files
	assert "page_001.gt.txt" in files


	# ---------------------------------------------------------------------------
	# 2. Profils de normalisation avec exclusion de caractères
	# ---------------------------------------------------------------------------

	class TestExcludeCharsNormalization:
	def test_parse_exclude_chars_from_comma_string(self):
	from picarones.core.normalization import _parse_exclude_chars

	result = _parse_exclude_chars("', -, –")
	assert "'" in result
	assert "-" in result
	assert "–" in result

	def test_parse_exclude_chars_from_plain_string(self):
	from picarones.core.normalization import _parse_exclude_chars

	result = _parse_exclude_chars(".,;:!?")
	assert "." in result
	assert "," in result
	assert "?" in result

	def test_parse_exclude_chars_empty(self):
	from picarones.core.normalization import _parse_exclude_chars

	assert _parse_exclude_chars("") == frozenset()
	assert _parse_exclude_chars(None) == frozenset()

	def test_normalize_strips_excluded_chars(self):
	from picarones.core.normalization import NormalizationProfile

	profile = NormalizationProfile(
	name="test",
	exclude_chars=frozenset([".", ","]),
	)
	assert profile.normalize("Bonjour, monde.") == "Bonjour monde"

	def test_sans_ponctuation_profile_exists(self):
	from picarones.core.normalization import NORMALIZATION_PROFILES

	assert "sans_ponctuation" in NORMALIZATION_PROFILES
	p = NORMALIZATION_PROFILES["sans_ponctuation"]
	assert "." in p.exclude_chars
	assert "," in p.exclude_chars
	assert "?" in p.exclude_chars

	def test_sans_apostrophes_profile_exists(self):
	from picarones.core.normalization import NORMALIZATION_PROFILES

	assert "sans_apostrophes" in NORMALIZATION_PROFILES
	p = NORMALIZATION_PROFILES["sans_apostrophes"]
	assert "'" in p.exclude_chars
	assert "\u2019" in p.exclude_chars # apostrophe typographique

	def test_compute_metrics_with_char_exclude(self):
	from picarones.core.metrics import compute_metrics

	ref = "Bonjour, monde!"
	hyp = "Bonjour monde"
	# Sans exclusion, CER > 0 (virgule et ! manquants)
	metrics_raw = compute_metrics(ref, hyp)
	assert metrics_raw.cer > 0

	# Avec exclusion de la ponctuation, les deux textes deviennent identiques
	metrics_excl = compute_metrics(ref, hyp, char_exclude=frozenset([",", "!", " "]))
	# CER devrait être 0 ou très faible maintenant (Bonjourmonde == Bonjourmonde)
	assert metrics_excl.cer == 0.0

	def test_char_exclude_propagated_in_run_benchmark(self, tmp_path):
	"""char_exclude doit être transmis à run_benchmark et réduire le CER."""
	from picarones.core.corpus import Corpus, Document
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine, EngineResult

	class MockEngine(BaseOCREngine):
	name = "mock"
	version = "0.0"

	def _run_ocr(self, image_path):
	return EngineResult(text="Bonjour monde", success=True)

	doc = Document(image_path=tmp_path / "page.png", ground_truth="Bonjour, monde!")
	(tmp_path / "page.png").write_bytes(FAKE_PNG)
	corpus = Corpus(name="test", documents=[doc])

	result_raw = run_benchmark(corpus, [MockEngine()])
	cer_raw = result_raw.engine_reports[0].document_results[0].metrics.cer

	result_excl = run_benchmark(corpus, [MockEngine()], char_exclude=frozenset([",", "!"]))
	cer_excl = result_excl.engine_reports[0].document_results[0].metrics.cer

	assert cer_excl <= cer_raw


	# ---------------------------------------------------------------------------
	# 3. Vue Analyses — Chart.js inline
	# ---------------------------------------------------------------------------

	class TestChartJsInline:
	def test_chartjs_embedded_inline(self, sample_generator, tmp_path):
	"""Le rapport HTML doit embarquer Chart.js inline (pas de CDN)."""
	out = tmp_path / "rapport.html"
	sample_generator.generate(out)
	html = out.read_text(encoding="utf-8")

	assert "cdnjs.cloudflare.com/ajax/libs/Chart.js" not in html
	assert "Chart.js v" in html or "new Chart(" in html

	def test_no_diff2html_cdn(self, sample_generator, tmp_path):
	"""Le rapport ne doit plus référencer diff2html (CDN supprimé)."""
	out = tmp_path / "rapport.html"
	sample_generator.generate(out)
	html = out.read_text(encoding="utf-8")

	assert "diff2html" not in html

	def test_build_charts_function_present(self, sample_generator, tmp_path):
	out = tmp_path / "rapport.html"
	sample_generator.generate(out)
	html = out.read_text(encoding="utf-8")

	assert "function buildCharts()" in html
	assert "buildCerHistogram" in html
	assert "buildRadar" in html


	@pytest.fixture
	def sample_generator():
	"""Fixture partagée : crée un ReportGenerator avec des données fictives."""
	from picarones.report.generator import ReportGenerator
	from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
	from picarones.core.metrics import MetricsResult

	def _make_metric(cer=0.1):
	return MetricsResult(
	cer=cer, cer_nfc=cer, cer_caseless=cer,
	wer=cer, wer_normalized=cer, mer=cer, wil=cer,
	reference_length=100, hypothesis_length=100,
	)

	docs = [
	DocumentResult(
	doc_id=f"doc_{i}", image_path="", ground_truth="GT text",
	hypothesis="Hyp text", metrics=_make_metric(0.1 + i * 0.01),
	duration_seconds=0.1,
	)
	for i in range(3)
	]
	report = EngineReport(engine_name="tesseract", engine_version="5.0", engine_config={}, document_results=docs)
	bm = BenchmarkResult(
	corpus_name="TestCorpus", corpus_source=None, document_count=3,
	engine_reports=[report],
	)
	return ReportGenerator(bm)


	# ---------------------------------------------------------------------------
	# 4. Métriques robustes — présence dans le rapport HTML
	# ---------------------------------------------------------------------------

	class TestRobustMetrics:
	def test_robust_metrics_card_present(self, sample_generator, tmp_path):
	"""La carte Métriques robustes doit être présente dans le rapport."""
	out = tmp_path / "rapport.html"
	sample_generator.generate(out)
	html = out.read_text(encoding="utf-8")

	assert "robust-metrics-card" in html
	assert "robust-anchor" in html
	assert "robust-ratio" in html
	assert "renderRobustMetrics" in html

	def test_robust_metrics_js_syntax_valid(self, sample_generator, tmp_path):
	"""La fonction renderRobustMetrics ne doit pas introduire de SyntaxError JS."""
	import re
	import subprocess

	out = tmp_path / "rapport.html"
	sample_generator.generate(out)
	html = out.read_text(encoding="utf-8")

	scripts = re.findall(r"<script>(.*?)</script>", html, re.DOTALL)
	# Le bloc applicatif est le dernier script
	app_js = tmp_path / "app.js"
	app_js.write_text(scripts[-1], encoding="utf-8")

	result = subprocess.run(
	["node", "--check", str(app_js)],
	capture_output=True, text=True,
	)
	assert result.returncode == 0, f"Erreur JS : {result.stderr}"