Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / tests /security /test_s1_xss_in_reports.py

Claude

fix(sprint-S1.1)!: corriger XSS critique via Jinja2 autoescape=False (Bandit B701, CWE-94)

bad7a01 unverified about 2 months ago

7.55 kB

	"""Sprint S1.3 — XSS dans le rapport HTML généré.

	Vérifie que tout contenu utilisateur (corpus_name, sentences narratives,
	nom de moteur) est échappé HTML dans le rapport produit par
	``picarones.reports.html.ReportGenerator``.

	Bandit B701 (CWE-94) avait flaggé ``Environment(autoescape=False)`` —
	ce test concrétise l'attaque que l'audit décrivait : un corpus nommé
	``</title><script>alert(1)</script>`` doit être inerte dans le HTML.
	"""

	from __future__ import annotations

	from pathlib import Path

	from picarones.evaluation.benchmark_result import (
	BenchmarkResult,
	DocumentResult,
	EngineReport,
	)


	def _make_minimal_benchmark(
	corpus_name: str,
	engine_name: str = "tesseract",
	) -> BenchmarkResult:
	"""Construit un BenchmarkResult minimal valide avec le ``corpus_name`` fourni."""
	from picarones.evaluation.metric_result import MetricsResult

	metrics = MetricsResult(cer=0.0, wer=0.0, mer=0.0, wil=0.0)
	doc = DocumentResult(
	doc_id="doc01",
	image_path="doc01.png",
	ground_truth="Bonjour",
	hypothesis="Bonjour",
	metrics=metrics,
	duration_seconds=0.1,
	)
	engine = EngineReport(
	engine_name=engine_name,
	engine_version="5.3.0",
	engine_config={},
	document_results=[doc],
	)
	return BenchmarkResult(
	corpus_name=corpus_name,
	corpus_source=None,
	document_count=1,
	engine_reports=[engine],
	)


	# ──────────────────────────────────────────────────────────────────────
	# 1. corpus_name avec script tag → doit être échappé dans <title>
	# ──────────────────────────────────────────────────────────────────────


	class TestCorpusNameXSS:
	"""Le ``corpus_name`` est injecté dans ``<title>`` de
	``base.html.j2``. Sans échappement, un nom malicieux compromet
	tout rapport partagé."""

	def test_script_tag_in_corpus_name_is_escaped(self, tmp_path: Path) -> None:
	from picarones.reports.html import ReportGenerator

	bench = _make_minimal_benchmark(
	corpus_name="</title><script>alert('xss')</script>",
	)
	gen = ReportGenerator(bench)
	out = tmp_path / "report.html"
	gen.generate(out)

	html = out.read_text()

	# Le tag </title> ne doit PAS apparaître au milieu du HTML
	# (autre que celui légitime à la fin du <head>) au point
	# d'attacher un <script>.
	assert "<script>alert('xss')</script>" not in html, (
	"XSS confirmé : le script malicieux est exécutable dans le rapport.\n"
	"Causes possibles : autoescape=False dans Jinja2 + corpus_name "
	"non filtré."
	)

	# Forme correcte : caractères dangereux échappés.
	assert "<script>alert(" in html or "<script>" in html.lower(), (
	"Le ``<`` dans corpus_name doit être échappé en ``<``."
	)

	def test_html_attribute_injection_in_corpus_name(self, tmp_path: Path) -> None:
	"""Cas d'attaque attribute-based : ``" onerror=alert(1)``
	peut casser une balise si corpus_name est utilisée dans
	un attribut."""
	from picarones.reports.html import ReportGenerator

	bench = _make_minimal_benchmark(
	corpus_name='" onerror="alert(1)" foo="',
	)
	gen = ReportGenerator(bench)
	out = tmp_path / "report.html"
	gen.generate(out)

	html = out.read_text()

	# Le caractère ``"`` doit être échappé en ``"`` ou
	# ``"`` partout où corpus_name est rendu. Aucune
	# attribute injection ne doit subsister.
	assert ' onerror="alert(' not in html, (
	"Attribute injection : un guillemet non échappé permet "
	"d'injecter onerror=."
	)

	def test_corpus_name_with_unicode_renders_correctly(
	self, tmp_path: Path,
	) -> None:
	"""Corollaire — vérifie qu'un nom Unicode légitime
	(``Manuscrit médiéval — chartes XIII°``) reste lisible
	après échappement."""
	from picarones.reports.html import ReportGenerator

	bench = _make_minimal_benchmark(
	corpus_name="Manuscrit médiéval — chartes XIII°",
	)
	gen = ReportGenerator(bench)
	out = tmp_path / "report.html"
	gen.generate(out)

	html = out.read_text()
	# Caractères Unicode safe doivent rester intacts.
	assert "Manuscrit médiéval" in html
	assert "XIII°" in html


	# ──────────────────────────────────────────────────────────────────────
	# 2. Engine name (moteur OCR) avec injection
	# ──────────────────────────────────────────────────────────────────────


	class TestEngineNameXSS:
	"""Le nom de moteur peut venir d'un import HuggingFace ou d'une
	config utilisateur. Doit être échappé dans tous les renderers
	qui l'affichent."""

	def test_engine_name_with_script_is_escaped(self, tmp_path: Path) -> None:
	from picarones.reports.html import ReportGenerator

	bench = _make_minimal_benchmark(
	corpus_name="test",
	engine_name="<script>alert('engine')</script>",
	)
	gen = ReportGenerator(bench)
	out = tmp_path / "report.html"
	gen.generate(out)

	html = out.read_text()
	assert "<script>alert('engine')</script>" not in html, (
	"Engine name XSS : un nom de moteur malicieux est exécuté."
	)


	# ──────────────────────────────────────────────────────────────────────
	# 3. Bandit B701 ne doit plus signaler autoescape=False
	# ──────────────────────────────────────────────────────────────────────


	class TestJinja2EnvIsAutoescaped:
	"""Garde-fou contre la régression : ``_build_jinja_env`` doit
	retourner un Environment avec autoescape activé pour les
	extensions HTML/J2."""

	def test_env_has_autoescape_enabled_for_html(self) -> None:
	from picarones.reports.html.generator import _build_jinja_env

	env = _build_jinja_env()
	# autoescape doit être un Callable (select_autoescape) ou True,
	# pas False ni None.
	autoescape = env.autoescape
	assert autoescape, (
	f"Jinja2 Environment.autoescape={autoescape!r} — XSS exposé."
	)
	# Si c'est une fonction (select_autoescape), elle doit
	# retourner True pour les HTML/J2.
	if callable(autoescape):
	assert autoescape("base.html.j2"), (
	"select_autoescape doit activer l'échappement pour .j2"
	)
	assert autoescape("any.html"), (
	"select_autoescape doit activer l'échappement pour .html"
	)