Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / tests /web /test_benchmark_run_b3_final_fields.py

Claude

test+feat: D4 audit B3-final — assertions strictes + warning expose_alto cross-engine

b420e00 unverified about 2 months ago

11.5 kB

	"""Tests E2E API REST pour les champs B3-final de ``BenchmarkRunRequest``.

	Phase D3 audit B3-final (mai 2026) — l'audit implacable a identifié
	l'absence de couverture API REST pour les nouveaux champs ajoutés
	en Phase B3-final corr-A/B/C :

	- ``views``, ``profile``, ``partial_dir``, ``entity_extractor``,
	``output_json`` (BenchmarkRunRequest)
	- ``expose_alto`` (PipelineConfig)

	Ces tests valident :
	1. Validation Pydantic positive : payloads valides retournent 200
	2. Validation Pydantic négative : payloads malformés retournent 422
	3. Sécurité path traversal : ``../../etc`` refusé en 422
	"""

	from __future__ import annotations

	import pytest
	from fastapi.testclient import TestClient


	@pytest.fixture
	def client():
	from picarones.interfaces.web.app import app
	return TestClient(app)


	def _valid_corpus_payload(tmp_path):
	"""Crée un corpus zip mini valide pour les tests."""
	from PIL import Image

	img = Image.new("RGB", (50, 50), color=(255, 255, 255))
	img.save(tmp_path / "doc01.png")
	(tmp_path / "doc01.gt.txt").write_text("hello", encoding="utf-8")
	return str(tmp_path)


	# ──────────────────────────────────────────────────────────────────────
	# 1. Validation positive — payloads B3-final acceptés
	# ──────────────────────────────────────────────────────────────────────


	class TestB3FinalFieldsAccepted:
	"""Vérifie que ``BenchmarkRunRequest`` accepte tous les nouveaux
	champs B3-final ajoutés en Phase corr-A/B/C."""

	def test_request_accepts_views_field(self, client) -> None:
	"""``views`` accepte la liste des vues canoniques."""
	from picarones.interfaces.web.models import BenchmarkRunRequest

	# Validation Pydantic isolée (sans HTTP, plus rapide).
	req = BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	views=["text_final", "alto_documentary", "searchability"],
	)
	assert list(req.views) == [
	"text_final", "alto_documentary", "searchability",
	]

	def test_request_accepts_profile_field(self) -> None:
	from picarones.interfaces.web.models import BenchmarkRunRequest

	req = BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	profile="diagnostics",
	)
	assert req.profile == "diagnostics"

	def test_request_accepts_partial_dir_field(self) -> None:
	from picarones.interfaces.web.models import BenchmarkRunRequest

	req = BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	partial_dir="partial/checkpoints",
	)
	assert req.partial_dir == "partial/checkpoints"

	def test_request_accepts_entity_extractor_field(self) -> None:
	from picarones.interfaces.web.models import BenchmarkRunRequest

	req = BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	entity_extractor="picarones.adapters.ner:SpacyExtractor",
	)
	assert req.entity_extractor == "picarones.adapters.ner:SpacyExtractor"

	def test_request_accepts_output_json_field(self) -> None:
	from picarones.interfaces.web.models import BenchmarkRunRequest

	req = BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	output_json="bench_legacy.json",
	)
	assert req.output_json == "bench_legacy.json"

	def test_pipeline_config_accepts_expose_alto(self) -> None:
	from picarones.interfaces.web.models import PipelineConfig

	pc = PipelineConfig(
	engine_name="tesseract", expose_alto=True,
	)
	assert pc.expose_alto is True

	def test_pipeline_config_default_no_expose_alto(self) -> None:
	from picarones.interfaces.web.models import PipelineConfig

	pc = PipelineConfig(engine_name="tesseract")
	assert pc.expose_alto is False

	def test_expose_alto_with_non_tesseract_engine_warns(
	self, caplog: pytest.LogCaptureFixture,
	) -> None:
	"""Phase D4 audit B3-final — l'UI envoie ``expose_alto=true``
	mais le moteur cible n'est pas Tesseract. Le flag est ignoré
	mais on logue un warning explicite pour que l'utilisateur
	comprenne pourquoi son ``alto_documentary`` view ne fournit
	aucune métrique.
	"""
	import logging
	from picarones.interfaces.web.benchmark_utils import (
	_engine_from_competitor,
	)
	from picarones.interfaces.web.models import PipelineConfig

	with caplog.at_level(logging.WARNING):
	try:
	_engine_from_competitor(PipelineConfig(
	engine_name="precomputed_text", expose_alto=True,
	))
	except Exception:
	# Le factory peut échouer car ``precomputed_text``
	# demande des kwargs supplémentaires — on capture mais
	# le warning doit être émis AVANT cette erreur.
	pass

	warnings_text = "\n".join(
	r.getMessage() for r in caplog.records
	if r.levelno >= logging.WARNING
	)
	assert "expose_alto" in warnings_text or "alto" in warnings_text.lower()
	assert "precomputed_text" in warnings_text


	# ──────────────────────────────────────────────────────────────────────
	# 2. Validation négative — payloads malformés rejetés
	# ──────────────────────────────────────────────────────────────────────


	class TestB3FinalFieldsValidation:
	def test_invalid_view_name_rejected(self) -> None:
	"""``views`` n'accepte que les noms canoniques (Literal)."""
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	views=["not_a_canonical_view"],
	)

	def test_invalid_profile_rejected(self) -> None:
	"""``profile`` n'accepte que les profils canoniques (Literal)."""
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	profile="not_a_real_profile",
	)


	# ──────────────────────────────────────────────────────────────────────
	# 3. Sécurité — path traversal refusé (Phase D2 audit)
	# ──────────────────────────────────────────────────────────────────────


	class TestPathTraversalSecurity:
	def test_partial_dir_traversal_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="path traversal"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	partial_dir="../../etc/passwd",
	)

	def test_partial_dir_absolute_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="chemin absolu"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	partial_dir="/etc/passwd",
	)

	def test_output_json_traversal_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="path traversal"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	output_json="../../home/user/private.json",
	)

	def test_entity_extractor_traversal_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="interdits"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	entity_extractor="../../etc/passwd:Bad",
	)

	def test_entity_extractor_with_slash_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="interdits"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	entity_extractor="some/path:Class",
	)

	def test_entity_extractor_with_space_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="interdits"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	entity_extractor="my package:Class",
	)

	def test_entity_extractor_malformed_rejected(self) -> None:
	from pydantic import ValidationError
	from picarones.interfaces.web.models import BenchmarkRunRequest

	with pytest.raises(ValidationError, match="format invalide"):
	BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	entity_extractor="123invalid_start_with_digit",
	)

	def test_empty_string_path_fields_accepted(self) -> None:
	"""``""`` est explicitement autorisé (= feature désactivée)."""
	from picarones.interfaces.web.models import BenchmarkRunRequest

	req = BenchmarkRunRequest(
	corpus_path="./corpus",
	competitors=[{"engine_name": "tesseract"}],
	partial_dir="",
	output_json="",
	entity_extractor="",
	)
	assert req.partial_dir == ""
	assert req.output_json == ""
	assert req.entity_extractor == ""