Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

File size: 11,462 Bytes

"""Tests E2E API REST pour les champs B3-final de ``BenchmarkRunRequest``.

Phase D3 audit B3-final (mai 2026) — l'audit implacable a identifié
l'absence de couverture API REST pour les nouveaux champs ajoutés
en Phase B3-final corr-A/B/C :

- ``views``, ``profile``, ``partial_dir``, ``entity_extractor``,
  ``output_json`` (BenchmarkRunRequest)
- ``expose_alto`` (PipelineConfig)

Ces tests valident :
1. **Validation Pydantic positive** : payloads valides retournent 200
2. **Validation Pydantic négative** : payloads malformés retournent 422
3. **Sécurité path traversal** : ``../../etc`` refusé en 422
"""

from __future__ import annotations

import pytest
from fastapi.testclient import TestClient


@pytest.fixture
def client():
    from picarones.interfaces.web.app import app
    return TestClient(app)


def _valid_corpus_payload(tmp_path):
    """Crée un corpus zip mini valide pour les tests."""
    from PIL import Image

    img = Image.new("RGB", (50, 50), color=(255, 255, 255))
    img.save(tmp_path / "doc01.png")
    (tmp_path / "doc01.gt.txt").write_text("hello", encoding="utf-8")
    return str(tmp_path)


# ──────────────────────────────────────────────────────────────────────
# 1. Validation positive — payloads B3-final acceptés
# ──────────────────────────────────────────────────────────────────────


class TestB3FinalFieldsAccepted:
    """Vérifie que ``BenchmarkRunRequest`` accepte tous les nouveaux
    champs B3-final ajoutés en Phase corr-A/B/C."""

    def test_request_accepts_views_field(self, client) -> None:
        """``views`` accepte la liste des vues canoniques."""
        from picarones.interfaces.web.models import BenchmarkRunRequest

        # Validation Pydantic isolée (sans HTTP, plus rapide).
        req = BenchmarkRunRequest(
            corpus_path="./corpus",
            competitors=[{"engine_name": "tesseract"}],
            views=["text_final", "alto_documentary", "searchability"],
        )
        assert list(req.views) == [
            "text_final", "alto_documentary", "searchability",
        ]

    def test_request_accepts_profile_field(self) -> None:
        from picarones.interfaces.web.models import BenchmarkRunRequest

        req = BenchmarkRunRequest(
            corpus_path="./corpus",
            competitors=[{"engine_name": "tesseract"}],
            profile="diagnostics",
        )
        assert req.profile == "diagnostics"

    def test_request_accepts_partial_dir_field(self) -> None:
        from picarones.interfaces.web.models import BenchmarkRunRequest

        req = BenchmarkRunRequest(
            corpus_path="./corpus",
            competitors=[{"engine_name": "tesseract"}],
            partial_dir="partial/checkpoints",
        )
        assert req.partial_dir == "partial/checkpoints"

    def test_request_accepts_entity_extractor_field(self) -> None:
        from picarones.interfaces.web.models import BenchmarkRunRequest

        req = BenchmarkRunRequest(
            corpus_path="./corpus",
            competitors=[{"engine_name": "tesseract"}],
            entity_extractor="picarones.adapters.ner:SpacyExtractor",
        )
        assert req.entity_extractor == "picarones.adapters.ner:SpacyExtractor"

    def test_request_accepts_output_json_field(self) -> None:
        from picarones.interfaces.web.models import BenchmarkRunRequest

        req = BenchmarkRunRequest(
            corpus_path="./corpus",
            competitors=[{"engine_name": "tesseract"}],
            output_json="bench_legacy.json",
        )
        assert req.output_json == "bench_legacy.json"

    def test_pipeline_config_accepts_expose_alto(self) -> None:
        from picarones.interfaces.web.models import PipelineConfig

        pc = PipelineConfig(
            engine_name="tesseract", expose_alto=True,
        )
        assert pc.expose_alto is True

    def test_pipeline_config_default_no_expose_alto(self) -> None:
        from picarones.interfaces.web.models import PipelineConfig

        pc = PipelineConfig(engine_name="tesseract")
        assert pc.expose_alto is False

    def test_expose_alto_with_non_tesseract_engine_warns(
        self, caplog: pytest.LogCaptureFixture,
    ) -> None:
        """Phase D4 audit B3-final — l'UI envoie ``expose_alto=true``
        mais le moteur cible n'est pas Tesseract.  Le flag est ignoré
        mais on logue un warning explicite pour que l'utilisateur
        comprenne pourquoi son ``alto_documentary`` view ne fournit
        aucune métrique.
        """
        import logging
        from picarones.interfaces.web.benchmark_utils import (
            _engine_from_competitor,
        )
        from picarones.interfaces.web.models import PipelineConfig

        with caplog.at_level(logging.WARNING):
            try:
                _engine_from_competitor(PipelineConfig(
                    engine_name="precomputed_text", expose_alto=True,
                ))
            except Exception:
                # Le factory peut échouer car ``precomputed_text``
                # demande des kwargs supplémentaires — on capture mais
                # le warning doit être émis AVANT cette erreur.
                pass

        warnings_text = "\n".join(
            r.getMessage() for r in caplog.records
            if r.levelno >= logging.WARNING
        )
        assert "expose_alto" in warnings_text or "alto" in warnings_text.lower()
        assert "precomputed_text" in warnings_text


# ──────────────────────────────────────────────────────────────────────
# 2. Validation négative — payloads malformés rejetés
# ──────────────────────────────────────────────────────────────────────


class TestB3FinalFieldsValidation:
    def test_invalid_view_name_rejected(self) -> None:
        """``views`` n'accepte que les noms canoniques (Literal)."""
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                views=["not_a_canonical_view"],
            )

    def test_invalid_profile_rejected(self) -> None:
        """``profile`` n'accepte que les profils canoniques (Literal)."""
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                profile="not_a_real_profile",
            )


# ──────────────────────────────────────────────────────────────────────
# 3. Sécurité — path traversal refusé (Phase D2 audit)
# ──────────────────────────────────────────────────────────────────────


class TestPathTraversalSecurity:
    def test_partial_dir_traversal_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="path traversal"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                partial_dir="../../etc/passwd",
            )

    def test_partial_dir_absolute_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="chemin absolu"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                partial_dir="/etc/passwd",
            )

    def test_output_json_traversal_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="path traversal"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                output_json="../../home/user/private.json",
            )

    def test_entity_extractor_traversal_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="interdits"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                entity_extractor="../../etc/passwd:Bad",
            )

    def test_entity_extractor_with_slash_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="interdits"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                entity_extractor="some/path:Class",
            )

    def test_entity_extractor_with_space_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="interdits"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                entity_extractor="my package:Class",
            )

    def test_entity_extractor_malformed_rejected(self) -> None:
        from pydantic import ValidationError
        from picarones.interfaces.web.models import BenchmarkRunRequest

        with pytest.raises(ValidationError, match="format invalide"):
            BenchmarkRunRequest(
                corpus_path="./corpus",
                competitors=[{"engine_name": "tesseract"}],
                entity_extractor="123invalid_start_with_digit",
            )

    def test_empty_string_path_fields_accepted(self) -> None:
        """``""`` est explicitement autorisé (= feature désactivée)."""
        from picarones.interfaces.web.models import BenchmarkRunRequest

        req = BenchmarkRunRequest(
            corpus_path="./corpus",
            competitors=[{"engine_name": "tesseract"}],
            partial_dir="",
            output_json="",
            entity_extractor="",
        )
        assert req.partial_dir == ""
        assert req.output_json == ""
        assert req.entity_extractor == ""