"""Tests Sprint 8 — Intégration eScriptorium et import Gallica. Classes de tests ---------------- TestEScriptoriumClient (12 tests) — client API eScriptorium (mocks HTTP) TestEScriptoriumConnect (4 tests) — fonction connect_escriptorium TestEScriptoriumExport (8 tests) — export benchmark → couche OCR eScriptorium TestGallicaRecord (6 tests) — structure GallicaRecord TestGallicaClient (12 tests) — client Gallica (mocks HTTP) TestGallicaSearchQuery (8 tests) — construction de requêtes SRU TestGallicaOCR (6 tests) — récupération OCR Gallica TestImportersInit (4 tests) — __init__.py importers TestCLIHistory (6 tests) — commande picarones history TestCLIRobustness (6 tests) — commande picarones robustness """ from __future__ import annotations import json from typing import TYPE_CHECKING from unittest.mock import patch import pytest if TYPE_CHECKING: from picarones.core.results import BenchmarkResult # =========================================================================== # TestEScriptoriumClient # =========================================================================== class TestEScriptoriumClient: def test_import_module(self): from picarones.importers.escriptorium import EScriptoriumClient assert EScriptoriumClient is not None def test_init_attributes(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://escriptorium.example.org", token="tok123", timeout=60) assert client.base_url == "https://escriptorium.example.org" assert client.token == "tok123" assert client.timeout == 60 def test_base_url_trailing_slash_stripped(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://escriptorium.example.org/", token="tok") assert not client.base_url.endswith("/") def test_headers_contain_token(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="mytoken") headers = client._headers() assert "Token mytoken" in headers.get("Authorization", "") def test_headers_contain_accept_json(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") headers = client._headers() assert "application/json" in headers.get("Accept", "") def test_test_connection_success(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") with patch.object(client, "_get", return_value={"results": [], "count": 0}): assert client.test_connection() is True def test_test_connection_failure(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="bad") with patch.object(client, "_get", side_effect=RuntimeError("403")): assert client.test_connection() is False def test_list_projects_empty(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") with patch.object(client, "_paginate", return_value=[]): projects = client.list_projects() assert projects == [] def test_list_projects_parses_items(self): from picarones.importers.escriptorium import EScriptoriumClient, EScriptoriumProject client = EScriptoriumClient("https://example.org", token="tok") mock_data = [ {"pk": 1, "name": "Projet Test", "slug": "projet-test", "owner": {"username": "user1"}, "documents_count": 5}, ] with patch.object(client, "_paginate", return_value=mock_data): projects = client.list_projects() assert len(projects) == 1 assert isinstance(projects[0], EScriptoriumProject) assert projects[0].pk == 1 assert projects[0].name == "Projet Test" assert projects[0].document_count == 5 def test_list_documents_with_project_filter(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") with patch.object(client, "_paginate", return_value=[]) as mock_pag: client.list_documents(project_pk=42) call_kwargs = mock_pag.call_args assert call_kwargs[0][1]["project"] == 42 def test_list_parts_returns_list(self): from picarones.importers.escriptorium import EScriptoriumClient, EScriptoriumPart client = EScriptoriumClient("https://example.org", token="tok") mock_data = [ {"pk": 10, "title": "f. 1r", "image": "https://example.org/img/1.jpg", "order": 0}, {"pk": 11, "title": "f. 1v", "image": "https://example.org/img/2.jpg", "order": 1}, ] with patch.object(client, "_paginate", return_value=mock_data): parts = client.list_parts(doc_pk=5) assert len(parts) == 2 assert isinstance(parts[0], EScriptoriumPart) assert parts[0].pk == 10 def test_escriptorium_project_as_dict(self): from picarones.importers.escriptorium import EScriptoriumProject p = EScriptoriumProject(pk=1, name="Test", slug="test", owner="user", document_count=3) d = p.as_dict() assert d["pk"] == 1 assert d["name"] == "Test" assert d["document_count"] == 3 # =========================================================================== # TestEScriptoriumConnect # =========================================================================== class TestEScriptoriumConnect: def test_connect_success(self): from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient with patch.object(EScriptoriumClient, "test_connection", return_value=True): client = connect_escriptorium("https://example.org", token="tok") assert isinstance(client, EScriptoriumClient) def test_connect_failure_raises(self): from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient with patch.object(EScriptoriumClient, "test_connection", return_value=False): with pytest.raises(RuntimeError, match="Impossible de se connecter"): connect_escriptorium("https://example.org", token="bad") def test_connect_returns_client_with_correct_url(self): from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient with patch.object(EScriptoriumClient, "test_connection", return_value=True): client = connect_escriptorium("https://myinstance.org", token="tok") assert "myinstance.org" in client.base_url def test_connect_timeout_passed(self): from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient with patch.object(EScriptoriumClient, "test_connection", return_value=True): client = connect_escriptorium("https://example.org", token="tok", timeout=120) assert client.timeout == 120 # =========================================================================== # TestEScriptoriumExport # =========================================================================== class TestEScriptoriumExport: def _make_benchmark(self, engine_name: str = "tesseract") -> "BenchmarkResult": from picarones.core.results import BenchmarkResult, EngineReport, DocumentResult from picarones.core.metrics import MetricsResult metrics = MetricsResult(cer=0.05, wer=0.10, cer_nfc=0.05, cer_caseless=0.04, cer_diplomatic=0.04, wer_normalized=0.09, mer=0.09, wil=0.05, reference_length=100, hypothesis_length=100) doc = DocumentResult( doc_id="part_00001", image_path="/img/1.jpg", ground_truth="texte gt", hypothesis="texte ocr", metrics=metrics, duration_seconds=1.0, ) report = EngineReport( engine_name=engine_name, engine_version="5.3", engine_config={}, document_results=[doc], ) return BenchmarkResult( corpus_name="Test", corpus_source="/test/", document_count=1, engine_reports=[report], ) def test_export_unknown_engine_raises(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") bm = self._make_benchmark("tesseract") with pytest.raises(ValueError, match="unknown_engine"): client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="unknown_engine") def test_export_returns_count(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") bm = self._make_benchmark("tesseract") with patch.object(client, "_post", return_value={}): count = client.export_benchmark_as_layer( bm, doc_pk=1, engine_name="tesseract" ) assert count == 1 def test_export_layer_name_default(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") bm = self._make_benchmark("tesseract") calls = [] with patch.object(client, "_post", side_effect=lambda path, payload: calls.append(payload) or {}): client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="tesseract") assert calls[0]["name"] == "picarones_tesseract" def test_export_custom_layer_name(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") bm = self._make_benchmark("tesseract") calls = [] with patch.object(client, "_post", side_effect=lambda path, payload: calls.append(payload) or {}): client.export_benchmark_as_layer( bm, doc_pk=1, engine_name="tesseract", layer_name="my_layer" ) assert calls[0]["name"] == "my_layer" def test_export_skips_error_docs(self): from picarones.importers.escriptorium import EScriptoriumClient from picarones.core.results import BenchmarkResult, EngineReport, DocumentResult from picarones.core.metrics import MetricsResult metrics = MetricsResult(cer=0.1, wer=0.2, cer_nfc=0.1, cer_caseless=0.1, cer_diplomatic=0.1, wer_normalized=0.2, mer=0.2, wil=0.1, reference_length=50, hypothesis_length=50) docs = [ DocumentResult("part_00001", "/img/1.jpg", "gt", "hyp", metrics, 1.0), DocumentResult("part_00002", "/img/2.jpg", "gt", "", metrics, 0.5, engine_error="timeout"), ] report = EngineReport("tesseract", "5.3", {}, docs) bm = BenchmarkResult("C", "/", 2, [report]) client = EScriptoriumClient("https://example.org", token="tok") with patch.object(client, "_post", return_value={}): count = client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="tesseract") assert count == 1 # seul le doc sans erreur est exporté def test_export_with_part_mapping(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") bm = self._make_benchmark("tesseract") calls = [] with patch.object(client, "_post", side_effect=lambda path, payload: calls.append(path) or {}): client.export_benchmark_as_layer( bm, doc_pk=1, engine_name="tesseract", part_mapping={"part_00001": 999}, ) assert "999" in calls[0] def test_export_post_error_is_logged_not_raised(self): from picarones.importers.escriptorium import EScriptoriumClient client = EScriptoriumClient("https://example.org", token="tok") bm = self._make_benchmark("tesseract") with patch.object(client, "_post", side_effect=RuntimeError("500")): count = client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="tesseract") assert count == 0 def test_document_result_as_dict_used(self): from picarones.importers.escriptorium import EScriptoriumDocument d = EScriptoriumDocument(pk=42, name="Doc", project="1", part_count=10, transcription_layers=["manual", "auto"]) d_dict = d.as_dict() assert d_dict["pk"] == 42 assert "manual" in d_dict["transcription_layers"] # =========================================================================== # TestGallicaRecord # =========================================================================== class TestGallicaRecord: def test_import_module(self): from picarones.importers.gallica import GallicaRecord assert GallicaRecord is not None def test_ark_property(self): from picarones.importers.gallica import GallicaRecord r = GallicaRecord(ark="12148/btv1b8453561w", title="Test") assert "12148/btv1b8453561w" in r.url def test_manifest_url(self): from picarones.importers.gallica import GallicaRecord r = GallicaRecord(ark="12148/btv1b8453561w", title="Test") assert "manifest.json" in r.manifest_url assert "12148/btv1b8453561w" in r.manifest_url def test_as_dict_keys(self): from picarones.importers.gallica import GallicaRecord r = GallicaRecord(ark="12148/btv1b8453561w", title="Froissart", creator="Froissart") d = r.as_dict() assert "ark" in d assert "title" in d assert "manifest_url" in d assert "url" in d def test_has_ocr_default_false(self): from picarones.importers.gallica import GallicaRecord r = GallicaRecord(ark="12148/xxx", title="Test") assert r.has_ocr is False def test_has_ocr_true(self): from picarones.importers.gallica import GallicaRecord r = GallicaRecord(ark="12148/xxx", title="Test", has_ocr=True) assert r.has_ocr is True # =========================================================================== # TestGallicaClient # =========================================================================== class TestGallicaClient: def test_import_module(self): from picarones.importers.gallica import GallicaClient assert GallicaClient is not None def test_init_defaults(self): from picarones.importers.gallica import GallicaClient client = GallicaClient() assert client.timeout == 30 assert client.delay >= 0 def test_search_returns_list(self): from picarones.importers.gallica import GallicaClient client = GallicaClient(delay_between_requests=0) with patch.object(client, "_fetch_url", side_effect=RuntimeError("network")): results = client.search(title="Froissart", max_results=5) assert isinstance(results, list) def test_search_empty_on_network_error(self): from picarones.importers.gallica import GallicaClient client = GallicaClient(delay_between_requests=0) with patch.object(client, "_fetch_url", side_effect=RuntimeError("timeout")): results = client.search(title="test") assert results == [] def test_get_ocr_text_returns_string(self): from picarones.importers.gallica import GallicaClient client = GallicaClient(delay_between_requests=0) with patch.object(client, "_fetch_url", return_value=b"Froissart transcription"): text = client.get_ocr_text("12148/btv1b8453561w", page=1) assert isinstance(text, str) assert "Froissart" in text def test_get_ocr_text_empty_on_html_response(self): from picarones.importers.gallica import GallicaClient client = GallicaClient(delay_between_requests=0) html = b"
Page non disponible" with patch.object(client, "_fetch_url", return_value=html): text = client.get_ocr_text("12148/xxx", page=1) assert text == "" def test_get_ocr_text_empty_on_error(self): from picarones.importers.gallica import GallicaClient client = GallicaClient(delay_between_requests=0) with patch.object(client, "_fetch_url", side_effect=RuntimeError("404")): text = client.get_ocr_text("12148/xxx", page=99) assert text == "" def test_get_metadata_returns_dict(self): from picarones.importers.gallica import GallicaClient client = GallicaClient(delay_between_requests=0) xml_bytes = b"""