Picarones / tests /app /test_sprint_h2b_canonical_in_runner.py
Claude
feat(migration): Phase B4 β€” migrer les 6 fichiers de tests catΓ©gorie A
5112943 unverified
Raw
History Blame
8.26 kB
"""Sprint H.2.b β€” ``run_benchmark_via_service`` consomme des
``BaseOCRAdapter`` canoniques.
VΓ©rifie que :
- ``engine_to_pipeline_spec`` produit une ``PipelineSpec`` valide pour
un ``BaseOCRAdapter`` canonique.
- ``build_adapter_resolver`` enregistre directement le ``BaseOCRAdapter``
(pas de wrapping intermΓ©diaire).
- Bout-en-bout : un ``PrecomputedTextAdapter`` consommΓ© par
``run_benchmark_via_service`` produit un ``BenchmarkResult`` valide.
"""
from __future__ import annotations
from pathlib import Path
from picarones.adapters.ocr import (
BaseOCRAdapter,
PrecomputedTextAdapter,
ocr_adapter_from_name,
)
from picarones.app.services.benchmark_runner import (
build_adapter_resolver,
engine_to_pipeline_spec,
)
from picarones.evaluation.corpus import Corpus, Document
from tests._migration_helpers import run_via_orchestrator
# ──────────────────────────────────────────────────────────────────────
# Mock canonique minimal
# ──────────────────────────────────────────────────────────────────────
class _MockCanonicalOCR(BaseOCRAdapter):
"""Adapter canonique trivial β€” retourne ``"text from mock"`` pour
n'importe quelle image, sans toucher au filesystem (le contrat
StepExecutor exige une URI mais on l'Γ©crit dans tmp_path)."""
def __init__(self, name: str = "mock_canonical") -> None:
self._name = name
@property
def name(self) -> str:
return self._name
def execute(self, inputs, params, context):
from picarones.domain.artifacts import Artifact, ArtifactType
out_dir = Path(context.workspace_uri)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{context.document_id}_mock.txt"
out_path.write_text("text from mock", encoding="utf-8")
return {
ArtifactType.RAW_TEXT: Artifact(
id=f"{context.document_id}:{self._name}:raw_text",
document_id=context.document_id,
type=ArtifactType.RAW_TEXT,
produced_by_step="ocr",
uri=str(out_path),
),
}
def _make_corpus(tmp_path: Path, n: int = 1) -> Corpus:
docs = []
for i in range(n):
img = tmp_path / f"doc{i}.png"
img.write_bytes(b"x")
docs.append(Document(
image_path=img,
ground_truth="text from mock",
doc_id=f"doc{i}",
))
return Corpus(name="canonical_test", documents=docs)
# ──────────────────────────────────────────────────────────────────────
# 1. engine_to_pipeline_spec accepte un BaseOCRAdapter
# ──────────────────────────────────────────────────────────────────────
class TestEngineToPipelineSpecCanonical:
def test_canonical_adapter_produces_mono_step_spec(self) -> None:
adapter = _MockCanonicalOCR(name="my_ocr")
spec = engine_to_pipeline_spec(adapter)
assert spec.name == "ocr_only_my_ocr"
assert len(spec.steps) == 1
assert spec.steps[0].adapter_name == "my_ocr"
def test_canonical_adapter_uses_adapter_input_output_types(self) -> None:
adapter = _MockCanonicalOCR()
spec = engine_to_pipeline_spec(adapter)
from picarones.domain.artifacts import ArtifactType
# PrecomputedTextAdapter declares IMAGE β†’ RAW_TEXT.
assert ArtifactType.IMAGE in spec.steps[0].input_types
assert ArtifactType.RAW_TEXT in spec.steps[0].output_types
def test_factory_built_adapter_works(self) -> None:
adapter = ocr_adapter_from_name(
"precomputed", source_label="bnf",
)
assert isinstance(adapter, PrecomputedTextAdapter)
spec = engine_to_pipeline_spec(adapter)
assert spec.steps[0].adapter_name == "precomputed_bnf"
# ──────────────────────────────────────────────────────────────────────
# 2. build_adapter_resolver enregistre direct (sans wrapping)
# ──────────────────────────────────────────────────────────────────────
class TestBuildAdapterResolverCanonical:
def test_canonical_registered_without_wrapping(self) -> None:
adapter = _MockCanonicalOCR(name="my_ocr")
resolver = build_adapter_resolver([adapter])
registered = resolver("my_ocr")
# L'instance retournΓ©e est l'adapter lui-mΓͺme, pas un wrapper.
assert registered is adapter
# ──────────────────────────────────────────────────────────────────────
# 3. run_benchmark_via_service bout-en-bout avec adapter canonique
# ──────────────────────────────────────────────────────────────────────
class TestRunBenchmarkWithCanonical:
def test_canonical_only_run_succeeds(self, tmp_path: Path) -> None:
corpus = _make_corpus(tmp_path)
adapter = _MockCanonicalOCR(name="my_ocr")
bm = run_via_orchestrator(corpus, [adapter])
assert bm.document_count == 1
assert len(bm.engine_reports) == 1
report = bm.engine_reports[0]
assert report.engine_name == "my_ocr"
# Hypothèse correctement extraite de l'artefact RAW_TEXT.
assert report.document_results[0].hypothesis == "text from mock"
def test_canonical_adapter_no_pipeline_metadata(
self, tmp_path: Path,
) -> None:
"""Un BaseOCRAdapter n'a pas ``is_pipeline`` β†’ pas de
``pipeline_metadata`` (cohΓ©rent avec un OCR seul legacy)."""
corpus = _make_corpus(tmp_path)
adapter = _MockCanonicalOCR()
bm = run_via_orchestrator(corpus, [adapter])
assert bm.engine_reports[0].document_results[0].pipeline_metadata == {}
def test_canonical_adapter_version_unknown(self, tmp_path: Path) -> None:
"""``BaseOCRAdapter`` n'a pas de ``version()`` β€” tolΓ©rance
``_safe_engine_version`` retourne ``"unknown"``."""
corpus = _make_corpus(tmp_path)
adapter = _MockCanonicalOCR()
bm = run_via_orchestrator(corpus, [adapter])
assert bm.engine_reports[0].engine_version == "unknown"
def test_multiple_canonical_run(self, tmp_path: Path) -> None:
"""Plusieurs adapters canoniques dans la mΓͺme liste."""
corpus = _make_corpus(tmp_path)
a = _MockCanonicalOCR(name="canon_a")
b = _MockCanonicalOCR(name="canon_b")
bm = run_via_orchestrator(corpus, [a, b])
assert len(bm.engine_reports) == 2
engine_names = {r.engine_name for r in bm.engine_reports}
assert engine_names == {"canon_a", "canon_b"}
def test_canonical_with_partial_dir(self, tmp_path: Path) -> None:
"""Le chemin resumable (D.2.b) marche aussi avec des
adapters canoniques."""
corpus = _make_corpus(tmp_path, n=2)
adapter = _MockCanonicalOCR(name="resumable_canon")
bm = run_via_orchestrator(
corpus, [adapter], partial_dir=tmp_path / "partials",
)
assert bm.document_count == 2
# Le partial a été supprimé après succès.
from picarones.app.services.partial_store import (
_partial_path,
)
partial = _partial_path(corpus.name, adapter.name, tmp_path / "partials")
assert not partial.exists()