Spaces:
Running
Running
File size: 8,260 Bytes
ff7895c 9312a64 ff7895c 9312a64 de9192c 9312a64 5112943 9312a64 5112943 9312a64 5112943 9312a64 5112943 9312a64 ff7895c 9312a64 ff7895c 9312a64 5112943 9312a64 ff7895c 9312a64 5112943 9312a64 de9192c 9312a64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | """Sprint H.2.b β ``run_benchmark_via_service`` consomme des
``BaseOCRAdapter`` canoniques.
VΓ©rifie que :
- ``engine_to_pipeline_spec`` produit une ``PipelineSpec`` valide pour
un ``BaseOCRAdapter`` canonique.
- ``build_adapter_resolver`` enregistre directement le ``BaseOCRAdapter``
(pas de wrapping intermΓ©diaire).
- Bout-en-bout : un ``PrecomputedTextAdapter`` consommΓ© par
``run_benchmark_via_service`` produit un ``BenchmarkResult`` valide.
"""
from __future__ import annotations
from pathlib import Path
from picarones.adapters.ocr import (
BaseOCRAdapter,
PrecomputedTextAdapter,
ocr_adapter_from_name,
)
from picarones.app.services.benchmark_runner import (
build_adapter_resolver,
engine_to_pipeline_spec,
)
from picarones.evaluation.corpus import Corpus, Document
from tests._migration_helpers import run_via_orchestrator
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Mock canonique minimal
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class _MockCanonicalOCR(BaseOCRAdapter):
"""Adapter canonique trivial β retourne ``"text from mock"`` pour
n'importe quelle image, sans toucher au filesystem (le contrat
StepExecutor exige une URI mais on l'Γ©crit dans tmp_path)."""
def __init__(self, name: str = "mock_canonical") -> None:
self._name = name
@property
def name(self) -> str:
return self._name
def execute(self, inputs, params, context):
from picarones.domain.artifacts import Artifact, ArtifactType
out_dir = Path(context.workspace_uri)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{context.document_id}_mock.txt"
out_path.write_text("text from mock", encoding="utf-8")
return {
ArtifactType.RAW_TEXT: Artifact(
id=f"{context.document_id}:{self._name}:raw_text",
document_id=context.document_id,
type=ArtifactType.RAW_TEXT,
produced_by_step="ocr",
uri=str(out_path),
),
}
def _make_corpus(tmp_path: Path, n: int = 1) -> Corpus:
docs = []
for i in range(n):
img = tmp_path / f"doc{i}.png"
img.write_bytes(b"x")
docs.append(Document(
image_path=img,
ground_truth="text from mock",
doc_id=f"doc{i}",
))
return Corpus(name="canonical_test", documents=docs)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. engine_to_pipeline_spec accepte un BaseOCRAdapter
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestEngineToPipelineSpecCanonical:
def test_canonical_adapter_produces_mono_step_spec(self) -> None:
adapter = _MockCanonicalOCR(name="my_ocr")
spec = engine_to_pipeline_spec(adapter)
assert spec.name == "ocr_only_my_ocr"
assert len(spec.steps) == 1
assert spec.steps[0].adapter_name == "my_ocr"
def test_canonical_adapter_uses_adapter_input_output_types(self) -> None:
adapter = _MockCanonicalOCR()
spec = engine_to_pipeline_spec(adapter)
from picarones.domain.artifacts import ArtifactType
# PrecomputedTextAdapter declares IMAGE β RAW_TEXT.
assert ArtifactType.IMAGE in spec.steps[0].input_types
assert ArtifactType.RAW_TEXT in spec.steps[0].output_types
def test_factory_built_adapter_works(self) -> None:
adapter = ocr_adapter_from_name(
"precomputed", source_label="bnf",
)
assert isinstance(adapter, PrecomputedTextAdapter)
spec = engine_to_pipeline_spec(adapter)
assert spec.steps[0].adapter_name == "precomputed_bnf"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. build_adapter_resolver enregistre direct (sans wrapping)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestBuildAdapterResolverCanonical:
def test_canonical_registered_without_wrapping(self) -> None:
adapter = _MockCanonicalOCR(name="my_ocr")
resolver = build_adapter_resolver([adapter])
registered = resolver("my_ocr")
# L'instance retournΓ©e est l'adapter lui-mΓͺme, pas un wrapper.
assert registered is adapter
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. run_benchmark_via_service bout-en-bout avec adapter canonique
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRunBenchmarkWithCanonical:
def test_canonical_only_run_succeeds(self, tmp_path: Path) -> None:
corpus = _make_corpus(tmp_path)
adapter = _MockCanonicalOCR(name="my_ocr")
bm = run_via_orchestrator(corpus, [adapter])
assert bm.document_count == 1
assert len(bm.engine_reports) == 1
report = bm.engine_reports[0]
assert report.engine_name == "my_ocr"
# Hypothèse correctement extraite de l'artefact RAW_TEXT.
assert report.document_results[0].hypothesis == "text from mock"
def test_canonical_adapter_no_pipeline_metadata(
self, tmp_path: Path,
) -> None:
"""Un BaseOCRAdapter n'a pas ``is_pipeline`` β pas de
``pipeline_metadata`` (cohΓ©rent avec un OCR seul legacy)."""
corpus = _make_corpus(tmp_path)
adapter = _MockCanonicalOCR()
bm = run_via_orchestrator(corpus, [adapter])
assert bm.engine_reports[0].document_results[0].pipeline_metadata == {}
def test_canonical_adapter_version_unknown(self, tmp_path: Path) -> None:
"""``BaseOCRAdapter`` n'a pas de ``version()`` β tolΓ©rance
``_safe_engine_version`` retourne ``"unknown"``."""
corpus = _make_corpus(tmp_path)
adapter = _MockCanonicalOCR()
bm = run_via_orchestrator(corpus, [adapter])
assert bm.engine_reports[0].engine_version == "unknown"
def test_multiple_canonical_run(self, tmp_path: Path) -> None:
"""Plusieurs adapters canoniques dans la mΓͺme liste."""
corpus = _make_corpus(tmp_path)
a = _MockCanonicalOCR(name="canon_a")
b = _MockCanonicalOCR(name="canon_b")
bm = run_via_orchestrator(corpus, [a, b])
assert len(bm.engine_reports) == 2
engine_names = {r.engine_name for r in bm.engine_reports}
assert engine_names == {"canon_a", "canon_b"}
def test_canonical_with_partial_dir(self, tmp_path: Path) -> None:
"""Le chemin resumable (D.2.b) marche aussi avec des
adapters canoniques."""
corpus = _make_corpus(tmp_path, n=2)
adapter = _MockCanonicalOCR(name="resumable_canon")
bm = run_via_orchestrator(
corpus, [adapter], partial_dir=tmp_path / "partials",
)
assert bm.document_count == 2
# Le partial a été supprimé après succès.
from picarones.app.services.partial_store import (
_partial_path,
)
partial = _partial_path(corpus.name, adapter.name, tmp_path / "partials")
assert not partial.exists()
|