Picarones / tests /test_sprint16_narrative_foundations.py
Claude
Sprint 1 du plan rapport — câblage Sprint 10 + fondations narratives
0aa159b unverified
Raw
History Blame
10.5 kB
"""Tests Sprint 16 — câblage line_metrics/hallucination + fondations du moteur narratif.
Couverture :
1. ``compute_document_result`` via le runner peuple bien ``line_metrics`` et
``hallucination_metrics`` sur un document réussi.
2. ``EngineReport`` expose ``aggregated_line_metrics`` et
``aggregated_hallucination`` après un benchmark.
3. Le modèle ``Fact`` et le ``DetectorRegistry`` fonctionnent.
4. Le registre par défaut est vide en Sprint 1 (les détecteurs seront activés
progressivement dans les sprints suivants).
"""
from __future__ import annotations
import json
import tempfile
from pathlib import Path
import pytest
from picarones.core.corpus import Corpus, Document
from picarones.core.narrative import (
DetectorRegistry,
Fact,
FactImportance,
FactType,
detect_all,
)
from picarones.core.runner import (
_aggregate_hallucination,
_aggregate_line_metrics,
_compute_document_result,
run_benchmark,
)
from picarones.engines.base import BaseOCREngine, EngineResult
class _FakeEngine(BaseOCREngine):
"""Moteur factice — renvoie un texte configurable, utile en test."""
def __init__(self, output_text: str, name: str = "fake", config=None):
super().__init__(config)
self._output = output_text
self._display_name = name
@property
def name(self) -> str:
return self._display_name
def version(self) -> str:
return "test"
def _run_ocr(self, image_path):
return self._output, None
def run(self, image_path) -> EngineResult:
return EngineResult(
engine_name=self.name,
image_path=str(image_path),
text=self._output,
duration_seconds=0.01,
)
# ---------------------------------------------------------------------------
# 1. Câblage line_metrics et hallucination par document
# ---------------------------------------------------------------------------
class TestDocumentResultWiring:
"""Vérifie que ``_compute_document_result`` peuple les nouveaux champs."""
def test_line_metrics_populated_on_success(self, tmp_path: Path):
image = tmp_path / "doc.png"
image.write_bytes(b"\x89PNG\r\n\x1a\n") # stub — image_quality loggera un warning
ocr = EngineResult(
engine_name="fake",
image_path=str(image),
text="ligne une\nligne deux\nligne trois",
duration_seconds=0.1,
)
gt = "ligne une\nligne deux\nligne trois"
result = _compute_document_result(
doc_id="doc1",
image_path=str(image),
ground_truth=gt,
ocr_result=ocr,
char_exclude=None,
)
assert result.line_metrics is not None, "line_metrics doit être peuplé"
assert "percentiles" in result.line_metrics
assert "gini" in result.line_metrics
assert result.line_metrics["line_count"] == 3
def test_hallucination_metrics_populated_on_success(self, tmp_path: Path):
image = tmp_path / "doc.png"
image.write_bytes(b"")
gt = "le chat est sur le tapis rouge et dort paisiblement"
hyp = "le chat mange des bananes spatiales en orbite lunaire"
ocr = EngineResult(
engine_name="fake",
image_path=str(image),
text=hyp,
duration_seconds=0.1,
)
result = _compute_document_result(
doc_id="doc1",
image_path=str(image),
ground_truth=gt,
ocr_result=ocr,
char_exclude=None,
)
assert result.hallucination_metrics is not None
assert "anchor_score" in result.hallucination_metrics
assert "length_ratio" in result.hallucination_metrics
assert "is_hallucinating" in result.hallucination_metrics
def test_new_fields_empty_on_engine_failure(self, tmp_path: Path):
"""Si l'OCR échoue (success=False), pas de calcul line_metrics/hallucination."""
image = tmp_path / "doc.png"
image.write_bytes(b"")
ocr = EngineResult(
engine_name="fake",
image_path=str(image),
text="",
duration_seconds=0.1,
error="simulated failure",
)
result = _compute_document_result(
doc_id="doc1",
image_path=str(image),
ground_truth="ground truth text",
ocr_result=ocr,
char_exclude=None,
)
assert result.line_metrics is None
assert result.hallucination_metrics is None
# ---------------------------------------------------------------------------
# 2. Agrégation au niveau EngineReport
# ---------------------------------------------------------------------------
class TestAggregationWiring:
"""Vérifie que le benchmark complet produit les agrégations."""
def test_aggregate_line_metrics_helper_with_empty_list(self):
assert _aggregate_line_metrics([]) is None
def test_aggregate_hallucination_helper_with_empty_list(self):
assert _aggregate_hallucination([]) is None
def test_benchmark_end_to_end_produces_aggregations(self, tmp_path: Path):
img = tmp_path / "test.png"
img.write_bytes(b"")
corpus = Corpus(
name="test",
documents=[
Document(
doc_id="d1",
image_path=img,
ground_truth="bonjour le monde\nligne deux\nfin",
),
Document(
doc_id="d2",
image_path=img,
ground_truth="autre document test\navec deux lignes",
),
],
source_path=str(tmp_path),
)
engine = _FakeEngine(
output_text="bonjour le monde\nligne deux\nfin",
name="fake_engine",
)
result = run_benchmark(
corpus=corpus,
engines=[engine],
show_progress=False,
max_workers=1,
partial_dir=str(tmp_path / "partial"),
)
assert len(result.engine_reports) == 1
report = result.engine_reports[0]
assert report.aggregated_line_metrics is not None, (
"aggregated_line_metrics doit être peuplé après benchmark"
)
assert "gini_mean" in report.aggregated_line_metrics
assert "document_count" in report.aggregated_line_metrics
assert report.aggregated_line_metrics["document_count"] == 2
assert report.aggregated_hallucination is not None, (
"aggregated_hallucination doit être peuplé après benchmark"
)
assert "anchor_score_mean" in report.aggregated_hallucination
assert report.aggregated_hallucination["document_count"] == 2
def test_json_export_includes_new_aggregations(self, tmp_path: Path):
img = tmp_path / "t.png"
img.write_bytes(b"")
corpus = Corpus(
name="test",
documents=[
Document(doc_id="d1", image_path=img, ground_truth="un\ndeux"),
],
source_path=str(tmp_path),
)
engine = _FakeEngine(output_text="un\ndeux", name="fake")
out = tmp_path / "bench.json"
run_benchmark(
corpus=corpus,
engines=[engine],
output_json=out,
show_progress=False,
max_workers=1,
partial_dir=str(tmp_path / "partial"),
)
data = json.loads(out.read_text(encoding="utf-8"))
report = data["engine_reports"][0]
assert "aggregated_line_metrics" in report
assert "aggregated_hallucination" in report
# ---------------------------------------------------------------------------
# 3. Modèle Fact et DetectorRegistry
# ---------------------------------------------------------------------------
class TestFactModel:
def test_fact_is_serializable(self):
fact = Fact(
type=FactType.GLOBAL_LEADER_CER,
importance=FactImportance.CRITICAL,
payload={"engine": "tesseract", "cer": 0.042},
engines_involved=("tesseract",),
)
d = fact.as_dict()
assert d["type"] == "global_leader_cer"
assert d["importance"] == 100
assert d["payload"]["cer"] == 0.042
assert d["engines_involved"] == ["tesseract"]
def test_fact_importance_ordering(self):
assert FactImportance.CRITICAL > FactImportance.HIGH
assert FactImportance.HIGH > FactImportance.MEDIUM
assert FactImportance.MEDIUM > FactImportance.LOW
class TestDetectorRegistry:
def test_registry_starts_empty(self):
registry = DetectorRegistry()
assert registry.registered_types() == ()
assert registry.run({}) == []
def test_register_and_run(self):
registry = DetectorRegistry()
def dummy_detector(data: dict) -> list[Fact]:
return [Fact(
type=FactType.GLOBAL_LEADER_CER,
importance=FactImportance.CRITICAL,
payload={"engine": data.get("leader", "unknown")},
)]
registry.register(FactType.GLOBAL_LEADER_CER, dummy_detector)
assert FactType.GLOBAL_LEADER_CER in registry.registered_types()
facts = registry.run({"leader": "tesseract"})
assert len(facts) == 1
assert facts[0].payload["engine"] == "tesseract"
def test_registry_swallows_detector_exceptions(self):
"""Un détecteur défaillant ne doit pas casser le pipeline narratif."""
registry = DetectorRegistry()
def broken_detector(data: dict) -> list[Fact]:
raise RuntimeError("boom")
def working_detector(data: dict) -> list[Fact]:
return [Fact(
type=FactType.SPEED_WINNER,
importance=FactImportance.HIGH,
payload={},
)]
registry.register(FactType.GLOBAL_LEADER_CER, broken_detector)
registry.register(FactType.SPEED_WINNER, working_detector)
facts = registry.run({})
assert len(facts) == 1
assert facts[0].type == FactType.SPEED_WINNER
def test_default_registry_is_empty_in_sprint_1(self):
"""Sprint 1 = fondations uniquement. Aucun détecteur n'est activé
par défaut — ils le seront au Sprint 4 avec leurs templates."""
facts = detect_all({})
assert facts == []