Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

File size: 10,523 Bytes

0aa159b

"""Tests Sprint 16 — câblage line_metrics/hallucination + fondations du moteur narratif.

Couverture :
1. ``compute_document_result`` via le runner peuple bien ``line_metrics`` et
   ``hallucination_metrics`` sur un document réussi.
2. ``EngineReport`` expose ``aggregated_line_metrics`` et
   ``aggregated_hallucination`` après un benchmark.
3. Le modèle ``Fact`` et le ``DetectorRegistry`` fonctionnent.
4. Le registre par défaut est vide en Sprint 1 (les détecteurs seront activés
   progressivement dans les sprints suivants).
"""

from __future__ import annotations

import json
import tempfile
from pathlib import Path

import pytest

from picarones.core.corpus import Corpus, Document
from picarones.core.narrative import (
    DetectorRegistry,
    Fact,
    FactImportance,
    FactType,
    detect_all,
)
from picarones.core.runner import (
    _aggregate_hallucination,
    _aggregate_line_metrics,
    _compute_document_result,
    run_benchmark,
)
from picarones.engines.base import BaseOCREngine, EngineResult


class _FakeEngine(BaseOCREngine):
    """Moteur factice — renvoie un texte configurable, utile en test."""

    def __init__(self, output_text: str, name: str = "fake", config=None):
        super().__init__(config)
        self._output = output_text
        self._display_name = name

    @property
    def name(self) -> str:
        return self._display_name

    def version(self) -> str:
        return "test"

    def _run_ocr(self, image_path):
        return self._output, None

    def run(self, image_path) -> EngineResult:
        return EngineResult(
            engine_name=self.name,
            image_path=str(image_path),
            text=self._output,
            duration_seconds=0.01,
        )


# ---------------------------------------------------------------------------
# 1. Câblage line_metrics et hallucination par document
# ---------------------------------------------------------------------------

class TestDocumentResultWiring:
    """Vérifie que ``_compute_document_result`` peuple les nouveaux champs."""

    def test_line_metrics_populated_on_success(self, tmp_path: Path):
        image = tmp_path / "doc.png"
        image.write_bytes(b"\x89PNG\r\n\x1a\n")  # stub — image_quality loggera un warning

        ocr = EngineResult(
            engine_name="fake",
            image_path=str(image),
            text="ligne une\nligne deux\nligne trois",
            duration_seconds=0.1,
        )
        gt = "ligne une\nligne deux\nligne trois"

        result = _compute_document_result(
            doc_id="doc1",
            image_path=str(image),
            ground_truth=gt,
            ocr_result=ocr,
            char_exclude=None,
        )

        assert result.line_metrics is not None, "line_metrics doit être peuplé"
        assert "percentiles" in result.line_metrics
        assert "gini" in result.line_metrics
        assert result.line_metrics["line_count"] == 3

    def test_hallucination_metrics_populated_on_success(self, tmp_path: Path):
        image = tmp_path / "doc.png"
        image.write_bytes(b"")

        gt = "le chat est sur le tapis rouge et dort paisiblement"
        hyp = "le chat mange des bananes spatiales en orbite lunaire"

        ocr = EngineResult(
            engine_name="fake",
            image_path=str(image),
            text=hyp,
            duration_seconds=0.1,
        )

        result = _compute_document_result(
            doc_id="doc1",
            image_path=str(image),
            ground_truth=gt,
            ocr_result=ocr,
            char_exclude=None,
        )

        assert result.hallucination_metrics is not None
        assert "anchor_score" in result.hallucination_metrics
        assert "length_ratio" in result.hallucination_metrics
        assert "is_hallucinating" in result.hallucination_metrics

    def test_new_fields_empty_on_engine_failure(self, tmp_path: Path):
        """Si l'OCR échoue (success=False), pas de calcul line_metrics/hallucination."""
        image = tmp_path / "doc.png"
        image.write_bytes(b"")

        ocr = EngineResult(
            engine_name="fake",
            image_path=str(image),
            text="",
            duration_seconds=0.1,
            error="simulated failure",
        )
        result = _compute_document_result(
            doc_id="doc1",
            image_path=str(image),
            ground_truth="ground truth text",
            ocr_result=ocr,
            char_exclude=None,
        )

        assert result.line_metrics is None
        assert result.hallucination_metrics is None


# ---------------------------------------------------------------------------
# 2. Agrégation au niveau EngineReport
# ---------------------------------------------------------------------------

class TestAggregationWiring:
    """Vérifie que le benchmark complet produit les agrégations."""

    def test_aggregate_line_metrics_helper_with_empty_list(self):
        assert _aggregate_line_metrics([]) is None

    def test_aggregate_hallucination_helper_with_empty_list(self):
        assert _aggregate_hallucination([]) is None

    def test_benchmark_end_to_end_produces_aggregations(self, tmp_path: Path):
        img = tmp_path / "test.png"
        img.write_bytes(b"")

        corpus = Corpus(
            name="test",
            documents=[
                Document(
                    doc_id="d1",
                    image_path=img,
                    ground_truth="bonjour le monde\nligne deux\nfin",
                ),
                Document(
                    doc_id="d2",
                    image_path=img,
                    ground_truth="autre document test\navec deux lignes",
                ),
            ],
            source_path=str(tmp_path),
        )

        engine = _FakeEngine(
            output_text="bonjour le monde\nligne deux\nfin",
            name="fake_engine",
        )

        result = run_benchmark(
            corpus=corpus,
            engines=[engine],
            show_progress=False,
            max_workers=1,
            partial_dir=str(tmp_path / "partial"),
        )

        assert len(result.engine_reports) == 1
        report = result.engine_reports[0]

        assert report.aggregated_line_metrics is not None, (
            "aggregated_line_metrics doit être peuplé après benchmark"
        )
        assert "gini_mean" in report.aggregated_line_metrics
        assert "document_count" in report.aggregated_line_metrics
        assert report.aggregated_line_metrics["document_count"] == 2

        assert report.aggregated_hallucination is not None, (
            "aggregated_hallucination doit être peuplé après benchmark"
        )
        assert "anchor_score_mean" in report.aggregated_hallucination
        assert report.aggregated_hallucination["document_count"] == 2

    def test_json_export_includes_new_aggregations(self, tmp_path: Path):
        img = tmp_path / "t.png"
        img.write_bytes(b"")
        corpus = Corpus(
            name="test",
            documents=[
                Document(doc_id="d1", image_path=img, ground_truth="un\ndeux"),
            ],
            source_path=str(tmp_path),
        )
        engine = _FakeEngine(output_text="un\ndeux", name="fake")

        out = tmp_path / "bench.json"
        run_benchmark(
            corpus=corpus,
            engines=[engine],
            output_json=out,
            show_progress=False,
            max_workers=1,
            partial_dir=str(tmp_path / "partial"),
        )

        data = json.loads(out.read_text(encoding="utf-8"))
        report = data["engine_reports"][0]
        assert "aggregated_line_metrics" in report
        assert "aggregated_hallucination" in report


# ---------------------------------------------------------------------------
# 3. Modèle Fact et DetectorRegistry
# ---------------------------------------------------------------------------

class TestFactModel:
    def test_fact_is_serializable(self):
        fact = Fact(
            type=FactType.GLOBAL_LEADER_CER,
            importance=FactImportance.CRITICAL,
            payload={"engine": "tesseract", "cer": 0.042},
            engines_involved=("tesseract",),
        )
        d = fact.as_dict()
        assert d["type"] == "global_leader_cer"
        assert d["importance"] == 100
        assert d["payload"]["cer"] == 0.042
        assert d["engines_involved"] == ["tesseract"]

    def test_fact_importance_ordering(self):
        assert FactImportance.CRITICAL > FactImportance.HIGH
        assert FactImportance.HIGH > FactImportance.MEDIUM
        assert FactImportance.MEDIUM > FactImportance.LOW


class TestDetectorRegistry:
    def test_registry_starts_empty(self):
        registry = DetectorRegistry()
        assert registry.registered_types() == ()
        assert registry.run({}) == []

    def test_register_and_run(self):
        registry = DetectorRegistry()

        def dummy_detector(data: dict) -> list[Fact]:
            return [Fact(
                type=FactType.GLOBAL_LEADER_CER,
                importance=FactImportance.CRITICAL,
                payload={"engine": data.get("leader", "unknown")},
            )]

        registry.register(FactType.GLOBAL_LEADER_CER, dummy_detector)
        assert FactType.GLOBAL_LEADER_CER in registry.registered_types()

        facts = registry.run({"leader": "tesseract"})
        assert len(facts) == 1
        assert facts[0].payload["engine"] == "tesseract"

    def test_registry_swallows_detector_exceptions(self):
        """Un détecteur défaillant ne doit pas casser le pipeline narratif."""
        registry = DetectorRegistry()

        def broken_detector(data: dict) -> list[Fact]:
            raise RuntimeError("boom")

        def working_detector(data: dict) -> list[Fact]:
            return [Fact(
                type=FactType.SPEED_WINNER,
                importance=FactImportance.HIGH,
                payload={},
            )]

        registry.register(FactType.GLOBAL_LEADER_CER, broken_detector)
        registry.register(FactType.SPEED_WINNER, working_detector)

        facts = registry.run({})
        assert len(facts) == 1
        assert facts[0].type == FactType.SPEED_WINNER

    def test_default_registry_is_empty_in_sprint_1(self):
        """Sprint 1 = fondations uniquement. Aucun détecteur n'est activé
        par défaut — ils le seront au Sprint 4 avec leurs templates."""
        facts = detect_all({})
        assert facts == []