Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

File size: 11,873 Bytes

"""Sprint A14-S25 — projection sans hack loader.

Le test central qui démontre que le fix du protocole ``Projector``
(retourne ``(Artifact, payload, ProjectionReport)`` au lieu de
``(Artifact, ProjectionReport)``) débloque le workflow CLI :
on peut maintenant exécuter une pipeline qui produit ALTO_XML, la
faire évaluer par TextView (qui projette ALTO → texte), et obtenir
des métriques **sans pré-stocker manuellement le payload projeté
dans le loader**.

C'est précisément le cas BnF central :
- Pipeline 1 : Tesseract → RAW_TEXT (TextView direct).
- Pipeline 2 : Pero OCR → ALTO_XML (TextView via projection
  ALTO→texte).

Les deux pipelines doivent être comparables sur la même TextView.
"""

from __future__ import annotations

from pathlib import Path

from picarones.app.services import RegistryService
from picarones.domain.artifacts import Artifact, ArtifactType
from picarones.domain.evaluation_spec import MetricSpec
from picarones.evaluation.registry import MetricRegistry
from picarones.evaluation.views import (
    DefaultEvaluationViewExecutor,
    build_text_view,
)
from picarones.formats.alto.types import (
    AltoBBox,
    AltoDocument,
    AltoLine,
    AltoPage,
    AltoString,
    AltoTextBlock,
)
from picarones.formats.alto.writer import write_alto


# ──────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────


def _build_alto(text: str) -> AltoDocument:
    return AltoDocument(pages=(AltoPage(blocks=(AltoTextBlock(lines=(AltoLine(strings=tuple(
        AltoString(content=w, bbox=AltoBBox(hpos=0, vpos=0, width=10, height=10))
        for w in text.split()
    )),),),),),),)


def _stub_cer(reference: str, hypothesis: str) -> float:
    if not reference:
        return 0.0 if not hypothesis else 1.0
    common = sum(1 for a, b in zip(reference, hypothesis) if a == b)
    return 1.0 - (common / max(len(reference), len(hypothesis)))


def _strict_loader(art: Artifact):
    """Loader qui REFUSE explicitement les artefacts projetés.

    Si l'executor essaie d'appeler ``loader(art)`` sur un artefact
    dont l'id se termine par ``:projected_text``, on lève — preuve
    que le fix S25 fait que l'executor n'appelle PAS le loader sur
    les artefacts projetés.

    Pour les autres artefacts (RAW_TEXT/ALTO_XML avec URI), on lit
    depuis le filesystem.
    """
    if ":projected_text" in art.id:
        raise AssertionError(
            f"S25 régression : le loader a été appelé sur "
            f"l'artefact projeté {art.id!r} — le fix S25 garantit que "
            "le payload est utilisé directement depuis le retour du "
            "projecteur, sans repasser par le loader."
        )
    if art.type == ArtifactType.RAW_TEXT:
        return Path(art.uri).read_text(encoding="utf-8")
    if art.type == ArtifactType.ALTO_XML:
        from picarones.formats.alto.parser import parse_alto
        return parse_alto(Path(art.uri).read_bytes())
    raise KeyError(f"loader strict : type {art.type} non géré")


# ──────────────────────────────────────────────────────────────────
# Tests
# ──────────────────────────────────────────────────────────────────


class TestProjectionWithoutLoaderHack:
    """Avant S25, l'executor appelait ``loader(projected_artifact)`` —
    obligeant les tests à pré-stocker le payload projeté dans une map.
    Après S25, le projecteur retourne le payload directement et
    l'executor ne sollicite plus le loader pour les artefacts projetés.
    """

    def test_alto_to_text_projection_works_without_loader_hack(
        self, tmp_path: Path,
    ) -> None:
        # Setup : un ALTO sur disque + une GT texte sur disque.
        gt_text = "Bonjour le monde"
        alto_doc = _build_alto(gt_text)
        alto_path = tmp_path / "doc.alto.xml"
        alto_path.write_bytes(write_alto(alto_doc))

        gt_path = tmp_path / "doc.gt.txt"
        gt_path.write_text(gt_text, encoding="utf-8")

        # Bootstrap registries via le service S23.
        registries = RegistryService.bootstrap_defaults()

        # Loader strict qui ASSERTE qu'il n'est pas appelé sur l'artefact
        # projeté.
        executor = DefaultEvaluationViewExecutor.from_registries(
            registries.metrics,
            registries.projectors,
            _strict_loader,
        )

        # Candidat : ALTO_XML.  GT : RAW_TEXT.  Vue : TextView qui
        # projette ALTO → texte.
        cand = Artifact(
            id="d1:pero:alto",
            document_id="d1",
            type=ArtifactType.ALTO_XML,
            uri=str(alto_path),
        )
        gt = Artifact(
            id="d1:gt:raw_text",
            document_id="d1",
            type=ArtifactType.RAW_TEXT,
            uri=str(gt_path),
        )
        view = build_text_view()
        result = executor.evaluate(view, cand, gt, pipeline_name="test")

        # Validation : la projection a bien eu lieu, le payload retourné
        # par le projecteur a été utilisé (le loader strict aurait levé
        # sinon), et le CER est 0 puisque le texte ALTO matche la GT.
        assert result.projection_report is not None
        assert result.projection_report.projector_name == "alto_to_text"
        assert result.failed_metrics == {}, (
            f"Métriques en échec inattendues : {result.failed_metrics}"
        )
        assert result.metric_values["cer"] == 0.0
        assert result.metric_values["wer"] == 0.0

    def test_canonical_to_text_projection_works_without_loader_hack(
        self, tmp_path: Path,
    ) -> None:
        # Setup : markdown sur disque + GT texte.
        md_path = tmp_path / "doc.canonical.md"
        md_path.write_text(
            "# Titre\n\nBonjour le monde\n",
            encoding="utf-8",
        )
        gt_path = tmp_path / "doc.gt.txt"
        gt_path.write_text("Titre Bonjour le monde", encoding="utf-8")

        registries = RegistryService.bootstrap_defaults()
        executor = DefaultEvaluationViewExecutor.from_registries(
            registries.metrics,
            registries.projectors,
            _strict_loader,
        )

        cand = Artifact(
            id="d1:vlm:canonical",
            document_id="d1",
            type=ArtifactType.CANONICAL_DOCUMENT,
            uri=str(md_path),
        )
        gt = Artifact(
            id="d1:gt:raw_text",
            document_id="d1",
            type=ArtifactType.RAW_TEXT,
            uri=str(gt_path),
        )
        view = build_text_view()
        result = executor.evaluate(view, cand, gt, pipeline_name="test")

        assert result.projection_report is not None
        assert result.projection_report.projector_name == "canonical_to_text"
        assert result.failed_metrics == {}, (
            f"Métriques en échec inattendues : {result.failed_metrics}"
        )

    def test_loader_still_called_for_non_projected_candidate(
        self, tmp_path: Path,
    ) -> None:
        """Garde-fou : le loader EST appelé pour les artefacts non
        projetés (RAW_TEXT direct), juste pas pour les projetés.
        Vérifie qu'on n'a pas accidentellement court-circuité
        TOUS les chemins."""
        gt_text = "Identique"
        cand_path = tmp_path / "cand.txt"
        cand_path.write_text(gt_text, encoding="utf-8")
        gt_path = tmp_path / "gt.txt"
        gt_path.write_text(gt_text, encoding="utf-8")

        registries = RegistryService.bootstrap_defaults()
        executor = DefaultEvaluationViewExecutor.from_registries(
            registries.metrics,
            registries.projectors,
            _strict_loader,
        )

        cand = Artifact(
            id="d1:tess:raw_text",
            document_id="d1",
            type=ArtifactType.RAW_TEXT,
            uri=str(cand_path),
        )
        gt = Artifact(
            id="d1:gt:raw_text",
            document_id="d1",
            type=ArtifactType.RAW_TEXT,
            uri=str(gt_path),
        )
        view = build_text_view()
        result = executor.evaluate(view, cand, gt, pipeline_name="test")

        # Pas de projection → loader appelé sur le candidat directement.
        assert result.projection_report is None
        assert result.metric_values["cer"] == 0.0


class TestPayloadFromProjectorIsAuthoritative:
    """Garantit que le payload retourné par le projecteur est utilisé
    tel quel (l'executor ne re-réécrit pas, ne re-charge pas)."""

    def test_alto_projector_payload_drives_metric(
        self, tmp_path: Path,
    ) -> None:
        """Quand le projecteur retourne 'X', le métrique compute sur 'X'
        (pas sur autre chose)."""
        gt_text = "exact"
        alto_path = tmp_path / "alto.xml"
        alto_path.write_bytes(write_alto(_build_alto("exact")))

        gt_path = tmp_path / "gt.txt"
        gt_path.write_text(gt_text, encoding="utf-8")

        # Métrique custom qui retourne 1.0 si reference == hypothesis,
        # 0.0 sinon — preuve que la valeur passée à la métrique est
        # bien le payload du projecteur.
        from picarones.evaluation.projectors import ProjectorRegistry, AltoToText

        captured: dict[str, str] = {}

        def capturing_metric(reference: str, hypothesis: str) -> float:
            captured["reference"] = reference
            captured["hypothesis"] = hypothesis
            return 1.0 if reference == hypothesis else 0.0

        metrics = MetricRegistry()
        metrics.register(
            MetricSpec(
                name="capture",
                input_types=(ArtifactType.RAW_TEXT, ArtifactType.RAW_TEXT),
                higher_is_better=True,
            ),
            capturing_metric,
        )
        projectors = ProjectorRegistry()
        projectors.register(AltoToText())

        from picarones.domain.evaluation_spec import EvaluationView
        from picarones.domain.projection_spec import ProjectionSpec

        # On ne peut pas utiliser build_text_view car ses metric_names
        # incluent cer/wer/mer/wil non enregistrés ici — on construit
        # une vue minimale qui projette ALTO → texte.
        view = EvaluationView(
            name="test_capture",
            description="capture le payload projeté",
            candidate_types=frozenset({ArtifactType.ALTO_XML}),
            projections_by_source_type={
                ArtifactType.ALTO_XML: ProjectionSpec(
                    source_type=ArtifactType.ALTO_XML,
                    target_type=ArtifactType.RAW_TEXT,
                    projector_name="alto_to_text",
                ),
            },
            metric_names=("capture",),
        )

        executor = DefaultEvaluationViewExecutor.from_registries(
            metrics, projectors, _strict_loader,
        )
        cand = Artifact(
            id="d:alto",
            document_id="d",
            type=ArtifactType.ALTO_XML,
            uri=str(alto_path),
        )
        gt = Artifact(
            id="d:gt",
            document_id="d",
            type=ArtifactType.RAW_TEXT,
            uri=str(gt_path),
        )
        result = executor.evaluate(view, cand, gt, pipeline_name="test")
        assert captured["reference"] == "exact"
        assert captured["hypothesis"] == "exact"
        assert result.metric_values["capture"] == 1.0