Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

File size: 14,180 Bytes

"""Sprint A14-S15 — AltoView (vue canonique 2).

6 cas couvrant la fidélité documentaire ALTO + le pattern
d'omission explicite des pipelines qui ne produisent pas d'ALTO.
"""

from __future__ import annotations

import pytest

from picarones.domain import (
    Artifact,
    ArtifactType,
    MetricSpec,
)
from picarones.evaluation.metrics.alto_structural import (
    compute_alto_validity,
    compute_line_count_ratio,
    compute_word_box_coverage,
)
from picarones.evaluation.projectors import ProjectorRegistry
from picarones.evaluation.registry import MetricRegistry
from picarones.evaluation.views import (
    DEFAULT_ALTO_METRICS,
    DefaultEvaluationViewExecutor,
    build_alto_view,
    build_text_view,
)
from picarones.formats.alto.types import (
    AltoBBox,
    AltoDocument,
    AltoLine,
    AltoPage,
    AltoString,
    AltoTextBlock,
)


# ──────────────────────────────────────────────────────────────────────
# Fixtures ALTO
# ──────────────────────────────────────────────────────────────────────


def _line(*words: str, with_bbox: bool = True) -> AltoLine:
    strings = tuple(
        AltoString(
            content=w,
            bbox=AltoBBox(hpos=0, vpos=0, width=10, height=10) if with_bbox else None,
        )
        for w in words
    )
    return AltoLine(strings=strings)


def _doc(*lines: AltoLine, n_blocks: int = 1) -> AltoDocument:
    """Construit un AltoDocument avec ``n_blocks`` blocs partageant
    les lignes."""
    if n_blocks == 1:
        return AltoDocument(pages=(AltoPage(
            blocks=(AltoTextBlock(lines=lines),),
        ),),)
    # Distribute lines across blocks (tous identiques pour simplifier)
    chunks = [lines] * n_blocks
    return AltoDocument(pages=(AltoPage(
        blocks=tuple(AltoTextBlock(lines=c) for c in chunks),
    ),),)


def _empty_doc() -> AltoDocument:
    return AltoDocument()


# ──────────────────────────────────────────────────────────────────────
# Métriques individuelles
# ──────────────────────────────────────────────────────────────────────


class TestAltoMetrics:
    def test_validity_full_doc(self) -> None:
        d = _doc(_line("a", "b"))
        assert compute_alto_validity(d, d) == 1.0

    def test_validity_empty_doc(self) -> None:
        assert compute_alto_validity(_doc(_line("a")), _empty_doc()) == 0.0

    def test_line_count_ratio_equal(self) -> None:
        d1 = _doc(_line("a"), _line("b"), _line("c"))
        d2 = _doc(_line("x"), _line("y"), _line("z"))
        assert compute_line_count_ratio(d1, d2) == 1.0

    def test_line_count_ratio_partial(self) -> None:
        d1 = _doc(_line("a"), _line("b"), _line("c"), _line("d"))  # 4
        d2 = _doc(_line("x"), _line("y"))  # 2
        assert compute_line_count_ratio(d1, d2) == 0.5

    def test_line_count_ratio_both_empty(self) -> None:
        assert compute_line_count_ratio(_empty_doc(), _empty_doc()) == 1.0

    def test_word_box_coverage_full(self) -> None:
        d = _doc(_line("a", "b", "c", with_bbox=True))
        assert compute_word_box_coverage(d, d) == 1.0

    def test_word_box_coverage_partial(self) -> None:
        # 2 mots avec bbox, 1 sans
        line = AltoLine(strings=(
            AltoString(content="a", bbox=AltoBBox(hpos=0, vpos=0, width=1, height=1)),
            AltoString(content="b", bbox=AltoBBox(hpos=0, vpos=0, width=1, height=1)),
            AltoString(content="c", bbox=None),
        ))
        d = AltoDocument(pages=(AltoPage(blocks=(AltoTextBlock(lines=(line,),),),),),)
        assert abs(compute_word_box_coverage(d, d) - 2 / 3) < 1e-9

    def test_word_box_coverage_no_bbox(self) -> None:
        d = _doc(_line("a", "b", with_bbox=False))
        assert compute_word_box_coverage(d, d) == 0.0


# ──────────────────────────────────────────────────────────────────────
# AltoView shape
# ──────────────────────────────────────────────────────────────────────


class TestAltoViewShape:
    def test_default_view_accepts_only_alto_xml(self) -> None:
        """Cas 1 — AltoView n'accepte que ALTO_XML."""
        view = build_alto_view()
        assert view.accepts(ArtifactType.ALTO_XML)
        assert not view.accepts(ArtifactType.RAW_TEXT)
        assert not view.accepts(ArtifactType.PAGE_XML)
        assert not view.accepts(ArtifactType.CANONICAL_DOCUMENT)
        assert not view.accepts(ArtifactType.IMAGE)

    def test_default_metrics(self) -> None:
        view = build_alto_view()
        assert view.metric_names == DEFAULT_ALTO_METRICS
        assert "alto_validity" in view.metric_names
        assert "alto_line_count_ratio" in view.metric_names
        assert "alto_word_box_coverage" in view.metric_names

    def test_no_projection(self) -> None:
        view = build_alto_view()
        assert view.projection is None
        # Pas de projection même par type source.
        assert view.projection_for(ArtifactType.ALTO_XML) is None

    def test_warnings_signal_omission_pattern(self) -> None:
        view = build_alto_view()
        warnings_text = " ".join(view.warnings)
        assert "OMIS" in warnings_text or "omis" in warnings_text


# ──────────────────────────────────────────────────────────────────────
# AltoView avec executor
# ──────────────────────────────────────────────────────────────────────


def _build_alto_executor(payloads: dict[str, AltoDocument]) -> DefaultEvaluationViewExecutor:
    metrics = MetricRegistry()
    metrics.register(
        MetricSpec(
            name="alto_validity",
            input_types=(ArtifactType.ALTO_XML, ArtifactType.ALTO_XML),
            higher_is_better=True,
        ),
        compute_alto_validity,
    )
    metrics.register(
        MetricSpec(
            name="alto_line_count_ratio",
            input_types=(ArtifactType.ALTO_XML, ArtifactType.ALTO_XML),
            higher_is_better=True,
        ),
        compute_line_count_ratio,
    )
    metrics.register(
        MetricSpec(
            name="alto_word_box_coverage",
            input_types=(ArtifactType.ALTO_XML, ArtifactType.ALTO_XML),
            higher_is_better=True,
        ),
        compute_word_box_coverage,
    )
    projectors = ProjectorRegistry()  # AltoView n'a pas besoin de projecteur

    def loader(art: Artifact) -> AltoDocument:
        if art.id not in payloads:
            raise KeyError(f"missing payload {art.id}")
        return payloads[art.id]

    return DefaultEvaluationViewExecutor.from_registries(metrics, projectors, loader)


class TestAltoViewWithExecutor:
    def test_perfect_alto_yields_all_ones(self) -> None:
        """Cas 2 — Hypothèse identique à la GT → toutes métriques = 1.0."""
        gt = _doc(_line("a", "b"), _line("c", "d"))
        payloads = {"gt": gt, "cand": gt}
        executor = _build_alto_executor(payloads)
        view = build_alto_view()
        gt_art = Artifact(id="gt", document_id="d", type=ArtifactType.ALTO_XML)
        cand = Artifact(id="cand", document_id="d", type=ArtifactType.ALTO_XML)
        result = executor.evaluate(view, cand, gt_art, pipeline_name="test")
        assert result.metric_values["alto_validity"] == 1.0
        assert result.metric_values["alto_line_count_ratio"] == 1.0
        assert result.metric_values["alto_word_box_coverage"] == 1.0
        assert result.failed_metrics == {}

    def test_partial_quality_alto(self) -> None:
        """Cas 3 — Hypothèse avec moins de lignes → ratio < 1, autres OK."""
        gt = _doc(_line("a"), _line("b"), _line("c"), _line("d"))  # 4 lignes
        cand = _doc(_line("x"), _line("y"))  # 2 lignes
        payloads = {"gt": gt, "cand": cand}
        executor = _build_alto_executor(payloads)
        view = build_alto_view()
        gt_art = Artifact(id="gt", document_id="d", type=ArtifactType.ALTO_XML)
        cand_art = Artifact(id="cand", document_id="d", type=ArtifactType.ALTO_XML)
        result = executor.evaluate(view, cand_art, gt_art, pipeline_name="test")
        assert result.metric_values["alto_validity"] == 1.0  # cohérent
        assert result.metric_values["alto_line_count_ratio"] == 0.5
        assert result.metric_values["alto_word_box_coverage"] == 1.0


# ──────────────────────────────────────────────────────────────────────
# Pattern d'omission : pipelines sans ALTO ne sont PAS dans AltoView
# ──────────────────────────────────────────────────────────────────────


class TestOmissionPattern:
    """Le caller (service applicatif) doit OMETTRE les pipelines qui
    ne produisent pas d'ALTO_XML, plutôt que de leur attribuer un
    score factice à 0.

    Le test démontre le pattern recommandé.
    """

    def test_caller_filters_pipelines_by_view_acceptance(self) -> None:
        """Cas 4 — Pattern : boucler sur (vue, candidats), filtrer
        ceux dont le type n'est pas dans candidate_types."""
        view = build_alto_view()

        # Simulons 3 pipelines avec leurs sorties principales :
        candidates = [
            ("tesseract_text", ArtifactType.RAW_TEXT),       # PAS d'ALTO
            ("ocr_llm_alto", ArtifactType.ALTO_XML),         # ALTO ✓
            ("vlm_alto_reconstructed", ArtifactType.ALTO_XML),  # ALTO ✓
        ]

        # Le caller filtre :
        eligible = [
            (name, art_type)
            for name, art_type in candidates
            if view.accepts(art_type)
        ]

        omitted = [
            (name, art_type)
            for name, art_type in candidates
            if not view.accepts(art_type)
        ]

        assert len(eligible) == 2
        assert ("ocr_llm_alto", ArtifactType.ALTO_XML) in eligible
        assert ("vlm_alto_reconstructed", ArtifactType.ALTO_XML) in eligible

        assert len(omitted) == 1
        assert omitted[0][0] == "tesseract_text"

    def test_executor_raises_value_error_if_caller_doesnt_filter(self) -> None:
        """Cas 5 — Garde-fou : si le caller n'a pas filtré et passe
        un RAW_TEXT à AltoView, ``executor.evaluate`` lève ``ValueError``
        explicite."""
        payloads = {"cand": "this is text", "gt": _doc(_line("a"))}
        executor = _build_alto_executor(payloads)
        view = build_alto_view()
        cand = Artifact(id="cand", document_id="d", type=ArtifactType.RAW_TEXT)
        gt = Artifact(id="gt", document_id="d", type=ArtifactType.ALTO_XML)
        with pytest.raises(ValueError, match="n'accepte pas"):
            executor.evaluate(view, cand, gt, pipeline_name="test")


# ──────────────────────────────────────────────────────────────────────
# Cas central BnF : TextView + AltoView complémentaires
# ──────────────────────────────────────────────────────────────────────


class TestBnFDualViewUsage:
    """Démontre que le rapport BnF cible peut présenter TextView ET
    AltoView pour les **mêmes** pipelines, mais avec des sets de
    pipelines différents.

    Pipeline 1 : Tesseract texte brut → présent dans TextView, OMIS d'AltoView.
    Pipeline 2 : OCR+LLM avec ALTO → présent dans les DEUX.
    Pipeline 3 : VLM avec ALTO reconstruit → présent dans les DEUX.

    Le test ne fait PAS l'évaluation complète (la stub mémoire ne
    porte que ce qui est utile).  Il vérifie le **pattern** : pour
    chaque vue, quels pipelines sont éligibles.
    """

    def test_two_views_select_different_pipeline_sets(self) -> None:
        """Cas 6 — Définition de done S15 :
          * Tesseract → omis d'AltoView, présent dans TextView
          * OCR+LLM+ALTO → dans les deux
          * VLM+ALTO → dans les deux
        """
        text_view = build_text_view()
        alto_view = build_alto_view()

        pipelines = [
            ("tesseract", ArtifactType.RAW_TEXT),
            ("ocr_llm_alto", ArtifactType.ALTO_XML),
            ("vlm_alto", ArtifactType.ALTO_XML),
        ]

        text_eligible = {
            n for n, t in pipelines if text_view.accepts(t)
        }
        alto_eligible = {
            n for n, t in pipelines if alto_view.accepts(t)
        }

        # TextView accepte les 3.
        assert text_eligible == {"tesseract", "ocr_llm_alto", "vlm_alto"}

        # AltoView omet Tesseract, garde les 2 ALTO.
        assert alto_eligible == {"ocr_llm_alto", "vlm_alto"}
        assert "tesseract" not in alto_eligible

        # Les pipelines présents dans AltoView sont un SOUS-ENSEMBLE de
        # ceux présents dans TextView (cohérence : si un pipeline
        # produit de l'ALTO, son texte est aussi extractible).
        assert alto_eligible.issubset(text_eligible)