"""Sprint A14-S9 — PAGE XML parser, projector."""
from __future__ import annotations
import pytest
from picarones.domain import Artifact, ArtifactType
from picarones.domain.errors import ProjectionError
from picarones.evaluation.projectors import PageToText, page_document_to_text
from picarones.formats.pagexml import (
PageDocument,
PageParseError,
PagePage,
PageTextLine,
PageTextRegion,
parse_pagexml,
)
_SAMPLE_PAGE_XML = '''
Premier ligne
deuxième ligne
Titre
'''.encode("utf-8")
class TestParser:
def test_parse_simple_page(self) -> None:
doc = parse_pagexml(_SAMPLE_PAGE_XML)
assert len(doc.pages) == 1
page = doc.pages[0]
assert page.image_filename == "folio_001.png"
assert page.image_width == 1200
assert page.image_height == 1800
assert len(page.text_regions) == 2
def test_text_lines_extracted(self) -> None:
doc = parse_pagexml(_SAMPLE_PAGE_XML)
r1 = doc.pages[0].text_regions[0]
assert len(r1.text_lines) == 2
assert r1.text_lines[0].text == "Premier ligne"
assert r1.text_lines[0].coords is not None
assert r1.text_lines[0].baseline is not None
def test_region_type_preserved(self) -> None:
doc = parse_pagexml(_SAMPLE_PAGE_XML)
assert doc.pages[0].text_regions[0].region_type == "paragraph"
assert doc.pages[0].text_regions[1].region_type == "heading"
def test_namespace_detected(self) -> None:
doc = parse_pagexml(_SAMPLE_PAGE_XML)
assert doc.source_namespace is not None
assert "primaresearch" in doc.source_namespace
def test_empty_raises(self) -> None:
with pytest.raises(PageParseError, match="vide"):
parse_pagexml(b"")
def test_invalid_xml_raises(self) -> None:
with pytest.raises(PageParseError, match="invalide"):
parse_pagexml(b" None:
xml = b'''
]>
&xxe;'''
with pytest.raises(PageParseError):
parse_pagexml(xml)
class TestExtractText:
def test_full_extraction(self) -> None:
doc = parse_pagexml(_SAMPLE_PAGE_XML)
text = page_document_to_text(doc)
# 2 régions séparées par ligne vide, lignes par \n.
assert text == "Premier ligne\ndeuxième ligne\n\nTitre"
def test_empty_document(self) -> None:
doc = PageDocument()
assert page_document_to_text(doc) == ""
def test_region_without_lines_skipped(self) -> None:
doc = PageDocument(pages=(PagePage(
text_regions=(
PageTextRegion(id="empty"),
PageTextRegion(
id="full",
text_lines=(PageTextLine(text="hello"),),
),
),
),),)
assert page_document_to_text(doc) == "hello"
class TestProjector:
def test_protocol_satisfied(self) -> None:
from picarones.evaluation.projectors import Projector
assert isinstance(PageToText(), Projector)
def test_project_from_filesystem(self, tmp_path) -> None:
path = tmp_path / "doc.page.xml"
path.write_bytes(_SAMPLE_PAGE_XML)
artifact = Artifact(
id="d:page",
document_id="d",
type=ArtifactType.PAGE_XML,
uri=str(path),
)
target, payload, report = PageToText().project(artifact, {})
assert target.type == ArtifactType.RAW_TEXT
# Sprint S25 — le projecteur retourne le texte calculé.
assert isinstance(payload, str)
assert len(payload) > 0
assert "geometry" in report.ignored_dimensions
def test_wrong_type_rejected(self) -> None:
artifact = Artifact(
id="d:alto", document_id="d", type=ArtifactType.ALTO_XML,
)
with pytest.raises(ProjectionError, match="PAGE_XML"):
PageToText().project(artifact, {})