"""Parser PAGE XML tolérant — Sprint A14-S9. Détection auto du namespace PRIMA (plusieurs versions co-existent dans la nature : ``2010-03-19``, ``2013-07-15``, ``2017-07-15``, ``2019-07-15``). Utilise ``defusedxml`` pour la sécurité XXE. """ from __future__ import annotations import logging import re from typing import Any import defusedxml.ElementTree as _SafeET from picarones.domain.errors import PicaronesError from picarones.formats.pagexml.types import ( PageDocument, PagePage, PageTextLine, PageTextRegion, ) logger = logging.getLogger(__name__) class PageParseError(PicaronesError): """PAGE XML non parsable.""" _NS_RE = re.compile(r"^\{([^}]*)\}") _LOCAL_NAME_RE = re.compile(r"\{[^}]*\}") def _local(tag: str) -> str: return _LOCAL_NAME_RE.sub("", tag) def _detect_namespace(root_tag: str) -> str | None: m = _NS_RE.match(root_tag) return m.group(1) if m else None def _extract_unicode(elem: Any) -> str: """Cherche le premier ```` descendant et retourne son texte, ou ``""`` si absent. PAGE XML stocke le texte dans ``...``. Plusieurs ``TextEquiv`` peuvent coexister (variantes d'OCR) — on prend la première. """ for child in elem.iter(): if _local(child.tag) == "Unicode": return (child.text or "").strip() return "" def _parse_coords(elem: Any) -> str | None: """Cherche le premier ```` enfant direct.""" for child in elem: if _local(child.tag) == "Coords": return child.attrib.get("points") return None def _parse_baseline(elem: Any) -> str | None: for child in elem: if _local(child.tag) == "Baseline": return child.attrib.get("points") return None def _parse_text_line(elem: Any) -> PageTextLine: return PageTextLine( id=elem.attrib.get("id"), coords=_parse_coords(elem), baseline=_parse_baseline(elem), text=_extract_unicode(elem), ) def _parse_text_region(elem: Any) -> PageTextRegion: lines: list[PageTextLine] = [] for child in elem: if _local(child.tag) == "TextLine": lines.append(_parse_text_line(child)) return PageTextRegion( id=elem.attrib.get("id"), coords=_parse_coords(elem), region_type=elem.attrib.get("type"), text_lines=tuple(lines), ) def _parse_int_attr(elem: Any, name: str) -> int | None: raw = elem.attrib.get(name) if raw is None: return None try: return int(float(raw)) except (ValueError, TypeError): return None def _parse_page(elem: Any) -> PagePage: regions: list[PageTextRegion] = [] for child in elem: if _local(child.tag) == "TextRegion": regions.append(_parse_text_region(child)) return PagePage( image_filename=elem.attrib.get("imageFilename"), image_width=_parse_int_attr(elem, "imageWidth"), image_height=_parse_int_attr(elem, "imageHeight"), text_regions=tuple(regions), ) def parse_pagexml(xml: bytes | str) -> PageDocument: """Parse un document PAGE XML et retourne la structure interne. Raises ------ PageParseError XML mal formé, défense XXE, ou root absent. """ if isinstance(xml, str): xml_bytes = xml.encode("utf-8") else: xml_bytes = xml if not xml_bytes.strip(): raise PageParseError("PAGE XML vide.") try: root = _SafeET.fromstring(xml_bytes) except Exception as exc: # noqa: BLE001 raise PageParseError(f"XML invalide ou XXE bloqué : {exc}") from exc if root is None: raise PageParseError("PAGE sans root element.") ns = _detect_namespace(root.tag) pages: list[PagePage] = [] for elem in root.iter(): if _local(elem.tag) == "Page": pages.append(_parse_page(elem)) return PageDocument(pages=tuple(pages), source_namespace=ns) __all__ = ["parse_pagexml", "PageParseError"]