Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

File size: 16,900 Bytes

"""Import depuis le catalogue HTR-United.

HTR-United est un catalogue communautaire de vérités terrain HTR/OCR publiées
sur GitHub sous licence ouverte. Les métadonnées sont stockées dans un fichier
YAML (catalogue.yml) sur https://github.com/HTR-United/htr-united.

Ce module fournit :
- :class:`HTRUnitedCatalogue` — chargement et recherche dans le catalogue
- :func:`fetch_catalogue` — téléchargement du catalogue depuis GitHub
- :func:`import_htr_united_corpus` — téléchargement et import d'un corpus

Exemple
-------
    catalogue = HTRUnitedCatalogue.from_remote()
    results = catalogue.search("français médiéval")
    corpus = import_htr_united_corpus(results[0], output_dir="./corpus/")
"""

from __future__ import annotations

import json
import logging
import re
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Catalogue remote URL
# ---------------------------------------------------------------------------

_CATALOGUE_URL = (
    "https://raw.githubusercontent.com/HTR-United/htr-united/master/htr-united.yml"
)
_CATALOGUE_API_URL = (
    "https://api.github.com/repos/HTR-United/htr-united/contents/htr-united.yml"
)

# Catalogue de démonstration / fallback (hors-ligne)
_DEMO_CATALOGUE: list[dict] = [
    {
        "id": "lectaurep-repertoires",
        "title": "Lectaurep — Répertoires de notaires parisiens",
        "url": "https://github.com/HTR-United/lectaurep-repertoires",
        "language": ["French"],
        "script": ["Cursiva"],
        "century": [17, 18],
        "institution": "Archives nationales (France)",
        "description": "Transcriptions de répertoires de notaires, XVIIe-XVIIIe siècles.",
        "license": "CC-BY 4.0",
        "lines": 12400,
        "format": "ALTO",
        "tags": ["notaires", "Paris", "cursive", "imprimé"],
    },
    {
        "id": "bvmm-manuscripts",
        "title": "BVMM — Manuscrits enluminés",
        "url": "https://github.com/HTR-United/bvmm-manuscripts",
        "language": ["Latin", "French"],
        "script": ["Gothic"],
        "century": [13, 14, 15],
        "institution": "IRHT",
        "description": "Manuscrits médiévaux latins et français, XIIIe-XVe siècles.",
        "license": "CC-BY 4.0",
        "lines": 8700,
        "format": "ALTO",
        "tags": ["manuscrits", "latin", "médiéval", "enluminure"],
    },
    {
        "id": "cremma-medieval",
        "title": "CREMMA Médiéval",
        "url": "https://github.com/HTR-United/cremma-medieval",
        "language": ["French", "Latin"],
        "script": ["Gothic", "Humanistica"],
        "century": [12, 13, 14, 15],
        "institution": "École des chartes / Inria",
        "description": "Corpus CREMMA de manuscrits médiévaux français et latins.",
        "license": "CC-BY 4.0",
        "lines": 6200,
        "format": "ALTO",
        "tags": ["médiéval", "chartes", "manuscrits"],
    },
    {
        "id": "simssa-ocr-printed",
        "title": "SIMSSA — Imprimés anciens (XVe-XVIIe)",
        "url": "https://github.com/HTR-United/simssa-printed",
        "language": ["French", "Latin"],
        "script": ["Rotunda", "Roman"],
        "century": [15, 16, 17],
        "institution": "McGill University",
        "description": "Corpus d'imprimés anciens romains et gothiques.",
        "license": "CC-BY 4.0",
        "lines": 4500,
        "format": "PAGE",
        "tags": ["imprimés", "incunables", "roman", "gothique"],
    },
    {
        "id": "fonds-gallica-presse",
        "title": "Presse ancienne — Gallica (XIXe)",
        "url": "https://github.com/HTR-United/gallica-presse-xix",
        "language": ["French"],
        "script": ["Roman"],
        "century": [19],
        "institution": "Gallica",
        "description": "Numérisations de journaux du XIXe siècle (Gallica).",
        "license": "etalab-2.0",
        "lines": 31000,
        "format": "ALTO",
        "tags": ["presse", "XIXe", "Gallica", "journaux"],
    },
    {
        "id": "archives-departem-correspondances",
        "title": "Correspondances administratives (XVIIIe-XIXe)",
        "url": "https://github.com/HTR-United/correspondances-admin",
        "language": ["French"],
        "script": ["Cursiva"],
        "century": [18, 19],
        "institution": "Archives départementales",
        "description": "Lettres et correspondances administratives manuscrites.",
        "license": "CC-BY 4.0",
        "lines": 9800,
        "format": "ALTO",
        "tags": ["correspondances", "administratif", "cursive"],
    },
    {
        "id": "e-codices-latin",
        "title": "e-codices — Manuscrits latins (Suisse)",
        "url": "https://github.com/HTR-United/e-codices-latin",
        "language": ["Latin"],
        "script": ["Caroline", "Gothic"],
        "century": [9, 10, 11, 12],
        "institution": "Bibliothèque cantonale universitaire de Lausanne",
        "description": "Manuscrits carolingiens et gothiques des bibliothèques suisses.",
        "license": "CC-BY 4.0",
        "lines": 3100,
        "format": "ALTO",
        "tags": ["caroline", "latin", "médiéval", "Suisse"],
    },
    {
        "id": "registres-paroissiaux-17",
        "title": "Registres paroissiaux — Bretagne (XVIIe)",
        "url": "https://github.com/HTR-United/registres-paroissiaux-bretagne",
        "language": ["French", "Latin"],
        "script": ["Cursiva"],
        "century": [17],
        "institution": "Archives départementales du Finistère",
        "description": "Registres paroissiaux bretons du XVIIe siècle.",
        "license": "CC-BY 4.0",
        "lines": 15600,
        "format": "ALTO",
        "tags": ["registres", "Bretagne", "paroissial", "cursive"],
    },
]


# ---------------------------------------------------------------------------
# Dataclass entrée catalogue
# ---------------------------------------------------------------------------

@dataclass
class HTRUnitedEntry:
    """Une entrée dans le catalogue HTR-United."""

    id: str
    title: str
    url: str
    language: list[str] = field(default_factory=list)
    script: list[str] = field(default_factory=list)
    century: list[int] = field(default_factory=list)
    institution: str = ""
    description: str = ""
    license: str = ""
    lines: int = 0
    format: str = "ALTO"
    tags: list[str] = field(default_factory=list)

    def as_dict(self) -> dict:
        return {
            "id": self.id,
            "title": self.title,
            "url": self.url,
            "language": self.language,
            "script": self.script,
            "century": self.century,
            "institution": self.institution,
            "description": self.description,
            "license": self.license,
            "lines": self.lines,
            "format": self.format,
            "tags": self.tags,
        }

    @classmethod
    def from_dict(cls, d: dict) -> "HTRUnitedEntry":
        return cls(
            id=d.get("id", ""),
            title=d.get("title", ""),
            url=d.get("url", ""),
            language=d.get("language", []),
            script=d.get("script", []),
            century=d.get("century", []),
            institution=d.get("institution", ""),
            description=d.get("description", ""),
            license=d.get("license", ""),
            lines=d.get("lines", 0),
            format=d.get("format", "ALTO"),
            tags=d.get("tags", []),
        )

    @property
    def century_str(self) -> str:
        """Siècles formatés en chiffres romains."""
        roman = {
            1: "Ier", 2: "IIe", 3: "IIIe", 4: "IVe", 5: "Ve",
            6: "VIe", 7: "VIIe", 8: "VIIIe", 9: "IXe", 10: "Xe",
            11: "XIe", 12: "XIIe", 13: "XIIIe", 14: "XIVe", 15: "XVe",
            16: "XVIe", 17: "XVIIe", 18: "XVIIIe", 19: "XIXe", 20: "XXe",
        }
        return ", ".join(roman.get(c, f"{c}e") for c in self.century)


# ---------------------------------------------------------------------------
# Catalogue
# ---------------------------------------------------------------------------

class HTRUnitedCatalogue:
    """Catalogue HTR-United avec recherche et filtrage."""

    def __init__(self, entries: list[HTRUnitedEntry], source: str = "demo") -> None:
        self.entries = entries
        self.source = source  # "remote" | "demo" | "cache"

    def __len__(self) -> int:
        return len(self.entries)

    @classmethod
    def from_demo(cls) -> "HTRUnitedCatalogue":
        """Charge le catalogue de démonstration intégré."""
        entries = [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]
        return cls(entries, source="demo")

    @classmethod
    def from_remote(cls, timeout: int = 10) -> "HTRUnitedCatalogue":
        """Télécharge le catalogue depuis GitHub.

        En cas d'erreur réseau, retourne le catalogue de démonstration.
        """
        try:
            req = urllib.request.Request(
                _CATALOGUE_URL,
                headers={"User-Agent": "picarones-htr-united-importer/1.0"},
            )
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                raw = resp.read().decode("utf-8")
            entries = _parse_yml_catalogue(raw)
            return cls(entries, source="remote")
        except (urllib.error.URLError, Exception) as exc:
            # Fallback démo avec avertissement
            logger.warning(
                "[HTR-United] impossible de charger le catalogue distant (%s) : %s. "
                "Utilisation des données de démonstration.",
                _CATALOGUE_URL, exc,
            )
            return cls.from_demo()

    def search(
        self,
        query: str = "",
        language: Optional[str] = None,
        script: Optional[str] = None,
        century_min: Optional[int] = None,
        century_max: Optional[int] = None,
    ) -> list[HTRUnitedEntry]:
        """Recherche dans le catalogue avec filtres optionnels."""
        results = self.entries

        if query:
            q = query.lower()
            results = [
                e for e in results
                if (q in e.title.lower()
                    or q in e.description.lower()
                    or q in e.institution.lower()
                    or any(q in t.lower() for t in e.tags)
                    or any(q in lang.lower() for lang in e.language))
            ]

        if language:
            lang_lower = language.lower()
            results = [
                e for e in results
                if any(lang_lower in lg.lower() for lg in e.language)
            ]

        if script:
            sc_lower = script.lower()
            results = [
                e for e in results
                if any(sc_lower in s.lower() for s in e.script)
            ]

        if century_min is not None:
            results = [
                e for e in results
                if any(c >= century_min for c in e.century)
            ]

        if century_max is not None:
            results = [
                e for e in results
                if any(c <= century_max for c in e.century)
            ]

        return results

    def get_by_id(self, entry_id: str) -> Optional[HTRUnitedEntry]:
        """Retourne une entrée par son identifiant."""
        for e in self.entries:
            if e.id == entry_id:
                return e
        return None

    def available_languages(self) -> list[str]:
        seen: set[str] = set()
        result: list[str] = []
        for e in self.entries:
            for lang in e.language:
                if lang not in seen:
                    seen.add(lang)
                    result.append(lang)
        return sorted(result)

    def available_scripts(self) -> list[str]:
        seen: set[str] = set()
        result: list[str] = []
        for e in self.entries:
            for sc in e.script:
                if sc not in seen:
                    seen.add(sc)
                    result.append(sc)
        return sorted(result)


# ---------------------------------------------------------------------------
# Import de corpus
# ---------------------------------------------------------------------------

def import_htr_united_corpus(
    entry: HTRUnitedEntry,
    output_dir: str | Path,
    max_samples: int = 100,
    show_progress: bool = True,
) -> dict:
    """Importe un corpus HTR-United dans un dossier local.

    Retourne un dict avec les métadonnées de l'import.
    Note : en l'absence d'accès réseau au dépôt GitHub, génère des fichiers
    placeholder (pour tests et démo).
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Sauvegarder les métadonnées
    meta = {
        "source": "htr-united",
        "entry_id": entry.id,
        "title": entry.title,
        "url": entry.url,
        "language": entry.language,
        "script": entry.script,
        "century": entry.century,
        "institution": entry.institution,
        "license": entry.license,
        "format": entry.format,
        "imported_at": _iso_now(),
    }
    (output_path / "htr_united_meta.json").write_text(
        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    # Essai de téléchargement réel depuis GitHub (archive releases)
    downloaded = _try_download_corpus(entry, output_path, max_samples, show_progress)

    return {
        "entry_id": entry.id,
        "title": entry.title,
        "output_dir": str(output_path),
        "files_imported": downloaded,
        "metadata_file": str(output_path / "htr_united_meta.json"),
    }


def _try_download_corpus(
    entry: HTRUnitedEntry,
    output_path: Path,
    max_samples: int,
    show_progress: bool,
) -> int:
    """Tente de télécharger le corpus depuis GitHub. Retourne le nombre de fichiers importés."""
    # Construit l'URL de l'archive ZIP du dépôt GitHub
    repo_path = _extract_github_repo(entry.url)
    if not repo_path:
        return 0

    zip_url = f"https://github.com/{repo_path}/archive/refs/heads/main.zip"
    try:
        req = urllib.request.Request(
            zip_url,
            headers={"User-Agent": "picarones-htr-united-importer/1.0"},
        )
        with urllib.request.urlopen(req, timeout=30) as resp:
            import io
            import zipfile

            data = resp.read()
            with zipfile.ZipFile(io.BytesIO(data)) as zf:
                # Extraire les fichiers ALTO/PAGE/GT
                gt_files = [
                    n for n in zf.namelist()
                    if n.endswith((".alto.xml", ".page.xml", ".gt.txt", ".xml"))
                    and not n.endswith("/")
                ][:max_samples]
                for i, fname in enumerate(gt_files):
                    dest = output_path / Path(fname).name
                    dest.write_bytes(zf.read(fname))
                return len(gt_files)
    except Exception as exc:  # noqa: BLE001 — large surface (réseau, ZIP, FS)
        # Sprint A3 (B-3) : on documente l'incident plutôt que de le
        # masquer ; le caller reçoit toujours 0 pour préserver le
        # contrat numérique de retour.
        from picarones.extras.importers._fallback_log import record_fallback
        record_fallback(
            importer="htr_united",
            operation="download_zip_samples",
            error=exc,
            extra={"output_path": str(output_path)},
        )
        return 0


def _extract_github_repo(url: str) -> Optional[str]:
    """Extrait 'owner/repo' depuis une URL GitHub."""
    m = re.match(r"https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$", url)
    return m.group(1) if m else None


def _parse_yml_catalogue(raw: str) -> list[HTRUnitedEntry]:
    """Parse rudimentaire du YAML catalogue HTR-United."""
    try:
        import yaml
        data = yaml.safe_load(raw)
        if isinstance(data, list):
            return [HTRUnitedEntry.from_dict(d) for d in data if isinstance(d, dict)]
    except Exception as exc:  # noqa: BLE001 — yaml + parsing user-supplied
        # Sprint A3 (B-3) : un YAML mal formé bascule en mode démo
        # sans que l'utilisateur en soit averti — on logge et on émet
        # un Fact pour que la synthèse du rapport mentionne l'incident.
        from picarones.extras.importers._fallback_log import record_fallback
        record_fallback(
            importer="htr_united",
            operation="yaml_catalogue_parse",
            error=exc,
        )
    return [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]


def _iso_now() -> str:
    from datetime import datetime, timezone
    return datetime.now(timezone.utc).isoformat(timespec="seconds")