Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / picarones /importers /htr_united.py

Claude

fix: résoudre les 64 erreurs ruff pré-existantes révélées par le lint actif

6362212 unverified 2 months ago

15.9 kB

	"""Import depuis le catalogue HTR-United.

	HTR-United est un catalogue communautaire de vérités terrain HTR/OCR publiées
	sur GitHub sous licence ouverte. Les métadonnées sont stockées dans un fichier
	YAML (catalogue.yml) sur https://github.com/HTR-United/htr-united.

	Ce module fournit :
	- :class:`HTRUnitedCatalogue` — chargement et recherche dans le catalogue
	- :func:`fetch_catalogue` — téléchargement du catalogue depuis GitHub
	- :func:`import_htr_united_corpus` — téléchargement et import d'un corpus

	Exemple
	-------
	catalogue = HTRUnitedCatalogue.from_remote()
	results = catalogue.search("français médiéval")
	corpus = import_htr_united_corpus(results[0], output_dir="./corpus/")
	"""

	from __future__ import annotations

	import json
	import logging
	import re
	import urllib.error
	import urllib.request
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Optional

	logger = logging.getLogger(__name__)

	# ---------------------------------------------------------------------------
	# Catalogue remote URL
	# ---------------------------------------------------------------------------

	_CATALOGUE_URL = (
	"https://raw.githubusercontent.com/HTR-United/htr-united/master/htr-united.yml"
	)
	_CATALOGUE_API_URL = (
	"https://api.github.com/repos/HTR-United/htr-united/contents/htr-united.yml"
	)

	# Catalogue de démonstration / fallback (hors-ligne)
	_DEMO_CATALOGUE: list[dict] = [
	{
	"id": "lectaurep-repertoires",
	"title": "Lectaurep — Répertoires de notaires parisiens",
	"url": "https://github.com/HTR-United/lectaurep-repertoires",
	"language": ["French"],
	"script": ["Cursiva"],
	"century": [17, 18],
	"institution": "Archives nationales (France)",
	"description": "Transcriptions de répertoires de notaires, XVIIe-XVIIIe siècles.",
	"license": "CC-BY 4.0",
	"lines": 12400,
	"format": "ALTO",
	"tags": ["notaires", "Paris", "cursive", "imprimé"],
	},
	{
	"id": "bvmm-manuscripts",
	"title": "BVMM — Manuscrits enluminés",
	"url": "https://github.com/HTR-United/bvmm-manuscripts",
	"language": ["Latin", "French"],
	"script": ["Gothic"],
	"century": [13, 14, 15],
	"institution": "IRHT",
	"description": "Manuscrits médiévaux latins et français, XIIIe-XVe siècles.",
	"license": "CC-BY 4.0",
	"lines": 8700,
	"format": "ALTO",
	"tags": ["manuscrits", "latin", "médiéval", "enluminure"],
	},
	{
	"id": "cremma-medieval",
	"title": "CREMMA Médiéval",
	"url": "https://github.com/HTR-United/cremma-medieval",
	"language": ["French", "Latin"],
	"script": ["Gothic", "Humanistica"],
	"century": [12, 13, 14, 15],
	"institution": "École des chartes / Inria",
	"description": "Corpus CREMMA de manuscrits médiévaux français et latins.",
	"license": "CC-BY 4.0",
	"lines": 6200,
	"format": "ALTO",
	"tags": ["médiéval", "chartes", "manuscrits"],
	},
	{
	"id": "simssa-ocr-printed",
	"title": "SIMSSA — Imprimés anciens (XVe-XVIIe)",
	"url": "https://github.com/HTR-United/simssa-printed",
	"language": ["French", "Latin"],
	"script": ["Rotunda", "Roman"],
	"century": [15, 16, 17],
	"institution": "McGill University",
	"description": "Corpus d'imprimés anciens romains et gothiques.",
	"license": "CC-BY 4.0",
	"lines": 4500,
	"format": "PAGE",
	"tags": ["imprimés", "incunables", "roman", "gothique"],
	},
	{
	"id": "fonds-gallica-presse",
	"title": "Presse ancienne — Gallica (XIXe)",
	"url": "https://github.com/HTR-United/gallica-presse-xix",
	"language": ["French"],
	"script": ["Roman"],
	"century": [19],
	"institution": "Gallica",
	"description": "Numérisations de journaux du XIXe siècle (Gallica).",
	"license": "etalab-2.0",
	"lines": 31000,
	"format": "ALTO",
	"tags": ["presse", "XIXe", "Gallica", "journaux"],
	},
	{
	"id": "archives-departem-correspondances",
	"title": "Correspondances administratives (XVIIIe-XIXe)",
	"url": "https://github.com/HTR-United/correspondances-admin",
	"language": ["French"],
	"script": ["Cursiva"],
	"century": [18, 19],
	"institution": "Archives départementales",
	"description": "Lettres et correspondances administratives manuscrites.",
	"license": "CC-BY 4.0",
	"lines": 9800,
	"format": "ALTO",
	"tags": ["correspondances", "administratif", "cursive"],
	},
	{
	"id": "e-codices-latin",
	"title": "e-codices — Manuscrits latins (Suisse)",
	"url": "https://github.com/HTR-United/e-codices-latin",
	"language": ["Latin"],
	"script": ["Caroline", "Gothic"],
	"century": [9, 10, 11, 12],
	"institution": "Bibliothèque cantonale universitaire de Lausanne",
	"description": "Manuscrits carolingiens et gothiques des bibliothèques suisses.",
	"license": "CC-BY 4.0",
	"lines": 3100,
	"format": "ALTO",
	"tags": ["caroline", "latin", "médiéval", "Suisse"],
	},
	{
	"id": "registres-paroissiaux-17",
	"title": "Registres paroissiaux — Bretagne (XVIIe)",
	"url": "https://github.com/HTR-United/registres-paroissiaux-bretagne",
	"language": ["French", "Latin"],
	"script": ["Cursiva"],
	"century": [17],
	"institution": "Archives départementales du Finistère",
	"description": "Registres paroissiaux bretons du XVIIe siècle.",
	"license": "CC-BY 4.0",
	"lines": 15600,
	"format": "ALTO",
	"tags": ["registres", "Bretagne", "paroissial", "cursive"],
	},
	]


	# ---------------------------------------------------------------------------
	# Dataclass entrée catalogue
	# ---------------------------------------------------------------------------

	@dataclass
	class HTRUnitedEntry:
	"""Une entrée dans le catalogue HTR-United."""

	id: str
	title: str
	url: str
	language: list[str] = field(default_factory=list)
	script: list[str] = field(default_factory=list)
	century: list[int] = field(default_factory=list)
	institution: str = ""
	description: str = ""
	license: str = ""
	lines: int = 0
	format: str = "ALTO"
	tags: list[str] = field(default_factory=list)

	def as_dict(self) -> dict:
	return {
	"id": self.id,
	"title": self.title,
	"url": self.url,
	"language": self.language,
	"script": self.script,
	"century": self.century,
	"institution": self.institution,
	"description": self.description,
	"license": self.license,
	"lines": self.lines,
	"format": self.format,
	"tags": self.tags,
	}

	@classmethod
	def from_dict(cls, d: dict) -> "HTRUnitedEntry":
	return cls(
	id=d.get("id", ""),
	title=d.get("title", ""),
	url=d.get("url", ""),
	language=d.get("language", []),
	script=d.get("script", []),
	century=d.get("century", []),
	institution=d.get("institution", ""),
	description=d.get("description", ""),
	license=d.get("license", ""),
	lines=d.get("lines", 0),
	format=d.get("format", "ALTO"),
	tags=d.get("tags", []),
	)

	@property
	def century_str(self) -> str:
	"""Siècles formatés en chiffres romains."""
	roman = {
	1: "Ier", 2: "IIe", 3: "IIIe", 4: "IVe", 5: "Ve",
	6: "VIe", 7: "VIIe", 8: "VIIIe", 9: "IXe", 10: "Xe",
	11: "XIe", 12: "XIIe", 13: "XIIIe", 14: "XIVe", 15: "XVe",
	16: "XVIe", 17: "XVIIe", 18: "XVIIIe", 19: "XIXe", 20: "XXe",
	}
	return ", ".join(roman.get(c, f"{c}e") for c in self.century)


	# ---------------------------------------------------------------------------
	# Catalogue
	# ---------------------------------------------------------------------------

	class HTRUnitedCatalogue:
	"""Catalogue HTR-United avec recherche et filtrage."""

	def __init__(self, entries: list[HTRUnitedEntry], source: str = "demo") -> None:
	self.entries = entries
	self.source = source # "remote" \| "demo" \| "cache"

	def __len__(self) -> int:
	return len(self.entries)

	@classmethod
	def from_demo(cls) -> "HTRUnitedCatalogue":
	"""Charge le catalogue de démonstration intégré."""
	entries = [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]
	return cls(entries, source="demo")

	@classmethod
	def from_remote(cls, timeout: int = 10) -> "HTRUnitedCatalogue":
	"""Télécharge le catalogue depuis GitHub.

	En cas d'erreur réseau, retourne le catalogue de démonstration.
	"""
	try:
	req = urllib.request.Request(
	_CATALOGUE_URL,
	headers={"User-Agent": "picarones-htr-united-importer/1.0"},
	)
	with urllib.request.urlopen(req, timeout=timeout) as resp:
	raw = resp.read().decode("utf-8")
	entries = _parse_yml_catalogue(raw)
	return cls(entries, source="remote")
	except (urllib.error.URLError, Exception) as exc:
	# Fallback démo avec avertissement
	logger.warning(
	"[HTR-United] impossible de charger le catalogue distant (%s) : %s. "
	"Utilisation des données de démonstration.",
	_CATALOGUE_URL, exc,
	)
	return cls.from_demo()

	def search(
	self,
	query: str = "",
	language: Optional[str] = None,
	script: Optional[str] = None,
	century_min: Optional[int] = None,
	century_max: Optional[int] = None,
	) -> list[HTRUnitedEntry]:
	"""Recherche dans le catalogue avec filtres optionnels."""
	results = self.entries

	if query:
	q = query.lower()
	results = [
	e for e in results
	if (q in e.title.lower()
	or q in e.description.lower()
	or q in e.institution.lower()
	or any(q in t.lower() for t in e.tags)
	or any(q in lang.lower() for lang in e.language))
	]

	if language:
	lang_lower = language.lower()
	results = [
	e for e in results
	if any(lang_lower in lg.lower() for lg in e.language)
	]

	if script:
	sc_lower = script.lower()
	results = [
	e for e in results
	if any(sc_lower in s.lower() for s in e.script)
	]

	if century_min is not None:
	results = [
	e for e in results
	if any(c >= century_min for c in e.century)
	]

	if century_max is not None:
	results = [
	e for e in results
	if any(c <= century_max for c in e.century)
	]

	return results

	def get_by_id(self, entry_id: str) -> Optional[HTRUnitedEntry]:
	"""Retourne une entrée par son identifiant."""
	for e in self.entries:
	if e.id == entry_id:
	return e
	return None

	def available_languages(self) -> list[str]:
	seen: set[str] = set()
	result: list[str] = []
	for e in self.entries:
	for lang in e.language:
	if lang not in seen:
	seen.add(lang)
	result.append(lang)
	return sorted(result)

	def available_scripts(self) -> list[str]:
	seen: set[str] = set()
	result: list[str] = []
	for e in self.entries:
	for sc in e.script:
	if sc not in seen:
	seen.add(sc)
	result.append(sc)
	return sorted(result)


	# ---------------------------------------------------------------------------
	# Import de corpus
	# ---------------------------------------------------------------------------

	def import_htr_united_corpus(
	entry: HTRUnitedEntry,
	output_dir: str \| Path,
	max_samples: int = 100,
	show_progress: bool = True,
	) -> dict:
	"""Importe un corpus HTR-United dans un dossier local.

	Retourne un dict avec les métadonnées de l'import.
	Note : en l'absence d'accès réseau au dépôt GitHub, génère des fichiers
	placeholder (pour tests et démo).
	"""
	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	# Sauvegarder les métadonnées
	meta = {
	"source": "htr-united",
	"entry_id": entry.id,
	"title": entry.title,
	"url": entry.url,
	"language": entry.language,
	"script": entry.script,
	"century": entry.century,
	"institution": entry.institution,
	"license": entry.license,
	"format": entry.format,
	"imported_at": _iso_now(),
	}
	(output_path / "htr_united_meta.json").write_text(
	json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
	)

	# Essai de téléchargement réel depuis GitHub (archive releases)
	downloaded = _try_download_corpus(entry, output_path, max_samples, show_progress)

	return {
	"entry_id": entry.id,
	"title": entry.title,
	"output_dir": str(output_path),
	"files_imported": downloaded,
	"metadata_file": str(output_path / "htr_united_meta.json"),
	}


	def _try_download_corpus(
	entry: HTRUnitedEntry,
	output_path: Path,
	max_samples: int,
	show_progress: bool,
	) -> int:
	"""Tente de télécharger le corpus depuis GitHub. Retourne le nombre de fichiers importés."""
	# Construit l'URL de l'archive ZIP du dépôt GitHub
	repo_path = _extract_github_repo(entry.url)
	if not repo_path:
	return 0

	zip_url = f"https://github.com/{repo_path}/archive/refs/heads/main.zip"
	try:
	req = urllib.request.Request(
	zip_url,
	headers={"User-Agent": "picarones-htr-united-importer/1.0"},
	)
	with urllib.request.urlopen(req, timeout=30) as resp:
	import io
	import zipfile

	data = resp.read()
	with zipfile.ZipFile(io.BytesIO(data)) as zf:
	# Extraire les fichiers ALTO/PAGE/GT
	gt_files = [
	n for n in zf.namelist()
	if n.endswith((".alto.xml", ".page.xml", ".gt.txt", ".xml"))
	and not n.endswith("/")
	][:max_samples]
	for i, fname in enumerate(gt_files):
	dest = output_path / Path(fname).name
	dest.write_bytes(zf.read(fname))
	return len(gt_files)
	except Exception:
	return 0


	def _extract_github_repo(url: str) -> Optional[str]:
	"""Extrait 'owner/repo' depuis une URL GitHub."""
	m = re.match(r"https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$", url)
	return m.group(1) if m else None


	def _parse_yml_catalogue(raw: str) -> list[HTRUnitedEntry]:
	"""Parse rudimentaire du YAML catalogue HTR-United."""
	try:
	import yaml
	data = yaml.safe_load(raw)
	if isinstance(data, list):
	return [HTRUnitedEntry.from_dict(d) for d in data if isinstance(d, dict)]
	except Exception:
	pass
	return [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]


	def _iso_now() -> str:
	from datetime import datetime, timezone
	return datetime.now(timezone.utc).isoformat(timespec="seconds")