Picarones / picarones /adapters /corpus /huggingface.py
Claude
feat(migration): Lots H + I + J — statistics, htr_united/huggingface, MetricsResult
c813aa1 unverified
Raw
History Blame
17.4 kB
"""Import de datasets OCR/HTR depuis HuggingFace Hub.
⚠ **Statut : expérimental** (phase C du chantier de refonte en 3 cercles).
L'API ``datasets`` HuggingFace évolue fréquemment et ce module n'a pas
de tests d'intégration. À utiliser à vos risques jusqu'à ce qu'un cas
d'usage institutionnel valide son comportement. Un ``UserWarning`` est
émis à l'import pour le rappeler.
Ce module fournit :
- :class:`HuggingFaceDataset` — métadonnées d'un dataset HuggingFace
- :class:`HuggingFaceImporter` — recherche et import de datasets
- :func:`search_hf_datasets` — recherche par tags dans l'API HuggingFace
- :func:`import_hf_dataset` — téléchargement d'un dataset vers un dossier local
Les datasets patrimoniaux de référence sont pré-référencés pour une découverte
rapide sans requête réseau.
Exemple
-------
importer = HuggingFaceImporter()
results = importer.search("medieval OCR", tags=["ocr"])
corpus = importer.import_dataset(results[0].dataset_id, output_dir="./corpus/")
"""
from __future__ import annotations
import json
import os
import urllib.error
import urllib.parse
import urllib.request
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
# Émission du warning ``experimental`` à l'import. Phase C du chantier
# de refonte — voir docstring du module ci-dessus.
warnings.warn(
"picarones.adapters.corpus.huggingface is experimental and may "
"change or be removed without notice. Use at your own risk until "
"an institutional use case validates the API.",
category=UserWarning,
stacklevel=2,
)
# ---------------------------------------------------------------------------
# Datasets de référence pré-référencés
# ---------------------------------------------------------------------------
_REFERENCE_DATASETS: list[dict] = [
{
"dataset_id": "Teklia/RIMES",
"title": "RIMES — Reconnaissance et Indexation de données Manuscrites et de fac-similEs",
"description": "Corpus de courriers manuscrits français modernes. Standard de référence pour la reconnaissance d'écriture manuscrite.",
"language": ["French"],
"tags": ["htr", "ocr", "handwritten", "french", "modern"],
"license": "cc-by-4.0",
"size_category": "1K<n<10K",
"task": "image-to-text",
"institution": "IRISA / A2iA",
"downloads": 1200,
},
{
"dataset_id": "Teklia/IAM",
"title": "IAM Handwriting Database",
"description": "Corpus de référence anglais pour la reconnaissance d'écriture manuscrite.",
"language": ["English"],
"tags": ["htr", "ocr", "handwritten", "english"],
"license": "other",
"size_category": "10K<n<100K",
"task": "image-to-text",
"institution": "University of Bern",
"downloads": 8400,
},
{
"dataset_id": "CATMuS/medieval",
"title": "CATMuS Medieval — Consistent Approaches to Transcribing ManuScripts",
"description": "Dataset multilingue de manuscrits médiévaux (latin, français, occitan, espagnol) pour l'entraînement de modèles HTR.",
"language": ["Latin", "French", "Occitan", "Spanish"],
"tags": ["htr", "medieval", "manuscripts", "latin", "french", "historical"],
"license": "cc-by-4.0",
"size_category": "100K<n<1M",
"task": "image-to-text",
"institution": "Inria / EPHE",
"downloads": 3100,
},
{
"dataset_id": "htr-united/cremma-medieval",
"title": "CREMMA Medieval",
"description": "Corpus de manuscrits médiévaux français XIIe-XVe siècles.",
"language": ["French", "Latin"],
"tags": ["htr", "medieval", "french", "manuscripts", "htr-united"],
"license": "cc-by-4.0",
"size_category": "1K<n<10K",
"task": "image-to-text",
"institution": "Inria",
"downloads": 520,
},
{
"dataset_id": "biglam/europeana_newspapers",
"title": "Europeana Newspapers",
"description": "Journaux numérisés européens du XIXe siècle (OCR + images).",
"language": ["French", "German", "Dutch", "Finnish"],
"tags": ["ocr", "newspapers", "historical", "19th-century", "europeana"],
"license": "cc0-1.0",
"size_category": "1M<n<10M",
"task": "image-to-text",
"institution": "Europeana Foundation",
"downloads": 15200,
},
{
"dataset_id": "stefanklut/esposalles",
"title": "Esposalles Dataset",
"description": "Registres de mariage catalans du XVIIe siècle pour la reconnaissance d'écriture historique.",
"language": ["Catalan", "Latin"],
"tags": ["htr", "historical", "registers", "catalan", "17th-century"],
"license": "cc-by-4.0",
"size_category": "1K<n<10K",
"task": "image-to-text",
"institution": "Universitat Autònoma de Barcelona",
"downloads": 340,
},
{
"dataset_id": "bnf-gallica/gallica-ocr",
"title": "Gallica OCR",
"description": "Extraits d'imprimés anciens numérisés depuis Gallica avec vérité terrain.",
"language": ["French", "Latin"],
"tags": ["ocr", "historical", "printed", "gallica", "french"],
"license": "etalab-2.0",
"size_category": "10K<n<100K",
"task": "image-to-text",
"institution": "Gallica",
"downloads": 2800,
},
{
"dataset_id": "Bozen-Baptism/baptism-records",
"title": "Bozen Baptism Records",
"description": "Registres de baptêmes de Bozen (Italie/Autriche) du XVIIIe siècle.",
"language": ["German", "Latin"],
"tags": ["htr", "historical", "registers", "german", "latin", "18th-century"],
"license": "cc-by-4.0",
"size_category": "1K<n<10K",
"task": "image-to-text",
"institution": "University of Innsbruck",
"downloads": 190,
},
{
"dataset_id": "read-bad/readbad",
"title": "READ-BAD — Recognition and Enrichment of Archival Documents",
"description": "Corpus multilingue de documents d'archives pour l'OCR historique (Latin, Allemand, Anglais).",
"language": ["German", "English", "Latin"],
"tags": ["ocr", "htr", "historical", "archives", "read"],
"license": "cc-by-4.0",
"size_category": "10K<n<100K",
"task": "image-to-text",
"institution": "University of Graz",
"downloads": 1050,
},
]
# ---------------------------------------------------------------------------
# Dataclass
# ---------------------------------------------------------------------------
@dataclass
class HuggingFaceDataset:
"""Métadonnées d'un dataset HuggingFace."""
dataset_id: str
title: str
description: str = ""
language: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
license: str = ""
size_category: str = ""
task: str = "image-to-text"
institution: str = ""
downloads: int = 0
source: str = "reference" # "reference" | "api"
def as_dict(self) -> dict:
return {
"dataset_id": self.dataset_id,
"title": self.title,
"description": self.description,
"language": self.language,
"tags": self.tags,
"license": self.license,
"size_category": self.size_category,
"task": self.task,
"institution": self.institution,
"downloads": self.downloads,
"source": self.source,
}
@classmethod
def from_dict(cls, d: dict) -> "HuggingFaceDataset":
return cls(
dataset_id=d.get("dataset_id", d.get("id", "")),
title=d.get("title", d.get("dataset_id", "")),
description=d.get("description", ""),
language=d.get("language", []),
tags=d.get("tags", []),
license=d.get("license", ""),
size_category=d.get("size_category", d.get("cardData", {}).get("size_categories", [""])[0] if isinstance(d.get("cardData"), dict) else ""),
task=d.get("task", "image-to-text"),
institution=d.get("institution", ""),
downloads=d.get("downloads", d.get("downloadsAllTime", 0)),
source=d.get("source", "api"),
)
@property
def hf_url(self) -> str:
return f"https://huggingface.co/datasets/{self.dataset_id}"
# ---------------------------------------------------------------------------
# Importer principal
# ---------------------------------------------------------------------------
class HuggingFaceImporter:
"""Recherche et importe des datasets depuis HuggingFace Hub."""
_API_BASE = "https://huggingface.co/api"
def __init__(self, token: Optional[str] = None) -> None:
self._token = token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
def _headers(self) -> dict:
h = {"User-Agent": "picarones-hf-importer/1.0"}
if self._token:
h["Authorization"] = f"Bearer {self._token}"
return h
def search(
self,
query: str = "",
tags: Optional[list[str]] = None,
language: Optional[str] = None,
limit: int = 20,
use_reference: bool = True,
) -> list[HuggingFaceDataset]:
"""Recherche des datasets avec filtres.
Interroge d'abord les datasets de référence pré-intégrés, puis
l'API HuggingFace si disponible.
"""
results: list[HuggingFaceDataset] = []
# Datasets de référence
if use_reference:
ref_results = self._search_reference(query, tags, language)
results.extend(ref_results)
# API HuggingFace (optionnel, peut échouer silencieusement)
try:
api_results = self._search_api(query, tags, language, limit)
# Déduplique (priorité aux références)
existing_ids = {r.dataset_id for r in results}
for ds in api_results:
if ds.dataset_id not in existing_ids:
results.append(ds)
existing_ids.add(ds.dataset_id)
except Exception as exc: # noqa: BLE001 — réseau/API tierce
# Sprint A3 (B-3) : la recherche API échoue silencieusement →
# l'utilisateur ne voit que les datasets de référence et croit
# que l'API est vide. On documente l'incident.
from picarones.adapters.corpus._fallback_log import record_fallback
record_fallback(
importer="huggingface",
operation="hub_search_api",
error=exc,
extra={"query": query, "language": language, "limit": limit},
)
return results[:limit]
def _search_reference(
self,
query: str,
tags: Optional[list[str]],
language: Optional[str],
) -> list[HuggingFaceDataset]:
datasets = [HuggingFaceDataset.from_dict(d) for d in _REFERENCE_DATASETS]
datasets = [ds._replace_source("reference") for ds in datasets]
if query:
q = query.lower()
datasets = [
ds for ds in datasets
if (q in ds.title.lower()
or q in ds.description.lower()
or q in ds.dataset_id.lower()
or any(q in t.lower() for t in ds.tags)
or any(q in lg.lower() for lg in ds.language))
]
if tags:
for tag in tags:
t_lower = tag.lower()
datasets = [
ds for ds in datasets
if any(t_lower in dt.lower() for dt in ds.tags)
]
if language:
lang_lower = language.lower()
datasets = [
ds for ds in datasets
if any(lang_lower in lg.lower() for lg in ds.language)
]
return datasets
def _search_api(
self,
query: str,
tags: Optional[list[str]],
language: Optional[str],
limit: int,
) -> list[HuggingFaceDataset]:
params: dict[str, str] = {
"task_categories": "image-to-text",
"limit": str(min(limit, 50)),
"full": "False",
}
if query:
params["search"] = query
if language:
params["language"] = language
if tags:
params["tags"] = ",".join(tags)
url = f"{self._API_BASE}/datasets?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url, headers=self._headers())
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read().decode("utf-8"))
results = []
for item in data if isinstance(data, list) else []:
ds = HuggingFaceDataset(
dataset_id=item.get("id", ""),
title=item.get("id", ""),
description=item.get("description", ""),
language=item.get("language", []),
tags=item.get("tags", []),
license=item.get("license", ""),
size_category=(
item.get("cardData", {}).get("size_categories", [""])[0]
if isinstance(item.get("cardData"), dict)
else ""
),
task="image-to-text",
downloads=item.get("downloadsAllTime", 0),
source="api",
)
if ds.dataset_id:
results.append(ds)
return results
def import_dataset(
self,
dataset_id: str,
output_dir: str | Path,
split: str = "train",
max_samples: int = 100,
show_progress: bool = True,
) -> dict:
"""Importe un dataset depuis HuggingFace vers un dossier local.
Retourne les métadonnées de l'import.
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
meta = {
"source": "huggingface",
"dataset_id": dataset_id,
"split": split,
"max_samples": max_samples,
"imported_at": _iso_now(),
}
meta_file = output_path / "huggingface_meta.json"
meta_file.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
# Tentative d'import via datasets library si disponible
files_imported = _try_import_with_datasets_lib(
dataset_id, output_path, split, max_samples, show_progress
)
return {
"dataset_id": dataset_id,
"output_dir": str(output_path),
"files_imported": files_imported,
"metadata_file": str(meta_file),
}
def _try_import_with_datasets_lib(
dataset_id: str,
output_path: Path,
split: str,
max_samples: int,
show_progress: bool,
) -> int:
"""Essaie d'importer avec la librairie `datasets` de HuggingFace."""
try:
from datasets import load_dataset # type: ignore
ds = load_dataset(dataset_id, split=split, streaming=True)
count = 0
for i, item in enumerate(ds):
if i >= max_samples:
break
# Cherche champ image et texte
image = item.get("image") or item.get("img")
text = item.get("text") or item.get("transcription") or item.get("ground_truth", "")
if image is not None:
img_file = output_path / f"doc_{i:04d}.jpg"
try:
image.save(str(img_file))
except Exception as exc: # noqa: BLE001 — PIL/PIL-IO
# Sprint A3 (B-3) : un échec de sauvegarde d'image
# produirait un GT orphelin (texte sans image). On
# documente et on continue — le GT est tout de même
# écrit pour préserver la cohérence numérique du compteur.
from picarones.adapters.corpus._fallback_log import record_fallback
record_fallback(
importer="huggingface",
operation="image_save",
error=exc,
extra={"img_file": str(img_file), "doc_index": i},
)
gt_file = output_path / f"doc_{i:04d}.gt.txt"
gt_file.write_text(str(text), encoding="utf-8")
count += 1
return count
except (ImportError, Exception):
return 0
def _iso_now() -> str:
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat(timespec="seconds")
# ---------------------------------------------------------------------------
# Extension de HuggingFaceDataset (helper privé)
# ---------------------------------------------------------------------------
def _patch_dataset_replace_source() -> None:
"""Ajoute un helper _replace_source à HuggingFaceDataset."""
def _replace_source(self, source: str) -> "HuggingFaceDataset":
from dataclasses import replace
return replace(self, source=source)
HuggingFaceDataset._replace_source = _replace_source
_patch_dataset_replace_source()