Spaces:
Sleeping
Sleeping
Claude
feat(migration): Lots H + I + J — statistics, htr_united/huggingface, MetricsResult
c813aa1 unverified | """Import de datasets OCR/HTR depuis HuggingFace Hub. | |
| ⚠ **Statut : expérimental** (phase C du chantier de refonte en 3 cercles). | |
| L'API ``datasets`` HuggingFace évolue fréquemment et ce module n'a pas | |
| de tests d'intégration. À utiliser à vos risques jusqu'à ce qu'un cas | |
| d'usage institutionnel valide son comportement. Un ``UserWarning`` est | |
| émis à l'import pour le rappeler. | |
| Ce module fournit : | |
| - :class:`HuggingFaceDataset` — métadonnées d'un dataset HuggingFace | |
| - :class:`HuggingFaceImporter` — recherche et import de datasets | |
| - :func:`search_hf_datasets` — recherche par tags dans l'API HuggingFace | |
| - :func:`import_hf_dataset` — téléchargement d'un dataset vers un dossier local | |
| Les datasets patrimoniaux de référence sont pré-référencés pour une découverte | |
| rapide sans requête réseau. | |
| Exemple | |
| ------- | |
| importer = HuggingFaceImporter() | |
| results = importer.search("medieval OCR", tags=["ocr"]) | |
| corpus = importer.import_dataset(results[0].dataset_id, output_dir="./corpus/") | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import urllib.error | |
| import urllib.parse | |
| import urllib.request | |
| import warnings | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| # Émission du warning ``experimental`` à l'import. Phase C du chantier | |
| # de refonte — voir docstring du module ci-dessus. | |
| warnings.warn( | |
| "picarones.adapters.corpus.huggingface is experimental and may " | |
| "change or be removed without notice. Use at your own risk until " | |
| "an institutional use case validates the API.", | |
| category=UserWarning, | |
| stacklevel=2, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Datasets de référence pré-référencés | |
| # --------------------------------------------------------------------------- | |
| _REFERENCE_DATASETS: list[dict] = [ | |
| { | |
| "dataset_id": "Teklia/RIMES", | |
| "title": "RIMES — Reconnaissance et Indexation de données Manuscrites et de fac-similEs", | |
| "description": "Corpus de courriers manuscrits français modernes. Standard de référence pour la reconnaissance d'écriture manuscrite.", | |
| "language": ["French"], | |
| "tags": ["htr", "ocr", "handwritten", "french", "modern"], | |
| "license": "cc-by-4.0", | |
| "size_category": "1K<n<10K", | |
| "task": "image-to-text", | |
| "institution": "IRISA / A2iA", | |
| "downloads": 1200, | |
| }, | |
| { | |
| "dataset_id": "Teklia/IAM", | |
| "title": "IAM Handwriting Database", | |
| "description": "Corpus de référence anglais pour la reconnaissance d'écriture manuscrite.", | |
| "language": ["English"], | |
| "tags": ["htr", "ocr", "handwritten", "english"], | |
| "license": "other", | |
| "size_category": "10K<n<100K", | |
| "task": "image-to-text", | |
| "institution": "University of Bern", | |
| "downloads": 8400, | |
| }, | |
| { | |
| "dataset_id": "CATMuS/medieval", | |
| "title": "CATMuS Medieval — Consistent Approaches to Transcribing ManuScripts", | |
| "description": "Dataset multilingue de manuscrits médiévaux (latin, français, occitan, espagnol) pour l'entraînement de modèles HTR.", | |
| "language": ["Latin", "French", "Occitan", "Spanish"], | |
| "tags": ["htr", "medieval", "manuscripts", "latin", "french", "historical"], | |
| "license": "cc-by-4.0", | |
| "size_category": "100K<n<1M", | |
| "task": "image-to-text", | |
| "institution": "Inria / EPHE", | |
| "downloads": 3100, | |
| }, | |
| { | |
| "dataset_id": "htr-united/cremma-medieval", | |
| "title": "CREMMA Medieval", | |
| "description": "Corpus de manuscrits médiévaux français XIIe-XVe siècles.", | |
| "language": ["French", "Latin"], | |
| "tags": ["htr", "medieval", "french", "manuscripts", "htr-united"], | |
| "license": "cc-by-4.0", | |
| "size_category": "1K<n<10K", | |
| "task": "image-to-text", | |
| "institution": "Inria", | |
| "downloads": 520, | |
| }, | |
| { | |
| "dataset_id": "biglam/europeana_newspapers", | |
| "title": "Europeana Newspapers", | |
| "description": "Journaux numérisés européens du XIXe siècle (OCR + images).", | |
| "language": ["French", "German", "Dutch", "Finnish"], | |
| "tags": ["ocr", "newspapers", "historical", "19th-century", "europeana"], | |
| "license": "cc0-1.0", | |
| "size_category": "1M<n<10M", | |
| "task": "image-to-text", | |
| "institution": "Europeana Foundation", | |
| "downloads": 15200, | |
| }, | |
| { | |
| "dataset_id": "stefanklut/esposalles", | |
| "title": "Esposalles Dataset", | |
| "description": "Registres de mariage catalans du XVIIe siècle pour la reconnaissance d'écriture historique.", | |
| "language": ["Catalan", "Latin"], | |
| "tags": ["htr", "historical", "registers", "catalan", "17th-century"], | |
| "license": "cc-by-4.0", | |
| "size_category": "1K<n<10K", | |
| "task": "image-to-text", | |
| "institution": "Universitat Autònoma de Barcelona", | |
| "downloads": 340, | |
| }, | |
| { | |
| "dataset_id": "bnf-gallica/gallica-ocr", | |
| "title": "Gallica OCR", | |
| "description": "Extraits d'imprimés anciens numérisés depuis Gallica avec vérité terrain.", | |
| "language": ["French", "Latin"], | |
| "tags": ["ocr", "historical", "printed", "gallica", "french"], | |
| "license": "etalab-2.0", | |
| "size_category": "10K<n<100K", | |
| "task": "image-to-text", | |
| "institution": "Gallica", | |
| "downloads": 2800, | |
| }, | |
| { | |
| "dataset_id": "Bozen-Baptism/baptism-records", | |
| "title": "Bozen Baptism Records", | |
| "description": "Registres de baptêmes de Bozen (Italie/Autriche) du XVIIIe siècle.", | |
| "language": ["German", "Latin"], | |
| "tags": ["htr", "historical", "registers", "german", "latin", "18th-century"], | |
| "license": "cc-by-4.0", | |
| "size_category": "1K<n<10K", | |
| "task": "image-to-text", | |
| "institution": "University of Innsbruck", | |
| "downloads": 190, | |
| }, | |
| { | |
| "dataset_id": "read-bad/readbad", | |
| "title": "READ-BAD — Recognition and Enrichment of Archival Documents", | |
| "description": "Corpus multilingue de documents d'archives pour l'OCR historique (Latin, Allemand, Anglais).", | |
| "language": ["German", "English", "Latin"], | |
| "tags": ["ocr", "htr", "historical", "archives", "read"], | |
| "license": "cc-by-4.0", | |
| "size_category": "10K<n<100K", | |
| "task": "image-to-text", | |
| "institution": "University of Graz", | |
| "downloads": 1050, | |
| }, | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Dataclass | |
| # --------------------------------------------------------------------------- | |
| class HuggingFaceDataset: | |
| """Métadonnées d'un dataset HuggingFace.""" | |
| dataset_id: str | |
| title: str | |
| description: str = "" | |
| language: list[str] = field(default_factory=list) | |
| tags: list[str] = field(default_factory=list) | |
| license: str = "" | |
| size_category: str = "" | |
| task: str = "image-to-text" | |
| institution: str = "" | |
| downloads: int = 0 | |
| source: str = "reference" # "reference" | "api" | |
| def as_dict(self) -> dict: | |
| return { | |
| "dataset_id": self.dataset_id, | |
| "title": self.title, | |
| "description": self.description, | |
| "language": self.language, | |
| "tags": self.tags, | |
| "license": self.license, | |
| "size_category": self.size_category, | |
| "task": self.task, | |
| "institution": self.institution, | |
| "downloads": self.downloads, | |
| "source": self.source, | |
| } | |
| def from_dict(cls, d: dict) -> "HuggingFaceDataset": | |
| return cls( | |
| dataset_id=d.get("dataset_id", d.get("id", "")), | |
| title=d.get("title", d.get("dataset_id", "")), | |
| description=d.get("description", ""), | |
| language=d.get("language", []), | |
| tags=d.get("tags", []), | |
| license=d.get("license", ""), | |
| size_category=d.get("size_category", d.get("cardData", {}).get("size_categories", [""])[0] if isinstance(d.get("cardData"), dict) else ""), | |
| task=d.get("task", "image-to-text"), | |
| institution=d.get("institution", ""), | |
| downloads=d.get("downloads", d.get("downloadsAllTime", 0)), | |
| source=d.get("source", "api"), | |
| ) | |
| def hf_url(self) -> str: | |
| return f"https://huggingface.co/datasets/{self.dataset_id}" | |
| # --------------------------------------------------------------------------- | |
| # Importer principal | |
| # --------------------------------------------------------------------------- | |
| class HuggingFaceImporter: | |
| """Recherche et importe des datasets depuis HuggingFace Hub.""" | |
| _API_BASE = "https://huggingface.co/api" | |
| def __init__(self, token: Optional[str] = None) -> None: | |
| self._token = token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") | |
| def _headers(self) -> dict: | |
| h = {"User-Agent": "picarones-hf-importer/1.0"} | |
| if self._token: | |
| h["Authorization"] = f"Bearer {self._token}" | |
| return h | |
| def search( | |
| self, | |
| query: str = "", | |
| tags: Optional[list[str]] = None, | |
| language: Optional[str] = None, | |
| limit: int = 20, | |
| use_reference: bool = True, | |
| ) -> list[HuggingFaceDataset]: | |
| """Recherche des datasets avec filtres. | |
| Interroge d'abord les datasets de référence pré-intégrés, puis | |
| l'API HuggingFace si disponible. | |
| """ | |
| results: list[HuggingFaceDataset] = [] | |
| # Datasets de référence | |
| if use_reference: | |
| ref_results = self._search_reference(query, tags, language) | |
| results.extend(ref_results) | |
| # API HuggingFace (optionnel, peut échouer silencieusement) | |
| try: | |
| api_results = self._search_api(query, tags, language, limit) | |
| # Déduplique (priorité aux références) | |
| existing_ids = {r.dataset_id for r in results} | |
| for ds in api_results: | |
| if ds.dataset_id not in existing_ids: | |
| results.append(ds) | |
| existing_ids.add(ds.dataset_id) | |
| except Exception as exc: # noqa: BLE001 — réseau/API tierce | |
| # Sprint A3 (B-3) : la recherche API échoue silencieusement → | |
| # l'utilisateur ne voit que les datasets de référence et croit | |
| # que l'API est vide. On documente l'incident. | |
| from picarones.adapters.corpus._fallback_log import record_fallback | |
| record_fallback( | |
| importer="huggingface", | |
| operation="hub_search_api", | |
| error=exc, | |
| extra={"query": query, "language": language, "limit": limit}, | |
| ) | |
| return results[:limit] | |
| def _search_reference( | |
| self, | |
| query: str, | |
| tags: Optional[list[str]], | |
| language: Optional[str], | |
| ) -> list[HuggingFaceDataset]: | |
| datasets = [HuggingFaceDataset.from_dict(d) for d in _REFERENCE_DATASETS] | |
| datasets = [ds._replace_source("reference") for ds in datasets] | |
| if query: | |
| q = query.lower() | |
| datasets = [ | |
| ds for ds in datasets | |
| if (q in ds.title.lower() | |
| or q in ds.description.lower() | |
| or q in ds.dataset_id.lower() | |
| or any(q in t.lower() for t in ds.tags) | |
| or any(q in lg.lower() for lg in ds.language)) | |
| ] | |
| if tags: | |
| for tag in tags: | |
| t_lower = tag.lower() | |
| datasets = [ | |
| ds for ds in datasets | |
| if any(t_lower in dt.lower() for dt in ds.tags) | |
| ] | |
| if language: | |
| lang_lower = language.lower() | |
| datasets = [ | |
| ds for ds in datasets | |
| if any(lang_lower in lg.lower() for lg in ds.language) | |
| ] | |
| return datasets | |
| def _search_api( | |
| self, | |
| query: str, | |
| tags: Optional[list[str]], | |
| language: Optional[str], | |
| limit: int, | |
| ) -> list[HuggingFaceDataset]: | |
| params: dict[str, str] = { | |
| "task_categories": "image-to-text", | |
| "limit": str(min(limit, 50)), | |
| "full": "False", | |
| } | |
| if query: | |
| params["search"] = query | |
| if language: | |
| params["language"] = language | |
| if tags: | |
| params["tags"] = ",".join(tags) | |
| url = f"{self._API_BASE}/datasets?" + urllib.parse.urlencode(params) | |
| req = urllib.request.Request(url, headers=self._headers()) | |
| with urllib.request.urlopen(req, timeout=10) as resp: | |
| data = json.loads(resp.read().decode("utf-8")) | |
| results = [] | |
| for item in data if isinstance(data, list) else []: | |
| ds = HuggingFaceDataset( | |
| dataset_id=item.get("id", ""), | |
| title=item.get("id", ""), | |
| description=item.get("description", ""), | |
| language=item.get("language", []), | |
| tags=item.get("tags", []), | |
| license=item.get("license", ""), | |
| size_category=( | |
| item.get("cardData", {}).get("size_categories", [""])[0] | |
| if isinstance(item.get("cardData"), dict) | |
| else "" | |
| ), | |
| task="image-to-text", | |
| downloads=item.get("downloadsAllTime", 0), | |
| source="api", | |
| ) | |
| if ds.dataset_id: | |
| results.append(ds) | |
| return results | |
| def import_dataset( | |
| self, | |
| dataset_id: str, | |
| output_dir: str | Path, | |
| split: str = "train", | |
| max_samples: int = 100, | |
| show_progress: bool = True, | |
| ) -> dict: | |
| """Importe un dataset depuis HuggingFace vers un dossier local. | |
| Retourne les métadonnées de l'import. | |
| """ | |
| output_path = Path(output_dir) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| meta = { | |
| "source": "huggingface", | |
| "dataset_id": dataset_id, | |
| "split": split, | |
| "max_samples": max_samples, | |
| "imported_at": _iso_now(), | |
| } | |
| meta_file = output_path / "huggingface_meta.json" | |
| meta_file.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8") | |
| # Tentative d'import via datasets library si disponible | |
| files_imported = _try_import_with_datasets_lib( | |
| dataset_id, output_path, split, max_samples, show_progress | |
| ) | |
| return { | |
| "dataset_id": dataset_id, | |
| "output_dir": str(output_path), | |
| "files_imported": files_imported, | |
| "metadata_file": str(meta_file), | |
| } | |
| def _try_import_with_datasets_lib( | |
| dataset_id: str, | |
| output_path: Path, | |
| split: str, | |
| max_samples: int, | |
| show_progress: bool, | |
| ) -> int: | |
| """Essaie d'importer avec la librairie `datasets` de HuggingFace.""" | |
| try: | |
| from datasets import load_dataset # type: ignore | |
| ds = load_dataset(dataset_id, split=split, streaming=True) | |
| count = 0 | |
| for i, item in enumerate(ds): | |
| if i >= max_samples: | |
| break | |
| # Cherche champ image et texte | |
| image = item.get("image") or item.get("img") | |
| text = item.get("text") or item.get("transcription") or item.get("ground_truth", "") | |
| if image is not None: | |
| img_file = output_path / f"doc_{i:04d}.jpg" | |
| try: | |
| image.save(str(img_file)) | |
| except Exception as exc: # noqa: BLE001 — PIL/PIL-IO | |
| # Sprint A3 (B-3) : un échec de sauvegarde d'image | |
| # produirait un GT orphelin (texte sans image). On | |
| # documente et on continue — le GT est tout de même | |
| # écrit pour préserver la cohérence numérique du compteur. | |
| from picarones.adapters.corpus._fallback_log import record_fallback | |
| record_fallback( | |
| importer="huggingface", | |
| operation="image_save", | |
| error=exc, | |
| extra={"img_file": str(img_file), "doc_index": i}, | |
| ) | |
| gt_file = output_path / f"doc_{i:04d}.gt.txt" | |
| gt_file.write_text(str(text), encoding="utf-8") | |
| count += 1 | |
| return count | |
| except (ImportError, Exception): | |
| return 0 | |
| def _iso_now() -> str: | |
| from datetime import datetime, timezone | |
| return datetime.now(timezone.utc).isoformat(timespec="seconds") | |
| # --------------------------------------------------------------------------- | |
| # Extension de HuggingFaceDataset (helper privé) | |
| # --------------------------------------------------------------------------- | |
| def _patch_dataset_replace_source() -> None: | |
| """Ajoute un helper _replace_source à HuggingFaceDataset.""" | |
| def _replace_source(self, source: str) -> "HuggingFaceDataset": | |
| from dataclasses import replace | |
| return replace(self, source=source) | |
| HuggingFaceDataset._replace_source = _replace_source | |
| _patch_dataset_replace_source() | |