Spaces:
Running
Running
File size: 4,002 Bytes
f14102c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | """Router des importers de corpus distants : HTR-United et HuggingFace."""
from __future__ import annotations
from fastapi import APIRouter, HTTPException, Query
from picarones.web.models import HTRUnitedImportRequest, HuggingFaceImportRequest
router = APIRouter()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# HTR-United
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@router.get("/api/htr-united/catalogue")
async def api_htr_united_catalogue(
query: str = Query(default="", description="Recherche textuelle"),
language: str = Query(default="", description="Filtre langue"),
script: str = Query(default="", description="Filtre type d'Γ©criture"),
) -> dict:
"""Catalogue HTR-United filtrable."""
from picarones.extras.importers.htr_united import HTRUnitedCatalogue
cat = HTRUnitedCatalogue.from_demo()
results = cat.search(
query=query,
language=language or None,
script=script or None,
)
return {
"source": cat.source,
"total": len(results),
"entries": [e.as_dict() for e in results],
"available_languages": cat.available_languages(),
"available_scripts": cat.available_scripts(),
}
@router.post("/api/htr-united/import")
async def api_htr_united_import(req: HTRUnitedImportRequest) -> dict:
"""Importe une entrΓ©e HTR-United dans ``req.output_dir``."""
from picarones.extras.importers.htr_united import (
HTRUnitedCatalogue,
import_htr_united_corpus,
)
cat = HTRUnitedCatalogue.from_demo()
entry = cat.get_by_id(req.entry_id)
if not entry:
raise HTTPException(
status_code=404, detail=f"EntrΓ©e non trouvΓ©e : {req.entry_id}",
)
return import_htr_united_corpus(
entry=entry,
output_dir=req.output_dir,
max_samples=req.max_samples,
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# HuggingFace Datasets
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@router.get("/api/huggingface/search")
async def api_huggingface_search(
query: str = Query(default="", description="RequΓͺte de recherche"),
language: str = Query(default="", description="Filtre langue"),
tags: str = Query(default="", description="Tags sΓ©parΓ©s par des virgules"),
limit: int = Query(default=20, ge=1, le=50),
) -> dict:
"""Recherche de datasets sur HuggingFace Hub."""
from picarones.extras.importers.huggingface import HuggingFaceImporter
tag_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else None
importer = HuggingFaceImporter()
results = importer.search(
query=query,
tags=tag_list,
language=language or None,
limit=limit,
)
return {
"total": len(results),
"datasets": [ds.as_dict() for ds in results],
}
@router.post("/api/huggingface/import")
async def api_huggingface_import(req: HuggingFaceImportRequest) -> dict:
"""Importe un dataset HuggingFace dans ``req.output_dir``."""
from picarones.extras.importers.huggingface import HuggingFaceImporter
importer = HuggingFaceImporter()
return importer.import_dataset(
dataset_id=req.dataset_id,
output_dir=req.output_dir,
split=req.split,
max_samples=req.max_samples,
)
|