File size: 4,002 Bytes
f14102c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Router des importers de corpus distants : HTR-United et HuggingFace."""

from __future__ import annotations

from fastapi import APIRouter, HTTPException, Query

from picarones.web.models import HTRUnitedImportRequest, HuggingFaceImportRequest

router = APIRouter()


# ──────────────────────────────────────────────────────────────────────────
# HTR-United
# ──────────────────────────────────────────────────────────────────────────

@router.get("/api/htr-united/catalogue")
async def api_htr_united_catalogue(
    query: str = Query(default="", description="Recherche textuelle"),
    language: str = Query(default="", description="Filtre langue"),
    script: str = Query(default="", description="Filtre type d'Γ©criture"),
) -> dict:
    """Catalogue HTR-United filtrable."""
    from picarones.extras.importers.htr_united import HTRUnitedCatalogue

    cat = HTRUnitedCatalogue.from_demo()
    results = cat.search(
        query=query,
        language=language or None,
        script=script or None,
    )
    return {
        "source": cat.source,
        "total": len(results),
        "entries": [e.as_dict() for e in results],
        "available_languages": cat.available_languages(),
        "available_scripts": cat.available_scripts(),
    }


@router.post("/api/htr-united/import")
async def api_htr_united_import(req: HTRUnitedImportRequest) -> dict:
    """Importe une entrΓ©e HTR-United dans ``req.output_dir``."""
    from picarones.extras.importers.htr_united import (
        HTRUnitedCatalogue,
        import_htr_united_corpus,
    )

    cat = HTRUnitedCatalogue.from_demo()
    entry = cat.get_by_id(req.entry_id)
    if not entry:
        raise HTTPException(
            status_code=404, detail=f"EntrΓ©e non trouvΓ©e : {req.entry_id}",
        )

    return import_htr_united_corpus(
        entry=entry,
        output_dir=req.output_dir,
        max_samples=req.max_samples,
    )


# ──────────────────────────────────────────────────────────────────────────
# HuggingFace Datasets
# ──────────────────────────────────────────────────────────────────────────

@router.get("/api/huggingface/search")
async def api_huggingface_search(
    query: str = Query(default="", description="RequΓͺte de recherche"),
    language: str = Query(default="", description="Filtre langue"),
    tags: str = Query(default="", description="Tags sΓ©parΓ©s par des virgules"),
    limit: int = Query(default=20, ge=1, le=50),
) -> dict:
    """Recherche de datasets sur HuggingFace Hub."""
    from picarones.extras.importers.huggingface import HuggingFaceImporter

    tag_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else None
    importer = HuggingFaceImporter()
    results = importer.search(
        query=query,
        tags=tag_list,
        language=language or None,
        limit=limit,
    )
    return {
        "total": len(results),
        "datasets": [ds.as_dict() for ds in results],
    }


@router.post("/api/huggingface/import")
async def api_huggingface_import(req: HuggingFaceImportRequest) -> dict:
    """Importe un dataset HuggingFace dans ``req.output_dir``."""
    from picarones.extras.importers.huggingface import HuggingFaceImporter

    importer = HuggingFaceImporter()
    return importer.import_dataset(
        dataset_id=req.dataset_id,
        output_dir=req.output_dir,
        split=req.split,
        max_samples=req.max_samples,
    )