Spaces:
Sleeping
Sleeping
| """Interface web locale Picarones — application FastAPI. | |
| Lance avec : | |
| picarones serve [--port 8000] [--host 127.0.0.1] | |
| ou directement : | |
| uvicorn picarones.web.app:app --reload --port 8000 | |
| Routes | |
| ------ | |
| GET / Page principale (SPA) | |
| GET /api/status Version et état de l'application | |
| GET /api/engines Statut des moteurs OCR et LLMs disponibles | |
| GET /api/corpus/browse Parcourir les dossiers du serveur | |
| GET /api/reports Liste des rapports générés | |
| GET /api/normalization/profiles Profils de normalisation disponibles | |
| POST /api/benchmark/start Lancer un benchmark (retourne job_id) | |
| GET /api/benchmark/{job_id}/stream Stream SSE de progression | |
| GET /api/benchmark/{job_id}/status Statut courant d'un job | |
| POST /api/benchmark/{job_id}/cancel Annuler un job | |
| GET /api/htr-united/catalogue Catalogue HTR-United | |
| POST /api/htr-united/import Importer un corpus HTR-United | |
| GET /api/huggingface/search Rechercher des datasets HuggingFace | |
| POST /api/huggingface/import Importer un dataset HuggingFace | |
| GET /reports/{filename} Accéder à un rapport HTML généré | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| import logging | |
| import os | |
| import shutil | |
| import tempfile | |
| import threading | |
| import uuid | |
| import xml.etree.ElementTree as ET | |
| import zipfile | |
| from dataclasses import dataclass, field | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any, AsyncIterator, Optional | |
| from fastapi import Cookie, FastAPI, File, HTTPException, Query, Response, UploadFile | |
| from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse | |
| from pydantic import BaseModel | |
| from picarones import __version__ | |
| # --------------------------------------------------------------------------- | |
| # App initialization | |
| # --------------------------------------------------------------------------- | |
| app = FastAPI( | |
| title="Picarones", | |
| description="Plateforme de comparaison de moteurs OCR/HTR pour documents patrimoniaux", | |
| version=__version__, | |
| docs_url="/api/docs", | |
| redoc_url="/api/redoc", | |
| ) | |
| # Fichiers statiques (CSS, icônes…) | |
| _STATIC_DIR = Path(__file__).parent / "static" | |
| if _STATIC_DIR.is_dir(): | |
| from fastapi.staticfiles import StaticFiles | |
| app.mount("/static", StaticFiles(directory=str(_STATIC_DIR)), name="static") | |
| # --------------------------------------------------------------------------- | |
| # Job management | |
| # --------------------------------------------------------------------------- | |
| _logger = logging.getLogger(__name__) | |
| class BenchmarkJob: | |
| job_id: str | |
| status: str = "pending" # pending | running | complete | error | cancelled | |
| progress: float = 0.0 # 0.0 – 1.0 | |
| current_engine: str = "" | |
| total_docs: int = 0 | |
| processed_docs: int = 0 | |
| output_path: str = "" | |
| error: str = "" | |
| started_at: Optional[str] = None | |
| finished_at: Optional[str] = None | |
| events: list[dict] = field(default_factory=list) | |
| _subscribers: list[asyncio.Queue] = field(default_factory=list) | |
| _lock: threading.Lock = field(default_factory=threading.Lock) | |
| _cancel_event: threading.Event = field(default_factory=threading.Event) | |
| def add_event(self, kind: str, data: Any) -> None: | |
| event = {"kind": kind, "data": data, "ts": _iso_now()} | |
| with self._lock: | |
| self.events.append(event) | |
| subscribers = list(self._subscribers) | |
| for q in subscribers: | |
| try: | |
| q.put_nowait(event) | |
| except asyncio.QueueFull: | |
| pass | |
| def subscribe(self) -> asyncio.Queue: | |
| q: asyncio.Queue = asyncio.Queue(maxsize=200) | |
| with self._lock: | |
| self._subscribers.append(q) | |
| return q | |
| def unsubscribe(self, q: asyncio.Queue) -> None: | |
| with self._lock: | |
| try: | |
| self._subscribers.remove(q) | |
| except ValueError: | |
| pass | |
| def as_dict(self) -> dict: | |
| return { | |
| "job_id": self.job_id, | |
| "status": self.status, | |
| "progress": self.progress, | |
| "current_engine": self.current_engine, | |
| "total_docs": self.total_docs, | |
| "processed_docs": self.processed_docs, | |
| "output_path": self.output_path, | |
| "error": self.error, | |
| "started_at": self.started_at, | |
| "finished_at": self.finished_at, | |
| } | |
| _JOBS: dict[str, BenchmarkJob] = {} | |
| _JOBS_MAX = 100 # Nombre max de jobs conservés en mémoire | |
| _JOBS_LOCK = threading.Lock() | |
| def _cleanup_old_jobs() -> None: | |
| """Supprime les jobs terminés les plus anciens si le nombre dépasse _JOBS_MAX.""" | |
| with _JOBS_LOCK: | |
| if len(_JOBS) <= _JOBS_MAX: | |
| return | |
| finished = [ | |
| (jid, j) for jid, j in _JOBS.items() | |
| if j.status in ("complete", "error", "cancelled") | |
| ] | |
| finished.sort(key=lambda x: x[1].finished_at or "") | |
| to_remove = len(_JOBS) - _JOBS_MAX | |
| for jid, _ in finished[:to_remove]: | |
| del _JOBS[jid] | |
| _IMAGE_EXTS = frozenset({".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp"}) | |
| _UPLOADS_DIR = Path("./uploads") | |
| # --------------------------------------------------------------------------- | |
| # Pydantic models | |
| # --------------------------------------------------------------------------- | |
| class BenchmarkRequest(BaseModel): | |
| corpus_path: str | |
| engines: list[str] = ["tesseract"] | |
| normalization_profile: str = "nfc" | |
| char_exclude: str = "" # Caractères à ignorer (séparés par virgule, ex: "',–") | |
| output_dir: str = "./rapports/" | |
| report_name: str = "" | |
| lang: str = "fra" | |
| report_lang: str = "fr" # langue du rapport HTML : "fr" ou "en" | |
| class HTRUnitedImportRequest(BaseModel): | |
| entry_id: str | |
| output_dir: str = "./corpus/" | |
| max_samples: int = 100 | |
| class HuggingFaceImportRequest(BaseModel): | |
| dataset_id: str | |
| output_dir: str = "./corpus/" | |
| split: str = "train" | |
| max_samples: int = 100 | |
| class CompetitorConfig(BaseModel): | |
| name: str = "" | |
| ocr_engine: str = "" | |
| """Moteur OCR : 'tesseract', 'mistral_ocr', ... ou 'corpus' pour utiliser l'OCR pré-calculé.""" | |
| ocr_model: str = "" | |
| llm_provider: str = "" | |
| llm_model: str = "" | |
| pipeline_mode: str = "" | |
| prompt_file: str = "" | |
| class BenchmarkRunRequest(BaseModel): | |
| corpus_path: str | |
| competitors: list[CompetitorConfig] | |
| normalization_profile: str = "nfc" | |
| char_exclude: str = "" # Caractères à ignorer (séparés par virgule, ex: "',–") | |
| output_dir: str = "./rapports/" | |
| report_name: str = "" | |
| report_lang: str = "fr" | |
| # --------------------------------------------------------------------------- | |
| # API — status | |
| # --------------------------------------------------------------------------- | |
| async def api_status() -> dict: | |
| return { | |
| "app": "Picarones", | |
| "version": __version__, | |
| "status": "ok", | |
| "timestamp": _iso_now(), | |
| } | |
| # --------------------------------------------------------------------------- | |
| # API — langue / i18n | |
| # --------------------------------------------------------------------------- | |
| _SUPPORTED_LANGS = ("fr", "en") | |
| _LANG_COOKIE = "picarones_lang" | |
| async def api_get_lang( | |
| picarones_lang: str = Cookie(default="fr"), | |
| ) -> dict: | |
| """Retourne la langue courante de l'interface (lue depuis le cookie de session).""" | |
| lang = picarones_lang if picarones_lang in _SUPPORTED_LANGS else "fr" | |
| return {"lang": lang, "supported": list(_SUPPORTED_LANGS)} | |
| async def api_set_lang(lang_code: str, response: Response) -> dict: | |
| """Définit la langue de l'interface et la persiste dans un cookie de session. | |
| Langues supportées : ``fr`` (français), ``en`` (anglais patrimonial). | |
| """ | |
| if lang_code not in _SUPPORTED_LANGS: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Langue non supportée : '{lang_code}'. Disponibles : {', '.join(_SUPPORTED_LANGS)}", | |
| ) | |
| response.set_cookie( | |
| key=_LANG_COOKIE, | |
| value=lang_code, | |
| max_age=60 * 60 * 24 * 365, # 1 an | |
| httponly=False, | |
| samesite="lax", | |
| ) | |
| return {"lang": lang_code, "message": f"Langue définie : {lang_code}"} | |
| # --------------------------------------------------------------------------- | |
| # API — engines | |
| # --------------------------------------------------------------------------- | |
| async def api_engines() -> dict: | |
| engines = [] | |
| # Tesseract | |
| tess = _check_engine("tesseract", "pytesseract") | |
| tess["langs"] = _get_tesseract_langs() | |
| engines.append(tess) | |
| # Pero OCR | |
| pero = _check_engine("pero_ocr", "pero_ocr", label="Pero OCR") | |
| engines.append(pero) | |
| # Kraken | |
| kraken = _check_engine("kraken", "kraken", label="Kraken") | |
| engines.append(kraken) | |
| # Calamari | |
| calamari = _check_engine("calamari", "calamari_ocr", label="Calamari") | |
| engines.append(calamari) | |
| # Mistral OCR (API cloud) | |
| mistral_key = os.environ.get("MISTRAL_API_KEY") | |
| engines.append({ | |
| "id": "mistral_ocr", | |
| "label": "Mistral OCR (Pixtral / mistral-ocr-latest)", | |
| "type": "ocr_cloud", | |
| "available": bool(mistral_key), | |
| "key_env": "MISTRAL_API_KEY", | |
| "status": "configured" if mistral_key else "missing_key", | |
| "version": "", | |
| }) | |
| # Google Vision (API cloud) | |
| gv_key = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") or os.environ.get("GOOGLE_API_KEY") | |
| engines.append({ | |
| "id": "google_vision", | |
| "label": "Google Vision API", | |
| "type": "ocr_cloud", | |
| "available": bool(gv_key), | |
| "key_env": "GOOGLE_APPLICATION_CREDENTIALS", | |
| "status": "configured" if gv_key else "missing_key", | |
| "version": "", | |
| }) | |
| # Azure Document Intelligence (API cloud) | |
| az_key = os.environ.get("AZURE_DOC_INTEL_KEY") | |
| engines.append({ | |
| "id": "azure_doc_intel", | |
| "label": "Azure Document Intelligence", | |
| "type": "ocr_cloud", | |
| "available": bool(az_key), | |
| "key_env": "AZURE_DOC_INTEL_KEY", | |
| "status": "configured" if az_key else "missing_key", | |
| "version": "", | |
| }) | |
| llms = [] | |
| # OpenAI | |
| llms.append({ | |
| "id": "openai", | |
| "label": "OpenAI (GPT-4o, GPT-4o mini)", | |
| "type": "llm", | |
| "available": bool(os.environ.get("OPENAI_API_KEY")), | |
| "key_env": "OPENAI_API_KEY", | |
| "status": "configured" if os.environ.get("OPENAI_API_KEY") else "missing_key", | |
| }) | |
| # Anthropic | |
| llms.append({ | |
| "id": "anthropic", | |
| "label": "Anthropic (Claude Sonnet, Haiku)", | |
| "type": "llm", | |
| "available": bool(os.environ.get("ANTHROPIC_API_KEY")), | |
| "key_env": "ANTHROPIC_API_KEY", | |
| "status": "configured" if os.environ.get("ANTHROPIC_API_KEY") else "missing_key", | |
| }) | |
| # Mistral LLM | |
| llms.append({ | |
| "id": "mistral", | |
| "label": "Mistral LLM (Mistral Large, Small…)", | |
| "type": "llm", | |
| "available": bool(os.environ.get("MISTRAL_API_KEY")), | |
| "key_env": "MISTRAL_API_KEY", | |
| "status": "configured" if os.environ.get("MISTRAL_API_KEY") else "missing_key", | |
| }) | |
| # Ollama (un seul appel HTTP) | |
| ollama_available, ollama_models = _fetch_ollama_info() | |
| llms.append({ | |
| "id": "ollama", | |
| "label": "Ollama (Llama 3, Gemma, Phi — local)", | |
| "type": "llm_local", | |
| "available": ollama_available, | |
| "status": "running" if ollama_available else "not_running", | |
| "models": ollama_models, | |
| "base_url": "http://localhost:11434", | |
| }) | |
| return {"engines": engines, "llms": llms} | |
| def _check_engine(engine_id: str, module_name: str, label: str = "") -> dict: | |
| label = label or engine_id.replace("_", " ").title() | |
| try: | |
| __import__(module_name) | |
| installed = True | |
| except ImportError: | |
| installed = False | |
| version = "" | |
| if installed and engine_id == "tesseract": | |
| try: | |
| import pytesseract | |
| version = pytesseract.get_tesseract_version() | |
| version = str(version) | |
| except Exception: | |
| version = "installé" | |
| elif installed: | |
| try: | |
| mod = __import__(module_name) | |
| version = getattr(mod, "__version__", "installé") | |
| except Exception: | |
| version = "installé" | |
| return { | |
| "id": engine_id, | |
| "label": label, | |
| "type": "ocr", | |
| "available": installed, | |
| "version": version, | |
| "status": "available" if installed else "not_installed", | |
| } | |
| def _fetch_ollama_info() -> tuple[bool, list[str]]: | |
| """Vérifie la disponibilité d'Ollama et liste ses modèles en un seul appel HTTP.""" | |
| import urllib.error | |
| import urllib.request | |
| try: | |
| with urllib.request.urlopen("http://localhost:11434/api/tags", timeout=2) as r: | |
| if r.status != 200: | |
| return False, [] | |
| data = json.loads(r.read().decode()) | |
| models = [m.get("name", "") for m in data.get("models", [])] | |
| return True, models | |
| except Exception: | |
| return False, [] | |
| def _check_ollama() -> bool: | |
| available, _ = _fetch_ollama_info() | |
| return available | |
| def _list_ollama_models() -> list[str]: | |
| _, models = _fetch_ollama_info() | |
| return models | |
| def _get_tesseract_langs() -> list[str]: | |
| try: | |
| import pytesseract | |
| langs = pytesseract.get_languages(config="") | |
| return sorted(lg for lg in langs if lg != "osd") | |
| except Exception: | |
| return ["fra", "lat", "eng", "deu", "ita", "spa"] | |
| # --------------------------------------------------------------------------- | |
| # API — models (dynamic per provider, with capability metadata) | |
| # --------------------------------------------------------------------------- | |
| # Modèles Mistral text-only (pas de support vision) | |
| _MISTRAL_TEXT_ONLY = frozenset({ | |
| "ministral-3b-latest", "ministral-8b-latest", "mistral-tiny", | |
| "mistral-tiny-latest", "open-mistral-7b", "open-mixtral-8x7b", | |
| "mistral-small-latest", "mistral-small-2409", | |
| }) | |
| # Préfixes de modèles Mistral qui sont text-only (pas de support vision) | |
| _MISTRAL_TEXT_ONLY_PREFIXES = ( | |
| "ministral", "open-mistral", "open-mixtral", "codestral", | |
| "mistral-embed", "mistral-tiny", | |
| ) | |
| # Familles Ollama multimodales connues | |
| _OLLAMA_VISION_FAMILIES = frozenset({ | |
| "llava", "bakllava", "moondream", "minicpm-v", "llama3.2-vision", | |
| "llava-llama3", "llava-phi3", "nanollava", | |
| }) | |
| def _model_entry(model_id: str, capabilities: list[str]) -> dict: | |
| """Crée une entrée modèle avec son ID et ses capacités.""" | |
| return {"id": model_id, "capabilities": capabilities} | |
| def _infer_mistral_capabilities(model_id: str) -> list[str]: | |
| mid = model_id.lower() | |
| # Modèles explicitement vision (Pixtral) | |
| if "pixtral" in mid: | |
| return ["text", "vision"] | |
| # Modèles explicitement text-only | |
| if mid in _MISTRAL_TEXT_ONLY or any(mid.startswith(p) for p in _MISTRAL_TEXT_ONLY_PREFIXES): | |
| return ["text"] | |
| # Mistral Large et modèles récents non-identifiés → vision par défaut | |
| if "mistral-large" in mid or "mistral-medium" in mid: | |
| return ["text", "vision"] | |
| # Par défaut, marquer comme text-only (plus sûr que de supposer vision) | |
| return ["text"] | |
| def _infer_openai_capabilities(model_id: str) -> list[str]: | |
| mid = model_id.lower() | |
| if "gpt-4o" in mid or "gpt-4-turbo" in mid or "gpt-4.1" in mid or "o1" in mid or "o3" in mid: | |
| return ["text", "vision"] | |
| return ["text"] | |
| def _infer_ollama_capabilities(model_name: str) -> list[str]: | |
| base = model_name.split(":")[0].lower() | |
| if any(base.startswith(family) for family in _OLLAMA_VISION_FAMILIES): | |
| return ["text", "vision"] | |
| return ["text"] | |
| async def api_models( | |
| provider: str, | |
| capability: str = Query(default="", description="Filtre par capacité : 'text', 'vision', ou vide pour tout"), | |
| ) -> dict: | |
| """Retourne les modèles disponibles avec leurs capacités (text, vision). | |
| Interroge l'API du provider en temps réel. Les capacités sont déterminées | |
| par heuristique sur le nom du modèle quand l'API ne fournit pas cette | |
| information directement. | |
| Le paramètre ``capability`` filtre les résultats (ex : ``?capability=vision`` | |
| ne retourne que les modèles supportant la vision). | |
| """ | |
| import urllib.request as _urlreq | |
| def _fetch_json(url: str, headers: dict) -> dict: | |
| req = _urlreq.Request(url, headers=headers) | |
| with _urlreq.urlopen(req, timeout=10) as resp: | |
| return json.loads(resp.read().decode()) | |
| def _filter_and_format(models: list[dict]) -> dict: | |
| if capability: | |
| models = [m for m in models if capability in m["capabilities"]] | |
| return { | |
| "provider": provider, | |
| "models": models, | |
| "model_ids": [m["id"] for m in models], | |
| } | |
| if provider == "tesseract": | |
| langs = _get_tesseract_langs() | |
| return {"provider": provider, "models": langs, "model_ids": langs} | |
| if provider == "mistral_ocr": | |
| api_key = os.environ.get("MISTRAL_API_KEY") | |
| if not api_key: | |
| return {"provider": provider, "models": [], "model_ids": [], "error": "MISTRAL_API_KEY non définie"} | |
| try: | |
| data = _fetch_json( | |
| "https://api.mistral.ai/v1/models", | |
| {"Authorization": f"Bearer {api_key}"}, | |
| ) | |
| models = [ | |
| _model_entry(m["id"], _infer_mistral_capabilities(m["id"])) | |
| for m in data.get("data", []) | |
| if "pixtral" in m["id"].lower() or "mistral-ocr" in m["id"].lower() | |
| ] | |
| return _filter_and_format(sorted(models, key=lambda m: m["id"])) | |
| except Exception as exc: | |
| fallback = [ | |
| _model_entry("pixtral-12b-2409", ["text", "vision"]), | |
| _model_entry("pixtral-large-latest", ["text", "vision"]), | |
| _model_entry("mistral-ocr-latest", ["text", "vision"]), | |
| ] | |
| return {**_filter_and_format(fallback), "error": str(exc)} | |
| if provider == "openai": | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| return {"provider": provider, "models": [], "model_ids": [], "error": "OPENAI_API_KEY non définie"} | |
| try: | |
| data = _fetch_json( | |
| "https://api.openai.com/v1/models", | |
| {"Authorization": f"Bearer {api_key}"}, | |
| ) | |
| models = [ | |
| _model_entry(m["id"], _infer_openai_capabilities(m["id"])) | |
| for m in data.get("data", []) | |
| if "gpt-4" in m["id"].lower() or "o1" in m["id"].lower() or "o3" in m["id"].lower() | |
| ] | |
| return _filter_and_format(sorted(models, key=lambda m: m["id"], reverse=True)) | |
| except Exception as exc: | |
| fallback = [ | |
| _model_entry("gpt-4o", ["text", "vision"]), | |
| _model_entry("gpt-4o-mini", ["text", "vision"]), | |
| _model_entry("gpt-4-turbo", ["text", "vision"]), | |
| ] | |
| return {**_filter_and_format(fallback), "error": str(exc)} | |
| if provider == "anthropic": | |
| api_key = os.environ.get("ANTHROPIC_API_KEY") | |
| if not api_key: | |
| return {"provider": provider, "models": [], "model_ids": [], "error": "ANTHROPIC_API_KEY non définie"} | |
| try: | |
| data = _fetch_json( | |
| "https://api.anthropic.com/v1/models", | |
| {"x-api-key": api_key, "anthropic-version": "2023-06-01"}, | |
| ) | |
| # Tous les modèles Claude 3+ supportent la vision | |
| models = [_model_entry(m["id"], ["text", "vision"]) for m in data.get("data", [])] | |
| return _filter_and_format(models) | |
| except Exception as exc: | |
| fallback = [ | |
| _model_entry("claude-sonnet-4-6", ["text", "vision"]), | |
| _model_entry("claude-haiku-4-5-20251001", ["text", "vision"]), | |
| _model_entry("claude-opus-4-6", ["text", "vision"]), | |
| ] | |
| return {**_filter_and_format(fallback), "error": str(exc)} | |
| if provider == "mistral": | |
| api_key = os.environ.get("MISTRAL_API_KEY") | |
| if not api_key: | |
| return {"provider": provider, "models": [], "model_ids": [], "error": "MISTRAL_API_KEY non définie"} | |
| try: | |
| data = _fetch_json( | |
| "https://api.mistral.ai/v1/models", | |
| {"Authorization": f"Bearer {api_key}"}, | |
| ) | |
| # Inclure TOUS les modèles Mistral (y compris Pixtral pour la vision) | |
| # sauf mistral-ocr qui est un endpoint OCR dédié, pas un LLM chat | |
| models = [ | |
| _model_entry(m["id"], _infer_mistral_capabilities(m["id"])) | |
| for m in data.get("data", []) | |
| if "mistral-ocr" not in m["id"].lower() | |
| ] | |
| return _filter_and_format(sorted(models, key=lambda m: m["id"])) | |
| except Exception as exc: | |
| fallback = [ | |
| _model_entry("mistral-large-latest", ["text", "vision"]), | |
| _model_entry("pixtral-large-latest", ["text", "vision"]), | |
| _model_entry("pixtral-12b-2409", ["text", "vision"]), | |
| _model_entry("mistral-small-latest", ["text"]), | |
| ] | |
| return {**_filter_and_format(fallback), "error": str(exc)} | |
| if provider == "ollama": | |
| _, model_names = _fetch_ollama_info() | |
| models = [ | |
| _model_entry(name, _infer_ollama_capabilities(name)) | |
| for name in model_names | |
| ] | |
| return _filter_and_format(models) | |
| if provider == "google_vision": | |
| models = [ | |
| _model_entry("document_text_detection", ["vision"]), | |
| _model_entry("text_detection", ["vision"]), | |
| ] | |
| return _filter_and_format(models) | |
| if provider == "azure_doc_intel": | |
| models = [ | |
| _model_entry("prebuilt-document", ["vision"]), | |
| _model_entry("prebuilt-read", ["vision"]), | |
| ] | |
| return _filter_and_format(models) | |
| if provider == "prompts": | |
| prompts_dir = Path(__file__).parent.parent / "prompts" | |
| if prompts_dir.exists(): | |
| prompts = sorted(f.name for f in prompts_dir.glob("*.txt")) | |
| else: | |
| prompts = [] | |
| return {"provider": provider, "models": prompts, "model_ids": prompts} | |
| raise HTTPException(status_code=404, detail=f"Provider inconnu : {provider}") | |
| # --------------------------------------------------------------------------- | |
| # API — corpus browse | |
| # --------------------------------------------------------------------------- | |
| _BROWSE_ROOTS = [ | |
| Path(".").resolve(), | |
| _UPLOADS_DIR.resolve(), | |
| Path("/workspaces").resolve(), | |
| Path(tempfile.gettempdir()).resolve(), | |
| ] | |
| def _is_path_allowed(target: Path) -> bool: | |
| """Vérifie qu'un chemin résolu est sous un des répertoires autorisés (cross-plateforme).""" | |
| for root in _BROWSE_ROOTS: | |
| try: | |
| if target == root or target.is_relative_to(root): | |
| return True | |
| except (ValueError, TypeError): | |
| continue | |
| return False | |
| async def api_corpus_browse(path: str = Query(default=".", description="Chemin à explorer")) -> dict: | |
| target = Path(path).resolve() | |
| if not target.exists() or not target.is_dir(): | |
| raise HTTPException(status_code=404, detail=f"Dossier non trouvé : {path}") | |
| # Sécurité : restreindre la navigation aux répertoires autorisés | |
| if not _is_path_allowed(target): | |
| raise HTTPException(status_code=403, detail="Accès refusé : chemin hors des répertoires autorisés") | |
| items = [] | |
| try: | |
| for entry in sorted(target.iterdir()): | |
| item: dict[str, Any] = { | |
| "name": entry.name, | |
| "path": str(entry), | |
| "is_dir": entry.is_dir(), | |
| } | |
| if entry.is_dir(): | |
| # Compter les paires image/gt | |
| gt_count = sum(1 for f in entry.iterdir() if f.suffix == ".txt" and f.stem.endswith(".gt")) | |
| item["gt_count"] = gt_count | |
| item["has_corpus"] = gt_count > 0 | |
| items.append(item) | |
| except PermissionError as exc: | |
| raise HTTPException(status_code=403, detail=str(exc)) | |
| return { | |
| "current_path": str(target), | |
| "parent_path": str(target.parent) if target.parent != target else None, | |
| "items": items, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # API — corpus upload | |
| # --------------------------------------------------------------------------- | |
| def _safe_parse_xml(xml_bytes: bytes) -> Optional[ET.Element]: | |
| """Parse du XML en désactivant les entités externes (protection XXE).""" | |
| try: | |
| import defusedxml.ElementTree as SafeET | |
| return SafeET.fromstring(xml_bytes) | |
| except ImportError: | |
| pass | |
| # Fallback : parser standard avec entités externes désactivées | |
| parser = ET.XMLParser() | |
| try: | |
| return ET.fromstring(xml_bytes, parser=parser) | |
| except ET.ParseError: | |
| return None | |
| def _detect_xml_gt(xml_bytes: bytes) -> tuple[str, str] | None: | |
| """Détecte si xml_bytes est un fichier ALTO ou PAGE XML et extrait le texte GT. | |
| Retourne (format_label, texte_gt) ou None si le format n'est pas reconnu. | |
| """ | |
| root = _safe_parse_xml(xml_bytes) | |
| if root is None: | |
| return None | |
| tag = root.tag # peut être "{namespace}alto" ou "alto" ou "{ns}PcGts" | |
| # --- ALTO XML --- | |
| # Namespace contient loc.gov/standards/alto ou balise racine "alto" | |
| ns_alto = "http://www.loc.gov/standards/alto" | |
| is_alto = ( | |
| ns_alto in tag | |
| or tag.lower() == "alto" | |
| or (tag.startswith("{") and tag.split("}")[1].lower() in ("alto",)) | |
| ) | |
| if is_alto: | |
| text = _extract_alto_text(root) | |
| return ("ALTO XML", text) | |
| # --- PAGE XML --- | |
| # Balise racine PcGts (avec ou sans namespace) | |
| local = tag.split("}")[-1] if "}" in tag else tag | |
| if local == "PcGts": | |
| text = _extract_page_text(root) | |
| return ("PAGE XML", text) | |
| return None | |
| def _extract_alto_text(root: ET.Element) -> str: | |
| """Extrait le texte plein d'un arbre ALTO XML. | |
| Concatène les attributs CONTENT des balises <String> dans l'ordre de lecture | |
| (bloc → ligne → mot), avec un espace entre mots et une newline entre lignes. | |
| """ | |
| # Chercher les éléments TextLine (avec ou sans namespace) | |
| lines: list[str] = [] | |
| for elem in root.iter(): | |
| local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag | |
| if local == "TextLine": | |
| words: list[str] = [] | |
| for child in elem.iter(): | |
| child_local = child.tag.split("}")[-1] if "}" in child.tag else child.tag | |
| if child_local == "String": | |
| content = child.get("CONTENT", "") | |
| if content: | |
| words.append(content) | |
| if words: | |
| lines.append(" ".join(words)) | |
| return "\n".join(lines) | |
| def _extract_page_text(root: ET.Element) -> str: | |
| """Extrait le texte plein d'un arbre PAGE XML. | |
| Concatène le contenu des balises <Unicode> dans l'ordre de lecture. | |
| """ | |
| texts: list[str] = [] | |
| for elem in root.iter(): | |
| local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag | |
| if local == "Unicode" and elem.text: | |
| texts.append(elem.text.strip()) | |
| return "\n".join(t for t in texts if t) | |
| def _analyze_corpus_dir(path: Path) -> dict: | |
| """Analyse un dossier et retourne un résumé des paires image/GT détectées.""" | |
| # Exclure les fichiers cachés macOS (._* AppleDouble) et tout fichier débutant par . | |
| images = sorted( | |
| f.name for f in path.iterdir() | |
| if f.suffix.lower() in _IMAGE_EXTS and not f.name.startswith(".") | |
| ) | |
| pairs: list[dict] = [] | |
| missing_gt: list[str] = [] | |
| for img in images: | |
| stem = Path(img).stem | |
| gt_txt = path / (stem + ".gt.txt") | |
| gt_xml = path / (stem + ".xml") | |
| if gt_txt.exists(): | |
| pairs.append({"image": img, "gt": stem + ".gt.txt", "gt_format": "texte brut"}) | |
| elif gt_xml.exists(): | |
| result = _detect_xml_gt(gt_xml.read_bytes()) | |
| if result is not None: | |
| fmt, text = result | |
| # Matérialiser le GT en .gt.txt pour le chargeur de corpus | |
| gt_txt.write_text(text, encoding="utf-8") | |
| pairs.append({"image": img, "gt": stem + ".gt.txt", "gt_format": fmt}) | |
| else: | |
| missing_gt.append(img) | |
| else: | |
| missing_gt.append(img) | |
| # Détecter le format dominant pour le résumé global | |
| formats = {p["gt_format"] for p in pairs} | |
| if len(formats) == 1: | |
| dominant_format: str = formats.pop() | |
| elif formats: | |
| dominant_format = "mixte" | |
| else: | |
| dominant_format = "texte brut" | |
| # Détecter les fichiers OCR bruité (.ocr.txt) pour les corpus triplets | |
| ocr_text_count = sum( | |
| 1 for p in pairs | |
| if (path / (Path(p["image"]).stem + ".ocr.txt")).exists() | |
| ) | |
| return { | |
| "doc_count": len(pairs), | |
| "pairs": pairs[:20], | |
| "total_pairs": len(pairs), | |
| "missing_gt": missing_gt[:10], | |
| "has_missing_gt": len(missing_gt) > 0, | |
| "warnings": [f"GT manquant : {img}" for img in missing_gt[:5]], | |
| "usable": len(pairs) > 0, | |
| "gt_format": dominant_format, | |
| "has_ocr_text": ocr_text_count > 0, | |
| "ocr_text_count": ocr_text_count, | |
| } | |
| _MAX_ZIP_TOTAL_SIZE = 500 * 1024 * 1024 # 500 Mo décompressé max | |
| _MAX_ZIP_FILES = 2000 # nombre max de fichiers extraits | |
| def _flatten_zip_to_dir(zf: zipfile.ZipFile, dest: Path) -> None: | |
| """Extrait un ZIP en aplatissant les paires image/.gt.txt/.xml dans dest.""" | |
| dest.mkdir(parents=True, exist_ok=True) | |
| total_size = 0 | |
| file_count = 0 | |
| for member in zf.infolist(): | |
| if member.is_dir(): | |
| continue | |
| p = Path(member.filename) | |
| name = p.name | |
| # Ignorer les fichiers cachés macOS (._* créés par AppleDouble dans les ZIPs) | |
| if name.startswith("."): | |
| continue | |
| # Accepter images, .gt.txt, .ocr.txt et .xml (ALTO/PAGE) | |
| if p.suffix.lower() in _IMAGE_EXTS or name.endswith(".gt.txt") or name.endswith(".ocr.txt") or p.suffix.lower() == ".xml": | |
| # Protection ZIP bomb : vérifier la taille décompressée | |
| total_size += member.file_size | |
| if total_size > _MAX_ZIP_TOTAL_SIZE: | |
| raise ValueError( | |
| f"ZIP trop volumineux : taille décompressée > {_MAX_ZIP_TOTAL_SIZE // (1024*1024)} Mo" | |
| ) | |
| file_count += 1 | |
| if file_count > _MAX_ZIP_FILES: | |
| raise ValueError(f"ZIP contient trop de fichiers (> {_MAX_ZIP_FILES})") | |
| data = zf.read(member.filename) | |
| (dest / name).write_bytes(data) | |
| async def api_corpus_upload(files: list[UploadFile] = File(...)) -> dict: | |
| """Upload un corpus : soit un .zip, soit une sélection d'images + .gt.txt.""" | |
| corpus_id = str(uuid.uuid4()) | |
| corpus_dir = _UPLOADS_DIR / corpus_id | |
| corpus_dir.mkdir(parents=True, exist_ok=True) | |
| try: | |
| for uf in files: | |
| filename = uf.filename or "upload" | |
| data = await uf.read() | |
| suffix = Path(filename).suffix.lower() | |
| if suffix == ".zip": | |
| # Extraire le ZIP en aplatissant les paires | |
| import io | |
| with zipfile.ZipFile(io.BytesIO(data)) as zf: | |
| _flatten_zip_to_dir(zf, corpus_dir) | |
| elif suffix in _IMAGE_EXTS or filename.endswith(".gt.txt") or filename.endswith(".ocr.txt") or suffix in (".txt", ".xml"): | |
| (corpus_dir / filename).write_bytes(data) | |
| # Ignorer les autres types | |
| summary = _analyze_corpus_dir(corpus_dir) | |
| if not summary["usable"]: | |
| shutil.rmtree(corpus_dir, ignore_errors=True) | |
| raise HTTPException( | |
| status_code=422, | |
| detail="Aucune paire image/.gt.txt valide trouvée dans les fichiers uploadés.", | |
| ) | |
| return { | |
| "corpus_id": corpus_id, | |
| "corpus_path": str(corpus_dir), | |
| **summary, | |
| } | |
| except HTTPException: | |
| raise | |
| except Exception as exc: | |
| shutil.rmtree(corpus_dir, ignore_errors=True) | |
| raise HTTPException(status_code=500, detail=str(exc)) | |
| async def api_corpus_uploads() -> dict: | |
| """Liste les corpus uploadés disponibles.""" | |
| if not _UPLOADS_DIR.exists(): | |
| return {"uploads": []} | |
| uploads = [] | |
| for d in sorted(_UPLOADS_DIR.iterdir()): | |
| if not d.is_dir(): | |
| continue | |
| try: | |
| summary = _analyze_corpus_dir(d) | |
| uploads.append({ | |
| "corpus_id": d.name, | |
| "corpus_path": str(d), | |
| "doc_count": summary["doc_count"], | |
| "has_missing_gt": summary["has_missing_gt"], | |
| }) | |
| except Exception as e: | |
| _logger.warning( | |
| "[api_corpus_uploads] upload '%s' ignoré — inspection impossible : %s", | |
| d.name, e, | |
| ) | |
| return {"uploads": uploads} | |
| async def api_corpus_image(upload_id: str, filename: str) -> FileResponse: | |
| """Sert une image depuis le dossier d'upload.""" | |
| # Sécurité : interdire les path traversal | |
| if "/" in upload_id or "\\" in upload_id or ".." in upload_id: | |
| raise HTTPException(status_code=400, detail="upload_id invalide") | |
| if "/" in filename or "\\" in filename or ".." in filename: | |
| raise HTTPException(status_code=400, detail="filename invalide") | |
| image_path = _UPLOADS_DIR / upload_id / filename | |
| if not image_path.exists() or not image_path.is_file(): | |
| raise HTTPException(status_code=404, detail="Image non trouvée") | |
| suffix = image_path.suffix.lower() | |
| media_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", | |
| ".tif": "image/tiff", ".tiff": "image/tiff", ".webp": "image/webp"} | |
| media_type = media_types.get(suffix, "application/octet-stream") | |
| return FileResponse(str(image_path), media_type=media_type) | |
| async def api_corpus_delete(corpus_id: str) -> dict: | |
| """Supprime un corpus uploadé.""" | |
| # Sécurité : interdire les path traversal | |
| if "/" in corpus_id or "\\" in corpus_id or ".." in corpus_id: | |
| raise HTTPException(status_code=400, detail="corpus_id invalide") | |
| corpus_dir = _UPLOADS_DIR / corpus_id | |
| if not corpus_dir.exists() or not corpus_dir.is_dir(): | |
| raise HTTPException(status_code=404, detail=f"Corpus non trouvé : {corpus_id}") | |
| shutil.rmtree(corpus_dir) | |
| return {"deleted": corpus_id} | |
| # --------------------------------------------------------------------------- | |
| # API — normalization profiles | |
| # --------------------------------------------------------------------------- | |
| async def api_normalization_profiles() -> dict: | |
| from picarones.core.normalization import NORMALIZATION_PROFILES | |
| profiles = [ | |
| { | |
| "id": pid, | |
| "name": p.name, | |
| "description": p.description or p.name, | |
| "caseless": p.caseless, | |
| "diplomatic_rules": len(p.diplomatic_table), | |
| "exclude_chars": sorted(p.exclude_chars), | |
| } | |
| for pid, p in NORMALIZATION_PROFILES.items() | |
| ] | |
| return {"profiles": profiles} | |
| # --------------------------------------------------------------------------- | |
| # API — reports | |
| # --------------------------------------------------------------------------- | |
| async def api_reports(reports_dir: str = Query(default=".", description="Dossier rapports")) -> dict: | |
| target = Path(reports_dir).resolve() | |
| reports = [] | |
| search_dirs = [target, Path(".").resolve(), Path("./rapports").resolve()] | |
| seen: set[str] = set() | |
| for d in search_dirs: | |
| if not d.exists(): | |
| continue | |
| for f in sorted(d.glob("*.html"), key=lambda x: x.stat().st_mtime, reverse=True): | |
| if str(f) not in seen: | |
| seen.add(str(f)) | |
| stat = f.stat() | |
| reports.append({ | |
| "filename": f.name, | |
| "path": str(f), | |
| "size_kb": round(stat.st_size / 1024, 1), | |
| "modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(), | |
| "url": f"/reports/{f.name}", | |
| }) | |
| return {"reports": reports} | |
| async def serve_report(filename: str) -> HTMLResponse: | |
| # Sécurité : interdire les path traversal | |
| if "/" in filename or "\\" in filename or ".." in filename: | |
| raise HTTPException(status_code=400, detail="Nom de fichier invalide") | |
| # Cherche dans le répertoire courant et ./rapports/ | |
| # Lecture directe + renvoi en text/html pour fonctionner depuis un Codespace | |
| # ou tout reverse-proxy distant (pas de redirect vers fichier statique). | |
| for d in [Path("."), Path("./rapports")]: | |
| f = d / filename | |
| if f.exists() and f.suffix == ".html": | |
| content = f.read_text(encoding="utf-8") | |
| return HTMLResponse(content=content) | |
| raise HTTPException(status_code=404, detail=f"Rapport non trouvé : {filename}") | |
| # --------------------------------------------------------------------------- | |
| # API — HTR-United | |
| # --------------------------------------------------------------------------- | |
| async def api_htr_united_catalogue( | |
| query: str = Query(default="", description="Recherche textuelle"), | |
| language: str = Query(default="", description="Filtre langue"), | |
| script: str = Query(default="", description="Filtre type d'écriture"), | |
| ) -> dict: | |
| from picarones.importers.htr_united import HTRUnitedCatalogue | |
| cat = HTRUnitedCatalogue.from_demo() | |
| results = cat.search( | |
| query=query, | |
| language=language or None, | |
| script=script or None, | |
| ) | |
| return { | |
| "source": cat.source, | |
| "total": len(results), | |
| "entries": [e.as_dict() for e in results], | |
| "available_languages": cat.available_languages(), | |
| "available_scripts": cat.available_scripts(), | |
| } | |
| async def api_htr_united_import(req: HTRUnitedImportRequest) -> dict: | |
| from picarones.importers.htr_united import HTRUnitedCatalogue, import_htr_united_corpus | |
| cat = HTRUnitedCatalogue.from_demo() | |
| entry = cat.get_by_id(req.entry_id) | |
| if not entry: | |
| raise HTTPException(status_code=404, detail=f"Entrée non trouvée : {req.entry_id}") | |
| result = import_htr_united_corpus( | |
| entry=entry, | |
| output_dir=req.output_dir, | |
| max_samples=req.max_samples, | |
| ) | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # API — HuggingFace | |
| # --------------------------------------------------------------------------- | |
| async def api_huggingface_search( | |
| query: str = Query(default="", description="Requête de recherche"), | |
| language: str = Query(default="", description="Filtre langue"), | |
| tags: str = Query(default="", description="Tags séparés par des virgules"), | |
| limit: int = Query(default=20, ge=1, le=50), | |
| ) -> dict: | |
| from picarones.importers.huggingface import HuggingFaceImporter | |
| tag_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else None | |
| importer = HuggingFaceImporter() | |
| results = importer.search( | |
| query=query, | |
| tags=tag_list, | |
| language=language or None, | |
| limit=limit, | |
| ) | |
| return { | |
| "total": len(results), | |
| "datasets": [ds.as_dict() for ds in results], | |
| } | |
| async def api_huggingface_import(req: HuggingFaceImportRequest) -> dict: | |
| from picarones.importers.huggingface import HuggingFaceImporter | |
| importer = HuggingFaceImporter() | |
| result = importer.import_dataset( | |
| dataset_id=req.dataset_id, | |
| output_dir=req.output_dir, | |
| split=req.split, | |
| max_samples=req.max_samples, | |
| ) | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # API — benchmark | |
| # --------------------------------------------------------------------------- | |
| async def api_benchmark_start(req: BenchmarkRequest) -> dict: | |
| corpus_path = Path(req.corpus_path) | |
| if not corpus_path.exists() or not corpus_path.is_dir(): | |
| raise HTTPException(status_code=400, detail=f"Corpus non trouvé : {req.corpus_path}") | |
| job_id = str(uuid.uuid4()) | |
| job = BenchmarkJob(job_id=job_id) | |
| _JOBS[job_id] = job | |
| _cleanup_old_jobs() | |
| # Démarrer le benchmark dans un thread séparé | |
| thread = threading.Thread( | |
| target=_run_benchmark_thread, | |
| args=(job, req), | |
| daemon=True, | |
| ) | |
| thread.start() | |
| return {"job_id": job_id, "status": "pending"} | |
| async def api_benchmark_status(job_id: str) -> dict: | |
| job = _JOBS.get(job_id) | |
| if not job: | |
| raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}") | |
| return job.as_dict() | |
| async def api_benchmark_cancel(job_id: str) -> dict: | |
| job = _JOBS.get(job_id) | |
| if not job: | |
| raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}") | |
| if job.status in ("complete", "error"): | |
| return {"job_id": job_id, "status": job.status, "message": "Job déjà terminé."} | |
| job.status = "cancelled" | |
| job._cancel_event.set() # Signal d'annulation pour run_benchmark | |
| job.add_event("cancelled", {"message": "Benchmark annulé par l'utilisateur."}) | |
| return {"job_id": job_id, "status": "cancelled"} | |
| async def api_benchmark_stream(job_id: str) -> StreamingResponse: | |
| job = _JOBS.get(job_id) | |
| if not job: | |
| raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}") | |
| async def event_generator() -> AsyncIterator[str]: | |
| # S'abonner AVANT de lire les événements existants pour ne rien perdre | |
| queue = job.subscribe() | |
| try: | |
| # Envoie les événements déjà produits (snapshot thread-safe) | |
| with job._lock: | |
| past_events = list(job.events) | |
| for event in past_events: | |
| yield _sse_format(event["kind"], event["data"]) | |
| if job.status in ("complete", "error", "cancelled"): | |
| yield _sse_format("done", {"status": job.status}) | |
| return | |
| while True: | |
| try: | |
| event = await asyncio.wait_for(queue.get(), timeout=30.0) | |
| yield _sse_format(event["kind"], event["data"]) | |
| if event["kind"] in ("complete", "error", "cancelled", "done"): | |
| break | |
| except asyncio.TimeoutError: | |
| # Keepalive | |
| yield ": keepalive\n\n" | |
| if job.status in ("complete", "error", "cancelled"): | |
| yield _sse_format("done", {"status": job.status}) | |
| break | |
| finally: | |
| job.unsubscribe(queue) | |
| return StreamingResponse( | |
| event_generator(), | |
| media_type="text/event-stream", | |
| headers={ | |
| "Cache-Control": "no-cache", | |
| "X-Accel-Buffering": "no", | |
| }, | |
| ) | |
| def _sse_format(event_type: str, data: Any) -> str: | |
| payload = json.dumps(data, ensure_ascii=False) | |
| return f"event: {event_type}\ndata: {payload}\n\n" | |
| # --------------------------------------------------------------------------- | |
| # API — benchmark/run (concurrents composés) | |
| # --------------------------------------------------------------------------- | |
| async def api_benchmark_run(req: BenchmarkRunRequest) -> dict: | |
| corpus_path = Path(req.corpus_path) | |
| if not corpus_path.exists() or not corpus_path.is_dir(): | |
| raise HTTPException(status_code=400, detail=f"Corpus non trouvé : {req.corpus_path}") | |
| if not req.competitors: | |
| raise HTTPException(status_code=400, detail="Aucun concurrent défini.") | |
| job_id = str(uuid.uuid4()) | |
| job = BenchmarkJob(job_id=job_id) | |
| _JOBS[job_id] = job | |
| thread = threading.Thread( | |
| target=_run_benchmark_thread_v2, | |
| args=(job, req), | |
| daemon=True, | |
| ) | |
| thread.start() | |
| return {"job_id": job_id, "status": "pending"} | |
| def _build_llm_adapter(comp: CompetitorConfig) -> Any: | |
| """Instancie un adaptateur LLM depuis la config d'un concurrent.""" | |
| if comp.llm_provider == "openai": | |
| from picarones.llm.openai_adapter import OpenAIAdapter | |
| return OpenAIAdapter(model=comp.llm_model or None) | |
| elif comp.llm_provider == "anthropic": | |
| from picarones.llm.anthropic_adapter import AnthropicAdapter | |
| return AnthropicAdapter(model=comp.llm_model or None) | |
| elif comp.llm_provider == "mistral": | |
| from picarones.llm.mistral_adapter import MistralAdapter | |
| return MistralAdapter(model=comp.llm_model or None) | |
| elif comp.llm_provider == "ollama": | |
| from picarones.llm.ollama_adapter import OllamaAdapter | |
| return OllamaAdapter(model=comp.llm_model or None) | |
| else: | |
| raise ValueError(f"Provider LLM inconnu : {comp.llm_provider}") | |
| def _engine_from_competitor(comp: CompetitorConfig) -> Any: | |
| """Instancie un moteur OCR (ou pipeline OCR+LLM) depuis une CompetitorConfig. | |
| Modes supportés : | |
| - ``ocr_engine`` = 'tesseract', 'mistral_ocr', etc. → moteur OCR seul | |
| - ``ocr_engine`` + ``llm_provider`` → pipeline OCR live + LLM | |
| - ``ocr_engine`` = 'corpus' + ``llm_provider`` → post-correction LLM | |
| avec OCR pré-calculé (fichiers .ocr.txt du corpus triplet) | |
| - ``ocr_engine`` = '' + ``llm_provider`` → LLM seul (zero-shot ou post-correction) | |
| """ | |
| engine_id = comp.ocr_engine | |
| # Pipeline post-correction avec OCR pré-calculé (corpus triplet) | |
| is_corpus_ocr = engine_id in ("corpus", "") | |
| if is_corpus_ocr and not comp.llm_provider: | |
| raise ValueError( | |
| "ocr_engine='corpus' nécessite un llm_provider " | |
| "(pour la post-correction ou le zero-shot)" | |
| ) | |
| ocr = None | |
| if not is_corpus_ocr: | |
| from picarones.engines.tesseract import TesseractEngine | |
| from picarones.engines.mistral_ocr import MistralOCREngine | |
| if engine_id == "tesseract": | |
| ocr = TesseractEngine(config={"lang": comp.ocr_model or "fra", "psm": 6}) | |
| elif engine_id == "mistral_ocr": | |
| ocr = MistralOCREngine(config={"model": comp.ocr_model or "mistral-ocr-latest"}) | |
| elif engine_id == "google_vision": | |
| try: | |
| from picarones.engines.google_vision import GoogleVisionEngine | |
| ocr = GoogleVisionEngine(config={"detection_type": comp.ocr_model or "document_text_detection"}) | |
| except ImportError as exc: | |
| raise RuntimeError("Google Vision non disponible.") from exc | |
| elif engine_id == "azure_doc_intel": | |
| try: | |
| from picarones.engines.azure_doc_intel import AzureDocIntelEngine | |
| ocr = AzureDocIntelEngine(config={"model": comp.ocr_model or "prebuilt-document"}) | |
| except ImportError as exc: | |
| raise RuntimeError("Azure Document Intelligence non disponible.") from exc | |
| else: | |
| raise ValueError(f"Moteur OCR inconnu : {engine_id}") | |
| if not comp.llm_provider: | |
| return ocr | |
| # Pipeline OCR+LLM (live ou post-correction) | |
| _mode_map = { | |
| "text_only": "text_only", | |
| "post_correction_text": "text_only", | |
| "text_and_image": "text_and_image", | |
| "post_correction_image": "text_and_image", | |
| "zero_shot": "zero_shot", | |
| } | |
| mode = _mode_map.get(comp.pipeline_mode, "text_only") | |
| llm = _build_llm_adapter(comp) | |
| from picarones.pipelines.base import OCRLLMPipeline | |
| prompt = comp.prompt_file or "correction_medieval_french.txt" | |
| if is_corpus_ocr: | |
| pipeline_name = comp.name or f"corpus_ocr → {comp.llm_model or comp.llm_provider}" | |
| else: | |
| pipeline_name = comp.name or f"{engine_id} → {comp.llm_model or comp.llm_provider}" | |
| return OCRLLMPipeline( | |
| ocr_engine=ocr, | |
| llm_adapter=llm, | |
| mode=mode, | |
| prompt=prompt, | |
| pipeline_name=pipeline_name, | |
| ) | |
| def _run_benchmark_thread_v2(job: BenchmarkJob, req: BenchmarkRunRequest) -> None: | |
| """Exécute un benchmark à partir d'une liste de CompetitorConfig.""" | |
| job.status = "running" | |
| job.started_at = _iso_now() | |
| job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path}) | |
| try: | |
| from picarones.core.corpus import load_corpus_from_directory | |
| from picarones.core.runner import run_benchmark | |
| corpus = load_corpus_from_directory(req.corpus_path) | |
| job.total_docs = len(corpus) | |
| job.add_event("log", {"message": f"{job.total_docs} documents chargés."}) | |
| if job.status == "cancelled": | |
| return | |
| engines = [] | |
| for comp in req.competitors: | |
| try: | |
| eng = _engine_from_competitor(comp) | |
| engines.append(eng) | |
| job.add_event("log", {"message": f"Concurrent : {eng.name}"}) | |
| except Exception as exc: | |
| job.add_event("warning", { | |
| "message": f"Concurrent ignoré '{comp.name or comp.ocr_engine}' : {exc}" | |
| }) | |
| if not engines: | |
| raise ValueError("Aucun concurrent valide disponible.") | |
| output_dir = Path(req.output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}" | |
| output_json = str(output_dir / f"{report_name}.json") | |
| output_html = str(output_dir / f"{report_name}.html") | |
| n_engines = len(engines) | |
| total_steps = job.total_docs * n_engines | |
| step_counter = [0] | |
| def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None: | |
| if job.status == "cancelled": | |
| return | |
| step_counter[0] += 1 | |
| job.current_engine = engine_name | |
| job.processed_docs = doc_idx | |
| job.progress = step_counter[0] / max(total_steps, 1) | |
| job.add_event("progress", { | |
| "engine": engine_name, | |
| "doc_idx": doc_idx, | |
| "doc_id": doc_id, | |
| "progress": job.progress, | |
| "processed": step_counter[0], | |
| "total": total_steps, | |
| }) | |
| from picarones.core.normalization import _parse_exclude_chars | |
| char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None | |
| result = run_benchmark( | |
| corpus=corpus, | |
| engines=engines, | |
| output_json=output_json, | |
| show_progress=False, | |
| progress_callback=_progress_callback, | |
| char_exclude=char_excl, | |
| cancel_event=job._cancel_event, | |
| ) | |
| if job.status == "cancelled": | |
| return | |
| job.add_event("log", {"message": "Génération du rapport HTML…"}) | |
| from picarones.report.generator import ReportGenerator | |
| gen = ReportGenerator(result, lang=req.report_lang) | |
| gen.generate(output_html) | |
| job.output_path = output_html | |
| job.progress = 1.0 | |
| job.status = "complete" | |
| job.finished_at = _iso_now() | |
| ranking = result.ranking() | |
| job.add_event("complete", { | |
| "message": "Benchmark terminé.", | |
| "output_html": output_html, | |
| "output_json": output_json, | |
| "ranking": ranking, | |
| }) | |
| except Exception as exc: | |
| job.status = "error" | |
| job.error = str(exc) | |
| job.finished_at = _iso_now() | |
| job.add_event("error", {"message": f"Erreur : {exc}"}) | |
| def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None: | |
| """Exécute le benchmark dans un thread et envoie des événements SSE.""" | |
| job.status = "running" | |
| job.started_at = _iso_now() | |
| job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path}) | |
| try: | |
| from picarones.core.corpus import load_corpus_from_directory | |
| from picarones.core.runner import run_benchmark | |
| # Charger le corpus | |
| job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"}) | |
| corpus = load_corpus_from_directory(req.corpus_path) | |
| job.total_docs = len(corpus) | |
| job.add_event("log", {"message": f"{job.total_docs} documents chargés."}) | |
| if job.status == "cancelled": | |
| return | |
| # Instancier les moteurs | |
| from picarones.cli import _engine_from_name | |
| import click | |
| ocr_engines = [] | |
| for engine_name in req.engines: | |
| try: | |
| eng = _engine_from_name(engine_name, lang=req.lang, psm=6) | |
| ocr_engines.append(eng) | |
| job.add_event("log", {"message": f"Moteur chargé : {engine_name}"}) | |
| except (click.BadParameter, Exception) as exc: | |
| job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"}) | |
| if not ocr_engines: | |
| raise ValueError("Aucun moteur valide disponible.") | |
| # Répertoire de sortie | |
| output_dir = Path(req.output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}" | |
| output_json = str(output_dir / f"{report_name}.json") | |
| output_html = str(output_dir / f"{report_name}.html") | |
| # Callback de progression (injecté dans un wrapper) | |
| n_engines = len(ocr_engines) | |
| total_steps = job.total_docs * n_engines | |
| step_counter = [0] | |
| def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None: | |
| if job.status == "cancelled": | |
| return | |
| step_counter[0] += 1 | |
| job.current_engine = engine_name | |
| job.processed_docs = doc_idx | |
| job.progress = step_counter[0] / max(total_steps, 1) | |
| job.add_event("progress", { | |
| "engine": engine_name, | |
| "doc_idx": doc_idx, | |
| "doc_id": doc_id, | |
| "progress": job.progress, | |
| "processed": step_counter[0], | |
| "total": total_steps, | |
| }) | |
| from picarones.core.normalization import _parse_exclude_chars | |
| char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None | |
| # Lancer le benchmark | |
| result = run_benchmark( | |
| corpus=corpus, | |
| engines=ocr_engines, | |
| output_json=output_json, | |
| show_progress=False, | |
| progress_callback=_progress_callback, | |
| char_exclude=char_excl, | |
| cancel_event=job._cancel_event, | |
| ) | |
| if job.status == "cancelled": | |
| return | |
| # Générer le rapport HTML | |
| job.add_event("log", {"message": "Génération du rapport HTML…"}) | |
| from picarones.report.generator import ReportGenerator | |
| report_lang = getattr(req, "report_lang", "fr") | |
| gen = ReportGenerator(result, lang=report_lang) | |
| gen.generate(output_html) | |
| job.output_path = output_html | |
| job.progress = 1.0 | |
| job.status = "complete" | |
| job.finished_at = _iso_now() | |
| # Classement final | |
| ranking = result.ranking() | |
| job.add_event("complete", { | |
| "message": "Benchmark terminé.", | |
| "output_html": output_html, | |
| "output_json": output_json, | |
| "ranking": ranking, | |
| }) | |
| except Exception as exc: | |
| job.status = "error" | |
| job.error = str(exc) | |
| job.finished_at = _iso_now() | |
| job.add_event("error", {"message": f"Erreur : {exc}"}) | |
| # --------------------------------------------------------------------------- | |
| # Page principale HTML (SPA) | |
| # --------------------------------------------------------------------------- | |
| async def index(picarones_lang: str = Cookie(default="fr")) -> HTMLResponse: | |
| lang = picarones_lang if picarones_lang in _SUPPORTED_LANGS else "fr" | |
| # Injecte le code langue dans la SPA via une balise meta | |
| page = _HTML_TEMPLATE.replace( | |
| "<head>", | |
| f'<head>\n<meta name="picarones-lang" content="{lang}">', | |
| 1, | |
| ).replace("__VERSION__", __version__) | |
| return HTMLResponse(content=page) | |
| # --------------------------------------------------------------------------- | |
| # Helper | |
| # --------------------------------------------------------------------------- | |
| def _iso_now() -> str: | |
| return datetime.now(timezone.utc).isoformat(timespec="seconds") | |
| # --------------------------------------------------------------------------- | |
| # HTML Template (SPA, French/English, Vanilla JS) | |
| # --------------------------------------------------------------------------- | |
| _HTML_TEMPLATE = r"""<!DOCTYPE html> | |
| <html lang="fr"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Picarones — OCR Benchmark</title> | |
| <link rel="stylesheet" href="/static/retro.css?v=__VERSION__"> | |
| <style> | |
| /* Overrides locaux minimaux — le gros du CSS est dans /static/retro.css */ | |
| </style> | |
| </head> | |
| <body> | |
| <div id="ascii-banner"> | |
| <pre>██████╗ ██╗ ██████╗ █████╗ ██████╗ ██████╗ ███╗ ██╗███████╗███████╗ | |
| ██╔══██╗██║██╔════╝██╔══██╗██╔══██╗██╔═══██╗████╗ ██║██╔════╝██╔════╝ | |
| ██████╔╝██║██║ ███████║██████╔╝██║ ██║██╔██╗ ██║█████╗ ███████╗ | |
| ██╔═══╝ ██║██║ ██╔══██║██╔══██╗██║ ██║██║╚██╗██║██╔══╝ ╚════██║ | |
| ██║ ██║╚██████╗██║ ██║██║ ██║╚██████╔╝██║ ╚████║███████╗███████║ | |
| ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝╚══════╝╚══════╝</pre> | |
| <span class="ascii-subtitle">OCR/HTR Benchmark Platform</span> | |
| </div> | |
| <div id="header"> | |
| <h1 data-i18n="app_title">Picarones <span class="version" id="app-version"></span></h1> | |
| <nav id="nav"> | |
| <button class="nav-btn active" onclick="showView('benchmark')" data-i18n="nav_benchmark">Benchmark</button> | |
| <button class="nav-btn" onclick="showView('reports')" data-i18n="nav_reports">Rapports</button> | |
| <button class="nav-btn" onclick="showView('engines')" data-i18n="nav_engines">Moteurs</button> | |
| <button class="nav-btn" onclick="showView('import')" data-i18n="nav_import">Import</button> | |
| </nav> | |
| <button id="lang-btn" onclick="toggleLang()">EN</button> | |
| </div> | |
| <div id="main"> | |
| <!-- ===== VUE BENCHMARK ===== --> | |
| <div id="view-benchmark" class="view active"> | |
| <div class="card"> | |
| <h2 data-i18n="bench_corpus_title">1. Corpus</h2> | |
| <!-- Tab bar --> | |
| <div class="corpus-tabs"> | |
| <button class="corpus-tab active" id="ctab-browse" onclick="switchCorpusTab('browse')" data-i18n="corpus_tab_browse">📁 Parcourir</button> | |
| <button class="corpus-tab" id="ctab-upload" onclick="switchCorpusTab('upload')" data-i18n="corpus_tab_upload">⬆ Uploader</button> | |
| </div> | |
| <!-- Browse tab --> | |
| <div id="corpus-tab-browse"> | |
| <div class="form-group"> | |
| <label data-i18n="bench_corpus_label">Chemin vers le dossier corpus (paires image/.gt.txt)</label> | |
| <div class="path-input-row"> | |
| <input type="text" id="corpus-path" placeholder="./corpus/" value="" /> | |
| <button class="btn btn-secondary btn-sm" onclick="openFileBrowser()" data-i18n="bench_browse">Parcourir</button> | |
| </div> | |
| </div> | |
| <div id="file-browser-container" style="display:none; margin-top:10px;"> | |
| <div class="fb-path" id="fb-current-path">.</div> | |
| <div id="file-browser"></div> | |
| </div> | |
| </div> | |
| <!-- Upload tab --> | |
| <div id="corpus-tab-upload" style="display:none;"> | |
| <div class="upload-mode-row"> | |
| <label><input type="radio" name="upload-mode" value="zip" checked onchange="onUploadModeChange()"> 🗜 <span data-i18n="upload_zip_mode">Archive ZIP</span></label> | |
| <label><input type="radio" name="upload-mode" value="files" onchange="onUploadModeChange()"> 🖼 <span data-i18n="upload_files_mode">Fichiers individuels</span></label> | |
| </div> | |
| <!-- Drop zone --> | |
| <div id="upload-dropzone" class="upload-dropzone" | |
| onclick="document.getElementById('upload-file-input').click()" | |
| ondragover="event.preventDefault(); this.classList.add('dragover')" | |
| ondragleave="this.classList.remove('dragover')" | |
| ondrop="onDropFiles(event)"> | |
| <span class="upload-icon">⬆</span> | |
| <span id="upload-dropzone-text" data-i18n="upload_drop_zip">Glissez un .zip ici ou cliquez pour sélectionner</span> | |
| <input type="file" id="upload-file-input" style="display:none" accept=".zip" onchange="onFileInputChange(event)" /> | |
| </div> | |
| <!-- Progress --> | |
| <div id="upload-progress-container" style="display:none; margin-top:10px;"> | |
| <div class="progress-bar-outer"> | |
| <div class="progress-bar-inner" id="upload-progress-bar" style="width:0%; transition:width 0.2s;"></div> | |
| </div> | |
| <div id="upload-progress-text" style="font-size:12px; color:var(--text-muted); margin-top:4px;"></div> | |
| </div> | |
| <!-- Preview after upload --> | |
| <div id="upload-preview" style="margin-top:10px;"></div> | |
| <!-- Previously uploaded corpora --> | |
| <div id="uploads-list" style="margin-top:14px;"></div> | |
| </div> | |
| <div id="corpus-info" style="margin-top:8px; font-size:12px; color: var(--text-muted);"></div> | |
| </div> | |
| <!-- ── Section 1 : Moteurs OCR ─────────────────────────────────── --> | |
| <div class="card"> | |
| <h2 data-i18n="bench_ocr_title">2. Moteurs OCR</h2> | |
| <div id="ocr-engines-status-list"> | |
| <div style="color: var(--text-muted); font-size: 12px;"><span class="spinner"></span> Chargement…</div> | |
| </div> | |
| </div> | |
| <!-- ── Section 2 : Modèles LLM ──────────────────────────────────── --> | |
| <div class="card"> | |
| <h2 data-i18n="bench_llm_title">3. Modèles LLM</h2> | |
| <div id="llm-status-list"> | |
| <div style="color: var(--text-muted); font-size: 12px;"><span class="spinner"></span> Chargement…</div> | |
| </div> | |
| </div> | |
| <!-- ── Section 3 : Composition des concurrents ──────────────────── --> | |
| <div class="card"> | |
| <h2 data-i18n="bench_compose_title">4. Concurrents à benchmarker</h2> | |
| <div class="mode-toggle"> | |
| <label><input type="radio" name="compose-mode" value="ocr" checked onchange="onComposeModeChange()"> 🔍 <span data-i18n="compose_ocr_only">OCR seul</span></label> | |
| <label><input type="radio" name="compose-mode" value="pipeline" onchange="onComposeModeChange()"> ⛓ <span data-i18n="compose_pipeline">Pipeline OCR+LLM</span></label> | |
| <label><input type="radio" name="compose-mode" value="postcorrection" onchange="onComposeModeChange()"> 📝 <span data-i18n="compose_postcorrection">Post-correction (corpus OCR)</span></label> | |
| </div> | |
| <div id="corpus-ocr-notice" style="display:none; margin:8px 0; padding:8px 12px; background:var(--bg-highlight,#f0fdf4); border-radius:6px; font-size:12px; color:var(--success,#16a34a);"> | |
| 📝 <span data-i18n="corpus_has_ocr">Ce corpus contient des fichiers OCR pré-calculés (.ocr.txt) — post-correction disponible.</span> | |
| </div> | |
| <div id="compose-ocr-section" class="composer-row"> | |
| <div class="form-group"> | |
| <label data-i18n="compose_ocr_engine">Moteur OCR</label> | |
| <select id="compose-ocr-engine" onchange="onComposeOCRChange()"> | |
| <option value="tesseract">Tesseract</option> | |
| <option value="mistral_ocr">Mistral OCR</option> | |
| <option value="google_vision">Google Vision</option> | |
| <option value="azure_doc_intel">Azure Doc Intel</option> | |
| </select> | |
| </div> | |
| <div class="form-group" style="flex:1;"> | |
| <label data-i18n="compose_ocr_model">Modèle / Langue <span class="spinner" id="sp-ocr-model" style="display:none"></span></label> | |
| <select id="compose-ocr-model"></select> | |
| </div> | |
| </div> | |
| <div id="compose-pipeline-section" style="display:none;"> | |
| <div class="composer-row"> | |
| <div class="form-group"> | |
| <label data-i18n="compose_llm_provider">Provider LLM</label> | |
| <select id="compose-llm-provider" onchange="onComposeLLMChange()"> | |
| <option value="openai">OpenAI</option> | |
| <option value="anthropic">Anthropic</option> | |
| <option value="mistral">Mistral LLM</option> | |
| <option value="ollama">Ollama</option> | |
| </select> | |
| </div> | |
| <div class="form-group" style="flex:1;"> | |
| <label data-i18n="compose_llm_model">Modèle LLM <span class="spinner" id="sp-llm-model" style="display:none"></span></label> | |
| <select id="compose-llm-model"></select> | |
| </div> | |
| </div> | |
| <div class="composer-row"> | |
| <div class="form-group"> | |
| <label data-i18n="compose_mode">Mode pipeline</label> | |
| <select id="compose-pipeline-mode" onchange="onComposePipelineModeChange()"> | |
| <option value="text_only" data-i18n="mode_text_only">Post-correction texte</option> | |
| <option value="text_and_image" data-i18n="mode_text_image">Post-correction image+texte</option> | |
| <option value="zero_shot" data-i18n="mode_zero_shot">Zero-shot</option> | |
| </select> | |
| </div> | |
| <div class="form-group" style="flex:1;"> | |
| <label data-i18n="compose_prompt">Prompt <span class="spinner" id="sp-prompt" style="display:none"></span></label> | |
| <select id="compose-prompt"></select> | |
| </div> | |
| </div> | |
| </div> | |
| <div style="display:flex; gap:10px; align-items:center; margin-top:10px;"> | |
| <button class="btn btn-primary btn-sm" onclick="addCompetitor()" data-i18n="compose_add">+ Ajouter</button> | |
| <span id="compose-error" style="color: var(--danger); font-size:12px;"></span> | |
| </div> | |
| <div id="competitors-list" style="margin-top:14px;"> | |
| <div style="color: var(--text-muted); font-size:12px;" data-i18n="compose_empty">Aucun concurrent ajouté.</div> | |
| </div> | |
| </div> | |
| <!-- ── 5. Options ─────────────────────────────────────────────────── --> | |
| <div class="card"> | |
| <h2 data-i18n="bench_options_title">5. Options</h2> | |
| <div class="form-row"> | |
| <div class="form-group"> | |
| <label data-i18n="bench_norm_label">Profil de normalisation</label> | |
| <select id="norm-profile"> | |
| <option value="nfc">NFC (standard)</option> | |
| </select> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="bench_char_exclude_label">Caractères à ignorer <span style="color:var(--text-muted);font-size:.75rem">(séparés par virgule, ex : ', -, –)</span></label> | |
| <input type="text" id="char-exclude" placeholder="ex: ', -, –, ." style="font-family:monospace" /> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="bench_output_label">Dossier de sortie</label> | |
| <input type="text" id="output-dir" value="./rapports/" /> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="bench_name_label">Nom du rapport (optionnel)</label> | |
| <input type="text" id="report-name" placeholder="rapport_2024_01_15" /> | |
| </div> | |
| </div> | |
| </div> | |
| <div style="display:flex; gap:10px; align-items:center; margin-bottom:16px;"> | |
| <button class="btn btn-primary" id="start-btn" onclick="startBenchmark()" data-i18n="bench_start">▶ Lancer le benchmark</button> | |
| <button class="btn btn-secondary" id="cancel-btn" style="display:none;" onclick="cancelBenchmark()" data-i18n="bench_cancel">✕ Annuler</button> | |
| <span id="bench-status-text" style="font-size:12px; color: var(--text-muted);"></span> | |
| </div> | |
| <div id="bench-progress-section" style="display:none;"> | |
| <div class="card"> | |
| <h2 data-i18n="bench_progress_title">Progression</h2> | |
| <div id="engine-progress-list"></div> | |
| <div style="margin-top: 12px;"> | |
| <label style="font-size:12px; color: var(--text-muted); display:block; margin-bottom:4px;" data-i18n="bench_log">Journal</label> | |
| <div class="log-box" id="bench-log"></div> | |
| </div> | |
| </div> | |
| </div> | |
| <div id="bench-result-section" style="display:none;"> | |
| <div class="card"> | |
| <h2 data-i18n="bench_result_title">Résultats</h2> | |
| <div id="bench-ranking-table"></div> | |
| <div style="margin-top:12px;"> | |
| <a id="bench-report-link" href="#" class="btn btn-primary" target="_blank" data-i18n="bench_open_report">Ouvrir le rapport</a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ===== VUE RAPPORTS ===== --> | |
| <div id="view-reports" class="view"> | |
| <div class="card"> | |
| <h2 data-i18n="reports_title">Rapports générés</h2> | |
| <div class="form-row" style="margin-bottom:12px;"> | |
| <div class="form-group" style="max-width:320px;"> | |
| <label data-i18n="reports_dir_label">Dossier de rapports</label> | |
| <div class="path-input-row"> | |
| <input type="text" id="reports-dir" value="." /> | |
| <button class="btn btn-secondary btn-sm" onclick="loadReports()" data-i18n="reports_refresh">Rafraîchir</button> | |
| </div> | |
| </div> | |
| </div> | |
| <div id="reports-list"> | |
| <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ===== VUE MOTEURS ===== --> | |
| <div id="view-engines" class="view"> | |
| <div class="card"> | |
| <h2 data-i18n="engines_ocr_title">Moteurs OCR</h2> | |
| <div id="engines-ocr-list"> | |
| <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <h2 data-i18n="engines_llm_title">LLMs disponibles</h2> | |
| <div id="engines-llm-list"> | |
| <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- ===== VUE IMPORT ===== --> | |
| <div id="view-import" class="view"> | |
| <!-- HTR-United --> | |
| <div class="card"> | |
| <h2 data-i18n="import_htr_title">Import HTR-United</h2> | |
| <p style="font-size:12px; color:var(--text-muted); margin-bottom:12px;" data-i18n="import_htr_desc"> | |
| Catalogue communautaire de corpus HTR/OCR pour documents patrimoniaux. | |
| </p> | |
| <div class="form-row"> | |
| <div class="form-group" style="flex:2;"> | |
| <label data-i18n="import_search_label">Recherche</label> | |
| <input type="text" id="htr-search" placeholder="médiéval, latin, manuscrits…" /> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="import_lang_filter">Langue</label> | |
| <select id="htr-lang-filter"> | |
| <option value="" data-i18n="all">Toutes</option> | |
| </select> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="import_script_filter">Type d'écriture</label> | |
| <select id="htr-script-filter"> | |
| <option value="" data-i18n="all">Tous</option> | |
| </select> | |
| </div> | |
| <div class="form-group" style="justify-content: flex-end; padding-top: 18px;"> | |
| <button class="btn btn-primary btn-sm" onclick="searchHTRUnited()" data-i18n="search">Rechercher</button> | |
| </div> | |
| </div> | |
| <div id="htr-results" class="ds-grid"></div> | |
| </div> | |
| <!-- HuggingFace --> | |
| <div class="card"> | |
| <h2 data-i18n="import_hf_title">Import HuggingFace Datasets</h2> | |
| <p style="font-size:12px; color:var(--text-muted); margin-bottom:12px;" data-i18n="import_hf_desc"> | |
| Datasets OCR/HTR publics depuis HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…). | |
| </p> | |
| <div class="form-row"> | |
| <div class="form-group" style="flex:2;"> | |
| <label data-i18n="import_search_label">Recherche</label> | |
| <input type="text" id="hf-search" placeholder="medieval OCR, IAM, RIMES…" /> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="import_lang_filter">Langue</label> | |
| <input type="text" id="hf-lang-filter" placeholder="French, Latin…" /> | |
| </div> | |
| <div class="form-group"> | |
| <label data-i18n="import_tag_filter">Tags</label> | |
| <input type="text" id="hf-tags" placeholder="ocr, htr, historical…" /> | |
| </div> | |
| <div class="form-group" style="justify-content: flex-end; padding-top: 18px;"> | |
| <button class="btn btn-primary btn-sm" onclick="searchHuggingFace()" data-i18n="search">Rechercher</button> | |
| </div> | |
| </div> | |
| <div id="hf-results" class="ds-grid"></div> | |
| </div> | |
| </div><!-- end view-import --> | |
| </div><!-- end #main --> | |
| <!-- Import modal --> | |
| <div id="import-modal" style="display:none; position:fixed; inset:0; background:rgba(0,0,0,0.4); z-index:200; align-items:center; justify-content:center;"> | |
| <div class="card" style="width: 420px; max-width: 95vw;"> | |
| <h2 id="import-modal-title" data-i18n="import_modal_title">Importer le corpus</h2> | |
| <input type="hidden" id="import-modal-type" /> | |
| <input type="hidden" id="import-modal-id" /> | |
| <div class="form-group" style="margin-bottom:12px;"> | |
| <label data-i18n="import_output_dir">Dossier de destination</label> | |
| <input type="text" id="import-modal-output" value="./corpus/" /> | |
| </div> | |
| <div class="form-group" style="margin-bottom:16px;"> | |
| <label data-i18n="import_max_samples">Nombre max de documents</label> | |
| <input type="number" id="import-modal-max" value="100" min="1" max="10000" /> | |
| </div> | |
| <div id="import-modal-status" style="margin-bottom:12px;"></div> | |
| <div style="display:flex; gap:8px;"> | |
| <button class="btn btn-primary" onclick="confirmImport()" data-i18n="import_confirm">Importer</button> | |
| <button class="btn btn-secondary" onclick="closeImportModal()" data-i18n="cancel">Annuler</button> | |
| </div> | |
| </div> | |
| </div> | |
| <script> | |
| // ─── i18n ──────────────────────────────────────────────────────────────────── | |
| const T = { | |
| fr: { | |
| app_title: "Picarones", | |
| nav_benchmark: "Benchmark", | |
| nav_reports: "Rapports", | |
| nav_engines: "Moteurs", | |
| nav_import: "Import", | |
| loading: "Chargement…", | |
| search: "Rechercher", | |
| all: "Tous", | |
| cancel: "Annuler", | |
| bench_corpus_title: "1. Corpus", | |
| bench_corpus_label: "Chemin vers le dossier corpus (paires image / .gt.txt)", | |
| bench_browse: "Parcourir", | |
| corpus_tab_browse: "📁 Parcourir", | |
| corpus_tab_upload: "⬆ Uploader", | |
| upload_zip_mode: "Archive ZIP", | |
| upload_files_mode: "Fichiers individuels", | |
| upload_drop_zip: "Glissez un .zip ici ou cliquez pour sélectionner", | |
| upload_drop_files: "Glissez des images + .gt.txt ou cliquez pour sélectionner", | |
| upload_uploading: "Upload en cours…", | |
| upload_success: "Corpus chargé avec succès", | |
| upload_no_corpus: "Aucun corpus uploadé.", | |
| upload_select: "Utiliser ce corpus", | |
| upload_delete: "Supprimer", | |
| upload_pairs: "paires", | |
| upload_missing_gt: "GT manquant(s)", | |
| bench_engines_title: "2. Moteurs et pipelines", | |
| bench_ocr_title: "2. Moteurs OCR", | |
| bench_llm_title: "3. Modèles LLM", | |
| bench_compose_title: "4. Concurrents à benchmarker", | |
| bench_options_title: "5. Options", | |
| compose_ocr_only: "OCR seul", | |
| compose_pipeline: "Pipeline OCR+LLM", | |
| compose_postcorrection: "Post-correction (corpus OCR)", | |
| corpus_has_ocr: "Ce corpus contient des fichiers OCR pré-calculés (.ocr.txt) — post-correction disponible.", | |
| corpus_no_ocr_warn: "Ce corpus ne contient pas de fichiers .ocr.txt — uploadez un corpus triplet pour la post-correction.", | |
| compose_ocr_engine: "Moteur OCR", | |
| compose_ocr_model: "Modèle / Langue", | |
| compose_llm_provider: "Provider LLM", | |
| compose_llm_model: "Modèle LLM", | |
| compose_mode: "Mode pipeline", | |
| compose_prompt: "Prompt", | |
| compose_add: "+ Ajouter", | |
| compose_empty: "Aucun concurrent ajouté.", | |
| mode_text_only: "Post-correction texte", | |
| mode_text_image: "Post-correction image+texte", | |
| mode_zero_shot: "Zero-shot", | |
| bench_norm_label: "Profil de normalisation", | |
| bench_lang_label: "Langue (Tesseract)", | |
| bench_output_label: "Dossier de sortie", | |
| bench_name_label: "Nom du rapport (optionnel)", | |
| bench_start: "▶ Lancer le benchmark", | |
| bench_cancel: "✕ Annuler", | |
| bench_progress_title: "Progression", | |
| bench_log: "Journal", | |
| bench_result_title: "Résultats", | |
| bench_open_report: "Ouvrir le rapport", | |
| reports_title: "Rapports générés", | |
| reports_dir_label: "Dossier de rapports", | |
| reports_refresh: "Rafraîchir", | |
| engines_ocr_title: "Moteurs OCR", | |
| engines_llm_title: "LLMs disponibles", | |
| import_htr_title: "Import HTR-United", | |
| import_htr_desc: "Catalogue communautaire de corpus HTR/OCR pour documents patrimoniaux.", | |
| import_hf_title: "Import HuggingFace Datasets", | |
| import_hf_desc: "Datasets OCR/HTR publics depuis HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).", | |
| import_search_label: "Recherche", | |
| import_lang_filter: "Langue", | |
| import_script_filter: "Type d'écriture", | |
| import_tag_filter: "Tags", | |
| import_modal_title: "Importer le corpus", | |
| import_output_dir: "Dossier de destination", | |
| import_max_samples: "Nombre max de documents", | |
| import_confirm: "Importer", | |
| available: "disponible", | |
| not_installed: "non installé", | |
| configured: "configuré", | |
| missing_key: "clé manquante", | |
| running: "actif", | |
| not_running: "inactif", | |
| no_reports: "Aucun rapport trouvé.", | |
| lines: "lignes", | |
| centuries: "siècles", | |
| }, | |
| en: { | |
| app_title: "Picarones", | |
| nav_benchmark: "Benchmark", | |
| nav_reports: "Reports", | |
| nav_engines: "Engines", | |
| nav_import: "Import", | |
| loading: "Loading…", | |
| search: "Search", | |
| all: "All", | |
| cancel: "Cancel", | |
| bench_corpus_title: "1. Corpus", | |
| bench_corpus_label: "Path to corpus directory (image / .gt.txt pairs)", | |
| bench_browse: "Browse", | |
| corpus_tab_browse: "📁 Browse", | |
| corpus_tab_upload: "⬆ Upload", | |
| upload_zip_mode: "ZIP archive", | |
| upload_files_mode: "Individual files", | |
| upload_drop_zip: "Drop a .zip here or click to select", | |
| upload_drop_files: "Drop images + .gt.txt files or click to select", | |
| upload_uploading: "Uploading…", | |
| upload_success: "Corpus loaded successfully", | |
| upload_no_corpus: "No corpus uploaded.", | |
| upload_select: "Use this corpus", | |
| upload_delete: "Delete", | |
| upload_pairs: "pairs", | |
| upload_missing_gt: "missing GT", | |
| bench_engines_title: "2. Engines & pipelines", | |
| bench_ocr_title: "2. OCR Engines", | |
| bench_llm_title: "3. LLM Models", | |
| bench_compose_title: "4. Competitors", | |
| bench_options_title: "5. Options", | |
| compose_ocr_only: "OCR only", | |
| compose_pipeline: "OCR+LLM Pipeline", | |
| compose_postcorrection: "Post-correction (corpus OCR)", | |
| corpus_has_ocr: "This corpus contains pre-computed OCR files (.ocr.txt) — post-correction available.", | |
| corpus_no_ocr_warn: "This corpus has no .ocr.txt files — upload a triplet corpus for post-correction.", | |
| compose_ocr_engine: "OCR Engine", | |
| compose_ocr_model: "Model / Language", | |
| compose_llm_provider: "LLM Provider", | |
| compose_llm_model: "LLM Model", | |
| compose_mode: "Pipeline mode", | |
| compose_prompt: "Prompt", | |
| compose_add: "+ Add", | |
| compose_empty: "No competitors added.", | |
| mode_text_only: "Text post-correction", | |
| mode_text_image: "Image+text post-correction", | |
| mode_zero_shot: "Zero-shot", | |
| bench_norm_label: "Normalization profile", | |
| bench_lang_label: "Language (Tesseract)", | |
| bench_output_label: "Output directory", | |
| bench_name_label: "Report name (optional)", | |
| bench_start: "▶ Start benchmark", | |
| bench_cancel: "✕ Cancel", | |
| bench_progress_title: "Progress", | |
| bench_log: "Log", | |
| bench_result_title: "Results", | |
| bench_open_report: "Open report", | |
| reports_title: "Generated reports", | |
| reports_dir_label: "Reports directory", | |
| reports_refresh: "Refresh", | |
| engines_ocr_title: "OCR Engines", | |
| engines_llm_title: "Available LLMs", | |
| import_htr_title: "Import from HTR-United", | |
| import_htr_desc: "Community catalogue of HTR/OCR datasets for heritage documents.", | |
| import_hf_title: "Import from HuggingFace Datasets", | |
| import_hf_desc: "Public OCR/HTR datasets from HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).", | |
| import_search_label: "Search", | |
| import_lang_filter: "Language", | |
| import_script_filter: "Script type", | |
| import_tag_filter: "Tags", | |
| import_modal_title: "Import corpus", | |
| import_output_dir: "Output directory", | |
| import_max_samples: "Max documents", | |
| import_confirm: "Import", | |
| available: "available", | |
| not_installed: "not installed", | |
| configured: "configured", | |
| missing_key: "key missing", | |
| running: "running", | |
| not_running: "not running", | |
| no_reports: "No reports found.", | |
| lines: "lines", | |
| centuries: "centuries", | |
| }, | |
| }; | |
| let lang = "fr"; | |
| function t(key) { return (T[lang][key]) || key; } | |
| function toggleLang() { | |
| lang = lang === "fr" ? "en" : "fr"; | |
| document.getElementById("lang-btn").textContent = lang === "fr" ? "EN" : "FR"; | |
| document.querySelectorAll("[data-i18n]").forEach(el => { | |
| const k = el.getAttribute("data-i18n"); | |
| if (T[lang][k]) el.textContent = T[lang][k]; | |
| }); | |
| } | |
| // ─── Navigation ────────────────────────────────────────────────────────────── | |
| function showView(name) { | |
| document.querySelectorAll(".view").forEach(v => v.classList.remove("active")); | |
| document.querySelectorAll(".nav-btn").forEach(b => b.classList.remove("active")); | |
| const view = document.getElementById("view-" + name); | |
| if (view) view.classList.add("active"); | |
| const btns = document.querySelectorAll(".nav-btn"); | |
| const idx = ["benchmark","reports","engines","import"].indexOf(name); | |
| if (btns[idx]) btns[idx].classList.add("active"); | |
| if (name === "reports") loadReports(); | |
| if (name === "engines") loadEngines(); | |
| if (name === "import") { searchHTRUnited(); searchHuggingFace(); } | |
| } | |
| // ─── Status / version ──────────────────────────────────────────────────────── | |
| async function loadStatus() { | |
| try { | |
| const r = await fetch("/api/status"); | |
| const d = await r.json(); | |
| document.getElementById("app-version").textContent = "v" + d.version; | |
| } catch(e) {} | |
| } | |
| // ─── Models cache & fetching ───────────────────────────────────────────────── | |
| let _modelsCache = {}; | |
| let _enginesData = null; | |
| let _competitors = []; | |
| let _refreshIntervalId = null; | |
| let _pendingOCREngine = null; // garde contre les réponses obsolètes (race condition) | |
| async function fetchModels(provider, capability) { | |
| const cacheKey = capability ? `${provider}__${capability}` : provider; | |
| if (_modelsCache[cacheKey]) return _modelsCache[cacheKey]; | |
| const url = capability ? `/api/models/${provider}?capability=${capability}` : `/api/models/${provider}`; | |
| const r = await fetch(url); | |
| const d = await r.json(); | |
| // Support both new format (objects with id+capabilities) and old format (flat strings) | |
| let models = d.model_ids || d.models || []; | |
| if (models.length > 0 && typeof models[0] === "object") { | |
| models = models.map(m => m.id || m); | |
| } | |
| _modelsCache[cacheKey] = models; | |
| return models; | |
| } | |
| function populateSelect(selectId, models, spinnerId) { | |
| const sel = document.getElementById(selectId); | |
| if (spinnerId) { const sp = document.getElementById(spinnerId); if (sp) sp.style.display = "none"; } | |
| if (!sel) return; | |
| // Handle both string arrays and object arrays | |
| const items = models.map(m => typeof m === "object" ? (m.id || m) : m); | |
| sel.innerHTML = items.length === 0 | |
| ? '<option value="">— aucun modèle —</option>' | |
| : items.map(m => `<option value="${m}">${m}</option>`).join(""); | |
| } | |
| // ─── Benchmark sections (OCR + LLM status + composer init) ─────────────────── | |
| async function loadBenchmarkSections() { | |
| try { | |
| const r = await fetch("/api/engines"); | |
| const d = await r.json(); | |
| _enginesData = d; | |
| renderOCREnginesSection(d.engines); | |
| renderLLMSection(d.llms); | |
| } catch(e) { | |
| document.getElementById("ocr-engines-status-list").innerHTML = | |
| `<div style="color:var(--danger);font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| function _makeProviderRow(eng, msId) { | |
| const dotCls = eng.available ? "status-ok" : (eng.status === "not_running" ? "status-warn" : "status-err"); | |
| let statusLabel; | |
| if (eng.available) statusLabel = eng.version ? eng.version : (lang === "fr" ? "disponible" : "available"); | |
| else if (eng.status === "missing_key") statusLabel = eng.key_env ? `<code style="font-size:11px;color:var(--warning)">${eng.key_env}</code>` : (lang === "fr" ? "clé manquante" : "key missing"); | |
| else if (eng.status === "not_running") statusLabel = lang === "fr" ? "inactif" : "not running"; | |
| else statusLabel = lang === "fr" ? "non installé" : "not installed"; | |
| const row = document.createElement("div"); | |
| row.className = "provider-row"; | |
| row.innerHTML = ` | |
| <div class="provider-label"><span class="engine-status ${dotCls}"></span><strong>${eng.label}</strong></div> | |
| <div class="provider-status">${statusLabel}</div> | |
| <div class="provider-model-select" id="${msId}">${eng.available ? '<span class="spinner"></span>' : ""}</div>`; | |
| return row; | |
| } | |
| async function renderOCREnginesSection(engines) { | |
| const container = document.getElementById("ocr-engines-status-list"); | |
| container.innerHTML = ""; | |
| for (const eng of engines) { | |
| const msId = `ms-ocr-${eng.id}`; | |
| container.appendChild(_makeProviderRow(eng, msId)); | |
| if (eng.available) { | |
| fetchModels(eng.id).then(models => { | |
| const div = document.getElementById(msId); | |
| if (!div) return; | |
| div.innerHTML = models.length === 0 | |
| ? `<span style="color:var(--text-muted);font-size:11px;">—</span>` | |
| : `<span style="font-size:12px;">${models.slice(0,5).join(", ")}${models.length > 5 ? ` +${models.length-5}` : ""}</span>`; | |
| }).catch(() => { | |
| const div = document.getElementById(msId); | |
| if (div) div.innerHTML = `<span style="color:var(--danger);font-size:11px;">Erreur API</span>`; | |
| }); | |
| } | |
| } | |
| } | |
| async function renderLLMSection(llms) { | |
| const container = document.getElementById("llm-status-list"); | |
| container.innerHTML = ""; | |
| for (const llm of llms) { | |
| const msId = `ms-llm-${llm.id}`; | |
| container.appendChild(_makeProviderRow(llm, msId)); | |
| if (llm.available) { | |
| fetchModels(llm.id).then(models => { | |
| const div = document.getElementById(msId); | |
| if (!div) return; | |
| div.innerHTML = models.length === 0 | |
| ? `<span style="color:var(--text-muted);font-size:11px;">—</span>` | |
| : `<span style="font-size:12px;">${models.slice(0,3).join(", ")}${models.length > 3 ? ` +${models.length-3}` : ""}</span>`; | |
| }).catch(() => { | |
| const div = document.getElementById(msId); | |
| if (div) div.innerHTML = `<span style="color:var(--danger);font-size:11px;">Erreur API</span>`; | |
| }); | |
| } | |
| } | |
| } | |
| function startAutoRefresh() { | |
| if (_refreshIntervalId) clearInterval(_refreshIntervalId); | |
| _refreshIntervalId = setInterval(async () => { | |
| try { | |
| const r = await fetch("/api/engines"); | |
| const d = await r.json(); | |
| if (!_enginesData || JSON.stringify(d) !== JSON.stringify(_enginesData)) { | |
| _modelsCache = {}; | |
| _enginesData = d; | |
| renderOCREnginesSection(d.engines); | |
| renderLLMSection(d.llms); | |
| } | |
| } catch(e) {} | |
| }, 10000); | |
| } | |
| // ─── Competitor composer ────────────────────────────────────────────────────── | |
| async function onComposeOCRChange() { | |
| const engine = document.getElementById("compose-ocr-engine").value; | |
| _pendingOCREngine = engine; // marquer la requête courante | |
| const sp = document.getElementById("sp-ocr-model"); | |
| // Google Vision et Azure ont des listes statiques — pas d'appel API nécessaire | |
| if (engine === "google_vision") { | |
| sp.style.display = "none"; | |
| populateSelect("compose-ocr-model", ["document_text_detection", "text_detection"], null); | |
| return; | |
| } | |
| if (engine === "azure_doc_intel") { | |
| sp.style.display = "none"; | |
| populateSelect("compose-ocr-model", ["prebuilt-document", "prebuilt-read"], null); | |
| return; | |
| } | |
| // Tesseract : langues installées ; Mistral OCR : modèles vision (API dynamique) | |
| sp.style.display = "inline-block"; | |
| try { | |
| const models = await fetchModels(engine); | |
| if (_pendingOCREngine !== engine) return; // réponse obsolète, abandonner | |
| populateSelect("compose-ocr-model", models, "sp-ocr-model"); | |
| } catch(e) { | |
| if (_pendingOCREngine !== engine) return; | |
| sp.style.display = "none"; | |
| document.getElementById("compose-ocr-model").innerHTML = '<option value="">Erreur</option>'; | |
| } | |
| } | |
| async function onComposeLLMChange() { | |
| const provider = document.getElementById("compose-llm-provider").value; | |
| const composeMode = document.querySelector("input[name=compose-mode]:checked").value; | |
| const pipelineMode = document.getElementById("compose-pipeline-mode").value; | |
| // Apply capability filter for modes requiring vision | |
| const needsVision = (pipelineMode === "text_and_image" || pipelineMode === "zero_shot"); | |
| const capability = (composeMode === "postcorrection" || composeMode === "pipeline") && needsVision ? "vision" : ""; | |
| _loadLLMModelsWithCapability(provider, capability); | |
| } | |
| function onComposeModeChange() { | |
| const mode = document.querySelector("input[name=compose-mode]:checked").value; | |
| const ocrSection = document.getElementById("compose-ocr-section"); | |
| const pipelineSection = document.getElementById("compose-pipeline-section"); | |
| if (mode === "ocr") { | |
| ocrSection.style.display = "flex"; | |
| pipelineSection.style.display = "none"; | |
| } else if (mode === "pipeline") { | |
| ocrSection.style.display = "flex"; | |
| pipelineSection.style.display = "block"; | |
| // Reload LLM models without capability filter | |
| onComposeLLMChange(); | |
| } else if (mode === "postcorrection") { | |
| ocrSection.style.display = "none"; | |
| pipelineSection.style.display = "block"; | |
| // Reload LLM models with capability filter based on pipeline mode | |
| onComposePipelineModeChange(); | |
| } | |
| } | |
| function onComposePipelineModeChange() { | |
| const composeMode = document.querySelector("input[name=compose-mode]:checked").value; | |
| if (composeMode !== "postcorrection" && composeMode !== "pipeline") return; | |
| const pipelineMode = document.getElementById("compose-pipeline-mode").value; | |
| // Filter by vision capability for modes that need images | |
| const needsVision = (pipelineMode === "text_and_image" || pipelineMode === "zero_shot"); | |
| const capability = needsVision ? "vision" : ""; | |
| const provider = document.getElementById("compose-llm-provider").value; | |
| // Clear cache for this provider to re-fetch with new capability filter | |
| const cacheKey = capability ? `${provider}__${capability}` : provider; | |
| delete _modelsCache[cacheKey]; | |
| _loadLLMModelsWithCapability(provider, capability); | |
| } | |
| async function _loadLLMModelsWithCapability(provider, capability) { | |
| document.getElementById("sp-llm-model").style.display = "inline-block"; | |
| try { | |
| const models = await fetchModels(provider, capability); | |
| populateSelect("compose-llm-model", models, "sp-llm-model"); | |
| } catch(e) { | |
| document.getElementById("sp-llm-model").style.display = "none"; | |
| document.getElementById("compose-llm-model").innerHTML = '<option value="">Erreur</option>'; | |
| } | |
| } | |
| async function loadComposePrompts() { | |
| document.getElementById("sp-prompt").style.display = "inline-block"; | |
| try { | |
| const models = await fetchModels("prompts"); | |
| populateSelect("compose-prompt", models, "sp-prompt"); | |
| } catch(e) { | |
| document.getElementById("sp-prompt").style.display = "none"; | |
| } | |
| } | |
| function addCompetitor() { | |
| const mode = document.querySelector("input[name=compose-mode]:checked").value; | |
| const errEl = document.getElementById("compose-error"); | |
| const comp = { name: "", ocr_engine: "", ocr_model: "", | |
| llm_provider: "", llm_model: "", pipeline_mode: "", prompt_file: "" }; | |
| if (mode === "postcorrection") { | |
| // Post-correction : OCR vient du corpus (.ocr.txt) | |
| comp.ocr_engine = "corpus"; | |
| comp.llm_provider = document.getElementById("compose-llm-provider").value; | |
| comp.llm_model = document.getElementById("compose-llm-model").value; | |
| comp.pipeline_mode = document.getElementById("compose-pipeline-mode").value; | |
| comp.prompt_file = document.getElementById("compose-prompt").value; | |
| if (!comp.llm_provider || !comp.llm_model) { | |
| errEl.textContent = lang === "fr" ? "Sélectionnez un provider et un modèle LLM." : "Select an LLM provider and model."; | |
| return; | |
| } | |
| const modeLabel = {"text_only":"texte","text_and_image":"img+texte","zero_shot":"zero-shot"}[comp.pipeline_mode] || comp.pipeline_mode; | |
| comp.name = `📝 ${comp.llm_model} [${modeLabel}]`; | |
| } else if (mode === "pipeline") { | |
| const ocrEngine = document.getElementById("compose-ocr-engine").value; | |
| const ocrModel = document.getElementById("compose-ocr-model").value; | |
| if (!ocrEngine) { | |
| errEl.textContent = lang === "fr" ? "Sélectionnez un moteur OCR." : "Select an OCR engine."; | |
| return; | |
| } | |
| comp.ocr_engine = ocrEngine; | |
| comp.ocr_model = ocrModel; | |
| comp.llm_provider = document.getElementById("compose-llm-provider").value; | |
| comp.llm_model = document.getElementById("compose-llm-model").value; | |
| comp.pipeline_mode = document.getElementById("compose-pipeline-mode").value; | |
| comp.prompt_file = document.getElementById("compose-prompt").value; | |
| if (!comp.llm_provider) { | |
| errEl.textContent = lang === "fr" ? "Sélectionnez un provider LLM." : "Select an LLM provider."; | |
| return; | |
| } | |
| comp.name = `${ocrEngine}${ocrModel ? ":"+ocrModel : ""} → ${comp.llm_model || comp.llm_provider}`; | |
| } else { | |
| // OCR seul | |
| const ocrEngine = document.getElementById("compose-ocr-engine").value; | |
| const ocrModel = document.getElementById("compose-ocr-model").value; | |
| if (!ocrEngine) { | |
| errEl.textContent = lang === "fr" ? "Sélectionnez un moteur OCR." : "Select an OCR engine."; | |
| return; | |
| } | |
| comp.ocr_engine = ocrEngine; | |
| comp.ocr_model = ocrModel; | |
| comp.name = `${ocrEngine}${ocrModel ? " ("+ocrModel+")" : ""}`; | |
| } | |
| errEl.textContent = ""; | |
| _competitors.push(comp); | |
| renderCompetitors(); | |
| } | |
| function removeCompetitor(idx) { | |
| _competitors.splice(idx, 1); | |
| renderCompetitors(); | |
| } | |
| function renderCompetitors() { | |
| const container = document.getElementById("competitors-list"); | |
| if (_competitors.length === 0) { | |
| container.innerHTML = `<div style="color:var(--text-muted);font-size:12px;">${t("compose_empty")}</div>`; | |
| return; | |
| } | |
| container.innerHTML = _competitors.map((c, i) => { | |
| const isCorpusOCR = c.ocr_engine === "corpus" || (c.ocr_engine === "" && c.llm_provider); | |
| const isPipeline = !!c.llm_provider && !isCorpusOCR; | |
| let badge, detail; | |
| if (isCorpusOCR) { | |
| badge = "📝 Post-correction"; | |
| detail = `corpus_ocr → ${c.llm_provider}:${c.llm_model} [${c.pipeline_mode}]`; | |
| } else if (isPipeline) { | |
| badge = "⛓ Pipeline"; | |
| detail = `${c.ocr_engine}:${c.ocr_model} → ${c.llm_provider}:${c.llm_model} [${c.pipeline_mode}]`; | |
| } else { | |
| badge = "🔍 OCR"; | |
| detail = `${c.ocr_engine}:${c.ocr_model}`; | |
| } | |
| return `<div class="competitor-card"> | |
| <div class="competitor-info"> | |
| <span class="competitor-badge">${badge}</span> | |
| <span class="competitor-name">${c.name}</span> | |
| <span class="competitor-detail">${detail}</span> | |
| </div> | |
| <button class="btn btn-danger btn-sm" onclick="removeCompetitor(${i})">✕</button> | |
| </div>`; | |
| }).join(""); | |
| } | |
| // ─── Normalization profiles ────────────────────────────────────────────────── | |
| let _normProfilesData = []; | |
| async function loadNormProfiles() { | |
| try { | |
| const r = await fetch("/api/normalization/profiles"); | |
| const d = await r.json(); | |
| _normProfilesData = d.profiles || []; | |
| const sel = document.getElementById("norm-profile"); | |
| sel.innerHTML = ""; | |
| _normProfilesData.forEach(p => { | |
| const opt = document.createElement("option"); | |
| opt.value = p.id; | |
| opt.textContent = `${p.name} — ${p.description}`; | |
| if (p.id === "nfc") opt.selected = true; | |
| sel.appendChild(opt); | |
| }); | |
| sel.addEventListener("change", () => { | |
| const p = _normProfilesData.find(x => x.id === sel.value); | |
| if (p && p.exclude_chars && p.exclude_chars.length) { | |
| document.getElementById("char-exclude").value = p.exclude_chars.join(", "); | |
| } | |
| }); | |
| } catch(e) {} | |
| } | |
| // ─── File browser ──────────────────────────────────────────────────────────── | |
| let _fbVisible = false; | |
| function openFileBrowser() { | |
| _fbVisible = !_fbVisible; | |
| const c = document.getElementById("file-browser-container"); | |
| c.style.display = _fbVisible ? "block" : "none"; | |
| if (_fbVisible) browsePath("."); | |
| } | |
| async function browsePath(path) { | |
| try { | |
| const r = await fetch(`/api/corpus/browse?path=${encodeURIComponent(path)}`); | |
| const d = await r.json(); | |
| document.getElementById("fb-current-path").textContent = d.current_path; | |
| const fb = document.getElementById("file-browser"); | |
| fb.innerHTML = ""; | |
| if (d.parent_path) { | |
| const up = document.createElement("div"); | |
| up.className = "fb-item"; | |
| up.innerHTML = `<span class="fb-icon">⬆</span><span class="fb-name">..</span>`; | |
| up.onclick = () => browsePath(d.parent_path); | |
| fb.appendChild(up); | |
| } | |
| d.items.filter(i => i.is_dir).forEach(item => { | |
| const el = document.createElement("div"); | |
| el.className = "fb-item"; | |
| const hasCorpus = item.has_corpus ? `<span class="fb-badge" style="color:var(--success)">✓ ${item.gt_count} GT</span>` : ""; | |
| el.innerHTML = `<span class="fb-icon">📁</span><span class="fb-name">${item.name}</span>${hasCorpus}`; | |
| el.onclick = () => { | |
| if (item.has_corpus) { | |
| document.getElementById("corpus-path").value = item.path; | |
| document.getElementById("corpus-info").textContent = `✓ ${item.gt_count} documents GT trouvés.`; | |
| _fbVisible = false; | |
| document.getElementById("file-browser-container").style.display = "none"; | |
| } else { | |
| browsePath(item.path); | |
| } | |
| }; | |
| fb.appendChild(el); | |
| }); | |
| if (fb.children.length === 0) { | |
| fb.innerHTML = '<div style="padding:12px; color: var(--text-muted); font-size:12px;">Dossier vide</div>'; | |
| } | |
| } catch(e) { | |
| document.getElementById("file-browser").innerHTML = | |
| `<div style="padding:12px; color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| // ─── Benchmark ─────────────────────────────────────────────────────────────── | |
| let _currentJobId = null; | |
| let _eventSource = null; | |
| async function startBenchmark() { | |
| const corpusPath = document.getElementById("corpus-path").value.trim(); | |
| if (!corpusPath) { | |
| alert(lang === "fr" ? "Veuillez sélectionner un dossier corpus." : "Please select a corpus directory."); | |
| return; | |
| } | |
| if (_competitors.length === 0) { | |
| alert(lang === "fr" ? "Ajoutez au moins un concurrent (Section 4)." : "Add at least one competitor (Section 4)."); | |
| return; | |
| } | |
| const payload = { | |
| corpus_path: corpusPath, | |
| competitors: _competitors, | |
| normalization_profile: document.getElementById("norm-profile").value, | |
| char_exclude: document.getElementById("char-exclude").value.trim(), | |
| output_dir: document.getElementById("output-dir").value, | |
| report_name: document.getElementById("report-name").value, | |
| }; | |
| document.getElementById("start-btn").disabled = true; | |
| document.getElementById("cancel-btn").style.display = "inline-flex"; | |
| document.getElementById("bench-progress-section").style.display = "block"; | |
| document.getElementById("bench-result-section").style.display = "none"; | |
| document.getElementById("bench-log").textContent = ""; | |
| document.getElementById("engine-progress-list").innerHTML = ""; | |
| document.getElementById("bench-status-text").textContent = lang === "fr" ? "Démarrage…" : "Starting…"; | |
| try { | |
| const r = await fetch("/api/benchmark/run", { | |
| method: "POST", | |
| headers: {"Content-Type": "application/json"}, | |
| body: JSON.stringify(payload), | |
| }); | |
| if (!r.ok) { | |
| const err = await r.json(); | |
| throw new Error(err.detail || "Erreur serveur"); | |
| } | |
| const d = await r.json(); | |
| _currentJobId = d.job_id; | |
| _startSSE(_currentJobId); | |
| } catch(e) { | |
| appendLog(`Erreur : ${e.message}`, "error"); | |
| document.getElementById("start-btn").disabled = false; | |
| document.getElementById("cancel-btn").style.display = "none"; | |
| document.getElementById("bench-status-text").textContent = ""; | |
| } | |
| } | |
| function _startSSE(jobId) { | |
| if (_eventSource) _eventSource.close(); | |
| const pl = document.getElementById("engine-progress-list"); | |
| pl.innerHTML = ""; | |
| const seenEngines = {}; | |
| _eventSource = new EventSource(`/api/benchmark/${jobId}/stream`); | |
| _eventSource.addEventListener("start", e => { | |
| const d = JSON.parse(e.data); | |
| appendLog(d.message, "success"); | |
| document.getElementById("bench-status-text").textContent = lang === "fr" ? "En cours…" : "Running…"; | |
| }); | |
| _eventSource.addEventListener("log", e => { | |
| const d = JSON.parse(e.data); | |
| appendLog(d.message); | |
| }); | |
| _eventSource.addEventListener("warning", e => { | |
| const d = JSON.parse(e.data); | |
| appendLog(d.message, "warn"); | |
| }); | |
| _eventSource.addEventListener("progress", e => { | |
| const d = JSON.parse(e.data); | |
| const pct = Math.round(d.progress * 100); | |
| const engId = d.engine.replace(/[^a-z0-9_-]/gi, "_"); | |
| if (!seenEngines[engId]) { | |
| seenEngines[engId] = true; | |
| const div = document.createElement("div"); | |
| div.style = "margin-bottom: 8px;"; | |
| div.innerHTML = `<div style="display:flex;justify-content:space-between;font-size:12px;margin-bottom:3px;"> | |
| <span>${d.engine}</span><span id="eng-pct-${engId}">0%</span></div> | |
| <div class="progress-bar-outer"><div class="progress-bar-inner" id="eng-bar-${engId}" style="width:0%"></div></div>`; | |
| pl.appendChild(div); | |
| } | |
| const bar = document.getElementById(`eng-bar-${engId}`); | |
| const pctEl = document.getElementById(`eng-pct-${engId}`); | |
| if (bar) bar.style.width = pct + "%"; | |
| if (pctEl) pctEl.textContent = pct + "%"; | |
| document.getElementById("bench-status-text").textContent = | |
| `${pct}% — ${d.engine} (${d.processed}/${d.total})`; | |
| }); | |
| _eventSource.addEventListener("complete", e => { | |
| const d = JSON.parse(e.data); | |
| appendLog(d.message, "success"); | |
| _showResults(d); | |
| _finishBenchmark(); | |
| }); | |
| _eventSource.addEventListener("error", e => { | |
| const d = JSON.parse(e.data); | |
| appendLog(d.message, "error"); | |
| _finishBenchmark(); | |
| }); | |
| _eventSource.addEventListener("cancelled", e => { | |
| appendLog(lang === "fr" ? "Benchmark annulé." : "Benchmark cancelled.", "warn"); | |
| _finishBenchmark(); | |
| }); | |
| _eventSource.addEventListener("done", e => { _finishBenchmark(); }); | |
| _eventSource.onerror = () => { if (_currentJobId) _finishBenchmark(); }; | |
| } | |
| function _showResults(data) { | |
| const section = document.getElementById("bench-result-section"); | |
| section.style.display = "block"; | |
| if (data.output_html) { | |
| const link = document.getElementById("bench-report-link"); | |
| link.href = `/reports/${data.output_html.split("/").pop()}`; | |
| } | |
| if (data.ranking) { | |
| let html = `<table><thead><tr><th>#</th><th>${lang==="fr"?"Moteur":"Engine"}</th><th>CER</th><th>WER</th><th>${lang==="fr"?"Docs":"Docs"}</th></tr></thead><tbody>`; | |
| data.ranking.forEach((row, i) => { | |
| const cer = row.mean_cer != null ? (row.mean_cer*100).toFixed(2)+"%" : "N/A"; | |
| const wer = row.mean_wer != null ? (row.mean_wer*100).toFixed(2)+"%" : "N/A"; | |
| html += `<tr><td>${i+1}</td><td>${row.engine}</td><td>${cer}</td><td>${wer}</td><td>${row.total_docs || ""}</td></tr>`; | |
| }); | |
| html += "</tbody></table>"; | |
| document.getElementById("bench-ranking-table").innerHTML = html; | |
| } | |
| } | |
| function _finishBenchmark() { | |
| if (_eventSource) { _eventSource.close(); _eventSource = null; } | |
| document.getElementById("start-btn").disabled = false; | |
| document.getElementById("cancel-btn").style.display = "none"; | |
| document.getElementById("bench-status-text").textContent = ""; | |
| } | |
| async function cancelBenchmark() { | |
| if (!_currentJobId) return; | |
| await fetch(`/api/benchmark/${_currentJobId}/cancel`, {method: "POST"}); | |
| } | |
| function appendLog(msg, cls) { | |
| const box = document.getElementById("bench-log"); | |
| const line = document.createElement("div"); | |
| if (cls === "error") line.className = "log-error"; | |
| else if (cls === "warn") line.className = "log-warn"; | |
| else if (cls === "success") line.className = "log-success"; | |
| line.textContent = msg; | |
| box.appendChild(line); | |
| box.scrollTop = box.scrollHeight; | |
| } | |
| // ─── Reports ───────────────────────────────────────────────────────────────── | |
| async function loadReports() { | |
| const dir = document.getElementById("reports-dir").value || "."; | |
| const container = document.getElementById("reports-list"); | |
| container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`; | |
| try { | |
| const r = await fetch(`/api/reports?reports_dir=${encodeURIComponent(dir)}`); | |
| const d = await r.json(); | |
| if (d.reports.length === 0) { | |
| container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("no_reports")}</div>`; | |
| return; | |
| } | |
| let html = `<table><thead><tr><th>${lang==="fr"?"Fichier":"File"}</th><th>${lang==="fr"?"Taille":"Size"}</th><th>${lang==="fr"?"Modifié":"Modified"}</th><th></th></tr></thead><tbody>`; | |
| d.reports.forEach(rep => { | |
| const date = new Date(rep.modified).toLocaleString(lang === "fr" ? "fr-FR" : "en-US"); | |
| html += `<tr><td>${rep.filename}</td><td>${rep.size_kb} Ko</td><td>${date}</td> | |
| <td><a href="${rep.url}" target="_blank" class="btn btn-primary btn-sm">${lang==="fr"?"Ouvrir":"Open"}</a></td></tr>`; | |
| }); | |
| html += "</tbody></table>"; | |
| container.innerHTML = html; | |
| } catch(e) { | |
| container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| // ─── Engines status ────────────────────────────────────────────────────────── | |
| async function loadEngines() { | |
| try { | |
| const r = await fetch("/api/engines"); | |
| const d = await r.json(); | |
| // OCR | |
| let html = `<table><thead><tr><th>ID</th><th>${lang==="fr"?"Nom":"Name"}</th><th>Version</th><th>Statut</th></tr></thead><tbody>`; | |
| d.engines.forEach(e => { | |
| const cls = e.available ? "badge-ok" : "badge-err"; | |
| const lbl = e.available ? t("available") : t("not_installed"); | |
| html += `<tr><td><code>${e.id}</code></td><td>${e.label}</td><td>${e.version||"—"}</td> | |
| <td><span class="badge ${cls}">${lbl}</span></td></tr>`; | |
| }); | |
| html += "</tbody></table>"; | |
| document.getElementById("engines-ocr-list").innerHTML = html; | |
| // LLMs | |
| let llmHtml = `<table><thead><tr><th>ID</th><th>${lang==="fr"?"Nom":"Name"}</th><th>Statut</th><th>${lang==="fr"?"Détail":"Detail"}</th></tr></thead><tbody>`; | |
| d.llms.forEach(e => { | |
| const cls = e.available ? "badge-ok" : "badge-warn"; | |
| const statusKey = e.status === "configured" ? "configured" | |
| : e.status === "running" ? "running" | |
| : e.status === "not_running" ? "not_running" | |
| : "missing_key"; | |
| const lbl = t(statusKey); | |
| let detail = ""; | |
| if (e.key_env) detail = `<code style="font-size:11px;">${e.key_env}</code>`; | |
| if (e.models && e.models.length > 0) detail = e.models.slice(0, 3).join(", "); | |
| llmHtml += `<tr><td><code>${e.id}</code></td><td>${e.label}</td> | |
| <td><span class="badge ${cls}">${lbl}</span></td><td>${detail}</td></tr>`; | |
| }); | |
| llmHtml += "</tbody></table>"; | |
| document.getElementById("engines-llm-list").innerHTML = llmHtml; | |
| } catch(e) { | |
| document.getElementById("engines-ocr-list").innerHTML = | |
| `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| // ─── HTR-United ────────────────────────────────────────────────────────────── | |
| async function initHTRFilters() { | |
| try { | |
| const r = await fetch("/api/htr-united/catalogue"); | |
| const d = await r.json(); | |
| const langSel = document.getElementById("htr-lang-filter"); | |
| const scriptSel = document.getElementById("htr-script-filter"); | |
| langSel.innerHTML = `<option value="">${t("all")}</option>`; | |
| d.available_languages.forEach(l => { | |
| langSel.innerHTML += `<option value="${l}">${l}</option>`; | |
| }); | |
| scriptSel.innerHTML = `<option value="">${t("all")}</option>`; | |
| d.available_scripts.forEach(s => { | |
| scriptSel.innerHTML += `<option value="${s}">${s}</option>`; | |
| }); | |
| } catch(e) {} | |
| } | |
| async function searchHTRUnited() { | |
| const q = document.getElementById("htr-search").value; | |
| const lang2 = document.getElementById("htr-lang-filter").value; | |
| const script = document.getElementById("htr-script-filter").value; | |
| const container = document.getElementById("htr-results"); | |
| container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`; | |
| try { | |
| const url = `/api/htr-united/catalogue?query=${encodeURIComponent(q)}&language=${encodeURIComponent(lang2)}&script=${encodeURIComponent(script)}`; | |
| const r = await fetch(url); | |
| const d = await r.json(); | |
| if (d.entries.length === 0) { | |
| container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${lang==="fr"?"Aucun résultat.":"No results."}</div>`; | |
| return; | |
| } | |
| container.innerHTML = d.entries.map(e => { | |
| const tags = [...e.language, ...e.script].map(s => `<span class="ds-tag">${s}</span>`).join(""); | |
| return `<div class="ds-card"> | |
| <div style="display:flex; justify-content:space-between; align-items:flex-start;"> | |
| <h4>${e.title}</h4> | |
| <button class="btn btn-primary btn-sm" onclick="openImportModal('htr', '${e.id}', '${e.title.replace(/'/g,"\\'")}')"> | |
| ${lang==="fr"?"Importer":"Import"} | |
| </button> | |
| </div> | |
| <p>${e.description}</p> | |
| <p style="color: var(--text-muted);">${e.institution} — ${e.lines.toLocaleString()} ${t("lines")} — ${e.format}</p> | |
| <div class="ds-meta">${tags}</div> | |
| </div>`; | |
| }).join(""); | |
| } catch(e) { | |
| container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| async function searchHuggingFace() { | |
| const q = document.getElementById("hf-search").value; | |
| const langFilter = document.getElementById("hf-lang-filter").value; | |
| const tags = document.getElementById("hf-tags").value; | |
| const container = document.getElementById("hf-results"); | |
| container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`; | |
| try { | |
| const url = `/api/huggingface/search?query=${encodeURIComponent(q)}&language=${encodeURIComponent(langFilter)}&tags=${encodeURIComponent(tags)}`; | |
| const r = await fetch(url); | |
| const d = await r.json(); | |
| if (d.datasets.length === 0) { | |
| container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${lang==="fr"?"Aucun résultat.":"No results."}</div>`; | |
| return; | |
| } | |
| container.innerHTML = d.datasets.map(ds => { | |
| const tags2 = ds.tags.slice(0,5).map(s => `<span class="ds-tag">${s}</span>`).join(""); | |
| return `<div class="ds-card"> | |
| <div style="display:flex; justify-content:space-between; align-items:flex-start;"> | |
| <h4>${ds.title}</h4> | |
| <button class="btn btn-primary btn-sm" onclick="openImportModal('hf', '${ds.dataset_id.replace(/'/g,"\\'")}', '${ds.title.replace(/'/g,"\\'")}')"> | |
| ${lang==="fr"?"Importer":"Import"} | |
| </button> | |
| </div> | |
| <p>${ds.description}</p> | |
| <p style="color: var(--text-muted);">${ds.institution||ds.dataset_id} ${ds.downloads ? "— " + ds.downloads.toLocaleString() + " téléchargements" : ""}</p> | |
| <div class="ds-meta">${tags2}</div> | |
| </div>`; | |
| }).join(""); | |
| } catch(e) { | |
| container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| // ─── Import modal ───────────────────────────────────────────────────────────── | |
| function openImportModal(type, id, title) { | |
| document.getElementById("import-modal-type").value = type; | |
| document.getElementById("import-modal-id").value = id; | |
| document.getElementById("import-modal-title").textContent = `${t("import_modal_title")} : ${title}`; | |
| document.getElementById("import-modal-status").innerHTML = ""; | |
| document.getElementById("import-modal").style.display = "flex"; | |
| } | |
| function closeImportModal() { | |
| document.getElementById("import-modal").style.display = "none"; | |
| } | |
| async function confirmImport() { | |
| const type = document.getElementById("import-modal-type").value; | |
| const id = document.getElementById("import-modal-id").value; | |
| const outputDir = document.getElementById("import-modal-output").value; | |
| const maxSamples = parseInt(document.getElementById("import-modal-max").value); | |
| const statusDiv = document.getElementById("import-modal-status"); | |
| statusDiv.innerHTML = `<div class="alert alert-info"><span class="spinner"></span> ${lang==="fr"?"Import en cours…":"Importing…"}</div>`; | |
| try { | |
| let url, body; | |
| if (type === "htr") { | |
| url = "/api/htr-united/import"; | |
| body = {entry_id: id, output_dir: outputDir, max_samples: maxSamples}; | |
| } else { | |
| url = "/api/huggingface/import"; | |
| body = {dataset_id: id, output_dir: outputDir, max_samples: maxSamples}; | |
| } | |
| const r = await fetch(url, {method:"POST", headers:{"Content-Type":"application/json"}, body: JSON.stringify(body)}); | |
| const d = await r.json(); | |
| if (!r.ok) throw new Error(d.detail || "Erreur"); | |
| const msg = lang === "fr" | |
| ? `✓ Import terminé. ${d.files_imported || 0} fichiers dans <code>${d.output_dir}</code>` | |
| : `✓ Import done. ${d.files_imported || 0} files in <code>${d.output_dir}</code>`; | |
| statusDiv.innerHTML = `<div class="alert alert-success">${msg}</div>`; | |
| // Suggestion de corpus path | |
| document.getElementById("corpus-path").value = d.output_dir; | |
| } catch(e) { | |
| statusDiv.innerHTML = `<div class="alert alert-error">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| // ─── Corpus upload ──────────────────────────────────────────────────────────── | |
| let _uploadMode = "zip"; // "zip" | "files" | |
| function switchCorpusTab(tab) { | |
| document.getElementById("corpus-tab-browse").style.display = tab === "browse" ? "block" : "none"; | |
| document.getElementById("corpus-tab-upload").style.display = tab === "upload" ? "block" : "none"; | |
| document.getElementById("ctab-browse").classList.toggle("active", tab === "browse"); | |
| document.getElementById("ctab-upload").classList.toggle("active", tab === "upload"); | |
| if (tab === "upload") loadUploadedCorpora(); | |
| } | |
| function onUploadModeChange() { | |
| _uploadMode = document.querySelector("input[name=upload-mode]:checked").value; | |
| const input = document.getElementById("upload-file-input"); | |
| if (_uploadMode === "zip") { | |
| input.accept = ".zip"; | |
| input.multiple = false; | |
| document.getElementById("upload-dropzone-text").textContent = t("upload_drop_zip"); | |
| } else { | |
| input.accept = ".jpg,.jpeg,.png,.tif,.tiff,.webp,.gt.txt,.txt"; | |
| input.multiple = true; | |
| document.getElementById("upload-dropzone-text").textContent = t("upload_drop_files"); | |
| } | |
| } | |
| function onFileInputChange(event) { | |
| const files = Array.from(event.target.files); | |
| if (files.length > 0) uploadCorpus(files); | |
| } | |
| function onDropFiles(event) { | |
| event.preventDefault(); | |
| document.getElementById("upload-dropzone").classList.remove("dragover"); | |
| const files = Array.from(event.dataTransfer.files); | |
| if (files.length > 0) uploadCorpus(files); | |
| } | |
| async function uploadCorpus(files) { | |
| const progressContainer = document.getElementById("upload-progress-container"); | |
| const progressBar = document.getElementById("upload-progress-bar"); | |
| const progressText = document.getElementById("upload-progress-text"); | |
| const previewEl = document.getElementById("upload-preview"); | |
| progressContainer.style.display = "block"; | |
| progressBar.style.width = "10%"; | |
| progressText.textContent = t("upload_uploading"); | |
| previewEl.innerHTML = ""; | |
| const fd = new FormData(); | |
| for (const f of files) fd.append("files", f); | |
| try { | |
| // Simulate progress during upload | |
| let pct = 10; | |
| const timer = setInterval(() => { | |
| pct = Math.min(pct + 5, 85); | |
| progressBar.style.width = pct + "%"; | |
| }, 200); | |
| const r = await fetch("/api/corpus/upload", {method: "POST", body: fd}); | |
| clearInterval(timer); | |
| progressBar.style.width = "100%"; | |
| if (!r.ok) { | |
| const err = await r.json(); | |
| throw new Error(err.detail || "Erreur serveur"); | |
| } | |
| const d = await r.json(); | |
| progressText.textContent = `✓ ${t("upload_success")} — ${d.doc_count} ${t("upload_pairs")}`; | |
| progressBar.style.background = "var(--success)"; | |
| // Show preview | |
| renderUploadPreview(d, previewEl); | |
| // Show corpus OCR notice if triplet corpus | |
| _updateCorpusOCRNotice(d); | |
| // Set corpus path and auto-select | |
| setCorpusPath(d.corpus_path, `upload:${d.corpus_id} (${d.doc_count} docs)`); | |
| // Refresh list | |
| loadUploadedCorpora(); | |
| } catch(e) { | |
| progressBar.style.width = "100%"; | |
| progressBar.style.background = "var(--danger)"; | |
| progressText.textContent = `✗ ${e.message}`; | |
| } | |
| } | |
| function renderUploadPreview(data, container) { | |
| const missingBadge = data.has_missing_gt | |
| ? `<span class="badge badge-err" style="margin-left:8px;">${data.missing_gt.length} ${t("upload_missing_gt")}</span>` | |
| : ""; | |
| const ocrBadge = (data.has_ocr_text && data.ocr_text_count > 0) | |
| ? `<span class="badge" style="margin-left:8px; background:#dcfce7; color:#16a34a;">📝 ${data.ocr_text_count} .ocr.txt</span>` | |
| : ""; | |
| let html = `<div class="corpus-preview"> | |
| <div class="corpus-preview-header"> | |
| <span>📄 ${data.doc_count} ${t("upload_pairs")}</span>${ocrBadge}${missingBadge} | |
| </div>`; | |
| for (const p of data.pairs) { | |
| html += `<div class="corpus-preview-pair"> | |
| <span style="color:var(--text-muted);">🖼</span><span>${p.image}</span> | |
| <span style="color:var(--text-muted); margin-left:auto;">↔</span> | |
| <span style="color:var(--success);">${p.gt}</span> | |
| </div>`; | |
| } | |
| if (data.total_pairs > data.pairs.length) { | |
| html += `<div class="corpus-preview-more">… et ${data.total_pairs - data.pairs.length} autres paires</div>`; | |
| } | |
| for (const w of (data.warnings || [])) { | |
| html += `<div style="padding:5px 12px; font-size:11px; color:var(--warning);">⚠ ${w}</div>`; | |
| } | |
| html += `</div>`; | |
| container.innerHTML = html; | |
| } | |
| function setCorpusPath(path, label) { | |
| document.getElementById("corpus-path").value = path; | |
| document.getElementById("corpus-info").textContent = `✓ ${label}`; | |
| } | |
| function _updateCorpusOCRNotice(corpusData) { | |
| const notice = document.getElementById("corpus-ocr-notice"); | |
| if (!notice) return; | |
| if (corpusData && corpusData.has_ocr_text && corpusData.ocr_text_count > 0) { | |
| notice.style.display = "block"; | |
| notice.innerHTML = `📝 ${t("corpus_has_ocr")} <strong>(${corpusData.ocr_text_count} fichiers .ocr.txt)</strong>`; | |
| } else { | |
| notice.style.display = "none"; | |
| } | |
| } | |
| async function loadUploadedCorpora() { | |
| const container = document.getElementById("uploads-list"); | |
| try { | |
| const r = await fetch("/api/corpus/uploads"); | |
| const d = await r.json(); | |
| if (d.uploads.length === 0) { | |
| container.innerHTML = `<div style="color:var(--text-muted); font-size:12px;">${t("upload_no_corpus")}</div>`; | |
| return; | |
| } | |
| const currentPath = document.getElementById("corpus-path").value; | |
| container.innerHTML = d.uploads.map(u => { | |
| const isSelected = u.corpus_path === currentPath; | |
| const missing = u.has_missing_gt | |
| ? `<span class="badge badge-warn" style="margin-left:6px;">${t("upload_missing_gt")}</span>` : ""; | |
| return `<div class="upload-corpus-item${isSelected ? " selected" : ""}" | |
| onclick="setCorpusPath('${u.corpus_path}', 'upload (${u.doc_count} docs)'); loadUploadedCorpora()"> | |
| <span class="upload-corpus-label"> | |
| <strong>${u.doc_count} ${t("upload_pairs")}</strong>${missing} | |
| <span style="display:block; font-size:11px; color:var(--text-muted); font-family:monospace;">${u.corpus_path}</span> | |
| </span> | |
| <button class="btn btn-danger btn-sm" onclick="event.stopPropagation(); deleteUploadedCorpus('${u.corpus_id}')" | |
| title="${t("upload_delete")}">✕</button> | |
| </div>`; | |
| }).join(""); | |
| } catch(e) { | |
| container.innerHTML = `<div style="color:var(--danger); font-size:12px;">Erreur : ${e.message}</div>`; | |
| } | |
| } | |
| async function deleteUploadedCorpus(corpusId) { | |
| try { | |
| await fetch(`/api/corpus/uploads/${corpusId}`, {method: "DELETE"}); | |
| loadUploadedCorpora(); | |
| // Clear corpus path if it was the deleted one | |
| const p = document.getElementById("corpus-path").value; | |
| if (p.includes(corpusId)) { | |
| document.getElementById("corpus-path").value = ""; | |
| document.getElementById("corpus-info").textContent = ""; | |
| } | |
| } catch(e) {} | |
| } | |
| // ─── Init ──────────────────────────────────────────────────────────────────── | |
| document.addEventListener("DOMContentLoaded", async () => { | |
| loadStatus(); | |
| loadNormProfiles(); | |
| initHTRFilters(); | |
| // Load OCR engines, LLM models, initialize composer | |
| await loadBenchmarkSections(); | |
| onComposeOCRChange(); // Pre-populate Tesseract languages | |
| loadComposePrompts(); // Pre-load prompt files | |
| startAutoRefresh(); // Auto-detect new API keys every 10 s | |
| // Close modal on backdrop click | |
| document.getElementById("import-modal").addEventListener("click", e => { | |
| if (e.target === document.getElementById("import-modal")) closeImportModal(); | |
| }); | |
| }); | |
| </script> | |
| </body> | |
| </html>""" | |