Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Claude commited on May 1

Commit

ec6e89d

unverified ·

1 Parent(s): 0171c0a

refactor(web): extraire benchmark_utils.py et config_utils.py

Suite de la décomposition de ``picarones/web/app.py``. Deux familles
d'utilitaires sortent du fichier monolithique :

- ``picarones/web/benchmark_utils.py`` (346 l) : la machinerie
d'exécution d'un benchmark — ``sse_format`` (Server-Sent Events
avec ``Last-Event-ID``), ``build_llm_adapter`` (factory adapter
LLM par provider), ``engine_from_competitor`` (factory moteur OCR
ou pipeline OCR+LLM depuis ``CompetitorConfig``),
``run_benchmark_thread`` et ``run_benchmark_thread_v2`` (workers
threadés qui exécutent le benchmark, émettent des événements SSE,
génèrent le rapport HTML final).

- ``picarones/web/config_utils.py`` (58 l) : validation et migration
des configs utilisateur — ``CONFIG_SCHEMA_VERSION``,
``ALLOWED_CONFIG_FIELDS`` (liste blanche pour ne pas embarquer de
secrets), ``filter_config``, ``upgrade_config`` (point de
divergence pour futures migrations de format).

``app.py`` passe de 1586 à 1244 lignes (~22 % retirés sur ce commit ;
~40 % depuis le début du chantier A). Les noms historiques avec
préfixe ``_`` sont préservés via aliases d'import dans ``app.py``.

Pytest : 3354 passed, 2 skipped, 0 failed. Ruff : All checks passed.

https://claude.ai/code/session_01Hsd7kL8yeCbXn1mA7GQK9L

Files changed (3) hide show

picarones/web/app.py +10 -352
picarones/web/benchmark_utils.py +346 -0
picarones/web/config_utils.py +57 -0

picarones/web/app.py CHANGED Viewed

@@ -44,6 +44,16 @@ from fastapi import Cookie, FastAPI, File, HTTPException, Query, Request, Respon
 from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
 from picarones import __version__
 from picarones.web.corpus_utils import (
     analyze_corpus_dir as _analyze_corpus_dir,
     flatten_zip_to_dir as _flatten_zip_to_dir,
@@ -60,7 +70,6 @@ from picarones.web.engine_utils import (
 from picarones.web.models import (
     BenchmarkRequest,
     BenchmarkRunRequest,
-    CompetitorConfig,
     HTRUnitedImportRequest,
     HuggingFaceImportRequest,
 )
@@ -647,46 +656,6 @@ async def api_normalization_profiles() -> dict:
 # API — config save/load (Sprint 28)
 # ---------------------------------------------------------------------------
-#: Schéma versionné des configs utilisateur. Si on change le format,
-#: bumpez ce nombre et rajoutez un upgrade path dans ``_upgrade_config``.
-_CONFIG_SCHEMA_VERSION = 1
-#: Champs autorisés dans une config sauvegardée. On filtre explicitement
-#: pour ne pas embarquer des secrets ou des clefs serveur si le client
-#: pousse un dict trop riche.
-_ALLOWED_CONFIG_FIELDS: frozenset[str] = frozenset({
-    "schema_version",
-    "saved_at",
-    "label",
-    "corpus_path",
-    "engines",
-    "normalization_profile",
-    "char_exclude",
-    "lang",
-    "report_lang",
-    "output_dir",
-    "report_name",
-    "competitors",
-})
-def _filter_config(payload: dict) -> dict:
-    """Ne garde que les champs autorisés, dans un ordre stable pour les diffs."""
-    out: dict[str, Any] = {}
-    for k in sorted(_ALLOWED_CONFIG_FIELDS):
-        if k in payload:
-            out[k] = payload[k]
-    return out
-def _upgrade_config(payload: dict) -> dict:
-    """Migre les anciennes configs vers le schéma courant.
-    Schéma 1 (Sprint 28) : pas de migration nécessaire — on retourne tel quel.
-    """
-    return payload
 @app.post("/api/config/save")
 async def api_config_save(payload: dict) -> Response:
     """Sérialise un dict de config en JSON téléchargeable.
@@ -1182,19 +1151,6 @@ async def api_benchmark_stream(job_id: str, request: Request) -> StreamingRespon
     )
-def _sse_format(event_type: str, data: Any, seq: Optional[int] = None) -> str:
-    """Format SSE.
-    Sprint 26 — émet une ligne ``id: <seq>`` quand le ``seq`` est connu.
-    C'est la valeur que le navigateur renvoie automatiquement dans
-    ``Last-Event-ID`` à la prochaine connexion (cf.
-    https://html.spec.whatwg.org/multipage/server-sent-events.html).
-    """
-    payload = json.dumps(data, ensure_ascii=False)
-    head = f"id: {seq}\n" if seq is not None else ""
-    return f"{head}event: {event_type}\ndata: {payload}\n\n"
 # ---------------------------------------------------------------------------
 # API — benchmark/run (concurrents composés)
 # ---------------------------------------------------------------------------
@@ -1247,304 +1203,6 @@ async def api_benchmark_run(req: BenchmarkRunRequest, request: Request) -> dict:
     return {"job_id": job_id, "status": "pending"}
-def _build_llm_adapter(comp: CompetitorConfig) -> Any:
-    """Instancie un adaptateur LLM depuis la config d'un concurrent."""
-    if comp.llm_provider == "openai":
-        from picarones.llm.openai_adapter import OpenAIAdapter
-        return OpenAIAdapter(model=comp.llm_model or None)
-    elif comp.llm_provider == "anthropic":
-        from picarones.llm.anthropic_adapter import AnthropicAdapter
-        return AnthropicAdapter(model=comp.llm_model or None)
-    elif comp.llm_provider == "mistral":
-        from picarones.llm.mistral_adapter import MistralAdapter
-        return MistralAdapter(model=comp.llm_model or None)
-    elif comp.llm_provider == "ollama":
-        from picarones.llm.ollama_adapter import OllamaAdapter
-        return OllamaAdapter(model=comp.llm_model or None)
-    else:
-        raise ValueError(f"Provider LLM inconnu : {comp.llm_provider}")
-def _engine_from_competitor(comp: CompetitorConfig) -> Any:
-    """Instancie un moteur OCR (ou pipeline OCR+LLM) depuis une CompetitorConfig.
-    Modes supportés :
-    - ``ocr_engine`` = 'tesseract', 'mistral_ocr', etc. → moteur OCR seul
-    - ``ocr_engine`` + ``llm_provider`` → pipeline OCR live + LLM
-    - ``ocr_engine`` = 'corpus' + ``llm_provider`` → post-correction LLM
-      avec OCR pré-calculé (fichiers .ocr.txt du corpus triplet)
-    - ``ocr_engine`` = '' + ``llm_provider`` → LLM seul (zero-shot ou post-correction)
-    """
-    engine_id = comp.ocr_engine
-    # Pipeline post-correction avec OCR pré-calculé (corpus triplet)
-    is_corpus_ocr = engine_id in ("corpus", "")
-    if is_corpus_ocr and not comp.llm_provider:
-        raise ValueError(
-            "ocr_engine='corpus' nécessite un llm_provider "
-            "(pour la post-correction ou le zero-shot)"
-        )
-    ocr = None
-    if not is_corpus_ocr:
-        from picarones.engines.tesseract import TesseractEngine
-        from picarones.engines.mistral_ocr import MistralOCREngine
-        if engine_id == "tesseract":
-            ocr = TesseractEngine(config={"lang": comp.ocr_model or "fra", "psm": 6})
-        elif engine_id == "mistral_ocr":
-            ocr = MistralOCREngine(config={"model": comp.ocr_model or "mistral-ocr-latest"})
-        elif engine_id == "google_vision":
-            try:
-                from picarones.engines.google_vision import GoogleVisionEngine
-                ocr = GoogleVisionEngine(config={"detection_type": comp.ocr_model or "document_text_detection"})
-            except ImportError as exc:
-                raise RuntimeError("Google Vision non disponible.") from exc
-        elif engine_id == "azure_doc_intel":
-            try:
-                from picarones.engines.azure_doc_intel import AzureDocIntelEngine
-                ocr = AzureDocIntelEngine(config={"model": comp.ocr_model or "prebuilt-document"})
-            except ImportError as exc:
-                raise RuntimeError("Azure Document Intelligence non disponible.") from exc
-        else:
-            raise ValueError(f"Moteur OCR inconnu : {engine_id}")
-        if not comp.llm_provider:
-            return ocr
-    # Pipeline OCR+LLM (live ou post-correction)
-    _mode_map = {
-        "text_only": "text_only",
-        "post_correction_text": "text_only",
-        "text_and_image": "text_and_image",
-        "post_correction_image": "text_and_image",
-        "zero_shot": "zero_shot",
-    }
-    mode = _mode_map.get(comp.pipeline_mode, "text_only")
-    llm = _build_llm_adapter(comp)
-    from picarones.pipelines.base import OCRLLMPipeline
-    prompt = comp.prompt_file or "correction_medieval_french.txt"
-    if is_corpus_ocr:
-        pipeline_name = comp.name or f"corpus_ocr → {comp.llm_model or comp.llm_provider}"
-    else:
-        pipeline_name = comp.name or f"{engine_id} → {comp.llm_model or comp.llm_provider}"
-    return OCRLLMPipeline(
-        ocr_engine=ocr,
-        llm_adapter=llm,
-        mode=mode,
-        prompt=prompt,
-        pipeline_name=pipeline_name,
-    )
-def _run_benchmark_thread_v2(job: BenchmarkJob, req: BenchmarkRunRequest) -> None:
-    """Exécute un benchmark à partir d'une liste de CompetitorConfig."""
-    job.set_status("running")
-    job.started_at = _iso_now()
-    job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
-    try:
-        from picarones.core.corpus import load_corpus_from_directory
-        from picarones.measurements.runner import run_benchmark
-        corpus = load_corpus_from_directory(req.corpus_path)
-        job.total_docs = len(corpus)
-        job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
-        if job.status == "cancelled":
-            return
-        engines = []
-        for comp in req.competitors:
-            try:
-                eng = _engine_from_competitor(comp)
-                engines.append(eng)
-                job.add_event("log", {"message": f"Concurrent : {eng.name}"})
-            except Exception as exc:
-                job.add_event("warning", {
-                    "message": f"Concurrent ignoré '{comp.name or comp.ocr_engine}' : {exc}"
-                })
-        if not engines:
-            raise ValueError("Aucun concurrent valide disponible.")
-        output_dir = Path(req.output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-        report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-        output_json = str(output_dir / f"{report_name}.json")
-        output_html = str(output_dir / f"{report_name}.html")
-        n_engines = len(engines)
-        total_steps = job.total_docs * n_engines
-        step_counter = [0]
-        def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
-            if job.status == "cancelled":
-                return
-            step_counter[0] += 1
-            job.current_engine = engine_name
-            job.processed_docs = doc_idx
-            job.progress = step_counter[0] / max(total_steps, 1)
-            job.add_event("progress", {
-                "engine": engine_name,
-                "doc_idx": doc_idx,
-                "doc_id": doc_id,
-                "progress": job.progress,
-                "processed": step_counter[0],
-                "total": total_steps,
-            })
-        from picarones.measurements.normalization import _parse_exclude_chars
-        char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
-        result = run_benchmark(
-            corpus=corpus,
-            engines=engines,
-            output_json=output_json,
-            show_progress=False,
-            progress_callback=_progress_callback,
-            char_exclude=char_excl,
-            cancel_event=job._cancel_event,
-        )
-        if job.status == "cancelled":
-            return
-        job.add_event("log", {"message": "Génération du rapport HTML…"})
-        from picarones.report.generator import ReportGenerator
-        gen = ReportGenerator(result, lang=req.report_lang)
-        gen.generate(output_html)
-        job.output_path = output_html
-        job.progress = 1.0
-        job.set_status("complete")
-        ranking = result.ranking()
-        job.add_event("complete", {
-            "message": "Benchmark terminé.",
-            "output_html": output_html,
-            "output_json": output_json,
-            "ranking": ranking,
-        })
-    except Exception as exc:
-        job.set_status("error", error=str(exc))
-        job.add_event("error", {"message": f"Erreur : {exc}"})
-def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
-    """Exécute le benchmark dans un thread et envoie des événements SSE."""
-    job.set_status("running")
-    job.started_at = _iso_now()
-    job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
-    try:
-        from picarones.core.corpus import load_corpus_from_directory
-        from picarones.measurements.runner import run_benchmark
-        # Charger le corpus
-        job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"})
-        corpus = load_corpus_from_directory(req.corpus_path)
-        job.total_docs = len(corpus)
-        job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
-        if job.status == "cancelled":
-            return
-        # Instancier les moteurs
-        from picarones.cli import _engine_from_name
-        import click
-        ocr_engines = []
-        for engine_name in req.engines:
-            try:
-                eng = _engine_from_name(engine_name, lang=req.lang, psm=6)
-                ocr_engines.append(eng)
-                job.add_event("log", {"message": f"Moteur chargé : {engine_name}"})
-            except (click.BadParameter, Exception) as exc:
-                job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"})
-        if not ocr_engines:
-            raise ValueError("Aucun moteur valide disponible.")
-        # Répertoire de sortie
-        output_dir = Path(req.output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-        report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-        output_json = str(output_dir / f"{report_name}.json")
-        output_html = str(output_dir / f"{report_name}.html")
-        # Callback de progression (injecté dans un wrapper)
-        n_engines = len(ocr_engines)
-        total_steps = job.total_docs * n_engines
-        step_counter = [0]
-        def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
-            if job.status == "cancelled":
-                return
-            step_counter[0] += 1
-            job.current_engine = engine_name
-            job.processed_docs = doc_idx
-            job.progress = step_counter[0] / max(total_steps, 1)
-            job.add_event("progress", {
-                "engine": engine_name,
-                "doc_idx": doc_idx,
-                "doc_id": doc_id,
-                "progress": job.progress,
-                "processed": step_counter[0],
-                "total": total_steps,
-            })
-        from picarones.measurements.normalization import _parse_exclude_chars
-        char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
-        # Lancer le benchmark
-        result = run_benchmark(
-            corpus=corpus,
-            engines=ocr_engines,
-            output_json=output_json,
-            show_progress=False,
-            progress_callback=_progress_callback,
-            char_exclude=char_excl,
-            cancel_event=job._cancel_event,
-        )
-        if job.status == "cancelled":
-            return
-        # Générer le rapport HTML
-        job.add_event("log", {"message": "Génération du rapport HTML…"})
-        from picarones.report.generator import ReportGenerator
-        report_lang = getattr(req, "report_lang", "fr")
-        gen = ReportGenerator(result, lang=report_lang)
-        gen.generate(output_html)
-        job.output_path = output_html
-        job.progress = 1.0
-        job.set_status("complete")
-        # Classement final
-        ranking = result.ranking()
-        job.add_event("complete", {
-            "message": "Benchmark terminé.",
-            "output_html": output_html,
-            "output_json": output_json,
-            "ranking": ranking,
-        })
-    except Exception as exc:
-        job.set_status("error", error=str(exc))
-        job.add_event("error", {"message": f"Erreur : {exc}"})
 # ---------------------------------------------------------------------------
 # Page principale HTML (SPA)
 # ---------------------------------------------------------------------------

 from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
 from picarones import __version__
+from picarones.web.benchmark_utils import (
+    run_benchmark_thread as _run_benchmark_thread,
+    run_benchmark_thread_v2 as _run_benchmark_thread_v2,
+    sse_format as _sse_format,
+)
+from picarones.web.config_utils import (
+    CONFIG_SCHEMA_VERSION as _CONFIG_SCHEMA_VERSION,
+    filter_config as _filter_config,
+    upgrade_config as _upgrade_config,
+)
 from picarones.web.corpus_utils import (
     analyze_corpus_dir as _analyze_corpus_dir,
     flatten_zip_to_dir as _flatten_zip_to_dir,
 from picarones.web.models import (
     BenchmarkRequest,
     BenchmarkRunRequest,
     HTRUnitedImportRequest,
     HuggingFaceImportRequest,
 )
 # API — config save/load (Sprint 28)
 # ---------------------------------------------------------------------------
 @app.post("/api/config/save")
 async def api_config_save(payload: dict) -> Response:
     """Sérialise un dict de config en JSON téléchargeable.
     )
 # ---------------------------------------------------------------------------
 # API — benchmark/run (concurrents composés)
 # ---------------------------------------------------------------------------
     return {"job_id": job_id, "status": "pending"}
 # ---------------------------------------------------------------------------
 # Page principale HTML (SPA)
 # ---------------------------------------------------------------------------

picarones/web/benchmark_utils.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""Utilitaires d'exécution de benchmark côté web.
+- ``sse_format`` : sérialisation d'un événement Server-Sent Events
+  avec ``Last-Event-ID`` (Sprint 26).
+- ``build_llm_adapter`` : factory adapter LLM depuis une config
+  ``CompetitorConfig``.
+- ``engine_from_competitor`` : factory moteur OCR ou pipeline
+  OCR+LLM depuis une ``CompetitorConfig``.
+- ``run_benchmark_thread`` / ``run_benchmark_thread_v2`` : workers
+  threadés qui exécutent le benchmark, émettent des événements SSE
+  via le ``BenchmarkJob``, génèrent le rapport HTML final.
+Ces utilitaires sont consommés par les routeurs ``/api/benchmark/*``.
+"""
+from __future__ import annotations
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+from picarones.web.models import (
+    BenchmarkRequest,
+    BenchmarkRunRequest,
+    CompetitorConfig,
+)
+from picarones.web.state import BenchmarkJob, iso_now
+def sse_format(event_type: str, data: Any, seq: Optional[int] = None) -> str:
+    """Format Server-Sent Events.
+    Sprint 26 — émet une ligne ``id: <seq>`` quand le ``seq`` est connu.
+    C'est la valeur que le navigateur renvoie automatiquement dans
+    ``Last-Event-ID`` à la prochaine connexion (cf.
+    https://html.spec.whatwg.org/multipage/server-sent-events.html).
+    """
+    payload = json.dumps(data, ensure_ascii=False)
+    head = f"id: {seq}\n" if seq is not None else ""
+    return f"{head}event: {event_type}\ndata: {payload}\n\n"
+def build_llm_adapter(comp: CompetitorConfig) -> Any:
+    """Instancie un adaptateur LLM depuis la config d'un concurrent."""
+    if comp.llm_provider == "openai":
+        from picarones.llm.openai_adapter import OpenAIAdapter
+        return OpenAIAdapter(model=comp.llm_model or None)
+    elif comp.llm_provider == "anthropic":
+        from picarones.llm.anthropic_adapter import AnthropicAdapter
+        return AnthropicAdapter(model=comp.llm_model or None)
+    elif comp.llm_provider == "mistral":
+        from picarones.llm.mistral_adapter import MistralAdapter
+        return MistralAdapter(model=comp.llm_model or None)
+    elif comp.llm_provider == "ollama":
+        from picarones.llm.ollama_adapter import OllamaAdapter
+        return OllamaAdapter(model=comp.llm_model or None)
+    else:
+        raise ValueError(f"Provider LLM inconnu : {comp.llm_provider}")
+def engine_from_competitor(comp: CompetitorConfig) -> Any:
+    """Instancie un moteur OCR (ou pipeline OCR+LLM) depuis une CompetitorConfig.
+    Modes supportés :
+    - ``ocr_engine`` = ``tesseract``, ``mistral_ocr``, … → moteur OCR seul.
+    - ``ocr_engine`` + ``llm_provider`` → pipeline OCR live + LLM.
+    - ``ocr_engine`` = ``corpus`` + ``llm_provider`` → post-correction LLM
+      avec OCR pré-calculé (fichiers ``.ocr.txt`` du corpus triplet).
+    - ``ocr_engine`` = ``""`` + ``llm_provider`` → LLM seul (zero-shot
+      ou post-correction).
+    """
+    engine_id = comp.ocr_engine
+    is_corpus_ocr = engine_id in ("corpus", "")
+    if is_corpus_ocr and not comp.llm_provider:
+        raise ValueError(
+            "ocr_engine='corpus' nécessite un llm_provider "
+            "(pour la post-correction ou le zero-shot)"
+        )
+    ocr = None
+    if not is_corpus_ocr:
+        from picarones.engines.tesseract import TesseractEngine
+        from picarones.engines.mistral_ocr import MistralOCREngine
+        if engine_id == "tesseract":
+            ocr = TesseractEngine(config={"lang": comp.ocr_model or "fra", "psm": 6})
+        elif engine_id == "mistral_ocr":
+            ocr = MistralOCREngine(config={"model": comp.ocr_model or "mistral-ocr-latest"})
+        elif engine_id == "google_vision":
+            try:
+                from picarones.engines.google_vision import GoogleVisionEngine
+                ocr = GoogleVisionEngine(
+                    config={"detection_type": comp.ocr_model or "document_text_detection"},
+                )
+            except ImportError as exc:
+                raise RuntimeError("Google Vision non disponible.") from exc
+        elif engine_id == "azure_doc_intel":
+            try:
+                from picarones.engines.azure_doc_intel import AzureDocIntelEngine
+                ocr = AzureDocIntelEngine(
+                    config={"model": comp.ocr_model or "prebuilt-document"},
+                )
+            except ImportError as exc:
+                raise RuntimeError("Azure Document Intelligence non disponible.") from exc
+        else:
+            raise ValueError(f"Moteur OCR inconnu : {engine_id}")
+        if not comp.llm_provider:
+            return ocr
+    # Pipeline OCR+LLM (live ou post-correction)
+    mode_map = {
+        "text_only": "text_only",
+        "post_correction_text": "text_only",
+        "text_and_image": "text_and_image",
+        "post_correction_image": "text_and_image",
+        "zero_shot": "zero_shot",
+    }
+    mode = mode_map.get(comp.pipeline_mode, "text_only")
+    llm = build_llm_adapter(comp)
+    from picarones.pipelines.base import OCRLLMPipeline
+    prompt = comp.prompt_file or "correction_medieval_french.txt"
+    if is_corpus_ocr:
+        pipeline_name = comp.name or f"corpus_ocr → {comp.llm_model or comp.llm_provider}"
+    else:
+        pipeline_name = comp.name or f"{engine_id} → {comp.llm_model or comp.llm_provider}"
+    return OCRLLMPipeline(
+        ocr_engine=ocr,
+        llm_adapter=llm,
+        mode=mode,
+        prompt=prompt,
+        pipeline_name=pipeline_name,
+    )
+def run_benchmark_thread_v2(job: BenchmarkJob, req: BenchmarkRunRequest) -> None:
+    """Exécute un benchmark à partir d'une liste de ``CompetitorConfig``."""
+    job.set_status("running")
+    job.started_at = iso_now()
+    job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
+    try:
+        from picarones.core.corpus import load_corpus_from_directory
+        from picarones.measurements.runner import run_benchmark
+        corpus = load_corpus_from_directory(req.corpus_path)
+        job.total_docs = len(corpus)
+        job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
+        if job.status == "cancelled":
+            return
+        engines = []
+        for comp in req.competitors:
+            try:
+                eng = engine_from_competitor(comp)
+                engines.append(eng)
+                job.add_event("log", {"message": f"Concurrent : {eng.name}"})
+            except Exception as exc:  # noqa: BLE001
+                job.add_event("warning", {
+                    "message": f"Concurrent ignoré '{comp.name or comp.ocr_engine}' : {exc}"
+                })
+        if not engines:
+            raise ValueError("Aucun concurrent valide disponible.")
+        output_dir = Path(req.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        output_json = str(output_dir / f"{report_name}.json")
+        output_html = str(output_dir / f"{report_name}.html")
+        n_engines = len(engines)
+        total_steps = job.total_docs * n_engines
+        step_counter = [0]
+        def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
+            if job.status == "cancelled":
+                return
+            step_counter[0] += 1
+            job.current_engine = engine_name
+            job.processed_docs = doc_idx
+            job.progress = step_counter[0] / max(total_steps, 1)
+            job.add_event("progress", {
+                "engine": engine_name,
+                "doc_idx": doc_idx,
+                "doc_id": doc_id,
+                "progress": job.progress,
+                "processed": step_counter[0],
+                "total": total_steps,
+            })
+        from picarones.measurements.normalization import _parse_exclude_chars
+        char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
+        result = run_benchmark(
+            corpus=corpus,
+            engines=engines,
+            output_json=output_json,
+            show_progress=False,
+            progress_callback=_progress_callback,
+            char_exclude=char_excl,
+            cancel_event=job._cancel_event,
+        )
+        if job.status == "cancelled":
+            return
+        job.add_event("log", {"message": "Génération du rapport HTML…"})
+        from picarones.report.generator import ReportGenerator
+        gen = ReportGenerator(result, lang=req.report_lang)
+        gen.generate(output_html)
+        job.output_path = output_html
+        job.progress = 1.0
+        job.set_status("complete")
+        ranking = result.ranking()
+        job.add_event("complete", {
+            "message": "Benchmark terminé.",
+            "output_html": output_html,
+            "output_json": output_json,
+            "ranking": ranking,
+        })
+    except Exception as exc:  # noqa: BLE001
+        job.set_status("error", error=str(exc))
+        job.add_event("error", {"message": f"Erreur : {exc}"})
+def run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
+    """Exécute le benchmark legacy (route ``/api/benchmark/start``)."""
+    job.set_status("running")
+    job.started_at = iso_now()
+    job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
+    try:
+        from picarones.core.corpus import load_corpus_from_directory
+        from picarones.measurements.runner import run_benchmark
+        # Charger le corpus
+        job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"})
+        corpus = load_corpus_from_directory(req.corpus_path)
+        job.total_docs = len(corpus)
+        job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
+        if job.status == "cancelled":
+            return
+        # Instancier les moteurs
+        from picarones.cli import _engine_from_name
+        import click
+        ocr_engines = []
+        for engine_name in req.engines:
+            try:
+                eng = _engine_from_name(engine_name, lang=req.lang, psm=6)
+                ocr_engines.append(eng)
+                job.add_event("log", {"message": f"Moteur chargé : {engine_name}"})
+            except (click.BadParameter, Exception) as exc:
+                job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"})
+        if not ocr_engines:
+            raise ValueError("Aucun moteur valide disponible.")
+        # Répertoire de sortie
+        output_dir = Path(req.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        output_json = str(output_dir / f"{report_name}.json")
+        output_html = str(output_dir / f"{report_name}.html")
+        # Callback de progression
+        n_engines = len(ocr_engines)
+        total_steps = job.total_docs * n_engines
+        step_counter = [0]
+        def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
+            if job.status == "cancelled":
+                return
+            step_counter[0] += 1
+            job.current_engine = engine_name
+            job.processed_docs = doc_idx
+            job.progress = step_counter[0] / max(total_steps, 1)
+            job.add_event("progress", {
+                "engine": engine_name,
+                "doc_idx": doc_idx,
+                "doc_id": doc_id,
+                "progress": job.progress,
+                "processed": step_counter[0],
+                "total": total_steps,
+            })
+        from picarones.measurements.normalization import _parse_exclude_chars
+        char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
+        result = run_benchmark(
+            corpus=corpus,
+            engines=ocr_engines,
+            output_json=output_json,
+            show_progress=False,
+            progress_callback=_progress_callback,
+            char_exclude=char_excl,
+            cancel_event=job._cancel_event,
+        )
+        if job.status == "cancelled":
+            return
+        job.add_event("log", {"message": "Génération du rapport HTML…"})
+        from picarones.report.generator import ReportGenerator
+        report_lang = getattr(req, "report_lang", "fr")
+        gen = ReportGenerator(result, lang=report_lang)
+        gen.generate(output_html)
+        job.output_path = output_html
+        job.progress = 1.0
+        job.set_status("complete")
+        ranking = result.ranking()
+        job.add_event("complete", {
+            "message": "Benchmark terminé.",
+            "output_html": output_html,
+            "output_json": output_json,
+            "ranking": ranking,
+        })
+    except Exception as exc:  # noqa: BLE001
+        job.set_status("error", error=str(exc))
+        job.add_event("error", {"message": f"Erreur : {exc}"})
+__all__ = [
+    "sse_format",
+    "build_llm_adapter",
+    "engine_from_competitor",
+    "run_benchmark_thread",
+    "run_benchmark_thread_v2",
+]

picarones/web/config_utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Utilitaires de validation et migration des configs utilisateur.
+Sprint 28 — supprime la friction *« reconfigurer chaque session »* :
+le client peut télécharger sa config en JSON et la réimporter plus
+tard. Ce module définit le schéma versionné et les règles de filtrage
+qui empêchent qu'un payload trop riche n'embarque des secrets ou des
+clés serveur.
+"""
+from __future__ import annotations
+from typing import Any
+CONFIG_SCHEMA_VERSION = 1
+"""Bump quand le format change ; ajouter un upgrade path dans ``upgrade_config``."""
+ALLOWED_CONFIG_FIELDS: frozenset[str] = frozenset({
+    "schema_version",
+    "saved_at",
+    "label",
+    "corpus_path",
+    "engines",
+    "normalization_profile",
+    "char_exclude",
+    "lang",
+    "report_lang",
+    "output_dir",
+    "report_name",
+    "competitors",
+})
+"""Liste blanche des champs autorisés dans une config sauvegardée."""
+def filter_config(payload: dict) -> dict:
+    """Ne garde que les champs autorisés, dans un ordre stable pour les diffs."""
+    out: dict[str, Any] = {}
+    for k in sorted(ALLOWED_CONFIG_FIELDS):
+        if k in payload:
+            out[k] = payload[k]
+    return out
+def upgrade_config(payload: dict) -> dict:
+    """Migre les anciennes configs vers le schéma courant.
+    Schéma 1 (Sprint 28) : pas de migration nécessaire — on retourne tel quel.
+    Future : ajouter des branches sur ``schema_version`` quand le format évolue.
+    """
+    return payload
+__all__ = [
+    "CONFIG_SCHEMA_VERSION",
+    "ALLOWED_CONFIG_FIELDS",
+    "filter_config",
+    "upgrade_config",
+]