Spaces:
Running
refactor(web): extraire benchmark_utils.py et config_utils.py
Browse filesSuite de la décomposition de ``picarones/web/app.py``. Deux familles
d'utilitaires sortent du fichier monolithique :
- ``picarones/web/benchmark_utils.py`` (346 l) : la machinerie
d'exécution d'un benchmark — ``sse_format`` (Server-Sent Events
avec ``Last-Event-ID``), ``build_llm_adapter`` (factory adapter
LLM par provider), ``engine_from_competitor`` (factory moteur OCR
ou pipeline OCR+LLM depuis ``CompetitorConfig``),
``run_benchmark_thread`` et ``run_benchmark_thread_v2`` (workers
threadés qui exécutent le benchmark, émettent des événements SSE,
génèrent le rapport HTML final).
- ``picarones/web/config_utils.py`` (58 l) : validation et migration
des configs utilisateur — ``CONFIG_SCHEMA_VERSION``,
``ALLOWED_CONFIG_FIELDS`` (liste blanche pour ne pas embarquer de
secrets), ``filter_config``, ``upgrade_config`` (point de
divergence pour futures migrations de format).
``app.py`` passe de 1586 à 1244 lignes (~22 % retirés sur ce commit ;
~40 % depuis le début du chantier A). Les noms historiques avec
préfixe ``_`` sont préservés via aliases d'import dans ``app.py``.
Pytest : 3354 passed, 2 skipped, 0 failed. Ruff : All checks passed.
https://claude.ai/code/session_01Hsd7kL8yeCbXn1mA7GQK9L
- picarones/web/app.py +10 -352
- picarones/web/benchmark_utils.py +346 -0
- picarones/web/config_utils.py +57 -0
|
@@ -44,6 +44,16 @@ from fastapi import Cookie, FastAPI, File, HTTPException, Query, Request, Respon
|
|
| 44 |
from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
|
| 45 |
|
| 46 |
from picarones import __version__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
from picarones.web.corpus_utils import (
|
| 48 |
analyze_corpus_dir as _analyze_corpus_dir,
|
| 49 |
flatten_zip_to_dir as _flatten_zip_to_dir,
|
|
@@ -60,7 +70,6 @@ from picarones.web.engine_utils import (
|
|
| 60 |
from picarones.web.models import (
|
| 61 |
BenchmarkRequest,
|
| 62 |
BenchmarkRunRequest,
|
| 63 |
-
CompetitorConfig,
|
| 64 |
HTRUnitedImportRequest,
|
| 65 |
HuggingFaceImportRequest,
|
| 66 |
)
|
|
@@ -647,46 +656,6 @@ async def api_normalization_profiles() -> dict:
|
|
| 647 |
# API — config save/load (Sprint 28)
|
| 648 |
# ---------------------------------------------------------------------------
|
| 649 |
|
| 650 |
-
#: Schéma versionné des configs utilisateur. Si on change le format,
|
| 651 |
-
#: bumpez ce nombre et rajoutez un upgrade path dans ``_upgrade_config``.
|
| 652 |
-
_CONFIG_SCHEMA_VERSION = 1
|
| 653 |
-
|
| 654 |
-
#: Champs autorisés dans une config sauvegardée. On filtre explicitement
|
| 655 |
-
#: pour ne pas embarquer des secrets ou des clefs serveur si le client
|
| 656 |
-
#: pousse un dict trop riche.
|
| 657 |
-
_ALLOWED_CONFIG_FIELDS: frozenset[str] = frozenset({
|
| 658 |
-
"schema_version",
|
| 659 |
-
"saved_at",
|
| 660 |
-
"label",
|
| 661 |
-
"corpus_path",
|
| 662 |
-
"engines",
|
| 663 |
-
"normalization_profile",
|
| 664 |
-
"char_exclude",
|
| 665 |
-
"lang",
|
| 666 |
-
"report_lang",
|
| 667 |
-
"output_dir",
|
| 668 |
-
"report_name",
|
| 669 |
-
"competitors",
|
| 670 |
-
})
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
def _filter_config(payload: dict) -> dict:
|
| 674 |
-
"""Ne garde que les champs autorisés, dans un ordre stable pour les diffs."""
|
| 675 |
-
out: dict[str, Any] = {}
|
| 676 |
-
for k in sorted(_ALLOWED_CONFIG_FIELDS):
|
| 677 |
-
if k in payload:
|
| 678 |
-
out[k] = payload[k]
|
| 679 |
-
return out
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
def _upgrade_config(payload: dict) -> dict:
|
| 683 |
-
"""Migre les anciennes configs vers le schéma courant.
|
| 684 |
-
|
| 685 |
-
Schéma 1 (Sprint 28) : pas de migration nécessaire — on retourne tel quel.
|
| 686 |
-
"""
|
| 687 |
-
return payload
|
| 688 |
-
|
| 689 |
-
|
| 690 |
@app.post("/api/config/save")
|
| 691 |
async def api_config_save(payload: dict) -> Response:
|
| 692 |
"""Sérialise un dict de config en JSON téléchargeable.
|
|
@@ -1182,19 +1151,6 @@ async def api_benchmark_stream(job_id: str, request: Request) -> StreamingRespon
|
|
| 1182 |
)
|
| 1183 |
|
| 1184 |
|
| 1185 |
-
def _sse_format(event_type: str, data: Any, seq: Optional[int] = None) -> str:
|
| 1186 |
-
"""Format SSE.
|
| 1187 |
-
|
| 1188 |
-
Sprint 26 — émet une ligne ``id: <seq>`` quand le ``seq`` est connu.
|
| 1189 |
-
C'est la valeur que le navigateur renvoie automatiquement dans
|
| 1190 |
-
``Last-Event-ID`` à la prochaine connexion (cf.
|
| 1191 |
-
https://html.spec.whatwg.org/multipage/server-sent-events.html).
|
| 1192 |
-
"""
|
| 1193 |
-
payload = json.dumps(data, ensure_ascii=False)
|
| 1194 |
-
head = f"id: {seq}\n" if seq is not None else ""
|
| 1195 |
-
return f"{head}event: {event_type}\ndata: {payload}\n\n"
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
# ---------------------------------------------------------------------------
|
| 1199 |
# API — benchmark/run (concurrents composés)
|
| 1200 |
# ---------------------------------------------------------------------------
|
|
@@ -1247,304 +1203,6 @@ async def api_benchmark_run(req: BenchmarkRunRequest, request: Request) -> dict:
|
|
| 1247 |
return {"job_id": job_id, "status": "pending"}
|
| 1248 |
|
| 1249 |
|
| 1250 |
-
def _build_llm_adapter(comp: CompetitorConfig) -> Any:
|
| 1251 |
-
"""Instancie un adaptateur LLM depuis la config d'un concurrent."""
|
| 1252 |
-
if comp.llm_provider == "openai":
|
| 1253 |
-
from picarones.llm.openai_adapter import OpenAIAdapter
|
| 1254 |
-
return OpenAIAdapter(model=comp.llm_model or None)
|
| 1255 |
-
elif comp.llm_provider == "anthropic":
|
| 1256 |
-
from picarones.llm.anthropic_adapter import AnthropicAdapter
|
| 1257 |
-
return AnthropicAdapter(model=comp.llm_model or None)
|
| 1258 |
-
elif comp.llm_provider == "mistral":
|
| 1259 |
-
from picarones.llm.mistral_adapter import MistralAdapter
|
| 1260 |
-
return MistralAdapter(model=comp.llm_model or None)
|
| 1261 |
-
elif comp.llm_provider == "ollama":
|
| 1262 |
-
from picarones.llm.ollama_adapter import OllamaAdapter
|
| 1263 |
-
return OllamaAdapter(model=comp.llm_model or None)
|
| 1264 |
-
else:
|
| 1265 |
-
raise ValueError(f"Provider LLM inconnu : {comp.llm_provider}")
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
-
def _engine_from_competitor(comp: CompetitorConfig) -> Any:
|
| 1269 |
-
"""Instancie un moteur OCR (ou pipeline OCR+LLM) depuis une CompetitorConfig.
|
| 1270 |
-
|
| 1271 |
-
Modes supportés :
|
| 1272 |
-
- ``ocr_engine`` = 'tesseract', 'mistral_ocr', etc. → moteur OCR seul
|
| 1273 |
-
- ``ocr_engine`` + ``llm_provider`` → pipeline OCR live + LLM
|
| 1274 |
-
- ``ocr_engine`` = 'corpus' + ``llm_provider`` → post-correction LLM
|
| 1275 |
-
avec OCR pré-calculé (fichiers .ocr.txt du corpus triplet)
|
| 1276 |
-
- ``ocr_engine`` = '' + ``llm_provider`` → LLM seul (zero-shot ou post-correction)
|
| 1277 |
-
"""
|
| 1278 |
-
engine_id = comp.ocr_engine
|
| 1279 |
-
|
| 1280 |
-
# Pipeline post-correction avec OCR pré-calculé (corpus triplet)
|
| 1281 |
-
is_corpus_ocr = engine_id in ("corpus", "")
|
| 1282 |
-
|
| 1283 |
-
if is_corpus_ocr and not comp.llm_provider:
|
| 1284 |
-
raise ValueError(
|
| 1285 |
-
"ocr_engine='corpus' nécessite un llm_provider "
|
| 1286 |
-
"(pour la post-correction ou le zero-shot)"
|
| 1287 |
-
)
|
| 1288 |
-
|
| 1289 |
-
ocr = None
|
| 1290 |
-
if not is_corpus_ocr:
|
| 1291 |
-
from picarones.engines.tesseract import TesseractEngine
|
| 1292 |
-
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 1293 |
-
|
| 1294 |
-
if engine_id == "tesseract":
|
| 1295 |
-
ocr = TesseractEngine(config={"lang": comp.ocr_model or "fra", "psm": 6})
|
| 1296 |
-
elif engine_id == "mistral_ocr":
|
| 1297 |
-
ocr = MistralOCREngine(config={"model": comp.ocr_model or "mistral-ocr-latest"})
|
| 1298 |
-
elif engine_id == "google_vision":
|
| 1299 |
-
try:
|
| 1300 |
-
from picarones.engines.google_vision import GoogleVisionEngine
|
| 1301 |
-
ocr = GoogleVisionEngine(config={"detection_type": comp.ocr_model or "document_text_detection"})
|
| 1302 |
-
except ImportError as exc:
|
| 1303 |
-
raise RuntimeError("Google Vision non disponible.") from exc
|
| 1304 |
-
elif engine_id == "azure_doc_intel":
|
| 1305 |
-
try:
|
| 1306 |
-
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 1307 |
-
ocr = AzureDocIntelEngine(config={"model": comp.ocr_model or "prebuilt-document"})
|
| 1308 |
-
except ImportError as exc:
|
| 1309 |
-
raise RuntimeError("Azure Document Intelligence non disponible.") from exc
|
| 1310 |
-
else:
|
| 1311 |
-
raise ValueError(f"Moteur OCR inconnu : {engine_id}")
|
| 1312 |
-
|
| 1313 |
-
if not comp.llm_provider:
|
| 1314 |
-
return ocr
|
| 1315 |
-
|
| 1316 |
-
# Pipeline OCR+LLM (live ou post-correction)
|
| 1317 |
-
_mode_map = {
|
| 1318 |
-
"text_only": "text_only",
|
| 1319 |
-
"post_correction_text": "text_only",
|
| 1320 |
-
"text_and_image": "text_and_image",
|
| 1321 |
-
"post_correction_image": "text_and_image",
|
| 1322 |
-
"zero_shot": "zero_shot",
|
| 1323 |
-
}
|
| 1324 |
-
mode = _mode_map.get(comp.pipeline_mode, "text_only")
|
| 1325 |
-
|
| 1326 |
-
llm = _build_llm_adapter(comp)
|
| 1327 |
-
|
| 1328 |
-
from picarones.pipelines.base import OCRLLMPipeline
|
| 1329 |
-
prompt = comp.prompt_file or "correction_medieval_french.txt"
|
| 1330 |
-
|
| 1331 |
-
if is_corpus_ocr:
|
| 1332 |
-
pipeline_name = comp.name or f"corpus_ocr → {comp.llm_model or comp.llm_provider}"
|
| 1333 |
-
else:
|
| 1334 |
-
pipeline_name = comp.name or f"{engine_id} → {comp.llm_model or comp.llm_provider}"
|
| 1335 |
-
|
| 1336 |
-
return OCRLLMPipeline(
|
| 1337 |
-
ocr_engine=ocr,
|
| 1338 |
-
llm_adapter=llm,
|
| 1339 |
-
mode=mode,
|
| 1340 |
-
prompt=prompt,
|
| 1341 |
-
pipeline_name=pipeline_name,
|
| 1342 |
-
)
|
| 1343 |
-
|
| 1344 |
-
|
| 1345 |
-
def _run_benchmark_thread_v2(job: BenchmarkJob, req: BenchmarkRunRequest) -> None:
|
| 1346 |
-
"""Exécute un benchmark à partir d'une liste de CompetitorConfig."""
|
| 1347 |
-
|
| 1348 |
-
job.set_status("running")
|
| 1349 |
-
job.started_at = _iso_now()
|
| 1350 |
-
job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
|
| 1351 |
-
|
| 1352 |
-
try:
|
| 1353 |
-
from picarones.core.corpus import load_corpus_from_directory
|
| 1354 |
-
from picarones.measurements.runner import run_benchmark
|
| 1355 |
-
|
| 1356 |
-
corpus = load_corpus_from_directory(req.corpus_path)
|
| 1357 |
-
job.total_docs = len(corpus)
|
| 1358 |
-
job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
|
| 1359 |
-
|
| 1360 |
-
if job.status == "cancelled":
|
| 1361 |
-
return
|
| 1362 |
-
|
| 1363 |
-
engines = []
|
| 1364 |
-
for comp in req.competitors:
|
| 1365 |
-
try:
|
| 1366 |
-
eng = _engine_from_competitor(comp)
|
| 1367 |
-
engines.append(eng)
|
| 1368 |
-
job.add_event("log", {"message": f"Concurrent : {eng.name}"})
|
| 1369 |
-
except Exception as exc:
|
| 1370 |
-
job.add_event("warning", {
|
| 1371 |
-
"message": f"Concurrent ignoré '{comp.name or comp.ocr_engine}' : {exc}"
|
| 1372 |
-
})
|
| 1373 |
-
|
| 1374 |
-
if not engines:
|
| 1375 |
-
raise ValueError("Aucun concurrent valide disponible.")
|
| 1376 |
-
|
| 1377 |
-
output_dir = Path(req.output_dir)
|
| 1378 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 1379 |
-
report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 1380 |
-
output_json = str(output_dir / f"{report_name}.json")
|
| 1381 |
-
output_html = str(output_dir / f"{report_name}.html")
|
| 1382 |
-
|
| 1383 |
-
n_engines = len(engines)
|
| 1384 |
-
total_steps = job.total_docs * n_engines
|
| 1385 |
-
step_counter = [0]
|
| 1386 |
-
|
| 1387 |
-
def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
|
| 1388 |
-
if job.status == "cancelled":
|
| 1389 |
-
return
|
| 1390 |
-
step_counter[0] += 1
|
| 1391 |
-
job.current_engine = engine_name
|
| 1392 |
-
job.processed_docs = doc_idx
|
| 1393 |
-
job.progress = step_counter[0] / max(total_steps, 1)
|
| 1394 |
-
job.add_event("progress", {
|
| 1395 |
-
"engine": engine_name,
|
| 1396 |
-
"doc_idx": doc_idx,
|
| 1397 |
-
"doc_id": doc_id,
|
| 1398 |
-
"progress": job.progress,
|
| 1399 |
-
"processed": step_counter[0],
|
| 1400 |
-
"total": total_steps,
|
| 1401 |
-
})
|
| 1402 |
-
|
| 1403 |
-
from picarones.measurements.normalization import _parse_exclude_chars
|
| 1404 |
-
char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
|
| 1405 |
-
|
| 1406 |
-
result = run_benchmark(
|
| 1407 |
-
corpus=corpus,
|
| 1408 |
-
engines=engines,
|
| 1409 |
-
output_json=output_json,
|
| 1410 |
-
show_progress=False,
|
| 1411 |
-
progress_callback=_progress_callback,
|
| 1412 |
-
char_exclude=char_excl,
|
| 1413 |
-
cancel_event=job._cancel_event,
|
| 1414 |
-
)
|
| 1415 |
-
|
| 1416 |
-
if job.status == "cancelled":
|
| 1417 |
-
return
|
| 1418 |
-
|
| 1419 |
-
job.add_event("log", {"message": "Génération du rapport HTML…"})
|
| 1420 |
-
from picarones.report.generator import ReportGenerator
|
| 1421 |
-
gen = ReportGenerator(result, lang=req.report_lang)
|
| 1422 |
-
gen.generate(output_html)
|
| 1423 |
-
|
| 1424 |
-
job.output_path = output_html
|
| 1425 |
-
job.progress = 1.0
|
| 1426 |
-
job.set_status("complete")
|
| 1427 |
-
|
| 1428 |
-
ranking = result.ranking()
|
| 1429 |
-
job.add_event("complete", {
|
| 1430 |
-
"message": "Benchmark terminé.",
|
| 1431 |
-
"output_html": output_html,
|
| 1432 |
-
"output_json": output_json,
|
| 1433 |
-
"ranking": ranking,
|
| 1434 |
-
})
|
| 1435 |
-
|
| 1436 |
-
except Exception as exc:
|
| 1437 |
-
job.set_status("error", error=str(exc))
|
| 1438 |
-
job.add_event("error", {"message": f"Erreur : {exc}"})
|
| 1439 |
-
|
| 1440 |
-
|
| 1441 |
-
def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
|
| 1442 |
-
"""Exécute le benchmark dans un thread et envoie des événements SSE."""
|
| 1443 |
-
|
| 1444 |
-
job.set_status("running")
|
| 1445 |
-
job.started_at = _iso_now()
|
| 1446 |
-
job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
|
| 1447 |
-
|
| 1448 |
-
try:
|
| 1449 |
-
from picarones.core.corpus import load_corpus_from_directory
|
| 1450 |
-
from picarones.measurements.runner import run_benchmark
|
| 1451 |
-
|
| 1452 |
-
# Charger le corpus
|
| 1453 |
-
job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"})
|
| 1454 |
-
corpus = load_corpus_from_directory(req.corpus_path)
|
| 1455 |
-
job.total_docs = len(corpus)
|
| 1456 |
-
job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
|
| 1457 |
-
|
| 1458 |
-
if job.status == "cancelled":
|
| 1459 |
-
return
|
| 1460 |
-
|
| 1461 |
-
# Instancier les moteurs
|
| 1462 |
-
from picarones.cli import _engine_from_name
|
| 1463 |
-
import click
|
| 1464 |
-
|
| 1465 |
-
ocr_engines = []
|
| 1466 |
-
for engine_name in req.engines:
|
| 1467 |
-
try:
|
| 1468 |
-
eng = _engine_from_name(engine_name, lang=req.lang, psm=6)
|
| 1469 |
-
ocr_engines.append(eng)
|
| 1470 |
-
job.add_event("log", {"message": f"Moteur chargé : {engine_name}"})
|
| 1471 |
-
except (click.BadParameter, Exception) as exc:
|
| 1472 |
-
job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"})
|
| 1473 |
-
|
| 1474 |
-
if not ocr_engines:
|
| 1475 |
-
raise ValueError("Aucun moteur valide disponible.")
|
| 1476 |
-
|
| 1477 |
-
# Répertoire de sortie
|
| 1478 |
-
output_dir = Path(req.output_dir)
|
| 1479 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 1480 |
-
report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 1481 |
-
output_json = str(output_dir / f"{report_name}.json")
|
| 1482 |
-
output_html = str(output_dir / f"{report_name}.html")
|
| 1483 |
-
|
| 1484 |
-
# Callback de progression (injecté dans un wrapper)
|
| 1485 |
-
n_engines = len(ocr_engines)
|
| 1486 |
-
total_steps = job.total_docs * n_engines
|
| 1487 |
-
|
| 1488 |
-
step_counter = [0]
|
| 1489 |
-
|
| 1490 |
-
def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
|
| 1491 |
-
if job.status == "cancelled":
|
| 1492 |
-
return
|
| 1493 |
-
step_counter[0] += 1
|
| 1494 |
-
job.current_engine = engine_name
|
| 1495 |
-
job.processed_docs = doc_idx
|
| 1496 |
-
job.progress = step_counter[0] / max(total_steps, 1)
|
| 1497 |
-
job.add_event("progress", {
|
| 1498 |
-
"engine": engine_name,
|
| 1499 |
-
"doc_idx": doc_idx,
|
| 1500 |
-
"doc_id": doc_id,
|
| 1501 |
-
"progress": job.progress,
|
| 1502 |
-
"processed": step_counter[0],
|
| 1503 |
-
"total": total_steps,
|
| 1504 |
-
})
|
| 1505 |
-
|
| 1506 |
-
from picarones.measurements.normalization import _parse_exclude_chars
|
| 1507 |
-
char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
|
| 1508 |
-
|
| 1509 |
-
# Lancer le benchmark
|
| 1510 |
-
result = run_benchmark(
|
| 1511 |
-
corpus=corpus,
|
| 1512 |
-
engines=ocr_engines,
|
| 1513 |
-
output_json=output_json,
|
| 1514 |
-
show_progress=False,
|
| 1515 |
-
progress_callback=_progress_callback,
|
| 1516 |
-
char_exclude=char_excl,
|
| 1517 |
-
cancel_event=job._cancel_event,
|
| 1518 |
-
)
|
| 1519 |
-
|
| 1520 |
-
if job.status == "cancelled":
|
| 1521 |
-
return
|
| 1522 |
-
|
| 1523 |
-
# Générer le rapport HTML
|
| 1524 |
-
job.add_event("log", {"message": "Génération du rapport HTML…"})
|
| 1525 |
-
from picarones.report.generator import ReportGenerator
|
| 1526 |
-
report_lang = getattr(req, "report_lang", "fr")
|
| 1527 |
-
gen = ReportGenerator(result, lang=report_lang)
|
| 1528 |
-
gen.generate(output_html)
|
| 1529 |
-
|
| 1530 |
-
job.output_path = output_html
|
| 1531 |
-
job.progress = 1.0
|
| 1532 |
-
job.set_status("complete")
|
| 1533 |
-
|
| 1534 |
-
# Classement final
|
| 1535 |
-
ranking = result.ranking()
|
| 1536 |
-
job.add_event("complete", {
|
| 1537 |
-
"message": "Benchmark terminé.",
|
| 1538 |
-
"output_html": output_html,
|
| 1539 |
-
"output_json": output_json,
|
| 1540 |
-
"ranking": ranking,
|
| 1541 |
-
})
|
| 1542 |
-
|
| 1543 |
-
except Exception as exc:
|
| 1544 |
-
job.set_status("error", error=str(exc))
|
| 1545 |
-
job.add_event("error", {"message": f"Erreur : {exc}"})
|
| 1546 |
-
|
| 1547 |
-
|
| 1548 |
# ---------------------------------------------------------------------------
|
| 1549 |
# Page principale HTML (SPA)
|
| 1550 |
# ---------------------------------------------------------------------------
|
|
|
|
| 44 |
from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
|
| 45 |
|
| 46 |
from picarones import __version__
|
| 47 |
+
from picarones.web.benchmark_utils import (
|
| 48 |
+
run_benchmark_thread as _run_benchmark_thread,
|
| 49 |
+
run_benchmark_thread_v2 as _run_benchmark_thread_v2,
|
| 50 |
+
sse_format as _sse_format,
|
| 51 |
+
)
|
| 52 |
+
from picarones.web.config_utils import (
|
| 53 |
+
CONFIG_SCHEMA_VERSION as _CONFIG_SCHEMA_VERSION,
|
| 54 |
+
filter_config as _filter_config,
|
| 55 |
+
upgrade_config as _upgrade_config,
|
| 56 |
+
)
|
| 57 |
from picarones.web.corpus_utils import (
|
| 58 |
analyze_corpus_dir as _analyze_corpus_dir,
|
| 59 |
flatten_zip_to_dir as _flatten_zip_to_dir,
|
|
|
|
| 70 |
from picarones.web.models import (
|
| 71 |
BenchmarkRequest,
|
| 72 |
BenchmarkRunRequest,
|
|
|
|
| 73 |
HTRUnitedImportRequest,
|
| 74 |
HuggingFaceImportRequest,
|
| 75 |
)
|
|
|
|
| 656 |
# API — config save/load (Sprint 28)
|
| 657 |
# ---------------------------------------------------------------------------
|
| 658 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
@app.post("/api/config/save")
|
| 660 |
async def api_config_save(payload: dict) -> Response:
|
| 661 |
"""Sérialise un dict de config en JSON téléchargeable.
|
|
|
|
| 1151 |
)
|
| 1152 |
|
| 1153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1154 |
# ---------------------------------------------------------------------------
|
| 1155 |
# API — benchmark/run (concurrents composés)
|
| 1156 |
# ---------------------------------------------------------------------------
|
|
|
|
| 1203 |
return {"job_id": job_id, "status": "pending"}
|
| 1204 |
|
| 1205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1206 |
# ---------------------------------------------------------------------------
|
| 1207 |
# Page principale HTML (SPA)
|
| 1208 |
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utilitaires d'exécution de benchmark côté web.
|
| 2 |
+
|
| 3 |
+
- ``sse_format`` : sérialisation d'un événement Server-Sent Events
|
| 4 |
+
avec ``Last-Event-ID`` (Sprint 26).
|
| 5 |
+
- ``build_llm_adapter`` : factory adapter LLM depuis une config
|
| 6 |
+
``CompetitorConfig``.
|
| 7 |
+
- ``engine_from_competitor`` : factory moteur OCR ou pipeline
|
| 8 |
+
OCR+LLM depuis une ``CompetitorConfig``.
|
| 9 |
+
- ``run_benchmark_thread`` / ``run_benchmark_thread_v2`` : workers
|
| 10 |
+
threadés qui exécutent le benchmark, émettent des événements SSE
|
| 11 |
+
via le ``BenchmarkJob``, génèrent le rapport HTML final.
|
| 12 |
+
|
| 13 |
+
Ces utilitaires sont consommés par les routeurs ``/api/benchmark/*``.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Any, Optional
|
| 22 |
+
|
| 23 |
+
from picarones.web.models import (
|
| 24 |
+
BenchmarkRequest,
|
| 25 |
+
BenchmarkRunRequest,
|
| 26 |
+
CompetitorConfig,
|
| 27 |
+
)
|
| 28 |
+
from picarones.web.state import BenchmarkJob, iso_now
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def sse_format(event_type: str, data: Any, seq: Optional[int] = None) -> str:
|
| 32 |
+
"""Format Server-Sent Events.
|
| 33 |
+
|
| 34 |
+
Sprint 26 — émet une ligne ``id: <seq>`` quand le ``seq`` est connu.
|
| 35 |
+
C'est la valeur que le navigateur renvoie automatiquement dans
|
| 36 |
+
``Last-Event-ID`` à la prochaine connexion (cf.
|
| 37 |
+
https://html.spec.whatwg.org/multipage/server-sent-events.html).
|
| 38 |
+
"""
|
| 39 |
+
payload = json.dumps(data, ensure_ascii=False)
|
| 40 |
+
head = f"id: {seq}\n" if seq is not None else ""
|
| 41 |
+
return f"{head}event: {event_type}\ndata: {payload}\n\n"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def build_llm_adapter(comp: CompetitorConfig) -> Any:
|
| 45 |
+
"""Instancie un adaptateur LLM depuis la config d'un concurrent."""
|
| 46 |
+
if comp.llm_provider == "openai":
|
| 47 |
+
from picarones.llm.openai_adapter import OpenAIAdapter
|
| 48 |
+
return OpenAIAdapter(model=comp.llm_model or None)
|
| 49 |
+
elif comp.llm_provider == "anthropic":
|
| 50 |
+
from picarones.llm.anthropic_adapter import AnthropicAdapter
|
| 51 |
+
return AnthropicAdapter(model=comp.llm_model or None)
|
| 52 |
+
elif comp.llm_provider == "mistral":
|
| 53 |
+
from picarones.llm.mistral_adapter import MistralAdapter
|
| 54 |
+
return MistralAdapter(model=comp.llm_model or None)
|
| 55 |
+
elif comp.llm_provider == "ollama":
|
| 56 |
+
from picarones.llm.ollama_adapter import OllamaAdapter
|
| 57 |
+
return OllamaAdapter(model=comp.llm_model or None)
|
| 58 |
+
else:
|
| 59 |
+
raise ValueError(f"Provider LLM inconnu : {comp.llm_provider}")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def engine_from_competitor(comp: CompetitorConfig) -> Any:
|
| 63 |
+
"""Instancie un moteur OCR (ou pipeline OCR+LLM) depuis une CompetitorConfig.
|
| 64 |
+
|
| 65 |
+
Modes supportés :
|
| 66 |
+
|
| 67 |
+
- ``ocr_engine`` = ``tesseract``, ``mistral_ocr``, … → moteur OCR seul.
|
| 68 |
+
- ``ocr_engine`` + ``llm_provider`` → pipeline OCR live + LLM.
|
| 69 |
+
- ``ocr_engine`` = ``corpus`` + ``llm_provider`` → post-correction LLM
|
| 70 |
+
avec OCR pré-calculé (fichiers ``.ocr.txt`` du corpus triplet).
|
| 71 |
+
- ``ocr_engine`` = ``""`` + ``llm_provider`` → LLM seul (zero-shot
|
| 72 |
+
ou post-correction).
|
| 73 |
+
"""
|
| 74 |
+
engine_id = comp.ocr_engine
|
| 75 |
+
is_corpus_ocr = engine_id in ("corpus", "")
|
| 76 |
+
|
| 77 |
+
if is_corpus_ocr and not comp.llm_provider:
|
| 78 |
+
raise ValueError(
|
| 79 |
+
"ocr_engine='corpus' nécessite un llm_provider "
|
| 80 |
+
"(pour la post-correction ou le zero-shot)"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
ocr = None
|
| 84 |
+
if not is_corpus_ocr:
|
| 85 |
+
from picarones.engines.tesseract import TesseractEngine
|
| 86 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 87 |
+
|
| 88 |
+
if engine_id == "tesseract":
|
| 89 |
+
ocr = TesseractEngine(config={"lang": comp.ocr_model or "fra", "psm": 6})
|
| 90 |
+
elif engine_id == "mistral_ocr":
|
| 91 |
+
ocr = MistralOCREngine(config={"model": comp.ocr_model or "mistral-ocr-latest"})
|
| 92 |
+
elif engine_id == "google_vision":
|
| 93 |
+
try:
|
| 94 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 95 |
+
ocr = GoogleVisionEngine(
|
| 96 |
+
config={"detection_type": comp.ocr_model or "document_text_detection"},
|
| 97 |
+
)
|
| 98 |
+
except ImportError as exc:
|
| 99 |
+
raise RuntimeError("Google Vision non disponible.") from exc
|
| 100 |
+
elif engine_id == "azure_doc_intel":
|
| 101 |
+
try:
|
| 102 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 103 |
+
ocr = AzureDocIntelEngine(
|
| 104 |
+
config={"model": comp.ocr_model or "prebuilt-document"},
|
| 105 |
+
)
|
| 106 |
+
except ImportError as exc:
|
| 107 |
+
raise RuntimeError("Azure Document Intelligence non disponible.") from exc
|
| 108 |
+
else:
|
| 109 |
+
raise ValueError(f"Moteur OCR inconnu : {engine_id}")
|
| 110 |
+
|
| 111 |
+
if not comp.llm_provider:
|
| 112 |
+
return ocr
|
| 113 |
+
|
| 114 |
+
# Pipeline OCR+LLM (live ou post-correction)
|
| 115 |
+
mode_map = {
|
| 116 |
+
"text_only": "text_only",
|
| 117 |
+
"post_correction_text": "text_only",
|
| 118 |
+
"text_and_image": "text_and_image",
|
| 119 |
+
"post_correction_image": "text_and_image",
|
| 120 |
+
"zero_shot": "zero_shot",
|
| 121 |
+
}
|
| 122 |
+
mode = mode_map.get(comp.pipeline_mode, "text_only")
|
| 123 |
+
|
| 124 |
+
llm = build_llm_adapter(comp)
|
| 125 |
+
|
| 126 |
+
from picarones.pipelines.base import OCRLLMPipeline
|
| 127 |
+
prompt = comp.prompt_file or "correction_medieval_french.txt"
|
| 128 |
+
|
| 129 |
+
if is_corpus_ocr:
|
| 130 |
+
pipeline_name = comp.name or f"corpus_ocr → {comp.llm_model or comp.llm_provider}"
|
| 131 |
+
else:
|
| 132 |
+
pipeline_name = comp.name or f"{engine_id} → {comp.llm_model or comp.llm_provider}"
|
| 133 |
+
|
| 134 |
+
return OCRLLMPipeline(
|
| 135 |
+
ocr_engine=ocr,
|
| 136 |
+
llm_adapter=llm,
|
| 137 |
+
mode=mode,
|
| 138 |
+
prompt=prompt,
|
| 139 |
+
pipeline_name=pipeline_name,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def run_benchmark_thread_v2(job: BenchmarkJob, req: BenchmarkRunRequest) -> None:
|
| 144 |
+
"""Exécute un benchmark à partir d'une liste de ``CompetitorConfig``."""
|
| 145 |
+
job.set_status("running")
|
| 146 |
+
job.started_at = iso_now()
|
| 147 |
+
job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 151 |
+
from picarones.measurements.runner import run_benchmark
|
| 152 |
+
|
| 153 |
+
corpus = load_corpus_from_directory(req.corpus_path)
|
| 154 |
+
job.total_docs = len(corpus)
|
| 155 |
+
job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
|
| 156 |
+
|
| 157 |
+
if job.status == "cancelled":
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
engines = []
|
| 161 |
+
for comp in req.competitors:
|
| 162 |
+
try:
|
| 163 |
+
eng = engine_from_competitor(comp)
|
| 164 |
+
engines.append(eng)
|
| 165 |
+
job.add_event("log", {"message": f"Concurrent : {eng.name}"})
|
| 166 |
+
except Exception as exc: # noqa: BLE001
|
| 167 |
+
job.add_event("warning", {
|
| 168 |
+
"message": f"Concurrent ignoré '{comp.name or comp.ocr_engine}' : {exc}"
|
| 169 |
+
})
|
| 170 |
+
|
| 171 |
+
if not engines:
|
| 172 |
+
raise ValueError("Aucun concurrent valide disponible.")
|
| 173 |
+
|
| 174 |
+
output_dir = Path(req.output_dir)
|
| 175 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 176 |
+
report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 177 |
+
output_json = str(output_dir / f"{report_name}.json")
|
| 178 |
+
output_html = str(output_dir / f"{report_name}.html")
|
| 179 |
+
|
| 180 |
+
n_engines = len(engines)
|
| 181 |
+
total_steps = job.total_docs * n_engines
|
| 182 |
+
step_counter = [0]
|
| 183 |
+
|
| 184 |
+
def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
|
| 185 |
+
if job.status == "cancelled":
|
| 186 |
+
return
|
| 187 |
+
step_counter[0] += 1
|
| 188 |
+
job.current_engine = engine_name
|
| 189 |
+
job.processed_docs = doc_idx
|
| 190 |
+
job.progress = step_counter[0] / max(total_steps, 1)
|
| 191 |
+
job.add_event("progress", {
|
| 192 |
+
"engine": engine_name,
|
| 193 |
+
"doc_idx": doc_idx,
|
| 194 |
+
"doc_id": doc_id,
|
| 195 |
+
"progress": job.progress,
|
| 196 |
+
"processed": step_counter[0],
|
| 197 |
+
"total": total_steps,
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
from picarones.measurements.normalization import _parse_exclude_chars
|
| 201 |
+
char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
|
| 202 |
+
|
| 203 |
+
result = run_benchmark(
|
| 204 |
+
corpus=corpus,
|
| 205 |
+
engines=engines,
|
| 206 |
+
output_json=output_json,
|
| 207 |
+
show_progress=False,
|
| 208 |
+
progress_callback=_progress_callback,
|
| 209 |
+
char_exclude=char_excl,
|
| 210 |
+
cancel_event=job._cancel_event,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
if job.status == "cancelled":
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
job.add_event("log", {"message": "Génération du rapport HTML…"})
|
| 217 |
+
from picarones.report.generator import ReportGenerator
|
| 218 |
+
gen = ReportGenerator(result, lang=req.report_lang)
|
| 219 |
+
gen.generate(output_html)
|
| 220 |
+
|
| 221 |
+
job.output_path = output_html
|
| 222 |
+
job.progress = 1.0
|
| 223 |
+
job.set_status("complete")
|
| 224 |
+
|
| 225 |
+
ranking = result.ranking()
|
| 226 |
+
job.add_event("complete", {
|
| 227 |
+
"message": "Benchmark terminé.",
|
| 228 |
+
"output_html": output_html,
|
| 229 |
+
"output_json": output_json,
|
| 230 |
+
"ranking": ranking,
|
| 231 |
+
})
|
| 232 |
+
|
| 233 |
+
except Exception as exc: # noqa: BLE001
|
| 234 |
+
job.set_status("error", error=str(exc))
|
| 235 |
+
job.add_event("error", {"message": f"Erreur : {exc}"})
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
|
| 239 |
+
"""Exécute le benchmark legacy (route ``/api/benchmark/start``)."""
|
| 240 |
+
job.set_status("running")
|
| 241 |
+
job.started_at = iso_now()
|
| 242 |
+
job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 246 |
+
from picarones.measurements.runner import run_benchmark
|
| 247 |
+
|
| 248 |
+
# Charger le corpus
|
| 249 |
+
job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"})
|
| 250 |
+
corpus = load_corpus_from_directory(req.corpus_path)
|
| 251 |
+
job.total_docs = len(corpus)
|
| 252 |
+
job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
|
| 253 |
+
|
| 254 |
+
if job.status == "cancelled":
|
| 255 |
+
return
|
| 256 |
+
|
| 257 |
+
# Instancier les moteurs
|
| 258 |
+
from picarones.cli import _engine_from_name
|
| 259 |
+
import click
|
| 260 |
+
|
| 261 |
+
ocr_engines = []
|
| 262 |
+
for engine_name in req.engines:
|
| 263 |
+
try:
|
| 264 |
+
eng = _engine_from_name(engine_name, lang=req.lang, psm=6)
|
| 265 |
+
ocr_engines.append(eng)
|
| 266 |
+
job.add_event("log", {"message": f"Moteur chargé : {engine_name}"})
|
| 267 |
+
except (click.BadParameter, Exception) as exc:
|
| 268 |
+
job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"})
|
| 269 |
+
|
| 270 |
+
if not ocr_engines:
|
| 271 |
+
raise ValueError("Aucun moteur valide disponible.")
|
| 272 |
+
|
| 273 |
+
# Répertoire de sortie
|
| 274 |
+
output_dir = Path(req.output_dir)
|
| 275 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 276 |
+
report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 277 |
+
output_json = str(output_dir / f"{report_name}.json")
|
| 278 |
+
output_html = str(output_dir / f"{report_name}.html")
|
| 279 |
+
|
| 280 |
+
# Callback de progression
|
| 281 |
+
n_engines = len(ocr_engines)
|
| 282 |
+
total_steps = job.total_docs * n_engines
|
| 283 |
+
step_counter = [0]
|
| 284 |
+
|
| 285 |
+
def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
|
| 286 |
+
if job.status == "cancelled":
|
| 287 |
+
return
|
| 288 |
+
step_counter[0] += 1
|
| 289 |
+
job.current_engine = engine_name
|
| 290 |
+
job.processed_docs = doc_idx
|
| 291 |
+
job.progress = step_counter[0] / max(total_steps, 1)
|
| 292 |
+
job.add_event("progress", {
|
| 293 |
+
"engine": engine_name,
|
| 294 |
+
"doc_idx": doc_idx,
|
| 295 |
+
"doc_id": doc_id,
|
| 296 |
+
"progress": job.progress,
|
| 297 |
+
"processed": step_counter[0],
|
| 298 |
+
"total": total_steps,
|
| 299 |
+
})
|
| 300 |
+
|
| 301 |
+
from picarones.measurements.normalization import _parse_exclude_chars
|
| 302 |
+
char_excl = _parse_exclude_chars(req.char_exclude) if req.char_exclude else None
|
| 303 |
+
|
| 304 |
+
result = run_benchmark(
|
| 305 |
+
corpus=corpus,
|
| 306 |
+
engines=ocr_engines,
|
| 307 |
+
output_json=output_json,
|
| 308 |
+
show_progress=False,
|
| 309 |
+
progress_callback=_progress_callback,
|
| 310 |
+
char_exclude=char_excl,
|
| 311 |
+
cancel_event=job._cancel_event,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
if job.status == "cancelled":
|
| 315 |
+
return
|
| 316 |
+
|
| 317 |
+
job.add_event("log", {"message": "Génération du rapport HTML…"})
|
| 318 |
+
from picarones.report.generator import ReportGenerator
|
| 319 |
+
report_lang = getattr(req, "report_lang", "fr")
|
| 320 |
+
gen = ReportGenerator(result, lang=report_lang)
|
| 321 |
+
gen.generate(output_html)
|
| 322 |
+
|
| 323 |
+
job.output_path = output_html
|
| 324 |
+
job.progress = 1.0
|
| 325 |
+
job.set_status("complete")
|
| 326 |
+
|
| 327 |
+
ranking = result.ranking()
|
| 328 |
+
job.add_event("complete", {
|
| 329 |
+
"message": "Benchmark terminé.",
|
| 330 |
+
"output_html": output_html,
|
| 331 |
+
"output_json": output_json,
|
| 332 |
+
"ranking": ranking,
|
| 333 |
+
})
|
| 334 |
+
|
| 335 |
+
except Exception as exc: # noqa: BLE001
|
| 336 |
+
job.set_status("error", error=str(exc))
|
| 337 |
+
job.add_event("error", {"message": f"Erreur : {exc}"})
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
__all__ = [
|
| 341 |
+
"sse_format",
|
| 342 |
+
"build_llm_adapter",
|
| 343 |
+
"engine_from_competitor",
|
| 344 |
+
"run_benchmark_thread",
|
| 345 |
+
"run_benchmark_thread_v2",
|
| 346 |
+
]
|
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utilitaires de validation et migration des configs utilisateur.
|
| 2 |
+
|
| 3 |
+
Sprint 28 — supprime la friction *« reconfigurer chaque session »* :
|
| 4 |
+
le client peut télécharger sa config en JSON et la réimporter plus
|
| 5 |
+
tard. Ce module définit le schéma versionné et les règles de filtrage
|
| 6 |
+
qui empêchent qu'un payload trop riche n'embarque des secrets ou des
|
| 7 |
+
clés serveur.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from typing import Any
|
| 13 |
+
|
| 14 |
+
CONFIG_SCHEMA_VERSION = 1
|
| 15 |
+
"""Bump quand le format change ; ajouter un upgrade path dans ``upgrade_config``."""
|
| 16 |
+
|
| 17 |
+
ALLOWED_CONFIG_FIELDS: frozenset[str] = frozenset({
|
| 18 |
+
"schema_version",
|
| 19 |
+
"saved_at",
|
| 20 |
+
"label",
|
| 21 |
+
"corpus_path",
|
| 22 |
+
"engines",
|
| 23 |
+
"normalization_profile",
|
| 24 |
+
"char_exclude",
|
| 25 |
+
"lang",
|
| 26 |
+
"report_lang",
|
| 27 |
+
"output_dir",
|
| 28 |
+
"report_name",
|
| 29 |
+
"competitors",
|
| 30 |
+
})
|
| 31 |
+
"""Liste blanche des champs autorisés dans une config sauvegardée."""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def filter_config(payload: dict) -> dict:
|
| 35 |
+
"""Ne garde que les champs autorisés, dans un ordre stable pour les diffs."""
|
| 36 |
+
out: dict[str, Any] = {}
|
| 37 |
+
for k in sorted(ALLOWED_CONFIG_FIELDS):
|
| 38 |
+
if k in payload:
|
| 39 |
+
out[k] = payload[k]
|
| 40 |
+
return out
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def upgrade_config(payload: dict) -> dict:
|
| 44 |
+
"""Migre les anciennes configs vers le schéma courant.
|
| 45 |
+
|
| 46 |
+
Schéma 1 (Sprint 28) : pas de migration nécessaire — on retourne tel quel.
|
| 47 |
+
Future : ajouter des branches sur ``schema_version`` quand le format évolue.
|
| 48 |
+
"""
|
| 49 |
+
return payload
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
__all__ = [
|
| 53 |
+
"CONFIG_SCHEMA_VERSION",
|
| 54 |
+
"ALLOWED_CONFIG_FIELDS",
|
| 55 |
+
"filter_config",
|
| 56 |
+
"upgrade_config",
|
| 57 |
+
]
|