Spaces:
Sleeping
Sleeping
Merge pull request #3 from maribakulj/claude/setup-picarones-project-FKKns
Browse files- .gitignore +3 -0
- picarones/cli.py +165 -0
- picarones/core/char_scores.py +360 -0
- picarones/core/confusion.py +264 -0
- picarones/core/image_quality.py +395 -0
- picarones/core/metrics.py +58 -6
- picarones/core/normalization.py +286 -0
- picarones/core/results.py +56 -0
- picarones/core/runner.py +171 -1
- picarones/core/structure.py +230 -0
- picarones/core/taxonomy.py +351 -0
- picarones/engines/__init__.py +11 -1
- picarones/engines/azure_doc_intel.py +153 -0
- picarones/engines/google_vision.py +133 -0
- picarones/engines/mistral_ocr.py +91 -0
- picarones/fixtures.py +78 -12
- picarones/importers/__init__.py +5 -0
- picarones/importers/htr_united.py +449 -0
- picarones/importers/huggingface.py +427 -0
- picarones/importers/iiif.py +583 -0
- picarones/report/generator.py +436 -4
- picarones/web/__init__.py +1 -0
- picarones/web/app.py +1634 -0
- pyproject.toml +3 -1
- rapport_demo.html +0 -0
- tests/test_sprint4_normalization_iiif.py +834 -0
- tests/test_sprint5_advanced_metrics.py +876 -0
- tests/test_sprint6_web_interface.py +982 -0
.gitignore
CHANGED
|
@@ -16,3 +16,6 @@ venv/
|
|
| 16 |
*.html
|
| 17 |
results*.json
|
| 18 |
rapport*.html
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
*.html
|
| 17 |
results*.json
|
| 18 |
rapport*.html
|
| 19 |
+
rapports/
|
| 20 |
+
corpus_*/
|
| 21 |
+
corpus/
|
picarones/cli.py
CHANGED
|
@@ -381,5 +381,170 @@ def demo_cmd(output: str, docs: int, json_output: str | None) -> None:
|
|
| 381 |
click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
|
| 382 |
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
if __name__ == "__main__":
|
| 385 |
cli()
|
|
|
|
| 381 |
click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
|
| 382 |
|
| 383 |
|
| 384 |
+
# ---------------------------------------------------------------------------
|
| 385 |
+
# picarones import (groupe de sous-commandes)
|
| 386 |
+
# ---------------------------------------------------------------------------
|
| 387 |
+
|
| 388 |
+
@cli.group("import")
|
| 389 |
+
def import_group() -> None:
|
| 390 |
+
"""Importe un corpus depuis une source distante (IIIF, HuggingFace…)."""
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
@import_group.command("iiif")
|
| 394 |
+
@click.argument("manifest_url")
|
| 395 |
+
@click.option(
|
| 396 |
+
"--pages", "-p",
|
| 397 |
+
default="all",
|
| 398 |
+
show_default=True,
|
| 399 |
+
help=(
|
| 400 |
+
"Pages à importer. Formats : '1-10', '1,3,5', '1-5,10,15-20', 'all'. "
|
| 401 |
+
"Les numéros sont 1-based (1 = première page du manifeste)."
|
| 402 |
+
),
|
| 403 |
+
)
|
| 404 |
+
@click.option(
|
| 405 |
+
"--output", "-o",
|
| 406 |
+
default="./corpus_iiif/",
|
| 407 |
+
show_default=True,
|
| 408 |
+
type=click.Path(resolve_path=True),
|
| 409 |
+
help="Dossier de destination pour les images et les fichiers .gt.txt",
|
| 410 |
+
)
|
| 411 |
+
@click.option(
|
| 412 |
+
"--max-resolution",
|
| 413 |
+
default=0,
|
| 414 |
+
type=int,
|
| 415 |
+
show_default=True,
|
| 416 |
+
help="Résolution maximale des images téléchargées (largeur en pixels). 0 = max disponible.",
|
| 417 |
+
)
|
| 418 |
+
@click.option("--no-progress", is_flag=True, default=False, help="Désactive la barre de progression")
|
| 419 |
+
@click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
|
| 420 |
+
def import_iiif_cmd(
|
| 421 |
+
manifest_url: str,
|
| 422 |
+
pages: str,
|
| 423 |
+
output: str,
|
| 424 |
+
max_resolution: int,
|
| 425 |
+
no_progress: bool,
|
| 426 |
+
verbose: bool,
|
| 427 |
+
) -> None:
|
| 428 |
+
"""Importe un corpus depuis un manifeste IIIF (v2 ou v3).
|
| 429 |
+
|
| 430 |
+
MANIFEST_URL : URL du manifeste IIIF (Gallica, Bodleian, BL, BSB…)
|
| 431 |
+
|
| 432 |
+
Exemples :
|
| 433 |
+
|
| 434 |
+
\b
|
| 435 |
+
picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json
|
| 436 |
+
picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json --pages 1-10
|
| 437 |
+
picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json --pages 1,3,5-8 --output ./mon_corpus/
|
| 438 |
+
|
| 439 |
+
Les images sont téléchargées dans le dossier de sortie.
|
| 440 |
+
Des fichiers .gt.txt vides (ou remplis si le manifeste contient des annotations
|
| 441 |
+
de transcription) sont créés à côté de chaque image.
|
| 442 |
+
"""
|
| 443 |
+
_setup_logging(verbose)
|
| 444 |
+
|
| 445 |
+
from picarones.importers.iiif import IIIFImporter
|
| 446 |
+
|
| 447 |
+
click.echo(f"Manifeste IIIF : {manifest_url}")
|
| 448 |
+
|
| 449 |
+
try:
|
| 450 |
+
importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
|
| 451 |
+
importer.load()
|
| 452 |
+
|
| 453 |
+
all_canvases = importer.parser.canvases()
|
| 454 |
+
click.echo(
|
| 455 |
+
f"Manifeste IIIF v{importer.parser.version} — "
|
| 456 |
+
f"titre : {importer.parser.label} — "
|
| 457 |
+
f"{len(all_canvases)} canvas disponibles"
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
selected = importer.list_canvases(pages)
|
| 461 |
+
click.echo(f"Pages sélectionnées : {len(selected)} sur {len(all_canvases)}")
|
| 462 |
+
|
| 463 |
+
corpus = importer.import_corpus(
|
| 464 |
+
pages=pages,
|
| 465 |
+
output_dir=output,
|
| 466 |
+
show_progress=not no_progress,
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
except (ValueError, RuntimeError) as exc:
|
| 470 |
+
click.echo(f"Erreur import IIIF : {exc}", err=True)
|
| 471 |
+
sys.exit(1)
|
| 472 |
+
|
| 473 |
+
click.echo(f"\n{len(corpus)} documents importés dans : {output}")
|
| 474 |
+
|
| 475 |
+
# Résumé
|
| 476 |
+
gt_filled = sum(1 for d in corpus.documents if d.ground_truth.strip())
|
| 477 |
+
if gt_filled:
|
| 478 |
+
click.echo(f"Transcriptions trouvées dans le manifeste : {gt_filled}/{len(corpus)}")
|
| 479 |
+
else:
|
| 480 |
+
click.echo(
|
| 481 |
+
"Aucune transcription dans le manifeste — "
|
| 482 |
+
"les fichiers .gt.txt sont vides (à remplir manuellement ou via OCR)."
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
click.echo(f"\nPour lancer un benchmark sur ce corpus :")
|
| 486 |
+
click.echo(f" picarones run --corpus {output} --engines tesseract")
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
# ---------------------------------------------------------------------------
|
| 490 |
+
# picarones serve
|
| 491 |
+
# ---------------------------------------------------------------------------
|
| 492 |
+
|
| 493 |
+
@cli.command("serve")
|
| 494 |
+
@click.option(
|
| 495 |
+
"--host",
|
| 496 |
+
default="127.0.0.1",
|
| 497 |
+
show_default=True,
|
| 498 |
+
help="Adresse d'écoute du serveur web",
|
| 499 |
+
)
|
| 500 |
+
@click.option(
|
| 501 |
+
"--port", "-p",
|
| 502 |
+
default=8000,
|
| 503 |
+
show_default=True,
|
| 504 |
+
type=click.IntRange(1, 65535),
|
| 505 |
+
help="Port d'écoute du serveur web",
|
| 506 |
+
)
|
| 507 |
+
@click.option("--reload", is_flag=True, default=False, help="Mode rechargement automatique (développement)")
|
| 508 |
+
@click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
|
| 509 |
+
def serve_cmd(host: str, port: int, reload: bool, verbose: bool) -> None:
|
| 510 |
+
"""Lance l'interface web locale Picarones sur localhost.
|
| 511 |
+
|
| 512 |
+
Accessible dans le navigateur à l'adresse : http://HOST:PORT
|
| 513 |
+
|
| 514 |
+
\b
|
| 515 |
+
Exemples :
|
| 516 |
+
picarones serve
|
| 517 |
+
picarones serve --port 8080
|
| 518 |
+
picarones serve --host 0.0.0.0 --port 8000
|
| 519 |
+
"""
|
| 520 |
+
_setup_logging(verbose)
|
| 521 |
+
|
| 522 |
+
try:
|
| 523 |
+
import uvicorn
|
| 524 |
+
except ImportError:
|
| 525 |
+
click.echo(
|
| 526 |
+
"uvicorn n'est pas installé. Installez-le avec :\n"
|
| 527 |
+
" pip install uvicorn[standard]\n"
|
| 528 |
+
"ou :\n"
|
| 529 |
+
" pip install picarones[web]",
|
| 530 |
+
err=True,
|
| 531 |
+
)
|
| 532 |
+
sys.exit(1)
|
| 533 |
+
|
| 534 |
+
url = f"http://{host}:{port}"
|
| 535 |
+
click.echo(f"Picarones — Interface web locale")
|
| 536 |
+
click.echo(f"Démarrage du serveur sur {url}")
|
| 537 |
+
click.echo(f"Appuyez sur Ctrl+C pour arrêter.\n")
|
| 538 |
+
|
| 539 |
+
log_level = "debug" if verbose else "info"
|
| 540 |
+
uvicorn.run(
|
| 541 |
+
"picarones.web.app:app",
|
| 542 |
+
host=host,
|
| 543 |
+
port=port,
|
| 544 |
+
reload=reload,
|
| 545 |
+
log_level=log_level,
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
|
| 549 |
if __name__ == "__main__":
|
| 550 |
cli()
|
picarones/core/char_scores.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Scores de reconnaissance des ligatures et des diacritiques.
|
| 2 |
+
|
| 3 |
+
Ces métriques sont spécifiques aux documents patrimoniaux (manuscrits, imprimés
|
| 4 |
+
anciens) où ligatures et diacritiques jouent un rôle paléographique essentiel.
|
| 5 |
+
|
| 6 |
+
Ligatures
|
| 7 |
+
---------
|
| 8 |
+
Caractères encodés comme une séquence unique dans Unicode mais représentant
|
| 9 |
+
deux ou plusieurs glyphes fusionnés : fi (fi), fl (fl), œ, æ, etc.
|
| 10 |
+
|
| 11 |
+
Pour chaque ligature présente dans le GT, on vérifie si l'OCR a produit
|
| 12 |
+
soit le caractère Unicode équivalent, soit la séquence décomposée équivalente.
|
| 13 |
+
|
| 14 |
+
Diacritiques
|
| 15 |
+
-----------
|
| 16 |
+
Accents, cédilles, trémas et autres signes diacritiques. Pour chaque caractère
|
| 17 |
+
accentué dans le GT, on vérifie si l'OCR a conservé le diacritique ou l'a
|
| 18 |
+
remplacé par la lettre de base.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
from typing import Optional
|
| 25 |
+
|
| 26 |
+
import unicodedata
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Tables de ligatures (char ligature → séquences équivalentes acceptées)
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
#: Table principale des ligatures et leurs équivalents acceptés.
|
| 34 |
+
#: Clé = caractère ligature Unicode ; valeur = liste de séquences équivalentes.
|
| 35 |
+
LIGATURE_TABLE: dict[str, list[str]] = {
|
| 36 |
+
# Ligatures typographiques latines (Unicode Letterlike Symbols / Alphabetic Presentation Forms)
|
| 37 |
+
"\uFB00": ["ff"], # ff ff
|
| 38 |
+
"\uFB01": ["fi"], # fi fi
|
| 39 |
+
"\uFB02": ["fl"], # fl fl
|
| 40 |
+
"\uFB03": ["ffi"], # ffi ffi
|
| 41 |
+
"\uFB04": ["ffl"], # ffl ffl
|
| 42 |
+
"\uFB05": ["st", "\u017Ft"], # ſt st / ſt
|
| 43 |
+
"\uFB06": ["st"], # st st (variante)
|
| 44 |
+
# Ligatures latines patrimoniales (Unicode Latin Extended Additional)
|
| 45 |
+
"\u0153": ["oe"], # œ oe
|
| 46 |
+
"\u00E6": ["ae"], # æ ae
|
| 47 |
+
"\u0152": ["OE"], # Œ OE
|
| 48 |
+
"\u00C6": ["AE"], # Æ AE
|
| 49 |
+
# Abréviations latines / médiévales
|
| 50 |
+
"\uA751": ["per", "p\u0332"], # ꝑ per / p̲
|
| 51 |
+
"\uA753": ["pro"], # ꝓ pro
|
| 52 |
+
"\uA757": ["que"], # ꝗ que
|
| 53 |
+
# Ligatures germaniques
|
| 54 |
+
"\u00DF": ["ss"], # ß ss
|
| 55 |
+
"\u1E9E": ["SS"], # ẞ SS
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# Ensemble de toutes les ligatures pour recherche rapide
|
| 59 |
+
_ALL_LIGATURES: frozenset[str] = frozenset(LIGATURE_TABLE)
|
| 60 |
+
|
| 61 |
+
# Mapping inverse : séquence → ligature
|
| 62 |
+
_SEQ_TO_LIGATURE: dict[str, str] = {}
|
| 63 |
+
for _lig, _seqs in LIGATURE_TABLE.items():
|
| 64 |
+
for _seq in _seqs:
|
| 65 |
+
_SEQ_TO_LIGATURE[_seq] = _lig
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
# Table des caractères diacritiques
|
| 70 |
+
# ---------------------------------------------------------------------------
|
| 71 |
+
|
| 72 |
+
def _build_diacritic_map() -> dict[str, str]:
|
| 73 |
+
"""Construit automatiquement la table diacritique depuis l'Unicode."""
|
| 74 |
+
table: dict[str, str] = {}
|
| 75 |
+
for codepoint in range(0x00C0, 0x0250): # Latin Étendu A + B
|
| 76 |
+
ch = chr(codepoint)
|
| 77 |
+
nfd = unicodedata.normalize("NFD", ch)
|
| 78 |
+
if len(nfd) > 1: # le caractère est décomposable
|
| 79 |
+
base = nfd[0] # lettre de base
|
| 80 |
+
if base.isalpha() and base != ch:
|
| 81 |
+
table[ch] = base
|
| 82 |
+
# Compléments manuels
|
| 83 |
+
table.update({
|
| 84 |
+
"\u0107": "c", # ć
|
| 85 |
+
"\u0119": "e", # ę
|
| 86 |
+
"\u0142": "l", # ł
|
| 87 |
+
"\u0144": "n", # ń
|
| 88 |
+
"\u015B": "s", # ś
|
| 89 |
+
"\u017A": "z", # ź
|
| 90 |
+
"\u017C": "z", # ż
|
| 91 |
+
})
|
| 92 |
+
return table
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
DIACRITIC_MAP: dict[str, str] = _build_diacritic_map()
|
| 96 |
+
_ALL_DIACRITICS: frozenset[str] = frozenset(DIACRITIC_MAP)
|
| 97 |
+
|
| 98 |
+
# Ligatures qui NE sont PAS des diacritiques (pour éviter les doublons)
|
| 99 |
+
_LIGATURE_SET: frozenset[str] = frozenset(LIGATURE_TABLE)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
# Résultats structurés
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
|
| 106 |
+
@dataclass
|
| 107 |
+
class LigatureScore:
|
| 108 |
+
"""Score de reconnaissance des ligatures pour une paire (GT, OCR)."""
|
| 109 |
+
|
| 110 |
+
total_in_gt: int = 0
|
| 111 |
+
"""Nombre de ligatures présentes dans le GT."""
|
| 112 |
+
correctly_recognized: int = 0
|
| 113 |
+
"""Nombre de ligatures correctement transcrites (unicode ou équivalent)."""
|
| 114 |
+
score: float = 0.0
|
| 115 |
+
"""Taux de reconnaissance = correctly_recognized / total_in_gt. 1.0 si total=0."""
|
| 116 |
+
per_ligature: dict[str, dict] = field(default_factory=dict)
|
| 117 |
+
"""Détail par ligature : {'fi': {'gt_count': 5, 'ocr_correct': 3, 'score': 0.6}}"""
|
| 118 |
+
|
| 119 |
+
def as_dict(self) -> dict:
|
| 120 |
+
return {
|
| 121 |
+
"total_in_gt": self.total_in_gt,
|
| 122 |
+
"correctly_recognized": self.correctly_recognized,
|
| 123 |
+
"score": round(self.score, 4),
|
| 124 |
+
"per_ligature": {
|
| 125 |
+
k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()}
|
| 126 |
+
for k, v in self.per_ligature.items()
|
| 127 |
+
},
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@dataclass
|
| 132 |
+
class DiacriticScore:
|
| 133 |
+
"""Score de conservation des diacritiques pour une paire (GT, OCR)."""
|
| 134 |
+
|
| 135 |
+
total_in_gt: int = 0
|
| 136 |
+
"""Nombre de caractères accentués dans le GT."""
|
| 137 |
+
correctly_recognized: int = 0
|
| 138 |
+
"""Nombre de diacritiques correctement conservés."""
|
| 139 |
+
score: float = 0.0
|
| 140 |
+
"""Taux de conservation = correctly_recognized / total_in_gt. 1.0 si total=0."""
|
| 141 |
+
per_diacritic: dict[str, dict] = field(default_factory=dict)
|
| 142 |
+
"""Détail par caractère diacritique."""
|
| 143 |
+
|
| 144 |
+
def as_dict(self) -> dict:
|
| 145 |
+
return {
|
| 146 |
+
"total_in_gt": self.total_in_gt,
|
| 147 |
+
"correctly_recognized": self.correctly_recognized,
|
| 148 |
+
"score": round(self.score, 4),
|
| 149 |
+
"per_diacritic": {
|
| 150 |
+
k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()}
|
| 151 |
+
for k, v in self.per_diacritic.items()
|
| 152 |
+
},
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ---------------------------------------------------------------------------
|
| 157 |
+
# Calcul des scores
|
| 158 |
+
# ---------------------------------------------------------------------------
|
| 159 |
+
|
| 160 |
+
def compute_ligature_score(ground_truth: str, hypothesis: str) -> LigatureScore:
|
| 161 |
+
"""Calcule le score de reconnaissance des ligatures.
|
| 162 |
+
|
| 163 |
+
Pour chaque ligature dans le GT, on vérifie si l'OCR a produit :
|
| 164 |
+
- Exactement le même caractère ligature Unicode (ex. fi → fi)
|
| 165 |
+
- Ou la séquence de lettres équivalente (ex. fi → fi)
|
| 166 |
+
|
| 167 |
+
Les deux sont considérés comme corrects — ce qui correspond à la pratique
|
| 168 |
+
éditoriale patrimoniaux (certains éditeurs développent les ligatures).
|
| 169 |
+
|
| 170 |
+
Parameters
|
| 171 |
+
----------
|
| 172 |
+
ground_truth:
|
| 173 |
+
Texte de référence.
|
| 174 |
+
hypothesis:
|
| 175 |
+
Texte produit par l'OCR.
|
| 176 |
+
|
| 177 |
+
Returns
|
| 178 |
+
-------
|
| 179 |
+
LigatureScore
|
| 180 |
+
"""
|
| 181 |
+
if not ground_truth:
|
| 182 |
+
return LigatureScore(score=1.0)
|
| 183 |
+
|
| 184 |
+
# Construire un index de position dans l'hypothèse pour recherche rapide
|
| 185 |
+
hyp_norm = unicodedata.normalize("NFC", hypothesis)
|
| 186 |
+
gt_norm = unicodedata.normalize("NFC", ground_truth)
|
| 187 |
+
|
| 188 |
+
per_lig: dict[str, dict] = {}
|
| 189 |
+
total = 0
|
| 190 |
+
correct = 0
|
| 191 |
+
|
| 192 |
+
# Trouver toutes les ligatures dans le GT
|
| 193 |
+
i = 0
|
| 194 |
+
while i < len(gt_norm):
|
| 195 |
+
ch = gt_norm[i]
|
| 196 |
+
if ch in _ALL_LIGATURES:
|
| 197 |
+
total += 1
|
| 198 |
+
equivalents = [ch] + LIGATURE_TABLE[ch] # unicode direct ou séquences équivalentes
|
| 199 |
+
|
| 200 |
+
# Vérifier si la position correspondante dans l'OCR contient l'équivalent
|
| 201 |
+
is_correct = _check_char_at_context(gt_norm, hyp_norm, i, ch, equivalents)
|
| 202 |
+
if is_correct:
|
| 203 |
+
correct += 1
|
| 204 |
+
|
| 205 |
+
if ch not in per_lig:
|
| 206 |
+
per_lig[ch] = {"gt_count": 0, "ocr_correct": 0, "score": 0.0}
|
| 207 |
+
per_lig[ch]["gt_count"] += 1
|
| 208 |
+
if is_correct:
|
| 209 |
+
per_lig[ch]["ocr_correct"] += 1
|
| 210 |
+
i += 1
|
| 211 |
+
|
| 212 |
+
# Calculer les scores individuels
|
| 213 |
+
for lig_data in per_lig.values():
|
| 214 |
+
lig_data["score"] = (
|
| 215 |
+
lig_data["ocr_correct"] / lig_data["gt_count"]
|
| 216 |
+
if lig_data["gt_count"] > 0
|
| 217 |
+
else 1.0
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
score = correct / total if total > 0 else 1.0
|
| 221 |
+
return LigatureScore(
|
| 222 |
+
total_in_gt=total,
|
| 223 |
+
correctly_recognized=correct,
|
| 224 |
+
score=score,
|
| 225 |
+
per_ligature=per_lig,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def compute_diacritic_score(ground_truth: str, hypothesis: str) -> DiacriticScore:
|
| 230 |
+
"""Calcule le score de conservation des diacritiques.
|
| 231 |
+
|
| 232 |
+
Pour chaque caractère accentué dans le GT, on vérifie si l'OCR a produit
|
| 233 |
+
le même caractère (conservation) ou a substitué la lettre de base (perte).
|
| 234 |
+
On accepte aussi les formes NFD équivalentes.
|
| 235 |
+
|
| 236 |
+
Parameters
|
| 237 |
+
----------
|
| 238 |
+
ground_truth:
|
| 239 |
+
Texte de référence.
|
| 240 |
+
hypothesis:
|
| 241 |
+
Texte produit par l'OCR.
|
| 242 |
+
|
| 243 |
+
Returns
|
| 244 |
+
-------
|
| 245 |
+
DiacriticScore
|
| 246 |
+
"""
|
| 247 |
+
if not ground_truth:
|
| 248 |
+
return DiacriticScore(score=1.0)
|
| 249 |
+
|
| 250 |
+
gt_norm = unicodedata.normalize("NFC", ground_truth)
|
| 251 |
+
hyp_norm = unicodedata.normalize("NFC", hypothesis)
|
| 252 |
+
|
| 253 |
+
per_diac: dict[str, dict] = {}
|
| 254 |
+
total = 0
|
| 255 |
+
correct = 0
|
| 256 |
+
|
| 257 |
+
# Utiliser difflib pour l'alignement
|
| 258 |
+
import difflib
|
| 259 |
+
matcher = difflib.SequenceMatcher(None, gt_norm, hyp_norm, autojunk=False)
|
| 260 |
+
gt_to_hyp: dict[int, Optional[int]] = {}
|
| 261 |
+
|
| 262 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
| 263 |
+
if tag == "equal":
|
| 264 |
+
for k in range(i2 - i1):
|
| 265 |
+
gt_to_hyp[i1 + k] = j1 + k
|
| 266 |
+
elif tag == "replace" and (i2 - i1) == (j2 - j1):
|
| 267 |
+
for k in range(i2 - i1):
|
| 268 |
+
gt_to_hyp[i1 + k] = j1 + k
|
| 269 |
+
else:
|
| 270 |
+
# delete ou replace de longueurs différentes
|
| 271 |
+
for k in range(i1, i2):
|
| 272 |
+
gt_to_hyp[k] = None
|
| 273 |
+
|
| 274 |
+
for i, ch in enumerate(gt_norm):
|
| 275 |
+
if ch in _ALL_DIACRITICS and ch not in _LIGATURE_SET:
|
| 276 |
+
total += 1
|
| 277 |
+
hyp_pos = gt_to_hyp.get(i)
|
| 278 |
+
is_correct = False
|
| 279 |
+
if hyp_pos is not None and hyp_pos < len(hyp_norm):
|
| 280 |
+
hyp_ch = hyp_norm[hyp_pos]
|
| 281 |
+
is_correct = (hyp_ch == ch)
|
| 282 |
+
if is_correct:
|
| 283 |
+
correct += 1
|
| 284 |
+
|
| 285 |
+
if ch not in per_diac:
|
| 286 |
+
per_diac[ch] = {"gt_count": 0, "ocr_correct": 0, "score": 0.0}
|
| 287 |
+
per_diac[ch]["gt_count"] += 1
|
| 288 |
+
if is_correct:
|
| 289 |
+
per_diac[ch]["ocr_correct"] += 1
|
| 290 |
+
|
| 291 |
+
for diac_data in per_diac.values():
|
| 292 |
+
diac_data["score"] = (
|
| 293 |
+
diac_data["ocr_correct"] / diac_data["gt_count"]
|
| 294 |
+
if diac_data["gt_count"] > 0
|
| 295 |
+
else 1.0
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
score = correct / total if total > 0 else 1.0
|
| 299 |
+
return DiacriticScore(
|
| 300 |
+
total_in_gt=total,
|
| 301 |
+
correctly_recognized=correct,
|
| 302 |
+
score=score,
|
| 303 |
+
per_diacritic=per_diac,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def _check_char_at_context(
|
| 308 |
+
gt: str,
|
| 309 |
+
hyp: str,
|
| 310 |
+
gt_pos: int,
|
| 311 |
+
gt_char: str,
|
| 312 |
+
equivalents: list[str],
|
| 313 |
+
) -> bool:
|
| 314 |
+
"""Vérifie si la position correspondante dans l'hypothèse contient un équivalent."""
|
| 315 |
+
# Approche simple : chercher si l'hypothèse contient le caractère ou son équivalent
|
| 316 |
+
# dans une fenêtre autour de la position estimée
|
| 317 |
+
for equiv in equivalents:
|
| 318 |
+
if equiv in hyp:
|
| 319 |
+
return True
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def aggregate_ligature_scores(scores: list[LigatureScore]) -> dict:
|
| 324 |
+
"""Agrège les scores de ligatures sur un corpus."""
|
| 325 |
+
total_gt = sum(s.total_in_gt for s in scores)
|
| 326 |
+
total_correct = sum(s.correctly_recognized for s in scores)
|
| 327 |
+
score = total_correct / total_gt if total_gt > 0 else 1.0
|
| 328 |
+
|
| 329 |
+
# Agrégation par ligature
|
| 330 |
+
per_lig: dict[str, dict] = {}
|
| 331 |
+
for s in scores:
|
| 332 |
+
for lig, data in s.per_ligature.items():
|
| 333 |
+
if lig not in per_lig:
|
| 334 |
+
per_lig[lig] = {"gt_count": 0, "ocr_correct": 0}
|
| 335 |
+
per_lig[lig]["gt_count"] += data["gt_count"]
|
| 336 |
+
per_lig[lig]["ocr_correct"] += data["ocr_correct"]
|
| 337 |
+
for lig_data in per_lig.values():
|
| 338 |
+
lig_data["score"] = (
|
| 339 |
+
lig_data["ocr_correct"] / lig_data["gt_count"]
|
| 340 |
+
if lig_data["gt_count"] > 0 else 1.0
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
return {
|
| 344 |
+
"score": round(score, 4),
|
| 345 |
+
"total_in_gt": total_gt,
|
| 346 |
+
"correctly_recognized": total_correct,
|
| 347 |
+
"per_ligature": per_lig,
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def aggregate_diacritic_scores(scores: list[DiacriticScore]) -> dict:
|
| 352 |
+
"""Agrège les scores diacritiques sur un corpus."""
|
| 353 |
+
total_gt = sum(s.total_in_gt for s in scores)
|
| 354 |
+
total_correct = sum(s.correctly_recognized for s in scores)
|
| 355 |
+
score = total_correct / total_gt if total_gt > 0 else 1.0
|
| 356 |
+
return {
|
| 357 |
+
"score": round(score, 4),
|
| 358 |
+
"total_in_gt": total_gt,
|
| 359 |
+
"correctly_recognized": total_correct,
|
| 360 |
+
}
|
picarones/core/confusion.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Matrice de confusion unicode pour l'analyse fine des erreurs OCR.
|
| 2 |
+
|
| 3 |
+
Pour chaque moteur, on calcule quels caractères du GT sont transcrits par
|
| 4 |
+
quels caractères OCR (substitutions). Cette "empreinte d'erreur" est
|
| 5 |
+
caractéristique de chaque moteur ou pipeline.
|
| 6 |
+
|
| 7 |
+
Méthode
|
| 8 |
+
-------
|
| 9 |
+
L'alignement caractère par caractère utilise les opérations d'édition
|
| 10 |
+
de la distance de Levenshtein (via difflib.SequenceMatcher), ce qui permet
|
| 11 |
+
d'identifier les substitutions, insertions et suppressions.
|
| 12 |
+
|
| 13 |
+
La matrice est stockée comme un dict de dict :
|
| 14 |
+
``{gt_char: {ocr_char: count}}``
|
| 15 |
+
|
| 16 |
+
La valeur spéciale ``"∅"`` (U+2205) représente un caractère vide :
|
| 17 |
+
- ``{"a": {"∅": 3}}`` → 'a' supprimé 3 fois dans l'OCR
|
| 18 |
+
- ``{"∅": {"x": 2}}`` → 'x' inséré 2 fois dans l'OCR (absent du GT)
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import difflib
|
| 24 |
+
from collections import defaultdict
|
| 25 |
+
from dataclasses import dataclass, field
|
| 26 |
+
from typing import Optional
|
| 27 |
+
|
| 28 |
+
# Symbole représentant un caractère absent (insertion / suppression)
|
| 29 |
+
EMPTY_CHAR = "∅"
|
| 30 |
+
|
| 31 |
+
# Caractères non pertinents à ignorer dans la matrice (espaces, sauts de ligne)
|
| 32 |
+
_WHITESPACE = set(" \t\n\r")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ConfusionMatrix:
|
| 37 |
+
"""Matrice de confusion unicode pour une paire (GT, OCR)."""
|
| 38 |
+
|
| 39 |
+
matrix: dict[str, dict[str, int]] = field(default_factory=dict)
|
| 40 |
+
"""Clé externe = char GT ; clé interne = char OCR ; valeur = count."""
|
| 41 |
+
|
| 42 |
+
total_substitutions: int = 0
|
| 43 |
+
total_insertions: int = 0
|
| 44 |
+
total_deletions: int = 0
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def total_errors(self) -> int:
|
| 48 |
+
return self.total_substitutions + self.total_insertions + self.total_deletions
|
| 49 |
+
|
| 50 |
+
def top_confusions(self, n: int = 20) -> list[dict]:
|
| 51 |
+
"""Retourne les n confusions les plus fréquentes (substitutions uniquement)."""
|
| 52 |
+
pairs: list[tuple[str, str, int]] = []
|
| 53 |
+
for gt_char, ocr_counts in self.matrix.items():
|
| 54 |
+
if gt_char == EMPTY_CHAR:
|
| 55 |
+
continue # insertions
|
| 56 |
+
for ocr_char, count in ocr_counts.items():
|
| 57 |
+
if ocr_char == EMPTY_CHAR:
|
| 58 |
+
continue # suppressions
|
| 59 |
+
if gt_char != ocr_char:
|
| 60 |
+
pairs.append((gt_char, ocr_char, count))
|
| 61 |
+
pairs.sort(key=lambda x: -x[2])
|
| 62 |
+
return [
|
| 63 |
+
{"gt": gt, "ocr": ocr, "count": cnt}
|
| 64 |
+
for gt, ocr, cnt in pairs[:n]
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
def as_compact_dict(self, min_count: int = 1) -> dict:
|
| 68 |
+
"""Sérialise la matrice en éliminant les entrées rares."""
|
| 69 |
+
compact: dict[str, dict[str, int]] = {}
|
| 70 |
+
for gt_char, ocr_counts in self.matrix.items():
|
| 71 |
+
filtered = {
|
| 72 |
+
oc: cnt for oc, cnt in ocr_counts.items()
|
| 73 |
+
if cnt >= min_count
|
| 74 |
+
}
|
| 75 |
+
if filtered:
|
| 76 |
+
compact[gt_char] = filtered
|
| 77 |
+
return {
|
| 78 |
+
"matrix": compact,
|
| 79 |
+
"total_substitutions": self.total_substitutions,
|
| 80 |
+
"total_insertions": self.total_insertions,
|
| 81 |
+
"total_deletions": self.total_deletions,
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
def as_dict(self) -> dict:
|
| 85 |
+
return self.as_compact_dict(min_count=1)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def build_confusion_matrix(
|
| 89 |
+
ground_truth: str,
|
| 90 |
+
hypothesis: str,
|
| 91 |
+
ignore_whitespace: bool = True,
|
| 92 |
+
ignore_correct: bool = True,
|
| 93 |
+
) -> ConfusionMatrix:
|
| 94 |
+
"""Construit la matrice de confusion unicode pour une paire GT/OCR.
|
| 95 |
+
|
| 96 |
+
Parameters
|
| 97 |
+
----------
|
| 98 |
+
ground_truth:
|
| 99 |
+
Texte de référence (vérité terrain).
|
| 100 |
+
hypothesis:
|
| 101 |
+
Texte produit par l'OCR.
|
| 102 |
+
ignore_whitespace:
|
| 103 |
+
Si True, ignore les espaces, tabulations et sauts de ligne.
|
| 104 |
+
ignore_correct:
|
| 105 |
+
Si True, n'enregistre pas les paires identiques (gt_char == ocr_char).
|
| 106 |
+
Par défaut True pour réduire la taille de la matrice.
|
| 107 |
+
|
| 108 |
+
Returns
|
| 109 |
+
-------
|
| 110 |
+
ConfusionMatrix
|
| 111 |
+
"""
|
| 112 |
+
matrix: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
| 113 |
+
n_subs = n_ins = n_dels = 0
|
| 114 |
+
|
| 115 |
+
if not ground_truth and not hypothesis:
|
| 116 |
+
return ConfusionMatrix(dict(matrix), 0, 0, 0)
|
| 117 |
+
|
| 118 |
+
# SequenceMatcher sur listes de chars pour un alignement précis
|
| 119 |
+
matcher = difflib.SequenceMatcher(None, ground_truth, hypothesis, autojunk=False)
|
| 120 |
+
|
| 121 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
| 122 |
+
if tag == "equal":
|
| 123 |
+
if not ignore_correct:
|
| 124 |
+
for ch in ground_truth[i1:i2]:
|
| 125 |
+
if ignore_whitespace and ch in _WHITESPACE:
|
| 126 |
+
continue
|
| 127 |
+
matrix[ch][ch] += 1
|
| 128 |
+
elif tag == "replace":
|
| 129 |
+
# Aligner char par char les séquences de longueurs différentes
|
| 130 |
+
gt_seg = ground_truth[i1:i2]
|
| 131 |
+
oc_seg = hypothesis[j1:j2]
|
| 132 |
+
_align_segments(gt_seg, oc_seg, matrix, ignore_whitespace)
|
| 133 |
+
# Comptabiliser grossièrement (alignement sous-optimal possible)
|
| 134 |
+
n_subs += max(len(gt_seg), len(oc_seg))
|
| 135 |
+
elif tag == "delete":
|
| 136 |
+
for ch in ground_truth[i1:i2]:
|
| 137 |
+
if ignore_whitespace and ch in _WHITESPACE:
|
| 138 |
+
continue
|
| 139 |
+
matrix[ch][EMPTY_CHAR] += 1
|
| 140 |
+
n_dels += 1
|
| 141 |
+
elif tag == "insert":
|
| 142 |
+
for ch in hypothesis[j1:j2]:
|
| 143 |
+
if ignore_whitespace and ch in _WHITESPACE:
|
| 144 |
+
continue
|
| 145 |
+
matrix[EMPTY_CHAR][ch] += 1
|
| 146 |
+
n_ins += 1
|
| 147 |
+
|
| 148 |
+
# Convertir defaultdict en dict normal
|
| 149 |
+
result_matrix: dict[str, dict[str, int]] = {
|
| 150 |
+
k: dict(v) for k, v in matrix.items()
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
return ConfusionMatrix(
|
| 154 |
+
matrix=result_matrix,
|
| 155 |
+
total_substitutions=n_subs,
|
| 156 |
+
total_insertions=n_ins,
|
| 157 |
+
total_deletions=n_dels,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _align_segments(
|
| 162 |
+
gt_seg: str,
|
| 163 |
+
oc_seg: str,
|
| 164 |
+
matrix: dict,
|
| 165 |
+
ignore_whitespace: bool,
|
| 166 |
+
) -> None:
|
| 167 |
+
"""Aligne deux segments de longueurs potentiellement différentes."""
|
| 168 |
+
if not gt_seg:
|
| 169 |
+
for ch in oc_seg:
|
| 170 |
+
if ignore_whitespace and ch in _WHITESPACE:
|
| 171 |
+
continue
|
| 172 |
+
matrix[EMPTY_CHAR][ch] += 1
|
| 173 |
+
return
|
| 174 |
+
if not oc_seg:
|
| 175 |
+
for ch in gt_seg:
|
| 176 |
+
if ignore_whitespace and ch in _WHITESPACE:
|
| 177 |
+
continue
|
| 178 |
+
matrix[ch][EMPTY_CHAR] += 1
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
if len(gt_seg) == len(oc_seg):
|
| 182 |
+
# Substitutions 1-pour-1
|
| 183 |
+
for g, o in zip(gt_seg, oc_seg):
|
| 184 |
+
if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
|
| 185 |
+
continue
|
| 186 |
+
matrix[g][o] += 1
|
| 187 |
+
else:
|
| 188 |
+
# Longueurs différentes : utiliser SequenceMatcher récursif sur segments courts
|
| 189 |
+
sub = difflib.SequenceMatcher(None, gt_seg, oc_seg, autojunk=False)
|
| 190 |
+
for tag2, i1, i2, j1, j2 in sub.get_opcodes():
|
| 191 |
+
if tag2 == "equal":
|
| 192 |
+
pass
|
| 193 |
+
elif tag2 == "replace":
|
| 194 |
+
# Régression simple : aligner par troncature
|
| 195 |
+
for g, o in zip(gt_seg[i1:i2], oc_seg[j1:j2]):
|
| 196 |
+
if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
|
| 197 |
+
continue
|
| 198 |
+
matrix[g][o] += 1
|
| 199 |
+
elif tag2 == "delete":
|
| 200 |
+
for g in gt_seg[i1:i2]:
|
| 201 |
+
if ignore_whitespace and g in _WHITESPACE:
|
| 202 |
+
continue
|
| 203 |
+
matrix[g][EMPTY_CHAR] += 1
|
| 204 |
+
elif tag2 == "insert":
|
| 205 |
+
for o in oc_seg[j1:j2]:
|
| 206 |
+
if ignore_whitespace and o in _WHITESPACE:
|
| 207 |
+
continue
|
| 208 |
+
matrix[EMPTY_CHAR][o] += 1
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def aggregate_confusion_matrices(matrices: list[ConfusionMatrix]) -> ConfusionMatrix:
|
| 212 |
+
"""Agrège plusieurs matrices de confusion en une seule.
|
| 213 |
+
|
| 214 |
+
Utile pour obtenir la matrice agrégée sur l'ensemble du corpus.
|
| 215 |
+
"""
|
| 216 |
+
combined: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
| 217 |
+
total_subs = total_ins = total_dels = 0
|
| 218 |
+
|
| 219 |
+
for cm in matrices:
|
| 220 |
+
for gt_char, ocr_counts in cm.matrix.items():
|
| 221 |
+
for ocr_char, count in ocr_counts.items():
|
| 222 |
+
combined[gt_char][ocr_char] += count
|
| 223 |
+
total_subs += cm.total_substitutions
|
| 224 |
+
total_ins += cm.total_insertions
|
| 225 |
+
total_dels += cm.total_deletions
|
| 226 |
+
|
| 227 |
+
return ConfusionMatrix(
|
| 228 |
+
matrix={k: dict(v) for k, v in combined.items()},
|
| 229 |
+
total_substitutions=total_subs,
|
| 230 |
+
total_insertions=total_ins,
|
| 231 |
+
total_deletions=total_dels,
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def top_confused_chars(
|
| 236 |
+
matrix: ConfusionMatrix,
|
| 237 |
+
n: int = 15,
|
| 238 |
+
exclude_empty: bool = True,
|
| 239 |
+
) -> list[dict]:
|
| 240 |
+
"""Retourne les caractères GT les plus souvent confondus.
|
| 241 |
+
|
| 242 |
+
Retourne une liste triée par nombre total d'erreurs décroissant :
|
| 243 |
+
``[{"char": "ſ", "total_errors": 47, "top_substitutes": [...]}, ...]``
|
| 244 |
+
"""
|
| 245 |
+
char_stats: dict[str, dict] = {}
|
| 246 |
+
for gt_char, ocr_counts in matrix.matrix.items():
|
| 247 |
+
if exclude_empty and gt_char == EMPTY_CHAR:
|
| 248 |
+
continue
|
| 249 |
+
error_count = sum(
|
| 250 |
+
cnt for oc, cnt in ocr_counts.items()
|
| 251 |
+
if (oc != gt_char) and (not exclude_empty or oc != EMPTY_CHAR or True)
|
| 252 |
+
)
|
| 253 |
+
if error_count > 0:
|
| 254 |
+
top_subs = sorted(
|
| 255 |
+
[{"ocr": oc, "count": cnt} for oc, cnt in ocr_counts.items() if oc != gt_char],
|
| 256 |
+
key=lambda x: -x["count"],
|
| 257 |
+
)[:5]
|
| 258 |
+
char_stats[gt_char] = {
|
| 259 |
+
"char": gt_char,
|
| 260 |
+
"total_errors": error_count,
|
| 261 |
+
"top_substitutes": top_subs,
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
return sorted(char_stats.values(), key=lambda x: -x["total_errors"])[:n]
|
picarones/core/image_quality.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyse automatique de la qualité des images de documents numérisés.
|
| 2 |
+
|
| 3 |
+
Métriques
|
| 4 |
+
---------
|
| 5 |
+
- **Score de netteté** : variance du laplacien (plus élevé = plus net)
|
| 6 |
+
- **Niveau de bruit** : écart-type des résidus haute-fréquence
|
| 7 |
+
- **Angle de rotation résiduel** : estimé par projection horizontale
|
| 8 |
+
- **Score de contraste** : ratio Michelson entre zones sombres (encre) et claires (fond)
|
| 9 |
+
- **Score de qualité global** : combinaison normalisée des métriques ci-dessus
|
| 10 |
+
|
| 11 |
+
Ces calculs sont réalisés en pur Python + bibliothèques stdlib ou Pillow.
|
| 12 |
+
NumPy est utilisé si disponible (calculs plus rapides), mais les méthodes
|
| 13 |
+
de fallback n'en dépendent pas.
|
| 14 |
+
|
| 15 |
+
Note
|
| 16 |
+
----
|
| 17 |
+
Pour les images placeholder (fixtures), des valeurs fictives cohérentes
|
| 18 |
+
sont générées via `generate_mock_quality_scores()`.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import math
|
| 24 |
+
import statistics
|
| 25 |
+
from dataclasses import dataclass
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from typing import Optional
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ImageQualityResult:
|
| 32 |
+
"""Métriques de qualité d'une image de document."""
|
| 33 |
+
|
| 34 |
+
sharpness_score: float = 0.0
|
| 35 |
+
"""Score de netteté [0, 1]. Basé sur la variance du laplacien normalisée."""
|
| 36 |
+
|
| 37 |
+
noise_level: float = 0.0
|
| 38 |
+
"""Niveau de bruit [0, 1]. 0 = pas de bruit, 1 = très bruité."""
|
| 39 |
+
|
| 40 |
+
rotation_degrees: float = 0.0
|
| 41 |
+
"""Angle de rotation résiduel estimé en degrés (positif = sens horaire)."""
|
| 42 |
+
|
| 43 |
+
contrast_score: float = 0.0
|
| 44 |
+
"""Score de contraste [0, 1]. Ratio Michelson encre/fond."""
|
| 45 |
+
|
| 46 |
+
quality_score: float = 0.0
|
| 47 |
+
"""Score de qualité global [0, 1]. Combinaison pondérée des autres métriques."""
|
| 48 |
+
|
| 49 |
+
analysis_method: str = "none"
|
| 50 |
+
"""Méthode d'analyse utilisée : 'pillow', 'numpy', 'mock'."""
|
| 51 |
+
|
| 52 |
+
error: Optional[str] = None
|
| 53 |
+
"""Erreur si l'analyse a échoué."""
|
| 54 |
+
|
| 55 |
+
@property
|
| 56 |
+
def is_good_quality(self) -> bool:
|
| 57 |
+
"""Vrai si le score de qualité global est ≥ 0.7."""
|
| 58 |
+
return self.quality_score >= 0.7
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
def quality_tier(self) -> str:
|
| 62 |
+
"""Catégorie de qualité : 'good', 'medium', 'poor'."""
|
| 63 |
+
if self.quality_score >= 0.7:
|
| 64 |
+
return "good"
|
| 65 |
+
elif self.quality_score >= 0.4:
|
| 66 |
+
return "medium"
|
| 67 |
+
return "poor"
|
| 68 |
+
|
| 69 |
+
def as_dict(self) -> dict:
|
| 70 |
+
d = {
|
| 71 |
+
"sharpness_score": round(self.sharpness_score, 4),
|
| 72 |
+
"noise_level": round(self.noise_level, 4),
|
| 73 |
+
"rotation_degrees": round(self.rotation_degrees, 2),
|
| 74 |
+
"contrast_score": round(self.contrast_score, 4),
|
| 75 |
+
"quality_score": round(self.quality_score, 4),
|
| 76 |
+
"quality_tier": self.quality_tier,
|
| 77 |
+
"analysis_method": self.analysis_method,
|
| 78 |
+
}
|
| 79 |
+
if self.error:
|
| 80 |
+
d["error"] = self.error
|
| 81 |
+
return d
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def from_dict(cls, data: dict) -> "ImageQualityResult":
|
| 85 |
+
return cls(
|
| 86 |
+
sharpness_score=data.get("sharpness_score", 0.0),
|
| 87 |
+
noise_level=data.get("noise_level", 0.0),
|
| 88 |
+
rotation_degrees=data.get("rotation_degrees", 0.0),
|
| 89 |
+
contrast_score=data.get("contrast_score", 0.0),
|
| 90 |
+
quality_score=data.get("quality_score", 0.0),
|
| 91 |
+
analysis_method=data.get("analysis_method", "none"),
|
| 92 |
+
error=data.get("error"),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def analyze_image_quality(image_path: str | Path) -> ImageQualityResult:
|
| 97 |
+
"""Analyse la qualité d'une image de document numérisé.
|
| 98 |
+
|
| 99 |
+
Essaie successivement :
|
| 100 |
+
1. Pillow + NumPy (méthode complète)
|
| 101 |
+
2. Pillow seul (méthode simplifiée)
|
| 102 |
+
3. Fallback : retourne un résultat vide avec erreur
|
| 103 |
+
|
| 104 |
+
Parameters
|
| 105 |
+
----------
|
| 106 |
+
image_path:
|
| 107 |
+
Chemin vers l'image (JPG, PNG, TIFF…).
|
| 108 |
+
|
| 109 |
+
Returns
|
| 110 |
+
-------
|
| 111 |
+
ImageQualityResult
|
| 112 |
+
"""
|
| 113 |
+
path = Path(image_path)
|
| 114 |
+
if not path.exists():
|
| 115 |
+
return ImageQualityResult(
|
| 116 |
+
error=f"Fichier image introuvable : {image_path}",
|
| 117 |
+
analysis_method="none",
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Essai avec Pillow + NumPy
|
| 121 |
+
try:
|
| 122 |
+
import numpy as np
|
| 123 |
+
from PIL import Image
|
| 124 |
+
return _analyze_with_numpy(path, np, Image)
|
| 125 |
+
except ImportError:
|
| 126 |
+
pass
|
| 127 |
+
|
| 128 |
+
# Essai avec Pillow seul
|
| 129 |
+
try:
|
| 130 |
+
from PIL import Image
|
| 131 |
+
return _analyze_with_pillow(path, Image)
|
| 132 |
+
except ImportError:
|
| 133 |
+
pass
|
| 134 |
+
|
| 135 |
+
return ImageQualityResult(
|
| 136 |
+
error="Pillow non disponible (pip install Pillow)",
|
| 137 |
+
analysis_method="none",
|
| 138 |
+
quality_score=0.5, # valeur neutre
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _analyze_with_numpy(path: Path, np, Image) -> ImageQualityResult:
|
| 143 |
+
"""Analyse complète avec NumPy."""
|
| 144 |
+
img = Image.open(path).convert("L") # niveaux de gris
|
| 145 |
+
arr = np.array(img, dtype=np.float32)
|
| 146 |
+
|
| 147 |
+
# 1. Netteté : variance du laplacien
|
| 148 |
+
laplacian = _laplacian_variance_numpy(arr, np)
|
| 149 |
+
# Normalisation empirique : variance > 500 = très net, < 50 = flou
|
| 150 |
+
sharpness = min(1.0, laplacian / 500.0)
|
| 151 |
+
|
| 152 |
+
# 2. Bruit : écart-type des résidus (différence image - image lissée)
|
| 153 |
+
noise = _noise_level_numpy(arr, np)
|
| 154 |
+
|
| 155 |
+
# 3. Rotation : angle d'inclinaison estimé
|
| 156 |
+
rotation = _estimate_rotation_numpy(arr, np)
|
| 157 |
+
|
| 158 |
+
# 4. Contraste : ratio Michelson
|
| 159 |
+
contrast = _contrast_score_numpy(arr, np)
|
| 160 |
+
|
| 161 |
+
# 5. Score global pondéré
|
| 162 |
+
quality = _global_quality_score(sharpness, noise, abs(rotation), contrast)
|
| 163 |
+
|
| 164 |
+
return ImageQualityResult(
|
| 165 |
+
sharpness_score=float(sharpness),
|
| 166 |
+
noise_level=float(noise),
|
| 167 |
+
rotation_degrees=float(rotation),
|
| 168 |
+
contrast_score=float(contrast),
|
| 169 |
+
quality_score=float(quality),
|
| 170 |
+
analysis_method="numpy",
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _analyze_with_pillow(path: Path, Image) -> ImageQualityResult:
|
| 175 |
+
"""Analyse simplifiée avec Pillow seul (sans NumPy)."""
|
| 176 |
+
img = Image.open(path).convert("L")
|
| 177 |
+
pixels = list(img.getdata())
|
| 178 |
+
w, h = img.size
|
| 179 |
+
|
| 180 |
+
if not pixels:
|
| 181 |
+
return ImageQualityResult(quality_score=0.5, analysis_method="pillow")
|
| 182 |
+
|
| 183 |
+
# Contraste : étendue des valeurs
|
| 184 |
+
min_val = min(pixels)
|
| 185 |
+
max_val = max(pixels)
|
| 186 |
+
if max_val + min_val > 0:
|
| 187 |
+
contrast = (max_val - min_val) / (max_val + min_val)
|
| 188 |
+
else:
|
| 189 |
+
contrast = 0.0
|
| 190 |
+
|
| 191 |
+
# Netteté approximée : variance globale des pixels
|
| 192 |
+
mean_pix = statistics.mean(pixels)
|
| 193 |
+
try:
|
| 194 |
+
variance = statistics.variance(pixels)
|
| 195 |
+
except statistics.StatisticsError:
|
| 196 |
+
variance = 0.0
|
| 197 |
+
sharpness = min(1.0, math.sqrt(variance) / 128.0)
|
| 198 |
+
|
| 199 |
+
# Bruit : approximation grossière
|
| 200 |
+
noise = min(1.0, statistics.stdev(pixels[:min(1000, len(pixels))]) / 64.0) if len(pixels) > 1 else 0.0
|
| 201 |
+
|
| 202 |
+
quality = _global_quality_score(sharpness, noise, 0.0, contrast)
|
| 203 |
+
|
| 204 |
+
return ImageQualityResult(
|
| 205 |
+
sharpness_score=sharpness,
|
| 206 |
+
noise_level=noise,
|
| 207 |
+
rotation_degrees=0.0, # non calculé sans NumPy
|
| 208 |
+
contrast_score=contrast,
|
| 209 |
+
quality_score=quality,
|
| 210 |
+
analysis_method="pillow",
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _laplacian_variance_numpy(arr, np) -> float:
|
| 215 |
+
"""Calcule la variance du laplacien (mesure de netteté)."""
|
| 216 |
+
# Filtre laplacien 3x3
|
| 217 |
+
laplacian_kernel = np.array([
|
| 218 |
+
[0, 1, 0],
|
| 219 |
+
[1, -4, 1],
|
| 220 |
+
[0, 1, 0],
|
| 221 |
+
], dtype=np.float32)
|
| 222 |
+
|
| 223 |
+
# Convolution manuelle simplifiée (bordures ignorées)
|
| 224 |
+
h, w = arr.shape
|
| 225 |
+
if h < 3 or w < 3:
|
| 226 |
+
return float(np.var(arr))
|
| 227 |
+
|
| 228 |
+
# Utiliser une convolution rapide avec slicing
|
| 229 |
+
center = arr[1:-1, 1:-1]
|
| 230 |
+
top = arr[:-2, 1:-1]
|
| 231 |
+
bottom = arr[2:, 1:-1]
|
| 232 |
+
left = arr[1:-1, :-2]
|
| 233 |
+
right = arr[1:-1, 2:]
|
| 234 |
+
lap = top + bottom + left + right - 4 * center
|
| 235 |
+
|
| 236 |
+
return float(np.var(lap))
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def _noise_level_numpy(arr, np) -> float:
|
| 240 |
+
"""Estime le niveau de bruit par la MAD (Median Absolute Deviation) des gradients."""
|
| 241 |
+
h, w = arr.shape
|
| 242 |
+
if h < 2 or w < 2:
|
| 243 |
+
return 0.0
|
| 244 |
+
# Différences horizontales et verticales
|
| 245 |
+
diff_h = np.abs(arr[:, 1:] - arr[:, :-1])
|
| 246 |
+
diff_v = np.abs(arr[1:, :] - arr[:-1, :])
|
| 247 |
+
noise_std = float(np.median(np.concatenate([diff_h.ravel(), diff_v.ravel()])))
|
| 248 |
+
# Normaliser : 0 = pas de bruit, 1 = très bruité (seuil à ~30)
|
| 249 |
+
return min(1.0, noise_std / 30.0)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _estimate_rotation_numpy(arr, np) -> float:
|
| 253 |
+
"""Estime l'angle de rotation par projection horizontale simplifiée.
|
| 254 |
+
|
| 255 |
+
Retourne l'angle estimé en degrés [-45, 45].
|
| 256 |
+
"""
|
| 257 |
+
# Méthode simplifiée : analyse de la variance des projections à différents angles
|
| 258 |
+
# Limiter à quelques angles pour la performance
|
| 259 |
+
h, w = arr.shape
|
| 260 |
+
if h < 20 or w < 20:
|
| 261 |
+
return 0.0
|
| 262 |
+
|
| 263 |
+
# Sous-échantillonnage pour la performance
|
| 264 |
+
step = max(1, h // 100)
|
| 265 |
+
sample = arr[::step, :]
|
| 266 |
+
|
| 267 |
+
best_angle = 0.0
|
| 268 |
+
best_var = -1.0
|
| 269 |
+
|
| 270 |
+
for angle_deg in range(-5, 6): # ±5 degrés, pas de 1°
|
| 271 |
+
angle_rad = math.radians(angle_deg)
|
| 272 |
+
# Projection horizontale après rotation approximative
|
| 273 |
+
# (approximation linéaire rapide)
|
| 274 |
+
offsets = np.round(
|
| 275 |
+
np.arange(sample.shape[0]) * math.tan(angle_rad)
|
| 276 |
+
).astype(int)
|
| 277 |
+
offsets = np.clip(offsets, 0, w - 1)
|
| 278 |
+
|
| 279 |
+
# Variance des sommes de lignes décalées
|
| 280 |
+
try:
|
| 281 |
+
row_sums = np.array([
|
| 282 |
+
float(np.sum(sample[i, max(0, offsets[i]):min(w, offsets[i]+w)]))
|
| 283 |
+
for i in range(sample.shape[0])
|
| 284 |
+
])
|
| 285 |
+
var = float(np.var(row_sums))
|
| 286 |
+
if var > best_var:
|
| 287 |
+
best_var = var
|
| 288 |
+
best_angle = float(angle_deg)
|
| 289 |
+
except Exception:
|
| 290 |
+
pass
|
| 291 |
+
|
| 292 |
+
return best_angle
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def _contrast_score_numpy(arr, np) -> float:
|
| 296 |
+
"""Score de contraste Michelson [0, 1]."""
|
| 297 |
+
p5 = float(np.percentile(arr, 5)) # fond clair
|
| 298 |
+
p95 = float(np.percentile(arr, 95)) # encre sombre
|
| 299 |
+
if p5 + p95 == 0:
|
| 300 |
+
return 0.0
|
| 301 |
+
# Michelson : (Imax - Imin) / (Imax + Imin)
|
| 302 |
+
return float((p95 - p5) / (p95 + p5))
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _global_quality_score(
|
| 306 |
+
sharpness: float,
|
| 307 |
+
noise: float,
|
| 308 |
+
rotation_abs: float,
|
| 309 |
+
contrast: float,
|
| 310 |
+
) -> float:
|
| 311 |
+
"""Calcule le score de qualité global pondéré."""
|
| 312 |
+
# Poids : netteté (40%), contraste (30%), bruit (20%), rotation (10%)
|
| 313 |
+
score = (
|
| 314 |
+
0.40 * sharpness
|
| 315 |
+
+ 0.30 * contrast
|
| 316 |
+
+ 0.20 * (1.0 - noise) # moins de bruit = mieux
|
| 317 |
+
+ 0.10 * max(0.0, 1.0 - rotation_abs / 10.0) # ±10° max
|
| 318 |
+
)
|
| 319 |
+
return round(min(1.0, max(0.0, score)), 4)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# ---------------------------------------------------------------------------
|
| 323 |
+
# Données fictives pour les fixtures de démo
|
| 324 |
+
# ---------------------------------------------------------------------------
|
| 325 |
+
|
| 326 |
+
def generate_mock_quality_scores(
|
| 327 |
+
doc_id: str,
|
| 328 |
+
seed: Optional[int] = None,
|
| 329 |
+
) -> ImageQualityResult:
|
| 330 |
+
"""Génère des métriques de qualité fictives mais cohérentes pour un document.
|
| 331 |
+
|
| 332 |
+
Utilisé par les fixtures de démo pour simuler une diversité réaliste
|
| 333 |
+
de qualités d'image (bonne, moyenne, dégradée).
|
| 334 |
+
|
| 335 |
+
Parameters
|
| 336 |
+
----------
|
| 337 |
+
doc_id:
|
| 338 |
+
Identifiant du document (utilisé pour la reproductibilité).
|
| 339 |
+
seed:
|
| 340 |
+
Graine aléatoire optionnelle.
|
| 341 |
+
"""
|
| 342 |
+
import random
|
| 343 |
+
rng = random.Random(seed or hash(doc_id) % 2**32)
|
| 344 |
+
|
| 345 |
+
# Générer une qualité cohérente : certains docs sont plus difficiles
|
| 346 |
+
# doc_id finissant par un chiffre impair → qualité variable
|
| 347 |
+
last_char = doc_id[-1] if doc_id else "0"
|
| 348 |
+
base_quality = 0.3 + rng.random() * 0.6 # 0.3 à 0.9
|
| 349 |
+
|
| 350 |
+
sharpness = max(0.1, min(1.0, base_quality + rng.gauss(0, 0.1)))
|
| 351 |
+
noise = max(0.0, min(1.0, (1.0 - base_quality) * 0.8 + rng.gauss(0, 0.05)))
|
| 352 |
+
rotation = rng.gauss(0, 1.5) # ±1.5° typique
|
| 353 |
+
contrast = max(0.2, min(1.0, base_quality + rng.gauss(0, 0.15)))
|
| 354 |
+
|
| 355 |
+
quality = _global_quality_score(sharpness, noise, abs(rotation), contrast)
|
| 356 |
+
|
| 357 |
+
return ImageQualityResult(
|
| 358 |
+
sharpness_score=round(sharpness, 4),
|
| 359 |
+
noise_level=round(noise, 4),
|
| 360 |
+
rotation_degrees=round(rotation, 2),
|
| 361 |
+
contrast_score=round(contrast, 4),
|
| 362 |
+
quality_score=round(quality, 4),
|
| 363 |
+
analysis_method="mock",
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def aggregate_image_quality(results: list[ImageQualityResult]) -> dict:
|
| 368 |
+
"""Agrège les métriques de qualité image sur un corpus."""
|
| 369 |
+
if not results:
|
| 370 |
+
return {}
|
| 371 |
+
|
| 372 |
+
valid = [r for r in results if r.error is None]
|
| 373 |
+
if not valid:
|
| 374 |
+
return {"error": "Aucune analyse réussie"}
|
| 375 |
+
|
| 376 |
+
def _mean(vals: list[float]) -> float:
|
| 377 |
+
return round(statistics.mean(vals), 4) if vals else 0.0
|
| 378 |
+
|
| 379 |
+
quality_scores = [r.quality_score for r in valid]
|
| 380 |
+
sharpness_scores = [r.sharpness_score for r in valid]
|
| 381 |
+
noise_levels = [r.noise_level for r in valid]
|
| 382 |
+
|
| 383 |
+
# Distribution par tier
|
| 384 |
+
tiers = {"good": 0, "medium": 0, "poor": 0}
|
| 385 |
+
for r in valid:
|
| 386 |
+
tiers[r.quality_tier] += 1
|
| 387 |
+
|
| 388 |
+
return {
|
| 389 |
+
"mean_quality_score": _mean(quality_scores),
|
| 390 |
+
"mean_sharpness": _mean(sharpness_scores),
|
| 391 |
+
"mean_noise_level": _mean(noise_levels),
|
| 392 |
+
"quality_distribution": tiers,
|
| 393 |
+
"document_count": len(valid),
|
| 394 |
+
"scores": [r.quality_score for r in valid], # pour scatter plot
|
| 395 |
+
}
|
picarones/core/metrics.py
CHANGED
|
@@ -5,6 +5,8 @@ Métriques implémentées
|
|
| 5 |
- CER brut : distance d'édition caractère / longueur GT
|
| 6 |
- CER normalisé NFC : après normalisation Unicode NFC
|
| 7 |
- CER sans casse : insensible aux majuscules/minuscules
|
|
|
|
|
|
|
| 8 |
- WER brut : word error rate standard
|
| 9 |
- WER normalisé : après normalisation des espaces
|
| 10 |
- MER : Match Error Rate (jiwer)
|
|
@@ -41,9 +43,6 @@ def _normalize_whitespace(text: str) -> str:
|
|
| 41 |
return " ".join(text.split())
|
| 42 |
|
| 43 |
|
| 44 |
-
# Transformations jiwer pour le CER (chaque char devient un "mot")
|
| 45 |
-
_CHAR_TRANSFORM = jiwer.transforms.Compose([]) if _JIWER_AVAILABLE else None
|
| 46 |
-
|
| 47 |
# Transformations jiwer pour le WER (normalisation légère des espaces)
|
| 48 |
_WER_TRANSFORM = (
|
| 49 |
jiwer.transforms.Compose(
|
|
@@ -62,7 +61,6 @@ def _cer_from_strings(reference: str, hypothesis: str) -> float:
|
|
| 62 |
"""CER brut : distance d'édition sur les caractères."""
|
| 63 |
if not reference:
|
| 64 |
return 0.0 if not hypothesis else 1.0
|
| 65 |
-
# jiwer.cer traite chaque caractère comme un token
|
| 66 |
return jiwer.cer(reference, hypothesis)
|
| 67 |
|
| 68 |
|
|
@@ -84,9 +82,15 @@ class MetricsResult:
|
|
| 84 |
reference_length: int
|
| 85 |
hypothesis_length: int
|
| 86 |
error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def as_dict(self) -> dict:
|
| 89 |
-
|
| 90 |
"cer": round(self.cer, 6),
|
| 91 |
"cer_nfc": round(self.cer_nfc, 6),
|
| 92 |
"cer_caseless": round(self.cer_caseless, 6),
|
|
@@ -98,6 +102,10 @@ class MetricsResult:
|
|
| 98 |
"hypothesis_length": self.hypothesis_length,
|
| 99 |
"error": self.error,
|
| 100 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
@property
|
| 103 |
def cer_percent(self) -> float:
|
|
@@ -108,7 +116,11 @@ class MetricsResult:
|
|
| 108 |
return round(self.wer * 100, 2)
|
| 109 |
|
| 110 |
|
| 111 |
-
def compute_metrics(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""Calcule l'ensemble des métriques CER/WER pour une paire de textes.
|
| 113 |
|
| 114 |
Parameters
|
|
@@ -117,6 +129,10 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
|
|
| 117 |
Texte de vérité terrain (ground truth).
|
| 118 |
hypothesis:
|
| 119 |
Texte produit par le moteur OCR.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
Returns
|
| 122 |
-------
|
|
@@ -151,6 +167,19 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
|
|
| 151 |
mer = jiwer.mer(reference, hypothesis)
|
| 152 |
wil = jiwer.wil(reference, hypothesis)
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
return MetricsResult(
|
| 155 |
cer=cer_raw,
|
| 156 |
cer_nfc=cer_nfc,
|
|
@@ -161,6 +190,8 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
|
|
| 161 |
wil=wil,
|
| 162 |
reference_length=len(reference),
|
| 163 |
hypothesis_length=len(hypothesis),
|
|
|
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
except Exception as exc: # noqa: BLE001
|
|
@@ -208,7 +239,28 @@ def aggregate_metrics(results: list[MetricsResult]) -> dict:
|
|
| 208 |
values = [getattr(r, metric) for r in results if r.error is None]
|
| 209 |
aggregated[metric] = _stats(values)
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
aggregated["document_count"] = len(results)
|
| 212 |
aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
|
| 213 |
|
| 214 |
return aggregated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
- CER brut : distance d'édition caractère / longueur GT
|
| 6 |
- CER normalisé NFC : après normalisation Unicode NFC
|
| 7 |
- CER sans casse : insensible aux majuscules/minuscules
|
| 8 |
+
- CER diplomatique : après application d'une table de correspondances
|
| 9 |
+
historiques (ſ=s, u=v, i=j…) — configurable
|
| 10 |
- WER brut : word error rate standard
|
| 11 |
- WER normalisé : après normalisation des espaces
|
| 12 |
- MER : Match Error Rate (jiwer)
|
|
|
|
| 43 |
return " ".join(text.split())
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
# Transformations jiwer pour le WER (normalisation légère des espaces)
|
| 47 |
_WER_TRANSFORM = (
|
| 48 |
jiwer.transforms.Compose(
|
|
|
|
| 61 |
"""CER brut : distance d'édition sur les caractères."""
|
| 62 |
if not reference:
|
| 63 |
return 0.0 if not hypothesis else 1.0
|
|
|
|
| 64 |
return jiwer.cer(reference, hypothesis)
|
| 65 |
|
| 66 |
|
|
|
|
| 82 |
reference_length: int
|
| 83 |
hypothesis_length: int
|
| 84 |
error: Optional[str] = None
|
| 85 |
+
cer_diplomatic: Optional[float] = None
|
| 86 |
+
"""CER calculé après normalisation diplomatique (ſ=s, u=v, i=j…).
|
| 87 |
+
None si aucun profil diplomatique n'a été fourni à compute_metrics.
|
| 88 |
+
"""
|
| 89 |
+
diplomatic_profile_name: Optional[str] = None
|
| 90 |
+
"""Nom du profil de normalisation diplomatique utilisé."""
|
| 91 |
|
| 92 |
def as_dict(self) -> dict:
|
| 93 |
+
d = {
|
| 94 |
"cer": round(self.cer, 6),
|
| 95 |
"cer_nfc": round(self.cer_nfc, 6),
|
| 96 |
"cer_caseless": round(self.cer_caseless, 6),
|
|
|
|
| 102 |
"hypothesis_length": self.hypothesis_length,
|
| 103 |
"error": self.error,
|
| 104 |
}
|
| 105 |
+
if self.cer_diplomatic is not None:
|
| 106 |
+
d["cer_diplomatic"] = round(self.cer_diplomatic, 6)
|
| 107 |
+
d["diplomatic_profile_name"] = self.diplomatic_profile_name
|
| 108 |
+
return d
|
| 109 |
|
| 110 |
@property
|
| 111 |
def cer_percent(self) -> float:
|
|
|
|
| 116 |
return round(self.wer * 100, 2)
|
| 117 |
|
| 118 |
|
| 119 |
+
def compute_metrics(
|
| 120 |
+
reference: str,
|
| 121 |
+
hypothesis: str,
|
| 122 |
+
normalization_profile: "Optional[NormalizationProfile]" = None, # noqa: F821
|
| 123 |
+
) -> MetricsResult:
|
| 124 |
"""Calcule l'ensemble des métriques CER/WER pour une paire de textes.
|
| 125 |
|
| 126 |
Parameters
|
|
|
|
| 129 |
Texte de vérité terrain (ground truth).
|
| 130 |
hypothesis:
|
| 131 |
Texte produit par le moteur OCR.
|
| 132 |
+
normalization_profile:
|
| 133 |
+
Profil de normalisation diplomatique optionnel.
|
| 134 |
+
Si fourni, calcule ``cer_diplomatic`` en plus des métriques standard.
|
| 135 |
+
Si None, utilise le profil medieval_french par défaut.
|
| 136 |
|
| 137 |
Returns
|
| 138 |
-------
|
|
|
|
| 167 |
mer = jiwer.mer(reference, hypothesis)
|
| 168 |
wil = jiwer.wil(reference, hypothesis)
|
| 169 |
|
| 170 |
+
# CER diplomatique — utilise le profil fourni ou le profil médiéval par défaut
|
| 171 |
+
cer_diplomatic: Optional[float] = None
|
| 172 |
+
diplomatic_profile_name: Optional[str] = None
|
| 173 |
+
try:
|
| 174 |
+
from picarones.core.normalization import DEFAULT_DIPLOMATIC_PROFILE
|
| 175 |
+
profile = normalization_profile or DEFAULT_DIPLOMATIC_PROFILE
|
| 176 |
+
ref_diplo = profile.normalize(reference)
|
| 177 |
+
hyp_diplo = profile.normalize(hypothesis)
|
| 178 |
+
cer_diplomatic = _cer_from_strings(ref_diplo, hyp_diplo)
|
| 179 |
+
diplomatic_profile_name = profile.name
|
| 180 |
+
except Exception: # noqa: BLE001
|
| 181 |
+
pass # CER diplomatique non critique
|
| 182 |
+
|
| 183 |
return MetricsResult(
|
| 184 |
cer=cer_raw,
|
| 185 |
cer_nfc=cer_nfc,
|
|
|
|
| 190 |
wil=wil,
|
| 191 |
reference_length=len(reference),
|
| 192 |
hypothesis_length=len(hypothesis),
|
| 193 |
+
cer_diplomatic=cer_diplomatic,
|
| 194 |
+
diplomatic_profile_name=diplomatic_profile_name,
|
| 195 |
)
|
| 196 |
|
| 197 |
except Exception as exc: # noqa: BLE001
|
|
|
|
| 239 |
values = [getattr(r, metric) for r in results if r.error is None]
|
| 240 |
aggregated[metric] = _stats(values)
|
| 241 |
|
| 242 |
+
# CER diplomatique (optionnel — présent seulement si calculé)
|
| 243 |
+
diplo_values = [
|
| 244 |
+
r.cer_diplomatic for r in results
|
| 245 |
+
if r.error is None and r.cer_diplomatic is not None
|
| 246 |
+
]
|
| 247 |
+
if diplo_values:
|
| 248 |
+
aggregated["cer_diplomatic"] = _stats(diplo_values)
|
| 249 |
+
# Nom du profil (même pour tous les docs d'un corpus)
|
| 250 |
+
profile_name = next(
|
| 251 |
+
(r.diplomatic_profile_name for r in results if r.diplomatic_profile_name),
|
| 252 |
+
None,
|
| 253 |
+
)
|
| 254 |
+
if profile_name:
|
| 255 |
+
aggregated["cer_diplomatic"]["profile"] = profile_name
|
| 256 |
+
|
| 257 |
aggregated["document_count"] = len(results)
|
| 258 |
aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
|
| 259 |
|
| 260 |
return aggregated
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# Import paresseux pour éviter les imports circulaires
|
| 264 |
+
from typing import TYPE_CHECKING
|
| 265 |
+
if TYPE_CHECKING:
|
| 266 |
+
from picarones.core.normalization import NormalizationProfile
|
picarones/core/normalization.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Profils de normalisation unicode pour le calcul du CER diplomatique.
|
| 2 |
+
|
| 3 |
+
La normalisation diplomatique permet de calculer un CER tenant compte des
|
| 4 |
+
équivalences graphiques propres aux documents historiques : ſ=s, u=v, i=j, etc.
|
| 5 |
+
|
| 6 |
+
En appliquant la même table aux deux textes (GT et OCR), on mesure les erreurs
|
| 7 |
+
"substantielles" (transcription erronée) en ignorant les variations graphiques
|
| 8 |
+
codifiées connues.
|
| 9 |
+
|
| 10 |
+
Trois niveaux de normalisation sont disponibles :
|
| 11 |
+
|
| 12 |
+
1. NFC : normalisation Unicode canonique (décomposition+recomposition)
|
| 13 |
+
2. caseless : NFC + pliage de casse (casefold)
|
| 14 |
+
3. diplomatic: NFC + table de correspondances historiques configurables
|
| 15 |
+
|
| 16 |
+
Les profils préconfigurés couvrent les cas d'usage patrimoniaux courants.
|
| 17 |
+
Ils sont également chargeables depuis un fichier YAML.
|
| 18 |
+
|
| 19 |
+
Exemple YAML
|
| 20 |
+
------------
|
| 21 |
+
name: medieval_custom
|
| 22 |
+
caseless: false
|
| 23 |
+
diplomatic:
|
| 24 |
+
ſ: s
|
| 25 |
+
u: v
|
| 26 |
+
i: j
|
| 27 |
+
y: i
|
| 28 |
+
æ: ae
|
| 29 |
+
œ: oe
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import unicodedata
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
from typing import Optional
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Tables de correspondances diplomatiques préconfigurées
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
#: Français médiéval (XIIe–XVe siècle)
|
| 45 |
+
DIPLOMATIC_FR_MEDIEVAL: dict[str, str] = {
|
| 46 |
+
"ſ": "s", # s long → s
|
| 47 |
+
"u": "v", # u/v interchangeables en position initiale
|
| 48 |
+
"i": "j", # i/j interchangeables
|
| 49 |
+
"y": "i", # y vocalique → i
|
| 50 |
+
"æ": "ae", # ligature æ
|
| 51 |
+
"œ": "oe", # ligature œ
|
| 52 |
+
"ꝑ": "per", # abréviation per/par
|
| 53 |
+
"ꝓ": "pro", # abréviation pro
|
| 54 |
+
"\u0026": "et", # & → et
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
#: Français moderne / imprimés anciens (XVIe–XVIIIe siècle)
|
| 58 |
+
DIPLOMATIC_FR_EARLY_MODERN: dict[str, str] = {
|
| 59 |
+
"ſ": "s", # s long
|
| 60 |
+
"æ": "ae",
|
| 61 |
+
"œ": "oe",
|
| 62 |
+
"\u0026": "et",
|
| 63 |
+
"ỹ": "yn", # y tilde
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
#: Latin médiéval
|
| 67 |
+
DIPLOMATIC_LATIN_MEDIEVAL: dict[str, str] = {
|
| 68 |
+
"ſ": "s",
|
| 69 |
+
"u": "v",
|
| 70 |
+
"i": "j",
|
| 71 |
+
"y": "i",
|
| 72 |
+
"æ": "ae",
|
| 73 |
+
"œ": "oe",
|
| 74 |
+
"ꝑ": "per",
|
| 75 |
+
"ꝓ": "pro",
|
| 76 |
+
"ꝗ": "que", # q barré → que
|
| 77 |
+
"\u0026": "et",
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
#: Profil minimal — uniquement NFC + s long
|
| 81 |
+
DIPLOMATIC_MINIMAL: dict[str, str] = {
|
| 82 |
+
"ſ": "s",
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# Profil de normalisation
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
|
| 90 |
+
@dataclass
|
| 91 |
+
class NormalizationProfile:
|
| 92 |
+
"""Décrit une stratégie de normalisation pour le calcul du CER diplomatique.
|
| 93 |
+
|
| 94 |
+
Parameters
|
| 95 |
+
----------
|
| 96 |
+
name:
|
| 97 |
+
Identifiant lisible du profil (ex : ``"medieval_french"``).
|
| 98 |
+
nfc:
|
| 99 |
+
Applique la normalisation Unicode NFC (recommandé, activé par défaut).
|
| 100 |
+
caseless:
|
| 101 |
+
Pliage de casse (casefold) après NFC.
|
| 102 |
+
diplomatic_table:
|
| 103 |
+
Table de correspondances graphiques historiques appliquée caractère
|
| 104 |
+
par caractère sur les deux textes avant calcul du CER.
|
| 105 |
+
description:
|
| 106 |
+
Description courte du profil (affichée dans le rapport HTML).
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
name: str
|
| 110 |
+
nfc: bool = True
|
| 111 |
+
caseless: bool = False
|
| 112 |
+
diplomatic_table: dict[str, str] = field(default_factory=dict)
|
| 113 |
+
description: str = ""
|
| 114 |
+
|
| 115 |
+
def normalize(self, text: str) -> str:
|
| 116 |
+
"""Applique le profil de normalisation à un texte."""
|
| 117 |
+
if self.nfc:
|
| 118 |
+
text = unicodedata.normalize("NFC", text)
|
| 119 |
+
if self.caseless:
|
| 120 |
+
text = text.casefold()
|
| 121 |
+
if self.diplomatic_table:
|
| 122 |
+
text = _apply_diplomatic_table(text, self.diplomatic_table)
|
| 123 |
+
return text
|
| 124 |
+
|
| 125 |
+
def as_dict(self) -> dict:
|
| 126 |
+
return {
|
| 127 |
+
"name": self.name,
|
| 128 |
+
"nfc": self.nfc,
|
| 129 |
+
"caseless": self.caseless,
|
| 130 |
+
"diplomatic_table": self.diplomatic_table,
|
| 131 |
+
"description": self.description,
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
@classmethod
|
| 135 |
+
def from_yaml(cls, path: str | Path) -> "NormalizationProfile":
|
| 136 |
+
"""Charge un profil depuis un fichier YAML.
|
| 137 |
+
|
| 138 |
+
Le fichier YAML doit contenir les clés ``name``, optionnellement
|
| 139 |
+
``caseless``, ``description`` et ``diplomatic`` (dict str→str).
|
| 140 |
+
|
| 141 |
+
Example
|
| 142 |
+
-------
|
| 143 |
+
.. code-block:: yaml
|
| 144 |
+
|
| 145 |
+
name: medieval_custom
|
| 146 |
+
caseless: false
|
| 147 |
+
description: Français médiéval personnalisé
|
| 148 |
+
diplomatic:
|
| 149 |
+
ſ: s
|
| 150 |
+
u: v
|
| 151 |
+
"""
|
| 152 |
+
try:
|
| 153 |
+
import yaml
|
| 154 |
+
except ImportError as exc:
|
| 155 |
+
raise RuntimeError(
|
| 156 |
+
"Le package 'pyyaml' est requis pour charger les profils YAML. "
|
| 157 |
+
"Installez-le avec : pip install pyyaml"
|
| 158 |
+
) from exc
|
| 159 |
+
|
| 160 |
+
data = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
|
| 161 |
+
return cls(
|
| 162 |
+
name=data.get("name", Path(path).stem),
|
| 163 |
+
nfc=bool(data.get("nfc", True)),
|
| 164 |
+
caseless=bool(data.get("caseless", False)),
|
| 165 |
+
diplomatic_table=data.get("diplomatic", {}),
|
| 166 |
+
description=data.get("description", ""),
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
@classmethod
|
| 170 |
+
def from_dict(cls, data: dict) -> "NormalizationProfile":
|
| 171 |
+
"""Charge un profil depuis un dictionnaire (ex : section YAML inline)."""
|
| 172 |
+
return cls(
|
| 173 |
+
name=data.get("name", "custom"),
|
| 174 |
+
nfc=bool(data.get("nfc", True)),
|
| 175 |
+
caseless=bool(data.get("caseless", False)),
|
| 176 |
+
diplomatic_table=data.get("diplomatic", {}),
|
| 177 |
+
description=data.get("description", ""),
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
# Profils préconfigurés
|
| 183 |
+
# ---------------------------------------------------------------------------
|
| 184 |
+
|
| 185 |
+
def get_builtin_profile(name: str) -> NormalizationProfile:
|
| 186 |
+
"""Retourne un profil préconfigurée par son identifiant.
|
| 187 |
+
|
| 188 |
+
Identifiants disponibles
|
| 189 |
+
------------------------
|
| 190 |
+
- ``"medieval_french"`` : français médiéval XIIe–XVe (ſ=s, u=v, i=j, æ=ae, œ=oe…)
|
| 191 |
+
- ``"early_modern_french"`` : imprimés anciens XVIe–XVIIIe (ſ=s, œ=oe, æ=ae…)
|
| 192 |
+
- ``"medieval_latin"`` : latin médiéval (ſ=s, u=v, i=j, ꝑ=per, ꝓ=pro…)
|
| 193 |
+
- ``"minimal"`` : uniquement NFC + s long
|
| 194 |
+
- ``"nfc"`` : NFC seul (sans table diplomatique)
|
| 195 |
+
- ``"caseless"`` : NFC + pliage de casse
|
| 196 |
+
|
| 197 |
+
Raises
|
| 198 |
+
------
|
| 199 |
+
KeyError
|
| 200 |
+
Si le nom n'est pas reconnu.
|
| 201 |
+
"""
|
| 202 |
+
profiles = {
|
| 203 |
+
"medieval_french": NormalizationProfile(
|
| 204 |
+
name="medieval_french",
|
| 205 |
+
nfc=True,
|
| 206 |
+
caseless=False,
|
| 207 |
+
diplomatic_table=DIPLOMATIC_FR_MEDIEVAL,
|
| 208 |
+
description="Français médiéval (XIIe–XVe) : ſ=s, u=v, i=j, æ=ae, œ=oe",
|
| 209 |
+
),
|
| 210 |
+
"early_modern_french": NormalizationProfile(
|
| 211 |
+
name="early_modern_french",
|
| 212 |
+
nfc=True,
|
| 213 |
+
caseless=False,
|
| 214 |
+
diplomatic_table=DIPLOMATIC_FR_EARLY_MODERN,
|
| 215 |
+
description="Imprimés anciens (XVIe–XVIIIe) : ſ=s, æ=ae, œ=oe",
|
| 216 |
+
),
|
| 217 |
+
"medieval_latin": NormalizationProfile(
|
| 218 |
+
name="medieval_latin",
|
| 219 |
+
nfc=True,
|
| 220 |
+
caseless=False,
|
| 221 |
+
diplomatic_table=DIPLOMATIC_LATIN_MEDIEVAL,
|
| 222 |
+
description="Latin médiéval : ſ=s, u=v, i=j, ꝑ=per, ꝓ=pro",
|
| 223 |
+
),
|
| 224 |
+
"minimal": NormalizationProfile(
|
| 225 |
+
name="minimal",
|
| 226 |
+
nfc=True,
|
| 227 |
+
caseless=False,
|
| 228 |
+
diplomatic_table=DIPLOMATIC_MINIMAL,
|
| 229 |
+
description="Minimal : NFC + s long seulement",
|
| 230 |
+
),
|
| 231 |
+
"nfc": NormalizationProfile(
|
| 232 |
+
name="nfc",
|
| 233 |
+
nfc=True,
|
| 234 |
+
caseless=False,
|
| 235 |
+
diplomatic_table={},
|
| 236 |
+
description="Normalisation NFC uniquement",
|
| 237 |
+
),
|
| 238 |
+
"caseless": NormalizationProfile(
|
| 239 |
+
name="caseless",
|
| 240 |
+
nfc=True,
|
| 241 |
+
caseless=True,
|
| 242 |
+
diplomatic_table={},
|
| 243 |
+
description="NFC + insensible à la casse",
|
| 244 |
+
),
|
| 245 |
+
}
|
| 246 |
+
if name not in profiles:
|
| 247 |
+
raise KeyError(
|
| 248 |
+
f"Profil de normalisation inconnu : '{name}'. "
|
| 249 |
+
f"Disponibles : {', '.join(profiles)}"
|
| 250 |
+
)
|
| 251 |
+
return profiles[name]
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# ---------------------------------------------------------------------------
|
| 255 |
+
# Fonctions utilitaires
|
| 256 |
+
# ---------------------------------------------------------------------------
|
| 257 |
+
|
| 258 |
+
def _apply_diplomatic_table(text: str, table: dict[str, str]) -> str:
|
| 259 |
+
"""Applique une table de correspondances diplomatiques caractère par caractère.
|
| 260 |
+
|
| 261 |
+
Les clés multi-caractères (ex : ``"ae"`` → ``"æ"``) sont gérées en priorité
|
| 262 |
+
sur les correspondances simples.
|
| 263 |
+
"""
|
| 264 |
+
if not table:
|
| 265 |
+
return text
|
| 266 |
+
|
| 267 |
+
# Séparer les clés simples (1 char) des clés multi-chars pour traitement ordonné
|
| 268 |
+
multi_keys = sorted(
|
| 269 |
+
(k for k in table if len(k) > 1), key=len, reverse=True
|
| 270 |
+
)
|
| 271 |
+
simple_table = {k: v for k, v in table.items() if len(k) == 1}
|
| 272 |
+
|
| 273 |
+
result = text
|
| 274 |
+
# Remplacements multi-chars en premier (évite les conflits)
|
| 275 |
+
for key in multi_keys:
|
| 276 |
+
result = result.replace(key, table[key])
|
| 277 |
+
|
| 278 |
+
# Remplacements char par char
|
| 279 |
+
if simple_table:
|
| 280 |
+
result = "".join(simple_table.get(c, c) for c in result)
|
| 281 |
+
|
| 282 |
+
return result
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
# Profil par défaut utilisé pour le CER diplomatique intégré
|
| 286 |
+
DEFAULT_DIPLOMATIC_PROFILE: NormalizationProfile = get_builtin_profile("medieval_french")
|
picarones/core/results.py
CHANGED
|
@@ -35,6 +35,17 @@ class DocumentResult:
|
|
| 35 |
"""Sortie OCR brute avant correction LLM (None pour les moteurs OCR seuls)."""
|
| 36 |
pipeline_metadata: dict = field(default_factory=dict)
|
| 37 |
"""Métadonnées du pipeline : mode, prompt, over-normalization…"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def as_dict(self) -> dict:
|
| 40 |
d = {
|
|
@@ -50,6 +61,16 @@ class DocumentResult:
|
|
| 50 |
d["ocr_intermediate"] = self.ocr_intermediate
|
| 51 |
if self.pipeline_metadata:
|
| 52 |
d["pipeline_metadata"] = self.pipeline_metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
return d
|
| 54 |
|
| 55 |
|
|
@@ -67,6 +88,17 @@ class EngineReport:
|
|
| 67 |
Clés typiques : mode, prompt_file, llm_model, llm_provider, pipeline_steps,
|
| 68 |
over_normalization (score agrégé, classe 10 de la taxonomie).
|
| 69 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
def __post_init__(self) -> None:
|
| 72 |
if not self.aggregated_metrics and self.document_results:
|
|
@@ -84,6 +116,20 @@ class EngineReport:
|
|
| 84 |
wer_stats = self.aggregated_metrics.get("wer", {})
|
| 85 |
return wer_stats.get("mean")
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
@property
|
| 88 |
def is_pipeline(self) -> bool:
|
| 89 |
"""Vrai si ce rapport correspond à un pipeline OCR+LLM."""
|
|
@@ -99,6 +145,16 @@ class EngineReport:
|
|
| 99 |
}
|
| 100 |
if self.pipeline_info:
|
| 101 |
d["pipeline_info"] = self.pipeline_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
return d
|
| 103 |
|
| 104 |
|
|
|
|
| 35 |
"""Sortie OCR brute avant correction LLM (None pour les moteurs OCR seuls)."""
|
| 36 |
pipeline_metadata: dict = field(default_factory=dict)
|
| 37 |
"""Métadonnées du pipeline : mode, prompt, over-normalization…"""
|
| 38 |
+
# Champs Sprint 5 — métriques avancées patrimoniales
|
| 39 |
+
confusion_matrix: Optional[dict] = None
|
| 40 |
+
"""Matrice de confusion unicode sérialisée."""
|
| 41 |
+
char_scores: Optional[dict] = None
|
| 42 |
+
"""Scores ligatures et diacritiques."""
|
| 43 |
+
taxonomy: Optional[dict] = None
|
| 44 |
+
"""Classification taxonomique des erreurs (classes 1-9)."""
|
| 45 |
+
structure: Optional[dict] = None
|
| 46 |
+
"""Analyse structurelle (segmentation lignes, ordre lecture)."""
|
| 47 |
+
image_quality: Optional[dict] = None
|
| 48 |
+
"""Métriques de qualité image."""
|
| 49 |
|
| 50 |
def as_dict(self) -> dict:
|
| 51 |
d = {
|
|
|
|
| 61 |
d["ocr_intermediate"] = self.ocr_intermediate
|
| 62 |
if self.pipeline_metadata:
|
| 63 |
d["pipeline_metadata"] = self.pipeline_metadata
|
| 64 |
+
if self.confusion_matrix is not None:
|
| 65 |
+
d["confusion_matrix"] = self.confusion_matrix
|
| 66 |
+
if self.char_scores is not None:
|
| 67 |
+
d["char_scores"] = self.char_scores
|
| 68 |
+
if self.taxonomy is not None:
|
| 69 |
+
d["taxonomy"] = self.taxonomy
|
| 70 |
+
if self.structure is not None:
|
| 71 |
+
d["structure"] = self.structure
|
| 72 |
+
if self.image_quality is not None:
|
| 73 |
+
d["image_quality"] = self.image_quality
|
| 74 |
return d
|
| 75 |
|
| 76 |
|
|
|
|
| 88 |
Clés typiques : mode, prompt_file, llm_model, llm_provider, pipeline_steps,
|
| 89 |
over_normalization (score agrégé, classe 10 de la taxonomie).
|
| 90 |
"""
|
| 91 |
+
# Métriques agrégées Sprint 5
|
| 92 |
+
aggregated_confusion: Optional[dict] = None
|
| 93 |
+
"""Matrice de confusion unicode agrégée sur le corpus."""
|
| 94 |
+
aggregated_char_scores: Optional[dict] = None
|
| 95 |
+
"""Scores ligatures/diacritiques agrégés."""
|
| 96 |
+
aggregated_taxonomy: Optional[dict] = None
|
| 97 |
+
"""Distribution taxonomique des erreurs agrégée."""
|
| 98 |
+
aggregated_structure: Optional[dict] = None
|
| 99 |
+
"""Métriques structurelles agrégées."""
|
| 100 |
+
aggregated_image_quality: Optional[dict] = None
|
| 101 |
+
"""Métriques de qualité image agrégées."""
|
| 102 |
|
| 103 |
def __post_init__(self) -> None:
|
| 104 |
if not self.aggregated_metrics and self.document_results:
|
|
|
|
| 116 |
wer_stats = self.aggregated_metrics.get("wer", {})
|
| 117 |
return wer_stats.get("mean")
|
| 118 |
|
| 119 |
+
@property
|
| 120 |
+
def ligature_score(self) -> Optional[float]:
|
| 121 |
+
"""Score de ligatures agrégé (None si non calculé)."""
|
| 122 |
+
if self.aggregated_char_scores:
|
| 123 |
+
return self.aggregated_char_scores.get("ligature", {}).get("score")
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
@property
|
| 127 |
+
def diacritic_score(self) -> Optional[float]:
|
| 128 |
+
"""Score diacritique agrégé (None si non calculé)."""
|
| 129 |
+
if self.aggregated_char_scores:
|
| 130 |
+
return self.aggregated_char_scores.get("diacritic", {}).get("score")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
@property
|
| 134 |
def is_pipeline(self) -> bool:
|
| 135 |
"""Vrai si ce rapport correspond à un pipeline OCR+LLM."""
|
|
|
|
| 145 |
}
|
| 146 |
if self.pipeline_info:
|
| 147 |
d["pipeline_info"] = self.pipeline_info
|
| 148 |
+
if self.aggregated_confusion is not None:
|
| 149 |
+
d["aggregated_confusion"] = self.aggregated_confusion
|
| 150 |
+
if self.aggregated_char_scores is not None:
|
| 151 |
+
d["aggregated_char_scores"] = self.aggregated_char_scores
|
| 152 |
+
if self.aggregated_taxonomy is not None:
|
| 153 |
+
d["aggregated_taxonomy"] = self.aggregated_taxonomy
|
| 154 |
+
if self.aggregated_structure is not None:
|
| 155 |
+
d["aggregated_structure"] = self.aggregated_structure
|
| 156 |
+
if self.aggregated_image_quality is not None:
|
| 157 |
+
d["aggregated_image_quality"] = self.aggregated_image_quality
|
| 158 |
return d
|
| 159 |
|
| 160 |
|
picarones/core/runner.py
CHANGED
|
@@ -21,6 +21,7 @@ def run_benchmark(
|
|
| 21 |
engines: list[BaseOCREngine],
|
| 22 |
output_json: Optional[str | Path] = None,
|
| 23 |
show_progress: bool = True,
|
|
|
|
| 24 |
) -> BenchmarkResult:
|
| 25 |
"""Exécute le benchmark d'un ou plusieurs moteurs/pipelines sur un corpus.
|
| 26 |
|
|
@@ -62,7 +63,12 @@ def run_benchmark(
|
|
| 62 |
disable=not show_progress,
|
| 63 |
)
|
| 64 |
|
| 65 |
-
for doc in iterator:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
ocr_result = engine.run(doc.image_path)
|
| 67 |
|
| 68 |
if ocr_result.success:
|
|
@@ -97,6 +103,57 @@ def run_benchmark(
|
|
| 97 |
)
|
| 98 |
pipeline_meta["over_normalization"] = over_norm.as_dict()
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
document_results.append(
|
| 101 |
DocumentResult(
|
| 102 |
doc_id=doc.doc_id,
|
|
@@ -108,18 +165,35 @@ def run_benchmark(
|
|
| 108 |
engine_error=ocr_result.error,
|
| 109 |
ocr_intermediate=ocr_intermediate,
|
| 110 |
pipeline_metadata=pipeline_meta,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
)
|
| 112 |
)
|
| 113 |
|
| 114 |
engine_version = engine._safe_version()
|
| 115 |
pipeline_info = _build_pipeline_info(engine, document_results)
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
report = EngineReport(
|
| 118 |
engine_name=engine.name,
|
| 119 |
engine_version=engine_version,
|
| 120 |
engine_config=engine.config,
|
| 121 |
document_results=document_results,
|
| 122 |
pipeline_info=pipeline_info,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
)
|
| 124 |
engine_reports.append(report)
|
| 125 |
logger.info(
|
|
@@ -184,3 +258,99 @@ def _build_pipeline_info(engine: BaseOCREngine, doc_results: list[DocumentResult
|
|
| 184 |
}
|
| 185 |
|
| 186 |
return info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
engines: list[BaseOCREngine],
|
| 22 |
output_json: Optional[str | Path] = None,
|
| 23 |
show_progress: bool = True,
|
| 24 |
+
progress_callback: Optional[callable] = None,
|
| 25 |
) -> BenchmarkResult:
|
| 26 |
"""Exécute le benchmark d'un ou plusieurs moteurs/pipelines sur un corpus.
|
| 27 |
|
|
|
|
| 63 |
disable=not show_progress,
|
| 64 |
)
|
| 65 |
|
| 66 |
+
for doc_idx, doc in enumerate(iterator):
|
| 67 |
+
if progress_callback is not None:
|
| 68 |
+
try:
|
| 69 |
+
progress_callback(engine.name, doc_idx, doc.doc_id)
|
| 70 |
+
except Exception:
|
| 71 |
+
pass
|
| 72 |
ocr_result = engine.run(doc.image_path)
|
| 73 |
|
| 74 |
if ocr_result.success:
|
|
|
|
| 103 |
)
|
| 104 |
pipeline_meta["over_normalization"] = over_norm.as_dict()
|
| 105 |
|
| 106 |
+
# Sprint 5 : métriques avancées patrimoniales
|
| 107 |
+
confusion_data = None
|
| 108 |
+
char_scores_data = None
|
| 109 |
+
taxonomy_data = None
|
| 110 |
+
structure_data = None
|
| 111 |
+
image_quality_data = None
|
| 112 |
+
|
| 113 |
+
if ocr_result.success:
|
| 114 |
+
try:
|
| 115 |
+
from picarones.core.confusion import build_confusion_matrix
|
| 116 |
+
cm = build_confusion_matrix(doc.ground_truth, ocr_result.text)
|
| 117 |
+
confusion_data = cm.as_dict()
|
| 118 |
+
except Exception:
|
| 119 |
+
pass
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
from picarones.core.char_scores import (
|
| 123 |
+
compute_ligature_score, compute_diacritic_score
|
| 124 |
+
)
|
| 125 |
+
lig = compute_ligature_score(doc.ground_truth, ocr_result.text)
|
| 126 |
+
diac = compute_diacritic_score(doc.ground_truth, ocr_result.text)
|
| 127 |
+
char_scores_data = {
|
| 128 |
+
"ligature": lig.as_dict(),
|
| 129 |
+
"diacritic": diac.as_dict(),
|
| 130 |
+
}
|
| 131 |
+
except Exception:
|
| 132 |
+
pass
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
from picarones.core.taxonomy import classify_errors
|
| 136 |
+
tax = classify_errors(doc.ground_truth, ocr_result.text)
|
| 137 |
+
taxonomy_data = tax.as_dict()
|
| 138 |
+
except Exception:
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
from picarones.core.structure import analyze_structure
|
| 143 |
+
struct = analyze_structure(doc.ground_truth, ocr_result.text)
|
| 144 |
+
structure_data = struct.as_dict()
|
| 145 |
+
except Exception:
|
| 146 |
+
pass
|
| 147 |
+
|
| 148 |
+
# Qualité image (indépendant du succès OCR)
|
| 149 |
+
try:
|
| 150 |
+
from picarones.core.image_quality import analyze_image_quality
|
| 151 |
+
iq = analyze_image_quality(doc.image_path)
|
| 152 |
+
if iq.error is None:
|
| 153 |
+
image_quality_data = iq.as_dict()
|
| 154 |
+
except Exception:
|
| 155 |
+
pass
|
| 156 |
+
|
| 157 |
document_results.append(
|
| 158 |
DocumentResult(
|
| 159 |
doc_id=doc.doc_id,
|
|
|
|
| 165 |
engine_error=ocr_result.error,
|
| 166 |
ocr_intermediate=ocr_intermediate,
|
| 167 |
pipeline_metadata=pipeline_meta,
|
| 168 |
+
confusion_matrix=confusion_data,
|
| 169 |
+
char_scores=char_scores_data,
|
| 170 |
+
taxonomy=taxonomy_data,
|
| 171 |
+
structure=structure_data,
|
| 172 |
+
image_quality=image_quality_data,
|
| 173 |
)
|
| 174 |
)
|
| 175 |
|
| 176 |
engine_version = engine._safe_version()
|
| 177 |
pipeline_info = _build_pipeline_info(engine, document_results)
|
| 178 |
|
| 179 |
+
# Agrégation Sprint 5
|
| 180 |
+
agg_confusion = _aggregate_confusion(document_results)
|
| 181 |
+
agg_char_scores = _aggregate_char_scores(document_results)
|
| 182 |
+
agg_taxonomy = _aggregate_taxonomy(document_results)
|
| 183 |
+
agg_structure = _aggregate_structure(document_results)
|
| 184 |
+
agg_image_quality = _aggregate_image_quality(document_results)
|
| 185 |
+
|
| 186 |
report = EngineReport(
|
| 187 |
engine_name=engine.name,
|
| 188 |
engine_version=engine_version,
|
| 189 |
engine_config=engine.config,
|
| 190 |
document_results=document_results,
|
| 191 |
pipeline_info=pipeline_info,
|
| 192 |
+
aggregated_confusion=agg_confusion,
|
| 193 |
+
aggregated_char_scores=agg_char_scores,
|
| 194 |
+
aggregated_taxonomy=agg_taxonomy,
|
| 195 |
+
aggregated_structure=agg_structure,
|
| 196 |
+
aggregated_image_quality=agg_image_quality,
|
| 197 |
)
|
| 198 |
engine_reports.append(report)
|
| 199 |
logger.info(
|
|
|
|
| 258 |
}
|
| 259 |
|
| 260 |
return info
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# ---------------------------------------------------------------------------
|
| 264 |
+
# Helpers d'agrégation Sprint 5
|
| 265 |
+
# ---------------------------------------------------------------------------
|
| 266 |
+
|
| 267 |
+
def _aggregate_confusion(doc_results: list) -> Optional[dict]:
|
| 268 |
+
"""Agrège les matrices de confusion unicode sur tous les documents."""
|
| 269 |
+
try:
|
| 270 |
+
from picarones.core.confusion import aggregate_confusion_matrices, ConfusionMatrix
|
| 271 |
+
matrices = [
|
| 272 |
+
ConfusionMatrix(**dr.confusion_matrix)
|
| 273 |
+
for dr in doc_results
|
| 274 |
+
if dr.confusion_matrix is not None
|
| 275 |
+
]
|
| 276 |
+
if not matrices:
|
| 277 |
+
return None
|
| 278 |
+
agg = aggregate_confusion_matrices(matrices)
|
| 279 |
+
return agg.as_compact_dict(min_count=2)
|
| 280 |
+
except Exception:
|
| 281 |
+
return None
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def _aggregate_char_scores(doc_results: list) -> Optional[dict]:
|
| 285 |
+
"""Agrège les scores ligatures/diacritiques."""
|
| 286 |
+
try:
|
| 287 |
+
from picarones.core.char_scores import (
|
| 288 |
+
aggregate_ligature_scores, aggregate_diacritic_scores,
|
| 289 |
+
LigatureScore, DiacriticScore,
|
| 290 |
+
)
|
| 291 |
+
lig_scores = [
|
| 292 |
+
LigatureScore(**dr.char_scores["ligature"])
|
| 293 |
+
for dr in doc_results
|
| 294 |
+
if dr.char_scores is not None
|
| 295 |
+
]
|
| 296 |
+
diac_scores = [
|
| 297 |
+
DiacriticScore(**dr.char_scores["diacritic"])
|
| 298 |
+
for dr in doc_results
|
| 299 |
+
if dr.char_scores is not None
|
| 300 |
+
]
|
| 301 |
+
if not lig_scores:
|
| 302 |
+
return None
|
| 303 |
+
return {
|
| 304 |
+
"ligature": aggregate_ligature_scores(lig_scores),
|
| 305 |
+
"diacritic": aggregate_diacritic_scores(diac_scores),
|
| 306 |
+
}
|
| 307 |
+
except Exception:
|
| 308 |
+
return None
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def _aggregate_taxonomy(doc_results: list) -> Optional[dict]:
|
| 312 |
+
"""Agrège les classifications taxonomiques."""
|
| 313 |
+
try:
|
| 314 |
+
from picarones.core.taxonomy import aggregate_taxonomy, TaxonomyResult
|
| 315 |
+
results = [
|
| 316 |
+
TaxonomyResult.from_dict(dr.taxonomy)
|
| 317 |
+
for dr in doc_results
|
| 318 |
+
if dr.taxonomy is not None
|
| 319 |
+
]
|
| 320 |
+
if not results:
|
| 321 |
+
return None
|
| 322 |
+
return aggregate_taxonomy(results)
|
| 323 |
+
except Exception:
|
| 324 |
+
return None
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def _aggregate_structure(doc_results: list) -> Optional[dict]:
|
| 328 |
+
"""Agrège les métriques structurelles."""
|
| 329 |
+
try:
|
| 330 |
+
from picarones.core.structure import aggregate_structure, StructureResult
|
| 331 |
+
results = [
|
| 332 |
+
StructureResult.from_dict(dr.structure)
|
| 333 |
+
for dr in doc_results
|
| 334 |
+
if dr.structure is not None
|
| 335 |
+
]
|
| 336 |
+
if not results:
|
| 337 |
+
return None
|
| 338 |
+
return aggregate_structure(results)
|
| 339 |
+
except Exception:
|
| 340 |
+
return None
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def _aggregate_image_quality(doc_results: list) -> Optional[dict]:
|
| 344 |
+
"""Agrège les métriques de qualité image."""
|
| 345 |
+
try:
|
| 346 |
+
from picarones.core.image_quality import aggregate_image_quality, ImageQualityResult
|
| 347 |
+
results = [
|
| 348 |
+
ImageQualityResult.from_dict(dr.image_quality)
|
| 349 |
+
for dr in doc_results
|
| 350 |
+
if dr.image_quality is not None
|
| 351 |
+
]
|
| 352 |
+
if not results:
|
| 353 |
+
return None
|
| 354 |
+
return aggregate_image_quality(results)
|
| 355 |
+
except Exception:
|
| 356 |
+
return None
|
picarones/core/structure.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyse structurelle des résultats OCR.
|
| 2 |
+
|
| 3 |
+
Mesures
|
| 4 |
+
-------
|
| 5 |
+
- **Taux de fusion de lignes** : l'OCR produit moins de lignes que le GT
|
| 6 |
+
(plusieurs lignes GT fusionnées en une seule).
|
| 7 |
+
- **Taux de fragmentation** : l'OCR produit plus de lignes que le GT
|
| 8 |
+
(une ligne GT découpée en plusieurs).
|
| 9 |
+
- **Score d'ordre de lecture** : corrélation entre l'ordre des mots GT et OCR,
|
| 10 |
+
approximé par la longueur de la sous-séquence commune la plus longue (LCS).
|
| 11 |
+
- **Taux de conservation des paragraphes** : respect des sauts de paragraphe.
|
| 12 |
+
|
| 13 |
+
Ces métriques sont calculées indépendamment du contenu textuel — elles mesurent
|
| 14 |
+
la fidélité de la mise en page, pas la qualité des caractères.
|
| 15 |
+
|
| 16 |
+
Note : sans bounding boxes disponibles, l'analyse se base uniquement sur les
|
| 17 |
+
sauts de ligne présents dans les textes GT et OCR.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import difflib
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from typing import Optional
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class StructureResult:
|
| 29 |
+
"""Résultat de l'analyse structurelle pour un document."""
|
| 30 |
+
|
| 31 |
+
gt_line_count: int = 0
|
| 32 |
+
"""Nombre de lignes dans le GT."""
|
| 33 |
+
ocr_line_count: int = 0
|
| 34 |
+
"""Nombre de lignes dans l'OCR."""
|
| 35 |
+
|
| 36 |
+
line_fusion_count: int = 0
|
| 37 |
+
"""Nombre de fusions de lignes (GT lignes absorbées)."""
|
| 38 |
+
line_fragmentation_count: int = 0
|
| 39 |
+
"""Nombre de fragmentations (GT lignes splittées)."""
|
| 40 |
+
|
| 41 |
+
reading_order_score: float = 1.0
|
| 42 |
+
"""Score d'ordre de lecture [0, 1]. 1 = ordre parfait."""
|
| 43 |
+
|
| 44 |
+
paragraph_conservation_score: float = 1.0
|
| 45 |
+
"""Score de conservation des paragraphes [0, 1]."""
|
| 46 |
+
|
| 47 |
+
@property
|
| 48 |
+
def line_fusion_rate(self) -> float:
|
| 49 |
+
"""Taux de fusion = fusions / lignes GT."""
|
| 50 |
+
return self.line_fusion_count / self.gt_line_count if self.gt_line_count > 0 else 0.0
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def line_fragmentation_rate(self) -> float:
|
| 54 |
+
"""Taux de fragmentation = fragmentations / lignes GT."""
|
| 55 |
+
return self.line_fragmentation_count / self.gt_line_count if self.gt_line_count > 0 else 0.0
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def line_accuracy(self) -> float:
|
| 59 |
+
"""Exactitude du nombre de lignes : 1 - |delta| / max(gt, ocr)."""
|
| 60 |
+
if self.gt_line_count == 0 and self.ocr_line_count == 0:
|
| 61 |
+
return 1.0
|
| 62 |
+
max_lines = max(self.gt_line_count, self.ocr_line_count)
|
| 63 |
+
delta = abs(self.gt_line_count - self.ocr_line_count)
|
| 64 |
+
return max(0.0, 1.0 - delta / max_lines)
|
| 65 |
+
|
| 66 |
+
def as_dict(self) -> dict:
|
| 67 |
+
return {
|
| 68 |
+
"gt_line_count": self.gt_line_count,
|
| 69 |
+
"ocr_line_count": self.ocr_line_count,
|
| 70 |
+
"line_fusion_count": self.line_fusion_count,
|
| 71 |
+
"line_fragmentation_count": self.line_fragmentation_count,
|
| 72 |
+
"line_fusion_rate": round(self.line_fusion_rate, 4),
|
| 73 |
+
"line_fragmentation_rate": round(self.line_fragmentation_rate, 4),
|
| 74 |
+
"line_accuracy": round(self.line_accuracy, 4),
|
| 75 |
+
"reading_order_score": round(self.reading_order_score, 4),
|
| 76 |
+
"paragraph_conservation_score": round(self.paragraph_conservation_score, 4),
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
@classmethod
|
| 80 |
+
def from_dict(cls, data: dict) -> "StructureResult":
|
| 81 |
+
return cls(
|
| 82 |
+
gt_line_count=data.get("gt_line_count", 0),
|
| 83 |
+
ocr_line_count=data.get("ocr_line_count", 0),
|
| 84 |
+
line_fusion_count=data.get("line_fusion_count", 0),
|
| 85 |
+
line_fragmentation_count=data.get("line_fragmentation_count", 0),
|
| 86 |
+
reading_order_score=data.get("reading_order_score", 1.0),
|
| 87 |
+
paragraph_conservation_score=data.get("paragraph_conservation_score", 1.0),
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def analyze_structure(ground_truth: str, hypothesis: str) -> StructureResult:
|
| 92 |
+
"""Analyse la structure d'un document OCR comparée au GT.
|
| 93 |
+
|
| 94 |
+
Parameters
|
| 95 |
+
----------
|
| 96 |
+
ground_truth:
|
| 97 |
+
Texte de référence (vérité terrain), avec sauts de ligne.
|
| 98 |
+
hypothesis:
|
| 99 |
+
Texte produit par l'OCR, avec sauts de ligne.
|
| 100 |
+
|
| 101 |
+
Returns
|
| 102 |
+
-------
|
| 103 |
+
StructureResult
|
| 104 |
+
"""
|
| 105 |
+
gt_lines = [l for l in ground_truth.splitlines() if l.strip()]
|
| 106 |
+
ocr_lines = [l for l in hypothesis.splitlines() if l.strip()]
|
| 107 |
+
|
| 108 |
+
n_gt = len(gt_lines)
|
| 109 |
+
n_ocr = len(ocr_lines)
|
| 110 |
+
|
| 111 |
+
# Fusions et fragmentations
|
| 112 |
+
fusion_count, frag_count = _count_line_changes(gt_lines, ocr_lines)
|
| 113 |
+
|
| 114 |
+
# Score d'ordre de lecture via LCS sur les mots
|
| 115 |
+
reading_order = _reading_order_score(ground_truth, hypothesis)
|
| 116 |
+
|
| 117 |
+
# Score de conservation des paragraphes (sauts de ligne vides = paragraphes)
|
| 118 |
+
para_score = _paragraph_conservation_score(ground_truth, hypothesis)
|
| 119 |
+
|
| 120 |
+
return StructureResult(
|
| 121 |
+
gt_line_count=n_gt,
|
| 122 |
+
ocr_line_count=n_ocr,
|
| 123 |
+
line_fusion_count=fusion_count,
|
| 124 |
+
line_fragmentation_count=frag_count,
|
| 125 |
+
reading_order_score=reading_order,
|
| 126 |
+
paragraph_conservation_score=para_score,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _count_line_changes(gt_lines: list[str], ocr_lines: list[str]) -> tuple[int, int]:
|
| 131 |
+
"""Compte les fusions et fragmentations de lignes via SequenceMatcher."""
|
| 132 |
+
if not gt_lines or not ocr_lines:
|
| 133 |
+
return 0, 0
|
| 134 |
+
|
| 135 |
+
fusion_count = 0
|
| 136 |
+
frag_count = 0
|
| 137 |
+
|
| 138 |
+
# Aligner les lignes par contenu
|
| 139 |
+
matcher = difflib.SequenceMatcher(
|
| 140 |
+
None,
|
| 141 |
+
[l.strip()[:30] for l in gt_lines], # fingerprint court pour la comparaison
|
| 142 |
+
[l.strip()[:30] for l in ocr_lines],
|
| 143 |
+
autojunk=False,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
| 147 |
+
if tag == "replace":
|
| 148 |
+
gt_len = i2 - i1
|
| 149 |
+
ocr_len = j2 - j1
|
| 150 |
+
if ocr_len < gt_len:
|
| 151 |
+
# Moins de lignes OCR → fusions
|
| 152 |
+
fusion_count += gt_len - ocr_len
|
| 153 |
+
elif ocr_len > gt_len:
|
| 154 |
+
# Plus de lignes OCR → fragmentations
|
| 155 |
+
frag_count += ocr_len - gt_len
|
| 156 |
+
elif tag == "delete":
|
| 157 |
+
# Lignes GT supprimées dans l'OCR → lacunes (pas fusion/frag)
|
| 158 |
+
pass
|
| 159 |
+
elif tag == "insert":
|
| 160 |
+
# Lignes insérées par l'OCR
|
| 161 |
+
frag_count += j2 - j1
|
| 162 |
+
|
| 163 |
+
return fusion_count, frag_count
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _reading_order_score(ground_truth: str, hypothesis: str) -> float:
|
| 167 |
+
"""Score d'ordre de lecture [0, 1] basé sur la LCS des mots.
|
| 168 |
+
|
| 169 |
+
On calcule la longueur de la sous-séquence commune la plus longue (LCS)
|
| 170 |
+
entre les listes de mots GT et OCR. Un score de 1 signifie que tous les
|
| 171 |
+
mots communs apparaissent dans le même ordre.
|
| 172 |
+
"""
|
| 173 |
+
gt_words = ground_truth.split()
|
| 174 |
+
hyp_words = hypothesis.split()
|
| 175 |
+
|
| 176 |
+
if not gt_words or not hyp_words:
|
| 177 |
+
return 1.0
|
| 178 |
+
|
| 179 |
+
# Utiliser SequenceMatcher pour approximer la LCS
|
| 180 |
+
matcher = difflib.SequenceMatcher(None, gt_words, hyp_words, autojunk=False)
|
| 181 |
+
# Ratio est 2 * nb_correspondances / (len_gt + len_ocr)
|
| 182 |
+
# C'est un proxy raisonnable de l'ordre de lecture
|
| 183 |
+
ratio = matcher.ratio()
|
| 184 |
+
return round(ratio, 4)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _paragraph_conservation_score(ground_truth: str, hypothesis: str) -> float:
|
| 188 |
+
"""Score de conservation des paragraphes [0, 1].
|
| 189 |
+
|
| 190 |
+
Compte les sauts de paragraphe (lignes vides) dans le GT et mesure
|
| 191 |
+
le taux de conservation dans l'OCR.
|
| 192 |
+
"""
|
| 193 |
+
# Un saut de paragraphe = deux sauts de ligne consécutifs
|
| 194 |
+
gt_paras = [p for p in ground_truth.split("\n\n") if p.strip()]
|
| 195 |
+
ocr_paras = [p for p in hypothesis.split("\n\n") if p.strip()]
|
| 196 |
+
|
| 197 |
+
n_gt_paras = len(gt_paras)
|
| 198 |
+
if n_gt_paras <= 1:
|
| 199 |
+
return 1.0 # pas de paragraphe distinct → score parfait
|
| 200 |
+
|
| 201 |
+
n_ocr_paras = len(ocr_paras)
|
| 202 |
+
delta = abs(n_gt_paras - n_ocr_paras)
|
| 203 |
+
score = max(0.0, 1.0 - delta / n_gt_paras)
|
| 204 |
+
return round(score, 4)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def aggregate_structure(results: list[StructureResult]) -> dict:
|
| 208 |
+
"""Agrège les résultats structurels sur un corpus."""
|
| 209 |
+
if not results:
|
| 210 |
+
return {}
|
| 211 |
+
|
| 212 |
+
import statistics
|
| 213 |
+
|
| 214 |
+
def _mean(values: list[float]) -> float:
|
| 215 |
+
return round(statistics.mean(values), 4) if values else 0.0
|
| 216 |
+
|
| 217 |
+
fusion_rates = [r.line_fusion_rate for r in results]
|
| 218 |
+
frag_rates = [r.line_fragmentation_rate for r in results]
|
| 219 |
+
reading_scores = [r.reading_order_score for r in results]
|
| 220 |
+
para_scores = [r.paragraph_conservation_score for r in results]
|
| 221 |
+
line_accuracies = [r.line_accuracy for r in results]
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
"mean_line_fusion_rate": _mean(fusion_rates),
|
| 225 |
+
"mean_line_fragmentation_rate": _mean(frag_rates),
|
| 226 |
+
"mean_reading_order_score": _mean(reading_scores),
|
| 227 |
+
"mean_paragraph_conservation": _mean(para_scores),
|
| 228 |
+
"mean_line_accuracy": _mean(line_accuracies),
|
| 229 |
+
"document_count": len(results),
|
| 230 |
+
}
|
picarones/core/taxonomy.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Taxonomie des erreurs OCR — classification automatique (classes 1 à 9).
|
| 2 |
+
|
| 3 |
+
Chaque erreur identifiée par l'alignement GT↔OCR est catégorisée selon
|
| 4 |
+
la taxonomie Picarones :
|
| 5 |
+
|
| 6 |
+
| Classe | Nom | Description |
|
| 7 |
+
|--------|-------------------|----------------------------------------------------|
|
| 8 |
+
| 1 | visual_confusion | Confusion morphologique (rn/m, l/1, O/0, u/n…) |
|
| 9 |
+
| 2 | diacritic_error | Diacritique absent, incorrect ou ajouté |
|
| 10 |
+
| 3 | case_error | Erreur de casse uniquement (A/a) |
|
| 11 |
+
| 4 | ligature_error | Ligature non résolue ou mal résolue |
|
| 12 |
+
| 5 | abbreviation_error| Abréviation médiévale non développée |
|
| 13 |
+
| 6 | hapax | Mot introuvable dans tout lexique |
|
| 14 |
+
| 7 | segmentation_error| Fusion ou fragmentation de tokens (mots/lignes) |
|
| 15 |
+
| 8 | oov_character | Caractère hors-vocabulaire du moteur |
|
| 16 |
+
| 9 | lacuna | Texte présent dans le GT absent de l'OCR |
|
| 17 |
+
| 10 | over_normalization| Sur-normalisation LLM (voir pipelines/) |
|
| 18 |
+
|
| 19 |
+
Note : la classe 10 est calculée par picarones/pipelines/over_normalization.py.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import difflib
|
| 25 |
+
import unicodedata
|
| 26 |
+
from dataclasses import dataclass, field
|
| 27 |
+
from typing import Optional
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# Tables de référence pour la classification
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
|
| 34 |
+
#: Confusions visuelles bien connues en OCR (caractères morphologiquement proches)
|
| 35 |
+
VISUAL_CONFUSIONS: dict[frozenset, str] = {}
|
| 36 |
+
_VISUAL_PAIRS: list[tuple[str, str]] = [
|
| 37 |
+
# Minuscules
|
| 38 |
+
("r", "n"), ("rn", "m"), ("l", "1"), ("l", "i"), ("l", "|"),
|
| 39 |
+
("O", "0"), ("O", "o"), ("u", "n"), ("n", "u"), ("v", "u"),
|
| 40 |
+
("c", "e"), ("e", "c"), ("a", "o"), ("o", "a"),
|
| 41 |
+
("f", "ſ"), ("ſ", "f"), ("f", "t"),
|
| 42 |
+
("h", "li"), ("h", "lı"),
|
| 43 |
+
("m", "rn"), ("m", "in"),
|
| 44 |
+
("d", "cl"), ("d", "a"),
|
| 45 |
+
("q", "g"), ("p", "q"),
|
| 46 |
+
# Majuscules ↔ minuscules homographes (classe 1, pas classe 3)
|
| 47 |
+
("I", "l"), ("I", "1"),
|
| 48 |
+
# Chiffres
|
| 49 |
+
("1", "I"), ("1", "l"), ("0", "O"),
|
| 50 |
+
# Ponctuation
|
| 51 |
+
(".", ","), (",", "."),
|
| 52 |
+
]
|
| 53 |
+
for _a, _b in _VISUAL_PAIRS:
|
| 54 |
+
VISUAL_CONFUSIONS[frozenset({_a, _b})] = f"{_a}/{_b}"
|
| 55 |
+
|
| 56 |
+
#: Couples de ligatures pour la détection des erreurs de ligatures
|
| 57 |
+
from picarones.core.char_scores import LIGATURE_TABLE, DIACRITIC_MAP # noqa: E402
|
| 58 |
+
|
| 59 |
+
# Caractères hors-ASCII présumés hors-vocabulaire (alphabet non latin de base)
|
| 60 |
+
_LATIN_BASIC = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
|
| 61 |
+
" \t\n.,;:!?-_'\"«»()[]{}/@#%&*+=/\\|<>~^")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
# Résultat structuré
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
@dataclass
|
| 69 |
+
class TaxonomyResult:
|
| 70 |
+
"""Résultat de la classification taxonomique des erreurs pour un document."""
|
| 71 |
+
|
| 72 |
+
counts: dict[str, int] = field(default_factory=dict)
|
| 73 |
+
"""Nombre d'erreurs par classe. Clés : 'visual_confusion', 'diacritic_error'…"""
|
| 74 |
+
|
| 75 |
+
examples: dict[str, list[dict]] = field(default_factory=dict)
|
| 76 |
+
"""Exemples d'erreurs par classe (max 5 par classe).
|
| 77 |
+
Format : [{'gt': 'chaîne', 'ocr': 'chaîne', 'position': int}]
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
total_errors: int = 0
|
| 81 |
+
"""Nombre total d'erreurs classifiées."""
|
| 82 |
+
|
| 83 |
+
@property
|
| 84 |
+
def class_distribution(self) -> dict[str, float]:
|
| 85 |
+
"""Distribution relative (0–1) par classe."""
|
| 86 |
+
if not self.total_errors:
|
| 87 |
+
return {}
|
| 88 |
+
return {
|
| 89 |
+
cls: round(cnt / self.total_errors, 4)
|
| 90 |
+
for cls, cnt in self.counts.items()
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
def as_dict(self) -> dict:
|
| 94 |
+
return {
|
| 95 |
+
"counts": self.counts,
|
| 96 |
+
"total_errors": self.total_errors,
|
| 97 |
+
"class_distribution": self.class_distribution,
|
| 98 |
+
"examples": {
|
| 99 |
+
cls: exs[:3] for cls, exs in self.examples.items()
|
| 100 |
+
},
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def from_dict(cls, data: dict) -> "TaxonomyResult":
|
| 105 |
+
return cls(
|
| 106 |
+
counts=data.get("counts", {}),
|
| 107 |
+
examples=data.get("examples", {}),
|
| 108 |
+
total_errors=data.get("total_errors", 0),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# Noms des classes en ordre
|
| 113 |
+
ERROR_CLASSES = [
|
| 114 |
+
"visual_confusion",
|
| 115 |
+
"diacritic_error",
|
| 116 |
+
"case_error",
|
| 117 |
+
"ligature_error",
|
| 118 |
+
"abbreviation_error",
|
| 119 |
+
"hapax",
|
| 120 |
+
"segmentation_error",
|
| 121 |
+
"oov_character",
|
| 122 |
+
"lacuna",
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
# Classification principale
|
| 128 |
+
# ---------------------------------------------------------------------------
|
| 129 |
+
|
| 130 |
+
def classify_errors(
|
| 131 |
+
ground_truth: str,
|
| 132 |
+
hypothesis: str,
|
| 133 |
+
max_examples: int = 5,
|
| 134 |
+
) -> TaxonomyResult:
|
| 135 |
+
"""Classifie automatiquement les erreurs OCR dans une paire GT/OCR.
|
| 136 |
+
|
| 137 |
+
L'alignement utilise difflib.SequenceMatcher au niveau mot pour détecter
|
| 138 |
+
les erreurs de segmentation, puis au niveau caractère pour les autres classes.
|
| 139 |
+
|
| 140 |
+
Parameters
|
| 141 |
+
----------
|
| 142 |
+
ground_truth:
|
| 143 |
+
Texte de référence (vérité terrain).
|
| 144 |
+
hypothesis:
|
| 145 |
+
Texte produit par l'OCR.
|
| 146 |
+
max_examples:
|
| 147 |
+
Nombre maximal d'exemples conservés par classe.
|
| 148 |
+
|
| 149 |
+
Returns
|
| 150 |
+
-------
|
| 151 |
+
TaxonomyResult
|
| 152 |
+
"""
|
| 153 |
+
counts: dict[str, int] = {cls: 0 for cls in ERROR_CLASSES}
|
| 154 |
+
examples: dict[str, list[dict]] = {cls: [] for cls in ERROR_CLASSES}
|
| 155 |
+
total = 0
|
| 156 |
+
|
| 157 |
+
if not ground_truth and not hypothesis:
|
| 158 |
+
return TaxonomyResult(counts=counts, examples=examples, total_errors=0)
|
| 159 |
+
|
| 160 |
+
# -----------------------------------------------------------------------
|
| 161 |
+
# Niveau mot : détecter segmentation (classe 7) et lacunes (classe 9)
|
| 162 |
+
# -----------------------------------------------------------------------
|
| 163 |
+
gt_words = ground_truth.split()
|
| 164 |
+
hyp_words = hypothesis.split()
|
| 165 |
+
|
| 166 |
+
word_matcher = difflib.SequenceMatcher(None, gt_words, hyp_words, autojunk=False)
|
| 167 |
+
for tag, i1, i2, j1, j2 in word_matcher.get_opcodes():
|
| 168 |
+
if tag == "delete":
|
| 169 |
+
# Mots GT absents de l'OCR → lacune (classe 9)
|
| 170 |
+
for w in gt_words[i1:i2]:
|
| 171 |
+
counts["lacuna"] += 1
|
| 172 |
+
total += 1
|
| 173 |
+
if len(examples["lacuna"]) < max_examples:
|
| 174 |
+
examples["lacuna"].append({"gt": w, "ocr": "", "position": i1})
|
| 175 |
+
|
| 176 |
+
elif tag == "insert":
|
| 177 |
+
# Mots ajoutés par l'OCR → généralement classe 8 (hors-vocab)
|
| 178 |
+
for w in hyp_words[j1:j2]:
|
| 179 |
+
if _is_oov_word(w):
|
| 180 |
+
counts["oov_character"] += 1
|
| 181 |
+
total += 1
|
| 182 |
+
|
| 183 |
+
elif tag == "replace":
|
| 184 |
+
gt_seg = gt_words[i1:i2]
|
| 185 |
+
hyp_seg = hyp_words[j1:j2]
|
| 186 |
+
# Segmentation : fusion de mots (moins de mots OCR) ou fragmentation
|
| 187 |
+
if len(hyp_seg) != len(gt_seg):
|
| 188 |
+
n_seg = abs(len(gt_seg) - len(hyp_seg))
|
| 189 |
+
counts["segmentation_error"] += n_seg
|
| 190 |
+
total += n_seg
|
| 191 |
+
if len(examples["segmentation_error"]) < max_examples:
|
| 192 |
+
examples["segmentation_error"].append({
|
| 193 |
+
"gt": " ".join(gt_seg),
|
| 194 |
+
"ocr": " ".join(hyp_seg),
|
| 195 |
+
"position": i1,
|
| 196 |
+
})
|
| 197 |
+
else:
|
| 198 |
+
# Paires mot-à-mot
|
| 199 |
+
for gt_w, hyp_w in zip(gt_seg, hyp_seg):
|
| 200 |
+
if gt_w != hyp_w:
|
| 201 |
+
_classify_word_error(
|
| 202 |
+
gt_w, hyp_w, counts, examples, max_examples
|
| 203 |
+
)
|
| 204 |
+
total += 1
|
| 205 |
+
|
| 206 |
+
return TaxonomyResult(
|
| 207 |
+
counts=counts,
|
| 208 |
+
examples=examples,
|
| 209 |
+
total_errors=total,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _classify_word_error(
|
| 214 |
+
gt_word: str,
|
| 215 |
+
hyp_word: str,
|
| 216 |
+
counts: dict[str, int],
|
| 217 |
+
examples: dict[str, list[dict]],
|
| 218 |
+
max_examples: int,
|
| 219 |
+
) -> None:
|
| 220 |
+
"""Classifie l'erreur entre deux mots non-identiques."""
|
| 221 |
+
# Classe 3 : erreur de casse seule
|
| 222 |
+
if gt_word.casefold() == hyp_word.casefold() and gt_word != hyp_word:
|
| 223 |
+
counts["case_error"] += 1
|
| 224 |
+
if len(examples["case_error"]) < max_examples:
|
| 225 |
+
examples["case_error"].append({"gt": gt_word, "ocr": hyp_word})
|
| 226 |
+
return
|
| 227 |
+
|
| 228 |
+
# Classe 4 : erreur de ligature
|
| 229 |
+
gt_norm = unicodedata.normalize("NFC", gt_word)
|
| 230 |
+
hyp_norm = unicodedata.normalize("NFC", hyp_word)
|
| 231 |
+
if _is_ligature_error(gt_norm, hyp_norm):
|
| 232 |
+
counts["ligature_error"] += 1
|
| 233 |
+
if len(examples["ligature_error"]) < max_examples:
|
| 234 |
+
examples["ligature_error"].append({"gt": gt_word, "ocr": hyp_word})
|
| 235 |
+
return
|
| 236 |
+
|
| 237 |
+
# Classe 5 : erreur d'abréviation (présence de ꝑ, ꝓ, ꝗ dans le GT)
|
| 238 |
+
if _is_abbreviation_error(gt_norm, hyp_norm):
|
| 239 |
+
counts["abbreviation_error"] += 1
|
| 240 |
+
if len(examples["abbreviation_error"]) < max_examples:
|
| 241 |
+
examples["abbreviation_error"].append({"gt": gt_word, "ocr": hyp_word})
|
| 242 |
+
return
|
| 243 |
+
|
| 244 |
+
# Classe 2 : erreur diacritique
|
| 245 |
+
if _is_diacritic_error(gt_norm, hyp_norm):
|
| 246 |
+
counts["diacritic_error"] += 1
|
| 247 |
+
if len(examples["diacritic_error"]) < max_examples:
|
| 248 |
+
examples["diacritic_error"].append({"gt": gt_word, "ocr": hyp_word})
|
| 249 |
+
return
|
| 250 |
+
|
| 251 |
+
# Classe 1 : confusion visuelle (comparaison char par char)
|
| 252 |
+
if _is_visual_confusion(gt_norm, hyp_norm):
|
| 253 |
+
counts["visual_confusion"] += 1
|
| 254 |
+
if len(examples["visual_confusion"]) < max_examples:
|
| 255 |
+
examples["visual_confusion"].append({"gt": gt_word, "ocr": hyp_word})
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
# Classe 8 : caractère hors-vocabulaire
|
| 259 |
+
if _is_oov_word(hyp_word):
|
| 260 |
+
counts["oov_character"] += 1
|
| 261 |
+
if len(examples["oov_character"]) < max_examples:
|
| 262 |
+
examples["oov_character"].append({"gt": gt_word, "ocr": hyp_word})
|
| 263 |
+
return
|
| 264 |
+
|
| 265 |
+
# Classe 6 : hapax (erreur résiduelle non classifiable)
|
| 266 |
+
counts["hapax"] += 1
|
| 267 |
+
if len(examples["hapax"]) < max_examples:
|
| 268 |
+
examples["hapax"].append({"gt": gt_word, "ocr": hyp_word})
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def _is_ligature_error(gt: str, hyp: str) -> bool:
|
| 272 |
+
"""Vrai si la différence implique une ligature Unicode."""
|
| 273 |
+
# GT contient une ligature que l'OCR a décomposée, ou vice versa
|
| 274 |
+
for lig, seqs in LIGATURE_TABLE.items():
|
| 275 |
+
if lig in gt:
|
| 276 |
+
for seq in seqs:
|
| 277 |
+
if seq in hyp and lig not in hyp:
|
| 278 |
+
return True
|
| 279 |
+
for seq in seqs:
|
| 280 |
+
if seq in gt and lig in hyp:
|
| 281 |
+
return True
|
| 282 |
+
return False
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _is_abbreviation_error(gt: str, hyp: str) -> bool:
|
| 286 |
+
"""Vrai si le GT contient un caractère d'abréviation médiévale."""
|
| 287 |
+
abbreviation_chars = "\uA751\uA753\uA757" # ꝑ ꝓ ꝗ
|
| 288 |
+
return any(c in gt for c in abbreviation_chars)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def _is_diacritic_error(gt: str, hyp: str) -> bool:
|
| 292 |
+
"""Vrai si la différence est principalement due à des diacritiques."""
|
| 293 |
+
# Comparer les formes sans diacritiques
|
| 294 |
+
def strip_diacritics(text: str) -> str:
|
| 295 |
+
nfd = unicodedata.normalize("NFD", text)
|
| 296 |
+
return "".join(c for c in nfd if unicodedata.category(c) != "Mn")
|
| 297 |
+
|
| 298 |
+
gt_stripped = strip_diacritics(gt)
|
| 299 |
+
hyp_stripped = strip_diacritics(hyp)
|
| 300 |
+
# Si les mots sont identiques sans diacritiques → erreur diacritique
|
| 301 |
+
if gt_stripped.casefold() == hyp_stripped.casefold() and gt != hyp:
|
| 302 |
+
return True
|
| 303 |
+
# Si le GT contient des diacritiques que l'OCR a supprimés
|
| 304 |
+
gt_has_diac = any(c in DIACRITIC_MAP for c in gt)
|
| 305 |
+
hyp_missing_diac = any(c not in DIACRITIC_MAP for c in hyp if c.isalpha())
|
| 306 |
+
return gt_has_diac and len(gt) == len(hyp) and gt_stripped == hyp_stripped
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def _is_visual_confusion(gt: str, hyp: str) -> bool:
|
| 310 |
+
"""Vrai si la différence implique des confusions visuelles connues."""
|
| 311 |
+
if abs(len(gt) - len(hyp)) > 2:
|
| 312 |
+
return False
|
| 313 |
+
# Vérifier les paires de confusions connues
|
| 314 |
+
for pair in VISUAL_CONFUSIONS:
|
| 315 |
+
chars = list(pair)
|
| 316 |
+
if len(chars) == 2:
|
| 317 |
+
a, b = chars
|
| 318 |
+
if a in gt and b in hyp and a not in hyp:
|
| 319 |
+
return True
|
| 320 |
+
if b in gt and a in hyp and b not in hyp:
|
| 321 |
+
return True
|
| 322 |
+
return False
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def _is_oov_word(word: str) -> bool:
|
| 326 |
+
"""Vrai si le mot contient des caractères hors de l'alphabet latin de base."""
|
| 327 |
+
return any(c not in _LATIN_BASIC and not c.isalpha() for c in word)
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
# ---------------------------------------------------------------------------
|
| 331 |
+
# Agrégation
|
| 332 |
+
# ---------------------------------------------------------------------------
|
| 333 |
+
|
| 334 |
+
def aggregate_taxonomy(results: list[TaxonomyResult]) -> dict:
|
| 335 |
+
"""Agrège les résultats taxonomiques sur un corpus."""
|
| 336 |
+
combined: dict[str, int] = {cls: 0 for cls in ERROR_CLASSES}
|
| 337 |
+
total = 0
|
| 338 |
+
for r in results:
|
| 339 |
+
for cls, cnt in r.counts.items():
|
| 340 |
+
combined[cls] = combined.get(cls, 0) + cnt
|
| 341 |
+
total += r.total_errors
|
| 342 |
+
|
| 343 |
+
distribution = {
|
| 344 |
+
cls: round(cnt / total, 4) if total > 0 else 0.0
|
| 345 |
+
for cls, cnt in combined.items()
|
| 346 |
+
}
|
| 347 |
+
return {
|
| 348 |
+
"counts": combined,
|
| 349 |
+
"total_errors": total,
|
| 350 |
+
"class_distribution": distribution,
|
| 351 |
+
}
|
picarones/engines/__init__.py
CHANGED
|
@@ -2,8 +2,18 @@
|
|
| 2 |
|
| 3 |
from picarones.engines.base import BaseOCREngine, EngineResult
|
| 4 |
from picarones.engines.tesseract import TesseractEngine
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
try:
|
| 9 |
from picarones.engines.pero_ocr import PeroOCREngine
|
|
|
|
| 2 |
|
| 3 |
from picarones.engines.base import BaseOCREngine, EngineResult
|
| 4 |
from picarones.engines.tesseract import TesseractEngine
|
| 5 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 6 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 7 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 8 |
|
| 9 |
+
__all__ = [
|
| 10 |
+
"BaseOCREngine",
|
| 11 |
+
"EngineResult",
|
| 12 |
+
"TesseractEngine",
|
| 13 |
+
"MistralOCREngine",
|
| 14 |
+
"GoogleVisionEngine",
|
| 15 |
+
"AzureDocIntelEngine",
|
| 16 |
+
]
|
| 17 |
|
| 18 |
try:
|
| 19 |
from picarones.engines.pero_ocr import PeroOCREngine
|
picarones/engines/azure_doc_intel.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur OCR — Azure Document Intelligence (anciennement Form Recognizer).
|
| 2 |
+
|
| 3 |
+
Utilise l'API Azure Document Intelligence pour la reconnaissance de texte
|
| 4 |
+
dans des documents historiques.
|
| 5 |
+
|
| 6 |
+
Variables d'environnement requises :
|
| 7 |
+
- ``AZURE_DOC_INTEL_KEY`` : clé API Azure
|
| 8 |
+
- ``AZURE_DOC_INTEL_ENDPOINT`` : URL de l'endpoint (ex : https://moninstance.cognitiveservices.azure.com/)
|
| 9 |
+
|
| 10 |
+
Documentation : https://learn.microsoft.com/azure/ai-services/document-intelligence/
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import base64
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import time
|
| 19 |
+
import urllib.error
|
| 20 |
+
import urllib.request
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Optional
|
| 23 |
+
|
| 24 |
+
from picarones.engines.base import BaseOCREngine
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class AzureDocIntelEngine(BaseOCREngine):
|
| 28 |
+
"""Moteur OCR via Azure Document Intelligence.
|
| 29 |
+
|
| 30 |
+
Configuration
|
| 31 |
+
-------------
|
| 32 |
+
model_id : str
|
| 33 |
+
Modèle Azure à utiliser. Défaut : ``"prebuilt-read"`` (lecture générique).
|
| 34 |
+
Alternatives : ``"prebuilt-document"``, ``"prebuilt-layout"``
|
| 35 |
+
ou un modèle entraîné personnalisé.
|
| 36 |
+
locale : str
|
| 37 |
+
Paramètre de locale pour améliorer la précision (ex : ``"fr-FR"``).
|
| 38 |
+
api_version : str
|
| 39 |
+
Version de l'API Azure (défaut : ``"2024-02-29-preview"``).
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def name(self) -> str:
|
| 44 |
+
return "azure_doc_intel"
|
| 45 |
+
|
| 46 |
+
def version(self) -> str:
|
| 47 |
+
return self.config.get("api_version", "2024-02-29-preview")
|
| 48 |
+
|
| 49 |
+
def __init__(self, config: Optional[dict] = None) -> None:
|
| 50 |
+
super().__init__(config)
|
| 51 |
+
self._api_key = os.environ.get("AZURE_DOC_INTEL_KEY")
|
| 52 |
+
self._endpoint = (
|
| 53 |
+
os.environ.get("AZURE_DOC_INTEL_ENDPOINT", "").rstrip("/")
|
| 54 |
+
or self.config.get("endpoint", "").rstrip("/")
|
| 55 |
+
)
|
| 56 |
+
self._model_id: str = self.config.get("model_id", "prebuilt-read")
|
| 57 |
+
self._locale: str = self.config.get("locale", "fr-FR")
|
| 58 |
+
self._api_version: str = self.config.get("api_version", "2024-02-29-preview")
|
| 59 |
+
|
| 60 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 61 |
+
if not self._api_key:
|
| 62 |
+
raise RuntimeError(
|
| 63 |
+
"Clé API Azure manquante — définissez la variable d'environnement AZURE_DOC_INTEL_KEY"
|
| 64 |
+
)
|
| 65 |
+
if not self._endpoint:
|
| 66 |
+
raise RuntimeError(
|
| 67 |
+
"Endpoint Azure manquant — définissez la variable d'environnement AZURE_DOC_INTEL_ENDPOINT"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Essai via SDK Azure si disponible, sinon REST direct
|
| 71 |
+
try:
|
| 72 |
+
return self._run_via_sdk(image_path)
|
| 73 |
+
except ImportError:
|
| 74 |
+
return self._run_via_rest(image_path)
|
| 75 |
+
|
| 76 |
+
def _run_via_sdk(self, image_path: Path) -> str:
|
| 77 |
+
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
| 78 |
+
from azure.core.credentials import AzureKeyCredential
|
| 79 |
+
|
| 80 |
+
client = DocumentIntelligenceClient(
|
| 81 |
+
endpoint=self._endpoint,
|
| 82 |
+
credential=AzureKeyCredential(self._api_key),
|
| 83 |
+
)
|
| 84 |
+
with open(image_path, "rb") as f:
|
| 85 |
+
poller = client.begin_analyze_document(
|
| 86 |
+
model_id=self._model_id,
|
| 87 |
+
body=f,
|
| 88 |
+
locale=self._locale,
|
| 89 |
+
content_type="application/octet-stream",
|
| 90 |
+
)
|
| 91 |
+
result = poller.result()
|
| 92 |
+
return "\n".join(
|
| 93 |
+
line.content
|
| 94 |
+
for page in result.pages
|
| 95 |
+
for line in (page.lines or [])
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def _run_via_rest(self, image_path: Path) -> str:
|
| 99 |
+
"""Appel REST direct (sans SDK Azure)."""
|
| 100 |
+
image_bytes = image_path.read_bytes()
|
| 101 |
+
analyze_url = (
|
| 102 |
+
f"{self._endpoint}/documentintelligence/documentModels/"
|
| 103 |
+
f"{self._model_id}:analyze"
|
| 104 |
+
f"?api-version={self._api_version}&locale={self._locale}"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Soumettre l'image
|
| 108 |
+
req = urllib.request.Request(
|
| 109 |
+
analyze_url,
|
| 110 |
+
data=image_bytes,
|
| 111 |
+
headers={
|
| 112 |
+
"Ocp-Apim-Subscription-Key": self._api_key,
|
| 113 |
+
"Content-Type": "application/octet-stream",
|
| 114 |
+
},
|
| 115 |
+
)
|
| 116 |
+
try:
|
| 117 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 118 |
+
operation_url = resp.headers.get("Operation-Location", "")
|
| 119 |
+
except urllib.error.HTTPError as exc:
|
| 120 |
+
raise RuntimeError(
|
| 121 |
+
f"Azure Document Intelligence erreur {exc.code}: {exc.read().decode()}"
|
| 122 |
+
) from exc
|
| 123 |
+
|
| 124 |
+
if not operation_url:
|
| 125 |
+
raise RuntimeError("Azure : pas d'Operation-Location dans la réponse")
|
| 126 |
+
|
| 127 |
+
# Polling du résultat (Azure est asynchrone)
|
| 128 |
+
headers = {"Ocp-Apim-Subscription-Key": self._api_key}
|
| 129 |
+
for attempt in range(30):
|
| 130 |
+
time.sleep(1 + attempt * 0.5)
|
| 131 |
+
poll_req = urllib.request.Request(operation_url, headers=headers)
|
| 132 |
+
with urllib.request.urlopen(poll_req, timeout=30) as resp:
|
| 133 |
+
result = json.loads(resp.read().decode("utf-8"))
|
| 134 |
+
status = result.get("status", "")
|
| 135 |
+
if status == "succeeded":
|
| 136 |
+
return self._extract_text_from_result(result)
|
| 137 |
+
if status in {"failed", "canceled"}:
|
| 138 |
+
raise RuntimeError(f"Azure Document Intelligence : analyse {status}")
|
| 139 |
+
# status == "running" → continuer à attendre
|
| 140 |
+
|
| 141 |
+
raise RuntimeError("Azure Document Intelligence : timeout — analyse trop longue")
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
def _extract_text_from_result(result: dict) -> str:
|
| 145 |
+
"""Extrait le texte brut depuis la réponse JSON Azure."""
|
| 146 |
+
pages = result.get("analyzeResult", {}).get("pages", [])
|
| 147 |
+
lines: list[str] = []
|
| 148 |
+
for page in pages:
|
| 149 |
+
for line in page.get("lines", []):
|
| 150 |
+
content = line.get("content", "")
|
| 151 |
+
if content:
|
| 152 |
+
lines.append(content)
|
| 153 |
+
return "\n".join(lines)
|
picarones/engines/google_vision.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur OCR — Google Cloud Vision API.
|
| 2 |
+
|
| 3 |
+
Utilise l'API Google Cloud Vision pour la détection de texte dans des
|
| 4 |
+
documents (méthode ``DOCUMENT_TEXT_DETECTION``, optimisée pour les textes
|
| 5 |
+
denses et multilinguistiques).
|
| 6 |
+
|
| 7 |
+
Authentification :
|
| 8 |
+
- Via service account JSON : variable d'environnement
|
| 9 |
+
``GOOGLE_APPLICATION_CREDENTIALS`` → chemin vers le fichier JSON
|
| 10 |
+
- Via clé API simple : variable d'environnement ``GOOGLE_API_KEY``
|
| 11 |
+
|
| 12 |
+
Le mode service account est recommandé pour la production.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import base64
|
| 18 |
+
import json
|
| 19 |
+
import os
|
| 20 |
+
import urllib.error
|
| 21 |
+
import urllib.request
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Optional
|
| 24 |
+
|
| 25 |
+
from picarones.engines.base import BaseOCREngine
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class GoogleVisionEngine(BaseOCREngine):
|
| 29 |
+
"""Moteur OCR via l'API Google Cloud Vision.
|
| 30 |
+
|
| 31 |
+
Configuration
|
| 32 |
+
-------------
|
| 33 |
+
language_hints : list[str]
|
| 34 |
+
Suggestions de langue (ex : ``["fr"]``). Améliore la précision.
|
| 35 |
+
feature_type : str
|
| 36 |
+
Type de détection : ``"DOCUMENT_TEXT_DETECTION"`` (défaut, pour textes
|
| 37 |
+
denses) ou ``"TEXT_DETECTION"`` (pour textes courts).
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def name(self) -> str:
|
| 42 |
+
return "google_vision"
|
| 43 |
+
|
| 44 |
+
def version(self) -> str:
|
| 45 |
+
return "v1"
|
| 46 |
+
|
| 47 |
+
def __init__(self, config: Optional[dict] = None) -> None:
|
| 48 |
+
super().__init__(config)
|
| 49 |
+
self._api_key = os.environ.get("GOOGLE_API_KEY")
|
| 50 |
+
self._credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
| 51 |
+
self._language_hints: list[str] = self.config.get("language_hints", ["fr"])
|
| 52 |
+
self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")
|
| 53 |
+
|
| 54 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 55 |
+
# Priorité : SDK google-cloud-vision si disponible, sinon REST direct
|
| 56 |
+
if self._credentials_path:
|
| 57 |
+
return self._run_via_sdk(image_path)
|
| 58 |
+
elif self._api_key:
|
| 59 |
+
return self._run_via_rest(image_path)
|
| 60 |
+
else:
|
| 61 |
+
raise RuntimeError(
|
| 62 |
+
"Authentification Google Vision manquante. Définissez "
|
| 63 |
+
"GOOGLE_APPLICATION_CREDENTIALS (service account JSON) "
|
| 64 |
+
"ou GOOGLE_API_KEY."
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def _run_via_sdk(self, image_path: Path) -> str:
|
| 68 |
+
try:
|
| 69 |
+
from google.cloud import vision
|
| 70 |
+
except ImportError as exc:
|
| 71 |
+
raise RuntimeError(
|
| 72 |
+
"Le package 'google-cloud-vision' n'est pas installé. "
|
| 73 |
+
"Lancez : pip install google-cloud-vision"
|
| 74 |
+
) from exc
|
| 75 |
+
|
| 76 |
+
client = vision.ImageAnnotatorClient()
|
| 77 |
+
image_bytes = image_path.read_bytes()
|
| 78 |
+
image = vision.Image(content=image_bytes)
|
| 79 |
+
|
| 80 |
+
if self._feature_type == "DOCUMENT_TEXT_DETECTION":
|
| 81 |
+
response = client.document_text_detection(
|
| 82 |
+
image=image,
|
| 83 |
+
image_context=vision.ImageContext(
|
| 84 |
+
language_hints=self._language_hints
|
| 85 |
+
),
|
| 86 |
+
)
|
| 87 |
+
return response.full_text_annotation.text
|
| 88 |
+
else:
|
| 89 |
+
response = client.text_detection(
|
| 90 |
+
image=image,
|
| 91 |
+
image_context=vision.ImageContext(
|
| 92 |
+
language_hints=self._language_hints
|
| 93 |
+
),
|
| 94 |
+
)
|
| 95 |
+
texts = response.text_annotations
|
| 96 |
+
return texts[0].description if texts else ""
|
| 97 |
+
|
| 98 |
+
def _run_via_rest(self, image_path: Path) -> str:
|
| 99 |
+
"""Appel REST direct (sans SDK), avec clé API simple."""
|
| 100 |
+
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 101 |
+
payload = {
|
| 102 |
+
"requests": [
|
| 103 |
+
{
|
| 104 |
+
"image": {"content": image_b64},
|
| 105 |
+
"features": [{"type": self._feature_type, "maxResults": 1}],
|
| 106 |
+
"imageContext": {"languageHints": self._language_hints},
|
| 107 |
+
}
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
url = f"https://vision.googleapis.com/v1/images:annotate?key={self._api_key}"
|
| 111 |
+
data = json.dumps(payload).encode("utf-8")
|
| 112 |
+
req = urllib.request.Request(
|
| 113 |
+
url, data=data,
|
| 114 |
+
headers={"Content-Type": "application/json"},
|
| 115 |
+
)
|
| 116 |
+
try:
|
| 117 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 118 |
+
result = json.loads(resp.read().decode("utf-8"))
|
| 119 |
+
except urllib.error.HTTPError as exc:
|
| 120 |
+
raise RuntimeError(f"Google Vision API erreur {exc.code}: {exc.read().decode()}") from exc
|
| 121 |
+
|
| 122 |
+
responses = result.get("responses", [{}])
|
| 123 |
+
if not responses:
|
| 124 |
+
return ""
|
| 125 |
+
r = responses[0]
|
| 126 |
+
if "error" in r:
|
| 127 |
+
raise RuntimeError(f"Google Vision API erreur : {r['error']}")
|
| 128 |
+
|
| 129 |
+
if self._feature_type == "DOCUMENT_TEXT_DETECTION":
|
| 130 |
+
return r.get("fullTextAnnotation", {}).get("text", "")
|
| 131 |
+
else:
|
| 132 |
+
texts = r.get("textAnnotations", [])
|
| 133 |
+
return texts[0]["description"] if texts else ""
|
picarones/engines/mistral_ocr.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur OCR — Mistral OCR (API vision Mistral AI).
|
| 2 |
+
|
| 3 |
+
Utilise l'API Mistral pour la reconnaissance de texte sur documents
|
| 4 |
+
patrimoniaux via le modèle multimodal Mistral.
|
| 5 |
+
|
| 6 |
+
Clé API : variable d'environnement ``MISTRAL_API_KEY``.
|
| 7 |
+
|
| 8 |
+
Documentation API : https://docs.mistral.ai/
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import base64
|
| 14 |
+
import os
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
from picarones.engines.base import BaseOCREngine
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class MistralOCREngine(BaseOCREngine):
|
| 22 |
+
"""Moteur OCR via l'API Mistral AI (modèle vision).
|
| 23 |
+
|
| 24 |
+
Configuration
|
| 25 |
+
-------------
|
| 26 |
+
model : str
|
| 27 |
+
Modèle Mistral à utiliser (défaut : ``"pixtral-12b-2409"``).
|
| 28 |
+
Les modèles multimodaux supportant la vision sont :
|
| 29 |
+
``pixtral-12b-2409``, ``pixtral-large-latest``.
|
| 30 |
+
prompt : str
|
| 31 |
+
Prompt envoyé avec l'image. Défaut : instruction générique de transcription.
|
| 32 |
+
max_tokens : int
|
| 33 |
+
Limite de tokens en sortie (défaut : 4096).
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def name(self) -> str:
|
| 38 |
+
return "mistral_ocr"
|
| 39 |
+
|
| 40 |
+
def version(self) -> str:
|
| 41 |
+
return self.config.get("model", "pixtral-12b-2409")
|
| 42 |
+
|
| 43 |
+
def __init__(self, config: Optional[dict] = None) -> None:
|
| 44 |
+
super().__init__(config)
|
| 45 |
+
self._api_key = os.environ.get("MISTRAL_API_KEY")
|
| 46 |
+
self._model = self.config.get("model", "pixtral-12b-2409")
|
| 47 |
+
self._prompt = self.config.get(
|
| 48 |
+
"prompt",
|
| 49 |
+
"Transcris fidèlement le texte visible sur cette image de document "
|
| 50 |
+
"historique. Retourne uniquement le texte, sans commentaire.",
|
| 51 |
+
)
|
| 52 |
+
self._max_tokens = int(self.config.get("max_tokens", 4096))
|
| 53 |
+
|
| 54 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 55 |
+
if not self._api_key:
|
| 56 |
+
raise RuntimeError(
|
| 57 |
+
"Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
|
| 58 |
+
)
|
| 59 |
+
try:
|
| 60 |
+
from mistralai import Mistral
|
| 61 |
+
except ImportError as exc:
|
| 62 |
+
raise RuntimeError(
|
| 63 |
+
"Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
|
| 64 |
+
) from exc
|
| 65 |
+
|
| 66 |
+
# Encoder l'image en base64 avec media type correct
|
| 67 |
+
suffix = image_path.suffix.lower()
|
| 68 |
+
media_type = {
|
| 69 |
+
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
| 70 |
+
".png": "image/png", ".tif": "image/tiff",
|
| 71 |
+
".tiff": "image/tiff", ".webp": "image/webp",
|
| 72 |
+
}.get(suffix, "image/jpeg")
|
| 73 |
+
|
| 74 |
+
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 75 |
+
image_url = f"data:{media_type};base64,{image_b64}"
|
| 76 |
+
|
| 77 |
+
client = Mistral(api_key=self._api_key)
|
| 78 |
+
response = client.chat.complete(
|
| 79 |
+
model=self._model,
|
| 80 |
+
messages=[
|
| 81 |
+
{
|
| 82 |
+
"role": "user",
|
| 83 |
+
"content": [
|
| 84 |
+
{"type": "text", "text": self._prompt},
|
| 85 |
+
{"type": "image_url", "image_url": image_url},
|
| 86 |
+
],
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
max_tokens=self._max_tokens,
|
| 90 |
+
)
|
| 91 |
+
return response.choices[0].message.content or ""
|
picarones/fixtures.py
CHANGED
|
@@ -18,24 +18,32 @@ from typing import Optional
|
|
| 18 |
from picarones.core.metrics import MetricsResult, aggregate_metrics
|
| 19 |
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
|
| 20 |
from picarones.pipelines.over_normalization import detect_over_normalization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
# Textes GT réalistes (documents patrimoniaux BnF)
|
| 24 |
# ---------------------------------------------------------------------------
|
| 25 |
|
| 26 |
_GT_TEXTS = [
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
|
|
|
| 39 |
]
|
| 40 |
|
| 41 |
# ---------------------------------------------------------------------------
|
|
@@ -289,6 +297,14 @@ def generate_sample_benchmark(
|
|
| 289 |
|
| 290 |
metrics = _make_metrics(gt, hypothesis)
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
doc_results.append(
|
| 293 |
DocumentResult(
|
| 294 |
doc_id=doc_id,
|
|
@@ -299,6 +315,14 @@ def generate_sample_benchmark(
|
|
| 299 |
duration_seconds=duration,
|
| 300 |
ocr_intermediate=ocr_intermediate,
|
| 301 |
pipeline_metadata=pipeline_meta,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
)
|
| 303 |
)
|
| 304 |
|
|
@@ -320,12 +344,54 @@ def generate_sample_benchmark(
|
|
| 320 |
"document_count": len(over_norms),
|
| 321 |
}
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
report = EngineReport(
|
| 324 |
engine_name=engine_name,
|
| 325 |
engine_version=engine_version,
|
| 326 |
engine_config=engine_cfg,
|
| 327 |
document_results=doc_results,
|
| 328 |
pipeline_info=effective_pipeline_info,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
)
|
| 330 |
engine_reports.append(report)
|
| 331 |
|
|
|
|
| 18 |
from picarones.core.metrics import MetricsResult, aggregate_metrics
|
| 19 |
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
|
| 20 |
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 21 |
+
# Sprint 5 — métriques avancées
|
| 22 |
+
from picarones.core.confusion import build_confusion_matrix
|
| 23 |
+
from picarones.core.char_scores import compute_ligature_score, compute_diacritic_score
|
| 24 |
+
from picarones.core.taxonomy import classify_errors, aggregate_taxonomy
|
| 25 |
+
from picarones.core.structure import analyze_structure, aggregate_structure
|
| 26 |
+
from picarones.core.image_quality import generate_mock_quality_scores, aggregate_image_quality
|
| 27 |
+
from picarones.core.char_scores import aggregate_ligature_scores, aggregate_diacritic_scores
|
| 28 |
|
| 29 |
# ---------------------------------------------------------------------------
|
| 30 |
# Textes GT réalistes (documents patrimoniaux BnF)
|
| 31 |
# ---------------------------------------------------------------------------
|
| 32 |
|
| 33 |
_GT_TEXTS = [
|
| 34 |
+
# Textes avec graphies médiévales incluant ſ, &, u/v — pour démontrer le CER diplomatique
|
| 35 |
+
"Icy commence le prologue de maiſtre Jehan Froiſſart ſus les croniques de France & d'Angleterre.",
|
| 36 |
+
"En l'an de grace mil trois cens ſoixante, regnoit en France le noble roy Jehan, filz du roy Phelippe de Valois.",
|
| 37 |
+
"Item ledit iour furent menez en ladicte ville de Paris pluſieurs priſonniers ſaraſins & mahommetans.",
|
| 38 |
+
"Le chancellier du roy manda à tous les baillifs & ſeneſchaulx que on feiſt crier & publier par tous les carrefours.",
|
| 39 |
+
"Cy après ſenſuyt la copie des lettres patentes données par noſtre ſeigneur le roy à ſes très chiers & feaulx.",
|
| 40 |
+
"Nous Charles, par la grace de Dieu roy de France, à tous ceulx qui ces preſentes lettres verront, ſalut.",
|
| 41 |
+
"Sauoir faiſons que pour conſidéracion des bons & aggreables ſeruices que noſtre amé & feal conſeillier.",
|
| 42 |
+
"Donné à Paris, le vingt & deuxième iour du mois de iuillet, l'an de grace mil quatre cens & troys.",
|
| 43 |
+
"Les deſſus ditz ambaſſadeurs reſpondirent que leur ſeigneur & maiſtre eſtoit très ioyeulx de ceſte aliance.",
|
| 44 |
+
"Après lesquelles choſes ainſi faictes & paſſées, le dit traictié fut ratiffié & confirmé de toutes parties.",
|
| 45 |
+
"Item, en ladicte année, fut faicte grant aſſemblée de gens d'armes tant à cheual que à pied.",
|
| 46 |
+
"Et pour ce que la choſe eſt notoire & manifeſte, nous auons fait mettre noſtre ſcel à ces preſentes.",
|
| 47 |
]
|
| 48 |
|
| 49 |
# ---------------------------------------------------------------------------
|
|
|
|
| 297 |
|
| 298 |
metrics = _make_metrics(gt, hypothesis)
|
| 299 |
|
| 300 |
+
# Sprint 5 — métriques avancées patrimoniales
|
| 301 |
+
cm = build_confusion_matrix(gt, hypothesis)
|
| 302 |
+
lig_score = compute_ligature_score(gt, hypothesis)
|
| 303 |
+
diac_score = compute_diacritic_score(gt, hypothesis)
|
| 304 |
+
taxonomy_result = classify_errors(gt, hypothesis)
|
| 305 |
+
struct_result = analyze_structure(gt, hypothesis)
|
| 306 |
+
iq_result = generate_mock_quality_scores(doc_id, seed=rng.randint(0, 999999))
|
| 307 |
+
|
| 308 |
doc_results.append(
|
| 309 |
DocumentResult(
|
| 310 |
doc_id=doc_id,
|
|
|
|
| 315 |
duration_seconds=duration,
|
| 316 |
ocr_intermediate=ocr_intermediate,
|
| 317 |
pipeline_metadata=pipeline_meta,
|
| 318 |
+
confusion_matrix=cm.as_dict(),
|
| 319 |
+
char_scores={
|
| 320 |
+
"ligature": lig_score.as_dict(),
|
| 321 |
+
"diacritic": diac_score.as_dict(),
|
| 322 |
+
},
|
| 323 |
+
taxonomy=taxonomy_result.as_dict(),
|
| 324 |
+
structure=struct_result.as_dict(),
|
| 325 |
+
image_quality=iq_result.as_dict(),
|
| 326 |
)
|
| 327 |
)
|
| 328 |
|
|
|
|
| 344 |
"document_count": len(over_norms),
|
| 345 |
}
|
| 346 |
|
| 347 |
+
# Agrégation Sprint 5
|
| 348 |
+
from picarones.core.confusion import aggregate_confusion_matrices, ConfusionMatrix
|
| 349 |
+
from picarones.core.char_scores import LigatureScore, DiacriticScore
|
| 350 |
+
from picarones.core.taxonomy import TaxonomyResult
|
| 351 |
+
from picarones.core.structure import StructureResult
|
| 352 |
+
from picarones.core.image_quality import ImageQualityResult
|
| 353 |
+
|
| 354 |
+
agg_confusion = aggregate_confusion_matrices([
|
| 355 |
+
ConfusionMatrix(**dr.confusion_matrix)
|
| 356 |
+
for dr in doc_results if dr.confusion_matrix
|
| 357 |
+
]).as_compact_dict(min_count=1)
|
| 358 |
+
|
| 359 |
+
agg_lig = aggregate_ligature_scores([
|
| 360 |
+
LigatureScore(**dr.char_scores["ligature"])
|
| 361 |
+
for dr in doc_results if dr.char_scores
|
| 362 |
+
])
|
| 363 |
+
agg_diac = aggregate_diacritic_scores([
|
| 364 |
+
DiacriticScore(**dr.char_scores["diacritic"])
|
| 365 |
+
for dr in doc_results if dr.char_scores
|
| 366 |
+
])
|
| 367 |
+
agg_char_scores = {"ligature": agg_lig, "diacritic": agg_diac}
|
| 368 |
+
|
| 369 |
+
agg_taxonomy = aggregate_taxonomy([
|
| 370 |
+
TaxonomyResult.from_dict(dr.taxonomy)
|
| 371 |
+
for dr in doc_results if dr.taxonomy
|
| 372 |
+
])
|
| 373 |
+
|
| 374 |
+
agg_structure = aggregate_structure([
|
| 375 |
+
StructureResult.from_dict(dr.structure)
|
| 376 |
+
for dr in doc_results if dr.structure
|
| 377 |
+
])
|
| 378 |
+
|
| 379 |
+
agg_iq = aggregate_image_quality([
|
| 380 |
+
ImageQualityResult.from_dict(dr.image_quality)
|
| 381 |
+
for dr in doc_results if dr.image_quality
|
| 382 |
+
])
|
| 383 |
+
|
| 384 |
report = EngineReport(
|
| 385 |
engine_name=engine_name,
|
| 386 |
engine_version=engine_version,
|
| 387 |
engine_config=engine_cfg,
|
| 388 |
document_results=doc_results,
|
| 389 |
pipeline_info=effective_pipeline_info,
|
| 390 |
+
aggregated_confusion=agg_confusion,
|
| 391 |
+
aggregated_char_scores=agg_char_scores,
|
| 392 |
+
aggregated_taxonomy=agg_taxonomy,
|
| 393 |
+
aggregated_structure=agg_structure,
|
| 394 |
+
aggregated_image_quality=agg_iq,
|
| 395 |
)
|
| 396 |
engine_reports.append(report)
|
| 397 |
|
picarones/importers/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Importeurs de corpus depuis des sources distantes (IIIF, HuggingFace, HTR-United…)."""
|
| 2 |
+
|
| 3 |
+
from picarones.importers.iiif import IIIFImporter, import_iiif_manifest
|
| 4 |
+
|
| 5 |
+
__all__ = ["IIIFImporter", "import_iiif_manifest"]
|
picarones/importers/htr_united.py
ADDED
|
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Import depuis le catalogue HTR-United.
|
| 2 |
+
|
| 3 |
+
HTR-United est un catalogue communautaire de vérités terrain HTR/OCR publiées
|
| 4 |
+
sur GitHub sous licence ouverte. Les métadonnées sont stockées dans un fichier
|
| 5 |
+
YAML (catalogue.yml) sur https://github.com/HTR-United/htr-united.
|
| 6 |
+
|
| 7 |
+
Ce module fournit :
|
| 8 |
+
- :class:`HTRUnitedCatalogue` — chargement et recherche dans le catalogue
|
| 9 |
+
- :func:`fetch_catalogue` — téléchargement du catalogue depuis GitHub
|
| 10 |
+
- :func:`import_htr_united_corpus` — téléchargement et import d'un corpus
|
| 11 |
+
|
| 12 |
+
Exemple
|
| 13 |
+
-------
|
| 14 |
+
catalogue = HTRUnitedCatalogue.from_remote()
|
| 15 |
+
results = catalogue.search("français médiéval")
|
| 16 |
+
corpus = import_htr_united_corpus(results[0], output_dir="./corpus/")
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import re
|
| 24 |
+
import time
|
| 25 |
+
import urllib.error
|
| 26 |
+
import urllib.request
|
| 27 |
+
from dataclasses import dataclass, field
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from typing import Optional
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Catalogue remote URL
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
_CATALOGUE_URL = (
|
| 36 |
+
"https://raw.githubusercontent.com/HTR-United/htr-united/master/htr-united.yml"
|
| 37 |
+
)
|
| 38 |
+
_CATALOGUE_API_URL = (
|
| 39 |
+
"https://api.github.com/repos/HTR-United/htr-united/contents/htr-united.yml"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Catalogue de démonstration / fallback (hors-ligne)
|
| 43 |
+
_DEMO_CATALOGUE: list[dict] = [
|
| 44 |
+
{
|
| 45 |
+
"id": "lectaurep-repertoires",
|
| 46 |
+
"title": "Lectaurep — Répertoires de notaires parisiens",
|
| 47 |
+
"url": "https://github.com/HTR-United/lectaurep-repertoires",
|
| 48 |
+
"language": ["French"],
|
| 49 |
+
"script": ["Cursiva"],
|
| 50 |
+
"century": [17, 18],
|
| 51 |
+
"institution": "Archives nationales (France)",
|
| 52 |
+
"description": "Transcriptions de répertoires de notaires, XVIIe-XVIIIe siècles.",
|
| 53 |
+
"license": "CC-BY 4.0",
|
| 54 |
+
"lines": 12400,
|
| 55 |
+
"format": "ALTO",
|
| 56 |
+
"tags": ["notaires", "Paris", "cursive", "imprimé"],
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"id": "bvmm-manuscripts",
|
| 60 |
+
"title": "BVMM — Manuscrits enluminés",
|
| 61 |
+
"url": "https://github.com/HTR-United/bvmm-manuscripts",
|
| 62 |
+
"language": ["Latin", "French"],
|
| 63 |
+
"script": ["Gothic"],
|
| 64 |
+
"century": [13, 14, 15],
|
| 65 |
+
"institution": "IRHT / BnF",
|
| 66 |
+
"description": "Manuscrits médiévaux latins et français, XIIIe-XVe siècles.",
|
| 67 |
+
"license": "CC-BY 4.0",
|
| 68 |
+
"lines": 8700,
|
| 69 |
+
"format": "ALTO",
|
| 70 |
+
"tags": ["manuscrits", "latin", "médiéval", "enluminure"],
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"id": "cremma-medieval",
|
| 74 |
+
"title": "CREMMA Médiéval",
|
| 75 |
+
"url": "https://github.com/HTR-United/cremma-medieval",
|
| 76 |
+
"language": ["French", "Latin"],
|
| 77 |
+
"script": ["Gothic", "Humanistica"],
|
| 78 |
+
"century": [12, 13, 14, 15],
|
| 79 |
+
"institution": "École des chartes / Inria",
|
| 80 |
+
"description": "Corpus CREMMA de manuscrits médiévaux français et latins.",
|
| 81 |
+
"license": "CC-BY 4.0",
|
| 82 |
+
"lines": 6200,
|
| 83 |
+
"format": "ALTO",
|
| 84 |
+
"tags": ["médiéval", "chartes", "manuscrits"],
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"id": "simssa-ocr-printed",
|
| 88 |
+
"title": "SIMSSA — Imprimés anciens (XVe-XVIIe)",
|
| 89 |
+
"url": "https://github.com/HTR-United/simssa-printed",
|
| 90 |
+
"language": ["French", "Latin"],
|
| 91 |
+
"script": ["Rotunda", "Roman"],
|
| 92 |
+
"century": [15, 16, 17],
|
| 93 |
+
"institution": "McGill University",
|
| 94 |
+
"description": "Corpus d'imprimés anciens romains et gothiques.",
|
| 95 |
+
"license": "CC-BY 4.0",
|
| 96 |
+
"lines": 4500,
|
| 97 |
+
"format": "PAGE",
|
| 98 |
+
"tags": ["imprimés", "incunables", "roman", "gothique"],
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"id": "fonds-gallica-presse",
|
| 102 |
+
"title": "Presse ancienne — Gallica (XIXe)",
|
| 103 |
+
"url": "https://github.com/HTR-United/gallica-presse-xix",
|
| 104 |
+
"language": ["French"],
|
| 105 |
+
"script": ["Roman"],
|
| 106 |
+
"century": [19],
|
| 107 |
+
"institution": "BnF",
|
| 108 |
+
"description": "Numérisations de journaux du XIXe siècle (Gallica).",
|
| 109 |
+
"license": "etalab-2.0",
|
| 110 |
+
"lines": 31000,
|
| 111 |
+
"format": "ALTO",
|
| 112 |
+
"tags": ["presse", "XIXe", "Gallica", "journaux"],
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"id": "archives-departem-correspondances",
|
| 116 |
+
"title": "Correspondances administratives (XVIIIe-XIXe)",
|
| 117 |
+
"url": "https://github.com/HTR-United/correspondances-admin",
|
| 118 |
+
"language": ["French"],
|
| 119 |
+
"script": ["Cursiva"],
|
| 120 |
+
"century": [18, 19],
|
| 121 |
+
"institution": "Archives départementales",
|
| 122 |
+
"description": "Lettres et correspondances administratives manuscrites.",
|
| 123 |
+
"license": "CC-BY 4.0",
|
| 124 |
+
"lines": 9800,
|
| 125 |
+
"format": "ALTO",
|
| 126 |
+
"tags": ["correspondances", "administratif", "cursive"],
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"id": "e-codices-latin",
|
| 130 |
+
"title": "e-codices — Manuscrits latins (Suisse)",
|
| 131 |
+
"url": "https://github.com/HTR-United/e-codices-latin",
|
| 132 |
+
"language": ["Latin"],
|
| 133 |
+
"script": ["Caroline", "Gothic"],
|
| 134 |
+
"century": [9, 10, 11, 12],
|
| 135 |
+
"institution": "Bibliothèque cantonale universitaire de Lausanne",
|
| 136 |
+
"description": "Manuscrits carolingiens et gothiques des bibliothèques suisses.",
|
| 137 |
+
"license": "CC-BY 4.0",
|
| 138 |
+
"lines": 3100,
|
| 139 |
+
"format": "ALTO",
|
| 140 |
+
"tags": ["caroline", "latin", "médiéval", "Suisse"],
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"id": "registres-paroissiaux-17",
|
| 144 |
+
"title": "Registres paroissiaux — Bretagne (XVIIe)",
|
| 145 |
+
"url": "https://github.com/HTR-United/registres-paroissiaux-bretagne",
|
| 146 |
+
"language": ["French", "Latin"],
|
| 147 |
+
"script": ["Cursiva"],
|
| 148 |
+
"century": [17],
|
| 149 |
+
"institution": "Archives départementales du Finistère",
|
| 150 |
+
"description": "Registres paroissiaux bretons du XVIIe siècle.",
|
| 151 |
+
"license": "CC-BY 4.0",
|
| 152 |
+
"lines": 15600,
|
| 153 |
+
"format": "ALTO",
|
| 154 |
+
"tags": ["registres", "Bretagne", "paroissial", "cursive"],
|
| 155 |
+
},
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ---------------------------------------------------------------------------
|
| 160 |
+
# Dataclass entrée catalogue
|
| 161 |
+
# ---------------------------------------------------------------------------
|
| 162 |
+
|
| 163 |
+
@dataclass
|
| 164 |
+
class HTRUnitedEntry:
|
| 165 |
+
"""Une entrée dans le catalogue HTR-United."""
|
| 166 |
+
|
| 167 |
+
id: str
|
| 168 |
+
title: str
|
| 169 |
+
url: str
|
| 170 |
+
language: list[str] = field(default_factory=list)
|
| 171 |
+
script: list[str] = field(default_factory=list)
|
| 172 |
+
century: list[int] = field(default_factory=list)
|
| 173 |
+
institution: str = ""
|
| 174 |
+
description: str = ""
|
| 175 |
+
license: str = ""
|
| 176 |
+
lines: int = 0
|
| 177 |
+
format: str = "ALTO"
|
| 178 |
+
tags: list[str] = field(default_factory=list)
|
| 179 |
+
|
| 180 |
+
def as_dict(self) -> dict:
|
| 181 |
+
return {
|
| 182 |
+
"id": self.id,
|
| 183 |
+
"title": self.title,
|
| 184 |
+
"url": self.url,
|
| 185 |
+
"language": self.language,
|
| 186 |
+
"script": self.script,
|
| 187 |
+
"century": self.century,
|
| 188 |
+
"institution": self.institution,
|
| 189 |
+
"description": self.description,
|
| 190 |
+
"license": self.license,
|
| 191 |
+
"lines": self.lines,
|
| 192 |
+
"format": self.format,
|
| 193 |
+
"tags": self.tags,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
@classmethod
|
| 197 |
+
def from_dict(cls, d: dict) -> "HTRUnitedEntry":
|
| 198 |
+
return cls(
|
| 199 |
+
id=d.get("id", ""),
|
| 200 |
+
title=d.get("title", ""),
|
| 201 |
+
url=d.get("url", ""),
|
| 202 |
+
language=d.get("language", []),
|
| 203 |
+
script=d.get("script", []),
|
| 204 |
+
century=d.get("century", []),
|
| 205 |
+
institution=d.get("institution", ""),
|
| 206 |
+
description=d.get("description", ""),
|
| 207 |
+
license=d.get("license", ""),
|
| 208 |
+
lines=d.get("lines", 0),
|
| 209 |
+
format=d.get("format", "ALTO"),
|
| 210 |
+
tags=d.get("tags", []),
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
@property
|
| 214 |
+
def century_str(self) -> str:
|
| 215 |
+
"""Siècles formatés en chiffres romains."""
|
| 216 |
+
roman = {
|
| 217 |
+
1: "Ier", 2: "IIe", 3: "IIIe", 4: "IVe", 5: "Ve",
|
| 218 |
+
6: "VIe", 7: "VIIe", 8: "VIIIe", 9: "IXe", 10: "Xe",
|
| 219 |
+
11: "XIe", 12: "XIIe", 13: "XIIIe", 14: "XIVe", 15: "XVe",
|
| 220 |
+
16: "XVIe", 17: "XVIIe", 18: "XVIIIe", 19: "XIXe", 20: "XXe",
|
| 221 |
+
}
|
| 222 |
+
return ", ".join(roman.get(c, f"{c}e") for c in self.century)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ---------------------------------------------------------------------------
|
| 226 |
+
# Catalogue
|
| 227 |
+
# ---------------------------------------------------------------------------
|
| 228 |
+
|
| 229 |
+
class HTRUnitedCatalogue:
|
| 230 |
+
"""Catalogue HTR-United avec recherche et filtrage."""
|
| 231 |
+
|
| 232 |
+
def __init__(self, entries: list[HTRUnitedEntry], source: str = "demo") -> None:
|
| 233 |
+
self.entries = entries
|
| 234 |
+
self.source = source # "remote" | "demo" | "cache"
|
| 235 |
+
|
| 236 |
+
def __len__(self) -> int:
|
| 237 |
+
return len(self.entries)
|
| 238 |
+
|
| 239 |
+
@classmethod
|
| 240 |
+
def from_demo(cls) -> "HTRUnitedCatalogue":
|
| 241 |
+
"""Charge le catalogue de démonstration intégré."""
|
| 242 |
+
entries = [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]
|
| 243 |
+
return cls(entries, source="demo")
|
| 244 |
+
|
| 245 |
+
@classmethod
|
| 246 |
+
def from_remote(cls, timeout: int = 10) -> "HTRUnitedCatalogue":
|
| 247 |
+
"""Télécharge le catalogue depuis GitHub.
|
| 248 |
+
|
| 249 |
+
En cas d'erreur réseau, retourne le catalogue de démonstration.
|
| 250 |
+
"""
|
| 251 |
+
try:
|
| 252 |
+
req = urllib.request.Request(
|
| 253 |
+
_CATALOGUE_URL,
|
| 254 |
+
headers={"User-Agent": "picarones-htr-united-importer/1.0"},
|
| 255 |
+
)
|
| 256 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 257 |
+
raw = resp.read().decode("utf-8")
|
| 258 |
+
entries = _parse_yml_catalogue(raw)
|
| 259 |
+
return cls(entries, source="remote")
|
| 260 |
+
except (urllib.error.URLError, Exception):
|
| 261 |
+
# Fallback démo
|
| 262 |
+
return cls.from_demo()
|
| 263 |
+
|
| 264 |
+
def search(
|
| 265 |
+
self,
|
| 266 |
+
query: str = "",
|
| 267 |
+
language: Optional[str] = None,
|
| 268 |
+
script: Optional[str] = None,
|
| 269 |
+
century_min: Optional[int] = None,
|
| 270 |
+
century_max: Optional[int] = None,
|
| 271 |
+
) -> list[HTRUnitedEntry]:
|
| 272 |
+
"""Recherche dans le catalogue avec filtres optionnels."""
|
| 273 |
+
results = self.entries
|
| 274 |
+
|
| 275 |
+
if query:
|
| 276 |
+
q = query.lower()
|
| 277 |
+
results = [
|
| 278 |
+
e for e in results
|
| 279 |
+
if (q in e.title.lower()
|
| 280 |
+
or q in e.description.lower()
|
| 281 |
+
or q in e.institution.lower()
|
| 282 |
+
or any(q in t.lower() for t in e.tags)
|
| 283 |
+
or any(q in lang.lower() for lang in e.language))
|
| 284 |
+
]
|
| 285 |
+
|
| 286 |
+
if language:
|
| 287 |
+
lang_lower = language.lower()
|
| 288 |
+
results = [
|
| 289 |
+
e for e in results
|
| 290 |
+
if any(lang_lower in l.lower() for l in e.language)
|
| 291 |
+
]
|
| 292 |
+
|
| 293 |
+
if script:
|
| 294 |
+
sc_lower = script.lower()
|
| 295 |
+
results = [
|
| 296 |
+
e for e in results
|
| 297 |
+
if any(sc_lower in s.lower() for s in e.script)
|
| 298 |
+
]
|
| 299 |
+
|
| 300 |
+
if century_min is not None:
|
| 301 |
+
results = [
|
| 302 |
+
e for e in results
|
| 303 |
+
if any(c >= century_min for c in e.century)
|
| 304 |
+
]
|
| 305 |
+
|
| 306 |
+
if century_max is not None:
|
| 307 |
+
results = [
|
| 308 |
+
e for e in results
|
| 309 |
+
if any(c <= century_max for c in e.century)
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
return results
|
| 313 |
+
|
| 314 |
+
def get_by_id(self, entry_id: str) -> Optional[HTRUnitedEntry]:
|
| 315 |
+
"""Retourne une entrée par son identifiant."""
|
| 316 |
+
for e in self.entries:
|
| 317 |
+
if e.id == entry_id:
|
| 318 |
+
return e
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
def available_languages(self) -> list[str]:
|
| 322 |
+
seen: set[str] = set()
|
| 323 |
+
result: list[str] = []
|
| 324 |
+
for e in self.entries:
|
| 325 |
+
for lang in e.language:
|
| 326 |
+
if lang not in seen:
|
| 327 |
+
seen.add(lang)
|
| 328 |
+
result.append(lang)
|
| 329 |
+
return sorted(result)
|
| 330 |
+
|
| 331 |
+
def available_scripts(self) -> list[str]:
|
| 332 |
+
seen: set[str] = set()
|
| 333 |
+
result: list[str] = []
|
| 334 |
+
for e in self.entries:
|
| 335 |
+
for sc in e.script:
|
| 336 |
+
if sc not in seen:
|
| 337 |
+
seen.add(sc)
|
| 338 |
+
result.append(sc)
|
| 339 |
+
return sorted(result)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
# ---------------------------------------------------------------------------
|
| 343 |
+
# Import de corpus
|
| 344 |
+
# ---------------------------------------------------------------------------
|
| 345 |
+
|
| 346 |
+
def import_htr_united_corpus(
|
| 347 |
+
entry: HTRUnitedEntry,
|
| 348 |
+
output_dir: str | Path,
|
| 349 |
+
max_samples: int = 100,
|
| 350 |
+
show_progress: bool = True,
|
| 351 |
+
) -> dict:
|
| 352 |
+
"""Importe un corpus HTR-United dans un dossier local.
|
| 353 |
+
|
| 354 |
+
Retourne un dict avec les métadonnées de l'import.
|
| 355 |
+
Note : en l'absence d'accès réseau au dépôt GitHub, génère des fichiers
|
| 356 |
+
placeholder (pour tests et démo).
|
| 357 |
+
"""
|
| 358 |
+
output_path = Path(output_dir)
|
| 359 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 360 |
+
|
| 361 |
+
# Sauvegarder les métadonnées
|
| 362 |
+
meta = {
|
| 363 |
+
"source": "htr-united",
|
| 364 |
+
"entry_id": entry.id,
|
| 365 |
+
"title": entry.title,
|
| 366 |
+
"url": entry.url,
|
| 367 |
+
"language": entry.language,
|
| 368 |
+
"script": entry.script,
|
| 369 |
+
"century": entry.century,
|
| 370 |
+
"institution": entry.institution,
|
| 371 |
+
"license": entry.license,
|
| 372 |
+
"format": entry.format,
|
| 373 |
+
"imported_at": _iso_now(),
|
| 374 |
+
}
|
| 375 |
+
(output_path / "htr_united_meta.json").write_text(
|
| 376 |
+
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
# Essai de téléchargement réel depuis GitHub (archive releases)
|
| 380 |
+
downloaded = _try_download_corpus(entry, output_path, max_samples, show_progress)
|
| 381 |
+
|
| 382 |
+
return {
|
| 383 |
+
"entry_id": entry.id,
|
| 384 |
+
"title": entry.title,
|
| 385 |
+
"output_dir": str(output_path),
|
| 386 |
+
"files_imported": downloaded,
|
| 387 |
+
"metadata_file": str(output_path / "htr_united_meta.json"),
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def _try_download_corpus(
|
| 392 |
+
entry: HTRUnitedEntry,
|
| 393 |
+
output_path: Path,
|
| 394 |
+
max_samples: int,
|
| 395 |
+
show_progress: bool,
|
| 396 |
+
) -> int:
|
| 397 |
+
"""Tente de télécharger le corpus depuis GitHub. Retourne le nombre de fichiers importés."""
|
| 398 |
+
# Construit l'URL de l'archive ZIP du dépôt GitHub
|
| 399 |
+
repo_path = _extract_github_repo(entry.url)
|
| 400 |
+
if not repo_path:
|
| 401 |
+
return 0
|
| 402 |
+
|
| 403 |
+
zip_url = f"https://github.com/{repo_path}/archive/refs/heads/main.zip"
|
| 404 |
+
try:
|
| 405 |
+
req = urllib.request.Request(
|
| 406 |
+
zip_url,
|
| 407 |
+
headers={"User-Agent": "picarones-htr-united-importer/1.0"},
|
| 408 |
+
)
|
| 409 |
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
| 410 |
+
import io
|
| 411 |
+
import zipfile
|
| 412 |
+
|
| 413 |
+
data = resp.read()
|
| 414 |
+
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
| 415 |
+
# Extraire les fichiers ALTO/PAGE/GT
|
| 416 |
+
gt_files = [
|
| 417 |
+
n for n in zf.namelist()
|
| 418 |
+
if n.endswith((".alto.xml", ".page.xml", ".gt.txt", ".xml"))
|
| 419 |
+
and not n.endswith("/")
|
| 420 |
+
][:max_samples]
|
| 421 |
+
for i, fname in enumerate(gt_files):
|
| 422 |
+
dest = output_path / Path(fname).name
|
| 423 |
+
dest.write_bytes(zf.read(fname))
|
| 424 |
+
return len(gt_files)
|
| 425 |
+
except Exception:
|
| 426 |
+
return 0
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def _extract_github_repo(url: str) -> Optional[str]:
|
| 430 |
+
"""Extrait 'owner/repo' depuis une URL GitHub."""
|
| 431 |
+
m = re.match(r"https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$", url)
|
| 432 |
+
return m.group(1) if m else None
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
def _parse_yml_catalogue(raw: str) -> list[HTRUnitedEntry]:
|
| 436 |
+
"""Parse rudimentaire du YAML catalogue HTR-United."""
|
| 437 |
+
try:
|
| 438 |
+
import yaml
|
| 439 |
+
data = yaml.safe_load(raw)
|
| 440 |
+
if isinstance(data, list):
|
| 441 |
+
return [HTRUnitedEntry.from_dict(d) for d in data if isinstance(d, dict)]
|
| 442 |
+
except Exception:
|
| 443 |
+
pass
|
| 444 |
+
return [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def _iso_now() -> str:
|
| 448 |
+
from datetime import datetime, timezone
|
| 449 |
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
picarones/importers/huggingface.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Import de datasets OCR/HTR depuis HuggingFace Hub.
|
| 2 |
+
|
| 3 |
+
Ce module fournit :
|
| 4 |
+
- :class:`HuggingFaceDataset` — métadonnées d'un dataset HuggingFace
|
| 5 |
+
- :class:`HuggingFaceImporter` — recherche et import de datasets
|
| 6 |
+
- :func:`search_hf_datasets` — recherche par tags dans l'API HuggingFace
|
| 7 |
+
- :func:`import_hf_dataset` — téléchargement d'un dataset vers un dossier local
|
| 8 |
+
|
| 9 |
+
Les datasets patrimoniaux de référence sont pré-référencés pour une découverte
|
| 10 |
+
rapide sans requête réseau.
|
| 11 |
+
|
| 12 |
+
Exemple
|
| 13 |
+
-------
|
| 14 |
+
importer = HuggingFaceImporter()
|
| 15 |
+
results = importer.search("medieval OCR", tags=["ocr"])
|
| 16 |
+
corpus = importer.import_dataset(results[0].dataset_id, output_dir="./corpus/")
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import urllib.error
|
| 24 |
+
import urllib.parse
|
| 25 |
+
import urllib.request
|
| 26 |
+
from dataclasses import dataclass, field
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import Optional
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# Datasets de référence pré-référencés
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
|
| 34 |
+
_REFERENCE_DATASETS: list[dict] = [
|
| 35 |
+
{
|
| 36 |
+
"dataset_id": "Teklia/RIMES",
|
| 37 |
+
"title": "RIMES — Reconnaissance et Indexation de données Manuscrites et de fac-similEs",
|
| 38 |
+
"description": "Corpus de courriers manuscrits français modernes. Standard de référence pour la reconnaissance d'écriture manuscrite.",
|
| 39 |
+
"language": ["French"],
|
| 40 |
+
"tags": ["htr", "ocr", "handwritten", "french", "modern"],
|
| 41 |
+
"license": "cc-by-4.0",
|
| 42 |
+
"size_category": "1K<n<10K",
|
| 43 |
+
"task": "image-to-text",
|
| 44 |
+
"institution": "IRISA / A2iA",
|
| 45 |
+
"downloads": 1200,
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"dataset_id": "Teklia/IAM",
|
| 49 |
+
"title": "IAM Handwriting Database",
|
| 50 |
+
"description": "Corpus de référence anglais pour la reconnaissance d'écriture manuscrite.",
|
| 51 |
+
"language": ["English"],
|
| 52 |
+
"tags": ["htr", "ocr", "handwritten", "english"],
|
| 53 |
+
"license": "other",
|
| 54 |
+
"size_category": "10K<n<100K",
|
| 55 |
+
"task": "image-to-text",
|
| 56 |
+
"institution": "University of Bern",
|
| 57 |
+
"downloads": 8400,
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"dataset_id": "CATMuS/medieval",
|
| 61 |
+
"title": "CATMuS Medieval — Consistent Approaches to Transcribing ManuScripts",
|
| 62 |
+
"description": "Dataset multilingue de manuscrits médiévaux (latin, français, occitan, espagnol) pour l'entraînement de modèles HTR.",
|
| 63 |
+
"language": ["Latin", "French", "Occitan", "Spanish"],
|
| 64 |
+
"tags": ["htr", "medieval", "manuscripts", "latin", "french", "historical"],
|
| 65 |
+
"license": "cc-by-4.0",
|
| 66 |
+
"size_category": "100K<n<1M",
|
| 67 |
+
"task": "image-to-text",
|
| 68 |
+
"institution": "Inria / EPHE",
|
| 69 |
+
"downloads": 3100,
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"dataset_id": "htr-united/cremma-medieval",
|
| 73 |
+
"title": "CREMMA Medieval",
|
| 74 |
+
"description": "Corpus de manuscrits médiévaux français XIIe-XVe siècles.",
|
| 75 |
+
"language": ["French", "Latin"],
|
| 76 |
+
"tags": ["htr", "medieval", "french", "manuscripts", "htr-united"],
|
| 77 |
+
"license": "cc-by-4.0",
|
| 78 |
+
"size_category": "1K<n<10K",
|
| 79 |
+
"task": "image-to-text",
|
| 80 |
+
"institution": "Inria",
|
| 81 |
+
"downloads": 520,
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"dataset_id": "biglam/europeana_newspapers",
|
| 85 |
+
"title": "Europeana Newspapers",
|
| 86 |
+
"description": "Journaux numérisés européens du XIXe siècle (OCR + images).",
|
| 87 |
+
"language": ["French", "German", "Dutch", "Finnish"],
|
| 88 |
+
"tags": ["ocr", "newspapers", "historical", "19th-century", "europeana"],
|
| 89 |
+
"license": "cc0-1.0",
|
| 90 |
+
"size_category": "1M<n<10M",
|
| 91 |
+
"task": "image-to-text",
|
| 92 |
+
"institution": "Europeana Foundation",
|
| 93 |
+
"downloads": 15200,
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"dataset_id": "stefanklut/esposalles",
|
| 97 |
+
"title": "Esposalles Dataset",
|
| 98 |
+
"description": "Registres de mariage catalans du XVIIe siècle pour la reconnaissance d'écriture historique.",
|
| 99 |
+
"language": ["Catalan", "Latin"],
|
| 100 |
+
"tags": ["htr", "historical", "registers", "catalan", "17th-century"],
|
| 101 |
+
"license": "cc-by-4.0",
|
| 102 |
+
"size_category": "1K<n<10K",
|
| 103 |
+
"task": "image-to-text",
|
| 104 |
+
"institution": "Universitat Autònoma de Barcelona",
|
| 105 |
+
"downloads": 340,
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"dataset_id": "bnf-gallica/gallica-ocr",
|
| 109 |
+
"title": "Gallica OCR — BnF",
|
| 110 |
+
"description": "Extraits d'imprimés anciens numérisés depuis Gallica avec vérité terrain.",
|
| 111 |
+
"language": ["French", "Latin"],
|
| 112 |
+
"tags": ["ocr", "historical", "printed", "gallica", "bnf", "french"],
|
| 113 |
+
"license": "etalab-2.0",
|
| 114 |
+
"size_category": "10K<n<100K",
|
| 115 |
+
"task": "image-to-text",
|
| 116 |
+
"institution": "Bibliothèque nationale de France",
|
| 117 |
+
"downloads": 2800,
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"dataset_id": "Bozen-Baptism/baptism-records",
|
| 121 |
+
"title": "Bozen Baptism Records",
|
| 122 |
+
"description": "Registres de baptêmes de Bozen (Italie/Autriche) du XVIIIe siècle.",
|
| 123 |
+
"language": ["German", "Latin"],
|
| 124 |
+
"tags": ["htr", "historical", "registers", "german", "latin", "18th-century"],
|
| 125 |
+
"license": "cc-by-4.0",
|
| 126 |
+
"size_category": "1K<n<10K",
|
| 127 |
+
"task": "image-to-text",
|
| 128 |
+
"institution": "University of Innsbruck",
|
| 129 |
+
"downloads": 190,
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"dataset_id": "read-bad/readbad",
|
| 133 |
+
"title": "READ-BAD — Recognition and Enrichment of Archival Documents",
|
| 134 |
+
"description": "Corpus multilingue de documents d'archives pour l'OCR historique (Latin, Allemand, Anglais).",
|
| 135 |
+
"language": ["German", "English", "Latin"],
|
| 136 |
+
"tags": ["ocr", "htr", "historical", "archives", "read"],
|
| 137 |
+
"license": "cc-by-4.0",
|
| 138 |
+
"size_category": "10K<n<100K",
|
| 139 |
+
"task": "image-to-text",
|
| 140 |
+
"institution": "University of Graz",
|
| 141 |
+
"downloads": 1050,
|
| 142 |
+
},
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
# ---------------------------------------------------------------------------
|
| 146 |
+
# Dataclass
|
| 147 |
+
# ---------------------------------------------------------------------------
|
| 148 |
+
|
| 149 |
+
@dataclass
|
| 150 |
+
class HuggingFaceDataset:
|
| 151 |
+
"""Métadonnées d'un dataset HuggingFace."""
|
| 152 |
+
|
| 153 |
+
dataset_id: str
|
| 154 |
+
title: str
|
| 155 |
+
description: str = ""
|
| 156 |
+
language: list[str] = field(default_factory=list)
|
| 157 |
+
tags: list[str] = field(default_factory=list)
|
| 158 |
+
license: str = ""
|
| 159 |
+
size_category: str = ""
|
| 160 |
+
task: str = "image-to-text"
|
| 161 |
+
institution: str = ""
|
| 162 |
+
downloads: int = 0
|
| 163 |
+
source: str = "reference" # "reference" | "api"
|
| 164 |
+
|
| 165 |
+
def as_dict(self) -> dict:
|
| 166 |
+
return {
|
| 167 |
+
"dataset_id": self.dataset_id,
|
| 168 |
+
"title": self.title,
|
| 169 |
+
"description": self.description,
|
| 170 |
+
"language": self.language,
|
| 171 |
+
"tags": self.tags,
|
| 172 |
+
"license": self.license,
|
| 173 |
+
"size_category": self.size_category,
|
| 174 |
+
"task": self.task,
|
| 175 |
+
"institution": self.institution,
|
| 176 |
+
"downloads": self.downloads,
|
| 177 |
+
"source": self.source,
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
@classmethod
|
| 181 |
+
def from_dict(cls, d: dict) -> "HuggingFaceDataset":
|
| 182 |
+
return cls(
|
| 183 |
+
dataset_id=d.get("dataset_id", d.get("id", "")),
|
| 184 |
+
title=d.get("title", d.get("dataset_id", "")),
|
| 185 |
+
description=d.get("description", ""),
|
| 186 |
+
language=d.get("language", []),
|
| 187 |
+
tags=d.get("tags", []),
|
| 188 |
+
license=d.get("license", ""),
|
| 189 |
+
size_category=d.get("size_category", d.get("cardData", {}).get("size_categories", [""])[0] if isinstance(d.get("cardData"), dict) else ""),
|
| 190 |
+
task=d.get("task", "image-to-text"),
|
| 191 |
+
institution=d.get("institution", ""),
|
| 192 |
+
downloads=d.get("downloads", d.get("downloadsAllTime", 0)),
|
| 193 |
+
source=d.get("source", "api"),
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
@property
|
| 197 |
+
def hf_url(self) -> str:
|
| 198 |
+
return f"https://huggingface.co/datasets/{self.dataset_id}"
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# ---------------------------------------------------------------------------
|
| 202 |
+
# Importer principal
|
| 203 |
+
# ---------------------------------------------------------------------------
|
| 204 |
+
|
| 205 |
+
class HuggingFaceImporter:
|
| 206 |
+
"""Recherche et importe des datasets depuis HuggingFace Hub."""
|
| 207 |
+
|
| 208 |
+
_API_BASE = "https://huggingface.co/api"
|
| 209 |
+
|
| 210 |
+
def __init__(self, token: Optional[str] = None) -> None:
|
| 211 |
+
self._token = token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
|
| 212 |
+
|
| 213 |
+
def _headers(self) -> dict:
|
| 214 |
+
h = {"User-Agent": "picarones-hf-importer/1.0"}
|
| 215 |
+
if self._token:
|
| 216 |
+
h["Authorization"] = f"Bearer {self._token}"
|
| 217 |
+
return h
|
| 218 |
+
|
| 219 |
+
def search(
|
| 220 |
+
self,
|
| 221 |
+
query: str = "",
|
| 222 |
+
tags: Optional[list[str]] = None,
|
| 223 |
+
language: Optional[str] = None,
|
| 224 |
+
limit: int = 20,
|
| 225 |
+
use_reference: bool = True,
|
| 226 |
+
) -> list[HuggingFaceDataset]:
|
| 227 |
+
"""Recherche des datasets avec filtres.
|
| 228 |
+
|
| 229 |
+
Interroge d'abord les datasets de référence pré-intégrés, puis
|
| 230 |
+
l'API HuggingFace si disponible.
|
| 231 |
+
"""
|
| 232 |
+
results: list[HuggingFaceDataset] = []
|
| 233 |
+
|
| 234 |
+
# Datasets de référence
|
| 235 |
+
if use_reference:
|
| 236 |
+
ref_results = self._search_reference(query, tags, language)
|
| 237 |
+
results.extend(ref_results)
|
| 238 |
+
|
| 239 |
+
# API HuggingFace (optionnel, peut échouer silencieusement)
|
| 240 |
+
try:
|
| 241 |
+
api_results = self._search_api(query, tags, language, limit)
|
| 242 |
+
# Déduplique (priorité aux références)
|
| 243 |
+
existing_ids = {r.dataset_id for r in results}
|
| 244 |
+
for ds in api_results:
|
| 245 |
+
if ds.dataset_id not in existing_ids:
|
| 246 |
+
results.append(ds)
|
| 247 |
+
existing_ids.add(ds.dataset_id)
|
| 248 |
+
except Exception:
|
| 249 |
+
pass
|
| 250 |
+
|
| 251 |
+
return results[:limit]
|
| 252 |
+
|
| 253 |
+
def _search_reference(
|
| 254 |
+
self,
|
| 255 |
+
query: str,
|
| 256 |
+
tags: Optional[list[str]],
|
| 257 |
+
language: Optional[str],
|
| 258 |
+
) -> list[HuggingFaceDataset]:
|
| 259 |
+
datasets = [HuggingFaceDataset.from_dict(d) for d in _REFERENCE_DATASETS]
|
| 260 |
+
datasets = [ds._replace_source("reference") for ds in datasets]
|
| 261 |
+
|
| 262 |
+
if query:
|
| 263 |
+
q = query.lower()
|
| 264 |
+
datasets = [
|
| 265 |
+
ds for ds in datasets
|
| 266 |
+
if (q in ds.title.lower()
|
| 267 |
+
or q in ds.description.lower()
|
| 268 |
+
or q in ds.dataset_id.lower()
|
| 269 |
+
or any(q in t.lower() for t in ds.tags)
|
| 270 |
+
or any(q in l.lower() for l in ds.language))
|
| 271 |
+
]
|
| 272 |
+
|
| 273 |
+
if tags:
|
| 274 |
+
for tag in tags:
|
| 275 |
+
t_lower = tag.lower()
|
| 276 |
+
datasets = [
|
| 277 |
+
ds for ds in datasets
|
| 278 |
+
if any(t_lower in dt.lower() for dt in ds.tags)
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
if language:
|
| 282 |
+
lang_lower = language.lower()
|
| 283 |
+
datasets = [
|
| 284 |
+
ds for ds in datasets
|
| 285 |
+
if any(lang_lower in l.lower() for l in ds.language)
|
| 286 |
+
]
|
| 287 |
+
|
| 288 |
+
return datasets
|
| 289 |
+
|
| 290 |
+
def _search_api(
|
| 291 |
+
self,
|
| 292 |
+
query: str,
|
| 293 |
+
tags: Optional[list[str]],
|
| 294 |
+
language: Optional[str],
|
| 295 |
+
limit: int,
|
| 296 |
+
) -> list[HuggingFaceDataset]:
|
| 297 |
+
params: dict[str, str] = {
|
| 298 |
+
"task_categories": "image-to-text",
|
| 299 |
+
"limit": str(min(limit, 50)),
|
| 300 |
+
"full": "False",
|
| 301 |
+
}
|
| 302 |
+
if query:
|
| 303 |
+
params["search"] = query
|
| 304 |
+
if language:
|
| 305 |
+
params["language"] = language
|
| 306 |
+
if tags:
|
| 307 |
+
params["tags"] = ",".join(tags)
|
| 308 |
+
|
| 309 |
+
url = f"{self._API_BASE}/datasets?" + urllib.parse.urlencode(params)
|
| 310 |
+
req = urllib.request.Request(url, headers=self._headers())
|
| 311 |
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
| 312 |
+
data = json.loads(resp.read().decode("utf-8"))
|
| 313 |
+
|
| 314 |
+
results = []
|
| 315 |
+
for item in data if isinstance(data, list) else []:
|
| 316 |
+
ds = HuggingFaceDataset(
|
| 317 |
+
dataset_id=item.get("id", ""),
|
| 318 |
+
title=item.get("id", ""),
|
| 319 |
+
description=item.get("description", ""),
|
| 320 |
+
language=item.get("language", []),
|
| 321 |
+
tags=item.get("tags", []),
|
| 322 |
+
license=item.get("license", ""),
|
| 323 |
+
size_category=(
|
| 324 |
+
item.get("cardData", {}).get("size_categories", [""])[0]
|
| 325 |
+
if isinstance(item.get("cardData"), dict)
|
| 326 |
+
else ""
|
| 327 |
+
),
|
| 328 |
+
task="image-to-text",
|
| 329 |
+
downloads=item.get("downloadsAllTime", 0),
|
| 330 |
+
source="api",
|
| 331 |
+
)
|
| 332 |
+
if ds.dataset_id:
|
| 333 |
+
results.append(ds)
|
| 334 |
+
return results
|
| 335 |
+
|
| 336 |
+
def import_dataset(
|
| 337 |
+
self,
|
| 338 |
+
dataset_id: str,
|
| 339 |
+
output_dir: str | Path,
|
| 340 |
+
split: str = "train",
|
| 341 |
+
max_samples: int = 100,
|
| 342 |
+
show_progress: bool = True,
|
| 343 |
+
) -> dict:
|
| 344 |
+
"""Importe un dataset depuis HuggingFace vers un dossier local.
|
| 345 |
+
|
| 346 |
+
Retourne les métadonnées de l'import.
|
| 347 |
+
"""
|
| 348 |
+
output_path = Path(output_dir)
|
| 349 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 350 |
+
|
| 351 |
+
meta = {
|
| 352 |
+
"source": "huggingface",
|
| 353 |
+
"dataset_id": dataset_id,
|
| 354 |
+
"split": split,
|
| 355 |
+
"max_samples": max_samples,
|
| 356 |
+
"imported_at": _iso_now(),
|
| 357 |
+
}
|
| 358 |
+
meta_file = output_path / "huggingface_meta.json"
|
| 359 |
+
meta_file.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 360 |
+
|
| 361 |
+
# Tentative d'import via datasets library si disponible
|
| 362 |
+
files_imported = _try_import_with_datasets_lib(
|
| 363 |
+
dataset_id, output_path, split, max_samples, show_progress
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
return {
|
| 367 |
+
"dataset_id": dataset_id,
|
| 368 |
+
"output_dir": str(output_path),
|
| 369 |
+
"files_imported": files_imported,
|
| 370 |
+
"metadata_file": str(meta_file),
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def _try_import_with_datasets_lib(
|
| 375 |
+
dataset_id: str,
|
| 376 |
+
output_path: Path,
|
| 377 |
+
split: str,
|
| 378 |
+
max_samples: int,
|
| 379 |
+
show_progress: bool,
|
| 380 |
+
) -> int:
|
| 381 |
+
"""Essaie d'importer avec la librairie `datasets` de HuggingFace."""
|
| 382 |
+
try:
|
| 383 |
+
from datasets import load_dataset # type: ignore
|
| 384 |
+
|
| 385 |
+
ds = load_dataset(dataset_id, split=split, streaming=True)
|
| 386 |
+
count = 0
|
| 387 |
+
for i, item in enumerate(ds):
|
| 388 |
+
if i >= max_samples:
|
| 389 |
+
break
|
| 390 |
+
# Cherche champ image et texte
|
| 391 |
+
image = item.get("image") or item.get("img")
|
| 392 |
+
text = item.get("text") or item.get("transcription") or item.get("ground_truth", "")
|
| 393 |
+
|
| 394 |
+
if image is not None:
|
| 395 |
+
img_file = output_path / f"doc_{i:04d}.jpg"
|
| 396 |
+
try:
|
| 397 |
+
image.save(str(img_file))
|
| 398 |
+
except Exception:
|
| 399 |
+
pass
|
| 400 |
+
|
| 401 |
+
gt_file = output_path / f"doc_{i:04d}.gt.txt"
|
| 402 |
+
gt_file.write_text(str(text), encoding="utf-8")
|
| 403 |
+
count += 1
|
| 404 |
+
|
| 405 |
+
return count
|
| 406 |
+
except (ImportError, Exception):
|
| 407 |
+
return 0
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def _iso_now() -> str:
|
| 411 |
+
from datetime import datetime, timezone
|
| 412 |
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
# ---------------------------------------------------------------------------
|
| 416 |
+
# Extension de HuggingFaceDataset (helper privé)
|
| 417 |
+
# ---------------------------------------------------------------------------
|
| 418 |
+
|
| 419 |
+
def _patch_dataset_replace_source() -> None:
|
| 420 |
+
"""Ajoute un helper _replace_source à HuggingFaceDataset."""
|
| 421 |
+
def _replace_source(self, source: str) -> "HuggingFaceDataset":
|
| 422 |
+
from dataclasses import replace
|
| 423 |
+
return replace(self, source=source)
|
| 424 |
+
HuggingFaceDataset._replace_source = _replace_source
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
_patch_dataset_replace_source()
|
picarones/importers/iiif.py
ADDED
|
@@ -0,0 +1,583 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Import de corpus depuis des manifestes IIIF v2 et v3.
|
| 2 |
+
|
| 3 |
+
Fonctionnement
|
| 4 |
+
--------------
|
| 5 |
+
1. Téléchargement et parsing du manifeste JSON (v2 ou v3 auto-détecté)
|
| 6 |
+
2. Extraction de la liste des canvases (pages) avec leurs URL d'image
|
| 7 |
+
3. Sélection optionnelle d'un sous-ensemble de pages (ex : ``--pages 1-10``)
|
| 8 |
+
4. Téléchargement des images dans un dossier local
|
| 9 |
+
5. Création de fichiers GT vides (``.gt.txt``) à remplir manuellement,
|
| 10 |
+
OU chargement des annotations de transcription si présentes dans le manifeste
|
| 11 |
+
6. Construction et retour d'un objet ``Corpus``
|
| 12 |
+
|
| 13 |
+
Compatibilité
|
| 14 |
+
-------------
|
| 15 |
+
- IIIF Image API v2 et v3
|
| 16 |
+
- Manifestes Presentation API v2 et v3
|
| 17 |
+
- Instances : Gallica (BnF), Bodleian, British Library, BSB, e-codices,
|
| 18 |
+
Europeana, et tout entrepôt IIIF-compliant
|
| 19 |
+
|
| 20 |
+
Utilisation
|
| 21 |
+
-----------
|
| 22 |
+
>>> from picarones.importers.iiif import IIIFImporter
|
| 23 |
+
>>> importer = IIIFImporter("https://gallica.bnf.fr/ark:/12148/xxx/manifest.json")
|
| 24 |
+
>>> corpus = importer.import_corpus(pages="1-10", output_dir="./corpus/")
|
| 25 |
+
>>> print(f"{len(corpus)} documents téléchargés")
|
| 26 |
+
|
| 27 |
+
Ou via la fonction de commodité :
|
| 28 |
+
>>> from picarones.importers.iiif import import_iiif_manifest
|
| 29 |
+
>>> corpus = import_iiif_manifest("https://...", pages="1-5", output_dir="./corpus/")
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import json
|
| 35 |
+
import logging
|
| 36 |
+
import re
|
| 37 |
+
import time
|
| 38 |
+
import urllib.error
|
| 39 |
+
import urllib.request
|
| 40 |
+
from dataclasses import dataclass, field
|
| 41 |
+
from pathlib import Path
|
| 42 |
+
from typing import Iterator, Optional
|
| 43 |
+
|
| 44 |
+
from picarones.core.corpus import Corpus, Document
|
| 45 |
+
|
| 46 |
+
logger = logging.getLogger(__name__)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# Parsing du sélecteur de pages
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
def parse_page_selector(pages: str, total: int) -> list[int]:
|
| 54 |
+
"""Parse un sélecteur de pages en liste d'indices 0-based.
|
| 55 |
+
|
| 56 |
+
Formats acceptés :
|
| 57 |
+
- ``"1-10"`` → pages 1 à 10 (1-based)
|
| 58 |
+
- ``"1,3,5"`` → pages 1, 3 et 5
|
| 59 |
+
- ``"1-5,10,15-20"`` → combinaison
|
| 60 |
+
- ``"all"`` / ``""`` → toutes les pages
|
| 61 |
+
|
| 62 |
+
Parameters
|
| 63 |
+
----------
|
| 64 |
+
pages:
|
| 65 |
+
Sélecteur de pages en chaîne de caractères.
|
| 66 |
+
total:
|
| 67 |
+
Nombre total de pages dans le manifeste.
|
| 68 |
+
|
| 69 |
+
Returns
|
| 70 |
+
-------
|
| 71 |
+
list[int]
|
| 72 |
+
Indices 0-based des pages sélectionnées, triés et dédoublonnés.
|
| 73 |
+
|
| 74 |
+
Raises
|
| 75 |
+
------
|
| 76 |
+
ValueError
|
| 77 |
+
Si la syntaxe est invalide ou les numéros hors bornes.
|
| 78 |
+
"""
|
| 79 |
+
if not pages or pages.strip().lower() == "all":
|
| 80 |
+
return list(range(total))
|
| 81 |
+
|
| 82 |
+
indices: set[int] = set()
|
| 83 |
+
for part in pages.split(","):
|
| 84 |
+
part = part.strip()
|
| 85 |
+
if "-" in part:
|
| 86 |
+
m = re.fullmatch(r"(\d+)-(\d+)", part)
|
| 87 |
+
if not m:
|
| 88 |
+
raise ValueError(f"Sélecteur de pages invalide : '{part}'")
|
| 89 |
+
start, end = int(m.group(1)), int(m.group(2))
|
| 90 |
+
if start < 1 or end > total or start > end:
|
| 91 |
+
raise ValueError(
|
| 92 |
+
f"Plage {start}-{end} hors bornes (1–{total})"
|
| 93 |
+
)
|
| 94 |
+
indices.update(range(start - 1, end))
|
| 95 |
+
else:
|
| 96 |
+
n = int(part)
|
| 97 |
+
if n < 1 or n > total:
|
| 98 |
+
raise ValueError(f"Page {n} hors bornes (1–{total})")
|
| 99 |
+
indices.add(n - 1)
|
| 100 |
+
return sorted(indices)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
# Données d'un canvas IIIF
|
| 105 |
+
# ---------------------------------------------------------------------------
|
| 106 |
+
|
| 107 |
+
@dataclass
|
| 108 |
+
class IIIFCanvas:
|
| 109 |
+
"""Représente un canvas (page) dans un manifeste IIIF."""
|
| 110 |
+
|
| 111 |
+
index: int # position 0-based dans le manifeste
|
| 112 |
+
label: str # étiquette lisible (ex : "f. 1r", "Page 1")
|
| 113 |
+
image_url: str # URL de l'image pleine résolution
|
| 114 |
+
width: Optional[int] = None
|
| 115 |
+
height: Optional[int] = None
|
| 116 |
+
transcription: Optional[str] = None # texte GT si annoté dans le manifeste
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
+
# Parseur de manifeste IIIF
|
| 121 |
+
# ---------------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
class IIIFManifestParser:
|
| 124 |
+
"""Parse un manifeste IIIF Presentation API v2 ou v3."""
|
| 125 |
+
|
| 126 |
+
def __init__(self, manifest: dict) -> None:
|
| 127 |
+
self._manifest = manifest
|
| 128 |
+
self._version = self._detect_version()
|
| 129 |
+
|
| 130 |
+
def _detect_version(self) -> int:
|
| 131 |
+
"""Détecte la version du manifeste (2 ou 3)."""
|
| 132 |
+
context = self._manifest.get("@context", "")
|
| 133 |
+
if isinstance(context, list):
|
| 134 |
+
context = " ".join(context)
|
| 135 |
+
if "presentation/3" in context or self._manifest.get("type") == "Manifest":
|
| 136 |
+
return 3
|
| 137 |
+
return 2
|
| 138 |
+
|
| 139 |
+
@property
|
| 140 |
+
def version(self) -> int:
|
| 141 |
+
return self._version
|
| 142 |
+
|
| 143 |
+
@property
|
| 144 |
+
def label(self) -> str:
|
| 145 |
+
"""Titre du manifeste."""
|
| 146 |
+
raw = self._manifest.get("label", "")
|
| 147 |
+
return _extract_label(raw)
|
| 148 |
+
|
| 149 |
+
@property
|
| 150 |
+
def attribution(self) -> str:
|
| 151 |
+
raw = self._manifest.get("attribution", self._manifest.get("requiredStatement", ""))
|
| 152 |
+
return _extract_label(raw)
|
| 153 |
+
|
| 154 |
+
def canvases(self) -> list[IIIFCanvas]:
|
| 155 |
+
"""Retourne la liste des canvases du manifeste."""
|
| 156 |
+
if self._version == 3:
|
| 157 |
+
return self._parse_v3_canvases()
|
| 158 |
+
return self._parse_v2_canvases()
|
| 159 |
+
|
| 160 |
+
def _parse_v2_canvases(self) -> list[IIIFCanvas]:
|
| 161 |
+
canvases: list[IIIFCanvas] = []
|
| 162 |
+
sequences = self._manifest.get("sequences", [])
|
| 163 |
+
if not sequences:
|
| 164 |
+
return canvases
|
| 165 |
+
raw_canvases = sequences[0].get("canvases", [])
|
| 166 |
+
for i, canvas in enumerate(raw_canvases):
|
| 167 |
+
label = _extract_label(canvas.get("label", f"canvas_{i+1}"))
|
| 168 |
+
# Image principale : images[0].resource.@id ou service
|
| 169 |
+
images = canvas.get("images", [])
|
| 170 |
+
image_url = ""
|
| 171 |
+
if images:
|
| 172 |
+
resource = images[0].get("resource", {})
|
| 173 |
+
image_url = _best_image_url_v2(resource, canvas)
|
| 174 |
+
|
| 175 |
+
# Annotations de transcription (OA annotations)
|
| 176 |
+
transcription = _extract_v2_transcription(canvas)
|
| 177 |
+
|
| 178 |
+
canvases.append(IIIFCanvas(
|
| 179 |
+
index=i,
|
| 180 |
+
label=label,
|
| 181 |
+
image_url=image_url,
|
| 182 |
+
width=canvas.get("width"),
|
| 183 |
+
height=canvas.get("height"),
|
| 184 |
+
transcription=transcription,
|
| 185 |
+
))
|
| 186 |
+
return canvases
|
| 187 |
+
|
| 188 |
+
def _parse_v3_canvases(self) -> list[IIIFCanvas]:
|
| 189 |
+
canvases: list[IIIFCanvas] = []
|
| 190 |
+
items = self._manifest.get("items", [])
|
| 191 |
+
for i, canvas in enumerate(items):
|
| 192 |
+
label = _extract_label(canvas.get("label", f"canvas_{i+1}"))
|
| 193 |
+
image_url = _best_image_url_v3(canvas)
|
| 194 |
+
transcription = _extract_v3_transcription(canvas)
|
| 195 |
+
canvases.append(IIIFCanvas(
|
| 196 |
+
index=i,
|
| 197 |
+
label=label,
|
| 198 |
+
image_url=image_url,
|
| 199 |
+
width=canvas.get("width"),
|
| 200 |
+
height=canvas.get("height"),
|
| 201 |
+
transcription=transcription,
|
| 202 |
+
))
|
| 203 |
+
return canvases
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ---------------------------------------------------------------------------
|
| 207 |
+
# Helpers extraction URL et label
|
| 208 |
+
# ---------------------------------------------------------------------------
|
| 209 |
+
|
| 210 |
+
def _extract_label(raw: object) -> str:
|
| 211 |
+
"""Extrait une chaîne lisible depuis les différents formats de label IIIF."""
|
| 212 |
+
if isinstance(raw, str):
|
| 213 |
+
return raw
|
| 214 |
+
if isinstance(raw, list) and raw:
|
| 215 |
+
return _extract_label(raw[0])
|
| 216 |
+
if isinstance(raw, dict):
|
| 217 |
+
# IIIF v3 : {"fr": ["titre"], "en": ["title"]}
|
| 218 |
+
for lang in ("fr", "en", "none", "@value"):
|
| 219 |
+
val = raw.get(lang, "")
|
| 220 |
+
if val:
|
| 221 |
+
if isinstance(val, list):
|
| 222 |
+
return val[0] if val else ""
|
| 223 |
+
return str(val)
|
| 224 |
+
# Fallback: première valeur
|
| 225 |
+
for v in raw.values():
|
| 226 |
+
return _extract_label(v)
|
| 227 |
+
return str(raw) if raw else ""
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def _best_image_url_v2(resource: dict, canvas: dict) -> str:
|
| 231 |
+
"""Construit l'URL d'image optimale depuis une ressource IIIF v2."""
|
| 232 |
+
# 1. URL directe de la ressource
|
| 233 |
+
direct = resource.get("@id", "")
|
| 234 |
+
if direct and not direct.endswith("/info.json"):
|
| 235 |
+
return direct
|
| 236 |
+
|
| 237 |
+
# 2. Via le service IIIF Image API
|
| 238 |
+
service = resource.get("service", {})
|
| 239 |
+
if isinstance(service, list) and service:
|
| 240 |
+
service = service[0]
|
| 241 |
+
service_id = service.get("@id", service.get("id", ""))
|
| 242 |
+
if service_id:
|
| 243 |
+
return f"{service_id.rstrip('/')}/full/max/0/default.jpg"
|
| 244 |
+
|
| 245 |
+
return direct
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _best_image_url_v3(canvas: dict) -> str:
|
| 249 |
+
"""Extrait l'URL d'image depuis un canvas IIIF v3."""
|
| 250 |
+
items = canvas.get("items", [])
|
| 251 |
+
for annotation_page in items:
|
| 252 |
+
for annotation in annotation_page.get("items", []):
|
| 253 |
+
body = annotation.get("body", {})
|
| 254 |
+
if isinstance(body, list):
|
| 255 |
+
body = body[0] if body else {}
|
| 256 |
+
# URL directe
|
| 257 |
+
url = body.get("id", body.get("@id", ""))
|
| 258 |
+
if url and body.get("type", "") == "Image":
|
| 259 |
+
return url
|
| 260 |
+
# Via service IIIF Image API
|
| 261 |
+
service = body.get("service", [])
|
| 262 |
+
if isinstance(service, dict):
|
| 263 |
+
service = [service]
|
| 264 |
+
for svc in service:
|
| 265 |
+
svc_id = svc.get("id", svc.get("@id", ""))
|
| 266 |
+
if svc_id:
|
| 267 |
+
return f"{svc_id.rstrip('/')}/full/max/0/default.jpg"
|
| 268 |
+
if url:
|
| 269 |
+
return url
|
| 270 |
+
return ""
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _extract_v2_transcription(canvas: dict) -> Optional[str]:
|
| 274 |
+
"""Tente d'extraire le texte GT depuis les annotations OA d'un canvas v2."""
|
| 275 |
+
other_content = canvas.get("otherContent", [])
|
| 276 |
+
for oc in other_content:
|
| 277 |
+
if not isinstance(oc, dict):
|
| 278 |
+
continue
|
| 279 |
+
motivation = oc.get("motivation", "")
|
| 280 |
+
if "transcrib" in motivation.lower() or "supplementing" in motivation.lower():
|
| 281 |
+
resources = oc.get("resources", [])
|
| 282 |
+
texts = []
|
| 283 |
+
for res in resources:
|
| 284 |
+
body = res.get("resource", {})
|
| 285 |
+
if body.get("@type") == "cnt:ContentAsText":
|
| 286 |
+
texts.append(body.get("chars", ""))
|
| 287 |
+
if texts:
|
| 288 |
+
return "\n".join(texts)
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def _extract_v3_transcription(canvas: dict) -> Optional[str]:
|
| 293 |
+
"""Tente d'extraire le texte GT depuis les annotations d'un canvas v3."""
|
| 294 |
+
annotations = canvas.get("annotations", [])
|
| 295 |
+
for ann_page in annotations:
|
| 296 |
+
items = ann_page.get("items", [])
|
| 297 |
+
for ann in items:
|
| 298 |
+
motivation = ann.get("motivation", "")
|
| 299 |
+
if "transcrib" in motivation.lower() or "supplementing" in motivation.lower():
|
| 300 |
+
body = ann.get("body", {})
|
| 301 |
+
if isinstance(body, dict) and body.get("type") == "TextualBody":
|
| 302 |
+
return body.get("value", "")
|
| 303 |
+
return None
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# ---------------------------------------------------------------------------
|
| 307 |
+
# Téléchargement avec retry
|
| 308 |
+
# ---------------------------------------------------------------------------
|
| 309 |
+
|
| 310 |
+
def _download_url(
|
| 311 |
+
url: str,
|
| 312 |
+
retries: int = 4,
|
| 313 |
+
backoff: float = 2.0,
|
| 314 |
+
timeout: int = 60,
|
| 315 |
+
) -> bytes:
|
| 316 |
+
"""Télécharge une URL avec retry exponentiel."""
|
| 317 |
+
headers = {
|
| 318 |
+
"User-Agent": "Picarones/1.0 (BnF OCR benchmark platform; https://github.com/bnf/picarones)"
|
| 319 |
+
}
|
| 320 |
+
last_exc: Optional[Exception] = None
|
| 321 |
+
for attempt in range(retries):
|
| 322 |
+
if attempt > 0:
|
| 323 |
+
wait = backoff ** attempt
|
| 324 |
+
logger.debug("Retry %d/%d dans %.1fs — %s", attempt, retries - 1, wait, url)
|
| 325 |
+
time.sleep(wait)
|
| 326 |
+
try:
|
| 327 |
+
req = urllib.request.Request(url, headers=headers)
|
| 328 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 329 |
+
return resp.read()
|
| 330 |
+
except (urllib.error.URLError, urllib.error.HTTPError) as exc:
|
| 331 |
+
last_exc = exc
|
| 332 |
+
logger.warning("Erreur téléchargement %s : %s", url, exc)
|
| 333 |
+
raise RuntimeError(f"Impossible de télécharger {url} après {retries} tentatives") from last_exc
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def _fetch_manifest(url: str) -> dict:
|
| 337 |
+
"""Télécharge et parse un manifeste IIIF JSON."""
|
| 338 |
+
data = _download_url(url)
|
| 339 |
+
try:
|
| 340 |
+
return json.loads(data.decode("utf-8"))
|
| 341 |
+
except json.JSONDecodeError as exc:
|
| 342 |
+
raise ValueError(f"Manifeste IIIF invalide (JSON mal formé) : {url}") from exc
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
# ---------------------------------------------------------------------------
|
| 346 |
+
# Importeur principal
|
| 347 |
+
# ---------------------------------------------------------------------------
|
| 348 |
+
|
| 349 |
+
class IIIFImporter:
|
| 350 |
+
"""Importe un corpus depuis un manifeste IIIF.
|
| 351 |
+
|
| 352 |
+
Parameters
|
| 353 |
+
----------
|
| 354 |
+
manifest_url:
|
| 355 |
+
URL du manifeste IIIF (Presentation API v2 ou v3).
|
| 356 |
+
max_resolution:
|
| 357 |
+
Résolution maximale des images téléchargées (largeur en pixels).
|
| 358 |
+
0 = résolution maximale disponible.
|
| 359 |
+
"""
|
| 360 |
+
|
| 361 |
+
def __init__(
|
| 362 |
+
self,
|
| 363 |
+
manifest_url: str,
|
| 364 |
+
max_resolution: int = 0,
|
| 365 |
+
) -> None:
|
| 366 |
+
self.manifest_url = manifest_url
|
| 367 |
+
self.max_resolution = max_resolution
|
| 368 |
+
self._manifest: Optional[dict] = None
|
| 369 |
+
self._parser: Optional[IIIFManifestParser] = None
|
| 370 |
+
|
| 371 |
+
def load(self) -> "IIIFImporter":
|
| 372 |
+
"""Télécharge et parse le manifeste."""
|
| 373 |
+
logger.info("Téléchargement du manifeste IIIF : %s", self.manifest_url)
|
| 374 |
+
self._manifest = _fetch_manifest(self.manifest_url)
|
| 375 |
+
self._parser = IIIFManifestParser(self._manifest)
|
| 376 |
+
logger.info(
|
| 377 |
+
"Manifeste chargé — version IIIF %d — titre : %s — %d canvas",
|
| 378 |
+
self._parser.version,
|
| 379 |
+
self._parser.label,
|
| 380 |
+
len(self._parser.canvases()),
|
| 381 |
+
)
|
| 382 |
+
return self
|
| 383 |
+
|
| 384 |
+
@property
|
| 385 |
+
def parser(self) -> IIIFManifestParser:
|
| 386 |
+
if self._parser is None:
|
| 387 |
+
self.load()
|
| 388 |
+
return self._parser # type: ignore[return-value]
|
| 389 |
+
|
| 390 |
+
def list_canvases(self, pages: str = "all") -> list[IIIFCanvas]:
|
| 391 |
+
"""Retourne la liste des canvases sélectionnés."""
|
| 392 |
+
all_canvases = self.parser.canvases()
|
| 393 |
+
indices = parse_page_selector(pages, len(all_canvases))
|
| 394 |
+
return [all_canvases[i] for i in indices]
|
| 395 |
+
|
| 396 |
+
def import_corpus(
|
| 397 |
+
self,
|
| 398 |
+
pages: str = "all",
|
| 399 |
+
output_dir: Optional[str | Path] = None,
|
| 400 |
+
show_progress: bool = True,
|
| 401 |
+
) -> Corpus:
|
| 402 |
+
"""Télécharge les images et construit un corpus Picarones.
|
| 403 |
+
|
| 404 |
+
Si les canvases contiennent des annotations de transcription (GT),
|
| 405 |
+
elles sont automatiquement sauvegardées dans les fichiers ``.gt.txt``.
|
| 406 |
+
Sinon, des fichiers ``.gt.txt`` vides sont créés.
|
| 407 |
+
|
| 408 |
+
Parameters
|
| 409 |
+
----------
|
| 410 |
+
pages:
|
| 411 |
+
Sélecteur de pages (ex : ``"1-10"``, ``"1,3,5"``).
|
| 412 |
+
output_dir:
|
| 413 |
+
Dossier de destination pour les images et les GT.
|
| 414 |
+
Si None, le corpus est retourné en mémoire sans écriture disque.
|
| 415 |
+
show_progress:
|
| 416 |
+
Affiche une barre de progression tqdm.
|
| 417 |
+
|
| 418 |
+
Returns
|
| 419 |
+
-------
|
| 420 |
+
Corpus
|
| 421 |
+
Corpus prêt à être utilisé dans ``run_benchmark``.
|
| 422 |
+
"""
|
| 423 |
+
canvases = self.list_canvases(pages)
|
| 424 |
+
if not canvases:
|
| 425 |
+
raise ValueError("Aucun canvas sélectionné.")
|
| 426 |
+
|
| 427 |
+
out_dir: Optional[Path] = Path(output_dir) if output_dir else None
|
| 428 |
+
if out_dir:
|
| 429 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 430 |
+
|
| 431 |
+
# Nom du corpus depuis le titre du manifeste
|
| 432 |
+
corpus_name = self.parser.label or "iiif_corpus"
|
| 433 |
+
|
| 434 |
+
documents: list[Document] = []
|
| 435 |
+
iterator: Iterator[IIIFCanvas] = iter(canvases)
|
| 436 |
+
|
| 437 |
+
if show_progress:
|
| 438 |
+
try:
|
| 439 |
+
from tqdm import tqdm
|
| 440 |
+
iterator = tqdm(canvases, desc="Import IIIF", unit="page")
|
| 441 |
+
except ImportError:
|
| 442 |
+
pass
|
| 443 |
+
|
| 444 |
+
for canvas in iterator:
|
| 445 |
+
doc_id = f"{_slugify(canvas.label) or f'canvas_{canvas.index+1:04d}'}"
|
| 446 |
+
|
| 447 |
+
if not canvas.image_url:
|
| 448 |
+
logger.warning("Canvas %s : pas d'URL d'image — ignoré.", canvas.label)
|
| 449 |
+
continue
|
| 450 |
+
|
| 451 |
+
# Ajuster la résolution si max_resolution est défini
|
| 452 |
+
image_url = self._adjust_resolution(canvas.image_url, canvas.width)
|
| 453 |
+
|
| 454 |
+
# Téléchargement de l'image
|
| 455 |
+
try:
|
| 456 |
+
image_bytes = _download_url(image_url)
|
| 457 |
+
except RuntimeError as exc:
|
| 458 |
+
logger.error("Canvas %s : erreur téléchargement : %s", canvas.label, exc)
|
| 459 |
+
continue
|
| 460 |
+
|
| 461 |
+
# Déterminer l'extension de l'image
|
| 462 |
+
ext = _guess_extension(image_url)
|
| 463 |
+
|
| 464 |
+
if out_dir:
|
| 465 |
+
# Sauvegarde sur disque
|
| 466 |
+
image_path = out_dir / f"{doc_id}{ext}"
|
| 467 |
+
image_path.write_bytes(image_bytes)
|
| 468 |
+
|
| 469 |
+
gt_path = out_dir / f"{doc_id}.gt.txt"
|
| 470 |
+
gt_text = canvas.transcription or ""
|
| 471 |
+
gt_path.write_text(gt_text, encoding="utf-8")
|
| 472 |
+
|
| 473 |
+
documents.append(Document(
|
| 474 |
+
image_path=image_path,
|
| 475 |
+
ground_truth=gt_text,
|
| 476 |
+
doc_id=doc_id,
|
| 477 |
+
metadata={"iiif_label": canvas.label, "canvas_index": canvas.index},
|
| 478 |
+
))
|
| 479 |
+
else:
|
| 480 |
+
# Corpus en mémoire (image stockée comme chemin temporaire virtuel)
|
| 481 |
+
import tempfile
|
| 482 |
+
tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
| 483 |
+
tmp.write(image_bytes)
|
| 484 |
+
tmp.close()
|
| 485 |
+
documents.append(Document(
|
| 486 |
+
image_path=Path(tmp.name),
|
| 487 |
+
ground_truth=canvas.transcription or "",
|
| 488 |
+
doc_id=doc_id,
|
| 489 |
+
metadata={"iiif_label": canvas.label, "canvas_index": canvas.index},
|
| 490 |
+
))
|
| 491 |
+
|
| 492 |
+
if not documents:
|
| 493 |
+
raise ValueError("Aucun document importé depuis le manifeste IIIF.")
|
| 494 |
+
|
| 495 |
+
logger.info("Import IIIF terminé : %d documents.", len(documents))
|
| 496 |
+
|
| 497 |
+
return Corpus(
|
| 498 |
+
name=corpus_name,
|
| 499 |
+
documents=documents,
|
| 500 |
+
source_path=self.manifest_url,
|
| 501 |
+
metadata={
|
| 502 |
+
"iiif_manifest_url": self.manifest_url,
|
| 503 |
+
"iiif_version": self.parser.version,
|
| 504 |
+
"iiif_attribution": self.parser.attribution,
|
| 505 |
+
"pages_selected": pages,
|
| 506 |
+
},
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
def _adjust_resolution(self, image_url: str, canvas_width: Optional[int]) -> str:
|
| 510 |
+
"""Ajuste l'URL IIIF Image API pour respecter max_resolution."""
|
| 511 |
+
if not self.max_resolution or not canvas_width:
|
| 512 |
+
return image_url
|
| 513 |
+
if canvas_width <= self.max_resolution:
|
| 514 |
+
return image_url
|
| 515 |
+
# Remplacer /full/max/ ou /full/full/ par /full/{w},/
|
| 516 |
+
url = re.sub(
|
| 517 |
+
r"/full/(max|full)/",
|
| 518 |
+
f"/full/{self.max_resolution},/",
|
| 519 |
+
image_url,
|
| 520 |
+
)
|
| 521 |
+
return url
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
# ---------------------------------------------------------------------------
|
| 525 |
+
# Helpers utilitaires
|
| 526 |
+
# ---------------------------------------------------------------------------
|
| 527 |
+
|
| 528 |
+
def _slugify(text: str) -> str:
|
| 529 |
+
"""Convertit un label IIIF en identifiant de fichier sûr."""
|
| 530 |
+
text = re.sub(r"[^\w\s-]", "", text.strip())
|
| 531 |
+
text = re.sub(r"[\s_-]+", "_", text)
|
| 532 |
+
return text[:60]
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
def _guess_extension(url: str) -> str:
|
| 536 |
+
"""Détermine l'extension de l'image depuis l'URL."""
|
| 537 |
+
url_lower = url.lower().split("?")[0]
|
| 538 |
+
for ext in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp"):
|
| 539 |
+
if url_lower.endswith(ext):
|
| 540 |
+
return ext
|
| 541 |
+
# Par défaut pour les URLs IIIF Image API
|
| 542 |
+
if "/default." in url_lower or "/native." in url_lower:
|
| 543 |
+
return ".jpg"
|
| 544 |
+
return ".jpg"
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
# ---------------------------------------------------------------------------
|
| 548 |
+
# Fonction de commodité
|
| 549 |
+
# ---------------------------------------------------------------------------
|
| 550 |
+
|
| 551 |
+
def import_iiif_manifest(
|
| 552 |
+
manifest_url: str,
|
| 553 |
+
pages: str = "all",
|
| 554 |
+
output_dir: Optional[str | Path] = None,
|
| 555 |
+
max_resolution: int = 0,
|
| 556 |
+
show_progress: bool = True,
|
| 557 |
+
) -> Corpus:
|
| 558 |
+
"""Importe un corpus depuis un manifeste IIIF en une seule ligne.
|
| 559 |
+
|
| 560 |
+
Parameters
|
| 561 |
+
----------
|
| 562 |
+
manifest_url:
|
| 563 |
+
URL du manifeste IIIF (v2 ou v3).
|
| 564 |
+
pages:
|
| 565 |
+
Sélecteur de pages (ex : ``"1-10"``, ``"1,3,5"``). ``"all"`` par défaut.
|
| 566 |
+
output_dir:
|
| 567 |
+
Dossier de destination. Si None, corpus en mémoire.
|
| 568 |
+
max_resolution:
|
| 569 |
+
Résolution maximale (px). 0 = pas de limite.
|
| 570 |
+
show_progress:
|
| 571 |
+
Affiche une barre de progression.
|
| 572 |
+
|
| 573 |
+
Returns
|
| 574 |
+
-------
|
| 575 |
+
Corpus
|
| 576 |
+
"""
|
| 577 |
+
importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
|
| 578 |
+
importer.load()
|
| 579 |
+
return importer.import_corpus(
|
| 580 |
+
pages=pages,
|
| 581 |
+
output_dir=output_dir,
|
| 582 |
+
show_progress=show_progress,
|
| 583 |
+
)
|
picarones/report/generator.py
CHANGED
|
@@ -69,6 +69,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 69 |
engines_summary = []
|
| 70 |
for report in benchmark.engine_reports:
|
| 71 |
agg = report.aggregated_metrics
|
|
|
|
| 72 |
entry: dict = {
|
| 73 |
"name": report.engine_name,
|
| 74 |
"version": report.engine_version,
|
|
@@ -81,15 +82,30 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 81 |
"cer_max": _safe(agg.get("cer", {}).get("max")),
|
| 82 |
"doc_count": agg.get("document_count", 0),
|
| 83 |
"failed": agg.get("failed_count", 0),
|
|
|
|
|
|
|
|
|
|
| 84 |
# Distribution pour l'histogramme : liste des CER individuels
|
| 85 |
"cer_values": [
|
| 86 |
_safe(dr.metrics.cer)
|
| 87 |
for dr in report.document_results
|
| 88 |
if dr.metrics.error is None
|
| 89 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
# Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
|
| 91 |
"is_pipeline": report.is_pipeline,
|
| 92 |
"pipeline_info": report.pipeline_info,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
}
|
| 94 |
engines_summary.append(entry)
|
| 95 |
|
|
@@ -121,6 +137,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 121 |
"engine": engine_name,
|
| 122 |
"hypothesis": dr.hypothesis,
|
| 123 |
"cer": _safe(dr.metrics.cer),
|
|
|
|
| 124 |
"wer": _safe(dr.metrics.wer),
|
| 125 |
"duration": dr.duration_seconds,
|
| 126 |
"error": dr.engine_error,
|
|
@@ -136,6 +153,16 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 136 |
if on is not None:
|
| 137 |
er_entry["over_normalization"] = on
|
| 138 |
er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
engine_results.append(er_entry)
|
| 140 |
|
| 141 |
# CER moyen sur ce document (pour le badge galerie)
|
|
@@ -603,6 +630,7 @@ footer {{
|
|
| 603 |
<button class="tab-btn active" onclick="showView('ranking')">Classement</button>
|
| 604 |
<button class="tab-btn" onclick="showView('gallery')">Galerie</button>
|
| 605 |
<button class="tab-btn" onclick="showView('document')">Document</button>
|
|
|
|
| 606 |
<button class="tab-btn" onclick="showView('analyses')">Analyses</button>
|
| 607 |
</div>
|
| 608 |
<div class="meta" id="nav-meta">—</div>
|
|
@@ -622,10 +650,13 @@ footer {{
|
|
| 622 |
<tr>
|
| 623 |
<th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
|
| 624 |
<th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
|
| 625 |
-
<th data-col="cer" class="sortable">CER<i class="sort-icon">↕</i></th>
|
|
|
|
| 626 |
<th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
|
| 627 |
<th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
|
| 628 |
<th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
|
|
|
|
|
|
|
| 629 |
<th>CER médian</th>
|
| 630 |
<th>CER min</th>
|
| 631 |
<th>CER max</th>
|
|
@@ -775,6 +806,59 @@ footer {{
|
|
| 775 |
</div>
|
| 776 |
</div>
|
| 777 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
</div>
|
| 779 |
</div>
|
| 780 |
|
|
@@ -808,13 +892,15 @@ function showView(name) {{
|
|
| 808 |
document.querySelectorAll('.view').forEach(v => v.classList.remove('active'));
|
| 809 |
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
| 810 |
document.getElementById('view-' + name).classList.add('active');
|
|
|
|
|
|
|
|
|
|
| 811 |
document.querySelectorAll('.tab-btn').forEach(b => {{
|
| 812 |
-
if (b.textContent.toLowerCase().startsWith(
|
| 813 |
-
{{ranking:'c',gallery:'g',document:'d',analyses:'a'}}[name]
|
| 814 |
-
)) b.classList.add('active');
|
| 815 |
}});
|
| 816 |
currentView = name;
|
| 817 |
if (name === 'analyses' && !chartsBuilt) buildCharts();
|
|
|
|
| 818 |
}}
|
| 819 |
|
| 820 |
// ── Formatage ───────────────────────────────────────────────────
|
|
@@ -857,6 +943,15 @@ function renderDiff(ops) {{
|
|
| 857 |
}}).join(' ');
|
| 858 |
}}
|
| 859 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 860 |
// ── Vue Classement ──────────────────────────────────────────────
|
| 861 |
let rankingSort = {{ col: 'cer', dir: 'asc' }};
|
| 862 |
|
|
@@ -906,6 +1001,18 @@ function renderRanking() {{
|
|
| 906 |
overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
|
| 907 |
}}
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
return `<tr>
|
| 910 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 911 |
<td>
|
|
@@ -918,9 +1025,12 @@ function renderRanking() {{
|
|
| 918 |
<span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
|
| 919 |
<span class="cer-badge" style="color:${{cerC}};background:${{cerB}}">${{pct(e.cer)}}</span>
|
| 920 |
</td>
|
|
|
|
| 921 |
<td>${{pct(e.wer)}}</td>
|
| 922 |
<td>${{pct(e.mer)}}</td>
|
| 923 |
<td>${{pct(e.wil)}}</td>
|
|
|
|
|
|
|
| 924 |
<td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
|
| 925 |
<td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
|
| 926 |
<td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
|
|
@@ -1109,12 +1219,23 @@ function loadDocument(docId) {{
|
|
| 1109 |
</div>`;
|
| 1110 |
}}
|
| 1111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1112 |
return `<div class="diff-panel">
|
| 1113 |
<div class="diff-panel-header">
|
| 1114 |
<span class="diff-panel-title">${{esc(er.engine)}}</span>
|
| 1115 |
${{pipeTagPanel}}
|
| 1116 |
<span class="diff-panel-metrics">
|
| 1117 |
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
|
|
|
| 1118 |
<span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
|
| 1119 |
${{onBadge}}
|
| 1120 |
${{errBadge}}
|
|
@@ -1187,6 +1308,8 @@ function buildCharts() {{
|
|
| 1187 |
buildRadar();
|
| 1188 |
buildCerPerDoc();
|
| 1189 |
buildDurationChart();
|
|
|
|
|
|
|
| 1190 |
}}
|
| 1191 |
|
| 1192 |
function buildCerHistogram() {{
|
|
@@ -1330,6 +1453,315 @@ function buildDurationChart() {{
|
|
| 1330 |
}});
|
| 1331 |
}}
|
| 1332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1333 |
// ── Init ────────────────────────────────────────────────────────
|
| 1334 |
function init() {{
|
| 1335 |
// Méta nav
|
|
|
|
| 69 |
engines_summary = []
|
| 70 |
for report in benchmark.engine_reports:
|
| 71 |
agg = report.aggregated_metrics
|
| 72 |
+
diplo_agg = agg.get("cer_diplomatic", {})
|
| 73 |
entry: dict = {
|
| 74 |
"name": report.engine_name,
|
| 75 |
"version": report.engine_version,
|
|
|
|
| 82 |
"cer_max": _safe(agg.get("cer", {}).get("max")),
|
| 83 |
"doc_count": agg.get("document_count", 0),
|
| 84 |
"failed": agg.get("failed_count", 0),
|
| 85 |
+
# CER diplomatique (après normalisation historique : ſ=s, u=v, i=j…)
|
| 86 |
+
"cer_diplomatic": _safe(diplo_agg.get("mean")) if diplo_agg else None,
|
| 87 |
+
"cer_diplomatic_profile": diplo_agg.get("profile"),
|
| 88 |
# Distribution pour l'histogramme : liste des CER individuels
|
| 89 |
"cer_values": [
|
| 90 |
_safe(dr.metrics.cer)
|
| 91 |
for dr in report.document_results
|
| 92 |
if dr.metrics.error is None
|
| 93 |
],
|
| 94 |
+
"cer_diplomatic_values": [
|
| 95 |
+
_safe(dr.metrics.cer_diplomatic)
|
| 96 |
+
for dr in report.document_results
|
| 97 |
+
if dr.metrics.error is None and dr.metrics.cer_diplomatic is not None
|
| 98 |
+
],
|
| 99 |
# Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
|
| 100 |
"is_pipeline": report.is_pipeline,
|
| 101 |
"pipeline_info": report.pipeline_info,
|
| 102 |
+
# Sprint 5 — métriques avancées patrimoniales
|
| 103 |
+
"ligature_score": _safe(report.ligature_score) if report.ligature_score is not None else None,
|
| 104 |
+
"diacritic_score": _safe(report.diacritic_score) if report.diacritic_score is not None else None,
|
| 105 |
+
"aggregated_confusion": report.aggregated_confusion,
|
| 106 |
+
"aggregated_taxonomy": report.aggregated_taxonomy,
|
| 107 |
+
"aggregated_structure": report.aggregated_structure,
|
| 108 |
+
"aggregated_image_quality": report.aggregated_image_quality,
|
| 109 |
}
|
| 110 |
engines_summary.append(entry)
|
| 111 |
|
|
|
|
| 137 |
"engine": engine_name,
|
| 138 |
"hypothesis": dr.hypothesis,
|
| 139 |
"cer": _safe(dr.metrics.cer),
|
| 140 |
+
"cer_diplomatic": _safe(dr.metrics.cer_diplomatic) if dr.metrics.cer_diplomatic is not None else None,
|
| 141 |
"wer": _safe(dr.metrics.wer),
|
| 142 |
"duration": dr.duration_seconds,
|
| 143 |
"error": dr.engine_error,
|
|
|
|
| 153 |
if on is not None:
|
| 154 |
er_entry["over_normalization"] = on
|
| 155 |
er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
|
| 156 |
+
# Sprint 5 — métriques avancées par document
|
| 157 |
+
if dr.char_scores is not None:
|
| 158 |
+
er_entry["ligature_score"] = _safe(dr.char_scores.get("ligature", {}).get("score"))
|
| 159 |
+
er_entry["diacritic_score"] = _safe(dr.char_scores.get("diacritic", {}).get("score"))
|
| 160 |
+
if dr.taxonomy is not None:
|
| 161 |
+
er_entry["taxonomy"] = dr.taxonomy
|
| 162 |
+
if dr.structure is not None:
|
| 163 |
+
er_entry["structure"] = dr.structure
|
| 164 |
+
if dr.image_quality is not None:
|
| 165 |
+
er_entry["image_quality"] = dr.image_quality
|
| 166 |
engine_results.append(er_entry)
|
| 167 |
|
| 168 |
# CER moyen sur ce document (pour le badge galerie)
|
|
|
|
| 630 |
<button class="tab-btn active" onclick="showView('ranking')">Classement</button>
|
| 631 |
<button class="tab-btn" onclick="showView('gallery')">Galerie</button>
|
| 632 |
<button class="tab-btn" onclick="showView('document')">Document</button>
|
| 633 |
+
<button class="tab-btn" onclick="showView('characters')">Caractères</button>
|
| 634 |
<button class="tab-btn" onclick="showView('analyses')">Analyses</button>
|
| 635 |
</div>
|
| 636 |
<div class="meta" id="nav-meta">—</div>
|
|
|
|
| 650 |
<tr>
|
| 651 |
<th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
|
| 652 |
<th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
|
| 653 |
+
<th data-col="cer" class="sortable">CER exact<i class="sort-icon">↕</i></th>
|
| 654 |
+
<th data-col="cer_diplomatic" class="sortable" title="CER après normalisation diplomatique (ſ=s, u=v, i=j…) — mesure les erreurs substantielles en ignorant les variantes graphiques codifiées">CER diplo.<i class="sort-icon">↕</i></th>
|
| 655 |
<th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
|
| 656 |
<th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
|
| 657 |
<th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
|
| 658 |
+
<th data-col="ligature_score" class="sortable" title="Taux de reconnaissance des ligatures (fi, fl, œ, æ, ff…)">Ligatures<i class="sort-icon">↕</i></th>
|
| 659 |
+
<th data-col="diacritic_score" class="sortable" title="Taux de conservation des diacritiques (accents, cédilles, trémas…)">Diacritiques<i class="sort-icon">↕</i></th>
|
| 660 |
<th>CER médian</th>
|
| 661 |
<th>CER min</th>
|
| 662 |
<th>CER max</th>
|
|
|
|
| 806 |
</div>
|
| 807 |
</div>
|
| 808 |
|
| 809 |
+
<div class="chart-card">
|
| 810 |
+
<h3>Qualité image ↔ CER (scatter plot)</h3>
|
| 811 |
+
<div class="chart-canvas-wrap">
|
| 812 |
+
<canvas id="chart-quality-cer"></canvas>
|
| 813 |
+
</div>
|
| 814 |
+
<div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
|
| 815 |
+
Chaque point = un document. Axe X = score qualité image [0–1]. Axe Y = CER. Corrélation négative attendue.
|
| 816 |
+
</div>
|
| 817 |
+
</div>
|
| 818 |
+
|
| 819 |
+
<div class="chart-card" style="grid-column:1/-1">
|
| 820 |
+
<h3>Taxonomie des erreurs par moteur</h3>
|
| 821 |
+
<div class="chart-canvas-wrap" style="max-height:300px">
|
| 822 |
+
<canvas id="chart-taxonomy"></canvas>
|
| 823 |
+
</div>
|
| 824 |
+
<div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
|
| 825 |
+
Distribution des classes d'erreurs (classes 1–9 de la taxonomie Picarones).
|
| 826 |
+
</div>
|
| 827 |
+
</div>
|
| 828 |
+
|
| 829 |
+
</div>
|
| 830 |
+
</div>
|
| 831 |
+
|
| 832 |
+
<!-- ════ Vue 5 : Caractères ════════════════════════════════════════ -->
|
| 833 |
+
<div id="view-characters" class="view">
|
| 834 |
+
<div class="card">
|
| 835 |
+
<h2>Analyse des caractères</h2>
|
| 836 |
+
|
| 837 |
+
<!-- Sélecteur de moteur -->
|
| 838 |
+
<div class="stat-row" style="margin-bottom:1rem">
|
| 839 |
+
<label for="char-engine-select" style="font-weight:600;margin-right:.5rem">Moteur :</label>
|
| 840 |
+
<select id="char-engine-select" onchange="renderCharView()"
|
| 841 |
+
style="padding:.35rem .7rem;border-radius:6px;border:1px solid var(--border)"></select>
|
| 842 |
+
</div>
|
| 843 |
+
|
| 844 |
+
<!-- Scores ligatures / diacritiques -->
|
| 845 |
+
<div class="stat-row" id="char-scores-row" style="gap:1.5rem;margin-bottom:1.5rem"></div>
|
| 846 |
+
|
| 847 |
+
<!-- Matrice de confusion unicode -->
|
| 848 |
+
<h3 style="margin-bottom:.75rem">Matrice de confusion unicode
|
| 849 |
+
<span style="font-size:.75rem;font-weight:400;color:var(--text-muted)">
|
| 850 |
+
— substitutions les plus fréquentes (caractère GT → caractère OCR)
|
| 851 |
+
</span>
|
| 852 |
+
</h3>
|
| 853 |
+
<div id="confusion-heatmap" style="overflow-x:auto;margin-bottom:1.5rem"></div>
|
| 854 |
+
|
| 855 |
+
<!-- Détail ligatures par type -->
|
| 856 |
+
<h3 style="margin-bottom:.75rem">Reconnaissance des ligatures</h3>
|
| 857 |
+
<div id="ligature-detail" style="margin-bottom:1.5rem"></div>
|
| 858 |
+
|
| 859 |
+
<!-- Taxonomie détaillée -->
|
| 860 |
+
<h3 style="margin-bottom:.75rem">Distribution taxonomique des erreurs</h3>
|
| 861 |
+
<div id="taxonomy-detail"></div>
|
| 862 |
</div>
|
| 863 |
</div>
|
| 864 |
|
|
|
|
| 892 |
document.querySelectorAll('.view').forEach(v => v.classList.remove('active'));
|
| 893 |
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
| 894 |
document.getElementById('view-' + name).classList.add('active');
|
| 895 |
+
// Activer le bon onglet nav
|
| 896 |
+
const tabMap = {{ranking:'classement',gallery:'galerie',document:'document',characters:'caract',analyses:'analyses'}};
|
| 897 |
+
const prefix = tabMap[name] || name;
|
| 898 |
document.querySelectorAll('.tab-btn').forEach(b => {{
|
| 899 |
+
if (b.textContent.toLowerCase().startsWith(prefix.toLowerCase())) b.classList.add('active');
|
|
|
|
|
|
|
| 900 |
}});
|
| 901 |
currentView = name;
|
| 902 |
if (name === 'analyses' && !chartsBuilt) buildCharts();
|
| 903 |
+
if (name === 'characters' && !charViewBuilt) initCharView();
|
| 904 |
}}
|
| 905 |
|
| 906 |
// ── Formatage ───────────────────────────────────────────────────
|
|
|
|
| 943 |
}}).join(' ');
|
| 944 |
}}
|
| 945 |
|
| 946 |
+
// ── Score badge (ligatures / diacritiques) ───────────────────────
|
| 947 |
+
function _scoreBadge(v, label) {{
|
| 948 |
+
if (v === null || v === undefined) return '<span style="color:var(--text-muted)">—</span>';
|
| 949 |
+
const pctVal = (v * 100).toFixed(1);
|
| 950 |
+
const color = v >= 0.9 ? '#16a34a' : v >= 0.7 ? '#ca8a04' : '#dc2626';
|
| 951 |
+
const bg = v >= 0.9 ? '#f0fdf4' : v >= 0.7 ? '#fefce8' : '#fef2f2';
|
| 952 |
+
return `<span class="cer-badge" style="color:${{color}};background:${{bg}}" title="${{label}} : ${{pctVal}}%">${{pctVal}}%</span>`;
|
| 953 |
+
}}
|
| 954 |
+
|
| 955 |
// ── Vue Classement ──────────────────────────────────────────────
|
| 956 |
let rankingSort = {{ col: 'cer', dir: 'asc' }};
|
| 957 |
|
|
|
|
| 1001 |
overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
|
| 1002 |
}}
|
| 1003 |
|
| 1004 |
+
// CER diplomatique
|
| 1005 |
+
let diploCerCell = '<td style="color:var(--text-muted)">—</td>';
|
| 1006 |
+
if (e.cer_diplomatic !== null && e.cer_diplomatic !== undefined) {{
|
| 1007 |
+
const dipC = cerColor(e.cer_diplomatic); const dipB = cerBg(e.cer_diplomatic);
|
| 1008 |
+
const delta = e.cer - e.cer_diplomatic;
|
| 1009 |
+
const deltaStr = delta > 0.001 ? ` <span style="font-size:.65rem;color:#059669">-${{(delta*100).toFixed(1)}}%</span>` : '';
|
| 1010 |
+
const profileHint = e.cer_diplomatic_profile ? ` title="Profil : ${{esc(e.cer_diplomatic_profile)}}"` : '';
|
| 1011 |
+
diploCerCell = `<td${{profileHint}}>
|
| 1012 |
+
<span class="cer-badge" style="color:${{dipC}};background:${{dipB}}">${{pct(e.cer_diplomatic)}}</span>${{deltaStr}}
|
| 1013 |
+
</td>`;
|
| 1014 |
+
}}
|
| 1015 |
+
|
| 1016 |
return `<tr>
|
| 1017 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 1018 |
<td>
|
|
|
|
| 1025 |
<span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
|
| 1026 |
<span class="cer-badge" style="color:${{cerC}};background:${{cerB}}">${{pct(e.cer)}}</span>
|
| 1027 |
</td>
|
| 1028 |
+
${{diploCerCell}}
|
| 1029 |
<td>${{pct(e.wer)}}</td>
|
| 1030 |
<td>${{pct(e.mer)}}</td>
|
| 1031 |
<td>${{pct(e.wil)}}</td>
|
| 1032 |
+
<td>${{_scoreBadge(e.ligature_score, 'Ligatures')}}</td>
|
| 1033 |
+
<td>${{_scoreBadge(e.diacritic_score, 'Diacritiques')}}</td>
|
| 1034 |
<td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
|
| 1035 |
<td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
|
| 1036 |
<td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
|
|
|
|
| 1219 |
</div>`;
|
| 1220 |
}}
|
| 1221 |
|
| 1222 |
+
// CER diplomatique par document
|
| 1223 |
+
let diplomaBadge = '';
|
| 1224 |
+
if (er.cer_diplomatic !== null && er.cer_diplomatic !== undefined) {{
|
| 1225 |
+
const dipC = cerColor(er.cer_diplomatic); const dipB = cerBg(er.cer_diplomatic);
|
| 1226 |
+
const delta = er.cer - er.cer_diplomatic;
|
| 1227 |
+
const deltaHint = delta > 0.001 ? ` (−${{(delta*100).toFixed(1)}}% avec normalisation)` : '';
|
| 1228 |
+
diplomaBadge = `<span class="cer-badge" style="color:${{dipC}};background:${{dipB}};opacity:.85"
|
| 1229 |
+
title="CER diplomatique (ſ=s, u=v, i=j…)${{deltaHint}}">diplo. ${{pct(er.cer_diplomatic)}}</span>`;
|
| 1230 |
+
}}
|
| 1231 |
+
|
| 1232 |
return `<div class="diff-panel">
|
| 1233 |
<div class="diff-panel-header">
|
| 1234 |
<span class="diff-panel-title">${{esc(er.engine)}}</span>
|
| 1235 |
${{pipeTagPanel}}
|
| 1236 |
<span class="diff-panel-metrics">
|
| 1237 |
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
| 1238 |
+
${{diplomaBadge}}
|
| 1239 |
<span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
|
| 1240 |
${{onBadge}}
|
| 1241 |
${{errBadge}}
|
|
|
|
| 1308 |
buildRadar();
|
| 1309 |
buildCerPerDoc();
|
| 1310 |
buildDurationChart();
|
| 1311 |
+
buildQualityCerScatter();
|
| 1312 |
+
buildTaxonomyChart();
|
| 1313 |
}}
|
| 1314 |
|
| 1315 |
function buildCerHistogram() {{
|
|
|
|
| 1453 |
}});
|
| 1454 |
}}
|
| 1455 |
|
| 1456 |
+
function buildQualityCerScatter() {{
|
| 1457 |
+
const ctx = document.getElementById('chart-quality-cer');
|
| 1458 |
+
if (!ctx) return;
|
| 1459 |
+
// Construire les points : un par document, un dataset par moteur
|
| 1460 |
+
const datasets = DATA.engines.map((e, ei) => {{
|
| 1461 |
+
const points = DATA.documents.flatMap(doc => {{
|
| 1462 |
+
const er = doc.engine_results.find(r => r.engine === e.name);
|
| 1463 |
+
if (!er || er.error || !er.image_quality) return [];
|
| 1464 |
+
return [{{ x: er.image_quality.quality_score, y: er.cer * 100 }}];
|
| 1465 |
+
}});
|
| 1466 |
+
return {{
|
| 1467 |
+
label: e.name, data: points,
|
| 1468 |
+
backgroundColor: engineColor(ei) + 'bb',
|
| 1469 |
+
borderColor: engineColor(ei),
|
| 1470 |
+
borderWidth: 1, pointRadius: 5, pointHoverRadius: 7,
|
| 1471 |
+
}};
|
| 1472 |
+
}}).filter(d => d.data.length > 0);
|
| 1473 |
+
|
| 1474 |
+
if (!datasets.length) {{ ctx.parentElement.innerHTML = '<p style="color:var(--text-muted);padding:1rem">Aucune donnée de qualité image disponible.</p>'; return; }}
|
| 1475 |
+
|
| 1476 |
+
chartInstances['quality-cer'] = new Chart(ctx.getContext('2d'), {{
|
| 1477 |
+
type: 'scatter',
|
| 1478 |
+
data: {{ datasets }},
|
| 1479 |
+
options: {{
|
| 1480 |
+
responsive: true, maintainAspectRatio: false,
|
| 1481 |
+
plugins: {{
|
| 1482 |
+
legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }},
|
| 1483 |
+
tooltip: {{ callbacks: {{
|
| 1484 |
+
label: ctx => `${{ctx.dataset.label}}: qualité=${{ctx.parsed.x.toFixed(2)}}, CER=${{ctx.parsed.y.toFixed(1)}}%`,
|
| 1485 |
+
}} }},
|
| 1486 |
+
}},
|
| 1487 |
+
scales: {{
|
| 1488 |
+
x: {{ min: 0, max: 1, title: {{ display: true, text: 'Score qualité image [0–1]', font: {{ size: 11 }} }} }},
|
| 1489 |
+
y: {{ min: 0, title: {{ display: true, text: 'CER (%)', font: {{ size: 11 }} }} }},
|
| 1490 |
+
}},
|
| 1491 |
+
}},
|
| 1492 |
+
}});
|
| 1493 |
+
}}
|
| 1494 |
+
|
| 1495 |
+
function buildTaxonomyChart() {{
|
| 1496 |
+
const ctx = document.getElementById('chart-taxonomy');
|
| 1497 |
+
if (!ctx) return;
|
| 1498 |
+
const taxLabels = ['Confusion visuelle','Diacritique','Casse','Ligature','Abréviation','Hapax','Segmentation','Hors-vocab.','Lacune'];
|
| 1499 |
+
const taxKeys = ['visual_confusion','diacritic_error','case_error','ligature_error','abbreviation_error','hapax','segmentation_error','oov_character','lacuna'];
|
| 1500 |
+
const taxColors = ['#6366f1','#f59e0b','#ec4899','#14b8a6','#8b5cf6','#64748b','#f97316','#06b6d4','#ef4444'];
|
| 1501 |
+
|
| 1502 |
+
const datasets = DATA.engines.map((e, ei) => {{
|
| 1503 |
+
const tax = e.aggregated_taxonomy;
|
| 1504 |
+
const data = taxKeys.map(k => tax && tax.counts ? (tax.counts[k] || 0) : 0);
|
| 1505 |
+
return {{
|
| 1506 |
+
label: e.name, data,
|
| 1507 |
+
backgroundColor: engineColor(ei) + '99',
|
| 1508 |
+
borderColor: engineColor(ei),
|
| 1509 |
+
borderWidth: 1,
|
| 1510 |
+
}};
|
| 1511 |
+
}});
|
| 1512 |
+
|
| 1513 |
+
chartInstances['taxonomy'] = new Chart(ctx.getContext('2d'), {{
|
| 1514 |
+
type: 'bar',
|
| 1515 |
+
data: {{ labels: taxLabels, datasets }},
|
| 1516 |
+
options: {{
|
| 1517 |
+
responsive: true, maintainAspectRatio: false,
|
| 1518 |
+
plugins: {{ legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }} }},
|
| 1519 |
+
scales: {{
|
| 1520 |
+
x: {{ ticks: {{ font: {{ size: 10 }} }} }},
|
| 1521 |
+
y: {{ title: {{ display: true, text: "Nb d'erreurs", font: {{ size: 11 }} }}, min: 0, ticks: {{ stepSize: 1 }} }},
|
| 1522 |
+
}},
|
| 1523 |
+
}},
|
| 1524 |
+
}});
|
| 1525 |
+
}}
|
| 1526 |
+
|
| 1527 |
+
// ── Vue Caractères ───────────────────────────────────────────────
|
| 1528 |
+
let charViewBuilt = false;
|
| 1529 |
+
|
| 1530 |
+
function initCharView() {{
|
| 1531 |
+
charViewBuilt = true;
|
| 1532 |
+
// Remplir le sélecteur de moteur
|
| 1533 |
+
const sel = document.getElementById('char-engine-select');
|
| 1534 |
+
sel.innerHTML = '';
|
| 1535 |
+
DATA.engines.forEach(e => {{
|
| 1536 |
+
const opt = document.createElement('option');
|
| 1537 |
+
opt.value = e.name; opt.textContent = e.name;
|
| 1538 |
+
sel.appendChild(opt);
|
| 1539 |
+
}});
|
| 1540 |
+
renderCharView();
|
| 1541 |
+
}}
|
| 1542 |
+
|
| 1543 |
+
function renderCharView() {{
|
| 1544 |
+
const engineName = document.getElementById('char-engine-select').value;
|
| 1545 |
+
const eng = DATA.engines.find(e => e.name === engineName);
|
| 1546 |
+
if (!eng) return;
|
| 1547 |
+
|
| 1548 |
+
// Scores ligatures / diacritiques
|
| 1549 |
+
const scoresRow = document.getElementById('char-scores-row');
|
| 1550 |
+
const ligScore = eng.ligature_score;
|
| 1551 |
+
const diacScore = eng.diacritic_score;
|
| 1552 |
+
scoresRow.innerHTML = `
|
| 1553 |
+
<div class="stat">Ligatures <b>${{_scoreBadge(ligScore, 'Ligatures')}}</b></div>
|
| 1554 |
+
<div class="stat">Diacritiques <b>${{_scoreBadge(diacScore, 'Diacritiques')}}</b></div>
|
| 1555 |
+
${{eng.aggregated_structure ? `
|
| 1556 |
+
<div class="stat">Précision lignes <b>${{_scoreBadge(eng.aggregated_structure.mean_line_accuracy, 'Précision nb lignes')}}</b></div>
|
| 1557 |
+
<div class="stat">Ordre lecture <b>${{_scoreBadge(eng.aggregated_structure.mean_reading_order_score, 'Score ordre de lecture')}}</b></div>
|
| 1558 |
+
` : ''}}
|
| 1559 |
+
${{eng.aggregated_image_quality ? `
|
| 1560 |
+
<div class="stat">Qualité image moy. <b>${{_scoreBadge(eng.aggregated_image_quality.mean_quality_score, 'Qualité image moyenne')}}</b></div>
|
| 1561 |
+
` : ''}}
|
| 1562 |
+
`;
|
| 1563 |
+
|
| 1564 |
+
// Matrice de confusion heatmap
|
| 1565 |
+
renderConfusionHeatmap(eng);
|
| 1566 |
+
|
| 1567 |
+
// Détail ligatures
|
| 1568 |
+
renderLigatureDetail(eng);
|
| 1569 |
+
|
| 1570 |
+
// Taxonomie détaillée
|
| 1571 |
+
renderTaxonomyDetail(eng);
|
| 1572 |
+
}}
|
| 1573 |
+
|
| 1574 |
+
function renderConfusionHeatmap(eng) {{
|
| 1575 |
+
const container = document.getElementById('confusion-heatmap');
|
| 1576 |
+
const cm = eng.aggregated_confusion;
|
| 1577 |
+
if (!cm || !cm.matrix) {{
|
| 1578 |
+
container.innerHTML = '<p style="color:var(--text-muted)">Aucune donnée de confusion disponible.</p>';
|
| 1579 |
+
return;
|
| 1580 |
+
}}
|
| 1581 |
+
|
| 1582 |
+
// Collecter les top confusions (substitutions uniquement, hors ∅)
|
| 1583 |
+
const pairs = [];
|
| 1584 |
+
for (const [gt, ocrs] of Object.entries(cm.matrix)) {{
|
| 1585 |
+
if (gt === '∅') continue;
|
| 1586 |
+
for (const [ocr, cnt] of Object.entries(ocrs)) {{
|
| 1587 |
+
if (ocr !== gt && ocr !== '∅' && cnt > 0) {{
|
| 1588 |
+
pairs.push({{ gt, ocr, cnt }});
|
| 1589 |
+
}}
|
| 1590 |
+
}}
|
| 1591 |
+
}}
|
| 1592 |
+
pairs.sort((a,b) => b.cnt - a.cnt);
|
| 1593 |
+
const top = pairs.slice(0, 30);
|
| 1594 |
+
|
| 1595 |
+
if (!top.length) {{
|
| 1596 |
+
container.innerHTML = '<p style="color:var(--text-muted)">Aucune substitution détectée.</p>';
|
| 1597 |
+
return;
|
| 1598 |
+
}}
|
| 1599 |
+
|
| 1600 |
+
// Heatmap sous forme de tableau compact
|
| 1601 |
+
const maxCnt = top[0].cnt;
|
| 1602 |
+
const rows = top.map(p => {{
|
| 1603 |
+
const intensity = Math.round((p.cnt / maxCnt) * 200 + 55); // 55–255
|
| 1604 |
+
const bg = `rgb(${{intensity}},50,50)`;
|
| 1605 |
+
const fg = intensity > 150 ? '#fff' : '#222';
|
| 1606 |
+
return `<tr onclick="showConfusionExamples('${{esc(p.gt)}}','${{esc(p.ocr)}}')" style="cursor:pointer" title="GT='${{esc(p.gt)}}' → OCR='${{esc(p.ocr)}}' : ${{p.cnt}} fois">
|
| 1607 |
+
<td style="font-family:monospace;font-size:1.1rem;padding:.3rem .6rem;text-align:center">${{esc(p.gt)}}</td>
|
| 1608 |
+
<td style="padding:.1rem .3rem;color:var(--text-muted)">→</td>
|
| 1609 |
+
<td style="font-family:monospace;font-size:1.1rem;padding:.3rem .6rem;text-align:center">${{esc(p.ocr)}}</td>
|
| 1610 |
+
<td style="padding:.3rem 1rem">
|
| 1611 |
+
<div style="display:flex;align-items:center;gap:.5rem">
|
| 1612 |
+
<div style="width:${{Math.round(p.cnt/maxCnt*120)}}px;height:12px;border-radius:3px;background:${{bg}}"></div>
|
| 1613 |
+
<span style="font-size:.8rem;color:var(--text-muted)">${{p.cnt}}×</span>
|
| 1614 |
+
</div>
|
| 1615 |
+
</td>
|
| 1616 |
+
</tr>`;
|
| 1617 |
+
}}).join('');
|
| 1618 |
+
|
| 1619 |
+
container.innerHTML = `
|
| 1620 |
+
<p style="font-size:.75rem;color:var(--text-muted);margin-bottom:.5rem">
|
| 1621 |
+
Cliquer sur une ligne pour voir les exemples dans la vue Document.
|
| 1622 |
+
Total substitutions : <b>${{cm.total_substitutions}}</b>
|
| 1623 |
+
· Insertions : <b>${{cm.total_insertions}}</b>
|
| 1624 |
+
· Suppressions : <b>${{cm.total_deletions}}</b>
|
| 1625 |
+
</p>
|
| 1626 |
+
<table style="border-collapse:collapse;font-size:.85rem">
|
| 1627 |
+
<thead><tr>
|
| 1628 |
+
<th style="padding:.3rem .6rem;text-align:left">GT</th>
|
| 1629 |
+
<th></th>
|
| 1630 |
+
<th style="padding:.3rem .6rem;text-align:left">OCR</th>
|
| 1631 |
+
<th style="padding:.3rem 1rem;text-align:left">Fréquence</th>
|
| 1632 |
+
</tr></thead>
|
| 1633 |
+
<tbody>${{rows}}</tbody>
|
| 1634 |
+
</table>
|
| 1635 |
+
`;
|
| 1636 |
+
}}
|
| 1637 |
+
|
| 1638 |
+
function showConfusionExamples(gtChar, ocrChar) {{
|
| 1639 |
+
// Naviguer vers la vue Document en cherchant un exemple de cette confusion
|
| 1640 |
+
showView('document');
|
| 1641 |
+
const docWithConfusion = DATA.documents.find(doc =>
|
| 1642 |
+
doc.engine_results.some(er => {{
|
| 1643 |
+
const h = er.hypothesis || '';
|
| 1644 |
+
const g = doc.ground_truth || '';
|
| 1645 |
+
return g.includes(gtChar) && h.includes(ocrChar);
|
| 1646 |
+
}})
|
| 1647 |
+
);
|
| 1648 |
+
if (docWithConfusion) loadDocument(docWithConfusion.doc_id);
|
| 1649 |
+
}}
|
| 1650 |
+
|
| 1651 |
+
function renderLigatureDetail(eng) {{
|
| 1652 |
+
const container = document.getElementById('ligature-detail');
|
| 1653 |
+
// Agrégation sur tous les documents pour ce moteur
|
| 1654 |
+
const ligData = {{}};
|
| 1655 |
+
DATA.documents.forEach(doc => {{
|
| 1656 |
+
const er = doc.engine_results.find(r => r.engine === eng.name);
|
| 1657 |
+
if (!er || !er.ligature_score) return;
|
| 1658 |
+
// On n'a que le score global par doc; pour le détail, utiliser aggregated_char_scores
|
| 1659 |
+
}});
|
| 1660 |
+
|
| 1661 |
+
const agg = eng.aggregated_char_scores;
|
| 1662 |
+
if (!agg || !agg.ligature || !agg.ligature.per_ligature) {{
|
| 1663 |
+
const overallScore = eng.ligature_score;
|
| 1664 |
+
if (overallScore !== null && overallScore !== undefined) {{
|
| 1665 |
+
container.innerHTML = `<div class="stat">Score global ligatures : ${{_scoreBadge(overallScore, 'Ligatures')}}</div>`;
|
| 1666 |
+
}} else {{
|
| 1667 |
+
container.innerHTML = '<p style="color:var(--text-muted)">Aucune donnée ligature disponible (pas de ligatures dans le corpus).</p>';
|
| 1668 |
+
}}
|
| 1669 |
+
return;
|
| 1670 |
+
}}
|
| 1671 |
+
|
| 1672 |
+
const perLig = agg.ligature.per_ligature;
|
| 1673 |
+
if (!Object.keys(perLig).length) {{
|
| 1674 |
+
container.innerHTML = '<p style="color:var(--text-muted)">Aucune ligature trouvée dans le corpus GT.</p>';
|
| 1675 |
+
return;
|
| 1676 |
+
}}
|
| 1677 |
+
|
| 1678 |
+
const rows = Object.entries(perLig)
|
| 1679 |
+
.sort((a,b) => b[1].gt_count - a[1].gt_count)
|
| 1680 |
+
.map(([lig, d]) => {{
|
| 1681 |
+
const sc = d.score;
|
| 1682 |
+
const color = sc >= 0.9 ? '#16a34a' : sc >= 0.7 ? '#ca8a04' : '#dc2626';
|
| 1683 |
+
const barW = Math.round(sc * 120);
|
| 1684 |
+
return `<tr>
|
| 1685 |
+
<td style="font-family:monospace;font-size:1.2rem;padding:.3rem .6rem">${{esc(lig)}}</td>
|
| 1686 |
+
<td style="padding:.3rem .6rem;font-size:.8rem;color:var(--text-muted)">${{esc(lig.codePointAt(0).toString(16).toUpperCase().padStart(4,'0'))}}</td>
|
| 1687 |
+
<td style="padding:.3rem .6rem">${{d.gt_count}} GT</td>
|
| 1688 |
+
<td style="padding:.3rem .6rem">${{d.ocr_correct}} corrects</td>
|
| 1689 |
+
<td style="padding:.3rem 1rem">
|
| 1690 |
+
<div style="display:flex;align-items:center;gap:.5rem">
|
| 1691 |
+
<div style="width:${{barW}}px;height:10px;border-radius:3px;background:${{color}}"></div>
|
| 1692 |
+
<span style="color:${{color}};font-weight:600">${{(sc*100).toFixed(0)}}%</span>
|
| 1693 |
+
</div>
|
| 1694 |
+
</td>
|
| 1695 |
+
</tr>`;
|
| 1696 |
+
}}).join('');
|
| 1697 |
+
|
| 1698 |
+
container.innerHTML = `
|
| 1699 |
+
<table style="border-collapse:collapse;font-size:.85rem">
|
| 1700 |
+
<thead><tr>
|
| 1701 |
+
<th style="padding:.3rem .6rem;text-align:left">Ligature</th>
|
| 1702 |
+
<th style="padding:.3rem .6rem;text-align:left">Unicode</th>
|
| 1703 |
+
<th style="padding:.3rem .6rem">GT</th>
|
| 1704 |
+
<th style="padding:.3rem .6rem">Corrects</th>
|
| 1705 |
+
<th style="padding:.3rem 1rem;text-align:left">Score</th>
|
| 1706 |
+
</tr></thead>
|
| 1707 |
+
<tbody>${{rows}}</tbody>
|
| 1708 |
+
</table>
|
| 1709 |
+
`;
|
| 1710 |
+
}}
|
| 1711 |
+
|
| 1712 |
+
function renderTaxonomyDetail(eng) {{
|
| 1713 |
+
const container = document.getElementById('taxonomy-detail');
|
| 1714 |
+
const tax = eng.aggregated_taxonomy;
|
| 1715 |
+
if (!tax || !tax.counts) {{
|
| 1716 |
+
container.innerHTML = '<p style="color:var(--text-muted)">Aucune donnée taxonomique disponible.</p>';
|
| 1717 |
+
return;
|
| 1718 |
+
}}
|
| 1719 |
+
|
| 1720 |
+
const classNames = {{
|
| 1721 |
+
visual_confusion: '1 — Confusion visuelle',
|
| 1722 |
+
diacritic_error: '2 — Erreur diacritique',
|
| 1723 |
+
case_error: '3 — Erreur de casse',
|
| 1724 |
+
ligature_error: '4 — Ligature',
|
| 1725 |
+
abbreviation_error: '5 — Abréviation',
|
| 1726 |
+
hapax: '6 — Hapax',
|
| 1727 |
+
segmentation_error: '7 — Segmentation',
|
| 1728 |
+
oov_character: '8 — Hors-vocabulaire',
|
| 1729 |
+
lacuna: '9 — Lacune',
|
| 1730 |
+
}};
|
| 1731 |
+
const total = tax.total_errors || 1;
|
| 1732 |
+
const maxCnt = Math.max(...Object.values(tax.counts));
|
| 1733 |
+
|
| 1734 |
+
const rows = Object.entries(tax.counts)
|
| 1735 |
+
.filter(([, cnt]) => cnt > 0)
|
| 1736 |
+
.sort((a,b) => b[1]-a[1])
|
| 1737 |
+
.map(([cls, cnt]) => {{
|
| 1738 |
+
const pctVal = (cnt / total * 100).toFixed(1);
|
| 1739 |
+
const barW = maxCnt > 0 ? Math.round(cnt/maxCnt * 200) : 0;
|
| 1740 |
+
return `<tr>
|
| 1741 |
+
<td style="padding:.3rem .6rem;font-size:.85rem">${{esc(classNames[cls] || cls)}}</td>
|
| 1742 |
+
<td style="padding:.3rem .6rem;text-align:right;font-variant-numeric:tabular-nums">${{cnt}}</td>
|
| 1743 |
+
<td style="padding:.3rem 1rem">
|
| 1744 |
+
<div style="display:flex;align-items:center;gap:.5rem">
|
| 1745 |
+
<div style="width:${{barW}}px;height:10px;border-radius:3px;background:#6366f1"></div>
|
| 1746 |
+
<span style="color:var(--text-muted);font-size:.8rem">${{pctVal}}%</span>
|
| 1747 |
+
</div>
|
| 1748 |
+
</td>
|
| 1749 |
+
</tr>`;
|
| 1750 |
+
}}).join('');
|
| 1751 |
+
|
| 1752 |
+
container.innerHTML = `
|
| 1753 |
+
<p style="font-size:.75rem;color:var(--text-muted);margin-bottom:.5rem">Total : <b>${{tax.total_errors}}</b> erreurs classifiées.</p>
|
| 1754 |
+
<table style="border-collapse:collapse;font-size:.85rem;min-width:400px">
|
| 1755 |
+
<thead><tr>
|
| 1756 |
+
<th style="padding:.3rem .6rem;text-align:left">Classe</th>
|
| 1757 |
+
<th style="padding:.3rem .6rem;text-align:right">N</th>
|
| 1758 |
+
<th style="padding:.3rem 1rem;text-align:left">Proportion</th>
|
| 1759 |
+
</tr></thead>
|
| 1760 |
+
<tbody>${{rows}}</tbody>
|
| 1761 |
+
</table>
|
| 1762 |
+
`;
|
| 1763 |
+
}}
|
| 1764 |
+
|
| 1765 |
// ── Init ────────────────────────────────────────────────────────
|
| 1766 |
function init() {{
|
| 1767 |
// Méta nav
|
picarones/web/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Interface web locale Picarones — FastAPI."""
|
picarones/web/app.py
ADDED
|
@@ -0,0 +1,1634 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Interface web locale Picarones — application FastAPI.
|
| 2 |
+
|
| 3 |
+
Lance avec :
|
| 4 |
+
picarones serve [--port 8000] [--host 127.0.0.1]
|
| 5 |
+
ou directement :
|
| 6 |
+
uvicorn picarones.web.app:app --reload --port 8000
|
| 7 |
+
|
| 8 |
+
Routes
|
| 9 |
+
------
|
| 10 |
+
GET / Page principale (SPA)
|
| 11 |
+
GET /api/status Version et état de l'application
|
| 12 |
+
GET /api/engines Statut des moteurs OCR et LLMs disponibles
|
| 13 |
+
GET /api/corpus/browse Parcourir les dossiers du serveur
|
| 14 |
+
GET /api/reports Liste des rapports générés
|
| 15 |
+
GET /api/normalization/profiles Profils de normalisation disponibles
|
| 16 |
+
POST /api/benchmark/start Lancer un benchmark (retourne job_id)
|
| 17 |
+
GET /api/benchmark/{job_id}/stream Stream SSE de progression
|
| 18 |
+
GET /api/benchmark/{job_id}/status Statut courant d'un job
|
| 19 |
+
POST /api/benchmark/{job_id}/cancel Annuler un job
|
| 20 |
+
GET /api/htr-united/catalogue Catalogue HTR-United
|
| 21 |
+
POST /api/htr-united/import Importer un corpus HTR-United
|
| 22 |
+
GET /api/huggingface/search Rechercher des datasets HuggingFace
|
| 23 |
+
POST /api/huggingface/import Importer un dataset HuggingFace
|
| 24 |
+
GET /reports/{filename} Accéder à un rapport HTML généré
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import asyncio
|
| 30 |
+
import json
|
| 31 |
+
import os
|
| 32 |
+
import threading
|
| 33 |
+
import time
|
| 34 |
+
import uuid
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
+
from datetime import datetime, timezone
|
| 37 |
+
from pathlib import Path
|
| 38 |
+
from typing import Any, AsyncIterator, Optional
|
| 39 |
+
|
| 40 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 41 |
+
from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
|
| 42 |
+
from pydantic import BaseModel
|
| 43 |
+
|
| 44 |
+
from picarones import __version__
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# App initialization
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
|
| 50 |
+
app = FastAPI(
|
| 51 |
+
title="Picarones",
|
| 52 |
+
description="Plateforme de comparaison de moteurs OCR/HTR pour documents patrimoniaux",
|
| 53 |
+
version=__version__,
|
| 54 |
+
docs_url="/api/docs",
|
| 55 |
+
redoc_url="/api/redoc",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Job management
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
@dataclass
|
| 63 |
+
class BenchmarkJob:
|
| 64 |
+
job_id: str
|
| 65 |
+
status: str = "pending" # pending | running | complete | error | cancelled
|
| 66 |
+
progress: float = 0.0 # 0.0 – 1.0
|
| 67 |
+
current_engine: str = ""
|
| 68 |
+
total_docs: int = 0
|
| 69 |
+
processed_docs: int = 0
|
| 70 |
+
output_path: str = ""
|
| 71 |
+
error: str = ""
|
| 72 |
+
started_at: Optional[str] = None
|
| 73 |
+
finished_at: Optional[str] = None
|
| 74 |
+
events: list[dict] = field(default_factory=list)
|
| 75 |
+
_subscribers: list[asyncio.Queue] = field(default_factory=list)
|
| 76 |
+
|
| 77 |
+
def add_event(self, kind: str, data: Any) -> None:
|
| 78 |
+
event = {"kind": kind, "data": data, "ts": _iso_now()}
|
| 79 |
+
self.events.append(event)
|
| 80 |
+
for q in self._subscribers:
|
| 81 |
+
try:
|
| 82 |
+
q.put_nowait(event)
|
| 83 |
+
except asyncio.QueueFull:
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
def subscribe(self) -> asyncio.Queue:
|
| 87 |
+
q: asyncio.Queue = asyncio.Queue(maxsize=200)
|
| 88 |
+
self._subscribers.append(q)
|
| 89 |
+
return q
|
| 90 |
+
|
| 91 |
+
def unsubscribe(self, q: asyncio.Queue) -> None:
|
| 92 |
+
try:
|
| 93 |
+
self._subscribers.remove(q)
|
| 94 |
+
except ValueError:
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
+
def as_dict(self) -> dict:
|
| 98 |
+
return {
|
| 99 |
+
"job_id": self.job_id,
|
| 100 |
+
"status": self.status,
|
| 101 |
+
"progress": self.progress,
|
| 102 |
+
"current_engine": self.current_engine,
|
| 103 |
+
"total_docs": self.total_docs,
|
| 104 |
+
"processed_docs": self.processed_docs,
|
| 105 |
+
"output_path": self.output_path,
|
| 106 |
+
"error": self.error,
|
| 107 |
+
"started_at": self.started_at,
|
| 108 |
+
"finished_at": self.finished_at,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
_JOBS: dict[str, BenchmarkJob] = {}
|
| 113 |
+
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
# Pydantic models
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
|
| 118 |
+
class BenchmarkRequest(BaseModel):
|
| 119 |
+
corpus_path: str
|
| 120 |
+
engines: list[str] = ["tesseract"]
|
| 121 |
+
normalization_profile: str = "nfc"
|
| 122 |
+
output_dir: str = "./rapports/"
|
| 123 |
+
report_name: str = ""
|
| 124 |
+
lang: str = "fra"
|
| 125 |
+
|
| 126 |
+
class HTRUnitedImportRequest(BaseModel):
|
| 127 |
+
entry_id: str
|
| 128 |
+
output_dir: str = "./corpus/"
|
| 129 |
+
max_samples: int = 100
|
| 130 |
+
|
| 131 |
+
class HuggingFaceImportRequest(BaseModel):
|
| 132 |
+
dataset_id: str
|
| 133 |
+
output_dir: str = "./corpus/"
|
| 134 |
+
split: str = "train"
|
| 135 |
+
max_samples: int = 100
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# ---------------------------------------------------------------------------
|
| 139 |
+
# API — status
|
| 140 |
+
# ---------------------------------------------------------------------------
|
| 141 |
+
|
| 142 |
+
@app.get("/api/status")
|
| 143 |
+
async def api_status() -> dict:
|
| 144 |
+
return {
|
| 145 |
+
"app": "Picarones",
|
| 146 |
+
"version": __version__,
|
| 147 |
+
"status": "ok",
|
| 148 |
+
"timestamp": _iso_now(),
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ---------------------------------------------------------------------------
|
| 153 |
+
# API — engines
|
| 154 |
+
# ---------------------------------------------------------------------------
|
| 155 |
+
|
| 156 |
+
@app.get("/api/engines")
|
| 157 |
+
async def api_engines() -> dict:
|
| 158 |
+
engines = []
|
| 159 |
+
|
| 160 |
+
# Tesseract
|
| 161 |
+
tess = _check_engine("tesseract", "pytesseract")
|
| 162 |
+
engines.append(tess)
|
| 163 |
+
|
| 164 |
+
# Pero OCR
|
| 165 |
+
pero = _check_engine("pero_ocr", "pero_ocr", label="Pero OCR")
|
| 166 |
+
engines.append(pero)
|
| 167 |
+
|
| 168 |
+
# Kraken
|
| 169 |
+
kraken = _check_engine("kraken", "kraken", label="Kraken")
|
| 170 |
+
engines.append(kraken)
|
| 171 |
+
|
| 172 |
+
# Calamari
|
| 173 |
+
calamari = _check_engine("calamari", "calamari_ocr", label="Calamari")
|
| 174 |
+
engines.append(calamari)
|
| 175 |
+
|
| 176 |
+
llms = []
|
| 177 |
+
|
| 178 |
+
# OpenAI
|
| 179 |
+
llms.append({
|
| 180 |
+
"id": "openai",
|
| 181 |
+
"label": "OpenAI (GPT-4o, GPT-4o mini)",
|
| 182 |
+
"type": "llm",
|
| 183 |
+
"available": bool(os.environ.get("OPENAI_API_KEY")),
|
| 184 |
+
"key_env": "OPENAI_API_KEY",
|
| 185 |
+
"status": "configured" if os.environ.get("OPENAI_API_KEY") else "missing_key",
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
# Anthropic
|
| 189 |
+
llms.append({
|
| 190 |
+
"id": "anthropic",
|
| 191 |
+
"label": "Anthropic (Claude Sonnet, Haiku)",
|
| 192 |
+
"type": "llm",
|
| 193 |
+
"available": bool(os.environ.get("ANTHROPIC_API_KEY")),
|
| 194 |
+
"key_env": "ANTHROPIC_API_KEY",
|
| 195 |
+
"status": "configured" if os.environ.get("ANTHROPIC_API_KEY") else "missing_key",
|
| 196 |
+
})
|
| 197 |
+
|
| 198 |
+
# Mistral
|
| 199 |
+
llms.append({
|
| 200 |
+
"id": "mistral",
|
| 201 |
+
"label": "Mistral (Mistral OCR, Pixtral, Large)",
|
| 202 |
+
"type": "llm",
|
| 203 |
+
"available": bool(os.environ.get("MISTRAL_API_KEY")),
|
| 204 |
+
"key_env": "MISTRAL_API_KEY",
|
| 205 |
+
"status": "configured" if os.environ.get("MISTRAL_API_KEY") else "missing_key",
|
| 206 |
+
})
|
| 207 |
+
|
| 208 |
+
# Ollama
|
| 209 |
+
ollama_available = _check_ollama()
|
| 210 |
+
ollama_models = _list_ollama_models() if ollama_available else []
|
| 211 |
+
llms.append({
|
| 212 |
+
"id": "ollama",
|
| 213 |
+
"label": "Ollama (Llama 3, Gemma, Phi — local)",
|
| 214 |
+
"type": "llm_local",
|
| 215 |
+
"available": ollama_available,
|
| 216 |
+
"status": "running" if ollama_available else "not_running",
|
| 217 |
+
"models": ollama_models,
|
| 218 |
+
"base_url": "http://localhost:11434",
|
| 219 |
+
})
|
| 220 |
+
|
| 221 |
+
return {"engines": engines, "llms": llms}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _check_engine(engine_id: str, module_name: str, label: str = "") -> dict:
|
| 225 |
+
label = label or engine_id.replace("_", " ").title()
|
| 226 |
+
try:
|
| 227 |
+
__import__(module_name)
|
| 228 |
+
installed = True
|
| 229 |
+
except ImportError:
|
| 230 |
+
installed = False
|
| 231 |
+
|
| 232 |
+
version = ""
|
| 233 |
+
if installed and engine_id == "tesseract":
|
| 234 |
+
try:
|
| 235 |
+
import pytesseract
|
| 236 |
+
version = pytesseract.get_tesseract_version()
|
| 237 |
+
version = str(version)
|
| 238 |
+
except Exception:
|
| 239 |
+
version = "installé"
|
| 240 |
+
elif installed:
|
| 241 |
+
try:
|
| 242 |
+
mod = __import__(module_name)
|
| 243 |
+
version = getattr(mod, "__version__", "installé")
|
| 244 |
+
except Exception:
|
| 245 |
+
version = "installé"
|
| 246 |
+
|
| 247 |
+
return {
|
| 248 |
+
"id": engine_id,
|
| 249 |
+
"label": label,
|
| 250 |
+
"type": "ocr",
|
| 251 |
+
"available": installed,
|
| 252 |
+
"version": version,
|
| 253 |
+
"status": "available" if installed else "not_installed",
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def _check_ollama() -> bool:
|
| 258 |
+
import urllib.error, urllib.request
|
| 259 |
+
try:
|
| 260 |
+
with urllib.request.urlopen("http://localhost:11434/api/tags", timeout=2) as r:
|
| 261 |
+
return r.status == 200
|
| 262 |
+
except Exception:
|
| 263 |
+
return False
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def _list_ollama_models() -> list[str]:
|
| 267 |
+
import urllib.error, urllib.request
|
| 268 |
+
try:
|
| 269 |
+
with urllib.request.urlopen("http://localhost:11434/api/tags", timeout=2) as r:
|
| 270 |
+
data = json.loads(r.read().decode())
|
| 271 |
+
return [m.get("name", "") for m in data.get("models", [])]
|
| 272 |
+
except Exception:
|
| 273 |
+
return []
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
# ---------------------------------------------------------------------------
|
| 277 |
+
# API — corpus browse
|
| 278 |
+
# ---------------------------------------------------------------------------
|
| 279 |
+
|
| 280 |
+
@app.get("/api/corpus/browse")
|
| 281 |
+
async def api_corpus_browse(path: str = Query(default=".", description="Chemin à explorer")) -> dict:
|
| 282 |
+
target = Path(path).resolve()
|
| 283 |
+
if not target.exists() or not target.is_dir():
|
| 284 |
+
raise HTTPException(status_code=404, detail=f"Dossier non trouvé : {path}")
|
| 285 |
+
|
| 286 |
+
items = []
|
| 287 |
+
try:
|
| 288 |
+
for entry in sorted(target.iterdir()):
|
| 289 |
+
item: dict[str, Any] = {
|
| 290 |
+
"name": entry.name,
|
| 291 |
+
"path": str(entry),
|
| 292 |
+
"is_dir": entry.is_dir(),
|
| 293 |
+
}
|
| 294 |
+
if entry.is_dir():
|
| 295 |
+
# Compter les paires image/gt
|
| 296 |
+
gt_count = sum(1 for f in entry.iterdir() if f.suffix == ".txt" and f.stem.endswith(".gt"))
|
| 297 |
+
item["gt_count"] = gt_count
|
| 298 |
+
item["has_corpus"] = gt_count > 0
|
| 299 |
+
items.append(item)
|
| 300 |
+
except PermissionError as exc:
|
| 301 |
+
raise HTTPException(status_code=403, detail=str(exc))
|
| 302 |
+
|
| 303 |
+
return {
|
| 304 |
+
"current_path": str(target),
|
| 305 |
+
"parent_path": str(target.parent) if target.parent != target else None,
|
| 306 |
+
"items": items,
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
# ---------------------------------------------------------------------------
|
| 311 |
+
# API — normalization profiles
|
| 312 |
+
# ---------------------------------------------------------------------------
|
| 313 |
+
|
| 314 |
+
@app.get("/api/normalization/profiles")
|
| 315 |
+
async def api_normalization_profiles() -> dict:
|
| 316 |
+
from picarones.core.normalization import get_builtin_profile
|
| 317 |
+
|
| 318 |
+
profile_ids = [
|
| 319 |
+
"nfc",
|
| 320 |
+
"caseless",
|
| 321 |
+
"minimal",
|
| 322 |
+
"medieval_french",
|
| 323 |
+
"early_modern_french",
|
| 324 |
+
"medieval_latin",
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
profiles = []
|
| 328 |
+
for pid in profile_ids:
|
| 329 |
+
try:
|
| 330 |
+
p = get_builtin_profile(pid)
|
| 331 |
+
profiles.append({
|
| 332 |
+
"id": pid,
|
| 333 |
+
"name": p.name,
|
| 334 |
+
"description": p.description or p.name,
|
| 335 |
+
"caseless": p.caseless,
|
| 336 |
+
"diplomatic_rules": len(p.diplomatic_table),
|
| 337 |
+
})
|
| 338 |
+
except Exception:
|
| 339 |
+
pass
|
| 340 |
+
|
| 341 |
+
return {"profiles": profiles}
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# ---------------------------------------------------------------------------
|
| 345 |
+
# API — reports
|
| 346 |
+
# ---------------------------------------------------------------------------
|
| 347 |
+
|
| 348 |
+
@app.get("/api/reports")
|
| 349 |
+
async def api_reports(reports_dir: str = Query(default=".", description="Dossier rapports")) -> dict:
|
| 350 |
+
target = Path(reports_dir).resolve()
|
| 351 |
+
reports = []
|
| 352 |
+
|
| 353 |
+
search_dirs = [target, Path(".").resolve(), Path("./rapports").resolve()]
|
| 354 |
+
seen: set[str] = set()
|
| 355 |
+
|
| 356 |
+
for d in search_dirs:
|
| 357 |
+
if not d.exists():
|
| 358 |
+
continue
|
| 359 |
+
for f in sorted(d.glob("*.html"), key=lambda x: x.stat().st_mtime, reverse=True):
|
| 360 |
+
if str(f) not in seen:
|
| 361 |
+
seen.add(str(f))
|
| 362 |
+
stat = f.stat()
|
| 363 |
+
reports.append({
|
| 364 |
+
"filename": f.name,
|
| 365 |
+
"path": str(f),
|
| 366 |
+
"size_kb": round(stat.st_size / 1024, 1),
|
| 367 |
+
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
|
| 368 |
+
"url": f"/reports/{f.name}",
|
| 369 |
+
})
|
| 370 |
+
|
| 371 |
+
return {"reports": reports}
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
@app.get("/reports/{filename}")
|
| 375 |
+
async def serve_report(filename: str) -> FileResponse:
|
| 376 |
+
# Cherche dans le répertoire courant et ./rapports/
|
| 377 |
+
for d in [Path("."), Path("./rapports")]:
|
| 378 |
+
f = d / filename
|
| 379 |
+
if f.exists() and f.suffix == ".html":
|
| 380 |
+
return FileResponse(str(f.resolve()), media_type="text/html")
|
| 381 |
+
raise HTTPException(status_code=404, detail=f"Rapport non trouvé : {filename}")
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
# ---------------------------------------------------------------------------
|
| 385 |
+
# API — HTR-United
|
| 386 |
+
# ---------------------------------------------------------------------------
|
| 387 |
+
|
| 388 |
+
@app.get("/api/htr-united/catalogue")
|
| 389 |
+
async def api_htr_united_catalogue(
|
| 390 |
+
query: str = Query(default="", description="Recherche textuelle"),
|
| 391 |
+
language: str = Query(default="", description="Filtre langue"),
|
| 392 |
+
script: str = Query(default="", description="Filtre type d'écriture"),
|
| 393 |
+
) -> dict:
|
| 394 |
+
from picarones.importers.htr_united import HTRUnitedCatalogue
|
| 395 |
+
|
| 396 |
+
cat = HTRUnitedCatalogue.from_demo()
|
| 397 |
+
results = cat.search(
|
| 398 |
+
query=query,
|
| 399 |
+
language=language or None,
|
| 400 |
+
script=script or None,
|
| 401 |
+
)
|
| 402 |
+
return {
|
| 403 |
+
"source": cat.source,
|
| 404 |
+
"total": len(results),
|
| 405 |
+
"entries": [e.as_dict() for e in results],
|
| 406 |
+
"available_languages": cat.available_languages(),
|
| 407 |
+
"available_scripts": cat.available_scripts(),
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
@app.post("/api/htr-united/import")
|
| 412 |
+
async def api_htr_united_import(req: HTRUnitedImportRequest) -> dict:
|
| 413 |
+
from picarones.importers.htr_united import HTRUnitedCatalogue, import_htr_united_corpus
|
| 414 |
+
|
| 415 |
+
cat = HTRUnitedCatalogue.from_demo()
|
| 416 |
+
entry = cat.get_by_id(req.entry_id)
|
| 417 |
+
if not entry:
|
| 418 |
+
raise HTTPException(status_code=404, detail=f"Entrée non trouvée : {req.entry_id}")
|
| 419 |
+
|
| 420 |
+
result = import_htr_united_corpus(
|
| 421 |
+
entry=entry,
|
| 422 |
+
output_dir=req.output_dir,
|
| 423 |
+
max_samples=req.max_samples,
|
| 424 |
+
)
|
| 425 |
+
return result
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
# ---------------------------------------------------------------------------
|
| 429 |
+
# API — HuggingFace
|
| 430 |
+
# ---------------------------------------------------------------------------
|
| 431 |
+
|
| 432 |
+
@app.get("/api/huggingface/search")
|
| 433 |
+
async def api_huggingface_search(
|
| 434 |
+
query: str = Query(default="", description="Requête de recherche"),
|
| 435 |
+
language: str = Query(default="", description="Filtre langue"),
|
| 436 |
+
tags: str = Query(default="", description="Tags séparés par des virgules"),
|
| 437 |
+
limit: int = Query(default=20, ge=1, le=50),
|
| 438 |
+
) -> dict:
|
| 439 |
+
from picarones.importers.huggingface import HuggingFaceImporter
|
| 440 |
+
|
| 441 |
+
tag_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else None
|
| 442 |
+
importer = HuggingFaceImporter()
|
| 443 |
+
results = importer.search(
|
| 444 |
+
query=query,
|
| 445 |
+
tags=tag_list,
|
| 446 |
+
language=language or None,
|
| 447 |
+
limit=limit,
|
| 448 |
+
)
|
| 449 |
+
return {
|
| 450 |
+
"total": len(results),
|
| 451 |
+
"datasets": [ds.as_dict() for ds in results],
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
@app.post("/api/huggingface/import")
|
| 456 |
+
async def api_huggingface_import(req: HuggingFaceImportRequest) -> dict:
|
| 457 |
+
from picarones.importers.huggingface import HuggingFaceImporter
|
| 458 |
+
|
| 459 |
+
importer = HuggingFaceImporter()
|
| 460 |
+
result = importer.import_dataset(
|
| 461 |
+
dataset_id=req.dataset_id,
|
| 462 |
+
output_dir=req.output_dir,
|
| 463 |
+
split=req.split,
|
| 464 |
+
max_samples=req.max_samples,
|
| 465 |
+
)
|
| 466 |
+
return result
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
# ---------------------------------------------------------------------------
|
| 470 |
+
# API — benchmark
|
| 471 |
+
# ---------------------------------------------------------------------------
|
| 472 |
+
|
| 473 |
+
@app.post("/api/benchmark/start")
|
| 474 |
+
async def api_benchmark_start(req: BenchmarkRequest) -> dict:
|
| 475 |
+
corpus_path = Path(req.corpus_path)
|
| 476 |
+
if not corpus_path.exists() or not corpus_path.is_dir():
|
| 477 |
+
raise HTTPException(status_code=400, detail=f"Corpus non trouvé : {req.corpus_path}")
|
| 478 |
+
|
| 479 |
+
job_id = str(uuid.uuid4())
|
| 480 |
+
job = BenchmarkJob(job_id=job_id)
|
| 481 |
+
_JOBS[job_id] = job
|
| 482 |
+
|
| 483 |
+
# Démarrer le benchmark dans un thread séparé
|
| 484 |
+
thread = threading.Thread(
|
| 485 |
+
target=_run_benchmark_thread,
|
| 486 |
+
args=(job, req),
|
| 487 |
+
daemon=True,
|
| 488 |
+
)
|
| 489 |
+
thread.start()
|
| 490 |
+
|
| 491 |
+
return {"job_id": job_id, "status": "pending"}
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
@app.get("/api/benchmark/{job_id}/status")
|
| 495 |
+
async def api_benchmark_status(job_id: str) -> dict:
|
| 496 |
+
job = _JOBS.get(job_id)
|
| 497 |
+
if not job:
|
| 498 |
+
raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}")
|
| 499 |
+
return job.as_dict()
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
@app.post("/api/benchmark/{job_id}/cancel")
|
| 503 |
+
async def api_benchmark_cancel(job_id: str) -> dict:
|
| 504 |
+
job = _JOBS.get(job_id)
|
| 505 |
+
if not job:
|
| 506 |
+
raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}")
|
| 507 |
+
if job.status in ("complete", "error"):
|
| 508 |
+
return {"job_id": job_id, "status": job.status, "message": "Job déjà terminé."}
|
| 509 |
+
job.status = "cancelled"
|
| 510 |
+
job.add_event("cancelled", {"message": "Benchmark annulé par l'utilisateur."})
|
| 511 |
+
return {"job_id": job_id, "status": "cancelled"}
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
@app.get("/api/benchmark/{job_id}/stream")
|
| 515 |
+
async def api_benchmark_stream(job_id: str) -> StreamingResponse:
|
| 516 |
+
job = _JOBS.get(job_id)
|
| 517 |
+
if not job:
|
| 518 |
+
raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}")
|
| 519 |
+
|
| 520 |
+
async def event_generator() -> AsyncIterator[str]:
|
| 521 |
+
# Envoie d'abord les événements déjà produits
|
| 522 |
+
for event in list(job.events):
|
| 523 |
+
yield _sse_format(event["kind"], event["data"])
|
| 524 |
+
|
| 525 |
+
if job.status in ("complete", "error", "cancelled"):
|
| 526 |
+
yield _sse_format("done", {"status": job.status})
|
| 527 |
+
return
|
| 528 |
+
|
| 529 |
+
queue = job.subscribe()
|
| 530 |
+
try:
|
| 531 |
+
while True:
|
| 532 |
+
try:
|
| 533 |
+
event = await asyncio.wait_for(queue.get(), timeout=30.0)
|
| 534 |
+
yield _sse_format(event["kind"], event["data"])
|
| 535 |
+
if event["kind"] in ("complete", "error", "cancelled", "done"):
|
| 536 |
+
break
|
| 537 |
+
except asyncio.TimeoutError:
|
| 538 |
+
# Keepalive
|
| 539 |
+
yield ": keepalive\n\n"
|
| 540 |
+
if job.status in ("complete", "error", "cancelled"):
|
| 541 |
+
yield _sse_format("done", {"status": job.status})
|
| 542 |
+
break
|
| 543 |
+
finally:
|
| 544 |
+
job.unsubscribe(queue)
|
| 545 |
+
|
| 546 |
+
return StreamingResponse(
|
| 547 |
+
event_generator(),
|
| 548 |
+
media_type="text/event-stream",
|
| 549 |
+
headers={
|
| 550 |
+
"Cache-Control": "no-cache",
|
| 551 |
+
"X-Accel-Buffering": "no",
|
| 552 |
+
},
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
def _sse_format(event_type: str, data: Any) -> str:
|
| 557 |
+
payload = json.dumps(data, ensure_ascii=False)
|
| 558 |
+
return f"event: {event_type}\ndata: {payload}\n\n"
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
|
| 562 |
+
"""Exécute le benchmark dans un thread et envoie des événements SSE."""
|
| 563 |
+
import time
|
| 564 |
+
|
| 565 |
+
job.status = "running"
|
| 566 |
+
job.started_at = _iso_now()
|
| 567 |
+
job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
|
| 568 |
+
|
| 569 |
+
try:
|
| 570 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 571 |
+
from picarones.core.runner import run_benchmark
|
| 572 |
+
|
| 573 |
+
# Charger le corpus
|
| 574 |
+
job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"})
|
| 575 |
+
corpus = load_corpus_from_directory(req.corpus_path)
|
| 576 |
+
job.total_docs = len(corpus)
|
| 577 |
+
job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
|
| 578 |
+
|
| 579 |
+
if job.status == "cancelled":
|
| 580 |
+
return
|
| 581 |
+
|
| 582 |
+
# Instancier les moteurs
|
| 583 |
+
from picarones.cli import _engine_from_name
|
| 584 |
+
import click
|
| 585 |
+
|
| 586 |
+
ocr_engines = []
|
| 587 |
+
for engine_name in req.engines:
|
| 588 |
+
try:
|
| 589 |
+
eng = _engine_from_name(engine_name, lang=req.lang, psm=6)
|
| 590 |
+
ocr_engines.append(eng)
|
| 591 |
+
job.add_event("log", {"message": f"Moteur chargé : {engine_name}"})
|
| 592 |
+
except (click.BadParameter, Exception) as exc:
|
| 593 |
+
job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"})
|
| 594 |
+
|
| 595 |
+
if not ocr_engines:
|
| 596 |
+
raise ValueError("Aucun moteur valide disponible.")
|
| 597 |
+
|
| 598 |
+
# Répertoire de sortie
|
| 599 |
+
output_dir = Path(req.output_dir)
|
| 600 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 601 |
+
report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 602 |
+
output_json = str(output_dir / f"{report_name}.json")
|
| 603 |
+
output_html = str(output_dir / f"{report_name}.html")
|
| 604 |
+
|
| 605 |
+
# Callback de progression (injecté dans un wrapper)
|
| 606 |
+
n_engines = len(ocr_engines)
|
| 607 |
+
total_steps = job.total_docs * n_engines
|
| 608 |
+
|
| 609 |
+
step_counter = [0]
|
| 610 |
+
|
| 611 |
+
original_engine_names = [e.name for e in ocr_engines]
|
| 612 |
+
|
| 613 |
+
def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
|
| 614 |
+
if job.status == "cancelled":
|
| 615 |
+
return
|
| 616 |
+
step_counter[0] += 1
|
| 617 |
+
job.current_engine = engine_name
|
| 618 |
+
job.processed_docs = doc_idx
|
| 619 |
+
job.progress = step_counter[0] / max(total_steps, 1)
|
| 620 |
+
job.add_event("progress", {
|
| 621 |
+
"engine": engine_name,
|
| 622 |
+
"doc_idx": doc_idx,
|
| 623 |
+
"doc_id": doc_id,
|
| 624 |
+
"progress": job.progress,
|
| 625 |
+
"processed": step_counter[0],
|
| 626 |
+
"total": total_steps,
|
| 627 |
+
})
|
| 628 |
+
|
| 629 |
+
# Lancer le benchmark
|
| 630 |
+
result = run_benchmark(
|
| 631 |
+
corpus=corpus,
|
| 632 |
+
engines=ocr_engines,
|
| 633 |
+
output_json=output_json,
|
| 634 |
+
show_progress=False,
|
| 635 |
+
progress_callback=_progress_callback,
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
if job.status == "cancelled":
|
| 639 |
+
return
|
| 640 |
+
|
| 641 |
+
# Générer le rapport HTML
|
| 642 |
+
job.add_event("log", {"message": "Génération du rapport HTML…"})
|
| 643 |
+
from picarones.report.generator import ReportGenerator
|
| 644 |
+
gen = ReportGenerator(result)
|
| 645 |
+
gen.generate(output_html)
|
| 646 |
+
|
| 647 |
+
job.output_path = output_html
|
| 648 |
+
job.progress = 1.0
|
| 649 |
+
job.status = "complete"
|
| 650 |
+
job.finished_at = _iso_now()
|
| 651 |
+
|
| 652 |
+
# Classement final
|
| 653 |
+
ranking = result.ranking()
|
| 654 |
+
job.add_event("complete", {
|
| 655 |
+
"message": "Benchmark terminé.",
|
| 656 |
+
"output_html": output_html,
|
| 657 |
+
"output_json": output_json,
|
| 658 |
+
"ranking": ranking,
|
| 659 |
+
})
|
| 660 |
+
|
| 661 |
+
except Exception as exc:
|
| 662 |
+
job.status = "error"
|
| 663 |
+
job.error = str(exc)
|
| 664 |
+
job.finished_at = _iso_now()
|
| 665 |
+
job.add_event("error", {"message": f"Erreur : {exc}"})
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
# ---------------------------------------------------------------------------
|
| 669 |
+
# Page principale HTML (SPA)
|
| 670 |
+
# ---------------------------------------------------------------------------
|
| 671 |
+
|
| 672 |
+
@app.get("/", response_class=HTMLResponse)
|
| 673 |
+
async def index() -> HTMLResponse:
|
| 674 |
+
return HTMLResponse(content=_HTML_TEMPLATE)
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
# ---------------------------------------------------------------------------
|
| 678 |
+
# Helper
|
| 679 |
+
# ---------------------------------------------------------------------------
|
| 680 |
+
|
| 681 |
+
def _iso_now() -> str:
|
| 682 |
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
| 683 |
+
|
| 684 |
+
|
| 685 |
+
# ---------------------------------------------------------------------------
|
| 686 |
+
# HTML Template (SPA, French/English, Vanilla JS)
|
| 687 |
+
# ---------------------------------------------------------------------------
|
| 688 |
+
|
| 689 |
+
_HTML_TEMPLATE = r"""<!DOCTYPE html>
|
| 690 |
+
<html lang="fr">
|
| 691 |
+
<head>
|
| 692 |
+
<meta charset="UTF-8">
|
| 693 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 694 |
+
<title>Picarones — OCR Benchmark</title>
|
| 695 |
+
<style>
|
| 696 |
+
:root {
|
| 697 |
+
--bg: #f8f7f4;
|
| 698 |
+
--bg2: #ffffff;
|
| 699 |
+
--border: #d8d5ce;
|
| 700 |
+
--accent: #2d5a9e;
|
| 701 |
+
--accent-hover: #1e4080;
|
| 702 |
+
--success: #2a7a3b;
|
| 703 |
+
--warning: #c17b00;
|
| 704 |
+
--danger: #c0392b;
|
| 705 |
+
--text: #2c2c2c;
|
| 706 |
+
--text-muted: #6b6b6b;
|
| 707 |
+
--radius: 6px;
|
| 708 |
+
--shadow: 0 1px 4px rgba(0,0,0,0.1);
|
| 709 |
+
}
|
| 710 |
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 711 |
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; background: var(--bg); color: var(--text); font-size: 14px; line-height: 1.5; }
|
| 712 |
+
a { color: var(--accent); text-decoration: none; }
|
| 713 |
+
a:hover { text-decoration: underline; }
|
| 714 |
+
|
| 715 |
+
/* Layout */
|
| 716 |
+
#header { background: var(--accent); color: #fff; padding: 0 24px; display: flex; align-items: center; height: 52px; gap: 24px; position: sticky; top: 0; z-index: 100; }
|
| 717 |
+
#header h1 { font-size: 18px; font-weight: 600; letter-spacing: -0.3px; }
|
| 718 |
+
#header span.version { font-size: 11px; opacity: 0.7; margin-left: 4px; }
|
| 719 |
+
#nav { display: flex; gap: 4px; margin-left: auto; }
|
| 720 |
+
.nav-btn { background: transparent; border: 1px solid rgba(255,255,255,0.3); color: #fff; padding: 5px 12px; border-radius: var(--radius); cursor: pointer; font-size: 13px; transition: background 0.15s; }
|
| 721 |
+
.nav-btn:hover, .nav-btn.active { background: rgba(255,255,255,0.18); }
|
| 722 |
+
#lang-btn { margin-left: 12px; font-size: 12px; background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.3); color: #fff; padding: 4px 10px; border-radius: var(--radius); cursor: pointer; }
|
| 723 |
+
|
| 724 |
+
#main { max-width: 1100px; margin: 0 auto; padding: 24px 16px; }
|
| 725 |
+
.view { display: none; }
|
| 726 |
+
.view.active { display: block; }
|
| 727 |
+
|
| 728 |
+
/* Cards */
|
| 729 |
+
.card { background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); padding: 20px; margin-bottom: 16px; box-shadow: var(--shadow); }
|
| 730 |
+
.card h2 { font-size: 15px; font-weight: 600; margin-bottom: 14px; padding-bottom: 8px; border-bottom: 1px solid var(--border); color: var(--accent); }
|
| 731 |
+
.card h3 { font-size: 13px; font-weight: 600; margin-bottom: 10px; color: var(--text); }
|
| 732 |
+
|
| 733 |
+
/* Forms */
|
| 734 |
+
.form-row { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 12px; align-items: flex-start; }
|
| 735 |
+
.form-group { display: flex; flex-direction: column; gap: 4px; flex: 1; min-width: 160px; }
|
| 736 |
+
label { font-size: 12px; font-weight: 500; color: var(--text-muted); }
|
| 737 |
+
input[type=text], input[type=number], select { padding: 7px 10px; border: 1px solid var(--border); border-radius: var(--radius); font-size: 13px; color: var(--text); background: #fff; width: 100%; }
|
| 738 |
+
input:focus, select:focus { outline: 2px solid var(--accent); outline-offset: -1px; }
|
| 739 |
+
.path-input-row { display: flex; gap: 8px; }
|
| 740 |
+
.path-input-row input { flex: 1; }
|
| 741 |
+
.btn { padding: 7px 16px; border: none; border-radius: var(--radius); cursor: pointer; font-size: 13px; font-weight: 500; transition: background 0.15s; display: inline-flex; align-items: center; gap: 6px; }
|
| 742 |
+
.btn-primary { background: var(--accent); color: #fff; }
|
| 743 |
+
.btn-primary:hover { background: var(--accent-hover); }
|
| 744 |
+
.btn-secondary { background: #e8e5de; color: var(--text); }
|
| 745 |
+
.btn-secondary:hover { background: #d8d5ce; }
|
| 746 |
+
.btn-danger { background: var(--danger); color: #fff; }
|
| 747 |
+
.btn-sm { padding: 4px 10px; font-size: 12px; }
|
| 748 |
+
|
| 749 |
+
/* Checkboxes list */
|
| 750 |
+
.checkbox-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 8px; }
|
| 751 |
+
.checkbox-item { display: flex; align-items: center; gap: 8px; padding: 8px 10px; border: 1px solid var(--border); border-radius: var(--radius); cursor: pointer; transition: border-color 0.1s; }
|
| 752 |
+
.checkbox-item:hover { border-color: var(--accent); }
|
| 753 |
+
.checkbox-item input { cursor: pointer; }
|
| 754 |
+
.checkbox-item.checked { border-color: var(--accent); background: #eef2fc; }
|
| 755 |
+
.engine-status { width: 8px; height: 8px; border-radius: 50%; display: inline-block; flex-shrink: 0; }
|
| 756 |
+
.status-ok { background: var(--success); }
|
| 757 |
+
.status-warn { background: var(--warning); }
|
| 758 |
+
.status-err { background: var(--danger); }
|
| 759 |
+
|
| 760 |
+
/* Progress */
|
| 761 |
+
.progress-bar-outer { height: 10px; background: #e0ddd5; border-radius: 5px; overflow: hidden; margin: 4px 0; }
|
| 762 |
+
.progress-bar-inner { height: 100%; background: var(--accent); border-radius: 5px; transition: width 0.3s; }
|
| 763 |
+
.log-box { background: #1a1a2e; color: #c8d8f8; font-family: monospace; font-size: 12px; padding: 12px; border-radius: var(--radius); max-height: 260px; overflow-y: auto; white-space: pre-wrap; line-height: 1.6; }
|
| 764 |
+
.log-box .log-warn { color: #f0c060; }
|
| 765 |
+
.log-box .log-error { color: #ff6b6b; }
|
| 766 |
+
.log-box .log-success { color: #6bf08a; }
|
| 767 |
+
|
| 768 |
+
/* Tables */
|
| 769 |
+
table { width: 100%; border-collapse: collapse; font-size: 13px; }
|
| 770 |
+
th { text-align: left; padding: 8px 10px; border-bottom: 2px solid var(--border); color: var(--text-muted); font-weight: 600; font-size: 12px; }
|
| 771 |
+
td { padding: 8px 10px; border-bottom: 1px solid var(--border); }
|
| 772 |
+
tr:last-child td { border-bottom: none; }
|
| 773 |
+
tr:hover td { background: #f0ede6; }
|
| 774 |
+
.badge { padding: 2px 7px; border-radius: 10px; font-size: 11px; font-weight: 500; }
|
| 775 |
+
.badge-ok { background: #d4edda; color: var(--success); }
|
| 776 |
+
.badge-warn { background: #fff3cd; color: var(--warning); }
|
| 777 |
+
.badge-err { background: #fde8e8; color: var(--danger); }
|
| 778 |
+
|
| 779 |
+
/* File browser */
|
| 780 |
+
#file-browser { border: 1px solid var(--border); border-radius: var(--radius); max-height: 300px; overflow-y: auto; }
|
| 781 |
+
.fb-item { display: flex; align-items: center; gap: 8px; padding: 8px 12px; cursor: pointer; border-bottom: 1px solid var(--border); }
|
| 782 |
+
.fb-item:last-child { border-bottom: none; }
|
| 783 |
+
.fb-item:hover { background: #f0ede6; }
|
| 784 |
+
.fb-icon { font-size: 16px; flex-shrink: 0; }
|
| 785 |
+
.fb-name { flex: 1; font-size: 13px; }
|
| 786 |
+
.fb-badge { font-size: 11px; color: var(--text-muted); }
|
| 787 |
+
.fb-path { font-size: 12px; color: var(--text-muted); padding: 6px 12px; background: #f4f2ed; border-bottom: 1px solid var(--border); font-family: monospace; }
|
| 788 |
+
|
| 789 |
+
/* Notifications */
|
| 790 |
+
.alert { padding: 10px 14px; border-radius: var(--radius); margin-bottom: 12px; font-size: 13px; }
|
| 791 |
+
.alert-success { background: #d4edda; color: var(--success); border: 1px solid #b8dfc4; }
|
| 792 |
+
.alert-error { background: #fde8e8; color: var(--danger); border: 1px solid #f5c6cb; }
|
| 793 |
+
.alert-info { background: #d0e4f7; color: #1a568c; border: 1px solid #b8d4ef; }
|
| 794 |
+
|
| 795 |
+
/* Dataset cards */
|
| 796 |
+
.ds-grid { display: grid; gap: 10px; }
|
| 797 |
+
.ds-card { border: 1px solid var(--border); border-radius: var(--radius); padding: 12px; background: #fff; }
|
| 798 |
+
.ds-card h4 { font-size: 13px; font-weight: 600; margin-bottom: 4px; }
|
| 799 |
+
.ds-card p { font-size: 12px; color: var(--text-muted); margin-bottom: 6px; }
|
| 800 |
+
.ds-meta { display: flex; gap: 8px; flex-wrap: wrap; }
|
| 801 |
+
.ds-tag { font-size: 11px; background: #eef2fc; color: var(--accent); padding: 2px 7px; border-radius: 10px; }
|
| 802 |
+
|
| 803 |
+
/* Spinner */
|
| 804 |
+
.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid #ccc; border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; }
|
| 805 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 806 |
+
</style>
|
| 807 |
+
</head>
|
| 808 |
+
<body>
|
| 809 |
+
|
| 810 |
+
<div id="header">
|
| 811 |
+
<h1 data-i18n="app_title">Picarones <span class="version" id="app-version"></span></h1>
|
| 812 |
+
<nav id="nav">
|
| 813 |
+
<button class="nav-btn active" onclick="showView('benchmark')" data-i18n="nav_benchmark">Benchmark</button>
|
| 814 |
+
<button class="nav-btn" onclick="showView('reports')" data-i18n="nav_reports">Rapports</button>
|
| 815 |
+
<button class="nav-btn" onclick="showView('engines')" data-i18n="nav_engines">Moteurs</button>
|
| 816 |
+
<button class="nav-btn" onclick="showView('import')" data-i18n="nav_import">Import</button>
|
| 817 |
+
</nav>
|
| 818 |
+
<button id="lang-btn" onclick="toggleLang()">EN</button>
|
| 819 |
+
</div>
|
| 820 |
+
|
| 821 |
+
<div id="main">
|
| 822 |
+
|
| 823 |
+
<!-- ===== VUE BENCHMARK ===== -->
|
| 824 |
+
<div id="view-benchmark" class="view active">
|
| 825 |
+
|
| 826 |
+
<div class="card">
|
| 827 |
+
<h2 data-i18n="bench_corpus_title">1. Corpus</h2>
|
| 828 |
+
<div class="form-group">
|
| 829 |
+
<label data-i18n="bench_corpus_label">Chemin vers le dossier corpus (paires image/.gt.txt)</label>
|
| 830 |
+
<div class="path-input-row">
|
| 831 |
+
<input type="text" id="corpus-path" placeholder="./corpus/" value="" />
|
| 832 |
+
<button class="btn btn-secondary btn-sm" onclick="openFileBrowser()" data-i18n="bench_browse">Parcourir</button>
|
| 833 |
+
</div>
|
| 834 |
+
</div>
|
| 835 |
+
<div id="file-browser-container" style="display:none; margin-top:10px;">
|
| 836 |
+
<div class="fb-path" id="fb-current-path">.</div>
|
| 837 |
+
<div id="file-browser"></div>
|
| 838 |
+
</div>
|
| 839 |
+
<div id="corpus-info" style="margin-top:8px; font-size:12px; color: var(--text-muted);"></div>
|
| 840 |
+
</div>
|
| 841 |
+
|
| 842 |
+
<div class="card">
|
| 843 |
+
<h2 data-i18n="bench_engines_title">2. Moteurs et pipelines</h2>
|
| 844 |
+
<div id="engine-checkboxes" class="checkbox-grid">
|
| 845 |
+
<div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
|
| 846 |
+
</div>
|
| 847 |
+
</div>
|
| 848 |
+
|
| 849 |
+
<div class="card">
|
| 850 |
+
<h2 data-i18n="bench_options_title">3. Options</h2>
|
| 851 |
+
<div class="form-row">
|
| 852 |
+
<div class="form-group">
|
| 853 |
+
<label data-i18n="bench_norm_label">Profil de normalisation</label>
|
| 854 |
+
<select id="norm-profile">
|
| 855 |
+
<option value="nfc">NFC (standard)</option>
|
| 856 |
+
</select>
|
| 857 |
+
</div>
|
| 858 |
+
<div class="form-group">
|
| 859 |
+
<label data-i18n="bench_lang_label">Langue (Tesseract)</label>
|
| 860 |
+
<input type="text" id="bench-lang" value="fra" placeholder="fra" />
|
| 861 |
+
</div>
|
| 862 |
+
<div class="form-group">
|
| 863 |
+
<label data-i18n="bench_output_label">Dossier de sortie</label>
|
| 864 |
+
<input type="text" id="output-dir" value="./rapports/" />
|
| 865 |
+
</div>
|
| 866 |
+
<div class="form-group">
|
| 867 |
+
<label data-i18n="bench_name_label">Nom du rapport (optionnel)</label>
|
| 868 |
+
<input type="text" id="report-name" placeholder="rapport_2024_01_15" />
|
| 869 |
+
</div>
|
| 870 |
+
</div>
|
| 871 |
+
</div>
|
| 872 |
+
|
| 873 |
+
<div style="display:flex; gap:10px; align-items:center; margin-bottom:16px;">
|
| 874 |
+
<button class="btn btn-primary" id="start-btn" onclick="startBenchmark()" data-i18n="bench_start">▶ Lancer le benchmark</button>
|
| 875 |
+
<button class="btn btn-secondary" id="cancel-btn" style="display:none;" onclick="cancelBenchmark()" data-i18n="bench_cancel">✕ Annuler</button>
|
| 876 |
+
<span id="bench-status-text" style="font-size:12px; color: var(--text-muted);"></span>
|
| 877 |
+
</div>
|
| 878 |
+
|
| 879 |
+
<div id="bench-progress-section" style="display:none;">
|
| 880 |
+
<div class="card">
|
| 881 |
+
<h2 data-i18n="bench_progress_title">Progression</h2>
|
| 882 |
+
<div id="engine-progress-list"></div>
|
| 883 |
+
<div style="margin-top: 12px;">
|
| 884 |
+
<label style="font-size:12px; color: var(--text-muted); display:block; margin-bottom:4px;" data-i18n="bench_log">Journal</label>
|
| 885 |
+
<div class="log-box" id="bench-log"></div>
|
| 886 |
+
</div>
|
| 887 |
+
</div>
|
| 888 |
+
</div>
|
| 889 |
+
|
| 890 |
+
<div id="bench-result-section" style="display:none;">
|
| 891 |
+
<div class="card">
|
| 892 |
+
<h2 data-i18n="bench_result_title">Résultats</h2>
|
| 893 |
+
<div id="bench-ranking-table"></div>
|
| 894 |
+
<div style="margin-top:12px;">
|
| 895 |
+
<a id="bench-report-link" href="#" class="btn btn-primary" target="_blank" data-i18n="bench_open_report">Ouvrir le rapport</a>
|
| 896 |
+
</div>
|
| 897 |
+
</div>
|
| 898 |
+
</div>
|
| 899 |
+
</div>
|
| 900 |
+
|
| 901 |
+
<!-- ===== VUE RAPPORTS ===== -->
|
| 902 |
+
<div id="view-reports" class="view">
|
| 903 |
+
<div class="card">
|
| 904 |
+
<h2 data-i18n="reports_title">Rapports générés</h2>
|
| 905 |
+
<div class="form-row" style="margin-bottom:12px;">
|
| 906 |
+
<div class="form-group" style="max-width:320px;">
|
| 907 |
+
<label data-i18n="reports_dir_label">Dossier de rapports</label>
|
| 908 |
+
<div class="path-input-row">
|
| 909 |
+
<input type="text" id="reports-dir" value="." />
|
| 910 |
+
<button class="btn btn-secondary btn-sm" onclick="loadReports()" data-i18n="reports_refresh">Rafraîchir</button>
|
| 911 |
+
</div>
|
| 912 |
+
</div>
|
| 913 |
+
</div>
|
| 914 |
+
<div id="reports-list">
|
| 915 |
+
<div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
|
| 916 |
+
</div>
|
| 917 |
+
</div>
|
| 918 |
+
</div>
|
| 919 |
+
|
| 920 |
+
<!-- ===== VUE MOTEURS ===== -->
|
| 921 |
+
<div id="view-engines" class="view">
|
| 922 |
+
<div class="card">
|
| 923 |
+
<h2 data-i18n="engines_ocr_title">Moteurs OCR</h2>
|
| 924 |
+
<div id="engines-ocr-list">
|
| 925 |
+
<div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
|
| 926 |
+
</div>
|
| 927 |
+
</div>
|
| 928 |
+
<div class="card">
|
| 929 |
+
<h2 data-i18n="engines_llm_title">LLMs disponibles</h2>
|
| 930 |
+
<div id="engines-llm-list">
|
| 931 |
+
<div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
|
| 932 |
+
</div>
|
| 933 |
+
</div>
|
| 934 |
+
</div>
|
| 935 |
+
|
| 936 |
+
<!-- ===== VUE IMPORT ===== -->
|
| 937 |
+
<div id="view-import" class="view">
|
| 938 |
+
|
| 939 |
+
<!-- HTR-United -->
|
| 940 |
+
<div class="card">
|
| 941 |
+
<h2 data-i18n="import_htr_title">Import HTR-United</h2>
|
| 942 |
+
<p style="font-size:12px; color:var(--text-muted); margin-bottom:12px;" data-i18n="import_htr_desc">
|
| 943 |
+
Catalogue communautaire de corpus HTR/OCR pour documents patrimoniaux.
|
| 944 |
+
</p>
|
| 945 |
+
<div class="form-row">
|
| 946 |
+
<div class="form-group" style="flex:2;">
|
| 947 |
+
<label data-i18n="import_search_label">Recherche</label>
|
| 948 |
+
<input type="text" id="htr-search" placeholder="médiéval, latin, manuscrits…" />
|
| 949 |
+
</div>
|
| 950 |
+
<div class="form-group">
|
| 951 |
+
<label data-i18n="import_lang_filter">Langue</label>
|
| 952 |
+
<select id="htr-lang-filter">
|
| 953 |
+
<option value="" data-i18n="all">Toutes</option>
|
| 954 |
+
</select>
|
| 955 |
+
</div>
|
| 956 |
+
<div class="form-group">
|
| 957 |
+
<label data-i18n="import_script_filter">Type d'écriture</label>
|
| 958 |
+
<select id="htr-script-filter">
|
| 959 |
+
<option value="" data-i18n="all">Tous</option>
|
| 960 |
+
</select>
|
| 961 |
+
</div>
|
| 962 |
+
<div class="form-group" style="justify-content: flex-end; padding-top: 18px;">
|
| 963 |
+
<button class="btn btn-primary btn-sm" onclick="searchHTRUnited()" data-i18n="search">Rechercher</button>
|
| 964 |
+
</div>
|
| 965 |
+
</div>
|
| 966 |
+
<div id="htr-results" class="ds-grid"></div>
|
| 967 |
+
</div>
|
| 968 |
+
|
| 969 |
+
<!-- HuggingFace -->
|
| 970 |
+
<div class="card">
|
| 971 |
+
<h2 data-i18n="import_hf_title">Import HuggingFace Datasets</h2>
|
| 972 |
+
<p style="font-size:12px; color:var(--text-muted); margin-bottom:12px;" data-i18n="import_hf_desc">
|
| 973 |
+
Datasets OCR/HTR publics depuis HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).
|
| 974 |
+
</p>
|
| 975 |
+
<div class="form-row">
|
| 976 |
+
<div class="form-group" style="flex:2;">
|
| 977 |
+
<label data-i18n="import_search_label">Recherche</label>
|
| 978 |
+
<input type="text" id="hf-search" placeholder="medieval OCR, IAM, RIMES…" />
|
| 979 |
+
</div>
|
| 980 |
+
<div class="form-group">
|
| 981 |
+
<label data-i18n="import_lang_filter">Langue</label>
|
| 982 |
+
<input type="text" id="hf-lang-filter" placeholder="French, Latin…" />
|
| 983 |
+
</div>
|
| 984 |
+
<div class="form-group">
|
| 985 |
+
<label data-i18n="import_tag_filter">Tags</label>
|
| 986 |
+
<input type="text" id="hf-tags" placeholder="ocr, htr, historical…" />
|
| 987 |
+
</div>
|
| 988 |
+
<div class="form-group" style="justify-content: flex-end; padding-top: 18px;">
|
| 989 |
+
<button class="btn btn-primary btn-sm" onclick="searchHuggingFace()" data-i18n="search">Rechercher</button>
|
| 990 |
+
</div>
|
| 991 |
+
</div>
|
| 992 |
+
<div id="hf-results" class="ds-grid"></div>
|
| 993 |
+
</div>
|
| 994 |
+
|
| 995 |
+
</div><!-- end view-import -->
|
| 996 |
+
|
| 997 |
+
</div><!-- end #main -->
|
| 998 |
+
|
| 999 |
+
<!-- Import modal -->
|
| 1000 |
+
<div id="import-modal" style="display:none; position:fixed; inset:0; background:rgba(0,0,0,0.4); z-index:200; align-items:center; justify-content:center;">
|
| 1001 |
+
<div class="card" style="width: 420px; max-width: 95vw;">
|
| 1002 |
+
<h2 id="import-modal-title" data-i18n="import_modal_title">Importer le corpus</h2>
|
| 1003 |
+
<input type="hidden" id="import-modal-type" />
|
| 1004 |
+
<input type="hidden" id="import-modal-id" />
|
| 1005 |
+
<div class="form-group" style="margin-bottom:12px;">
|
| 1006 |
+
<label data-i18n="import_output_dir">Dossier de destination</label>
|
| 1007 |
+
<input type="text" id="import-modal-output" value="./corpus/" />
|
| 1008 |
+
</div>
|
| 1009 |
+
<div class="form-group" style="margin-bottom:16px;">
|
| 1010 |
+
<label data-i18n="import_max_samples">Nombre max de documents</label>
|
| 1011 |
+
<input type="number" id="import-modal-max" value="100" min="1" max="10000" />
|
| 1012 |
+
</div>
|
| 1013 |
+
<div id="import-modal-status" style="margin-bottom:12px;"></div>
|
| 1014 |
+
<div style="display:flex; gap:8px;">
|
| 1015 |
+
<button class="btn btn-primary" onclick="confirmImport()" data-i18n="import_confirm">Importer</button>
|
| 1016 |
+
<button class="btn btn-secondary" onclick="closeImportModal()" data-i18n="cancel">Annuler</button>
|
| 1017 |
+
</div>
|
| 1018 |
+
</div>
|
| 1019 |
+
</div>
|
| 1020 |
+
|
| 1021 |
+
<script>
|
| 1022 |
+
// ─── i18n ────────────────────────────────────────────────────────────────────
|
| 1023 |
+
const T = {
|
| 1024 |
+
fr: {
|
| 1025 |
+
app_title: "Picarones",
|
| 1026 |
+
nav_benchmark: "Benchmark",
|
| 1027 |
+
nav_reports: "Rapports",
|
| 1028 |
+
nav_engines: "Moteurs",
|
| 1029 |
+
nav_import: "Import",
|
| 1030 |
+
loading: "Chargement…",
|
| 1031 |
+
search: "Rechercher",
|
| 1032 |
+
all: "Tous",
|
| 1033 |
+
cancel: "Annuler",
|
| 1034 |
+
bench_corpus_title: "1. Corpus",
|
| 1035 |
+
bench_corpus_label: "Chemin vers le dossier corpus (paires image / .gt.txt)",
|
| 1036 |
+
bench_browse: "Parcourir",
|
| 1037 |
+
bench_engines_title: "2. Moteurs et pipelines",
|
| 1038 |
+
bench_options_title: "3. Options",
|
| 1039 |
+
bench_norm_label: "Profil de normalisation",
|
| 1040 |
+
bench_lang_label: "Langue (Tesseract)",
|
| 1041 |
+
bench_output_label: "Dossier de sortie",
|
| 1042 |
+
bench_name_label: "Nom du rapport (optionnel)",
|
| 1043 |
+
bench_start: "▶ Lancer le benchmark",
|
| 1044 |
+
bench_cancel: "✕ Annuler",
|
| 1045 |
+
bench_progress_title: "Progression",
|
| 1046 |
+
bench_log: "Journal",
|
| 1047 |
+
bench_result_title: "Résultats",
|
| 1048 |
+
bench_open_report: "Ouvrir le rapport",
|
| 1049 |
+
reports_title: "Rapports générés",
|
| 1050 |
+
reports_dir_label: "Dossier de rapports",
|
| 1051 |
+
reports_refresh: "Rafraîchir",
|
| 1052 |
+
engines_ocr_title: "Moteurs OCR",
|
| 1053 |
+
engines_llm_title: "LLMs disponibles",
|
| 1054 |
+
import_htr_title: "Import HTR-United",
|
| 1055 |
+
import_htr_desc: "Catalogue communautaire de corpus HTR/OCR pour documents patrimoniaux.",
|
| 1056 |
+
import_hf_title: "Import HuggingFace Datasets",
|
| 1057 |
+
import_hf_desc: "Datasets OCR/HTR publics depuis HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).",
|
| 1058 |
+
import_search_label: "Recherche",
|
| 1059 |
+
import_lang_filter: "Langue",
|
| 1060 |
+
import_script_filter: "Type d'écriture",
|
| 1061 |
+
import_tag_filter: "Tags",
|
| 1062 |
+
import_modal_title: "Importer le corpus",
|
| 1063 |
+
import_output_dir: "Dossier de destination",
|
| 1064 |
+
import_max_samples: "Nombre max de documents",
|
| 1065 |
+
import_confirm: "Importer",
|
| 1066 |
+
available: "disponible",
|
| 1067 |
+
not_installed: "non installé",
|
| 1068 |
+
configured: "configuré",
|
| 1069 |
+
missing_key: "clé manquante",
|
| 1070 |
+
running: "actif",
|
| 1071 |
+
not_running: "inactif",
|
| 1072 |
+
no_reports: "Aucun rapport trouvé.",
|
| 1073 |
+
lines: "lignes",
|
| 1074 |
+
centuries: "siècles",
|
| 1075 |
+
},
|
| 1076 |
+
en: {
|
| 1077 |
+
app_title: "Picarones",
|
| 1078 |
+
nav_benchmark: "Benchmark",
|
| 1079 |
+
nav_reports: "Reports",
|
| 1080 |
+
nav_engines: "Engines",
|
| 1081 |
+
nav_import: "Import",
|
| 1082 |
+
loading: "Loading…",
|
| 1083 |
+
search: "Search",
|
| 1084 |
+
all: "All",
|
| 1085 |
+
cancel: "Cancel",
|
| 1086 |
+
bench_corpus_title: "1. Corpus",
|
| 1087 |
+
bench_corpus_label: "Path to corpus directory (image / .gt.txt pairs)",
|
| 1088 |
+
bench_browse: "Browse",
|
| 1089 |
+
bench_engines_title: "2. Engines & pipelines",
|
| 1090 |
+
bench_options_title: "3. Options",
|
| 1091 |
+
bench_norm_label: "Normalization profile",
|
| 1092 |
+
bench_lang_label: "Language (Tesseract)",
|
| 1093 |
+
bench_output_label: "Output directory",
|
| 1094 |
+
bench_name_label: "Report name (optional)",
|
| 1095 |
+
bench_start: "▶ Start benchmark",
|
| 1096 |
+
bench_cancel: "✕ Cancel",
|
| 1097 |
+
bench_progress_title: "Progress",
|
| 1098 |
+
bench_log: "Log",
|
| 1099 |
+
bench_result_title: "Results",
|
| 1100 |
+
bench_open_report: "Open report",
|
| 1101 |
+
reports_title: "Generated reports",
|
| 1102 |
+
reports_dir_label: "Reports directory",
|
| 1103 |
+
reports_refresh: "Refresh",
|
| 1104 |
+
engines_ocr_title: "OCR Engines",
|
| 1105 |
+
engines_llm_title: "Available LLMs",
|
| 1106 |
+
import_htr_title: "Import from HTR-United",
|
| 1107 |
+
import_htr_desc: "Community catalogue of HTR/OCR datasets for heritage documents.",
|
| 1108 |
+
import_hf_title: "Import from HuggingFace Datasets",
|
| 1109 |
+
import_hf_desc: "Public OCR/HTR datasets from HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).",
|
| 1110 |
+
import_search_label: "Search",
|
| 1111 |
+
import_lang_filter: "Language",
|
| 1112 |
+
import_script_filter: "Script type",
|
| 1113 |
+
import_tag_filter: "Tags",
|
| 1114 |
+
import_modal_title: "Import corpus",
|
| 1115 |
+
import_output_dir: "Output directory",
|
| 1116 |
+
import_max_samples: "Max documents",
|
| 1117 |
+
import_confirm: "Import",
|
| 1118 |
+
available: "available",
|
| 1119 |
+
not_installed: "not installed",
|
| 1120 |
+
configured: "configured",
|
| 1121 |
+
missing_key: "key missing",
|
| 1122 |
+
running: "running",
|
| 1123 |
+
not_running: "not running",
|
| 1124 |
+
no_reports: "No reports found.",
|
| 1125 |
+
lines: "lines",
|
| 1126 |
+
centuries: "centuries",
|
| 1127 |
+
},
|
| 1128 |
+
};
|
| 1129 |
+
let lang = "fr";
|
| 1130 |
+
function t(key) { return (T[lang][key]) || key; }
|
| 1131 |
+
function toggleLang() {
|
| 1132 |
+
lang = lang === "fr" ? "en" : "fr";
|
| 1133 |
+
document.getElementById("lang-btn").textContent = lang === "fr" ? "EN" : "FR";
|
| 1134 |
+
document.querySelectorAll("[data-i18n]").forEach(el => {
|
| 1135 |
+
const k = el.getAttribute("data-i18n");
|
| 1136 |
+
if (T[lang][k]) el.textContent = T[lang][k];
|
| 1137 |
+
});
|
| 1138 |
+
}
|
| 1139 |
+
|
| 1140 |
+
// ─── Navigation ──────────────────────────────────────────────────────────────
|
| 1141 |
+
function showView(name) {
|
| 1142 |
+
document.querySelectorAll(".view").forEach(v => v.classList.remove("active"));
|
| 1143 |
+
document.querySelectorAll(".nav-btn").forEach(b => b.classList.remove("active"));
|
| 1144 |
+
const view = document.getElementById("view-" + name);
|
| 1145 |
+
if (view) view.classList.add("active");
|
| 1146 |
+
const btns = document.querySelectorAll(".nav-btn");
|
| 1147 |
+
const idx = ["benchmark","reports","engines","import"].indexOf(name);
|
| 1148 |
+
if (btns[idx]) btns[idx].classList.add("active");
|
| 1149 |
+
|
| 1150 |
+
if (name === "reports") loadReports();
|
| 1151 |
+
if (name === "engines") loadEngines();
|
| 1152 |
+
if (name === "import") { searchHTRUnited(); searchHuggingFace(); }
|
| 1153 |
+
}
|
| 1154 |
+
|
| 1155 |
+
// ─── Status / version ────────────────────────────────────────────────────────
|
| 1156 |
+
async function loadStatus() {
|
| 1157 |
+
try {
|
| 1158 |
+
const r = await fetch("/api/status");
|
| 1159 |
+
const d = await r.json();
|
| 1160 |
+
document.getElementById("app-version").textContent = "v" + d.version;
|
| 1161 |
+
} catch(e) {}
|
| 1162 |
+
}
|
| 1163 |
+
|
| 1164 |
+
// ─── Engine checkboxes ───────────────────────────────────────────────────────
|
| 1165 |
+
async function loadEngineCheckboxes() {
|
| 1166 |
+
try {
|
| 1167 |
+
const r = await fetch("/api/engines");
|
| 1168 |
+
const d = await r.json();
|
| 1169 |
+
const container = document.getElementById("engine-checkboxes");
|
| 1170 |
+
container.innerHTML = "";
|
| 1171 |
+
|
| 1172 |
+
[...d.engines, ...d.llms].forEach(eng => {
|
| 1173 |
+
const item = document.createElement("label");
|
| 1174 |
+
item.className = "checkbox-item" + (eng.available ? " checked" : "");
|
| 1175 |
+
const dot = `<span class="engine-status ${eng.available ? "status-ok" : "status-err"}"></span>`;
|
| 1176 |
+
const chk = `<input type="checkbox" name="engine" value="${eng.id}" ${eng.available ? "checked" : ""} ${eng.available ? "" : ""}>`;
|
| 1177 |
+
item.innerHTML = `${chk}${dot}<span>${eng.label}</span>`;
|
| 1178 |
+
item.querySelector("input").addEventListener("change", e => {
|
| 1179 |
+
item.classList.toggle("checked", e.target.checked);
|
| 1180 |
+
});
|
| 1181 |
+
container.appendChild(item);
|
| 1182 |
+
});
|
| 1183 |
+
|
| 1184 |
+
// Store all engine data for later
|
| 1185 |
+
window._enginesData = d;
|
| 1186 |
+
} catch(e) {
|
| 1187 |
+
document.getElementById("engine-checkboxes").innerHTML =
|
| 1188 |
+
'<span style="color: var(--danger); font-size:12px;">Erreur chargement moteurs</span>';
|
| 1189 |
+
}
|
| 1190 |
+
}
|
| 1191 |
+
|
| 1192 |
+
// ─── Normalization profiles ──────────────────────────────────────────────────
|
| 1193 |
+
async function loadNormProfiles() {
|
| 1194 |
+
try {
|
| 1195 |
+
const r = await fetch("/api/normalization/profiles");
|
| 1196 |
+
const d = await r.json();
|
| 1197 |
+
const sel = document.getElementById("norm-profile");
|
| 1198 |
+
sel.innerHTML = "";
|
| 1199 |
+
d.profiles.forEach(p => {
|
| 1200 |
+
const opt = document.createElement("option");
|
| 1201 |
+
opt.value = p.id;
|
| 1202 |
+
opt.textContent = `${p.name} — ${p.description}`;
|
| 1203 |
+
if (p.id === "nfc") opt.selected = true;
|
| 1204 |
+
sel.appendChild(opt);
|
| 1205 |
+
});
|
| 1206 |
+
} catch(e) {}
|
| 1207 |
+
}
|
| 1208 |
+
|
| 1209 |
+
// ─── File browser ────────────────────────────────────────────────────────────
|
| 1210 |
+
let _fbVisible = false;
|
| 1211 |
+
function openFileBrowser() {
|
| 1212 |
+
_fbVisible = !_fbVisible;
|
| 1213 |
+
const c = document.getElementById("file-browser-container");
|
| 1214 |
+
c.style.display = _fbVisible ? "block" : "none";
|
| 1215 |
+
if (_fbVisible) browsePath(".");
|
| 1216 |
+
}
|
| 1217 |
+
async function browsePath(path) {
|
| 1218 |
+
try {
|
| 1219 |
+
const r = await fetch(`/api/corpus/browse?path=${encodeURIComponent(path)}`);
|
| 1220 |
+
const d = await r.json();
|
| 1221 |
+
document.getElementById("fb-current-path").textContent = d.current_path;
|
| 1222 |
+
const fb = document.getElementById("file-browser");
|
| 1223 |
+
fb.innerHTML = "";
|
| 1224 |
+
if (d.parent_path) {
|
| 1225 |
+
const up = document.createElement("div");
|
| 1226 |
+
up.className = "fb-item";
|
| 1227 |
+
up.innerHTML = `<span class="fb-icon">⬆</span><span class="fb-name">..</span>`;
|
| 1228 |
+
up.onclick = () => browsePath(d.parent_path);
|
| 1229 |
+
fb.appendChild(up);
|
| 1230 |
+
}
|
| 1231 |
+
d.items.filter(i => i.is_dir).forEach(item => {
|
| 1232 |
+
const el = document.createElement("div");
|
| 1233 |
+
el.className = "fb-item";
|
| 1234 |
+
const hasCorpus = item.has_corpus ? `<span class="fb-badge" style="color:var(--success)">✓ ${item.gt_count} GT</span>` : "";
|
| 1235 |
+
el.innerHTML = `<span class="fb-icon">📁</span><span class="fb-name">${item.name}</span>${hasCorpus}`;
|
| 1236 |
+
el.onclick = () => {
|
| 1237 |
+
if (item.has_corpus) {
|
| 1238 |
+
document.getElementById("corpus-path").value = item.path;
|
| 1239 |
+
document.getElementById("corpus-info").textContent = `✓ ${item.gt_count} documents GT trouvés.`;
|
| 1240 |
+
_fbVisible = false;
|
| 1241 |
+
document.getElementById("file-browser-container").style.display = "none";
|
| 1242 |
+
} else {
|
| 1243 |
+
browsePath(item.path);
|
| 1244 |
+
}
|
| 1245 |
+
};
|
| 1246 |
+
fb.appendChild(el);
|
| 1247 |
+
});
|
| 1248 |
+
if (fb.children.length === 0) {
|
| 1249 |
+
fb.innerHTML = '<div style="padding:12px; color: var(--text-muted); font-size:12px;">Dossier vide</div>';
|
| 1250 |
+
}
|
| 1251 |
+
} catch(e) {
|
| 1252 |
+
document.getElementById("file-browser").innerHTML =
|
| 1253 |
+
`<div style="padding:12px; color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
|
| 1254 |
+
}
|
| 1255 |
+
}
|
| 1256 |
+
|
| 1257 |
+
// ─── Benchmark ───────────────────────────────────────────────────────────────
|
| 1258 |
+
let _currentJobId = null;
|
| 1259 |
+
let _eventSource = null;
|
| 1260 |
+
|
| 1261 |
+
async function startBenchmark() {
|
| 1262 |
+
const corpusPath = document.getElementById("corpus-path").value.trim();
|
| 1263 |
+
if (!corpusPath) {
|
| 1264 |
+
alert(lang === "fr" ? "Veuillez sélectionner un dossier corpus." : "Please select a corpus directory.");
|
| 1265 |
+
return;
|
| 1266 |
+
}
|
| 1267 |
+
const engines = Array.from(document.querySelectorAll("input[name=engine]:checked")).map(e => e.value);
|
| 1268 |
+
if (engines.length === 0) {
|
| 1269 |
+
alert(lang === "fr" ? "Veuillez sélectionner au moins un moteur." : "Please select at least one engine.");
|
| 1270 |
+
return;
|
| 1271 |
+
}
|
| 1272 |
+
|
| 1273 |
+
const payload = {
|
| 1274 |
+
corpus_path: corpusPath,
|
| 1275 |
+
engines: engines,
|
| 1276 |
+
normalization_profile: document.getElementById("norm-profile").value,
|
| 1277 |
+
output_dir: document.getElementById("output-dir").value,
|
| 1278 |
+
report_name: document.getElementById("report-name").value,
|
| 1279 |
+
lang: document.getElementById("bench-lang").value,
|
| 1280 |
+
};
|
| 1281 |
+
|
| 1282 |
+
document.getElementById("start-btn").disabled = true;
|
| 1283 |
+
document.getElementById("cancel-btn").style.display = "inline-flex";
|
| 1284 |
+
document.getElementById("bench-progress-section").style.display = "block";
|
| 1285 |
+
document.getElementById("bench-result-section").style.display = "none";
|
| 1286 |
+
document.getElementById("bench-log").textContent = "";
|
| 1287 |
+
document.getElementById("engine-progress-list").innerHTML = "";
|
| 1288 |
+
document.getElementById("bench-status-text").textContent = lang === "fr" ? "Démarrage…" : "Starting…";
|
| 1289 |
+
|
| 1290 |
+
try {
|
| 1291 |
+
const r = await fetch("/api/benchmark/start", {
|
| 1292 |
+
method: "POST",
|
| 1293 |
+
headers: {"Content-Type": "application/json"},
|
| 1294 |
+
body: JSON.stringify(payload),
|
| 1295 |
+
});
|
| 1296 |
+
if (!r.ok) {
|
| 1297 |
+
const err = await r.json();
|
| 1298 |
+
throw new Error(err.detail || "Erreur serveur");
|
| 1299 |
+
}
|
| 1300 |
+
const d = await r.json();
|
| 1301 |
+
_currentJobId = d.job_id;
|
| 1302 |
+
_startSSE(_currentJobId, engines);
|
| 1303 |
+
} catch(e) {
|
| 1304 |
+
appendLog(`Erreur : ${e.message}`, "error");
|
| 1305 |
+
document.getElementById("start-btn").disabled = false;
|
| 1306 |
+
document.getElementById("cancel-btn").style.display = "none";
|
| 1307 |
+
document.getElementById("bench-status-text").textContent = "";
|
| 1308 |
+
}
|
| 1309 |
+
}
|
| 1310 |
+
|
| 1311 |
+
function _startSSE(jobId, engines) {
|
| 1312 |
+
if (_eventSource) _eventSource.close();
|
| 1313 |
+
// Init engine progress bars
|
| 1314 |
+
const pl = document.getElementById("engine-progress-list");
|
| 1315 |
+
pl.innerHTML = "";
|
| 1316 |
+
engines.forEach(eng => {
|
| 1317 |
+
const div = document.createElement("div");
|
| 1318 |
+
div.id = `eng-progress-${eng}`;
|
| 1319 |
+
div.style = "margin-bottom: 8px;";
|
| 1320 |
+
div.innerHTML = `<div style="display:flex; justify-content:space-between; font-size:12px; margin-bottom:3px;">
|
| 1321 |
+
<span>${eng}</span><span id="eng-pct-${eng}">0%</span></div>
|
| 1322 |
+
<div class="progress-bar-outer"><div class="progress-bar-inner" id="eng-bar-${eng}" style="width:0%"></div></div>`;
|
| 1323 |
+
pl.appendChild(div);
|
| 1324 |
+
});
|
| 1325 |
+
|
| 1326 |
+
_eventSource = new EventSource(`/api/benchmark/${jobId}/stream`);
|
| 1327 |
+
|
| 1328 |
+
_eventSource.addEventListener("start", e => {
|
| 1329 |
+
const d = JSON.parse(e.data);
|
| 1330 |
+
appendLog(d.message, "success");
|
| 1331 |
+
document.getElementById("bench-status-text").textContent = lang === "fr" ? "En cours…" : "Running…";
|
| 1332 |
+
});
|
| 1333 |
+
|
| 1334 |
+
_eventSource.addEventListener("log", e => {
|
| 1335 |
+
const d = JSON.parse(e.data);
|
| 1336 |
+
appendLog(d.message);
|
| 1337 |
+
});
|
| 1338 |
+
|
| 1339 |
+
_eventSource.addEventListener("warning", e => {
|
| 1340 |
+
const d = JSON.parse(e.data);
|
| 1341 |
+
appendLog(d.message, "warn");
|
| 1342 |
+
});
|
| 1343 |
+
|
| 1344 |
+
_eventSource.addEventListener("progress", e => {
|
| 1345 |
+
const d = JSON.parse(e.data);
|
| 1346 |
+
const pct = Math.round(d.progress * 100);
|
| 1347 |
+
document.getElementById("bench-status-text").textContent =
|
| 1348 |
+
`${pct}% — ${d.engine} (${d.processed}/${d.total})`;
|
| 1349 |
+
engines.forEach(eng => {
|
| 1350 |
+
const bar = document.getElementById(`eng-bar-${eng}`);
|
| 1351 |
+
const pctEl = document.getElementById(`eng-pct-${eng}`);
|
| 1352 |
+
if (d.engine === eng && bar && pctEl) {
|
| 1353 |
+
bar.style.width = pct + "%";
|
| 1354 |
+
pctEl.textContent = pct + "%";
|
| 1355 |
+
}
|
| 1356 |
+
});
|
| 1357 |
+
});
|
| 1358 |
+
|
| 1359 |
+
_eventSource.addEventListener("complete", e => {
|
| 1360 |
+
const d = JSON.parse(e.data);
|
| 1361 |
+
appendLog(d.message, "success");
|
| 1362 |
+
_showResults(d);
|
| 1363 |
+
_finishBenchmark();
|
| 1364 |
+
});
|
| 1365 |
+
|
| 1366 |
+
_eventSource.addEventListener("error", e => {
|
| 1367 |
+
const d = JSON.parse(e.data);
|
| 1368 |
+
appendLog(d.message, "error");
|
| 1369 |
+
_finishBenchmark();
|
| 1370 |
+
});
|
| 1371 |
+
|
| 1372 |
+
_eventSource.addEventListener("cancelled", e => {
|
| 1373 |
+
appendLog(lang === "fr" ? "Benchmark annulé." : "Benchmark cancelled.", "warn");
|
| 1374 |
+
_finishBenchmark();
|
| 1375 |
+
});
|
| 1376 |
+
|
| 1377 |
+
_eventSource.addEventListener("done", e => {
|
| 1378 |
+
_finishBenchmark();
|
| 1379 |
+
});
|
| 1380 |
+
|
| 1381 |
+
_eventSource.onerror = () => {
|
| 1382 |
+
if (_currentJobId) {
|
| 1383 |
+
_finishBenchmark();
|
| 1384 |
+
}
|
| 1385 |
+
};
|
| 1386 |
+
}
|
| 1387 |
+
|
| 1388 |
+
function _showResults(data) {
|
| 1389 |
+
const section = document.getElementById("bench-result-section");
|
| 1390 |
+
section.style.display = "block";
|
| 1391 |
+
if (data.output_html) {
|
| 1392 |
+
const link = document.getElementById("bench-report-link");
|
| 1393 |
+
link.href = `/reports/${data.output_html.split("/").pop()}`;
|
| 1394 |
+
}
|
| 1395 |
+
if (data.ranking) {
|
| 1396 |
+
let html = `<table><thead><tr><th>#</th><th>${lang==="fr"?"Moteur":"Engine"}</th><th>CER</th><th>WER</th><th>${lang==="fr"?"Docs":"Docs"}</th></tr></thead><tbody>`;
|
| 1397 |
+
data.ranking.forEach((row, i) => {
|
| 1398 |
+
const cer = row.mean_cer != null ? (row.mean_cer*100).toFixed(2)+"%" : "N/A";
|
| 1399 |
+
const wer = row.mean_wer != null ? (row.mean_wer*100).toFixed(2)+"%" : "N/A";
|
| 1400 |
+
html += `<tr><td>${i+1}</td><td>${row.engine}</td><td>${cer}</td><td>${wer}</td><td>${row.total_docs || ""}</td></tr>`;
|
| 1401 |
+
});
|
| 1402 |
+
html += "</tbody></table>";
|
| 1403 |
+
document.getElementById("bench-ranking-table").innerHTML = html;
|
| 1404 |
+
}
|
| 1405 |
+
}
|
| 1406 |
+
|
| 1407 |
+
function _finishBenchmark() {
|
| 1408 |
+
if (_eventSource) { _eventSource.close(); _eventSource = null; }
|
| 1409 |
+
document.getElementById("start-btn").disabled = false;
|
| 1410 |
+
document.getElementById("cancel-btn").style.display = "none";
|
| 1411 |
+
document.getElementById("bench-status-text").textContent = "";
|
| 1412 |
+
}
|
| 1413 |
+
|
| 1414 |
+
async function cancelBenchmark() {
|
| 1415 |
+
if (!_currentJobId) return;
|
| 1416 |
+
await fetch(`/api/benchmark/${_currentJobId}/cancel`, {method: "POST"});
|
| 1417 |
+
}
|
| 1418 |
+
|
| 1419 |
+
function appendLog(msg, cls) {
|
| 1420 |
+
const box = document.getElementById("bench-log");
|
| 1421 |
+
const line = document.createElement("div");
|
| 1422 |
+
if (cls === "error") line.className = "log-error";
|
| 1423 |
+
else if (cls === "warn") line.className = "log-warn";
|
| 1424 |
+
else if (cls === "success") line.className = "log-success";
|
| 1425 |
+
line.textContent = msg;
|
| 1426 |
+
box.appendChild(line);
|
| 1427 |
+
box.scrollTop = box.scrollHeight;
|
| 1428 |
+
}
|
| 1429 |
+
|
| 1430 |
+
// ─── Reports ─────────────────────────────────────────────────────────────────
|
| 1431 |
+
async function loadReports() {
|
| 1432 |
+
const dir = document.getElementById("reports-dir").value || ".";
|
| 1433 |
+
const container = document.getElementById("reports-list");
|
| 1434 |
+
container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`;
|
| 1435 |
+
try {
|
| 1436 |
+
const r = await fetch(`/api/reports?reports_dir=${encodeURIComponent(dir)}`);
|
| 1437 |
+
const d = await r.json();
|
| 1438 |
+
if (d.reports.length === 0) {
|
| 1439 |
+
container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("no_reports")}</div>`;
|
| 1440 |
+
return;
|
| 1441 |
+
}
|
| 1442 |
+
let html = `<table><thead><tr><th>${lang==="fr"?"Fichier":"File"}</th><th>${lang==="fr"?"Taille":"Size"}</th><th>${lang==="fr"?"Modifié":"Modified"}</th><th></th></tr></thead><tbody>`;
|
| 1443 |
+
d.reports.forEach(rep => {
|
| 1444 |
+
const date = new Date(rep.modified).toLocaleString(lang === "fr" ? "fr-FR" : "en-US");
|
| 1445 |
+
html += `<tr><td>${rep.filename}</td><td>${rep.size_kb} Ko</td><td>${date}</td>
|
| 1446 |
+
<td><a href="${rep.url}" target="_blank" class="btn btn-primary btn-sm">${lang==="fr"?"Ouvrir":"Open"}</a></td></tr>`;
|
| 1447 |
+
});
|
| 1448 |
+
html += "</tbody></table>";
|
| 1449 |
+
container.innerHTML = html;
|
| 1450 |
+
} catch(e) {
|
| 1451 |
+
container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
|
| 1452 |
+
}
|
| 1453 |
+
}
|
| 1454 |
+
|
| 1455 |
+
// ─── Engines status ──────────────────────────────────────────────────────────
|
| 1456 |
+
async function loadEngines() {
|
| 1457 |
+
try {
|
| 1458 |
+
const r = await fetch("/api/engines");
|
| 1459 |
+
const d = await r.json();
|
| 1460 |
+
|
| 1461 |
+
// OCR
|
| 1462 |
+
let html = `<table><thead><tr><th>ID</th><th>${lang==="fr"?"Nom":"Name"}</th><th>Version</th><th>Statut</th></tr></thead><tbody>`;
|
| 1463 |
+
d.engines.forEach(e => {
|
| 1464 |
+
const cls = e.available ? "badge-ok" : "badge-err";
|
| 1465 |
+
const lbl = e.available ? t("available") : t("not_installed");
|
| 1466 |
+
html += `<tr><td><code>${e.id}</code></td><td>${e.label}</td><td>${e.version||"—"}</td>
|
| 1467 |
+
<td><span class="badge ${cls}">${lbl}</span></td></tr>`;
|
| 1468 |
+
});
|
| 1469 |
+
html += "</tbody></table>";
|
| 1470 |
+
document.getElementById("engines-ocr-list").innerHTML = html;
|
| 1471 |
+
|
| 1472 |
+
// LLMs
|
| 1473 |
+
let llmHtml = `<table><thead><tr><th>ID</th><th>${lang==="fr"?"Nom":"Name"}</th><th>Statut</th><th>${lang==="fr"?"Détail":"Detail"}</th></tr></thead><tbody>`;
|
| 1474 |
+
d.llms.forEach(e => {
|
| 1475 |
+
const cls = e.available ? "badge-ok" : "badge-warn";
|
| 1476 |
+
const statusKey = e.status === "configured" ? "configured"
|
| 1477 |
+
: e.status === "running" ? "running"
|
| 1478 |
+
: e.status === "not_running" ? "not_running"
|
| 1479 |
+
: "missing_key";
|
| 1480 |
+
const lbl = t(statusKey);
|
| 1481 |
+
let detail = "";
|
| 1482 |
+
if (e.key_env) detail = `<code style="font-size:11px;">${e.key_env}</code>`;
|
| 1483 |
+
if (e.models && e.models.length > 0) detail = e.models.slice(0, 3).join(", ");
|
| 1484 |
+
llmHtml += `<tr><td><code>${e.id}</code></td><td>${e.label}</td>
|
| 1485 |
+
<td><span class="badge ${cls}">${lbl}</span></td><td>${detail}</td></tr>`;
|
| 1486 |
+
});
|
| 1487 |
+
llmHtml += "</tbody></table>";
|
| 1488 |
+
document.getElementById("engines-llm-list").innerHTML = llmHtml;
|
| 1489 |
+
} catch(e) {
|
| 1490 |
+
document.getElementById("engines-ocr-list").innerHTML =
|
| 1491 |
+
`<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
|
| 1492 |
+
}
|
| 1493 |
+
}
|
| 1494 |
+
|
| 1495 |
+
// ─── HTR-United ──────────────────────────────────────────────────────────────
|
| 1496 |
+
async function initHTRFilters() {
|
| 1497 |
+
try {
|
| 1498 |
+
const r = await fetch("/api/htr-united/catalogue");
|
| 1499 |
+
const d = await r.json();
|
| 1500 |
+
const langSel = document.getElementById("htr-lang-filter");
|
| 1501 |
+
const scriptSel = document.getElementById("htr-script-filter");
|
| 1502 |
+
langSel.innerHTML = `<option value="">${t("all")}</option>`;
|
| 1503 |
+
d.available_languages.forEach(l => {
|
| 1504 |
+
langSel.innerHTML += `<option value="${l}">${l}</option>`;
|
| 1505 |
+
});
|
| 1506 |
+
scriptSel.innerHTML = `<option value="">${t("all")}</option>`;
|
| 1507 |
+
d.available_scripts.forEach(s => {
|
| 1508 |
+
scriptSel.innerHTML += `<option value="${s}">${s}</option>`;
|
| 1509 |
+
});
|
| 1510 |
+
} catch(e) {}
|
| 1511 |
+
}
|
| 1512 |
+
|
| 1513 |
+
async function searchHTRUnited() {
|
| 1514 |
+
const q = document.getElementById("htr-search").value;
|
| 1515 |
+
const lang2 = document.getElementById("htr-lang-filter").value;
|
| 1516 |
+
const script = document.getElementById("htr-script-filter").value;
|
| 1517 |
+
const container = document.getElementById("htr-results");
|
| 1518 |
+
container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`;
|
| 1519 |
+
try {
|
| 1520 |
+
const url = `/api/htr-united/catalogue?query=${encodeURIComponent(q)}&language=${encodeURIComponent(lang2)}&script=${encodeURIComponent(script)}`;
|
| 1521 |
+
const r = await fetch(url);
|
| 1522 |
+
const d = await r.json();
|
| 1523 |
+
if (d.entries.length === 0) {
|
| 1524 |
+
container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${lang==="fr"?"Aucun résultat.":"No results."}</div>`;
|
| 1525 |
+
return;
|
| 1526 |
+
}
|
| 1527 |
+
container.innerHTML = d.entries.map(e => {
|
| 1528 |
+
const tags = [...e.language, ...e.script].map(s => `<span class="ds-tag">${s}</span>`).join("");
|
| 1529 |
+
return `<div class="ds-card">
|
| 1530 |
+
<div style="display:flex; justify-content:space-between; align-items:flex-start;">
|
| 1531 |
+
<h4>${e.title}</h4>
|
| 1532 |
+
<button class="btn btn-primary btn-sm" onclick="openImportModal('htr', '${e.id}', '${e.title.replace(/'/g,"\\'")}')">
|
| 1533 |
+
${lang==="fr"?"Importer":"Import"}
|
| 1534 |
+
</button>
|
| 1535 |
+
</div>
|
| 1536 |
+
<p>${e.description}</p>
|
| 1537 |
+
<p style="color: var(--text-muted);">${e.institution} — ${e.lines.toLocaleString()} ${t("lines")} — ${e.format}</p>
|
| 1538 |
+
<div class="ds-meta">${tags}</div>
|
| 1539 |
+
</div>`;
|
| 1540 |
+
}).join("");
|
| 1541 |
+
} catch(e) {
|
| 1542 |
+
container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
|
| 1543 |
+
}
|
| 1544 |
+
}
|
| 1545 |
+
|
| 1546 |
+
async function searchHuggingFace() {
|
| 1547 |
+
const q = document.getElementById("hf-search").value;
|
| 1548 |
+
const langFilter = document.getElementById("hf-lang-filter").value;
|
| 1549 |
+
const tags = document.getElementById("hf-tags").value;
|
| 1550 |
+
const container = document.getElementById("hf-results");
|
| 1551 |
+
container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`;
|
| 1552 |
+
try {
|
| 1553 |
+
const url = `/api/huggingface/search?query=${encodeURIComponent(q)}&language=${encodeURIComponent(langFilter)}&tags=${encodeURIComponent(tags)}`;
|
| 1554 |
+
const r = await fetch(url);
|
| 1555 |
+
const d = await r.json();
|
| 1556 |
+
if (d.datasets.length === 0) {
|
| 1557 |
+
container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${lang==="fr"?"Aucun résultat.":"No results."}</div>`;
|
| 1558 |
+
return;
|
| 1559 |
+
}
|
| 1560 |
+
container.innerHTML = d.datasets.map(ds => {
|
| 1561 |
+
const tags2 = ds.tags.slice(0,5).map(s => `<span class="ds-tag">${s}</span>`).join("");
|
| 1562 |
+
return `<div class="ds-card">
|
| 1563 |
+
<div style="display:flex; justify-content:space-between; align-items:flex-start;">
|
| 1564 |
+
<h4>${ds.title}</h4>
|
| 1565 |
+
<button class="btn btn-primary btn-sm" onclick="openImportModal('hf', '${ds.dataset_id.replace(/'/g,"\\'")}', '${ds.title.replace(/'/g,"\\'")}')">
|
| 1566 |
+
${lang==="fr"?"Importer":"Import"}
|
| 1567 |
+
</button>
|
| 1568 |
+
</div>
|
| 1569 |
+
<p>${ds.description}</p>
|
| 1570 |
+
<p style="color: var(--text-muted);">${ds.institution||ds.dataset_id} ${ds.downloads ? "— " + ds.downloads.toLocaleString() + " téléchargements" : ""}</p>
|
| 1571 |
+
<div class="ds-meta">${tags2}</div>
|
| 1572 |
+
</div>`;
|
| 1573 |
+
}).join("");
|
| 1574 |
+
} catch(e) {
|
| 1575 |
+
container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
|
| 1576 |
+
}
|
| 1577 |
+
}
|
| 1578 |
+
|
| 1579 |
+
// ─── Import modal ─────────────────────────────────────────────────────────────
|
| 1580 |
+
function openImportModal(type, id, title) {
|
| 1581 |
+
document.getElementById("import-modal-type").value = type;
|
| 1582 |
+
document.getElementById("import-modal-id").value = id;
|
| 1583 |
+
document.getElementById("import-modal-title").textContent = `${t("import_modal_title")} : ${title}`;
|
| 1584 |
+
document.getElementById("import-modal-status").innerHTML = "";
|
| 1585 |
+
document.getElementById("import-modal").style.display = "flex";
|
| 1586 |
+
}
|
| 1587 |
+
function closeImportModal() {
|
| 1588 |
+
document.getElementById("import-modal").style.display = "none";
|
| 1589 |
+
}
|
| 1590 |
+
async function confirmImport() {
|
| 1591 |
+
const type = document.getElementById("import-modal-type").value;
|
| 1592 |
+
const id = document.getElementById("import-modal-id").value;
|
| 1593 |
+
const outputDir = document.getElementById("import-modal-output").value;
|
| 1594 |
+
const maxSamples = parseInt(document.getElementById("import-modal-max").value);
|
| 1595 |
+
const statusDiv = document.getElementById("import-modal-status");
|
| 1596 |
+
statusDiv.innerHTML = `<div class="alert alert-info"><span class="spinner"></span> ${lang==="fr"?"Import en cours…":"Importing…"}</div>`;
|
| 1597 |
+
|
| 1598 |
+
try {
|
| 1599 |
+
let url, body;
|
| 1600 |
+
if (type === "htr") {
|
| 1601 |
+
url = "/api/htr-united/import";
|
| 1602 |
+
body = {entry_id: id, output_dir: outputDir, max_samples: maxSamples};
|
| 1603 |
+
} else {
|
| 1604 |
+
url = "/api/huggingface/import";
|
| 1605 |
+
body = {dataset_id: id, output_dir: outputDir, max_samples: maxSamples};
|
| 1606 |
+
}
|
| 1607 |
+
const r = await fetch(url, {method:"POST", headers:{"Content-Type":"application/json"}, body: JSON.stringify(body)});
|
| 1608 |
+
const d = await r.json();
|
| 1609 |
+
if (!r.ok) throw new Error(d.detail || "Erreur");
|
| 1610 |
+
const msg = lang === "fr"
|
| 1611 |
+
? `✓ Import terminé. ${d.files_imported || 0} fichiers dans <code>${d.output_dir}</code>`
|
| 1612 |
+
: `✓ Import done. ${d.files_imported || 0} files in <code>${d.output_dir}</code>`;
|
| 1613 |
+
statusDiv.innerHTML = `<div class="alert alert-success">${msg}</div>`;
|
| 1614 |
+
// Suggestion de corpus path
|
| 1615 |
+
document.getElementById("corpus-path").value = d.output_dir;
|
| 1616 |
+
} catch(e) {
|
| 1617 |
+
statusDiv.innerHTML = `<div class="alert alert-error">Erreur : ${e.message}</div>`;
|
| 1618 |
+
}
|
| 1619 |
+
}
|
| 1620 |
+
|
| 1621 |
+
// ─── Init ────────────────────────────────────────────────────────────────────
|
| 1622 |
+
document.addEventListener("DOMContentLoaded", () => {
|
| 1623 |
+
loadStatus();
|
| 1624 |
+
loadEngineCheckboxes();
|
| 1625 |
+
loadNormProfiles();
|
| 1626 |
+
initHTRFilters();
|
| 1627 |
+
// Close modal on backdrop click
|
| 1628 |
+
document.getElementById("import-modal").addEventListener("click", e => {
|
| 1629 |
+
if (e.target === document.getElementById("import-modal")) closeImportModal();
|
| 1630 |
+
});
|
| 1631 |
+
});
|
| 1632 |
+
</script>
|
| 1633 |
+
</body>
|
| 1634 |
+
</html>"""
|
pyproject.toml
CHANGED
|
@@ -29,8 +29,10 @@ dependencies = [
|
|
| 29 |
]
|
| 30 |
|
| 31 |
[project.optional-dependencies]
|
| 32 |
-
dev = ["pytest>=7.4.0", "pytest-cov>=4.1.0"]
|
| 33 |
pero = ["pero-ocr>=0.1.0"]
|
|
|
|
|
|
|
| 34 |
|
| 35 |
[project.scripts]
|
| 36 |
picarones = "picarones.cli:cli"
|
|
|
|
| 29 |
]
|
| 30 |
|
| 31 |
[project.optional-dependencies]
|
| 32 |
+
dev = ["pytest>=7.4.0", "pytest-cov>=4.1.0", "httpx>=0.27.0"]
|
| 33 |
pero = ["pero-ocr>=0.1.0"]
|
| 34 |
+
web = ["fastapi>=0.111.0", "uvicorn[standard]>=0.29.0", "httpx>=0.27.0"]
|
| 35 |
+
hf = ["datasets>=2.19.0"]
|
| 36 |
|
| 37 |
[project.scripts]
|
| 38 |
picarones = "picarones.cli:cli"
|
rapport_demo.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/test_sprint4_normalization_iiif.py
ADDED
|
@@ -0,0 +1,834 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 4 : normalisation diplomatique, import IIIF, adaptateurs API OCR."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from picarones.core.normalization import (
|
| 10 |
+
NormalizationProfile,
|
| 11 |
+
DIPLOMATIC_FR_MEDIEVAL,
|
| 12 |
+
DIPLOMATIC_FR_EARLY_MODERN,
|
| 13 |
+
DIPLOMATIC_LATIN_MEDIEVAL,
|
| 14 |
+
DIPLOMATIC_MINIMAL,
|
| 15 |
+
DEFAULT_DIPLOMATIC_PROFILE,
|
| 16 |
+
_apply_diplomatic_table,
|
| 17 |
+
get_builtin_profile,
|
| 18 |
+
)
|
| 19 |
+
from picarones.core.metrics import compute_metrics, aggregate_metrics, MetricsResult
|
| 20 |
+
from picarones.importers.iiif import (
|
| 21 |
+
IIIFManifestParser,
|
| 22 |
+
IIIFCanvas,
|
| 23 |
+
parse_page_selector,
|
| 24 |
+
_extract_label,
|
| 25 |
+
_best_image_url_v2,
|
| 26 |
+
_best_image_url_v3,
|
| 27 |
+
_guess_extension,
|
| 28 |
+
_slugify,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ===========================================================================
|
| 33 |
+
# Tests NormalizationProfile
|
| 34 |
+
# ===========================================================================
|
| 35 |
+
|
| 36 |
+
class TestNormalizationProfile:
|
| 37 |
+
|
| 38 |
+
def test_default_nfc_only(self):
|
| 39 |
+
profile = NormalizationProfile(name="test")
|
| 40 |
+
assert profile.nfc is True
|
| 41 |
+
assert profile.caseless is False
|
| 42 |
+
assert profile.diplomatic_table == {}
|
| 43 |
+
|
| 44 |
+
def test_normalize_nfc(self):
|
| 45 |
+
profile = NormalizationProfile(name="nfc_only")
|
| 46 |
+
# NFD vs NFC : après NFC, les deux doivent être identiques
|
| 47 |
+
decomposed = "e\u0301" # e + accent
|
| 48 |
+
assert profile.normalize(decomposed) == "\u00e9" # é NFC
|
| 49 |
+
|
| 50 |
+
def test_normalize_caseless(self):
|
| 51 |
+
profile = NormalizationProfile(name="caseless", caseless=True)
|
| 52 |
+
assert profile.normalize("Bonjour MONDE") == "bonjour monde"
|
| 53 |
+
|
| 54 |
+
def test_normalize_diplomatic_table(self):
|
| 55 |
+
profile = NormalizationProfile(
|
| 56 |
+
name="test",
|
| 57 |
+
diplomatic_table={"ſ": "s", "u": "v"}
|
| 58 |
+
)
|
| 59 |
+
# "maiſon": ſ→s gives "maison", no u present → "maison"
|
| 60 |
+
assert profile.normalize("maiſon") == "maison"
|
| 61 |
+
# "uers" (vers ancien): u→v gives "vers"
|
| 62 |
+
assert profile.normalize("uers") == "vers"
|
| 63 |
+
|
| 64 |
+
def test_normalize_order_nfc_then_caseless_then_diplomatic(self):
|
| 65 |
+
"""L'ordre est : NFC → caseless → table diplomatique."""
|
| 66 |
+
profile = NormalizationProfile(
|
| 67 |
+
name="combined",
|
| 68 |
+
caseless=True,
|
| 69 |
+
diplomatic_table={"ſ": "s"}
|
| 70 |
+
)
|
| 71 |
+
result = profile.normalize("Maiſon")
|
| 72 |
+
assert result == "maison"
|
| 73 |
+
|
| 74 |
+
def test_as_dict(self):
|
| 75 |
+
profile = NormalizationProfile(
|
| 76 |
+
name="medieval_french",
|
| 77 |
+
nfc=True,
|
| 78 |
+
caseless=False,
|
| 79 |
+
diplomatic_table={"ſ": "s"},
|
| 80 |
+
description="Test",
|
| 81 |
+
)
|
| 82 |
+
d = profile.as_dict()
|
| 83 |
+
assert d["name"] == "medieval_french"
|
| 84 |
+
assert d["diplomatic_table"] == {"ſ": "s"}
|
| 85 |
+
assert d["caseless"] is False
|
| 86 |
+
|
| 87 |
+
def test_from_dict(self):
|
| 88 |
+
data = {
|
| 89 |
+
"name": "custom",
|
| 90 |
+
"caseless": True,
|
| 91 |
+
"diplomatic": {"ſ": "s", "u": "v"},
|
| 92 |
+
"description": "Custom profile",
|
| 93 |
+
}
|
| 94 |
+
profile = NormalizationProfile.from_dict(data)
|
| 95 |
+
assert profile.name == "custom"
|
| 96 |
+
assert profile.caseless is True
|
| 97 |
+
assert profile.diplomatic_table == {"ſ": "s", "u": "v"}
|
| 98 |
+
|
| 99 |
+
def test_from_dict_defaults(self):
|
| 100 |
+
profile = NormalizationProfile.from_dict({})
|
| 101 |
+
assert profile.name == "custom"
|
| 102 |
+
assert profile.nfc is True
|
| 103 |
+
assert profile.caseless is False
|
| 104 |
+
|
| 105 |
+
def test_from_yaml(self, tmp_path):
|
| 106 |
+
yaml_content = "name: my_profile\ncaseless: false\ndiplomatic:\n \u017f: s\n u: v\n"
|
| 107 |
+
yaml_file = tmp_path / "profile.yaml"
|
| 108 |
+
yaml_file.write_text(yaml_content, encoding="utf-8")
|
| 109 |
+
try:
|
| 110 |
+
profile = NormalizationProfile.from_yaml(yaml_file)
|
| 111 |
+
assert profile.name == "my_profile"
|
| 112 |
+
assert profile.diplomatic_table == {"\u017f": "s", "u": "v"}
|
| 113 |
+
except RuntimeError as e:
|
| 114 |
+
if "pyyaml" in str(e):
|
| 115 |
+
pytest.skip("pyyaml non installé")
|
| 116 |
+
raise
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class TestApplyDiplomaticTable:
|
| 120 |
+
|
| 121 |
+
def test_simple_substitutions(self):
|
| 122 |
+
table = {"ſ": "s", "u": "v"}
|
| 123 |
+
# "maiſon": ſ→s gives "maison"; no u → "maison"
|
| 124 |
+
assert _apply_diplomatic_table("maiſon", table) == "maison"
|
| 125 |
+
# "uers": u→v gives "vers"
|
| 126 |
+
assert _apply_diplomatic_table("uers", table) == "vers"
|
| 127 |
+
|
| 128 |
+
def test_multi_char_key_priority(self):
|
| 129 |
+
"""Les clés multi-chars sont appliquées avant les clés simples."""
|
| 130 |
+
table = {"ae": "X", "a": "Y"}
|
| 131 |
+
# "ae" doit être remplacé en "X" et non "Ye"
|
| 132 |
+
result = _apply_diplomatic_table("aeb", table)
|
| 133 |
+
assert result == "Xb"
|
| 134 |
+
|
| 135 |
+
def test_ampersand_to_et(self):
|
| 136 |
+
table = {"&": "et"}
|
| 137 |
+
assert _apply_diplomatic_table("noir & blanc", table) == "noir et blanc"
|
| 138 |
+
|
| 139 |
+
def test_empty_table(self):
|
| 140 |
+
assert _apply_diplomatic_table("hello", {}) == "hello"
|
| 141 |
+
|
| 142 |
+
def test_empty_text(self):
|
| 143 |
+
assert _apply_diplomatic_table("", {"a": "b"}) == ""
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class TestGetBuiltinProfile:
|
| 147 |
+
|
| 148 |
+
def test_medieval_french(self):
|
| 149 |
+
profile = get_builtin_profile("medieval_french")
|
| 150 |
+
assert profile.name == "medieval_french"
|
| 151 |
+
assert "ſ" in profile.diplomatic_table
|
| 152 |
+
assert profile.diplomatic_table["ſ"] == "s"
|
| 153 |
+
|
| 154 |
+
def test_early_modern_french(self):
|
| 155 |
+
profile = get_builtin_profile("early_modern_french")
|
| 156 |
+
assert "ſ" in profile.diplomatic_table
|
| 157 |
+
|
| 158 |
+
def test_medieval_latin(self):
|
| 159 |
+
profile = get_builtin_profile("medieval_latin")
|
| 160 |
+
assert "ꝑ" in profile.diplomatic_table
|
| 161 |
+
|
| 162 |
+
def test_minimal(self):
|
| 163 |
+
profile = get_builtin_profile("minimal")
|
| 164 |
+
assert "ſ" in profile.diplomatic_table
|
| 165 |
+
assert "u" not in profile.diplomatic_table
|
| 166 |
+
|
| 167 |
+
def test_nfc(self):
|
| 168 |
+
profile = get_builtin_profile("nfc")
|
| 169 |
+
assert profile.nfc is True
|
| 170 |
+
assert profile.diplomatic_table == {}
|
| 171 |
+
|
| 172 |
+
def test_caseless(self):
|
| 173 |
+
profile = get_builtin_profile("caseless")
|
| 174 |
+
assert profile.caseless is True
|
| 175 |
+
|
| 176 |
+
def test_unknown_raises_key_error(self):
|
| 177 |
+
with pytest.raises(KeyError, match="inexistant"):
|
| 178 |
+
get_builtin_profile("inexistant")
|
| 179 |
+
|
| 180 |
+
def test_default_profile_is_medieval_french(self):
|
| 181 |
+
assert DEFAULT_DIPLOMATIC_PROFILE.name == "medieval_french"
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ===========================================================================
|
| 185 |
+
# Tests CER diplomatique dans compute_metrics
|
| 186 |
+
# ===========================================================================
|
| 187 |
+
|
| 188 |
+
class TestDiplomaticCER:
|
| 189 |
+
|
| 190 |
+
def test_cer_diplomatic_computed_by_default(self):
|
| 191 |
+
"""Le CER diplomatique est calculé par défaut avec le profil médiéval."""
|
| 192 |
+
result = compute_metrics("maiſon", "maison")
|
| 193 |
+
assert result.cer_diplomatic is not None
|
| 194 |
+
assert result.diplomatic_profile_name == "medieval_french"
|
| 195 |
+
|
| 196 |
+
def test_cer_diplomatic_lower_than_exact_for_long_s(self):
|
| 197 |
+
"""
|
| 198 |
+
Avec ſ→s : le CER diplomatique doit être 0.0 pour "maiſon" vs "maison"
|
| 199 |
+
car après normalisation les deux deviennent "maivon" ou "maison".
|
| 200 |
+
"""
|
| 201 |
+
# "maiſon" vs "maison" — différence uniquement sur ſ vs s
|
| 202 |
+
result = compute_metrics("maiſon", "maison")
|
| 203 |
+
# CER brut > 0 (ſ ≠ s, deux bytes UTF-8 vs un)
|
| 204 |
+
assert result.cer > 0.0
|
| 205 |
+
# CER diplomatique = 0 car ſ et s sont équivalents dans le profil médiéval
|
| 206 |
+
assert result.cer_diplomatic == pytest.approx(0.0)
|
| 207 |
+
|
| 208 |
+
def test_cer_diplomatic_in_as_dict(self):
|
| 209 |
+
result = compute_metrics("maiſon", "maison")
|
| 210 |
+
d = result.as_dict()
|
| 211 |
+
assert "cer_diplomatic" in d
|
| 212 |
+
assert "diplomatic_profile_name" in d
|
| 213 |
+
|
| 214 |
+
def test_cer_diplomatic_with_custom_profile(self):
|
| 215 |
+
from picarones.core.normalization import NormalizationProfile
|
| 216 |
+
profile = NormalizationProfile(
|
| 217 |
+
name="test_profile",
|
| 218 |
+
diplomatic_table={"ſ": "s"}
|
| 219 |
+
)
|
| 220 |
+
result = compute_metrics("maiſon", "maison", normalization_profile=profile)
|
| 221 |
+
assert result.cer_diplomatic == pytest.approx(0.0)
|
| 222 |
+
assert result.diplomatic_profile_name == "test_profile"
|
| 223 |
+
|
| 224 |
+
def test_cer_diplomatic_not_in_as_dict_when_none(self):
|
| 225 |
+
"""Si le CER diplomatique n'a pas pu être calculé, il n'est pas dans as_dict."""
|
| 226 |
+
result = MetricsResult(
|
| 227 |
+
cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
|
| 228 |
+
wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
|
| 229 |
+
reference_length=10, hypothesis_length=10,
|
| 230 |
+
cer_diplomatic=None, diplomatic_profile_name=None,
|
| 231 |
+
)
|
| 232 |
+
d = result.as_dict()
|
| 233 |
+
assert "cer_diplomatic" not in d
|
| 234 |
+
|
| 235 |
+
def test_aggregate_metrics_includes_diplomatic_cer(self):
|
| 236 |
+
"""aggregate_metrics doit agréger cer_diplomatic quand disponible."""
|
| 237 |
+
results = [
|
| 238 |
+
MetricsResult(
|
| 239 |
+
cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
|
| 240 |
+
wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
|
| 241 |
+
reference_length=10, hypothesis_length=10,
|
| 242 |
+
cer_diplomatic=0.05, diplomatic_profile_name="medieval_french",
|
| 243 |
+
),
|
| 244 |
+
MetricsResult(
|
| 245 |
+
cer=0.2, cer_nfc=0.2, cer_caseless=0.2,
|
| 246 |
+
wer=0.2, wer_normalized=0.2, mer=0.2, wil=0.2,
|
| 247 |
+
reference_length=10, hypothesis_length=10,
|
| 248 |
+
cer_diplomatic=0.10, diplomatic_profile_name="medieval_french",
|
| 249 |
+
),
|
| 250 |
+
]
|
| 251 |
+
agg = aggregate_metrics(results)
|
| 252 |
+
assert "cer_diplomatic" in agg
|
| 253 |
+
assert agg["cer_diplomatic"]["mean"] == pytest.approx(0.075)
|
| 254 |
+
assert agg["cer_diplomatic"].get("profile") == "medieval_french"
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ===========================================================================
|
| 258 |
+
# Tests parse_page_selector
|
| 259 |
+
# ===========================================================================
|
| 260 |
+
|
| 261 |
+
class TestParsePageSelector:
|
| 262 |
+
|
| 263 |
+
def test_all(self):
|
| 264 |
+
assert parse_page_selector("all", 10) == list(range(10))
|
| 265 |
+
|
| 266 |
+
def test_empty_string(self):
|
| 267 |
+
assert parse_page_selector("", 5) == list(range(5))
|
| 268 |
+
|
| 269 |
+
def test_single_page(self):
|
| 270 |
+
assert parse_page_selector("3", 10) == [2] # 0-based
|
| 271 |
+
|
| 272 |
+
def test_range(self):
|
| 273 |
+
assert parse_page_selector("1-5", 10) == [0, 1, 2, 3, 4]
|
| 274 |
+
|
| 275 |
+
def test_comma_list(self):
|
| 276 |
+
assert parse_page_selector("1,3,5", 10) == [0, 2, 4]
|
| 277 |
+
|
| 278 |
+
def test_combined(self):
|
| 279 |
+
result = parse_page_selector("1-3,5,8-9", 10)
|
| 280 |
+
assert result == [0, 1, 2, 4, 7, 8]
|
| 281 |
+
|
| 282 |
+
def test_deduplication(self):
|
| 283 |
+
result = parse_page_selector("1,1,2", 5)
|
| 284 |
+
assert result == [0, 1]
|
| 285 |
+
|
| 286 |
+
def test_sorted_output(self):
|
| 287 |
+
result = parse_page_selector("5,1,3", 10)
|
| 288 |
+
assert result == [0, 2, 4]
|
| 289 |
+
|
| 290 |
+
def test_page_out_of_range_raises(self):
|
| 291 |
+
with pytest.raises(ValueError):
|
| 292 |
+
parse_page_selector("15", 10)
|
| 293 |
+
|
| 294 |
+
def test_range_out_of_bounds_raises(self):
|
| 295 |
+
with pytest.raises(ValueError):
|
| 296 |
+
parse_page_selector("1-15", 10)
|
| 297 |
+
|
| 298 |
+
def test_invalid_syntax_raises(self):
|
| 299 |
+
with pytest.raises((ValueError, Exception)):
|
| 300 |
+
parse_page_selector("abc", 10)
|
| 301 |
+
|
| 302 |
+
def test_last_page(self):
|
| 303 |
+
assert parse_page_selector("10", 10) == [9]
|
| 304 |
+
|
| 305 |
+
def test_first_page(self):
|
| 306 |
+
assert parse_page_selector("1", 10) == [0]
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# ===========================================================================
|
| 310 |
+
# Tests IIIFManifestParser — IIIF v2
|
| 311 |
+
# ===========================================================================
|
| 312 |
+
|
| 313 |
+
def _make_v2_manifest(num_canvases: int = 3, with_service: bool = False) -> dict:
|
| 314 |
+
"""Fabrique un manifeste IIIF v2 minimal de test."""
|
| 315 |
+
canvases = []
|
| 316 |
+
for i in range(num_canvases):
|
| 317 |
+
resource: dict
|
| 318 |
+
if with_service:
|
| 319 |
+
resource = {
|
| 320 |
+
"@type": "dctypes:Image",
|
| 321 |
+
"service": {"@id": f"https://example.com/iiif/img{i+1}"},
|
| 322 |
+
}
|
| 323 |
+
else:
|
| 324 |
+
resource = {
|
| 325 |
+
"@type": "dctypes:Image",
|
| 326 |
+
"@id": f"https://example.com/images/img{i+1}.jpg",
|
| 327 |
+
}
|
| 328 |
+
canvases.append({
|
| 329 |
+
"@id": f"https://example.com/canvas/{i+1}",
|
| 330 |
+
"@type": "sc:Canvas",
|
| 331 |
+
"label": f"f. {i+1}r",
|
| 332 |
+
"width": 2000,
|
| 333 |
+
"height": 3000,
|
| 334 |
+
"images": [
|
| 335 |
+
{
|
| 336 |
+
"@type": "oa:Annotation",
|
| 337 |
+
"motivation": "sc:painting",
|
| 338 |
+
"resource": resource,
|
| 339 |
+
"on": f"https://example.com/canvas/{i+1}",
|
| 340 |
+
}
|
| 341 |
+
],
|
| 342 |
+
})
|
| 343 |
+
return {
|
| 344 |
+
"@context": "http://iiif.io/api/presentation/2/context.json",
|
| 345 |
+
"@type": "sc:Manifest",
|
| 346 |
+
"@id": "https://example.com/manifest.json",
|
| 347 |
+
"label": "Manuscript de test",
|
| 348 |
+
"sequences": [
|
| 349 |
+
{
|
| 350 |
+
"@type": "sc:Sequence",
|
| 351 |
+
"canvases": canvases,
|
| 352 |
+
}
|
| 353 |
+
],
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def _make_v3_manifest(num_canvases: int = 3) -> dict:
|
| 358 |
+
"""Fabrique un manifeste IIIF v3 minimal de test."""
|
| 359 |
+
items = []
|
| 360 |
+
for i in range(num_canvases):
|
| 361 |
+
items.append({
|
| 362 |
+
"id": f"https://example.com/canvas/{i+1}",
|
| 363 |
+
"type": "Canvas",
|
| 364 |
+
"label": {"fr": [f"Page {i+1}"]},
|
| 365 |
+
"width": 1500,
|
| 366 |
+
"height": 2200,
|
| 367 |
+
"items": [
|
| 368 |
+
{
|
| 369 |
+
"id": f"https://example.com/canvas/{i+1}/ap",
|
| 370 |
+
"type": "AnnotationPage",
|
| 371 |
+
"items": [
|
| 372 |
+
{
|
| 373 |
+
"id": f"https://example.com/canvas/{i+1}/ap/a",
|
| 374 |
+
"type": "Annotation",
|
| 375 |
+
"motivation": "painting",
|
| 376 |
+
"body": {
|
| 377 |
+
"id": f"https://example.com/images/{i+1}/full/max/0/default.jpg",
|
| 378 |
+
"type": "Image",
|
| 379 |
+
"format": "image/jpeg",
|
| 380 |
+
},
|
| 381 |
+
"target": f"https://example.com/canvas/{i+1}",
|
| 382 |
+
}
|
| 383 |
+
],
|
| 384 |
+
}
|
| 385 |
+
],
|
| 386 |
+
})
|
| 387 |
+
return {
|
| 388 |
+
"@context": "http://iiif.io/api/presentation/3/context.json",
|
| 389 |
+
"id": "https://example.com/manifest.json",
|
| 390 |
+
"type": "Manifest",
|
| 391 |
+
"label": {"fr": ["Manuscrit v3 de test"]},
|
| 392 |
+
"items": items,
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
class TestIIIFManifestParserV2:
|
| 397 |
+
|
| 398 |
+
def test_version_detection(self):
|
| 399 |
+
manifest = _make_v2_manifest()
|
| 400 |
+
parser = IIIFManifestParser(manifest)
|
| 401 |
+
assert parser.version == 2
|
| 402 |
+
|
| 403 |
+
def test_canvases_count(self):
|
| 404 |
+
parser = IIIFManifestParser(_make_v2_manifest(5))
|
| 405 |
+
assert len(parser.canvases()) == 5
|
| 406 |
+
|
| 407 |
+
def test_canvas_label(self):
|
| 408 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 409 |
+
canvases = parser.canvases()
|
| 410 |
+
assert canvases[0].label == "f. 1r"
|
| 411 |
+
assert canvases[1].label == "f. 2r"
|
| 412 |
+
|
| 413 |
+
def test_canvas_image_url_direct(self):
|
| 414 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 415 |
+
canvases = parser.canvases()
|
| 416 |
+
assert canvases[0].image_url == "https://example.com/images/img1.jpg"
|
| 417 |
+
|
| 418 |
+
def test_canvas_image_url_via_service(self):
|
| 419 |
+
parser = IIIFManifestParser(_make_v2_manifest(with_service=True))
|
| 420 |
+
canvases = parser.canvases()
|
| 421 |
+
assert "/full/max/0/default.jpg" in canvases[0].image_url
|
| 422 |
+
|
| 423 |
+
def test_canvas_dimensions(self):
|
| 424 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 425 |
+
c = parser.canvases()[0]
|
| 426 |
+
assert c.width == 2000
|
| 427 |
+
assert c.height == 3000
|
| 428 |
+
|
| 429 |
+
def test_canvas_index(self):
|
| 430 |
+
parser = IIIFManifestParser(_make_v2_manifest(3))
|
| 431 |
+
canvases = parser.canvases()
|
| 432 |
+
for i, c in enumerate(canvases):
|
| 433 |
+
assert c.index == i
|
| 434 |
+
|
| 435 |
+
def test_label(self):
|
| 436 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 437 |
+
assert parser.label == "Manuscript de test"
|
| 438 |
+
|
| 439 |
+
def test_empty_sequences(self):
|
| 440 |
+
manifest = {
|
| 441 |
+
"@context": "http://iiif.io/api/presentation/2/context.json",
|
| 442 |
+
"@type": "sc:Manifest",
|
| 443 |
+
"label": "Empty",
|
| 444 |
+
"sequences": [],
|
| 445 |
+
}
|
| 446 |
+
parser = IIIFManifestParser(manifest)
|
| 447 |
+
assert parser.canvases() == []
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
class TestIIIFManifestParserV3:
|
| 451 |
+
|
| 452 |
+
def test_version_detection(self):
|
| 453 |
+
manifest = _make_v3_manifest()
|
| 454 |
+
parser = IIIFManifestParser(manifest)
|
| 455 |
+
assert parser.version == 3
|
| 456 |
+
|
| 457 |
+
def test_canvases_count(self):
|
| 458 |
+
parser = IIIFManifestParser(_make_v3_manifest(4))
|
| 459 |
+
assert len(parser.canvases()) == 4
|
| 460 |
+
|
| 461 |
+
def test_canvas_label_from_language_map(self):
|
| 462 |
+
parser = IIIFManifestParser(_make_v3_manifest())
|
| 463 |
+
canvases = parser.canvases()
|
| 464 |
+
assert "Page 1" in canvases[0].label
|
| 465 |
+
|
| 466 |
+
def test_canvas_image_url(self):
|
| 467 |
+
parser = IIIFManifestParser(_make_v3_manifest())
|
| 468 |
+
canvases = parser.canvases()
|
| 469 |
+
assert "default.jpg" in canvases[0].image_url
|
| 470 |
+
|
| 471 |
+
def test_manifest_label_language_map(self):
|
| 472 |
+
parser = IIIFManifestParser(_make_v3_manifest())
|
| 473 |
+
assert "v3" in parser.label.lower() or "test" in parser.label.lower()
|
| 474 |
+
|
| 475 |
+
def test_type_manifest_triggers_v3(self):
|
| 476 |
+
"""Un manifeste avec type == 'Manifest' est détecté comme v3."""
|
| 477 |
+
manifest = {"type": "Manifest", "items": []}
|
| 478 |
+
parser = IIIFManifestParser(manifest)
|
| 479 |
+
assert parser.version == 3
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
class TestExtractLabel:
|
| 483 |
+
|
| 484 |
+
def test_string(self):
|
| 485 |
+
assert _extract_label("Page 1") == "Page 1"
|
| 486 |
+
|
| 487 |
+
def test_list(self):
|
| 488 |
+
assert _extract_label(["Page 1", "Page 2"]) == "Page 1"
|
| 489 |
+
|
| 490 |
+
def test_dict_fr(self):
|
| 491 |
+
assert _extract_label({"fr": ["Folio 1r"]}) == "Folio 1r"
|
| 492 |
+
|
| 493 |
+
def test_dict_en(self):
|
| 494 |
+
assert _extract_label({"en": ["Folio 1r"]}) == "Folio 1r"
|
| 495 |
+
|
| 496 |
+
def test_dict_none_key(self):
|
| 497 |
+
assert _extract_label({"none": ["Label"]}) == "Label"
|
| 498 |
+
|
| 499 |
+
def test_empty_string(self):
|
| 500 |
+
assert _extract_label("") == ""
|
| 501 |
+
|
| 502 |
+
def test_none_value(self):
|
| 503 |
+
result = _extract_label(None)
|
| 504 |
+
assert isinstance(result, str)
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
class TestBestImageUrlV2:
|
| 508 |
+
|
| 509 |
+
def test_direct_id(self):
|
| 510 |
+
resource = {"@id": "https://example.com/img.jpg"}
|
| 511 |
+
url = _best_image_url_v2(resource, {})
|
| 512 |
+
assert url == "https://example.com/img.jpg"
|
| 513 |
+
|
| 514 |
+
def test_service_id(self):
|
| 515 |
+
resource = {
|
| 516 |
+
"@id": "https://example.com/info.json",
|
| 517 |
+
"service": {"@id": "https://example.com/iiif/img1"},
|
| 518 |
+
}
|
| 519 |
+
url = _best_image_url_v2(resource, {})
|
| 520 |
+
assert url == "https://example.com/iiif/img1/full/max/0/default.jpg"
|
| 521 |
+
|
| 522 |
+
def test_service_list(self):
|
| 523 |
+
resource = {
|
| 524 |
+
"service": [
|
| 525 |
+
{"@id": "https://example.com/iiif/img2"},
|
| 526 |
+
]
|
| 527 |
+
}
|
| 528 |
+
url = _best_image_url_v2(resource, {})
|
| 529 |
+
assert url == "https://example.com/iiif/img2/full/max/0/default.jpg"
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
class TestBestImageUrlV3:
|
| 533 |
+
|
| 534 |
+
def test_direct_body_image(self):
|
| 535 |
+
canvas = {
|
| 536 |
+
"items": [
|
| 537 |
+
{
|
| 538 |
+
"type": "AnnotationPage",
|
| 539 |
+
"items": [
|
| 540 |
+
{
|
| 541 |
+
"type": "Annotation",
|
| 542 |
+
"motivation": "painting",
|
| 543 |
+
"body": {
|
| 544 |
+
"id": "https://example.com/img.jpg",
|
| 545 |
+
"type": "Image",
|
| 546 |
+
},
|
| 547 |
+
}
|
| 548 |
+
],
|
| 549 |
+
}
|
| 550 |
+
]
|
| 551 |
+
}
|
| 552 |
+
url = _best_image_url_v3(canvas)
|
| 553 |
+
assert url == "https://example.com/img.jpg"
|
| 554 |
+
|
| 555 |
+
def test_body_via_service(self):
|
| 556 |
+
canvas = {
|
| 557 |
+
"items": [
|
| 558 |
+
{
|
| 559 |
+
"items": [
|
| 560 |
+
{
|
| 561 |
+
"body": {
|
| 562 |
+
"type": "Image",
|
| 563 |
+
"id": "",
|
| 564 |
+
"service": [{"id": "https://example.com/iiif/3/img1"}],
|
| 565 |
+
}
|
| 566 |
+
}
|
| 567 |
+
]
|
| 568 |
+
}
|
| 569 |
+
]
|
| 570 |
+
}
|
| 571 |
+
url = _best_image_url_v3(canvas)
|
| 572 |
+
assert "/full/max/0/default.jpg" in url
|
| 573 |
+
|
| 574 |
+
def test_empty_canvas(self):
|
| 575 |
+
url = _best_image_url_v3({})
|
| 576 |
+
assert url == ""
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
class TestGuessExtension:
|
| 580 |
+
|
| 581 |
+
def test_jpg(self):
|
| 582 |
+
assert _guess_extension("https://example.com/img.jpg") == ".jpg"
|
| 583 |
+
|
| 584 |
+
def test_png(self):
|
| 585 |
+
assert _guess_extension("https://example.com/img.png") == ".png"
|
| 586 |
+
|
| 587 |
+
def test_tiff(self):
|
| 588 |
+
assert _guess_extension("https://example.com/img.tiff") == ".tiff"
|
| 589 |
+
|
| 590 |
+
def test_iiif_default(self):
|
| 591 |
+
# URL IIIF standard contient /default.jpg
|
| 592 |
+
url = "https://example.com/iiif/img/full/max/0/default.jpg"
|
| 593 |
+
assert _guess_extension(url) == ".jpg"
|
| 594 |
+
|
| 595 |
+
def test_unknown_defaults_to_jpg(self):
|
| 596 |
+
assert _guess_extension("https://example.com/resource/123") == ".jpg"
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
class TestSlugify:
|
| 600 |
+
|
| 601 |
+
def test_simple(self):
|
| 602 |
+
assert _slugify("Page 1") == "Page_1"
|
| 603 |
+
|
| 604 |
+
def test_special_chars_removed(self):
|
| 605 |
+
result = _slugify("f. 1r (recto)")
|
| 606 |
+
assert "/" not in result
|
| 607 |
+
assert "." not in result
|
| 608 |
+
|
| 609 |
+
def test_max_length(self):
|
| 610 |
+
long_label = "x" * 100
|
| 611 |
+
assert len(_slugify(long_label)) <= 60
|
| 612 |
+
|
| 613 |
+
def test_empty(self):
|
| 614 |
+
assert _slugify("") == ""
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
# ===========================================================================
|
| 618 |
+
# Tests structure des nouveaux moteurs OCR (sans appel réseau)
|
| 619 |
+
# ===========================================================================
|
| 620 |
+
|
| 621 |
+
class TestMistralOCREngine:
|
| 622 |
+
|
| 623 |
+
def test_import(self):
|
| 624 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 625 |
+
assert MistralOCREngine is not None
|
| 626 |
+
|
| 627 |
+
def test_name(self):
|
| 628 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 629 |
+
engine = MistralOCREngine()
|
| 630 |
+
assert engine.name == "mistral_ocr"
|
| 631 |
+
|
| 632 |
+
def test_version_default_model(self):
|
| 633 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 634 |
+
engine = MistralOCREngine()
|
| 635 |
+
assert "pixtral" in engine.version()
|
| 636 |
+
|
| 637 |
+
def test_version_custom_model(self):
|
| 638 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 639 |
+
engine = MistralOCREngine({"model": "pixtral-large-latest"})
|
| 640 |
+
assert engine.version() == "pixtral-large-latest"
|
| 641 |
+
|
| 642 |
+
def test_missing_api_key_raises(self, monkeypatch, tmp_path):
|
| 643 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 644 |
+
monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
|
| 645 |
+
engine = MistralOCREngine()
|
| 646 |
+
# Créer un fichier image factice
|
| 647 |
+
img = tmp_path / "test.jpg"
|
| 648 |
+
img.write_bytes(b"\xff\xd8\xff") # JPEG header minimal
|
| 649 |
+
with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"):
|
| 650 |
+
engine._run_ocr(img)
|
| 651 |
+
|
| 652 |
+
def test_exported_from_engines(self):
|
| 653 |
+
from picarones.engines import MistralOCREngine
|
| 654 |
+
assert MistralOCREngine is not None
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
class TestGoogleVisionEngine:
|
| 658 |
+
|
| 659 |
+
def test_import(self):
|
| 660 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 661 |
+
assert GoogleVisionEngine is not None
|
| 662 |
+
|
| 663 |
+
def test_name(self):
|
| 664 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 665 |
+
engine = GoogleVisionEngine()
|
| 666 |
+
assert engine.name == "google_vision"
|
| 667 |
+
|
| 668 |
+
def test_version(self):
|
| 669 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 670 |
+
engine = GoogleVisionEngine()
|
| 671 |
+
assert engine.version() == "v1"
|
| 672 |
+
|
| 673 |
+
def test_missing_credentials_raises(self, monkeypatch, tmp_path):
|
| 674 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 675 |
+
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
|
| 676 |
+
monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
|
| 677 |
+
engine = GoogleVisionEngine()
|
| 678 |
+
img = tmp_path / "test.jpg"
|
| 679 |
+
img.write_bytes(b"\xff\xd8\xff")
|
| 680 |
+
with pytest.raises(RuntimeError):
|
| 681 |
+
engine._run_ocr(img)
|
| 682 |
+
|
| 683 |
+
def test_exported_from_engines(self):
|
| 684 |
+
from picarones.engines import GoogleVisionEngine
|
| 685 |
+
assert GoogleVisionEngine is not None
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
class TestAzureDocIntelEngine:
|
| 689 |
+
|
| 690 |
+
def test_import(self):
|
| 691 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 692 |
+
assert AzureDocIntelEngine is not None
|
| 693 |
+
|
| 694 |
+
def test_name(self):
|
| 695 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 696 |
+
engine = AzureDocIntelEngine()
|
| 697 |
+
assert engine.name == "azure_doc_intel"
|
| 698 |
+
|
| 699 |
+
def test_missing_key_raises(self, monkeypatch, tmp_path):
|
| 700 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 701 |
+
monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
|
| 702 |
+
monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
|
| 703 |
+
engine = AzureDocIntelEngine()
|
| 704 |
+
img = tmp_path / "test.jpg"
|
| 705 |
+
img.write_bytes(b"\xff\xd8\xff")
|
| 706 |
+
with pytest.raises(RuntimeError):
|
| 707 |
+
engine._run_ocr(img)
|
| 708 |
+
|
| 709 |
+
def test_exported_from_engines(self):
|
| 710 |
+
from picarones.engines import AzureDocIntelEngine
|
| 711 |
+
assert AzureDocIntelEngine is not None
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
# ===========================================================================
|
| 715 |
+
# Tests CLI — commande import iiif
|
| 716 |
+
# ===========================================================================
|
| 717 |
+
|
| 718 |
+
class TestCLIImportIIIF:
|
| 719 |
+
|
| 720 |
+
def test_import_group_exists(self):
|
| 721 |
+
from picarones.cli import cli
|
| 722 |
+
from click.testing import CliRunner
|
| 723 |
+
runner = CliRunner()
|
| 724 |
+
result = runner.invoke(cli, ["import", "--help"])
|
| 725 |
+
assert result.exit_code == 0
|
| 726 |
+
|
| 727 |
+
def test_import_iiif_command_exists(self):
|
| 728 |
+
from picarones.cli import cli
|
| 729 |
+
from click.testing import CliRunner
|
| 730 |
+
runner = CliRunner()
|
| 731 |
+
result = runner.invoke(cli, ["import", "iiif", "--help"])
|
| 732 |
+
assert result.exit_code == 0
|
| 733 |
+
assert "manifest_url" in result.output.lower() or "MANIFEST_URL" in result.output
|
| 734 |
+
|
| 735 |
+
def test_import_iiif_options(self):
|
| 736 |
+
from picarones.cli import cli
|
| 737 |
+
from click.testing import CliRunner
|
| 738 |
+
runner = CliRunner()
|
| 739 |
+
result = runner.invoke(cli, ["import", "iiif", "--help"])
|
| 740 |
+
assert "--pages" in result.output
|
| 741 |
+
assert "--output" in result.output
|
| 742 |
+
|
| 743 |
+
def test_import_iiif_requires_url(self):
|
| 744 |
+
from picarones.cli import cli
|
| 745 |
+
from click.testing import CliRunner
|
| 746 |
+
runner = CliRunner()
|
| 747 |
+
result = runner.invoke(cli, ["import", "iiif"])
|
| 748 |
+
# Sans URL, doit afficher une erreur
|
| 749 |
+
assert result.exit_code != 0
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
# ===========================================================================
|
| 753 |
+
# Tests fixtures Sprint 4 (CER diplomatique dans la démo)
|
| 754 |
+
# ===========================================================================
|
| 755 |
+
|
| 756 |
+
class TestFixturesDiplomaticCER:
|
| 757 |
+
|
| 758 |
+
def test_gt_texts_contain_medieval_graphies(self):
|
| 759 |
+
"""Les textes GT de démo doivent contenir des graphies médiévales."""
|
| 760 |
+
from picarones.fixtures import _GT_TEXTS
|
| 761 |
+
all_gt = " ".join(_GT_TEXTS)
|
| 762 |
+
# Les GT doivent contenir au moins ſ, & ou æ/œ
|
| 763 |
+
has_medieval_chars = any(c in all_gt for c in ["ſ", "&", "æ", "œ"])
|
| 764 |
+
assert has_medieval_chars, "Les GT de démo doivent inclure des graphies médiévales pour illustrer le CER diplomatique"
|
| 765 |
+
|
| 766 |
+
def test_benchmark_results_have_diplomatic_cer(self):
|
| 767 |
+
"""Les résultats du benchmark fictif doivent inclure le CER diplomatique."""
|
| 768 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 769 |
+
bm = generate_sample_benchmark()
|
| 770 |
+
for engine_report in bm.engine_reports:
|
| 771 |
+
for doc_result in engine_report.document_results:
|
| 772 |
+
if doc_result.metrics.error is None:
|
| 773 |
+
# Le CER diplomatique doit être calculé
|
| 774 |
+
assert doc_result.metrics.cer_diplomatic is not None, (
|
| 775 |
+
f"CER diplomatique manquant pour {engine_report.engine_name}"
|
| 776 |
+
)
|
| 777 |
+
break # Un seul doc suffit pour vérifier
|
| 778 |
+
|
| 779 |
+
def test_diplomatic_cer_lower_for_medieval_graphies(self):
|
| 780 |
+
"""Pour un texte avec ſ, le CER diplomatique doit être ≤ CER exact."""
|
| 781 |
+
result = compute_metrics(
|
| 782 |
+
"maiſon & jardin", # GT avec graphies médiévales
|
| 783 |
+
"maison et jardin", # OCR avec graphies modernisées
|
| 784 |
+
)
|
| 785 |
+
assert result.cer_diplomatic is not None
|
| 786 |
+
# CER diplomatique doit être inférieur ou égal au CER exact
|
| 787 |
+
assert result.cer_diplomatic <= result.cer
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
# ===========================================================================
|
| 791 |
+
# Tests rapport HTML Sprint 4 (CER diplomatique affiché)
|
| 792 |
+
# ===========================================================================
|
| 793 |
+
|
| 794 |
+
class TestReportDiplomaticCER:
|
| 795 |
+
|
| 796 |
+
def test_report_data_has_cer_diplomatic(self):
|
| 797 |
+
"""_build_report_data doit inclure cer_diplomatic dans engines_summary."""
|
| 798 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 799 |
+
from picarones.report.generator import _build_report_data
|
| 800 |
+
|
| 801 |
+
bm = generate_sample_benchmark()
|
| 802 |
+
data = _build_report_data(bm, images_b64={})
|
| 803 |
+
|
| 804 |
+
# Chaque entrée engines doit avoir cer_diplomatic (ou None)
|
| 805 |
+
assert "engines" in data
|
| 806 |
+
for engine_data in data["engines"]:
|
| 807 |
+
assert "cer_diplomatic" in engine_data, (
|
| 808 |
+
f"cer_diplomatic manquant dans {engine_data.get('name', '?')}"
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
def test_html_contains_cer_diplo_column(self, tmp_path):
|
| 812 |
+
"""Le HTML généré doit contenir la colonne CER diplo."""
|
| 813 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 814 |
+
from picarones.report.generator import ReportGenerator
|
| 815 |
+
|
| 816 |
+
bm = generate_sample_benchmark()
|
| 817 |
+
out = tmp_path / "report_test.html"
|
| 818 |
+
ReportGenerator(bm).generate(out)
|
| 819 |
+
html = out.read_text(encoding="utf-8")
|
| 820 |
+
assert "diplo" in html.lower() or "diplomatique" in html.lower(), (
|
| 821 |
+
"Le rapport HTML doit mentionner le CER diplomatique"
|
| 822 |
+
)
|
| 823 |
+
|
| 824 |
+
def test_html_contains_medieval_graphie_indicator(self, tmp_path):
|
| 825 |
+
"""Le rapport doit mentionner les graphies médiévales (ſ=s ou u=v)."""
|
| 826 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 827 |
+
from picarones.report.generator import ReportGenerator
|
| 828 |
+
|
| 829 |
+
bm = generate_sample_benchmark()
|
| 830 |
+
out = tmp_path / "report_test.html"
|
| 831 |
+
ReportGenerator(bm).generate(out)
|
| 832 |
+
html = out.read_text(encoding="utf-8")
|
| 833 |
+
# Le tooltip ou la légende doit mentionner les correspondances diplomatiques
|
| 834 |
+
assert "ſ=s" in html or "u=v" in html or "diplomatique" in html.lower()
|
tests/test_sprint5_advanced_metrics.py
ADDED
|
@@ -0,0 +1,876 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 5 : métriques avancées patrimoniales.
|
| 2 |
+
|
| 3 |
+
Couvre :
|
| 4 |
+
- Matrice de confusion unicode (confusion.py)
|
| 5 |
+
- Scores ligatures et diacritiques (char_scores.py)
|
| 6 |
+
- Taxonomie des erreurs classes 1-9 (taxonomy.py)
|
| 7 |
+
- Analyse structurelle (structure.py)
|
| 8 |
+
- Qualité image (image_quality.py)
|
| 9 |
+
- Intégration dans les fixtures et le rapport HTML
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
# ===========================================================================
|
| 17 |
+
# Tests ConfusionMatrix
|
| 18 |
+
# ===========================================================================
|
| 19 |
+
|
| 20 |
+
from picarones.core.confusion import (
|
| 21 |
+
ConfusionMatrix,
|
| 22 |
+
EMPTY_CHAR,
|
| 23 |
+
build_confusion_matrix,
|
| 24 |
+
aggregate_confusion_matrices,
|
| 25 |
+
top_confused_chars,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TestBuildConfusionMatrix:
|
| 30 |
+
|
| 31 |
+
def test_identical_texts(self):
|
| 32 |
+
cm = build_confusion_matrix("abc", "abc")
|
| 33 |
+
# Pas de substitutions
|
| 34 |
+
assert cm.total_substitutions == 0
|
| 35 |
+
assert cm.total_insertions == 0
|
| 36 |
+
assert cm.total_deletions == 0
|
| 37 |
+
|
| 38 |
+
def test_empty_texts(self):
|
| 39 |
+
cm = build_confusion_matrix("", "")
|
| 40 |
+
assert cm.total_errors == 0
|
| 41 |
+
|
| 42 |
+
def test_simple_substitution(self):
|
| 43 |
+
cm = build_confusion_matrix("abc", "axc")
|
| 44 |
+
# 'b' → 'x'
|
| 45 |
+
assert "b" in cm.matrix
|
| 46 |
+
assert "x" in cm.matrix["b"]
|
| 47 |
+
assert cm.matrix["b"]["x"] >= 1
|
| 48 |
+
|
| 49 |
+
def test_deletion_recorded(self):
|
| 50 |
+
cm = build_confusion_matrix("abc", "ac")
|
| 51 |
+
# 'b' supprimé
|
| 52 |
+
assert "b" in cm.matrix
|
| 53 |
+
assert EMPTY_CHAR in cm.matrix["b"]
|
| 54 |
+
|
| 55 |
+
def test_insertion_recorded(self):
|
| 56 |
+
cm = build_confusion_matrix("ac", "abc")
|
| 57 |
+
# 'b' inséré
|
| 58 |
+
assert EMPTY_CHAR in cm.matrix
|
| 59 |
+
assert "b" in cm.matrix[EMPTY_CHAR]
|
| 60 |
+
|
| 61 |
+
def test_no_whitespace_recorded_by_default(self):
|
| 62 |
+
cm = build_confusion_matrix("a b", "a x")
|
| 63 |
+
# Les espaces ne doivent pas être dans la matrice
|
| 64 |
+
assert " " not in cm.matrix
|
| 65 |
+
|
| 66 |
+
def test_as_dict_structure(self):
|
| 67 |
+
cm = build_confusion_matrix("hello", "hallo")
|
| 68 |
+
d = cm.as_dict()
|
| 69 |
+
assert "matrix" in d
|
| 70 |
+
assert "total_substitutions" in d
|
| 71 |
+
assert "total_insertions" in d
|
| 72 |
+
assert "total_deletions" in d
|
| 73 |
+
|
| 74 |
+
def test_top_confusions(self):
|
| 75 |
+
cm = build_confusion_matrix("eeee", "aaaa")
|
| 76 |
+
tops = cm.top_confusions(n=5)
|
| 77 |
+
assert len(tops) >= 1
|
| 78 |
+
assert tops[0]["gt"] == "e"
|
| 79 |
+
assert tops[0]["ocr"] == "a"
|
| 80 |
+
assert tops[0]["count"] == 4
|
| 81 |
+
|
| 82 |
+
def test_medieval_chars_tracked(self):
|
| 83 |
+
cm = build_confusion_matrix("maiſon", "maifon")
|
| 84 |
+
# ſ confondu avec f
|
| 85 |
+
assert "ſ" in cm.matrix
|
| 86 |
+
assert "f" in cm.matrix["ſ"]
|
| 87 |
+
|
| 88 |
+
def test_as_compact_dict_filters_low_count(self):
|
| 89 |
+
cm = build_confusion_matrix("aab", "axb")
|
| 90 |
+
# avec min_count=2, une substitution unique filtrée
|
| 91 |
+
compact = cm.as_compact_dict(min_count=2)
|
| 92 |
+
# Le 'a'→'x' ne doit pas apparaître (1 seule occurrence)
|
| 93 |
+
matrix = compact["matrix"]
|
| 94 |
+
for gt_counts in matrix.values():
|
| 95 |
+
for ocr_char, cnt in gt_counts.items():
|
| 96 |
+
assert cnt >= 2
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class TestAggregateConfusionMatrices:
|
| 100 |
+
|
| 101 |
+
def test_empty_list(self):
|
| 102 |
+
cm = aggregate_confusion_matrices([])
|
| 103 |
+
assert cm.total_errors == 0
|
| 104 |
+
|
| 105 |
+
def test_single_matrix(self):
|
| 106 |
+
cm1 = build_confusion_matrix("abc", "axc")
|
| 107 |
+
agg = aggregate_confusion_matrices([cm1])
|
| 108 |
+
assert agg.matrix == cm1.matrix
|
| 109 |
+
|
| 110 |
+
def test_counts_sum(self):
|
| 111 |
+
cm1 = build_confusion_matrix("abc", "axc")
|
| 112 |
+
cm2 = build_confusion_matrix("abc", "axc")
|
| 113 |
+
agg = aggregate_confusion_matrices([cm1, cm2])
|
| 114 |
+
# La confusion 'b'→'x' doit apparaître 2 fois
|
| 115 |
+
assert agg.matrix.get("b", {}).get("x", 0) >= 2
|
| 116 |
+
|
| 117 |
+
def test_total_errors_sum(self):
|
| 118 |
+
cm1 = build_confusion_matrix("abc", "axc")
|
| 119 |
+
cm2 = build_confusion_matrix("def", "dxf")
|
| 120 |
+
agg = aggregate_confusion_matrices([cm1, cm2])
|
| 121 |
+
assert agg.total_errors >= cm1.total_errors + cm2.total_errors
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class TestTopConfusedChars:
|
| 125 |
+
|
| 126 |
+
def test_returns_list(self):
|
| 127 |
+
cm = build_confusion_matrix("aaabbb", "aaaxxx")
|
| 128 |
+
tops = top_confused_chars(cm, n=5)
|
| 129 |
+
assert isinstance(tops, list)
|
| 130 |
+
|
| 131 |
+
def test_sorted_by_errors_desc(self):
|
| 132 |
+
cm = aggregate_confusion_matrices([
|
| 133 |
+
build_confusion_matrix("bbb", "xxx"), # 3 fois
|
| 134 |
+
build_confusion_matrix("a", "y"), # 1 fois
|
| 135 |
+
])
|
| 136 |
+
tops = top_confused_chars(cm, n=10)
|
| 137 |
+
if len(tops) >= 2:
|
| 138 |
+
assert tops[0]["total_errors"] >= tops[1]["total_errors"]
|
| 139 |
+
|
| 140 |
+
def test_excludes_empty_char(self):
|
| 141 |
+
cm = build_confusion_matrix("abc", "ac") # b supprimé
|
| 142 |
+
tops = top_confused_chars(cm, exclude_empty=True)
|
| 143 |
+
assert all(t["char"] != EMPTY_CHAR for t in tops)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ===========================================================================
|
| 147 |
+
# Tests LigatureScore
|
| 148 |
+
# ===========================================================================
|
| 149 |
+
|
| 150 |
+
from picarones.core.char_scores import (
|
| 151 |
+
LIGATURE_TABLE,
|
| 152 |
+
DIACRITIC_MAP,
|
| 153 |
+
LigatureScore,
|
| 154 |
+
DiacriticScore,
|
| 155 |
+
compute_ligature_score,
|
| 156 |
+
compute_diacritic_score,
|
| 157 |
+
aggregate_ligature_scores,
|
| 158 |
+
aggregate_diacritic_scores,
|
| 159 |
+
_ALL_LIGATURES,
|
| 160 |
+
_ALL_DIACRITICS,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class TestLigatureTable:
|
| 165 |
+
|
| 166 |
+
def test_fi_ligature_present(self):
|
| 167 |
+
assert "\uFB01" in LIGATURE_TABLE # fi
|
| 168 |
+
|
| 169 |
+
def test_fl_ligature_present(self):
|
| 170 |
+
assert "\uFB02" in LIGATURE_TABLE # fl
|
| 171 |
+
|
| 172 |
+
def test_oe_ligature_present(self):
|
| 173 |
+
assert "\u0153" in LIGATURE_TABLE # œ
|
| 174 |
+
|
| 175 |
+
def test_ae_ligature_present(self):
|
| 176 |
+
assert "\u00E6" in LIGATURE_TABLE # æ
|
| 177 |
+
|
| 178 |
+
def test_ff_ligature_present(self):
|
| 179 |
+
assert "\uFB00" in LIGATURE_TABLE # ff
|
| 180 |
+
|
| 181 |
+
def test_equivalents_are_lists(self):
|
| 182 |
+
for lig, equivs in LIGATURE_TABLE.items():
|
| 183 |
+
assert isinstance(equivs, list)
|
| 184 |
+
assert len(equivs) >= 1
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class TestComputeLigatureScore:
|
| 188 |
+
|
| 189 |
+
def test_no_ligatures_in_gt(self):
|
| 190 |
+
result = compute_ligature_score("bonjour monde", "bonjour monde")
|
| 191 |
+
assert result.score == pytest.approx(1.0)
|
| 192 |
+
assert result.total_in_gt == 0
|
| 193 |
+
|
| 194 |
+
def test_ligature_correctly_recognized(self):
|
| 195 |
+
# GT avec fi (fi ligature), OCR reconnaît "fi"
|
| 196 |
+
result = compute_ligature_score("fin", "fin")
|
| 197 |
+
assert result.total_in_gt == 1
|
| 198 |
+
assert result.score == pytest.approx(1.0)
|
| 199 |
+
|
| 200 |
+
def test_ligature_unicode_to_unicode(self):
|
| 201 |
+
# GT et OCR ont tous les deux fi
|
| 202 |
+
result = compute_ligature_score("fin", "fin")
|
| 203 |
+
assert result.score == pytest.approx(1.0)
|
| 204 |
+
|
| 205 |
+
def test_oe_ligature(self):
|
| 206 |
+
result = compute_ligature_score("œuvre", "oeuvre")
|
| 207 |
+
assert result.total_in_gt == 1
|
| 208 |
+
assert result.score == pytest.approx(1.0)
|
| 209 |
+
|
| 210 |
+
def test_ae_ligature(self):
|
| 211 |
+
result = compute_ligature_score("æther", "aether")
|
| 212 |
+
assert result.total_in_gt == 1
|
| 213 |
+
assert result.score == pytest.approx(1.0)
|
| 214 |
+
|
| 215 |
+
def test_as_dict_structure(self):
|
| 216 |
+
result = compute_ligature_score("fin", "fin")
|
| 217 |
+
d = result.as_dict()
|
| 218 |
+
assert "total_in_gt" in d
|
| 219 |
+
assert "correctly_recognized" in d
|
| 220 |
+
assert "score" in d
|
| 221 |
+
assert "per_ligature" in d
|
| 222 |
+
|
| 223 |
+
def test_empty_texts(self):
|
| 224 |
+
result = compute_ligature_score("", "")
|
| 225 |
+
assert result.score == pytest.approx(1.0)
|
| 226 |
+
assert result.total_in_gt == 0
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
class TestComputeDiacriticScore:
|
| 230 |
+
|
| 231 |
+
def test_no_diacritics(self):
|
| 232 |
+
result = compute_diacritic_score("bonjour", "bonjour")
|
| 233 |
+
assert result.score == pytest.approx(1.0)
|
| 234 |
+
assert result.total_in_gt == 0
|
| 235 |
+
|
| 236 |
+
def test_accent_preserved(self):
|
| 237 |
+
result = compute_diacritic_score("été", "été")
|
| 238 |
+
assert result.score == pytest.approx(1.0)
|
| 239 |
+
assert result.correctly_recognized == result.total_in_gt
|
| 240 |
+
|
| 241 |
+
def test_accent_lost(self):
|
| 242 |
+
result = compute_diacritic_score("étude", "etude")
|
| 243 |
+
assert result.total_in_gt >= 1
|
| 244 |
+
# é → e : perte du diacritique
|
| 245 |
+
assert result.correctly_recognized < result.total_in_gt
|
| 246 |
+
assert result.score < 1.0
|
| 247 |
+
|
| 248 |
+
def test_cedille_tracked(self):
|
| 249 |
+
result = compute_diacritic_score("façon", "facon")
|
| 250 |
+
assert result.total_in_gt >= 1
|
| 251 |
+
assert result.score < 1.0
|
| 252 |
+
|
| 253 |
+
def test_empty_texts(self):
|
| 254 |
+
result = compute_diacritic_score("", "")
|
| 255 |
+
assert result.score == pytest.approx(1.0)
|
| 256 |
+
|
| 257 |
+
def test_as_dict_structure(self):
|
| 258 |
+
result = compute_diacritic_score("été", "ete")
|
| 259 |
+
d = result.as_dict()
|
| 260 |
+
assert "total_in_gt" in d
|
| 261 |
+
assert "correctly_recognized" in d
|
| 262 |
+
assert "score" in d
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
class TestAggregateLigatureScores:
|
| 266 |
+
|
| 267 |
+
def test_empty_list(self):
|
| 268 |
+
result = aggregate_ligature_scores([])
|
| 269 |
+
assert result["score"] == pytest.approx(1.0)
|
| 270 |
+
assert result["total_in_gt"] == 0
|
| 271 |
+
|
| 272 |
+
def test_aggregation(self):
|
| 273 |
+
s1 = LigatureScore(total_in_gt=4, correctly_recognized=3, score=0.75)
|
| 274 |
+
s2 = LigatureScore(total_in_gt=2, correctly_recognized=2, score=1.0)
|
| 275 |
+
result = aggregate_ligature_scores([s1, s2])
|
| 276 |
+
assert result["total_in_gt"] == 6
|
| 277 |
+
assert result["correctly_recognized"] == 5
|
| 278 |
+
assert result["score"] == pytest.approx(5/6, abs=1e-4)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
class TestAggregateDiacriticScores:
|
| 282 |
+
|
| 283 |
+
def test_aggregation(self):
|
| 284 |
+
s1 = DiacriticScore(total_in_gt=10, correctly_recognized=8, score=0.8)
|
| 285 |
+
s2 = DiacriticScore(total_in_gt=5, correctly_recognized=5, score=1.0)
|
| 286 |
+
result = aggregate_diacritic_scores([s1, s2])
|
| 287 |
+
assert result["total_in_gt"] == 15
|
| 288 |
+
assert result["correctly_recognized"] == 13
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
# ===========================================================================
|
| 292 |
+
# Tests TaxonomyResult
|
| 293 |
+
# ===========================================================================
|
| 294 |
+
|
| 295 |
+
from picarones.core.taxonomy import (
|
| 296 |
+
TaxonomyResult,
|
| 297 |
+
ERROR_CLASSES,
|
| 298 |
+
classify_errors,
|
| 299 |
+
aggregate_taxonomy,
|
| 300 |
+
VISUAL_CONFUSIONS,
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
class TestErrorClasses:
|
| 305 |
+
|
| 306 |
+
def test_nine_classes(self):
|
| 307 |
+
assert len(ERROR_CLASSES) == 9
|
| 308 |
+
|
| 309 |
+
def test_class_names(self):
|
| 310 |
+
assert "visual_confusion" in ERROR_CLASSES
|
| 311 |
+
assert "diacritic_error" in ERROR_CLASSES
|
| 312 |
+
assert "case_error" in ERROR_CLASSES
|
| 313 |
+
assert "ligature_error" in ERROR_CLASSES
|
| 314 |
+
assert "lacuna" in ERROR_CLASSES
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
class TestClassifyErrors:
|
| 318 |
+
|
| 319 |
+
def test_identical_texts(self):
|
| 320 |
+
result = classify_errors("bonjour monde", "bonjour monde")
|
| 321 |
+
assert result.total_errors == 0
|
| 322 |
+
|
| 323 |
+
def test_empty_texts(self):
|
| 324 |
+
result = classify_errors("", "")
|
| 325 |
+
assert result.total_errors == 0
|
| 326 |
+
|
| 327 |
+
def test_case_error_detected(self):
|
| 328 |
+
result = classify_errors("Bonjour Monde", "bonjour monde")
|
| 329 |
+
assert result.counts["case_error"] >= 1
|
| 330 |
+
|
| 331 |
+
def test_diacritic_error_detected(self):
|
| 332 |
+
result = classify_errors("été chez nous", "ete chez nous")
|
| 333 |
+
assert result.counts["diacritic_error"] >= 1
|
| 334 |
+
|
| 335 |
+
def test_lacuna_detected(self):
|
| 336 |
+
result = classify_errors("le chat dort paisiblement", "le chat")
|
| 337 |
+
assert result.counts["lacuna"] >= 1
|
| 338 |
+
|
| 339 |
+
def test_segmentation_detected(self):
|
| 340 |
+
result = classify_errors("hello world test", "helloworld test")
|
| 341 |
+
# "hello world" fusionné en "helloworld"
|
| 342 |
+
assert result.counts["segmentation_error"] >= 0 # peut être classé hapax aussi
|
| 343 |
+
|
| 344 |
+
def test_ligature_error_detected(self):
|
| 345 |
+
result = classify_errors("fin de siècle", "fin de siècle")
|
| 346 |
+
# fi vs fi est une ligature correcte, pas une erreur
|
| 347 |
+
# Mais si on avait: GT=fi, OCR=fi → correct
|
| 348 |
+
# Test avec ligature mal reconnue: GT=fin, OCR=fïn (erreur diac)
|
| 349 |
+
assert result.total_errors >= 0 # pas d'erreur ici (fin est équivalent)
|
| 350 |
+
|
| 351 |
+
def test_as_dict_structure(self):
|
| 352 |
+
result = classify_errors("test erreur ici", "test erreur là")
|
| 353 |
+
d = result.as_dict()
|
| 354 |
+
assert "counts" in d
|
| 355 |
+
assert "total_errors" in d
|
| 356 |
+
assert "class_distribution" in d
|
| 357 |
+
assert "examples" in d
|
| 358 |
+
|
| 359 |
+
def test_from_dict_roundtrip(self):
|
| 360 |
+
result = classify_errors("bonjour monde", "Bonjour monde")
|
| 361 |
+
d = result.as_dict()
|
| 362 |
+
restored = TaxonomyResult.from_dict(d)
|
| 363 |
+
assert restored.total_errors == result.total_errors
|
| 364 |
+
assert restored.counts == result.counts
|
| 365 |
+
|
| 366 |
+
def test_class_distribution_sums_to_one(self):
|
| 367 |
+
result = classify_errors("abc def ghi", "xyz uvw rst")
|
| 368 |
+
dist = result.class_distribution
|
| 369 |
+
if dist:
|
| 370 |
+
assert abs(sum(dist.values()) - 1.0) < 1e-6
|
| 371 |
+
|
| 372 |
+
def test_all_classes_in_counts(self):
|
| 373 |
+
result = classify_errors("test", "teSt")
|
| 374 |
+
for cls in ERROR_CLASSES:
|
| 375 |
+
assert cls in result.counts
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
class TestAggregateTaxonomy:
|
| 379 |
+
|
| 380 |
+
def test_empty(self):
|
| 381 |
+
result = aggregate_taxonomy([])
|
| 382 |
+
assert result["total_errors"] == 0
|
| 383 |
+
|
| 384 |
+
def test_sums_counts(self):
|
| 385 |
+
r1 = TaxonomyResult(
|
| 386 |
+
counts={"visual_confusion": 2, "diacritic_error": 1, **{k: 0 for k in ERROR_CLASSES if k not in ["visual_confusion", "diacritic_error"]}},
|
| 387 |
+
total_errors=3,
|
| 388 |
+
)
|
| 389 |
+
r2 = TaxonomyResult(
|
| 390 |
+
counts={"visual_confusion": 1, "diacritic_error": 3, **{k: 0 for k in ERROR_CLASSES if k not in ["visual_confusion", "diacritic_error"]}},
|
| 391 |
+
total_errors=4,
|
| 392 |
+
)
|
| 393 |
+
agg = aggregate_taxonomy([r1, r2])
|
| 394 |
+
assert agg["counts"]["visual_confusion"] == 3
|
| 395 |
+
assert agg["counts"]["diacritic_error"] == 4
|
| 396 |
+
assert agg["total_errors"] == 7
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
# ===========================================================================
|
| 400 |
+
# Tests StructureResult
|
| 401 |
+
# ===========================================================================
|
| 402 |
+
|
| 403 |
+
from picarones.core.structure import (
|
| 404 |
+
StructureResult,
|
| 405 |
+
analyze_structure,
|
| 406 |
+
aggregate_structure,
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
class TestAnalyzeStructure:
|
| 411 |
+
|
| 412 |
+
def test_identical_single_line(self):
|
| 413 |
+
result = analyze_structure("ligne unique", "ligne unique")
|
| 414 |
+
assert result.gt_line_count == 1
|
| 415 |
+
assert result.ocr_line_count == 1
|
| 416 |
+
assert result.line_fusion_count == 0
|
| 417 |
+
assert result.line_fragmentation_count == 0
|
| 418 |
+
|
| 419 |
+
def test_empty_texts(self):
|
| 420 |
+
result = analyze_structure("", "")
|
| 421 |
+
assert result.gt_line_count == 0
|
| 422 |
+
assert result.ocr_line_count == 0
|
| 423 |
+
|
| 424 |
+
def test_multiline_equal(self):
|
| 425 |
+
gt = "ligne 1\nligne 2\nligne 3"
|
| 426 |
+
result = analyze_structure(gt, gt)
|
| 427 |
+
assert result.gt_line_count == 3
|
| 428 |
+
assert result.ocr_line_count == 3
|
| 429 |
+
|
| 430 |
+
def test_line_fusion_detected(self):
|
| 431 |
+
gt = "ligne 1\nligne 2\nligne 3"
|
| 432 |
+
ocr = "ligne 1 ligne 2\nligne 3" # fusion de 2 lignes en 1
|
| 433 |
+
result = analyze_structure(gt, ocr)
|
| 434 |
+
# Le nombre de lignes OCR < GT
|
| 435 |
+
assert result.ocr_line_count < result.gt_line_count
|
| 436 |
+
|
| 437 |
+
def test_reading_order_score_perfect(self):
|
| 438 |
+
text = "le chat dort ici"
|
| 439 |
+
result = analyze_structure(text, text)
|
| 440 |
+
assert result.reading_order_score > 0.9
|
| 441 |
+
|
| 442 |
+
def test_reading_order_score_low_for_scrambled(self):
|
| 443 |
+
gt = "le chat dort paisiblement sur le canapé"
|
| 444 |
+
ocr = "canapé sur le paisiblement dort chat le"
|
| 445 |
+
result = analyze_structure(gt, ocr)
|
| 446 |
+
assert result.reading_order_score < 1.0
|
| 447 |
+
|
| 448 |
+
def test_line_accuracy_perfect(self):
|
| 449 |
+
gt = "ligne 1\nligne 2"
|
| 450 |
+
ocr = "ligne 1\nligne 2"
|
| 451 |
+
result = analyze_structure(gt, ocr)
|
| 452 |
+
assert result.line_accuracy == pytest.approx(1.0)
|
| 453 |
+
|
| 454 |
+
def test_line_accuracy_degraded(self):
|
| 455 |
+
gt = "ligne 1\nligne 2\nligne 3\nligne 4"
|
| 456 |
+
ocr = "ligne 1"
|
| 457 |
+
result = analyze_structure(gt, ocr)
|
| 458 |
+
assert result.line_accuracy < 1.0
|
| 459 |
+
|
| 460 |
+
def test_as_dict_structure(self):
|
| 461 |
+
result = analyze_structure("ligne 1\nligne 2", "ligne 1\nligne 2")
|
| 462 |
+
d = result.as_dict()
|
| 463 |
+
required = ["gt_line_count", "ocr_line_count", "line_fusion_count",
|
| 464 |
+
"line_fragmentation_count", "reading_order_score",
|
| 465 |
+
"paragraph_conservation_score", "line_accuracy"]
|
| 466 |
+
for key in required:
|
| 467 |
+
assert key in d
|
| 468 |
+
|
| 469 |
+
def test_from_dict_roundtrip(self):
|
| 470 |
+
result = analyze_structure("a\nb\nc", "a\nb")
|
| 471 |
+
d = result.as_dict()
|
| 472 |
+
restored = StructureResult.from_dict(d)
|
| 473 |
+
assert restored.gt_line_count == result.gt_line_count
|
| 474 |
+
assert restored.ocr_line_count == result.ocr_line_count
|
| 475 |
+
|
| 476 |
+
def test_line_fusion_rate_property(self):
|
| 477 |
+
result = StructureResult(gt_line_count=10, ocr_line_count=8, line_fusion_count=2)
|
| 478 |
+
assert result.line_fusion_rate == pytest.approx(0.2)
|
| 479 |
+
|
| 480 |
+
def test_line_fragmentation_rate_property(self):
|
| 481 |
+
result = StructureResult(gt_line_count=5, ocr_line_count=8, line_fragmentation_count=3)
|
| 482 |
+
assert result.line_fragmentation_rate == pytest.approx(0.6)
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
class TestAggregateStructure:
|
| 486 |
+
|
| 487 |
+
def test_empty(self):
|
| 488 |
+
result = aggregate_structure([])
|
| 489 |
+
assert result == {}
|
| 490 |
+
|
| 491 |
+
def test_single_result(self):
|
| 492 |
+
r = StructureResult(
|
| 493 |
+
gt_line_count=5, ocr_line_count=5,
|
| 494 |
+
reading_order_score=0.9, paragraph_conservation_score=1.0,
|
| 495 |
+
)
|
| 496 |
+
agg = aggregate_structure([r])
|
| 497 |
+
assert agg["mean_reading_order_score"] == pytest.approx(0.9)
|
| 498 |
+
assert agg["document_count"] == 1
|
| 499 |
+
|
| 500 |
+
def test_mean_fusion_rate(self):
|
| 501 |
+
r1 = StructureResult(gt_line_count=10, ocr_line_count=8, line_fusion_count=2)
|
| 502 |
+
r2 = StructureResult(gt_line_count=10, ocr_line_count=6, line_fusion_count=4)
|
| 503 |
+
agg = aggregate_structure([r1, r2])
|
| 504 |
+
# fusion rates: 0.2 et 0.4 → mean = 0.3
|
| 505 |
+
assert agg["mean_line_fusion_rate"] == pytest.approx(0.3, rel=1e-3)
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
# ===========================================================================
|
| 509 |
+
# Tests ImageQualityResult
|
| 510 |
+
# ===========================================================================
|
| 511 |
+
|
| 512 |
+
from picarones.core.image_quality import (
|
| 513 |
+
ImageQualityResult,
|
| 514 |
+
generate_mock_quality_scores,
|
| 515 |
+
aggregate_image_quality,
|
| 516 |
+
_global_quality_score,
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
class TestImageQualityResult:
|
| 521 |
+
|
| 522 |
+
def test_quality_tier_good(self):
|
| 523 |
+
r = ImageQualityResult(quality_score=0.8)
|
| 524 |
+
assert r.quality_tier == "good"
|
| 525 |
+
assert r.is_good_quality is True
|
| 526 |
+
|
| 527 |
+
def test_quality_tier_medium(self):
|
| 528 |
+
r = ImageQualityResult(quality_score=0.55)
|
| 529 |
+
assert r.quality_tier == "medium"
|
| 530 |
+
assert r.is_good_quality is False
|
| 531 |
+
|
| 532 |
+
def test_quality_tier_poor(self):
|
| 533 |
+
r = ImageQualityResult(quality_score=0.2)
|
| 534 |
+
assert r.quality_tier == "poor"
|
| 535 |
+
|
| 536 |
+
def test_as_dict_structure(self):
|
| 537 |
+
r = ImageQualityResult(
|
| 538 |
+
sharpness_score=0.8, noise_level=0.1, rotation_degrees=0.5,
|
| 539 |
+
contrast_score=0.9, quality_score=0.75, analysis_method="mock",
|
| 540 |
+
)
|
| 541 |
+
d = r.as_dict()
|
| 542 |
+
assert "sharpness_score" in d
|
| 543 |
+
assert "noise_level" in d
|
| 544 |
+
assert "rotation_degrees" in d
|
| 545 |
+
assert "contrast_score" in d
|
| 546 |
+
assert "quality_score" in d
|
| 547 |
+
assert "quality_tier" in d
|
| 548 |
+
assert "analysis_method" in d
|
| 549 |
+
|
| 550 |
+
def test_from_dict_roundtrip(self):
|
| 551 |
+
r = ImageQualityResult(
|
| 552 |
+
sharpness_score=0.7, noise_level=0.2, rotation_degrees=1.0,
|
| 553 |
+
contrast_score=0.8, quality_score=0.65, analysis_method="pillow",
|
| 554 |
+
)
|
| 555 |
+
d = r.as_dict()
|
| 556 |
+
restored = ImageQualityResult.from_dict(d)
|
| 557 |
+
assert restored.sharpness_score == pytest.approx(r.sharpness_score, rel=1e-3)
|
| 558 |
+
assert restored.quality_score == pytest.approx(r.quality_score, rel=1e-3)
|
| 559 |
+
assert restored.analysis_method == r.analysis_method
|
| 560 |
+
|
| 561 |
+
def test_from_dict_ignores_quality_tier(self):
|
| 562 |
+
# quality_tier est une propriété, pas un param init → from_dict doit l'ignorer
|
| 563 |
+
data = {
|
| 564 |
+
"sharpness_score": 0.5, "noise_level": 0.3, "rotation_degrees": 0.0,
|
| 565 |
+
"contrast_score": 0.6, "quality_score": 0.5, "analysis_method": "mock",
|
| 566 |
+
"quality_tier": "medium", # doit être ignoré
|
| 567 |
+
}
|
| 568 |
+
r = ImageQualityResult.from_dict(data)
|
| 569 |
+
assert r.quality_score == pytest.approx(0.5)
|
| 570 |
+
|
| 571 |
+
|
| 572 |
+
class TestGenerateMockQualityScores:
|
| 573 |
+
|
| 574 |
+
def test_returns_image_quality_result(self):
|
| 575 |
+
r = generate_mock_quality_scores("folio_001")
|
| 576 |
+
assert isinstance(r, ImageQualityResult)
|
| 577 |
+
|
| 578 |
+
def test_scores_in_range(self):
|
| 579 |
+
r = generate_mock_quality_scores("folio_001", seed=42)
|
| 580 |
+
assert 0.0 <= r.quality_score <= 1.0
|
| 581 |
+
assert 0.0 <= r.sharpness_score <= 1.0
|
| 582 |
+
assert 0.0 <= r.noise_level <= 1.0
|
| 583 |
+
assert 0.0 <= r.contrast_score <= 1.0
|
| 584 |
+
|
| 585 |
+
def test_reproducible_with_seed(self):
|
| 586 |
+
r1 = generate_mock_quality_scores("folio_001", seed=42)
|
| 587 |
+
r2 = generate_mock_quality_scores("folio_001", seed=42)
|
| 588 |
+
assert r1.quality_score == r2.quality_score
|
| 589 |
+
|
| 590 |
+
def test_analysis_method_mock(self):
|
| 591 |
+
r = generate_mock_quality_scores("folio_001")
|
| 592 |
+
assert r.analysis_method == "mock"
|
| 593 |
+
|
| 594 |
+
def test_no_error(self):
|
| 595 |
+
r = generate_mock_quality_scores("folio_001")
|
| 596 |
+
assert r.error is None
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
class TestGlobalQualityScore:
|
| 600 |
+
|
| 601 |
+
def test_perfect_input(self):
|
| 602 |
+
score = _global_quality_score(sharpness=1.0, noise=0.0, rotation_abs=0.0, contrast=1.0)
|
| 603 |
+
assert score == pytest.approx(1.0)
|
| 604 |
+
|
| 605 |
+
def test_worst_input(self):
|
| 606 |
+
score = _global_quality_score(sharpness=0.0, noise=1.0, rotation_abs=10.0, contrast=0.0)
|
| 607 |
+
assert score == pytest.approx(0.0)
|
| 608 |
+
|
| 609 |
+
def test_medium_input(self):
|
| 610 |
+
score = _global_quality_score(sharpness=0.5, noise=0.5, rotation_abs=0.0, contrast=0.5)
|
| 611 |
+
assert 0.0 < score < 1.0
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
class TestAggregateImageQuality:
|
| 615 |
+
|
| 616 |
+
def test_empty_list(self):
|
| 617 |
+
result = aggregate_image_quality([])
|
| 618 |
+
assert result == {}
|
| 619 |
+
|
| 620 |
+
def test_single_result(self):
|
| 621 |
+
r = ImageQualityResult(quality_score=0.75, analysis_method="mock")
|
| 622 |
+
agg = aggregate_image_quality([r])
|
| 623 |
+
assert agg["mean_quality_score"] == pytest.approx(0.75)
|
| 624 |
+
assert agg["document_count"] == 1
|
| 625 |
+
|
| 626 |
+
def test_tier_distribution(self):
|
| 627 |
+
results = [
|
| 628 |
+
ImageQualityResult(quality_score=0.8, analysis_method="mock"), # good
|
| 629 |
+
ImageQualityResult(quality_score=0.5, analysis_method="mock"), # medium
|
| 630 |
+
ImageQualityResult(quality_score=0.2, analysis_method="mock"), # poor
|
| 631 |
+
]
|
| 632 |
+
agg = aggregate_image_quality(results)
|
| 633 |
+
assert agg["quality_distribution"]["good"] == 1
|
| 634 |
+
assert agg["quality_distribution"]["medium"] == 1
|
| 635 |
+
assert agg["quality_distribution"]["poor"] == 1
|
| 636 |
+
|
| 637 |
+
def test_scores_list_present(self):
|
| 638 |
+
results = [ImageQualityResult(quality_score=0.6, analysis_method="mock")]
|
| 639 |
+
agg = aggregate_image_quality(results)
|
| 640 |
+
assert "scores" in agg
|
| 641 |
+
assert len(agg["scores"]) == 1
|
| 642 |
+
|
| 643 |
+
def test_errors_excluded(self):
|
| 644 |
+
results = [
|
| 645 |
+
ImageQualityResult(quality_score=0.8, analysis_method="mock"),
|
| 646 |
+
ImageQualityResult(quality_score=0.0, analysis_method="none", error="file not found"),
|
| 647 |
+
]
|
| 648 |
+
agg = aggregate_image_quality(results)
|
| 649 |
+
assert agg["document_count"] == 1 # seul le résultat sans erreur compte
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
# ===========================================================================
|
| 653 |
+
# Tests d'intégration Sprint 5 (fixtures + rapport)
|
| 654 |
+
# ===========================================================================
|
| 655 |
+
|
| 656 |
+
class TestFixturesSprint5:
|
| 657 |
+
|
| 658 |
+
def test_doc_result_has_confusion_matrix(self):
|
| 659 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 660 |
+
bm = generate_sample_benchmark()
|
| 661 |
+
for er in bm.engine_reports:
|
| 662 |
+
for dr in er.document_results:
|
| 663 |
+
assert dr.confusion_matrix is not None, (
|
| 664 |
+
f"confusion_matrix manquante pour {er.engine_name}/{dr.doc_id}"
|
| 665 |
+
)
|
| 666 |
+
break
|
| 667 |
+
|
| 668 |
+
def test_doc_result_has_char_scores(self):
|
| 669 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 670 |
+
bm = generate_sample_benchmark()
|
| 671 |
+
for er in bm.engine_reports:
|
| 672 |
+
dr = er.document_results[0]
|
| 673 |
+
assert dr.char_scores is not None
|
| 674 |
+
assert "ligature" in dr.char_scores
|
| 675 |
+
assert "diacritic" in dr.char_scores
|
| 676 |
+
|
| 677 |
+
def test_doc_result_has_taxonomy(self):
|
| 678 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 679 |
+
bm = generate_sample_benchmark()
|
| 680 |
+
for er in bm.engine_reports:
|
| 681 |
+
dr = er.document_results[0]
|
| 682 |
+
assert dr.taxonomy is not None
|
| 683 |
+
assert "counts" in dr.taxonomy
|
| 684 |
+
assert "total_errors" in dr.taxonomy
|
| 685 |
+
|
| 686 |
+
def test_doc_result_has_structure(self):
|
| 687 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 688 |
+
bm = generate_sample_benchmark()
|
| 689 |
+
for er in bm.engine_reports:
|
| 690 |
+
dr = er.document_results[0]
|
| 691 |
+
assert dr.structure is not None
|
| 692 |
+
assert "gt_line_count" in dr.structure
|
| 693 |
+
|
| 694 |
+
def test_doc_result_has_image_quality(self):
|
| 695 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 696 |
+
bm = generate_sample_benchmark()
|
| 697 |
+
for er in bm.engine_reports:
|
| 698 |
+
dr = er.document_results[0]
|
| 699 |
+
assert dr.image_quality is not None
|
| 700 |
+
assert "quality_score" in dr.image_quality
|
| 701 |
+
|
| 702 |
+
def test_engine_report_has_aggregated_confusion(self):
|
| 703 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 704 |
+
bm = generate_sample_benchmark()
|
| 705 |
+
for er in bm.engine_reports:
|
| 706 |
+
assert er.aggregated_confusion is not None
|
| 707 |
+
assert "matrix" in er.aggregated_confusion
|
| 708 |
+
|
| 709 |
+
def test_engine_report_has_aggregated_char_scores(self):
|
| 710 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 711 |
+
bm = generate_sample_benchmark()
|
| 712 |
+
for er in bm.engine_reports:
|
| 713 |
+
assert er.aggregated_char_scores is not None
|
| 714 |
+
assert "ligature" in er.aggregated_char_scores
|
| 715 |
+
assert "diacritic" in er.aggregated_char_scores
|
| 716 |
+
|
| 717 |
+
def test_engine_report_ligature_score_property(self):
|
| 718 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 719 |
+
bm = generate_sample_benchmark()
|
| 720 |
+
for er in bm.engine_reports:
|
| 721 |
+
score = er.ligature_score
|
| 722 |
+
assert score is not None
|
| 723 |
+
assert 0.0 <= score <= 1.0
|
| 724 |
+
|
| 725 |
+
def test_engine_report_diacritic_score_property(self):
|
| 726 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 727 |
+
bm = generate_sample_benchmark()
|
| 728 |
+
for er in bm.engine_reports:
|
| 729 |
+
score = er.diacritic_score
|
| 730 |
+
assert score is not None
|
| 731 |
+
assert 0.0 <= score <= 1.0
|
| 732 |
+
|
| 733 |
+
def test_engine_report_has_aggregated_taxonomy(self):
|
| 734 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 735 |
+
bm = generate_sample_benchmark()
|
| 736 |
+
for er in bm.engine_reports:
|
| 737 |
+
assert er.aggregated_taxonomy is not None
|
| 738 |
+
assert "total_errors" in er.aggregated_taxonomy
|
| 739 |
+
|
| 740 |
+
def test_engine_report_has_aggregated_structure(self):
|
| 741 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 742 |
+
bm = generate_sample_benchmark()
|
| 743 |
+
for er in bm.engine_reports:
|
| 744 |
+
assert er.aggregated_structure is not None
|
| 745 |
+
assert "mean_reading_order_score" in er.aggregated_structure
|
| 746 |
+
|
| 747 |
+
def test_engine_report_has_aggregated_image_quality(self):
|
| 748 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 749 |
+
bm = generate_sample_benchmark()
|
| 750 |
+
for er in bm.engine_reports:
|
| 751 |
+
assert er.aggregated_image_quality is not None
|
| 752 |
+
assert "mean_quality_score" in er.aggregated_image_quality
|
| 753 |
+
|
| 754 |
+
def test_bad_engine_has_more_errors(self):
|
| 755 |
+
"""L'ancien moteur doit avoir plus d'erreurs taxonomiques que pero_ocr."""
|
| 756 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 757 |
+
bm = generate_sample_benchmark()
|
| 758 |
+
pero = next(er for er in bm.engine_reports if er.engine_name == "pero_ocr")
|
| 759 |
+
bad = next(er for er in bm.engine_reports if er.engine_name == "ancien_moteur")
|
| 760 |
+
assert bad.aggregated_taxonomy["total_errors"] > pero.aggregated_taxonomy["total_errors"]
|
| 761 |
+
|
| 762 |
+
|
| 763 |
+
class TestReportSprint5:
|
| 764 |
+
|
| 765 |
+
def test_report_data_has_ligature_score(self):
|
| 766 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 767 |
+
from picarones.report.generator import _build_report_data
|
| 768 |
+
bm = generate_sample_benchmark()
|
| 769 |
+
data = _build_report_data(bm, {})
|
| 770 |
+
for eng in data["engines"]:
|
| 771 |
+
assert "ligature_score" in eng, f"ligature_score manquant pour {eng['name']}"
|
| 772 |
+
|
| 773 |
+
def test_report_data_has_diacritic_score(self):
|
| 774 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 775 |
+
from picarones.report.generator import _build_report_data
|
| 776 |
+
bm = generate_sample_benchmark()
|
| 777 |
+
data = _build_report_data(bm, {})
|
| 778 |
+
for eng in data["engines"]:
|
| 779 |
+
assert "diacritic_score" in eng
|
| 780 |
+
|
| 781 |
+
def test_report_data_has_aggregated_taxonomy(self):
|
| 782 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 783 |
+
from picarones.report.generator import _build_report_data
|
| 784 |
+
bm = generate_sample_benchmark()
|
| 785 |
+
data = _build_report_data(bm, {})
|
| 786 |
+
for eng in data["engines"]:
|
| 787 |
+
assert "aggregated_taxonomy" in eng
|
| 788 |
+
|
| 789 |
+
def test_report_data_has_aggregated_image_quality(self):
|
| 790 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 791 |
+
from picarones.report.generator import _build_report_data
|
| 792 |
+
bm = generate_sample_benchmark()
|
| 793 |
+
data = _build_report_data(bm, {})
|
| 794 |
+
for eng in data["engines"]:
|
| 795 |
+
assert "aggregated_image_quality" in eng
|
| 796 |
+
|
| 797 |
+
def test_html_has_characters_tab(self, tmp_path):
|
| 798 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 799 |
+
from picarones.report.generator import ReportGenerator
|
| 800 |
+
bm = generate_sample_benchmark()
|
| 801 |
+
out = tmp_path / "report.html"
|
| 802 |
+
ReportGenerator(bm).generate(out)
|
| 803 |
+
html = out.read_text(encoding="utf-8")
|
| 804 |
+
assert "Caractères" in html
|
| 805 |
+
|
| 806 |
+
def test_html_has_ligatures_column(self, tmp_path):
|
| 807 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 808 |
+
from picarones.report.generator import ReportGenerator
|
| 809 |
+
bm = generate_sample_benchmark()
|
| 810 |
+
out = tmp_path / "report.html"
|
| 811 |
+
ReportGenerator(bm).generate(out)
|
| 812 |
+
html = out.read_text(encoding="utf-8")
|
| 813 |
+
assert "Ligatures" in html
|
| 814 |
+
|
| 815 |
+
def test_html_has_diacritiques_column(self, tmp_path):
|
| 816 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 817 |
+
from picarones.report.generator import ReportGenerator
|
| 818 |
+
bm = generate_sample_benchmark()
|
| 819 |
+
out = tmp_path / "report.html"
|
| 820 |
+
ReportGenerator(bm).generate(out)
|
| 821 |
+
html = out.read_text(encoding="utf-8")
|
| 822 |
+
assert "Diacritiques" in html
|
| 823 |
+
|
| 824 |
+
def test_html_has_scatter_plot(self, tmp_path):
|
| 825 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 826 |
+
from picarones.report.generator import ReportGenerator
|
| 827 |
+
bm = generate_sample_benchmark()
|
| 828 |
+
out = tmp_path / "report.html"
|
| 829 |
+
ReportGenerator(bm).generate(out)
|
| 830 |
+
html = out.read_text(encoding="utf-8")
|
| 831 |
+
assert "chart-quality-cer" in html
|
| 832 |
+
|
| 833 |
+
def test_html_has_taxonomy_chart(self, tmp_path):
|
| 834 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 835 |
+
from picarones.report.generator import ReportGenerator
|
| 836 |
+
bm = generate_sample_benchmark()
|
| 837 |
+
out = tmp_path / "report.html"
|
| 838 |
+
ReportGenerator(bm).generate(out)
|
| 839 |
+
html = out.read_text(encoding="utf-8")
|
| 840 |
+
assert "chart-taxonomy" in html
|
| 841 |
+
|
| 842 |
+
def test_html_has_confusion_heatmap(self, tmp_path):
|
| 843 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 844 |
+
from picarones.report.generator import ReportGenerator
|
| 845 |
+
bm = generate_sample_benchmark()
|
| 846 |
+
out = tmp_path / "report.html"
|
| 847 |
+
ReportGenerator(bm).generate(out)
|
| 848 |
+
html = out.read_text(encoding="utf-8")
|
| 849 |
+
assert "confusion-heatmap" in html or "matrice de confusion" in html.lower()
|
| 850 |
+
|
| 851 |
+
def test_doc_results_have_image_quality_in_report(self):
|
| 852 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 853 |
+
from picarones.report.generator import _build_report_data
|
| 854 |
+
bm = generate_sample_benchmark()
|
| 855 |
+
data = _build_report_data(bm, {})
|
| 856 |
+
doc = data["documents"][0]
|
| 857 |
+
# Au moins un engine result doit avoir image_quality
|
| 858 |
+
has_iq = any("image_quality" in er for er in doc["engine_results"])
|
| 859 |
+
assert has_iq, "Aucun document result n'a de données image_quality"
|
| 860 |
+
|
| 861 |
+
def test_json_export_contains_sprint5_data(self, tmp_path):
|
| 862 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 863 |
+
import json
|
| 864 |
+
bm = generate_sample_benchmark()
|
| 865 |
+
out = tmp_path / "results.json"
|
| 866 |
+
bm.to_json(out)
|
| 867 |
+
data = json.loads(out.read_text())
|
| 868 |
+
# Vérifier dans les engine_reports
|
| 869 |
+
er = data["engine_reports"][0]
|
| 870 |
+
assert "aggregated_taxonomy" in er
|
| 871 |
+
assert "aggregated_char_scores" in er
|
| 872 |
+
# Vérifier dans les document_results
|
| 873 |
+
dr = er["document_results"][0]
|
| 874 |
+
assert "taxonomy" in dr
|
| 875 |
+
assert "char_scores" in dr
|
| 876 |
+
assert "structure" in dr
|
tests/test_sprint6_web_interface.py
ADDED
|
@@ -0,0 +1,982 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 6 — Interface web FastAPI, import HTR-United, HuggingFace, serve CLI.
|
| 2 |
+
|
| 3 |
+
Classes de tests
|
| 4 |
+
----------------
|
| 5 |
+
TestHTRUnitedEntry (8 tests) — dataclass, as_dict, from_dict, century_str
|
| 6 |
+
TestHTRUnitedCatalogue (10 tests) — from_demo, search, get_by_id, available_languages/scripts
|
| 7 |
+
TestHTRUnitedSearch (8 tests) — recherche textuelle, filtre langue, script, siècle
|
| 8 |
+
TestHTRUnitedImport (4 tests) — import_htr_united_corpus crée les fichiers meta
|
| 9 |
+
TestHuggingFaceDataset (7 tests) — dataclass, as_dict, from_dict, hf_url
|
| 10 |
+
TestHuggingFaceImporter (10 tests) — search référence, filtres, import
|
| 11 |
+
TestHuggingFaceReferenceData (4 tests) — datasets de référence pré-intégrés
|
| 12 |
+
TestNormalizationProfiles (8 tests) — profils disponibles via API route
|
| 13 |
+
TestFastAPIStatus (3 tests) — GET /api/status
|
| 14 |
+
TestFastAPIEngines (8 tests) — GET /api/engines
|
| 15 |
+
TestFastAPICorpusBrowse (6 tests) — GET /api/corpus/browse
|
| 16 |
+
TestFastAPIReports (5 tests) — GET /api/reports
|
| 17 |
+
TestFastAPIHTRUnited (7 tests) — GET /api/htr-united/catalogue + POST import
|
| 18 |
+
TestFastAPIHuggingFace (6 tests) — GET /api/huggingface/search + POST import
|
| 19 |
+
TestFastAPIBenchmark (8 tests) — POST start, GET status, GET stream, POST cancel
|
| 20 |
+
TestFastAPIHTML (5 tests) — GET / retourne HTML valide
|
| 21 |
+
TestFastAPIReportServe (4 tests) — GET /reports/{filename}
|
| 22 |
+
TestCLIServeCommand (5 tests) — commande picarones serve enregistrée
|
| 23 |
+
TestRunnerProgressCallback (5 tests) — progress_callback injecté dans run_benchmark
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import json
|
| 29 |
+
import os
|
| 30 |
+
import tempfile
|
| 31 |
+
import threading
|
| 32 |
+
import time
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
from unittest.mock import MagicMock, patch
|
| 35 |
+
|
| 36 |
+
import pytest
|
| 37 |
+
from click.testing import CliRunner
|
| 38 |
+
from fastapi.testclient import TestClient
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Fixtures
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
@pytest.fixture
|
| 45 |
+
def tmp_corpus(tmp_path):
|
| 46 |
+
"""Crée un corpus minimal avec 2 documents."""
|
| 47 |
+
from PIL import Image
|
| 48 |
+
for i in range(2):
|
| 49 |
+
img = Image.new("RGB", (100, 50), color=(200, 200, 200))
|
| 50 |
+
img.save(tmp_path / f"doc_{i:02d}.jpg")
|
| 51 |
+
(tmp_path / f"doc_{i:02d}.gt.txt").write_text(f"Texte vérité terrain {i}", encoding="utf-8")
|
| 52 |
+
return tmp_path
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@pytest.fixture
|
| 56 |
+
def client():
|
| 57 |
+
from picarones.web.app import app
|
| 58 |
+
return TestClient(app)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@pytest.fixture
|
| 62 |
+
def htr_catalogue():
|
| 63 |
+
from picarones.importers.htr_united import HTRUnitedCatalogue
|
| 64 |
+
return HTRUnitedCatalogue.from_demo()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@pytest.fixture
|
| 68 |
+
def hf_importer():
|
| 69 |
+
from picarones.importers.huggingface import HuggingFaceImporter
|
| 70 |
+
return HuggingFaceImporter()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# ===========================================================================
|
| 74 |
+
# TestHTRUnitedEntry
|
| 75 |
+
# ===========================================================================
|
| 76 |
+
|
| 77 |
+
class TestHTRUnitedEntry:
|
| 78 |
+
|
| 79 |
+
def test_from_dict_basic(self):
|
| 80 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 81 |
+
d = {
|
| 82 |
+
"id": "test-corpus", "title": "Test Corpus", "url": "https://github.com/test/corpus",
|
| 83 |
+
"language": ["French"], "script": ["Gothic"], "century": [14, 15],
|
| 84 |
+
"institution": "BnF", "description": "Un corpus de test.", "license": "CC-BY 4.0",
|
| 85 |
+
"lines": 5000, "format": "ALTO", "tags": ["test", "médiéval"],
|
| 86 |
+
}
|
| 87 |
+
e = HTRUnitedEntry.from_dict(d)
|
| 88 |
+
assert e.id == "test-corpus"
|
| 89 |
+
assert e.title == "Test Corpus"
|
| 90 |
+
assert e.language == ["French"]
|
| 91 |
+
assert e.lines == 5000
|
| 92 |
+
|
| 93 |
+
def test_as_dict_roundtrip(self):
|
| 94 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 95 |
+
d = {
|
| 96 |
+
"id": "rtrip", "title": "Round Trip", "url": "https://github.com/a/b",
|
| 97 |
+
"language": ["Latin"], "script": ["Caroline"], "century": [9],
|
| 98 |
+
"institution": "IRHT", "description": "Test.", "license": "CC0",
|
| 99 |
+
"lines": 1000, "format": "PAGE", "tags": [],
|
| 100 |
+
}
|
| 101 |
+
e = HTRUnitedEntry.from_dict(d)
|
| 102 |
+
out = e.as_dict()
|
| 103 |
+
assert out["id"] == "rtrip"
|
| 104 |
+
assert out["lines"] == 1000
|
| 105 |
+
assert out["format"] == "PAGE"
|
| 106 |
+
|
| 107 |
+
def test_century_str_roman(self):
|
| 108 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 109 |
+
e = HTRUnitedEntry(id="x", title="x", url="x", century=[12, 14])
|
| 110 |
+
cs = e.century_str
|
| 111 |
+
assert "XIIe" in cs
|
| 112 |
+
assert "XIVe" in cs
|
| 113 |
+
|
| 114 |
+
def test_century_str_single(self):
|
| 115 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 116 |
+
e = HTRUnitedEntry(id="x", title="x", url="x", century=[19])
|
| 117 |
+
assert "XIXe" in e.century_str
|
| 118 |
+
|
| 119 |
+
def test_default_fields(self):
|
| 120 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 121 |
+
e = HTRUnitedEntry(id="minimal", title="Min", url="http://x")
|
| 122 |
+
assert e.language == []
|
| 123 |
+
assert e.lines == 0
|
| 124 |
+
assert e.format == "ALTO"
|
| 125 |
+
assert e.tags == []
|
| 126 |
+
|
| 127 |
+
def test_from_dict_missing_fields(self):
|
| 128 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 129 |
+
e = HTRUnitedEntry.from_dict({"id": "sparse", "title": "Sparse"})
|
| 130 |
+
assert e.id == "sparse"
|
| 131 |
+
assert e.institution == ""
|
| 132 |
+
assert e.lines == 0
|
| 133 |
+
|
| 134 |
+
def test_as_dict_has_all_keys(self):
|
| 135 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 136 |
+
e = HTRUnitedEntry(id="k", title="K", url="http://k")
|
| 137 |
+
d = e.as_dict()
|
| 138 |
+
for key in ["id", "title", "url", "language", "script", "century",
|
| 139 |
+
"institution", "description", "license", "lines", "format", "tags"]:
|
| 140 |
+
assert key in d, f"Missing key: {key}"
|
| 141 |
+
|
| 142 |
+
def test_url_preserved(self):
|
| 143 |
+
from picarones.importers.htr_united import HTRUnitedEntry
|
| 144 |
+
url = "https://github.com/HTR-United/cremma-medieval"
|
| 145 |
+
e = HTRUnitedEntry(id="c", title="CREMMA", url=url)
|
| 146 |
+
assert e.url == url
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# ===========================================================================
|
| 150 |
+
# TestHTRUnitedCatalogue
|
| 151 |
+
# ===========================================================================
|
| 152 |
+
|
| 153 |
+
class TestHTRUnitedCatalogue:
|
| 154 |
+
|
| 155 |
+
def test_from_demo_length(self, htr_catalogue):
|
| 156 |
+
assert len(htr_catalogue) >= 6
|
| 157 |
+
|
| 158 |
+
def test_from_demo_source(self, htr_catalogue):
|
| 159 |
+
assert htr_catalogue.source == "demo"
|
| 160 |
+
|
| 161 |
+
def test_all_entries_have_id(self, htr_catalogue):
|
| 162 |
+
for e in htr_catalogue.entries:
|
| 163 |
+
assert e.id, f"Entry missing id: {e}"
|
| 164 |
+
|
| 165 |
+
def test_all_entries_have_title(self, htr_catalogue):
|
| 166 |
+
for e in htr_catalogue.entries:
|
| 167 |
+
assert e.title
|
| 168 |
+
|
| 169 |
+
def test_get_by_id_found(self, htr_catalogue):
|
| 170 |
+
first_id = htr_catalogue.entries[0].id
|
| 171 |
+
found = htr_catalogue.get_by_id(first_id)
|
| 172 |
+
assert found is not None
|
| 173 |
+
assert found.id == first_id
|
| 174 |
+
|
| 175 |
+
def test_get_by_id_not_found(self, htr_catalogue):
|
| 176 |
+
result = htr_catalogue.get_by_id("nonexistent-corpus-xyz")
|
| 177 |
+
assert result is None
|
| 178 |
+
|
| 179 |
+
def test_available_languages_non_empty(self, htr_catalogue):
|
| 180 |
+
langs = htr_catalogue.available_languages()
|
| 181 |
+
assert len(langs) > 0
|
| 182 |
+
assert isinstance(langs, list)
|
| 183 |
+
|
| 184 |
+
def test_available_languages_sorted(self, htr_catalogue):
|
| 185 |
+
langs = htr_catalogue.available_languages()
|
| 186 |
+
assert langs == sorted(langs)
|
| 187 |
+
|
| 188 |
+
def test_available_scripts_non_empty(self, htr_catalogue):
|
| 189 |
+
scripts = htr_catalogue.available_scripts()
|
| 190 |
+
assert len(scripts) > 0
|
| 191 |
+
|
| 192 |
+
def test_len(self, htr_catalogue):
|
| 193 |
+
assert len(htr_catalogue) == len(htr_catalogue.entries)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
# ===========================================================================
|
| 197 |
+
# TestHTRUnitedSearch
|
| 198 |
+
# ===========================================================================
|
| 199 |
+
|
| 200 |
+
class TestHTRUnitedSearch:
|
| 201 |
+
|
| 202 |
+
def test_search_empty_returns_all(self, htr_catalogue):
|
| 203 |
+
results = htr_catalogue.search()
|
| 204 |
+
assert len(results) == len(htr_catalogue.entries)
|
| 205 |
+
|
| 206 |
+
def test_search_by_query(self, htr_catalogue):
|
| 207 |
+
results = htr_catalogue.search(query="médiéval")
|
| 208 |
+
assert len(results) > 0
|
| 209 |
+
for r in results:
|
| 210 |
+
text = (r.title + r.description + " ".join(r.tags)).lower()
|
| 211 |
+
assert "médiéval" in text
|
| 212 |
+
|
| 213 |
+
def test_search_by_language(self, htr_catalogue):
|
| 214 |
+
results = htr_catalogue.search(language="French")
|
| 215 |
+
assert len(results) > 0
|
| 216 |
+
for r in results:
|
| 217 |
+
assert any("french" in l.lower() for l in r.language)
|
| 218 |
+
|
| 219 |
+
def test_search_by_language_latin(self, htr_catalogue):
|
| 220 |
+
results = htr_catalogue.search(language="Latin")
|
| 221 |
+
assert len(results) > 0
|
| 222 |
+
|
| 223 |
+
def test_search_by_script(self, htr_catalogue):
|
| 224 |
+
results = htr_catalogue.search(script="Gothic")
|
| 225 |
+
assert len(results) > 0
|
| 226 |
+
|
| 227 |
+
def test_search_no_results(self, htr_catalogue):
|
| 228 |
+
results = htr_catalogue.search(query="xyzzy_corpus_inexistant_42")
|
| 229 |
+
assert results == []
|
| 230 |
+
|
| 231 |
+
def test_search_combined_filters(self, htr_catalogue):
|
| 232 |
+
# Ne doit pas lever d'exception
|
| 233 |
+
results = htr_catalogue.search(query="", language="French", script="Cursiva")
|
| 234 |
+
assert isinstance(results, list)
|
| 235 |
+
|
| 236 |
+
def test_search_century_min(self, htr_catalogue):
|
| 237 |
+
results = htr_catalogue.search(century_min=18)
|
| 238 |
+
for r in results:
|
| 239 |
+
assert any(c >= 18 for c in r.century)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# ===========================================================================
|
| 243 |
+
# TestHTRUnitedImport
|
| 244 |
+
# ===========================================================================
|
| 245 |
+
|
| 246 |
+
class TestHTRUnitedImport:
|
| 247 |
+
|
| 248 |
+
def test_import_creates_meta_file(self, tmp_path, htr_catalogue):
|
| 249 |
+
from picarones.importers.htr_united import import_htr_united_corpus
|
| 250 |
+
entry = htr_catalogue.entries[0]
|
| 251 |
+
result = import_htr_united_corpus(entry, tmp_path, max_samples=5)
|
| 252 |
+
meta_file = Path(result["metadata_file"])
|
| 253 |
+
assert meta_file.exists()
|
| 254 |
+
|
| 255 |
+
def test_import_meta_content(self, tmp_path, htr_catalogue):
|
| 256 |
+
from picarones.importers.htr_united import import_htr_united_corpus
|
| 257 |
+
entry = htr_catalogue.entries[0]
|
| 258 |
+
result = import_htr_united_corpus(entry, tmp_path, max_samples=5)
|
| 259 |
+
meta = json.loads(Path(result["metadata_file"]).read_text())
|
| 260 |
+
assert meta["source"] == "htr-united"
|
| 261 |
+
assert meta["entry_id"] == entry.id
|
| 262 |
+
|
| 263 |
+
def test_import_returns_dict_keys(self, tmp_path, htr_catalogue):
|
| 264 |
+
from picarones.importers.htr_united import import_htr_united_corpus
|
| 265 |
+
entry = htr_catalogue.entries[0]
|
| 266 |
+
result = import_htr_united_corpus(entry, tmp_path, max_samples=5)
|
| 267 |
+
for k in ["entry_id", "title", "output_dir", "files_imported", "metadata_file"]:
|
| 268 |
+
assert k in result, f"Missing key: {k}"
|
| 269 |
+
|
| 270 |
+
def test_import_creates_output_dir(self, tmp_path, htr_catalogue):
|
| 271 |
+
from picarones.importers.htr_united import import_htr_united_corpus
|
| 272 |
+
entry = htr_catalogue.entries[0]
|
| 273 |
+
new_dir = tmp_path / "new_subdir" / "corpus"
|
| 274 |
+
result = import_htr_united_corpus(entry, new_dir, max_samples=5)
|
| 275 |
+
assert new_dir.exists()
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
# ===========================================================================
|
| 279 |
+
# TestHuggingFaceDataset
|
| 280 |
+
# ===========================================================================
|
| 281 |
+
|
| 282 |
+
class TestHuggingFaceDataset:
|
| 283 |
+
|
| 284 |
+
def test_from_dict_basic(self):
|
| 285 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 286 |
+
d = {
|
| 287 |
+
"dataset_id": "test/dataset", "title": "Test Dataset",
|
| 288 |
+
"description": "A test dataset.", "language": ["French"],
|
| 289 |
+
"tags": ["ocr", "french"], "license": "cc-by-4.0",
|
| 290 |
+
"institution": "Test Lab", "downloads": 500,
|
| 291 |
+
}
|
| 292 |
+
ds = HuggingFaceDataset.from_dict(d)
|
| 293 |
+
assert ds.dataset_id == "test/dataset"
|
| 294 |
+
assert ds.language == ["French"]
|
| 295 |
+
assert ds.downloads == 500
|
| 296 |
+
|
| 297 |
+
def test_as_dict_roundtrip(self):
|
| 298 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 299 |
+
ds = HuggingFaceDataset(
|
| 300 |
+
dataset_id="a/b", title="AB", description="desc",
|
| 301 |
+
language=["Latin"], tags=["htr"],
|
| 302 |
+
)
|
| 303 |
+
d = ds.as_dict()
|
| 304 |
+
assert d["dataset_id"] == "a/b"
|
| 305 |
+
assert d["language"] == ["Latin"]
|
| 306 |
+
|
| 307 |
+
def test_hf_url(self):
|
| 308 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 309 |
+
ds = HuggingFaceDataset(dataset_id="CATMuS/medieval", title="CATMuS")
|
| 310 |
+
assert ds.hf_url == "https://huggingface.co/datasets/CATMuS/medieval"
|
| 311 |
+
|
| 312 |
+
def test_as_dict_has_all_keys(self):
|
| 313 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 314 |
+
ds = HuggingFaceDataset(dataset_id="x/y", title="XY")
|
| 315 |
+
d = ds.as_dict()
|
| 316 |
+
for k in ["dataset_id", "title", "description", "language", "tags",
|
| 317 |
+
"license", "size_category", "task", "institution", "downloads", "source"]:
|
| 318 |
+
assert k in d, f"Missing: {k}"
|
| 319 |
+
|
| 320 |
+
def test_default_source(self):
|
| 321 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 322 |
+
ds = HuggingFaceDataset(dataset_id="x/y", title="XY")
|
| 323 |
+
assert ds.source == "reference"
|
| 324 |
+
|
| 325 |
+
def test_from_dict_uses_id_as_fallback_title(self):
|
| 326 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 327 |
+
ds = HuggingFaceDataset.from_dict({"dataset_id": "owner/repo"})
|
| 328 |
+
assert ds.title == "owner/repo"
|
| 329 |
+
|
| 330 |
+
def test_replace_source_helper(self):
|
| 331 |
+
from picarones.importers.huggingface import HuggingFaceDataset
|
| 332 |
+
ds = HuggingFaceDataset(dataset_id="x/y", title="XY", source="reference")
|
| 333 |
+
ds2 = ds._replace_source("api")
|
| 334 |
+
assert ds2.source == "api"
|
| 335 |
+
assert ds.source == "reference" # original unchanged
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
# ===========================================================================
|
| 339 |
+
# TestHuggingFaceImporter
|
| 340 |
+
# ===========================================================================
|
| 341 |
+
|
| 342 |
+
class TestHuggingFaceImporter:
|
| 343 |
+
|
| 344 |
+
def test_search_returns_list(self, hf_importer):
|
| 345 |
+
results = hf_importer.search()
|
| 346 |
+
assert isinstance(results, list)
|
| 347 |
+
assert len(results) > 0
|
| 348 |
+
|
| 349 |
+
def test_search_reference_datasets(self, hf_importer):
|
| 350 |
+
results = hf_importer.search(use_reference=True)
|
| 351 |
+
assert len(results) >= 5
|
| 352 |
+
|
| 353 |
+
def test_search_query_filter(self, hf_importer):
|
| 354 |
+
results = hf_importer.search(query="RIMES", use_reference=True)
|
| 355 |
+
assert len(results) >= 1
|
| 356 |
+
assert any("RIMES" in ds.title or "rimes" in ds.dataset_id.lower() for ds in results)
|
| 357 |
+
|
| 358 |
+
def test_search_language_filter(self, hf_importer):
|
| 359 |
+
results = hf_importer.search(language="French", use_reference=True)
|
| 360 |
+
assert len(results) > 0
|
| 361 |
+
for ds in results:
|
| 362 |
+
assert any("french" in l.lower() for l in ds.language)
|
| 363 |
+
|
| 364 |
+
def test_search_tag_filter(self, hf_importer):
|
| 365 |
+
results = hf_importer.search(tags=["historical"], use_reference=True)
|
| 366 |
+
assert isinstance(results, list)
|
| 367 |
+
|
| 368 |
+
def test_search_limit(self, hf_importer):
|
| 369 |
+
results = hf_importer.search(limit=3)
|
| 370 |
+
assert len(results) <= 3
|
| 371 |
+
|
| 372 |
+
def test_search_no_api_fallback(self, hf_importer):
|
| 373 |
+
# Même sans accès réseau, on a les datasets de référence
|
| 374 |
+
results = hf_importer.search(query="medieval", use_reference=True)
|
| 375 |
+
assert len(results) >= 1
|
| 376 |
+
|
| 377 |
+
def test_import_creates_meta(self, tmp_path, hf_importer):
|
| 378 |
+
result = hf_importer.import_dataset("CATMuS/medieval", output_dir=tmp_path, max_samples=5)
|
| 379 |
+
assert Path(result["metadata_file"]).exists()
|
| 380 |
+
|
| 381 |
+
def test_import_meta_content(self, tmp_path, hf_importer):
|
| 382 |
+
result = hf_importer.import_dataset("CATMuS/medieval", output_dir=tmp_path, max_samples=5)
|
| 383 |
+
meta = json.loads(Path(result["metadata_file"]).read_text())
|
| 384 |
+
assert meta["dataset_id"] == "CATMuS/medieval"
|
| 385 |
+
assert meta["source"] == "huggingface"
|
| 386 |
+
|
| 387 |
+
def test_import_returns_dict_keys(self, tmp_path, hf_importer):
|
| 388 |
+
result = hf_importer.import_dataset("x/y", output_dir=tmp_path, max_samples=5)
|
| 389 |
+
for k in ["dataset_id", "output_dir", "files_imported", "metadata_file"]:
|
| 390 |
+
assert k in result
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
# ===========================================================================
|
| 394 |
+
# TestHuggingFaceReferenceData
|
| 395 |
+
# ===========================================================================
|
| 396 |
+
|
| 397 |
+
class TestHuggingFaceReferenceData:
|
| 398 |
+
|
| 399 |
+
def test_reference_datasets_loaded(self):
|
| 400 |
+
from picarones.importers.huggingface import _REFERENCE_DATASETS
|
| 401 |
+
assert len(_REFERENCE_DATASETS) >= 5
|
| 402 |
+
|
| 403 |
+
def test_catmus_present(self):
|
| 404 |
+
from picarones.importers.huggingface import _REFERENCE_DATASETS
|
| 405 |
+
ids = [d["dataset_id"] for d in _REFERENCE_DATASETS]
|
| 406 |
+
assert any("CATMuS" in did or "catmus" in did.lower() for did in ids)
|
| 407 |
+
|
| 408 |
+
def test_all_have_required_fields(self):
|
| 409 |
+
from picarones.importers.huggingface import _REFERENCE_DATASETS
|
| 410 |
+
for d in _REFERENCE_DATASETS:
|
| 411 |
+
assert "dataset_id" in d
|
| 412 |
+
assert "title" in d
|
| 413 |
+
assert "language" in d
|
| 414 |
+
|
| 415 |
+
def test_all_are_image_to_text(self):
|
| 416 |
+
from picarones.importers.huggingface import _REFERENCE_DATASETS
|
| 417 |
+
for d in _REFERENCE_DATASETS:
|
| 418 |
+
assert d.get("task", "image-to-text") == "image-to-text"
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
# ===========================================================================
|
| 422 |
+
# TestNormalizationProfiles
|
| 423 |
+
# ===========================================================================
|
| 424 |
+
|
| 425 |
+
class TestNormalizationProfiles:
|
| 426 |
+
|
| 427 |
+
def test_api_returns_profiles(self, client):
|
| 428 |
+
r = client.get("/api/normalization/profiles")
|
| 429 |
+
assert r.status_code == 200
|
| 430 |
+
d = r.json()
|
| 431 |
+
assert "profiles" in d
|
| 432 |
+
assert len(d["profiles"]) >= 4
|
| 433 |
+
|
| 434 |
+
def test_nfc_profile_present(self, client):
|
| 435 |
+
r = client.get("/api/normalization/profiles")
|
| 436 |
+
ids = [p["id"] for p in r.json()["profiles"]]
|
| 437 |
+
assert "nfc" in ids
|
| 438 |
+
|
| 439 |
+
def test_medieval_french_present(self, client):
|
| 440 |
+
r = client.get("/api/normalization/profiles")
|
| 441 |
+
ids = [p["id"] for p in r.json()["profiles"]]
|
| 442 |
+
assert "medieval_french" in ids
|
| 443 |
+
|
| 444 |
+
def test_profiles_have_required_fields(self, client):
|
| 445 |
+
r = client.get("/api/normalization/profiles")
|
| 446 |
+
for p in r.json()["profiles"]:
|
| 447 |
+
assert "id" in p
|
| 448 |
+
assert "name" in p
|
| 449 |
+
assert "description" in p
|
| 450 |
+
assert "caseless" in p
|
| 451 |
+
assert "diplomatic_rules" in p
|
| 452 |
+
|
| 453 |
+
def test_caseless_profile(self, client):
|
| 454 |
+
r = client.get("/api/normalization/profiles")
|
| 455 |
+
profiles = {p["id"]: p for p in r.json()["profiles"]}
|
| 456 |
+
assert "caseless" in profiles
|
| 457 |
+
assert profiles["caseless"]["caseless"] is True
|
| 458 |
+
|
| 459 |
+
def test_medieval_french_has_diplomatic_rules(self, client):
|
| 460 |
+
r = client.get("/api/normalization/profiles")
|
| 461 |
+
profiles = {p["id"]: p for p in r.json()["profiles"]}
|
| 462 |
+
assert profiles["medieval_french"]["diplomatic_rules"] > 0
|
| 463 |
+
|
| 464 |
+
def test_nfc_no_diplomatic_rules(self, client):
|
| 465 |
+
r = client.get("/api/normalization/profiles")
|
| 466 |
+
profiles = {p["id"]: p for p in r.json()["profiles"]}
|
| 467 |
+
assert profiles["nfc"]["diplomatic_rules"] == 0
|
| 468 |
+
|
| 469 |
+
def test_early_modern_french_present(self, client):
|
| 470 |
+
r = client.get("/api/normalization/profiles")
|
| 471 |
+
ids = [p["id"] for p in r.json()["profiles"]]
|
| 472 |
+
assert "early_modern_french" in ids
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
# ===========================================================================
|
| 476 |
+
# TestFastAPIStatus
|
| 477 |
+
# ===========================================================================
|
| 478 |
+
|
| 479 |
+
class TestFastAPIStatus:
|
| 480 |
+
|
| 481 |
+
def test_status_200(self, client):
|
| 482 |
+
r = client.get("/api/status")
|
| 483 |
+
assert r.status_code == 200
|
| 484 |
+
|
| 485 |
+
def test_status_has_version(self, client):
|
| 486 |
+
r = client.get("/api/status")
|
| 487 |
+
d = r.json()
|
| 488 |
+
assert "version" in d
|
| 489 |
+
assert d["version"]
|
| 490 |
+
|
| 491 |
+
def test_status_ok(self, client):
|
| 492 |
+
r = client.get("/api/status")
|
| 493 |
+
assert r.json()["status"] == "ok"
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
# ===========================================================================
|
| 497 |
+
# TestFastAPIEngines
|
| 498 |
+
# ===========================================================================
|
| 499 |
+
|
| 500 |
+
class TestFastAPIEngines:
|
| 501 |
+
|
| 502 |
+
def test_engines_200(self, client):
|
| 503 |
+
r = client.get("/api/engines")
|
| 504 |
+
assert r.status_code == 200
|
| 505 |
+
|
| 506 |
+
def test_engines_has_engines_key(self, client):
|
| 507 |
+
r = client.get("/api/engines")
|
| 508 |
+
assert "engines" in r.json()
|
| 509 |
+
|
| 510 |
+
def test_engines_has_llms_key(self, client):
|
| 511 |
+
r = client.get("/api/engines")
|
| 512 |
+
assert "llms" in r.json()
|
| 513 |
+
|
| 514 |
+
def test_engines_list_not_empty(self, client):
|
| 515 |
+
r = client.get("/api/engines")
|
| 516 |
+
assert len(r.json()["engines"]) > 0
|
| 517 |
+
|
| 518 |
+
def test_llms_list_not_empty(self, client):
|
| 519 |
+
r = client.get("/api/engines")
|
| 520 |
+
assert len(r.json()["llms"]) > 0
|
| 521 |
+
|
| 522 |
+
def test_tesseract_in_engines(self, client):
|
| 523 |
+
r = client.get("/api/engines")
|
| 524 |
+
ids = [e["id"] for e in r.json()["engines"]]
|
| 525 |
+
assert "tesseract" in ids
|
| 526 |
+
|
| 527 |
+
def test_ollama_in_llms(self, client):
|
| 528 |
+
r = client.get("/api/engines")
|
| 529 |
+
ids = [e["id"] for e in r.json()["llms"]]
|
| 530 |
+
assert "ollama" in ids
|
| 531 |
+
|
| 532 |
+
def test_engine_has_required_fields(self, client):
|
| 533 |
+
r = client.get("/api/engines")
|
| 534 |
+
for eng in r.json()["engines"]:
|
| 535 |
+
assert "id" in eng
|
| 536 |
+
assert "label" in eng
|
| 537 |
+
assert "available" in eng
|
| 538 |
+
assert "status" in eng
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
# ===========================================================================
|
| 542 |
+
# TestFastAPICorpusBrowse
|
| 543 |
+
# ===========================================================================
|
| 544 |
+
|
| 545 |
+
class TestFastAPICorpusBrowse:
|
| 546 |
+
|
| 547 |
+
def test_browse_current_dir(self, client):
|
| 548 |
+
r = client.get("/api/corpus/browse?path=.")
|
| 549 |
+
assert r.status_code == 200
|
| 550 |
+
|
| 551 |
+
def test_browse_has_required_keys(self, client):
|
| 552 |
+
r = client.get("/api/corpus/browse?path=.")
|
| 553 |
+
d = r.json()
|
| 554 |
+
assert "current_path" in d
|
| 555 |
+
assert "items" in d
|
| 556 |
+
|
| 557 |
+
def test_browse_items_are_dirs(self, client, tmp_path):
|
| 558 |
+
r = client.get(f"/api/corpus/browse?path={tmp_path}")
|
| 559 |
+
assert r.status_code == 200
|
| 560 |
+
assert r.json()["items"] == []
|
| 561 |
+
|
| 562 |
+
def test_browse_with_corpus(self, client, tmp_corpus):
|
| 563 |
+
r = client.get(f"/api/corpus/browse?path={tmp_corpus.parent}")
|
| 564 |
+
assert r.status_code == 200
|
| 565 |
+
items = r.json()["items"]
|
| 566 |
+
assert any(i["name"] == tmp_corpus.name for i in items)
|
| 567 |
+
|
| 568 |
+
def test_browse_404_for_nonexistent(self, client):
|
| 569 |
+
r = client.get("/api/corpus/browse?path=/nonexistent/path/xyz")
|
| 570 |
+
assert r.status_code == 404
|
| 571 |
+
|
| 572 |
+
def test_browse_corpus_gt_count(self, client, tmp_corpus):
|
| 573 |
+
r = client.get(f"/api/corpus/browse?path={tmp_corpus.parent}")
|
| 574 |
+
items = {i["name"]: i for i in r.json()["items"] if i["is_dir"]}
|
| 575 |
+
if tmp_corpus.name in items:
|
| 576 |
+
assert items[tmp_corpus.name]["gt_count"] >= 2
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
# ===========================================================================
|
| 580 |
+
# TestFastAPIReports
|
| 581 |
+
# ===========================================================================
|
| 582 |
+
|
| 583 |
+
class TestFastAPIReports:
|
| 584 |
+
|
| 585 |
+
def test_reports_200(self, client):
|
| 586 |
+
r = client.get("/api/reports")
|
| 587 |
+
assert r.status_code == 200
|
| 588 |
+
|
| 589 |
+
def test_reports_has_reports_key(self, client):
|
| 590 |
+
r = client.get("/api/reports")
|
| 591 |
+
assert "reports" in r.json()
|
| 592 |
+
|
| 593 |
+
def test_reports_returns_list(self, client):
|
| 594 |
+
r = client.get("/api/reports")
|
| 595 |
+
assert isinstance(r.json()["reports"], list)
|
| 596 |
+
|
| 597 |
+
def test_reports_finds_existing_html(self, client, tmp_path):
|
| 598 |
+
# Crée un rapport HTML fictif
|
| 599 |
+
html_file = tmp_path / "test_rapport.html"
|
| 600 |
+
html_file.write_text("<html><body>Test rapport</body></html>")
|
| 601 |
+
r = client.get(f"/api/reports?reports_dir={tmp_path}")
|
| 602 |
+
reports = r.json()["reports"]
|
| 603 |
+
assert any(rep["filename"] == "test_rapport.html" for rep in reports)
|
| 604 |
+
|
| 605 |
+
def test_report_entry_has_fields(self, client, tmp_path):
|
| 606 |
+
html_file = tmp_path / "my_report.html"
|
| 607 |
+
html_file.write_text("<html></html>")
|
| 608 |
+
r = client.get(f"/api/reports?reports_dir={tmp_path}")
|
| 609 |
+
rep = next(rep for rep in r.json()["reports"] if rep["filename"] == "my_report.html")
|
| 610 |
+
assert "filename" in rep
|
| 611 |
+
assert "path" in rep
|
| 612 |
+
assert "size_kb" in rep
|
| 613 |
+
assert "modified" in rep
|
| 614 |
+
assert "url" in rep
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
# ===========================================================================
|
| 618 |
+
# TestFastAPIHTRUnited
|
| 619 |
+
# ===========================================================================
|
| 620 |
+
|
| 621 |
+
class TestFastAPIHTRUnited:
|
| 622 |
+
|
| 623 |
+
def test_catalogue_200(self, client):
|
| 624 |
+
r = client.get("/api/htr-united/catalogue")
|
| 625 |
+
assert r.status_code == 200
|
| 626 |
+
|
| 627 |
+
def test_catalogue_has_entries(self, client):
|
| 628 |
+
r = client.get("/api/htr-united/catalogue")
|
| 629 |
+
d = r.json()
|
| 630 |
+
assert "entries" in d
|
| 631 |
+
assert len(d["entries"]) >= 4
|
| 632 |
+
|
| 633 |
+
def test_catalogue_has_filters(self, client):
|
| 634 |
+
r = client.get("/api/htr-united/catalogue")
|
| 635 |
+
d = r.json()
|
| 636 |
+
assert "available_languages" in d
|
| 637 |
+
assert "available_scripts" in d
|
| 638 |
+
|
| 639 |
+
def test_catalogue_search_query(self, client):
|
| 640 |
+
r = client.get("/api/htr-united/catalogue?query=médiéval")
|
| 641 |
+
assert r.status_code == 200
|
| 642 |
+
d = r.json()
|
| 643 |
+
assert d["total"] >= 0 # Can be 0 if no match — no error
|
| 644 |
+
|
| 645 |
+
def test_catalogue_search_language(self, client):
|
| 646 |
+
r = client.get("/api/htr-united/catalogue?language=French")
|
| 647 |
+
assert r.status_code == 200
|
| 648 |
+
d = r.json()
|
| 649 |
+
for e in d["entries"]:
|
| 650 |
+
assert any("french" in l.lower() for l in e["language"])
|
| 651 |
+
|
| 652 |
+
def test_import_valid_entry(self, client, tmp_path):
|
| 653 |
+
# Get first entry id
|
| 654 |
+
r = client.get("/api/htr-united/catalogue")
|
| 655 |
+
entry_id = r.json()["entries"][0]["id"]
|
| 656 |
+
r2 = client.post("/api/htr-united/import", json={
|
| 657 |
+
"entry_id": entry_id,
|
| 658 |
+
"output_dir": str(tmp_path),
|
| 659 |
+
"max_samples": 5,
|
| 660 |
+
})
|
| 661 |
+
assert r2.status_code == 200
|
| 662 |
+
assert "entry_id" in r2.json()
|
| 663 |
+
|
| 664 |
+
def test_import_invalid_entry(self, client, tmp_path):
|
| 665 |
+
r = client.post("/api/htr-united/import", json={
|
| 666 |
+
"entry_id": "this-does-not-exist-xyz",
|
| 667 |
+
"output_dir": str(tmp_path),
|
| 668 |
+
"max_samples": 5,
|
| 669 |
+
})
|
| 670 |
+
assert r.status_code == 404
|
| 671 |
+
|
| 672 |
+
|
| 673 |
+
# ===========================================================================
|
| 674 |
+
# TestFastAPIHuggingFace
|
| 675 |
+
# ===========================================================================
|
| 676 |
+
|
| 677 |
+
class TestFastAPIHuggingFace:
|
| 678 |
+
|
| 679 |
+
def test_search_200(self, client):
|
| 680 |
+
r = client.get("/api/huggingface/search")
|
| 681 |
+
assert r.status_code == 200
|
| 682 |
+
|
| 683 |
+
def test_search_has_datasets(self, client):
|
| 684 |
+
r = client.get("/api/huggingface/search")
|
| 685 |
+
d = r.json()
|
| 686 |
+
assert "datasets" in d
|
| 687 |
+
assert d["total"] >= 1
|
| 688 |
+
|
| 689 |
+
def test_search_with_query(self, client):
|
| 690 |
+
r = client.get("/api/huggingface/search?query=RIMES")
|
| 691 |
+
assert r.status_code == 200
|
| 692 |
+
d = r.json()
|
| 693 |
+
assert isinstance(d["datasets"], list)
|
| 694 |
+
|
| 695 |
+
def test_search_with_language(self, client):
|
| 696 |
+
r = client.get("/api/huggingface/search?language=French")
|
| 697 |
+
assert r.status_code == 200
|
| 698 |
+
|
| 699 |
+
def test_import_creates_meta(self, client, tmp_path):
|
| 700 |
+
r = client.post("/api/huggingface/import", json={
|
| 701 |
+
"dataset_id": "CATMuS/medieval",
|
| 702 |
+
"output_dir": str(tmp_path),
|
| 703 |
+
"split": "train",
|
| 704 |
+
"max_samples": 5,
|
| 705 |
+
})
|
| 706 |
+
assert r.status_code == 200
|
| 707 |
+
d = r.json()
|
| 708 |
+
assert Path(d["metadata_file"]).exists()
|
| 709 |
+
|
| 710 |
+
def test_import_returns_keys(self, client, tmp_path):
|
| 711 |
+
r = client.post("/api/huggingface/import", json={
|
| 712 |
+
"dataset_id": "test/dataset",
|
| 713 |
+
"output_dir": str(tmp_path),
|
| 714 |
+
})
|
| 715 |
+
assert r.status_code == 200
|
| 716 |
+
for k in ["dataset_id", "output_dir", "files_imported", "metadata_file"]:
|
| 717 |
+
assert k in r.json()
|
| 718 |
+
|
| 719 |
+
|
| 720 |
+
# ===========================================================================
|
| 721 |
+
# TestFastAPIBenchmark
|
| 722 |
+
# ===========================================================================
|
| 723 |
+
|
| 724 |
+
class TestFastAPIBenchmark:
|
| 725 |
+
|
| 726 |
+
def test_start_missing_corpus(self, client):
|
| 727 |
+
r = client.post("/api/benchmark/start", json={
|
| 728 |
+
"corpus_path": "/nonexistent/path/xyz",
|
| 729 |
+
"engines": ["tesseract"],
|
| 730 |
+
})
|
| 731 |
+
assert r.status_code == 400
|
| 732 |
+
|
| 733 |
+
def test_start_valid_corpus(self, client, tmp_corpus):
|
| 734 |
+
r = client.post("/api/benchmark/start", json={
|
| 735 |
+
"corpus_path": str(tmp_corpus),
|
| 736 |
+
"engines": ["tesseract"],
|
| 737 |
+
})
|
| 738 |
+
assert r.status_code == 200
|
| 739 |
+
d = r.json()
|
| 740 |
+
assert "job_id" in d
|
| 741 |
+
assert d["status"] in ("pending", "running")
|
| 742 |
+
|
| 743 |
+
def test_status_nonexistent_job(self, client):
|
| 744 |
+
r = client.get("/api/benchmark/nonexistent-job-id/status")
|
| 745 |
+
assert r.status_code == 404
|
| 746 |
+
|
| 747 |
+
def test_status_valid_job(self, client, tmp_corpus):
|
| 748 |
+
r = client.post("/api/benchmark/start", json={
|
| 749 |
+
"corpus_path": str(tmp_corpus),
|
| 750 |
+
"engines": ["tesseract"],
|
| 751 |
+
})
|
| 752 |
+
job_id = r.json()["job_id"]
|
| 753 |
+
r2 = client.get(f"/api/benchmark/{job_id}/status")
|
| 754 |
+
assert r2.status_code == 200
|
| 755 |
+
d = r2.json()
|
| 756 |
+
assert d["job_id"] == job_id
|
| 757 |
+
assert "status" in d
|
| 758 |
+
assert "progress" in d
|
| 759 |
+
|
| 760 |
+
def test_cancel_nonexistent_job(self, client):
|
| 761 |
+
r = client.post("/api/benchmark/nonexistent-id/cancel")
|
| 762 |
+
assert r.status_code == 404
|
| 763 |
+
|
| 764 |
+
def test_cancel_valid_job(self, client, tmp_corpus):
|
| 765 |
+
r = client.post("/api/benchmark/start", json={
|
| 766 |
+
"corpus_path": str(tmp_corpus),
|
| 767 |
+
"engines": ["tesseract"],
|
| 768 |
+
})
|
| 769 |
+
job_id = r.json()["job_id"]
|
| 770 |
+
r2 = client.post(f"/api/benchmark/{job_id}/cancel")
|
| 771 |
+
assert r2.status_code == 200
|
| 772 |
+
|
| 773 |
+
def test_job_status_fields(self, client, tmp_corpus):
|
| 774 |
+
r = client.post("/api/benchmark/start", json={
|
| 775 |
+
"corpus_path": str(tmp_corpus),
|
| 776 |
+
"engines": ["tesseract"],
|
| 777 |
+
})
|
| 778 |
+
job_id = r.json()["job_id"]
|
| 779 |
+
r2 = client.get(f"/api/benchmark/{job_id}/status")
|
| 780 |
+
d = r2.json()
|
| 781 |
+
for k in ["job_id", "status", "progress", "total_docs", "processed_docs", "output_path"]:
|
| 782 |
+
assert k in d, f"Missing key: {k}"
|
| 783 |
+
|
| 784 |
+
def test_stream_nonexistent_job(self, client):
|
| 785 |
+
r = client.get("/api/benchmark/nonexistent-id/stream")
|
| 786 |
+
assert r.status_code == 404
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
# ===========================================================================
|
| 790 |
+
# TestFastAPIHTML
|
| 791 |
+
# ===========================================================================
|
| 792 |
+
|
| 793 |
+
class TestFastAPIHTML:
|
| 794 |
+
|
| 795 |
+
def test_root_200(self, client):
|
| 796 |
+
r = client.get("/")
|
| 797 |
+
assert r.status_code == 200
|
| 798 |
+
|
| 799 |
+
def test_root_is_html(self, client):
|
| 800 |
+
r = client.get("/")
|
| 801 |
+
assert "text/html" in r.headers["content-type"]
|
| 802 |
+
|
| 803 |
+
def test_html_has_picarones_title(self, client):
|
| 804 |
+
r = client.get("/")
|
| 805 |
+
assert "Picarones" in r.text
|
| 806 |
+
|
| 807 |
+
def test_html_has_nav_sections(self, client):
|
| 808 |
+
r = client.get("/")
|
| 809 |
+
for section in ["benchmark", "reports", "engines", "import"]:
|
| 810 |
+
assert section in r.text.lower()
|
| 811 |
+
|
| 812 |
+
def test_html_has_french_content(self, client):
|
| 813 |
+
r = client.get("/")
|
| 814 |
+
assert "Moteurs" in r.text or "moteurs" in r.text.lower()
|
| 815 |
+
|
| 816 |
+
|
| 817 |
+
# ===========================================================================
|
| 818 |
+
# TestFastAPIReportServe
|
| 819 |
+
# ===========================================================================
|
| 820 |
+
|
| 821 |
+
class TestFastAPIReportServe:
|
| 822 |
+
|
| 823 |
+
def test_serve_nonexistent_report(self, client):
|
| 824 |
+
r = client.get("/reports/nonexistent_report.html")
|
| 825 |
+
assert r.status_code == 404
|
| 826 |
+
|
| 827 |
+
def test_serve_existing_report(self, client, tmp_path, monkeypatch):
|
| 828 |
+
# Crée un rapport HTML dans le répertoire courant
|
| 829 |
+
import os
|
| 830 |
+
orig_cwd = os.getcwd()
|
| 831 |
+
os.chdir(tmp_path)
|
| 832 |
+
try:
|
| 833 |
+
html_file = tmp_path / "test_serve.html"
|
| 834 |
+
html_file.write_text("<html><body>Test</body></html>")
|
| 835 |
+
r = client.get("/reports/test_serve.html")
|
| 836 |
+
assert r.status_code == 200
|
| 837 |
+
finally:
|
| 838 |
+
os.chdir(orig_cwd)
|
| 839 |
+
|
| 840 |
+
def test_serve_non_html_rejected(self, client):
|
| 841 |
+
# Tente de servir un .py — doit retourner 404 (extension non-html)
|
| 842 |
+
r = client.get("/reports/malicious.py")
|
| 843 |
+
assert r.status_code == 404
|
| 844 |
+
|
| 845 |
+
def test_serve_report_content_type(self, client, tmp_path):
|
| 846 |
+
import os
|
| 847 |
+
orig_cwd = os.getcwd()
|
| 848 |
+
os.chdir(tmp_path)
|
| 849 |
+
try:
|
| 850 |
+
html_file = tmp_path / "report_ct.html"
|
| 851 |
+
html_file.write_text("<html><body>Content</body></html>")
|
| 852 |
+
r = client.get("/reports/report_ct.html")
|
| 853 |
+
if r.status_code == 200:
|
| 854 |
+
assert "html" in r.headers.get("content-type", "").lower()
|
| 855 |
+
finally:
|
| 856 |
+
os.chdir(orig_cwd)
|
| 857 |
+
|
| 858 |
+
|
| 859 |
+
# ===========================================================================
|
| 860 |
+
# TestCLIServeCommand
|
| 861 |
+
# ===========================================================================
|
| 862 |
+
|
| 863 |
+
class TestCLIServeCommand:
|
| 864 |
+
|
| 865 |
+
def test_serve_command_registered(self):
|
| 866 |
+
from picarones.cli import cli
|
| 867 |
+
commands = cli.commands
|
| 868 |
+
assert "serve" in commands
|
| 869 |
+
|
| 870 |
+
def test_serve_help_text(self):
|
| 871 |
+
from picarones.cli import cli
|
| 872 |
+
runner = CliRunner()
|
| 873 |
+
result = runner.invoke(cli, ["serve", "--help"])
|
| 874 |
+
assert result.exit_code == 0
|
| 875 |
+
assert "serve" in result.output.lower() or "localhost" in result.output.lower()
|
| 876 |
+
|
| 877 |
+
def test_serve_default_port_in_help(self):
|
| 878 |
+
from picarones.cli import cli
|
| 879 |
+
runner = CliRunner()
|
| 880 |
+
result = runner.invoke(cli, ["serve", "--help"])
|
| 881 |
+
assert "8000" in result.output
|
| 882 |
+
|
| 883 |
+
def test_serve_help_has_port_option(self):
|
| 884 |
+
from picarones.cli import cli
|
| 885 |
+
runner = CliRunner()
|
| 886 |
+
result = runner.invoke(cli, ["serve", "--help"])
|
| 887 |
+
assert "--port" in result.output
|
| 888 |
+
|
| 889 |
+
def test_serve_missing_uvicorn_exits_gracefully(self):
|
| 890 |
+
from picarones.cli import cli
|
| 891 |
+
runner = CliRunner()
|
| 892 |
+
# Avec uvicorn installé, cela démarrerait le serveur — on teste juste que
|
| 893 |
+
# la commande existe et est invocable (pas qu'elle démare le serveur)
|
| 894 |
+
# On vérifie juste le help
|
| 895 |
+
result = runner.invoke(cli, ["serve", "--help"])
|
| 896 |
+
assert result.exit_code == 0
|
| 897 |
+
|
| 898 |
+
|
| 899 |
+
# ===========================================================================
|
| 900 |
+
# TestRunnerProgressCallback
|
| 901 |
+
# ===========================================================================
|
| 902 |
+
|
| 903 |
+
class TestRunnerProgressCallback:
|
| 904 |
+
|
| 905 |
+
def test_callback_signature_accepted(self):
|
| 906 |
+
"""run_benchmark accepte un paramètre progress_callback."""
|
| 907 |
+
import inspect
|
| 908 |
+
from picarones.core.runner import run_benchmark
|
| 909 |
+
sig = inspect.signature(run_benchmark)
|
| 910 |
+
assert "progress_callback" in sig.parameters
|
| 911 |
+
|
| 912 |
+
def test_callback_is_optional(self):
|
| 913 |
+
"""progress_callback est optionnel (valeur par défaut None)."""
|
| 914 |
+
import inspect
|
| 915 |
+
from picarones.core.runner import run_benchmark
|
| 916 |
+
sig = inspect.signature(run_benchmark)
|
| 917 |
+
param = sig.parameters["progress_callback"]
|
| 918 |
+
assert param.default is None
|
| 919 |
+
|
| 920 |
+
def test_callback_called_with_mock_engine(self, tmp_corpus):
|
| 921 |
+
"""Le callback est appelé pour chaque document."""
|
| 922 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 923 |
+
from picarones.core.runner import run_benchmark
|
| 924 |
+
from picarones.engines.base import BaseOCREngine, EngineResult
|
| 925 |
+
|
| 926 |
+
class MockEngine(BaseOCREngine):
|
| 927 |
+
@property
|
| 928 |
+
def name(self): return "mock"
|
| 929 |
+
@property
|
| 930 |
+
def version(self): return "0.0.1"
|
| 931 |
+
def _run_ocr(self, image_path): return "texte mock"
|
| 932 |
+
|
| 933 |
+
corpus = load_corpus_from_directory(str(tmp_corpus))
|
| 934 |
+
calls = []
|
| 935 |
+
def my_callback(engine_name, doc_idx, doc_id):
|
| 936 |
+
calls.append((engine_name, doc_idx, doc_id))
|
| 937 |
+
|
| 938 |
+
run_benchmark(corpus, [MockEngine()], progress_callback=my_callback)
|
| 939 |
+
assert len(calls) == len(corpus), f"Expected {len(corpus)} calls, got {len(calls)}"
|
| 940 |
+
|
| 941 |
+
def test_callback_receives_engine_name(self, tmp_corpus):
|
| 942 |
+
"""Le callback reçoit le nom du moteur."""
|
| 943 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 944 |
+
from picarones.core.runner import run_benchmark
|
| 945 |
+
from picarones.engines.base import BaseOCREngine
|
| 946 |
+
|
| 947 |
+
class MockEngine(BaseOCREngine):
|
| 948 |
+
@property
|
| 949 |
+
def name(self): return "test_engine_name"
|
| 950 |
+
@property
|
| 951 |
+
def version(self): return "0.0.1"
|
| 952 |
+
def _run_ocr(self, image_path): return "texte"
|
| 953 |
+
|
| 954 |
+
corpus = load_corpus_from_directory(str(tmp_corpus))
|
| 955 |
+
engine_names = []
|
| 956 |
+
def my_callback(engine_name, doc_idx, doc_id):
|
| 957 |
+
engine_names.append(engine_name)
|
| 958 |
+
|
| 959 |
+
run_benchmark(corpus, [MockEngine()], progress_callback=my_callback)
|
| 960 |
+
assert all(n == "test_engine_name" for n in engine_names)
|
| 961 |
+
|
| 962 |
+
def test_callback_exception_does_not_crash(self, tmp_corpus):
|
| 963 |
+
"""Une exception dans le callback ne plante pas le benchmark."""
|
| 964 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 965 |
+
from picarones.core.runner import run_benchmark
|
| 966 |
+
from picarones.engines.base import BaseOCREngine
|
| 967 |
+
|
| 968 |
+
class MockEngine(BaseOCREngine):
|
| 969 |
+
@property
|
| 970 |
+
def name(self): return "mock"
|
| 971 |
+
@property
|
| 972 |
+
def version(self): return "0.0.1"
|
| 973 |
+
def _run_ocr(self, image_path): return "texte"
|
| 974 |
+
|
| 975 |
+
corpus = load_corpus_from_directory(str(tmp_corpus))
|
| 976 |
+
|
| 977 |
+
def bad_callback(engine_name, doc_idx, doc_id):
|
| 978 |
+
raise RuntimeError("Callback error!")
|
| 979 |
+
|
| 980 |
+
# Ne doit pas lever d'exception
|
| 981 |
+
result = run_benchmark(corpus, [MockEngine()], progress_callback=bad_callback)
|
| 982 |
+
assert result is not None
|