Spaces:
Sleeping
Sleeping
Claude
refactor(core): faire de core/ un cercle 1 strict, déplacer cercle 2 vers measurements/
979f3c3 unverified | """Commandes workflows benchmark : run, diagnose, economics, edition, compare. | |
| Sous-module CLI extrait de l'ancien ``picarones/cli.py`` (1519 lignes) | |
| lors du chantier 5 post-Sprint 97. Les commandes ici s'enregistrent | |
| automatiquement sur le groupe ``cli`` à l'import. | |
| Comportement et signatures inchangés — uniquement de la modularisation. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import sys | |
| import click | |
| from picarones.cli import cli, _engine_from_name, _setup_logging | |
| # --------------------------------------------------------------------------- | |
| # picarones run | |
| # --------------------------------------------------------------------------- | |
| def run_cmd( | |
| corpus: str, | |
| engines: str, | |
| output: str, | |
| lang: str, | |
| psm: int, | |
| no_progress: bool, | |
| verbose: bool, | |
| fail_if_cer_above: float | None, | |
| profile: str, | |
| ) -> None: | |
| """Lance un benchmark OCR sur un corpus de documents. | |
| Le corpus doit être un dossier contenant des paires | |
| <image>.<ext> + <image>.gt.txt (vérité terrain). | |
| """ | |
| _setup_logging(verbose) | |
| from picarones.core.corpus import load_corpus_from_directory | |
| from picarones.measurements.runner import run_benchmark | |
| # Chargement du corpus | |
| try: | |
| corp = load_corpus_from_directory(corpus) | |
| except (FileNotFoundError, ValueError) as exc: | |
| click.echo(f"Erreur corpus : {exc}", err=True) | |
| sys.exit(1) | |
| click.echo(f"Corpus '{corp.name}' — {len(corp)} documents chargés.") | |
| # Instanciation des moteurs | |
| engine_names = [e.strip() for e in engines.split(",") if e.strip()] | |
| ocr_engines = [] | |
| for name in engine_names: | |
| try: | |
| engine = _engine_from_name(name, lang=lang, psm=psm) | |
| ocr_engines.append(engine) | |
| except click.BadParameter as exc: | |
| click.echo(f"Erreur moteur : {exc}", err=True) | |
| sys.exit(1) | |
| if not ocr_engines: | |
| click.echo("Aucun moteur valide spécifié.", err=True) | |
| sys.exit(1) | |
| click.echo(f"Moteurs : {', '.join(e.name for e in ocr_engines)}") | |
| click.echo(f"Profil de métriques : {profile}") | |
| # Lancement du benchmark | |
| result = run_benchmark( | |
| corpus=corp, | |
| engines=ocr_engines, | |
| output_json=output, | |
| show_progress=not no_progress, | |
| profile=profile, | |
| ) | |
| # Affichage du classement | |
| click.echo("\n── Classement ──────────────────────────────────") | |
| for rank, entry in enumerate(result.ranking(), 1): | |
| cer_pct = f"{entry['mean_cer'] * 100:.2f}%" if entry["mean_cer"] is not None else "N/A" | |
| wer_pct = f"{entry['mean_wer'] * 100:.2f}%" if entry["mean_wer"] is not None else "N/A" | |
| failed = entry["failed"] | |
| failed_str = f" ({failed} erreur(s))" if failed else "" | |
| click.echo(f" {rank}. {entry['engine']:<20} CER={cer_pct:<8} WER={wer_pct}{failed_str}") | |
| click.echo(f"\nRésultats écrits dans : {output}") | |
| # Mode CI/CD : exit code non-zero si CER > seuil | |
| if fail_if_cer_above is not None: | |
| for entry in result.ranking(): | |
| if entry["mean_cer"] is not None and entry["mean_cer"] * 100 > fail_if_cer_above: | |
| click.echo( | |
| f"\nECHEC : {entry['engine']} CER={entry['mean_cer']*100:.2f}% " | |
| f"> seuil {fail_if_cer_above:.2f}%", | |
| err=True, | |
| ) | |
| sys.exit(1) | |
| # --------------------------------------------------------------------------- | |
| # Workflows CLI dédiés (chantier 4 post-Sprint 97) | |
| # --------------------------------------------------------------------------- | |
| # | |
| # Chaque commande spécialisée fixe un profil de calcul (chantier 2) et | |
| # émet un message identifiant la famille avant de déléguer au runner. | |
| # L'option ``--profile`` reste disponible mais le défaut change pour | |
| # chaque commande. | |
| def _run_workflow( | |
| *, | |
| corpus: str, | |
| engines: str, | |
| output: str, | |
| lang: str, | |
| psm: int, | |
| no_progress: bool, | |
| verbose: bool, | |
| profile: str, | |
| workflow_label: str, | |
| ) -> None: | |
| """Implémentation commune des commandes ``run``, ``diagnose``, | |
| ``economics`` et ``edition``. | |
| Les 4 commandes partagent le squelette : chargement corpus → | |
| instanciation moteurs → ``run_benchmark(profile=...)`` → affichage | |
| classement. Seul le profil par défaut et le message d'en-tête | |
| diffèrent. | |
| """ | |
| _setup_logging(verbose) | |
| from picarones.core.corpus import load_corpus_from_directory | |
| from picarones.measurements.runner import run_benchmark | |
| try: | |
| corp = load_corpus_from_directory(corpus) | |
| except (FileNotFoundError, ValueError) as exc: | |
| click.echo(f"Erreur corpus : {exc}", err=True) | |
| sys.exit(1) | |
| click.echo(f"[{workflow_label}] Corpus '{corp.name}' — " | |
| f"{len(corp)} documents chargés.") | |
| engine_names = [e.strip() for e in engines.split(",") if e.strip()] | |
| ocr_engines = [] | |
| for name in engine_names: | |
| try: | |
| engine = _engine_from_name(name, lang=lang, psm=psm) | |
| ocr_engines.append(engine) | |
| except click.BadParameter as exc: | |
| click.echo(f"Erreur moteur : {exc}", err=True) | |
| sys.exit(1) | |
| if not ocr_engines: | |
| click.echo("Aucun moteur valide spécifié.", err=True) | |
| sys.exit(1) | |
| click.echo(f"Moteurs : {', '.join(e.name for e in ocr_engines)}") | |
| click.echo(f"Profil de métriques : {profile}") | |
| result = run_benchmark( | |
| corpus=corp, | |
| engines=ocr_engines, | |
| output_json=output, | |
| show_progress=not no_progress, | |
| profile=profile, | |
| ) | |
| click.echo("\n── Classement ──────────────────────────────────") | |
| for rank, entry in enumerate(result.ranking(), 1): | |
| cer_pct = ( | |
| f"{entry['mean_cer'] * 100:.2f}%" | |
| if entry["mean_cer"] is not None else "N/A" | |
| ) | |
| wer_pct = ( | |
| f"{entry['mean_wer'] * 100:.2f}%" | |
| if entry["mean_wer"] is not None else "N/A" | |
| ) | |
| failed = entry["failed"] | |
| failed_str = f" ({failed} erreur(s))" if failed else "" | |
| click.echo( | |
| f" {rank}. {entry['engine']:<20} " | |
| f"CER={cer_pct:<8} WER={wer_pct}{failed_str}" | |
| ) | |
| click.echo(f"\nRésultats écrits dans : {output}") | |
| def diagnose_cmd( | |
| corpus: str, engines: str, output: str, lang: str, psm: int, | |
| no_progress: bool, verbose: bool, | |
| ) -> None: | |
| """Workflow diagnostic : bench + leviers d'amélioration + image_predictive. | |
| Active le profil ``diagnostics`` (chantier 2) qui calcule les | |
| métriques nécessaires à la vue HTML « Diagnostic approfondi » | |
| (chantier 3) : leviers, profil d'image, baseline, longitudinal. | |
| Idéal pour comprendre *pourquoi* un moteur produit ces résultats | |
| sur ce corpus, pas seulement *quel CER*. | |
| """ | |
| _run_workflow( | |
| corpus=corpus, engines=engines, output=output, | |
| lang=lang, psm=psm, | |
| no_progress=no_progress, verbose=verbose, | |
| profile="diagnostics", | |
| workflow_label="diagnose", | |
| ) | |
| def economics_cmd( | |
| corpus: str, engines: str, output: str, lang: str, psm: int, | |
| no_progress: bool, verbose: bool, | |
| ) -> None: | |
| """Workflow économique : bench + throughput effectif + (cost projection). | |
| Active le profil ``economics`` (chantier 2) qui se concentre sur | |
| les métriques de décision budget : pages/h utilisable (intégrant | |
| la correction humaine HTR-United à 5 s/erreur), coût marginal par | |
| erreur évitée. La vue HTML « Coût et performance » (chantier 3) | |
| est ensuite branchée. | |
| """ | |
| _run_workflow( | |
| corpus=corpus, engines=engines, output=output, | |
| lang=lang, psm=psm, | |
| no_progress=no_progress, verbose=verbose, | |
| profile="economics", | |
| workflow_label="economics", | |
| ) | |
| def edition_cmd( | |
| corpus: str, engines: str, output: str, lang: str, psm: int, | |
| no_progress: bool, verbose: bool, | |
| ) -> None: | |
| """Workflow édition critique : bench + métriques philologiques. | |
| Active le profil ``philological`` (chantier 2) qui inclut les | |
| modules philologiques (unicode_blocks, abbreviations, MUFI, | |
| early_modern_typography, modern_archives, roman_numerals) et la | |
| vue HTML « Taxonomie avancée » (chantier 3) avec comparaison | |
| miroir leader vs runner-up. Cible : éditeurs de chartes, | |
| paléographes, archivistes. | |
| """ | |
| _run_workflow( | |
| corpus=corpus, engines=engines, output=output, | |
| lang=lang, psm=psm, | |
| no_progress=no_progress, verbose=verbose, | |
| profile="philological", | |
| workflow_label="edition", | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # picarones compare (Sprint 28) | |
| # --------------------------------------------------------------------------- | |
| def compare_cmd( | |
| run_a: str, | |
| run_b: str, | |
| output: str, | |
| threshold: float, | |
| label_a: str, | |
| label_b: str, | |
| json_only: bool, | |
| verbose: bool, | |
| ) -> None: | |
| """Compare deux runs de benchmark JSON et signale les régressions. | |
| Convention : un Δ CER positif signifie que ``B`` est moins bon que | |
| ``A``. Un moteur dont |Δ CER| > ``--threshold`` est marqué comme | |
| régression ou amélioration. | |
| \b | |
| Exemples : | |
| picarones compare run_v1.json run_v2.json -o diff.html | |
| picarones compare run_v1.json run_v2.json --json | |
| picarones compare run_v1.json run_v2.json --threshold 0.01 --label-a v1 --label-b v2 | |
| """ | |
| _setup_logging(verbose) | |
| from picarones.report.comparison import ( | |
| compare_benchmarks, | |
| detect_regressions, | |
| render_comparison_html, | |
| ) | |
| diff = compare_benchmarks( | |
| run_a, run_b, | |
| threshold=threshold, | |
| label_a=label_a, | |
| label_b=label_b, | |
| ) | |
| regressions = detect_regressions(diff) | |
| if json_only: | |
| click.echo(json.dumps(diff.as_dict(), ensure_ascii=False, indent=2)) | |
| if regressions: | |
| sys.exit(2) # exit code 2 → régression détectée (utile en CI) | |
| return | |
| out = render_comparison_html(diff, output) | |
| click.echo(f"Rapport de comparaison : {out}") | |
| click.echo(f"Moteurs comparés : {len(diff.deltas)}") | |
| click.echo(f"Régressions : {len(regressions)}") | |
| click.echo(f"Améliorations : {sum(1 for d in diff.deltas if d.is_improvement)}") | |
| if regressions: | |
| click.echo("\n— Régressions détectées —") | |
| for d in regressions: | |
| click.echo( | |
| f" ⚠ {d.engine} : " | |
| f"{d.cer_a:.3f} → {d.cer_b:.3f} (Δ +{d.delta_cer:.3f})" | |
| ) | |
| sys.exit(2) | |