"""Générateur du rapport HTML interactif auto-contenu. Le rapport produit est un fichier HTML unique embarquant : - Toutes les données (JSON inline) - Chart.js et diff2html (depuis cdnjs) - CSS et JavaScript de l'application Vues disponibles ---------------- 1. Classement — tableau triable par colonne (CER, WER, MER, WIL) 2. Galerie — grille d'images avec badge CER coloré 3. Document — image zoomable + diff coloré GT / OCR par moteur 4. Analyses — histogramme CER + graphique radar """ from __future__ import annotations import json import math from pathlib import Path from typing import Optional from picarones.core.results import BenchmarkResult from picarones.report.diff_utils import compute_word_diff from picarones.core.statistics import ( compute_pairwise_stats, compute_reliability_curve, compute_correlation_matrix, compute_venn_data, cluster_errors, bootstrap_ci, ) from picarones.core.difficulty import compute_all_difficulties, difficulty_label, difficulty_color # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _cer_color(cer: float) -> str: """Retourne une couleur CSS pour un score CER donné (0→vert, 1→rouge).""" if cer < 0.05: return "#16a34a" # vert if cer < 0.15: return "#ca8a04" # jaune-orangé if cer < 0.30: return "#ea580c" # orange return "#dc2626" # rouge def _cer_bg(cer: float) -> str: if cer < 0.05: return "#dcfce7" if cer < 0.15: return "#fef9c3" if cer < 0.30: return "#ffedd5" return "#fee2e2" def _pct(v: Optional[float], decimals: int = 2) -> str: if v is None: return "—" return f"{v * 100:.{decimals}f} %" def _safe(v: Optional[float], decimals: int = 4) -> float: return round(v or 0.0, decimals) # --------------------------------------------------------------------------- # Préparation des données # --------------------------------------------------------------------------- def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -> dict: """Transforme un BenchmarkResult en dict JSON pour le rapport HTML.""" engines_summary = [] for report in benchmark.engine_reports: agg = report.aggregated_metrics diplo_agg = agg.get("cer_diplomatic", {}) entry: dict = { "name": report.engine_name, "version": report.engine_version, "cer": _safe(agg.get("cer", {}).get("mean")), "wer": _safe(agg.get("wer", {}).get("mean")), "mer": _safe(agg.get("mer", {}).get("mean")), "wil": _safe(agg.get("wil", {}).get("mean")), "cer_median": _safe(agg.get("cer", {}).get("median")), "cer_min": _safe(agg.get("cer", {}).get("min")), "cer_max": _safe(agg.get("cer", {}).get("max")), "doc_count": agg.get("document_count", 0), "failed": agg.get("failed_count", 0), # CER diplomatique (après normalisation historique : ſ=s, u=v, i=j…) "cer_diplomatic": _safe(diplo_agg.get("mean")) if diplo_agg else None, "cer_diplomatic_profile": diplo_agg.get("profile"), # Distribution pour l'histogramme : liste des CER individuels "cer_values": [ _safe(dr.metrics.cer) for dr in report.document_results if dr.metrics.error is None ], "cer_diplomatic_values": [ _safe(dr.metrics.cer_diplomatic) for dr in report.document_results if dr.metrics.error is None and dr.metrics.cer_diplomatic is not None ], # Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls) "is_pipeline": report.is_pipeline, "pipeline_info": report.pipeline_info, # Sprint 5 — métriques avancées patrimoniales "ligature_score": _safe(report.ligature_score) if report.ligature_score is not None else None, "diacritic_score": _safe(report.diacritic_score) if report.diacritic_score is not None else None, "aggregated_confusion": report.aggregated_confusion, "aggregated_taxonomy": report.aggregated_taxonomy, "aggregated_structure": report.aggregated_structure, "aggregated_image_quality": report.aggregated_image_quality, } engines_summary.append(entry) # Documents (vue galerie + vue détail) # On collecte tous les doc_ids depuis le premier moteur doc_ids_ordered = [] if benchmark.engine_reports: doc_ids_ordered = [dr.doc_id for dr in benchmark.engine_reports[0].document_results] # Index croisé : doc_id → {engine_name → DocumentResult} doc_engine_map: dict[str, dict] = {did: {} for did in doc_ids_ordered} for report in benchmark.engine_reports: for dr in report.document_results: doc_engine_map[dr.doc_id][report.engine_name] = dr documents = [] for doc_id in doc_ids_ordered: engine_results = [] gt = "" image_path = "" for engine_name in [r.engine_name for r in benchmark.engine_reports]: dr = doc_engine_map[doc_id].get(engine_name) if dr is None: continue gt = dr.ground_truth image_path = dr.image_path diff_ops = compute_word_diff(dr.ground_truth, dr.hypothesis) er_entry: dict = { "engine": engine_name, "hypothesis": dr.hypothesis, "cer": _safe(dr.metrics.cer), "cer_diplomatic": _safe(dr.metrics.cer_diplomatic) if dr.metrics.cer_diplomatic is not None else None, "wer": _safe(dr.metrics.wer), "duration": dr.duration_seconds, "error": dr.engine_error, "diff": diff_ops, } # Champs spécifiques aux pipelines OCR+LLM if dr.ocr_intermediate is not None: er_entry["ocr_intermediate"] = dr.ocr_intermediate er_entry["ocr_diff"] = compute_word_diff(dr.ground_truth, dr.ocr_intermediate) er_entry["llm_correction_diff"] = compute_word_diff(dr.ocr_intermediate, dr.hypothesis) if dr.pipeline_metadata: on = dr.pipeline_metadata.get("over_normalization") if on is not None: er_entry["over_normalization"] = on er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode") # Sprint 5 — métriques avancées par document if dr.char_scores is not None: er_entry["ligature_score"] = _safe(dr.char_scores.get("ligature", {}).get("score")) er_entry["diacritic_score"] = _safe(dr.char_scores.get("diacritic", {}).get("score")) if dr.taxonomy is not None: er_entry["taxonomy"] = dr.taxonomy if dr.structure is not None: er_entry["structure"] = dr.structure if dr.image_quality is not None: er_entry["image_quality"] = dr.image_quality engine_results.append(er_entry) # CER moyen sur ce document (pour le badge galerie) cer_values = [er["cer"] for er in engine_results if er["error"] is None] mean_cer = sum(cer_values) / len(cer_values) if cer_values else 1.0 best_engine = min(engine_results, key=lambda x: x["cer"], default=None) # Script type (depuis metadata par document si disponible) script_type = "" first_dr = doc_engine_map[doc_id].get( benchmark.engine_reports[0].engine_name if benchmark.engine_reports else None ) if first_dr and first_dr.image_quality: script_type = first_dr.image_quality.get("script_type", "") documents.append({ "doc_id": doc_id, "image_path": image_path, "image_b64": images_b64.get(doc_id, ""), "ground_truth": gt, "mean_cer": _safe(mean_cer), "best_engine": best_engine["engine"] if best_engine else "", "engine_results": engine_results, "script_type": script_type, }) # ── Sprint 7 — Score de difficulté intrinsèque ─────────────────────── gt_map = {d["doc_id"]: d["ground_truth"] for d in documents} cer_map: dict[str, dict[str, float]] = {d["doc_id"]: {} for d in documents} iq_map: dict[str, float] = {} for report in benchmark.engine_reports: for dr in report.document_results: cer_map.setdefault(dr.doc_id, {})[report.engine_name] = _safe(dr.metrics.cer) if dr.image_quality and "quality_score" in dr.image_quality: iq_map[dr.doc_id] = dr.image_quality["quality_score"] difficulty_scores = compute_all_difficulties( doc_ids=doc_ids_ordered, ground_truths=gt_map, cer_map=cer_map, image_quality_map=iq_map or None, ) # Ajouter difficulty_score à chaque document for doc in documents: ds = difficulty_scores.get(doc["doc_id"]) if ds: doc["difficulty_score"] = _safe(ds.score) doc["difficulty_label"] = difficulty_label(ds.score) else: doc["difficulty_score"] = 0.5 doc["difficulty_label"] = "Modéré" # ── Sprint 7 — Tests statistiques (Wilcoxon pairwise + bootstrap CI) ─ engine_cer_map_stats: dict[str, list[float]] = {} for report in benchmark.engine_reports: vals = [_safe(dr.metrics.cer) for dr in report.document_results if dr.metrics.error is None] if vals: engine_cer_map_stats[report.engine_name] = vals pairwise_stats = compute_pairwise_stats(engine_cer_map_stats) bootstrap_cis: list[dict] = [] for engine_name, vals in engine_cer_map_stats.items(): lo, hi = bootstrap_ci(vals) mean_v = sum(vals) / len(vals) if vals else 0.0 bootstrap_cis.append({ "engine": engine_name, "mean": _safe(mean_v), "ci_lower": _safe(lo), "ci_upper": _safe(hi), }) # ── Sprint 7 — Courbes de fiabilité ────────────────────────────────── reliability_curves: list[dict] = [] for report in benchmark.engine_reports: vals = [_safe(dr.metrics.cer) for dr in report.document_results if dr.metrics.error is None] curve = compute_reliability_curve(vals) reliability_curves.append({ "engine": report.engine_name, "points": curve, }) # ── Sprint 7 — Venn des erreurs communes / exclusives ──────────────── # Construire les ensembles d'erreurs par moteur : {engine → set(doc_id:gt_tok:hyp_tok)} venn_error_sets: dict[str, set[str]] = {} for report in benchmark.engine_reports: error_set: set[str] = set() for dr in report.document_results: ops = compute_word_diff(dr.ground_truth, dr.hypothesis) for op in ops: if op["op"] in ("replace", "delete", "insert"): key = f"{dr.doc_id}:{op.get('old', op.get('text',''))}:{op.get('new', op.get('text',''))}" error_set.add(key) venn_error_sets[report.engine_name] = error_set venn_data = compute_venn_data(venn_error_sets) # ── Sprint 7 — Clustering des patterns d'erreurs ───────────────────── error_data_all: list[dict] = [] for report in benchmark.engine_reports: for dr in report.document_results: error_data_all.append({ "engine": report.engine_name, "gt": dr.ground_truth, "hypothesis": dr.hypothesis, }) error_clusters_raw = cluster_errors(error_data_all, max_clusters=8) error_clusters = [c.as_dict() for c in error_clusters_raw] # ── Sprint 7 — Matrice de corrélation ──────────────────────────────── # Pour chaque moteur : une liste de dicts métriques par document correlation_per_engine: list[dict] = [] for report in benchmark.engine_reports: metrics_list = [] for dr in report.document_results: if dr.metrics.error is not None: continue entry: dict[str, float] = { "cer": _safe(dr.metrics.cer), "wer": _safe(dr.metrics.wer), "mer": _safe(dr.metrics.mer), "wil": _safe(dr.metrics.wil), } if dr.image_quality: entry["quality_score"] = _safe(dr.image_quality.get("quality_score", 0.5)) entry["sharpness"] = _safe(dr.image_quality.get("sharpness_score", 0.5)) if dr.char_scores: entry["ligature"] = _safe(dr.char_scores.get("ligature", {}).get("score", 0.5)) entry["diacritic"] = _safe(dr.char_scores.get("diacritic", {}).get("score", 0.5)) metrics_list.append(entry) if metrics_list: corr = compute_correlation_matrix(metrics_list) correlation_per_engine.append({ "engine": report.engine_name, **corr, }) return { "meta": { "corpus_name": benchmark.corpus_name, "corpus_source": benchmark.corpus_source, "document_count": benchmark.document_count, "run_date": benchmark.run_date, "picarones_version": benchmark.picarones_version, "metadata": benchmark.metadata, }, "ranking": benchmark.ranking(), "engines": engines_summary, "documents": documents, # Sprint 7 "statistics": { "pairwise_wilcoxon": pairwise_stats, "bootstrap_cis": bootstrap_cis, }, "reliability_curves": reliability_curves, "venn_data": venn_data, "error_clusters": error_clusters, "correlation_per_engine": correlation_per_engine, } # --------------------------------------------------------------------------- # Template HTML # --------------------------------------------------------------------------- _HTML_TEMPLATE = """\ Picarones — {corpus_name}

Classement des moteurs

# Concurrent CER exact CER diplo. WER MER WIL Ligatures Diacritiques CER médian CER min CER max Sur-norm. Docs
CER < 5 %
5–15 %
15–30 %
> 30 %

Sélectionner un document

Image originale

🖼 Sélectionnez un document

Vérité terrain (GT)

✓ Ground Truth

Sorties OCR — diff par moteur

Distribution du CER par moteur

Profil des moteurs (radar)

Axe radar : CER, WER, MER, WIL — valeurs inversées (plus c'est haut, meilleur est le moteur).

CER par document (tous moteurs)

Temps d'exécution moyen (secondes/document)

Qualité image ↔ CER (scatter plot)

Chaque point = un document. Axe X = score qualité image [0–1]. Axe Y = CER. Corrélation négative attendue.

Taxonomie des erreurs par moteur

Distribution des classes d'erreurs (classes 1–9 de la taxonomie Picarones).

Courbes de fiabilité

Pour les X% documents les plus faciles (triés par CER croissant), quel est le CER moyen cumulé ? Une courbe basse = moteur performant même sur les documents faciles.

Intervalles de confiance à 95 % (bootstrap)

IC à 95% sur le CER moyen par moteur (1000 itérations bootstrap).

Erreurs communes / exclusives (Venn)

Intersection des ensembles d'erreurs entre les 2 ou 3 premiers concurrents. Erreurs communes = segments partagés.

Tests de Wilcoxon — comparaisons par paires

Test signé-rangé de Wilcoxon (non-paramétrique). Seuil α = 0.05.

Clustering des patterns d'erreurs

Matrice de corrélation entre métriques

Coefficient de Pearson entre les métriques CER, WER, qualité image, ligatures, diacritiques. Vert = corrélation positive, Rouge = corrélation négative.

Analyse des caractères

Matrice de confusion unicode — substitutions les plus fréquentes (caractère GT → caractère OCR)

Reconnaissance des ligatures

Distribution taxonomique des erreurs

""" # --------------------------------------------------------------------------- # Classe principale # --------------------------------------------------------------------------- class ReportGenerator: """Génère un rapport HTML interactif depuis un BenchmarkResult. Usage ----- >>> from picarones.report import ReportGenerator >>> gen = ReportGenerator(benchmark_result) >>> path = gen.generate("rapport.html") """ def __init__( self, benchmark: BenchmarkResult, images_b64: Optional[dict[str, str]] = None, ) -> None: """ Parameters ---------- benchmark: Résultat de benchmark à visualiser. images_b64: Dictionnaire {doc_id: data-URI base64} des images. Si None, le générateur cherche dans ``benchmark.metadata["_images_b64"]``. """ self.benchmark = benchmark self.images_b64: dict[str, str] = images_b64 or {} # Récupérer les images embarquées dans les metadata (fixtures) if not self.images_b64: self.images_b64 = benchmark.metadata.get("_images_b64", {}) # type: ignore[assignment] def generate(self, output_path: str | Path) -> Path: """Génère le fichier HTML et le sauvegarde sur disque. Parameters ---------- output_path: Chemin du fichier HTML à écrire. Returns ------- Path Chemin absolu du fichier généré. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) report_data = _build_report_data(self.benchmark, self.images_b64) report_json = json.dumps(report_data, ensure_ascii=False, separators=(",", ":")) html = _HTML_TEMPLATE.format( corpus_name=self.benchmark.corpus_name, picarones_version=self.benchmark.picarones_version, report_data_json=report_json, ) output_path.write_text(html, encoding="utf-8") return output_path.resolve() @classmethod def from_json(cls, json_path: str | Path, **kwargs) -> "ReportGenerator": """Crée un générateur depuis un fichier JSON de résultats. Compatible avec les fichiers produits par ``BenchmarkResult.to_json()``. Les images base64 doivent être passées via ``kwargs["images_b64"]`` si elles ne sont pas dans le JSON. """ import json as _json data = _json.loads(Path(json_path).read_text(encoding="utf-8")) # Reconstruction minimale d'un BenchmarkResult depuis le dict from picarones.core.metrics import MetricsResult from picarones.core.results import DocumentResult, EngineReport engine_reports = [] for er_data in data.get("engine_reports", []): doc_results = [] for dr_data in er_data.get("document_results", []): m = dr_data["metrics"] metrics = MetricsResult( cer=m["cer"], cer_nfc=m["cer_nfc"], cer_caseless=m["cer_caseless"], wer=m["wer"], wer_normalized=m["wer_normalized"], mer=m["mer"], wil=m["wil"], reference_length=m["reference_length"], hypothesis_length=m["hypothesis_length"], error=m.get("error"), ) doc_results.append(DocumentResult( doc_id=dr_data["doc_id"], image_path=dr_data["image_path"], ground_truth=dr_data["ground_truth"], hypothesis=dr_data["hypothesis"], metrics=metrics, duration_seconds=dr_data.get("duration_seconds", 0.0), engine_error=dr_data.get("engine_error"), )) engine_reports.append(EngineReport( engine_name=er_data["engine_name"], engine_version=er_data.get("engine_version", "unknown"), engine_config=er_data.get("engine_config", {}), document_results=doc_results, )) corpus_info = data.get("corpus", {}) bm = BenchmarkResult( corpus_name=corpus_info.get("name", "Corpus"), corpus_source=corpus_info.get("source"), document_count=corpus_info.get("document_count", 0), engine_reports=engine_reports, run_date=data.get("run_date", ""), picarones_version=data.get("picarones_version", ""), metadata=data.get("metadata", {}), ) images_b64 = kwargs.pop("images_b64", {}) return cls(bm, images_b64=images_b64, **kwargs)