"""Générateur du rapport HTML interactif auto-contenu.
Le rapport produit est un fichier HTML unique embarquant :
- Toutes les données (JSON inline)
- Chart.js et diff2html (depuis cdnjs)
- CSS et JavaScript de l'application
Vues disponibles
----------------
1. Classement — tableau triable par colonne (CER, WER, MER, WIL)
2. Galerie — grille d'images avec badge CER coloré
3. Document — image zoomable + diff coloré GT / OCR par moteur
4. Analyses — histogramme CER + graphique radar
"""
from __future__ import annotations
import json
import math
from pathlib import Path
from typing import Optional
from picarones.core.results import BenchmarkResult
from picarones.report.diff_utils import compute_word_diff
from picarones.core.statistics import (
compute_pairwise_stats,
compute_reliability_curve,
compute_correlation_matrix,
compute_venn_data,
cluster_errors,
bootstrap_ci,
)
from picarones.core.difficulty import compute_all_difficulties, difficulty_label, difficulty_color
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _cer_color(cer: float) -> str:
"""Retourne une couleur CSS pour un score CER donné (0→vert, 1→rouge)."""
if cer < 0.05:
return "#16a34a" # vert
if cer < 0.15:
return "#ca8a04" # jaune-orangé
if cer < 0.30:
return "#ea580c" # orange
return "#dc2626" # rouge
def _cer_bg(cer: float) -> str:
if cer < 0.05:
return "#dcfce7"
if cer < 0.15:
return "#fef9c3"
if cer < 0.30:
return "#ffedd5"
return "#fee2e2"
def _pct(v: Optional[float], decimals: int = 2) -> str:
if v is None:
return "—"
return f"{v * 100:.{decimals}f} %"
def _safe(v: Optional[float], decimals: int = 4) -> float:
return round(v or 0.0, decimals)
# ---------------------------------------------------------------------------
# Préparation des données
# ---------------------------------------------------------------------------
def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -> dict:
"""Transforme un BenchmarkResult en dict JSON pour le rapport HTML."""
engines_summary = []
for report in benchmark.engine_reports:
agg = report.aggregated_metrics
diplo_agg = agg.get("cer_diplomatic", {})
entry: dict = {
"name": report.engine_name,
"version": report.engine_version,
"cer": _safe(agg.get("cer", {}).get("mean")),
"wer": _safe(agg.get("wer", {}).get("mean")),
"mer": _safe(agg.get("mer", {}).get("mean")),
"wil": _safe(agg.get("wil", {}).get("mean")),
"cer_median": _safe(agg.get("cer", {}).get("median")),
"cer_min": _safe(agg.get("cer", {}).get("min")),
"cer_max": _safe(agg.get("cer", {}).get("max")),
"doc_count": agg.get("document_count", 0),
"failed": agg.get("failed_count", 0),
# CER diplomatique (après normalisation historique : ſ=s, u=v, i=j…)
"cer_diplomatic": _safe(diplo_agg.get("mean")) if diplo_agg else None,
"cer_diplomatic_profile": diplo_agg.get("profile"),
# Distribution pour l'histogramme : liste des CER individuels
"cer_values": [
_safe(dr.metrics.cer)
for dr in report.document_results
if dr.metrics.error is None
],
"cer_diplomatic_values": [
_safe(dr.metrics.cer_diplomatic)
for dr in report.document_results
if dr.metrics.error is None and dr.metrics.cer_diplomatic is not None
],
# Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
"is_pipeline": report.is_pipeline,
"pipeline_info": report.pipeline_info,
# Sprint 5 — métriques avancées patrimoniales
"ligature_score": _safe(report.ligature_score) if report.ligature_score is not None else None,
"diacritic_score": _safe(report.diacritic_score) if report.diacritic_score is not None else None,
"aggregated_confusion": report.aggregated_confusion,
"aggregated_taxonomy": report.aggregated_taxonomy,
"aggregated_structure": report.aggregated_structure,
"aggregated_image_quality": report.aggregated_image_quality,
}
engines_summary.append(entry)
# Documents (vue galerie + vue détail)
# On collecte tous les doc_ids depuis le premier moteur
doc_ids_ordered = []
if benchmark.engine_reports:
doc_ids_ordered = [dr.doc_id for dr in benchmark.engine_reports[0].document_results]
# Index croisé : doc_id → {engine_name → DocumentResult}
doc_engine_map: dict[str, dict] = {did: {} for did in doc_ids_ordered}
for report in benchmark.engine_reports:
for dr in report.document_results:
doc_engine_map[dr.doc_id][report.engine_name] = dr
documents = []
for doc_id in doc_ids_ordered:
engine_results = []
gt = ""
image_path = ""
for engine_name in [r.engine_name for r in benchmark.engine_reports]:
dr = doc_engine_map[doc_id].get(engine_name)
if dr is None:
continue
gt = dr.ground_truth
image_path = dr.image_path
diff_ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
er_entry: dict = {
"engine": engine_name,
"hypothesis": dr.hypothesis,
"cer": _safe(dr.metrics.cer),
"cer_diplomatic": _safe(dr.metrics.cer_diplomatic) if dr.metrics.cer_diplomatic is not None else None,
"wer": _safe(dr.metrics.wer),
"duration": dr.duration_seconds,
"error": dr.engine_error,
"diff": diff_ops,
}
# Champs spécifiques aux pipelines OCR+LLM
if dr.ocr_intermediate is not None:
er_entry["ocr_intermediate"] = dr.ocr_intermediate
er_entry["ocr_diff"] = compute_word_diff(dr.ground_truth, dr.ocr_intermediate)
er_entry["llm_correction_diff"] = compute_word_diff(dr.ocr_intermediate, dr.hypothesis)
if dr.pipeline_metadata:
on = dr.pipeline_metadata.get("over_normalization")
if on is not None:
er_entry["over_normalization"] = on
er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
# Sprint 5 — métriques avancées par document
if dr.char_scores is not None:
er_entry["ligature_score"] = _safe(dr.char_scores.get("ligature", {}).get("score"))
er_entry["diacritic_score"] = _safe(dr.char_scores.get("diacritic", {}).get("score"))
if dr.taxonomy is not None:
er_entry["taxonomy"] = dr.taxonomy
if dr.structure is not None:
er_entry["structure"] = dr.structure
if dr.image_quality is not None:
er_entry["image_quality"] = dr.image_quality
engine_results.append(er_entry)
# CER moyen sur ce document (pour le badge galerie)
cer_values = [er["cer"] for er in engine_results if er["error"] is None]
mean_cer = sum(cer_values) / len(cer_values) if cer_values else 1.0
best_engine = min(engine_results, key=lambda x: x["cer"], default=None)
# Script type (depuis metadata par document si disponible)
script_type = ""
first_dr = doc_engine_map[doc_id].get(
benchmark.engine_reports[0].engine_name if benchmark.engine_reports else None
)
if first_dr and first_dr.image_quality:
script_type = first_dr.image_quality.get("script_type", "")
documents.append({
"doc_id": doc_id,
"image_path": image_path,
"image_b64": images_b64.get(doc_id, ""),
"ground_truth": gt,
"mean_cer": _safe(mean_cer),
"best_engine": best_engine["engine"] if best_engine else "",
"engine_results": engine_results,
"script_type": script_type,
})
# ── Sprint 7 — Score de difficulté intrinsèque ───────────────────────
gt_map = {d["doc_id"]: d["ground_truth"] for d in documents}
cer_map: dict[str, dict[str, float]] = {d["doc_id"]: {} for d in documents}
iq_map: dict[str, float] = {}
for report in benchmark.engine_reports:
for dr in report.document_results:
cer_map.setdefault(dr.doc_id, {})[report.engine_name] = _safe(dr.metrics.cer)
if dr.image_quality and "quality_score" in dr.image_quality:
iq_map[dr.doc_id] = dr.image_quality["quality_score"]
difficulty_scores = compute_all_difficulties(
doc_ids=doc_ids_ordered,
ground_truths=gt_map,
cer_map=cer_map,
image_quality_map=iq_map or None,
)
# Ajouter difficulty_score à chaque document
for doc in documents:
ds = difficulty_scores.get(doc["doc_id"])
if ds:
doc["difficulty_score"] = _safe(ds.score)
doc["difficulty_label"] = difficulty_label(ds.score)
else:
doc["difficulty_score"] = 0.5
doc["difficulty_label"] = "Modéré"
# ── Sprint 7 — Tests statistiques (Wilcoxon pairwise + bootstrap CI) ─
engine_cer_map_stats: dict[str, list[float]] = {}
for report in benchmark.engine_reports:
vals = [_safe(dr.metrics.cer) for dr in report.document_results if dr.metrics.error is None]
if vals:
engine_cer_map_stats[report.engine_name] = vals
pairwise_stats = compute_pairwise_stats(engine_cer_map_stats)
bootstrap_cis: list[dict] = []
for engine_name, vals in engine_cer_map_stats.items():
lo, hi = bootstrap_ci(vals)
mean_v = sum(vals) / len(vals) if vals else 0.0
bootstrap_cis.append({
"engine": engine_name,
"mean": _safe(mean_v),
"ci_lower": _safe(lo),
"ci_upper": _safe(hi),
})
# ── Sprint 7 — Courbes de fiabilité ──────────────────────────────────
reliability_curves: list[dict] = []
for report in benchmark.engine_reports:
vals = [_safe(dr.metrics.cer) for dr in report.document_results if dr.metrics.error is None]
curve = compute_reliability_curve(vals)
reliability_curves.append({
"engine": report.engine_name,
"points": curve,
})
# ── Sprint 7 — Venn des erreurs communes / exclusives ────────────────
# Construire les ensembles d'erreurs par moteur : {engine → set(doc_id:gt_tok:hyp_tok)}
venn_error_sets: dict[str, set[str]] = {}
for report in benchmark.engine_reports:
error_set: set[str] = set()
for dr in report.document_results:
ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
for op in ops:
if op["op"] in ("replace", "delete", "insert"):
key = f"{dr.doc_id}:{op.get('old', op.get('text',''))}:{op.get('new', op.get('text',''))}"
error_set.add(key)
venn_error_sets[report.engine_name] = error_set
venn_data = compute_venn_data(venn_error_sets)
# ── Sprint 7 — Clustering des patterns d'erreurs ─────────────────────
error_data_all: list[dict] = []
for report in benchmark.engine_reports:
for dr in report.document_results:
error_data_all.append({
"engine": report.engine_name,
"gt": dr.ground_truth,
"hypothesis": dr.hypothesis,
})
error_clusters_raw = cluster_errors(error_data_all, max_clusters=8)
error_clusters = [c.as_dict() for c in error_clusters_raw]
# ── Sprint 7 — Matrice de corrélation ────────────────────────────────
# Pour chaque moteur : une liste de dicts métriques par document
correlation_per_engine: list[dict] = []
for report in benchmark.engine_reports:
metrics_list = []
for dr in report.document_results:
if dr.metrics.error is not None:
continue
entry: dict[str, float] = {
"cer": _safe(dr.metrics.cer),
"wer": _safe(dr.metrics.wer),
"mer": _safe(dr.metrics.mer),
"wil": _safe(dr.metrics.wil),
}
if dr.image_quality:
entry["quality_score"] = _safe(dr.image_quality.get("quality_score", 0.5))
entry["sharpness"] = _safe(dr.image_quality.get("sharpness_score", 0.5))
if dr.char_scores:
entry["ligature"] = _safe(dr.char_scores.get("ligature", {}).get("score", 0.5))
entry["diacritic"] = _safe(dr.char_scores.get("diacritic", {}).get("score", 0.5))
metrics_list.append(entry)
if metrics_list:
corr = compute_correlation_matrix(metrics_list)
correlation_per_engine.append({
"engine": report.engine_name,
**corr,
})
return {
"meta": {
"corpus_name": benchmark.corpus_name,
"corpus_source": benchmark.corpus_source,
"document_count": benchmark.document_count,
"run_date": benchmark.run_date,
"picarones_version": benchmark.picarones_version,
"metadata": benchmark.metadata,
},
"ranking": benchmark.ranking(),
"engines": engines_summary,
"documents": documents,
# Sprint 7
"statistics": {
"pairwise_wilcoxon": pairwise_stats,
"bootstrap_cis": bootstrap_cis,
},
"reliability_curves": reliability_curves,
"venn_data": venn_data,
"error_clusters": error_clusters,
"correlation_per_engine": correlation_per_engine,
}
# ---------------------------------------------------------------------------
# Template HTML
# ---------------------------------------------------------------------------
_HTML_TEMPLATE = """\
Picarones — {corpus_name}
Classement des moteurs
#↑
Concurrent↕
CER exact↕
CER diplo.↕
WER↕
MER↕
WIL↕
Ligatures↕
Diacritiques↕
CER médian
CER min
CER max
Sur-norm.
Docs
CER < 5 %
5–15 %
15–30 %
> 30 %
Galerie des documents
Aucun document ne correspond aux filtres.
Sélectionner un document
Image originale
🖼Sélectionnez un document
Vérité terrain (GT)
✓ Ground Truth
—
Sorties OCR — diff par moteur
Distribution du CER par moteur
Profil des moteurs (radar)
Axe radar : CER, WER, MER, WIL — valeurs inversées (plus c'est haut, meilleur est le moteur).
CER par document (tous moteurs)
Temps d'exécution moyen (secondes/document)
Qualité image ↔ CER (scatter plot)
Chaque point = un document. Axe X = score qualité image [0–1]. Axe Y = CER. Corrélation négative attendue.
Taxonomie des erreurs par moteur
Distribution des classes d'erreurs (classes 1–9 de la taxonomie Picarones).
Courbes de fiabilité
Pour les X% documents les plus faciles (triés par CER croissant), quel est le CER moyen cumulé ?
Une courbe basse = moteur performant même sur les documents faciles.
Intervalles de confiance à 95 % (bootstrap)
IC à 95% sur le CER moyen par moteur (1000 itérations bootstrap).
Erreurs communes / exclusives (Venn)
Intersection des ensembles d'erreurs entre les 2 ou 3 premiers concurrents.
Erreurs communes = segments partagés.
Tests de Wilcoxon — comparaisons par paires
Test signé-rangé de Wilcoxon (non-paramétrique). Seuil α = 0.05.
Clustering des patterns d'erreurs
Matrice de corrélation entre métriques
Coefficient de Pearson entre les métriques CER, WER, qualité image, ligatures, diacritiques.
Vert = corrélation positive, Rouge = corrélation négative.
Analyse des caractères
Matrice de confusion unicode
— substitutions les plus fréquentes (caractère GT → caractère OCR)
Reconnaissance des ligatures
Distribution taxonomique des erreurs
"""
# ---------------------------------------------------------------------------
# Classe principale
# ---------------------------------------------------------------------------
class ReportGenerator:
"""Génère un rapport HTML interactif depuis un BenchmarkResult.
Usage
-----
>>> from picarones.report import ReportGenerator
>>> gen = ReportGenerator(benchmark_result)
>>> path = gen.generate("rapport.html")
"""
def __init__(
self,
benchmark: BenchmarkResult,
images_b64: Optional[dict[str, str]] = None,
) -> None:
"""
Parameters
----------
benchmark:
Résultat de benchmark à visualiser.
images_b64:
Dictionnaire {doc_id: data-URI base64} des images.
Si None, le générateur cherche dans ``benchmark.metadata["_images_b64"]``.
"""
self.benchmark = benchmark
self.images_b64: dict[str, str] = images_b64 or {}
# Récupérer les images embarquées dans les metadata (fixtures)
if not self.images_b64:
self.images_b64 = benchmark.metadata.get("_images_b64", {}) # type: ignore[assignment]
def generate(self, output_path: str | Path) -> Path:
"""Génère le fichier HTML et le sauvegarde sur disque.
Parameters
----------
output_path:
Chemin du fichier HTML à écrire.
Returns
-------
Path
Chemin absolu du fichier généré.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
report_data = _build_report_data(self.benchmark, self.images_b64)
report_json = json.dumps(report_data, ensure_ascii=False, separators=(",", ":"))
html = _HTML_TEMPLATE.format(
corpus_name=self.benchmark.corpus_name,
picarones_version=self.benchmark.picarones_version,
report_data_json=report_json,
)
output_path.write_text(html, encoding="utf-8")
return output_path.resolve()
@classmethod
def from_json(cls, json_path: str | Path, **kwargs) -> "ReportGenerator":
"""Crée un générateur depuis un fichier JSON de résultats.
Compatible avec les fichiers produits par ``BenchmarkResult.to_json()``.
Les images base64 doivent être passées via ``kwargs["images_b64"]``
si elles ne sont pas dans le JSON.
"""
import json as _json
data = _json.loads(Path(json_path).read_text(encoding="utf-8"))
# Reconstruction minimale d'un BenchmarkResult depuis le dict
from picarones.core.metrics import MetricsResult
from picarones.core.results import DocumentResult, EngineReport
engine_reports = []
for er_data in data.get("engine_reports", []):
doc_results = []
for dr_data in er_data.get("document_results", []):
m = dr_data["metrics"]
metrics = MetricsResult(
cer=m["cer"], cer_nfc=m["cer_nfc"], cer_caseless=m["cer_caseless"],
wer=m["wer"], wer_normalized=m["wer_normalized"],
mer=m["mer"], wil=m["wil"],
reference_length=m["reference_length"],
hypothesis_length=m["hypothesis_length"],
error=m.get("error"),
)
doc_results.append(DocumentResult(
doc_id=dr_data["doc_id"],
image_path=dr_data["image_path"],
ground_truth=dr_data["ground_truth"],
hypothesis=dr_data["hypothesis"],
metrics=metrics,
duration_seconds=dr_data.get("duration_seconds", 0.0),
engine_error=dr_data.get("engine_error"),
))
engine_reports.append(EngineReport(
engine_name=er_data["engine_name"],
engine_version=er_data.get("engine_version", "unknown"),
engine_config=er_data.get("engine_config", {}),
document_results=doc_results,
))
corpus_info = data.get("corpus", {})
bm = BenchmarkResult(
corpus_name=corpus_info.get("name", "Corpus"),
corpus_source=corpus_info.get("source"),
document_count=corpus_info.get("document_count", 0),
engine_reports=engine_reports,
run_date=data.get("run_date", ""),
picarones_version=data.get("picarones_version", ""),
metadata=data.get("metadata", {}),
)
images_b64 = kwargs.pop("images_b64", {})
return cls(bm, images_b64=images_b64, **kwargs)