Claude
refactor(report): split generator.py (1063 → 431 lines) by concern
d641f6e unverified
Raw
History Blame
8.24 kB
"""Sections statistiques du rapport (Sprint 7 + Sprint 17).
Construit les blocs :
- ``pairwise_wilcoxon`` — tests de Wilcoxon par paire de moteurs.
- ``bootstrap_cis`` — intervalles de confiance bootstrap par moteur.
- ``friedman`` + ``nemenyi`` — Sprint 17, multi-moteurs.
- ``reliability_curves`` — courbes de fiabilité par moteur.
- ``venn_data`` — diagramme de Venn des erreurs communes/exclusives.
- ``error_clusters`` — clustering des patterns d'erreurs.
- ``correlation_per_engine`` — matrice de corrélation par moteur.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Optional
from picarones.core.diff_utils import compute_word_diff
from picarones.measurements.statistics import (
bootstrap_ci,
cluster_errors,
compute_correlation_matrix,
compute_pairwise_stats,
compute_reliability_curve,
compute_venn_data,
friedman_test,
nemenyi_posthoc,
)
from picarones.report.report_data._helpers import safe_round
if TYPE_CHECKING:
from picarones.core.results import BenchmarkResult
def _engine_cer_values(benchmark: "BenchmarkResult") -> dict[str, list[float]]:
"""Map ``engine_name → [cer_individuels valides]``."""
out: dict[str, list[float]] = {}
for report in benchmark.engine_reports:
vals = [
safe_round(dr.metrics.cer)
for dr in report.document_results
if dr.metrics.error is None
]
if vals:
out[report.engine_name] = vals
return out
def build_pairwise_wilcoxon(benchmark: "BenchmarkResult") -> list[dict]:
"""Tests de Wilcoxon par paire de moteurs (Sprint 7)."""
return compute_pairwise_stats(_engine_cer_values(benchmark))
def build_bootstrap_cis(benchmark: "BenchmarkResult") -> list[dict]:
"""Intervalles de confiance bootstrap par moteur (Sprint 7)."""
bootstrap_cis: list[dict] = []
for engine_name, vals in _engine_cer_values(benchmark).items():
lo, hi = bootstrap_ci(vals)
mean_v = sum(vals) / len(vals) if vals else 0.0
bootstrap_cis.append({
"engine": engine_name,
"mean": safe_round(mean_v),
"ci_lower": safe_round(lo),
"ci_upper": safe_round(hi),
})
return bootstrap_cis
def build_friedman_and_nemenyi(benchmark: "BenchmarkResult") -> dict:
"""Test de Friedman + post-hoc Nemenyi (Sprint 17, multi-moteurs).
Alignement strict sur le même ordre de documents : on reconstruit
la map à partir des documents communs à tous les moteurs, sinon
Friedman n'est pas applicable.
Returns
-------
dict
``{"friedman": {...}, "nemenyi": {...}}`` à fusionner dans
la section ``statistics`` du rapport.
"""
# Liste ordonnée des doc_ids selon l'ordre d'apparition.
seen: set[str] = set()
doc_ids_ordered: list[str] = []
for report in benchmark.engine_reports:
for dr in report.document_results:
if dr.doc_id not in seen:
seen.add(dr.doc_id)
doc_ids_ordered.append(dr.doc_id)
common_doc_ids: Optional[set[str]] = None
for report in benchmark.engine_reports:
doc_ids = {dr.doc_id for dr in report.document_results if dr.metrics.error is None}
common_doc_ids = doc_ids if common_doc_ids is None else common_doc_ids & doc_ids
engine_cer_aligned: dict[str, list[float]] = {}
if common_doc_ids:
ordered_common = [d for d in doc_ids_ordered if d in common_doc_ids]
for report in benchmark.engine_reports:
dr_by_id = {dr.doc_id: dr for dr in report.document_results}
engine_cer_aligned[report.engine_name] = [
safe_round(dr_by_id[d].metrics.cer) for d in ordered_common
]
if engine_cer_aligned:
friedman = friedman_test(engine_cer_aligned)
nemenyi = nemenyi_posthoc(engine_cer_aligned)
else:
friedman = {
"statistic": 0.0, "p_value": 1.0, "significant": False,
"df": 0, "n_blocks": 0, "n_engines": 0, "mean_ranks": {},
"interpretation": "Test de Friedman non calculé — aucun document commun.",
"error": "no_common_documents",
}
nemenyi = {
"alpha": 0.05, "critical_distance": 0.0, "q_alpha": 0.0,
"n_blocks": 0, "n_engines": 0, "mean_ranks": {},
"engines_sorted": [], "significant_matrix": [], "tied_groups": [],
"error": "no_common_documents",
}
return {"friedman": friedman, "nemenyi": nemenyi}
def build_reliability_curves(benchmark: "BenchmarkResult") -> list[dict]:
"""Courbes de fiabilité par moteur (Sprint 7)."""
reliability_curves: list[dict] = []
for report in benchmark.engine_reports:
vals = [
safe_round(dr.metrics.cer)
for dr in report.document_results
if dr.metrics.error is None
]
curve = compute_reliability_curve(vals)
reliability_curves.append({
"engine": report.engine_name,
"points": curve,
})
return reliability_curves
def build_venn_data(benchmark: "BenchmarkResult") -> dict:
"""Venn des erreurs communes / exclusives (Sprint 7).
Construit les ensembles d'erreurs par moteur :
``{engine → set("doc_id:gt_tok:hyp_tok")}``.
"""
venn_error_sets: dict[str, set[str]] = {}
for report in benchmark.engine_reports:
error_set: set[str] = set()
for dr in report.document_results:
ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
for op in ops:
if op["op"] in ("replace", "delete", "insert"):
key = (
f"{dr.doc_id}:"
f"{op.get('old', op.get('text', ''))}:"
f"{op.get('new', op.get('text', ''))}"
)
error_set.add(key)
venn_error_sets[report.engine_name] = error_set
return compute_venn_data(venn_error_sets)
def build_error_clusters(benchmark: "BenchmarkResult") -> list[dict]:
"""Clustering des patterns d'erreurs (Sprint 7)."""
error_data_all: list[dict] = []
for report in benchmark.engine_reports:
for dr in report.document_results:
error_data_all.append({
"engine": report.engine_name,
"gt": dr.ground_truth,
"hypothesis": dr.hypothesis,
})
error_clusters_raw = cluster_errors(error_data_all, max_clusters=8)
return [c.as_dict() for c in error_clusters_raw]
def build_correlation_per_engine(benchmark: "BenchmarkResult") -> list[dict]:
"""Matrice de corrélation par moteur entre métriques métiers (Sprint 7)."""
correlation_per_engine: list[dict] = []
for report in benchmark.engine_reports:
metrics_list: list[dict[str, float]] = []
for dr in report.document_results:
if dr.metrics.error is not None:
continue
entry: dict[str, float] = {
"cer": safe_round(dr.metrics.cer),
"wer": safe_round(dr.metrics.wer),
"mer": safe_round(dr.metrics.mer),
"wil": safe_round(dr.metrics.wil),
}
if dr.image_quality:
entry["quality_score"] = safe_round(dr.image_quality.get("quality_score", 0.5))
entry["sharpness"] = safe_round(dr.image_quality.get("sharpness_score", 0.5))
if dr.char_scores:
entry["ligature"] = safe_round(dr.char_scores.get("ligature", {}).get("score", 0.5))
entry["diacritic"] = safe_round(dr.char_scores.get("diacritic", {}).get("score", 0.5))
metrics_list.append(entry)
if metrics_list:
corr = compute_correlation_matrix(metrics_list)
correlation_per_engine.append({
"engine": report.engine_name,
**corr,
})
return correlation_per_engine
__all__ = [
"build_pairwise_wilcoxon",
"build_bootstrap_cis",
"build_friedman_and_nemenyi",
"build_reliability_curves",
"build_venn_data",
"build_error_clusters",
"build_correlation_per_engine",
]