Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / picarones /report /report_data /statistics.py

Claude

refactor(report): split generator.py (1063 → 431 lines) by concern

d641f6e unverified about 2 months ago

8.24 kB

	"""Sections statistiques du rapport (Sprint 7 + Sprint 17).

	Construit les blocs :

	- ``pairwise_wilcoxon`` — tests de Wilcoxon par paire de moteurs.
	- ``bootstrap_cis`` — intervalles de confiance bootstrap par moteur.
	- ``friedman`` + ``nemenyi`` — Sprint 17, multi-moteurs.
	- ``reliability_curves`` — courbes de fiabilité par moteur.
	- ``venn_data`` — diagramme de Venn des erreurs communes/exclusives.
	- ``error_clusters`` — clustering des patterns d'erreurs.
	- ``correlation_per_engine`` — matrice de corrélation par moteur.
	"""

	from __future__ import annotations

	from typing import TYPE_CHECKING, Optional

	from picarones.core.diff_utils import compute_word_diff
	from picarones.measurements.statistics import (
	bootstrap_ci,
	cluster_errors,
	compute_correlation_matrix,
	compute_pairwise_stats,
	compute_reliability_curve,
	compute_venn_data,
	friedman_test,
	nemenyi_posthoc,
	)
	from picarones.report.report_data._helpers import safe_round

	if TYPE_CHECKING:
	from picarones.core.results import BenchmarkResult


	def _engine_cer_values(benchmark: "BenchmarkResult") -> dict[str, list[float]]:
	"""Map ``engine_name → [cer_individuels valides]``."""
	out: dict[str, list[float]] = {}
	for report in benchmark.engine_reports:
	vals = [
	safe_round(dr.metrics.cer)
	for dr in report.document_results
	if dr.metrics.error is None
	]
	if vals:
	out[report.engine_name] = vals
	return out


	def build_pairwise_wilcoxon(benchmark: "BenchmarkResult") -> list[dict]:
	"""Tests de Wilcoxon par paire de moteurs (Sprint 7)."""
	return compute_pairwise_stats(_engine_cer_values(benchmark))


	def build_bootstrap_cis(benchmark: "BenchmarkResult") -> list[dict]:
	"""Intervalles de confiance bootstrap par moteur (Sprint 7)."""
	bootstrap_cis: list[dict] = []
	for engine_name, vals in _engine_cer_values(benchmark).items():
	lo, hi = bootstrap_ci(vals)
	mean_v = sum(vals) / len(vals) if vals else 0.0
	bootstrap_cis.append({
	"engine": engine_name,
	"mean": safe_round(mean_v),
	"ci_lower": safe_round(lo),
	"ci_upper": safe_round(hi),
	})
	return bootstrap_cis


	def build_friedman_and_nemenyi(benchmark: "BenchmarkResult") -> dict:
	"""Test de Friedman + post-hoc Nemenyi (Sprint 17, multi-moteurs).

	Alignement strict sur le même ordre de documents : on reconstruit
	la map à partir des documents communs à tous les moteurs, sinon
	Friedman n'est pas applicable.

	Returns
	-------
	dict
	``{"friedman": {...}, "nemenyi": {...}}`` à fusionner dans
	la section ``statistics`` du rapport.
	"""
	# Liste ordonnée des doc_ids selon l'ordre d'apparition.
	seen: set[str] = set()
	doc_ids_ordered: list[str] = []
	for report in benchmark.engine_reports:
	for dr in report.document_results:
	if dr.doc_id not in seen:
	seen.add(dr.doc_id)
	doc_ids_ordered.append(dr.doc_id)

	common_doc_ids: Optional[set[str]] = None
	for report in benchmark.engine_reports:
	doc_ids = {dr.doc_id for dr in report.document_results if dr.metrics.error is None}
	common_doc_ids = doc_ids if common_doc_ids is None else common_doc_ids & doc_ids

	engine_cer_aligned: dict[str, list[float]] = {}
	if common_doc_ids:
	ordered_common = [d for d in doc_ids_ordered if d in common_doc_ids]
	for report in benchmark.engine_reports:
	dr_by_id = {dr.doc_id: dr for dr in report.document_results}
	engine_cer_aligned[report.engine_name] = [
	safe_round(dr_by_id[d].metrics.cer) for d in ordered_common
	]

	if engine_cer_aligned:
	friedman = friedman_test(engine_cer_aligned)
	nemenyi = nemenyi_posthoc(engine_cer_aligned)
	else:
	friedman = {
	"statistic": 0.0, "p_value": 1.0, "significant": False,
	"df": 0, "n_blocks": 0, "n_engines": 0, "mean_ranks": {},
	"interpretation": "Test de Friedman non calculé — aucun document commun.",
	"error": "no_common_documents",
	}
	nemenyi = {
	"alpha": 0.05, "critical_distance": 0.0, "q_alpha": 0.0,
	"n_blocks": 0, "n_engines": 0, "mean_ranks": {},
	"engines_sorted": [], "significant_matrix": [], "tied_groups": [],
	"error": "no_common_documents",
	}
	return {"friedman": friedman, "nemenyi": nemenyi}


	def build_reliability_curves(benchmark: "BenchmarkResult") -> list[dict]:
	"""Courbes de fiabilité par moteur (Sprint 7)."""
	reliability_curves: list[dict] = []
	for report in benchmark.engine_reports:
	vals = [
	safe_round(dr.metrics.cer)
	for dr in report.document_results
	if dr.metrics.error is None
	]
	curve = compute_reliability_curve(vals)
	reliability_curves.append({
	"engine": report.engine_name,
	"points": curve,
	})
	return reliability_curves


	def build_venn_data(benchmark: "BenchmarkResult") -> dict:
	"""Venn des erreurs communes / exclusives (Sprint 7).

	Construit les ensembles d'erreurs par moteur :
	``{engine → set("doc_id:gt_tok:hyp_tok")}``.
	"""
	venn_error_sets: dict[str, set[str]] = {}
	for report in benchmark.engine_reports:
	error_set: set[str] = set()
	for dr in report.document_results:
	ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
	for op in ops:
	if op["op"] in ("replace", "delete", "insert"):
	key = (
	f"{dr.doc_id}:"
	f"{op.get('old', op.get('text', ''))}:"
	f"{op.get('new', op.get('text', ''))}"
	)
	error_set.add(key)
	venn_error_sets[report.engine_name] = error_set
	return compute_venn_data(venn_error_sets)


	def build_error_clusters(benchmark: "BenchmarkResult") -> list[dict]:
	"""Clustering des patterns d'erreurs (Sprint 7)."""
	error_data_all: list[dict] = []
	for report in benchmark.engine_reports:
	for dr in report.document_results:
	error_data_all.append({
	"engine": report.engine_name,
	"gt": dr.ground_truth,
	"hypothesis": dr.hypothesis,
	})
	error_clusters_raw = cluster_errors(error_data_all, max_clusters=8)
	return [c.as_dict() for c in error_clusters_raw]


	def build_correlation_per_engine(benchmark: "BenchmarkResult") -> list[dict]:
	"""Matrice de corrélation par moteur entre métriques métiers (Sprint 7)."""
	correlation_per_engine: list[dict] = []
	for report in benchmark.engine_reports:
	metrics_list: list[dict[str, float]] = []
	for dr in report.document_results:
	if dr.metrics.error is not None:
	continue
	entry: dict[str, float] = {
	"cer": safe_round(dr.metrics.cer),
	"wer": safe_round(dr.metrics.wer),
	"mer": safe_round(dr.metrics.mer),
	"wil": safe_round(dr.metrics.wil),
	}
	if dr.image_quality:
	entry["quality_score"] = safe_round(dr.image_quality.get("quality_score", 0.5))
	entry["sharpness"] = safe_round(dr.image_quality.get("sharpness_score", 0.5))
	if dr.char_scores:
	entry["ligature"] = safe_round(dr.char_scores.get("ligature", {}).get("score", 0.5))
	entry["diacritic"] = safe_round(dr.char_scores.get("diacritic", {}).get("score", 0.5))
	metrics_list.append(entry)
	if metrics_list:
	corr = compute_correlation_matrix(metrics_list)
	correlation_per_engine.append({
	"engine": report.engine_name,
	**corr,
	})
	return correlation_per_engine


	__all__ = [
	"build_pairwise_wilcoxon",
	"build_bootstrap_cis",
	"build_friedman_and_nemenyi",
	"build_reliability_curves",
	"build_venn_data",
	"build_error_clusters",
	"build_correlation_per_engine",
	]