Spaces:
Sleeping
Sleeping
| """Sections statistiques du rapport (Sprint 7 + Sprint 17). | |
| Construit les blocs : | |
| - ``pairwise_wilcoxon`` — tests de Wilcoxon par paire de moteurs. | |
| - ``bootstrap_cis`` — intervalles de confiance bootstrap par moteur. | |
| - ``friedman`` + ``nemenyi`` — Sprint 17, multi-moteurs. | |
| - ``reliability_curves`` — courbes de fiabilité par moteur. | |
| - ``venn_data`` — diagramme de Venn des erreurs communes/exclusives. | |
| - ``error_clusters`` — clustering des patterns d'erreurs. | |
| - ``correlation_per_engine`` — matrice de corrélation par moteur. | |
| """ | |
| from __future__ import annotations | |
| from typing import TYPE_CHECKING, Optional | |
| from picarones.core.diff_utils import compute_word_diff | |
| from picarones.measurements.statistics import ( | |
| bootstrap_ci, | |
| cluster_errors, | |
| compute_correlation_matrix, | |
| compute_pairwise_stats, | |
| compute_reliability_curve, | |
| compute_venn_data, | |
| friedman_test, | |
| nemenyi_posthoc, | |
| ) | |
| from picarones.report.report_data._helpers import safe_round | |
| if TYPE_CHECKING: | |
| from picarones.core.results import BenchmarkResult | |
| def _engine_cer_values(benchmark: "BenchmarkResult") -> dict[str, list[float]]: | |
| """Map ``engine_name → [cer_individuels valides]``.""" | |
| out: dict[str, list[float]] = {} | |
| for report in benchmark.engine_reports: | |
| vals = [ | |
| safe_round(dr.metrics.cer) | |
| for dr in report.document_results | |
| if dr.metrics.error is None | |
| ] | |
| if vals: | |
| out[report.engine_name] = vals | |
| return out | |
| def build_pairwise_wilcoxon(benchmark: "BenchmarkResult") -> list[dict]: | |
| """Tests de Wilcoxon par paire de moteurs (Sprint 7).""" | |
| return compute_pairwise_stats(_engine_cer_values(benchmark)) | |
| def build_bootstrap_cis(benchmark: "BenchmarkResult") -> list[dict]: | |
| """Intervalles de confiance bootstrap par moteur (Sprint 7).""" | |
| bootstrap_cis: list[dict] = [] | |
| for engine_name, vals in _engine_cer_values(benchmark).items(): | |
| lo, hi = bootstrap_ci(vals) | |
| mean_v = sum(vals) / len(vals) if vals else 0.0 | |
| bootstrap_cis.append({ | |
| "engine": engine_name, | |
| "mean": safe_round(mean_v), | |
| "ci_lower": safe_round(lo), | |
| "ci_upper": safe_round(hi), | |
| }) | |
| return bootstrap_cis | |
| def build_friedman_and_nemenyi(benchmark: "BenchmarkResult") -> dict: | |
| """Test de Friedman + post-hoc Nemenyi (Sprint 17, multi-moteurs). | |
| Alignement strict sur le même ordre de documents : on reconstruit | |
| la map à partir des documents communs à tous les moteurs, sinon | |
| Friedman n'est pas applicable. | |
| Returns | |
| ------- | |
| dict | |
| ``{"friedman": {...}, "nemenyi": {...}}`` à fusionner dans | |
| la section ``statistics`` du rapport. | |
| """ | |
| # Liste ordonnée des doc_ids selon l'ordre d'apparition. | |
| seen: set[str] = set() | |
| doc_ids_ordered: list[str] = [] | |
| for report in benchmark.engine_reports: | |
| for dr in report.document_results: | |
| if dr.doc_id not in seen: | |
| seen.add(dr.doc_id) | |
| doc_ids_ordered.append(dr.doc_id) | |
| common_doc_ids: Optional[set[str]] = None | |
| for report in benchmark.engine_reports: | |
| doc_ids = {dr.doc_id for dr in report.document_results if dr.metrics.error is None} | |
| common_doc_ids = doc_ids if common_doc_ids is None else common_doc_ids & doc_ids | |
| engine_cer_aligned: dict[str, list[float]] = {} | |
| if common_doc_ids: | |
| ordered_common = [d for d in doc_ids_ordered if d in common_doc_ids] | |
| for report in benchmark.engine_reports: | |
| dr_by_id = {dr.doc_id: dr for dr in report.document_results} | |
| engine_cer_aligned[report.engine_name] = [ | |
| safe_round(dr_by_id[d].metrics.cer) for d in ordered_common | |
| ] | |
| if engine_cer_aligned: | |
| friedman = friedman_test(engine_cer_aligned) | |
| nemenyi = nemenyi_posthoc(engine_cer_aligned) | |
| else: | |
| friedman = { | |
| "statistic": 0.0, "p_value": 1.0, "significant": False, | |
| "df": 0, "n_blocks": 0, "n_engines": 0, "mean_ranks": {}, | |
| "interpretation": "Test de Friedman non calculé — aucun document commun.", | |
| "error": "no_common_documents", | |
| } | |
| nemenyi = { | |
| "alpha": 0.05, "critical_distance": 0.0, "q_alpha": 0.0, | |
| "n_blocks": 0, "n_engines": 0, "mean_ranks": {}, | |
| "engines_sorted": [], "significant_matrix": [], "tied_groups": [], | |
| "error": "no_common_documents", | |
| } | |
| return {"friedman": friedman, "nemenyi": nemenyi} | |
| def build_reliability_curves(benchmark: "BenchmarkResult") -> list[dict]: | |
| """Courbes de fiabilité par moteur (Sprint 7).""" | |
| reliability_curves: list[dict] = [] | |
| for report in benchmark.engine_reports: | |
| vals = [ | |
| safe_round(dr.metrics.cer) | |
| for dr in report.document_results | |
| if dr.metrics.error is None | |
| ] | |
| curve = compute_reliability_curve(vals) | |
| reliability_curves.append({ | |
| "engine": report.engine_name, | |
| "points": curve, | |
| }) | |
| return reliability_curves | |
| def build_venn_data(benchmark: "BenchmarkResult") -> dict: | |
| """Venn des erreurs communes / exclusives (Sprint 7). | |
| Construit les ensembles d'erreurs par moteur : | |
| ``{engine → set("doc_id:gt_tok:hyp_tok")}``. | |
| """ | |
| venn_error_sets: dict[str, set[str]] = {} | |
| for report in benchmark.engine_reports: | |
| error_set: set[str] = set() | |
| for dr in report.document_results: | |
| ops = compute_word_diff(dr.ground_truth, dr.hypothesis) | |
| for op in ops: | |
| if op["op"] in ("replace", "delete", "insert"): | |
| key = ( | |
| f"{dr.doc_id}:" | |
| f"{op.get('old', op.get('text', ''))}:" | |
| f"{op.get('new', op.get('text', ''))}" | |
| ) | |
| error_set.add(key) | |
| venn_error_sets[report.engine_name] = error_set | |
| return compute_venn_data(venn_error_sets) | |
| def build_error_clusters(benchmark: "BenchmarkResult") -> list[dict]: | |
| """Clustering des patterns d'erreurs (Sprint 7).""" | |
| error_data_all: list[dict] = [] | |
| for report in benchmark.engine_reports: | |
| for dr in report.document_results: | |
| error_data_all.append({ | |
| "engine": report.engine_name, | |
| "gt": dr.ground_truth, | |
| "hypothesis": dr.hypothesis, | |
| }) | |
| error_clusters_raw = cluster_errors(error_data_all, max_clusters=8) | |
| return [c.as_dict() for c in error_clusters_raw] | |
| def build_correlation_per_engine(benchmark: "BenchmarkResult") -> list[dict]: | |
| """Matrice de corrélation par moteur entre métriques métiers (Sprint 7).""" | |
| correlation_per_engine: list[dict] = [] | |
| for report in benchmark.engine_reports: | |
| metrics_list: list[dict[str, float]] = [] | |
| for dr in report.document_results: | |
| if dr.metrics.error is not None: | |
| continue | |
| entry: dict[str, float] = { | |
| "cer": safe_round(dr.metrics.cer), | |
| "wer": safe_round(dr.metrics.wer), | |
| "mer": safe_round(dr.metrics.mer), | |
| "wil": safe_round(dr.metrics.wil), | |
| } | |
| if dr.image_quality: | |
| entry["quality_score"] = safe_round(dr.image_quality.get("quality_score", 0.5)) | |
| entry["sharpness"] = safe_round(dr.image_quality.get("sharpness_score", 0.5)) | |
| if dr.char_scores: | |
| entry["ligature"] = safe_round(dr.char_scores.get("ligature", {}).get("score", 0.5)) | |
| entry["diacritic"] = safe_round(dr.char_scores.get("diacritic", {}).get("score", 0.5)) | |
| metrics_list.append(entry) | |
| if metrics_list: | |
| corr = compute_correlation_matrix(metrics_list) | |
| correlation_per_engine.append({ | |
| "engine": report.engine_name, | |
| **corr, | |
| }) | |
| return correlation_per_engine | |
| __all__ = [ | |
| "build_pairwise_wilcoxon", | |
| "build_bootstrap_cis", | |
| "build_friedman_and_nemenyi", | |
| "build_reliability_curves", | |
| "build_venn_data", | |
| "build_error_clusters", | |
| "build_correlation_per_engine", | |
| ] | |