Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / picarones /report /report_data /documents.py

Claude

refactor(report): split generator.py (1063 → 431 lines) by concern

d641f6e unverified about 2 months ago

6.98 kB

	"""Construction de la liste ``documents`` (vue galerie + vue détail).

	Pour chaque document du corpus, agrège les hypothèses de tous les
	moteurs avec leurs métriques, le diff caractère par caractère, et
	les champs spécifiques aux pipelines OCR+LLM (intermédiaire, mode,
	sur-normalisation).

	:func:`annotate_documents_with_difficulty` enrichit ensuite chaque
	document avec son score de difficulté intrinsèque (Sprint 7).
	"""

	from __future__ import annotations

	from typing import TYPE_CHECKING

	from picarones.core.diff_utils import compute_char_diff, compute_word_diff
	from picarones.measurements.difficulty import (
	compute_all_difficulties,
	difficulty_label,
	)
	from picarones.report.report_data._helpers import safe_round

	if TYPE_CHECKING:
	from picarones.core.results import BenchmarkResult


	def build_documents(
	benchmark: "BenchmarkResult", images_b64: dict[str, str],
	) -> list[dict]:
	"""Retourne la liste ordonnée des documents prêts pour le template.

	L'ordre des documents préserve l'ordre d'apparition (premier moteur
	d'abord, puis compléments depuis les moteurs suivants si certains
	documents ne sont pas couverts par tous les moteurs).
	"""
	seen_doc_ids: set[str] = set()
	doc_ids_ordered: list[str] = []
	for report in benchmark.engine_reports:
	for dr in report.document_results:
	if dr.doc_id not in seen_doc_ids:
	seen_doc_ids.add(dr.doc_id)
	doc_ids_ordered.append(dr.doc_id)

	# Index croisé : doc_id → {engine_name → DocumentResult}
	doc_engine_map: dict[str, dict] = {did: {} for did in doc_ids_ordered}
	for report in benchmark.engine_reports:
	for dr in report.document_results:
	doc_engine_map.setdefault(dr.doc_id, {})[report.engine_name] = dr

	documents: list[dict] = []
	engine_names = [r.engine_name for r in benchmark.engine_reports]
	for doc_id in doc_ids_ordered:
	engine_results: list[dict] = []
	gt = ""
	image_path = ""
	for engine_name in engine_names:
	dr = doc_engine_map[doc_id].get(engine_name)
	if dr is None:
	continue
	gt = dr.ground_truth
	image_path = dr.image_path
	er_entry = _build_engine_result_entry(engine_name, dr)
	engine_results.append(er_entry)

	# CER moyen sur ce document (pour le badge galerie)
	cer_values = [er["cer"] for er in engine_results if er["error"] is None]
	mean_cer = sum(cer_values) / len(cer_values) if cer_values else 1.0
	best_engine = min(engine_results, key=lambda x: x["cer"], default=None)

	# Script type (depuis metadata par document si disponible)
	script_type = ""
	first_engine = engine_names[0] if engine_names else None
	first_dr = doc_engine_map[doc_id].get(first_engine)
	if first_dr and first_dr.image_quality:
	script_type = first_dr.image_quality.get("script_type", "")

	documents.append({
	"doc_id": doc_id,
	"image_path": image_path,
	"image_b64": images_b64.get(doc_id, ""),
	"ground_truth": gt,
	"mean_cer": safe_round(mean_cer),
	"best_engine": best_engine["engine"] if best_engine else "",
	"engine_results": engine_results,
	"script_type": script_type,
	})
	return documents


	def _build_engine_result_entry(engine_name: str, dr) -> dict:
	"""Construit une entrée moteur pour un document donné (extrait pour lisibilité)."""
	diff_ops = compute_char_diff(dr.ground_truth, dr.hypothesis)
	er_entry: dict = {
	"engine": engine_name,
	"hypothesis": dr.hypothesis,
	"cer": safe_round(dr.metrics.cer),
	"cer_diplomatic": safe_round(dr.metrics.cer_diplomatic) if dr.metrics.cer_diplomatic is not None else None,
	"wer": safe_round(dr.metrics.wer),
	"mer": safe_round(dr.metrics.mer),
	"wil": safe_round(dr.metrics.wil),
	"duration": dr.duration_seconds,
	"error": dr.engine_error,
	"diff": diff_ops,
	}
	# Champs spécifiques aux pipelines OCR+LLM
	if dr.ocr_intermediate is not None:
	er_entry["ocr_intermediate"] = dr.ocr_intermediate
	er_entry["ocr_diff"] = compute_word_diff(dr.ground_truth, dr.ocr_intermediate)
	er_entry["llm_correction_diff"] = compute_word_diff(dr.ocr_intermediate, dr.hypothesis)
	if dr.pipeline_metadata:
	on = dr.pipeline_metadata.get("over_normalization")
	if on is not None:
	er_entry["over_normalization"] = on
	er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
	# Sprint 5 — métriques avancées par document
	if dr.char_scores is not None:
	er_entry["ligature_score"] = safe_round(dr.char_scores.get("ligature", {}).get("score"))
	er_entry["diacritic_score"] = safe_round(dr.char_scores.get("diacritic", {}).get("score"))
	if dr.taxonomy is not None:
	er_entry["taxonomy"] = dr.taxonomy
	if dr.structure is not None:
	er_entry["structure"] = dr.structure
	if dr.image_quality is not None:
	er_entry["image_quality"] = dr.image_quality
	# Sprint 10
	if dr.line_metrics is not None:
	er_entry["line_metrics"] = dr.line_metrics
	if dr.hallucination_metrics is not None:
	er_entry["hallucination_metrics"] = dr.hallucination_metrics
	return er_entry


	def annotate_documents_with_difficulty(
	benchmark: "BenchmarkResult", documents: list[dict],
	) -> None:
	"""Annote chaque document du dict avec son score de difficulté (Sprint 7).

	Modifie ``documents`` en place. Les valeurs par défaut ``0.5`` /
	``"Modéré"`` sont retournées si la difficulté n'a pas pu être
	calculée (par exemple corpus dégénéré).
	"""
	doc_ids_ordered = [d["doc_id"] for d in documents]
	gt_map = {d["doc_id"]: d["ground_truth"] for d in documents}
	cer_map: dict[str, dict[str, float]] = {d["doc_id"]: {} for d in documents}
	iq_map: dict[str, float] = {}
	for report in benchmark.engine_reports:
	for dr in report.document_results:
	cer_map.setdefault(dr.doc_id, {})[report.engine_name] = safe_round(dr.metrics.cer)
	if dr.image_quality and "quality_score" in dr.image_quality:
	iq_map[dr.doc_id] = dr.image_quality["quality_score"]
	difficulty_scores = compute_all_difficulties(
	doc_ids=doc_ids_ordered,
	ground_truths=gt_map,
	cer_map=cer_map,
	image_quality_map=iq_map or None,
	)
	for doc in documents:
	ds = difficulty_scores.get(doc["doc_id"])
	if ds:
	doc["difficulty_score"] = safe_round(ds.score)
	doc["difficulty_label"] = difficulty_label(ds.score)
	else:
	doc["difficulty_score"] = 0.5
	doc["difficulty_label"] = "Modéré"


	__all__ = ["build_documents", "annotate_documents_with_difficulty"]