Add eval_dashboard.py

deb35c1 verified 3 months ago

27.4 kB

	#!/usr/bin/env python3
	"""
	RetinaSense v3.0 -- Phase 1A: Rich Evaluation Dashboard
	========================================================
	Standalone script that loads the trained ViT model, runs inference on the
	full test set (1,281 images), and produces publication-quality evaluation
	plots plus a structured metrics JSON report.

	Outputs (all written to outputs_v3/evaluation/):
	- confusion_matrix.png
	- roc_curves_per_class.png
	- precision_recall_curves.png
	- calibration_reliability.png
	- confidence_histograms.png
	- error_analysis_by_source.png
	- metrics_report.json

	Usage:
	python eval_dashboard.py
	"""

	import os
	import sys
	import json
	import warnings
	import numpy as np
	import pandas as pd
	import cv2
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import matplotlib.ticker as mticker
	import seaborn as sns
	from PIL import Image
	from collections import OrderedDict

	warnings.filterwarnings('ignore')

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import Dataset, DataLoader
	from torchvision import transforms
	import timm

	from sklearn.metrics import (
	confusion_matrix,
	classification_report,
	roc_curve,
	auc,
	precision_recall_curve,
	average_precision_score,
	f1_score,
	accuracy_score,
	cohen_kappa_score,
	matthews_corrcoef,
	balanced_accuracy_score,
	log_loss,
	)

	# ================================================================
	# CONFIGURATION
	# ================================================================
	BASE_DIR = '/teamspace/studios/this_studio'
	OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs_v3')
	EVAL_DIR = os.path.join(OUTPUT_DIR, 'evaluation')
	os.makedirs(EVAL_DIR, exist_ok=True)

	MODEL_PATH = os.path.join(OUTPUT_DIR, 'best_model.pth')
	THRESHOLDS_PATH = os.path.join(OUTPUT_DIR, 'thresholds.json')
	TEMPERATURE_PATH = os.path.join(OUTPUT_DIR, 'temperature.json')
	TEST_CSV = os.path.join(BASE_DIR, 'data', 'test_split.csv')
	NORM_STATS_PATH = os.path.join(BASE_DIR, 'data', 'fundus_norm_stats.json')

	NUM_CLASSES = 5
	IMG_SIZE = 224
	DROPOUT = 0.3
	BATCH_SIZE = 32

	CLASS_NAMES = ['Normal', 'Diabetes/DR', 'Glaucoma', 'Cataract', 'AMD']

	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Publication style defaults
	plt.rcParams.update({
	'font.size': 11,
	'axes.titlesize': 13,
	'axes.labelsize': 12,
	'xtick.labelsize': 10,
	'ytick.labelsize': 10,
	'legend.fontsize': 10,
	'figure.dpi': 300,
	'savefig.dpi': 300,
	'savefig.bbox': 'tight',
	'savefig.pad_inches': 0.15,
	'font.family': 'sans-serif',
	})

	print('=' * 65)
	print(' RetinaSense v3.0 -- Phase 1A: Evaluation Dashboard')
	print('=' * 65)
	print(f' Device : {DEVICE}')
	if torch.cuda.is_available():
	print(f' GPU : {torch.cuda.get_device_name(0)}')
	print(f' Output : {EVAL_DIR}')
	print('=' * 65)


	# ================================================================
	# LOAD NORMALISATION STATS
	# ================================================================
	if os.path.exists(NORM_STATS_PATH):
	with open(NORM_STATS_PATH) as f:
	norm_stats = json.load(f)
	NORM_MEAN = norm_stats['mean_rgb']
	NORM_STD = norm_stats['std_rgb']
	print(f' Fundus norm stats loaded: mean={[round(v, 4) for v in NORM_MEAN]}, '
	f'std={[round(v, 4) for v in NORM_STD]}')
	else:
	NORM_MEAN = [0.485, 0.456, 0.406]
	NORM_STD = [0.229, 0.224, 0.225]
	print(' Using ImageNet normalisation fallback')


	# ================================================================
	# MODEL ARCHITECTURE (mirrors retinasense_v3.py / gradcam_v3.py)
	# ================================================================
	class MultiTaskViT(nn.Module):
	"""ViT-Base-Patch16-224 with disease + severity heads."""

	def __init__(self, n_disease=NUM_CLASSES, n_severity=5, drop=DROPOUT):
	super().__init__()
	self.backbone = timm.create_model(
	'vit_base_patch16_224', pretrained=False, num_classes=0
	)
	feat = 768 # CLS token dimension
	self.drop = nn.Dropout(drop)
	self.disease_head = nn.Sequential(
	nn.Linear(feat, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
	nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
	nn.Linear(256, n_disease),
	)
	self.severity_head = nn.Sequential(
	nn.Linear(feat, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
	nn.Linear(256, n_severity),
	)

	def forward(self, x):
	f = self.backbone(x) # (B, 768) CLS token features
	f = self.drop(f)
	return self.disease_head(f), self.severity_head(f)


	# ================================================================
	# LOAD MODEL + CALIBRATION ARTIFACTS
	# ================================================================
	print('\nLoading model...')
	model = MultiTaskViT().to(DEVICE)
	ckpt = torch.load(MODEL_PATH, map_location=DEVICE, weights_only=False)
	model.load_state_dict(ckpt['model_state_dict'])
	model.eval()
	print(f' Loaded: {MODEL_PATH}')
	print(f' Checkpoint epoch: {ckpt.get("epoch", "?") + 1} '
	f'val_acc={ckpt.get("val_acc", 0):.2f}%')

	with open(THRESHOLDS_PATH) as f:
	thr_data = json.load(f)
	THRESHOLDS = thr_data['thresholds']

	with open(TEMPERATURE_PATH) as f:
	temp_data = json.load(f)
	TEMPERATURE = temp_data['temperature']

	print(f' Temperature T = {TEMPERATURE:.4f}')
	print(f' Thresholds = {[round(t, 3) for t in THRESHOLDS]}')


	# ================================================================
	# DATASET
	# ================================================================
	class TestDataset(Dataset):
	"""
	Test dataset that loads from preprocessed .npy cache (fast path).
	Falls back to on-the-fly preprocessing if cache is missing.
	"""

	def __init__(self, df, transform):
	self.df = df.reset_index(drop=True)
	self.transform = transform

	def __len__(self):
	return len(self.df)

	def __getitem__(self, idx):
	row = self.df.iloc[idx]

	# Try cache path first
	cache_fp = row.get('cache_path', '')
	img = None

	if cache_fp and os.path.exists(cache_fp):
	try:
	img = np.load(cache_fp)
	except Exception:
	img = None

	# Fallback: on-the-fly preprocessing
	if img is None:
	image_path = row['image_path']
	if not os.path.isabs(image_path):
	clean = image_path
	while clean.startswith('./') or clean.startswith('.//'):
	clean = clean[2:] if clean.startswith('./') else clean[3:]
	image_path = os.path.join(BASE_DIR, clean)

	source = row.get('source', 'ODIR')
	try:
	if source == 'APTOS':
	img = self._ben_graham(image_path)
	else:
	img = self._clahe_preprocess(image_path)
	except Exception:
	img = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)

	img_tensor = self.transform(img)
	disease_lbl = int(row['disease_label'])
	source = row.get('source', 'unknown')
	return img_tensor, disease_lbl, source

	@staticmethod
	def _ben_graham(path, sz=IMG_SIZE, sigma=10):
	raw = cv2.imread(path)
	if raw is None:
	raw = np.array(Image.open(path).convert('RGB'))
	raw = cv2.cvtColor(raw, cv2.COLOR_RGB2BGR)
	raw = cv2.cvtColor(raw, cv2.COLOR_BGR2RGB)
	raw = cv2.resize(raw, (sz, sz))
	raw = cv2.addWeighted(raw, 4, cv2.GaussianBlur(raw, (0, 0), sigma), -4, 128)
	mask = np.zeros(raw.shape[:2], dtype=np.uint8)
	cv2.circle(mask, (sz // 2, sz // 2), int(sz * 0.48), 255, -1)
	return cv2.bitwise_and(raw, raw, mask=mask)

	@staticmethod
	def _clahe_preprocess(path, sz=IMG_SIZE):
	raw = cv2.imread(path)
	if raw is None:
	raw = np.array(Image.open(path).convert('RGB'))
	raw = cv2.cvtColor(raw, cv2.COLOR_RGB2BGR)
	raw = cv2.resize(raw, (sz, sz))
	lab = cv2.cvtColor(raw, cv2.COLOR_BGR2LAB)
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	lab[:, :, 0] = clahe.apply(lab[:, :, 0])
	raw = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
	return cv2.cvtColor(raw, cv2.COLOR_BGR2RGB)


	val_transform = transforms.Compose([
	transforms.ToPILImage(),
	transforms.ToTensor(),
	transforms.Normalize(NORM_MEAN, NORM_STD),
	])

	print('\nLoading test set...')
	test_df = pd.read_csv(TEST_CSV)
	print(f' Test samples: {len(test_df)}')
	print(f' Sources : {sorted(test_df["source"].unique())}')
	print(f' Class dist : {test_df["disease_label"].value_counts().sort_index().to_dict()}')

	test_ds = TestDataset(test_df, val_transform)
	test_loader = DataLoader(
	test_ds, batch_size=BATCH_SIZE, shuffle=False,
	num_workers=4, pin_memory=True,
	)


	# ================================================================
	# INFERENCE
	# ================================================================
	print('\nRunning inference on full test set...')
	all_logits = []
	all_labels = []
	all_sources = []

	with torch.no_grad():
	for imgs, labels, sources in test_loader:
	imgs = imgs.to(DEVICE)
	disease_logits, _ = model(imgs)
	all_logits.append(disease_logits.cpu())
	all_labels.extend(labels.numpy().tolist())
	all_sources.extend(sources)

	all_logits = torch.cat(all_logits, dim=0) # (N, 5)
	all_labels = np.array(all_labels)
	all_sources = np.array(all_sources)
	N = len(all_labels)
	print(f' Inference complete: {N} samples')

	# Temperature-scaled probabilities
	probs_calibrated = F.softmax(all_logits / TEMPERATURE, dim=1).numpy() # (N, 5)
	probs_uncalibrated = F.softmax(all_logits, dim=1).numpy()

	# Predictions: argmax of calibrated probabilities
	preds = np.argmax(probs_calibrated, axis=1)
	confidences = np.max(probs_calibrated, axis=1)

	correct_mask = (preds == all_labels)
	acc = accuracy_score(all_labels, preds)
	print(f' Overall accuracy: {acc:.4f} ({int(acc * N)}/{N})')


	# ================================================================
	# 1. CONFUSION MATRIX
	# ================================================================
	print('\n[1/7] Confusion matrix...')
	cm = confusion_matrix(all_labels, preds, labels=list(range(NUM_CLASSES)))
	cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

	fig, ax = plt.subplots(figsize=(7, 6))
	sns.heatmap(
	cm_norm, annot=True, fmt='.2f', cmap='Blues',
	xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES,
	linewidths=0.5, linecolor='white',
	cbar_kws={'label': 'Proportion', 'shrink': 0.8},
	ax=ax, vmin=0, vmax=1,
	)
	# Overlay raw counts in smaller font
	for i in range(NUM_CLASSES):
	for j in range(NUM_CLASSES):
	ax.text(j + 0.5, i + 0.72, f'(n={cm[i, j]})',
	ha='center', va='center', fontsize=7, color='gray')

	ax.set_xlabel('Predicted Class')
	ax.set_ylabel('True Class')
	ax.set_title('Normalized Confusion Matrix (Test Set)')
	fig.tight_layout()
	fig.savefig(os.path.join(EVAL_DIR, 'confusion_matrix.png'))
	plt.close(fig)
	print(' Saved confusion_matrix.png')


	# ================================================================
	# 2. ROC CURVES PER CLASS
	# ================================================================
	print('[2/7] ROC curves...')
	fig, ax = plt.subplots(figsize=(7, 6))
	colors = sns.color_palette('tab10', NUM_CLASSES)
	all_fpr_tpr = {}
	macro_auc_list = []

	for i in range(NUM_CLASSES):
	y_true_bin = (all_labels == i).astype(int)
	y_score = probs_calibrated[:, i]
	fpr, tpr, _ = roc_curve(y_true_bin, y_score)
	roc_auc = auc(fpr, tpr)
	macro_auc_list.append(roc_auc)
	all_fpr_tpr[i] = (fpr, tpr)
	ax.plot(fpr, tpr, color=colors[i], lw=2,
	label=f'{CLASS_NAMES[i]} (AUC={roc_auc:.3f})')

	# Macro average ROC
	mean_fpr = np.linspace(0, 1, 200)
	mean_tpr = np.zeros_like(mean_fpr)
	for i in range(NUM_CLASSES):
	mean_tpr += np.interp(mean_fpr, all_fpr_tpr[i][0], all_fpr_tpr[i][1])
	mean_tpr /= NUM_CLASSES
	macro_auc = auc(mean_fpr, mean_tpr)
	ax.plot(mean_fpr, mean_tpr, 'k--', lw=2.5,
	label=f'Macro-average (AUC={macro_auc:.3f})')
	ax.plot([0, 1], [0, 1], 'k:', lw=1, alpha=0.4)

	ax.set_xlim([-0.02, 1.02])
	ax.set_ylim([-0.02, 1.05])
	ax.set_xlabel('False Positive Rate')
	ax.set_ylabel('True Positive Rate')
	ax.set_title('One-vs-Rest ROC Curves (Calibrated)')
	ax.legend(loc='lower right', framealpha=0.9)
	ax.grid(True, alpha=0.3)
	fig.tight_layout()
	fig.savefig(os.path.join(EVAL_DIR, 'roc_curves_per_class.png'))
	plt.close(fig)
	print(' Saved roc_curves_per_class.png')


	# ================================================================
	# 3. PRECISION-RECALL CURVES
	# ================================================================
	print('[3/7] Precision-recall curves...')
	fig, ax = plt.subplots(figsize=(7, 6))

	for i in range(NUM_CLASSES):
	y_true_bin = (all_labels == i).astype(int)
	y_score = probs_calibrated[:, i]
	prec, rec, _ = precision_recall_curve(y_true_bin, y_score)
	ap = average_precision_score(y_true_bin, y_score)
	ax.plot(rec, prec, color=colors[i], lw=2,
	label=f'{CLASS_NAMES[i]} (AP={ap:.3f})')

	# Add prevalence baselines
	prevalences = np.bincount(all_labels, minlength=NUM_CLASSES) / N
	for i in range(NUM_CLASSES):
	ax.axhline(y=prevalences[i], color=colors[i], ls=':', alpha=0.3)

	ax.set_xlim([-0.02, 1.02])
	ax.set_ylim([-0.02, 1.05])
	ax.set_xlabel('Recall')
	ax.set_ylabel('Precision')
	ax.set_title('Precision-Recall Curves (Calibrated)')
	ax.legend(loc='upper right', framealpha=0.9)
	ax.grid(True, alpha=0.3)
	fig.tight_layout()
	fig.savefig(os.path.join(EVAL_DIR, 'precision_recall_curves.png'))
	plt.close(fig)
	print(' Saved precision_recall_curves.png')


	# ================================================================
	# 4. CALIBRATION RELIABILITY DIAGRAM
	# ================================================================
	print('[4/7] Calibration reliability diagram...')
	n_bins = 10
	bin_edges = np.linspace(0, 1, n_bins + 1)
	bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

	# Compute calibration for both calibrated and uncalibrated probabilities
	def compute_calibration(confidences_arr, correct_arr, bin_edges):
	"""Compute per-bin accuracy and average confidence."""
	bin_accs = []
	bin_confs = []
	bin_counts = []
	for lo, hi in zip(bin_edges[:-1], bin_edges[1:]):
	mask = (confidences_arr > lo) & (confidences_arr <= hi)
	if mask.sum() == 0:
	bin_accs.append(np.nan)
	bin_confs.append(np.nan)
	bin_counts.append(0)
	else:
	bin_accs.append(correct_arr[mask].mean())
	bin_confs.append(confidences_arr[mask].mean())
	bin_counts.append(int(mask.sum()))
	return np.array(bin_accs), np.array(bin_confs), np.array(bin_counts)

	conf_calib = np.max(probs_calibrated, axis=1)
	conf_uncalib = np.max(probs_uncalibrated, axis=1)

	bin_accs_cal, bin_confs_cal, bin_counts_cal = compute_calibration(
	conf_calib, correct_mask.astype(float), bin_edges)
	bin_accs_uncal, bin_confs_uncal, bin_counts_uncal = compute_calibration(
	conf_uncalib, correct_mask.astype(float), bin_edges)

	# ECE
	ece_cal = np.nansum(
	np.abs(bin_accs_cal - bin_confs_cal) * bin_counts_cal) / N
	ece_uncal = np.nansum(
	np.abs(bin_accs_uncal - bin_confs_uncal) * bin_counts_uncal) / N

	fig, axes = plt.subplots(1, 2, figsize=(12, 5))

	for ax_idx, (b_accs, b_confs, b_counts, ece_val, title_suffix) in enumerate([
	(bin_accs_cal, bin_confs_cal, bin_counts_cal, ece_cal, 'Calibrated'),
	(bin_accs_uncal, bin_confs_uncal, bin_counts_uncal, ece_uncal, 'Uncalibrated'),
	]):
	ax = axes[ax_idx]
	# Perfect calibration line
	ax.plot([0, 1], [0, 1], 'k--', lw=1.5, alpha=0.5, label='Perfectly calibrated')
	# Bar chart of bin accuracy
	valid = ~np.isnan(b_accs)
	bar_color = '#4C72B0' if ax_idx == 0 else '#DD8452'
	ax.bar(bin_centers[valid], b_accs[valid], width=0.08,
	alpha=0.7, color=bar_color, edgecolor='black', linewidth=0.5,
	label=f'Model (ECE={ece_val:.4f})')
	# Gap shading
	for j in range(n_bins):
	if valid[j]:
	lo_val = min(b_accs[j], b_confs[j])
	hi_val = max(b_accs[j], b_confs[j])
	ax.fill_between(
	[bin_centers[j] - 0.04, bin_centers[j] + 0.04],
	lo_val, hi_val, alpha=0.15, color='red')
	# Sample counts on top
	for j in range(n_bins):
	if valid[j] and b_counts[j] > 0:
	ax.text(bin_centers[j], b_accs[j] + 0.03,
	str(b_counts[j]), ha='center', va='bottom', fontsize=7)

	ax.set_xlim([0, 1])
	ax.set_ylim([0, 1.1])
	ax.set_xlabel('Mean Predicted Confidence')
	ax.set_ylabel('Fraction of Correct Predictions')
	ax.set_title(f'Reliability Diagram ({title_suffix})')
	ax.legend(loc='upper left', framealpha=0.9)
	ax.grid(True, alpha=0.3)

	fig.tight_layout()
	fig.savefig(os.path.join(EVAL_DIR, 'calibration_reliability.png'))
	plt.close(fig)
	print(f' Saved calibration_reliability.png (ECE_cal={ece_cal:.4f}, ECE_uncal={ece_uncal:.4f})')


	# ================================================================
	# 5. CONFIDENCE HISTOGRAMS
	# ================================================================
	print('[5/7] Confidence histograms...')
	fig, axes = plt.subplots(1, 2, figsize=(12, 5))

	# Correct vs Incorrect
	for ax_idx, (mask, label, color) in enumerate([
	(correct_mask, 'Correct', '#2ca02c'),
	(~correct_mask, 'Incorrect', '#d62728'),
	]):
	axes[0].hist(confidences[mask], bins=30, alpha=0.65, color=color,
	label=f'{label} (n={mask.sum()})', edgecolor='black', linewidth=0.3)

	axes[0].set_xlabel('Prediction Confidence')
	axes[0].set_ylabel('Count')
	axes[0].set_title('Confidence Distribution: Correct vs Incorrect')
	axes[0].legend(loc='upper left', framealpha=0.9)
	axes[0].axvline(x=np.median(confidences[correct_mask]), color='#2ca02c',
	ls='--', alpha=0.6, label='_nolegend_')
	axes[0].axvline(x=np.median(confidences[~correct_mask]), color='#d62728',
	ls='--', alpha=0.6, label='_nolegend_')
	axes[0].grid(True, alpha=0.3, axis='y')

	# Per-class confidence
	for i in range(NUM_CLASSES):
	cls_mask = (all_labels == i)
	axes[1].hist(confidences[cls_mask], bins=20, alpha=0.5, color=colors[i],
	label=f'{CLASS_NAMES[i]} (n={cls_mask.sum()})',
	edgecolor='black', linewidth=0.3)

	axes[1].set_xlabel('Prediction Confidence')
	axes[1].set_ylabel('Count')
	axes[1].set_title('Confidence Distribution by True Class')
	axes[1].legend(loc='upper left', framealpha=0.9, fontsize=9)
	axes[1].grid(True, alpha=0.3, axis='y')

	fig.tight_layout()
	fig.savefig(os.path.join(EVAL_DIR, 'confidence_histograms.png'))
	plt.close(fig)
	print(' Saved confidence_histograms.png')


	# ================================================================
	# 6. ERROR ANALYSIS BY SOURCE
	# ================================================================
	print('[6/7] Error analysis by source...')
	sources_unique = sorted(np.unique(all_sources))
	n_sources = len(sources_unique)

	# Build accuracy per (source, class) pair
	source_class_acc = {}
	source_class_n = {}
	for src in sources_unique:
	for cls_idx in range(NUM_CLASSES):
	mask = (all_sources == src) & (all_labels == cls_idx)
	n_cls = mask.sum()
	if n_cls > 0:
	acc_sc = (preds[mask] == all_labels[mask]).mean()
	else:
	acc_sc = np.nan
	source_class_acc[(src, cls_idx)] = acc_sc
	source_class_n[(src, cls_idx)] = int(n_cls)

	# Also overall accuracy per source
	source_overall_acc = {}
	for src in sources_unique:
	mask = (all_sources == src)
	source_overall_acc[src] = accuracy_score(all_labels[mask], preds[mask])

	fig, axes = plt.subplots(1, 2, figsize=(14, 6))

	# Left panel: grouped bar chart of per-class accuracy by source
	x = np.arange(NUM_CLASSES)
	bar_width = 0.8 / max(n_sources, 1)
	source_colors = sns.color_palette('Set2', n_sources)

	for s_idx, src in enumerate(sources_unique):
	accs = [source_class_acc[(src, c)] for c in range(NUM_CLASSES)]
	counts = [source_class_n[(src, c)] for c in range(NUM_CLASSES)]
	offset = (s_idx - n_sources / 2 + 0.5) * bar_width
	bars = axes[0].bar(x + offset, accs, bar_width * 0.9,
	label=f'{src} (n={sum(counts)})',
	color=source_colors[s_idx], edgecolor='black', linewidth=0.5)
	# Annotate sample counts
	for j, (b, n_val) in enumerate(zip(bars, counts)):
	if n_val > 0 and not np.isnan(accs[j]):
	axes[0].text(b.get_x() + b.get_width() / 2, b.get_height() + 0.02,
	str(n_val), ha='center', va='bottom', fontsize=7)

	axes[0].set_xticks(x)
	axes[0].set_xticklabels(CLASS_NAMES, rotation=15, ha='right')
	axes[0].set_ylabel('Accuracy')
	axes[0].set_title('Per-Class Accuracy by Data Source')
	axes[0].set_ylim([0, 1.15])
	axes[0].legend(loc='upper right', framealpha=0.9)
	axes[0].grid(True, alpha=0.3, axis='y')
	axes[0].axhline(y=acc, color='black', ls='--', alpha=0.4, lw=1)
	axes[0].text(NUM_CLASSES - 0.5, acc + 0.02, f'Overall: {acc:.3f}',
	ha='right', fontsize=9, alpha=0.6)

	# Right panel: confusion breakdown -- most common misclassifications per source
	error_data = []
	for src in sources_unique:
	src_mask = (all_sources == src) & (~correct_mask)
	if src_mask.sum() == 0:
	continue
	for true_cls in range(NUM_CLASSES):
	for pred_cls in range(NUM_CLASSES):
	if true_cls == pred_cls:
	continue
	pair_mask = src_mask & (all_labels == true_cls) & (preds == pred_cls)
	cnt = pair_mask.sum()
	if cnt > 0:
	error_data.append({
	'Source': src,
	'Error': f'{CLASS_NAMES[true_cls][:3]}>{CLASS_NAMES[pred_cls][:3]}',
	'Count': int(cnt),
	})

	if error_data:
	err_df = pd.DataFrame(error_data)
	# Top 10 error types
	top_errors = (err_df.groupby('Error')['Count'].sum()
	.sort_values(ascending=False).head(10).index.tolist())
	err_df_top = err_df[err_df['Error'].isin(top_errors)]
	pivot = err_df_top.pivot_table(index='Error', columns='Source',
	values='Count', aggfunc='sum', fill_value=0)
	# Reorder by total count
	pivot = pivot.loc[pivot.sum(axis=1).sort_values(ascending=True).index]
	pivot.plot(kind='barh', stacked=True, ax=axes[1],
	color=source_colors[:n_sources], edgecolor='black', linewidth=0.5)
	axes[1].set_xlabel('Error Count')
	axes[1].set_title('Top Misclassification Patterns by Source')
	axes[1].legend(loc='lower right', framealpha=0.9)
	axes[1].grid(True, alpha=0.3, axis='x')
	else:
	axes[1].text(0.5, 0.5, 'No errors to display', ha='center', va='center',
	transform=axes[1].transAxes, fontsize=14)
	axes[1].set_title('Top Misclassification Patterns by Source')

	fig.tight_layout()
	fig.savefig(os.path.join(EVAL_DIR, 'error_analysis_by_source.png'))
	plt.close(fig)
	print(' Saved error_analysis_by_source.png')


	# ================================================================
	# 7. METRICS REPORT (JSON)
	# ================================================================
	print('[7/7] Metrics report...')

	# Classification report as dict
	cls_report = classification_report(
	all_labels, preds, target_names=CLASS_NAMES,
	output_dict=True, zero_division=0)

	# Per-class AUC and AP
	per_class_auc = {}
	per_class_ap = {}
	for i in range(NUM_CLASSES):
	y_bin = (all_labels == i).astype(int)
	y_score = probs_calibrated[:, i]
	fpr_i, tpr_i, _ = roc_curve(y_bin, y_score)
	per_class_auc[CLASS_NAMES[i]] = float(auc(fpr_i, tpr_i))
	per_class_ap[CLASS_NAMES[i]] = float(average_precision_score(y_bin, y_score))

	# Build the full report
	try:
	ll = float(log_loss(all_labels, probs_calibrated))
	except Exception:
	ll = None

	metrics_report = OrderedDict([
	('n_test_samples', int(N)),
	('overall_accuracy', float(acc)),
	('balanced_accuracy', float(balanced_accuracy_score(all_labels, preds))),
	('macro_f1', float(f1_score(all_labels, preds, average='macro', zero_division=0))),
	('weighted_f1', float(f1_score(all_labels, preds, average='weighted', zero_division=0))),
	('cohen_kappa', float(cohen_kappa_score(all_labels, preds))),
	('matthews_corrcoef', float(matthews_corrcoef(all_labels, preds))),
	('log_loss', ll),
	('macro_auc', float(np.mean(list(per_class_auc.values())))),
	('ece_calibrated', float(ece_cal)),
	('ece_uncalibrated', float(ece_uncal)),
	('temperature', float(TEMPERATURE)),
	('thresholds', THRESHOLDS),
	('per_class_metrics', {}),
	('per_class_auc', per_class_auc),
	('per_class_ap', per_class_ap),
	('confusion_matrix_raw', cm.tolist()),
	('confusion_matrix_normalized', np.round(cm_norm, 4).tolist()),
	('source_accuracy', {src: float(v) for src, v in source_overall_acc.items()}),
	('source_class_counts', {
	src: {CLASS_NAMES[c]: source_class_n[(src, c)]
	for c in range(NUM_CLASSES)}
	for src in sources_unique
	}),
	('class_names', CLASS_NAMES),
	])

	# Per-class from classification_report
	for i, name in enumerate(CLASS_NAMES):
	metrics_report['per_class_metrics'][name] = {
	'precision': float(cls_report[name]['precision']),
	'recall': float(cls_report[name]['recall']),
	'f1-score': float(cls_report[name]['f1-score']),
	'support': int(cls_report[name]['support']),
	'auc': per_class_auc[name],
	'average_precision': per_class_ap[name],
	}

	report_path = os.path.join(EVAL_DIR, 'metrics_report.json')
	with open(report_path, 'w') as f:
	json.dump(metrics_report, f, indent=2)
	print(f' Saved metrics_report.json')


	# ================================================================
	# SUMMARY
	# ================================================================
	print('\n' + '=' * 65)
	print(' EVALUATION DASHBOARD COMPLETE')
	print('=' * 65)
	print(f' Overall Accuracy : {acc:.4f}')
	print(f' Balanced Accuracy : {metrics_report["balanced_accuracy"]:.4f}')
	print(f' Macro F1 : {metrics_report["macro_f1"]:.4f}')
	print(f' Cohen Kappa : {metrics_report["cohen_kappa"]:.4f}')
	print(f' Macro AUC : {metrics_report["macro_auc"]:.4f}')
	print(f' ECE (calibrated) : {ece_cal:.4f}')
	print(f' ECE (uncalibrated) : {ece_uncal:.4f}')
	print(f'\n Per-class AUC:')
	for name, val in per_class_auc.items():
	print(f' {name:15s} : {val:.4f}')
	print(f'\n Source accuracy:')
	for src, val in source_overall_acc.items():
	print(f' {src:10s} : {val:.4f}')
	print(f'\n All outputs in: {EVAL_DIR}/')
	output_files = [
	'confusion_matrix.png',
	'roc_curves_per_class.png',
	'precision_recall_curves.png',
	'calibration_reliability.png',
	'confidence_histograms.png',
	'error_analysis_by_source.png',
	'metrics_report.json',
	]
	for fname in output_files:
	fpath = os.path.join(EVAL_DIR, fname)
	exists = os.path.exists(fpath)
	size_kb = os.path.getsize(fpath) / 1024 if exists else 0
	status = f'{size_kb:.0f} KB' if exists else 'MISSING'
	print(f' [{status:>8s}] {fname}')
	print('=' * 65)