tanishq74
/

retinasense-vit

+#!/usr/bin/env python3
+"""
+RetinaSense v3.0 -- Phase 1A: Rich Evaluation Dashboard
+========================================================
+Standalone script that loads the trained ViT model, runs inference on the
+full test set (1,281 images), and produces publication-quality evaluation
+plots plus a structured metrics JSON report.
+Outputs (all written to outputs_v3/evaluation/):
+  - confusion_matrix.png
+  - roc_curves_per_class.png
+  - precision_recall_curves.png
+  - calibration_reliability.png
+  - confidence_histograms.png
+  - error_analysis_by_source.png
+  - metrics_report.json
+Usage:
+  python eval_dashboard.py
+"""
+import os
+import sys
+import json
+import warnings
+import numpy as np
+import pandas as pd
+import cv2
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+import seaborn as sns
+from PIL import Image
+from collections import OrderedDict
+warnings.filterwarnings('ignore')
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import timm
+from sklearn.metrics import (
+    confusion_matrix,
+    classification_report,
+    roc_curve,
+    auc,
+    precision_recall_curve,
+    average_precision_score,
+    f1_score,
+    accuracy_score,
+    cohen_kappa_score,
+    matthews_corrcoef,
+    balanced_accuracy_score,
+    log_loss,
+)
+# ================================================================
+# CONFIGURATION
+# ================================================================
+BASE_DIR    = '/teamspace/studios/this_studio'
+OUTPUT_DIR  = os.path.join(BASE_DIR, 'outputs_v3')
+EVAL_DIR    = os.path.join(OUTPUT_DIR, 'evaluation')
+os.makedirs(EVAL_DIR, exist_ok=True)
+MODEL_PATH       = os.path.join(OUTPUT_DIR, 'best_model.pth')
+THRESHOLDS_PATH  = os.path.join(OUTPUT_DIR, 'thresholds.json')
+TEMPERATURE_PATH = os.path.join(OUTPUT_DIR, 'temperature.json')
+TEST_CSV         = os.path.join(BASE_DIR, 'data', 'test_split.csv')
+NORM_STATS_PATH  = os.path.join(BASE_DIR, 'data', 'fundus_norm_stats.json')
+NUM_CLASSES = 5
+IMG_SIZE    = 224
+DROPOUT     = 0.3
+BATCH_SIZE  = 32
+CLASS_NAMES = ['Normal', 'Diabetes/DR', 'Glaucoma', 'Cataract', 'AMD']
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Publication style defaults
+plt.rcParams.update({
+    'font.size': 11,
+    'axes.titlesize': 13,
+    'axes.labelsize': 12,
+    'xtick.labelsize': 10,
+    'ytick.labelsize': 10,
+    'legend.fontsize': 10,
+    'figure.dpi': 300,
+    'savefig.dpi': 300,
+    'savefig.bbox': 'tight',
+    'savefig.pad_inches': 0.15,
+    'font.family': 'sans-serif',
+})
+print('=' * 65)
+print('   RetinaSense v3.0 -- Phase 1A: Evaluation Dashboard')
+print('=' * 65)
+print(f'  Device  : {DEVICE}')
+if torch.cuda.is_available():
+    print(f'  GPU     : {torch.cuda.get_device_name(0)}')
+print(f'  Output  : {EVAL_DIR}')
+print('=' * 65)
+# ================================================================
+# LOAD NORMALISATION STATS
+# ================================================================
+if os.path.exists(NORM_STATS_PATH):
+    with open(NORM_STATS_PATH) as f:
+        norm_stats = json.load(f)
+    NORM_MEAN = norm_stats['mean_rgb']
+    NORM_STD  = norm_stats['std_rgb']
+    print(f'  Fundus norm stats loaded: mean={[round(v, 4) for v in NORM_MEAN]}, '
+          f'std={[round(v, 4) for v in NORM_STD]}')
+else:
+    NORM_MEAN = [0.485, 0.456, 0.406]
+    NORM_STD  = [0.229, 0.224, 0.225]
+    print('  Using ImageNet normalisation fallback')
+# ================================================================
+# MODEL ARCHITECTURE (mirrors retinasense_v3.py / gradcam_v3.py)
+# ================================================================
+class MultiTaskViT(nn.Module):
+    """ViT-Base-Patch16-224 with disease + severity heads."""
+    def __init__(self, n_disease=NUM_CLASSES, n_severity=5, drop=DROPOUT):
+        super().__init__()
+        self.backbone = timm.create_model(
+            'vit_base_patch16_224', pretrained=False, num_classes=0
+        )
+        feat = 768  # CLS token dimension
+        self.drop = nn.Dropout(drop)
+        self.disease_head = nn.Sequential(
+            nn.Linear(feat, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
+            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
+            nn.Linear(256, n_disease),
+        )
+        self.severity_head = nn.Sequential(
+            nn.Linear(feat, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
+            nn.Linear(256, n_severity),
+        )
+    def forward(self, x):
+        f = self.backbone(x)   # (B, 768) CLS token features
+        f = self.drop(f)
+        return self.disease_head(f), self.severity_head(f)
+# ================================================================
+# LOAD MODEL + CALIBRATION ARTIFACTS
+# ================================================================
+print('\nLoading model...')
+model = MultiTaskViT().to(DEVICE)
+ckpt = torch.load(MODEL_PATH, map_location=DEVICE, weights_only=False)
+model.load_state_dict(ckpt['model_state_dict'])
+model.eval()
+print(f'  Loaded: {MODEL_PATH}')
+print(f'  Checkpoint epoch: {ckpt.get("epoch", "?") + 1}  '
+      f'val_acc={ckpt.get("val_acc", 0):.2f}%')
+with open(THRESHOLDS_PATH) as f:
+    thr_data = json.load(f)
+THRESHOLDS = thr_data['thresholds']
+with open(TEMPERATURE_PATH) as f:
+    temp_data = json.load(f)
+TEMPERATURE = temp_data['temperature']
+print(f'  Temperature T = {TEMPERATURE:.4f}')
+print(f'  Thresholds    = {[round(t, 3) for t in THRESHOLDS]}')
+# ================================================================
+# DATASET
+# ================================================================
+class TestDataset(Dataset):
+    """
+    Test dataset that loads from preprocessed .npy cache (fast path).
+    Falls back to on-the-fly preprocessing if cache is missing.
+    """
+    def __init__(self, df, transform):
+        self.df = df.reset_index(drop=True)
+        self.transform = transform
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        # Try cache path first
+        cache_fp = row.get('cache_path', '')
+        img = None
+        if cache_fp and os.path.exists(cache_fp):
+            try:
+                img = np.load(cache_fp)
+            except Exception:
+                img = None
+        # Fallback: on-the-fly preprocessing
+        if img is None:
+            image_path = row['image_path']
+            if not os.path.isabs(image_path):
+                clean = image_path
+                while clean.startswith('./') or clean.startswith('.//'):
+                    clean = clean[2:] if clean.startswith('./') else clean[3:]
+                image_path = os.path.join(BASE_DIR, clean)
+            source = row.get('source', 'ODIR')
+            try:
+                if source == 'APTOS':
+                    img = self._ben_graham(image_path)
+                else:
+                    img = self._clahe_preprocess(image_path)
+            except Exception:
+                img = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)
+        img_tensor = self.transform(img)
+        disease_lbl = int(row['disease_label'])
+        source = row.get('source', 'unknown')
+        return img_tensor, disease_lbl, source
+    @staticmethod
+    def _ben_graham(path, sz=IMG_SIZE, sigma=10):
+        raw = cv2.imread(path)
+        if raw is None:
+            raw = np.array(Image.open(path).convert('RGB'))
+            raw = cv2.cvtColor(raw, cv2.COLOR_RGB2BGR)
+        raw = cv2.cvtColor(raw, cv2.COLOR_BGR2RGB)
+        raw = cv2.resize(raw, (sz, sz))
+        raw = cv2.addWeighted(raw, 4, cv2.GaussianBlur(raw, (0, 0), sigma), -4, 128)
+        mask = np.zeros(raw.shape[:2], dtype=np.uint8)
+        cv2.circle(mask, (sz // 2, sz // 2), int(sz * 0.48), 255, -1)
+        return cv2.bitwise_and(raw, raw, mask=mask)
+    @staticmethod
+    def _clahe_preprocess(path, sz=IMG_SIZE):
+        raw = cv2.imread(path)
+        if raw is None:
+            raw = np.array(Image.open(path).convert('RGB'))
+            raw = cv2.cvtColor(raw, cv2.COLOR_RGB2BGR)
+        raw = cv2.resize(raw, (sz, sz))
+        lab = cv2.cvtColor(raw, cv2.COLOR_BGR2LAB)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        lab[:, :, 0] = clahe.apply(lab[:, :, 0])
+        raw = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
+        return cv2.cvtColor(raw, cv2.COLOR_BGR2RGB)
+val_transform = transforms.Compose([
+    transforms.ToPILImage(),
+    transforms.ToTensor(),
+    transforms.Normalize(NORM_MEAN, NORM_STD),
+])
+print('\nLoading test set...')
+test_df = pd.read_csv(TEST_CSV)
+print(f'  Test samples: {len(test_df)}')
+print(f'  Sources     : {sorted(test_df["source"].unique())}')
+print(f'  Class dist  : {test_df["disease_label"].value_counts().sort_index().to_dict()}')
+test_ds = TestDataset(test_df, val_transform)
+test_loader = DataLoader(
+    test_ds, batch_size=BATCH_SIZE, shuffle=False,
+    num_workers=4, pin_memory=True,
+)
+# ================================================================
+# INFERENCE
+# ================================================================
+print('\nRunning inference on full test set...')
+all_logits = []
+all_labels = []
+all_sources = []
+with torch.no_grad():
+    for imgs, labels, sources in test_loader:
+        imgs = imgs.to(DEVICE)
+        disease_logits, _ = model(imgs)
+        all_logits.append(disease_logits.cpu())
+        all_labels.extend(labels.numpy().tolist())
+        all_sources.extend(sources)
+all_logits = torch.cat(all_logits, dim=0)  # (N, 5)
+all_labels = np.array(all_labels)
+all_sources = np.array(all_sources)
+N = len(all_labels)
+print(f'  Inference complete: {N} samples')
+# Temperature-scaled probabilities
+probs_calibrated = F.softmax(all_logits / TEMPERATURE, dim=1).numpy()  # (N, 5)
+probs_uncalibrated = F.softmax(all_logits, dim=1).numpy()
+# Predictions: argmax of calibrated probabilities
+preds = np.argmax(probs_calibrated, axis=1)
+confidences = np.max(probs_calibrated, axis=1)
+correct_mask = (preds == all_labels)
+acc = accuracy_score(all_labels, preds)
+print(f'  Overall accuracy: {acc:.4f} ({int(acc * N)}/{N})')
+# ================================================================
+# 1. CONFUSION MATRIX
+# ================================================================
+print('\n[1/7] Confusion matrix...')
+cm = confusion_matrix(all_labels, preds, labels=list(range(NUM_CLASSES)))
+cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)
+fig, ax = plt.subplots(figsize=(7, 6))
+sns.heatmap(
+    cm_norm, annot=True, fmt='.2f', cmap='Blues',
+    xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES,
+    linewidths=0.5, linecolor='white',
+    cbar_kws={'label': 'Proportion', 'shrink': 0.8},
+    ax=ax, vmin=0, vmax=1,
+)
+# Overlay raw counts in smaller font
+for i in range(NUM_CLASSES):
+    for j in range(NUM_CLASSES):
+        ax.text(j + 0.5, i + 0.72, f'(n={cm[i, j]})',
+                ha='center', va='center', fontsize=7, color='gray')
+ax.set_xlabel('Predicted Class')
+ax.set_ylabel('True Class')
+ax.set_title('Normalized Confusion Matrix (Test Set)')
+fig.tight_layout()
+fig.savefig(os.path.join(EVAL_DIR, 'confusion_matrix.png'))
+plt.close(fig)
+print('  Saved confusion_matrix.png')
+# ================================================================
+# 2. ROC CURVES PER CLASS
+# ================================================================
+print('[2/7] ROC curves...')
+fig, ax = plt.subplots(figsize=(7, 6))
+colors = sns.color_palette('tab10', NUM_CLASSES)
+all_fpr_tpr = {}
+macro_auc_list = []
+for i in range(NUM_CLASSES):
+    y_true_bin = (all_labels == i).astype(int)
+    y_score = probs_calibrated[:, i]
+    fpr, tpr, _ = roc_curve(y_true_bin, y_score)
+    roc_auc = auc(fpr, tpr)
+    macro_auc_list.append(roc_auc)
+    all_fpr_tpr[i] = (fpr, tpr)
+    ax.plot(fpr, tpr, color=colors[i], lw=2,
+            label=f'{CLASS_NAMES[i]} (AUC={roc_auc:.3f})')
+# Macro average ROC
+mean_fpr = np.linspace(0, 1, 200)
+mean_tpr = np.zeros_like(mean_fpr)
+for i in range(NUM_CLASSES):
+    mean_tpr += np.interp(mean_fpr, all_fpr_tpr[i][0], all_fpr_tpr[i][1])
+mean_tpr /= NUM_CLASSES
+macro_auc = auc(mean_fpr, mean_tpr)
+ax.plot(mean_fpr, mean_tpr, 'k--', lw=2.5,
+        label=f'Macro-average (AUC={macro_auc:.3f})')
+ax.plot([0, 1], [0, 1], 'k:', lw=1, alpha=0.4)
+ax.set_xlim([-0.02, 1.02])
+ax.set_ylim([-0.02, 1.05])
+ax.set_xlabel('False Positive Rate')
+ax.set_ylabel('True Positive Rate')
+ax.set_title('One-vs-Rest ROC Curves (Calibrated)')
+ax.legend(loc='lower right', framealpha=0.9)
+ax.grid(True, alpha=0.3)
+fig.tight_layout()
+fig.savefig(os.path.join(EVAL_DIR, 'roc_curves_per_class.png'))
+plt.close(fig)
+print('  Saved roc_curves_per_class.png')
+# ================================================================
+# 3. PRECISION-RECALL CURVES
+# ================================================================
+print('[3/7] Precision-recall curves...')
+fig, ax = plt.subplots(figsize=(7, 6))
+for i in range(NUM_CLASSES):
+    y_true_bin = (all_labels == i).astype(int)
+    y_score = probs_calibrated[:, i]
+    prec, rec, _ = precision_recall_curve(y_true_bin, y_score)
+    ap = average_precision_score(y_true_bin, y_score)
+    ax.plot(rec, prec, color=colors[i], lw=2,
+            label=f'{CLASS_NAMES[i]} (AP={ap:.3f})')
+# Add prevalence baselines
+prevalences = np.bincount(all_labels, minlength=NUM_CLASSES) / N
+for i in range(NUM_CLASSES):
+    ax.axhline(y=prevalences[i], color=colors[i], ls=':', alpha=0.3)
+ax.set_xlim([-0.02, 1.02])
+ax.set_ylim([-0.02, 1.05])
+ax.set_xlabel('Recall')
+ax.set_ylabel('Precision')
+ax.set_title('Precision-Recall Curves (Calibrated)')
+ax.legend(loc='upper right', framealpha=0.9)
+ax.grid(True, alpha=0.3)
+fig.tight_layout()
+fig.savefig(os.path.join(EVAL_DIR, 'precision_recall_curves.png'))
+plt.close(fig)
+print('  Saved precision_recall_curves.png')
+# ================================================================
+# 4. CALIBRATION RELIABILITY DIAGRAM
+# ================================================================
+print('[4/7] Calibration reliability diagram...')
+n_bins = 10
+bin_edges = np.linspace(0, 1, n_bins + 1)
+bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
+# Compute calibration for both calibrated and uncalibrated probabilities
+def compute_calibration(confidences_arr, correct_arr, bin_edges):
+    """Compute per-bin accuracy and average confidence."""
+    bin_accs = []
+    bin_confs = []
+    bin_counts = []
+    for lo, hi in zip(bin_edges[:-1], bin_edges[1:]):
+        mask = (confidences_arr > lo) & (confidences_arr <= hi)
+        if mask.sum() == 0:
+            bin_accs.append(np.nan)
+            bin_confs.append(np.nan)
+            bin_counts.append(0)
+        else:
+            bin_accs.append(correct_arr[mask].mean())
+            bin_confs.append(confidences_arr[mask].mean())
+            bin_counts.append(int(mask.sum()))
+    return np.array(bin_accs), np.array(bin_confs), np.array(bin_counts)
+conf_calib = np.max(probs_calibrated, axis=1)
+conf_uncalib = np.max(probs_uncalibrated, axis=1)
+bin_accs_cal, bin_confs_cal, bin_counts_cal = compute_calibration(
+    conf_calib, correct_mask.astype(float), bin_edges)
+bin_accs_uncal, bin_confs_uncal, bin_counts_uncal = compute_calibration(
+    conf_uncalib, correct_mask.astype(float), bin_edges)
+# ECE
+ece_cal = np.nansum(
+    np.abs(bin_accs_cal - bin_confs_cal) * bin_counts_cal) / N
+ece_uncal = np.nansum(
+    np.abs(bin_accs_uncal - bin_confs_uncal) * bin_counts_uncal) / N
+fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+for ax_idx, (b_accs, b_confs, b_counts, ece_val, title_suffix) in enumerate([
+    (bin_accs_cal, bin_confs_cal, bin_counts_cal, ece_cal, 'Calibrated'),
+    (bin_accs_uncal, bin_confs_uncal, bin_counts_uncal, ece_uncal, 'Uncalibrated'),
+]):
+    ax = axes[ax_idx]
+    # Perfect calibration line
+    ax.plot([0, 1], [0, 1], 'k--', lw=1.5, alpha=0.5, label='Perfectly calibrated')
+    # Bar chart of bin accuracy
+    valid = ~np.isnan(b_accs)
+    bar_color = '#4C72B0' if ax_idx == 0 else '#DD8452'
+    ax.bar(bin_centers[valid], b_accs[valid], width=0.08,
+           alpha=0.7, color=bar_color, edgecolor='black', linewidth=0.5,
+           label=f'Model (ECE={ece_val:.4f})')
+    # Gap shading
+    for j in range(n_bins):
+        if valid[j]:
+            lo_val = min(b_accs[j], b_confs[j])
+            hi_val = max(b_accs[j], b_confs[j])
+            ax.fill_between(
+                [bin_centers[j] - 0.04, bin_centers[j] + 0.04],
+                lo_val, hi_val, alpha=0.15, color='red')
+    # Sample counts on top
+    for j in range(n_bins):
+        if valid[j] and b_counts[j] > 0:
+            ax.text(bin_centers[j], b_accs[j] + 0.03,
+                    str(b_counts[j]), ha='center', va='bottom', fontsize=7)
+    ax.set_xlim([0, 1])
+    ax.set_ylim([0, 1.1])
+    ax.set_xlabel('Mean Predicted Confidence')
+    ax.set_ylabel('Fraction of Correct Predictions')
+    ax.set_title(f'Reliability Diagram ({title_suffix})')
+    ax.legend(loc='upper left', framealpha=0.9)
+    ax.grid(True, alpha=0.3)
+fig.tight_layout()
+fig.savefig(os.path.join(EVAL_DIR, 'calibration_reliability.png'))
+plt.close(fig)
+print(f'  Saved calibration_reliability.png  (ECE_cal={ece_cal:.4f}, ECE_uncal={ece_uncal:.4f})')
+# ================================================================
+# 5. CONFIDENCE HISTOGRAMS
+# ================================================================
+print('[5/7] Confidence histograms...')
+fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+# Correct vs Incorrect
+for ax_idx, (mask, label, color) in enumerate([
+    (correct_mask, 'Correct', '#2ca02c'),
+    (~correct_mask, 'Incorrect', '#d62728'),
+]):
+    axes[0].hist(confidences[mask], bins=30, alpha=0.65, color=color,
+                 label=f'{label} (n={mask.sum()})', edgecolor='black', linewidth=0.3)
+axes[0].set_xlabel('Prediction Confidence')
+axes[0].set_ylabel('Count')
+axes[0].set_title('Confidence Distribution: Correct vs Incorrect')
+axes[0].legend(loc='upper left', framealpha=0.9)
+axes[0].axvline(x=np.median(confidences[correct_mask]), color='#2ca02c',
+                ls='--', alpha=0.6, label='_nolegend_')
+axes[0].axvline(x=np.median(confidences[~correct_mask]), color='#d62728',
+                ls='--', alpha=0.6, label='_nolegend_')
+axes[0].grid(True, alpha=0.3, axis='y')
+# Per-class confidence
+for i in range(NUM_CLASSES):
+    cls_mask = (all_labels == i)
+    axes[1].hist(confidences[cls_mask], bins=20, alpha=0.5, color=colors[i],
+                 label=f'{CLASS_NAMES[i]} (n={cls_mask.sum()})',
+                 edgecolor='black', linewidth=0.3)
+axes[1].set_xlabel('Prediction Confidence')
+axes[1].set_ylabel('Count')
+axes[1].set_title('Confidence Distribution by True Class')
+axes[1].legend(loc='upper left', framealpha=0.9, fontsize=9)
+axes[1].grid(True, alpha=0.3, axis='y')
+fig.tight_layout()
+fig.savefig(os.path.join(EVAL_DIR, 'confidence_histograms.png'))
+plt.close(fig)
+print('  Saved confidence_histograms.png')
+# ================================================================
+# 6. ERROR ANALYSIS BY SOURCE
+# ================================================================
+print('[6/7] Error analysis by source...')
+sources_unique = sorted(np.unique(all_sources))
+n_sources = len(sources_unique)
+# Build accuracy per (source, class) pair
+source_class_acc = {}
+source_class_n = {}
+for src in sources_unique:
+    for cls_idx in range(NUM_CLASSES):
+        mask = (all_sources == src) & (all_labels == cls_idx)
+        n_cls = mask.sum()
+        if n_cls > 0:
+            acc_sc = (preds[mask] == all_labels[mask]).mean()
+        else:
+            acc_sc = np.nan
+        source_class_acc[(src, cls_idx)] = acc_sc
+        source_class_n[(src, cls_idx)] = int(n_cls)
+# Also overall accuracy per source
+source_overall_acc = {}
+for src in sources_unique:
+    mask = (all_sources == src)
+    source_overall_acc[src] = accuracy_score(all_labels[mask], preds[mask])
+fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+# Left panel: grouped bar chart of per-class accuracy by source
+x = np.arange(NUM_CLASSES)
+bar_width = 0.8 / max(n_sources, 1)
+source_colors = sns.color_palette('Set2', n_sources)
+for s_idx, src in enumerate(sources_unique):
+    accs = [source_class_acc[(src, c)] for c in range(NUM_CLASSES)]
+    counts = [source_class_n[(src, c)] for c in range(NUM_CLASSES)]
+    offset = (s_idx - n_sources / 2 + 0.5) * bar_width
+    bars = axes[0].bar(x + offset, accs, bar_width * 0.9,
+                       label=f'{src} (n={sum(counts)})',
+                       color=source_colors[s_idx], edgecolor='black', linewidth=0.5)
+    # Annotate sample counts
+    for j, (b, n_val) in enumerate(zip(bars, counts)):
+        if n_val > 0 and not np.isnan(accs[j]):
+            axes[0].text(b.get_x() + b.get_width() / 2, b.get_height() + 0.02,
+                         str(n_val), ha='center', va='bottom', fontsize=7)
+axes[0].set_xticks(x)
+axes[0].set_xticklabels(CLASS_NAMES, rotation=15, ha='right')
+axes[0].set_ylabel('Accuracy')
+axes[0].set_title('Per-Class Accuracy by Data Source')
+axes[0].set_ylim([0, 1.15])
+axes[0].legend(loc='upper right', framealpha=0.9)
+axes[0].grid(True, alpha=0.3, axis='y')
+axes[0].axhline(y=acc, color='black', ls='--', alpha=0.4, lw=1)
+axes[0].text(NUM_CLASSES - 0.5, acc + 0.02, f'Overall: {acc:.3f}',
+             ha='right', fontsize=9, alpha=0.6)
+# Right panel: confusion breakdown -- most common misclassifications per source
+error_data = []
+for src in sources_unique:
+    src_mask = (all_sources == src) & (~correct_mask)
+    if src_mask.sum() == 0:
+        continue
+    for true_cls in range(NUM_CLASSES):
+        for pred_cls in range(NUM_CLASSES):
+            if true_cls == pred_cls:
+                continue
+            pair_mask = src_mask & (all_labels == true_cls) & (preds == pred_cls)
+            cnt = pair_mask.sum()
+            if cnt > 0:
+                error_data.append({
+                    'Source': src,
+                    'Error': f'{CLASS_NAMES[true_cls][:3]}>{CLASS_NAMES[pred_cls][:3]}',
+                    'Count': int(cnt),
+                })
+if error_data:
+    err_df = pd.DataFrame(error_data)
+    # Top 10 error types
+    top_errors = (err_df.groupby('Error')['Count'].sum()
+                  .sort_values(ascending=False).head(10).index.tolist())
+    err_df_top = err_df[err_df['Error'].isin(top_errors)]
+    pivot = err_df_top.pivot_table(index='Error', columns='Source',
+                                   values='Count', aggfunc='sum', fill_value=0)
+    # Reorder by total count
+    pivot = pivot.loc[pivot.sum(axis=1).sort_values(ascending=True).index]
+    pivot.plot(kind='barh', stacked=True, ax=axes[1],
+               color=source_colors[:n_sources], edgecolor='black', linewidth=0.5)
+    axes[1].set_xlabel('Error Count')
+    axes[1].set_title('Top Misclassification Patterns by Source')
+    axes[1].legend(loc='lower right', framealpha=0.9)
+    axes[1].grid(True, alpha=0.3, axis='x')
+else:
+    axes[1].text(0.5, 0.5, 'No errors to display', ha='center', va='center',
+                 transform=axes[1].transAxes, fontsize=14)
+    axes[1].set_title('Top Misclassification Patterns by Source')
+fig.tight_layout()
+fig.savefig(os.path.join(EVAL_DIR, 'error_analysis_by_source.png'))
+plt.close(fig)
+print('  Saved error_analysis_by_source.png')
+# ================================================================
+# 7. METRICS REPORT (JSON)
+# ================================================================
+print('[7/7] Metrics report...')
+# Classification report as dict
+cls_report = classification_report(
+    all_labels, preds, target_names=CLASS_NAMES,
+    output_dict=True, zero_division=0)
+# Per-class AUC and AP
+per_class_auc = {}
+per_class_ap = {}
+for i in range(NUM_CLASSES):
+    y_bin = (all_labels == i).astype(int)
+    y_score = probs_calibrated[:, i]
+    fpr_i, tpr_i, _ = roc_curve(y_bin, y_score)
+    per_class_auc[CLASS_NAMES[i]] = float(auc(fpr_i, tpr_i))
+    per_class_ap[CLASS_NAMES[i]] = float(average_precision_score(y_bin, y_score))
+# Build the full report
+try:
+    ll = float(log_loss(all_labels, probs_calibrated))
+except Exception:
+    ll = None
+metrics_report = OrderedDict([
+    ('n_test_samples', int(N)),
+    ('overall_accuracy', float(acc)),
+    ('balanced_accuracy', float(balanced_accuracy_score(all_labels, preds))),
+    ('macro_f1', float(f1_score(all_labels, preds, average='macro', zero_division=0))),
+    ('weighted_f1', float(f1_score(all_labels, preds, average='weighted', zero_division=0))),
+    ('cohen_kappa', float(cohen_kappa_score(all_labels, preds))),
+    ('matthews_corrcoef', float(matthews_corrcoef(all_labels, preds))),
+    ('log_loss', ll),
+    ('macro_auc', float(np.mean(list(per_class_auc.values())))),
+    ('ece_calibrated', float(ece_cal)),
+    ('ece_uncalibrated', float(ece_uncal)),
+    ('temperature', float(TEMPERATURE)),
+    ('thresholds', THRESHOLDS),
+    ('per_class_metrics', {}),
+    ('per_class_auc', per_class_auc),
+    ('per_class_ap', per_class_ap),
+    ('confusion_matrix_raw', cm.tolist()),
+    ('confusion_matrix_normalized', np.round(cm_norm, 4).tolist()),
+    ('source_accuracy', {src: float(v) for src, v in source_overall_acc.items()}),
+    ('source_class_counts', {
+        src: {CLASS_NAMES[c]: source_class_n[(src, c)]
+              for c in range(NUM_CLASSES)}
+        for src in sources_unique
+    }),
+    ('class_names', CLASS_NAMES),
+])
+# Per-class from classification_report
+for i, name in enumerate(CLASS_NAMES):
+    metrics_report['per_class_metrics'][name] = {
+        'precision': float(cls_report[name]['precision']),
+        'recall': float(cls_report[name]['recall']),
+        'f1-score': float(cls_report[name]['f1-score']),
+        'support': int(cls_report[name]['support']),
+        'auc': per_class_auc[name],
+        'average_precision': per_class_ap[name],
+    }
+report_path = os.path.join(EVAL_DIR, 'metrics_report.json')
+with open(report_path, 'w') as f:
+    json.dump(metrics_report, f, indent=2)
+print(f'  Saved metrics_report.json')
+# ================================================================
+# SUMMARY
+# ================================================================
+print('\n' + '=' * 65)
+print('  EVALUATION DASHBOARD COMPLETE')
+print('=' * 65)
+print(f'  Overall Accuracy    : {acc:.4f}')
+print(f'  Balanced Accuracy   : {metrics_report["balanced_accuracy"]:.4f}')
+print(f'  Macro F1            : {metrics_report["macro_f1"]:.4f}')
+print(f'  Cohen Kappa         : {metrics_report["cohen_kappa"]:.4f}')
+print(f'  Macro AUC           : {metrics_report["macro_auc"]:.4f}')
+print(f'  ECE (calibrated)    : {ece_cal:.4f}')
+print(f'  ECE (uncalibrated)  : {ece_uncal:.4f}')
+print(f'\n  Per-class AUC:')
+for name, val in per_class_auc.items():
+    print(f'    {name:15s} : {val:.4f}')
+print(f'\n  Source accuracy:')
+for src, val in source_overall_acc.items():
+    print(f'    {src:10s} : {val:.4f}')
+print(f'\n  All outputs in: {EVAL_DIR}/')
+output_files = [
+    'confusion_matrix.png',
+    'roc_curves_per_class.png',
+    'precision_recall_curves.png',
+    'calibration_reliability.png',
+    'confidence_histograms.png',
+    'error_analysis_by_source.png',
+    'metrics_report.json',
+]
+for fname in output_files:
+    fpath = os.path.join(EVAL_DIR, fname)
+    exists = os.path.exists(fpath)
+    size_kb = os.path.getsize(fpath) / 1024 if exists else 0
+    status = f'{size_kb:.0f} KB' if exists else 'MISSING'
+    print(f'    [{status:>8s}] {fname}')
+print('=' * 65)