tanishq74
/

retinasense-vit

+#!/usr/bin/env python3
+"""
+RetinaSense v3.0 -- MC Dropout Uncertainty Quantification (Phase 1B)
+====================================================================
+Performs Monte Carlo Dropout inference on the test set to decompose
+predictive uncertainty into aleatoric and epistemic components.
+Strategy for efficiency:
+  - Run the ViT backbone ONCE per image (deterministic, no dropout in backbone)
+  - Cache the 768-dim CLS features
+  - Run T=30 stochastic forward passes through the classification heads only
+    (where the dropout layers live: self.drop + head dropouts)
+  This is 30x faster than running the full model T times.
+For each test image, computes:
+  - Predictive entropy (total uncertainty)
+  - Expected entropy (aleatoric uncertainty)
+  - Mutual information (epistemic uncertainty)
+  - Per-class prediction variance
+Generates:
+  - uncertainty_vs_accuracy.png
+  - rejection_curve.png
+  - epistemic_vs_aleatoric.png
+  - uncertainty_by_class.png
+  - confidence_vs_uncertainty.png
+  - mc_dropout_results.json
+Usage:
+  python mc_dropout_uncertainty.py
+"""
+import os
+import sys
+import json
+import time
+import warnings
+import numpy as np
+import pandas as pd
+import cv2
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from PIL import Image
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from torch.utils.data import Dataset, DataLoader
+import timm
+# Maximize CPU throughput
+torch.set_num_threads(4)
+# ================================================================
+# CONFIGURATION
+# ================================================================
+BASE_DIR    = '/teamspace/studios/this_studio'
+OUTPUT_DIR  = os.path.join(BASE_DIR, 'outputs_v3')
+UNCERT_DIR  = os.path.join(OUTPUT_DIR, 'uncertainty')
+os.makedirs(UNCERT_DIR, exist_ok=True)
+MODEL_PATH       = os.path.join(OUTPUT_DIR, 'best_model.pth')
+TEMPERATURE_PATH = os.path.join(OUTPUT_DIR, 'temperature.json')
+NORM_STATS_PATH  = os.path.join(BASE_DIR, 'data', 'fundus_norm_stats.json')
+TEST_CSV         = os.path.join(BASE_DIR, 'data', 'test_split.csv')
+CLASS_NAMES = ['Normal', 'Diabetes/DR', 'Glaucoma', 'Cataract', 'AMD']
+NUM_CLASSES = 5
+IMG_SIZE    = 224
+DROPOUT     = 0.3
+T_FORWARD_PASSES = 30   # number of MC stochastic forward passes
+BATCH_SIZE       = 32    # batch size for feature extraction
+HEAD_BATCH       = 512   # batch size for head-only MC passes (very lightweight)
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print('=' * 65)
+print('  RetinaSense v3.0 -- MC Dropout Uncertainty Quantification')
+print('=' * 65)
+print(f'  Device          : {DEVICE}')
+if torch.cuda.is_available():
+    print(f'  GPU             : {torch.cuda.get_device_name(0)}')
+print(f'  MC passes (T)   : {T_FORWARD_PASSES}')
+print(f'  Output dir      : {UNCERT_DIR}')
+# ================================================================
+# LOAD NORMALISATION STATS
+# ================================================================
+if os.path.exists(NORM_STATS_PATH):
+    with open(NORM_STATS_PATH) as f:
+        norm_stats = json.load(f)
+    NORM_MEAN = norm_stats['mean_rgb']
+    NORM_STD  = norm_stats['std_rgb']
+    print(f'  Fundus norm     : mean={[round(v,4) for v in NORM_MEAN]}, '
+          f'std={[round(v,4) for v in NORM_STD]}')
+else:
+    NORM_MEAN = [0.485, 0.456, 0.406]
+    NORM_STD  = [0.229, 0.224, 0.225]
+    print('  Using ImageNet normalisation fallback')
+# Load temperature
+with open(TEMPERATURE_PATH) as f:
+    temp_data = json.load(f)
+TEMPERATURE = temp_data['temperature']
+print(f'  Temperature     : {TEMPERATURE:.4f}')
+# ================================================================
+# MODEL ARCHITECTURE (mirrors retinasense_v3.py / gradcam_v3.py)
+# ================================================================
+class MultiTaskViT(nn.Module):
+    """ViT-Base-Patch16-224 with disease + severity heads."""
+    def __init__(self, n_disease=NUM_CLASSES, n_severity=5, drop=DROPOUT):
+        super().__init__()
+        self.backbone = timm.create_model(
+            'vit_base_patch16_224', pretrained=False, num_classes=0
+        )
+        feat = 768  # CLS token dimension
+        self.drop = nn.Dropout(drop)
+        self.disease_head = nn.Sequential(
+            nn.Linear(feat, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
+            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
+            nn.Linear(256, n_disease),
+        )
+        self.severity_head = nn.Sequential(
+            nn.Linear(feat, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
+            nn.Linear(256, n_severity),
+        )
+    def forward(self, x):
+        f = self.backbone(x)   # (B, 768)
+        f = self.drop(f)
+        return self.disease_head(f), self.severity_head(f)
+    def extract_features(self, x):
+        """Run backbone only (deterministic) to get CLS features."""
+        return self.backbone(x)   # (B, 768)
+    def forward_heads(self, features):
+        """Run dropout + disease head on pre-extracted features."""
+        f = self.drop(features)
+        return self.disease_head(f)
+# ================================================================
+# LOAD MODEL
+# ================================================================
+print('\nLoading model...')
+model = MultiTaskViT().to(DEVICE)
+ckpt = torch.load(MODEL_PATH, map_location=DEVICE, weights_only=False)
+model.load_state_dict(ckpt['model_state_dict'])
+print(f'  Loaded: {MODEL_PATH}')
+print(f'  Checkpoint epoch: {ckpt.get("epoch", "?") + 1}  '
+      f'val_acc={ckpt.get("val_acc", 0):.2f}%')
+# ================================================================
+# MC DROPOUT SETUP
+# ================================================================
+def enable_head_dropout(model):
+    """
+    Set model to eval mode, then enable dropout ONLY in the classification
+    heads (self.drop, disease_head dropouts). The backbone stays fully
+    deterministic (eval mode) so we only need one backbone pass per image.
+    BatchNorm layers remain in eval mode (use running stats).
+    """
+    model.eval()  # everything to eval (including backbone)
+    # Enable dropout in the drop layer and disease_head
+    model.drop.train()
+    for m in model.disease_head.modules():
+        if isinstance(m, (nn.Dropout, nn.Dropout2d)):
+            m.train()
+enable_head_dropout(model)
+# Count active dropout layers
+n_dropout_active = 0
+for name, m in model.named_modules():
+    if isinstance(m, (nn.Dropout, nn.Dropout2d)) and m.training:
+        n_dropout_active += 1
+n_dropout_total = sum(1 for m in model.modules() if isinstance(m, (nn.Dropout, nn.Dropout2d)))
+print(f'\n  MC Dropout enabled in heads: {n_dropout_active} active / {n_dropout_total} total dropout layers')
+print(f'  Backbone: deterministic (eval mode) -- single pass per image')
+print(f'  Heads: stochastic (train mode dropout) -- {T_FORWARD_PASSES} passes per image')
+# ================================================================
+# PREPROCESSING (matches gradcam_v3.py pipeline)
+# ================================================================
+def ben_graham(path, sz=IMG_SIZE, sigma=10):
+    """Ben Graham high-frequency fundus enhancement (APTOS-style)."""
+    img = cv2.imread(path)
+    if img is None:
+        img = np.array(Image.open(path).convert('RGB'))
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = cv2.resize(img, (sz, sz))
+    img = cv2.addWeighted(img, 4, cv2.GaussianBlur(img, (0, 0), sigma), -4, 128)
+    mask = np.zeros(img.shape[:2], dtype=np.uint8)
+    cv2.circle(mask, (sz // 2, sz // 2), int(sz * 0.48), 255, -1)
+    return cv2.bitwise_and(img, img, mask=mask)
+def clahe_preprocess(path, sz=IMG_SIZE):
+    """CLAHE-based contrast enhancement (ODIR-style)."""
+    img = cv2.imread(path)
+    if img is None:
+        img = np.array(Image.open(path).convert('RGB'))
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    img = cv2.resize(img, (sz, sz))
+    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    lab[:, :, 0] = clahe.apply(lab[:, :, 0])
+    img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
+    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+def resolve_path(image_path):
+    """Resolve image path relative to BASE_DIR."""
+    if os.path.isabs(image_path) and os.path.exists(image_path):
+        return image_path
+    clean = image_path
+    while clean.startswith('./'):
+        clean = clean[2:]
+    return os.path.join(BASE_DIR, clean)
+# ================================================================
+# DATASET
+# ================================================================
+class TestDataset(Dataset):
+    """Test dataset loading preprocessed images from cache or live."""
+    def __init__(self, csv_path):
+        self.df = pd.read_csv(csv_path).reset_index(drop=True)
+        self.transform = transforms.Compose([
+            transforms.ToPILImage(),
+            transforms.ToTensor(),
+            transforms.Normalize(NORM_MEAN, NORM_STD),
+        ])
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        img_path = str(row['image_path'])
+        dataset  = str(row.get('source', 'auto'))
+        label    = int(row['disease_label'])
+        # Try loading from cache first
+        cache_path = str(row.get('cache_path', ''))
+        if cache_path and cache_path != 'nan':
+            cache_abs = resolve_path(cache_path)
+            if os.path.exists(cache_abs):
+                try:
+                    img_np = np.load(cache_abs)
+                    img_tensor = self.transform(img_np)
+                    return img_tensor, label, img_path
+                except Exception:
+                    pass
+        # Live preprocessing
+        abs_path = resolve_path(img_path)
+        try:
+            if dataset == 'APTOS':
+                img_np = ben_graham(abs_path)
+            else:
+                img_np = clahe_preprocess(abs_path)
+            img_tensor = self.transform(img_np)
+        except Exception:
+            img_tensor = torch.zeros(3, IMG_SIZE, IMG_SIZE)
+        return img_tensor, label, img_path
+# ================================================================
+# TWO-STAGE MC DROPOUT INFERENCE
+# ================================================================
+def extract_all_features(model, dataloader):
+    """
+    Stage 1: Run backbone once per image to get CLS features (deterministic).
+    Returns features (N, 768), labels (N,), paths list.
+    """
+    all_features = []
+    all_labels   = []
+    all_paths    = []
+    print(f'\n  Stage 1: Extracting backbone features (deterministic)...')
+    with torch.no_grad():
+        for images, labels, paths in tqdm(dataloader, desc='  Features', ncols=80):
+            images = images.to(DEVICE)
+            feats = model.extract_features(images)  # (B, 768)
+            all_features.append(feats.cpu())
+            all_labels.extend(labels.numpy().tolist())
+            all_paths.extend(paths)
+    all_features = torch.cat(all_features, dim=0)  # (N, 768)
+    all_labels   = np.array(all_labels)
+    return all_features, all_labels, all_paths
+def mc_dropout_on_heads(model, features, T=T_FORWARD_PASSES, temperature=TEMPERATURE):
+    """
+    Stage 2: Run T stochastic forward passes through heads only.
+    features: (N, 768) tensor
+    Returns: (N, T, C) numpy array of probability vectors.
+    """
+    N = features.size(0)
+    all_probs = np.zeros((N, T, NUM_CLASSES), dtype=np.float32)
+    print(f'\n  Stage 2: MC Dropout through heads ({T} passes, {N} samples)...')
+    with torch.no_grad():
+        for t in tqdm(range(T), desc='  MC Passes', ncols=80):
+            # Process in chunks to avoid memory issues
+            for start in range(0, N, HEAD_BATCH):
+                end = min(start + HEAD_BATCH, N)
+                feat_batch = features[start:end].to(DEVICE)
+                logits = model.forward_heads(feat_batch)
+                scaled = logits / temperature
+                probs = F.softmax(scaled, dim=1)
+                all_probs[start:end, t, :] = probs.cpu().numpy()
+    return all_probs
+# ================================================================
+# UNCERTAINTY METRICS
+# ================================================================
+def compute_uncertainty_metrics(mc_probs):
+    """
+    Compute uncertainty metrics from MC dropout probability samples.
+    Args:
+        mc_probs: (N, T, C) array of MC sampled probability vectors
+    Returns dict with:
+      - p_mean, predicted_class, max_confidence
+      - predictive_entropy (total), expected_entropy (aleatoric),
+        mutual_info (epistemic), class_variance
+    """
+    N, T, C = mc_probs.shape
+    eps = 1e-10
+    # Predictive mean: average over T passes
+    p_mean = mc_probs.mean(axis=1)                     # (N, C)
+    predicted_class = p_mean.argmax(axis=1)             # (N,)
+    max_confidence  = p_mean.max(axis=1)                # (N,)
+    # Predictive entropy: H[p_bar] = -sum(p_bar * log(p_bar))  -- TOTAL uncertainty
+    predictive_entropy = -np.sum(p_mean * np.log(p_mean + eps), axis=1)  # (N,)
+    # Per-pass entropies
+    per_pass_entropy = -np.sum(mc_probs * np.log(mc_probs + eps), axis=2)  # (N, T)
+    # Expected entropy: E_t[H[p_t]]  -- ALEATORIC uncertainty
+    expected_entropy = per_pass_entropy.mean(axis=1)    # (N,)
+    # Mutual information: H - E[H]  -- EPISTEMIC uncertainty
+    mutual_info = predictive_entropy - expected_entropy
+    mutual_info = np.maximum(mutual_info, 0.0)
+    # Prediction variance per class
+    class_variance = mc_probs.var(axis=1)               # (N, C)
+    return {
+        'p_mean':             p_mean,
+        'predicted_class':    predicted_class,
+        'max_confidence':     max_confidence,
+        'predictive_entropy': predictive_entropy,
+        'expected_entropy':   expected_entropy,
+        'mutual_info':        mutual_info,
+        'class_variance':     class_variance,
+    }
+# ================================================================
+# PLOTTING FUNCTIONS
+# ================================================================
+def plot_uncertainty_vs_accuracy(metrics, labels, save_path):
+    """Scatter: total uncertainty vs correctness, colored by class."""
+    correct = (metrics['predicted_class'] == labels).astype(int)
+    entropy = metrics['predictive_entropy']
+    fig, ax = plt.subplots(figsize=(10, 7))
+    colors = plt.cm.Set2(np.linspace(0, 1, NUM_CLASSES))
+    for cls_idx in range(NUM_CLASSES):
+        mask = labels == cls_idx
+        ax.scatter(
+            entropy[mask], correct[mask] + np.random.uniform(-0.08, 0.08, mask.sum()),
+            c=[colors[cls_idx]], alpha=0.5, s=20, label=CLASS_NAMES[cls_idx],
+            edgecolors='none'
+        )
+    ax.set_xlabel('Predictive Entropy (Total Uncertainty)', fontsize=12)
+    ax.set_ylabel('Correctness (1=correct, 0=wrong)', fontsize=12)
+    ax.set_title('MC Dropout: Uncertainty vs Prediction Correctness', fontsize=14)
+    ax.set_yticks([0, 1])
+    ax.set_yticklabels(['Incorrect', 'Correct'])
+    ax.legend(title='True Class', fontsize=9, title_fontsize=10)
+    ax.grid(True, alpha=0.3)
+    # Add vertical line at median uncertainty
+    med = np.median(entropy)
+    ax.axvline(med, color='red', linestyle='--', alpha=0.5, label=f'Median H={med:.3f}')
+    # Summary stats
+    correct_ent = entropy[correct == 1]
+    wrong_ent   = entropy[correct == 0]
+    textstr = (f'Correct: mean H={correct_ent.mean():.3f}\n'
+               f'Wrong:   mean H={wrong_ent.mean():.3f}' if len(wrong_ent) > 0
+               else f'Correct: mean H={correct_ent.mean():.3f}')
+    ax.text(0.98, 0.5, textstr, transform=ax.transAxes,
+            fontsize=9, verticalalignment='center', horizontalalignment='right',
+            bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))
+    plt.tight_layout()
+    fig.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+    print(f'  Saved: {save_path}')
+def plot_rejection_curve(metrics, labels, save_path):
+    """Accuracy as a function of rejection threshold on uncertainty."""
+    entropy = metrics['predictive_entropy']
+    correct = (metrics['predicted_class'] == labels).astype(int)
+    # Sort by decreasing uncertainty
+    sorted_idx = np.argsort(entropy)[::-1]
+    sorted_correct = correct[sorted_idx]
+    N = len(labels)
+    rejection_fracs = np.linspace(0.0, 0.95, 200)
+    accuracies  = []
+    n_remaining = []
+    for frac in rejection_fracs:
+        n_reject = int(frac * N)
+        kept = sorted_correct[n_reject:]
+        if len(kept) == 0:
+            accuracies.append(np.nan)
+            n_remaining.append(0)
+        else:
+            accuracies.append(kept.mean() * 100)
+            n_remaining.append(len(kept))
+    accuracies  = np.array(accuracies)
+    n_remaining = np.array(n_remaining)
+    fig, ax1 = plt.subplots(figsize=(10, 7))
+    color1 = '#2196F3'
+    ax1.plot(rejection_fracs * 100, accuracies, color=color1, linewidth=2.0,
+             label='Accuracy')
+    ax1.set_xlabel('Rejection Rate (%)', fontsize=12)
+    ax1.set_ylabel('Accuracy (%)', fontsize=12, color=color1)
+    ax1.tick_params(axis='y', labelcolor=color1)
+    ax1.set_ylim([max(50, np.nanmin(accuracies) - 5), 101])
+    # Secondary axis: number of remaining samples
+    ax2 = ax1.twinx()
+    color2 = '#FF9800'
+    ax2.plot(rejection_fracs * 100, n_remaining, color=color2, linewidth=1.5,
+             linestyle='--', alpha=0.7, label='Remaining')
+    ax2.set_ylabel('Samples Remaining', fontsize=12, color=color2)
+    ax2.tick_params(axis='y', labelcolor=color2)
+    # Baseline accuracy (no rejection)
+    base_acc = correct.mean() * 100
+    ax1.axhline(base_acc, color='gray', linestyle=':', alpha=0.5)
+    ax1.text(2, base_acc + 0.5, f'Baseline: {base_acc:.1f}%', fontsize=9, color='gray')
+    ax1.set_title('Rejection Curve: Accuracy vs Uncertainty-Based Rejection', fontsize=14)
+    ax1.grid(True, alpha=0.3)
+    # Combined legend
+    lines1, labels1 = ax1.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax1.legend(lines1 + lines2, labels1 + labels2, loc='lower left', fontsize=10)
+    plt.tight_layout()
+    fig.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+    print(f'  Saved: {save_path}')
+def plot_epistemic_vs_aleatoric(metrics, labels, save_path):
+    """Scatter separating epistemic and aleatoric uncertainty."""
+    aleatoric = metrics['expected_entropy']
+    epistemic = metrics['mutual_info']
+    correct   = (metrics['predicted_class'] == labels).astype(int)
+    fig, ax = plt.subplots(figsize=(10, 7))
+    colors = plt.cm.Set2(np.linspace(0, 1, NUM_CLASSES))
+    for cls_idx in range(NUM_CLASSES):
+        mask = labels == cls_idx
+        ax.scatter(
+            aleatoric[mask], epistemic[mask],
+            c=[colors[cls_idx]], alpha=0.45, s=20, label=CLASS_NAMES[cls_idx],
+            edgecolors='none'
+        )
+    # Mark misclassified samples
+    wrong_mask = correct == 0
+    if wrong_mask.sum() > 0:
+        ax.scatter(
+            aleatoric[wrong_mask], epistemic[wrong_mask],
+            facecolors='none', edgecolors='red', s=60, linewidths=1.2,
+            label='Misclassified', zorder=5
+        )
+    ax.set_xlabel('Aleatoric Uncertainty (Expected Entropy)', fontsize=12)
+    ax.set_ylabel('Epistemic Uncertainty (Mutual Information)', fontsize=12)
+    ax.set_title('Decomposition of Uncertainty: Epistemic vs Aleatoric', fontsize=14)
+    ax.legend(fontsize=9, title='Class', title_fontsize=10)
+    ax.grid(True, alpha=0.3)
+    # Annotate quadrants
+    xlim = ax.get_xlim()
+    ylim = ax.get_ylim()
+    ax.text(xlim[0] + 0.02 * (xlim[1] - xlim[0]),
+            ylim[1] - 0.05 * (ylim[1] - ylim[0]),
+            'Low aleatoric\nHigh epistemic\n(need more data)',
+            fontsize=8, alpha=0.6, va='top')
+    ax.text(xlim[1] - 0.02 * (xlim[1] - xlim[0]),
+            ylim[1] - 0.05 * (ylim[1] - ylim[0]),
+            'High aleatoric\nHigh epistemic\n(hard + unseen)',
+            fontsize=8, alpha=0.6, va='top', ha='right')
+    ax.text(xlim[1] - 0.02 * (xlim[1] - xlim[0]),
+            ylim[0] + 0.05 * (ylim[1] - ylim[0]),
+            'High aleatoric\nLow epistemic\n(inherently noisy)',
+            fontsize=8, alpha=0.6, va='bottom', ha='right')
+    plt.tight_layout()
+    fig.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+    print(f'  Saved: {save_path}')
+def plot_uncertainty_by_class(metrics, labels, save_path):
+    """Box plots of uncertainty per class."""
+    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
+    data_types = [
+        ('predictive_entropy', 'Total Uncertainty (Predictive Entropy)'),
+        ('expected_entropy',   'Aleatoric Uncertainty (Expected Entropy)'),
+        ('mutual_info',        'Epistemic Uncertainty (Mutual Information)'),
+    ]
+    for ax, (key, title) in zip(axes, data_types):
+        data = metrics[key]
+        box_data = [data[labels == c] for c in range(NUM_CLASSES)]
+        bp = ax.boxplot(box_data, labels=CLASS_NAMES, patch_artist=True,
+                        widths=0.6, showfliers=True,
+                        flierprops=dict(marker='o', markersize=3, alpha=0.3))
+        colors = plt.cm.Set2(np.linspace(0, 1, NUM_CLASSES))
+        for patch, color in zip(bp['boxes'], colors):
+            patch.set_facecolor(color)
+            patch.set_alpha(0.7)
+        ax.set_title(title, fontsize=11)
+        ax.set_ylabel('Uncertainty', fontsize=10)
+        ax.grid(True, axis='y', alpha=0.3)
+        ax.tick_params(axis='x', rotation=15)
+        # Add sample counts
+        for i, cls_data in enumerate(box_data):
+            ax.text(i + 1, ax.get_ylim()[1] * 0.95,
+                    f'n={len(cls_data)}', ha='center', fontsize=8, alpha=0.6)
+    plt.suptitle('Uncertainty Distribution by Disease Class', fontsize=14, y=1.02)
+    plt.tight_layout()
+    fig.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+    print(f'  Saved: {save_path}')
+def plot_confidence_vs_uncertainty(metrics, labels, save_path):
+    """Scatter showing confidence vs uncertainty (should be anti-correlated)."""
+    confidence = metrics['max_confidence']
+    entropy    = metrics['predictive_entropy']
+    correct    = (metrics['predicted_class'] == labels).astype(int)
+    fig, ax = plt.subplots(figsize=(10, 7))
+    scatter_correct = ax.scatter(
+        confidence[correct == 1], entropy[correct == 1],
+        c='#4CAF50', alpha=0.4, s=15, label='Correct', edgecolors='none'
+    )
+    scatter_wrong = ax.scatter(
+        confidence[correct == 0], entropy[correct == 0],
+        c='#F44336', alpha=0.6, s=25, label='Incorrect', edgecolors='none',
+        marker='x', linewidths=1.0
+    )
+    # Compute correlation
+    from scipy import stats
+    r, p_val = stats.pearsonr(confidence, entropy)
+    ax.set_xlabel('Maximum Confidence (max p_bar)', fontsize=12)
+    ax.set_ylabel('Predictive Entropy (Total Uncertainty)', fontsize=12)
+    ax.set_title(f'Confidence vs Uncertainty (Pearson r={r:.3f}, p={p_val:.2e})', fontsize=14)
+    ax.legend(fontsize=10)
+    ax.grid(True, alpha=0.3)
+    # Add trend line
+    z = np.polyfit(confidence, entropy, 1)
+    x_line = np.linspace(confidence.min(), confidence.max(), 100)
+    ax.plot(x_line, np.polyval(z, x_line), 'k--', alpha=0.4, linewidth=1.5)
+    plt.tight_layout()
+    fig.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+    print(f'  Saved: {save_path}')
+# ================================================================
+# MAIN
+# ================================================================
+def main():
+    t_start = time.time()
+    # ---- 1. Build DataLoader ----
+    print('\nLoading test set...')
+    dataset = TestDataset(TEST_CSV)
+    dataloader = DataLoader(
+        dataset, batch_size=BATCH_SIZE, shuffle=False,
+        num_workers=2, pin_memory=False
+    )
+    print(f'  Test samples: {len(dataset)}')
+    # ---- 2. Stage 1: Extract backbone features (single deterministic pass) ----
+    features, true_labels, image_paths = extract_all_features(model, dataloader)
+    print(f'  Features shape: {features.shape}')
+    t_feat = time.time() - t_start
+    print(f'  Feature extraction: {t_feat:.1f}s')
+    # ---- 3. Stage 2: MC Dropout on heads only ----
+    mc_probs = mc_dropout_on_heads(
+        model, features, T=T_FORWARD_PASSES, temperature=TEMPERATURE
+    )
+    print(f'  MC probs shape: {mc_probs.shape}  (N, T, C)')
+    t_mc = time.time() - t_start - t_feat
+    print(f'  MC head passes: {t_mc:.1f}s')
+    # ---- 4. Compute Uncertainty Metrics ----
+    print('\nComputing uncertainty metrics...')
+    metrics = compute_uncertainty_metrics(mc_probs)
+    # Print summary statistics
+    correct = (metrics['predicted_class'] == true_labels).astype(int)
+    accuracy = correct.mean() * 100
+    print(f'\n  --- Summary ---')
+    print(f'  Accuracy (MC mean):     {accuracy:.2f}%')
+    print(f'  Predictive entropy:     mean={metrics["predictive_entropy"].mean():.4f}, '
+          f'std={metrics["predictive_entropy"].std():.4f}')
+    print(f'  Aleatoric (exp. ent.):  mean={metrics["expected_entropy"].mean():.4f}, '
+          f'std={metrics["expected_entropy"].std():.4f}')
+    print(f'  Epistemic (MI):         mean={metrics["mutual_info"].mean():.4f}, '
+          f'std={metrics["mutual_info"].std():.4f}')
+    print(f'  Max confidence:         mean={metrics["max_confidence"].mean():.4f}, '
+          f'std={metrics["max_confidence"].std():.4f}')
+    # Per-class stats
+    print(f'\n  Per-class uncertainty (predictive entropy):')
+    for cls_idx in range(NUM_CLASSES):
+        mask = true_labels == cls_idx
+        n_cls = mask.sum()
+        cls_acc = correct[mask].mean() * 100 if n_cls > 0 else 0
+        cls_ent = metrics['predictive_entropy'][mask].mean() if n_cls > 0 else 0
+        cls_mi  = metrics['mutual_info'][mask].mean() if n_cls > 0 else 0
+        print(f'    {CLASS_NAMES[cls_idx]:15s}: n={n_cls:4d}, '
+              f'acc={cls_acc:5.1f}%, H={cls_ent:.4f}, MI={cls_mi:.4f}')
+    # ---- 5. Generate Plots ----
+    print('\nGenerating plots...')
+    plot_uncertainty_vs_accuracy(
+        metrics, true_labels,
+        os.path.join(UNCERT_DIR, 'uncertainty_vs_accuracy.png')
+    )
+    plot_rejection_curve(
+        metrics, true_labels,
+        os.path.join(UNCERT_DIR, 'rejection_curve.png')
+    )
+    plot_epistemic_vs_aleatoric(
+        metrics, true_labels,
+        os.path.join(UNCERT_DIR, 'epistemic_vs_aleatoric.png')
+    )
+    plot_uncertainty_by_class(
+        metrics, true_labels,
+        os.path.join(UNCERT_DIR, 'uncertainty_by_class.png')
+    )
+    plot_confidence_vs_uncertainty(
+        metrics, true_labels,
+        os.path.join(UNCERT_DIR, 'confidence_vs_uncertainty.png')
+    )
+    # ---- 6. Save JSON Results ----
+    print('\nSaving results JSON...')
+    per_image = []
+    for i in range(len(true_labels)):
+        per_image.append({
+            'image_path':         image_paths[i],
+            'true_label':         int(true_labels[i]),
+            'true_class':         CLASS_NAMES[int(true_labels[i])],
+            'predicted_label':    int(metrics['predicted_class'][i]),
+            'predicted_class':    CLASS_NAMES[int(metrics['predicted_class'][i])],
+            'correct':            bool(correct[i]),
+            'max_confidence':     round(float(metrics['max_confidence'][i]), 6),
+            'predictive_entropy': round(float(metrics['predictive_entropy'][i]), 6),
+            'expected_entropy':   round(float(metrics['expected_entropy'][i]), 6),
+            'mutual_information': round(float(metrics['mutual_info'][i]), 6),
+            'class_variance':     [round(float(v), 8) for v in metrics['class_variance'][i]],
+            'mean_probs':         [round(float(v), 6) for v in metrics['p_mean'][i]],
+        })
+    aggregate = {
+        'n_samples':     int(len(true_labels)),
+        'n_classes':     NUM_CLASSES,
+        'mc_passes':     T_FORWARD_PASSES,
+        'temperature':   TEMPERATURE,
+        'accuracy_pct':  round(float(accuracy), 4),
+        'overall': {
+            'predictive_entropy': {
+                'mean': round(float(metrics['predictive_entropy'].mean()), 6),
+                'std':  round(float(metrics['predictive_entropy'].std()), 6),
+                'min':  round(float(metrics['predictive_entropy'].min()), 6),
+                'max':  round(float(metrics['predictive_entropy'].max()), 6),
+            },
+            'expected_entropy': {
+                'mean': round(float(metrics['expected_entropy'].mean()), 6),
+                'std':  round(float(metrics['expected_entropy'].std()), 6),
+                'min':  round(float(metrics['expected_entropy'].min()), 6),
+                'max':  round(float(metrics['expected_entropy'].max()), 6),
+            },
+            'mutual_information': {
+                'mean': round(float(metrics['mutual_info'].mean()), 6),
+                'std':  round(float(metrics['mutual_info'].std()), 6),
+                'min':  round(float(metrics['mutual_info'].min()), 6),
+                'max':  round(float(metrics['mutual_info'].max()), 6),
+            },
+            'max_confidence': {
+                'mean': round(float(metrics['max_confidence'].mean()), 6),
+                'std':  round(float(metrics['max_confidence'].std()), 6),
+            },
+        },
+        'per_class': {},
+    }
+    for cls_idx in range(NUM_CLASSES):
+        mask = true_labels == cls_idx
+        n_cls = int(mask.sum())
+        if n_cls == 0:
+            continue
+        aggregate['per_class'][CLASS_NAMES[cls_idx]] = {
+            'n_samples':  n_cls,
+            'accuracy':   round(float(correct[mask].mean() * 100), 4),
+            'pred_entropy_mean': round(float(metrics['predictive_entropy'][mask].mean()), 6),
+            'pred_entropy_std':  round(float(metrics['predictive_entropy'][mask].std()), 6),
+            'aleatoric_mean':    round(float(metrics['expected_entropy'][mask].mean()), 6),
+            'epistemic_mean':    round(float(metrics['mutual_info'][mask].mean()), 6),
+            'confidence_mean':   round(float(metrics['max_confidence'][mask].mean()), 6),
+        }
+    # Rejection curve data at key thresholds
+    entropy = metrics['predictive_entropy']
+    sorted_idx = np.argsort(entropy)[::-1]
+    sorted_correct = correct[sorted_idx]
+    rejection_checkpoints = {}
+    for frac in [0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.50]:
+        n_reject = int(frac * len(true_labels))
+        kept = sorted_correct[n_reject:]
+        if len(kept) > 0:
+            rejection_checkpoints[f'reject_{int(frac*100)}pct'] = {
+                'accuracy': round(float(kept.mean() * 100), 4),
+                'n_remaining': int(len(kept)),
+            }
+    aggregate['rejection_curve'] = rejection_checkpoints
+    results = {
+        'aggregate':  aggregate,
+        'per_image':  per_image,
+    }
+    json_path = os.path.join(UNCERT_DIR, 'mc_dropout_results.json')
+    with open(json_path, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f'  Saved: {json_path}')
+    elapsed = time.time() - t_start
+    print(f'\nDone in {elapsed:.1f}s')
+    print('=' * 65)
+if __name__ == '__main__':
+    main()