retinasense-vit / mc_dropout_uncertainty.py

Add mc_dropout_uncertainty.py

c4d737f verified 3 months ago

31.3 kB

	#!/usr/bin/env python3
	"""
	RetinaSense v3.0 -- MC Dropout Uncertainty Quantification (Phase 1B)
	====================================================================
	Performs Monte Carlo Dropout inference on the test set to decompose
	predictive uncertainty into aleatoric and epistemic components.

	Strategy for efficiency:
	- Run the ViT backbone ONCE per image (deterministic, no dropout in backbone)
	- Cache the 768-dim CLS features
	- Run T=30 stochastic forward passes through the classification heads only
	(where the dropout layers live: self.drop + head dropouts)
	This is 30x faster than running the full model T times.

	For each test image, computes:
	- Predictive entropy (total uncertainty)
	- Expected entropy (aleatoric uncertainty)
	- Mutual information (epistemic uncertainty)
	- Per-class prediction variance

	Generates:
	- uncertainty_vs_accuracy.png
	- rejection_curve.png
	- epistemic_vs_aleatoric.png
	- uncertainty_by_class.png
	- confidence_vs_uncertainty.png
	- mc_dropout_results.json

	Usage:
	python mc_dropout_uncertainty.py
	"""

	import os
	import sys
	import json
	import time
	import warnings
	import numpy as np
	import pandas as pd
	import cv2
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	from PIL import Image
	from tqdm import tqdm

	warnings.filterwarnings('ignore')

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torchvision import transforms
	from torch.utils.data import Dataset, DataLoader

	import timm

	# Maximize CPU throughput
	torch.set_num_threads(4)

	# ================================================================
	# CONFIGURATION
	# ================================================================
	BASE_DIR = '/teamspace/studios/this_studio'
	OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs_v3')
	UNCERT_DIR = os.path.join(OUTPUT_DIR, 'uncertainty')
	os.makedirs(UNCERT_DIR, exist_ok=True)

	MODEL_PATH = os.path.join(OUTPUT_DIR, 'best_model.pth')
	TEMPERATURE_PATH = os.path.join(OUTPUT_DIR, 'temperature.json')
	NORM_STATS_PATH = os.path.join(BASE_DIR, 'data', 'fundus_norm_stats.json')
	TEST_CSV = os.path.join(BASE_DIR, 'data', 'test_split.csv')

	CLASS_NAMES = ['Normal', 'Diabetes/DR', 'Glaucoma', 'Cataract', 'AMD']
	NUM_CLASSES = 5
	IMG_SIZE = 224
	DROPOUT = 0.3

	T_FORWARD_PASSES = 30 # number of MC stochastic forward passes
	BATCH_SIZE = 32 # batch size for feature extraction
	HEAD_BATCH = 512 # batch size for head-only MC passes (very lightweight)

	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	print('=' * 65)
	print(' RetinaSense v3.0 -- MC Dropout Uncertainty Quantification')
	print('=' * 65)
	print(f' Device : {DEVICE}')
	if torch.cuda.is_available():
	print(f' GPU : {torch.cuda.get_device_name(0)}')
	print(f' MC passes (T) : {T_FORWARD_PASSES}')
	print(f' Output dir : {UNCERT_DIR}')

	# ================================================================
	# LOAD NORMALISATION STATS
	# ================================================================
	if os.path.exists(NORM_STATS_PATH):
	with open(NORM_STATS_PATH) as f:
	norm_stats = json.load(f)
	NORM_MEAN = norm_stats['mean_rgb']
	NORM_STD = norm_stats['std_rgb']
	print(f' Fundus norm : mean={[round(v,4) for v in NORM_MEAN]}, '
	f'std={[round(v,4) for v in NORM_STD]}')
	else:
	NORM_MEAN = [0.485, 0.456, 0.406]
	NORM_STD = [0.229, 0.224, 0.225]
	print(' Using ImageNet normalisation fallback')

	# Load temperature
	with open(TEMPERATURE_PATH) as f:
	temp_data = json.load(f)
	TEMPERATURE = temp_data['temperature']
	print(f' Temperature : {TEMPERATURE:.4f}')

	# ================================================================
	# MODEL ARCHITECTURE (mirrors retinasense_v3.py / gradcam_v3.py)
	# ================================================================
	class MultiTaskViT(nn.Module):
	"""ViT-Base-Patch16-224 with disease + severity heads."""

	def __init__(self, n_disease=NUM_CLASSES, n_severity=5, drop=DROPOUT):
	super().__init__()
	self.backbone = timm.create_model(
	'vit_base_patch16_224', pretrained=False, num_classes=0
	)
	feat = 768 # CLS token dimension

	self.drop = nn.Dropout(drop)

	self.disease_head = nn.Sequential(
	nn.Linear(feat, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
	nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
	nn.Linear(256, n_disease),
	)
	self.severity_head = nn.Sequential(
	nn.Linear(feat, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
	nn.Linear(256, n_severity),
	)

	def forward(self, x):
	f = self.backbone(x) # (B, 768)
	f = self.drop(f)
	return self.disease_head(f), self.severity_head(f)

	def extract_features(self, x):
	"""Run backbone only (deterministic) to get CLS features."""
	return self.backbone(x) # (B, 768)

	def forward_heads(self, features):
	"""Run dropout + disease head on pre-extracted features."""
	f = self.drop(features)
	return self.disease_head(f)


	# ================================================================
	# LOAD MODEL
	# ================================================================
	print('\nLoading model...')
	model = MultiTaskViT().to(DEVICE)
	ckpt = torch.load(MODEL_PATH, map_location=DEVICE, weights_only=False)
	model.load_state_dict(ckpt['model_state_dict'])
	print(f' Loaded: {MODEL_PATH}')
	print(f' Checkpoint epoch: {ckpt.get("epoch", "?") + 1} '
	f'val_acc={ckpt.get("val_acc", 0):.2f}%')


	# ================================================================
	# MC DROPOUT SETUP
	# ================================================================
	def enable_head_dropout(model):
	"""
	Set model to eval mode, then enable dropout ONLY in the classification
	heads (self.drop, disease_head dropouts). The backbone stays fully
	deterministic (eval mode) so we only need one backbone pass per image.
	BatchNorm layers remain in eval mode (use running stats).
	"""
	model.eval() # everything to eval (including backbone)

	# Enable dropout in the drop layer and disease_head
	model.drop.train()
	for m in model.disease_head.modules():
	if isinstance(m, (nn.Dropout, nn.Dropout2d)):
	m.train()


	enable_head_dropout(model)

	# Count active dropout layers
	n_dropout_active = 0
	for name, m in model.named_modules():
	if isinstance(m, (nn.Dropout, nn.Dropout2d)) and m.training:
	n_dropout_active += 1
	n_dropout_total = sum(1 for m in model.modules() if isinstance(m, (nn.Dropout, nn.Dropout2d)))
	print(f'\n MC Dropout enabled in heads: {n_dropout_active} active / {n_dropout_total} total dropout layers')
	print(f' Backbone: deterministic (eval mode) -- single pass per image')
	print(f' Heads: stochastic (train mode dropout) -- {T_FORWARD_PASSES} passes per image')


	# ================================================================
	# PREPROCESSING (matches gradcam_v3.py pipeline)
	# ================================================================
	def ben_graham(path, sz=IMG_SIZE, sigma=10):
	"""Ben Graham high-frequency fundus enhancement (APTOS-style)."""
	img = cv2.imread(path)
	if img is None:
	img = np.array(Image.open(path).convert('RGB'))
	img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	img = cv2.resize(img, (sz, sz))
	img = cv2.addWeighted(img, 4, cv2.GaussianBlur(img, (0, 0), sigma), -4, 128)
	mask = np.zeros(img.shape[:2], dtype=np.uint8)
	cv2.circle(mask, (sz // 2, sz // 2), int(sz * 0.48), 255, -1)
	return cv2.bitwise_and(img, img, mask=mask)


	def clahe_preprocess(path, sz=IMG_SIZE):
	"""CLAHE-based contrast enhancement (ODIR-style)."""
	img = cv2.imread(path)
	if img is None:
	img = np.array(Image.open(path).convert('RGB'))
	img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
	img = cv2.resize(img, (sz, sz))
	lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	lab[:, :, 0] = clahe.apply(lab[:, :, 0])
	img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
	return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


	def resolve_path(image_path):
	"""Resolve image path relative to BASE_DIR."""
	if os.path.isabs(image_path) and os.path.exists(image_path):
	return image_path
	clean = image_path
	while clean.startswith('./'):
	clean = clean[2:]
	return os.path.join(BASE_DIR, clean)


	# ================================================================
	# DATASET
	# ================================================================
	class TestDataset(Dataset):
	"""Test dataset loading preprocessed images from cache or live."""

	def __init__(self, csv_path):
	self.df = pd.read_csv(csv_path).reset_index(drop=True)
	self.transform = transforms.Compose([
	transforms.ToPILImage(),
	transforms.ToTensor(),
	transforms.Normalize(NORM_MEAN, NORM_STD),
	])

	def __len__(self):
	return len(self.df)

	def __getitem__(self, idx):
	row = self.df.iloc[idx]
	img_path = str(row['image_path'])
	dataset = str(row.get('source', 'auto'))
	label = int(row['disease_label'])

	# Try loading from cache first
	cache_path = str(row.get('cache_path', ''))
	if cache_path and cache_path != 'nan':
	cache_abs = resolve_path(cache_path)
	if os.path.exists(cache_abs):
	try:
	img_np = np.load(cache_abs)
	img_tensor = self.transform(img_np)
	return img_tensor, label, img_path
	except Exception:
	pass

	# Live preprocessing
	abs_path = resolve_path(img_path)
	try:
	if dataset == 'APTOS':
	img_np = ben_graham(abs_path)
	else:
	img_np = clahe_preprocess(abs_path)
	img_tensor = self.transform(img_np)
	except Exception:
	img_tensor = torch.zeros(3, IMG_SIZE, IMG_SIZE)

	return img_tensor, label, img_path


	# ================================================================
	# TWO-STAGE MC DROPOUT INFERENCE
	# ================================================================
	def extract_all_features(model, dataloader):
	"""
	Stage 1: Run backbone once per image to get CLS features (deterministic).
	Returns features (N, 768), labels (N,), paths list.
	"""
	all_features = []
	all_labels = []
	all_paths = []

	print(f'\n Stage 1: Extracting backbone features (deterministic)...')
	with torch.no_grad():
	for images, labels, paths in tqdm(dataloader, desc=' Features', ncols=80):
	images = images.to(DEVICE)
	feats = model.extract_features(images) # (B, 768)
	all_features.append(feats.cpu())
	all_labels.extend(labels.numpy().tolist())
	all_paths.extend(paths)

	all_features = torch.cat(all_features, dim=0) # (N, 768)
	all_labels = np.array(all_labels)
	return all_features, all_labels, all_paths


	def mc_dropout_on_heads(model, features, T=T_FORWARD_PASSES, temperature=TEMPERATURE):
	"""
	Stage 2: Run T stochastic forward passes through heads only.
	features: (N, 768) tensor
	Returns: (N, T, C) numpy array of probability vectors.
	"""
	N = features.size(0)
	all_probs = np.zeros((N, T, NUM_CLASSES), dtype=np.float32)

	print(f'\n Stage 2: MC Dropout through heads ({T} passes, {N} samples)...')

	with torch.no_grad():
	for t in tqdm(range(T), desc=' MC Passes', ncols=80):
	# Process in chunks to avoid memory issues
	for start in range(0, N, HEAD_BATCH):
	end = min(start + HEAD_BATCH, N)
	feat_batch = features[start:end].to(DEVICE)
	logits = model.forward_heads(feat_batch)
	scaled = logits / temperature
	probs = F.softmax(scaled, dim=1)
	all_probs[start:end, t, :] = probs.cpu().numpy()

	return all_probs


	# ================================================================
	# UNCERTAINTY METRICS
	# ================================================================
	def compute_uncertainty_metrics(mc_probs):
	"""
	Compute uncertainty metrics from MC dropout probability samples.

	Args:
	mc_probs: (N, T, C) array of MC sampled probability vectors

	Returns dict with:
	- p_mean, predicted_class, max_confidence
	- predictive_entropy (total), expected_entropy (aleatoric),
	mutual_info (epistemic), class_variance
	"""
	N, T, C = mc_probs.shape
	eps = 1e-10

	# Predictive mean: average over T passes
	p_mean = mc_probs.mean(axis=1) # (N, C)
	predicted_class = p_mean.argmax(axis=1) # (N,)
	max_confidence = p_mean.max(axis=1) # (N,)

	# Predictive entropy: H[p_bar] = -sum(p_bar * log(p_bar)) -- TOTAL uncertainty
	predictive_entropy = -np.sum(p_mean * np.log(p_mean + eps), axis=1) # (N,)

	# Per-pass entropies
	per_pass_entropy = -np.sum(mc_probs * np.log(mc_probs + eps), axis=2) # (N, T)

	# Expected entropy: E_t[H[p_t]] -- ALEATORIC uncertainty
	expected_entropy = per_pass_entropy.mean(axis=1) # (N,)

	# Mutual information: H - E[H] -- EPISTEMIC uncertainty
	mutual_info = predictive_entropy - expected_entropy
	mutual_info = np.maximum(mutual_info, 0.0)

	# Prediction variance per class
	class_variance = mc_probs.var(axis=1) # (N, C)

	return {
	'p_mean': p_mean,
	'predicted_class': predicted_class,
	'max_confidence': max_confidence,
	'predictive_entropy': predictive_entropy,
	'expected_entropy': expected_entropy,
	'mutual_info': mutual_info,
	'class_variance': class_variance,
	}


	# ================================================================
	# PLOTTING FUNCTIONS
	# ================================================================
	def plot_uncertainty_vs_accuracy(metrics, labels, save_path):
	"""Scatter: total uncertainty vs correctness, colored by class."""
	correct = (metrics['predicted_class'] == labels).astype(int)
	entropy = metrics['predictive_entropy']

	fig, ax = plt.subplots(figsize=(10, 7))

	colors = plt.cm.Set2(np.linspace(0, 1, NUM_CLASSES))
	for cls_idx in range(NUM_CLASSES):
	mask = labels == cls_idx
	ax.scatter(
	entropy[mask], correct[mask] + np.random.uniform(-0.08, 0.08, mask.sum()),
	c=[colors[cls_idx]], alpha=0.5, s=20, label=CLASS_NAMES[cls_idx],
	edgecolors='none'
	)

	ax.set_xlabel('Predictive Entropy (Total Uncertainty)', fontsize=12)
	ax.set_ylabel('Correctness (1=correct, 0=wrong)', fontsize=12)
	ax.set_title('MC Dropout: Uncertainty vs Prediction Correctness', fontsize=14)
	ax.set_yticks([0, 1])
	ax.set_yticklabels(['Incorrect', 'Correct'])
	ax.legend(title='True Class', fontsize=9, title_fontsize=10)
	ax.grid(True, alpha=0.3)

	# Add vertical line at median uncertainty
	med = np.median(entropy)
	ax.axvline(med, color='red', linestyle='--', alpha=0.5, label=f'Median H={med:.3f}')

	# Summary stats
	correct_ent = entropy[correct == 1]
	wrong_ent = entropy[correct == 0]
	textstr = (f'Correct: mean H={correct_ent.mean():.3f}\n'
	f'Wrong: mean H={wrong_ent.mean():.3f}' if len(wrong_ent) > 0
	else f'Correct: mean H={correct_ent.mean():.3f}')
	ax.text(0.98, 0.5, textstr, transform=ax.transAxes,
	fontsize=9, verticalalignment='center', horizontalalignment='right',
	bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))

	plt.tight_layout()
	fig.savefig(save_path, dpi=300, bbox_inches='tight')
	plt.close(fig)
	print(f' Saved: {save_path}')


	def plot_rejection_curve(metrics, labels, save_path):
	"""Accuracy as a function of rejection threshold on uncertainty."""
	entropy = metrics['predictive_entropy']
	correct = (metrics['predicted_class'] == labels).astype(int)

	# Sort by decreasing uncertainty
	sorted_idx = np.argsort(entropy)[::-1]
	sorted_correct = correct[sorted_idx]

	N = len(labels)
	rejection_fracs = np.linspace(0.0, 0.95, 200)
	accuracies = []
	n_remaining = []

	for frac in rejection_fracs:
	n_reject = int(frac * N)
	kept = sorted_correct[n_reject:]
	if len(kept) == 0:
	accuracies.append(np.nan)
	n_remaining.append(0)
	else:
	accuracies.append(kept.mean() * 100)
	n_remaining.append(len(kept))

	accuracies = np.array(accuracies)
	n_remaining = np.array(n_remaining)

	fig, ax1 = plt.subplots(figsize=(10, 7))

	color1 = '#2196F3'
	ax1.plot(rejection_fracs * 100, accuracies, color=color1, linewidth=2.0,
	label='Accuracy')
	ax1.set_xlabel('Rejection Rate (%)', fontsize=12)
	ax1.set_ylabel('Accuracy (%)', fontsize=12, color=color1)
	ax1.tick_params(axis='y', labelcolor=color1)
	ax1.set_ylim([max(50, np.nanmin(accuracies) - 5), 101])

	# Secondary axis: number of remaining samples
	ax2 = ax1.twinx()
	color2 = '#FF9800'
	ax2.plot(rejection_fracs * 100, n_remaining, color=color2, linewidth=1.5,
	linestyle='--', alpha=0.7, label='Remaining')
	ax2.set_ylabel('Samples Remaining', fontsize=12, color=color2)
	ax2.tick_params(axis='y', labelcolor=color2)

	# Baseline accuracy (no rejection)
	base_acc = correct.mean() * 100
	ax1.axhline(base_acc, color='gray', linestyle=':', alpha=0.5)
	ax1.text(2, base_acc + 0.5, f'Baseline: {base_acc:.1f}%', fontsize=9, color='gray')

	ax1.set_title('Rejection Curve: Accuracy vs Uncertainty-Based Rejection', fontsize=14)
	ax1.grid(True, alpha=0.3)

	# Combined legend
	lines1, labels1 = ax1.get_legend_handles_labels()
	lines2, labels2 = ax2.get_legend_handles_labels()
	ax1.legend(lines1 + lines2, labels1 + labels2, loc='lower left', fontsize=10)

	plt.tight_layout()
	fig.savefig(save_path, dpi=300, bbox_inches='tight')
	plt.close(fig)
	print(f' Saved: {save_path}')


	def plot_epistemic_vs_aleatoric(metrics, labels, save_path):
	"""Scatter separating epistemic and aleatoric uncertainty."""
	aleatoric = metrics['expected_entropy']
	epistemic = metrics['mutual_info']
	correct = (metrics['predicted_class'] == labels).astype(int)

	fig, ax = plt.subplots(figsize=(10, 7))

	colors = plt.cm.Set2(np.linspace(0, 1, NUM_CLASSES))
	for cls_idx in range(NUM_CLASSES):
	mask = labels == cls_idx
	ax.scatter(
	aleatoric[mask], epistemic[mask],
	c=[colors[cls_idx]], alpha=0.45, s=20, label=CLASS_NAMES[cls_idx],
	edgecolors='none'
	)

	# Mark misclassified samples
	wrong_mask = correct == 0
	if wrong_mask.sum() > 0:
	ax.scatter(
	aleatoric[wrong_mask], epistemic[wrong_mask],
	facecolors='none', edgecolors='red', s=60, linewidths=1.2,
	label='Misclassified', zorder=5
	)

	ax.set_xlabel('Aleatoric Uncertainty (Expected Entropy)', fontsize=12)
	ax.set_ylabel('Epistemic Uncertainty (Mutual Information)', fontsize=12)
	ax.set_title('Decomposition of Uncertainty: Epistemic vs Aleatoric', fontsize=14)
	ax.legend(fontsize=9, title='Class', title_fontsize=10)
	ax.grid(True, alpha=0.3)

	# Annotate quadrants
	xlim = ax.get_xlim()
	ylim = ax.get_ylim()
	ax.text(xlim[0] + 0.02 * (xlim[1] - xlim[0]),
	ylim[1] - 0.05 * (ylim[1] - ylim[0]),
	'Low aleatoric\nHigh epistemic\n(need more data)',
	fontsize=8, alpha=0.6, va='top')
	ax.text(xlim[1] - 0.02 * (xlim[1] - xlim[0]),
	ylim[1] - 0.05 * (ylim[1] - ylim[0]),
	'High aleatoric\nHigh epistemic\n(hard + unseen)',
	fontsize=8, alpha=0.6, va='top', ha='right')
	ax.text(xlim[1] - 0.02 * (xlim[1] - xlim[0]),
	ylim[0] + 0.05 * (ylim[1] - ylim[0]),
	'High aleatoric\nLow epistemic\n(inherently noisy)',
	fontsize=8, alpha=0.6, va='bottom', ha='right')

	plt.tight_layout()
	fig.savefig(save_path, dpi=300, bbox_inches='tight')
	plt.close(fig)
	print(f' Saved: {save_path}')


	def plot_uncertainty_by_class(metrics, labels, save_path):
	"""Box plots of uncertainty per class."""
	fig, axes = plt.subplots(1, 3, figsize=(18, 6))

	data_types = [
	('predictive_entropy', 'Total Uncertainty (Predictive Entropy)'),
	('expected_entropy', 'Aleatoric Uncertainty (Expected Entropy)'),
	('mutual_info', 'Epistemic Uncertainty (Mutual Information)'),
	]

	for ax, (key, title) in zip(axes, data_types):
	data = metrics[key]
	box_data = [data[labels == c] for c in range(NUM_CLASSES)]

	bp = ax.boxplot(box_data, labels=CLASS_NAMES, patch_artist=True,
	widths=0.6, showfliers=True,
	flierprops=dict(marker='o', markersize=3, alpha=0.3))

	colors = plt.cm.Set2(np.linspace(0, 1, NUM_CLASSES))
	for patch, color in zip(bp['boxes'], colors):
	patch.set_facecolor(color)
	patch.set_alpha(0.7)

	ax.set_title(title, fontsize=11)
	ax.set_ylabel('Uncertainty', fontsize=10)
	ax.grid(True, axis='y', alpha=0.3)
	ax.tick_params(axis='x', rotation=15)

	# Add sample counts
	for i, cls_data in enumerate(box_data):
	ax.text(i + 1, ax.get_ylim()[1] * 0.95,
	f'n={len(cls_data)}', ha='center', fontsize=8, alpha=0.6)

	plt.suptitle('Uncertainty Distribution by Disease Class', fontsize=14, y=1.02)
	plt.tight_layout()
	fig.savefig(save_path, dpi=300, bbox_inches='tight')
	plt.close(fig)
	print(f' Saved: {save_path}')


	def plot_confidence_vs_uncertainty(metrics, labels, save_path):
	"""Scatter showing confidence vs uncertainty (should be anti-correlated)."""
	confidence = metrics['max_confidence']
	entropy = metrics['predictive_entropy']
	correct = (metrics['predicted_class'] == labels).astype(int)

	fig, ax = plt.subplots(figsize=(10, 7))

	scatter_correct = ax.scatter(
	confidence[correct == 1], entropy[correct == 1],
	c='#4CAF50', alpha=0.4, s=15, label='Correct', edgecolors='none'
	)
	scatter_wrong = ax.scatter(
	confidence[correct == 0], entropy[correct == 0],
	c='#F44336', alpha=0.6, s=25, label='Incorrect', edgecolors='none',
	marker='x', linewidths=1.0
	)

	# Compute correlation
	from scipy import stats
	r, p_val = stats.pearsonr(confidence, entropy)

	ax.set_xlabel('Maximum Confidence (max p_bar)', fontsize=12)
	ax.set_ylabel('Predictive Entropy (Total Uncertainty)', fontsize=12)
	ax.set_title(f'Confidence vs Uncertainty (Pearson r={r:.3f}, p={p_val:.2e})', fontsize=14)
	ax.legend(fontsize=10)
	ax.grid(True, alpha=0.3)

	# Add trend line
	z = np.polyfit(confidence, entropy, 1)
	x_line = np.linspace(confidence.min(), confidence.max(), 100)
	ax.plot(x_line, np.polyval(z, x_line), 'k--', alpha=0.4, linewidth=1.5)

	plt.tight_layout()
	fig.savefig(save_path, dpi=300, bbox_inches='tight')
	plt.close(fig)
	print(f' Saved: {save_path}')


	# ================================================================
	# MAIN
	# ================================================================
	def main():
	t_start = time.time()

	# ---- 1. Build DataLoader ----
	print('\nLoading test set...')
	dataset = TestDataset(TEST_CSV)
	dataloader = DataLoader(
	dataset, batch_size=BATCH_SIZE, shuffle=False,
	num_workers=2, pin_memory=False
	)
	print(f' Test samples: {len(dataset)}')

	# ---- 2. Stage 1: Extract backbone features (single deterministic pass) ----
	features, true_labels, image_paths = extract_all_features(model, dataloader)
	print(f' Features shape: {features.shape}')

	t_feat = time.time() - t_start
	print(f' Feature extraction: {t_feat:.1f}s')

	# ---- 3. Stage 2: MC Dropout on heads only ----
	mc_probs = mc_dropout_on_heads(
	model, features, T=T_FORWARD_PASSES, temperature=TEMPERATURE
	)
	print(f' MC probs shape: {mc_probs.shape} (N, T, C)')

	t_mc = time.time() - t_start - t_feat
	print(f' MC head passes: {t_mc:.1f}s')

	# ---- 4. Compute Uncertainty Metrics ----
	print('\nComputing uncertainty metrics...')
	metrics = compute_uncertainty_metrics(mc_probs)

	# Print summary statistics
	correct = (metrics['predicted_class'] == true_labels).astype(int)
	accuracy = correct.mean() * 100
	print(f'\n --- Summary ---')
	print(f' Accuracy (MC mean): {accuracy:.2f}%')
	print(f' Predictive entropy: mean={metrics["predictive_entropy"].mean():.4f}, '
	f'std={metrics["predictive_entropy"].std():.4f}')
	print(f' Aleatoric (exp. ent.): mean={metrics["expected_entropy"].mean():.4f}, '
	f'std={metrics["expected_entropy"].std():.4f}')
	print(f' Epistemic (MI): mean={metrics["mutual_info"].mean():.4f}, '
	f'std={metrics["mutual_info"].std():.4f}')
	print(f' Max confidence: mean={metrics["max_confidence"].mean():.4f}, '
	f'std={metrics["max_confidence"].std():.4f}')

	# Per-class stats
	print(f'\n Per-class uncertainty (predictive entropy):')
	for cls_idx in range(NUM_CLASSES):
	mask = true_labels == cls_idx
	n_cls = mask.sum()
	cls_acc = correct[mask].mean() * 100 if n_cls > 0 else 0
	cls_ent = metrics['predictive_entropy'][mask].mean() if n_cls > 0 else 0
	cls_mi = metrics['mutual_info'][mask].mean() if n_cls > 0 else 0
	print(f' {CLASS_NAMES[cls_idx]:15s}: n={n_cls:4d}, '
	f'acc={cls_acc:5.1f}%, H={cls_ent:.4f}, MI={cls_mi:.4f}')

	# ---- 5. Generate Plots ----
	print('\nGenerating plots...')

	plot_uncertainty_vs_accuracy(
	metrics, true_labels,
	os.path.join(UNCERT_DIR, 'uncertainty_vs_accuracy.png')
	)
	plot_rejection_curve(
	metrics, true_labels,
	os.path.join(UNCERT_DIR, 'rejection_curve.png')
	)
	plot_epistemic_vs_aleatoric(
	metrics, true_labels,
	os.path.join(UNCERT_DIR, 'epistemic_vs_aleatoric.png')
	)
	plot_uncertainty_by_class(
	metrics, true_labels,
	os.path.join(UNCERT_DIR, 'uncertainty_by_class.png')
	)
	plot_confidence_vs_uncertainty(
	metrics, true_labels,
	os.path.join(UNCERT_DIR, 'confidence_vs_uncertainty.png')
	)

	# ---- 6. Save JSON Results ----
	print('\nSaving results JSON...')

	per_image = []
	for i in range(len(true_labels)):
	per_image.append({
	'image_path': image_paths[i],
	'true_label': int(true_labels[i]),
	'true_class': CLASS_NAMES[int(true_labels[i])],
	'predicted_label': int(metrics['predicted_class'][i]),
	'predicted_class': CLASS_NAMES[int(metrics['predicted_class'][i])],
	'correct': bool(correct[i]),
	'max_confidence': round(float(metrics['max_confidence'][i]), 6),
	'predictive_entropy': round(float(metrics['predictive_entropy'][i]), 6),
	'expected_entropy': round(float(metrics['expected_entropy'][i]), 6),
	'mutual_information': round(float(metrics['mutual_info'][i]), 6),
	'class_variance': [round(float(v), 8) for v in metrics['class_variance'][i]],
	'mean_probs': [round(float(v), 6) for v in metrics['p_mean'][i]],
	})

	aggregate = {
	'n_samples': int(len(true_labels)),
	'n_classes': NUM_CLASSES,
	'mc_passes': T_FORWARD_PASSES,
	'temperature': TEMPERATURE,
	'accuracy_pct': round(float(accuracy), 4),
	'overall': {
	'predictive_entropy': {
	'mean': round(float(metrics['predictive_entropy'].mean()), 6),
	'std': round(float(metrics['predictive_entropy'].std()), 6),
	'min': round(float(metrics['predictive_entropy'].min()), 6),
	'max': round(float(metrics['predictive_entropy'].max()), 6),
	},
	'expected_entropy': {
	'mean': round(float(metrics['expected_entropy'].mean()), 6),
	'std': round(float(metrics['expected_entropy'].std()), 6),
	'min': round(float(metrics['expected_entropy'].min()), 6),
	'max': round(float(metrics['expected_entropy'].max()), 6),
	},
	'mutual_information': {
	'mean': round(float(metrics['mutual_info'].mean()), 6),
	'std': round(float(metrics['mutual_info'].std()), 6),
	'min': round(float(metrics['mutual_info'].min()), 6),
	'max': round(float(metrics['mutual_info'].max()), 6),
	},
	'max_confidence': {
	'mean': round(float(metrics['max_confidence'].mean()), 6),
	'std': round(float(metrics['max_confidence'].std()), 6),
	},
	},
	'per_class': {},
	}

	for cls_idx in range(NUM_CLASSES):
	mask = true_labels == cls_idx
	n_cls = int(mask.sum())
	if n_cls == 0:
	continue
	aggregate['per_class'][CLASS_NAMES[cls_idx]] = {
	'n_samples': n_cls,
	'accuracy': round(float(correct[mask].mean() * 100), 4),
	'pred_entropy_mean': round(float(metrics['predictive_entropy'][mask].mean()), 6),
	'pred_entropy_std': round(float(metrics['predictive_entropy'][mask].std()), 6),
	'aleatoric_mean': round(float(metrics['expected_entropy'][mask].mean()), 6),
	'epistemic_mean': round(float(metrics['mutual_info'][mask].mean()), 6),
	'confidence_mean': round(float(metrics['max_confidence'][mask].mean()), 6),
	}

	# Rejection curve data at key thresholds
	entropy = metrics['predictive_entropy']
	sorted_idx = np.argsort(entropy)[::-1]
	sorted_correct = correct[sorted_idx]
	rejection_checkpoints = {}
	for frac in [0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.50]:
	n_reject = int(frac * len(true_labels))
	kept = sorted_correct[n_reject:]
	if len(kept) > 0:
	rejection_checkpoints[f'reject_{int(frac*100)}pct'] = {
	'accuracy': round(float(kept.mean() * 100), 4),
	'n_remaining': int(len(kept)),
	}
	aggregate['rejection_curve'] = rejection_checkpoints

	results = {
	'aggregate': aggregate,
	'per_image': per_image,
	}

	json_path = os.path.join(UNCERT_DIR, 'mc_dropout_results.json')
	with open(json_path, 'w') as f:
	json.dump(results, f, indent=2)
	print(f' Saved: {json_path}')

	elapsed = time.time() - t_start
	print(f'\nDone in {elapsed:.1f}s')
	print('=' * 65)


	if __name__ == '__main__':
	main()