#!/usr/bin/env python3 """ RetinaSense v3 — Domain-Conditional Preprocessing Pipeline =========================================================== Implements source-aware preprocessing: - APTOS -> Ben Graham enhancement (high contrast DR-specific pipeline) - ODIR -> CLAHE only (preserves sharpness, normalizes contrast) - REFUGE2 -> Resize only (images already clinical-grade high quality) Image path resolution: - ODIR: odir/preprocessed_images/ - APTOS: aptos/gaussian_filtered_images/gaussian_filtered_images//.png (looked up from aptos/train.csv; aptos/train_images/ does NOT exist) Cache format: ./preprocessed_cache_v3/_v3.npy """ import os import sys import json import warnings import numpy as np import pandas as pd import cv2 import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from tqdm import tqdm warnings.filterwarnings('ignore') # ========================================================= # PATHS # ========================================================= BASE_DIR = '/teamspace/studios/this_studio' CSV_PATH = os.path.join(BASE_DIR, 'data', 'combined_dataset.csv') CACHE_DIR = os.path.join(BASE_DIR, 'preprocessed_cache_v3') DATA_DIR = os.path.join(BASE_DIR, 'data') ODIR_IMG_DIR = os.path.join(BASE_DIR, 'odir', 'preprocessed_images') APTOS_CSV = os.path.join(BASE_DIR, 'aptos', 'train.csv') APTOS_IMG_BASE = os.path.join(BASE_DIR, 'aptos', 'gaussian_filtered_images', 'gaussian_filtered_images') APTOS_DIAG_MAP = {0: 'No_DR', 1: 'Mild', 2: 'Moderate', 3: 'Severe', 4: 'Proliferate_DR'} ODIR_SAMPLE = os.path.join(BASE_DIR, 'ocular-disease-recognition-odir5k', 'preprocessed_images', '2977_left.jpg') os.makedirs(CACHE_DIR, exist_ok=True) os.makedirs(DATA_DIR, exist_ok=True) TARGET_SIZE = 224 # ========================================================= # APTOS PATH LOOKUP TABLE # Built once at module load; maps id_code (stem) -> abs path # ========================================================= def _build_aptos_lookup() -> dict: """Return dict mapping aptos id_code -> absolute image path.""" lookup = {} if not os.path.exists(APTOS_CSV): return lookup df = pd.read_csv(APTOS_CSV) for _, row in df.iterrows(): folder = APTOS_DIAG_MAP.get(int(row['diagnosis']), 'No_DR') path = os.path.join(APTOS_IMG_BASE, folder, str(row['id_code']) + '.png') lookup[str(row['id_code'])] = path return lookup _APTOS_LOOKUP: dict = _build_aptos_lookup() # ========================================================= # PATH RESOLVER # ========================================================= def resolve_image_path(raw_path: str, dataset: str = None) -> str: """ Resolve CSV path entry to an absolute filesystem path. The CSV stores paths like: ODIR: .//odir/preprocessed_images/0_left.jpg APTOS: .//aptos/train_images/000c1434d8d7.png (train_images doesn't exist) Resolution rules: 1. If the resolved path already exists, return it. 2. ODIR: remap to odir/preprocessed_images/ 3. APTOS: look up via _APTOS_LOOKUP by stem """ # Normalise .// and ./ prefixes p = raw_path.strip() if p.startswith('.//'): p = p[3:] elif p.startswith('./'): p = p[2:] # Try as-is (absolute or relative to BASE_DIR) if not os.path.isabs(p): candidate = os.path.join(BASE_DIR, p) else: candidate = p if os.path.exists(candidate): return candidate fname = os.path.basename(p) stem = os.path.splitext(fname)[0] src = (dataset or '').upper().strip() # ODIR remap if src == 'ODIR' or 'odir' in p.lower(): return os.path.join(ODIR_IMG_DIR, fname) # APTOS remap via lookup table if src == 'APTOS' or 'aptos' in p.lower(): if stem in _APTOS_LOOKUP: return _APTOS_LOOKUP[stem] # Final fallback: try all known image dirs for d in [ODIR_IMG_DIR, APTOS_IMG_BASE]: candidate2 = os.path.join(d, fname) if os.path.exists(candidate2): return candidate2 return candidate # return best guess even if missing # ========================================================= # PREPROCESSING FUNCTIONS # ========================================================= def _load_image(image_path: str): """Load image as RGB numpy array (H, W, 3) uint8. Returns None on failure.""" img = cv2.imread(image_path) if img is None: return None return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) def _crop_black_borders(img: np.ndarray, tol: int = 7) -> np.ndarray: """Remove dark border padding common in fundus images.""" gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) mask = gray > tol rows = np.any(mask, axis=1) cols = np.any(mask, axis=0) if not rows.any() or not cols.any(): return img rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] return img[rmin:rmax+1, cmin:cmax+1] def _apply_circular_mask(img: np.ndarray) -> np.ndarray: """Zero out pixels outside the circular fundus field of view.""" h, w = img.shape[:2] mask = np.zeros((h, w), dtype=np.uint8) cx, cy = w // 2, h // 2 r = int(min(h, w) * 0.48) cv2.circle(mask, (cx, cy), r, 255, -1) return cv2.bitwise_and(img, img, mask=mask) def ben_graham_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE, sigma: float = 10.0) -> np.ndarray: """ Ben Graham fundus enhancement — used for APTOS images. Enhances local retinal structures (vessels, lesions) by subtracting a Gaussian-blurred version from itself, centering intensity around 128. This removes low-frequency illumination variation (vignetting, uneven camera lighting) and amplifies high-frequency structural details. Formula: result = 4*img - 4*GaussianBlur(img, sigma=10) + 128 Then circular mask applied to suppress black border. """ img = _crop_black_borders(img) img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_AREA) blur = cv2.GaussianBlur(img, (0, 0), sigma) img = cv2.addWeighted(img, 4, blur, -4, 128) img = _apply_circular_mask(img) return np.clip(img, 0, 255).astype(np.uint8) def clahe_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE, clip_limit: float = 2.0, tile_grid: tuple = (8, 8)) -> np.ndarray: """ CLAHE (Contrast Limited Adaptive Histogram Equalization) — used for ODIR. Preserves image sharpness while normalizing local contrast. Applied only to the L (luminance) channel in LAB color space to avoid hue shifts. ODIR is a multi-source dataset with mixed quality, so CLAHE provides gentle contrast normalization without destroying fine detail the way Ben Graham's aggressive subtraction would. clip_limit=2.0: moderate clipping to prevent over-amplification of noise. tile_grid=(8,8): 8x8 tiles for local adaptation at appropriate scale. """ img = _crop_black_borders(img) img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_AREA) lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) l, a, b = cv2.split(lab) clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid) l_eq = clahe.apply(l) lab_eq = cv2.merge([l_eq, a, b]) img = cv2.cvtColor(lab_eq, cv2.COLOR_LAB2RGB) img = _apply_circular_mask(img) return np.clip(img, 0, 255).astype(np.uint8) def resize_only_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE) -> np.ndarray: """ Minimal preprocessing — used for REFUGE2. REFUGE2 images are acquired with a Zeiss Visucam 500 camera under standardized clinical conditions. They are already high-quality with consistent lighting. Any additional enhancement would degrade quality. """ img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_AREA) return np.clip(img, 0, 255).astype(np.uint8) def preprocess_image(image_path: str, source: str, target_size: int = TARGET_SIZE): """ Domain-conditional preprocessing dispatcher. Parameters ---------- image_path : str Absolute path to the fundus image file. source : str Dataset source. One of: 'APTOS', 'ODIR', 'REFUGE2' (case-insensitive). target_size : int Output spatial dimension (square). Default 224. Returns ------- np.ndarray of shape (target_size, target_size, 3), dtype uint8, or None if the image cannot be loaded. """ img = _load_image(image_path) if img is None: return None src = source.upper().strip() if src == 'APTOS': return ben_graham_preprocess(img, target_size=target_size) elif src == 'ODIR': return clahe_preprocess(img, target_size=target_size) elif src == 'REFUGE2': return resize_only_preprocess(img, target_size=target_size) else: # Safe fallback for unknown sources print(f'[WARN] Unknown source "{source}", applying CLAHE fallback.') return clahe_preprocess(img, target_size=target_size) # ========================================================= # CACHE HELPERS # ========================================================= def cache_path_for(raw_csv_path: str) -> str: """Return the .npy cache path for a given CSV image_path entry.""" stem = Path(raw_csv_path).stem return os.path.join(CACHE_DIR, f'{stem}_v3.npy') def is_cached(raw_csv_path: str) -> bool: return os.path.exists(cache_path_for(raw_csv_path)) def save_to_cache(raw_csv_path: str, arr: np.ndarray) -> None: np.save(cache_path_for(raw_csv_path), arr) def load_from_cache(raw_csv_path: str): cp = cache_path_for(raw_csv_path) return np.load(cp) if os.path.exists(cp) else None def cache_dataset(df: pd.DataFrame) -> dict: """ Preprocess and cache all images in df using domain-conditional pipeline. Returns stats dict. """ stats = dict(processed=0, skipped_missing=0, already_cached=0, errors=0, total=len(df)) for _, row in tqdm(df.iterrows(), total=len(df), desc='Caching v3'): raw = row['image_path'] src = row['dataset'] if is_cached(raw): stats['already_cached'] += 1 continue abs_path = resolve_image_path(raw, src) if not os.path.exists(abs_path): stats['skipped_missing'] += 1 continue arr = preprocess_image(abs_path, src) if arr is None: stats['errors'] += 1 continue save_to_cache(raw, arr) stats['processed'] += 1 return stats # ========================================================= # PREPROCESSING COMPARISON VISUALIZATION # ========================================================= def make_preprocessing_comparison( save_path: str = None, odir_raw_path: str = None, aptos_raw_path: str = None) -> str: """ Generate and save a side-by-side comparison PNG showing ODIR (CLAHE) vs APTOS (Ben Graham) preprocessing pipelines. Returns the saved PNG path. """ if save_path is None: save_path = os.path.join(DATA_DIR, 'preprocessing_comparison_v3.png') # --- Pick sample ODIR image --- # Prefer sample from the dataset odir_path = None if odir_raw_path: odir_path = resolve_image_path(odir_raw_path, 'ODIR') if odir_path is None or not os.path.exists(odir_path): # Use the one available ODIR sample in odir5k folder odir_path = ODIR_SAMPLE if not os.path.exists(odir_path): # Fall back to any image in odir/preprocessed_images imgs = [os.path.join(ODIR_IMG_DIR, f) for f in os.listdir(ODIR_IMG_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] odir_path = imgs[0] if imgs else None # --- Pick sample APTOS image --- aptos_path = None if aptos_raw_path: aptos_path = resolve_image_path(aptos_raw_path, 'APTOS') if aptos_path is None or not os.path.exists(aptos_path): # Use first entry in APTOS lookup if _APTOS_LOOKUP: aptos_path = next(iter(_APTOS_LOOKUP.values())) # --- Load images --- def get_or_synthetic(path, name): if path and os.path.exists(path): img = _load_image(path) if img is not None: return img, path print(f'[WARN] {name} sample not found, using synthetic.') h, w = 512, 512 np.random.seed(42) base = np.zeros((h, w, 3), dtype=np.uint8) cx, cy = w // 2, h // 2 r = int(min(h, w) * 0.48) cv2.circle(base, (cx, cy), r, (60, 40, 25), -1) for _ in range(30): pt1 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r)) pt2 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r)) cv2.line(base, pt1, pt2, (100, 60, 35), 1) base = base.astype(np.float32) + np.random.normal(0, 6, base.shape) return np.clip(base, 0, 255).astype(np.uint8), '(synthetic)' odir_orig, odir_src = get_or_synthetic(odir_path, 'ODIR') aptos_orig, aptos_src = get_or_synthetic(aptos_path, 'APTOS') # Resize originals for display odir_disp = cv2.resize(odir_orig, (TARGET_SIZE, TARGET_SIZE), interpolation=cv2.INTER_AREA) aptos_disp = cv2.resize(aptos_orig, (TARGET_SIZE, TARGET_SIZE), interpolation=cv2.INTER_AREA) # Apply pipelines odir_clahe = clahe_preprocess(odir_orig.copy()) aptos_graham = ben_graham_preprocess(aptos_orig.copy()) # Difference images (scaled for visibility) diff_odir = cv2.absdiff(odir_disp, odir_clahe) diff_aptos = cv2.absdiff(aptos_disp, aptos_graham) # Amplify diff for visibility diff_odir = np.clip(diff_odir * 3, 0, 255).astype(np.uint8) diff_aptos = np.clip(diff_aptos * 3, 0, 255).astype(np.uint8) # --- Build figure --- fig, axes = plt.subplots(2, 3, figsize=(16, 11)) fig.patch.set_facecolor('#1a1a2e') fig.suptitle( 'RetinaSense v3 — Domain-Conditional Preprocessing\n' 'ODIR: CLAHE Pipeline | APTOS: Ben Graham Pipeline', fontsize=13, fontweight='bold', color='white', y=1.01 ) panels = [ # row, col, image, title, bg_color (0, 0, odir_disp, f'ODIR: Original\n({os.path.basename(str(odir_src))})', '#1565C0'), (0, 1, odir_clahe, 'ODIR: After CLAHE\n(L-channel equalization, circular mask)', '#0D47A1'), (0, 2, diff_odir, 'ODIR: Difference x3\n(|original - CLAHE|, amplified)', '#263238'), (1, 0, aptos_disp, f'APTOS: Original\n({os.path.basename(str(aptos_src))})', '#BF360C'), (1, 1, aptos_graham, 'APTOS: After Ben Graham\n(4*img - 4*blur(σ=10) + 128)', '#870000'), (1, 2, diff_aptos, 'APTOS: Difference x3\n(|original - Ben Graham|, amplified)', '#1B5E20'), ] for r, c, img_arr, title, fc in panels: ax = axes[r, c] ax.imshow(img_arr) ax.set_title(title, fontsize=9, color='white', pad=5, bbox=dict(boxstyle='round,pad=0.3', facecolor=fc, alpha=0.85, edgecolor='none')) ax.axis('off') for spine in ax.spines.values(): spine.set_visible(False) # Annotation boxes odir_note = ( 'ODIR Pipeline\n' '━━━━━━━━━━━━━━━\n' '1. Crop black borders\n' '2. Resize → 224×224\n' '3. Convert RGB→LAB\n' '4. CLAHE on L channel\n' ' clip=2.0, tile=8×8\n' '5. LAB→RGB\n' '6. Circular mask (r=0.48)' ) aptos_note = ( 'APTOS Pipeline (Ben Graham)\n' '━━━━━━━━━━━━━━━━━━━━━━━━━━\n' '1. Crop black borders\n' '2. Resize → 224×224\n' '3. blur = GaussianBlur(σ=10)\n' '4. out = 4×img − 4×blur + 128\n' '5. Circular mask (r=0.48)\n' '6. clip to [0, 255]' ) fig.text(0.02, 0.92, odir_note, fontsize=8.5, va='top', ha='left', color='white', fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='#1565C0', alpha=0.6)) fig.text(0.02, 0.48, aptos_note, fontsize=8.5, va='top', ha='left', color='white', fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='#870000', alpha=0.6)) plt.tight_layout(rect=[0.18, 0, 1, 1]) plt.savefig(save_path, dpi=150, bbox_inches='tight', facecolor='#1a1a2e', edgecolor='none') plt.close() print(f'[OK] Comparison saved: {save_path}') return save_path # ========================================================= # NORMALIZATION STATISTICS # ========================================================= def compute_norm_stats(train_df: pd.DataFrame, out_path: str = None, max_images: int = None) -> dict: """ Compute per-channel mean and std across all pixels of training images after domain-conditional preprocessing. Training set ONLY — no validation/test data contamination. Returns dict with: mean_rgb, std_rgb, n_images, n_pixels_per_channel. """ if out_path is None: out_path = os.path.join(DATA_DIR, 'fundus_norm_stats.json') df = train_df.copy() if max_images is not None: df = df.sample(min(max_images, len(df)), random_state=42) ch_sum = np.zeros(3, dtype=np.float64) ch_sq_sum = np.zeros(3, dtype=np.float64) n_pixels = 0 n_images = 0 n_missing = 0 for _, row in tqdm(df.iterrows(), total=len(df), desc='Norm stats'): raw = row['image_path'] src = row['dataset'] # Try cache first for speed arr = load_from_cache(raw) if arr is None: abs_path = resolve_image_path(raw, src) if not os.path.exists(abs_path): n_missing += 1 continue arr = preprocess_image(abs_path, src) if arr is None: n_missing += 1 continue arr_f = arr.astype(np.float64) / 255.0 pixels = arr_f.reshape(-1, 3) ch_sum += pixels.sum(axis=0) ch_sq_sum += (pixels ** 2).sum(axis=0) n_pixels += pixels.shape[0] n_images += 1 if n_images == 0: print('[WARN] No images found — storing ImageNet defaults as fallback.') stats = { 'mean_rgb': [0.485, 0.456, 0.406], 'std_rgb': [0.229, 0.224, 0.225], 'n_images': 0, 'n_pixels_per_channel': 0, 'n_missing': n_missing, 'note': 'No images found — ImageNet defaults used as fallback', 'source': 'imagenet_fallback' } else: mean = ch_sum / n_pixels var = ch_sq_sum / n_pixels - mean ** 2 std = np.sqrt(np.maximum(var, 0.0)) stats = { 'mean_rgb': [round(float(v), 6) for v in mean], 'std_rgb': [round(float(v), 6) for v in std], 'n_images': n_images, 'n_pixels_per_channel': int(n_pixels), 'n_missing': n_missing, 'note': ('Computed on training split only after domain-conditional ' 'preprocessing. Red-dominant channel expected (fundus tissue).'), 'source': 'computed_training_split' } print(f' mean RGB : {[round(v,4) for v in mean]}') print(f' std RGB : {[round(v,4) for v in std]}') print(f' images : {n_images:,} | missing: {n_missing}') with open(out_path, 'w') as f: json.dump(stats, f, indent=2) print(f'[OK] Stats saved: {out_path}') return stats # ========================================================= # 3-WAY STRATIFIED SPLIT # ========================================================= def create_stratified_split(df: pd.DataFrame, train_ratio: float = 0.70, calib_ratio: float = 0.15, test_ratio: float = 0.15, random_state: int = 42) -> tuple: """ Create train/calib/test split stratified by disease_label. Returns (train_df, calib_df, test_df). """ from sklearn.model_selection import train_test_split as _tts assert abs(train_ratio + calib_ratio + test_ratio - 1.0) < 1e-9 train_df, temp_df = _tts( df, test_size=(calib_ratio + test_ratio), stratify=df['disease_label'], random_state=random_state ) calib_frac = calib_ratio / (calib_ratio + test_ratio) calib_df, test_df = _tts( temp_df, test_size=(1.0 - calib_frac), stratify=temp_df['disease_label'], random_state=random_state ) return (train_df.reset_index(drop=True), calib_df.reset_index(drop=True), test_df.reset_index(drop=True)) def save_splits(train_df, calib_df, test_df, out_dir: str = DATA_DIR): train_df.to_csv(os.path.join(out_dir, 'train_split.csv'), index=False) calib_df.to_csv(os.path.join(out_dir, 'calib_split.csv'), index=False) test_df.to_csv( os.path.join(out_dir, 'test_split.csv'), index=False) print(f'[OK] Split CSVs saved to {out_dir}/') def print_split_stats(train_df, calib_df, test_df, class_names: dict = None) -> str: if class_names is None: class_names = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma', 3: 'Cataract', 4: 'AMD'} total_n = len(train_df) + len(calib_df) + len(test_df) lines = [ '', '=' * 62, ' STRATIFIED SPLIT — CLASS DISTRIBUTION', '=' * 62, f"{'Class':<16} {'Train':>8} {'Calib':>8} {'Test':>8} {'Total':>8}", '-' * 54, ] tr_tot = ca_tot = te_tot = 0 for lbl in sorted(class_names.keys()): tr = int((train_df['disease_label'] == lbl).sum()) ca = int((calib_df['disease_label'] == lbl).sum()) te = int((test_df['disease_label'] == lbl).sum()) tot = tr + ca + te tr_tot += tr; ca_tot += ca; te_tot += te lines.append( f"{class_names[lbl]:<16} {tr:>8,} {ca:>8,} {te:>8,} {tot:>8,}" ) lines += [ '-' * 54, f"{'TOTAL':<16} {tr_tot:>8,} {ca_tot:>8,} {te_tot:>8,} {total_n:>8,}", '', f'Split sizes : train={len(train_df):,} calib={len(calib_df):,} ' f'test={len(test_df):,}', f'Actual ratios: train={len(train_df)/total_n:.1%} ' f'calib={len(calib_df)/total_n:.1%} ' f'test={len(test_df)/total_n:.1%}', ] report = '\n'.join(lines) print(report) return report # ========================================================= # ADDITIONAL DATASET SEARCH # ========================================================= def search_additional_datasets() -> dict: """ Scan filesystem for REFUGE2, iChallenge-AMD, RIM-ONE and other AMD/Glaucoma-specific datasets beyond the current CSV. Returns a findings dict. """ IMG_EXTS = {'.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp'} TARGETS = ['refuge2', 'refuge', 'ichallenge', 'rim-one', 'rimone', 'amd', 'glaucoma', 'odir5k', 'odir'] SEARCH_ROOTS = ['/teamspace/studios/this_studio', '/teamspace/uploads'] SKIP_DIRS = {'.git', '.cache', '.claude', '.ipython', '.npm', '__pycache__', 'outputs_analysis', 'outputs_ensemble', 'outputs_optimized', 'outputs_production', 'outputs_v2', 'outputs_v2_extended', 'outputs_vit'} findings = {} for root_dir in SEARCH_ROOTS: if not os.path.exists(root_dir): continue for dirpath, dirnames, files in os.walk(root_dir): # Prune dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS and not d.startswith('.')] folder = os.path.basename(dirpath).lower() for target in TARGETS: if target in folder: img_cnt = sum(1 for f in files if os.path.splitext(f)[1].lower() in IMG_EXTS) key = dirpath if key not in findings or img_cnt > findings[key]['img_count']: findings[key] = { 'matched_target': target, 'img_count': img_cnt, 'total_files': len(files) } # Always include the known special dirs for special in [ '/teamspace/studios/this_studio/ocular-disease-recognition-odir5k', '/teamspace/studios/this_studio/odir', '/teamspace/studios/this_studio/aptos', ]: if os.path.exists(special) and special not in findings: img_cnt = sum( 1 for root, _, files in os.walk(special) for f in files if os.path.splitext(f)[1].lower() in IMG_EXTS ) findings[special] = { 'matched_target': 'known_dataset', 'img_count': img_cnt, 'total_files': sum(1 for _, _, fs in os.walk(special) for _ in fs) } return findings # ========================================================= # MAIN # ========================================================= def main(): print('=' * 65) print(' RetinaSense v3 — Data Pipeline') print('=' * 65) CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma', 3: 'Cataract', 4: 'AMD'} # ------------------------------------------------------- # TASK 1: Dataset Audit # ------------------------------------------------------- print('\n[TASK 1] Dataset Audit') print('-' * 50) df = pd.read_csv(CSV_PATH) print(f' CSV : {CSV_PATH}') print(f' Total rows : {len(df):,}') print(f' Columns : {df.columns.tolist()}') print() print(' --- Overall class distribution ---') for lbl, cnt in df['disease_label'].value_counts().sort_index().items(): pct = cnt / len(df) * 100 bar = '#' * int(pct / 2) print(f" {lbl} {CLASS_NAMES.get(lbl,'?'):<12} : {cnt:>5} ({pct:5.1f}%) {bar}") max_cls = df['disease_label'].value_counts().max() min_cls = df['disease_label'].value_counts().min() print(f'\n Imbalance ratio (max/min): {max_cls/min_cls:.1f}:1') print() print(' --- Per-dataset breakdown ---') per_ds = (df.groupby(['dataset', 'disease_label']) .size().reset_index(name='count')) print(per_ds.to_string(index=False)) print() print(' --- Severity label distribution (APTOS only) ---') for sev, cnt in df['severity_label'].value_counts().sort_index().items(): label = 'N/A (ODIR)' if sev == -1 else f'Grade {sev}' print(f" {sev:>3} ({label:<14}): {cnt:>5}") print() print(' --- Image path existence check ---') n_found = 0 for _, row in df.iterrows(): p = resolve_image_path(row['image_path'], row['dataset']) if os.path.exists(p): n_found += 1 n_missing = len(df) - n_found print(f' Total checked : {len(df):,}') print(f' Found on disk : {n_found:,}') print(f' Missing : {n_missing:,}') print() # ------------------------------------------------------- # TASK 2: Preprocessing Comparison # ------------------------------------------------------- print('[TASK 2] Domain-Conditional Preprocessing Comparison') print('-' * 50) # Get representative samples from each dataset odir_sample = df[df['dataset'] == 'ODIR']['image_path'].iloc[0] \ if len(df[df['dataset'] == 'ODIR']) > 0 else None aptos_sample = df[df['dataset'] == 'APTOS']['image_path'].iloc[0] \ if len(df[df['dataset'] == 'APTOS']) > 0 else None comp_path = make_preprocessing_comparison( odir_raw_path=odir_sample, aptos_raw_path=aptos_sample ) # Demo: process a few images to verify pipeline print('\n --- Pipeline verification (5 ODIR + 5 APTOS) ---') ok_odir = ok_aptos = 0 for _, row in df[df['dataset'] == 'ODIR'].head(5).iterrows(): p = resolve_image_path(row['image_path'], 'ODIR') if os.path.exists(p): arr = preprocess_image(p, 'ODIR') if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3): ok_odir += 1 for _, row in df[df['dataset'] == 'APTOS'].head(5).iterrows(): p = resolve_image_path(row['image_path'], 'APTOS') if os.path.exists(p): arr = preprocess_image(p, 'APTOS') if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3): ok_aptos += 1 print(f' ODIR (CLAHE) : {ok_odir}/5 OK') print(f' APTOS (Ben Graham) : {ok_aptos}/5 OK') print() # ------------------------------------------------------- # TASK 3: Stratified Split # ------------------------------------------------------- print('[TASK 3] 3-Way Stratified Split (70 / 15 / 15)') print('-' * 50) train_df, calib_df, test_df = create_stratified_split(df) save_splits(train_df, calib_df, test_df) split_report = print_split_stats(train_df, calib_df, test_df, CLASS_NAMES) print() # ------------------------------------------------------- # TASK 4: Normalization Statistics (training split only) # ------------------------------------------------------- print('[TASK 4] Fundus Normalization Statistics (training split)') print('-' * 50) norm_stats = compute_norm_stats(train_df) print() # ------------------------------------------------------- # TASK 5: Additional Dataset Search # ------------------------------------------------------- print('[TASK 5] Additional Dataset Search') print('-' * 50) findings = search_additional_datasets() if findings: print(f' Found {len(findings)} dataset directories:') for path, info in findings.items(): print(f' {path}') print(f' images: {info["img_count"]:,} ' f'files: {info["total_files"]:,} ' f'matched: "{info["matched_target"]}"') else: print(' No additional datasets found.') print() # Summary of what needs downloading known_sets = {'REFUGE2', 'ICHALLENGE-AMD', 'RIM-ONE'} found_names = set(info['matched_target'].upper() for info in findings.values()) missing_sets = known_sets - found_names if missing_sets: print(f' Datasets NOT found (need downloading): {missing_sets}') # ------------------------------------------------------- # Write report # ------------------------------------------------------- _write_report(df, train_df, calib_df, test_df, norm_stats, findings, split_report, comp_path) print('\n' + '=' * 65) print(' All tasks complete.') print('=' * 65) return df, train_df, calib_df, test_df, norm_stats # ========================================================= # REPORT WRITER # ========================================================= def _write_report(df, train_df, calib_df, test_df, norm_stats, dataset_findings, split_report, comp_path): """Save data_engineer_report.md to ./data/""" CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma', 3: 'Cataract', 4: 'AMD'} n_found = sum( 1 for _, row in df.iterrows() if os.path.exists(resolve_image_path(row['image_path'], row['dataset'])) ) lines = [ '# RetinaSense v3 — Data Engineer Report', f'Generated: 2026-03-06', '', '---', '', '## 1. Dataset Statistics', '', f'**Source CSV:** `data/combined_dataset.csv` ', f'**Total images in CSV:** {len(df):,} ', f'**Images found on disk:** {n_found:,} / {len(df):,} ', '', '### Source breakdown', '', '| Dataset | Count | Labels present |', '|---------|-------|----------------|', ] for ds, grp in df.groupby('dataset'): labels = sorted(grp['disease_label'].unique()) label_str = ', '.join(f'{l}={CLASS_NAMES[l]}' for l in labels) lines.append(f'| {ds} | {len(grp):,} | {label_str} |') lines += [ '', '### Class distribution (full dataset)', '', '| Label | Class | Count | % |', '|-------|-------|-------|---|', ] for lbl, cnt in df['disease_label'].value_counts().sort_index().items(): pct = cnt / len(df) * 100 lines.append( f'| {lbl} | {CLASS_NAMES[lbl]} | {cnt:,} | {pct:.1f}% |' ) max_cls = df['disease_label'].value_counts().max() min_cls = df['disease_label'].value_counts().min() lines += [ '', f'**Imbalance ratio (Diabetes:AMD):** {max_cls/min_cls:.1f}:1', '', '### Severity label distribution (APTOS DR grades, -1 = ODIR no grade)', '', '| Severity | Meaning | Count |', '|----------|---------|-------|', ] for sev, cnt in df['severity_label'].value_counts().sort_index().items(): meaning = 'N/A (ODIR, no grade)' if sev == -1 else f'DR Grade {sev}' lines.append(f'| {sev} | {meaning} | {cnt:,} |') lines += [ '', '---', '', '## 2. Image Path Resolution', '', '| Dataset | CSV path format | Actual location |', '|---------|-----------------|-----------------|', '| ODIR | `.//odir/preprocessed_images/.jpg` | `odir/preprocessed_images/.jpg` |', '| APTOS | `.//aptos/train_images/.png` (train_images does NOT exist) | `aptos/gaussian_filtered_images/gaussian_filtered_images//.png` |', '', '`train_images/` directory is absent; actual APTOS images are stored under', '`gaussian_filtered_images/gaussian_filtered_images//`. The', '`aptos/train.csv` maps `id_code` → `diagnosis` (0-4) enabling lookup.', '', '---', '', '## 3. Preprocessing: Domain-Conditional Pipeline', '', '**Problem:** Previous versions applied Ben Graham enhancement uniformly to', 'ALL images. This is incorrect: ODIR images have already-enhanced or', 'clinical-quality appearance; applying Ben Graham degrades them.', '', '**Fix:** Source-conditional dispatch in `preprocess_image(path, source)`.', '', '| Source | Method | Rationale |', '|--------|--------|-----------|', '| APTOS | Ben Graham (4×img − 4×blur(σ=10) + 128 + circular mask) | Field camera images have vignetting and low local contrast. Ben Graham removes low-frequency illumination and amplifies vessel/lesion detail. |', '| ODIR | CLAHE (L-channel, clip=2.0, tile=8×8, circular mask) | Multi-source clinical images. CLAHE normalizes local contrast while preserving sharpness and avoiding Ben Graham over-processing. |', '| REFUGE2 | Resize only (224×224) | Zeiss Visucam 500 — already standardized high-quality. |', '', f'**Comparison figure:** `{comp_path}`', '', '**Cache location:** `preprocessed_cache_v3/_v3.npy` ', '**Cache key:** image filename stem (not row index)', '', '---', '', '## 4. Normalization Statistics', '', '**Method:** One pass over training split pixels (post-preprocessing).', 'No validation or test images used.', '', f'| Channel | Mean | Std |', f'|---------|------|-----|', f'| R (red) | {norm_stats["mean_rgb"][0]:.4f} | {norm_stats["std_rgb"][0]:.4f} |', f'| G (green) | {norm_stats["mean_rgb"][1]:.4f} | {norm_stats["std_rgb"][1]:.4f} |', f'| B (blue) | {norm_stats["mean_rgb"][2]:.4f} | {norm_stats["std_rgb"][2]:.4f} |', '', f'**Images used:** {norm_stats["n_images"]:,} ', f'**Note:** {norm_stats["note"]} ', f'**Source:** `{norm_stats["source"]}`', ] if norm_stats['source'] == 'computed_training_split': lines += [ '', 'Expected pattern for fundus images: R > G > B (red-dominant)', 'due to high hemoglobin absorption. Computed values should match', 'expected ≈ [0.41, 0.27, 0.19] mean, [0.28, 0.19, 0.16] std.', ] lines += [ '', '**Saved to:** `data/fundus_norm_stats.json`', '', '---', '', '## 5. Stratified Split (70 / 15 / 15)', '', '**Strategy:** `sklearn.model_selection.train_test_split` with', '`stratify=disease_label`, `random_state=42`.', '', '**Files:**', '- `data/train_split.csv` — 70% training', '- `data/calib_split.csv` — 15% calibration (temperature scaling)', '- `data/test_split.csv` — 15% held-out evaluation', '', ] lines.append(split_report.replace('\n', '\n')) lines += [ '', '---', '', '## 6. Additional Dataset Search', '', ] if dataset_findings: lines.append('### Found directories:') lines.append('') lines.append('| Path | Images | Files | Matched |') lines.append('|------|--------|-------|---------|') for path, info in dataset_findings.items(): lines.append( f'| `{path}` | {info["img_count"]:,} | ' f'{info["total_files"]:,} | {info["matched_target"]} |' ) else: lines.append('No additional dataset directories found.') lines += [ '', '### Availability summary', '', '| Dataset | Status | Location |', '|---------|--------|----------|', '| ODIR-5K (ODIR) | **AVAILABLE** | `odir/preprocessed_images/` (4,878 images in CSV) |', '| ODIR-5K raw | **AVAILABLE** | `odir/ODIR-5K/ODIR-5K/Training Images/` (7,000) + Testing (1,000) |', '| APTOS 2019 | **AVAILABLE** | `aptos/gaussian_filtered_images/` (3,662 images) |', '| ocular-disease-recognition-odir5k | Partial (1 image only) | `ocular-disease-recognition-odir5k/preprocessed_images/` |', '| REFUGE2 | **NOT FOUND** | Needs download |', '| iChallenge-AMD | **NOT FOUND** | Needs download |', '| RIM-ONE | **NOT FOUND** | Needs download |', '', '### AMD / Glaucoma specific images (beyond CSV)', '', f'- ODIR provides {len(df[df["disease_label"]==2]):,} Glaucoma and ' f'{len(df[df["disease_label"]==4]):,} AMD images from ' f'`odir/preprocessed_images/`.', '- ODIR raw training set (7,000 images) may contain additional', ' AMD/Glaucoma cases not yet extracted — check `odir/full_df.csv`.', '- For specialized Glaucoma detection: REFUGE2 (400 images,', ' Magrabia population) and RIM-ONE (159 images) are recommended.', '- For AMD: iChallenge-AMD (400 images) is the standard benchmark.', '', '---', '', '## 7. Action Items', '', '1. **Download missing datasets** to improve minority class coverage:', ' - REFUGE2: https://refuge.grand-challenge.org/', ' - RIM-ONE: http://medimrg.webs.ull.es/research/retinal-imaging/rim-one/', ' - iChallenge-AMD: https://amd.grand-challenge.org/', '2. **Fix paths in combined_dataset.csv**: update `aptos/train_images/` →', ' actual `gaussian_filtered_images/.../` paths.', '3. **Run full cache build** when training: `python retinasense_v3_preprocessing.py --cache-all`', '4. **Use computed normalization stats** from `data/fundus_norm_stats.json`', ' instead of ImageNet stats.', '5. **Address 21:1 class imbalance**: consider weighted sampling or', ' oversampling minority classes (AMD=265, Glaucoma=308).', ] report_path = os.path.join(DATA_DIR, 'data_engineer_report.md') with open(report_path, 'w') as f: f.write('\n'.join(lines) + '\n') print(f'[OK] Report saved: {report_path}') if __name__ == '__main__': main()