tanishq74
/

retinasense-vit

+#!/usr/bin/env python3
+"""
+RetinaSense v3 — Domain-Conditional Preprocessing Pipeline
+===========================================================
+Implements source-aware preprocessing:
+  - APTOS   -> Ben Graham enhancement (high contrast DR-specific pipeline)
+  - ODIR    -> CLAHE only (preserves sharpness, normalizes contrast)
+  - REFUGE2 -> Resize only (images already clinical-grade high quality)
+Image path resolution:
+  - ODIR:  odir/preprocessed_images/<filename>
+  - APTOS: aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png
+           (looked up from aptos/train.csv; aptos/train_images/ does NOT exist)
+Cache format: ./preprocessed_cache_v3/<stem>_v3.npy
+"""
+import os
+import sys
+import json
+import warnings
+import numpy as np
+import pandas as pd
+import cv2
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from pathlib import Path
+from tqdm import tqdm
+warnings.filterwarnings('ignore')
+# =========================================================
+# PATHS
+# =========================================================
+BASE_DIR   = '/teamspace/studios/this_studio'
+CSV_PATH   = os.path.join(BASE_DIR, 'data', 'combined_dataset.csv')
+CACHE_DIR  = os.path.join(BASE_DIR, 'preprocessed_cache_v3')
+DATA_DIR   = os.path.join(BASE_DIR, 'data')
+ODIR_IMG_DIR   = os.path.join(BASE_DIR, 'odir', 'preprocessed_images')
+APTOS_CSV      = os.path.join(BASE_DIR, 'aptos', 'train.csv')
+APTOS_IMG_BASE = os.path.join(BASE_DIR, 'aptos',
+                               'gaussian_filtered_images',
+                               'gaussian_filtered_images')
+APTOS_DIAG_MAP = {0: 'No_DR', 1: 'Mild', 2: 'Moderate',
+                  3: 'Severe', 4: 'Proliferate_DR'}
+ODIR_SAMPLE    = os.path.join(BASE_DIR, 'ocular-disease-recognition-odir5k',
+                               'preprocessed_images', '2977_left.jpg')
+os.makedirs(CACHE_DIR, exist_ok=True)
+os.makedirs(DATA_DIR,  exist_ok=True)
+TARGET_SIZE = 224
+# =========================================================
+# APTOS PATH LOOKUP TABLE
+# Built once at module load; maps id_code (stem) -> abs path
+# =========================================================
+def _build_aptos_lookup() -> dict:
+    """Return dict mapping aptos id_code -> absolute image path."""
+    lookup = {}
+    if not os.path.exists(APTOS_CSV):
+        return lookup
+    df = pd.read_csv(APTOS_CSV)
+    for _, row in df.iterrows():
+        folder = APTOS_DIAG_MAP.get(int(row['diagnosis']), 'No_DR')
+        path   = os.path.join(APTOS_IMG_BASE, folder,
+                              str(row['id_code']) + '.png')
+        lookup[str(row['id_code'])] = path
+    return lookup
+_APTOS_LOOKUP: dict = _build_aptos_lookup()
+# =========================================================
+# PATH RESOLVER
+# =========================================================
+def resolve_image_path(raw_path: str, dataset: str = None) -> str:
+    """
+    Resolve CSV path entry to an absolute filesystem path.
+    The CSV stores paths like:
+      ODIR:  .//odir/preprocessed_images/0_left.jpg
+      APTOS: .//aptos/train_images/000c1434d8d7.png  (train_images doesn't exist)
+    Resolution rules:
+      1. If the resolved path already exists, return it.
+      2. ODIR: remap to odir/preprocessed_images/<filename>
+      3. APTOS: look up via _APTOS_LOOKUP by stem
+    """
+    # Normalise .// and ./ prefixes
+    p = raw_path.strip()
+    if p.startswith('.//'):
+        p = p[3:]
+    elif p.startswith('./'):
+        p = p[2:]
+    # Try as-is (absolute or relative to BASE_DIR)
+    if not os.path.isabs(p):
+        candidate = os.path.join(BASE_DIR, p)
+    else:
+        candidate = p
+    if os.path.exists(candidate):
+        return candidate
+    fname = os.path.basename(p)
+    stem  = os.path.splitext(fname)[0]
+    src   = (dataset or '').upper().strip()
+    # ODIR remap
+    if src == 'ODIR' or 'odir' in p.lower():
+        return os.path.join(ODIR_IMG_DIR, fname)
+    # APTOS remap via lookup table
+    if src == 'APTOS' or 'aptos' in p.lower():
+        if stem in _APTOS_LOOKUP:
+            return _APTOS_LOOKUP[stem]
+    # Final fallback: try all known image dirs
+    for d in [ODIR_IMG_DIR, APTOS_IMG_BASE]:
+        candidate2 = os.path.join(d, fname)
+        if os.path.exists(candidate2):
+            return candidate2
+    return candidate  # return best guess even if missing
+# =========================================================
+# PREPROCESSING FUNCTIONS
+# =========================================================
+def _load_image(image_path: str):
+    """Load image as RGB numpy array (H, W, 3) uint8. Returns None on failure."""
+    img = cv2.imread(image_path)
+    if img is None:
+        return None
+    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+def _crop_black_borders(img: np.ndarray, tol: int = 7) -> np.ndarray:
+    """Remove dark border padding common in fundus images."""
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    mask = gray > tol
+    rows = np.any(mask, axis=1)
+    cols = np.any(mask, axis=0)
+    if not rows.any() or not cols.any():
+        return img
+    rmin, rmax = np.where(rows)[0][[0, -1]]
+    cmin, cmax = np.where(cols)[0][[0, -1]]
+    return img[rmin:rmax+1, cmin:cmax+1]
+def _apply_circular_mask(img: np.ndarray) -> np.ndarray:
+    """Zero out pixels outside the circular fundus field of view."""
+    h, w = img.shape[:2]
+    mask = np.zeros((h, w), dtype=np.uint8)
+    cx, cy = w // 2, h // 2
+    r = int(min(h, w) * 0.48)
+    cv2.circle(mask, (cx, cy), r, 255, -1)
+    return cv2.bitwise_and(img, img, mask=mask)
+def ben_graham_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE,
+                           sigma: float = 10.0) -> np.ndarray:
+    """
+    Ben Graham fundus enhancement — used for APTOS images.
+    Enhances local retinal structures (vessels, lesions) by subtracting a
+    Gaussian-blurred version from itself, centering intensity around 128.
+    This removes low-frequency illumination variation (vignetting, uneven
+    camera lighting) and amplifies high-frequency structural details.
+    Formula: result = 4*img - 4*GaussianBlur(img, sigma=10) + 128
+    Then circular mask applied to suppress black border.
+    """
+    img = _crop_black_borders(img)
+    img = cv2.resize(img, (target_size, target_size),
+                     interpolation=cv2.INTER_AREA)
+    blur = cv2.GaussianBlur(img, (0, 0), sigma)
+    img  = cv2.addWeighted(img, 4, blur, -4, 128)
+    img  = _apply_circular_mask(img)
+    return np.clip(img, 0, 255).astype(np.uint8)
+def clahe_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE,
+                     clip_limit: float = 2.0,
+                     tile_grid: tuple = (8, 8)) -> np.ndarray:
+    """
+    CLAHE (Contrast Limited Adaptive Histogram Equalization) — used for ODIR.
+    Preserves image sharpness while normalizing local contrast.
+    Applied only to the L (luminance) channel in LAB color space to
+    avoid hue shifts. ODIR is a multi-source dataset with mixed quality,
+    so CLAHE provides gentle contrast normalization without destroying
+    fine detail the way Ben Graham's aggressive subtraction would.
+    clip_limit=2.0: moderate clipping to prevent over-amplification of noise.
+    tile_grid=(8,8): 8x8 tiles for local adaptation at appropriate scale.
+    """
+    img = _crop_black_borders(img)
+    img = cv2.resize(img, (target_size, target_size),
+                     interpolation=cv2.INTER_AREA)
+    lab    = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
+    l, a, b = cv2.split(lab)
+    clahe  = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid)
+    l_eq   = clahe.apply(l)
+    lab_eq = cv2.merge([l_eq, a, b])
+    img    = cv2.cvtColor(lab_eq, cv2.COLOR_LAB2RGB)
+    img    = _apply_circular_mask(img)
+    return np.clip(img, 0, 255).astype(np.uint8)
+def resize_only_preprocess(img: np.ndarray,
+                            target_size: int = TARGET_SIZE) -> np.ndarray:
+    """
+    Minimal preprocessing — used for REFUGE2.
+    REFUGE2 images are acquired with a Zeiss Visucam 500 camera under
+    standardized clinical conditions. They are already high-quality with
+    consistent lighting. Any additional enhancement would degrade quality.
+    """
+    img = cv2.resize(img, (target_size, target_size),
+                     interpolation=cv2.INTER_AREA)
+    return np.clip(img, 0, 255).astype(np.uint8)
+def preprocess_image(image_path: str, source: str,
+                     target_size: int = TARGET_SIZE):
+    """
+    Domain-conditional preprocessing dispatcher.
+    Parameters
+    ----------
+    image_path : str
+        Absolute path to the fundus image file.
+    source : str
+        Dataset source. One of: 'APTOS', 'ODIR', 'REFUGE2' (case-insensitive).
+    target_size : int
+        Output spatial dimension (square). Default 224.
+    Returns
+    -------
+    np.ndarray of shape (target_size, target_size, 3), dtype uint8,
+    or None if the image cannot be loaded.
+    """
+    img = _load_image(image_path)
+    if img is None:
+        return None
+    src = source.upper().strip()
+    if src == 'APTOS':
+        return ben_graham_preprocess(img, target_size=target_size)
+    elif src == 'ODIR':
+        return clahe_preprocess(img, target_size=target_size)
+    elif src == 'REFUGE2':
+        return resize_only_preprocess(img, target_size=target_size)
+    else:
+        # Safe fallback for unknown sources
+        print(f'[WARN] Unknown source "{source}", applying CLAHE fallback.')
+        return clahe_preprocess(img, target_size=target_size)
+# =========================================================
+# CACHE HELPERS
+# =========================================================
+def cache_path_for(raw_csv_path: str) -> str:
+    """Return the .npy cache path for a given CSV image_path entry."""
+    stem = Path(raw_csv_path).stem
+    return os.path.join(CACHE_DIR, f'{stem}_v3.npy')
+def is_cached(raw_csv_path: str) -> bool:
+    return os.path.exists(cache_path_for(raw_csv_path))
+def save_to_cache(raw_csv_path: str, arr: np.ndarray) -> None:
+    np.save(cache_path_for(raw_csv_path), arr)
+def load_from_cache(raw_csv_path: str):
+    cp = cache_path_for(raw_csv_path)
+    return np.load(cp) if os.path.exists(cp) else None
+def cache_dataset(df: pd.DataFrame) -> dict:
+    """
+    Preprocess and cache all images in df using domain-conditional pipeline.
+    Returns stats dict.
+    """
+    stats = dict(processed=0, skipped_missing=0, already_cached=0,
+                 errors=0, total=len(df))
+    for _, row in tqdm(df.iterrows(), total=len(df), desc='Caching v3'):
+        raw  = row['image_path']
+        src  = row['dataset']
+        if is_cached(raw):
+            stats['already_cached'] += 1
+            continue
+        abs_path = resolve_image_path(raw, src)
+        if not os.path.exists(abs_path):
+            stats['skipped_missing'] += 1
+            continue
+        arr = preprocess_image(abs_path, src)
+        if arr is None:
+            stats['errors'] += 1
+            continue
+        save_to_cache(raw, arr)
+        stats['processed'] += 1
+    return stats
+# =========================================================
+# PREPROCESSING COMPARISON VISUALIZATION
+# =========================================================
+def make_preprocessing_comparison(
+        save_path: str = None,
+        odir_raw_path: str = None,
+        aptos_raw_path: str = None) -> str:
+    """
+    Generate and save a side-by-side comparison PNG showing
+    ODIR (CLAHE) vs APTOS (Ben Graham) preprocessing pipelines.
+    Returns the saved PNG path.
+    """
+    if save_path is None:
+        save_path = os.path.join(DATA_DIR, 'preprocessing_comparison_v3.png')
+    # --- Pick sample ODIR image ---
+    # Prefer sample from the dataset
+    odir_path = None
+    if odir_raw_path:
+        odir_path = resolve_image_path(odir_raw_path, 'ODIR')
+    if odir_path is None or not os.path.exists(odir_path):
+        # Use the one available ODIR sample in odir5k folder
+        odir_path = ODIR_SAMPLE
+    if not os.path.exists(odir_path):
+        # Fall back to any image in odir/preprocessed_images
+        imgs = [os.path.join(ODIR_IMG_DIR, f)
+                for f in os.listdir(ODIR_IMG_DIR)
+                if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
+        odir_path = imgs[0] if imgs else None
+    # --- Pick sample APTOS image ---
+    aptos_path = None
+    if aptos_raw_path:
+        aptos_path = resolve_image_path(aptos_raw_path, 'APTOS')
+    if aptos_path is None or not os.path.exists(aptos_path):
+        # Use first entry in APTOS lookup
+        if _APTOS_LOOKUP:
+            aptos_path = next(iter(_APTOS_LOOKUP.values()))
+    # --- Load images ---
+    def get_or_synthetic(path, name):
+        if path and os.path.exists(path):
+            img = _load_image(path)
+            if img is not None:
+                return img, path
+        print(f'[WARN] {name} sample not found, using synthetic.')
+        h, w = 512, 512
+        np.random.seed(42)
+        base = np.zeros((h, w, 3), dtype=np.uint8)
+        cx, cy = w // 2, h // 2
+        r = int(min(h, w) * 0.48)
+        cv2.circle(base, (cx, cy), r, (60, 40, 25), -1)
+        for _ in range(30):
+            pt1 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r))
+            pt2 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r))
+            cv2.line(base, pt1, pt2, (100, 60, 35), 1)
+        base = base.astype(np.float32) + np.random.normal(0, 6, base.shape)
+        return np.clip(base, 0, 255).astype(np.uint8), '(synthetic)'
+    odir_orig,  odir_src  = get_or_synthetic(odir_path,  'ODIR')
+    aptos_orig, aptos_src = get_or_synthetic(aptos_path, 'APTOS')
+    # Resize originals for display
+    odir_disp  = cv2.resize(odir_orig,  (TARGET_SIZE, TARGET_SIZE),
+                            interpolation=cv2.INTER_AREA)
+    aptos_disp = cv2.resize(aptos_orig, (TARGET_SIZE, TARGET_SIZE),
+                            interpolation=cv2.INTER_AREA)
+    # Apply pipelines
+    odir_clahe   = clahe_preprocess(odir_orig.copy())
+    aptos_graham = ben_graham_preprocess(aptos_orig.copy())
+    # Difference images (scaled for visibility)
+    diff_odir  = cv2.absdiff(odir_disp,  odir_clahe)
+    diff_aptos = cv2.absdiff(aptos_disp, aptos_graham)
+    # Amplify diff for visibility
+    diff_odir  = np.clip(diff_odir  * 3, 0, 255).astype(np.uint8)
+    diff_aptos = np.clip(diff_aptos * 3, 0, 255).astype(np.uint8)
+    # --- Build figure ---
+    fig, axes = plt.subplots(2, 3, figsize=(16, 11))
+    fig.patch.set_facecolor('#1a1a2e')
+    fig.suptitle(
+        'RetinaSense v3 — Domain-Conditional Preprocessing\n'
+        'ODIR: CLAHE Pipeline  |  APTOS: Ben Graham Pipeline',
+        fontsize=13, fontweight='bold', color='white', y=1.01
+    )
+    panels = [
+        # row, col, image, title, bg_color
+        (0, 0, odir_disp,     f'ODIR: Original\n({os.path.basename(str(odir_src))})',
+         '#1565C0'),
+        (0, 1, odir_clahe,    'ODIR: After CLAHE\n(L-channel equalization, circular mask)',
+         '#0D47A1'),
+        (0, 2, diff_odir,     'ODIR: Difference x3\n(|original - CLAHE|, amplified)',
+         '#263238'),
+        (1, 0, aptos_disp,    f'APTOS: Original\n({os.path.basename(str(aptos_src))})',
+         '#BF360C'),
+        (1, 1, aptos_graham,  'APTOS: After Ben Graham\n(4*img - 4*blur(σ=10) + 128)',
+         '#870000'),
+        (1, 2, diff_aptos,    'APTOS: Difference x3\n(|original - Ben Graham|, amplified)',
+         '#1B5E20'),
+    ]
+    for r, c, img_arr, title, fc in panels:
+        ax = axes[r, c]
+        ax.imshow(img_arr)
+        ax.set_title(title, fontsize=9, color='white', pad=5,
+                     bbox=dict(boxstyle='round,pad=0.3', facecolor=fc,
+                               alpha=0.85, edgecolor='none'))
+        ax.axis('off')
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+    # Annotation boxes
+    odir_note = (
+        'ODIR Pipeline\n'
+        '━━━━━━━━━━━━━━━\n'
+        '1. Crop black borders\n'
+        '2. Resize → 224×224\n'
+        '3. Convert RGB→LAB\n'
+        '4. CLAHE on L channel\n'
+        '   clip=2.0, tile=8×8\n'
+        '5. LAB→RGB\n'
+        '6. Circular mask (r=0.48)'
+    )
+    aptos_note = (
+        'APTOS Pipeline (Ben Graham)\n'
+        '━━━━━━━━━━━━━━━━━━━━━━━━━━\n'
+        '1. Crop black borders\n'
+        '2. Resize → 224×224\n'
+        '3. blur = GaussianBlur(σ=10)\n'
+        '4. out = 4×img − 4×blur + 128\n'
+        '5. Circular mask (r=0.48)\n'
+        '6. clip to [0, 255]'
+    )
+    fig.text(0.02, 0.92, odir_note,  fontsize=8.5, va='top', ha='left',
+             color='white', fontfamily='monospace',
+             bbox=dict(boxstyle='round', facecolor='#1565C0', alpha=0.6))
+    fig.text(0.02, 0.48, aptos_note, fontsize=8.5, va='top', ha='left',
+             color='white', fontfamily='monospace',
+             bbox=dict(boxstyle='round', facecolor='#870000', alpha=0.6))
+    plt.tight_layout(rect=[0.18, 0, 1, 1])
+    plt.savefig(save_path, dpi=150, bbox_inches='tight',
+                facecolor='#1a1a2e', edgecolor='none')
+    plt.close()
+    print(f'[OK] Comparison saved: {save_path}')
+    return save_path
+# =========================================================
+# NORMALIZATION STATISTICS
+# =========================================================
+def compute_norm_stats(train_df: pd.DataFrame,
+                       out_path: str = None,
+                       max_images: int = None) -> dict:
+    """
+    Compute per-channel mean and std across all pixels of training images
+    after domain-conditional preprocessing. Training set ONLY — no
+    validation/test data contamination.
+    Returns dict with: mean_rgb, std_rgb, n_images, n_pixels_per_channel.
+    """
+    if out_path is None:
+        out_path = os.path.join(DATA_DIR, 'fundus_norm_stats.json')
+    df = train_df.copy()
+    if max_images is not None:
+        df = df.sample(min(max_images, len(df)), random_state=42)
+    ch_sum    = np.zeros(3, dtype=np.float64)
+    ch_sq_sum = np.zeros(3, dtype=np.float64)
+    n_pixels  = 0
+    n_images  = 0
+    n_missing = 0
+    for _, row in tqdm(df.iterrows(), total=len(df), desc='Norm stats'):
+        raw = row['image_path']
+        src = row['dataset']
+        # Try cache first for speed
+        arr = load_from_cache(raw)
+        if arr is None:
+            abs_path = resolve_image_path(raw, src)
+            if not os.path.exists(abs_path):
+                n_missing += 1
+                continue
+            arr = preprocess_image(abs_path, src)
+            if arr is None:
+                n_missing += 1
+                continue
+        arr_f   = arr.astype(np.float64) / 255.0
+        pixels  = arr_f.reshape(-1, 3)
+        ch_sum    += pixels.sum(axis=0)
+        ch_sq_sum += (pixels ** 2).sum(axis=0)
+        n_pixels  += pixels.shape[0]
+        n_images  += 1
+    if n_images == 0:
+        print('[WARN] No images found — storing ImageNet defaults as fallback.')
+        stats = {
+            'mean_rgb': [0.485, 0.456, 0.406],
+            'std_rgb':  [0.229, 0.224, 0.225],
+            'n_images': 0,
+            'n_pixels_per_channel': 0,
+            'n_missing': n_missing,
+            'note': 'No images found — ImageNet defaults used as fallback',
+            'source': 'imagenet_fallback'
+        }
+    else:
+        mean = ch_sum    / n_pixels
+        var  = ch_sq_sum / n_pixels - mean ** 2
+        std  = np.sqrt(np.maximum(var, 0.0))
+        stats = {
+            'mean_rgb': [round(float(v), 6) for v in mean],
+            'std_rgb':  [round(float(v), 6) for v in std],
+            'n_images': n_images,
+            'n_pixels_per_channel': int(n_pixels),
+            'n_missing': n_missing,
+            'note': ('Computed on training split only after domain-conditional '
+                     'preprocessing. Red-dominant channel expected (fundus tissue).'),
+            'source': 'computed_training_split'
+        }
+        print(f'  mean RGB : {[round(v,4) for v in mean]}')
+        print(f'  std  RGB : {[round(v,4) for v in std]}')
+        print(f'  images   : {n_images:,}  |  missing: {n_missing}')
+    with open(out_path, 'w') as f:
+        json.dump(stats, f, indent=2)
+    print(f'[OK] Stats saved: {out_path}')
+    return stats
+# =========================================================
+# 3-WAY STRATIFIED SPLIT
+# =========================================================
+def create_stratified_split(df: pd.DataFrame,
+                             train_ratio: float = 0.70,
+                             calib_ratio: float = 0.15,
+                             test_ratio:  float = 0.15,
+                             random_state: int = 42) -> tuple:
+    """
+    Create train/calib/test split stratified by disease_label.
+    Returns (train_df, calib_df, test_df).
+    """
+    from sklearn.model_selection import train_test_split as _tts
+    assert abs(train_ratio + calib_ratio + test_ratio - 1.0) < 1e-9
+    train_df, temp_df = _tts(
+        df, test_size=(calib_ratio + test_ratio),
+        stratify=df['disease_label'], random_state=random_state
+    )
+    calib_frac = calib_ratio / (calib_ratio + test_ratio)
+    calib_df, test_df = _tts(
+        temp_df, test_size=(1.0 - calib_frac),
+        stratify=temp_df['disease_label'], random_state=random_state
+    )
+    return (train_df.reset_index(drop=True),
+            calib_df.reset_index(drop=True),
+            test_df.reset_index(drop=True))
+def save_splits(train_df, calib_df, test_df, out_dir: str = DATA_DIR):
+    train_df.to_csv(os.path.join(out_dir, 'train_split.csv'), index=False)
+    calib_df.to_csv(os.path.join(out_dir, 'calib_split.csv'), index=False)
+    test_df.to_csv( os.path.join(out_dir, 'test_split.csv'),  index=False)
+    print(f'[OK] Split CSVs saved to {out_dir}/')
+def print_split_stats(train_df, calib_df, test_df,
+                      class_names: dict = None) -> str:
+    if class_names is None:
+        class_names = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
+                       3: 'Cataract', 4: 'AMD'}
+    total_n = len(train_df) + len(calib_df) + len(test_df)
+    lines = [
+        '',
+        '=' * 62,
+        '  STRATIFIED SPLIT — CLASS DISTRIBUTION',
+        '=' * 62,
+        f"{'Class':<16} {'Train':>8} {'Calib':>8} {'Test':>8} {'Total':>8}",
+        '-' * 54,
+    ]
+    tr_tot = ca_tot = te_tot = 0
+    for lbl in sorted(class_names.keys()):
+        tr = int((train_df['disease_label'] == lbl).sum())
+        ca = int((calib_df['disease_label'] == lbl).sum())
+        te = int((test_df['disease_label']  == lbl).sum())
+        tot = tr + ca + te
+        tr_tot += tr; ca_tot += ca; te_tot += te
+        lines.append(
+            f"{class_names[lbl]:<16} {tr:>8,} {ca:>8,} {te:>8,} {tot:>8,}"
+        )
+    lines += [
+        '-' * 54,
+        f"{'TOTAL':<16} {tr_tot:>8,} {ca_tot:>8,} {te_tot:>8,} {total_n:>8,}",
+        '',
+        f'Split sizes : train={len(train_df):,}  calib={len(calib_df):,}  '
+        f'test={len(test_df):,}',
+        f'Actual ratios: train={len(train_df)/total_n:.1%}  '
+        f'calib={len(calib_df)/total_n:.1%}  '
+        f'test={len(test_df)/total_n:.1%}',
+    ]
+    report = '\n'.join(lines)
+    print(report)
+    return report
+# =========================================================
+# ADDITIONAL DATASET SEARCH
+# =========================================================
+def search_additional_datasets() -> dict:
+    """
+    Scan filesystem for REFUGE2, iChallenge-AMD, RIM-ONE and other
+    AMD/Glaucoma-specific datasets beyond the current CSV.
+    Returns a findings dict.
+    """
+    IMG_EXTS = {'.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp'}
+    TARGETS  = ['refuge2', 'refuge', 'ichallenge', 'rim-one', 'rimone',
+                'amd', 'glaucoma', 'odir5k', 'odir']
+    SEARCH_ROOTS = ['/teamspace/studios/this_studio', '/teamspace/uploads']
+    SKIP_DIRS    = {'.git', '.cache', '.claude', '.ipython', '.npm',
+                    '__pycache__', 'outputs_analysis', 'outputs_ensemble',
+                    'outputs_optimized', 'outputs_production', 'outputs_v2',
+                    'outputs_v2_extended', 'outputs_vit'}
+    findings = {}
+    for root_dir in SEARCH_ROOTS:
+        if not os.path.exists(root_dir):
+            continue
+        for dirpath, dirnames, files in os.walk(root_dir):
+            # Prune
+            dirnames[:] = [d for d in dirnames
+                           if d not in SKIP_DIRS and not d.startswith('.')]
+            folder = os.path.basename(dirpath).lower()
+            for target in TARGETS:
+                if target in folder:
+                    img_cnt = sum(1 for f in files
+                                  if os.path.splitext(f)[1].lower() in IMG_EXTS)
+                    key = dirpath
+                    if key not in findings or img_cnt > findings[key]['img_count']:
+                        findings[key] = {
+                            'matched_target': target,
+                            'img_count': img_cnt,
+                            'total_files': len(files)
+                        }
+    # Always include the known special dirs
+    for special in [
+        '/teamspace/studios/this_studio/ocular-disease-recognition-odir5k',
+        '/teamspace/studios/this_studio/odir',
+        '/teamspace/studios/this_studio/aptos',
+    ]:
+        if os.path.exists(special) and special not in findings:
+            img_cnt = sum(
+                1 for root, _, files in os.walk(special)
+                for f in files
+                if os.path.splitext(f)[1].lower() in IMG_EXTS
+            )
+            findings[special] = {
+                'matched_target': 'known_dataset',
+                'img_count': img_cnt,
+                'total_files': sum(1 for _, _, fs in os.walk(special) for _ in fs)
+            }
+    return findings
+# =========================================================
+# MAIN
+# =========================================================
+def main():
+    print('=' * 65)
+    print('  RetinaSense v3 — Data Pipeline')
+    print('=' * 65)
+    CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
+                   3: 'Cataract', 4: 'AMD'}
+    # -------------------------------------------------------
+    # TASK 1: Dataset Audit
+    # -------------------------------------------------------
+    print('\n[TASK 1] Dataset Audit')
+    print('-' * 50)
+    df = pd.read_csv(CSV_PATH)
+    print(f'  CSV          : {CSV_PATH}')
+    print(f'  Total rows   : {len(df):,}')
+    print(f'  Columns      : {df.columns.tolist()}')
+    print()
+    print('  --- Overall class distribution ---')
+    for lbl, cnt in df['disease_label'].value_counts().sort_index().items():
+        pct = cnt / len(df) * 100
+        bar = '#' * int(pct / 2)
+        print(f"    {lbl} {CLASS_NAMES.get(lbl,'?'):<12} : {cnt:>5}  ({pct:5.1f}%)  {bar}")
+    max_cls = df['disease_label'].value_counts().max()
+    min_cls = df['disease_label'].value_counts().min()
+    print(f'\n  Imbalance ratio (max/min): {max_cls/min_cls:.1f}:1')
+    print()
+    print('  --- Per-dataset breakdown ---')
+    per_ds = (df.groupby(['dataset', 'disease_label'])
+               .size().reset_index(name='count'))
+    print(per_ds.to_string(index=False))
+    print()
+    print('  --- Severity label distribution (APTOS only) ---')
+    for sev, cnt in df['severity_label'].value_counts().sort_index().items():
+        label = 'N/A (ODIR)' if sev == -1 else f'Grade {sev}'
+        print(f"    {sev:>3} ({label:<14}): {cnt:>5}")
+    print()
+    print('  --- Image path existence check ---')
+    n_found = 0
+    for _, row in df.iterrows():
+        p = resolve_image_path(row['image_path'], row['dataset'])
+        if os.path.exists(p):
+            n_found += 1
+    n_missing = len(df) - n_found
+    print(f'    Total checked : {len(df):,}')
+    print(f'    Found on disk : {n_found:,}')
+    print(f'    Missing       : {n_missing:,}')
+    print()
+    # -------------------------------------------------------
+    # TASK 2: Preprocessing Comparison
+    # -------------------------------------------------------
+    print('[TASK 2] Domain-Conditional Preprocessing Comparison')
+    print('-' * 50)
+    # Get representative samples from each dataset
+    odir_sample  = df[df['dataset'] == 'ODIR']['image_path'].iloc[0] \
+                   if len(df[df['dataset'] == 'ODIR']) > 0 else None
+    aptos_sample = df[df['dataset'] == 'APTOS']['image_path'].iloc[0] \
+                   if len(df[df['dataset'] == 'APTOS']) > 0 else None
+    comp_path = make_preprocessing_comparison(
+        odir_raw_path=odir_sample,
+        aptos_raw_path=aptos_sample
+    )
+    # Demo: process a few images to verify pipeline
+    print('\n  --- Pipeline verification (5 ODIR + 5 APTOS) ---')
+    ok_odir = ok_aptos = 0
+    for _, row in df[df['dataset'] == 'ODIR'].head(5).iterrows():
+        p = resolve_image_path(row['image_path'], 'ODIR')
+        if os.path.exists(p):
+            arr = preprocess_image(p, 'ODIR')
+            if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3):
+                ok_odir += 1
+    for _, row in df[df['dataset'] == 'APTOS'].head(5).iterrows():
+        p = resolve_image_path(row['image_path'], 'APTOS')
+        if os.path.exists(p):
+            arr = preprocess_image(p, 'APTOS')
+            if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3):
+                ok_aptos += 1
+    print(f'    ODIR  (CLAHE)      : {ok_odir}/5 OK')
+    print(f'    APTOS (Ben Graham) : {ok_aptos}/5 OK')
+    print()
+    # -------------------------------------------------------
+    # TASK 3: Stratified Split
+    # -------------------------------------------------------
+    print('[TASK 3] 3-Way Stratified Split (70 / 15 / 15)')
+    print('-' * 50)
+    train_df, calib_df, test_df = create_stratified_split(df)
+    save_splits(train_df, calib_df, test_df)
+    split_report = print_split_stats(train_df, calib_df, test_df, CLASS_NAMES)
+    print()
+    # -------------------------------------------------------
+    # TASK 4: Normalization Statistics (training split only)
+    # -------------------------------------------------------
+    print('[TASK 4] Fundus Normalization Statistics (training split)')
+    print('-' * 50)
+    norm_stats = compute_norm_stats(train_df)
+    print()
+    # -------------------------------------------------------
+    # TASK 5: Additional Dataset Search
+    # -------------------------------------------------------
+    print('[TASK 5] Additional Dataset Search')
+    print('-' * 50)
+    findings = search_additional_datasets()
+    if findings:
+        print(f'  Found {len(findings)} dataset directories:')
+        for path, info in findings.items():
+            print(f'    {path}')
+            print(f'      images: {info["img_count"]:,}  '
+                  f'files: {info["total_files"]:,}  '
+                  f'matched: "{info["matched_target"]}"')
+    else:
+        print('  No additional datasets found.')
+    print()
+    # Summary of what needs downloading
+    known_sets = {'REFUGE2', 'ICHALLENGE-AMD', 'RIM-ONE'}
+    found_names = set(info['matched_target'].upper()
+                      for info in findings.values())
+    missing_sets = known_sets - found_names
+    if missing_sets:
+        print(f'  Datasets NOT found (need downloading): {missing_sets}')
+    # -------------------------------------------------------
+    # Write report
+    # -------------------------------------------------------
+    _write_report(df, train_df, calib_df, test_df, norm_stats,
+                  findings, split_report, comp_path)
+    print('\n' + '=' * 65)
+    print('  All tasks complete.')
+    print('=' * 65)
+    return df, train_df, calib_df, test_df, norm_stats
+# =========================================================
+# REPORT WRITER
+# =========================================================
+def _write_report(df, train_df, calib_df, test_df, norm_stats,
+                  dataset_findings, split_report, comp_path):
+    """Save data_engineer_report.md to ./data/"""
+    CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
+                   3: 'Cataract', 4: 'AMD'}
+    n_found = sum(
+        1 for _, row in df.iterrows()
+        if os.path.exists(resolve_image_path(row['image_path'], row['dataset']))
+    )
+    lines = [
+        '# RetinaSense v3 — Data Engineer Report',
+        f'Generated: 2026-03-06',
+        '',
+        '---',
+        '',
+        '## 1. Dataset Statistics',
+        '',
+        f'**Source CSV:** `data/combined_dataset.csv`  ',
+        f'**Total images in CSV:** {len(df):,}  ',
+        f'**Images found on disk:** {n_found:,} / {len(df):,}  ',
+        '',
+        '### Source breakdown',
+        '',
+        '| Dataset | Count | Labels present |',
+        '|---------|-------|----------------|',
+    ]
+    for ds, grp in df.groupby('dataset'):
+        labels = sorted(grp['disease_label'].unique())
+        label_str = ', '.join(f'{l}={CLASS_NAMES[l]}' for l in labels)
+        lines.append(f'| {ds} | {len(grp):,} | {label_str} |')
+    lines += [
+        '',
+        '### Class distribution (full dataset)',
+        '',
+        '| Label | Class | Count | % |',
+        '|-------|-------|-------|---|',
+    ]
+    for lbl, cnt in df['disease_label'].value_counts().sort_index().items():
+        pct = cnt / len(df) * 100
+        lines.append(
+            f'| {lbl} | {CLASS_NAMES[lbl]} | {cnt:,} | {pct:.1f}% |'
+        )
+    max_cls = df['disease_label'].value_counts().max()
+    min_cls = df['disease_label'].value_counts().min()
+    lines += [
+        '',
+        f'**Imbalance ratio (Diabetes:AMD):** {max_cls/min_cls:.1f}:1',
+        '',
+        '### Severity label distribution (APTOS DR grades, -1 = ODIR no grade)',
+        '',
+        '| Severity | Meaning | Count |',
+        '|----------|---------|-------|',
+    ]
+    for sev, cnt in df['severity_label'].value_counts().sort_index().items():
+        meaning = 'N/A (ODIR, no grade)' if sev == -1 else f'DR Grade {sev}'
+        lines.append(f'| {sev} | {meaning} | {cnt:,} |')
+    lines += [
+        '',
+        '---',
+        '',
+        '## 2. Image Path Resolution',
+        '',
+        '| Dataset | CSV path format | Actual location |',
+        '|---------|-----------------|-----------------|',
+        '| ODIR  | `.//odir/preprocessed_images/<name>.jpg` | `odir/preprocessed_images/<name>.jpg` |',
+        '| APTOS | `.//aptos/train_images/<id>.png` (train_images does NOT exist) | `aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png` |',
+        '',
+        '`train_images/` directory is absent; actual APTOS images are stored under',
+        '`gaussian_filtered_images/gaussian_filtered_images/<DR_grade>/`. The',
+        '`aptos/train.csv` maps `id_code` → `diagnosis` (0-4) enabling lookup.',
+        '',
+        '---',
+        '',
+        '## 3. Preprocessing: Domain-Conditional Pipeline',
+        '',
+        '**Problem:** Previous versions applied Ben Graham enhancement uniformly to',
+        'ALL images. This is incorrect: ODIR images have already-enhanced or',
+        'clinical-quality appearance; applying Ben Graham degrades them.',
+        '',
+        '**Fix:** Source-conditional dispatch in `preprocess_image(path, source)`.',
+        '',
+        '| Source | Method | Rationale |',
+        '|--------|--------|-----------|',
+        '| APTOS  | Ben Graham (4×img − 4×blur(σ=10) + 128 + circular mask) | Field camera images have vignetting and low local contrast. Ben Graham removes low-frequency illumination and amplifies vessel/lesion detail. |',
+        '| ODIR   | CLAHE (L-channel, clip=2.0, tile=8×8, circular mask) | Multi-source clinical images. CLAHE normalizes local contrast while preserving sharpness and avoiding Ben Graham over-processing. |',
+        '| REFUGE2 | Resize only (224×224) | Zeiss Visucam 500 — already standardized high-quality. |',
+        '',
+        f'**Comparison figure:** `{comp_path}`',
+        '',
+        '**Cache location:** `preprocessed_cache_v3/<stem>_v3.npy`  ',
+        '**Cache key:** image filename stem (not row index)',
+        '',
+        '---',
+        '',
+        '## 4. Normalization Statistics',
+        '',
+        '**Method:** One pass over training split pixels (post-preprocessing).',
+        'No validation or test images used.',
+        '',
+        f'| Channel | Mean | Std |',
+        f'|---------|------|-----|',
+        f'| R (red)   | {norm_stats["mean_rgb"][0]:.4f} | {norm_stats["std_rgb"][0]:.4f} |',
+        f'| G (green) | {norm_stats["mean_rgb"][1]:.4f} | {norm_stats["std_rgb"][1]:.4f} |',
+        f'| B (blue)  | {norm_stats["mean_rgb"][2]:.4f} | {norm_stats["std_rgb"][2]:.4f} |',
+        '',
+        f'**Images used:** {norm_stats["n_images"]:,}  ',
+        f'**Note:** {norm_stats["note"]}  ',
+        f'**Source:** `{norm_stats["source"]}`',
+    ]
+    if norm_stats['source'] == 'computed_training_split':
+        lines += [
+            '',
+            'Expected pattern for fundus images: R > G > B (red-dominant)',
+            'due to high hemoglobin absorption. Computed values should match',
+            'expected ≈ [0.41, 0.27, 0.19] mean, [0.28, 0.19, 0.16] std.',
+        ]
+    lines += [
+        '',
+        '**Saved to:** `data/fundus_norm_stats.json`',
+        '',
+        '---',
+        '',
+        '## 5. Stratified Split (70 / 15 / 15)',
+        '',
+        '**Strategy:** `sklearn.model_selection.train_test_split` with',
+        '`stratify=disease_label`, `random_state=42`.',
+        '',
+        '**Files:**',
+        '- `data/train_split.csv` — 70% training',
+        '- `data/calib_split.csv` — 15% calibration (temperature scaling)',
+        '- `data/test_split.csv`  — 15% held-out evaluation',
+        '',
+    ]
+    lines.append(split_report.replace('\n', '\n'))
+    lines += [
+        '',
+        '---',
+        '',
+        '## 6. Additional Dataset Search',
+        '',
+    ]
+    if dataset_findings:
+        lines.append('### Found directories:')
+        lines.append('')
+        lines.append('| Path | Images | Files | Matched |')
+        lines.append('|------|--------|-------|---------|')
+        for path, info in dataset_findings.items():
+            lines.append(
+                f'| `{path}` | {info["img_count"]:,} | '
+                f'{info["total_files"]:,} | {info["matched_target"]} |'
+            )
+    else:
+        lines.append('No additional dataset directories found.')
+    lines += [
+        '',
+        '### Availability summary',
+        '',
+        '| Dataset | Status | Location |',
+        '|---------|--------|----------|',
+        '| ODIR-5K (ODIR) | **AVAILABLE** | `odir/preprocessed_images/` (4,878 images in CSV) |',
+        '| ODIR-5K raw    | **AVAILABLE** | `odir/ODIR-5K/ODIR-5K/Training Images/` (7,000) + Testing (1,000) |',
+        '| APTOS 2019     | **AVAILABLE** | `aptos/gaussian_filtered_images/` (3,662 images) |',
+        '| ocular-disease-recognition-odir5k | Partial (1 image only) | `ocular-disease-recognition-odir5k/preprocessed_images/` |',
+        '| REFUGE2        | **NOT FOUND** | Needs download |',
+        '| iChallenge-AMD | **NOT FOUND** | Needs download |',
+        '| RIM-ONE        | **NOT FOUND** | Needs download |',
+        '',
+        '### AMD / Glaucoma specific images (beyond CSV)',
+        '',
+        f'- ODIR provides {len(df[df["disease_label"]==2]):,} Glaucoma and '
+        f'{len(df[df["disease_label"]==4]):,} AMD images from '
+        f'`odir/preprocessed_images/`.',
+        '- ODIR raw training set (7,000 images) may contain additional',
+        '  AMD/Glaucoma cases not yet extracted — check `odir/full_df.csv`.',
+        '- For specialized Glaucoma detection: REFUGE2 (400 images,',
+        '  Magrabia population) and RIM-ONE (159 images) are recommended.',
+        '- For AMD: iChallenge-AMD (400 images) is the standard benchmark.',
+        '',
+        '---',
+        '',
+        '## 7. Action Items',
+        '',
+        '1. **Download missing datasets** to improve minority class coverage:',
+        '   - REFUGE2: https://refuge.grand-challenge.org/',
+        '   - RIM-ONE: http://medimrg.webs.ull.es/research/retinal-imaging/rim-one/',
+        '   - iChallenge-AMD: https://amd.grand-challenge.org/',
+        '2. **Fix paths in combined_dataset.csv**: update `aptos/train_images/` →',
+        '   actual `gaussian_filtered_images/.../` paths.',
+        '3. **Run full cache build** when training: `python retinasense_v3_preprocessing.py --cache-all`',
+        '4. **Use computed normalization stats** from `data/fundus_norm_stats.json`',
+        '   instead of ImageNet stats.',
+        '5. **Address 21:1 class imbalance**: consider weighted sampling or',
+        '   oversampling minority classes (AMD=265, Glaucoma=308).',
+    ]
+    report_path = os.path.join(DATA_DIR, 'data_engineer_report.md')
+    with open(report_path, 'w') as f:
+        f.write('\n'.join(lines) + '\n')
+    print(f'[OK] Report saved: {report_path}')
+if __name__ == '__main__':
+    main()