| |
| """ |
| RetinaSense v3 — Domain-Conditional Preprocessing Pipeline |
| =========================================================== |
| Implements source-aware preprocessing: |
| - APTOS -> Ben Graham enhancement (high contrast DR-specific pipeline) |
| - ODIR -> CLAHE only (preserves sharpness, normalizes contrast) |
| - REFUGE2 -> Resize only (images already clinical-grade high quality) |
| |
| Image path resolution: |
| - ODIR: odir/preprocessed_images/<filename> |
| - APTOS: aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png |
| (looked up from aptos/train.csv; aptos/train_images/ does NOT exist) |
| |
| Cache format: ./preprocessed_cache_v3/<stem>_v3.npy |
| """ |
|
|
| import os |
| import sys |
| import json |
| import warnings |
| import numpy as np |
| import pandas as pd |
| import cv2 |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| from pathlib import Path |
| from tqdm import tqdm |
|
|
| warnings.filterwarnings('ignore') |
|
|
| |
| |
| |
| BASE_DIR = '/teamspace/studios/this_studio' |
| CSV_PATH = os.path.join(BASE_DIR, 'data', 'combined_dataset.csv') |
| CACHE_DIR = os.path.join(BASE_DIR, 'preprocessed_cache_v3') |
| DATA_DIR = os.path.join(BASE_DIR, 'data') |
|
|
| ODIR_IMG_DIR = os.path.join(BASE_DIR, 'odir', 'preprocessed_images') |
| APTOS_CSV = os.path.join(BASE_DIR, 'aptos', 'train.csv') |
| APTOS_IMG_BASE = os.path.join(BASE_DIR, 'aptos', |
| 'gaussian_filtered_images', |
| 'gaussian_filtered_images') |
| APTOS_DIAG_MAP = {0: 'No_DR', 1: 'Mild', 2: 'Moderate', |
| 3: 'Severe', 4: 'Proliferate_DR'} |
|
|
| ODIR_SAMPLE = os.path.join(BASE_DIR, 'ocular-disease-recognition-odir5k', |
| 'preprocessed_images', '2977_left.jpg') |
|
|
| os.makedirs(CACHE_DIR, exist_ok=True) |
| os.makedirs(DATA_DIR, exist_ok=True) |
|
|
| TARGET_SIZE = 224 |
|
|
| |
| |
| |
| |
|
|
| def _build_aptos_lookup() -> dict: |
| """Return dict mapping aptos id_code -> absolute image path.""" |
| lookup = {} |
| if not os.path.exists(APTOS_CSV): |
| return lookup |
| df = pd.read_csv(APTOS_CSV) |
| for _, row in df.iterrows(): |
| folder = APTOS_DIAG_MAP.get(int(row['diagnosis']), 'No_DR') |
| path = os.path.join(APTOS_IMG_BASE, folder, |
| str(row['id_code']) + '.png') |
| lookup[str(row['id_code'])] = path |
| return lookup |
|
|
|
|
| _APTOS_LOOKUP: dict = _build_aptos_lookup() |
|
|
|
|
| |
| |
| |
|
|
| def resolve_image_path(raw_path: str, dataset: str = None) -> str: |
| """ |
| Resolve CSV path entry to an absolute filesystem path. |
| |
| The CSV stores paths like: |
| ODIR: .//odir/preprocessed_images/0_left.jpg |
| APTOS: .//aptos/train_images/000c1434d8d7.png (train_images doesn't exist) |
| |
| Resolution rules: |
| 1. If the resolved path already exists, return it. |
| 2. ODIR: remap to odir/preprocessed_images/<filename> |
| 3. APTOS: look up via _APTOS_LOOKUP by stem |
| """ |
| |
| p = raw_path.strip() |
| if p.startswith('.//'): |
| p = p[3:] |
| elif p.startswith('./'): |
| p = p[2:] |
|
|
| |
| if not os.path.isabs(p): |
| candidate = os.path.join(BASE_DIR, p) |
| else: |
| candidate = p |
|
|
| if os.path.exists(candidate): |
| return candidate |
|
|
| fname = os.path.basename(p) |
| stem = os.path.splitext(fname)[0] |
| src = (dataset or '').upper().strip() |
|
|
| |
| if src == 'ODIR' or 'odir' in p.lower(): |
| return os.path.join(ODIR_IMG_DIR, fname) |
|
|
| |
| if src == 'APTOS' or 'aptos' in p.lower(): |
| if stem in _APTOS_LOOKUP: |
| return _APTOS_LOOKUP[stem] |
|
|
| |
| for d in [ODIR_IMG_DIR, APTOS_IMG_BASE]: |
| candidate2 = os.path.join(d, fname) |
| if os.path.exists(candidate2): |
| return candidate2 |
|
|
| return candidate |
|
|
|
|
| |
| |
| |
|
|
| def _load_image(image_path: str): |
| """Load image as RGB numpy array (H, W, 3) uint8. Returns None on failure.""" |
| img = cv2.imread(image_path) |
| if img is None: |
| return None |
| return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) |
|
|
|
|
| def _crop_black_borders(img: np.ndarray, tol: int = 7) -> np.ndarray: |
| """Remove dark border padding common in fundus images.""" |
| gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
| mask = gray > tol |
| rows = np.any(mask, axis=1) |
| cols = np.any(mask, axis=0) |
| if not rows.any() or not cols.any(): |
| return img |
| rmin, rmax = np.where(rows)[0][[0, -1]] |
| cmin, cmax = np.where(cols)[0][[0, -1]] |
| return img[rmin:rmax+1, cmin:cmax+1] |
|
|
|
|
| def _apply_circular_mask(img: np.ndarray) -> np.ndarray: |
| """Zero out pixels outside the circular fundus field of view.""" |
| h, w = img.shape[:2] |
| mask = np.zeros((h, w), dtype=np.uint8) |
| cx, cy = w // 2, h // 2 |
| r = int(min(h, w) * 0.48) |
| cv2.circle(mask, (cx, cy), r, 255, -1) |
| return cv2.bitwise_and(img, img, mask=mask) |
|
|
|
|
| def ben_graham_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE, |
| sigma: float = 10.0) -> np.ndarray: |
| """ |
| Ben Graham fundus enhancement — used for APTOS images. |
| |
| Enhances local retinal structures (vessels, lesions) by subtracting a |
| Gaussian-blurred version from itself, centering intensity around 128. |
| This removes low-frequency illumination variation (vignetting, uneven |
| camera lighting) and amplifies high-frequency structural details. |
| |
| Formula: result = 4*img - 4*GaussianBlur(img, sigma=10) + 128 |
| Then circular mask applied to suppress black border. |
| """ |
| img = _crop_black_borders(img) |
| img = cv2.resize(img, (target_size, target_size), |
| interpolation=cv2.INTER_AREA) |
| blur = cv2.GaussianBlur(img, (0, 0), sigma) |
| img = cv2.addWeighted(img, 4, blur, -4, 128) |
| img = _apply_circular_mask(img) |
| return np.clip(img, 0, 255).astype(np.uint8) |
|
|
|
|
| def clahe_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE, |
| clip_limit: float = 2.0, |
| tile_grid: tuple = (8, 8)) -> np.ndarray: |
| """ |
| CLAHE (Contrast Limited Adaptive Histogram Equalization) — used for ODIR. |
| |
| Preserves image sharpness while normalizing local contrast. |
| Applied only to the L (luminance) channel in LAB color space to |
| avoid hue shifts. ODIR is a multi-source dataset with mixed quality, |
| so CLAHE provides gentle contrast normalization without destroying |
| fine detail the way Ben Graham's aggressive subtraction would. |
| |
| clip_limit=2.0: moderate clipping to prevent over-amplification of noise. |
| tile_grid=(8,8): 8x8 tiles for local adaptation at appropriate scale. |
| """ |
| img = _crop_black_borders(img) |
| img = cv2.resize(img, (target_size, target_size), |
| interpolation=cv2.INTER_AREA) |
| lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) |
| l, a, b = cv2.split(lab) |
| clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid) |
| l_eq = clahe.apply(l) |
| lab_eq = cv2.merge([l_eq, a, b]) |
| img = cv2.cvtColor(lab_eq, cv2.COLOR_LAB2RGB) |
| img = _apply_circular_mask(img) |
| return np.clip(img, 0, 255).astype(np.uint8) |
|
|
|
|
| def resize_only_preprocess(img: np.ndarray, |
| target_size: int = TARGET_SIZE) -> np.ndarray: |
| """ |
| Minimal preprocessing — used for REFUGE2. |
| |
| REFUGE2 images are acquired with a Zeiss Visucam 500 camera under |
| standardized clinical conditions. They are already high-quality with |
| consistent lighting. Any additional enhancement would degrade quality. |
| """ |
| img = cv2.resize(img, (target_size, target_size), |
| interpolation=cv2.INTER_AREA) |
| return np.clip(img, 0, 255).astype(np.uint8) |
|
|
|
|
| def preprocess_image(image_path: str, source: str, |
| target_size: int = TARGET_SIZE): |
| """ |
| Domain-conditional preprocessing dispatcher. |
| |
| Parameters |
| ---------- |
| image_path : str |
| Absolute path to the fundus image file. |
| source : str |
| Dataset source. One of: 'APTOS', 'ODIR', 'REFUGE2' (case-insensitive). |
| target_size : int |
| Output spatial dimension (square). Default 224. |
| |
| Returns |
| ------- |
| np.ndarray of shape (target_size, target_size, 3), dtype uint8, |
| or None if the image cannot be loaded. |
| """ |
| img = _load_image(image_path) |
| if img is None: |
| return None |
|
|
| src = source.upper().strip() |
| if src == 'APTOS': |
| return ben_graham_preprocess(img, target_size=target_size) |
| elif src == 'ODIR': |
| return clahe_preprocess(img, target_size=target_size) |
| elif src == 'REFUGE2': |
| return resize_only_preprocess(img, target_size=target_size) |
| else: |
| |
| print(f'[WARN] Unknown source "{source}", applying CLAHE fallback.') |
| return clahe_preprocess(img, target_size=target_size) |
|
|
|
|
| |
| |
| |
|
|
| def cache_path_for(raw_csv_path: str) -> str: |
| """Return the .npy cache path for a given CSV image_path entry.""" |
| stem = Path(raw_csv_path).stem |
| return os.path.join(CACHE_DIR, f'{stem}_v3.npy') |
|
|
|
|
| def is_cached(raw_csv_path: str) -> bool: |
| return os.path.exists(cache_path_for(raw_csv_path)) |
|
|
|
|
| def save_to_cache(raw_csv_path: str, arr: np.ndarray) -> None: |
| np.save(cache_path_for(raw_csv_path), arr) |
|
|
|
|
| def load_from_cache(raw_csv_path: str): |
| cp = cache_path_for(raw_csv_path) |
| return np.load(cp) if os.path.exists(cp) else None |
|
|
|
|
| def cache_dataset(df: pd.DataFrame) -> dict: |
| """ |
| Preprocess and cache all images in df using domain-conditional pipeline. |
| Returns stats dict. |
| """ |
| stats = dict(processed=0, skipped_missing=0, already_cached=0, |
| errors=0, total=len(df)) |
|
|
| for _, row in tqdm(df.iterrows(), total=len(df), desc='Caching v3'): |
| raw = row['image_path'] |
| src = row['dataset'] |
|
|
| if is_cached(raw): |
| stats['already_cached'] += 1 |
| continue |
|
|
| abs_path = resolve_image_path(raw, src) |
| if not os.path.exists(abs_path): |
| stats['skipped_missing'] += 1 |
| continue |
|
|
| arr = preprocess_image(abs_path, src) |
| if arr is None: |
| stats['errors'] += 1 |
| continue |
|
|
| save_to_cache(raw, arr) |
| stats['processed'] += 1 |
|
|
| return stats |
|
|
|
|
| |
| |
| |
|
|
| def make_preprocessing_comparison( |
| save_path: str = None, |
| odir_raw_path: str = None, |
| aptos_raw_path: str = None) -> str: |
| """ |
| Generate and save a side-by-side comparison PNG showing |
| ODIR (CLAHE) vs APTOS (Ben Graham) preprocessing pipelines. |
| |
| Returns the saved PNG path. |
| """ |
| if save_path is None: |
| save_path = os.path.join(DATA_DIR, 'preprocessing_comparison_v3.png') |
|
|
| |
| |
| odir_path = None |
| if odir_raw_path: |
| odir_path = resolve_image_path(odir_raw_path, 'ODIR') |
| if odir_path is None or not os.path.exists(odir_path): |
| |
| odir_path = ODIR_SAMPLE |
| if not os.path.exists(odir_path): |
| |
| imgs = [os.path.join(ODIR_IMG_DIR, f) |
| for f in os.listdir(ODIR_IMG_DIR) |
| if f.lower().endswith(('.jpg', '.jpeg', '.png'))] |
| odir_path = imgs[0] if imgs else None |
|
|
| |
| aptos_path = None |
| if aptos_raw_path: |
| aptos_path = resolve_image_path(aptos_raw_path, 'APTOS') |
| if aptos_path is None or not os.path.exists(aptos_path): |
| |
| if _APTOS_LOOKUP: |
| aptos_path = next(iter(_APTOS_LOOKUP.values())) |
|
|
| |
| def get_or_synthetic(path, name): |
| if path and os.path.exists(path): |
| img = _load_image(path) |
| if img is not None: |
| return img, path |
| print(f'[WARN] {name} sample not found, using synthetic.') |
| h, w = 512, 512 |
| np.random.seed(42) |
| base = np.zeros((h, w, 3), dtype=np.uint8) |
| cx, cy = w // 2, h // 2 |
| r = int(min(h, w) * 0.48) |
| cv2.circle(base, (cx, cy), r, (60, 40, 25), -1) |
| for _ in range(30): |
| pt1 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r)) |
| pt2 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r)) |
| cv2.line(base, pt1, pt2, (100, 60, 35), 1) |
| base = base.astype(np.float32) + np.random.normal(0, 6, base.shape) |
| return np.clip(base, 0, 255).astype(np.uint8), '(synthetic)' |
|
|
| odir_orig, odir_src = get_or_synthetic(odir_path, 'ODIR') |
| aptos_orig, aptos_src = get_or_synthetic(aptos_path, 'APTOS') |
|
|
| |
| odir_disp = cv2.resize(odir_orig, (TARGET_SIZE, TARGET_SIZE), |
| interpolation=cv2.INTER_AREA) |
| aptos_disp = cv2.resize(aptos_orig, (TARGET_SIZE, TARGET_SIZE), |
| interpolation=cv2.INTER_AREA) |
|
|
| |
| odir_clahe = clahe_preprocess(odir_orig.copy()) |
| aptos_graham = ben_graham_preprocess(aptos_orig.copy()) |
|
|
| |
| diff_odir = cv2.absdiff(odir_disp, odir_clahe) |
| diff_aptos = cv2.absdiff(aptos_disp, aptos_graham) |
| |
| diff_odir = np.clip(diff_odir * 3, 0, 255).astype(np.uint8) |
| diff_aptos = np.clip(diff_aptos * 3, 0, 255).astype(np.uint8) |
|
|
| |
| fig, axes = plt.subplots(2, 3, figsize=(16, 11)) |
| fig.patch.set_facecolor('#1a1a2e') |
| fig.suptitle( |
| 'RetinaSense v3 — Domain-Conditional Preprocessing\n' |
| 'ODIR: CLAHE Pipeline | APTOS: Ben Graham Pipeline', |
| fontsize=13, fontweight='bold', color='white', y=1.01 |
| ) |
|
|
| panels = [ |
| |
| (0, 0, odir_disp, f'ODIR: Original\n({os.path.basename(str(odir_src))})', |
| '#1565C0'), |
| (0, 1, odir_clahe, 'ODIR: After CLAHE\n(L-channel equalization, circular mask)', |
| '#0D47A1'), |
| (0, 2, diff_odir, 'ODIR: Difference x3\n(|original - CLAHE|, amplified)', |
| '#263238'), |
| (1, 0, aptos_disp, f'APTOS: Original\n({os.path.basename(str(aptos_src))})', |
| '#BF360C'), |
| (1, 1, aptos_graham, 'APTOS: After Ben Graham\n(4*img - 4*blur(σ=10) + 128)', |
| '#870000'), |
| (1, 2, diff_aptos, 'APTOS: Difference x3\n(|original - Ben Graham|, amplified)', |
| '#1B5E20'), |
| ] |
|
|
| for r, c, img_arr, title, fc in panels: |
| ax = axes[r, c] |
| ax.imshow(img_arr) |
| ax.set_title(title, fontsize=9, color='white', pad=5, |
| bbox=dict(boxstyle='round,pad=0.3', facecolor=fc, |
| alpha=0.85, edgecolor='none')) |
| ax.axis('off') |
| for spine in ax.spines.values(): |
| spine.set_visible(False) |
|
|
| |
| odir_note = ( |
| 'ODIR Pipeline\n' |
| '━━━━━━━━━━━━━━━\n' |
| '1. Crop black borders\n' |
| '2. Resize → 224×224\n' |
| '3. Convert RGB→LAB\n' |
| '4. CLAHE on L channel\n' |
| ' clip=2.0, tile=8×8\n' |
| '5. LAB→RGB\n' |
| '6. Circular mask (r=0.48)' |
| ) |
| aptos_note = ( |
| 'APTOS Pipeline (Ben Graham)\n' |
| '━━━━━━━━━━━━━━━━━━━━━━━━━━\n' |
| '1. Crop black borders\n' |
| '2. Resize → 224×224\n' |
| '3. blur = GaussianBlur(σ=10)\n' |
| '4. out = 4×img − 4×blur + 128\n' |
| '5. Circular mask (r=0.48)\n' |
| '6. clip to [0, 255]' |
| ) |
|
|
| fig.text(0.02, 0.92, odir_note, fontsize=8.5, va='top', ha='left', |
| color='white', fontfamily='monospace', |
| bbox=dict(boxstyle='round', facecolor='#1565C0', alpha=0.6)) |
| fig.text(0.02, 0.48, aptos_note, fontsize=8.5, va='top', ha='left', |
| color='white', fontfamily='monospace', |
| bbox=dict(boxstyle='round', facecolor='#870000', alpha=0.6)) |
|
|
| plt.tight_layout(rect=[0.18, 0, 1, 1]) |
| plt.savefig(save_path, dpi=150, bbox_inches='tight', |
| facecolor='#1a1a2e', edgecolor='none') |
| plt.close() |
| print(f'[OK] Comparison saved: {save_path}') |
| return save_path |
|
|
|
|
| |
| |
| |
|
|
| def compute_norm_stats(train_df: pd.DataFrame, |
| out_path: str = None, |
| max_images: int = None) -> dict: |
| """ |
| Compute per-channel mean and std across all pixels of training images |
| after domain-conditional preprocessing. Training set ONLY — no |
| validation/test data contamination. |
| |
| Returns dict with: mean_rgb, std_rgb, n_images, n_pixels_per_channel. |
| """ |
| if out_path is None: |
| out_path = os.path.join(DATA_DIR, 'fundus_norm_stats.json') |
|
|
| df = train_df.copy() |
| if max_images is not None: |
| df = df.sample(min(max_images, len(df)), random_state=42) |
|
|
| ch_sum = np.zeros(3, dtype=np.float64) |
| ch_sq_sum = np.zeros(3, dtype=np.float64) |
| n_pixels = 0 |
| n_images = 0 |
| n_missing = 0 |
|
|
| for _, row in tqdm(df.iterrows(), total=len(df), desc='Norm stats'): |
| raw = row['image_path'] |
| src = row['dataset'] |
|
|
| |
| arr = load_from_cache(raw) |
| if arr is None: |
| abs_path = resolve_image_path(raw, src) |
| if not os.path.exists(abs_path): |
| n_missing += 1 |
| continue |
| arr = preprocess_image(abs_path, src) |
| if arr is None: |
| n_missing += 1 |
| continue |
|
|
| arr_f = arr.astype(np.float64) / 255.0 |
| pixels = arr_f.reshape(-1, 3) |
| ch_sum += pixels.sum(axis=0) |
| ch_sq_sum += (pixels ** 2).sum(axis=0) |
| n_pixels += pixels.shape[0] |
| n_images += 1 |
|
|
| if n_images == 0: |
| print('[WARN] No images found — storing ImageNet defaults as fallback.') |
| stats = { |
| 'mean_rgb': [0.485, 0.456, 0.406], |
| 'std_rgb': [0.229, 0.224, 0.225], |
| 'n_images': 0, |
| 'n_pixels_per_channel': 0, |
| 'n_missing': n_missing, |
| 'note': 'No images found — ImageNet defaults used as fallback', |
| 'source': 'imagenet_fallback' |
| } |
| else: |
| mean = ch_sum / n_pixels |
| var = ch_sq_sum / n_pixels - mean ** 2 |
| std = np.sqrt(np.maximum(var, 0.0)) |
| stats = { |
| 'mean_rgb': [round(float(v), 6) for v in mean], |
| 'std_rgb': [round(float(v), 6) for v in std], |
| 'n_images': n_images, |
| 'n_pixels_per_channel': int(n_pixels), |
| 'n_missing': n_missing, |
| 'note': ('Computed on training split only after domain-conditional ' |
| 'preprocessing. Red-dominant channel expected (fundus tissue).'), |
| 'source': 'computed_training_split' |
| } |
| print(f' mean RGB : {[round(v,4) for v in mean]}') |
| print(f' std RGB : {[round(v,4) for v in std]}') |
| print(f' images : {n_images:,} | missing: {n_missing}') |
|
|
| with open(out_path, 'w') as f: |
| json.dump(stats, f, indent=2) |
| print(f'[OK] Stats saved: {out_path}') |
| return stats |
|
|
|
|
| |
| |
| |
|
|
| def create_stratified_split(df: pd.DataFrame, |
| train_ratio: float = 0.70, |
| calib_ratio: float = 0.15, |
| test_ratio: float = 0.15, |
| random_state: int = 42) -> tuple: |
| """ |
| Create train/calib/test split stratified by disease_label. |
| Returns (train_df, calib_df, test_df). |
| """ |
| from sklearn.model_selection import train_test_split as _tts |
| assert abs(train_ratio + calib_ratio + test_ratio - 1.0) < 1e-9 |
|
|
| train_df, temp_df = _tts( |
| df, test_size=(calib_ratio + test_ratio), |
| stratify=df['disease_label'], random_state=random_state |
| ) |
| calib_frac = calib_ratio / (calib_ratio + test_ratio) |
| calib_df, test_df = _tts( |
| temp_df, test_size=(1.0 - calib_frac), |
| stratify=temp_df['disease_label'], random_state=random_state |
| ) |
| return (train_df.reset_index(drop=True), |
| calib_df.reset_index(drop=True), |
| test_df.reset_index(drop=True)) |
|
|
|
|
| def save_splits(train_df, calib_df, test_df, out_dir: str = DATA_DIR): |
| train_df.to_csv(os.path.join(out_dir, 'train_split.csv'), index=False) |
| calib_df.to_csv(os.path.join(out_dir, 'calib_split.csv'), index=False) |
| test_df.to_csv( os.path.join(out_dir, 'test_split.csv'), index=False) |
| print(f'[OK] Split CSVs saved to {out_dir}/') |
|
|
|
|
| def print_split_stats(train_df, calib_df, test_df, |
| class_names: dict = None) -> str: |
| if class_names is None: |
| class_names = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma', |
| 3: 'Cataract', 4: 'AMD'} |
|
|
| total_n = len(train_df) + len(calib_df) + len(test_df) |
| lines = [ |
| '', |
| '=' * 62, |
| ' STRATIFIED SPLIT — CLASS DISTRIBUTION', |
| '=' * 62, |
| f"{'Class':<16} {'Train':>8} {'Calib':>8} {'Test':>8} {'Total':>8}", |
| '-' * 54, |
| ] |
| tr_tot = ca_tot = te_tot = 0 |
| for lbl in sorted(class_names.keys()): |
| tr = int((train_df['disease_label'] == lbl).sum()) |
| ca = int((calib_df['disease_label'] == lbl).sum()) |
| te = int((test_df['disease_label'] == lbl).sum()) |
| tot = tr + ca + te |
| tr_tot += tr; ca_tot += ca; te_tot += te |
| lines.append( |
| f"{class_names[lbl]:<16} {tr:>8,} {ca:>8,} {te:>8,} {tot:>8,}" |
| ) |
| lines += [ |
| '-' * 54, |
| f"{'TOTAL':<16} {tr_tot:>8,} {ca_tot:>8,} {te_tot:>8,} {total_n:>8,}", |
| '', |
| f'Split sizes : train={len(train_df):,} calib={len(calib_df):,} ' |
| f'test={len(test_df):,}', |
| f'Actual ratios: train={len(train_df)/total_n:.1%} ' |
| f'calib={len(calib_df)/total_n:.1%} ' |
| f'test={len(test_df)/total_n:.1%}', |
| ] |
| report = '\n'.join(lines) |
| print(report) |
| return report |
|
|
|
|
| |
| |
| |
|
|
| def search_additional_datasets() -> dict: |
| """ |
| Scan filesystem for REFUGE2, iChallenge-AMD, RIM-ONE and other |
| AMD/Glaucoma-specific datasets beyond the current CSV. |
| Returns a findings dict. |
| """ |
| IMG_EXTS = {'.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp'} |
| TARGETS = ['refuge2', 'refuge', 'ichallenge', 'rim-one', 'rimone', |
| 'amd', 'glaucoma', 'odir5k', 'odir'] |
| SEARCH_ROOTS = ['/teamspace/studios/this_studio', '/teamspace/uploads'] |
| SKIP_DIRS = {'.git', '.cache', '.claude', '.ipython', '.npm', |
| '__pycache__', 'outputs_analysis', 'outputs_ensemble', |
| 'outputs_optimized', 'outputs_production', 'outputs_v2', |
| 'outputs_v2_extended', 'outputs_vit'} |
|
|
| findings = {} |
|
|
| for root_dir in SEARCH_ROOTS: |
| if not os.path.exists(root_dir): |
| continue |
| for dirpath, dirnames, files in os.walk(root_dir): |
| |
| dirnames[:] = [d for d in dirnames |
| if d not in SKIP_DIRS and not d.startswith('.')] |
| folder = os.path.basename(dirpath).lower() |
| for target in TARGETS: |
| if target in folder: |
| img_cnt = sum(1 for f in files |
| if os.path.splitext(f)[1].lower() in IMG_EXTS) |
| key = dirpath |
| if key not in findings or img_cnt > findings[key]['img_count']: |
| findings[key] = { |
| 'matched_target': target, |
| 'img_count': img_cnt, |
| 'total_files': len(files) |
| } |
|
|
| |
| for special in [ |
| '/teamspace/studios/this_studio/ocular-disease-recognition-odir5k', |
| '/teamspace/studios/this_studio/odir', |
| '/teamspace/studios/this_studio/aptos', |
| ]: |
| if os.path.exists(special) and special not in findings: |
| img_cnt = sum( |
| 1 for root, _, files in os.walk(special) |
| for f in files |
| if os.path.splitext(f)[1].lower() in IMG_EXTS |
| ) |
| findings[special] = { |
| 'matched_target': 'known_dataset', |
| 'img_count': img_cnt, |
| 'total_files': sum(1 for _, _, fs in os.walk(special) for _ in fs) |
| } |
|
|
| return findings |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| print('=' * 65) |
| print(' RetinaSense v3 — Data Pipeline') |
| print('=' * 65) |
|
|
| CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma', |
| 3: 'Cataract', 4: 'AMD'} |
|
|
| |
| |
| |
| print('\n[TASK 1] Dataset Audit') |
| print('-' * 50) |
| df = pd.read_csv(CSV_PATH) |
| print(f' CSV : {CSV_PATH}') |
| print(f' Total rows : {len(df):,}') |
| print(f' Columns : {df.columns.tolist()}') |
| print() |
|
|
| print(' --- Overall class distribution ---') |
| for lbl, cnt in df['disease_label'].value_counts().sort_index().items(): |
| pct = cnt / len(df) * 100 |
| bar = '#' * int(pct / 2) |
| print(f" {lbl} {CLASS_NAMES.get(lbl,'?'):<12} : {cnt:>5} ({pct:5.1f}%) {bar}") |
|
|
| max_cls = df['disease_label'].value_counts().max() |
| min_cls = df['disease_label'].value_counts().min() |
| print(f'\n Imbalance ratio (max/min): {max_cls/min_cls:.1f}:1') |
| print() |
|
|
| print(' --- Per-dataset breakdown ---') |
| per_ds = (df.groupby(['dataset', 'disease_label']) |
| .size().reset_index(name='count')) |
| print(per_ds.to_string(index=False)) |
| print() |
|
|
| print(' --- Severity label distribution (APTOS only) ---') |
| for sev, cnt in df['severity_label'].value_counts().sort_index().items(): |
| label = 'N/A (ODIR)' if sev == -1 else f'Grade {sev}' |
| print(f" {sev:>3} ({label:<14}): {cnt:>5}") |
| print() |
|
|
| print(' --- Image path existence check ---') |
| n_found = 0 |
| for _, row in df.iterrows(): |
| p = resolve_image_path(row['image_path'], row['dataset']) |
| if os.path.exists(p): |
| n_found += 1 |
| n_missing = len(df) - n_found |
| print(f' Total checked : {len(df):,}') |
| print(f' Found on disk : {n_found:,}') |
| print(f' Missing : {n_missing:,}') |
| print() |
|
|
| |
| |
| |
| print('[TASK 2] Domain-Conditional Preprocessing Comparison') |
| print('-' * 50) |
|
|
| |
| odir_sample = df[df['dataset'] == 'ODIR']['image_path'].iloc[0] \ |
| if len(df[df['dataset'] == 'ODIR']) > 0 else None |
| aptos_sample = df[df['dataset'] == 'APTOS']['image_path'].iloc[0] \ |
| if len(df[df['dataset'] == 'APTOS']) > 0 else None |
|
|
| comp_path = make_preprocessing_comparison( |
| odir_raw_path=odir_sample, |
| aptos_raw_path=aptos_sample |
| ) |
|
|
| |
| print('\n --- Pipeline verification (5 ODIR + 5 APTOS) ---') |
| ok_odir = ok_aptos = 0 |
| for _, row in df[df['dataset'] == 'ODIR'].head(5).iterrows(): |
| p = resolve_image_path(row['image_path'], 'ODIR') |
| if os.path.exists(p): |
| arr = preprocess_image(p, 'ODIR') |
| if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3): |
| ok_odir += 1 |
| for _, row in df[df['dataset'] == 'APTOS'].head(5).iterrows(): |
| p = resolve_image_path(row['image_path'], 'APTOS') |
| if os.path.exists(p): |
| arr = preprocess_image(p, 'APTOS') |
| if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3): |
| ok_aptos += 1 |
| print(f' ODIR (CLAHE) : {ok_odir}/5 OK') |
| print(f' APTOS (Ben Graham) : {ok_aptos}/5 OK') |
| print() |
|
|
| |
| |
| |
| print('[TASK 3] 3-Way Stratified Split (70 / 15 / 15)') |
| print('-' * 50) |
| train_df, calib_df, test_df = create_stratified_split(df) |
| save_splits(train_df, calib_df, test_df) |
| split_report = print_split_stats(train_df, calib_df, test_df, CLASS_NAMES) |
| print() |
|
|
| |
| |
| |
| print('[TASK 4] Fundus Normalization Statistics (training split)') |
| print('-' * 50) |
| norm_stats = compute_norm_stats(train_df) |
| print() |
|
|
| |
| |
| |
| print('[TASK 5] Additional Dataset Search') |
| print('-' * 50) |
| findings = search_additional_datasets() |
| if findings: |
| print(f' Found {len(findings)} dataset directories:') |
| for path, info in findings.items(): |
| print(f' {path}') |
| print(f' images: {info["img_count"]:,} ' |
| f'files: {info["total_files"]:,} ' |
| f'matched: "{info["matched_target"]}"') |
| else: |
| print(' No additional datasets found.') |
| print() |
|
|
| |
| known_sets = {'REFUGE2', 'ICHALLENGE-AMD', 'RIM-ONE'} |
| found_names = set(info['matched_target'].upper() |
| for info in findings.values()) |
| missing_sets = known_sets - found_names |
| if missing_sets: |
| print(f' Datasets NOT found (need downloading): {missing_sets}') |
|
|
| |
| |
| |
| _write_report(df, train_df, calib_df, test_df, norm_stats, |
| findings, split_report, comp_path) |
|
|
| print('\n' + '=' * 65) |
| print(' All tasks complete.') |
| print('=' * 65) |
| return df, train_df, calib_df, test_df, norm_stats |
|
|
|
|
| |
| |
| |
|
|
| def _write_report(df, train_df, calib_df, test_df, norm_stats, |
| dataset_findings, split_report, comp_path): |
| """Save data_engineer_report.md to ./data/""" |
| CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma', |
| 3: 'Cataract', 4: 'AMD'} |
|
|
| n_found = sum( |
| 1 for _, row in df.iterrows() |
| if os.path.exists(resolve_image_path(row['image_path'], row['dataset'])) |
| ) |
|
|
| lines = [ |
| '# RetinaSense v3 — Data Engineer Report', |
| f'Generated: 2026-03-06', |
| '', |
| '---', |
| '', |
| '## 1. Dataset Statistics', |
| '', |
| f'**Source CSV:** `data/combined_dataset.csv` ', |
| f'**Total images in CSV:** {len(df):,} ', |
| f'**Images found on disk:** {n_found:,} / {len(df):,} ', |
| '', |
| '### Source breakdown', |
| '', |
| '| Dataset | Count | Labels present |', |
| '|---------|-------|----------------|', |
| ] |
| for ds, grp in df.groupby('dataset'): |
| labels = sorted(grp['disease_label'].unique()) |
| label_str = ', '.join(f'{l}={CLASS_NAMES[l]}' for l in labels) |
| lines.append(f'| {ds} | {len(grp):,} | {label_str} |') |
|
|
| lines += [ |
| '', |
| '### Class distribution (full dataset)', |
| '', |
| '| Label | Class | Count | % |', |
| '|-------|-------|-------|---|', |
| ] |
| for lbl, cnt in df['disease_label'].value_counts().sort_index().items(): |
| pct = cnt / len(df) * 100 |
| lines.append( |
| f'| {lbl} | {CLASS_NAMES[lbl]} | {cnt:,} | {pct:.1f}% |' |
| ) |
| max_cls = df['disease_label'].value_counts().max() |
| min_cls = df['disease_label'].value_counts().min() |
| lines += [ |
| '', |
| f'**Imbalance ratio (Diabetes:AMD):** {max_cls/min_cls:.1f}:1', |
| '', |
| '### Severity label distribution (APTOS DR grades, -1 = ODIR no grade)', |
| '', |
| '| Severity | Meaning | Count |', |
| '|----------|---------|-------|', |
| ] |
| for sev, cnt in df['severity_label'].value_counts().sort_index().items(): |
| meaning = 'N/A (ODIR, no grade)' if sev == -1 else f'DR Grade {sev}' |
| lines.append(f'| {sev} | {meaning} | {cnt:,} |') |
|
|
| lines += [ |
| '', |
| '---', |
| '', |
| '## 2. Image Path Resolution', |
| '', |
| '| Dataset | CSV path format | Actual location |', |
| '|---------|-----------------|-----------------|', |
| '| ODIR | `.//odir/preprocessed_images/<name>.jpg` | `odir/preprocessed_images/<name>.jpg` |', |
| '| APTOS | `.//aptos/train_images/<id>.png` (train_images does NOT exist) | `aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png` |', |
| '', |
| '`train_images/` directory is absent; actual APTOS images are stored under', |
| '`gaussian_filtered_images/gaussian_filtered_images/<DR_grade>/`. The', |
| '`aptos/train.csv` maps `id_code` → `diagnosis` (0-4) enabling lookup.', |
| '', |
| '---', |
| '', |
| '## 3. Preprocessing: Domain-Conditional Pipeline', |
| '', |
| '**Problem:** Previous versions applied Ben Graham enhancement uniformly to', |
| 'ALL images. This is incorrect: ODIR images have already-enhanced or', |
| 'clinical-quality appearance; applying Ben Graham degrades them.', |
| '', |
| '**Fix:** Source-conditional dispatch in `preprocess_image(path, source)`.', |
| '', |
| '| Source | Method | Rationale |', |
| '|--------|--------|-----------|', |
| '| APTOS | Ben Graham (4×img − 4×blur(σ=10) + 128 + circular mask) | Field camera images have vignetting and low local contrast. Ben Graham removes low-frequency illumination and amplifies vessel/lesion detail. |', |
| '| ODIR | CLAHE (L-channel, clip=2.0, tile=8×8, circular mask) | Multi-source clinical images. CLAHE normalizes local contrast while preserving sharpness and avoiding Ben Graham over-processing. |', |
| '| REFUGE2 | Resize only (224×224) | Zeiss Visucam 500 — already standardized high-quality. |', |
| '', |
| f'**Comparison figure:** `{comp_path}`', |
| '', |
| '**Cache location:** `preprocessed_cache_v3/<stem>_v3.npy` ', |
| '**Cache key:** image filename stem (not row index)', |
| '', |
| '---', |
| '', |
| '## 4. Normalization Statistics', |
| '', |
| '**Method:** One pass over training split pixels (post-preprocessing).', |
| 'No validation or test images used.', |
| '', |
| f'| Channel | Mean | Std |', |
| f'|---------|------|-----|', |
| f'| R (red) | {norm_stats["mean_rgb"][0]:.4f} | {norm_stats["std_rgb"][0]:.4f} |', |
| f'| G (green) | {norm_stats["mean_rgb"][1]:.4f} | {norm_stats["std_rgb"][1]:.4f} |', |
| f'| B (blue) | {norm_stats["mean_rgb"][2]:.4f} | {norm_stats["std_rgb"][2]:.4f} |', |
| '', |
| f'**Images used:** {norm_stats["n_images"]:,} ', |
| f'**Note:** {norm_stats["note"]} ', |
| f'**Source:** `{norm_stats["source"]}`', |
| ] |
|
|
| if norm_stats['source'] == 'computed_training_split': |
| lines += [ |
| '', |
| 'Expected pattern for fundus images: R > G > B (red-dominant)', |
| 'due to high hemoglobin absorption. Computed values should match', |
| 'expected ≈ [0.41, 0.27, 0.19] mean, [0.28, 0.19, 0.16] std.', |
| ] |
|
|
| lines += [ |
| '', |
| '**Saved to:** `data/fundus_norm_stats.json`', |
| '', |
| '---', |
| '', |
| '## 5. Stratified Split (70 / 15 / 15)', |
| '', |
| '**Strategy:** `sklearn.model_selection.train_test_split` with', |
| '`stratify=disease_label`, `random_state=42`.', |
| '', |
| '**Files:**', |
| '- `data/train_split.csv` — 70% training', |
| '- `data/calib_split.csv` — 15% calibration (temperature scaling)', |
| '- `data/test_split.csv` — 15% held-out evaluation', |
| '', |
| ] |
| lines.append(split_report.replace('\n', '\n')) |
| lines += [ |
| '', |
| '---', |
| '', |
| '## 6. Additional Dataset Search', |
| '', |
| ] |
| if dataset_findings: |
| lines.append('### Found directories:') |
| lines.append('') |
| lines.append('| Path | Images | Files | Matched |') |
| lines.append('|------|--------|-------|---------|') |
| for path, info in dataset_findings.items(): |
| lines.append( |
| f'| `{path}` | {info["img_count"]:,} | ' |
| f'{info["total_files"]:,} | {info["matched_target"]} |' |
| ) |
| else: |
| lines.append('No additional dataset directories found.') |
|
|
| lines += [ |
| '', |
| '### Availability summary', |
| '', |
| '| Dataset | Status | Location |', |
| '|---------|--------|----------|', |
| '| ODIR-5K (ODIR) | **AVAILABLE** | `odir/preprocessed_images/` (4,878 images in CSV) |', |
| '| ODIR-5K raw | **AVAILABLE** | `odir/ODIR-5K/ODIR-5K/Training Images/` (7,000) + Testing (1,000) |', |
| '| APTOS 2019 | **AVAILABLE** | `aptos/gaussian_filtered_images/` (3,662 images) |', |
| '| ocular-disease-recognition-odir5k | Partial (1 image only) | `ocular-disease-recognition-odir5k/preprocessed_images/` |', |
| '| REFUGE2 | **NOT FOUND** | Needs download |', |
| '| iChallenge-AMD | **NOT FOUND** | Needs download |', |
| '| RIM-ONE | **NOT FOUND** | Needs download |', |
| '', |
| '### AMD / Glaucoma specific images (beyond CSV)', |
| '', |
| f'- ODIR provides {len(df[df["disease_label"]==2]):,} Glaucoma and ' |
| f'{len(df[df["disease_label"]==4]):,} AMD images from ' |
| f'`odir/preprocessed_images/`.', |
| '- ODIR raw training set (7,000 images) may contain additional', |
| ' AMD/Glaucoma cases not yet extracted — check `odir/full_df.csv`.', |
| '- For specialized Glaucoma detection: REFUGE2 (400 images,', |
| ' Magrabia population) and RIM-ONE (159 images) are recommended.', |
| '- For AMD: iChallenge-AMD (400 images) is the standard benchmark.', |
| '', |
| '---', |
| '', |
| '## 7. Action Items', |
| '', |
| '1. **Download missing datasets** to improve minority class coverage:', |
| ' - REFUGE2: https://refuge.grand-challenge.org/', |
| ' - RIM-ONE: http://medimrg.webs.ull.es/research/retinal-imaging/rim-one/', |
| ' - iChallenge-AMD: https://amd.grand-challenge.org/', |
| '2. **Fix paths in combined_dataset.csv**: update `aptos/train_images/` →', |
| ' actual `gaussian_filtered_images/.../` paths.', |
| '3. **Run full cache build** when training: `python retinasense_v3_preprocessing.py --cache-all`', |
| '4. **Use computed normalization stats** from `data/fundus_norm_stats.json`', |
| ' instead of ImageNet stats.', |
| '5. **Address 21:1 class imbalance**: consider weighted sampling or', |
| ' oversampling minority classes (AMD=265, Glaucoma=308).', |
| ] |
|
|
| report_path = os.path.join(DATA_DIR, 'data_engineer_report.md') |
| with open(report_path, 'w') as f: |
| f.write('\n'.join(lines) + '\n') |
| print(f'[OK] Report saved: {report_path}') |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|