tanishq74 commited on
Commit
c3e6a8f
·
verified ·
1 Parent(s): 5b13cad

Add retinasense_v3_preprocessing.py

Browse files
Files changed (1) hide show
  1. retinasense_v3_preprocessing.py +1064 -0
retinasense_v3_preprocessing.py ADDED
@@ -0,0 +1,1064 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ RetinaSense v3 — Domain-Conditional Preprocessing Pipeline
4
+ ===========================================================
5
+ Implements source-aware preprocessing:
6
+ - APTOS -> Ben Graham enhancement (high contrast DR-specific pipeline)
7
+ - ODIR -> CLAHE only (preserves sharpness, normalizes contrast)
8
+ - REFUGE2 -> Resize only (images already clinical-grade high quality)
9
+
10
+ Image path resolution:
11
+ - ODIR: odir/preprocessed_images/<filename>
12
+ - APTOS: aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png
13
+ (looked up from aptos/train.csv; aptos/train_images/ does NOT exist)
14
+
15
+ Cache format: ./preprocessed_cache_v3/<stem>_v3.npy
16
+ """
17
+
18
+ import os
19
+ import sys
20
+ import json
21
+ import warnings
22
+ import numpy as np
23
+ import pandas as pd
24
+ import cv2
25
+ import matplotlib
26
+ matplotlib.use('Agg')
27
+ import matplotlib.pyplot as plt
28
+ from pathlib import Path
29
+ from tqdm import tqdm
30
+
31
+ warnings.filterwarnings('ignore')
32
+
33
+ # =========================================================
34
+ # PATHS
35
+ # =========================================================
36
+ BASE_DIR = '/teamspace/studios/this_studio'
37
+ CSV_PATH = os.path.join(BASE_DIR, 'data', 'combined_dataset.csv')
38
+ CACHE_DIR = os.path.join(BASE_DIR, 'preprocessed_cache_v3')
39
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
40
+
41
+ ODIR_IMG_DIR = os.path.join(BASE_DIR, 'odir', 'preprocessed_images')
42
+ APTOS_CSV = os.path.join(BASE_DIR, 'aptos', 'train.csv')
43
+ APTOS_IMG_BASE = os.path.join(BASE_DIR, 'aptos',
44
+ 'gaussian_filtered_images',
45
+ 'gaussian_filtered_images')
46
+ APTOS_DIAG_MAP = {0: 'No_DR', 1: 'Mild', 2: 'Moderate',
47
+ 3: 'Severe', 4: 'Proliferate_DR'}
48
+
49
+ ODIR_SAMPLE = os.path.join(BASE_DIR, 'ocular-disease-recognition-odir5k',
50
+ 'preprocessed_images', '2977_left.jpg')
51
+
52
+ os.makedirs(CACHE_DIR, exist_ok=True)
53
+ os.makedirs(DATA_DIR, exist_ok=True)
54
+
55
+ TARGET_SIZE = 224
56
+
57
+ # =========================================================
58
+ # APTOS PATH LOOKUP TABLE
59
+ # Built once at module load; maps id_code (stem) -> abs path
60
+ # =========================================================
61
+
62
+ def _build_aptos_lookup() -> dict:
63
+ """Return dict mapping aptos id_code -> absolute image path."""
64
+ lookup = {}
65
+ if not os.path.exists(APTOS_CSV):
66
+ return lookup
67
+ df = pd.read_csv(APTOS_CSV)
68
+ for _, row in df.iterrows():
69
+ folder = APTOS_DIAG_MAP.get(int(row['diagnosis']), 'No_DR')
70
+ path = os.path.join(APTOS_IMG_BASE, folder,
71
+ str(row['id_code']) + '.png')
72
+ lookup[str(row['id_code'])] = path
73
+ return lookup
74
+
75
+
76
+ _APTOS_LOOKUP: dict = _build_aptos_lookup()
77
+
78
+
79
+ # =========================================================
80
+ # PATH RESOLVER
81
+ # =========================================================
82
+
83
+ def resolve_image_path(raw_path: str, dataset: str = None) -> str:
84
+ """
85
+ Resolve CSV path entry to an absolute filesystem path.
86
+
87
+ The CSV stores paths like:
88
+ ODIR: .//odir/preprocessed_images/0_left.jpg
89
+ APTOS: .//aptos/train_images/000c1434d8d7.png (train_images doesn't exist)
90
+
91
+ Resolution rules:
92
+ 1. If the resolved path already exists, return it.
93
+ 2. ODIR: remap to odir/preprocessed_images/<filename>
94
+ 3. APTOS: look up via _APTOS_LOOKUP by stem
95
+ """
96
+ # Normalise .// and ./ prefixes
97
+ p = raw_path.strip()
98
+ if p.startswith('.//'):
99
+ p = p[3:]
100
+ elif p.startswith('./'):
101
+ p = p[2:]
102
+
103
+ # Try as-is (absolute or relative to BASE_DIR)
104
+ if not os.path.isabs(p):
105
+ candidate = os.path.join(BASE_DIR, p)
106
+ else:
107
+ candidate = p
108
+
109
+ if os.path.exists(candidate):
110
+ return candidate
111
+
112
+ fname = os.path.basename(p)
113
+ stem = os.path.splitext(fname)[0]
114
+ src = (dataset or '').upper().strip()
115
+
116
+ # ODIR remap
117
+ if src == 'ODIR' or 'odir' in p.lower():
118
+ return os.path.join(ODIR_IMG_DIR, fname)
119
+
120
+ # APTOS remap via lookup table
121
+ if src == 'APTOS' or 'aptos' in p.lower():
122
+ if stem in _APTOS_LOOKUP:
123
+ return _APTOS_LOOKUP[stem]
124
+
125
+ # Final fallback: try all known image dirs
126
+ for d in [ODIR_IMG_DIR, APTOS_IMG_BASE]:
127
+ candidate2 = os.path.join(d, fname)
128
+ if os.path.exists(candidate2):
129
+ return candidate2
130
+
131
+ return candidate # return best guess even if missing
132
+
133
+
134
+ # =========================================================
135
+ # PREPROCESSING FUNCTIONS
136
+ # =========================================================
137
+
138
+ def _load_image(image_path: str):
139
+ """Load image as RGB numpy array (H, W, 3) uint8. Returns None on failure."""
140
+ img = cv2.imread(image_path)
141
+ if img is None:
142
+ return None
143
+ return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
144
+
145
+
146
+ def _crop_black_borders(img: np.ndarray, tol: int = 7) -> np.ndarray:
147
+ """Remove dark border padding common in fundus images."""
148
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
149
+ mask = gray > tol
150
+ rows = np.any(mask, axis=1)
151
+ cols = np.any(mask, axis=0)
152
+ if not rows.any() or not cols.any():
153
+ return img
154
+ rmin, rmax = np.where(rows)[0][[0, -1]]
155
+ cmin, cmax = np.where(cols)[0][[0, -1]]
156
+ return img[rmin:rmax+1, cmin:cmax+1]
157
+
158
+
159
+ def _apply_circular_mask(img: np.ndarray) -> np.ndarray:
160
+ """Zero out pixels outside the circular fundus field of view."""
161
+ h, w = img.shape[:2]
162
+ mask = np.zeros((h, w), dtype=np.uint8)
163
+ cx, cy = w // 2, h // 2
164
+ r = int(min(h, w) * 0.48)
165
+ cv2.circle(mask, (cx, cy), r, 255, -1)
166
+ return cv2.bitwise_and(img, img, mask=mask)
167
+
168
+
169
+ def ben_graham_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE,
170
+ sigma: float = 10.0) -> np.ndarray:
171
+ """
172
+ Ben Graham fundus enhancement — used for APTOS images.
173
+
174
+ Enhances local retinal structures (vessels, lesions) by subtracting a
175
+ Gaussian-blurred version from itself, centering intensity around 128.
176
+ This removes low-frequency illumination variation (vignetting, uneven
177
+ camera lighting) and amplifies high-frequency structural details.
178
+
179
+ Formula: result = 4*img - 4*GaussianBlur(img, sigma=10) + 128
180
+ Then circular mask applied to suppress black border.
181
+ """
182
+ img = _crop_black_borders(img)
183
+ img = cv2.resize(img, (target_size, target_size),
184
+ interpolation=cv2.INTER_AREA)
185
+ blur = cv2.GaussianBlur(img, (0, 0), sigma)
186
+ img = cv2.addWeighted(img, 4, blur, -4, 128)
187
+ img = _apply_circular_mask(img)
188
+ return np.clip(img, 0, 255).astype(np.uint8)
189
+
190
+
191
+ def clahe_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE,
192
+ clip_limit: float = 2.0,
193
+ tile_grid: tuple = (8, 8)) -> np.ndarray:
194
+ """
195
+ CLAHE (Contrast Limited Adaptive Histogram Equalization) — used for ODIR.
196
+
197
+ Preserves image sharpness while normalizing local contrast.
198
+ Applied only to the L (luminance) channel in LAB color space to
199
+ avoid hue shifts. ODIR is a multi-source dataset with mixed quality,
200
+ so CLAHE provides gentle contrast normalization without destroying
201
+ fine detail the way Ben Graham's aggressive subtraction would.
202
+
203
+ clip_limit=2.0: moderate clipping to prevent over-amplification of noise.
204
+ tile_grid=(8,8): 8x8 tiles for local adaptation at appropriate scale.
205
+ """
206
+ img = _crop_black_borders(img)
207
+ img = cv2.resize(img, (target_size, target_size),
208
+ interpolation=cv2.INTER_AREA)
209
+ lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
210
+ l, a, b = cv2.split(lab)
211
+ clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid)
212
+ l_eq = clahe.apply(l)
213
+ lab_eq = cv2.merge([l_eq, a, b])
214
+ img = cv2.cvtColor(lab_eq, cv2.COLOR_LAB2RGB)
215
+ img = _apply_circular_mask(img)
216
+ return np.clip(img, 0, 255).astype(np.uint8)
217
+
218
+
219
+ def resize_only_preprocess(img: np.ndarray,
220
+ target_size: int = TARGET_SIZE) -> np.ndarray:
221
+ """
222
+ Minimal preprocessing — used for REFUGE2.
223
+
224
+ REFUGE2 images are acquired with a Zeiss Visucam 500 camera under
225
+ standardized clinical conditions. They are already high-quality with
226
+ consistent lighting. Any additional enhancement would degrade quality.
227
+ """
228
+ img = cv2.resize(img, (target_size, target_size),
229
+ interpolation=cv2.INTER_AREA)
230
+ return np.clip(img, 0, 255).astype(np.uint8)
231
+
232
+
233
+ def preprocess_image(image_path: str, source: str,
234
+ target_size: int = TARGET_SIZE):
235
+ """
236
+ Domain-conditional preprocessing dispatcher.
237
+
238
+ Parameters
239
+ ----------
240
+ image_path : str
241
+ Absolute path to the fundus image file.
242
+ source : str
243
+ Dataset source. One of: 'APTOS', 'ODIR', 'REFUGE2' (case-insensitive).
244
+ target_size : int
245
+ Output spatial dimension (square). Default 224.
246
+
247
+ Returns
248
+ -------
249
+ np.ndarray of shape (target_size, target_size, 3), dtype uint8,
250
+ or None if the image cannot be loaded.
251
+ """
252
+ img = _load_image(image_path)
253
+ if img is None:
254
+ return None
255
+
256
+ src = source.upper().strip()
257
+ if src == 'APTOS':
258
+ return ben_graham_preprocess(img, target_size=target_size)
259
+ elif src == 'ODIR':
260
+ return clahe_preprocess(img, target_size=target_size)
261
+ elif src == 'REFUGE2':
262
+ return resize_only_preprocess(img, target_size=target_size)
263
+ else:
264
+ # Safe fallback for unknown sources
265
+ print(f'[WARN] Unknown source "{source}", applying CLAHE fallback.')
266
+ return clahe_preprocess(img, target_size=target_size)
267
+
268
+
269
+ # =========================================================
270
+ # CACHE HELPERS
271
+ # =========================================================
272
+
273
+ def cache_path_for(raw_csv_path: str) -> str:
274
+ """Return the .npy cache path for a given CSV image_path entry."""
275
+ stem = Path(raw_csv_path).stem
276
+ return os.path.join(CACHE_DIR, f'{stem}_v3.npy')
277
+
278
+
279
+ def is_cached(raw_csv_path: str) -> bool:
280
+ return os.path.exists(cache_path_for(raw_csv_path))
281
+
282
+
283
+ def save_to_cache(raw_csv_path: str, arr: np.ndarray) -> None:
284
+ np.save(cache_path_for(raw_csv_path), arr)
285
+
286
+
287
+ def load_from_cache(raw_csv_path: str):
288
+ cp = cache_path_for(raw_csv_path)
289
+ return np.load(cp) if os.path.exists(cp) else None
290
+
291
+
292
+ def cache_dataset(df: pd.DataFrame) -> dict:
293
+ """
294
+ Preprocess and cache all images in df using domain-conditional pipeline.
295
+ Returns stats dict.
296
+ """
297
+ stats = dict(processed=0, skipped_missing=0, already_cached=0,
298
+ errors=0, total=len(df))
299
+
300
+ for _, row in tqdm(df.iterrows(), total=len(df), desc='Caching v3'):
301
+ raw = row['image_path']
302
+ src = row['dataset']
303
+
304
+ if is_cached(raw):
305
+ stats['already_cached'] += 1
306
+ continue
307
+
308
+ abs_path = resolve_image_path(raw, src)
309
+ if not os.path.exists(abs_path):
310
+ stats['skipped_missing'] += 1
311
+ continue
312
+
313
+ arr = preprocess_image(abs_path, src)
314
+ if arr is None:
315
+ stats['errors'] += 1
316
+ continue
317
+
318
+ save_to_cache(raw, arr)
319
+ stats['processed'] += 1
320
+
321
+ return stats
322
+
323
+
324
+ # =========================================================
325
+ # PREPROCESSING COMPARISON VISUALIZATION
326
+ # =========================================================
327
+
328
+ def make_preprocessing_comparison(
329
+ save_path: str = None,
330
+ odir_raw_path: str = None,
331
+ aptos_raw_path: str = None) -> str:
332
+ """
333
+ Generate and save a side-by-side comparison PNG showing
334
+ ODIR (CLAHE) vs APTOS (Ben Graham) preprocessing pipelines.
335
+
336
+ Returns the saved PNG path.
337
+ """
338
+ if save_path is None:
339
+ save_path = os.path.join(DATA_DIR, 'preprocessing_comparison_v3.png')
340
+
341
+ # --- Pick sample ODIR image ---
342
+ # Prefer sample from the dataset
343
+ odir_path = None
344
+ if odir_raw_path:
345
+ odir_path = resolve_image_path(odir_raw_path, 'ODIR')
346
+ if odir_path is None or not os.path.exists(odir_path):
347
+ # Use the one available ODIR sample in odir5k folder
348
+ odir_path = ODIR_SAMPLE
349
+ if not os.path.exists(odir_path):
350
+ # Fall back to any image in odir/preprocessed_images
351
+ imgs = [os.path.join(ODIR_IMG_DIR, f)
352
+ for f in os.listdir(ODIR_IMG_DIR)
353
+ if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
354
+ odir_path = imgs[0] if imgs else None
355
+
356
+ # --- Pick sample APTOS image ---
357
+ aptos_path = None
358
+ if aptos_raw_path:
359
+ aptos_path = resolve_image_path(aptos_raw_path, 'APTOS')
360
+ if aptos_path is None or not os.path.exists(aptos_path):
361
+ # Use first entry in APTOS lookup
362
+ if _APTOS_LOOKUP:
363
+ aptos_path = next(iter(_APTOS_LOOKUP.values()))
364
+
365
+ # --- Load images ---
366
+ def get_or_synthetic(path, name):
367
+ if path and os.path.exists(path):
368
+ img = _load_image(path)
369
+ if img is not None:
370
+ return img, path
371
+ print(f'[WARN] {name} sample not found, using synthetic.')
372
+ h, w = 512, 512
373
+ np.random.seed(42)
374
+ base = np.zeros((h, w, 3), dtype=np.uint8)
375
+ cx, cy = w // 2, h // 2
376
+ r = int(min(h, w) * 0.48)
377
+ cv2.circle(base, (cx, cy), r, (60, 40, 25), -1)
378
+ for _ in range(30):
379
+ pt1 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r))
380
+ pt2 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r))
381
+ cv2.line(base, pt1, pt2, (100, 60, 35), 1)
382
+ base = base.astype(np.float32) + np.random.normal(0, 6, base.shape)
383
+ return np.clip(base, 0, 255).astype(np.uint8), '(synthetic)'
384
+
385
+ odir_orig, odir_src = get_or_synthetic(odir_path, 'ODIR')
386
+ aptos_orig, aptos_src = get_or_synthetic(aptos_path, 'APTOS')
387
+
388
+ # Resize originals for display
389
+ odir_disp = cv2.resize(odir_orig, (TARGET_SIZE, TARGET_SIZE),
390
+ interpolation=cv2.INTER_AREA)
391
+ aptos_disp = cv2.resize(aptos_orig, (TARGET_SIZE, TARGET_SIZE),
392
+ interpolation=cv2.INTER_AREA)
393
+
394
+ # Apply pipelines
395
+ odir_clahe = clahe_preprocess(odir_orig.copy())
396
+ aptos_graham = ben_graham_preprocess(aptos_orig.copy())
397
+
398
+ # Difference images (scaled for visibility)
399
+ diff_odir = cv2.absdiff(odir_disp, odir_clahe)
400
+ diff_aptos = cv2.absdiff(aptos_disp, aptos_graham)
401
+ # Amplify diff for visibility
402
+ diff_odir = np.clip(diff_odir * 3, 0, 255).astype(np.uint8)
403
+ diff_aptos = np.clip(diff_aptos * 3, 0, 255).astype(np.uint8)
404
+
405
+ # --- Build figure ---
406
+ fig, axes = plt.subplots(2, 3, figsize=(16, 11))
407
+ fig.patch.set_facecolor('#1a1a2e')
408
+ fig.suptitle(
409
+ 'RetinaSense v3 — Domain-Conditional Preprocessing\n'
410
+ 'ODIR: CLAHE Pipeline | APTOS: Ben Graham Pipeline',
411
+ fontsize=13, fontweight='bold', color='white', y=1.01
412
+ )
413
+
414
+ panels = [
415
+ # row, col, image, title, bg_color
416
+ (0, 0, odir_disp, f'ODIR: Original\n({os.path.basename(str(odir_src))})',
417
+ '#1565C0'),
418
+ (0, 1, odir_clahe, 'ODIR: After CLAHE\n(L-channel equalization, circular mask)',
419
+ '#0D47A1'),
420
+ (0, 2, diff_odir, 'ODIR: Difference x3\n(|original - CLAHE|, amplified)',
421
+ '#263238'),
422
+ (1, 0, aptos_disp, f'APTOS: Original\n({os.path.basename(str(aptos_src))})',
423
+ '#BF360C'),
424
+ (1, 1, aptos_graham, 'APTOS: After Ben Graham\n(4*img - 4*blur(σ=10) + 128)',
425
+ '#870000'),
426
+ (1, 2, diff_aptos, 'APTOS: Difference x3\n(|original - Ben Graham|, amplified)',
427
+ '#1B5E20'),
428
+ ]
429
+
430
+ for r, c, img_arr, title, fc in panels:
431
+ ax = axes[r, c]
432
+ ax.imshow(img_arr)
433
+ ax.set_title(title, fontsize=9, color='white', pad=5,
434
+ bbox=dict(boxstyle='round,pad=0.3', facecolor=fc,
435
+ alpha=0.85, edgecolor='none'))
436
+ ax.axis('off')
437
+ for spine in ax.spines.values():
438
+ spine.set_visible(False)
439
+
440
+ # Annotation boxes
441
+ odir_note = (
442
+ 'ODIR Pipeline\n'
443
+ '━━━━━━━━━━━━━━━\n'
444
+ '1. Crop black borders\n'
445
+ '2. Resize → 224×224\n'
446
+ '3. Convert RGB→LAB\n'
447
+ '4. CLAHE on L channel\n'
448
+ ' clip=2.0, tile=8×8\n'
449
+ '5. LAB→RGB\n'
450
+ '6. Circular mask (r=0.48)'
451
+ )
452
+ aptos_note = (
453
+ 'APTOS Pipeline (Ben Graham)\n'
454
+ '━━━━━━━━━━━━━━━━━━━━━━━━━━\n'
455
+ '1. Crop black borders\n'
456
+ '2. Resize → 224×224\n'
457
+ '3. blur = GaussianBlur(σ=10)\n'
458
+ '4. out = 4×img − 4×blur + 128\n'
459
+ '5. Circular mask (r=0.48)\n'
460
+ '6. clip to [0, 255]'
461
+ )
462
+
463
+ fig.text(0.02, 0.92, odir_note, fontsize=8.5, va='top', ha='left',
464
+ color='white', fontfamily='monospace',
465
+ bbox=dict(boxstyle='round', facecolor='#1565C0', alpha=0.6))
466
+ fig.text(0.02, 0.48, aptos_note, fontsize=8.5, va='top', ha='left',
467
+ color='white', fontfamily='monospace',
468
+ bbox=dict(boxstyle='round', facecolor='#870000', alpha=0.6))
469
+
470
+ plt.tight_layout(rect=[0.18, 0, 1, 1])
471
+ plt.savefig(save_path, dpi=150, bbox_inches='tight',
472
+ facecolor='#1a1a2e', edgecolor='none')
473
+ plt.close()
474
+ print(f'[OK] Comparison saved: {save_path}')
475
+ return save_path
476
+
477
+
478
+ # =========================================================
479
+ # NORMALIZATION STATISTICS
480
+ # =========================================================
481
+
482
+ def compute_norm_stats(train_df: pd.DataFrame,
483
+ out_path: str = None,
484
+ max_images: int = None) -> dict:
485
+ """
486
+ Compute per-channel mean and std across all pixels of training images
487
+ after domain-conditional preprocessing. Training set ONLY — no
488
+ validation/test data contamination.
489
+
490
+ Returns dict with: mean_rgb, std_rgb, n_images, n_pixels_per_channel.
491
+ """
492
+ if out_path is None:
493
+ out_path = os.path.join(DATA_DIR, 'fundus_norm_stats.json')
494
+
495
+ df = train_df.copy()
496
+ if max_images is not None:
497
+ df = df.sample(min(max_images, len(df)), random_state=42)
498
+
499
+ ch_sum = np.zeros(3, dtype=np.float64)
500
+ ch_sq_sum = np.zeros(3, dtype=np.float64)
501
+ n_pixels = 0
502
+ n_images = 0
503
+ n_missing = 0
504
+
505
+ for _, row in tqdm(df.iterrows(), total=len(df), desc='Norm stats'):
506
+ raw = row['image_path']
507
+ src = row['dataset']
508
+
509
+ # Try cache first for speed
510
+ arr = load_from_cache(raw)
511
+ if arr is None:
512
+ abs_path = resolve_image_path(raw, src)
513
+ if not os.path.exists(abs_path):
514
+ n_missing += 1
515
+ continue
516
+ arr = preprocess_image(abs_path, src)
517
+ if arr is None:
518
+ n_missing += 1
519
+ continue
520
+
521
+ arr_f = arr.astype(np.float64) / 255.0
522
+ pixels = arr_f.reshape(-1, 3)
523
+ ch_sum += pixels.sum(axis=0)
524
+ ch_sq_sum += (pixels ** 2).sum(axis=0)
525
+ n_pixels += pixels.shape[0]
526
+ n_images += 1
527
+
528
+ if n_images == 0:
529
+ print('[WARN] No images found — storing ImageNet defaults as fallback.')
530
+ stats = {
531
+ 'mean_rgb': [0.485, 0.456, 0.406],
532
+ 'std_rgb': [0.229, 0.224, 0.225],
533
+ 'n_images': 0,
534
+ 'n_pixels_per_channel': 0,
535
+ 'n_missing': n_missing,
536
+ 'note': 'No images found — ImageNet defaults used as fallback',
537
+ 'source': 'imagenet_fallback'
538
+ }
539
+ else:
540
+ mean = ch_sum / n_pixels
541
+ var = ch_sq_sum / n_pixels - mean ** 2
542
+ std = np.sqrt(np.maximum(var, 0.0))
543
+ stats = {
544
+ 'mean_rgb': [round(float(v), 6) for v in mean],
545
+ 'std_rgb': [round(float(v), 6) for v in std],
546
+ 'n_images': n_images,
547
+ 'n_pixels_per_channel': int(n_pixels),
548
+ 'n_missing': n_missing,
549
+ 'note': ('Computed on training split only after domain-conditional '
550
+ 'preprocessing. Red-dominant channel expected (fundus tissue).'),
551
+ 'source': 'computed_training_split'
552
+ }
553
+ print(f' mean RGB : {[round(v,4) for v in mean]}')
554
+ print(f' std RGB : {[round(v,4) for v in std]}')
555
+ print(f' images : {n_images:,} | missing: {n_missing}')
556
+
557
+ with open(out_path, 'w') as f:
558
+ json.dump(stats, f, indent=2)
559
+ print(f'[OK] Stats saved: {out_path}')
560
+ return stats
561
+
562
+
563
+ # =========================================================
564
+ # 3-WAY STRATIFIED SPLIT
565
+ # =========================================================
566
+
567
+ def create_stratified_split(df: pd.DataFrame,
568
+ train_ratio: float = 0.70,
569
+ calib_ratio: float = 0.15,
570
+ test_ratio: float = 0.15,
571
+ random_state: int = 42) -> tuple:
572
+ """
573
+ Create train/calib/test split stratified by disease_label.
574
+ Returns (train_df, calib_df, test_df).
575
+ """
576
+ from sklearn.model_selection import train_test_split as _tts
577
+ assert abs(train_ratio + calib_ratio + test_ratio - 1.0) < 1e-9
578
+
579
+ train_df, temp_df = _tts(
580
+ df, test_size=(calib_ratio + test_ratio),
581
+ stratify=df['disease_label'], random_state=random_state
582
+ )
583
+ calib_frac = calib_ratio / (calib_ratio + test_ratio)
584
+ calib_df, test_df = _tts(
585
+ temp_df, test_size=(1.0 - calib_frac),
586
+ stratify=temp_df['disease_label'], random_state=random_state
587
+ )
588
+ return (train_df.reset_index(drop=True),
589
+ calib_df.reset_index(drop=True),
590
+ test_df.reset_index(drop=True))
591
+
592
+
593
+ def save_splits(train_df, calib_df, test_df, out_dir: str = DATA_DIR):
594
+ train_df.to_csv(os.path.join(out_dir, 'train_split.csv'), index=False)
595
+ calib_df.to_csv(os.path.join(out_dir, 'calib_split.csv'), index=False)
596
+ test_df.to_csv( os.path.join(out_dir, 'test_split.csv'), index=False)
597
+ print(f'[OK] Split CSVs saved to {out_dir}/')
598
+
599
+
600
+ def print_split_stats(train_df, calib_df, test_df,
601
+ class_names: dict = None) -> str:
602
+ if class_names is None:
603
+ class_names = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
604
+ 3: 'Cataract', 4: 'AMD'}
605
+
606
+ total_n = len(train_df) + len(calib_df) + len(test_df)
607
+ lines = [
608
+ '',
609
+ '=' * 62,
610
+ ' STRATIFIED SPLIT — CLASS DISTRIBUTION',
611
+ '=' * 62,
612
+ f"{'Class':<16} {'Train':>8} {'Calib':>8} {'Test':>8} {'Total':>8}",
613
+ '-' * 54,
614
+ ]
615
+ tr_tot = ca_tot = te_tot = 0
616
+ for lbl in sorted(class_names.keys()):
617
+ tr = int((train_df['disease_label'] == lbl).sum())
618
+ ca = int((calib_df['disease_label'] == lbl).sum())
619
+ te = int((test_df['disease_label'] == lbl).sum())
620
+ tot = tr + ca + te
621
+ tr_tot += tr; ca_tot += ca; te_tot += te
622
+ lines.append(
623
+ f"{class_names[lbl]:<16} {tr:>8,} {ca:>8,} {te:>8,} {tot:>8,}"
624
+ )
625
+ lines += [
626
+ '-' * 54,
627
+ f"{'TOTAL':<16} {tr_tot:>8,} {ca_tot:>8,} {te_tot:>8,} {total_n:>8,}",
628
+ '',
629
+ f'Split sizes : train={len(train_df):,} calib={len(calib_df):,} '
630
+ f'test={len(test_df):,}',
631
+ f'Actual ratios: train={len(train_df)/total_n:.1%} '
632
+ f'calib={len(calib_df)/total_n:.1%} '
633
+ f'test={len(test_df)/total_n:.1%}',
634
+ ]
635
+ report = '\n'.join(lines)
636
+ print(report)
637
+ return report
638
+
639
+
640
+ # =========================================================
641
+ # ADDITIONAL DATASET SEARCH
642
+ # =========================================================
643
+
644
+ def search_additional_datasets() -> dict:
645
+ """
646
+ Scan filesystem for REFUGE2, iChallenge-AMD, RIM-ONE and other
647
+ AMD/Glaucoma-specific datasets beyond the current CSV.
648
+ Returns a findings dict.
649
+ """
650
+ IMG_EXTS = {'.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp'}
651
+ TARGETS = ['refuge2', 'refuge', 'ichallenge', 'rim-one', 'rimone',
652
+ 'amd', 'glaucoma', 'odir5k', 'odir']
653
+ SEARCH_ROOTS = ['/teamspace/studios/this_studio', '/teamspace/uploads']
654
+ SKIP_DIRS = {'.git', '.cache', '.claude', '.ipython', '.npm',
655
+ '__pycache__', 'outputs_analysis', 'outputs_ensemble',
656
+ 'outputs_optimized', 'outputs_production', 'outputs_v2',
657
+ 'outputs_v2_extended', 'outputs_vit'}
658
+
659
+ findings = {}
660
+
661
+ for root_dir in SEARCH_ROOTS:
662
+ if not os.path.exists(root_dir):
663
+ continue
664
+ for dirpath, dirnames, files in os.walk(root_dir):
665
+ # Prune
666
+ dirnames[:] = [d for d in dirnames
667
+ if d not in SKIP_DIRS and not d.startswith('.')]
668
+ folder = os.path.basename(dirpath).lower()
669
+ for target in TARGETS:
670
+ if target in folder:
671
+ img_cnt = sum(1 for f in files
672
+ if os.path.splitext(f)[1].lower() in IMG_EXTS)
673
+ key = dirpath
674
+ if key not in findings or img_cnt > findings[key]['img_count']:
675
+ findings[key] = {
676
+ 'matched_target': target,
677
+ 'img_count': img_cnt,
678
+ 'total_files': len(files)
679
+ }
680
+
681
+ # Always include the known special dirs
682
+ for special in [
683
+ '/teamspace/studios/this_studio/ocular-disease-recognition-odir5k',
684
+ '/teamspace/studios/this_studio/odir',
685
+ '/teamspace/studios/this_studio/aptos',
686
+ ]:
687
+ if os.path.exists(special) and special not in findings:
688
+ img_cnt = sum(
689
+ 1 for root, _, files in os.walk(special)
690
+ for f in files
691
+ if os.path.splitext(f)[1].lower() in IMG_EXTS
692
+ )
693
+ findings[special] = {
694
+ 'matched_target': 'known_dataset',
695
+ 'img_count': img_cnt,
696
+ 'total_files': sum(1 for _, _, fs in os.walk(special) for _ in fs)
697
+ }
698
+
699
+ return findings
700
+
701
+
702
+ # =========================================================
703
+ # MAIN
704
+ # =========================================================
705
+
706
+ def main():
707
+ print('=' * 65)
708
+ print(' RetinaSense v3 — Data Pipeline')
709
+ print('=' * 65)
710
+
711
+ CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
712
+ 3: 'Cataract', 4: 'AMD'}
713
+
714
+ # -------------------------------------------------------
715
+ # TASK 1: Dataset Audit
716
+ # -------------------------------------------------------
717
+ print('\n[TASK 1] Dataset Audit')
718
+ print('-' * 50)
719
+ df = pd.read_csv(CSV_PATH)
720
+ print(f' CSV : {CSV_PATH}')
721
+ print(f' Total rows : {len(df):,}')
722
+ print(f' Columns : {df.columns.tolist()}')
723
+ print()
724
+
725
+ print(' --- Overall class distribution ---')
726
+ for lbl, cnt in df['disease_label'].value_counts().sort_index().items():
727
+ pct = cnt / len(df) * 100
728
+ bar = '#' * int(pct / 2)
729
+ print(f" {lbl} {CLASS_NAMES.get(lbl,'?'):<12} : {cnt:>5} ({pct:5.1f}%) {bar}")
730
+
731
+ max_cls = df['disease_label'].value_counts().max()
732
+ min_cls = df['disease_label'].value_counts().min()
733
+ print(f'\n Imbalance ratio (max/min): {max_cls/min_cls:.1f}:1')
734
+ print()
735
+
736
+ print(' --- Per-dataset breakdown ---')
737
+ per_ds = (df.groupby(['dataset', 'disease_label'])
738
+ .size().reset_index(name='count'))
739
+ print(per_ds.to_string(index=False))
740
+ print()
741
+
742
+ print(' --- Severity label distribution (APTOS only) ---')
743
+ for sev, cnt in df['severity_label'].value_counts().sort_index().items():
744
+ label = 'N/A (ODIR)' if sev == -1 else f'Grade {sev}'
745
+ print(f" {sev:>3} ({label:<14}): {cnt:>5}")
746
+ print()
747
+
748
+ print(' --- Image path existence check ---')
749
+ n_found = 0
750
+ for _, row in df.iterrows():
751
+ p = resolve_image_path(row['image_path'], row['dataset'])
752
+ if os.path.exists(p):
753
+ n_found += 1
754
+ n_missing = len(df) - n_found
755
+ print(f' Total checked : {len(df):,}')
756
+ print(f' Found on disk : {n_found:,}')
757
+ print(f' Missing : {n_missing:,}')
758
+ print()
759
+
760
+ # -------------------------------------------------------
761
+ # TASK 2: Preprocessing Comparison
762
+ # -------------------------------------------------------
763
+ print('[TASK 2] Domain-Conditional Preprocessing Comparison')
764
+ print('-' * 50)
765
+
766
+ # Get representative samples from each dataset
767
+ odir_sample = df[df['dataset'] == 'ODIR']['image_path'].iloc[0] \
768
+ if len(df[df['dataset'] == 'ODIR']) > 0 else None
769
+ aptos_sample = df[df['dataset'] == 'APTOS']['image_path'].iloc[0] \
770
+ if len(df[df['dataset'] == 'APTOS']) > 0 else None
771
+
772
+ comp_path = make_preprocessing_comparison(
773
+ odir_raw_path=odir_sample,
774
+ aptos_raw_path=aptos_sample
775
+ )
776
+
777
+ # Demo: process a few images to verify pipeline
778
+ print('\n --- Pipeline verification (5 ODIR + 5 APTOS) ---')
779
+ ok_odir = ok_aptos = 0
780
+ for _, row in df[df['dataset'] == 'ODIR'].head(5).iterrows():
781
+ p = resolve_image_path(row['image_path'], 'ODIR')
782
+ if os.path.exists(p):
783
+ arr = preprocess_image(p, 'ODIR')
784
+ if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3):
785
+ ok_odir += 1
786
+ for _, row in df[df['dataset'] == 'APTOS'].head(5).iterrows():
787
+ p = resolve_image_path(row['image_path'], 'APTOS')
788
+ if os.path.exists(p):
789
+ arr = preprocess_image(p, 'APTOS')
790
+ if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3):
791
+ ok_aptos += 1
792
+ print(f' ODIR (CLAHE) : {ok_odir}/5 OK')
793
+ print(f' APTOS (Ben Graham) : {ok_aptos}/5 OK')
794
+ print()
795
+
796
+ # -------------------------------------------------------
797
+ # TASK 3: Stratified Split
798
+ # -------------------------------------------------------
799
+ print('[TASK 3] 3-Way Stratified Split (70 / 15 / 15)')
800
+ print('-' * 50)
801
+ train_df, calib_df, test_df = create_stratified_split(df)
802
+ save_splits(train_df, calib_df, test_df)
803
+ split_report = print_split_stats(train_df, calib_df, test_df, CLASS_NAMES)
804
+ print()
805
+
806
+ # -------------------------------------------------------
807
+ # TASK 4: Normalization Statistics (training split only)
808
+ # -------------------------------------------------------
809
+ print('[TASK 4] Fundus Normalization Statistics (training split)')
810
+ print('-' * 50)
811
+ norm_stats = compute_norm_stats(train_df)
812
+ print()
813
+
814
+ # -------------------------------------------------------
815
+ # TASK 5: Additional Dataset Search
816
+ # -------------------------------------------------------
817
+ print('[TASK 5] Additional Dataset Search')
818
+ print('-' * 50)
819
+ findings = search_additional_datasets()
820
+ if findings:
821
+ print(f' Found {len(findings)} dataset directories:')
822
+ for path, info in findings.items():
823
+ print(f' {path}')
824
+ print(f' images: {info["img_count"]:,} '
825
+ f'files: {info["total_files"]:,} '
826
+ f'matched: "{info["matched_target"]}"')
827
+ else:
828
+ print(' No additional datasets found.')
829
+ print()
830
+
831
+ # Summary of what needs downloading
832
+ known_sets = {'REFUGE2', 'ICHALLENGE-AMD', 'RIM-ONE'}
833
+ found_names = set(info['matched_target'].upper()
834
+ for info in findings.values())
835
+ missing_sets = known_sets - found_names
836
+ if missing_sets:
837
+ print(f' Datasets NOT found (need downloading): {missing_sets}')
838
+
839
+ # -------------------------------------------------------
840
+ # Write report
841
+ # -------------------------------------------------------
842
+ _write_report(df, train_df, calib_df, test_df, norm_stats,
843
+ findings, split_report, comp_path)
844
+
845
+ print('\n' + '=' * 65)
846
+ print(' All tasks complete.')
847
+ print('=' * 65)
848
+ return df, train_df, calib_df, test_df, norm_stats
849
+
850
+
851
+ # =========================================================
852
+ # REPORT WRITER
853
+ # =========================================================
854
+
855
+ def _write_report(df, train_df, calib_df, test_df, norm_stats,
856
+ dataset_findings, split_report, comp_path):
857
+ """Save data_engineer_report.md to ./data/"""
858
+ CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
859
+ 3: 'Cataract', 4: 'AMD'}
860
+
861
+ n_found = sum(
862
+ 1 for _, row in df.iterrows()
863
+ if os.path.exists(resolve_image_path(row['image_path'], row['dataset']))
864
+ )
865
+
866
+ lines = [
867
+ '# RetinaSense v3 — Data Engineer Report',
868
+ f'Generated: 2026-03-06',
869
+ '',
870
+ '---',
871
+ '',
872
+ '## 1. Dataset Statistics',
873
+ '',
874
+ f'**Source CSV:** `data/combined_dataset.csv` ',
875
+ f'**Total images in CSV:** {len(df):,} ',
876
+ f'**Images found on disk:** {n_found:,} / {len(df):,} ',
877
+ '',
878
+ '### Source breakdown',
879
+ '',
880
+ '| Dataset | Count | Labels present |',
881
+ '|---------|-------|----------------|',
882
+ ]
883
+ for ds, grp in df.groupby('dataset'):
884
+ labels = sorted(grp['disease_label'].unique())
885
+ label_str = ', '.join(f'{l}={CLASS_NAMES[l]}' for l in labels)
886
+ lines.append(f'| {ds} | {len(grp):,} | {label_str} |')
887
+
888
+ lines += [
889
+ '',
890
+ '### Class distribution (full dataset)',
891
+ '',
892
+ '| Label | Class | Count | % |',
893
+ '|-------|-------|-------|---|',
894
+ ]
895
+ for lbl, cnt in df['disease_label'].value_counts().sort_index().items():
896
+ pct = cnt / len(df) * 100
897
+ lines.append(
898
+ f'| {lbl} | {CLASS_NAMES[lbl]} | {cnt:,} | {pct:.1f}% |'
899
+ )
900
+ max_cls = df['disease_label'].value_counts().max()
901
+ min_cls = df['disease_label'].value_counts().min()
902
+ lines += [
903
+ '',
904
+ f'**Imbalance ratio (Diabetes:AMD):** {max_cls/min_cls:.1f}:1',
905
+ '',
906
+ '### Severity label distribution (APTOS DR grades, -1 = ODIR no grade)',
907
+ '',
908
+ '| Severity | Meaning | Count |',
909
+ '|----------|---------|-------|',
910
+ ]
911
+ for sev, cnt in df['severity_label'].value_counts().sort_index().items():
912
+ meaning = 'N/A (ODIR, no grade)' if sev == -1 else f'DR Grade {sev}'
913
+ lines.append(f'| {sev} | {meaning} | {cnt:,} |')
914
+
915
+ lines += [
916
+ '',
917
+ '---',
918
+ '',
919
+ '## 2. Image Path Resolution',
920
+ '',
921
+ '| Dataset | CSV path format | Actual location |',
922
+ '|---------|-----------------|-----------------|',
923
+ '| ODIR | `.//odir/preprocessed_images/<name>.jpg` | `odir/preprocessed_images/<name>.jpg` |',
924
+ '| APTOS | `.//aptos/train_images/<id>.png` (train_images does NOT exist) | `aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png` |',
925
+ '',
926
+ '`train_images/` directory is absent; actual APTOS images are stored under',
927
+ '`gaussian_filtered_images/gaussian_filtered_images/<DR_grade>/`. The',
928
+ '`aptos/train.csv` maps `id_code` → `diagnosis` (0-4) enabling lookup.',
929
+ '',
930
+ '---',
931
+ '',
932
+ '## 3. Preprocessing: Domain-Conditional Pipeline',
933
+ '',
934
+ '**Problem:** Previous versions applied Ben Graham enhancement uniformly to',
935
+ 'ALL images. This is incorrect: ODIR images have already-enhanced or',
936
+ 'clinical-quality appearance; applying Ben Graham degrades them.',
937
+ '',
938
+ '**Fix:** Source-conditional dispatch in `preprocess_image(path, source)`.',
939
+ '',
940
+ '| Source | Method | Rationale |',
941
+ '|--------|--------|-----------|',
942
+ '| APTOS | Ben Graham (4×img − 4×blur(σ=10) + 128 + circular mask) | Field camera images have vignetting and low local contrast. Ben Graham removes low-frequency illumination and amplifies vessel/lesion detail. |',
943
+ '| ODIR | CLAHE (L-channel, clip=2.0, tile=8×8, circular mask) | Multi-source clinical images. CLAHE normalizes local contrast while preserving sharpness and avoiding Ben Graham over-processing. |',
944
+ '| REFUGE2 | Resize only (224×224) | Zeiss Visucam 500 — already standardized high-quality. |',
945
+ '',
946
+ f'**Comparison figure:** `{comp_path}`',
947
+ '',
948
+ '**Cache location:** `preprocessed_cache_v3/<stem>_v3.npy` ',
949
+ '**Cache key:** image filename stem (not row index)',
950
+ '',
951
+ '---',
952
+ '',
953
+ '## 4. Normalization Statistics',
954
+ '',
955
+ '**Method:** One pass over training split pixels (post-preprocessing).',
956
+ 'No validation or test images used.',
957
+ '',
958
+ f'| Channel | Mean | Std |',
959
+ f'|---------|------|-----|',
960
+ f'| R (red) | {norm_stats["mean_rgb"][0]:.4f} | {norm_stats["std_rgb"][0]:.4f} |',
961
+ f'| G (green) | {norm_stats["mean_rgb"][1]:.4f} | {norm_stats["std_rgb"][1]:.4f} |',
962
+ f'| B (blue) | {norm_stats["mean_rgb"][2]:.4f} | {norm_stats["std_rgb"][2]:.4f} |',
963
+ '',
964
+ f'**Images used:** {norm_stats["n_images"]:,} ',
965
+ f'**Note:** {norm_stats["note"]} ',
966
+ f'**Source:** `{norm_stats["source"]}`',
967
+ ]
968
+
969
+ if norm_stats['source'] == 'computed_training_split':
970
+ lines += [
971
+ '',
972
+ 'Expected pattern for fundus images: R > G > B (red-dominant)',
973
+ 'due to high hemoglobin absorption. Computed values should match',
974
+ 'expected ≈ [0.41, 0.27, 0.19] mean, [0.28, 0.19, 0.16] std.',
975
+ ]
976
+
977
+ lines += [
978
+ '',
979
+ '**Saved to:** `data/fundus_norm_stats.json`',
980
+ '',
981
+ '---',
982
+ '',
983
+ '## 5. Stratified Split (70 / 15 / 15)',
984
+ '',
985
+ '**Strategy:** `sklearn.model_selection.train_test_split` with',
986
+ '`stratify=disease_label`, `random_state=42`.',
987
+ '',
988
+ '**Files:**',
989
+ '- `data/train_split.csv` — 70% training',
990
+ '- `data/calib_split.csv` — 15% calibration (temperature scaling)',
991
+ '- `data/test_split.csv` — 15% held-out evaluation',
992
+ '',
993
+ ]
994
+ lines.append(split_report.replace('\n', '\n'))
995
+ lines += [
996
+ '',
997
+ '---',
998
+ '',
999
+ '## 6. Additional Dataset Search',
1000
+ '',
1001
+ ]
1002
+ if dataset_findings:
1003
+ lines.append('### Found directories:')
1004
+ lines.append('')
1005
+ lines.append('| Path | Images | Files | Matched |')
1006
+ lines.append('|------|--------|-------|---------|')
1007
+ for path, info in dataset_findings.items():
1008
+ lines.append(
1009
+ f'| `{path}` | {info["img_count"]:,} | '
1010
+ f'{info["total_files"]:,} | {info["matched_target"]} |'
1011
+ )
1012
+ else:
1013
+ lines.append('No additional dataset directories found.')
1014
+
1015
+ lines += [
1016
+ '',
1017
+ '### Availability summary',
1018
+ '',
1019
+ '| Dataset | Status | Location |',
1020
+ '|---------|--------|----------|',
1021
+ '| ODIR-5K (ODIR) | **AVAILABLE** | `odir/preprocessed_images/` (4,878 images in CSV) |',
1022
+ '| ODIR-5K raw | **AVAILABLE** | `odir/ODIR-5K/ODIR-5K/Training Images/` (7,000) + Testing (1,000) |',
1023
+ '| APTOS 2019 | **AVAILABLE** | `aptos/gaussian_filtered_images/` (3,662 images) |',
1024
+ '| ocular-disease-recognition-odir5k | Partial (1 image only) | `ocular-disease-recognition-odir5k/preprocessed_images/` |',
1025
+ '| REFUGE2 | **NOT FOUND** | Needs download |',
1026
+ '| iChallenge-AMD | **NOT FOUND** | Needs download |',
1027
+ '| RIM-ONE | **NOT FOUND** | Needs download |',
1028
+ '',
1029
+ '### AMD / Glaucoma specific images (beyond CSV)',
1030
+ '',
1031
+ f'- ODIR provides {len(df[df["disease_label"]==2]):,} Glaucoma and '
1032
+ f'{len(df[df["disease_label"]==4]):,} AMD images from '
1033
+ f'`odir/preprocessed_images/`.',
1034
+ '- ODIR raw training set (7,000 images) may contain additional',
1035
+ ' AMD/Glaucoma cases not yet extracted — check `odir/full_df.csv`.',
1036
+ '- For specialized Glaucoma detection: REFUGE2 (400 images,',
1037
+ ' Magrabia population) and RIM-ONE (159 images) are recommended.',
1038
+ '- For AMD: iChallenge-AMD (400 images) is the standard benchmark.',
1039
+ '',
1040
+ '---',
1041
+ '',
1042
+ '## 7. Action Items',
1043
+ '',
1044
+ '1. **Download missing datasets** to improve minority class coverage:',
1045
+ ' - REFUGE2: https://refuge.grand-challenge.org/',
1046
+ ' - RIM-ONE: http://medimrg.webs.ull.es/research/retinal-imaging/rim-one/',
1047
+ ' - iChallenge-AMD: https://amd.grand-challenge.org/',
1048
+ '2. **Fix paths in combined_dataset.csv**: update `aptos/train_images/` →',
1049
+ ' actual `gaussian_filtered_images/.../` paths.',
1050
+ '3. **Run full cache build** when training: `python retinasense_v3_preprocessing.py --cache-all`',
1051
+ '4. **Use computed normalization stats** from `data/fundus_norm_stats.json`',
1052
+ ' instead of ImageNet stats.',
1053
+ '5. **Address 21:1 class imbalance**: consider weighted sampling or',
1054
+ ' oversampling minority classes (AMD=265, Glaucoma=308).',
1055
+ ]
1056
+
1057
+ report_path = os.path.join(DATA_DIR, 'data_engineer_report.md')
1058
+ with open(report_path, 'w') as f:
1059
+ f.write('\n'.join(lines) + '\n')
1060
+ print(f'[OK] Report saved: {report_path}')
1061
+
1062
+
1063
+ if __name__ == '__main__':
1064
+ main()