#!/usr/bin/env python3
"""
RetinaSense v3 — Domain-Conditional Preprocessing Pipeline
===========================================================
Implements source-aware preprocessing:
  - APTOS   -> Ben Graham enhancement (high contrast DR-specific pipeline)
  - ODIR    -> CLAHE only (preserves sharpness, normalizes contrast)
  - REFUGE2 -> Resize only (images already clinical-grade high quality)

Image path resolution:
  - ODIR:  odir/preprocessed_images/<filename>
  - APTOS: aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png
           (looked up from aptos/train.csv; aptos/train_images/ does NOT exist)

Cache format: ./preprocessed_cache_v3/<stem>_v3.npy
"""

import os
import sys
import json
import warnings
import numpy as np
import pandas as pd
import cv2
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm

warnings.filterwarnings('ignore')

# =========================================================
# PATHS
# =========================================================
BASE_DIR   = '/teamspace/studios/this_studio'
CSV_PATH   = os.path.join(BASE_DIR, 'data', 'combined_dataset.csv')
CACHE_DIR  = os.path.join(BASE_DIR, 'preprocessed_cache_v3')
DATA_DIR   = os.path.join(BASE_DIR, 'data')

ODIR_IMG_DIR   = os.path.join(BASE_DIR, 'odir', 'preprocessed_images')
APTOS_CSV      = os.path.join(BASE_DIR, 'aptos', 'train.csv')
APTOS_IMG_BASE = os.path.join(BASE_DIR, 'aptos',
                               'gaussian_filtered_images',
                               'gaussian_filtered_images')
APTOS_DIAG_MAP = {0: 'No_DR', 1: 'Mild', 2: 'Moderate',
                  3: 'Severe', 4: 'Proliferate_DR'}

ODIR_SAMPLE    = os.path.join(BASE_DIR, 'ocular-disease-recognition-odir5k',
                               'preprocessed_images', '2977_left.jpg')

os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(DATA_DIR,  exist_ok=True)

TARGET_SIZE = 224

# =========================================================
# APTOS PATH LOOKUP TABLE
# Built once at module load; maps id_code (stem) -> abs path
# =========================================================

def _build_aptos_lookup() -> dict:
    """Return dict mapping aptos id_code -> absolute image path."""
    lookup = {}
    if not os.path.exists(APTOS_CSV):
        return lookup
    df = pd.read_csv(APTOS_CSV)
    for _, row in df.iterrows():
        folder = APTOS_DIAG_MAP.get(int(row['diagnosis']), 'No_DR')
        path   = os.path.join(APTOS_IMG_BASE, folder,
                              str(row['id_code']) + '.png')
        lookup[str(row['id_code'])] = path
    return lookup


_APTOS_LOOKUP: dict = _build_aptos_lookup()


# =========================================================
# PATH RESOLVER
# =========================================================

def resolve_image_path(raw_path: str, dataset: str = None) -> str:
    """
    Resolve CSV path entry to an absolute filesystem path.

    The CSV stores paths like:
      ODIR:  .//odir/preprocessed_images/0_left.jpg
      APTOS: .//aptos/train_images/000c1434d8d7.png  (train_images doesn't exist)

    Resolution rules:
      1. If the resolved path already exists, return it.
      2. ODIR: remap to odir/preprocessed_images/<filename>
      3. APTOS: look up via _APTOS_LOOKUP by stem
    """
    # Normalise .// and ./ prefixes
    p = raw_path.strip()
    if p.startswith('.//'):
        p = p[3:]
    elif p.startswith('./'):
        p = p[2:]

    # Try as-is (absolute or relative to BASE_DIR)
    if not os.path.isabs(p):
        candidate = os.path.join(BASE_DIR, p)
    else:
        candidate = p

    if os.path.exists(candidate):
        return candidate

    fname = os.path.basename(p)
    stem  = os.path.splitext(fname)[0]
    src   = (dataset or '').upper().strip()

    # ODIR remap
    if src == 'ODIR' or 'odir' in p.lower():
        return os.path.join(ODIR_IMG_DIR, fname)

    # APTOS remap via lookup table
    if src == 'APTOS' or 'aptos' in p.lower():
        if stem in _APTOS_LOOKUP:
            return _APTOS_LOOKUP[stem]

    # Final fallback: try all known image dirs
    for d in [ODIR_IMG_DIR, APTOS_IMG_BASE]:
        candidate2 = os.path.join(d, fname)
        if os.path.exists(candidate2):
            return candidate2

    return candidate  # return best guess even if missing


# =========================================================
# PREPROCESSING FUNCTIONS
# =========================================================

def _load_image(image_path: str):
    """Load image as RGB numpy array (H, W, 3) uint8. Returns None on failure."""
    img = cv2.imread(image_path)
    if img is None:
        return None
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


def _crop_black_borders(img: np.ndarray, tol: int = 7) -> np.ndarray:
    """Remove dark border padding common in fundus images."""
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    mask = gray > tol
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)
    if not rows.any() or not cols.any():
        return img
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]
    return img[rmin:rmax+1, cmin:cmax+1]


def _apply_circular_mask(img: np.ndarray) -> np.ndarray:
    """Zero out pixels outside the circular fundus field of view."""
    h, w = img.shape[:2]
    mask = np.zeros((h, w), dtype=np.uint8)
    cx, cy = w // 2, h // 2
    r = int(min(h, w) * 0.48)
    cv2.circle(mask, (cx, cy), r, 255, -1)
    return cv2.bitwise_and(img, img, mask=mask)


def ben_graham_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE,
                           sigma: float = 10.0) -> np.ndarray:
    """
    Ben Graham fundus enhancement — used for APTOS images.

    Enhances local retinal structures (vessels, lesions) by subtracting a
    Gaussian-blurred version from itself, centering intensity around 128.
    This removes low-frequency illumination variation (vignetting, uneven
    camera lighting) and amplifies high-frequency structural details.

    Formula: result = 4*img - 4*GaussianBlur(img, sigma=10) + 128
    Then circular mask applied to suppress black border.
    """
    img = _crop_black_borders(img)
    img = cv2.resize(img, (target_size, target_size),
                     interpolation=cv2.INTER_AREA)
    blur = cv2.GaussianBlur(img, (0, 0), sigma)
    img  = cv2.addWeighted(img, 4, blur, -4, 128)
    img  = _apply_circular_mask(img)
    return np.clip(img, 0, 255).astype(np.uint8)


def clahe_preprocess(img: np.ndarray, target_size: int = TARGET_SIZE,
                     clip_limit: float = 2.0,
                     tile_grid: tuple = (8, 8)) -> np.ndarray:
    """
    CLAHE (Contrast Limited Adaptive Histogram Equalization) — used for ODIR.

    Preserves image sharpness while normalizing local contrast.
    Applied only to the L (luminance) channel in LAB color space to
    avoid hue shifts. ODIR is a multi-source dataset with mixed quality,
    so CLAHE provides gentle contrast normalization without destroying
    fine detail the way Ben Graham's aggressive subtraction would.

    clip_limit=2.0: moderate clipping to prevent over-amplification of noise.
    tile_grid=(8,8): 8x8 tiles for local adaptation at appropriate scale.
    """
    img = _crop_black_borders(img)
    img = cv2.resize(img, (target_size, target_size),
                     interpolation=cv2.INTER_AREA)
    lab    = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(lab)
    clahe  = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid)
    l_eq   = clahe.apply(l)
    lab_eq = cv2.merge([l_eq, a, b])
    img    = cv2.cvtColor(lab_eq, cv2.COLOR_LAB2RGB)
    img    = _apply_circular_mask(img)
    return np.clip(img, 0, 255).astype(np.uint8)


def resize_only_preprocess(img: np.ndarray,
                            target_size: int = TARGET_SIZE) -> np.ndarray:
    """
    Minimal preprocessing — used for REFUGE2.

    REFUGE2 images are acquired with a Zeiss Visucam 500 camera under
    standardized clinical conditions. They are already high-quality with
    consistent lighting. Any additional enhancement would degrade quality.
    """
    img = cv2.resize(img, (target_size, target_size),
                     interpolation=cv2.INTER_AREA)
    return np.clip(img, 0, 255).astype(np.uint8)


def preprocess_image(image_path: str, source: str,
                     target_size: int = TARGET_SIZE):
    """
    Domain-conditional preprocessing dispatcher.

    Parameters
    ----------
    image_path : str
        Absolute path to the fundus image file.
    source : str
        Dataset source. One of: 'APTOS', 'ODIR', 'REFUGE2' (case-insensitive).
    target_size : int
        Output spatial dimension (square). Default 224.

    Returns
    -------
    np.ndarray of shape (target_size, target_size, 3), dtype uint8,
    or None if the image cannot be loaded.
    """
    img = _load_image(image_path)
    if img is None:
        return None

    src = source.upper().strip()
    if src == 'APTOS':
        return ben_graham_preprocess(img, target_size=target_size)
    elif src == 'ODIR':
        return clahe_preprocess(img, target_size=target_size)
    elif src == 'REFUGE2':
        return resize_only_preprocess(img, target_size=target_size)
    else:
        # Safe fallback for unknown sources
        print(f'[WARN] Unknown source "{source}", applying CLAHE fallback.')
        return clahe_preprocess(img, target_size=target_size)


# =========================================================
# CACHE HELPERS
# =========================================================

def cache_path_for(raw_csv_path: str) -> str:
    """Return the .npy cache path for a given CSV image_path entry."""
    stem = Path(raw_csv_path).stem
    return os.path.join(CACHE_DIR, f'{stem}_v3.npy')


def is_cached(raw_csv_path: str) -> bool:
    return os.path.exists(cache_path_for(raw_csv_path))


def save_to_cache(raw_csv_path: str, arr: np.ndarray) -> None:
    np.save(cache_path_for(raw_csv_path), arr)


def load_from_cache(raw_csv_path: str):
    cp = cache_path_for(raw_csv_path)
    return np.load(cp) if os.path.exists(cp) else None


def cache_dataset(df: pd.DataFrame) -> dict:
    """
    Preprocess and cache all images in df using domain-conditional pipeline.
    Returns stats dict.
    """
    stats = dict(processed=0, skipped_missing=0, already_cached=0,
                 errors=0, total=len(df))

    for _, row in tqdm(df.iterrows(), total=len(df), desc='Caching v3'):
        raw  = row['image_path']
        src  = row['dataset']

        if is_cached(raw):
            stats['already_cached'] += 1
            continue

        abs_path = resolve_image_path(raw, src)
        if not os.path.exists(abs_path):
            stats['skipped_missing'] += 1
            continue

        arr = preprocess_image(abs_path, src)
        if arr is None:
            stats['errors'] += 1
            continue

        save_to_cache(raw, arr)
        stats['processed'] += 1

    return stats


# =========================================================
# PREPROCESSING COMPARISON VISUALIZATION
# =========================================================

def make_preprocessing_comparison(
        save_path: str = None,
        odir_raw_path: str = None,
        aptos_raw_path: str = None) -> str:
    """
    Generate and save a side-by-side comparison PNG showing
    ODIR (CLAHE) vs APTOS (Ben Graham) preprocessing pipelines.

    Returns the saved PNG path.
    """
    if save_path is None:
        save_path = os.path.join(DATA_DIR, 'preprocessing_comparison_v3.png')

    # --- Pick sample ODIR image ---
    # Prefer sample from the dataset
    odir_path = None
    if odir_raw_path:
        odir_path = resolve_image_path(odir_raw_path, 'ODIR')
    if odir_path is None or not os.path.exists(odir_path):
        # Use the one available ODIR sample in odir5k folder
        odir_path = ODIR_SAMPLE
    if not os.path.exists(odir_path):
        # Fall back to any image in odir/preprocessed_images
        imgs = [os.path.join(ODIR_IMG_DIR, f)
                for f in os.listdir(ODIR_IMG_DIR)
                if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        odir_path = imgs[0] if imgs else None

    # --- Pick sample APTOS image ---
    aptos_path = None
    if aptos_raw_path:
        aptos_path = resolve_image_path(aptos_raw_path, 'APTOS')
    if aptos_path is None or not os.path.exists(aptos_path):
        # Use first entry in APTOS lookup
        if _APTOS_LOOKUP:
            aptos_path = next(iter(_APTOS_LOOKUP.values()))

    # --- Load images ---
    def get_or_synthetic(path, name):
        if path and os.path.exists(path):
            img = _load_image(path)
            if img is not None:
                return img, path
        print(f'[WARN] {name} sample not found, using synthetic.')
        h, w = 512, 512
        np.random.seed(42)
        base = np.zeros((h, w, 3), dtype=np.uint8)
        cx, cy = w // 2, h // 2
        r = int(min(h, w) * 0.48)
        cv2.circle(base, (cx, cy), r, (60, 40, 25), -1)
        for _ in range(30):
            pt1 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r))
            pt2 = (cx + np.random.randint(-r, r), cy + np.random.randint(-r, r))
            cv2.line(base, pt1, pt2, (100, 60, 35), 1)
        base = base.astype(np.float32) + np.random.normal(0, 6, base.shape)
        return np.clip(base, 0, 255).astype(np.uint8), '(synthetic)'

    odir_orig,  odir_src  = get_or_synthetic(odir_path,  'ODIR')
    aptos_orig, aptos_src = get_or_synthetic(aptos_path, 'APTOS')

    # Resize originals for display
    odir_disp  = cv2.resize(odir_orig,  (TARGET_SIZE, TARGET_SIZE),
                            interpolation=cv2.INTER_AREA)
    aptos_disp = cv2.resize(aptos_orig, (TARGET_SIZE, TARGET_SIZE),
                            interpolation=cv2.INTER_AREA)

    # Apply pipelines
    odir_clahe   = clahe_preprocess(odir_orig.copy())
    aptos_graham = ben_graham_preprocess(aptos_orig.copy())

    # Difference images (scaled for visibility)
    diff_odir  = cv2.absdiff(odir_disp,  odir_clahe)
    diff_aptos = cv2.absdiff(aptos_disp, aptos_graham)
    # Amplify diff for visibility
    diff_odir  = np.clip(diff_odir  * 3, 0, 255).astype(np.uint8)
    diff_aptos = np.clip(diff_aptos * 3, 0, 255).astype(np.uint8)

    # --- Build figure ---
    fig, axes = plt.subplots(2, 3, figsize=(16, 11))
    fig.patch.set_facecolor('#1a1a2e')
    fig.suptitle(
        'RetinaSense v3 — Domain-Conditional Preprocessing\n'
        'ODIR: CLAHE Pipeline  |  APTOS: Ben Graham Pipeline',
        fontsize=13, fontweight='bold', color='white', y=1.01
    )

    panels = [
        # row, col, image, title, bg_color
        (0, 0, odir_disp,     f'ODIR: Original\n({os.path.basename(str(odir_src))})',
         '#1565C0'),
        (0, 1, odir_clahe,    'ODIR: After CLAHE\n(L-channel equalization, circular mask)',
         '#0D47A1'),
        (0, 2, diff_odir,     'ODIR: Difference x3\n(|original - CLAHE|, amplified)',
         '#263238'),
        (1, 0, aptos_disp,    f'APTOS: Original\n({os.path.basename(str(aptos_src))})',
         '#BF360C'),
        (1, 1, aptos_graham,  'APTOS: After Ben Graham\n(4*img - 4*blur(σ=10) + 128)',
         '#870000'),
        (1, 2, diff_aptos,    'APTOS: Difference x3\n(|original - Ben Graham|, amplified)',
         '#1B5E20'),
    ]

    for r, c, img_arr, title, fc in panels:
        ax = axes[r, c]
        ax.imshow(img_arr)
        ax.set_title(title, fontsize=9, color='white', pad=5,
                     bbox=dict(boxstyle='round,pad=0.3', facecolor=fc,
                               alpha=0.85, edgecolor='none'))
        ax.axis('off')
        for spine in ax.spines.values():
            spine.set_visible(False)

    # Annotation boxes
    odir_note = (
        'ODIR Pipeline\n'
        '━━━━━━━━━━━━━━━\n'
        '1. Crop black borders\n'
        '2. Resize → 224×224\n'
        '3. Convert RGB→LAB\n'
        '4. CLAHE on L channel\n'
        '   clip=2.0, tile=8×8\n'
        '5. LAB→RGB\n'
        '6. Circular mask (r=0.48)'
    )
    aptos_note = (
        'APTOS Pipeline (Ben Graham)\n'
        '━━━━━━━━━━━━━━━━━━━━━━━━━━\n'
        '1. Crop black borders\n'
        '2. Resize → 224×224\n'
        '3. blur = GaussianBlur(σ=10)\n'
        '4. out = 4×img − 4×blur + 128\n'
        '5. Circular mask (r=0.48)\n'
        '6. clip to [0, 255]'
    )

    fig.text(0.02, 0.92, odir_note,  fontsize=8.5, va='top', ha='left',
             color='white', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='#1565C0', alpha=0.6))
    fig.text(0.02, 0.48, aptos_note, fontsize=8.5, va='top', ha='left',
             color='white', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='#870000', alpha=0.6))

    plt.tight_layout(rect=[0.18, 0, 1, 1])
    plt.savefig(save_path, dpi=150, bbox_inches='tight',
                facecolor='#1a1a2e', edgecolor='none')
    plt.close()
    print(f'[OK] Comparison saved: {save_path}')
    return save_path


# =========================================================
# NORMALIZATION STATISTICS
# =========================================================

def compute_norm_stats(train_df: pd.DataFrame,
                       out_path: str = None,
                       max_images: int = None) -> dict:
    """
    Compute per-channel mean and std across all pixels of training images
    after domain-conditional preprocessing. Training set ONLY — no
    validation/test data contamination.

    Returns dict with: mean_rgb, std_rgb, n_images, n_pixels_per_channel.
    """
    if out_path is None:
        out_path = os.path.join(DATA_DIR, 'fundus_norm_stats.json')

    df = train_df.copy()
    if max_images is not None:
        df = df.sample(min(max_images, len(df)), random_state=42)

    ch_sum    = np.zeros(3, dtype=np.float64)
    ch_sq_sum = np.zeros(3, dtype=np.float64)
    n_pixels  = 0
    n_images  = 0
    n_missing = 0

    for _, row in tqdm(df.iterrows(), total=len(df), desc='Norm stats'):
        raw = row['image_path']
        src = row['dataset']

        # Try cache first for speed
        arr = load_from_cache(raw)
        if arr is None:
            abs_path = resolve_image_path(raw, src)
            if not os.path.exists(abs_path):
                n_missing += 1
                continue
            arr = preprocess_image(abs_path, src)
            if arr is None:
                n_missing += 1
                continue

        arr_f   = arr.astype(np.float64) / 255.0
        pixels  = arr_f.reshape(-1, 3)
        ch_sum    += pixels.sum(axis=0)
        ch_sq_sum += (pixels ** 2).sum(axis=0)
        n_pixels  += pixels.shape[0]
        n_images  += 1

    if n_images == 0:
        print('[WARN] No images found — storing ImageNet defaults as fallback.')
        stats = {
            'mean_rgb': [0.485, 0.456, 0.406],
            'std_rgb':  [0.229, 0.224, 0.225],
            'n_images': 0,
            'n_pixels_per_channel': 0,
            'n_missing': n_missing,
            'note': 'No images found — ImageNet defaults used as fallback',
            'source': 'imagenet_fallback'
        }
    else:
        mean = ch_sum    / n_pixels
        var  = ch_sq_sum / n_pixels - mean ** 2
        std  = np.sqrt(np.maximum(var, 0.0))
        stats = {
            'mean_rgb': [round(float(v), 6) for v in mean],
            'std_rgb':  [round(float(v), 6) for v in std],
            'n_images': n_images,
            'n_pixels_per_channel': int(n_pixels),
            'n_missing': n_missing,
            'note': ('Computed on training split only after domain-conditional '
                     'preprocessing. Red-dominant channel expected (fundus tissue).'),
            'source': 'computed_training_split'
        }
        print(f'  mean RGB : {[round(v,4) for v in mean]}')
        print(f'  std  RGB : {[round(v,4) for v in std]}')
        print(f'  images   : {n_images:,}  |  missing: {n_missing}')

    with open(out_path, 'w') as f:
        json.dump(stats, f, indent=2)
    print(f'[OK] Stats saved: {out_path}')
    return stats


# =========================================================
# 3-WAY STRATIFIED SPLIT
# =========================================================

def create_stratified_split(df: pd.DataFrame,
                             train_ratio: float = 0.70,
                             calib_ratio: float = 0.15,
                             test_ratio:  float = 0.15,
                             random_state: int = 42) -> tuple:
    """
    Create train/calib/test split stratified by disease_label.
    Returns (train_df, calib_df, test_df).
    """
    from sklearn.model_selection import train_test_split as _tts
    assert abs(train_ratio + calib_ratio + test_ratio - 1.0) < 1e-9

    train_df, temp_df = _tts(
        df, test_size=(calib_ratio + test_ratio),
        stratify=df['disease_label'], random_state=random_state
    )
    calib_frac = calib_ratio / (calib_ratio + test_ratio)
    calib_df, test_df = _tts(
        temp_df, test_size=(1.0 - calib_frac),
        stratify=temp_df['disease_label'], random_state=random_state
    )
    return (train_df.reset_index(drop=True),
            calib_df.reset_index(drop=True),
            test_df.reset_index(drop=True))


def save_splits(train_df, calib_df, test_df, out_dir: str = DATA_DIR):
    train_df.to_csv(os.path.join(out_dir, 'train_split.csv'), index=False)
    calib_df.to_csv(os.path.join(out_dir, 'calib_split.csv'), index=False)
    test_df.to_csv( os.path.join(out_dir, 'test_split.csv'),  index=False)
    print(f'[OK] Split CSVs saved to {out_dir}/')


def print_split_stats(train_df, calib_df, test_df,
                      class_names: dict = None) -> str:
    if class_names is None:
        class_names = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
                       3: 'Cataract', 4: 'AMD'}

    total_n = len(train_df) + len(calib_df) + len(test_df)
    lines = [
        '',
        '=' * 62,
        '  STRATIFIED SPLIT — CLASS DISTRIBUTION',
        '=' * 62,
        f"{'Class':<16} {'Train':>8} {'Calib':>8} {'Test':>8} {'Total':>8}",
        '-' * 54,
    ]
    tr_tot = ca_tot = te_tot = 0
    for lbl in sorted(class_names.keys()):
        tr = int((train_df['disease_label'] == lbl).sum())
        ca = int((calib_df['disease_label'] == lbl).sum())
        te = int((test_df['disease_label']  == lbl).sum())
        tot = tr + ca + te
        tr_tot += tr; ca_tot += ca; te_tot += te
        lines.append(
            f"{class_names[lbl]:<16} {tr:>8,} {ca:>8,} {te:>8,} {tot:>8,}"
        )
    lines += [
        '-' * 54,
        f"{'TOTAL':<16} {tr_tot:>8,} {ca_tot:>8,} {te_tot:>8,} {total_n:>8,}",
        '',
        f'Split sizes : train={len(train_df):,}  calib={len(calib_df):,}  '
        f'test={len(test_df):,}',
        f'Actual ratios: train={len(train_df)/total_n:.1%}  '
        f'calib={len(calib_df)/total_n:.1%}  '
        f'test={len(test_df)/total_n:.1%}',
    ]
    report = '\n'.join(lines)
    print(report)
    return report


# =========================================================
# ADDITIONAL DATASET SEARCH
# =========================================================

def search_additional_datasets() -> dict:
    """
    Scan filesystem for REFUGE2, iChallenge-AMD, RIM-ONE and other
    AMD/Glaucoma-specific datasets beyond the current CSV.
    Returns a findings dict.
    """
    IMG_EXTS = {'.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp'}
    TARGETS  = ['refuge2', 'refuge', 'ichallenge', 'rim-one', 'rimone',
                'amd', 'glaucoma', 'odir5k', 'odir']
    SEARCH_ROOTS = ['/teamspace/studios/this_studio', '/teamspace/uploads']
    SKIP_DIRS    = {'.git', '.cache', '.claude', '.ipython', '.npm',
                    '__pycache__', 'outputs_analysis', 'outputs_ensemble',
                    'outputs_optimized', 'outputs_production', 'outputs_v2',
                    'outputs_v2_extended', 'outputs_vit'}

    findings = {}

    for root_dir in SEARCH_ROOTS:
        if not os.path.exists(root_dir):
            continue
        for dirpath, dirnames, files in os.walk(root_dir):
            # Prune
            dirnames[:] = [d for d in dirnames
                           if d not in SKIP_DIRS and not d.startswith('.')]
            folder = os.path.basename(dirpath).lower()
            for target in TARGETS:
                if target in folder:
                    img_cnt = sum(1 for f in files
                                  if os.path.splitext(f)[1].lower() in IMG_EXTS)
                    key = dirpath
                    if key not in findings or img_cnt > findings[key]['img_count']:
                        findings[key] = {
                            'matched_target': target,
                            'img_count': img_cnt,
                            'total_files': len(files)
                        }

    # Always include the known special dirs
    for special in [
        '/teamspace/studios/this_studio/ocular-disease-recognition-odir5k',
        '/teamspace/studios/this_studio/odir',
        '/teamspace/studios/this_studio/aptos',
    ]:
        if os.path.exists(special) and special not in findings:
            img_cnt = sum(
                1 for root, _, files in os.walk(special)
                for f in files
                if os.path.splitext(f)[1].lower() in IMG_EXTS
            )
            findings[special] = {
                'matched_target': 'known_dataset',
                'img_count': img_cnt,
                'total_files': sum(1 for _, _, fs in os.walk(special) for _ in fs)
            }

    return findings


# =========================================================
# MAIN
# =========================================================

def main():
    print('=' * 65)
    print('  RetinaSense v3 — Data Pipeline')
    print('=' * 65)

    CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
                   3: 'Cataract', 4: 'AMD'}

    # -------------------------------------------------------
    # TASK 1: Dataset Audit
    # -------------------------------------------------------
    print('\n[TASK 1] Dataset Audit')
    print('-' * 50)
    df = pd.read_csv(CSV_PATH)
    print(f'  CSV          : {CSV_PATH}')
    print(f'  Total rows   : {len(df):,}')
    print(f'  Columns      : {df.columns.tolist()}')
    print()

    print('  --- Overall class distribution ---')
    for lbl, cnt in df['disease_label'].value_counts().sort_index().items():
        pct = cnt / len(df) * 100
        bar = '#' * int(pct / 2)
        print(f"    {lbl} {CLASS_NAMES.get(lbl,'?'):<12} : {cnt:>5}  ({pct:5.1f}%)  {bar}")

    max_cls = df['disease_label'].value_counts().max()
    min_cls = df['disease_label'].value_counts().min()
    print(f'\n  Imbalance ratio (max/min): {max_cls/min_cls:.1f}:1')
    print()

    print('  --- Per-dataset breakdown ---')
    per_ds = (df.groupby(['dataset', 'disease_label'])
               .size().reset_index(name='count'))
    print(per_ds.to_string(index=False))
    print()

    print('  --- Severity label distribution (APTOS only) ---')
    for sev, cnt in df['severity_label'].value_counts().sort_index().items():
        label = 'N/A (ODIR)' if sev == -1 else f'Grade {sev}'
        print(f"    {sev:>3} ({label:<14}): {cnt:>5}")
    print()

    print('  --- Image path existence check ---')
    n_found = 0
    for _, row in df.iterrows():
        p = resolve_image_path(row['image_path'], row['dataset'])
        if os.path.exists(p):
            n_found += 1
    n_missing = len(df) - n_found
    print(f'    Total checked : {len(df):,}')
    print(f'    Found on disk : {n_found:,}')
    print(f'    Missing       : {n_missing:,}')
    print()

    # -------------------------------------------------------
    # TASK 2: Preprocessing Comparison
    # -------------------------------------------------------
    print('[TASK 2] Domain-Conditional Preprocessing Comparison')
    print('-' * 50)

    # Get representative samples from each dataset
    odir_sample  = df[df['dataset'] == 'ODIR']['image_path'].iloc[0] \
                   if len(df[df['dataset'] == 'ODIR']) > 0 else None
    aptos_sample = df[df['dataset'] == 'APTOS']['image_path'].iloc[0] \
                   if len(df[df['dataset'] == 'APTOS']) > 0 else None

    comp_path = make_preprocessing_comparison(
        odir_raw_path=odir_sample,
        aptos_raw_path=aptos_sample
    )

    # Demo: process a few images to verify pipeline
    print('\n  --- Pipeline verification (5 ODIR + 5 APTOS) ---')
    ok_odir = ok_aptos = 0
    for _, row in df[df['dataset'] == 'ODIR'].head(5).iterrows():
        p = resolve_image_path(row['image_path'], 'ODIR')
        if os.path.exists(p):
            arr = preprocess_image(p, 'ODIR')
            if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3):
                ok_odir += 1
    for _, row in df[df['dataset'] == 'APTOS'].head(5).iterrows():
        p = resolve_image_path(row['image_path'], 'APTOS')
        if os.path.exists(p):
            arr = preprocess_image(p, 'APTOS')
            if arr is not None and arr.shape == (TARGET_SIZE, TARGET_SIZE, 3):
                ok_aptos += 1
    print(f'    ODIR  (CLAHE)      : {ok_odir}/5 OK')
    print(f'    APTOS (Ben Graham) : {ok_aptos}/5 OK')
    print()

    # -------------------------------------------------------
    # TASK 3: Stratified Split
    # -------------------------------------------------------
    print('[TASK 3] 3-Way Stratified Split (70 / 15 / 15)')
    print('-' * 50)
    train_df, calib_df, test_df = create_stratified_split(df)
    save_splits(train_df, calib_df, test_df)
    split_report = print_split_stats(train_df, calib_df, test_df, CLASS_NAMES)
    print()

    # -------------------------------------------------------
    # TASK 4: Normalization Statistics (training split only)
    # -------------------------------------------------------
    print('[TASK 4] Fundus Normalization Statistics (training split)')
    print('-' * 50)
    norm_stats = compute_norm_stats(train_df)
    print()

    # -------------------------------------------------------
    # TASK 5: Additional Dataset Search
    # -------------------------------------------------------
    print('[TASK 5] Additional Dataset Search')
    print('-' * 50)
    findings = search_additional_datasets()
    if findings:
        print(f'  Found {len(findings)} dataset directories:')
        for path, info in findings.items():
            print(f'    {path}')
            print(f'      images: {info["img_count"]:,}  '
                  f'files: {info["total_files"]:,}  '
                  f'matched: "{info["matched_target"]}"')
    else:
        print('  No additional datasets found.')
    print()

    # Summary of what needs downloading
    known_sets = {'REFUGE2', 'ICHALLENGE-AMD', 'RIM-ONE'}
    found_names = set(info['matched_target'].upper()
                      for info in findings.values())
    missing_sets = known_sets - found_names
    if missing_sets:
        print(f'  Datasets NOT found (need downloading): {missing_sets}')

    # -------------------------------------------------------
    # Write report
    # -------------------------------------------------------
    _write_report(df, train_df, calib_df, test_df, norm_stats,
                  findings, split_report, comp_path)

    print('\n' + '=' * 65)
    print('  All tasks complete.')
    print('=' * 65)
    return df, train_df, calib_df, test_df, norm_stats


# =========================================================
# REPORT WRITER
# =========================================================

def _write_report(df, train_df, calib_df, test_df, norm_stats,
                  dataset_findings, split_report, comp_path):
    """Save data_engineer_report.md to ./data/"""
    CLASS_NAMES = {0: 'Normal', 1: 'Diabetes/DR', 2: 'Glaucoma',
                   3: 'Cataract', 4: 'AMD'}

    n_found = sum(
        1 for _, row in df.iterrows()
        if os.path.exists(resolve_image_path(row['image_path'], row['dataset']))
    )

    lines = [
        '# RetinaSense v3 — Data Engineer Report',
        f'Generated: 2026-03-06',
        '',
        '---',
        '',
        '## 1. Dataset Statistics',
        '',
        f'**Source CSV:** `data/combined_dataset.csv`  ',
        f'**Total images in CSV:** {len(df):,}  ',
        f'**Images found on disk:** {n_found:,} / {len(df):,}  ',
        '',
        '### Source breakdown',
        '',
        '| Dataset | Count | Labels present |',
        '|---------|-------|----------------|',
    ]
    for ds, grp in df.groupby('dataset'):
        labels = sorted(grp['disease_label'].unique())
        label_str = ', '.join(f'{l}={CLASS_NAMES[l]}' for l in labels)
        lines.append(f'| {ds} | {len(grp):,} | {label_str} |')

    lines += [
        '',
        '### Class distribution (full dataset)',
        '',
        '| Label | Class | Count | % |',
        '|-------|-------|-------|---|',
    ]
    for lbl, cnt in df['disease_label'].value_counts().sort_index().items():
        pct = cnt / len(df) * 100
        lines.append(
            f'| {lbl} | {CLASS_NAMES[lbl]} | {cnt:,} | {pct:.1f}% |'
        )
    max_cls = df['disease_label'].value_counts().max()
    min_cls = df['disease_label'].value_counts().min()
    lines += [
        '',
        f'**Imbalance ratio (Diabetes:AMD):** {max_cls/min_cls:.1f}:1',
        '',
        '### Severity label distribution (APTOS DR grades, -1 = ODIR no grade)',
        '',
        '| Severity | Meaning | Count |',
        '|----------|---------|-------|',
    ]
    for sev, cnt in df['severity_label'].value_counts().sort_index().items():
        meaning = 'N/A (ODIR, no grade)' if sev == -1 else f'DR Grade {sev}'
        lines.append(f'| {sev} | {meaning} | {cnt:,} |')

    lines += [
        '',
        '---',
        '',
        '## 2. Image Path Resolution',
        '',
        '| Dataset | CSV path format | Actual location |',
        '|---------|-----------------|-----------------|',
        '| ODIR  | `.//odir/preprocessed_images/<name>.jpg` | `odir/preprocessed_images/<name>.jpg` |',
        '| APTOS | `.//aptos/train_images/<id>.png` (train_images does NOT exist) | `aptos/gaussian_filtered_images/gaussian_filtered_images/<class>/<id>.png` |',
        '',
        '`train_images/` directory is absent; actual APTOS images are stored under',
        '`gaussian_filtered_images/gaussian_filtered_images/<DR_grade>/`. The',
        '`aptos/train.csv` maps `id_code` → `diagnosis` (0-4) enabling lookup.',
        '',
        '---',
        '',
        '## 3. Preprocessing: Domain-Conditional Pipeline',
        '',
        '**Problem:** Previous versions applied Ben Graham enhancement uniformly to',
        'ALL images. This is incorrect: ODIR images have already-enhanced or',
        'clinical-quality appearance; applying Ben Graham degrades them.',
        '',
        '**Fix:** Source-conditional dispatch in `preprocess_image(path, source)`.',
        '',
        '| Source | Method | Rationale |',
        '|--------|--------|-----------|',
        '| APTOS  | Ben Graham (4×img − 4×blur(σ=10) + 128 + circular mask) | Field camera images have vignetting and low local contrast. Ben Graham removes low-frequency illumination and amplifies vessel/lesion detail. |',
        '| ODIR   | CLAHE (L-channel, clip=2.0, tile=8×8, circular mask) | Multi-source clinical images. CLAHE normalizes local contrast while preserving sharpness and avoiding Ben Graham over-processing. |',
        '| REFUGE2 | Resize only (224×224) | Zeiss Visucam 500 — already standardized high-quality. |',
        '',
        f'**Comparison figure:** `{comp_path}`',
        '',
        '**Cache location:** `preprocessed_cache_v3/<stem>_v3.npy`  ',
        '**Cache key:** image filename stem (not row index)',
        '',
        '---',
        '',
        '## 4. Normalization Statistics',
        '',
        '**Method:** One pass over training split pixels (post-preprocessing).',
        'No validation or test images used.',
        '',
        f'| Channel | Mean | Std |',
        f'|---------|------|-----|',
        f'| R (red)   | {norm_stats["mean_rgb"][0]:.4f} | {norm_stats["std_rgb"][0]:.4f} |',
        f'| G (green) | {norm_stats["mean_rgb"][1]:.4f} | {norm_stats["std_rgb"][1]:.4f} |',
        f'| B (blue)  | {norm_stats["mean_rgb"][2]:.4f} | {norm_stats["std_rgb"][2]:.4f} |',
        '',
        f'**Images used:** {norm_stats["n_images"]:,}  ',
        f'**Note:** {norm_stats["note"]}  ',
        f'**Source:** `{norm_stats["source"]}`',
    ]

    if norm_stats['source'] == 'computed_training_split':
        lines += [
            '',
            'Expected pattern for fundus images: R > G > B (red-dominant)',
            'due to high hemoglobin absorption. Computed values should match',
            'expected ≈ [0.41, 0.27, 0.19] mean, [0.28, 0.19, 0.16] std.',
        ]

    lines += [
        '',
        '**Saved to:** `data/fundus_norm_stats.json`',
        '',
        '---',
        '',
        '## 5. Stratified Split (70 / 15 / 15)',
        '',
        '**Strategy:** `sklearn.model_selection.train_test_split` with',
        '`stratify=disease_label`, `random_state=42`.',
        '',
        '**Files:**',
        '- `data/train_split.csv` — 70% training',
        '- `data/calib_split.csv` — 15% calibration (temperature scaling)',
        '- `data/test_split.csv`  — 15% held-out evaluation',
        '',
    ]
    lines.append(split_report.replace('\n', '\n'))
    lines += [
        '',
        '---',
        '',
        '## 6. Additional Dataset Search',
        '',
    ]
    if dataset_findings:
        lines.append('### Found directories:')
        lines.append('')
        lines.append('| Path | Images | Files | Matched |')
        lines.append('|------|--------|-------|---------|')
        for path, info in dataset_findings.items():
            lines.append(
                f'| `{path}` | {info["img_count"]:,} | '
                f'{info["total_files"]:,} | {info["matched_target"]} |'
            )
    else:
        lines.append('No additional dataset directories found.')

    lines += [
        '',
        '### Availability summary',
        '',
        '| Dataset | Status | Location |',
        '|---------|--------|----------|',
        '| ODIR-5K (ODIR) | **AVAILABLE** | `odir/preprocessed_images/` (4,878 images in CSV) |',
        '| ODIR-5K raw    | **AVAILABLE** | `odir/ODIR-5K/ODIR-5K/Training Images/` (7,000) + Testing (1,000) |',
        '| APTOS 2019     | **AVAILABLE** | `aptos/gaussian_filtered_images/` (3,662 images) |',
        '| ocular-disease-recognition-odir5k | Partial (1 image only) | `ocular-disease-recognition-odir5k/preprocessed_images/` |',
        '| REFUGE2        | **NOT FOUND** | Needs download |',
        '| iChallenge-AMD | **NOT FOUND** | Needs download |',
        '| RIM-ONE        | **NOT FOUND** | Needs download |',
        '',
        '### AMD / Glaucoma specific images (beyond CSV)',
        '',
        f'- ODIR provides {len(df[df["disease_label"]==2]):,} Glaucoma and '
        f'{len(df[df["disease_label"]==4]):,} AMD images from '
        f'`odir/preprocessed_images/`.',
        '- ODIR raw training set (7,000 images) may contain additional',
        '  AMD/Glaucoma cases not yet extracted — check `odir/full_df.csv`.',
        '- For specialized Glaucoma detection: REFUGE2 (400 images,',
        '  Magrabia population) and RIM-ONE (159 images) are recommended.',
        '- For AMD: iChallenge-AMD (400 images) is the standard benchmark.',
        '',
        '---',
        '',
        '## 7. Action Items',
        '',
        '1. **Download missing datasets** to improve minority class coverage:',
        '   - REFUGE2: https://refuge.grand-challenge.org/',
        '   - RIM-ONE: http://medimrg.webs.ull.es/research/retinal-imaging/rim-one/',
        '   - iChallenge-AMD: https://amd.grand-challenge.org/',
        '2. **Fix paths in combined_dataset.csv**: update `aptos/train_images/` →',
        '   actual `gaussian_filtered_images/.../` paths.',
        '3. **Run full cache build** when training: `python retinasense_v3_preprocessing.py --cache-all`',
        '4. **Use computed normalization stats** from `data/fundus_norm_stats.json`',
        '   instead of ImageNet stats.',
        '5. **Address 21:1 class imbalance**: consider weighted sampling or',
        '   oversampling minority classes (AMD=265, Glaucoma=308).',
    ]

    report_path = os.path.join(DATA_DIR, 'data_engineer_report.md')
    with open(report_path, 'w') as f:
        f.write('\n'.join(lines) + '\n')
    print(f'[OK] Report saved: {report_path}')


if __name__ == '__main__':
    main()