"""RamanBench Leaderboard — Gradio Space.

Data is pre-computed by scripts/export_leaderboard.py in the raman_bench_paper repo
and stored as CSVs in data/ and figures in figures/.  The space only reads
and displays them.
"""
import os
import gradio as gr
import pandas as pd

DATA_DIR    = os.path.join(os.path.dirname(__file__), "data")
FIGURES_DIR = os.path.join(os.path.dirname(__file__), "figures")

VERSION = "0.1.0"

TITLE = "# 🔬 RamanBench Leaderboard"

MAINTAINERS = """
<div style="display:flex; justify-content:space-between; align-items:center; flex-wrap:wrap; gap:0.75em; margin-bottom:1em; font-size:0.9em; max-width:100%;">
  <span style="flex:1; min-width:200px;">Maintained by
    <a href="https://kiwerkstatt.f2.htw-berlin.de/" target="_blank"><strong>KI-Werkstatt HTW Berlin</strong></a>
    (Mario Koddenbrock) &amp;
    <a href="https://www.tu.berlin/en/bioprocess" target="_blank"><strong>Dept. of Biotechnology TU Berlin</strong></a>
    (Christoph Lange)
  </span>
  <span style="display:flex; gap:1.25em; flex-shrink:0;">
    <a href="https://arxiv.org/abs/2605.02003" target="_blank">📄 Paper</a>
    <a href="https://github.com/ml-lab-htw/RamanBench" target="_blank">💻 GitHub</a>
    <a href="https://pypi.org/project/raman-bench/" target="_blank">📦 PyPI</a>
  </span>
</div>
"""

DESCRIPTION = f"""
**RamanBench** is a large-scale benchmark for machine learning on Raman spectroscopy,
unifying **74 publicly available datasets** (163 prediction targets) across four application domains —
Material Science, Biological, Medical, and Chemical — covering both classification and regression tasks
and spanning diverse instruments and experimental conditions.
We compare **28 models**, from the domain standard PLS to modern tabular foundation models
(TabPFN, TabICL) and time-series classifiers (ROCKET, ARSENAL).

**Elo** is calibrated so Random Forest = 1000. &nbsp;
**Score** is a combined normalized performance index based on F1 (classification) and RMSE (regression) — best model per target = 1, median = 0; higher is better. &nbsp;
**Improvability %** is the mean relative gap to the best model per target (lower is better). &nbsp;
`v{VERSION}`
"""

TABARENA_NOTE = """
<div class="tabarena-note">
🔗 RamanBench is a Raman-specific adaptation of <a href="https://huggingface.co/spaces/TabArena/leaderboard" target="_blank"><strong>TabArena</strong></a>, a general benchmark for tabular ML.
</div>
"""

CITATION = """@article{koddenbrock2026ramanbench,
  title={RamanBench: A Large-Scale Benchmark for Machine Learning on Raman Spectroscopy},
  author={Koddenbrock, Mario and Lange, Christoph and Legner, Robin and J{\"a}ger, Martin and K{\"o}gler, Martin and Bournazou, Mariano N Cruz and Neubauer, Peter and Biessmann, Felix and Rodner, Erik},
  journal={arXiv preprint arXiv:2605.02003},
  year={2026}
}"""

# ---------------------------------------------------------------------------
# Paper URLs keyed by display name
# ---------------------------------------------------------------------------

_D = " (default)"  # suffix added by export script to all standard models

MODEL_URLS = {
    f"Logistic Reg.{_D}":    "https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression",
    f"Random Forest{_D}":    "https://doi.org/10.1023/A:1010933404324",
    f"Extra Trees{_D}":      "https://doi.org/10.1007/s10994-006-6226-1",
    f"XGBoost{_D}":          "https://arxiv.org/abs/1603.02754",
    f"LightGBM{_D}":         "https://papers.nips.cc/paper_files/paper/2017/hash/6449f44a102fde848669bdd9eb6b76fa-Abstract.html",
    f"CatBoost{_D}":         "https://arxiv.org/abs/1706.09516",
    f"RealMLP{_D}":          "https://arxiv.org/abs/2407.04491",
    f"CoAtNet{_D}":          "https://arxiv.org/abs/2106.04803",
    f"FC-ResNeXt{_D}":       "https://arxiv.org/abs/2402.03970",
    f"MITRA{_D}":            "https://arxiv.org/abs/2510.21204",
    f"TabM{_D}":             "https://doi.org/10.48550/arXiv.2410.24210",
    f"TabDPT{_D}":           "https://arxiv.org/abs/2405.13396",
    f"TabPFN v2{_D}":        "https://doi.org/10.1038/s41586-024-08328-6",
    f"TabPFN v2.5{_D}":      "https://doi.org/10.48550/arXiv.2511.08667",
    f"TabICL{_D}":           "https://arxiv.org/abs/2602.11139",
    f"DeepCNN{_D}":          "https://doi.org/10.1039/C7AN01371J",
    f"SANet{_D}":            "https://doi.org/10.1109/JBHI.2021.3113700",
    f"RamanNet{_D}":         "https://doi.org/10.1038/s41598-023-44358-2",
    f"RamanTransformer{_D}": "https://doi.org/10.1038/s41598-023-44358-2",
    f"RamanFormer{_D}":      "https://doi.org/10.1021/acsomega.3c09247",
    f"ReZeroNet{_D}":        "https://arxiv.org/abs/2003.04887",
    f"ROCKET{_D}":           "https://arxiv.org/abs/1910.13051",
    f"ARSENAL{_D}":          "https://doi.org/10.1007/s10994-021-06057-9",
    "AutoGluon 1.5 (extreme, 4h)": "https://arxiv.org/abs/2003.06505",
}

CATEGORY_BADGE = {
    "Traditional ML":    "🟦",
    "Tree-based":        "🟩",
    "Gradient Boosting": "🟧",
    "Deep Learning":     "🟥",
    "Tabular Foundation":"🟪",
    "Raman-Specific":    "🩵",
    "TS Classification": "🟠",
    "AutoML":            "⬛",
}

ALL_CATEGORIES = list(CATEGORY_BADGE.keys())

FIXED_COLS       = ["Rank", "Model", "Category"]
COLUMN_WIDTHS    = {"Rank": "55px", "Model": "260px", "Category": "200px"}
OVERALL_OPT_COLS = ["Elo", "Score", "Avg Rank", "Improvability %", "Train Time s", "Infer. s/1K"]
CLF_OPT_COLS     = ["Elo", "Score", "Avg Rank", "F1", "Bal. Acc.", "Improvability %", "# Datasets"]
REG_OPT_COLS     = ["Elo", "Score", "Avg Rank", "RMSE", "R²", "Improvability %", "# Datasets"]

# ---------------------------------------------------------------------------
# Theme + CSS
# ---------------------------------------------------------------------------

def _build_css() -> str:
    # Desktop column widths (generated first so mobile @media can override them)
    col_rules = ""
    for i, col in enumerate(FIXED_COLS, start=1):
        w = COLUMN_WIDTHS.get(col)
        if w:
            col_rules += f"""
.leaderboard th:nth-child({i}),
.leaderboard td:nth-child({i}) {{
    width: {w} !important;
    min-width: {w} !important;
    max-width: {w} !important;
}}
"""

    return f"""
/* TabArena attribution callout */
.tabarena-note {{
    background: #eef4fb !important;
    border-left: 4px solid #6aa3d9 !important;
    padding: 10px 14px !important;
    margin: 8px 0 16px 0 !important;
    border-radius: 4px !important;
    font-size: 0.92em !important;
    color: #222 !important;
}}
.tabarena-note a {{
    color: #1a5fa8 !important;
}}

/* Caption text below figures */
.fig-caption {{
    font-size: 0.85em;
    color: #555;
    margin-top: 4px;
    margin-bottom: 16px;
    line-height: 1.4;
}}

/* Images: let the container shrink to the image's natural height */
[data-testid="image"],
[data-testid="image"] > *,
[data-testid="image"] > * > * {{
    height: auto !important;
    min-height: 0 !important;
    max-height: none !important;
}}
[data-testid="image"] img {{
    display: block;
    width: 100%;
    height: auto;
    max-width: 100%;
}}

/* Scrollable table containers */
.leaderboard,
.ds-table {{
    overflow-x: auto !important;
    -webkit-overflow-scrolling: touch;
    width: 100%;
}}

/* Keep column widths stable across rerenders */
.leaderboard table,
.ds-table table {{
    table-layout: fixed !important;
    min-width: 520px;
    width: max-content !important;
}}

.leaderboard td:nth-child(2) a,
.leaderboard td:nth-child(2) p {{
    display: block;
    overflow: hidden;
    text-overflow: ellipsis;
    white-space: nowrap;
}}

/* Markdown tables in accordions */
.prose table,
.md-table-wrap table {{
    display: block;
    overflow-x: auto;
    -webkit-overflow-scrolling: touch;
    max-width: 100%;
}}

/* Desktop column widths */
{col_rules}

/* ── Mobile overrides (must come last to win the cascade) ── */
@media (max-width: 768px) {{
    .leaderboard td,
    .leaderboard th,
    .ds-table td,
    .ds-table th {{
        padding: 4px 6px !important;
        font-size: 0.8em !important;
    }}

    /* Clip cell content so text never bleeds into adjacent cells */
    .leaderboard td,
    .ds-table td {{
        overflow: hidden !important;
        text-overflow: ellipsis !important;
        white-space: nowrap !important;
        max-width: 0;
    }}

    /* Adjusted fixed columns on small screens */
    .leaderboard th:nth-child(1),
    .leaderboard td:nth-child(1) {{
        width: 42px !important;
        min-width: 42px !important;
        max-width: 42px !important;
    }}
    .leaderboard th:nth-child(2),
    .leaderboard td:nth-child(2) {{
        width: 190px !important;
        min-width: 190px !important;
        max-width: 190px !important;
    }}
    .leaderboard th:nth-child(3),
    .leaderboard td:nth-child(3) {{
        width: 200px !important;
        min-width: 200px !important;
        max-width: 200px !important;
    }}

    .fig-caption {{
        font-size: 0.78em;
    }}
}}
"""

CSS = _build_css()

LIGHTBOX_JS = """
function() {
    const style = document.createElement('style');
    style.textContent = `
        #lb-overlay {
            display: none;
            position: fixed;
            z-index: 9999;
            inset: 0;
            background: rgba(0,0,0,0.88);
            cursor: zoom-out;
            align-items: center;
            justify-content: center;
        }
        #lb-overlay.active { display: flex; }
        #lb-overlay img {
            max-width: 95vw;
            max-height: 95vh;
            object-fit: contain;
            border-radius: 4px;
            box-shadow: 0 8px 48px rgba(0,0,0,0.6);
        }
    `;
    document.head.appendChild(style);

    const overlay = document.createElement('div');
    overlay.id = 'lb-overlay';
    const lbImg = document.createElement('img');
    overlay.appendChild(lbImg);
    document.body.appendChild(overlay);

    overlay.addEventListener('click', () => overlay.classList.remove('active'));
    document.addEventListener('keydown', e => {
        if (e.key === 'Escape') overlay.classList.remove('active');
    });

    function attachTo(img) {
        if (img.dataset.lb) return;
        img.dataset.lb = '1';
        img.style.cursor = 'zoom-in';
        img.addEventListener('click', e => {
            e.stopPropagation();
            lbImg.src = img.src;
            overlay.classList.add('active');
        });
    }

    new MutationObserver(() => {
        document.querySelectorAll('[data-testid="image"] img').forEach(attachTo);
    }).observe(document.body, { childList: true, subtree: true });

    document.querySelectorAll('[data-testid="image"] img').forEach(attachTo);
}
"""

theme = gr.themes.Base(
    primary_hue=gr.themes.colors.blue,
    secondary_hue=gr.themes.colors.slate,
    neutral_hue=gr.themes.colors.slate,
    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
    font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "monospace"],
)

# ---------------------------------------------------------------------------
# Loading & formatting
# ---------------------------------------------------------------------------

def _load_csv(filename: str) -> pd.DataFrame:
    path = os.path.join(DATA_DIR, filename)
    if not os.path.exists(path):
        return pd.DataFrame()
    return pd.read_csv(path, index_col=0)


def _apply_links(df: pd.DataFrame) -> pd.DataFrame:
    if "Model" not in df.columns:
        return df
    df = df.copy()
    def _linkify(n):
        display = n.replace(" (default)", "")
        url = MODEL_URLS.get(n)
        return f"[{display}]({url})" if url else display
    df["Model"] = df["Model"].apply(_linkify)
    return df


def _add_badge(df: pd.DataFrame) -> pd.DataFrame:
    if "Category" not in df.columns:
        return df
    df = df.copy()
    df["Category"] = df["Category"].apply(
        lambda c: f"{CATEGORY_BADGE.get(c, '⬜')} {c}"
    )
    return df


def _fmt_overall(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "Improvability" in df.columns:
        df = df.rename(columns={"Improvability": "Improvability %"})
    fmts = {
        "Elo":             lambda v: f"{v:.0f}",
        "Score":           lambda v: f"{v:.2f}",
        "Avg Rank":        lambda v: f"{v:.1f}",
        "Improvability %": lambda v: f"{v:.1f}",
        "Train Time s":    lambda v: f"{v:.1f}",
        "Infer. s/1K":     lambda v: f"{v:.1f}",
    }
    for col, fn in fmts.items():
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].apply(lambda v, f=fn: f(v) if pd.notna(v) else "—")
    return df


def _fmt_clf(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    pct = {"F1", "Bal. Acc.", "Accuracy", "MCC"}
    for col in df.columns:
        if not pd.api.types.is_numeric_dtype(df[col]):
            continue
        if col in pct:
            df[col] = df[col].apply(lambda v: f"{v*100:.1f}%" if pd.notna(v) else "—")
        elif col == "Elo":
            df[col] = df[col].apply(lambda v: f"{v:.0f}" if pd.notna(v) else "—")
        elif col == "Score":
            df[col] = df[col].apply(lambda v: f"{v:.2f}" if pd.notna(v) else "—")
        elif col in {"Avg Rank", "Improvability %"}:
            df[col] = df[col].apply(lambda v: f"{v:.1f}" if pd.notna(v) else "—")
    return df


def _fmt_reg(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in df.columns:
        if not pd.api.types.is_numeric_dtype(df[col]):
            continue
        if col == "Elo":
            df[col] = df[col].apply(lambda v: f"{v:.0f}" if pd.notna(v) else "—")
        elif col in {"Score", "R²"}:
            df[col] = df[col].apply(lambda v: f"{v:.2f}" if pd.notna(v) else "—")
        elif col in {"Avg Rank", "RMSE", "MAE", "Improvability %"}:
            df[col] = df[col].apply(lambda v: f"{v:.2f}" if pd.notna(v) else "—")
    return df


def _build_base(raw: pd.DataFrame, fmt_fn) -> pd.DataFrame:
    if raw.empty:
        return raw
    df = fmt_fn(raw)
    df["_cat"] = df["Category"]
    df = _apply_links(df)
    df = _add_badge(df)
    return df.reset_index(drop=True)


_overall_base = _build_base(_load_csv("leaderboard_overall.csv"), _fmt_overall)
_clf_base     = _build_base(_load_csv("leaderboard_clf.csv"),     _fmt_clf)
_reg_base     = _build_base(_load_csv("leaderboard_reg.csv"),     _fmt_reg)

_datasets_raw = (pd.read_csv(os.path.join(DATA_DIR, "datasets.csv"))
                 if os.path.exists(os.path.join(DATA_DIR, "datasets.csv"))
                 else pd.DataFrame({"Info": ["Run export_leaderboard.py first."]}))

# ---------------------------------------------------------------------------
# Filter + render helpers
# ---------------------------------------------------------------------------

def _render(base, sel_cats, sel_cols, opt_cols):
    if base.empty:
        return pd.DataFrame({"Info": ["Data not found. Run export_leaderboard.py first."]}), ["str"], []
    df = base.copy()
    if sel_cats and set(sel_cats) != set(ALL_CATEGORIES):
        df = df[df["_cat"].isin(sel_cats)]
    df = df.drop(columns=["_cat"], errors="ignore")
    visible = FIXED_COLS + [c for c in opt_cols if c in sel_cols and c in df.columns]
    df = df[[c for c in visible if c in df.columns]]
    if "Rank" in df.columns:
        df = df.copy()
        df["Rank"] = range(1, len(df) + 1)
    datatypes = ["markdown" if c == "Model" else "str" for c in df.columns]
    col_widths = [COLUMN_WIDTHS.get(c) for c in df.columns]
    return df, datatypes, col_widths


def _present_cats(base):
    if base.empty or "_cat" not in base.columns:
        return ALL_CATEGORIES
    return [c for c in ALL_CATEGORIES if c in base["_cat"].values]


overall_cats = _present_cats(_overall_base)
clf_cats     = _present_cats(_clf_base)
reg_cats     = _present_cats(_reg_base)


_DS_DISPLAY_COLS = ["Dataset", "Domain", "Task", "Samples", "Features", "Classes / Targets", "License"]


def _filter_datasets(sel_domains, sel_tasks) -> pd.DataFrame:
    """Return the filtered datasets dataframe including dataset_id (for internal use)."""
    df = _datasets_raw.copy()
    if "Info" in df.columns:
        return df
    if sel_domains:
        df = df[df["Domain"].isin(sel_domains)]
    if sel_tasks:
        df = df[df["Task"].isin(sel_tasks)]
    for col in ["Samples", "Features", "Classes / Targets"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").apply(
                lambda v: str(int(v)) if pd.notna(v) else ""
            )
    return df.reset_index(drop=True)


def _render_datasets(sel_domains, sel_tasks):
    df = _filter_datasets(sel_domains, sel_tasks)
    display_cols = [c for c in _DS_DISPLAY_COLS if c in df.columns]
    return df[display_cols] if display_cols else df


def _fig(name):
    path = os.path.join(FIGURES_DIR, name)
    return path if os.path.exists(path) else None


_ds_domains = sorted(_datasets_raw["Domain"].dropna().unique().tolist()) \
    if "Domain" in _datasets_raw.columns else []
_ds_tasks   = sorted(_datasets_raw["Task"].dropna().unique().tolist()) \
    if "Task" in _datasets_raw.columns else []

# ---------------------------------------------------------------------------
# Layout
# ---------------------------------------------------------------------------

with gr.Blocks(title="RamanBench Leaderboard", theme=theme, css=CSS, js=LIGHTBOX_JS) as demo:
    gr.Markdown(TITLE)
    gr.HTML(MAINTAINERS)
    gr.HTML(TABARENA_NOTE, elem_classes=["tabarena-note-wrap"])
    gr.Markdown(DESCRIPTION)

    # ── Figure 1 ─────────────────────────────────────────────────────────────
    with gr.Accordion("📊 Figure 1 — RamanBench Overview", open=True):
        if _fig("figure1.png"):
            gr.Image(_fig("figure1.png"), show_label=False)
        gr.Markdown(
            "*__Left:__ RamanBench datasets (pink) occupy a high-dimensional, low-sample regime "
            "distinct from tabular benchmarks (TabArena, TALENT) and time-series benchmarks (UCR, UEA). "
            "__Right:__ Model performance (Elo) vs. release year. "
            "PLS, the long-standing domain standard, held its leading position for decades; "
            "only in the last decade have modern methods begun to clearly surpass it. "
            "The state-of-the-art frontier is still advancing — no single model dominates "
            "across all tasks and domains.*",
            elem_classes=["fig-caption"],
        )

    # ── Raman examples ────────────────────────────────────────────────────────
    with gr.Accordion("🔬 Raman Spectroscopy Examples", open=True):
        if _fig("raman_examples.png"):
            gr.Image(_fig("raman_examples.png"), show_label=False)
        gr.Markdown(
            "*Representative Raman spectra from the four application domains in RamanBench. "
            "Each panel shows spectra from one domain, colored by class (classification) or by the target "
            "analyte value (regression, gradient from low to high). "
            "The thick line is the mean spectrum; shaded bands show ±1 standard deviation. "
            "Spectral ranges, sample sizes, noise levels, and analytical tasks differ substantially across domains, "
            "illustrating the breadth and heterogeneity of RamanBench.*",
            elem_classes=["fig-caption"],
        )

    # ── Benchmark composition ─────────────────────────────────────────────────
    with gr.Accordion("📊 Benchmark Composition", open=True):
        if _fig("summary_grid_6.png"):
            gr.Image(_fig("summary_grid_6.png"), show_label=False)
        gr.Markdown(
            "*Benchmark composition overview. "
            "__Domain distribution (left two donuts):__ Chemical & Industrial contributes the most datasets (33 of 74), "
            "while Material Science dominates by raw spectrum count due to the large RRUFF mineral library and MLROD. "
            "__Task distribution (centre two donuts):__ Regression datasets outnumber classification (53 vs. 21), "
            "yet classification datasets account for over 91% of all spectra. "
            "__Data sources (fifth donut):__ Datasets originate from eight hosting platforms "
            "with HuggingFace and Kaggle being the two largest. "
            "__New vs. existing (sixth donut):__ 16 of the 74 datasets (★) are released for the first time with this paper.*",
            elem_classes=["fig-caption"],
        )

    # ── Elo ranking ───────────────────────────────────────────────────────────
    with gr.Accordion("🏆 Elo Ranking", open=True):
        if _fig("elo_ranking.png"):
            gr.Image(_fig("elo_ranking.png"), show_label=False)
        gr.Markdown(
            "*__RamanBench Leaderboard.__ Elo ratings for all models (RF = 1000), sorted by performance. "
            "Error bars show 95% bootstrap confidence intervals (200 resampling rounds over the target pool). "
            "Models are colour-coded by algorithmic family. "
            "A higher Elo indicates consistently better performance across all classification and regression targets. "
            "Models marked with \\* are evaluated on classification tasks only and not imputed on regression tasks.*",
            elem_classes=["fig-caption"],
        )

    # ── Performance–efficiency trade-off ─────────────────────────────────────
    with gr.Accordion("⚡ Performance–Efficiency Trade-off", open=False):
        if _fig("metrics_vs_time.png"):
            gr.Image(_fig("metrics_vs_time.png"), show_label=False)
        gr.Markdown(
            "*__TFMs define the high-performance end of the Pareto frontier; ReZeroNet is the only "
            "non-TFM contender, while KNN qualifies through speed alone.__ "
            "Normalised F1 (classification, left) and normalised RMSE (regression, right) "
            "vs. mean total runtime (train + predict, log scale). "
            "Each metric is normalised per target: best model = 1, median = 0, clipped at 0. "
            "Upper-left is Pareto-optimal for F1 (high score, low cost); lower-left for RMSE. "
            "Runtime excludes the one-time pretraining cost of foundation models (TabPFN, TabICL, MITRA, TabDPT).*",
            elem_classes=["fig-caption"],
        )

    # ── Improvability vs. total time ──────────────────────────────────────────
    with gr.Accordion("📉 Improvability vs. Total Time", open=False):
        if _fig("improvability_vs_time.png"):
            gr.Image(_fig("improvability_vs_time.png"), show_label=False)
        gr.Markdown(
            "*__TFMs anchor the low-improvability end of the Pareto frontier; ReZeroNet is the only "
            "Raman-specific model near it, while KNN qualifies through speed alone.__ "
            "Mean improvability (%) vs. mean total time (train + predict, s, log scale), "
            "shown separately for classification (left) and regression (right). "
            "Improvability of 0% means the model is already optimal within the evaluated pool; "
            "higher values indicate larger room for improvement. "
            "ROCKET and Arsenal appear in the classification panel only (no regression support). "
            "The dashed line shows the Pareto frontier (lower-left is optimal).*",
            elem_classes=["fig-caption"],
        )

    # ── Dataset characteristics (initially collapsed) ─────────────────────────
    with gr.Accordion("📐 Dataset Characteristics", open=False):
        if _fig("datasets_by_size_trio.png"):
            gr.Image(_fig("datasets_by_size_trio.png"), show_label=False)
        gr.Markdown(
            "*RamanBench: 74 datasets across 4 application domains, sorted by size (largest top) "
            "and split into two halves. "
            "Within each half, four panels are shown: "
            "__Instances__ (spectrum count, log scale, 12 to 130,061), "
            "__Features__ (number of wavenumber points), "
            "__Spectral Range__ (Raman shift window in cm⁻¹), "
            "__Targets__ (regression targets, or 1 for classification). "
            "Dataset name colours indicate application domain; "
            "bar colours encode task type (sky blue = Classification, orange = Regression).*",
            elem_classes=["fig-caption"],
        )

    # ── Pairwise win rates (initially collapsed) ──────────────────────────────
    with gr.Accordion("🔢 Pairwise Win Rates", open=False):
        if _fig("pairwise_win_rates.png"):
            with gr.Row():
                with gr.Column(scale=3):
                    gr.Image(_fig("pairwise_win_rates.png"), show_label=False)
                gr.Column(scale=1)
        gr.Markdown(
            "*__Top-ranked models win broadly across the benchmark; lower-ranked models show consistent "
            "losses against most competitors.__ "
            "Pairwise win counts across all prediction targets. "
            "Each cell shows the number of targets on which the __y-axis model__ outperforms the __x-axis model__ "
            "(ties count as 0.5). "
            "Green cells indicate a high win rate for the row model; red cells indicate a low win rate. "
            "Models are sorted by combined Elo rating, best at top-left. "
            "Only targets for which both models produce a valid prediction are counted.*",
            elem_classes=["fig-caption"],
        )

    # ── Efficiency overview (initially collapsed) ─────────────────────────────
    with gr.Accordion("🖥️ Computational Efficiency", open=False):
        if _fig("efficiency_overview.png"):
            gr.Image(_fig("efficiency_overview.png"), show_label=False)
        gr.Markdown(
            "*__Arsenal and RealMLP are the slowest models by training time; XGBoost and tree-based "
            "methods offer the lowest inference latency.__ "
            "Computational efficiency across three dimensions: "
            "__Training time__ (left, log scale), "
            "__Peak memory__ (centre, GB), "
            "__Inference latency__ (right, log scale, seconds per 1,000 samples). "
            "Models are sorted by training time and colour-coded by algorithmic family.*",
            elem_classes=["fig-caption"],
        )

    # ── Leaderboard ───────────────────────────────────────────────────────────
    with gr.Tabs():

        with gr.Tab("🏆 Overall"):
            gr.Markdown(
                "Combined ranking across all tasks. **Score** is normalized performance "
                "averaged across classification (F1) and regression (RMSE) datasets — "
                "best model per dataset = 1, median = 0."
            )
            with gr.Row():
                ov_cat = gr.CheckboxGroup(choices=overall_cats, value=overall_cats,
                                          label="Model category", scale=3)
                ov_col = gr.CheckboxGroup(choices=OVERALL_OPT_COLS, value=OVERALL_OPT_COLS,
                                          label="Columns", scale=3)
            init_ov, init_ov_dt, init_ov_cw = _render(_overall_base, overall_cats, OVERALL_OPT_COLS, OVERALL_OPT_COLS)
            ov_table = gr.Dataframe(value=init_ov, datatype=init_ov_dt, column_widths=init_ov_cw,
                                    interactive=False, wrap=False, elem_classes=["leaderboard"], elem_id="overall_lb")

            def update_overall(cats, cols):
                df, dt, cw = _render(_overall_base, cats, cols, OVERALL_OPT_COLS)
                return gr.Dataframe(value=df, datatype=dt, column_widths=cw)

            ov_cat.change(update_overall, [ov_cat, ov_col], ov_table)
            ov_col.change(update_overall, [ov_cat, ov_col], ov_table)

        with gr.Tab("📊 Classification"):
            gr.Markdown(
                "Per-model averages across all **classification** datasets, sorted by Avg Rank.  \n"
                "**Score** = normalized F1 (best model per dataset = 1, median = 0)."
            )
            with gr.Row():
                clf_cat = gr.CheckboxGroup(choices=clf_cats, value=clf_cats,
                                           label="Model category", scale=3)
                clf_col = gr.CheckboxGroup(choices=CLF_OPT_COLS, value=CLF_OPT_COLS,
                                           label="Columns", scale=3)
            init_clf, init_clf_dt, init_clf_cw = _render(_clf_base, clf_cats, CLF_OPT_COLS, CLF_OPT_COLS)
            clf_table = gr.Dataframe(value=init_clf, datatype=init_clf_dt, column_widths=init_clf_cw,
                                     interactive=False, wrap=False, elem_classes=["leaderboard"], elem_id="clf_lb")

            def update_clf(cats, cols):
                df, dt, cw = _render(_clf_base, cats, cols, CLF_OPT_COLS)
                return gr.Dataframe(value=df, datatype=dt, column_widths=cw)

            clf_cat.change(update_clf, [clf_cat, clf_col], clf_table)
            clf_col.change(update_clf, [clf_cat, clf_col], clf_table)

        with gr.Tab("📈 Regression"):
            gr.Markdown(
                "Per-model averages across all **regression** datasets, sorted by Avg Rank.  \n"
                "**Score** = normalized RMSE (best model per dataset = 1, median = 0)."
            )
            with gr.Row():
                reg_cat = gr.CheckboxGroup(choices=reg_cats, value=reg_cats,
                                           label="Model category", scale=3)
                reg_col = gr.CheckboxGroup(choices=REG_OPT_COLS, value=REG_OPT_COLS,
                                           label="Columns", scale=3)
            init_reg, init_reg_dt, init_reg_cw = _render(_reg_base, reg_cats, REG_OPT_COLS, REG_OPT_COLS)
            reg_table = gr.Dataframe(value=init_reg, datatype=init_reg_dt, column_widths=init_reg_cw,
                                     interactive=False, wrap=False, elem_classes=["leaderboard"], elem_id="reg_lb")

            def update_reg(cats, cols):
                df, dt, cw = _render(_reg_base, cats, cols, REG_OPT_COLS)
                return gr.Dataframe(value=df, datatype=dt, column_widths=cw)

            reg_cat.change(update_reg, [reg_cat, reg_col], reg_table)
            reg_col.change(update_reg, [reg_cat, reg_col], reg_table)

        with gr.Tab("🗃️ Datasets"):
            gr.Markdown("All 74 datasets included in RamanBench (163 prediction targets), grouped by application domain.  \n"
                        "*Click a row to see an example spectrum.*")
            with gr.Row():
                ds_domain = gr.CheckboxGroup(choices=_ds_domains, value=_ds_domains,
                                             label="Domain", scale=2)
                ds_task   = gr.CheckboxGroup(choices=_ds_tasks,   value=_ds_tasks,
                                             label="Task", scale=1)
            init_ds_full = _filter_datasets(_ds_domains, _ds_tasks)
            init_ds      = _render_datasets(_ds_domains, _ds_tasks)
            ds_state = gr.State(value=init_ds_full)
            ds_table = gr.Dataframe(value=init_ds, interactive=False, wrap=False,
                                    elem_classes=["ds-table"])

            with gr.Row():
                ds_image = gr.Image(label="Example spectrum", visible=False,
                                    show_label=True, height=300)

            def update_datasets(domains, tasks):
                full = _filter_datasets(domains, tasks)
                display = _render_datasets(domains, tasks)
                return display, full, gr.Image(visible=False)

            def on_ds_select(evt: gr.SelectData, full_df: pd.DataFrame):
                if full_df is None or full_df.empty:
                    return gr.Image(visible=False)
                row_idx = evt.index[0]
                if "dataset_id" not in full_df.columns or row_idx >= len(full_df):
                    return gr.Image(visible=False)
                ds_id    = full_df.iloc[row_idx]["dataset_id"]
                img_path = os.path.join(FIGURES_DIR, "examples", f"{ds_id}.png")
                if os.path.exists(img_path):
                    return gr.Image(value=img_path, visible=True)
                return gr.Image(visible=False)

            ds_domain.change(update_datasets, [ds_domain, ds_task], [ds_table, ds_state, ds_image])
            ds_task.change(update_datasets,   [ds_domain, ds_task], [ds_table, ds_state, ds_image])
            ds_table.select(on_ds_select, [ds_state], ds_image)

    # ── Info accordions ───────────────────────────────────────────────────────
    with gr.Accordion("ℹ️ About RamanBench", open=False):
        gr.Markdown("""
RamanBench evaluates 28 machine learning models on 74 Raman spectroscopy datasets
spanning four application domains:

| Domain | Description |
|---|---|
| 🔬 Material Science | Mineral identification, pigment classification |
| 🧬 Biological | Bioprocess monitoring, fermentation analytics |
| 🏥 Medical | Disease diagnosis from serum/tissue spectra |
| ⚗️ Chemical | Concentration prediction, reaction monitoring |

Together the datasets contain **325,000+ spectra** and define **163 independent benchmark tasks**.
Dataset sizes span over four orders of magnitude (12 to 130,061 spectra, median 235),
and feature dimensionality ranges from 114 to 11,689 wavenumber points (median 1,951).

All models are trained through **AutoGluon 1.5** with a **fixed hyperparameter configuration** across **3 random seeds**, ensuring reproducibility.

RamanBench is described in detail at [github.com/ml-lab-htw/RamanBench](https://github.com/ml-lab-htw/RamanBench).
""")

    with gr.Accordion("📐 Metrics", open=False):
        gr.Markdown("""
### Per-dataset performance metrics

| Metric | Task | Description |
|---|---|---|
| **F1** | Classification | Macro-averaged F1 score — primary ranking metric for classification |
| **Bal. Acc.** | Classification | Balanced accuracy (average per-class recall) |
| **RMSE** | Regression | Root Mean Squared Error — primary ranking metric for regression |
| **R²** | Regression | Coefficient of determination |

### Aggregation metrics (all tabs)

- **Elo**: Pairwise Elo calibrated to Random Forest = 1000. Averaged over 200 shuffled target orderings for stability; 95% confidence intervals from 200 bootstrap resamples of the target pool. F1 used for classification, RMSE for regression.
- **Score**: Per-target normalization (Salinas & Erickson, TabRepo 2024): best model = 1,
  median = 0, clipped at 0. Averaged across all applicable targets.
  The Overall Score averages across both classification (norm. F1) and regression (norm. RMSE) targets.
- **Avg Rank**: Mean rank across all prediction targets (lower = better, rank 1 = best).
- **Improvability**: `(err − best_err) / err × 100%` averaged across all targets,
  where `err = 1 − F1` (clf) or `err = RMSE` (reg). 0% = already optimal within the evaluated pool.

### Efficiency metrics (Overall tab only)

- **Train Time (s)**: Mean wall-clock training time, averaged across all datasets and seeds (single A100 GPU).
- **Infer. s/1K**: Mean inference time per 1,000 samples in seconds.
""")

    with gr.Accordion("🤖 Model Categories", open=False):
        gr.Markdown("""
| Category | Models |
|---|---|
| 🟦 Traditional ML | KNN, Logistic Reg., PLS |
| 🟩 Tree-based | Random Forest, Extra Trees |
| 🟧 Gradient Boosting | XGBoost, LightGBM, CatBoost |
| 🟥 Deep Learning | FastAI, NN (Torch), RealMLP, CoAtNet, FC-ResNeXt, TabM |
| 🟪 Tabular Foundation | TabPFN v2, TabPFN v2.5, TabICL, TabDPT, MITRA |
| 🩵 Raman-Specific | RamanNet, RamanTransformer, RamanFormer, DeepCNN, SANet, ReZeroNet |
| 🟠 TS Classification | ROCKET, ARSENAL (classification only) |
| ⬛ AutoML | AutoGluon 1.5 (extreme, 4h) |

AutoGluon is evaluated with a 4-hour compute budget (AutoGluon 1.5 "extreme" preset)
and is included as an upper reference baseline only.
""")

    with gr.Accordion("🤝 How to Contribute", open=False):
        gr.Markdown("""
We welcome contributions to RamanBench! There are several ways to get involved:

### 📂 Submit a new dataset

Adding a dataset follows a two-step process:

**Step 1 — Add the dataset to [`raman-data`](https://pypi.org/project/raman-data/)**
Implement a loader for your dataset in the
[raman-data](https://github.com/ml-lab-htw/raman_data) package and open a pull request there.
Once merged, a new release of `raman-data` is published to PyPI.

Your dataset must meet the inclusion criteria:
- Real measured Raman spectra (no synthetic data)
- Publicly accessible (Kaggle, Hugging Face, Zenodo, institutional repository, etc.)
- At least 10 labelled spectra; classification datasets need ≥ 2 classes with ≥ 9 spectra each
- At least one model must outperform the dummy baseline ($R^2 > 0.05$ for regression; $\\Delta$F1 > 0.05 for classification)

**Step 2 — Add the dataset to RamanBench**
After the `raman-data` release, open a pull request at
[github.com/ml-lab-htw/RamanBench](https://github.com/ml-lab-htw/RamanBench)
that adds the dataset key to the benchmark config file.
RamanBench will then evaluate all models on the new dataset and update the leaderboard.

### 🤖 Submit a new model
Implement your model as an AutoGluon-compatible wrapper following the existing model implementations
in the [RamanBench repository](https://github.com/ml-lab-htw/RamanBench).
It will be evaluated under the same standardized protocol (fixed configuration, 3 seeds) across all 74 datasets.

### 🐛 Report issues
Found a bug, a mislabeled dataset, or an incorrect metric?
Please open an issue at [github.com/ml-lab-htw/RamanBench/issues](https://github.com/ml-lab-htw/RamanBench/issues).

### 📬 Contact
- Mario Koddenbrock — [mario.koddenbrock@htw-berlin.de](mailto:mario.koddenbrock@htw-berlin.de)
- Christoph Lange — [christoph.lange@tu-berlin.de](mailto:christoph.lange@tu-berlin.de)
""")

    with gr.Accordion("🕓 Version History", open=False):
        gr.Markdown(f"""
**Current Version: RamanBench-v{VERSION}**

The following details updates to the leaderboard (date format YYYY/MM/DD):

**2026/04/30 — v0.1.0 (stable)**
Official stable release — all results are final for RamanBench v0.1.

**2026/04/14 — v0.1.0-alpha (pre-release)**
Initial public pre-release of the RamanBench leaderboard.
- 74 datasets, 163 prediction targets across four application domains
- 28 models evaluated: classical chemometrics, gradient boosting, deep learning, tabular foundation models, Raman-specific architectures, and time-series classifiers
- Metrics: Elo (RF = 1000), normalized Score, Avg Rank, Improvability, Train Time, Infer. s/1K
- Interactive filters by model category and column selection
- Overall, Classification, and Regression leaderboard views
- Dataset overview table with domain and task filters
- Paper figures with captions
""")

    with gr.Accordion("📄 Citation", open=False):
        gr.Code(value=CITATION, label="BibTeX", language=None, interactive=False)

    # DFG funding logo at the bottom — light/dark variants swap via Gradio's
    # .dark theme class. Encoded inline so we don't need a /file= URL.
    import base64 as _b64
    _dfg_light = _b64.b64encode(open(os.path.join(FIGURES_DIR, "dfg_logo_light.gif"), "rb").read()).decode()
    _dfg_dark  = _b64.b64encode(open(os.path.join(FIGURES_DIR, "dfg_logo_dark.gif"),  "rb").read()).decode()
    gr.HTML(f"""
<style>
.dfg-footer {{
  display: flex;
  justify-content: center;
  align-items: center;
  width: 100%;
  margin: 2.5rem 0 1rem 0;
}}
.dfg-footer img {{ max-width: 280px; height: auto; display: block; margin: 0 auto; }}
.dfg-footer .dfg-dark {{ display: none; }}
.dark .dfg-footer .dfg-light {{ display: none; }}
.dark .dfg-footer .dfg-dark  {{ display: block; margin: 0 auto; }}
</style>
<div class="dfg-footer">
  <img class="dfg-light" src="data:image/gif;base64,{_dfg_light}" alt="DFG">
  <img class="dfg-dark"  src="data:image/gif;base64,{_dfg_dark}"  alt="DFG">
</div>
""")

demo.launch(allowed_paths=[FIGURES_DIR])