"""
Metadata Hierarchy Explorer — TFM 2026
Pre-built results viewer for Baseline, Approach 1, and Approach 2.

Rendering faithfully replicates each app's display pipeline:
  - Baseline    : raw tree, Greens, Sunburst + Treemap
  - Approach 1  : raw tree, Blues,  Sunburst + Treemap + Node-link + Facets
  - Approach 2  : compress one-child chains, Viridis, Sunburst + Treemap + Node-link

Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
match the controls in the individual apps.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path

import numpy as np
import plotly.graph_objects as go
import streamlit as st

# ─────────────────────────────────────────────────────────────────────────────
# PAGE CONFIG
# ─────────────────────────────────────────────────────────────────────────────
st.set_page_config(
    page_title="Metadata Hierarchy Explorer",
    page_icon="🌿",
    layout="wide",
)

ROOT = Path(__file__).parent / "outputs"

DEFAULT_DEPTH = 7

# ─────────────────────────────────────────────────────────────────────────────
# PRE-BUILT OUTPUT PATHS
# ─────────────────────────────────────────────────────────────────────────────
PREBUILT = {
    "Baseline": {
        "AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
        "HCP":     {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
    },
    "Approach 1": {
        "AI-MIND": {
            "hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
            "facets":    ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
        },
        "HCP": {
            "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
            "facets":    ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
        },
    },
    "Approach 2": {
        "AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
        "HCP":     {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
    },
}

# Per-approach rendering config (matches each source app)
CONFIG = {
    "Baseline":   {"color": "Greens",  "compress": False, "node_link": False},
    "Approach 1": {"color": "Blues",   "compress": False, "node_link": True},
    "Approach 2": {"color": "Viridis", "compress": True,  "node_link": True},
}

APPROACH_DESC = {
    "Baseline": (
        "Pure clustering baseline — TF-IDF representation + recursive agglomerative "
        "(cosine) clustering, number of clusters chosen by silhouette. No external APIs, "
        "no neural embeddings. Node labels are the most discriminative terms per cluster."
    ),
    "Approach 1": (
        "Global embedding pipeline — SBERT + N×M concept-table alignment (Gonçalves 2019) "
        "+ HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets. Optionally "
        "retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal."
    ),
    "Approach 2": (
        "Dataset-constrained multi-aspect hierarchy — group-anchored L1/L2 → phrase-slot "
        "mining → FASTopic semantic aspect discovery (Wu et al. NeurIPS 2024) → GMM/KMeans "
        "clustering → deterministic 5-stage label generation. Optional local-LLM refinement."
    ),
}

# ─────────────────────────────────────────────────────────────────────────────
# TREE TRANSFORMS  (copied from approach_2.py — display-only, exact behaviour)
# ─────────────────────────────────────────────────────────────────────────────
def _filter_dissolved(nodes: list) -> list:
    drop_ids = {int(n["id"]) for n in nodes
                if n.get("type") == "dissolved" or n.get("isShown") is False}
    if not drop_ids:
        return nodes
    out = []
    for n in nodes:
        if int(n["id"]) in drop_ids:
            continue
        m = dict(n)
        m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
        out.append(m)
    return out

def compress_one_child_chains(nodes: list) -> list:
    """Collapse chains where an aggregation node has exactly one aggregation child
    (e.g. 'DMS → DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
    nodes = _filter_dissolved(nodes)
    nm = {int(n["id"]): dict(n) for n in nodes}

    def _is_chain_link(n):
        if n.get("type") != "aggregation":
            return False
        children = n.get("related", [])
        return (len(children) == 1
                and nm.get(int(children[0]), {}).get("type") == "aggregation")

    changed = True
    while changed:
        changed = False
        for nid, n in list(nm.items()):
            if _is_chain_link(n):
                child_id = int(n["related"][0])
                child = nm[child_id]
                new_node = dict(child)
                new_node["id"] = nid
                new_node["name"] = f"{n['name']} / {child['name']}"
                new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
                nm[nid] = new_node
                if child_id in nm:
                    del nm[child_id]
                for other in nm.values():
                    other["related"] = [nid if int(c) == child_id else int(c)
                                        for c in other.get("related", [])]
                changed = True
                break
    return list(nm.values())

# ─────────────────────────────────────────────────────────────────────────────
# RENDER HELPERS  (DAG-safe value map — copied from approach_2.py)
# ─────────────────────────────────────────────────────────────────────────────
def _leaf_ids(nodes: list, nid: int) -> list:
    m = {int(n["id"]): n for n in nodes}
    out = []
    def rec(x):
        n = m.get(int(x))
        if not n:
            return
        if n.get("type") == "attribute":
            out.append(int(x)); return
        for c in n.get("related", []):
            rec(int(c))
    rec(nid)
    return list(dict.fromkeys(out))

def _parent_map(nodes: list) -> dict:
    pm = {}
    for n in nodes:
        for c in n.get("related", []):
            if int(c) not in pm:
                pm[int(c)] = int(n["id"])
    return pm

def _tree_value_map(nodes: list, pm: dict) -> dict:
    kids = {}
    for child, par in pm.items():
        kids.setdefault(int(par), []).append(int(child))
    nodemap = {int(n["id"]): n for n in nodes}
    memo = {}
    def count(nid: int) -> int:
        if nid in memo:
            return memo[nid]
        memo[nid] = 1
        n = nodemap.get(nid)
        if n is not None and n.get("type") == "attribute":
            memo[nid] = 1
            return 1
        ch = kids.get(nid, [])
        v = sum(count(c) for c in ch) if ch else 1
        memo[nid] = max(1, v)
        return memo[nid]
    return {nid: count(nid) for nid in nodemap}

def _wrap_hover(text: str, width: int = 80) -> str:
    import textwrap as _tw
    s = str(text or "")
    if not s:
        return ""
    lines = []
    for raw_line in s.split("\n"):
        lines.extend(_tw.wrap(raw_line, width=width) or [""])
    return "<br>".join(lines)

def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
    nodes = _filter_dissolved(nodes)
    pm = _parent_map(nodes)
    vm = _tree_value_map(nodes, pm)
    ids, labels, parents, values, hover = [], [], [], [], []
    for n in nodes:
        nid = int(n["id"])
        lc = len(_leaf_ids(nodes, nid))
        ids.append(str(nid))
        labels.append(str(n.get("name", ""))[:40])
        parents.append("" if nid == 0 else str(pm.get(nid, 0)))
        values.append(vm.get(nid, 1))
        hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
                     f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
    fig = go.Figure(go.Sunburst(
        ids=ids, labels=labels, parents=parents, values=values,
        branchvalues="total", hovertext=hover, hoverinfo="text",
        maxdepth=max_depth, insidetextorientation="radial",
        marker=dict(colorscale=color, line=dict(width=1, color="white"))))
    fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
                      title=dict(text="Click sector to drill down — click centre to go back",
                                 font=dict(size=13), x=0.5))
    return fig

def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
    nodes = _filter_dissolved(nodes)
    pm = _parent_map(nodes)
    vm = _tree_value_map(nodes, pm)
    ids, labels, parents, values, hover = [], [], [], [], []
    for n in nodes:
        nid = int(n["id"])
        lc = len(_leaf_ids(nodes, nid))
        ids.append(str(nid))
        labels.append(str(n.get("name", ""))[:40])
        parents.append("" if nid == 0 else str(pm.get(nid, 0)))
        values.append(vm.get(nid, 1))
        hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
                     f"{_wrap_hover(n.get('desc', ''))}")
    fig = go.Figure(go.Treemap(
        ids=ids, labels=labels, parents=parents, values=values,
        branchvalues="total", hovertext=hover, hoverinfo="text",
        textinfo="label+value", maxdepth=max_depth,
        marker=dict(colorscale=color, line=dict(width=1, color="white"))))
    fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
    return fig

# ─────────────────────────────────────────────────────────────────────────────
# NODE-LINK TREE  (Reingold-Tilford layout — copied from approach_2.py)
# ─────────────────────────────────────────────────────────────────────────────
def _node_color(n: dict) -> str:
    t = n.get("type", "")
    if t == "root":      return "#c44e52"
    if t == "attribute": return "#4C72B0"
    if t == "collapsed": return "#bbbbbb"
    return "#8C8C8C"

def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
    m = {int(n["id"]): n for n in nodes}
    dnodes: dict = {}
    edges: list = []
    counter = 10 ** 9

    def rec(nid, depth):
        nonlocal counter
        n = m.get(int(nid))
        if not n:
            return
        if not show_hidden and n.get("isShown") is False and depth > 0:
            return
        dnodes[int(nid)] = n
        if depth >= max_depth and n.get("related"):
            counter += 1
            cid = counter
            n_leaves = len(_leaf_ids(nodes, nid))
            dnodes[cid] = {"id": cid, "name": f"… {n_leaves} variables",
                           "type": "collapsed", "related": [],
                           "desc": f"Collapsed: {n.get('name')}"}
            edges.append((int(nid), cid))
            return
        for c in n.get("related", []):
            ch = m.get(int(c))
            if not ch:
                continue
            if not show_hidden and ch.get("isShown") is False:
                continue
            edges.append((int(nid), int(c)))
            rec(int(c), depth + 1)

    rec(0, 0)
    return list(dnodes.values()), edges

def _positions(edges: list):
    H_SCALE, V_SPACE = 3.0, 1.8
    children: dict = defaultdict(list)
    for p, c in edges:
        children[p].append(c)
    pos: dict = {}
    counter = {"v": 0}

    def rec(nid, depth):
        ch = children.get(nid, [])
        if not ch:
            y_pos = counter["v"] * V_SPACE
            counter["v"] += 1
            pos[nid] = (depth * H_SCALE, y_pos)
            return y_pos
        child_ys = [rec(c, depth + 1) for c in ch]
        y_pos = float(np.mean(child_ys))
        pos[nid] = (depth * H_SCALE, y_pos)
        return y_pos

    rec(0, 0)
    return pos

def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
    nodes = _filter_dissolved(nodes)
    dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
    pos = _positions(edges)

    ex, ey = [], []
    for p, c in edges:
        if p not in pos or c not in pos:
            continue
        x0, y0 = pos[p]
        x1, y1 = pos[c]
        xm = (x0 + x1) / 2
        ex += [x0, xm, xm, x1, None]
        ey += [y0, y0, y1, y1, None]
    traces = [go.Scatter(x=ex, y=ey, mode="lines",
                         line=dict(width=1, color="#c8c8c8"),
                         hoverinfo="skip", showlegend=False)]

    agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
    lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
    for n in dnodes:
        nid = int(n["id"])
        if nid not in pos:
            continue
        x, y = pos[nid]
        lc = len(_leaf_ids(nodes, nid))
        lab = str(n.get("name", ""))[:32]
        hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
               f"Variables: {lc}")
        if n.get("type") == "attribute":
            lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
            lf_lab.append(lab if show_leaf_labels else "")
            lf_hov.append(hov)
        else:
            agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
            agg_lab.append(lab); agg_hov.append(hov)

    traces.append(go.Scatter(
        x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
        text=lf_lab, textposition="middle right", textfont=dict(size=9),
        marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
        hovertext=lf_hov, hoverinfo="text", showlegend=False))
    traces.append(go.Scatter(
        x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
        textposition="middle right", textfont=dict(size=10),
        marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
        hovertext=agg_hov, hoverinfo="text", showlegend=False))

    n_rows = max(len(lf_y), len(agg_y), 1)
    fig = go.Figure(traces)
    fig.update_layout(
        height=max(600, n_rows * 16),
        margin=dict(l=10, r=140, t=10, b=10),
        xaxis=dict(visible=False), yaxis=dict(visible=False),
        plot_bgcolor="white",
    )
    return fig

# ─────────────────────────────────────────────────────────────────────────────
# STATS / SAFE RENDERING
# ─────────────────────────────────────────────────────────────────────────────
def _tree_depth(nodes: list) -> int:
    """Max depth of the rendered single-parent tree (root = depth 0)."""
    nodes = _filter_dissolved(nodes)
    m = {int(n["id"]): n for n in nodes}
    best = {"d": 0}
    def rec(nid, d):
        best["d"] = max(best["d"], d)
        for c in m.get(int(nid), {}).get("related", []):
            if int(c) in m:
                rec(int(c), d + 1)
    rec(0, 0)
    return best["d"]

def safe_render_depth(nodes: list, requested: int) -> int:
    """Plotly sunburst/treemap silently blank when asked to draw too many sectors
    at once (large hierarchies like HCP). Cap the *initial* render depth — the
    chart stays fully drillable by clicking, so no data is lost."""
    n = len(_filter_dissolved(nodes))
    if n > 400:
        return min(requested, 3)
    if n > 150:
        return min(requested, 4)
    return requested

# ─────────────────────────────────────────────────────────────────────────────
# IO
# ─────────────────────────────────────────────────────────────────────────────
@st.cache_data(show_spinner=False)
def _load_json(path_str: str):
    with open(path_str, encoding="utf-8") as f:
        return json.load(f)

def _read_bytes(path_str: str) -> bytes:
    with open(path_str, "rb") as f:
        return f.read()

@st.cache_data(show_spinner=False)
def _outputs_zip(root_str: str) -> bytes:
    """Zip the entire bundled outputs/ folder for one-click download."""
    import io, zipfile
    root = Path(root_str)
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
        for p in sorted(root.rglob("*")):
            if p.is_file():
                zf.write(p, arcname=p.relative_to(root.parent).as_posix())
    return buf.getvalue()

def count_nodes(nodes: list) -> tuple[int, int]:
    nodes = _filter_dissolved(nodes)
    leaves = sum(1 for n in nodes if n.get("type") == "attribute")
    aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
    return leaves, aggs

def concept_aligned_pct(nodes: list) -> float | None:
    """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
    aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
    if not aggs:
        return None
    aligned = sum(1 for n in aggs
                  if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
    return 100.0 * aligned / len(aggs) if aligned else None

# ─────────────────────────────────────────────────────────────────────────────
# SIDEBAR
# ─────────────────────────────────────────────────────────────────────────────
with st.sidebar:
    st.title("🌿 Hierarchy Explorer")
    st.caption("TFM 2026 — Metadata hierarchy construction")
    st.markdown("---")

    approach = st.radio("**Select Approach**",
                        ["Baseline", "Approach 1", "Approach 2"], index=0)
    dataset = st.radio("**Select Dataset**", ["AI-MIND", "HCP"], index=0)

    st.markdown("---")
    st.caption("Results are pre-built from the thesis experiments. To run on your "
               "own data, clone the repository and run the individual apps.")
    st.markdown("[📦 GitHub Repository]"
                "(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")

# ─────────────────────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────────────────────
cfg = CONFIG[approach]
color = cfg["color"]

st.title(f"📊 {approach} — {dataset} Dataset")
st.markdown(f"> {APPROACH_DESC[approach]}")

paths = PREBUILT[approach][dataset]
hier_path = paths.get("hierarchy")
if hier_path is None or not hier_path.exists():
    st.error(f"Pre-built result not found: `{hier_path}`")
    st.stop()

raw_nodes = _load_json(str(hier_path))

leaves, aggs = count_nodes(raw_nodes)
c1, c2, c3 = st.columns(3)
c1.metric("Leaf Variables", leaves)
c2.metric("Aggregation Nodes", aggs)
c3.metric("Total Nodes", leaves + aggs)

# ── Build summary (collapsed) ────────────────────────────────────────────────
facet_path = paths.get("facets")
n_facets = None
if facet_path is not None and facet_path.exists():
    try:
        n_facets = len(_load_json(str(facet_path)))
    except Exception:
        n_facets = None

with st.expander("ℹ️ Build summary", expanded=False):
    bs1, bs2, bs3, bs4 = st.columns(4)
    bs1.metric("Variables", leaves)
    bs2.metric("Internal nodes", aggs)
    bs3.metric("Tree depth", _tree_depth(raw_nodes))
    bs4.metric("Facets", n_facets if n_facets is not None else "—")
    pct = concept_aligned_pct(raw_nodes)
    if pct is not None:
        st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
    st.caption(
        f"Source file: `{hier_path.name}` · "
        f"Approach: **{approach}** · Dataset: **{dataset}**. "
        "Tree topology and labels are reproduced exactly from the pre-built "
        "thesis output (the algorithms are not re-run in this viewer)."
    )

# ── Downloads ────────────────────────────────────────────────────────────────
d1, d2, d3 = st.columns(3)
with d1:
    st.download_button("⬇️ Hierarchy JSON", data=_read_bytes(str(hier_path)),
                       file_name=hier_path.name, mime="application/json",
                       use_container_width=True)
with d2:
    if facet_path is not None and facet_path.exists():
        st.download_button("⬇️ Facets JSON", data=_read_bytes(str(facet_path)),
                           file_name=facet_path.name, mime="application/json",
                           use_container_width=True)
    else:
        st.button("⬇️ Facets JSON", disabled=True, use_container_width=True,
                  help="This approach/dataset has no facet tree.")
with d3:
    st.download_button("⬇️ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
                       file_name="metadata_hierarchy_outputs.zip",
                       mime="application/zip", use_container_width=True)

st.markdown("---")

# ── Level-of-Detail controls (above chart — matches the apps) ────────────────
view_options = ["Sunburst (drill-down)", "Treemap"]
if cfg["node_link"]:
    view_options.append("Node-link tree")

if cfg["compress"]:
    vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
else:
    vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
    vc5 = None

with vc1:
    viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
                        help="Sunburst best for large hierarchies [Taxonomizer]. "
                             "Node-link best for moderate-depth structure inspection.")
with vc2:
    depth = st.slider("Depth (Level of Detail)", 1, 8, DEFAULT_DEPTH, 1)
with vc3:
    show_leaf_labels = st.checkbox("Leaf labels", value=False)
with vc4:
    show_hidden = st.checkbox("Hidden nodes", value=False)
if vc5 is not None:
    with vc5:
        compress_chains = st.checkbox("Compress chains", value=True,
                                      help="Merge one-child aggregation chains "
                                           '(e.g. "DMS → DMS Recommended Standard") for '
                                           "display. Export JSON keeps original structure.")
else:
    compress_chains = False

st.divider()

display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes

if viz_mode == "Sunburst (drill-down)":
    eff = safe_render_depth(display_nodes, depth)
    if eff < depth:
        st.caption(f"Large hierarchy — showing {eff} levels initially to render "
                   "reliably. **Click any sector to drill deeper.**")
    st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
elif viz_mode == "Treemap":
    eff = safe_render_depth(display_nodes, depth)
    if eff < depth:
        st.caption(f"Large hierarchy — showing {eff} levels initially to render "
                   "reliably. **Click a tile to drill deeper.**")
    st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
else:
    st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
                    use_container_width=True)

# ── Facets (Approach 1 only) ─────────────────────────────────────────────────
if facet_path is not None and facet_path.exists():
    st.markdown("---")
    st.subheader("🔀 Parallel facets")
    facets = _load_json(str(facet_path))
    names = list(facets.keys())
    if not names:
        st.info("No facets available for this dataset.")
    else:
        sel = st.selectbox("Select facet", names)
        fnodes = facets[sel]
        ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
        with ft1:
            st.plotly_chart(plot_sunburst(fnodes, color, depth), use_container_width=True)
        with ft2:
            st.plotly_chart(plot_treemap(fnodes, color), use_container_width=True)