"""Generate a per-paradigm Algorithm Lab.

This page follows the interaction pattern of the local DiT / cross-attention
explainers: a stepper controls animation, formula rows, code lines, and a plain
English trace at the same time.  The content is generated for every node in the
main robot-learning landscape, using specific templates for the method family.
"""

from __future__ import annotations

import json

import gen_landscape
import gen_vlm
import gen_worldmodel


SOURCE_LINKS = [
    ("Diffusion Explainer", "https://poloclub.github.io/diffusion-explainer/"),
    ("Diffusion Explainer paper", "https://arxiv.org/abs/2305.03509"),
    ("Distill explorable explanations", "https://distill.pub/"),
    ("The Illustrated Transformer", "https://jalammar.github.io/illustrated-transformer/"),
    ("BertViz attention visualization", "https://arxiv.org/abs/1906.05714"),
    ("Attention Flow", "https://arxiv.org/abs/2005.00928"),
    ("Spinning Up RL docs", "https://spinningup.openai.com/"),
    ("Sutton & Barto RL book", "http://incompleteideas.net/book/the-book-2nd.html"),
    ("Lilian Weng policy gradients", "https://lilianweng.github.io/posts/2018-04-08-policy-gradient/"),
    ("Diffusion Policy", "https://diffusion-policy.cs.columbia.edu/"),
    ("Decision Transformer", "https://sites.google.com/berkeley.edu/decision-transformer"),
    ("World Models", "https://worldmodels.github.io/"),
]


ARCHETYPES = {
    "diffusion": {
        "title": "Generative action model",
        "intuition": "把动作或轨迹当成要生成的对象：从噪声出发，逐步变成可执行动作。",
        "steps": [
            ("context", "读状态/指令", "observation + language become the conditioning context", [0, 1], [0]),
            ("noise", "采样噪声", "start from noise or a noisy trajectory/action", [1, 2], [1, 2]),
            ("score", "预测修正", "network predicts denoising score / velocity / action correction", [2], [3, 4]),
            ("iterate", "迭代生成", "repeat a few denoising or flow integration steps", [2, 3], [5, 6]),
            ("execute", "执行动作", "send the generated action chunk or first trajectory segment to the controller", [3], [7]),
        ],
        "formulas": [
            (r"c=\phi(o,\ell)", "状态/图像/语言先编码成条件 c。"),
            (r"a^0\sim\mathcal{N}(0,I)", "从随机动作开始。"),
            (r"\epsilon_\theta(a^k,c,k)\;\text{or}\;v_\theta(a^t,c,t)", "网络预测去噪方向或 flow velocity。"),
            (r"a^{k-1}=a^k-\alpha\,\epsilon_\theta(a^k,c,k)+\sigma_k z", "diffusion: 一步步去掉噪声。"),
            (r"a^1=a^0+\int_0^1 v_\theta(a^t,c,t)\,dt", "flow: 沿速度场积分到动作。"),
            (r"\pi(a\mid o,\ell)\approx p_\theta(a\mid c)", "生成分布就是策略。"),
        ],
        "code": [
            "# generative action head",
            "c = encoder(obs, instruction)",
            "a = normal(shape=action_chunk)",
            "for k in schedule:",
            "    delta = denoiser(a, c, k)",
            "    a = update(a, delta, k)",
            "return execute(a[0])",
            "# flow replaces update with an ODE step",
        ],
        "vis": "denoise",
    },
    "token": {
        "title": "Token / sequence model",
        "intuition": "把状态、目标、语言、动作都变成序列 token，然后像语言模型一样预测下一个动作 token。",
        "steps": [
            ("context", "构造上下文", "state, language, return, or goal tokens define the context", [0, 1], [0, 1]),
            ("quantize", "动作离散化", "continuous action becomes bins, VQ codes, or compressed action tokens", [1], [2]),
            ("decode", "自回归预测", "predict one action token at a time", [2], [3, 4]),
            ("detok", "还原动作", "decode tokens back to continuous robot commands", [3], [5]),
            ("control", "闭环执行", "execute a short chunk, observe again, and re-plan", [3, 4], [6]),
        ],
        "formulas": [
            (r"x_{1:T}=[c_1,\ldots,c_m,a^{(1)},\ldots,a^{(K)}]", "把条件和动作放进同一序列。"),
            (r"a\rightarrow(a^{(1)},\ldots,a^{(K)}),\quad a^{(j)}\in\mathcal{V}", "连续动作被 tokenized。"),
            (r"p_\theta(a^{(j)}\mid c,a^{(<j)})", "逐 token 预测。"),
            (r"\mathcal{L}=-\sum_j\log p_\theta(a^{(j)}\mid c,a^{(<j)})", "交叉熵训练。"),
            (r"\hat a=\mathrm{decode}(\hat a^{(1:K)})", "把 token 还原成机器人动作。"),
        ],
        "code": [
            "# tokenized robot policy",
            "ctx = tokenize(obs, instruction, history)",
            "tok = []",
            "for j in range(K):",
            "    logits = transformer(ctx + tok)",
            "    tok.append(sample(logits[-1]))",
            "action = detokenize(tok)",
        ],
        "vis": "tokens",
    },
    "value": {
        "title": "Value backup",
        "intuition": "从奖励往回传播价值，最后在每个状态选择 Q 最大的动作。",
        "steps": [
            ("collect", "收集转移", "observe transitions (s,a,r,s')", [0], [0, 1]),
            ("target", "构造 Bellman 目标", "reward plus discounted best next value", [0, 1], [2, 3]),
            ("fit", "拟合 Q", "regress Q toward the target", [1, 2], [4]),
            ("act", "贪心动作", "choose the action with highest Q", [2], [5]),
        ],
        "formulas": [
            (r"y=r+\gamma\max_{a'}Q_{\bar\theta}(s',a')", "Bellman target。"),
            (r"\mathcal{L}=(Q_\theta(s,a)-y)^2", "把当前 Q 拉向目标。"),
            (r"\pi(s)=\arg\max_a Q_\theta(s,a)", "执行价值最高的动作。"),
        ],
        "code": [
            "# Q-learning / DQN style update",
            "s, a, r, sp = replay.sample()",
            "target = r + gamma * max_a(Q_target(sp, a))",
            "loss = mse(Q(s, a), target)",
            "theta = optimizer.step(loss)",
            "action = argmax_a(Q(obs, a))",
        ],
        "vis": "grid",
    },
    "policy": {
        "title": "Policy gradient",
        "intuition": "做很多次任务，把高回报动作的概率推高，把低回报动作的概率推低。",
        "steps": [
            ("rollout", "采样轨迹", "run current policy in the environment", [0], [0, 1]),
            ("return", "计算优势", "estimate return or advantage for each action", [0, 1], [2, 3]),
            ("gradient", "加权梯度", "log-prob gradient weighted by advantage", [1, 2], [4]),
            ("update", "更新策略", "move probability mass toward good actions", [2], [5]),
        ],
        "formulas": [
            (r"J(\theta)=\mathbb{E}_{\tau\sim\pi_\theta}\sum_t\gamma^t r_t", "目标是最大化期望回报。"),
            (r"\nabla_\theta J=\mathbb{E}\sum_t\nabla_\theta\log\pi_\theta(a_t|s_t)\hat A_t", "好动作的 log-prob 被推高。"),
            (r"L^{PPO}=\mathbb{E}\min(r_t\hat A_t,\mathrm{clip}(r_t,1-\epsilon,1+\epsilon)\hat A_t)", "PPO 限制更新幅度。"),
        ],
        "code": [
            "# policy gradient / PPO style",
            "traj = rollout(policy)",
            "adv = estimate_advantage(traj)",
            "ratio = prob_new / prob_old",
            "loss = -min(ratio * adv, clip(ratio) * adv)",
            "theta = optimizer.step(loss)",
        ],
        "vis": "prob",
    },
    "actorcritic": {
        "title": "Actor-Critic",
        "intuition": "actor 决定怎么做，critic 给动作打分；critic 学得越准，actor 更新越有方向。",
        "steps": [
            ("act", "Actor 出动作", "policy proposes an action", [0], [0, 1]),
            ("critic", "Critic 打分", "critic estimates Q or V", [1], [2, 3]),
            ("td", "TD 更新 critic", "fit critic to bootstrapped target", [1, 2], [4]),
            ("actor", "更新 actor", "move action toward high critic value", [2, 3], [5]),
        ],
        "formulas": [
            (r"y=r+\gamma Q_{\bar\phi}(s',\pi_{\bar\theta}(s'))", "critic 的 bootstrap target。"),
            (r"\mathcal{L}_Q=(Q_\phi(s,a)-y)^2", "训练 critic。"),
            (r"\max_\theta\mathbb{E}_{s\sim\mathcal{D}}Q_\phi(s,\pi_\theta(s))+\alpha\mathcal{H}", "训练 actor。"),
        ],
        "code": [
            "# off-policy actor-critic",
            "a = actor(s)",
            "y = r + gamma * critic_targ(sp, actor_targ(sp))",
            "critic_loss = mse(critic(s, a_data), y)",
            "actor_loss = -critic(s, actor(s)).mean()",
            "update(critic_loss, actor_loss)",
        ],
        "vis": "actorcritic",
    },
    "world": {
        "title": "World model / planning",
        "intuition": "先学一个“如果这样做会怎样”的模型，再在模型里想象、评估、规划或训练。",
        "steps": [
            ("encode", "编码世界状态", "compress observation into latent state", [0], [0, 1]),
            ("predict", "预测未来", "roll the latent/video model forward", [1], [2, 3]),
            ("score", "评估候选动作", "reward/value/cost scores imagined futures", [2], [4]),
            ("plan", "选择计划", "pick best action sequence or train inside imagination", [2, 3], [5]),
            ("execute", "执行再重规划", "execute one step and close the loop", [3], [6]),
        ],
        "formulas": [
            (r"z_t=\phi(o_t)", "把观测压成 latent。"),
            (r"p_\theta(z_{t+1}\mid z_t,a_t)", "action-conditioned dynamics。"),
            (r"\hat a_{t:t+H}=\arg\max_a\sum_k\gamma^k\hat r(\hat z_{t+k},a_{t+k})", "在模型中规划。"),
            (r"\nabla_\psi J=\nabla_\psi \mathbb{E}_{\hat\tau\sim p_\theta,\pi_\psi}\sum_t\hat r_t", "也可以在 imagined rollouts 中训练 actor。"),
        ],
        "code": [
            "# model-based control",
            "z = encoder(obs)",
            "candidates = sample_action_sequences()",
            "for A in candidates:",
            "    future = world_model.rollout(z, A)",
            "    score[A] = reward_or_value(future)",
            "execute(argmax(score)[0])",
        ],
        "vis": "world",
    },
    "planner": {
        "title": "Planner / constraint solver",
        "intuition": "把任务变成几何路径、子目标、代码或约束，再由控制器/优化器执行。",
        "steps": [
            ("goal", "解析目标", "language, goal, or geometry defines what should happen", [0], [0, 1]),
            ("constraints", "生成约束/代价", "planner constructs cost, graph, or skill sequence", [1], [2, 3]),
            ("search", "搜索/优化", "search over paths, subgoals, or programs", [2], [4]),
            ("execute", "执行与反馈", "low-level controller follows the plan and replans if needed", [3], [5]),
        ],
        "formulas": [
            (r"\tau^*=\arg\min_\tau\int_0^T\mathcal{C}(\tau(t))dt", "trajectory optimization。"),
            (r"\text{plan}=\mathrm{LLM/VLM}(\ell,\mathrm{scene},\mathrm{API})", "LLM/VLM planner。"),
            (r"\tau(s)\in\mathcal{C}_{free}", "motion planning 必须满足 collision-free。"),
        ],
        "code": [
            "# planner / orchestration",
            "goal = parse(instruction, scene)",
            "cost_or_skills = propose(goal)",
            "plan = search_or_optimize(cost_or_skills)",
            "for step in plan:",
            "    controller.execute(step)",
            "    if blocked: replan()",
        ],
        "vis": "planner",
    },
    "imitation": {
        "title": "Imitation / inverse objective",
        "intuition": "从专家行为中学习：直接复制动作，或反推出专家在优化什么奖励。",
        "steps": [
            ("demo", "读取示范", "expert trajectories define behavior support", [0], [0, 1]),
            ("match", "匹配专家", "policy, energy, discriminator, or reward tries to explain expert actions", [1], [2, 3]),
            ("opt", "优化策略", "train policy by supervised loss or RL on recovered reward", [2], [4]),
            ("deploy", "部署并反馈", "execute policy, aggregate or compare new behavior", [3], [5]),
        ],
        "formulas": [
            (r"\min_\theta D[\pi_\theta(a|o)\|\pi_{\mathcal{D}}(a|o)]", "BC: 匹配专家动作分布。"),
            (r"\min_\pi\max_D\mathbb{E}_{\pi^*}\log D+\mathbb{E}_{\pi}\log(1-D)", "GAIL: 专家和学习者对抗匹配。"),
            (r"\text{recover }r\;\text{s.t.}\;\pi_r\approx\pi^*", "IRL: 反推出奖励。"),
        ],
        "code": [
            "# imitation-style update",
            "demo = sample_expert_batch()",
            "pred = policy(demo.obs)",
            "loss = behavior_matching(pred, demo.action)",
            "theta = optimizer.step(loss)",
            "# IRL/GAIL inserts reward/discriminator before policy update",
        ],
        "vis": "imitation",
    },
}


ID_TO_ARCHETYPE = {
    "diffusion-policy": "diffusion",
    "flow-matching-policy": "diffusion",
    "trajectory-diffusion": "diffusion",
    "world-action-model": "world",
    "generative-video-wm": "world",
    "action-conditioned-wm": "world",
    "latent-imagination": "world",
    "forward-dynamics-mpc": "world",
    "occupancy-latent-wm": "world",
    "tokenized-bc": "token",
    "decision-transformer": "token",
    "value-based-rl": "value",
    "policy-gradient-rl": "policy",
    "off-policy-ac": "actorcritic",
    "offline-rl": "actorcritic",
    "energy-based-bc": "imitation",
    "maxent-irl": "imitation",
    "gail": "imitation",
    "goal-conditioned": "planner",
    "hrl": "planner",
    "meta-learning": "policy",
    "llm-planner": "planner",
    "vlm-affordance": "planner",
    "pid-control": "planner",
    "lqr": "planner",
    "classical-mpc": "planner",
    "motion-planning": "planner",
}


def steps(items):
    """Compact node-specific step declarations."""
    return [(k, t, c, fx, code) for k, t, c, fx, code in items]


NODE_LESSONS = {
    "flow-matching-policy": {
        "title": "Flow Matching action head",
        "intuition": "不是一点点去噪，而是学习一条从噪声直达专家动作的速度场；推理时沿 ODE 走几步。",
        "vis": "denoise",
        "steps": steps([
            ("pair", "配对噪声与专家动作", "sample a noise action and an expert action for the same observation", [0, 1], [0, 1]),
            ("interpolate", "走中间点", "draw a random time t and interpolate between noise and expert action", [1, 2], [2, 3]),
            ("velocity", "预测速度场", "train v_theta to point from current point toward the expert action", [2, 3], [4]),
            ("integrate", "ODE 积分", "at inference, integrate the learned velocity field from noise to action", [3, 4], [5, 6]),
            ("chunk", "执行 action chunk", "execute the generated action chunk with receding-horizon feedback", [4], [7]),
        ]),
        "formulas": [
            (r"a^0\sim\mathcal{N}(0,I),\quad a^1\sim\pi_{\mathcal{D}}(\cdot|o)", "同一观测下配一个噪声动作和专家动作。"),
            (r"a^t=(1-t)a^0+t a^1,\quad t\sim U[0,1]", "训练时看中间点。"),
            (r"u_t=a^1-a^0", "直线路径的目标速度。"),
            (r"\mathcal{L}=\mathbb{E}\|v_\theta(a^t,o,t)-u_t\|^2", "速度场回归。"),
            (r"\frac{da}{dt}=v_\theta(a,o,t)", "推理时解 ODE。"),
        ],
        "code": ["# flow matching policy", "a0 = normal_like(action)", "a1 = expert_action", "t = uniform(0, 1)", "at = (1-t)*a0 + t*a1", "loss = mse(v_theta(at, obs, t), a1-a0)", "a = ode_solve(v_theta, normal(), obs)", "execute(a[:horizon])"],
    },
    "diffusion-policy": {
        "title": "Diffusion Policy denoising",
        "intuition": "把机器人动作序列当成要生成的信号：训练时加噪，推理时从噪声一步步还原成多峰动作。",
        "vis": "denoise",
        "steps": steps([
            ("chunk", "取专家动作块", "use an H-step action chunk from demonstration", [0], [0, 1]),
            ("noise", "前向加噪", "corrupt the chunk with Gaussian noise at diffusion step k", [1, 2], [2, 3]),
            ("score", "预测噪声", "conditioned on observation, predict the injected noise", [2], [4]),
            ("sample", "反向采样", "start from random action chunk and denoise repeatedly", [3], [5, 6]),
            ("recede", "receding horizon", "execute only the first actions, then observe and denoise again", [4], [7]),
        ]),
        "formulas": [
            (r"a^k=\sqrt{\bar\alpha_k}a+\sqrt{1-\bar\alpha_k}\epsilon", "训练时把专家动作加噪。"),
            (r"\mathcal{L}=\mathbb{E}_{k,a,\epsilon}\|\epsilon-\epsilon_\theta(a^k,o,k)\|^2", "网络预测加进去的噪声。"),
            (r"a^{k-1}=a^k-\alpha_k\epsilon_\theta(a^k,o,k)+\sigma_k z", "推理时逐步去噪。"),
            (r"a_{t:t+H}\sim p_\theta(\cdot|o_t)", "输出的是 action chunk 分布。"),
        ],
        "code": ["# diffusion policy", "chunk = demo.actions[t:t+H]", "k = randint(K)", "eps = normal_like(chunk)", "noisy = sqrt(ab[k])*chunk + sqrt(1-ab[k])*eps", "loss = mse(model(noisy, obs, k), eps)", "a = normal(shape=chunk.shape)", "for k in reversed(schedule): a = denoise(a, obs, k)"],
    },
    "tokenized-bc": {
        "title": "Tokenized BC / VLA action tokens",
        "intuition": "把连续动作压成离散 token，让 VLM 像续写文字一样续写机器人动作。",
        "vis": "tokens",
        "steps": steps([
            ("encode", "视觉语言上下文", "encode image, instruction, and history into tokens", [0], [0, 1]),
            ("quant", "动作量化", "turn continuous robot commands into bins / VQ / FAST tokens", [1], [2]),
            ("lm", "next-token 训练", "train with cross entropy over action tokens", [2, 3], [3, 4]),
            ("decode", "反量化动作", "decode predicted tokens back to continuous commands", [4], [5]),
            ("loop", "闭环滚动", "execute a short chunk and continue autoregressive control", [4], [6]),
        ]),
        "formulas": [
            (r"a\mapsto (z_1,\ldots,z_K),\quad z_i\in\mathcal{V}", "动作先离散化。"),
            (r"p_\theta(z_i|o,\ell,z_{<i})=\mathrm{softmax}(h_iW)", "像语言模型一样预测下一个 token。"),
            (r"\mathcal{L}=-\sum_i\log p_\theta(z_i^*|o,\ell,z_{<i}^*)", "交叉熵训练。"),
            (r"\hat a=\mathrm{dequantize}(\hat z_{1:K})", "token 还原为控制命令。"),
        ],
        "code": ["# tokenized behavioral cloning", "ctx = vlm_tokens(image, instruction)", "z = action_tokenizer.encode(action_chunk)", "logits = transformer(ctx + z[:-1])", "loss = cross_entropy(logits, z)", "pred = autoregressive_decode(transformer, ctx)", "action = action_tokenizer.decode(pred)"],
    },
    "energy-based-bc": {
        "title": "Energy-Based / Implicit BC",
        "intuition": "不直接输出动作，而是给每个候选动作打能量分；执行时在动作空间里找最低能量。",
        "vis": "imitation",
        "steps": steps([
            ("positive", "专家正样本", "expert action should have low energy", [0], [0, 1]),
            ("negative", "采样负动作", "random or optimized negative actions should have higher energy", [1], [2]),
            ("contrast", "对比训练", "push positive energy down and negative energy up", [2], [3, 4]),
            ("argmin", "测试时优化动作", "search in action space for the minimum-energy action", [4], [5]),
        ]),
        "formulas": [(r"\pi_\theta(a|s)\propto \exp(-E_\theta(s,a))", "低能量代表高概率。"), (r"\mathcal{L}=-\log\frac{\exp(-E(s,a^+))}{\sum_j\exp(-E(s,a_j))}", "InfoNCE 风格对比损失。"), (r"a^*=\arg\min_a E_\theta(s,a)", "推理时求能量最低动作。")],
        "code": ["# implicit behavioral cloning", "pos = expert_action", "neg = sample_actions()", "E_pos = E(obs, pos)", "E_neg = E(obs, neg)", "loss = -log_softmax(-concat(E_pos, E_neg))[0]", "a = optimize_action(lambda a: E(obs, a))"],
    },
    "value-based-rl": {
        "title": "Q-learning value backup",
        "intuition": "把终点奖励沿 Bellman 方程一格格往回传，最后每个状态选 Q 最大的动作。",
        "vis": "grid",
        "steps": steps([("sample", "采样转移", "store (s,a,r,s') in replay", [0], [0, 1]), ("backup", "Bellman backup", "target is immediate reward plus best next value", [0], [2]), ("fit", "拟合 Q 网络", "regress Q(s,a) toward the target", [1], [3, 4]), ("act", "argmax 执行", "choose action with largest Q", [2], [5])]),
        "formulas": [(r"y=r+\gamma\max_{a'}Q_{\bar\theta}(s',a')", "目标来自下一状态的最大价值。"), (r"\mathcal{L}=(Q_\theta(s,a)-y)^2", "TD regression。"), (r"\pi(s)=\arg\max_a Q_\theta(s,a)", "贪心策略。")],
        "code": ["# Q-learning", "batch = replay.sample()", "y = r + gamma * max(Q_target(sp))", "loss = mse(Q(s)[a], y)", "optimizer.step(loss)", "action = argmax(Q(obs))"],
    },
    "policy-gradient-rl": {
        "title": "Policy Gradient / PPO",
        "intuition": "直接调策略参数：高优势动作概率上升，但 PPO 用 clip 防止一步改太猛。",
        "vis": "prob",
        "steps": steps([("rollout", "on-policy rollout", "collect trajectories using current policy", [0], [0, 1]), ("adv", "估计优势", "compare returns against value baseline", [0, 1], [2]), ("ratio", "概率比", "measure how much new policy changed action probability", [1, 2], [3]), ("clip", "PPO clipped update", "optimize only inside a conservative trust region", [2], [4, 5])]),
        "formulas": [(r"\hat A_t=\hat R_t-V_\phi(s_t)", "优势：比 baseline 好多少。"), (r"r_t(\theta)=\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{old}}(a_t|s_t)}", "新旧策略概率比。"), (r"L=\mathbb{E}\min(r_t\hat A_t,\mathrm{clip}(r_t,1-\epsilon,1+\epsilon)\hat A_t)", "PPO clipped objective。")],
        "code": ["# PPO update", "traj = rollout(pi_old)", "adv = returns - value(states)", "ratio = pi(a|s) / pi_old(a|s)", "obj = min(ratio*adv, clip(ratio)*adv)", "loss = -obj.mean()", "optimizer.step(loss)"],
    },
    "off-policy-ac": {
        "title": "SAC / TD3 Actor-Critic",
        "intuition": "critic 用 replay 学 Q，actor 沿 critic 的梯度找高价值动作；SAC 额外奖励高熵探索。",
        "vis": "actorcritic",
        "steps": steps([("replay", "从 replay 取样", "reuse off-policy transitions", [0], [0, 1]), ("critic", "训练 twin critic", "fit Bellman target, often with target networks", [0, 1], [2, 3]), ("actor", "actor 最大化 Q", "differentiate through critic into action", [2], [4]), ("entropy", "SAC 熵正则", "keep policy stochastic and exploratory", [2], [5])]),
        "formulas": [(r"y=r+\gamma(\min_i Q_{\bar\phi_i}(s',a')-\alpha\log\pi(a'|s'))", "SAC target。"), (r"\mathcal{L}_Q=\sum_i(Q_{\phi_i}(s,a)-y)^2", "训练 critic。"), (r"\mathcal{L}_\pi=\mathbb{E}[\alpha\log\pi_\theta(a|s)-\min_i Q_{\phi_i}(s,a)]", "训练 actor。")],
        "code": ["# SAC-style actor critic", "batch = replay.sample()", "ap = actor.sample(sp)", "y = r + gamma*(min(Q_targ(sp, ap)) - alpha*logp(ap))", "critic_loss = mse(Q1(s,a), y) + mse(Q2(s,a), y)", "actor_loss = alpha*logp(actor(s)) - min(Q(s, actor(s)))", "update(critic_loss, actor_loss)"],
    },
    "offline-rl": {
        "title": "Offline RL with pessimism",
        "intuition": "不能在线试错，所以必须让策略留在数据支持内，并对没见过的动作保持悲观。",
        "vis": "actorcritic",
        "steps": steps([("dataset", "固定数据集", "only train from logged trajectories", [0], [0]), ("support", "估计数据支持", "avoid actions absent from the dataset", [1], [1, 2]), ("critic", "悲观 Q", "penalize overestimated OOD actions", [2], [3, 4]), ("extract", "提取策略", "BC, advantage-weighting, or constrained actor update", [3], [5])]),
        "formulas": [(r"\mathcal{D}=\{(s,a,r,s')\}\;\text{fixed}", "训练期间没有新交互。"), (r"\max_\pi \mathbb{E}_{s\sim\mathcal{D}}Q(s,\pi(s))-\lambda D(\pi||\pi_{\mathcal{D}})", "行为约束。"), (r"\mathcal{L}_{CQL}=\alpha(\log\sum_a e^{Q(s,a)}-\mathbb{E}_{a\sim\mathcal{D}}Q(s,a))", "CQL 压低 OOD 动作。")],
        "code": ["# offline RL", "batch = dataset.sample()", "critic_loss = bellman_error(batch)", "ood_actions = actor(batch.s)", "pessimism = logsumexp(Q(batch.s, all_actions)) - Q(batch.s, batch.a)", "actor_loss = -Q(s, actor(s)) + lambda_ * behavior_penalty", "update(critic_loss + alpha*pessimism, actor_loss)"],
    },
    "maxent-irl": {
        "title": "Maximum-Entropy IRL",
        "intuition": "不是直接复制动作，而是找一个奖励函数，使专家轨迹在最大熵策略下最合理。",
        "vis": "imitation",
        "steps": steps([("feature", "统计专家特征", "count what states/actions experts visit", [0], [0, 1]), ("reward", "拟合奖励", "assign reward so expert trajectories become likely", [1], [2, 3]), ("rl", "用奖励跑 RL", "optimize a policy under the recovered reward", [2], [4]), ("compare", "匹配 occupancy", "update reward if learner visits different regions", [2], [5])]),
        "formulas": [(r"p(\tau|r)\propto \exp(R(\tau))", "高奖励轨迹概率更高。"), (r"\max_r\;\mathbb{E}_{\tau_E}R(\tau)-\log Z(r)", "最大化专家轨迹似然。"), (r"\rho_{\pi_r}(s,a)\approx\rho_E(s,a)", "最终匹配 occupancy。")],
        "code": ["# max-ent IRL sketch", "expert_feat = feature_counts(expert_traj)", "for it in range(K):", "    policy = run_rl(reward)", "    learner_feat = feature_counts(policy.rollouts())", "    reward += lr * (expert_feat - learner_feat)", "return policy"],
    },
    "gail": {
        "title": "GAIL / AIRL adversarial imitation",
        "intuition": "判别器努力分辨专家和学习者，策略把“骗过判别器”当奖励来优化。",
        "vis": "imitation",
        "steps": steps([("rollout", "生成学习者轨迹", "policy produces current behavior", [0], [0, 1]), ("disc", "训练判别器", "D classifies expert vs learner state-action pairs", [1], [2, 3]), ("reward", "转成 shaped reward", "policy receives reward for looking expert-like", [2], [4]), ("rl", "内层 RL 更新", "use PPO/TRPO/SAC to optimize the imitation reward", [2, 3], [5])]),
        "formulas": [(r"\max_D\;\mathbb{E}_{\pi_E}\log D(s,a)+\mathbb{E}_{\pi}\log(1-D(s,a))", "判别器分真假。"), (r"r_D(s,a)=-\log(1-D(s,a))", "骗过判别器变成奖励。"), (r"\min_\pi\max_D\;\mathcal{L}_{GAN}(\pi,D)-\lambda H(\pi)", "策略和判别器对抗。")],
        "code": ["# GAIL loop", "learner = policy.rollout()", "disc_loss = bce(D(expert), 1) + bce(D(learner), 0)", "update(D, disc_loss)", "reward = -log(1 - D(learner.s, learner.a))", "policy = rl_update(policy, reward)"],
    },
    "forward-dynamics-mpc": {
        "title": "Learned dynamics + MPC",
        "intuition": "学一个前向模型，然后每一步采样多条动作计划，预测未来，执行评分最高计划的第一步。",
        "vis": "world",
        "steps": steps([("learn", "学习动力学", "fit f_hat(s,a)->s' from interaction", [0], [0, 1]), ("sample", "采样候选计划", "draw many action sequences over horizon H", [1], [2]), ("rollout", "模型内 rollout", "predict future states for each sequence", [1, 2], [3]), ("select", "选最高分", "score by reward/cost and take first action", [2], [4, 5]), ("replan", "下一步重规划", "observe again and repeat", [2, 3], [6])]),
        "formulas": [(r"\hat s_{t+1}=\hat f_\phi(s_t,a_t)", "学习前向动力学。"), (r"J(a_{t:t+H})=\sum_{k=0}^H \gamma^k\hat r(\hat s_{t+k},a_{t+k})", "给候选动作序列打分。"), (r"a_t=\left[\arg\max_{a_{t:t+H}}J\right]_0", "只执行第一步。")],
        "code": ["# learned MPC", "model.fit(replay)", "candidates = sample_sequences(N, H)", "for A in candidates:", "    S = rollout(model, s, A)", "    score[A] = reward(S, A).sum()", "execute(best(candidates)[0])"],
    },
    "latent-imagination": {
        "title": "Dreamer latent imagination",
        "intuition": "把像素压进 latent world model，在想象轨迹里训练 actor 和 value，而不是每次都用真实机器人试。",
        "vis": "world",
        "steps": steps([("rssm", "学习 latent dynamics", "RSSM predicts next latent, reward, continuation", [0, 1], [0, 1]), ("imagine", "latent imagination", "roll actor forward inside learned latent model", [1], [2]), ("value", "lambda-return value", "estimate long-horizon imagined returns", [2], [3]), ("actor", "反传训练 actor", "differentiate imagined returns through latent trajectories", [2, 3], [4, 5])]),
        "formulas": [(r"z_t\sim q_\phi(z_t|z_{t-1},a_{t-1},o_t)", "posterior latent。"), (r"\hat z_{t+1}\sim p_\phi(\hat z_{t+1}|\hat z_t,a_t)", "想象中的 dynamics。"), (r"V_\lambda(\hat z_t)=\hat r_t+\gamma((1-\lambda)V(\hat z_{t+1})+\lambda V_\lambda(\hat z_{t+1}))", "lambda return。"), (r"\max_\psi \mathbb{E}_{\hat\tau\sim p_\phi,\pi_\psi}\sum_t V_\lambda(\hat z_t)", "actor 在梦里优化。")],
        "code": ["# Dreamer-style update", "posterior = encoder(obs, actions)", "model_loss = recon + reward_loss + kl", "z = stopgrad(posterior[-1])", "imagined = world_model.imagine(actor, z, horizon)", "actor_loss = -lambda_return(imagined).mean()", "value_loss = mse(value(z), target_return)"],
    },
    "generative-video-wm": {
        "title": "Generative video world model",
        "intuition": "预测未来视频本身：它可以是 planner、simulator 或 data generator，关键在条件里有没有动作。",
        "vis": "world",
        "steps": steps([("context", "输入上下文帧", "past frames and text/goal/actions condition the video model", [0], [0, 1]), ("latent", "压缩视频 latent", "operate in latent video/token space", [1], [2]), ("generate", "生成未来帧", "autoregressive or diffusion decoder predicts future video", [2], [3, 4]), ("use", "用于规划/数据", "future videos can guide policy, generate data, or evaluate outcomes", [3], [5])]),
        "formulas": [(r"p_\theta(o_{t+1:t+H}|o_{\le t},c)", "未来视频条件生成。"), (r"c\in\{\ell,g,a_{t:t+H},\text{mask}\}", "条件决定它是 planner 还是 simulator。"), (r"\hat o_{t+1:t+H}\sim p_\theta(\cdot|o_{\le t},c)", "采样未来。")],
        "code": ["# video world model", "ctx = encode_video(past_frames, condition)", "z = sample_latent_noise()", "future = video_decoder.generate(z, ctx)", "if has_reward: score = reward_model(future)", "use(future, score)"],
    },
    "action-conditioned-wm": {
        "title": "Action-Conditioned World Model",
        "intuition": "动作是输入：问模型“如果我这样做，会看到什么未来？”所以它能做 counterfactual evaluation。",
        "vis": "world",
        "steps": steps([("state", "编码当前状态", "encode observation/history", [0], [0]), ("candidate", "输入候选动作", "condition the model on proposed action sequence", [1], [1, 2]), ("predict", "预测对应未来", "different actions lead to different futures", [2], [3]), ("evaluate", "评估与选择", "score predicted futures for planning or policy improvement", [3], [4, 5])]),
        "formulas": [(r"p_\theta(o_{t+1:t+H}|o_{\le t},a_{t:t+H})", "动作作为条件输入。"), (r"\hat o^A\neq \hat o^B\quad\text{for}\quad A\neq B", "不同动作序列产生不同 counterfactual future。"), (r"A^*=\arg\max_A R(\hat o^A_{t+1:t+H})", "用预测未来选动作。")],
        "code": ["# action-conditioned WM planning", "state = encode(obs_history)", "for A in candidate_actions:", "    future[A] = wm.predict(state, A)", "    score[A] = task_score(future[A])", "execute(best(score)[0])"],
    },
    "world-action-model": {
        "title": "World Action Model",
        "intuition": "动作是输出：先想象一个成功的视频，再从视频中读出能实现它的动作。",
        "vis": "world",
        "steps": steps([("prompt", "任务条件", "scene and instruction specify desired success", [0], [0]), ("imagine", "生成成功未来", "video model imagines the task being completed", [1], [1, 2]), ("decode", "从视频读动作", "inverse dynamics or joint decoder extracts actions", [2], [3, 4]), ("execute", "执行并反馈", "execute proposed actions, optionally re-imagine", [3], [5])]),
        "formulas": [(r"p_\theta(o_{t+1:T},a_{t:T}|o_t,\ell)", "联合生成未来和动作。"), (r"p_\theta(o_{t+1:T}|o_t,\ell)p_\psi(a_{t:T}|o_{t:T})", "两阶段：先视频，再 inverse dynamics。"), (r"\hat a_{t:T}\sim p(\cdot|\hat o_{t:T})", "动作来自 imagined future。")],
        "code": ["# world-action model", "future = video_model.generate(obs, instruction)", "actions = inverse_dynamics(obs, future)", "for a in actions[:chunk]:", "    robot.step(a)", "    if off_track: future = reimagine()"],
    },
    "occupancy-latent-wm": {
        "title": "Occupancy / latent state world model",
        "intuition": "不预测每个 RGB 像素，而是预测规划有用的状态：占据、接触、cost、latent dynamics。",
        "vis": "world",
        "steps": steps([("encode", "抽象状态", "map pixels/history to occupancy or latent state", [0], [0, 1]), ("transition", "状态转移", "predict compact next state under action", [1], [2]), ("cost", "读出可规划量", "derive collision/contact/cost maps", [2], [3, 4]), ("plan", "在抽象状态规划", "use MPC/search on compact prediction", [3], [5])]),
        "formulas": [(r"z_t=\phi(o_{\le t})", "抽象 latent。"), (r"p_\theta(z_{t+1}|z_t,a_t)", "latent dynamics。"), (r"\hat c_t=\psi(z_t)", "从 latent 读出 occupancy/contact/cost。"), (r"\min_{a_{0:H}}\sum_t \hat c_t", "规划时优化 cost。")],
        "code": ["# occupancy latent WM", "z = encoder(obs_history)", "for a in plan:", "    z = latent_dynamics(z, a)", "    cost += occupancy_head(z) + contact_head(z)", "plan = optimize(cost)", "execute(plan[0])"],
    },
    "decision-transformer": {
        "title": "Decision Transformer",
        "intuition": "不做 Bellman backup；把 return-to-go、state、action 排成序列，让 Transformer 预测下一个动作。",
        "vis": "tokens",
        "steps": steps([("tokens", "构造轨迹 token", "interleave return, state, action tokens", [0], [0, 1]), ("rtg", "条件目标回报", "desired return steers behavior", [1], [2]), ("causal", "因果 Transformer", "predict action from past tokens only", [2], [3, 4]), ("roll", "执行并更新 RTG", "subtract received reward and continue", [3], [5, 6])]),
        "formulas": [(r"x=(\hat R_1,s_1,a_1,\ldots,\hat R_t,s_t)", "轨迹序列。"), (r"p_\theta(a_t|\hat R_{\le t},s_{\le t},a_{<t})", "条件动作预测。"), (r"\hat R_{t+1}=\hat R_t-r_t", "执行后更新目标 return。")],
        "code": ["# Decision Transformer", "tokens = [rtg, state, prev_action, ...]", "a = transformer.predict_action(tokens)", "obs, r = env.step(a)", "rtg = rtg - r", "tokens.extend([rtg, obs, a])"],
    },
    "trajectory-diffusion": {
        "title": "Trajectory Diffusion / Diffuser",
        "intuition": "不是只生成下一步动作，而是把整条状态-动作轨迹当成一个可被 goal/reward 引导的样本。",
        "vis": "denoise",
        "steps": steps([("traj", "轨迹作为样本", "model full trajectory tau=(s,a,...)", [0], [0, 1]), ("noise", "轨迹加噪", "diffuse the whole trajectory", [1], [2]), ("guide", "目标/奖励引导", "guide denoising toward goal or high reward", [2], [3, 4]), ("execute", "执行前缀", "execute first action segment and replan", [3], [5])]),
        "formulas": [(r"\tau=(s_1,a_1,\ldots,s_T,a_T)", "样本是整条轨迹。"), (r"p_\theta(\tau)", "学习轨迹分布。"), (r"\nabla_\tau\log p(g|\tau)\;\text{or}\;\nabla_\tau R(\tau)", "用目标/奖励引导采样。"), (r"\tau^*\sim p_\theta(\tau|g)", "采样出计划。")],
        "code": ["# Diffuser planning", "traj = normal(shape=(T, state_action_dim))", "for k in reversed(schedule):", "    score = model.score(traj, k)", "    guide = grad(goal_or_reward(traj))", "    traj = denoise(traj, score + guide)", "execute(traj.actions[:chunk])"],
    },
    "goal-conditioned": {
        "title": "Goal-conditioned policy + HER",
        "intuition": "策略多输入一个 goal；失败轨迹也能重标成“到达了实际终点”的成功样本。",
        "vis": "planner",
        "steps": steps([("goal", "输入目标", "policy receives state and desired goal", [0], [0, 1]), ("try", "尝试执行", "rollout may miss the original goal", [1], [2]), ("relabel", "hindsight relabel", "replace goal with achieved final state", [2], [3, 4]), ("learn", "用重标数据学习", "off-policy RL or GCSL trains on relabeled success", [2, 3], [5])]),
        "formulas": [(r"\pi(a|s,g)", "goal-conditioned policy。"), (r"(s_t,a_t,s_{t+1},g)\rightarrow(s_t,a_t,s_{t+1},g'=s_T)", "HER relabel。"), (r"r_g(s)=\mathbb{1}[d(s,g)<\epsilon]", "目标奖励。")],
        "code": ["# HER", "episode = rollout(pi, goal=g)", "for transition in episode:", "    replay.add(transition, goal=g)", "    g2 = episode.final_state", "    replay.add(relabel(transition, goal=g2))", "update_goal_conditioned_policy(replay)"],
    },
    "hrl": {
        "title": "Hierarchical RL / options",
        "intuition": "高层少量决策子目标或 skill，低层在多个时间步内执行细动作。",
        "vis": "planner",
        "steps": steps([("option", "选择 option", "high-level policy chooses skill/subgoal", [0], [0, 1]), ("worker", "低层执行", "low-level policy acts conditioned on option", [1], [2]), ("terminate", "终止条件", "option ends when beta(s) fires or horizon expires", [2], [3]), ("credit", "高层更新", "assign long-horizon credit to options", [2, 3], [4, 5])]),
        "formulas": [(r"z_k\sim\pi_{hi}(z|s_{kT})", "高层选择子目标/skill。"), (r"a_t\sim\pi_{lo}(a|s_t,z_k)", "低层执行。"), (r"\beta_z(s)\in[0,1]", "option termination。")],
        "code": ["# hierarchical policy", "z = high_level(state)", "while not terminate(z, state):", "    action = low_level(state, z)", "    state = env.step(action)", "update_low_level()", "update_high_level()"],
    },
    "meta-learning": {
        "title": "Meta-learning policies",
        "intuition": "训练目标不是某个任务最高分，而是学一个初始化/隐变量，让新任务少量数据就能适应。",
        "vis": "prob",
        "steps": steps([("tasks", "采样任务分布", "train over many related tasks", [0], [0]), ("inner", "内循环适应", "adapt parameters or infer context latent on support data", [1], [1, 2]), ("outer", "外循环优化", "optimize initial parameters for post-adaptation performance", [2], [3, 4]), ("test", "新任务 few-shot", "quickly specialize on a new task", [2, 3], [5])]),
        "formulas": [(r"\theta'_T=\theta-\alpha\nabla_\theta\mathcal{L}_T(\theta)", "MAML 内循环。"), (r"\min_\theta\mathbb{E}_{T\sim p(T)}\mathcal{L}_T(\theta'_T)", "外循环优化适应后的 loss。"), (r"z\sim q_\phi(z|c_{1:n})", "PEARL/RL2 风格任务隐变量。")],
        "code": ["# MAML-style meta RL", "for task in tasks:", "    theta_fast = theta - alpha*grad(loss(task.support, theta))", "    outer_loss += loss(task.query, theta_fast)", "theta = theta - beta*grad(outer_loss)", "adapt_to_new_task(few_shots)"],
    },
    "llm-planner": {
        "title": "LLM planner / Code-as-Policies",
        "intuition": "LLM 不直接输出电机动作，而是把语言任务分解成可调用 skill/API/代码。",
        "vis": "planner",
        "steps": steps([("prompt", "构造 prompt", "instruction, scene, and API docs enter the LLM", [0], [0, 1]), ("program", "生成计划/代码", "LLM outputs symbolic plan or Python", [1], [2]), ("ground", "技能可行性检查", "affordance/value functions ground the plan", [2], [3, 4]), ("execute", "调用 skill", "robot executes verified primitives and replans on feedback", [2, 3], [5])]),
        "formulas": [(r"\text{plan}=\mathrm{LLM}(\ell,\mathrm{scene},\mathrm{API})", "LLM 生成计划。"), (r"\arg\max_i p_{LLM}(s_i|\ell)\cdot V_i(o)", "SayCan: 语言概率 × 可执行价值。"), (r"\mathrm{exec}(\text{program})\rightarrow a_{1:T}", "程序调用底层技能。")],
        "code": ["# LLM planner", "prompt = build_prompt(instruction, scene, skill_api)", "program = llm.generate(prompt)", "for call in parse_calls(program):", "    if affordance(call, scene) > tau:", "        skills[call.name](call.args)", "    else: replan()"],
    },
    "vlm-affordance": {
        "title": "VLM affordance / spatial programs",
        "intuition": "让 VLM 输出哪里能抓、哪里该放、哪些 3D 约束成立，再交给轨迹优化器。",
        "vis": "planner",
        "steps": steps([("scene", "读图像/点云", "VLM grounds language in the scene", [0], [0]), ("cost", "生成空间代价", "produce keypoints, voxel costs, or constraints", [1], [1, 2]), ("opt", "轨迹优化", "classical solver finds a feasible path", [2], [3, 4]), ("feedback", "视觉反馈", "re-run grounding if scene changes", [3], [5])]),
        "formulas": [(r"\mathcal{C}(x)=\mathrm{VLM}(\ell,o,x)", "VLM 定义空间代价。"), (r"\tau^*=\arg\min_\tau\int\mathcal{C}(\tau(t))dt", "轨迹优化。"), (r"g_i(q_t)\le 0,\quad h_j(q_t)=0", "几何/关系约束。")],
        "code": ["# VLM affordance planner", "objects = vlm.detect(scene, instruction)", "cost = vlm.value_map(scene, instruction)", "constraints = keypoint_constraints(objects)", "traj = trajopt(cost, constraints, robot_model)", "controller.follow(traj)"],
    },
    "pid-control": {
        "title": "PID feedback controller",
        "intuition": "只看目标误差：当前误差、累计误差、误差变化率三项合成控制量。",
        "vis": "planner",
        "steps": steps([("error", "测误差", "compare target and current value", [0], [0]), ("p", "P 项", "push proportional to current error", [1], [1]), ("i", "I 项", "integrate persistent bias", [2], [2]), ("d", "D 项", "damp fast changes", [3], [3]), ("apply", "施加控制", "send control to actuator and repeat", [0, 1, 2, 3], [4])]),
        "formulas": [(r"e(t)=r(t)-y(t)", "误差。"), (r"u_P=K_pe(t)", "比例项。"), (r"u_I=K_i\int_0^t e(\tau)d\tau", "积分项。"), (r"u_D=K_d\dot e(t)", "微分项。"), (r"u=u_P+u_I+u_D", "总控制量。")],
        "code": ["# PID loop", "e = target - measured", "integral += e * dt", "derivative = (e - prev_e) / dt", "u = Kp*e + Ki*integral + Kd*derivative", "actuator.send(u)", "prev_e = e"],
    },
    "lqr": {
        "title": "LQR optimal linear feedback",
        "intuition": "在线性动力学和二次代价下，最优策略就是一个固定反馈矩阵 u=-Kx。",
        "vis": "planner",
        "steps": steps([("linear", "线性模型", "assume x_{t+1}=Ax_t+Bu_t", [0], [0]), ("cost", "二次代价", "penalize state error and control effort", [1], [1]), ("riccati", "解 Riccati", "solve backward / algebraic Riccati equation", [2], [2, 3]), ("feedback", "反馈控制", "apply u=-Kx", [3], [4])]),
        "formulas": [(r"x_{t+1}=Ax_t+Bu_t", "线性动力学。"), (r"J=\sum_t x_t^\top Qx_t+u_t^\top Ru_t", "二次代价。"), (r"P=A^\top PA-A^\top PB(R+B^\top PB)^{-1}B^\top PA+Q", "Riccati equation。"), (r"u=-Kx,\quad K=(R+B^\top PB)^{-1}B^\top PA", "最优反馈。")],
        "code": ["# LQR", "A, B = linearize(dynamics)", "P = solve_riccati(A, B, Q, R)", "K = inv(R + B.T@P@B) @ B.T@P@A", "while control:", "    u = -K @ x", "    x = step(u)"],
    },
    "classical-mpc": {
        "title": "Classical MPC / trajectory optimization",
        "intuition": "用已知模型在约束下优化未来 H 步，只执行第一步，然后滚动重算。",
        "vis": "planner",
        "steps": steps([("model", "已知模型", "use physics/kinematics model", [0], [0]), ("horizon", "有限时域优化", "optimize controls over a receding horizon", [1], [1, 2]), ("constraints", "处理约束", "respect obstacles, torque limits, contacts", [2], [3]), ("recede", "执行第一步", "execute first control and solve again next cycle", [3], [4, 5])]),
        "formulas": [(r"\min_{u_{0:H}}\sum_{t=0}^H\ell(x_t,u_t)", "优化时域代价。"), (r"x_{t+1}=f(x_t,u_t)", "模型约束。"), (r"g(x_t,u_t)\le 0", "安全/物理约束。"), (r"u_t=[u^*_{0:H}]_0", "只执行第一步。")],
        "code": ["# MPC", "while running:", "    problem = build_optimization(x, model, constraints)", "    U = solve(problem, horizon=H)", "    execute(U[0])", "    x = observe()"],
    },
    "motion-planning": {
        "title": "Motion planning search",
        "intuition": "在 configuration space 里搜索一条 collision-free path，之后再由控制器跟踪。",
        "vis": "planner",
        "steps": steps([("space", "构造 C-space", "represent robot configuration and obstacles", [0], [0]), ("sample", "采样/扩展", "RRT/PRM samples feasible configurations", [1], [1, 2]), ("connect", "碰撞检测连接", "connect nodes only through free space", [2], [3]), ("path", "输出路径", "extract and smooth path to goal", [3], [4, 5])]),
        "formulas": [(r"q\in\mathcal{C},\quad q\in\mathcal{C}_{free}", "配置空间与自由空间。"), (r"\text{find }\tau:q_s\rightarrow q_g,\;\tau(t)\in\mathcal{C}_{free}", "路径可行性。"), (r"q_{new}=\mathrm{steer}(q_{near},q_{rand})", "RRT 扩展。")],
        "code": ["# RRT sketch", "tree = [q_start]", "for i in range(N):", "    q_rand = sample_free()", "    q_near = nearest(tree, q_rand)", "    q_new = steer(q_near, q_rand)", "    if collision_free(q_near, q_new): tree.add(q_new)"],
    },
    "vla-foundation": {
        "title": "VLA foundation model",
        "intuition": "把 VLM 的视觉语言语义 trunk 接上 robot action head，再用跨任务/跨 embodiment 数据微调。",
        "vis": "tokens",
        "steps": steps([("pretrain", "VLM 预训练", "web-scale image-text/video-text gives semantic priors", [0], [0]), ("robot", "机器人数据对齐", "teleop and multi-embodiment data teach actions", [1], [1, 2]), ("head", "动作头", "token/diffusion/flow head maps hidden state to robot commands", [2], [3, 4]), ("deploy", "语言控制", "instruction-conditioned policy acts in closed loop", [3], [5])]),
        "formulas": [(r"h=\mathrm{VLM}_\theta(o,\ell)", "语义 trunk。"), (r"a\sim p_\psi(a|h)", "动作头。"), (r"\mathcal{D}=\bigcup_e\mathcal{D}_e", "跨 embodiment 数据。")],
        "code": ["# VLA policy", "h = vlm(image, instruction)", "if action_head == 'token': action = decode_tokens(h)", "elif action_head == 'flow': action = flow_sample(h)", "elif action_head == 'diffusion': action = diffusion_sample(h)", "robot.execute(action)"],
    },
    "vla-rl": {
        "title": "RL-finetuned VLA",
        "intuition": "先用大规模 BC 得到可用 VLA，再用奖励/偏好/环境反馈修正它的行为。",
        "vis": "actorcritic",
        "steps": steps([("init", "BC 初始化", "start from a capable imitation VLA", [0], [0]), ("reward", "收集反馈", "task reward, preference, success detector, or critic", [1], [1, 2]), ("update", "受约束微调", "improve reward while staying close to base policy", [2], [3, 4]), ("eval", "真实评测", "deploy cautiously with safety and rollback", [3], [5])]),
        "formulas": [(r"\pi_0=\mathrm{BC}(\mathcal{D}_{robot})", "先模仿。"), (r"\max_\pi \mathbb{E}R(\tau)-\beta D_{KL}(\pi||\pi_0)", "带 KL/行为约束的强化。"), (r"A(s,a)=Q(s,a)-V(s)", "用优势指导更新。")],
        "code": ["# RL fine-tune VLA", "pi = load_bc_vla()", "rollouts = collect(pi)", "reward = success_or_preference_model(rollouts)", "loss = rl_objective(pi, reward) + beta*kl(pi, pi_base)", "safe_update(pi, loss)"],
    },
    "domain-randomization": {
        "title": "Domain randomization",
        "intuition": "不是让 sim 完全真实，而是把 sim 随机到足够宽，让真实世界只是训练分布中的一个样本。",
        "vis": "prob",
        "steps": steps([("range", "设随机范围", "choose physics, texture, lighting, latency ranges", [0], [0]), ("sample", "每集采样域", "randomize simulator parameters per rollout", [1], [1, 2]), ("train", "训练鲁棒策略", "policy maximizes expected return over domains", [2], [3]), ("real", "真实迁移", "deploy zero-shot or with small adaptation", [3], [4])]),
        "formulas": [(r"\xi\sim p(\xi)", "随机物理/视觉参数。"), (r"\max_\pi\mathbb{E}_{\xi,\tau\sim\pi,\mathrm{sim}_\xi}\sum_t r_t", "跨域期望回报。"), (r"\xi_{real}\in\mathrm{support}(p(\xi))", "希望真实域落在随机范围内。")],
        "code": ["# domain randomization", "for episode in train:", "    xi = sample(domain_ranges)", "    env.set_params(xi)", "    traj = rollout(policy, env)", "    update(policy, traj.reward)", "deploy(policy, real_robot)"],
    },
    "sim2real-adapt": {
        "title": "Sim-to-real adaptation / RMA",
        "intuition": "策略在线估计真实环境的隐变量，比如摩擦、负载、地形，再据此调整动作。",
        "vis": "actorcritic",
        "steps": steps([("base", "sim 训练 base policy", "train with privileged simulator parameters", [0], [0, 1]), ("adapt", "训练 adaptation module", "infer latent environment from recent history", [1], [2, 3]), ("real", "真实在线估计", "estimate context without privileged variables", [2], [4]), ("control", "条件控制", "policy acts conditioned on inferred context", [3], [5])]),
        "formulas": [(r"z=\phi_\eta(o_{t-k:t},a_{t-k:t})", "从历史估计环境 latent。"), (r"a_t=\pi_\theta(o_t,z)", "策略条件化到 latent。"), (r"\min_\eta\|z-\xi_{priv}\|^2", "用 sim privileged labels 监督 adaptation。")],
        "code": ["# RMA-style adaptation", "xi = sim.privileged_params()", "z_target = encoder_privileged(xi)", "z = adaptation(history)", "action = policy(obs, z)", "loss = rl_loss + mse(z, z_target)", "deploy: z = adaptation(real_history)"],
    },
    "visual-pretrain": {
        "title": "Visual representation pretraining",
        "intuition": "先从视频/图像学一个通用视觉 encoder，再把它冻结或微调用于小数据机器人策略。",
        "vis": "tokens",
        "steps": steps([("video", "收集无标签视频", "human/ego/web videos provide visual priors", [0], [0]), ("ssl", "自监督目标", "contrastive, masked, temporal, or value-aware pretraining", [1], [1, 2]), ("freeze", "接机器人头", "policy head trains on top of representation", [2], [3, 4]), ("transfer", "迁移到任务", "better sample efficiency and generalization", [3], [5])]),
        "formulas": [(r"\phi^*=\arg\min_\phi\mathcal{L}_{SSL}(\phi;\mathcal{D}_{video})", "视觉预训练。"), (r"a\sim\pi_\psi(a|\phi(o))", "机器人策略使用 encoder。"), (r"\mathcal{L}_{BC}(\psi)=\|a-\pi_\psi(\phi(o))\|^2", "小数据行为克隆。")],
        "code": ["# visual pretraining", "phi = train_ssl(video_dataset)", "for batch in robot_demos:", "    feat = phi(batch.obs).detach()", "    pred = policy_head(feat)", "    loss = bc_loss(pred, batch.action)", "update(policy_head)"],
    },
    "latent-action": {
        "title": "Latent action pretraining",
        "intuition": "先从无动作视频里离散化“画面怎么变”的 latent action，再用少量机器人动作把 latent 解码成电机命令。",
        "vis": "world",
        "steps": steps([("pairs", "视频帧对", "use adjacent frames without robot action labels", [0], [0]), ("infer", "推断 latent action", "VQ/inverse model compresses transition into latent code", [1], [1, 2]), ("predict", "用 latent 预测下一帧", "latent action must explain visual change", [2], [3]), ("decode", "少量标注解码", "map latent actions to real robot commands", [3], [4, 5])]),
        "formulas": [(r"z_t=\mathrm{VQ}(o_t,o_{t+1})", "从帧变化得到 latent action。"), (r"p_\theta(o_{t+1}|o_t,z_t)", "latent 必须能预测下一帧。"), (r"a_t=h_\psi(z_t,o_t)", "少量动作标注解码。")],
        "code": ["# latent action pretraining", "z = vq_inverse(frame_t, frame_tp1)", "pred_next = video_model(frame_t, z)", "loss = recon(pred_next, frame_tp1) + vq_loss(z)", "decoder = fit_action_decoder(z_labeled, robot_action)", "robot_action = decoder(z, obs)"],
    },
}


def build_data():
    landscapes = [
        ("robot", "Robot Learning", gen_landscape.build_data()),
        ("vlm", "VLM", gen_vlm.build_data()),
        ("world", "World Models", gen_worldmodel.build_data()),
    ]
    labs = []
    robot_base = landscapes[0][2]
    missing = [p["id"] for p in robot_base["paradigms"] if p["id"] not in NODE_LESSONS]
    if missing:
        raise RuntimeError("Missing node-specific Algorithm Lab lessons: %s" % ", ".join(missing))
    for landscape_key, landscape_label, base in landscapes:
        fam = {f["key"]: f for f in base["families"]}
        for p0 in base["paradigms"]:
            p = dict(p0)
            raw_id = p["id"]
            p["id"] = raw_id if landscape_key == "robot" else f"{landscape_key}-{raw_id}"
            p["raw_id"] = raw_id
            p["landscape"] = landscape_key
            p["landscapeLabel"] = landscape_label
            if landscape_key == "robot":
                lesson = NODE_LESSONS[raw_id]
            else:
                lesson = make_external_lesson(p, landscape_key)
            labs.append(build_lab_record(p, fam, lesson))
    return {"labs": labs, "sources": [{"title": t, "url": u} for t, u in SOURCE_LINKS]}


def build_lab_record(p, fam, lesson):
        arch_key = ID_TO_ARCHETYPE.get(p["id"], "planner")
        if p.get("landscape") == "vlm":
            arch_key = "token"
        elif p.get("landscape") == "world":
            arch_key = "world"
        arch = ARCHETYPES[arch_key]
        formulas = list(arch["formulas"])
        formulas = list(lesson["formulas"])
        if p.get("math"):
            formulas.insert(0, (p["math"], "这个节点在 landscape 中的原始目标/方程。"))
        family_eq = fam.get(p["family"], {}).get("equation")
        if family_eq:
            formulas.insert(1, (family_eq, "所属 family 的共享目标。"))
        return {
                "id": p["id"],
                "rawId": p.get("raw_id", p["id"]),
                "landscape": p.get("landscape", "robot"),
                "landscapeLabel": p.get("landscapeLabel", "Robot Learning"),
                "name": p["name"],
                "short": p.get("short", p["name"]),
                "family": p["family"],
                "familyLabel": fam.get(p["family"], {}).get("label", p["family"]),
                "color": fam.get(p["family"], {}).get("color", "#8b5cf6"),
                "tagline": p.get("tagline", ""),
                "simple": p.get("simple", ""),
                "when": p.get("when", ""),
                "papers": p.get("papers", []),
                "learn": p.get("learn"),
                "arch": arch_key,
                "archTitle": lesson.get("title", arch["title"]),
                "intuition": lesson.get("intuition", arch["intuition"]),
                "steps": [
                    {"key": k, "t": t, "cap": c, "fx": fx, "code": code}
                    for (k, t, c, fx, code) in lesson["steps"]
                ],
                "formulas": [{"tex": tex, "gl": gl} for tex, gl in formulas],
                "code": lesson["code"],
                "vis": lesson.get("vis", arch["vis"]),
            }


def make_external_lesson(p, landscape_key):
    if landscape_key == "vlm":
        return make_vlm_lesson(p)
    return make_world_lesson(p)


def make_vlm_lesson(p):
    name = p["name"]
    short = p.get("short", name)
    family = p.get("family", "VLM")
    return {
        "title": f"{short} visual-language mechanism",
        "intuition": f"{name} 的核心是把视觉信号和语言 token 对齐、桥接或融合；关键要看视觉信息如何进入语言模型，以及输出是文本、区域还是多模态 token。",
        "vis": "tokens" if family not in {"Grounding", "Video"} else ("planner" if family == "Grounding" else "world"),
        "steps": steps([
            ("input", "输入图像/文本", f"use the node's input interface: {p.get('mapping','image + text')}", [0], [0, 1]),
            ("encode", "视觉编码/切块", "turn pixels, regions, frames, or patches into visual tokens/features", [1], [2]),
            ("align", "对齐或桥接", "contrastive loss, projector, Q-Former, cross-attention, or native fusion connects vision to language", [2], [3, 4]),
            ("reason", "语言侧推理/生成", "the language model predicts answer tokens, grounded boxes, or multimodal tokens", [3], [5]),
            ("output", "输出与部署", "use the result for retrieval, chat, grounding, OCR, video QA, or agent perception", [4], [6]),
        ]),
        "formulas": [
            (p.get("math") or r"H_v=\mathrm{VisionEncoder}(I),\quad y\sim p_\theta(y|H_v,x)", "该 VLM 节点的核心训练/推理方程。"),
            (r"H_v=\phi_v(I)\quad\text{or}\quad H_v=\phi_v(f_{1:T})", "图像/视频先变成视觉 token。"),
            (r"H'_v=\mathrm{Bridge}(H_v)\in\mathbb{R}^{m\times d_{LLM}}", "桥接层把视觉特征变成 LLM 可读 token。"),
            (r"p_\theta(y_t|y_{<t},H'_v,x)", "语言模型条件生成答案。"),
            (r"\text{output}\in\{\text{text},\text{box},\text{caption},\text{retrieval},\text{action context}\}", "不同 VLM 分支输出不同对象。"),
        ],
        "code": [
            f"# {short}: VLM mechanism",
            "visual = vision_encoder(image_or_video)",
            "tokens = bridge_or_projector(visual)",
            "prompt = tokenize(instruction)",
            "hidden = llm(prefix=tokens, text=prompt)",
            "output = decode(hidden)",
            "# inspect: where do visual tokens enter, and what is supervised?",
        ],
    }


def make_world_lesson(p):
    name = p["name"]
    short = p.get("short", name)
    return {
        "title": f"{short} world-model mechanism",
        "intuition": f"{name} 的核心不是一个静态识别器，而是学习世界如何随时间、动作或条件变化；关键要分清它预测 latent、token、video、value 还是 action。",
        "vis": "world",
        "steps": steps([
            ("observe", "观测历史", f"use context: {p.get('mapping','past observations and actions')}", [0], [0, 1]),
            ("state", "状态/Token 表示", "compress the world into latent states, discrete tokens, features, or 3D state", [1], [2]),
            ("predict", "预测未来", "roll forward dynamics, tokens, video, value-equivalent state, or JEPA features", [2], [3, 4]),
            ("condition", "条件与控制", "actions, text, goals, masks, or rewards determine controllability", [3], [5]),
            ("use", "用于规划/训练/生成", "evaluate candidate actions, train in imagination, generate data, or simulate deployment", [4], [6]),
        ]),
        "formulas": [
            (p.get("math") or r"p_\theta(z_{t+1:t+H}|z_{\le t},a_{t:t+H},c)", "该 world-model 节点的核心方程。"),
            (r"z_t=\phi(o_{\le t})", "观测被压成状态或 token。"),
            (r"\hat z_{t+1}\sim p_\theta(\cdot|z_t,a_t,c)", "模型预测下一步。"),
            (r"\hat o_{t+1:t+H}=\psi(\hat z_{t+1:t+H})", "有些模型再解码成视频/图像。"),
            (r"a^*=\arg\max_a R(\hat z_{t+1:t+H})", "如果可控，就能用于规划。"),
        ],
        "code": [
            f"# {short}: world-model mechanism",
            "state = encode(observation_history)",
            "condition = build_condition(actions, text, goal)",
            "future = world_model.predict(state, condition)",
            "score = evaluator(future) if planning else None",
            "action = select_action(score) if planning else None",
            "# inspect: are actions inputs, outputs, or absent?",
        ],
    }


TEMPLATE = r"""<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<link rel="icon" href="data:,"/>
<title>Robot Learning Algorithm Lab</title>
<script>
window.MathJax = { tex: { displayMath: [['\\[','\\]']], inlineMath: [['\\(','\\)']] },
  options: { skipHtmlTags: ['script','noscript','style','textarea','pre'] } };
</script>
<script async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<style>
  :root{ --bg:#070b16; --card:#0b1326; --card2:#101a31; --ink:#e7ecf6; --dim:#93a0bd;
    --line:#243554; --gold:#fbbf24; --blue:#58a6ff; --green:#34d399; --rose:#fb7185; }
  *{ box-sizing:border-box; }
  html,body{ margin:0; min-height:100%; overflow-x:hidden; background:var(--bg); color:var(--ink);
    font-family:Inter,-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif; letter-spacing:0; }
  button,select,input{ font:inherit; }
  a{ color:#7dd3fc; text-decoration:none; } a:hover{ text-decoration:underline; }
  .wrap{ max-width:1440px; margin:0 auto; padding:16px 18px 34px; }
  header{ display:flex; align-items:flex-end; gap:14px; flex-wrap:wrap; margin-bottom:12px; }
  h1{ margin:0; font-size:22px; line-height:1.12; font-weight:850;
    background:linear-gradient(90deg,#a78bfa,#67e8f9); -webkit-background-clip:text; background-clip:text; color:transparent; }
  .sub{ color:var(--dim); font-size:13px; max-width:820px; }
  .toolbar{ display:grid; grid-template-columns:minmax(260px,420px) 1fr; gap:12px; margin-bottom:12px; align-items:start; }
  .picker{ display:flex; gap:8px; align-items:center; }
  .picker input,.picker select{ height:38px; border:1px solid var(--line); border-radius:9px; background:#081021; color:var(--ink); padding:7px 9px; }
  .picker input{ flex:1; min-width:0; }
  .picker select{ flex:1.2; min-width:0; }
  .pills{ display:flex; gap:6px; flex-wrap:wrap; justify-content:flex-end; }
  .pill{ border:1px solid #2b3f68; border-radius:999px; background:#0d1730; color:#dbeafe; padding:6px 9px; font-size:12px; cursor:pointer; }
  .pill.on{ background:linear-gradient(90deg,#7c3aed,#0ea5e9); color:white; border-color:transparent; }
  .stepper{ display:grid; grid-template-columns:repeat(5,1fr); gap:6px; margin:10px 0 14px; }
  .stepper button{ min-width:0; border:1px solid var(--line); border-radius:9px; background:var(--card2); color:var(--dim);
    padding:8px 6px; cursor:pointer; white-space:nowrap; overflow:hidden; text-overflow:ellipsis; transition:.18s; }
  .stepper button.on{ background:var(--gold); border-color:var(--gold); color:#07111f; font-weight:800; }
  .main{ display:grid; grid-template-columns:minmax(0,1.08fr) minmax(360px,.92fr); gap:16px; align-items:start; }
  .left,.right{ display:flex; flex-direction:column; gap:14px; }
  .panel{ background:linear-gradient(180deg,#0c1326,#091020); border:1px solid #16203a; border-radius:12px; padding:14px 16px; }
  .panel h2{ margin:0 0 8px; color:var(--gold); font-size:12px; letter-spacing:.12em; text-transform:uppercase; }
  .titlebar{ display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
  .name{ font-size:22px; font-weight:850; margin:0 0 3px; }
  .fam{ color:var(--c); font-size:12px; font-weight:800; letter-spacing:.08em; text-transform:uppercase; }
  .tagline{ color:#cbd5e1; font-size:13.5px; line-height:1.45; }
  .arch{ color:var(--gold); font-size:12px; font-weight:800; border:1px solid #5a4617; background:#171407; border-radius:999px; padding:5px 9px; white-space:nowrap; }
  canvas{ width:100%; border-radius:10px; border:1px solid var(--line); background:#07101f; cursor:pointer; display:block; }
  .cap{ color:#dbe4f7; min-height:52px; line-height:1.5; font-size:13.5px; margin-top:9px; }
  .trace{ color:#cbd5e1; font-size:13px; line-height:1.55; }
  .trace b{ color:var(--gold); }
  .fx{ display:flex; flex-direction:column; gap:7px; }
  .fx-row{ display:grid; grid-template-columns:24px 1fr; gap:9px; align-items:start; border:1px solid transparent;
    background:#0f1729; border-radius:8px; padding:8px 10px; opacity:.48; cursor:pointer; transition:.18s; }
  .fx-row.on{ opacity:1; border-color:var(--gold); background:#171407; }
  .idx{ color:var(--dim); font-size:11px; padding-top:3px; }
  .math{ overflow-x:auto; font-size:12.5px; }
  .gl{ color:var(--dim); font-size:12px; line-height:1.45; margin-top:4px; }
  pre.code{ margin:0; background:#07101f; border:1px solid var(--line); border-radius:10px; padding:9px 0;
    overflow:auto; font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace; font-size:12.4px; line-height:1.75; }
  .cl{ display:block; white-space:pre; padding:0 12px 0 14px; border-left:3px solid transparent; color:#dbeafe; }
  .cl.on{ background:rgba(251,191,36,.12); border-left-color:var(--gold); }
  .cm{ color:#64748b; font-style:italic; } .kw{ color:#f59e0b; } .fn{ color:#7dd3fc; }
  .refs{ display:grid; grid-template-columns:repeat(2,minmax(0,1fr)); gap:7px; }
  .refs a,.paper{ border:1px solid #223252; border-radius:8px; padding:7px 8px; background:#0d1730; font-size:12px; color:#cbd5e1; }
  .paper{ color:#a8b4cf; }
  .ctrl{ display:flex; justify-content:center; gap:10px; margin-top:12px; }
  .ctrl button{ border:1px solid var(--line); border-radius:8px; background:var(--card2); color:var(--ink); padding:7px 18px; cursor:pointer; }
  .ctrl button:hover{ border-color:var(--gold); color:var(--gold); }
  @media(max-width:980px){
    .toolbar,.main{ grid-template-columns:1fr; }
    .pills{ justify-content:flex-start; }
    .stepper{ grid-template-columns:repeat(2,1fr); }
    .refs{ grid-template-columns:1fr; }
  }
</style>
</head>
<body>
<div class="wrap">
  <header>
    <h1>Robot Learning Algorithm Lab</h1>
    <div class="sub">每个 landscape 节点都有同步解释器：动画、公式、代码、直觉说明一起随步骤切换。第一版覆盖全部节点，后续可以逐个节点继续精修到论文级别。</div>
  </header>
  <div class="toolbar">
    <div class="picker">
      <input id="q" placeholder="Search paradigm / family / paper"/>
      <select id="sel"></select>
    </div>
    <div class="pills" id="families"></div>
  </div>
  <section class="panel" id="intro"></section>
  <div class="stepper" id="stepper"></div>
  <main class="main">
    <div class="left">
      <div class="panel">
        <h2>Animation</h2>
        <canvas id="cv" width="1200" height="760"></canvas>
        <div class="cap" id="cap"></div>
        <div class="ctrl">
          <button id="prev">← Prev</button>
          <button id="play">Pause</button>
          <button id="next">Next →</button>
        </div>
      </div>
      <div class="panel">
        <h2>Trace</h2>
        <div class="trace" id="trace"></div>
      </div>
    </div>
    <div class="right">
      <div class="panel">
        <h2>Formula</h2>
        <div class="fx" id="fx"></div>
      </div>
      <div class="panel">
        <h2>Code</h2>
        <pre class="code" id="code"></pre>
      </div>
      <div class="panel">
        <h2>Evidence / tutorials</h2>
        <div class="refs" id="refs"></div>
      </div>
    </div>
  </main>
</div>
<script>
const D = __DATA_JSON__;
let labs=D.labs, lab=labs[0], step=0, playing=true, clock=0, last=0, family='all';
const byId={}; labs.forEach(x=>byId[x.id]=x);
const cv=document.getElementById('cv'), ctx=cv.getContext('2d');
const W=600,H=380;
function esc(s){return String(s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');}
function texEsc(s){return esc(s).replace(/`/g,'&#96;');}
function fitText(text,max){ text=String(text||''); return text.length>max?text.slice(0,max-1)+'…':text; }
function init(){
  const fams=['all',...new Set(labs.map(x=>x.familyLabel))];
  document.getElementById('families').innerHTML=fams.map(f=>`<button class="pill ${f==='all'?'on':''}" data-f="${esc(f)}">${esc(f)}</button>`).join('');
  document.querySelectorAll('.pill').forEach(b=>b.onclick=()=>{family=b.dataset.f; document.querySelectorAll('.pill').forEach(x=>x.classList.toggle('on',x===b)); renderSelect();});
  document.getElementById('q').oninput=renderSelect;
  document.getElementById('sel').onchange=e=>setLab(e.target.value);
  document.getElementById('prev').onclick=()=>go(step-1,false);
  document.getElementById('next').onclick=()=>go(step+1,false);
  document.getElementById('play').onclick=()=>{playing=!playing; document.getElementById('play').textContent=playing?'Pause':'Play';};
  renderSelect(); setLab(lab.id);
  requestAnimationFrame(loop);
}
function filtered(){
  const q=document.getElementById('q').value.toLowerCase().trim();
  return labs.filter(x=>(family==='all'||x.familyLabel===family) && (!q || [x.name,x.short,x.familyLabel,x.archTitle,(x.papers||[]).join(' ')].join(' ').toLowerCase().includes(q)));
}
function renderSelect(){
  const arr=filtered();
  const sel=document.getElementById('sel');
  sel.innerHTML=arr.map(x=>`<option value="${x.id}">${esc(x.familyLabel)} · ${esc(x.short)}</option>`).join('');
  if(!arr.find(x=>x.id===lab.id) && arr[0]) setLab(arr[0].id);
  else sel.value=lab.id;
}
function setLab(id){
  lab=byId[id]||labs[0]; step=0; clock=0;
  document.documentElement.style.setProperty('--c', lab.color);
  document.getElementById('sel').value=lab.id;
  renderLab();
  try{ history.replaceState(null,'','#'+lab.id); }
  catch(_){}
}
function renderLab(){
  document.getElementById('intro').innerHTML=
    `<div class="titlebar"><div><div class="fam" style="color:${lab.color}">${esc(lab.familyLabel)}</div><div class="name">${esc(lab.name)}</div><div class="tagline">${esc(lab.tagline)}</div></div><div class="arch">${esc(lab.archTitle)}</div></div>`;
  document.getElementById('stepper').innerHTML=lab.steps.map((s,i)=>`<button data-i="${i}">${i} · ${esc(s.t)}</button>`).join('');
  document.querySelectorAll('#stepper button').forEach(b=>b.onclick=()=>go(Number(b.dataset.i),false));
  document.getElementById('fx').innerHTML=lab.formulas.map((f,i)=>`<div class="fx-row" data-i="${i}"><div class="idx">${i}</div><div><div class="math">\\[${f.tex}\\]</div><div class="gl">${esc(f.gl)}</div></div></div>`).join('');
  document.querySelectorAll('.fx-row').forEach(r=>r.onclick=()=>{ const idx=Number(r.dataset.i); const s=lab.steps.findIndex(x=>x.fx.includes(idx)); if(s>=0) go(s,false); });
  document.getElementById('code').innerHTML=lab.code.map((ln,i)=>`<span class="cl" data-i="${i}">${highlight(ln)||'&nbsp;'}</span>`).join('');
  document.querySelectorAll('.cl').forEach(r=>r.onclick=()=>{ const idx=Number(r.dataset.i); const s=lab.steps.findIndex(x=>x.code.includes(idx)); if(s>=0) go(s,false); });
  const papers=(lab.papers||[]).slice(0,5).map(p=>`<div class="paper">${esc(p)}</div>`).join('');
  const learn=lab.learn?`<a href="${lab.learn.url}" target="_blank" rel="noopener noreferrer">${esc(lab.learn.title)} ↗</a>`:'';
  document.getElementById('refs').innerHTML=learn+papers+D.sources.slice(0,6).map(s=>`<a href="${s.url}" target="_blank" rel="noopener noreferrer">${esc(s.title)} ↗</a>`).join('');
  go(0,true);
  typeset();
}
function highlight(ln){
  if(ln.trim().startsWith('#')) return `<span class="cm">${esc(ln)}</span>`;
  let ci=ln.indexOf('#'), code=ci>=0?ln.slice(0,ci):ln, cm=ci>=0?ln.slice(ci):'';
  code=esc(code).replace(/\b(for|in|return|if|else|range|def|while)\b/g,'<span class="kw">$1</span>')
               .replace(/(encoder|denoiser|transformer|softmax|rollout|sample|execute|update|argmax|optimizer|search_or_optimize|detokenize|parse|propose)\b/g,'<span class="fn">$1</span>');
  return code+(cm?`<span class="cm">${esc(cm)}</span>`:'');
}
function go(i,keep=true){
  step=(i+lab.steps.length)%lab.steps.length; clock=0; if(!keep) playing=false;
  const s=lab.steps[step];
  document.querySelectorAll('#stepper button').forEach((b,j)=>b.classList.toggle('on',j===step));
  document.getElementById('cap').innerHTML=`<b>${esc(s.t)}</b> · ${esc(s.cap)}<br><span style="color:var(--dim)">${esc(lab.intuition)}</span>`;
  document.querySelectorAll('.fx-row').forEach((r,j)=>r.classList.toggle('on',s.fx.includes(j)));
  document.querySelectorAll('.cl').forEach((r,j)=>r.classList.toggle('on',s.code.includes(j)));
  document.getElementById('trace').innerHTML=traceText(s);
  document.getElementById('play').textContent=playing?'Pause':'Play';
}
function traceText(s){
  return `<b>${esc(lab.short)}</b> 当前步骤是 <b>${esc(s.t)}</b>。<br>${esc(s.cap)}<br><span style="color:var(--dim)">什么时候用：${esc(lab.when||lab.simple)}</span>`;
}
function typeset(){ if(window.MathJax && MathJax.typesetPromise) MathJax.typesetPromise().catch(()=>{}); }
function loop(now){ if(!last) last=now; const dt=Math.min(48,now-last); last=now; if(playing){ clock+=dt; if(clock>6200) go(step+1,true); } draw(); requestAnimationFrame(loop); }
function ease(t){ return t<.5 ? 4*t*t*t : 1-Math.pow(-2*t+2,3)/2; }
function smooth(t){ return t*t*(3-2*t); }
function clamp(t,a=0,b=1){ return Math.max(a,Math.min(b,t)); }
function phase(offset=0,span=1){ return clamp(((clock%2600)/2600-offset)/span); }
function bg(){
  ctx.setTransform(2,0,0,2,0,0);
  const g=ctx.createRadialGradient(300,185,20,300,185,420);
  g.addColorStop(0,'#0b1832'); g.addColorStop(.55,'#07101f'); g.addColorStop(1,'#050914');
  ctx.fillStyle=g; ctx.fillRect(0,0,W,H);
  ctx.strokeStyle='rgba(148,163,184,.045)'; ctx.lineWidth=1;
  for(let x=40;x<W;x+=40){ctx.beginPath();ctx.moveTo(x,68);ctx.lineTo(x,340);ctx.stroke();}
  for(let y=80;y<340;y+=40){ctx.beginPath();ctx.moveTo(34,y);ctx.lineTo(W-34,y);ctx.stroke();}
}
function txt(s,x,y,c='#93a0bd',size=12,align='center',weight='500'){ ctx.fillStyle=c; ctx.font=`${weight} ${size}px Inter, sans-serif`; ctx.textAlign=align; ctx.fillText(s,x,y); }
function roundRect(x,y,w,h,r,fill,stroke,alpha=1){ ctx.save(); ctx.globalAlpha=alpha; ctx.beginPath(); ctx.roundRect(x,y,w,h,r); ctx.fillStyle=fill; ctx.fill(); ctx.strokeStyle=stroke; ctx.lineWidth=1.4; ctx.stroke(); ctx.restore(); }
function arrow(x1,y1,x2,y2,c='#334155',w=1.5,progress=1){ const px=x1+(x2-x1)*progress, py=y1+(y2-y1)*progress; ctx.strokeStyle=c; ctx.lineWidth=w; ctx.lineCap='round'; ctx.beginPath(); ctx.moveTo(x1,y1); ctx.lineTo(px,py); ctx.stroke(); if(progress>.92){ const a=Math.atan2(y2-y1,x2-x1); ctx.fillStyle=c; ctx.beginPath(); ctx.moveTo(x2,y2); ctx.lineTo(x2-7*Math.cos(a-.35),y2-7*Math.sin(a-.35)); ctx.lineTo(x2-7*Math.cos(a+.35),y2-7*Math.sin(a+.35)); ctx.fill(); } }
function pathArrow(points,c,w=2,progress=1){ ctx.strokeStyle=c; ctx.lineWidth=w; ctx.lineCap='round'; ctx.beginPath(); ctx.moveTo(points[0][0],points[0][1]); for(let i=1;i<points.length;i++)ctx.lineTo(points[i][0],points[i][1]); ctx.stroke(); const seg=Math.min(points.length-2,Math.floor(progress*(points.length-1))); const a=points[seg], b=points[seg+1]; const t=(progress*(points.length-1)-seg); return [a[0]+(b[0]-a[0])*t,a[1]+(b[1]-a[1])*t]; }
function box(x,y,w,h,label,on=false,c='#fbbf24'){ 
  if(on){ ctx.shadowColor=c; ctx.shadowBlur=14; }
  roundRect(x,y,w,h,8,on?'#171407':'#0d1730',on?c:'#2b3f68');
  ctx.shadowBlur=0; txt(label,x+w/2,y+h/2+4,on?c:'#dbeafe',12,'center',on?'750':'550');
}
function pulse(x,y,c,r=7){ const t=(clock%1300)/1300; ctx.globalAlpha=.32*(1-t); ctx.fillStyle=c; ctx.beginPath(); ctx.arc(x,y,r+22*ease(t),0,7); ctx.fill(); ctx.globalAlpha=1; ctx.fillStyle=c; ctx.beginPath(); ctx.arc(x,y,r,0,7); ctx.fill(); }
function chip(x,y,label,c,on){ roundRect(x,y,58,28,7,on?'rgba(251,191,36,.14)':'#0d1730',on?c:'#2b3f68'); txt(label,x+29,y+18,on?c:'#cbd5e1',11,'center',on?'750':'550'); }
function robotArm(baseX,baseY,a1,a2,c){ const l1=58,l2=48; const x1=baseX+Math.cos(a1)*l1,y1=baseY+Math.sin(a1)*l1; const x2=x1+Math.cos(a1+a2)*l2,y2=y1+Math.sin(a1+a2)*l2; ctx.strokeStyle='#334155';ctx.lineWidth=13;ctx.lineCap='round';ctx.beginPath();ctx.moveTo(baseX,baseY);ctx.lineTo(x1,y1);ctx.lineTo(x2,y2);ctx.stroke();ctx.strokeStyle=c;ctx.lineWidth=5;ctx.beginPath();ctx.moveTo(baseX,baseY);ctx.lineTo(x1,y1);ctx.lineTo(x2,y2);ctx.stroke();pulse(x2,y2,c,5); return [x2,y2]; }
function draw(){
  bg(); const s=lab.steps[step], p=(clock%2400)/2400, c=lab.color;
  txt(lab.name,300,28,'#e7ecf6',16); txt(lab.archTitle,300,48,c,12);
  if(lab.vis==='denoise') drawDenoise(s,p,c);
  else if(lab.vis==='tokens') drawTokens(s,p,c);
  else if(lab.vis==='grid') drawGrid(s,p,c);
  else if(lab.vis==='prob') drawProb(s,p,c);
  else if(lab.vis==='actorcritic') drawAC(s,p,c);
  else if(lab.vis==='world') drawWorld(s,p,c);
  else if(lab.vis==='imitation') drawImitation(s,p,c);
  else drawPlanner(s,p,c);
}
function drawDenoise(s,p,c){
  const e=ease(p), active=s.key;
  box(34,76,92,44,'context',active==='context'||active==='pair',c); box(246,76,104,44,active==='velocity'?'velocity':'denoiser',active==='score'||active==='velocity',c); box(474,76,92,44,'action',active==='execute'||active==='chunk',c);
  arrow(126,98,246,98,c,2,smooth(phase(.05,.25))); arrow(350,98,474,98,c,2,smooth(phase(.35,.25)));
  const target=[[70,276],[130,232],[202,212],[280,218],[358,190],[438,165],[510,128]];
  ctx.strokeStyle='rgba(148,163,184,.35)'; ctx.lineWidth=2; ctx.setLineDash([5,6]); ctx.beginPath(); ctx.moveTo(target[0][0],target[0][1]); for(let i=1;i<target.length;i++)ctx.lineTo(target[i][0],target[i][1]); ctx.stroke(); ctx.setLineDash([]);
  ctx.strokeStyle=c; ctx.lineWidth=3.2; ctx.beginPath(); ctx.moveTo(target[0][0],target[0][1]); for(let i=1;i<target.length;i++)ctx.lineTo(target[i][0],target[i][1]); ctx.stroke();
  for(let i=0;i<11;i++){ const t=clamp(e-i*.045); const x=70+i*42, y=260+Math.sin(i*1.8)*48*(1-t); pulse(x,y,c,3+4*t); }
  const [rx,ry]=robotArm(110,330,-1.18+.65*e,-.92+.35*Math.sin(e*Math.PI),c);
  roundRect(495,118,28,28,6,'#172033','#475569'); txt('goal',510,112,'#fbbf24',10);
  txt(active==='integrate'||active==='sample'?'few smooth generation steps':'noise collapses into a multimodal action chunk',300,352,'#93a0bd');
}
function drawTokens(s,p,c){
  const e=ease(p); box(26,78,110,42,'VLM ctx',s.key==='context'||s.key==='encode',c);
  arrow(136,99,188,99,c,2,smooth(phase(.05,.2)));
  for(let i=0;i<8;i++){ const on=(s.key==='decode'||s.key==='lm'||s.key==='quant'||s.key==='tokens'||s.key==='rtg'||s.key==='causal') && e>i/9; chip(188+i*45,84,'a'+(i+1),c,on); }
  arrow(548,99,548,165,c,2,smooth(phase(.55,.2))); box(494,166,108,42,s.key==='detok'?'dequantize':'controller',s.key==='detok'||s.key==='control'||s.key==='roll',c);
  const [gx,gy]=robotArm(132,318,-.95+.45*e,-.78+.55*Math.sin(e*Math.PI),c);
  for(let i=0;i<4;i++){ const x=210+i*36; ctx.fillStyle=e>i*.18?c:'#1e293b'; ctx.beginPath(); ctx.arc(x,244,6,0,7); ctx.fill(); arrow(x+8,244,x+28,244,c,1.4,e>i*.18?1:.15); }
  txt('tokens are not text only: they can be robot actions, returns, regions, or video codes',300,352,'#93a0bd');
}
function drawGrid(s,p,c){
  const e=ease(p), x0=128,y0=82,cell=43;
  for(let r=0;r<4;r++)for(let col=0;col<8;col++){ const d=Math.abs(7-col)+Math.abs(0-r); const v=clamp((9-d)/9*(s.key==='backup'||s.key==='target'||s.key==='fit'?1:e)); ctx.globalAlpha=.18+.62*v; roundRect(x0+col*cell,y0+r*cell,35,35,5,c,'#2b3f68'); ctx.globalAlpha=1; if(v>.46)txt(v.toFixed(1),x0+col*cell+17,y0+r*cell+22,'#07101f',9); }
  txt('★',x0+7*cell+17,y0+22,'#fbbf24',20); const path=[[x0+17,y0+3*cell+17],[x0+2*cell+17,y0+3*cell+17],[x0+2*cell+17,y0+cell+17],[x0+5*cell+17,y0+cell+17],[x0+7*cell+17,y0+17]]; const dot=pathArrow(path,c,3,e); pulse(dot[0],dot[1],c,6);
  txt('value propagates backward; action follows the steepest value rise',300,315,'#93a0bd');
}
function drawProb(s,p,c){
  const e=ease(p), x=92,y=262; pulse(x,y,c,7); txt('π(a|s)',x,y+28,c,12);
  const ends=[[190,116],[270,82],[365,125],[455,205],[522,285]];
  ends.forEach((pt,i)=>{const best=i===1; const w=best?2+6*e:5-3.2*e; arrow(x,y,pt[0],pt[1],best?c:'#64748b',w,1); ctx.globalAlpha=best?.95:.35; txt(best?'high A':'low A',pt[0],pt[1]-10,best?'#fbbf24':'#94a3b8',10); ctx.globalAlpha=1;});
  ctx.strokeStyle='#334155'; ctx.lineWidth=1; ctx.beginPath(); ctx.arc(x,y,36,0,Math.PI*2); ctx.stroke();
  txt('policy gradient reshapes the action distribution, not a value table',300,330,'#93a0bd');
}
function drawAC(s,p,c){
  const e=ease(p); box(62,86,124,52,'Actor π',s.key==='act'||s.key==='actor',c); box(414,86,124,52,'Critic Q',s.key==='critic'||s.key==='td',c);
  arrow(186,112,414,112,c,2.4,smooth(phase(.02,.28))); arrow(414,138,186,138,'#34d399',2.4,smooth(phase(.42,.28)));
  const x=300+Math.sin(e*Math.PI*2)*18; pulse(x,112,c,5); txt('action',300,101,c,10); txt('Q grade / gradient',300,160,'#34d399',11);
  for(let i=0;i<5;i++){ const h=20+42*(i===3?e:1-e*.45); roundRect(228+i*30,262-h,18,h,3,i===3?c:'#334155',i===3?c:'#475569'); }
  txt('critic evaluates; actor moves toward high-value continuous actions',300,322,'#93a0bd');
}
function drawWorld(s,p,c){
  const e=ease(p); box(34,78,84,42,'obs',s.key==='encode'||s.key==='observe',c); box(156,78,122,42,'latent world',s.key==='predict'||s.key==='state',c); box(326,78,92,42,'score',s.key==='score'||s.key==='condition',c); box(482,78,82,42,'plan',s.key==='plan'||s.key==='use',c);
  arrow(118,99,156,99,c,2,smooth(phase(.02,.18))); arrow(278,99,326,99,c,2,smooth(phase(.22,.18))); arrow(418,99,482,99,c,2,smooth(phase(.42,.18)));
  const futures=[[[150,260],[230,222],[315,205],[420,188]],[[150,260],[235,270],[334,256],[442,238]],[[150,260],[210,205],[290,174],[386,154]]];
  futures.forEach((path,i)=>{ ctx.globalAlpha=i===0?.95:.35; const dot=pathArrow(path,i===0?c:'#64748b',i===0?3:1.5,clamp(e+i*.05)); if(i===0)pulse(dot[0],dot[1],c,5); ctx.globalAlpha=1; });
  txt('parallel imagined futures → evaluator chooses one → execute first action',300,324,'#93a0bd');
}
function drawPlanner(s,p,c){
  const e=ease(p); box(38,82,104,42,'goal',s.key==='goal'||s.key==='prompt'||s.key==='scene',c); box(218,82,134,42,'constraints',s.key==='constraints'||s.key==='cost'||s.key==='program',c); box(454,82,104,42,'execute',s.key==='execute'||s.key==='feedback',c);
  arrow(142,103,218,103,c,2,smooth(phase(.04,.2))); arrow(352,103,454,103,c,2,smooth(phase(.32,.2)));
  const path=[[70,285],[142,216],[220,260],[310,190],[392,214],[520,126]]; ctx.strokeStyle='rgba(148,163,184,.26)'; ctx.lineWidth=7; ctx.lineCap='round'; ctx.beginPath(); ctx.moveTo(path[0][0],path[0][1]); for(let i=1;i<path.length;i++)ctx.lineTo(path[i][0],path[i][1]); ctx.stroke(); const dot=pathArrow(path,c,3,e); pulse(dot[0],dot[1],c,5);
  ctx.strokeStyle='#fb7185'; ctx.lineWidth=2; ctx.setLineDash([4,4]); ctx.strokeRect(256,216,58,42); ctx.setLineDash([]); txt('avoid',285,210,'#fb7185',10);
  txt('planner turns semantics or geometry into a feasible path/skill sequence',300,330,'#93a0bd');
}
function drawImitation(s,p,c){
  const e=ease(p); box(48,82,120,48,'expert demos',s.key==='demo'||s.key==='positive',c); box(238,82,130,48,s.key==='disc'?'discriminator':'match loss',s.key==='match'||s.key==='disc'||s.key==='contrast',c); box(438,82,108,48,'policy',s.key==='opt'||s.key==='deploy'||s.key==='rl',c);
  arrow(168,106,238,106,c,2,smooth(phase(.05,.2))); arrow(368,106,438,106,c,2,smooth(phase(.35,.2)));
  ctx.strokeStyle='#e7ecf6'; ctx.lineWidth=2; ctx.setLineDash([6,5]); ctx.beginPath(); ctx.moveTo(70,260); ctx.bezierCurveTo(170,165,310,180,470,228); ctx.stroke(); ctx.setLineDash([]);
  ctx.strokeStyle=c; ctx.lineWidth=3; ctx.beginPath(); ctx.moveTo(70,292); ctx.bezierCurveTo(170,235-54*e,310,235-46*e,470,260-32*e); ctx.stroke(); pulse(70+400*e,292-64*e,c,5);
  txt('learner trajectory is pulled toward expert occupancy or low-energy actions',300,330,'#93a0bd');
}
window.addEventListener('hashchange',()=>{ const h=location.hash.slice(1); if(byId[h]) setLab(h); });
window.addEventListener('load',()=>{ const h=location.hash.slice(1); if(byId[h]) lab=byId[h]; init(); });
</script>
</body>
</html>
"""


def render() -> str:
    return TEMPLATE.replace("__DATA_JSON__", json.dumps(build_data(), ensure_ascii=False))


if __name__ == "__main__":
    with open("robot_algorithm_lab.html", "w", encoding="utf-8") as f:
        f.write(render())
    print("Wrote robot_algorithm_lab.html")