"""Generate a World Model landscape page (current as of mid-2026).

Reuses the robot-landscape engine (gen_landscape.TEMPLATE: constellation map,
Magic-Move detail panels, typed edges, MathJax) but injects a world-model
specific set of animations and data. Self-contained HTML.

Built from a 5-agent research sweep (June 2026): latent/control WMs, video-gen
& interactive WMs, JEPA latent prediction, embodied/driving WMs, surveys.
Families are organized equation-first (by training/inference mechanism).

Run:  .venv_robot_paradigms/bin/python gen_worldmodel.py   -> robot_worldmodel.html
"""
import json

import gen_landscape
import my_papers

# ---------------------------------------------------------------------------
# Families (11) — organized by the training / inference equation, not fame.
# ---------------------------------------------------------------------------
FAMILIES = [
    ("Latent",      "Latent Imagination (RSSM)", "#2563eb"),
    ("Value",       "Value-Equivalent Planning", "#0891b2"),
    ("Token",       "Autoregressive Token WMs",  "#7c3aed"),
    ("JEPA",        "JEPA / Latent Prediction",  "#16a34a"),
    ("VideoDiff",   "Video Diffusion Simulators","#db2777"),
    ("Interactive", "Interactive / Playable",    "#ea580c"),
    ("Spatial3D",   "Persistent 3D Worlds",      "#ca8a04"),
    ("Driving",     "Driving World Models",      "#e11d48"),
    ("RobotWM",     "Robot WM Infrastructure",   "#0ea5e9"),
    ("WMVLA",       "World Models in VLAs",      "#65a30d"),
    ("Position",    "Positions & Debates",       "#64748b"),
]

# id, name, short, family, anim, tagline, simple, mapping, math, when, pros, cons, papers, (learn title,url)
P = [
 # ---------------- Latent Imagination ----------------
 dict(id="world-models", name="World Models (Ha & Schmidhuber)", short="World Models", family="Latent", anim="latentroll",
   tagline="The proof-of-concept: compress with a VAE, dream with an RNN, train a controller inside the dream.",
   simple=("Learn to drive a racing game by first building a 'mental movie player' of it: a VAE squeezes each frame "
           "into a small code, an RNN predicts the next code, and a tiny controller is trained entirely inside this "
           "hallucinated game — then transferred back to the real one. The 2018 paper that named the genre."),
   mapping="frames → latent z (VAE) → next-z prediction (RNN) → controller",
   math=r"z_t\sim q_{\text{VAE}}(z_t\mid o_t),\quad P(z_{t+1}\mid a_t,z_t,h_t)\;\text{(MDN-RNN)}",
   when="As the conceptual starting point: what 'training inside a learned simulator' means.",
   pros=["First clean train-inside-the-dream demo", "Simple modular V / M / C pipeline", "Beautiful interactive paper"],
   cons=["Not end-to-end; latents not shaped for control", "Dreams drift and get exploited", "Only simple environments"],
   papers=["World Models (Ha & Schmidhuber 2018)"],
   learn=("World Models — interactive paper", "https://worldmodels.github.io/")),
 dict(id="planet", name="PlaNet (RSSM)", short="PlaNet (RSSM)", family="Latent", anim="latentroll",
   tagline="The RSSM backbone: deterministic + stochastic latent dynamics, planned with CEM purely in latent space.",
   simple=("PlaNet learns a latent 'physics engine' from pixels where each state has a reliable deterministic memory "
           "plus a stochastic guess for what's uncertain. To act it learns no policy at all — it imagines thousands of "
           "action sequences in latent space and picks the best, like playing out chess lines in your head."),
   mapping="pixels → RSSM state (h_t, s_t) → CEM over imagined rollouts → action",
   math=r"\mathcal{L}=\sum_t\mathbb{E}\big[\ln p(o_t\mid s_t)\big]-\mathbb{E}\,\mathrm{KL}\big[q(s_t\mid s_{t-1},a_{t-1},o_t)\,\|\,p(s_t\mid s_{t-1},a_{t-1})\big]",
   when="Online planning from pixels with high sample efficiency; the origin of every Dreamer.",
   pros=["~50x more sample-efficient than model-free", "RSSM became the standard latent dynamics", "No policy network at all"],
   cons=["CEM at every step is slow", "Struggles with long horizons / sparse rewards", "Reconstruction wastes capacity on irrelevant pixels"],
   papers=["PlaNet (Hafner et al. 2019)"],
   learn=("Introducing PlaNet — Google Research blog", "https://research.google/blog/introducing-planet-a-deep-planning-network-for-reinforcement-learning/")),
 dict(id="dreamer", name="Dreamer v1–v3", short="Dreamer v1–3", family="Latent", anim="dream",
   tagline="Actor-critic trained by backpropagating value gradients through imagined latent rollouts.",
   simple=("Dreamer keeps PlaNet's latent physics engine but trains a policy and value function inside the dream — "
           "backpropagating 'how to get more reward' through imagined futures. V2 added discrete latents (human-level "
           "Atari); V3's symlog tricks made one hyperparameter set master 150+ tasks and mine Minecraft diamonds from scratch."),
   mapping="pixels → RSSM latent → imagined rollouts → actor-critic update",
   math=r"\max_\phi\;\mathbb{E}_{q_\phi}\Big[\textstyle\sum_{\tau=t}^{t+H}V_\lambda(s_\tau)\Big]\;\;(\text{gradients through the model};\;\mathrm{symlog}(x)=\mathrm{sign}(x)\ln(|x|{+}1))",
   when="The default sample-efficient RL agent on a new domain with no tuning budget.",
   pros=["One config across 150+ tasks (Nature 2025)", "First Minecraft diamonds from scratch", "Fast amortized inference"],
   cons=["Recurrent RSSM trains sequentially (poor GPU use)", "Reconstruction limits rich real-world video", "Policy can exploit model errors"],
   papers=["Dreamer (Hafner 2020)", "DreamerV2 (2021)", "DreamerV3 (2023; Nature 2025)"],
   learn=("DreamerV3 — project page", "https://danijar.com/project/dreamerv3/")),
 dict(id="daydreamer", name="DayDreamer", short="DayDreamer", family="Latent", anim="dream",
   tagline="Dreamer on physical robots: a quadruped learns to walk in 1 hour, no simulator.",
   simple=("If imagination training is so sample-efficient, can a real robot afford it? Yes — by dreaming thousands of "
           "practice runs for every real step, an A1 quadruped learned to walk from scratch in about an hour and "
           "adapted online to being pushed. The existence proof that latent world models work outside simulation."),
   mapping="real robot sensors → Dreamer world model → imagined practice → motor commands",
   math=r"\max_\phi\,\mathbb{E}_q\Big[\textstyle\sum_\tau V_\lambda(s_\tau)\Big]\;\;\text{trained from real-world replay only}",
   when="Real-robot RL without a simulator, when every minute of hardware time is precious.",
   pros=["Walking in ~1 hour wall-clock on hardware", "Shown on 4 different robots", "No simulator, no demos"],
   cons=["Short-horizon, simple tasks", "Exploration safety not addressed", "Inherits reconstruction limits in clutter"],
   papers=["DayDreamer (Wu, Escontrela, Hafner, Abbeel, Goldberg 2022)"],
   learn=("DayDreamer — project page", "https://danijar.com/project/daydreamer/")),
 dict(id="dreamer4", name="Dreamer 4 (Shortcut Forcing)", short="Dreamer 4", family="Latent", anim="shortcutf",
   tagline="Scalable transformer WM, real-time on one GPU; Minecraft diamonds from purely offline data.",
   simple=("Dreamer 4 replaces the small recurrent dream with a video-scale transformer that stays fast enough to "
           "interact with live. Its 'shortcut forcing' objective teaches one big denoising step to match the result of "
           "two half-steps, so a dream frame needs ~4 network calls instead of 64. The agent practices inside this "
           "dream from logged videos only — and still mines diamonds."),
   mapping="offline video (mostly action-free) → shortcut-forcing transformer WM → imagination RL → policy",
   math=r"f_\theta(z_t,d)\;\approx\;\mathrm{sg}\big[f_\theta\big(f_\theta(z_t,\tfrac{d}{2}),\tfrac{d}{2}\big)\big]\;\;\text{(shortcut forcing)}",
   when="Offline-to-control at scale: rich visual domains where real interaction is expensive or impossible.",
   pros=["First diamonds purely offline; ~100x less data than VPT", "~16x faster generation → real-time on 1 GPU", "Only ~100h of action labels needed"],
   cons=["Absolute success rate still low (~0.7%)", "No online correction of model bias", "Compute-heavy pretraining"],
   papers=["Training Agents Inside of Scalable World Models (Hafner, Yan & Lillicrap 2025)"],
   learn=("Dreamer 4 — project page", "https://danijar.com/project/dreamer4/")),

 # ---------------- Value-Equivalent Planning ----------------
 dict(id="muzero", name="MuZero", short="MuZero", family="Value", anim="mcts",
   tagline="AlphaZero without the rulebook: latents trained only to predict reward, value, policy — searched with MCTS.",
   simple=("MuZero plays Go, chess, and Atari at superhuman level without being told the rules. Its learned model never "
           "simulates boards or pixels — its abstract state only answers the three questions search cares about: what's "
           "the reward, who's winning, what looks promising? A world model only needs to be accurate about decisions."),
   mapping="observations → abstract latent → MCTS over (reward, value, policy) heads → action",
   math=r"\mathcal{L}=\sum_{k=0}^{K}\Big[\ell^r(u_{t+k},\hat r_t^k)+\ell^v(z_{t+k},\hat v_t^k)+\ell^p(\pi_{t+k},\hat p_t^k)\Big]",
   when="Discrete-action domains with strategic depth where tree search pays off.",
   pros=["Superhuman with no rules given", "Value-equivalent: accurate about decisions, not pixels", "Deployed in real products (video compression)"],
   cons=["Extremely sample- and compute-hungry", "Continuous control needs nontrivial extensions", "Latents uninterpretable and task-locked"],
   papers=["MuZero (Schrittwieser et al. 2020)"],
   learn=("MuZero — DeepMind blog", "https://deepmind.google/discover/blog/muzero-mastering-go-chess-shogi-and-atari-without-rules/")),
 dict(id="efficientzero", name="EfficientZero (V1/V2)", short="EfficientZero", family="Value", anim="mcts",
   tagline="MuZero made sample-efficient: a SimSiam consistency loss → superhuman Atari in 2 hours of play.",
   simple=("MuZero needed billions of frames; EfficientZero gets superhuman Atari scores from two hours of gameplay. "
           "Rewards alone are too weak a signal from so little data, so it adds a self-supervised loss forcing the "
           "model's imagined next state to match the encoder's actual next state. V2 extended the recipe to continuous control."),
   mapping="limited replay → latent state → MCTS, + next-state consistency as extra signal",
   math=r"\mathcal{L}_{\text{consist}}=-\,\mathrm{sim}\big(P(g_\theta(z_t,a_t)),\;\mathrm{sg}[h_\theta(o_{t+1})]\big)",
   when="When environment samples are the bottleneck — the sample-efficiency record-holder lineage.",
   pros=["First superhuman Atari-100k (~2h experience)", "Consistency loss is simple and transferable", "V2 spans discrete/continuous"],
   cons=["Heavy machinery (MCTS + reanalysis)", "High wall-clock compute per step", "Less plug-and-play than Dreamer/TD-MPC"],
   papers=["EfficientZero (Ye et al. 2021)", "EfficientZero V2 (Wang et al. 2024)"],
   learn=("EfficientZero: How It Works — illustrated", "https://www.lesswrong.com/posts/mRwJce3npmzbKfxws/efficientzero-how-it-works")),
 dict(id="tdmpc", name="TD-MPC / TD-MPC2", short="TD-MPC2", family="Value", anim="plan",
   tagline="Decoder-free latents shaped by TD-learning; short-horizon MPPI search + a learned terminal value.",
   simple=("TD-MPC never reconstructs pixels — its latent space only has to predict rewards and values. At decision "
           "time it searches a few steps ahead in latent space and lets a value function judge everything beyond, like "
           "a chess player calculating 5 moves deep then trusting intuition. TD-MPC2: one config, one 317M agent, 80+ tasks."),
   mapping="observation → task-oriented latent → MPPI + terminal value Q → action",
   math=r"a^*=\arg\max_{a_{t:t+H}}\mathbb{E}\Big[\sum_{h=0}^{H-1}\gamma^h R_\theta(z_h,a_h)+\gamma^H Q_\theta(z_H,a_H)\Big]",
   when="The strong default for continuous-control benchmarks and multi-task robot learning.",
   pros=["104 tasks, one hyperparameter set", "SOTA on hard locomotion/manipulation", "Open code; the practical workhorse"],
   cons=["Can't generate observations or transfer without rewards", "Planning overhead every control step", "Novel-embodiment transfer limited"],
   papers=["TD-MPC (Hansen, Wang, Su 2022)", "TD-MPC2 (2024)"],
   learn=("TD-MPC2 — project page", "https://www.tdmpc2.com/")),

 # ---------------- Autoregressive Token WMs ----------------
 dict(id="iris", name="IRIS (Transformer WM)", short="IRIS", family="Token", anim="tokenfilm",
   tagline="The world model as a language model: VQ frame tokens + a GPT that predicts the next frame's tokens.",
   simple=("IRIS treats experience like text: a VQ-VAE turns each frame into discrete tokens, and a GPT-like "
           "transformer predicts the next frame's tokens given past tokens and actions. The agent learns inside this "
           "'GPT of the game' — 2 hours of Atari experience beat humans on many games, kicking off the transformer-WM wave."),
   mapping="frames → VQ tokens → next-token transformer → imagined rollouts → policy",
   math=r"\max_\theta\;\sum_t\log p_\theta\big(z_{t+1}\mid z_{\le t},a_{\le t}\big)\quad(z=\text{discrete frame tokens})",
   when="Sample-limited visual RL; importing the LLM toolbox (tokenizers, scaling) into world modeling.",
   pros=["Sequence-modeling machinery transfers directly", "Interpretable token rollouts you can watch", "Influential (STORM, TWM, Genie lineage)"],
   cons=["Many tokens per frame → slow imagination", "VQ loses fine visual detail", "Limited long-horizon memory"],
   papers=["IRIS (Micheli, Alonso, Fleuret 2023)", "TWM (Robine 2023)", "STORM (Zhang 2023)"],
   learn=("IRIS — official repo with rollout videos", "https://github.com/eloialonso/iris")),
 dict(id="gaia1", name="GAIA-1 (Driving Tokens)", short="GAIA-1", family="Token", anim="tokenfilm",
   tagline="9B-param driving world model: next-token prediction over video tokens, conditioned on action and text.",
   simple=("GAIA-1 treats driving video like language: chop frames into discrete tokens and predict the next one, the "
           "way GPT predicts the next word. Tell it 'it starts raining' or feed a steering command and it dreams a "
           "coherent continuation of the drive. The proof that world models scale like LLMs."),
   mapping="past driving video + ego actions + text → future driving video",
   math=r"\mathcal{L}=-\sum_t\log p_\theta\big(z_t\mid z_{<t},a_{<t},c_{\text{text}}\big)\;\;\text{(+ diffusion decoder to pixels)}",
   when="Generating rare/dangerous driving scenarios; historically, the LLM-recipe-on-video proof.",
   pros=["Clean autoregressive formulation, LLM-style scaling laws", "Text + action controllability", "Kickstarted the driving-WM subfield"],
   cons=["Discrete tokens cap fidelity", "Single camera, low FPS, offline", "Superseded by GAIA-2/3"],
   papers=["GAIA-1 (Wayve 2023)"],
   learn=("Introducing GAIA-1 — Wayve blog", "https://wayve.ai/thinking/introducing-gaia1/")),
 dict(id="wham", name="WHAM / Muse (Game WM)", short="WHAM / Muse", family="Token", anim="interleave",
   tagline="Nature-published game world model: one token stream jointly predicts frames AND controller actions.",
   simple=("Muse trained on seven years of human matches from one Xbox game, autoregressively generating both the next "
           "frames and plausible controller inputs. It stays consistent for minutes and keeps user edits persistent in "
           "the world. The cleanest demo that one sequence model can jointly model a world and the actions taken in it."),
   mapping="interleaved past frames + controller actions → next frames and/or next actions",
   math=r"\max_\theta\sum_t\log p_\theta(x_t\mid x_{<t}),\quad x=(\text{frame tokens},\text{action tokens})\;\text{interleaved}",
   when="Gameplay ideation; the 'unified obs+action token stream' blueprint now visible inside robot models.",
   pros=["Joint world + behavior model", "Strong persistency of user edits", "Open weights + demonstrator (rare)"],
   cons=["One game, ~300×180 resolution", "~10 fps — not a shippable engine", "Robotics transfer is conceptual only"],
   papers=["WHAM / Muse (Kanervisto et al., Nature 2025)"],
   learn=("Introducing Muse — Microsoft Research blog", "https://www.microsoft.com/en-us/research/blog/introducing-muse-our-first-generative-ai-model-designed-for-gameplay-ideation/")),

 # ---------------- JEPA / Latent Prediction ----------------
 dict(id="ijepa", name="I-JEPA", short="I-JEPA", family="JEPA", anim="jepapred",
   tagline="Predict embeddings of masked image blocks from a context block — features, not pixels.",
   simple=("Cover part of a photo and ask the model to describe what's hidden — not paint it. I-JEPA sees one chunk of "
           "an image and predicts the abstract features of other chunks, using a slowly-updated 'teacher' copy of "
           "itself as the answer key. No pixel rendering means ~10x less compute than reconstruction for similar quality."),
   mapping="visible context patches + target locations → embeddings of masked blocks",
   math=r"\mathcal{L}=\sum_i\big\|g_\phi(f_\theta(x_{ctx}),m_i)-\bar f_{\bar\theta}(y_i)\big\|_2^2,\quad\bar\theta\leftarrow\tau\bar\theta+(1{-}\tau)\theta",
   when="Self-supervised image pretraining without augmentation engineering or pixel decoders.",
   pros=["No hand-crafted augmentations, no decoder", "Far cheaper than generative pretraining", "Strong transfer (classification, depth, counting)"],
   cons=["Static images — no dynamics, no actions", "Needs EMA teacher + masking heuristics vs collapse", "A representation learner, not yet a usable WM"],
   papers=["I-JEPA (Assran et al. 2023)"],
   learn=("I-JEPA — Meta AI blog", "https://ai.meta.com/blog/yann-lecun-ai-model-i-jepa/")),
 dict(id="vjepa2", name="V-JEPA 2", short="V-JEPA 2", family="JEPA", anim="jepapred",
   tagline="1B-param video JEPA pretrained on ~1M hours: physics intuition by predicting masked latent features.",
   simple=("V-JEPA 2 watched over a million hours of internet video, learning how the world unfolds purely by "
           "predicting masked-out latent features — never generating a frame. One encoder then does video QA, "
           "anticipates what happens next, and serves as the physics-savvy backbone for robot control (V-JEPA 2-AC)."),
   mapping="masked internet-scale video → latent features of missing content",
   math=r"\min_{\theta,\phi}\;\big\|P_\phi(E_\theta(x_{vis}))-\mathrm{sg}\big(\bar E_{\bar\theta}(x)\big)_{mask}\big\|_1\quad\text{at }10^6\text{ video-hours}",
   when="One pretrained video backbone for understanding, anticipation, and downstream control.",
   pros=["SOTA motion understanding & anticipation at release", "Open weights; LLM-aligned for video QA", "The JEPA scaling story made real"],
   cons=["Still fails chunks of intuitive-physics benchmarks", "No actions in pretraining", "Latent rollouts degrade with horizon"],
   papers=["V-JEPA (Bardes 2024)", "V-JEPA 2 (Assran et al. 2025)"],
   learn=("Introducing V-JEPA 2 — Meta AI", "https://ai.meta.com/vjepa/")),
 dict(id="vjepa2ac", name="V-JEPA 2-AC (Action-Conditioned)", short="V-JEPA 2-AC", family="JEPA", anim="latentmpc",
   tagline="Frozen V-JEPA 2 + a thin action-conditioned predictor (62h robot data) → zero-shot manipulation via latent MPC.",
   simple=("Take V-JEPA 2's frozen 'eyes', then teach a small add-on the if-I-do-this-then-that of a robot arm from "
           "under 62 hours of robot data. To act, the robot imagines latent futures for candidate action sequences and "
           "picks the one whose imagined end state looks most like the goal image — 65–80% zero-shot pick-and-place in labs it never saw."),
   mapping="frame embedding + candidate actions → predicted future embeddings → argmin distance-to-goal",
   math=r"a^*_{1:T}=\arg\min_{a}\big\|P_\phi\big(E(o_t),a_{1:T}\big)-E(o_{\text{goal}})\big\|_1\;\;\text{(CEM/MPC)}",
   when="Robot manipulation with goal images and almost no robot-specific data; the flagship of the non-generative camp.",
   pros=["Web-scale passive pretraining + thin action layer", "Zero-shot to unseen labs/objects", "~15x faster planning than Cosmos latent diffusion"],
   cons=["Goal must be an image (no language/reward)", "Seconds per planning step — not reactive", "Camera-pose sensitive"],
   papers=["V-JEPA 2 / V-JEPA 2-AC (Assran et al. 2025)"],
   learn=("V-JEPA 2 robot planning — Meta AI", "https://ai.meta.com/blog/v-jepa-2-world-model-benchmarks/")),
 dict(id="dinowm", name="DINO-WM / DINO-world", short="DINO-WM", family="JEPA", anim="latentmpc",
   tagline="Learn dynamics directly on frozen DINOv2 features; plan zero-shot to goal images with MPC.",
   simple=("Why learn vision from scratch for every robot? DINO-WM keeps a frozen off-the-shelf DINOv2 encoder and "
           "only learns how its patch features evolve under actions. At test time it just searches for actions whose "
           "predicted features match the goal image — no rewards, no demos. DINO-world scales the same recipe to ~60M web videos."),
   mapping="frozen DINOv2 features + actions → next features; plan to a goal image",
   math=r"\min_\phi\big\|p_\phi(z_{t-k:t},a_{t-k:t})-z_{t+1}\big\|^2,\quad z=\mathrm{DINOv2}(o)\;\text{(frozen)}",
   when="Reward-free offline data + goal-image tasks; the cleanest 'plan in pretrained feature space' testbed.",
   pros=["No encoder training, no reconstruction, no reward", "Zero-shot planning to new goals", "Beat generative WMs on maze/push suites (ICML 2025)"],
   cons=["Frozen features may miss precise contacts", "Test-time MPC is slow", "Mostly tabletop/sim-scale so far"],
   papers=["DINO-WM (Zhou, Pan, LeCun, Pinto 2024)", "DINO-world (Baldassarre et al. 2025)", "PLDM (Sobal et al. 2025)"],
   learn=("DINO-WM — project page", "https://dino-wm.github.io/")),
 dict(id="lejepa", name="LeJEPA", short="LeJEPA", family="JEPA", anim="gausreg",
   tagline="A theory-first JEPA: prove the optimal embedding distribution is isotropic Gaussian, enforce it, delete the heuristics.",
   simple=("Classic JEPAs are held together with duct tape — EMA teachers, stop-gradients — all to stop embeddings "
           "collapsing to a point. LeJEPA proves the ideal embedding distribution is an isotropic Gaussian and adds a "
           "cheap statistical test (SIGReg) as a regularizer, making collapse impossible by construction. One loss, one "
           "hyperparameter; LeCun's last Meta paper before founding AMI Labs ($1.03B seed, 2026)."),
   mapping="two views → embeddings; predict one from the other while forcing Z toward N(0, I)",
   math=r"\mathcal{L}=\mathcal{L}_{\text{pred}}+\lambda\,\mathrm{SIGReg}(Z),\quad\mathrm{SIGReg}=\textstyle\sum_{u\sim\mathbb{S}^{d-1}}T_{\text{EP}}\big(u^\top Z,\mathcal{N}(0,1)\big)",
   when="Training JEPA-style encoders from scratch without teacher-student tricks or tuning sweeps.",
   pros=["Provable anti-collapse", "Linear-cost regularizer; 60+ architectures tested", "Training loss tracks downstream quality"],
   cons=["Not yet an action-conditioned world model", "Optimality argument rests on task assumptions", "Young — replications accumulating"],
   papers=["LeJEPA (Balestriero & LeCun 2025)", "SPR (Schwarzer 2021)", "BYOL-Explore (Guo 2022)"],
   learn=("LeJEPA explained — Turing Post", "https://www.turingpost.com/p/lejepa")),

 # ---------------- Video Diffusion Simulators ----------------
 dict(id="sora", name="Sora / Sora 2 (Video as Simulator)", short="Sora / Sora 2", family="VideoDiff", anim="viddiffuse",
   tagline="Text-to-video diffusion transformer that OpenAI framed as a 'world simulator' — igniting the central debate.",
   simple=("Sora turns noise into realistic video guided by a text prompt. Because plausible video forces the model to "
           "get gravity, occlusion, and object permanence roughly right, OpenAI argued it implicitly simulates the "
           "world. Sora 2's basketball now bounces off the rim instead of teleporting — but critics say it's "
           "pattern-matching pixels, not understanding physics."),
   mapping="text (or image/video) → photorealistic video clip",
   math=r"\mathcal{L}=\mathbb{E}_{x_0,\epsilon,t}\big[\|\epsilon-\epsilon_\theta(x_t,t,c_{\text{text}})\|^2\big]\;\;\text{(DiT over latent spacetime patches)}",
   when="Offline high-fidelity synthesis; the reference point for 'do video generators learn physics?'.",
   pros=["Highest visual fidelity in the lane", "Emergent 3D consistency and rough physics from scale", "Sora 2 adds synced audio + better physics"],
   cons=["No action interface — can't be stepped like a simulator", "Physics failures undermine the claim", "Closed, offline, not real-time"],
   papers=["Video generation models as world simulators (OpenAI 2024)", "Sora 2 (OpenAI 2025)"],
   learn=("Sora's spacetime patches — illustrated", "https://towardsdatascience.com/explaining-openai-soras-spacetime-patches-the-key-ingredient-e14e0703ec5b/")),
 dict(id="cosmos", name="NVIDIA Cosmos (Predict / Transfer / Reason)", short="NVIDIA Cosmos", family="VideoDiff", anim="viddiffuse",
   tagline="Open world-foundation-model platform for Physical AI: predict futures, repaint sims, reason about physics.",
   simple=("Cosmos is NVIDIA's toolkit for robots and AVs to 'imagine' the world: Predict dreams future video, "
           "Transfer repaints rough simulator renders into photoreal training data, Reason judges whether what it sees "
           "is physically sensible. Predict 2.5 unified everything into one flow-matching model trained on 200M clips; "
           "2M+ downloads made it the de-facto open WFM platform."),
   mapping="text / image / video / control maps → physically plausible future video + reasoning",
   math=r"\mathcal{L}_{\text{flow}}=\mathbb{E}_{x_0,x_1,t}\big[\|v_\theta(x_t,t,c)-(x_1-x_0)\|^2\big],\;\;x_t=(1{-}t)x_0+t\,x_1",
   when="Synthetic data generation and policy evaluation for robotics/AV; post-train open weights on your embodiment.",
   pros=["Open weights, permissive license", "Multi-control conditioning ideal for sim-to-real data", "Rapid cadence (Predict 2.5 → Cosmos 3)"],
   cons=["Not real-time interactive — batch simulation", "Quality below frontier closed video models", "Sprawling product family"],
   papers=["Cosmos WFM Platform (NVIDIA 2025)", "Cosmos-Predict 2.5 (2025)", "Cosmos Reason (2025)"],
   learn=("NVIDIA Cosmos — platform page", "https://www.nvidia.com/en-us/ai/cosmos/")),

 # ---------------- Interactive / Playable ----------------
 dict(id="genie", name="Genie 1/2 (Latent Actions)", short="Genie 1/2", family="Interactive", anim="lam",
   tagline="World models from unlabeled video: infer the hidden 'buttons' that explain how frames change.",
   simple=("Genie watched 200k hours of platformer videos with no controller data and figured out, on its own, a small "
           "vocabulary of latent actions that explain how each frame becomes the next — deducing a game's controls "
           "just by watching streams. Genie 2 scaled it to image→3D worlds with real keyboard/mouse control and "
           "up-to-a-minute memory."),
   mapping="image + (inferred) latent action sequence → playable video world",
   math=r"\max_\theta\;\log p_\theta\big(x_{t+1}\mid x_{\le t},\tilde a_t\big),\quad\tilde a_t=\mathrm{VQ}\big(f_\phi(x_{\le t+1})\big)\;\text{(no action labels)}",
   when="When action labels don't exist: extracting control interfaces from raw video at internet scale.",
   pros=["Latent actions emerge unsupervised", "Turns any image into an environment", "Founded the playable-video line (ICML best paper)"],
   cons=["Genie 1: 1 FPS, 2D only", "Tiny action vocabulary", "Short horizons, drift"],
   papers=["Genie (Bruce et al. 2024)", "Genie 2 (Parker-Holder et al. 2024)"],
   learn=("Genie 2 — DeepMind blog", "https://deepmind.google/discover/blog/genie-2-a-large-scale-foundation-world-model/")),
 dict(id="genie3", name="Genie 3 (Real-Time Worlds)", short="Genie 3", family="Interactive", anim="playable",
   tagline="First real-time general world model: text → navigable 720p world at 24 FPS with minutes of memory.",
   simple=("Type 'a volcanic island at dusk' and Genie 3 builds a world you walk through live at 24 frames per second "
           "— no game engine, every frame dreamed on the fly. It remembers what it generated for minutes (paint a "
           "wall, walk away, come back — still painted) and accepts promptable world events like weather changes. "
           "Shipped to consumers as Project Genie in Jan 2026."),
   mapping="text/image prompt + live navigation + event prompts → real-time interactive world stream",
   math=r"\max_\theta\sum_t\log p_\theta\big(x_t\mid x_{<t},a_{<t},c\big)\;\;\text{(frame-causal generation; internals undisclosed)}",
   when="Embodied-agent training grounds, environment prototyping; DeepMind's stepping stone to AGI.",
   pros=["Real-time 24 FPS / 720p — a qualitative jump", "Visual memory over minutes; promptable events", "General-purpose worlds"],
   cons=["Few-minute horizon; limited action space", "Weak multi-agent interactions", "Closed; architecture undisclosed"],
   papers=["Genie 3 (Google DeepMind 2025)", "Project Genie (2026)"],
   learn=("Genie 3 — DeepMind blog with live demos", "https://deepmind.google/blog/genie-3-a-new-frontier-for-world-models/")),
 dict(id="oasis", name="Oasis / Oasis 3 (Decart)", short="Oasis (Decart)", family="Interactive", anim="playable",
   tagline="Minecraft with no game engine: a transformer hallucinates every frame at 20 FPS — now photoreal driving.",
   simple=("Oasis is Minecraft dreamed frame-by-frame from your keyboard and mouse, 20 times per second — no code for "
           "gravity or trees, only a neural net that watched enough gameplay to fake them. Decart then shipped "
           "MirageLSD (zero-latency live video restyling) and Oasis 3 (June 2026), which streams hours of "
           "photorealistic interactive driving as an API."),
   mapping="current frame + keyboard/mouse → next frame, in a real-time loop",
   math=r"\mathcal{L}=\mathbb{E}_{t_{1:T}}\sum_i\big\|\epsilon_i-\epsilon_\theta(z_i^{(t_i)},t_i,z_{<i},a_{<i})\big\|^2\;\;\text{(diffusion forcing)}",
   when="Real-time neural game engines; live video transformation; interactive driving simulation via API.",
   pros=["First playable generation in the public's hands", "Extreme inference optimization (zero-latency streaming)", "Rapid productization ($4B valuation by May 2026)"],
   cons=["Oasis 1: severe drift and amnesia", "Low fidelity vs offline models", "Driving realism still has physics caveats"],
   papers=["Oasis (Decart/Etched 2024)", "MirageLSD (2025)", "Oasis 3 (2026)"],
   learn=("Oasis: A Universe in a Transformer — demo", "https://oasis-model.github.io/")),
 dict(id="matrixgame", name="Matrix-Game 2.0 / GameCraft (Open)", short="Matrix-Game 2.0", family="Interactive", anim="playable",
   tagline="The open-source answer to Genie 3: real-time 25 FPS, minutes-long, keyboard-and-mouse worlds.",
   simple=("A week after Genie 3 wowed everyone behind closed doors, Skywork open-sourced Matrix-Game 2.0: a "
           "downloadable model that streams playable worlds at 25 FPS for minutes. The trick is distilling a slow "
           "many-step video diffuser into a few denoising steps per frame — fast enough for a live loop. Tencent's "
           "Hunyuan-GameCraft adds free-form language control ('open the door, then it starts to rain')."),
   mapping="image/scene + frame-level keyboard & mouse → real-time 25 FPS interactive video",
   math=r"\mathcal{L}_{\text{DMD}}=\mathbb{E}\big[D_{KL}\big(p_{\text{teacher}}(x_t\mid x_{<t},a)\,\|\,p_{G_\theta}(x_t\mid x_{<t},a)\big)\big]\;\;\text{(few-step distillation)}",
   when="Open research/deployment of real-time interactive generation; embodied-agent training data.",
   pros=["Fully open weights + code", "25 FPS streaming with frame-level action accuracy", "Strong scene diversity (~2,700h gameplay)"],
   cons=["Visual quality below Genie 3", "Game-like domains, limited real physics", "Consistency degrades over minutes"],
   papers=["Matrix-Game 2.0 (Skywork 2025)", "Hunyuan-GameCraft 1/2 (Tencent 2025)"],
   learn=("Matrix-Game 2.0 — project page with demos", "https://matrix-game-v2.github.io/")),

 # ---------------- Persistent 3D Worlds ----------------
 dict(id="marble", name="Marble (World Labs)", short="Marble", family="Spatial3D", anim="splat3d",
   tagline="Spatial intelligence productized: text/image → persistent, editable, downloadable 3D worlds.",
   simple=("Where Genie-style models dream each frame as you move (and can forget what's behind you), Marble builds "
           "the whole 3D place first — then you walk through it forever, because the geometry actually exists. From a "
           "photo or prompt it outputs an explorable scene you can edit and export as Gaussian splats or meshes into "
           "Unreal, Blender, or VR. Fei-Fei Li's bet that spatial intelligence needs persistent 3D, not video."),
   mapping="text / image / video / coarse layout → persistent explorable 3D scene (splats, mesh)",
   math=r"S=G_\theta(c),\;S=\{(\mu_i,\Sigma_i,c_i,\alpha_i)\};\quad\min_\theta\sum_k\|\mathcal{R}(S,\pi_k)-I_k\|",
   when="Game/VFX/VR assets and robotics sim scenes — when the world must persist and export.",
   pros=["Persistent geometry: zero drift, true memory", "Exports to standard pipelines; VR-ready", "First commercial product in the lane ($1B+ valuation)"],
   cons=["Worlds are largely static — little physics", "Scene-scale, not open-world", "Closed, credit-based"],
   papers=["Marble (World Labs 2025)", "RTFM: Real-Time Frame Model (World Labs 2025)"],
   learn=("Marble launch post — World Labs", "https://www.worldlabs.ai/blog/marble-world-model")),
 dict(id="hyworld", name="HY-World 2.0 (Open 3D Stack)", short="HY-World 2.0", family="Spatial3D", anim="splat3d",
   tagline="Tencent's open-source counterweight: reconstruct, generate, and simulate 3D worlds (mesh, 3DGS, points).",
   simple=("HY-World 2.0 is the open-weights answer to Marble: a full stack for turning images and text into 3D "
           "worlds you can reconstruct, edit, and simulate, with meshes, Gaussian splats, and point clouds as "
           "first-class outputs. It anchors the open ecosystem for persistent-3D world modeling."),
   mapping="images / text → reconstructed or generated 3D world (mesh / 3DGS / points)",
   math=r"\min_\theta\sum_v\big\|\mathcal{R}(S_\theta,v)-I_v\big\|\;\;\text{(render-consistency over generated scene }S_\theta\text{)}",
   when="Open research and products on 3D world generation without a closed-API dependency.",
   pros=["Open source, multi-representation output", "Reconstruction + generation + simulation in one stack", "Active OSS ecosystem"],
   cons=["Fidelity below the closed frontier", "Static-scene bias like all 3D-gen", "Heavy pipeline to run"],
   papers=["HY-World 2.0 (Tencent Hunyuan 2026)"],
   learn=("HY-World 2.0 — GitHub", "https://github.com/Tencent-Hunyuan/HY-World-2.0")),

 # ---------------- Driving World Models ----------------
 dict(id="gaia2", name="GAIA-2 / GAIA-3 (Wayve)", short="GAIA-2/3", family="Driving", anim="viddiffuse",
   tagline="Latent-diffusion driving WM: multi-camera, structured control over ego, agents, weather, geography.",
   simple=("GAIA-2 generates realistic surround-camera driving video with knobs for nearly everything: weather, "
           "country, other agents' behavior, and the ego car's own maneuvers. Wayve uses it as a scenario factory for "
           "near-collisions too dangerous to collect. GAIA-3 (15B, 10x data) moves world models from synthetic data "
           "into safety evaluation."),
   mapping="scene/agent/action/weather conditioning → coherent multi-camera driving video",
   math=r"\mathcal{L}=\mathbb{E}_{z,\epsilon,\tau}\big[\|\epsilon-\epsilon_\theta(z_\tau,\tau\mid c_{\text{ego}},c_{\text{agents}},c_{\text{env}})\|^2\big]",
   when="Training and stress-testing driving stacks with controllable synthetic corner cases.",
   pros=["Fine-grained documented conditioning", "Multi-camera spatial/temporal coherence", "Most documented industrial driving WM"],
   cons=["Pixel realism ≠ physical fidelity; no lidar", "No public weights", "Generation cost limits closed-loop training"],
   papers=["GAIA-2 (Russell et al. 2025)", "GAIA-3 (Wayve 2025)"],
   learn=("GAIA-2 — Wayve blog", "https://wayve.ai/thinking/gaia-2/")),
 dict(id="vista", name="Vista (Open Driving WM)", short="Vista", family="Driving", anim="videoaction",
   tagline="Open-source generalizable driving WM: high-fidelity prediction + reward-free action evaluation.",
   simple=("Vista fine-tunes a video diffusion model into a driving crystal ball: show it the road now and a candidate "
           "maneuver, and it renders what happens next. Because it can imagine the outcome of any action, it can also "
           "score actions — a what-if engine that doubles as a reward function, generalizing to unseen cities."),
   mapping="current frames + action (trajectory / steering / command) → future video + action score",
   math=r"\mathcal{L}=\mathbb{E}\big[\|\epsilon-\epsilon_\theta(z_t,t,a,c)\|^2\big]+\lambda\,\mathcal{L}_{\text{dyn}}",
   when="Open research on action-conditioned driving prediction; evaluating planners by imagined rollouts.",
   pros=["Open source with weights — the academic reference", "Multiple action formats", "Zero-shot to new environments"],
   cons=["Slow diffusion sampling", "Short practical horizons", "Backbone limits resolution/length"],
   papers=["Vista (OpenDriveLab, NeurIPS 2024)"],
   learn=("Vista — project page", "https://opendrivelab.com/Vista/")),
 dict(id="occworld", name="OccWorld", short="OccWorld", family="Driving", anim="occupancy",
   tagline="GPT for 3D occupancy: predict how the voxelized scene and the ego car move next.",
   simple=("Instead of predicting pixels, OccWorld represents the driving scene as a 3D grid of occupied/free voxels, "
           "compresses it into tokens, and runs a GPT-style transformer to predict the next scene-tokens and the car's "
           "own next pose. Pixels are pretty, but occupancy is what actually matters for not hitting things."),
   mapping="past 3D occupancy + ego poses → future occupancy + ego trajectory",
   math=r"\max_\theta\sum_t\log p_\theta\big(z_{t+1},p_{t+1}\mid z_{\le t},p_{\le t}\big),\quad z=\text{VQ occupancy tokens}",
   when="Driving stacks that want a label-light, geometry-native world model for forecasting and planning.",
   pros=["Occupancy is directly safety-relevant", "No box/map annotation needed", "Spawned a whole sub-family"],
   cons=["Coarse voxels miss thin structures", "Benchmark-scale (nuScenes)", "Open-loop metrics flatter it"],
   papers=["OccWorld (Zheng et al., ECCV 2024)"],
   learn=("OccWorld — project page", "https://wzzheng.net/OccWorld/")),
 dict(id="driveocc", name="Drive-OccWorld", short="Drive-OccWorld", family="Driving", anim="occupancy",
   tagline="Action-conditioned 4D occupancy forecasting as the cost model for end-to-end planning.",
   simple=("Drive-OccWorld forecasts future 3D occupancy and motion flow conditioned on candidate driving actions, "
           "then scores each candidate trajectory by how badly it collides with the forecasted occupied space and "
           "picks the safest. The cleanest example of 'world model as the planner's imagination' in driving."),
   mapping="camera history + candidate action → future occupancy & flow → best trajectory",
   math=r"\tau^*=\arg\min_\tau\sum_t C_{\text{occ}}\big(\tau_t,\hat O_t\big),\quad\hat O_t\sim p_\theta(O_t\mid \mathrm{BEV}_{\le 0},a_\tau)",
   when="Vision-only AV stacks wanting continuous forecast-then-plan with what-if rollouts.",
   pros=["Planning grounded in forecasted geometry", "Flexible action conditioning", "Beat prior end-to-end planners (AAAI 2025)"],
   cons=["Hand-designed occupancy costs", "Compute-heavy per candidate", "No fleet deployment evidence"],
   papers=["Drive-OccWorld (Yang et al., AAAI 2025)"],
   learn=("Drive-OccWorld — project page", "https://drive-occworld.github.io/")),
 dict(id="fleetsim", name="Fleet Neural Simulators (Tesla / Waymo)", short="Fleet Simulators", family="Driving", anim="evalwm",
   tagline="Industrial neural simulators trained on fleet video, used for closed-loop evaluation and RL.",
   simple=("Tesla trains a generative simulator on video from its 8-camera fleet and drives new FSD builds inside it "
           "in closed loop — including adversarial edge cases — and showed the same simulator running Optimus in a "
           "virtual gigafactory. Waymo built its World Model on Genie 3, generating camera AND lidar together so "
           "simulated data matches its real sensor suite, steered by language prompts."),
   mapping="fleet camera history + ego/robot actions → simulated multi-sensor future (closed loop)",
   math=r"\pi^*=\arg\max_\pi\,\mathbb{E}_{\hat o_{1:T}\sim p_\theta(\cdot\mid o_0,\pi)}\Big[\textstyle\sum_t\gamma^t r(\hat o_t,a_t)\Big]",
   when="Fleet-scale AV/humanoid development where real-world testing is the bottleneck.",
   pros=["Trained on the largest real ego-video corpora", "Closed-loop + adversarial scenario generation", "Cross-embodiment (FSD → Optimus; Genie 3 → driving)"],
   cons=["Almost nothing published; claims unverifiable", "RL-in-neural-sim at safety scale unproven", "Eval circularity risk"],
   papers=["Tesla world simulator (ICCV 2025 keynote)", "The Waymo World Model (2026)"],
   learn=("The Waymo World Model — blog", "https://waymo.com/blog/2026/02/the-waymo-world-model-a-new-frontier-for-autonomous-driving-simulation/")),

 # ---------------- Robot WM Infrastructure ----------------
 dict(id="unisim", name="UniSim (Universal Simulator)", short="UniSim", family="RobotWM", anim="videoaction",
   tagline="One action-conditioned video model trained on everything, used as a learned simulator of real interaction.",
   simple=("UniSim learned, from a huge mix of robot data, human videos, and navigation data, to answer 'what would "
           "the camera see next if this action were taken?' — a video game engine for the real world. Policies trained "
           "inside it transferred to real robots zero-shot, seeding the whole 'WM as data engine' lane."),
   mapping="history of frames + action (text or control) → next video segment",
   math=r"\mathcal{L}=\mathbb{E}_{o,a,\epsilon,\tau}\big[\|\epsilon-\epsilon_\theta(o_{t+1:t+k}^{(\tau)},\tau\mid o_{\le t},a_t)\|^2\big]",
   when="One learned simulator to train/evaluate embodied policies without hand-built sim assets.",
   pros=["Simulates manipulation, navigation, human activity in one model", "Zero-shot policy transfer demonstrated", "ICLR 2024 Outstanding Paper"],
   cons=["Slow diffusion rollouts", "Drift over long horizons", "Needs enormous merged datasets"],
   papers=["UniSim (Yang et al. 2023)"],
   learn=("UniSim — project page", "https://universal-simulator.github.io/unisim/")),
 dict(id="dreamgen", name="DreamGen / GR00T-Dreams (Data Engine)", short="DreamGen", family="RobotWM", anim="dataengine",
   tagline="Dream robot videos from a few demos, extract actions, train policies — months of teleop in ~36 hours.",
   simple=("NVIDIA's recipe: fine-tune a video world model on a handful of teleoperated demos, then prompt it to "
           "'dream' thousands of videos of the robot doing task variations it never actually did. A reasoning VLM "
           "filters bad dreams, an inverse dynamics model converts the survivors into action labels, and the policy "
           "trains on the result."),
   mapping="few real demos + image + instruction → dreamed videos → extracted (video, action) pairs → policy",
   math=r"\hat a_t=f_{\text{IDM}}(\hat o_t,\hat o_{t+1}),\quad\hat o_{1:T}\sim p_{\text{WM}}(\cdot\mid o_0,\ell);\quad\mathcal{L}_\pi=\mathbb{E}\big[\|\pi(\hat o_t,\ell)-\hat a_t\|^2\big]",
   when="When robot data collection is the bottleneck and you need generalization beyond your teleop dataset.",
   pros=["Orders-of-magnitude cheaper data scaling", "Generalizes to unseen verbs/environments", "Shipped as an open pipeline"],
   cons=["Dreamed physics can be wrong; gating imperfect", "IDM action labels are noisy", "Benefits shrink with plentiful real data"],
   papers=["DreamGen (Jang et al. 2025)", "Cosmos WFM Platform (NVIDIA 2025)"],
   learn=("Synthetic trajectories from WFMs — NVIDIA blog", "https://developer.nvidia.com/blog/enhance-robot-learning-with-synthetic-trajectory-data-generated-by-world-foundation-models/")),
 dict(id="onexwm", name="1X World Model (Redwood)", short="1X World Model", family="RobotWM", anim="evalwm",
   tagline="A fleet-data digital twin used to grade humanoid policies instead of endless physical tests.",
   simple=("1X trains its world model on thousands of hours of video from its home humanoids. Given a starting scene "
           "and a candidate policy's actions, it generates what would happen — including laundry, curtains, doors, "
           "and drawers that classical sims can't model — and predicts the policy's real-world success rate without "
           "touching a robot."),
   mapping="past robot observations + candidate actions → predicted future video (+ task outcome)",
   math=r"\hat S(\pi)=\mathbb{E}_{o_0\sim\mathcal{D}}\big[r(\hat o_{1:T})\big],\quad\hat o_{1:T}\sim p_\theta(\cdot\mid o_0,a^{\pi}_{1:T})",
   when="Shipping robots into homes: reproducible, scalable evaluation of policy updates across hundreds of tasks.",
   pros=["Learns deformable/articulated dynamics", "Reproducible counterfactual rollouts", "Open data release + public challenge"],
   cons=["Object coherence breaks in long rollouts", "A flattering simulator inflates scores", "Production details thin"],
   papers=["1X World Model / Redwood AI (1X 2024–2025)"],
   learn=("1X World Model — announcement", "https://www.1x.tech/discover/1x-world-model")),
 dict(id="worldeval", name="WorldEval (Policy2Vec)", short="WorldEval", family="RobotWM", anim="evalwm",
   tagline="Grade real-robot policies inside a video WM — with actions encoded so the rollout actually obeys them.",
   simple=("Testing every checkpoint on a real robot is slow and risky, so WorldEval runs the policy inside a video "
           "world model and scores the generated rollout. The key fix is Policy2Vec: raw actions fed to a video "
           "generator get ignored, so the policy's actions are encoded into a latent the generator reliably follows. "
           "Rankings correlate strongly with real-world success."),
   mapping="initial scene + policy (via latent action encoding) → generated rollout → predicted success",
   math=r"\hat o_{1:T}\sim p_\theta(\cdot\mid o_0,z_{1:T}),\;z=\text{Policy2Vec}(\pi);\quad\max\;\rho\big(\hat S_{\text{WM}}(\pi),S_{\text{real}}(\pi)\big)",
   when="Checkpoint selection, policy ranking, and pre-deployment safety screening.",
   pros=["Strong rank correlation with reality", "Doubles as a safety filter", "Cheaper than real-to-sim or physical eval"],
   cons=["Only as trustworthy as the WM's dynamics", "Needs a reliable success detector", "Limited task/robot distribution so far"],
   papers=["WorldEval (Li, Zhu et al. 2025)", "Ctrl-World (2025)"],
   learn=("WorldEval — project page", "https://worldeval.github.io/")),
 dict(id="nwm", name="Navigation World Models", short="Nav World Model", family="RobotWM", anim="plan",
   tagline="A 1B conditional diffusion transformer lets a robot 'think before walking' by simulating candidate routes.",
   simple=("NWM generates what the robot would see next given a candidate motion command. To navigate, it simulates "
           "several candidate trajectories, checks which imagined future actually reaches the goal, and picks the best "
           "— mentally rehearsing routes, absorbing new constraints at plan time ('avoid that side'). Notably "
           "generative — from LeCun's own collaborators. CVPR 2025 Best Paper Honorable Mention."),
   mapping="past egocentric frames + navigation actions → imagined futures → scored trajectory",
   math=r"a^*_{1:T}=\arg\min_{a_{1:T}}\mathbb{E}\big[\mathcal{C}(\hat s_T,g)\big],\quad\hat s_{t+1}\sim\mathrm{CDiT}_\theta(\cdot\mid\hat s_t,a_t)",
   when="Visual navigation with goal images, especially when constraints change at deployment.",
   pros=["New constraints without retraining", "Trained on human + robot egocentric video", "Open code and weights"],
   cons=["Diffusion rollouts make planning slow", "Hallucinates geometry in unknown places", "Navigation-only action space"],
   papers=["Navigation World Models (Bar et al., CVPR 2025)"],
   learn=("NWM — project page", "https://www.amirbar.net/nwm/")),

 # ---------------- World Models in VLAs ----------------
 dict(id="gr2", name="GR-1 / GR-2 (Video Pretraining)", short="GR-1 / GR-2", family="WMVLA", anim="worldaction",
   tagline="Pretrain on millions of web videos to predict the future, then fine-tune the same model to act.",
   simple=("GR-1 showed that a GPT-style model pretrained simply to predict future video frames learns a lot about how "
           "the world moves — and fine-tuning it on robot data makes a much better manipulation policy. GR-2 scaled to "
           "38M clips, predicting action AND future video together: ~98% success across 100+ tasks. The canonical "
           "'world model hiding inside a VLA' result."),
   mapping="language + image history → (predicted future video, robot action trajectory)",
   math=r"\mathcal{L}_{\text{ft}}=\mathbb{E}\big[\|\hat a_t-a_t\|^2+\lambda\|\hat x_{t+1}-x_{t+1}\|^2\big]\;\;\text{after web-video next-frame pretraining}",
   when="When robot demos are scarce but web video is endless — buy generalization with video-prediction pretraining.",
   pros=["Web video measurably transfers to manipulation", "Video head doubles as an inspectable plan", "100+ task scale"],
   cons=["Pixel prediction adds inference latency", "Closed weights", "Action quality tied to video quality"],
   papers=["GR-1 (Wu et al., ICLR 2024)", "GR-2 (Cheang et al. 2024)"],
   learn=("GR-2 — project page", "https://gr2-manipulation.github.io/")),
 dict(id="uva", name="Unified Video Action Model (UVA)", short="UVA", family="WMVLA", anim="worldaction",
   tagline="One joint latent for video and action; train as a world model, run as a real-time policy.",
   simple=("UVA learns a single latent from which two light diffusion heads decode either future video or future "
           "actions; masked training makes it simultaneously a policy, video predictor, forward- and inverse-dynamics "
           "model. The punchline: at inference you skip the expensive video decoding and just decode actions — "
           "world-model-grade representations at real-time policy speed."),
   mapping="observations (+ masked video/action targets) → joint latent → actions and/or future video",
   math=r"\mathcal{L}=\mathbb{E}\big[\|\epsilon_v-\epsilon^v_\theta(z)\|^2\big]+\mathbb{E}\big[\|\epsilon_a-\epsilon^a_\theta(z)\|^2\big],\quad z=f_\theta(o,\text{masked}(v,a))",
   when="Video-prediction pretraining benefits in a manipulation policy without video-generation latency.",
   pros=["Real-time inference despite generative training", "Policy + forward + inverse dynamics in one", "Strong multi-task results (RSS 2025)"],
   cons=["Joint latent trades video fidelity vs action precision", "Tabletop scale so far", "Masked-training engineering complexity"],
   papers=["Unified Video Action Model (Li, Gao, Sadigh, Song 2025)"],
   learn=("UVA — project page", "https://unified-video-action-model.github.io/")),
 dict(id="worldvla", name="WorldVLA", short="WorldVLA", family="WMVLA", anim="interleave",
   tagline="One autoregressive token stream for images, text, and actions — the VLA and the WM train each other.",
   simple=("WorldVLA tokenizes everything — images, text, robot actions — into one sequence and trains a single "
           "autoregressive model on two interleaved jobs: act (predict next action) and imagine (predict next image "
           "given the action). Each improves the other: the world-model half teaches physics, the action half grounds "
           "it. Successor RynnVLA-002 hits 97.4% on LIBERO."),
   mapping="interleaved image/text/action tokens → next action tokens (policy) or next image tokens (WM)",
   math=r"\max_\theta\sum_t\log p_\theta(u_t\mid u_{<t}),\quad u\in\{\text{img},\text{txt},\text{act}\};\quad\mathcal{L}=\mathcal{L}_{\text{VLA}}+\mathcal{L}_{\text{WM}}",
   when="One unified model that both controls the robot and predicts consequences, with mutual-enhancement gains.",
   pros=["WM objective measurably improves grasping (+4% LIBERO)", "One transformer, one loss family", "Open code/weights"],
   cons=["Discrete action tokens limit precision", "AR image generation slow for real-time imagination", "Needed a masking fix vs error snowballing"],
   papers=["WorldVLA (Cen et al. 2025)", "RynnVLA-002 (Alibaba DAMO 2025)"],
   learn=("WorldVLA — paper page", "https://huggingface.co/papers/2506.21539")),

 # ---------------- Positions & Debates ----------------
 dict(id="jepa-manifesto", name="LeCun's JEPA Manifesto", short="JEPA Manifesto", family="Position", anim="jepapred",
   tagline="The blueprint: agents learn world models by predicting in abstract representation space, not pixels.",
   simple=("LeCun's 2022 position paper argues an agent needs an internal world model to imagine outcomes before "
           "acting — and that predicting a compressed summary of the future (not the future itself) lets the model "
           "discard unpredictable detail like leaves in wind. A hierarchy of JEPAs would plan at multiple time scales. "
           "In 2026 the bet went all-in: AMI Labs raised a record $1.03B seed to build exactly this."),
   mapping="context x (+ latent z for uncertainty) → predicted embedding of target y",
   math=r"\mathcal{L}=D\big(\mathrm{Pred}_\phi(\mathrm{Enc}_\theta(x),z),\;\overline{\mathrm{Enc}}_\theta(y)\big)\;\;\text{(energy in embedding space)}",
   when="The conceptual frame whenever you're deciding between generating futures vs predicting their representations.",
   pros=["Latent prediction discards unpredictable detail", "Hierarchy promises multi-scale planning", "Unifies SSL, WMs, and planning"],
   cons=["A manifesto — hierarchy/configurator still unrealized", "Collapse must be fought with extra machinery", "Hard to evaluate (no samples)"],
   papers=["A Path Towards Autonomous Machine Intelligence (LeCun 2022)", "AMI Labs (2026)"],
   learn=("LeCun on AI that learns like animals — Meta AI", "https://ai.meta.com/blog/yann-lecun-advances-in-ai-research/")),
 dict(id="pan", name="PAN / Critiques of World Models", short="PAN (Critiques)", family="Position", anim="hierarchy",
   tagline="The CMU rebuttal: defend generative, hierarchical world models — an LLM backbone over latent levels.",
   simple=("The 'Critiques of World Models' paper pushes back on the JEPA camp: it defends generative loss grounded in "
           "observations and proposes PAN — a Physical, Agentic, Nested world model with an LLM at the top reasoning "
           "in language while generative latent levels below predict the physical world at multiple scales. The other "
           "pole of the field's central debate."),
   mapping="hierarchical state z^(ℓ): LLM at top, generative latent prediction below",
   math=r"p_\theta\big(z^{(\ell)}_{t+1}\mid z^{(\ell)}_{\le t},z^{(\ell+1)}\big),\;\;\ell=0..L",
   when="Surfacing the debates: pixels vs latents, generation vs prediction, is an LLM already a world model?",
   pros=["Sharpens the field's definitional debates", "Hierarchy reconciles symbols and physics", "Grounded in observable data"],
   cons=["Full technical system still pending", "Position paper, not a benchmark result", "Hierarchy is hard to train end-to-end"],
   papers=["Critiques of World Models (CMU 2025)", "Physics-IQ: do video models learn physics? (2025)"],
   learn=("Critiques of World Models — arXiv", "https://arxiv.org/abs/2507.05169")),
]

# (a, b, why, kind): "v" = variant (dashed), "b" = builds-on (arrow a → b: b builds on a)
EDGES = [
 # Latent lineage
 ("world-models", "planet", "end-to-end RSSM + latent-space planning", "b"),
 ("planet", "dreamer", "replace CEM planning with imagination actor-critic", "b"),
 ("dreamer", "daydreamer", "the same agent run on physical robots", "v"),
 ("dreamer", "dreamer4", "RSSM → scalable shortcut-forcing transformer", "b"),
 ("dreamer4", "genie3", "real-time interactive generation: for-policy vs as-product", "v"),
 # Value-equivalent
 ("muzero", "efficientzero", "+ self-supervised consistency for ~100x sample efficiency", "b"),
 ("muzero", "tdmpc", "decoder-free value-equivalent latents for continuous control", "b"),
 ("planet", "tdmpc", "latent-space MPC, with vs without reconstruction", "v"),
 # Token lineage
 ("world-models", "iris", "VAE→VQ tokens, RNN→transformer", "b"),
 ("iris", "gaia1", "next-token video world model scaled to driving", "v"),
 ("gaia1", "wham", "joint frame + action token streams", "v"),
 ("iris", "genie", "tokenized dynamics scaled to internet video + latent actions", "b"),
 # JEPA lineage
 ("jepa-manifesto", "ijepa", "first concrete realization of the JEPA recipe", "b"),
 ("ijepa", "vjepa2", "masked latent prediction: images → video at scale", "b"),
 ("vjepa2", "vjepa2ac", "an action-conditioned head makes it a controllable WM", "b"),
 ("dinowm", "vjepa2ac", "the plan-in-SSL-feature-space thesis", "v"),
 ("planet", "dinowm", "latent MPC revived in frozen pretrained features", "b"),
 ("ijepa", "lejepa", "provable anti-collapse replaces the EMA heuristics", "b"),
 ("jepa-manifesto", "pan", "the field's two poles: latents-only vs generative hierarchy", "v"),
 # Video diffusion
 ("sora", "cosmos", "the video DiT recipe pointed at Physical AI", "b"),
 ("sora", "gaia2", "latent video diffusion applied to driving", "b"),
 ("cosmos", "dreamgen", "the robotics data engine runs on Cosmos", "b"),
 # Interactive
 ("genie", "genie3", "latent actions → real-time promptable worlds", "b"),
 ("genie3", "matrixgame", "the open-source real-time counterpart", "v"),
 ("genie", "oasis", "the playable world model realized in real time", "b"),
 ("oasis", "matrixgame", "real-time AR diffusion: single game → general worlds", "b"),
 ("genie3", "marble", "streamed frames vs persistent 3D — rival answers to memory", "v"),
 # 3D
 ("marble", "hyworld", "closed product vs open-source 3D world stack", "v"),
 # Driving
 ("gaia1", "gaia2", "discrete tokens → multi-camera latent diffusion", "b"),
 ("gaia2", "vista", "the open-source action-conditioned driving WM", "v"),
 ("occworld", "driveocc", "occupancy WM wired into end-to-end planning", "b"),
 ("gaia2", "occworld", "pixel-space vs occupancy-space camps", "v"),
 ("genie3", "fleetsim", "Waymo's simulator is built on Genie 3", "b"),
 ("gaia2", "fleetsim", "generative scenario factories for AV", "v"),
 # Robot WM infra
 ("unisim", "dreamgen", "learned-simulator-as-data-engine, industrialized", "b"),
 ("unisim", "worldeval", "WM rollouts repurposed as policy evaluation", "b"),
 ("onexwm", "worldeval", "industry + academic WM evaluators", "v"),
 ("onexwm", "fleetsim", "fleet-video neural simulators for embodied eval", "v"),
 ("nwm", "driveocc", "simulate-candidates-then-score: pixels vs occupancy", "v"),
 ("vjepa2ac", "nwm", "goal-reaching by simulation: latents vs pixels", "v"),
 # WM in VLAs
 ("unisim", "gr2", "video generation as manipulation knowledge", "v"),
 ("gr2", "uva", "keep the video pretraining, remove the inference latency", "b"),
 ("gr2", "worldvla", "video pretraining → fully unified action world model", "b"),
 ("wham", "worldvla", "the interleaved obs+action blueprint, on robots", "b"),
 ("uva", "worldvla", "shared diffusion latent vs one discrete token stream", "v"),
]


def build_data():
    fams = [dict(key=k, label=lbl, color=col, desc="", equation="", relations=[]) for (k, lbl, col) in FAMILIES]
    paradigms = []
    for p in P:
        paradigms.append(dict(
            id=p["id"], name=p["name"], short=p["short"], family=p["family"], anim=p["anim"],
            tagline=p["tagline"], simple=p["simple"], mapping=p["mapping"], math=p["math"],
            when=p["when"], pros=p["pros"], cons=p["cons"], papers=p["papers"],
            learn={"title": p["learn"][0], "url": p["learn"][1]},
        ))
    ids = {p["id"] for p in paradigms}
    edges = [[a, b, w, k] for (a, b, w, k) in EDGES if a in ids and b in ids]
    nodeset = ids | {f["key"] for f in fams}
    mypapers = [m for m in my_papers.MY_PAPERS if m.get("node") in nodeset]
    return dict(families=fams, paradigms=paradigms, edges=edges, mypapers=mypapers)


# ---------------------------------------------------------------------------
# World-model animation library (injected into the shared template's ANIM).
# ---------------------------------------------------------------------------
WM_ANIM_JS = r"""
  latentroll(c){
    let s='<text x="14" y="18" font-size="9" fill="#93a0bd">frames</text>';
    for(let i=0;i<3;i++)s+='<rect x="'+(14+i*8)+'" y="'+(26+i*6)+'" width="40" height="30" rx="4" fill="#172036" stroke="#27406b"/>';
    s+='<path d="M62,50 L86,50" stroke="'+c+'" stroke-width="2"/><path d="M80,45 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<text x="92" y="24" font-size="9" fill="'+c+'">latent z rolls forward</text>';
    for(let i=0;i<5;i++){const x=98+i*34,seen=i<2;
      s+='<circle cx="'+x+'" cy="50" r="8" fill="'+(seen?c:'none')+'" stroke="'+c+'" stroke-width="2" opacity="'+(seen?'1':'0')+'">'+
        (seen?'':'<animate attributeName="opacity" values="0;0.9" dur="0.4s" begin="'+(0.6+i*0.45)+'s" fill="freeze"/>')+'</circle>';
      if(i<4)s+='<line x1="'+(x+8)+'" y1="50" x2="'+(x+26)+'" y2="50" stroke="'+c+'" stroke-width="1.4" stroke-dasharray="3 3" opacity="0.5"/>';}
    s+='<text x="98" y="96" font-size="9" fill="#93a0bd">tiny state · imagined future</text>';
    s+='<circle r="4" fill="#fff" style="filter:drop-shadow(0 0 5px '+c+')"><animateMotion dur="2.6s" repeatCount="indefinite" path="M98,50 L234,50"/></circle>';
    return s;
  },
  shortcutf(c){
    let s='<text x="14" y="20" font-size="9" fill="#93a0bd">two half-steps</text>';
    s+='<path d="M30,64 Q65,34 100,64" fill="none" stroke="#475569" stroke-width="2"/><path d="M100,64 Q135,34 170,64" fill="none" stroke="#475569" stroke-width="2"/>';
    [30,100,170].forEach(x=>s+='<circle cx="'+x+'" cy="64" r="5" fill="#475569"/>');
    s+='<text x="190" y="50" font-size="13" fill="'+c+'" font-weight="800">=</text>';
    s+='<text x="14" y="100" font-size="9" fill="'+c+'">one big step (shortcut)</text>';
    s+='<path d="M30,128 Q100,84 170,128" fill="none" stroke="'+c+'" stroke-width="3" stroke-dasharray="240" stroke-dashoffset="240"><animate attributeName="stroke-dashoffset" values="240;0;0" keyTimes="0;0.5;1" dur="2.6s" repeatCount="indefinite"/></path>';
    s+='<circle cx="30" cy="128" r="5" fill="'+c+'"/><circle cx="170" cy="128" r="5" fill="'+c+'"/>';
    s+='<text x="206" y="110" font-size="10" fill="#93a0bd">64 calls</text><text x="206" y="126" font-size="12" font-weight="800" fill="'+c+'">→ ~4</text><text x="206" y="142" font-size="9" fill="#93a0bd">real-time</text>';
    return s;
  },
  mcts(c){
    const root=[140,26];
    const L1=[[70,66],[140,66],[210,66]], L2=[[44,112],[96,112],[186,112],[234,112]];
    let s='<circle cx="'+root[0]+'" cy="'+root[1]+'" r="9" fill="'+c+'"/>';
    const segs=[[root,L1[0],0],[root,L1[1],0],[root,L1[2],1],[L1[0],L2[0],0],[L1[0],L2[1],0],[L1[2],L2[2],1],[L1[2],L2[3],0]];
    segs.forEach(([a,b,best],i)=>{s+='<line x1="'+a[0]+'" y1="'+a[1]+'" x2="'+b[0]+'" y2="'+b[1]+'" stroke="'+(best?c:'#475569')+'" stroke-width="'+(best?3:1.4)+'" opacity="0"><animate attributeName="opacity" values="0;'+(best?1:0.6)+'" dur="0.3s" begin="'+(0.3+i*0.22)+'s" fill="freeze"/></line>';});
    L1.concat(L2).forEach((p,i)=>{const best=(p===L1[2]||p===L2[2]);
      s+='<circle cx="'+p[0]+'" cy="'+p[1]+'" r="7" fill="'+(best?c:'#172036')+'" stroke="'+(best?c:'#475569')+'" stroke-width="1.5" opacity="0"><animate attributeName="opacity" values="0;1" dur="0.3s" begin="'+(0.4+i*0.2)+'s" fill="freeze"/></circle>';});
    s+='<text x="186" y="138" text-anchor="middle" font-size="10" fill="'+c+'">v=0.9 ✓<animate attributeName="opacity" values="0;0;1" keyTimes="0;0.6;1" dur="2.6s" fill="freeze"/></text>';
    s+='<text x="44" y="138" text-anchor="middle" font-size="9" fill="#64748b">v=0.2</text>';
    return s;
  },
  tokenfilm(c){
    let s='<rect x="16" y="30" width="74" height="56" rx="5" fill="#172036" stroke="#27406b"/><text x="53" y="100" text-anchor="middle" font-size="9" fill="#93a0bd">frame t</text>';
    s+='<path d="M96,58 L116,58" stroke="'+c+'" stroke-width="2"/><path d="M110,53 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="122" y="30" width="74" height="56" rx="5" fill="#0a1426" stroke="'+c+'" stroke-width="1.5"/><text x="159" y="100" text-anchor="middle" font-size="9" fill="'+c+'">frame t+1, token by token</text>';
    for(let r=0;r<3;r++)for(let q=0;q<4;q++){const i=r*4+q;
      s+='<rect x="'+(126+q*17)+'" y="'+(34+r*16)+'" width="14" height="13" rx="2" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.85;0.85;0" keyTimes="0;0.08;0.9;1" dur="3.4s" begin="'+(i*0.18)+'s" repeatCount="indefinite"/></rect>';}
    s+='<rect x="206" y="44" width="58" height="28" rx="6" fill="#0c1326" stroke="'+c+'" stroke-width="1.5"/><text x="235" y="62" text-anchor="middle" font-size="9" fill="'+c+'">GPT</text>';
    return s;
  },
  jepapred(c){
    let s='<text x="16" y="18" font-size="9" fill="#93a0bd">context</text>';
    for(let r=0;r<3;r++)for(let q=0;q<3;q++){const m=(r===1&&q===2)||(r===2&&q===2);
      s+='<rect x="'+(16+q*24)+'" y="'+(26+r*24)+'" width="21" height="21" rx="3" fill="'+(m?'#0a1426':'#334155')+'" stroke="'+(m?c:'#27406b')+'" stroke-width="'+(m?1.6:1)+'" '+(m?'stroke-dasharray="3 3"':'')+'/>';}
    s+='<path d="M96,62 L128,62" stroke="'+c+'" stroke-width="2"/><path d="M122,57 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="134" y="44" width="64" height="36" rx="8" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="166" y="66" text-anchor="middle" font-size="10" fill="'+c+'">predict ẑ</text>';
    s+='<text x="216" y="40" font-size="9" fill="#93a0bd">target features</text>';
    for(let i=0;i<4;i++)s+='<rect x="'+(216+i*12)+'" y="48" width="9" height="'+(14+(i%3)*8)+'" rx="2" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.9" dur="0.4s" begin="'+(0.8+i*0.2)+'s" fill="freeze"/></rect>';
    s+='<text x="216" y="106" font-size="9" fill="'+c+'">not pixels — features</text>';
    s+='<text x="204" y="66" font-size="11" fill="'+c+'">≈</text>';
    return s;
  },
  latentmpc(c){
    let s='<circle cx="30" cy="74" r="8" fill="'+c+'"/><text x="30" y="98" text-anchor="middle" font-size="9" fill="#93a0bd">now (z)</text>';
    s+='<rect x="226" y="56" width="38" height="36" rx="5" fill="#172036" stroke="#fbbf24" stroke-width="2"/><text x="245" y="78" text-anchor="middle" font-size="11" fill="#fbbf24">★</text><text x="245" y="108" text-anchor="middle" font-size="9" fill="#fbbf24">goal img</text>';
    const cands=[['M30,74 C90,30 150,28 222,46','#475569','a₁'],['M30,74 C90,74 150,72 222,72',c,'a₂'],['M30,74 C90,120 150,120 222,104','#475569','a₃']];
    cands.forEach(([d,col,lab],i)=>{const best=col===c;
      s+='<path d="'+d+'" fill="none" stroke="'+col+'" stroke-width="'+(best?2.6:1.4)+'" stroke-dasharray="4 4" opacity="'+(best?1:0.55)+'"/>';
      for(let k=1;k<=3;k++)s+='<circle r="3.5" fill="'+col+'" opacity="'+(best?0.95:0.4)+'"><animateMotion dur="2.2s" begin="'+(k*0.3+i*0.1)+'s" repeatCount="indefinite" path="'+d+'"/></circle>';});
    s+='<text x="196" y="64" font-size="11" fill="'+c+'">✓ nearest</text>';
    return s;
  },
  gausreg(c){
    let s='<text x="20" y="18" font-size="9" fill="#93a0bd">embeddings</text>';
    const pts=[[40,40],[68,110],[52,78],[210,36],[238,96],[224,64],[90,52],[180,108],[120,34],[160,118],[104,98],[196,78]];
    pts.forEach((p,i)=>{s+='<circle r="4" fill="'+c+'"><animate attributeName="cx" values="'+p[0]+';'+(140+(p[0]-140)*0.28)+'" dur="1.6s" begin="0.5s" fill="freeze"/><animate attributeName="cy" values="'+p[1]+';'+(76+(p[1]-76)*0.28)+'" dur="1.6s" begin="0.5s" fill="freeze"/></circle>';});
    s+='<circle cx="140" cy="76" r="34" fill="none" stroke="'+c+'" stroke-width="1.6" stroke-dasharray="5 5" opacity="0"><animate attributeName="opacity" values="0;0.9" dur="0.5s" begin="1.9s" fill="freeze"/></circle>';
    s+='<text x="140" y="132" text-anchor="middle" font-size="10" fill="'+c+'" opacity="0">N(0, I) — collapse impossible<animate attributeName="opacity" values="0;1" dur="0.5s" begin="2.1s" fill="freeze"/></text>';
    return s;
  },
  viddiffuse(c){
    let s='';
    for(let i=0;i<4;i++){const x=18+i*58;
      s+='<rect x="'+x+'" y="36" width="50" height="56" rx="5" fill="#172036" stroke="#27406b"/>';
      for(let n=0;n<10;n++){const nx=x+5+((n*17)%40),ny=42+((n*13)%44);
        s+='<rect x="'+nx+'" y="'+ny+'" width="4" height="4" fill="#64748b"><animate attributeName="opacity" values="1;0" dur="0.6s" begin="'+(0.4+i*0.5)+'s" fill="freeze"/></rect>';}
      s+='<circle cx="'+(x+25)+'" cy="64" r="'+(8+i*3)+'" fill="none" stroke="'+c+'" stroke-width="2.4" opacity="0"><animate attributeName="opacity" values="0;1" dur="0.5s" begin="'+(0.5+i*0.5)+'s" fill="freeze"/></circle>';}
    s+='<text x="18" y="24" font-size="9" fill="#93a0bd">noise → video, frame by frame</text>';
    s+='<rect x="18" y="108" width="118" height="20" rx="6" fill="#0c1326" stroke="'+c+'"/><text x="77" y="122" text-anchor="middle" font-size="9" fill="'+c+'">"rainy street" + action</text>';
    return s;
  },
  lam(c){
    let s='<rect x="16" y="30" width="64" height="48" rx="5" fill="#172036" stroke="#27406b"/><circle cx="36" cy="58" r="7" fill="#475569"/>';
    s+='<rect x="200" y="30" width="64" height="48" rx="5" fill="#172036" stroke="#27406b"/><circle cx="244" cy="50" r="7" fill="#475569"/>';
    s+='<text x="48" y="94" text-anchor="middle" font-size="9" fill="#93a0bd">frame t</text><text x="232" y="94" text-anchor="middle" font-size="9" fill="#93a0bd">frame t+1</text>';
    s+='<text x="140" y="48" text-anchor="middle" font-size="14" fill="'+c+'">?<animate attributeName="opacity" values="1;1;0;0" keyTimes="0;0.4;0.5;1" dur="3s" repeatCount="indefinite"/></text>';
    s+='<rect x="118" y="36" width="44" height="24" rx="6" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0;1;1" keyTimes="0;0.5;0.6;1" dur="3s" repeatCount="indefinite"/></rect>';
    s+='<text x="140" y="52" text-anchor="middle" font-size="10" font-weight="700" fill="#06121f" opacity="0">ã = →<animate attributeName="opacity" values="0;0;1;1" keyTimes="0;0.5;0.6;1" dur="3s" repeatCount="indefinite"/></text>';
    s+='<path d="M84,54 L114,48" stroke="'+c+'" stroke-width="1.4" stroke-dasharray="3 3" opacity="0.6"/><path d="M166,48 L196,54" stroke="'+c+'" stroke-width="1.4" stroke-dasharray="3 3" opacity="0.6"/>';
    s+='<text x="140" y="120" text-anchor="middle" font-size="9" fill="'+c+'">hidden action inferred — no labels</text>';
    return s;
  },
  playable(c){
    let s='<rect x="86" y="22" width="120" height="76" rx="7" fill="#0a1426" stroke="'+c+'" stroke-width="2"/>';
    s+='<circle cx="126" cy="70" r="9" fill="'+c+'"><animate attributeName="cx" values="126;166;126" dur="2.4s" repeatCount="indefinite"/></circle>';
    s+='<path d="M96,88 L196,88" stroke="#27406b" stroke-width="2"/>';
    s+='<rect x="210" y="34" width="42" height="16" rx="4" fill="#172036" stroke="#27406b"/><text x="231" y="46" text-anchor="middle" font-size="8.5" fill="'+c+'">24 fps</text>';
    const keys=[['◀',104],['▼',128],['▶',152]];
    keys.forEach(([k,x],i)=>{const on=i===2;
      s+='<rect x="'+x+'" y="112" width="22" height="20" rx="4" fill="'+(on?c:'#172036')+'" stroke="'+(on?c:'#475569')+'">'+(on?'<animate attributeName="opacity" values="1;0.4;1" dur="1.2s" repeatCount="indefinite"/>':'')+'</rect>'+
        '<text x="'+(x+11)+'" y="126" text-anchor="middle" font-size="10" fill="'+(on?'#06121f':'#93a0bd')+'">'+k+'</text>';});
    s+='<path d="M150,108 L150,100" stroke="'+c+'" stroke-width="1.6"/><path d="M146,104 l4,-6 l4,6" fill="none" stroke="'+c+'" stroke-width="1.6"/>';
    s+='<text x="36" y="66" font-size="9" fill="#93a0bd">every frame</text><text x="36" y="78" font-size="9" fill="#93a0bd">generated live</text>';
    return s;
  },
  splat3d(c){
    let s='<text x="20" y="18" font-size="9" fill="#93a0bd">persistent 3D scene</text>';
    const pts=[[120,60,6],[150,52,5],[176,66,7],[136,82,5],[162,88,6],[110,76,4],[188,84,4],[148,70,8]];
    pts.forEach(p=>{s+='<ellipse cx="'+p[0]+'" cy="'+p[1]+'" rx="'+(p[2]*1.6)+'" ry="'+p[2]+'" fill="'+c+'" opacity="0.5"/>';});
    s+='<ellipse cx="148" cy="100" rx="58" ry="10" fill="none" stroke="#27406b"/>';
    s+='<g><circle r="6" fill="#fff" style="filter:drop-shadow(0 0 5px '+c+')"><animateMotion dur="4s" repeatCount="indefinite" path="M148,128 C210,128 230,72 148,44 C66,72 86,128 148,128"/></circle></g>';
    s+='<text x="148" y="144" text-anchor="middle" font-size="9" fill="'+c+'">camera orbits — the world never changes</text>';
    return s;
  },
  dataengine(c){
    let s='<rect x="16" y="48" width="48" height="36" rx="5" fill="#172036" stroke="'+c+'" stroke-width="1.6"/><text x="40" y="98" text-anchor="middle" font-size="9" fill="#93a0bd">few demos</text>';
    for(let i=0;i<5;i++){const y=20+i*24;
      s+='<rect x="112" y="'+y+'" width="40" height="18" rx="4" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.75" dur="0.4s" begin="'+(0.4+i*0.25)+'s" fill="freeze"/></rect>'+
        '<line x1="66" y1="66" x2="110" y2="'+(y+9)+'" stroke="'+c+'" stroke-width="1" opacity="0.35"/>';}
    s+='<text x="132" y="148" text-anchor="middle" font-size="9" fill="'+c+'">dreamed variations</text>';
    s+='<path d="M156,70 L186,70" stroke="'+c+'" stroke-width="2"/><path d="M180,65 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<text x="196" y="56" font-size="8.5" fill="#93a0bd">IDM → actions</text>';
    s+='<circle cx="226" cy="84" r="17" fill="#0b1326" stroke="'+c+'" stroke-width="2.5"/><text x="226" y="89" text-anchor="middle" font-size="11" fill="'+c+'">π</text>';
    return s;
  },
  evalwm(c){
    let s='<circle cx="34" cy="50" r="14" fill="#0b1326" stroke="'+c+'" stroke-width="2.4"/><text x="34" y="55" text-anchor="middle" font-size="10" fill="'+c+'">π</text>';
    s+='<path d="M52,50 L74,50" stroke="'+c+'" stroke-width="2"/><path d="M68,45 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="80" y="28" width="100" height="46" rx="10" fill="#0f1c33" stroke="'+c+'" stroke-width="1.6"/><text x="130" y="50" text-anchor="middle" font-size="9.5" fill="'+c+'">world model</text><text x="130" y="64" text-anchor="middle" font-size="8.5" fill="#93a0bd">imagined rollout</text>';
    s+='<path d="M184,50 L206,50" stroke="'+c+'" stroke-width="2"/><path d="M200,45 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    const bars=[['A',46,'#34d399'],['B',30,c],['C',14,'#fb7185']];
    bars.forEach(([t,h,col],i)=>{const x=214+i*20;
      s+='<rect x="'+x+'" y="'+(76-h)+'" width="14" height="0" fill="'+col+'" rx="2"><animate attributeName="height" values="0;'+h+'" dur="0.6s" begin="'+(0.8+i*0.2)+'s" fill="freeze"/><animate attributeName="y" values="76;'+(76-h)+'" dur="0.6s" begin="'+(0.8+i*0.2)+'s" fill="freeze"/></rect>'+
        '<text x="'+(x+7)+'" y="88" text-anchor="middle" font-size="8.5" fill="#93a0bd">'+t+'</text>';});
    s+='<text x="140" y="120" text-anchor="middle" font-size="9" fill="'+c+'">rank checkpoints before touching a robot</text>';
    return s;
  },
  interleave(c){
    let s='<text x="20" y="22" font-size="9" fill="#93a0bd">one token stream</text>';
    const seq=['img','act','img','act','img','act'];
    seq.forEach((t,i)=>{const x=18+i*42,isA=t==='act';
      s+='<rect x="'+x+'" y="44" width="36" height="30" rx="6" fill="'+(isA?c:'#172036')+'" stroke="'+(isA?c:'#475569')+'" stroke-width="1.5" opacity="0"><animate attributeName="opacity" values="0;1" dur="0.3s" begin="'+(i*0.35)+'s" fill="freeze"/></rect>'+
        '<text x="'+(x+18)+'" y="63" text-anchor="middle" font-size="9" font-weight="700" fill="'+(isA?'#06121f':'#93a0bd')+'" opacity="0">'+t+'<animate attributeName="opacity" values="0;1" dur="0.3s" begin="'+(i*0.35)+'s" fill="freeze"/></text>';});
    s+='<path d="M54,96 C90,116 120,116 140,98" fill="none" stroke="'+c+'" stroke-width="1.4" stroke-dasharray="4 4" opacity="0.7"/>';
    s+='<text x="100" y="132" text-anchor="middle" font-size="9" fill="'+c+'">act ↔ imagine teach each other</text>';
    return s;
  },
"""

WM_CAP_JS = (
 'latentroll:"Frames compress to a tiny latent that rolls the future forward.",'
 'shortcutf:"One big denoising step learns to match two half-steps.",'
 'mcts:"A search tree grows; the best-scoring branch is chosen.",'
 'tokenfilm:"The next frame is predicted token by token, like words.",'
 'jepapred:"Predict the missing piece\'s features — never its pixels.",'
 'latentmpc:"Candidate actions roll latent futures; pick the one nearest the goal.",'
 'gausreg:"Embeddings are shaped into a clean Gaussian ball — collapse impossible.",'
 'viddiffuse:"Noise sharpens into a controllable video of the future.",'
 'lam:"Hidden \'buttons\' are inferred from raw video — actions without labels.",'
 'playable:"Every frame is generated live in response to your keys.",'
 'splat3d:"The world is built as persistent 3D — look away, it\'s still there.",'
 'dataengine:"A few real demos are dreamed into thousands of training clips.",'
 'evalwm:"Run a policy inside the dream; rank it before touching a robot.",'
 'interleave:"One token stream carries frames AND actions; each helps the other.",'
)


def render():
    t = gen_landscape.TEMPLATE
    t = t.replace("const ANIM = {", "const ANIM = {\n" + WM_ANIM_JS, 1)
    t = t.replace("const ANIM_CAP = {", "const ANIM_CAP = {\n  " + WM_CAP_JS + "\n", 1)
    t = t.replace("<title>Robot Learning Landscape</title>", "<title>World Model Landscape</title>")
    t = t.replace("🤖 Robot Learning Landscape", "🌍 World Model Landscape")
    t = t.replace(">🤖</text>", ">🌍</text>")
    t = t.replace(">POLICY</text>", ">WORLD</text>")
    t = t.replace("__DATA_JSON__", json.dumps(build_data()))
    return t


def main():
    html = render()
    with open("robot_worldmodel.html", "w", encoding="utf-8") as fh:
        fh.write(html)
    d = build_data()
    print("wrote robot_worldmodel.html (%d chars) — %d families, %d paradigms, %d edges"
          % (len(html), len(d["families"]), len(d["paradigms"]), len(d["edges"])))


if __name__ == "__main__":
    main()