"""Generate a self-contained interactive 'Robot Learning Landscape' page. Pulls the researched paradigm data straight out of robot_paradigms_app.py so the visualization stays in sync, then injects it into a static HTML template that provides: a constellation map, Keynote-style Magic-Move detail panels (View Transitions API), and a small looping SMIL animation per paradigm. Run: .venv_robot_paradigms/bin/python gen_landscape.py Out: robot_landscape.html """ import json import html as html_lib import robot_paradigms_app as app import my_papers # --------------------------------------------------------------------------- # 1. Family display labels (nice names for the 10 hubs) # --------------------------------------------------------------------------- FAMILY_LABEL = { "BC": "Imitation Learning", "Reinforcement": "Reinforcement Learning", "Offline RL": "Offline RL", "Inverse RL": "Inverse RL (Imitation)", "Model-Based": "World Models", "Sequence": "Sequence Models", "Goal-Cond.": "Goal-Conditioned", "Hierarchical": "Hierarchical", "Meta-Learning": "Meta-Learning", "LLM-Orchestration": "LLM / VLM", } # --------------------------------------------------------------------------- # 2. Per-paradigm enrichment authored for a beginner audience: # short = label shown on the map # simple = plain-English "what the robot is doing" (high-school level) # anim = which mini-animation archetype to play # --------------------------------------------------------------------------- ENRICH = { "flow-matching-policy": dict(short="Flow Matching", anim="flow", simple=( "Imagine smoothly steering a dot from a random scribble to the exact move an " "expert made, following the straightest possible path. The robot learns that " "“steering field,” so it can turn noise into a precise action in just a " "few steps.")), "diffusion-policy": dict(short="Diffusion", anim="denoise", simple=( "Start with pure static (random noise) and clean it up step by step until a " "smooth, sensible action appears — like sharpening a blurry photo. Because it " "imagines many possibilities, it can pick left OR right around an obstacle instead " "of averaging into a crash.")), "tokenized-bc": dict(short="Tokenized BC", anim="tokens", simple=( "Chop each action into little pieces and predict them one-by-one, exactly like a " "chatbot predicts the next word. This lets a robot reuse all the machinery of a " "language model to act.")), "energy-based-bc": dict(short="Energy / Implicit", anim="energy", simple=( "Give every possible action a “score,” then roll downhill to the " "best-scoring one — like a marble settling into the lowest dip of a hilly " "landscape.")), "value-based-rl": dict(short="Q-Learning", anim="qlearn", simple=( "Try things, keep a running estimate of how good each move turns out, and always " "pick the move with the best expected payoff. It learns purely from rewards and " "mistakes — no teacher needed.")), "policy-gradient-rl": dict(short="Policy Gradient", anim="pgrad", simple=( "Do the task many times; nudge the behavior a little toward whatever earned more " "reward and away from what earned less. Slowly the robot’s habits get " "better.")), "off-policy-ac": dict(short="Actor-Critic", anim="actorcritic", simple=( "Two parts team up: an “actor” that acts and a “critic” that " "grades each action. The critic’s grades teach the actor to act better — " "and it can re-use old replayed experience.")), "offline-rl": dict(short="Offline RL", anim="offline", simple=( "Learn only from a fixed recording of past behavior — no live robot, no new " "tries. The trick is to stay close to what’s in the recording so the robot " "doesn’t bet on moves it never saw work.")), "maxent-irl": dict(short="MaxEnt IRL", anim="rewardmap", simple=( "Watch an expert and figure out the hidden “reward” that would explain " "why they did what they did — reverse-engineering their goal from their " "behavior.")), "gail": dict(short="GAIL", anim="adversarial", simple=( "Two networks play a game: one tries to act like the expert, the other tries to " "spot the fake. As the spotter gets sharper, the imitator is forced to become " "indistinguishable from the real expert.")), "forward-dynamics-mpc": dict(short="MPC", anim="plan", simple=( "Learn a “what happens if…” simulator, imagine several action plans " "a few steps ahead, and execute the one that looks best — then re-plan at the " "next moment.")), "latent-imagination": dict(short="Dreamer", anim="dream", simple=( "Build a compact “mental world,” then practice thousands of times inside " "that daydream instead of the real world — fast and safe — and carry the " "learned skill back to reality.")), "generative-video-wm": dict(short="Video WM", anim="videopred", simple=( "Predict the future as a short video: given what it sees now, the robot pictures " "what will happen next, frame by frame.")), "action-conditioned-wm": dict(short="Action-Cond WM", anim="videoaction", simple=( "Ask “if I move like this, what will I see?” The robot predicts the " "future video that each candidate action would cause, then picks the best one.")), "world-action-model": dict(short="World-Action", anim="worldaction", simple=( "Imagine the future video of the task AND read off the actions needed to make that " "future happen — dreaming the plan and the moves together.")), "occupancy-latent-wm": dict(short="Occupancy WM", anim="occupancy", simple=( "Instead of a full video, predict a simple map of where stuff will be (occupied vs " "free space) so the robot can plan safe motions.")), "decision-transformer": dict(short="Decision Transf.", anim="returncond", simple=( "Tell the robot the score you want (“get 100 points”) and it writes out " "the sequence of actions likely to hit that score — like autocompleting a " "winning playthrough.")), "trajectory-diffusion": dict(short="Traj. Diffusion", anim="denoise", simple=( "Sketch a whole path out of noise and clean it up all at once into a smooth plan, " "then gently steer that plan toward a goal.")), "goal-conditioned": dict(short="Goal + HER", anim="goalrelabel", simple=( "Tell the robot where to end up and it aims for that goal. When it misses, it " "pretends “wherever I landed” was the goal all along — so even " "failures become useful lessons.")), "hrl": dict(short="Hierarchical", anim="hierarchy", simple=( "A “manager” sets mini-goals (go here, then there) and a “worker” " "figures out the small moves to reach each one — splitting a big task into " "easy chunks.")), "meta-learning": dict(short="Meta-Learning", anim="meta", simple=( "Practice on many different tasks so the robot learns HOW to learn — then it " "can pick up a brand-new task after just a few tries.")), "llm-planner": dict(short="LLM Planner", anim="llmplan", simple=( "A language model reads the instruction, breaks it into a step-by-step plan (or " "even writes code), and calls ready-made skills to carry it out — no " "trial-and-error training.")), "vlm-affordance": dict(short="VLM Affordance", anim="affordance", simple=( "A vision-language model looks at the scene and marks WHERE and HOW to act (grab " "here, push there), turning a picture into a usable plan.")), } # --------------------------------------------------------------------------- # 3. Interconnections between paradigms (the bridges). [idA, idB, why] # --------------------------------------------------------------------------- # (a, b, why, kind): # "v" = variant — same core idea, different flavor (dashed, undirected) # "b" = builds-on — arrow a → b means "b builds on / uses a" (a underlies b) EDGES = [ # ---- variants (same core idea, different flavor) ---- ("diffusion-policy", "trajectory-diffusion", "same denoising idea (action vs. whole path)", "v"), ("diffusion-policy", "flow-matching-policy", "generative action heads", "v"), ("flow-matching-policy", "tokenized-bc", "the three VLA action heads", "v"), ("diffusion-policy", "tokenized-bc", "the three VLA action heads", "v"), ("tokenized-bc", "llm-planner", "reuse the language-model / token stack", "v"), ("offline-rl", "decision-transformer", "return-conditioned offline RL (sequence modeling)", "b"), ("decision-transformer", "goal-conditioned", "condition on a target", "v"), ("decision-transformer", "tokenized-bc", "sequence models of actions", "v"), ("offline-rl", "diffusion-policy", "stay close to the data = imitate it", "v"), ("gail", "maxent-irl", "recover / use a reward signal", "v"), ("latent-imagination", "forward-dynamics-mpc", "plan inside a learned model", "v"), ("forward-dynamics-mpc", "occupancy-latent-wm", "a model used for planning", "v"), ("generative-video-wm", "action-conditioned-wm", "adds action-controllable prediction", "b"), ("action-conditioned-wm", "world-action-model", "actions + video together", "v"), ("vlm-affordance", "llm-planner", "language / vision planning, no policy gradient", "v"), ("hrl", "goal-conditioned", "sub-goals are just goals", "v"), ("hrl", "llm-planner", "high-level decomposition", "v"), ("classical-mpc", "forward-dynamics-mpc", "same optimizer — known vs. learned model", "v"), # ---- builds-on / enables (arrow a → b: "b builds on a") ---- ("off-policy-ac", "offline-rl", "many offline methods extend off-policy AC (+ conservatism)", "b"), ("policy-gradient-rl", "gail", "GAIL trains its imitator with RL", "b"), ("maxent-irl", "value-based-rl", "infer the reward, then run RL on it", "b"), ("policy-gradient-rl", "latent-imagination", "trains the policy inside the imagined model", "b"), ("diffusion-policy", "world-action-model", "share diffusion/flow generative machinery", "v"), ("policy-gradient-rl", "meta-learning", "adapts with a few gradient steps", "b"), ("off-policy-ac", "goal-conditioned", "HER rides on off-policy RL", "b"), # ---- classical control underlies the learning methods ---- ("lqr", "value-based-rl", "classical optimal-control precursor to RL", "b"), ("pid-control", "off-policy-ac", "residual RL learns on top of a controller", "b"), ("motion-planning", "llm-planner", "the LLM calls a classical planner", "b"), ("classical-mpc", "vlm-affordance", "VLM cost + a classical trajectory optimizer", "b"), ] # --------------------------------------------------------------------------- # 4. Classical / traditional control — NOT in the learning app, added here so # the map shows the full robot-control landscape (the non-learning bedrock). # --------------------------------------------------------------------------- CLASSICAL_FAMILY = dict( key="Classical", label="Classical Control", color="#64748b", desc=("Model-based control & planning with no learning — the engineering " "bedrock robots still run on, and what most learned methods sit on top of."), ) CLASSICAL_PARADIGMS = [ dict( id="pid-control", name="PID / Feedback Control", short="PID", family="Classical", anim="pid", tagline="Push proportionally to the error — the workhorse of control.", simple=("Measure how far you are from the target, and correct in proportion to that " "error — plus a bit for accumulated error (I) and how fast it’s changing (D). " "No model, no learning, just feedback. It’s the inner loop under almost everything."), mapping="error e(t) → control u(t)", math=r"u(t)=K_p\,e(t)+K_i\!\int_0^t\! e(\tau)\,d\tau+K_d\,\dot e(t)", when="Low-level motor/joint control, and the inner loop beneath higher-level planners.", pros=["Dead simple, needs no model", "Ubiquitous and robust", "Easy to tune"], cons=["No foresight or constraints", "Struggles with nonlinear / coupled / delayed systems", "Gains are hand-tuned"], papers=["Ziegler–Nichols tuning (1942)", "classical control theory"], ), dict( id="lqr", name="LQR / Optimal Control", short="LQR", family="Classical", anim="lqr", tagline="The provably optimal linear feedback gain.", simple=("If your system is roughly linear, you can solve for the single best feedback " "gain that minimizes a cost trading off staying on target vs. control effort. " "It’s the optimal-control ancestor of value-based RL."), mapping="state x → control u = −Kx", math=r"u=-Kx,\quad K=R^{-1}B^\top P,\quad A^\top P+PA-PBR^{-1}B^\top P+Q=0", when="Stabilization/tracking for systems you can linearize; a baseline and building block (iLQR, LQG).", pros=["Provably optimal for linear-quadratic problems", "Closed-form and fast", "Foundation for iLQR / LQG / MPC"], cons=["Assumes linear dynamics + quadratic cost", "No hard constraints", "Needs a model"], papers=["Kalman (1960), optimal control / LQG"], ), dict( id="classical-mpc", name="Model-Predictive Control / Trajectory Optimization", short="MPC / TrajOpt", family="Classical", anim="trajopt", tagline="Optimize controls over a horizon with a known model; re-plan each step.", simple=("Using known physics, optimize a short sequence of future controls to minimize " "cost while respecting constraints (limits, obstacles), execute the first one, then " "re-optimize at the next step. This is the same machinery learned-model MPC uses — " "here the model is hand-derived physics."), mapping="model f + cost ℓ → optimal u_{0:H}", math=r"\min_{u_{0:H}}\sum_{t=0}^{H}\ell(x_t,u_t)\quad\text{s.t.}\quad x_{t+1}=f(x_t,u_t),\;\;g(x_t,u_t)\le 0", when="When you have a decent model and need constraint handling + foresight (legged locomotion, driving, arms).", pros=["Handles constraints and foresight", "Uses known physics", "Re-planning adds robustness"], cons=["Needs an accurate model", "Online optimization is expensive", "Hard for contact-rich / uncertain dynamics"], papers=["iLQR (Todorov 2005)", "CHOMP / TrajOpt", "MPC literature"], ), dict( id="motion-planning", name="Motion Planning (Sampling / Search)", short="Motion Planning", family="Classical", anim="planning", tagline="Search the free space for a collision-free path.", simple=("Find a collision-free path from start to goal by sampling random configurations " "and connecting them (RRT/PRM) or searching a grid/graph (A*). It’s about geometry " "and feasibility, not learning — and it’s often the executor under an LLM planner."), mapping="start, goal, obstacles → collision-free path τ", math=r"\text{find }\tau: q_{\text{start}}\to q_{\text{goal}}\;\;\text{s.t.}\;\;\tau(s)\in\mathcal{C}_{\text{free}}\;\;\forall s", when="Navigation and arm motion through known obstacle fields.", pros=["Completeness / optimality guarantees (A*, RRT*)", "No training data needed", "Mature and reliable"], cons=["Needs a known map / geometry", "Struggles with high-dim contact + dynamics", "Replanning cost in clutter"], papers=["RRT (LaValle 1998)", "PRM (Kavraki 1996)", "A* (Hart 1968)"], ), ] # --------------------------------------------------------------------------- # 5. Best "learn more" explainer per branch (verified reachable June 2026). # (title, url) — canonical project pages / respected blogs / standard texts. # --------------------------------------------------------------------------- LEARN = { "flow-matching-policy": ("π₀ flow-matching VLA — HuggingFace", "https://huggingface.co/blog/pi0"), "diffusion-policy": ("Diffusion Policy — project page", "https://diffusion-policy.cs.columbia.edu/"), "tokenized-bc": ("OpenVLA — project page", "https://openvla.github.io/"), "energy-based-bc": ("Implicit BC — project page", "https://implicitbc.github.io/"), "decision-transformer": ("Decision Transformer — project page", "https://sites.google.com/berkeley.edu/decision-transformer"), "trajectory-diffusion": ("Diffuser: Planning with Diffusion", "https://diffusion-planning.github.io/"), "value-based-rl": ("Deep Q-Learning — HF Deep RL Course", "https://huggingface.co/learn/deep-rl-course/unit3/introduction"), "policy-gradient-rl": ("Policy Gradients & PPO — Arxiv Insights (video)", "https://www.youtube.com/watch?v=5P7I-xPq8u8"), "off-policy-ac": ("DDPG & SAC — Pieter Abbeel (video)", "https://www.youtube.com/watch?v=pg-lKy7JIRk"), "offline-rl": ("Offline RL — BAIR blog", "https://bair.berkeley.edu/blog/2020/12/07/offline/"), "maxent-irl": ("What Is Inverse RL? — The Gradient", "https://thegradient.pub/learning-from-humans-what-is-inverse-reinforcement-learning/"), "gail": ("GAIL Imitation Learning (video)", "https://www.youtube.com/watch?v=E-lfhLiXiBc"), "forward-dynamics-mpc": ("Model-Based RL + MPC — BAIR", "https://bair.berkeley.edu/blog/2017/11/30/model-based-rl/"), "latent-imagination": ("Dreamer — Danijar Hafner", "https://danijar.com/project/dreamer/"), "generative-video-wm": ("World Models — interactive", "https://worldmodels.github.io/"), "action-conditioned-wm": ("V-JEPA 2 World Model — Meta", "https://ai.meta.com/blog/v-jepa-2-world-model-benchmarks/"), "world-action-model": ("Unified Video Action Model", "https://unified-video-action-model.github.io/"), "occupancy-latent-wm": ("Drive-OccWorld — occupancy WM", "https://drive-occworld.github.io/"), "goal-conditioned": ("Hindsight Experience Replay — Two Minute Papers (video)", "https://www.youtube.com/watch?v=Dvd1jQe3pq0"), "hrl": ("Hierarchical RL — The Gradient", "https://thegradient.pub/the-promise-of-hierarchical-reinforcement-learning/"), "meta-learning": ("Interactive Intro to MAML", "https://interactive-maml.github.io/maml.html"), "llm-planner": ("Code as Policies — project page", "https://code-as-policies.github.io/"), "vlm-affordance": ("VoxPoser — project page", "https://voxposer.github.io/"), "pid-control": ("PID Control — Brian Douglas (video)", "https://www.youtube.com/watch?v=wkfEZmsQqiA"), "lqr": ("LQR Optimal Control — Brian Douglas (video)", "https://www.youtube.com/watch?v=E_RDCFOlJx4"), "classical-mpc": ("Model Predictive Control — MATLAB (video)", "https://www.youtube.com/watch?v=cEWnixjNdzs"), "motion-planning": ("A* Pathfinding — Red Blob Games (interactive)", "https://www.redblobgames.com/pathfinding/a-star/introduction.html"), } def build_data(): families = [] for k in app.FAMILY: families.append(dict( key=k, label=FAMILY_LABEL.get(k, k), color=app.FAMILY[k][0], desc=app.FAMILY[k][1], equation=app.FAMILY_EQUATIONS.get(k, ""), relations=[dict(name=n, eq=e) for (n, e, _note) in app.FAMILY_RELATIONS.get(k, [])], )) cf = dict(CLASSICAL_FAMILY); cf["equation"] = ""; cf["relations"] = [] families.append(cf) for f in EXTRA_FAMILIES: families.append(dict(f, desc="", equation="", relations=[])) paradigms = [] for p in app.PARADIGMS: e = ENRICH.get(p["id"], {}) paradigms.append(dict( id=p["id"], name=p["name"], short=e.get("short", p["name"]), family=p["family"], tagline=p["tagline"], simple=e.get("simple", p["intuition"]), anim=e.get("anim", "denoise"), mapping=p.get("mapping", ""), math=p.get("math", ""), when=p.get("when", ""), pros=p.get("pros", []), cons=p.get("cons", []), papers=p.get("key_papers", []), learn=({"title": LEARN[p["id"]][0], "url": LEARN[p["id"]][1]} if p["id"] in LEARN else None), )) for p in CLASSICAL_PARADIGMS: paradigms.append(dict( id=p["id"], name=p["name"], short=p["short"], family=p["family"], tagline=p["tagline"], simple=p["simple"], anim=p["anim"], mapping=p["mapping"], math=p["math"], when=p["when"], pros=p["pros"], cons=p["cons"], papers=p["papers"], learn=({"title": LEARN[p["id"]][0], "url": LEARN[p["id"]][1]} if p["id"] in LEARN else None), )) for p in EXTRA_PARADIGMS: paradigms.append(dict( id=p["id"], name=p["name"], short=p["short"], family=p["family"], anim=p["anim"], tagline=p["tagline"], simple=p["simple"], mapping=p["mapping"], math=p["math"], when=p["when"], pros=p["pros"], cons=p["cons"], papers=p["papers"], learn={"title": p["learn"][0], "url": p["learn"][1]}, )) # only keep edges whose endpoints exist ids = {p["id"] for p in paradigms} edges = [[a, b, w, k] for (a, b, w, k) in (EDGES + EXTRA_EDGES) if a in ids and b in ids] nodeset = ids | {f["key"] for f in families} mypapers = [m for m in my_papers.MY_PAPERS if m.get("node") in nodeset] return dict(families=families, paradigms=paradigms, edges=edges, mypapers=mypapers) TEMPLATE = r"""