"""Generate a contribution-focused evolution atlas. The page answers one question per milestone: compared with the prior reference point, what was the smallest conceptual/algorithmic contribution that mattered? It avoids ranking by benchmark numbers and instead visualizes mechanism deltas. """ from __future__ import annotations import json MILESTONES = [ # Robot learning ("robot", 1989, "Q-Learning", "Dynamic programming", "learn Q from samples instead of a known model", "Bellman backup becomes data-driven control", "value"), ("robot", 1992, "REINFORCE", "Value iteration", "optimize policy probabilities directly", "gradient pushes high-return actions up", "policy"), ("robot", 2016, "GAIL", "MaxEnt IRL", "replace explicit reward recovery with adversarial occupancy matching", "discriminator becomes imitation reward", "adversarial"), ("robot", 2018, "SAC", "DDPG / TD3", "add maximum-entropy objective to off-policy actor-critic", "exploration is part of the objective", "actorcritic"), ("robot", 2020, "CQL", "Naive offline actor-critic", "penalize unsupported high Q values", "offline RL becomes pessimistic", "offline"), ("robot", 2021, "Decision Transformer", "Offline RL critics", "cast return-conditioned control as sequence modeling", "RL log becomes a language-model dataset", "tokens"), ("robot", 2023, "Diffusion Policy", "MSE behavioral cloning", "model multimodal action distributions by denoising action chunks", "avoid averaging left/right demonstrations", "denoise"), ("robot", 2024, "OpenVLA", "RT-2-style closed VLAs", "open VLM-to-action recipe with tokenized actions", "VLA becomes reproducible", "vla"), ("robot", 2024, "pi0", "Tokenized action VLAs", "use flow matching for continuous high-precision action chunks", "VLA action head becomes generative continuous control", "flow"), ("robot", 2025, "GR00T / Helix / Gemini Robotics", "single-arm VLA demos", "scale VLA recipes toward humanoid, dual-system, and broader embodied deployment", "foundation policies become systems", "stack"), # VLM ("vlm", 2021, "CLIP", "supervised vision labels", "align images and text contrastively at web scale", "image classifiers become text-queryable encoders", "contrast"), ("vlm", 2022, "Flamingo", "CLIP retrieval/captioning", "insert gated cross-attention into frozen LLMs", "LLMs can read interleaved images", "cross"), ("vlm", 2023, "BLIP-2", "full VLM finetuning", "use Q-Former as a small trainable visual bottleneck", "frozen vision + frozen LLM become cheap to connect", "qformer"), ("vlm", 2023, "LLaVA", "heavy bridge modules", "simple MLP projector plus visual instruction tuning", "open VLM assistant recipe becomes simple", "projector"), ("vlm", 2023, "Grounding DINO", "fixed-label detection", "ground arbitrary text phrases to boxes", "VLM perception becomes open-vocabulary and spatial", "ground"), ("vlm", 2024, "LLaVA-NeXT / Qwen2-VL", "fixed low-res image tokens", "process high-resolution images by tiling or dynamic resolution", "small text and dense charts become visible", "anyres"), ("vlm", 2024, "Chameleon / GPT-4o style native multimodal", "late-fusion VLMs", "train a model on mixed-modal token streams from the start", "multimodal shifts from adapter to native model", "native"), ("vlm", 2025, "Efficient / MoE VLMs", "single dense giant VLMs", "route visual-language computation selectively", "quality-cost tradeoff becomes architectural", "moe"), # World models ("world", 2018, "World Models", "model-free policy search", "train controller inside learned VAE+RNN dreams", "world model becomes a training environment", "world"), ("world", 2019, "PlaNet", "modular VAE/RNN dreams", "RSSM latent dynamics plus CEM planning", "latent planning from pixels becomes practical", "latent"), ("world", 2020, "Dreamer", "PlaNet online CEM", "amortize behavior learning with actor-critic in imagination", "dreams train policies, not just evaluate plans", "dream"), ("world", 2020, "MuZero", "pixel/world reconstruction", "learn value-equivalent latent dynamics for search", "model only needs to predict what planning uses", "mcts"), ("world", 2022, "IRIS", "continuous latent RSSM", "tokenize frames and train a transformer next-token world model", "world modeling adopts the language-model recipe", "tokens"), ("world", 2023, "GAIA-1", "game/Atari token WMs", "scale autoregressive world models to driving video with action/text conditioning", "driving simulation becomes generative", "video"), ("world", 2023, "I-JEPA", "pixel reconstruction", "predict abstract latent features instead of pixels", "representation prediction competes with generation", "jepa"), ("world", 2025, "V-JEPA 2-AC", "passive video JEPA", "add a thin action-conditioned predictor for robot planning", "passive video features become controllable for robotics", "jepa_action"), ("world", 2025, "World Action Models", "action-conditioned simulators", "generate successful future plus actions", "world model also becomes policy proposal", "wam"), ("world", 2025, "Cosmos / Genie / interactive worlds", "offline video prediction", "make video worlds controllable, interactive, and useful for data generation", "world models become infrastructure", "interactive"), ] SOURCES = [ ("CLIP", "https://arxiv.org/abs/2103.00020"), ("Flamingo", "https://arxiv.org/abs/2204.14198"), ("BLIP-2", "https://arxiv.org/abs/2301.12597"), ("LLaVA", "https://arxiv.org/abs/2304.08485"), ("Grounding DINO", "https://arxiv.org/abs/2303.05499"), ("Diffusion Policy", "https://arxiv.org/abs/2303.04137"), ("Decision Transformer", "https://arxiv.org/abs/2106.01345"), ("Dreamer", "https://arxiv.org/abs/1912.01603"), ("MuZero", "https://arxiv.org/abs/1911.08265"), ("V-JEPA 2", "https://ai.meta.com/vjepa/"), ("World Models", "https://worldmodels.github.io/"), ] def build_data(): return { "milestones": [ { "track": tr, "year": year, "name": name, "prev": prev, "delta": delta, "meaning": meaning, "anim": anim, } for tr, year, name, prev, delta, meaning, anim in MILESTONES ], "sources": [{"title": t, "url": u} for t, u in SOURCES], } TEMPLATE = r"""