"""Generate a Vision-Language-Model (VLM) landscape page.

Reuses the robot-landscape engine (gen_landscape.TEMPLATE: constellation map,
Magic-Move detail panels, typed edges, MathJax) but injects a VLM-specific set
of animations and data. Self-contained HTML.

Run:  .venv_robot_paradigms/bin/python gen_vlm.py   -> robot_vlm.html
"""
import json

import gen_landscape
import my_papers

# ---------------------------------------------------------------------------
# Families (11)
# ---------------------------------------------------------------------------
FAMILIES = [
    ("Contrastive",  "Contrastive Alignment", "#2563eb"),
    ("Generative",   "Generative Pretraining", "#0891b2"),
    ("Bridge",       "Vision→LLM Bridges", "#7c3aed"),
    ("Native",       "Native Multimodal", "#ea580c"),
    ("Grounding",    "Grounding / Regions", "#16a34a"),
    ("AnyRes",       "High-Resolution", "#ca8a04"),
    ("Video",        "Video VLMs", "#db2777"),
    ("Document",     "Document / OCR", "#92400e"),
    ("Backbone",     "Vision Backbones", "#64748b"),
    ("Posttrain",    "Post-Training", "#e11d48"),
    ("Unified",      "Any-to-Any", "#4f46e5"),
    ("Masked",       "Masked Pretraining", "#38bdf8"),
    ("Efficient",    "Efficient / Small", "#34d399"),
    ("MoE",          "Mixture-of-Experts", "#fbbf24"),
    ("Agentic",      "Agentic / RAG", "#fb7185"),
]

# id, name, short, family, anim, tagline, simple, mapping, math, when, pros, cons, papers, (learn title,url)
P = [
 dict(id="clip", name="CLIP (Contrastive Image-Text)", short="CLIP", family="Contrastive", anim="contrastive",
   tagline="Pull matching image–text pairs together, push the rest apart.",
   simple=("Show the model millions of image+caption pairs. Train it so each image's vector lands "
           "next to its true caption's vector and far from all the wrong captions. The result: you can "
           "compare any image to any sentence — the basis of almost every modern VLM."),
   mapping="image, text → shared embedding space",
   math=r"\mathcal{L}=-\tfrac12\sum_i\Big[\log\tfrac{e^{\langle I_i,T_i\rangle/\tau}}{\sum_j e^{\langle I_i,T_j\rangle/\tau}}+\log\tfrac{e^{\langle I_i,T_i\rangle/\tau}}{\sum_j e^{\langle I_j,T_i\rangle/\tau}}\Big]",
   when="Zero-shot classification/retrieval, and as the vision encoder feeding most VLMs.",
   pros=["Zero-shot transfer", "Web-scale, no labels needed", "Encoder reused everywhere"],
   cons=["Weak at fine spatial detail / counting", "Bag-of-words text understanding", "Needs huge batches"],
   papers=["CLIP (Radford 2021)", "ALIGN (Jia 2021)", "OpenCLIP"],
   learn=("What is CLIP? — Roboflow (illustrated)", "https://blog.roboflow.com/openai-clip/")),
 dict(id="siglip", name="SigLIP (Sigmoid Loss)", short="SigLIP", family="Contrastive", anim="contrastive",
   tagline="Same idea as CLIP, but a simpler pairwise sigmoid loss.",
   simple=("CLIP needs a giant batch so every image can be compared against many captions at once. "
           "SigLIP swaps the softmax for an independent yes/no (sigmoid) on each image–text pair, so it "
           "trains well even with smaller batches — and tends to give a stronger encoder."),
   mapping="image, text → matched? (per-pair sigmoid)",
   math=r"\mathcal{L}=\sum_{i,j}\log\sigma\!\big(z_{ij}\,(t\,\langle I_i,T_j\rangle+b)\big),\;\;z_{ij}\in\{+1,-1\}",
   when="A drop-in, batch-friendly upgrade to CLIP; encoder for newer VLMs (e.g. Qwen-VL, PaliGemma).",
   pros=["Works with smaller batches", "Often stronger than CLIP", "Simple loss"],
   cons=["Still global (not region-level)", "Needs careful bias init"],
   papers=["SigLIP (Zhai 2023)", "SigLIP 2 (2025)"],
   learn=("Google's SigLIP explained — Analytics Vidhya", "https://www.analyticsvidhya.com/blog/2024/10/googles-siglip/")),
 dict(id="coca", name="CoCa (Contrastive Captioner)", short="CoCa", family="Generative", anim="caption",
   tagline="One model trained on BOTH a contrastive loss and a captioning loss.",
   simple=("Why choose between matching (CLIP) and describing (captioning)? CoCa does both at once: the "
           "first half of the model learns CLIP-style alignment, the second half learns to write the "
           "caption. You get a strong encoder AND a generator from one training run."),
   mapping="image → embedding + generated caption",
   math=r"\mathcal{L}=\lambda_{\text{con}}\,\mathcal{L}_{\text{contrastive}}+\lambda_{\text{cap}}\,\mathcal{L}_{\text{caption}}",
   when="When you want one backbone good at both retrieval and captioning/VQA.",
   pros=["Encoder + decoder in one", "Strong transfer", "Single pretraining"],
   cons=["Heavier to train", "Closed weights (original)"],
   papers=["CoCa (Yu 2022)"],
   learn=("CoCa visually explained — DataCamp", "https://www.datacamp.com/tutorial/coca-contrastive-captioners-are-image-text-foundation-models-visually-explained")),
 dict(id="blip", name="BLIP (Bootstrapped Captions)", short="BLIP", family="Generative", anim="caption",
   tagline="Generate captions, filter the noisy ones, retrain on the clean set.",
   simple=("Web captions are noisy. BLIP trains a captioner to write new captions for images, a filter to "
           "throw out bad ones, and then re-trains on this cleaned-up data (CapFilt). It can both "
           "understand (match) and describe (generate)."),
   mapping="image → caption (bootstrapped, CapFilt)",
   math=r"\mathcal{L}=\mathcal{L}_{\text{ITC}}+\mathcal{L}_{\text{ITM}}+\mathcal{L}_{\text{LM}}\;\;(\text{caption}+\text{filter})",
   when="Caption/VQA when your web data is noisy; predecessor of BLIP-2.",
   pros=["Cleans noisy web data", "Understand + generate", "Open"],
   cons=["Multi-stage pipeline", "Smaller than LLM-bridged VLMs"],
   papers=["BLIP (Li 2022)"],
   learn=("BLIP walkthrough — Yannic Kilcher (video)", "https://www.youtube.com/watch?v=X2k7n4FuI7c")),
 dict(id="git", name="GIT / SimVLM (Generative)", short="GIT / SimVLM", family="Generative", anim="caption",
   tagline="Just predict the caption tokens, end to end.",
   simple=("The simplest recipe: feed the image into an encoder and train one decoder to emit the caption "
           "word by word — like a language model that can see. No contrastive loss, no fancy pipeline."),
   mapping="image → text (autoregressive)",
   math=r"\mathcal{L}=-\sum_t \log p_\theta(w_t\mid w_{<t},\,\mathrm{Enc}(\text{image}))",
   when="Strong, simple captioning/VQA baselines.",
   pros=["Very simple objective", "Scales with data", "Strong captioning"],
   cons=["No explicit alignment", "Weaker zero-shot retrieval"],
   papers=["GIT (Wang 2022)", "SimVLM (Wang 2021)"],
   learn=("GIT model — Hugging Face docs (figures + demo)", "https://huggingface.co/docs/transformers/model_doc/git")),
 dict(id="flamingo", name="Flamingo (Gated Cross-Attention)", short="Flamingo", family="Bridge", anim="crossattn",
   tagline="Keep the LLM frozen; let text tokens cross-attend to image features.",
   simple=("Take a pretrained, frozen language model and insert new 'gated cross-attention' layers so the "
           "text can look at the image's features. Train only those new layers. This unlocked few-shot, "
           "interleaved image-and-text prompting."),
   mapping="image + text → text (few-shot, interleaved)",
   math=r"h\leftarrow h+\tanh(\alpha)\cdot\mathrm{CrossAttn}\big(h,\;\mathrm{Enc}(\text{image})\big)",
   when="Few-shot multimodal prompting; interleaved image-text documents.",
   pros=["Few-shot in-context learning", "Frozen LLM reused", "Handles many images"],
   cons=["Adds heavy cross-attn layers", "Original closed (see IDEFICS)"],
   papers=["Flamingo (Alayrac 2022)", "IDEFICS (HF 2023)"],
   learn=("Flamingo walkthrough — Yannic Kilcher (video)", "https://www.youtube.com/watch?v=wkeA2oND-pg")),
 dict(id="blip2", name="BLIP-2 (Q-Former)", short="BLIP-2 (Q-Former)", family="Bridge", anim="qformer",
   tagline="A tiny 'Q-Former' squeezes the image into ~32 tokens for a frozen LLM.",
   simple=("Both the vision encoder and the LLM stay frozen. A small module of ~32 learned 'query' tokens "
           "attends to the image and distills it into just 32 tokens the LLM can read — cheap to train, "
           "since only the little Q-Former learns."),
   mapping="image → 32 query tokens → frozen LLM",
   math=r"Q'=\mathrm{QFormer}(Q,\;\mathrm{Enc}(I)),\quad \mathrm{LLM}\big([\,W_pQ';\,\text{text}\,]\big)",
   when="Cheaply connect an existing encoder + existing LLM.",
   pros=["Very parameter-efficient", "Both backbones frozen", "Few image tokens"],
   cons=["Q-Former is fiddly to train", "Compresses away fine detail"],
   papers=["BLIP-2 (Li 2023)", "InstructBLIP (2023)"],
   learn=("Zero-shot with BLIP-2 — Hugging Face blog", "https://huggingface.co/blog/blip-2")),
 dict(id="llava", name="LLaVA (MLP Projector)", short="LLaVA", family="Bridge", anim="projector",
   tagline="Just a 2-layer MLP maps CLIP features into the LLM's token space.",
   simple=("The surprisingly simple recipe that took over: run the image through CLIP, push the features "
           "through a small MLP so they look like word-embeddings, prepend them to the prompt, and "
           "fine-tune on image+instruction data. Cheap, open, and very strong."),
   mapping="image → projected tokens → LLM",
   math=r"H_v=W_2\,\phi(W_1\,\mathrm{CLIP}(I)),\quad \mathrm{LLM}\big([\,H_v;\,\text{text}\,]\big)",
   when="The default open recipe for building a capable VLM assistant.",
   pros=["Dead simple + open", "Cheap to train", "Strong instruction following"],
   cons=["Fixed low resolution (base)", "Projector throws away some detail"],
   papers=["LLaVA (Liu 2023)", "LLaVA-1.5 (2023)"],
   learn=("LLaVA — official project page (demo + figures)", "https://llava-vl.github.io/")),
 dict(id="fuyu", name="Fuyu (Patch-as-Token)", short="Fuyu", family="Native", anim="patchify",
   tagline="No separate vision encoder — feed raw image patches straight into the decoder.",
   simple=("Skip the whole vision-encoder step. Cut the image into patches, run each through one linear "
           "layer, and feed those directly into a decoder-only transformer alongside text. Simple, and "
           "naturally handles any image size."),
   mapping="image patches → decoder-only LLM",
   math=r"\text{tokens}=[\,\mathrm{Linear}(\text{patch}_1),\dots,\text{text tokens}\,]\to\text{decoder LLM}",
   when="When you want one clean architecture and arbitrary resolutions.",
   pros=["No separate encoder", "Any resolution", "Simple"],
   cons=["Needs lots of data to learn vision", "Less off-the-shelf reuse"],
   papers=["Fuyu-8B (Adept 2023)"],
   learn=("Fuyu-8B — Adept blog (patches-in design)", "https://www.adept.ai/blog/fuyu-8b/")),
 dict(id="chameleon", name="Chameleon (Early-Fusion)", short="Chameleon", family="Native", anim="earlyfusion",
   tagline="One transformer, one vocabulary of mixed image+text tokens.",
   simple=("Turn images into discrete tokens and mix them into the SAME token stream as text, with one "
           "shared vocabulary. A single transformer is trained from scratch on this interleaved soup — so "
           "it can read and write images and text together."),
   mapping="interleaved image+text tokens → one model",
   math=r"p_\theta(x_1,\dots,x_n),\quad x_k\in\mathcal{V}_{\text{text}}\cup\mathcal{V}_{\text{image}}",
   when="Truly mixed-modal in/out; foundation for any-to-any models.",
   pros=["Unified in/out", "No grafting modules", "Mixed-modal reasoning"],
   cons=["Expensive from-scratch training", "Image tokenizer limits fidelity"],
   papers=["Chameleon (Meta 2024)"],
   learn=("Meta's Chameleon — VentureBeat (early vs late fusion)", "https://venturebeat.com/ai/meta-introduces-chameleon-a-state-of-the-art-multimodal-model")),
 dict(id="frontier", name="GPT-4o / Gemini (Frontier)", short="GPT-4o / Gemini", family="Native", anim="earlyfusion",
   tagline="Single natively-multimodal models over image, text, audio, video.",
   simple=("The frontier proprietary models are trained from the start on many modalities at once, so a "
           "single network sees images (and audio/video) and talks about them fluently, in real time."),
   mapping="image / audio / video / text → text",
   math=r"p_\theta(\text{tokens}\mid \text{image, audio, video, text})",
   when="State-of-the-art general multimodal assistants.",
   pros=["Best general quality", "Real-time, many modalities", "Strong reasoning"],
   cons=["Closed weights", "Opaque training", "API cost"],
   papers=["GPT-4o (OpenAI 2024)", "Gemini (Google 2023–25)"],
   learn=("GPT-4o & our multimodal future — AI Coffee Break (video)", "https://www.youtube.com/watch?v=T6DGGHlkYa0")),
 dict(id="glip", name="GLIP / Grounding DINO (Open-Vocab Detection)", short="Grounding DINO", family="Grounding", anim="grounding",
   tagline="Detect ANY object you name in text — no fixed label set.",
   simple=("Classic detectors only know a fixed list of classes. Phrase-grounded detectors align image "
           "regions to words, so you can ask for 'the red mug' or 'a person wearing a hat' and get boxes — "
           "open-vocabulary detection driven by language."),
   mapping="image + text prompt → bounding boxes",
   math=r"S=\mathrm{Enc}_{\text{img}}(I)\,\mathrm{Enc}_{\text{txt}}(\text{prompt})^\top\;\Rightarrow\;\text{boxes}+\text{alignment}",
   when="Open-vocabulary detection; auto-labeling; grounding for agents.",
   pros=["Detect arbitrary phrases", "No retraining per class", "Great for labeling"],
   cons=["Prompt-sensitive", "Detection, not full chat"],
   papers=["GLIP (Li 2022)", "Grounding DINO (Liu 2023)"],
   learn=("Grounding DINO — Roboflow (visual guide)", "https://blog.roboflow.com/grounding-dino-zero-shot-object-detection/")),
 dict(id="kosmos2", name="Kosmos-2 / Ferret (Grounded VLM)", short="Kosmos-2", family="Grounding", anim="grounding",
   tagline="A chat VLM that can point — its words carry image coordinates.",
   simple=("A normal VLM describes an image; a grounded one can also say WHERE. Kosmos-2/Ferret emit "
           "special location tokens, so an answer like 'the cat [box] is on the mat [box]' links phrases to "
           "actual regions you can draw."),
   mapping="image → text interleaved with region boxes",
   math=r"p_\theta\big(\text{text}+\langle\text{box}\rangle\mid I\big)\;\;(\text{grounded tokens})",
   when="Referring expressions, pointing, region-level QA for agents.",
   pros=["Links words to pixels", "Better spatial grounding", "Agent-friendly"],
   cons=["Needs grounded training data", "Coordinate tokens add complexity"],
   papers=["Kosmos-2 (Peng 2023)", "Ferret (Apple 2023)"],
   learn=("Kosmos-2 explained — Labellerr (illustrated)", "https://www.labellerr.com/blog/kosmos-2-explained/")),
 dict(id="llavanext", name="LLaVA-NeXT (AnyRes Tiling)", short="LLaVA-NeXT", family="AnyRes", anim="anyres",
   tagline="Slice a high-res image into tiles + a thumbnail, encode each.",
   simple=("Small fixed resolution blurs text and tiny objects. AnyRes cuts the picture into several tiles "
           "(plus a low-res overview), encodes them all, and feeds the lot to the LLM — so it can finally "
           "read small text and see fine detail."),
   mapping="hi-res image → tiles + thumbnail → LLM",
   math=r"\{\text{tiles}\}\cup\{\text{thumbnail}\}\xrightarrow{\mathrm{Enc}}[\,H_v;\,\text{text}\,]",
   when="Documents, charts, dense scenes — anything needing detail.",
   pros=["Reads small text", "Reuses a low-res encoder", "Big quality jump"],
   cons=["Many more image tokens", "Slower / costlier"],
   papers=["LLaVA-NeXT (2024)"],
   learn=("LLaVA-NeXT — official blog (AnyRes figure)", "https://llava-vl.github.io/blog/2024-01-30-llava-next/")),
 dict(id="qwenvl", name="Qwen-VL (Dynamic Resolution)", short="Qwen-VL", family="Bridge", anim="anyres",
   tagline="Encode images at their native resolution — token count scales with size.",
   simple=("Instead of squashing every image to one size, process it at (near) its real resolution, so a "
           "big detailed image simply becomes more tokens. Pairs a native dynamic-resolution ViT with an LLM."),
   mapping="native-res image → variable #tokens → LLM",
   math=r"\text{tokens}=\mathrm{Enc}(I_{\text{native}}),\quad |\text{tokens}|\propto \text{resolution}",
   when="Strong open general-purpose VLM; OCR, charts, grounding, video.",
   pros=["Native resolution", "Strong open model", "Broad skills"],
   cons=["Variable cost per image", "Big at high res"],
   papers=["Qwen-VL (2023)", "Qwen2-VL (2024)"],
   learn=("Qwen2-VL — official blog (dynamic resolution)", "https://qwenlm.github.io/blog/qwen2-vl/")),
 dict(id="videollava", name="Video-LLaVA / VideoChat", short="Video-LLaVA", family="Video", anim="videoframes",
   tagline="Sample a few frames, turn them into tokens, let the LLM reason over time.",
   simple=("A video is too many frames to feed in full. Sample a handful, encode each into tokens, and the "
           "LLM reasons over the sequence — so it can answer 'what happened?' about a clip."),
   mapping="video frames → tokens → LLM",
   math=r"H=\mathrm{Enc}(\{f_1,\dots,f_T\}),\quad \mathrm{LLM}\big([\,H;\,\text{text}\,]\big)",
   when="Video QA, captioning, temporal reasoning.",
   pros=["Reuses image-VLM recipe", "Temporal reasoning", "Open"],
   cons=["Sparse frames miss fast events", "Long videos = many tokens"],
   papers=["Video-LLaVA (2023)", "VideoChat (2023)"],
   learn=("Video-LLaVA — official repo (demos + figures)", "https://github.com/PKU-YuanGroup/Video-LLaVA")),
 dict(id="donut", name="Donut / Pix2Struct (OCR-free Docs)", short="Donut", family="Document", anim="ocr",
   tagline="Read a document image straight to structured text — no separate OCR.",
   simple=("Older pipelines run OCR, then parse the text. Donut skips OCR entirely: it reads the document "
           "image and directly generates the answer or a structured JSON (totals, dates, fields) — "
           "end to end."),
   mapping="document image → JSON / answer",
   math=r"p_\theta\big(\text{JSON tokens}\mid \mathrm{Enc}(\text{document image})\big)\;\;(\text{OCR-free})",
   when="Receipts, forms, invoices, charts → structured output.",
   pros=["No OCR errors to inherit", "End-to-end", "Outputs structure"],
   cons=["Needs document-specific finetuning", "Struggles with very long pages"],
   papers=["Donut (Kim 2022)", "Pix2Struct (2023)"],
   learn=("Fine-tuning Donut — Phil Schmid (clear intro)", "https://www.philschmid.de/fine-tuning-donut")),
 dict(id="vit", name="Vision Transformer (ViT)", short="ViT", family="Backbone", anim="patchify",
   tagline="Treat an image as a sequence of patches — a transformer for pixels.",
   simple=("The backbone under almost every VLM. Cut the image into a grid of 16×16 patches, turn each "
           "into a token, add position info, and run a plain transformer. No convolutions needed at scale."),
   mapping="image patches → feature tokens",
   math=r"z_0=[\,x_{\text{cls}};\,Ex_1;\dots;Ex_N\,]+E_{\text{pos}},\quad z=\mathrm{Transformer}(z_0)",
   when="The image encoder that CLIP/SigLIP/DINO and VLMs are built on.",
   pros=["Scales with data", "Unifies vision with NLP tooling", "Flexible"],
   cons=["Data-hungry without SSL", "Quadratic in #patches"],
   papers=["ViT (Dosovitskiy 2020)"],
   learn=("How the Vision Transformer works — AI Summer", "https://theaisummer.com/vision-transformer/")),
 dict(id="ssl", name="DINOv2 / MAE (Self-Supervised)", short="DINOv2 / MAE", family="Backbone", anim="ssl",
   tagline="Learn powerful vision features from images alone — no labels.",
   simple=("Labels are expensive. Self-supervised methods invent their own task: mask out patches and "
           "reconstruct them (MAE), or make two crops of an image agree (DINO). The encoder learns rich "
           "features that transfer everywhere."),
   mapping="unlabeled images → strong encoder",
   math=r"\min_\theta\;\|x_{\text{masked}}-\hat x_\theta\|^2\;\;\text{or}\;\;\text{align two augmented views}",
   when="Pretrain an encoder when labels are scarce; dense features.",
   pros=["No labels", "Great dense/spatial features", "Transfers broadly"],
   cons=["Heavy pretraining", "Not language-aligned by itself"],
   papers=["MAE (He 2021)", "DINO/DINOv2 (Caron 2021 / 2023)"],
   learn=("Masked Autoencoders animated — AI Coffee Break (video)", "https://www.youtube.com/watch?v=Dp6iICL2dVI")),
 dict(id="instructtune", name="Visual Instruction Tuning", short="Visual Instruct Tuning", family="Posttrain", anim="instructtune",
   tagline="Fine-tune on image+question→answer so the VLM becomes a chat assistant.",
   simple=("A pretrained VLM can match or caption, but it isn't a helpful assistant yet. Fine-tune it on "
           "lots of (image, instruction, answer) examples — often generated by GPT-4 — and it learns to "
           "follow visual instructions and chat about images."),
   mapping="image + instruction → answer (supervised)",
   math=r"\min_\theta-\!\sum \log p_\theta\big(\text{answer}\mid \text{image},\,\text{instruction}\big)",
   when="The step that turns a raw VLM into LLaVA-style assistants.",
   pros=["Big usefulness jump", "Cheap vs pretraining", "Data can be auto-generated"],
   cons=["Quality bounded by the data", "Can amplify hallucination"],
   papers=["LLaVA visual instruction tuning (2023)"],
   learn=("Visual Instruction Tuning with LLaVA (video)", "https://www.youtube.com/watch?v=ZHhuPYgbVzw")),
 dict(id="mmrlhf", name="Multimodal RLHF / DPO", short="Multimodal RLHF", family="Posttrain", anim="rlhf",
   tagline="Use human/AI preferences to make answers truthful, not hallucinated.",
   simple=("VLMs love to make things up ('a dog' when it's a cat). Collect preferences — which of two "
           "answers is more faithful to the image — and nudge the model toward the truthful one with RLHF "
           "or DPO. Hallucinations drop."),
   mapping="answers + preference → less-hallucinating VLM",
   math=r"\max_\theta\;\mathbb{E}[\,r(\text{answer})\,]\quad\text{or}\quad \mathrm{DPO}:\;\text{preferred}\succ\text{rejected}",
   when="Reduce hallucination; align tone/safety after instruction tuning.",
   pros=["Less hallucination", "Better faithfulness", "Aligns behavior"],
   cons=["Needs preference data", "Reward hacking risk"],
   papers=["RLHF-V (2023)", "LLaVA-RLHF (2023)", "multimodal DPO"],
   learn=("RLHF-V — official page (hallucination figures)", "https://rlhf-v.github.io/")),
 dict(id="unified", name="Unified-IO / Emu / Janus (Any-to-Any)", short="Unified / Janus", family="Unified", anim="anytoany",
   tagline="One model that both understands images AND generates them.",
   simple=("Most VLMs only read images. Any-to-any models also draw them: a single model takes text or "
           "images in, and produces text or images out — understanding and generation unified in one "
           "network."),
   mapping="image ↔ text (understand and generate)",
   math=r"p_\theta(y\mid x),\quad x,y\in\{\text{text},\,\text{image}\}",
   when="When you want one model for VQA, captioning AND image generation.",
   pros=["One model, both directions", "Shared multimodal reasoning", "Flexible I/O"],
   cons=["Generation quality trails specialists", "Hard to train/balance"],
   papers=["Unified-IO (2022/23)", "Emu (2023/24)", "Janus / Janus-Pro (2025)"],
   learn=("DeepSeek Janus-Pro explained — DataCamp", "https://www.datacamp.com/blog/janus-pro")),
 dict(id="flava", name="FLAVA (Masked Multimodal)", short="FLAVA", family="Masked", anim="ssl",
   tagline="The third classic objective: mask parts of image AND text, reconstruct them.",
   simple=("Besides contrastive (CLIP) and captioning, there's a third foundational recipe: hide patches "
           "of the image and words of the text, and train the model to fill them back in. FLAVA combines "
           "masked-image, masked-language, masked-multimodal AND contrastive objectives in one model."),
   mapping="masked image + text → reconstruct",
   math=r"\mathcal{L}=\mathcal{L}_{\text{MIM}}+\mathcal{L}_{\text{MLM}}+\mathcal{L}_{\text{MMM}}+\mathcal{L}_{\text{contrastive}}",
   when="One foundation model strong at vision, language, AND multimodal tasks.",
   pros=["Unifies the masked objectives", "One model, many task types", "Strong representations"],
   cons=["Complex multi-objective training", "Heavier than CLIP"],
   papers=["FLAVA (Singh 2022)"],
   learn=("FLAVA — Hugging Face docs", "https://huggingface.co/docs/transformers/model_doc/flava")),
 dict(id="beit3", name="BEiT-3 (Image as a Language)", short="BEiT-3", family="Masked", anim="ssl",
   tagline="Treat an image as 'Imglish' and mask-model images, text, and pairs alike.",
   simple=("BEiT-3 views an image as just another language ('Imglish') and trains one Multiway transformer "
           "with a single masked-modeling objective over images, text, and image-text pairs — topping both "
           "vision and vision-language benchmarks with one backbone."),
   mapping="masked images-as-language + text",
   math=r"\min_\theta\;\mathbb{E}\big[-\log p_\theta(x_{\text{masked}}\mid x_{\text{visible}})\big]\;\;(\text{image as ‘Imglish’})",
   when="A single backbone for detection, segmentation, VQA, retrieval, captioning.",
   pros=["SOTA across many tasks", "One unified masked objective", "Multiway experts"],
   cons=["Large", "Complex architecture"],
   papers=["BEiT-3 (Wang 2022)"],
   learn=("BEiT-3 — Microsoft unilm (repo)", "https://github.com/microsoft/unilm/tree/master/beit3")),
 dict(id="small-vlm", name="Small / Efficient VLMs", short="Small VLMs", family="Efficient", anim="efficient",
   tagline="Phone-sized VLMs via a small LLM, distillation, and token compression.",
   simple=("Frontier VLMs are huge. Efficient VLMs (MobileVLM, TinyLLaVA, MiniCPM-V) use a small LLM, "
           "distill from a big teacher, and compress vision tokens — so a capable VLM runs on a laptop or "
           "even a phone."),
   mapping="big VLM → small, on-device VLM",
   math=r"\min_\theta\;\mathrm{KL}\big(p_{\text{teacher}}\,\|\,p_\theta\big)+\mathcal{L}_{\text{task}}\;\;(\text{distill + compress})",
   when="On-device, low-latency, or cost-sensitive deployment.",
   pros=["Runs on phones / laptops", "Cheap inference", "Surprisingly capable"],
   cons=["Quality gap vs frontier", "Weaker long context", "Compression artifacts"],
   papers=["MobileVLM (2023)", "TinyLLaVA (2024)", "MiniCPM-V (2024)"],
   learn=("MiniCPM-V — project repo", "https://github.com/OpenBMB/MiniCPM-V")),
 dict(id="moe-vlm", name="Mixture-of-Experts VLMs", short="MoE VLMs", family="MoE", anim="moe",
   tagline="Many expert sub-nets; a router fires only a few per token.",
   simple=("Scale capacity without scaling compute: keep many 'expert' subnetworks and a router that sends "
           "each token to just a couple of them. The model has huge total capacity but stays cheap to run — "
           "MoE-LLaVA."),
   mapping="token → router → top-k experts",
   math=r"y=\sum_{i\in\mathrm{TopK}} g_i(x)\,E_i(x),\quad g=\mathrm{softmax}(\mathrm{router}(x))",
   when="Scale capacity across many tasks/modalities under a compute budget.",
   pros=["Big capacity, low active compute", "Specialized experts", "Scales well"],
   cons=["Routing instability", "Memory holds all experts", "Load balancing"],
   papers=["MoE-LLaVA (2024)"],
   learn=("MoE-LLaVA — project repo", "https://github.com/PKU-YuanGroup/MoE-LLaVA")),
 dict(id="agentic-vlm", name="Agentic / Tool-Use VLM", short="Agentic VLM", family="Agentic", anim="agentic",
   tagline="A VLM that reasons, calls tools, and acts in a loop.",
   simple=("Instead of one-shot answers, the VLM plans: look, think, call a tool (search, code, zoom, "
           "click), observe the result, and repeat. That turns it into an agent that can operate GUIs or "
           "solve multi-step visual tasks (MM-ReAct, computer-use agents)."),
   mapping="see → think → call tool → observe → repeat",
   math=r"a_t=\mathrm{VLM}(o_{\le t},\,\text{tools}),\quad o_{t+1}=\mathrm{Env/Tool}(a_t)\;\;(\text{ReAct loop})",
   when="GUI automation, multi-step visual reasoning, computer-use / embodied agents.",
   pros=["Multi-step reasoning", "Extends abilities via tools", "Acts, not just answers"],
   cons=["Errors compound over steps", "Latency", "Needs guardrails"],
   papers=["MM-ReAct (2023)", "GUI / computer-use agents (2024–25)"],
   learn=("MM-ReAct — project repo", "https://github.com/microsoft/MM-REACT")),
 dict(id="mm-rag", name="Multimodal RAG", short="Multimodal RAG", family="Agentic", anim="rag",
   tagline="Retrieve relevant images/docs first, then answer grounded in them.",
   simple=("Don't rely only on what's baked into the weights. Retrieve relevant images, pages, or passages "
           "from a knowledge store and feed them to the VLM, so answers are grounded in real, up-to-date "
           "sources — with far fewer hallucinations."),
   mapping="query → retrieve images/docs → grounded answer",
   math=r"\hat y=\mathrm{VLM}\big(q,\;\mathrm{Retrieve}(q,\mathcal{D})\big)",
   when="Document QA, knowledge-grounded VQA, up-to-date or private corpora.",
   pros=["Grounded, current answers", "Less hallucination", "No retraining for new facts"],
   cons=["Bounded by retrieval quality", "More moving parts", "Latency"],
   papers=["Multimodal RAG (2023–25)"],
   learn=("Multimodal RAG — Hugging Face cookbook", "https://huggingface.co/learn/cookbook/multimodal_rag_using_document_retrieval_and_vlms")),
]

# a,b,why,kind   ("v"=variant; "b"=builds-on, arrow a→b means "b builds on a")
EDGES = [
 ("clip","siglip","contrastive pretraining: softmax vs sigmoid","v"),
 ("clip","coca","contrastive image-text alignment","v"),
 ("blip","coca","caption + contrastive pretraining","v"),
 ("blip","git","generative image-to-text","v"),
 ("flamingo","blip2","two ways to bridge vision into an LLM","v"),
 ("blip2","llava","two ways to bridge vision into an LLM","v"),
 ("flamingo","llava","two ways to bridge vision into an LLM","v"),
 ("fuyu","chameleon","early-fusion: image patches as tokens","v"),
 ("chameleon","frontier","natively multimodal","v"),
 ("glip","kosmos2","grounding language to image regions","v"),
 ("llavanext","qwenvl","high-resolution / any-res input","v"),
 ("vit","ssl","a ViT trained with self-supervision","v"),
 ("donut","llavanext","high resolution helps read documents","v"),
 ("vit","clip","CLIP's image encoder is a ViT","b"),
 ("clip","blip2","a frozen CLIP-style encoder feeds the Q-Former","b"),
 ("clip","llava","CLIP-ViT is LLaVA's vision encoder","b"),
 ("clip","flamingo","both pretrain the vision encoder contrastively (CLIP-style)","v"),
 ("clip","glip","CLIP-style image–text alignment, applied to detection","v"),
 ("vit","qwenvl","a native dynamic-resolution ViT encoder","b"),
 ("instructtune","llava","instruction tuning makes LLaVA an assistant","b"),
 ("llava","llavanext","LLaVA-NeXT extends LLaVA to high-res","b"),
 ("llava","videollava","extend the image VLM to video","b"),
 ("instructtune","mmrlhf","RLHF/DPO post-trains the tuned VLM","b"),
 ("chameleon","unified","early-fusion, unified token-space any-to-any","v"),
 ("blip2","instructtune","instruction data tunes bridged VLMs","b"),
 # survey-driven additions
 ("clip","flava","adds masked modeling to contrastive","v"),
 ("flava","beit3","masked multimodal pretraining","v"),
 ("ssl","flava","masked-image-modeling lineage","b"),
 ("llava","small-vlm","shrink the LLaVA recipe for on-device","b"),
 ("llava","moe-vlm","swap the LLM for a mixture-of-experts","b"),
 ("instructtune","agentic-vlm","tool-use on an instruction-tuned VLM","b"),
 ("agentic-vlm","mm-rag","retrieval is one tool the agent calls","v"),
 ("frontier","agentic-vlm","frontier VLMs power the best agents","v"),
]


def build_data():
    fams = [dict(key=k, label=lbl, color=col, desc="", equation="", relations=[]) for (k, lbl, col) in FAMILIES]
    paradigms = []
    for p in P:
        paradigms.append(dict(
            id=p["id"], name=p["name"], short=p["short"], family=p["family"], anim=p["anim"],
            tagline=p["tagline"], simple=p["simple"], mapping=p["mapping"], math=p["math"],
            when=p["when"], pros=p["pros"], cons=p["cons"], papers=p["papers"],
            learn={"title": p["learn"][0], "url": p["learn"][1]},
        ))
    ids = {p["id"] for p in paradigms}
    edges = [[a, b, w, k] for (a, b, w, k) in EDGES if a in ids and b in ids]
    nodeset = ids | {f["key"] for f in fams}
    mypapers = [m for m in my_papers.MY_PAPERS if m.get("node") in nodeset]
    return dict(families=fams, paradigms=paradigms, edges=edges, mypapers=mypapers)


# ---------------------------------------------------------------------------
# VLM animation library (injected into the shared template's ANIM object).
# Each anim svg gets its own arrow marker (the map's marker isn't in scope).
# ---------------------------------------------------------------------------
VLM_ANIM_JS = r"""
  contrastive(c){
    let s='';
    const n=4,x0=86,y0=30,cell=22;
    for(let r=0;r<n;r++)for(let q=0;q<n;q++){const x=x0+q*cell,y=y0+r*cell,d=r===q;
      s+='<rect x="'+x+'" y="'+y+'" width="'+(cell-3)+'" height="'+(cell-3)+'" rx="3" fill="'+(d?c:'#172036')+'" opacity="'+(d?'0':'0.5')+'">'+(d?'<animate attributeName="opacity" values="0;1;1;0.3" dur="2.6s" begin="'+(r*0.25)+'s" repeatCount="indefinite"/>':'')+'</rect>';}
    for(let i=0;i<n;i++){s+='<rect x="'+(x0-20)+'" y="'+(y0+i*cell)+'" width="14" height="'+(cell-3)+'" rx="2" fill="#334155"/>';
      s+='<rect x="'+(x0+i*cell)+'" y="'+(y0-16)+'" width="'+(cell-3)+'" height="11" rx="2" fill="'+c+'" opacity="0.6"/>';}
    s+='';
    return s;
  },
  caption(c){
    let s='<rect x="18" y="44" width="60" height="58" rx="7" fill="#172036" stroke="'+c+'" stroke-width="1.5"/>'+
      '<circle cx="48" cy="73" r="11" fill="none" stroke="'+c+'" stroke-width="2"/><path d="M82,73 L104,73" stroke="'+c+'" stroke-width="2"/>';
    const w=['a','cat','on','a','mat'];
    w.forEach((t,i)=>{const x=110+i*30;
      s+='<rect x="'+x+'" y="62" width="27" height="22" rx="5" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;1;1" keyTimes="0;0.12;1" dur="3s" begin="'+(i*0.4)+'s" repeatCount="indefinite"/></rect>'+
        '<text x="'+(x+13)+'" y="77" text-anchor="middle" font-size="9" fill="#06121f" opacity="0">'+t+'<animate attributeName="opacity" values="0;1;1" keyTimes="0;0.12;1" dur="3s" begin="'+(i*0.4)+'s" repeatCount="indefinite"/></text>';});
    s+='';return s;
  },
  projector(c){
    let s='<text x="12" y="20" font-size="9" fill="#93a0bd">vision patches</text>';
    for(let i=0;i<4;i++)s+='<rect x="12" y="'+(30+i*20)+'" width="18" height="16" rx="3" fill="'+c+'" opacity="0.8"/>';
    s+='<path d="M46,30 L82,46 L82,94 L46,110 Z" fill="#172036" stroke="'+c+'" stroke-width="1.5"/><text x="64" y="74" text-anchor="middle" font-size="8" fill="'+c+'">proj</text>';
    for(let i=0;i<4;i++)s+='<circle r="4" fill="'+c+'"><animateMotion dur="1.8s" begin="'+(i*0.3)+'s" repeatCount="indefinite" path="M86,70 L150,70"/><animate attributeName="opacity" values="0;1;0" dur="1.8s" begin="'+(i*0.3)+'s" repeatCount="indefinite"/></circle>';
    s+='<rect x="152" y="44" width="62" height="52" rx="9" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="183" y="74" text-anchor="middle" font-size="12" fill="'+c+'">LLM</text>';
    s+='';return s;
  },
  qformer(c){
    let s='';
    for(let r=0;r<4;r++)for(let q=0;q<3;q++)s+='<rect x="'+(12+q*14)+'" y="'+(28+r*16)+'" width="11" height="13" rx="2" fill="#334155"/>';
    s+='<text x="86" y="18" font-size="9" fill="'+c+'">few queries</text>';
    for(let i=0;i<3;i++){s+='<circle cx="100" cy="'+(40+i*22)+'" r="7" fill="'+c+'"/>';s+='<line x1="58" y1="'+(50+i*14)+'" x2="93" y2="'+(40+i*22)+'" stroke="'+c+'" stroke-width="1" opacity="0.4"/>';}
    s+='<path d="M108,62 L150,62" stroke="'+c+'" stroke-width="2"/><rect x="152" y="42" width="62" height="44" rx="9" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="183" y="68" text-anchor="middle" font-size="11" fill="'+c+'">LLM</text>';
    s+='';return s;
  },
  crossattn(c){
    let s='<text x="20" y="16" font-size="9" fill="#93a0bd">image features</text>';
    for(let q=0;q<6;q++)s+='<rect x="'+(20+q*30)+'" y="22" width="24" height="20" rx="3" fill="#334155"/>';
    const t=['the','cat','is'];
    t.forEach((w,i)=>{const x=36+i*72;
      s+='<rect x="'+x+'" y="98" width="52" height="22" rx="6" fill="'+c+'"/><text x="'+(x+26)+'" y="113" text-anchor="middle" font-size="10" fill="#06121f">'+w+'</text>'+
        '<path d="M'+(x+26)+',98 C'+(x+26)+',70 '+(56+i*54)+',60 '+(56+i*54)+',44" fill="none" stroke="'+c+'" stroke-width="1.6" stroke-dasharray="4 4" opacity="0"><animate attributeName="opacity" values="0;0.9;0" dur="2.2s" begin="'+(i*0.4)+'s" repeatCount="indefinite"/></path>';});
    s+='';return s;
  },
  patchify(c){
    let s='';
    const gx=16,gy=24,ps=20;
    for(let r=0;r<3;r++)for(let q=0;q<3;q++)s+='<rect x="'+(gx+q*ps)+'" y="'+(gy+r*ps)+'" width="'+(ps-2)+'" height="'+(ps-2)+'" rx="2" fill="'+c+'" opacity="'+(0.35+0.07*(r*3+q))+'"/>';
    for(let i=0;i<9;i++)s+='<rect x="'+(98+i*18)+'" y="98" width="15" height="15" rx="3" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.9" dur="0.3s" begin="'+(0.6+i*0.12)+'s" fill="freeze"/></rect>';
    s+='<path d="M80,52 L96,100" stroke="#93a0bd" stroke-width="1" stroke-dasharray="3 3" opacity="0.5"/>';return s;
  },
  earlyfusion(c){
    let s='';
    const seq=[1,1,0,1,0,0,1];
    seq.forEach((m,i)=>{const x=16+i*30;
      s+='<rect x="'+x+'" y="32" width="24" height="22" rx="4" fill="'+(m?c:'#475569')+'" opacity="0"><animate attributeName="opacity" values="0;1" dur="0.25s" begin="'+(i*0.12)+'s" fill="freeze"/></rect>'+
        '<circle r="3" fill="#06121f"><animateMotion dur="1.6s" begin="'+(1.2+i*0.06)+'s" repeatCount="indefinite" path="M'+(x+12)+',54 L'+(x+12)+',88"/></circle>';});
    s+='<rect x="40" y="92" width="200" height="32" rx="9" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="140" y="112" text-anchor="middle" font-size="11" fill="'+c+'">one Transformer</text>';return s;
  },
  grounding(c){
    let s='<rect x="14" y="26" width="184" height="98" rx="8" fill="#0a1426" stroke="#27406b"/><text x="22" y="40" font-size="9" fill="#93a0bd">scene</text>';
    s+='<circle cx="68" cy="82" r="14" fill="#334155"/><rect x="118" y="62" width="34" height="40" rx="4" fill="#334155"/>';
    s+='<text x="204" y="58" font-size="10" fill="'+c+'">"the box"</text>';
    s+='<rect x="114" y="58" width="42" height="48" rx="3" fill="none" stroke="'+c+'" stroke-width="2.5" stroke-dasharray="180" stroke-dashoffset="180"><animate attributeName="stroke-dashoffset" values="180;0" dur="1.2s" begin="0.6s" fill="freeze"/></rect>';
    s+='<text x="204" y="78" font-size="9" fill="#93a0bd">→ box on the</text><text x="204" y="90" font-size="9" fill="#93a0bd">named object</text>';return s;
  },
  anyres(c){
    let s='';
    for(let r=0;r<2;r++)for(let q=0;q<2;q++)s+='<rect x="'+(14+q*42)+'" y="'+(26+r*42)+'" width="38" height="38" rx="4" fill="'+c+'" opacity="'+(0.4+0.12*(r*2+q))+'"/>';
    s+='<rect x="14" y="116" width="34" height="20" rx="3" fill="'+c+'" opacity="0.7"/><text x="54" y="130" font-size="8" fill="#93a0bd">thumbnail</text>';
    for(let i=0;i<4;i++)s+='<circle r="4" fill="'+c+'"><animateMotion dur="1.6s" begin="'+(i*0.3)+'s" repeatCount="indefinite" path="M104,'+(46+i*8)+' L168,70"/><animate attributeName="opacity" values="0;1;0" dur="1.6s" begin="'+(i*0.3)+'s" repeatCount="indefinite"/></circle>';
    s+='<rect x="170" y="50" width="60" height="40" rx="8" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="200" y="74" text-anchor="middle" font-size="10" fill="'+c+'">VLM</text>';return s;
  },
  videoframes(c){
    let s='';
    for(let i=0;i<6;i++){const x=16+i*40,sp=i%2===0;
      s+='<rect x="'+x+'" y="28" width="34" height="44" rx="4" fill="#172036" stroke="'+(sp?c:'#27406b')+'" stroke-width="'+(sp?2:1)+'"/>';
      if(sp)s+='<rect x="'+x+'" y="28" width="34" height="44" rx="4" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.5;0" dur="2.4s" begin="'+(i*0.2)+'s" repeatCount="indefinite"/></rect>';}
    for(let i=0;i<3;i++)s+='<rect x="'+(60+i*50)+'" y="98" width="40" height="20" rx="5" fill="'+c+'" opacity="0.8"/>';
    s+='';return s;
  },
  ocr(c){
    let s='<rect x="14" y="24" width="86" height="102" rx="6" fill="#0a1426" stroke="#27406b"/>';
    for(let i=0;i<6;i++)s+='<rect x="24" y="'+(36+i*15)+'" width="'+(66-(i%3)*14)+'" height="6" rx="3" fill="#334155"/>';
    s+='<text x="22" y="138" font-size="8" fill="#93a0bd">document image</text>';
    s+='<path d="M104,74 L138,74" stroke="'+c+'" stroke-width="2"/><path d="M132,69 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="146" y="32" width="120" height="88" rx="6" fill="#0c1326" stroke="'+c+'" stroke-width="1.5"/>';
    const rows=['{ "total":','   "$42.00",','   "date":','   "6/5" }'];
    rows.forEach((t,i)=>s+='<text x="156" y="'+(52+i*18)+'" font-size="10" fill="'+c+'" font-family="ui-monospace,monospace" opacity="0">'+t+'<animate attributeName="opacity" values="0;1;1" keyTimes="0;0.15;1" dur="3s" begin="'+(0.4+i*0.4)+'s" repeatCount="indefinite"/></text>');
    return s;
  },
  ssl(c){
    let s='';
    const gx=86,gy=26,ps=24,mask=[1,2,4,7,8,5];
    for(let i=0;i<9;i++){const r=Math.floor(i/3),q=i%3,x=gx+q*ps,y=gy+r*ps,m=mask.indexOf(i)>=0;
      s+='<rect x="'+x+'" y="'+y+'" width="'+(ps-3)+'" height="'+(ps-3)+'" rx="3" fill="'+(m?'#0a1426':c)+'" stroke="#27406b"/>';
      if(m)s+='<rect x="'+x+'" y="'+y+'" width="'+(ps-3)+'" height="'+(ps-3)+'" rx="3" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.85" dur="0.5s" begin="'+(1.2+(i%3)*0.4)+'s" fill="freeze"/></rect>';}
    s+='';return s;
  },
  instructtune(c){
    let s='<rect x="16" y="30" width="44" height="40" rx="6" fill="#172036" stroke="'+c+'" stroke-width="1.5"/><circle cx="38" cy="50" r="9" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="70" y="32" width="166" height="24" rx="9" fill="#172036" stroke="#334155"/><text x="80" y="48" font-size="10" fill="#cbd5e1">"What is in the image?"</text>';
    s+='<rect x="70" y="78" width="190" height="40" rx="9" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;1" dur="0.4s" begin="1s" fill="freeze"/></rect>'+
       '<text x="80" y="95" font-size="10" fill="#06121f" opacity="0">"A cat sitting on a<animate attributeName="opacity" values="0;1" dur="0.4s" begin="1.1s" fill="freeze"/></text>'+
       '<text x="80" y="110" font-size="10" fill="#06121f" opacity="0">windowsill in the sun."<animate attributeName="opacity" values="0;1" dur="0.4s" begin="1.3s" fill="freeze"/></text>';
    s+='';return s;
  },
  rlhf(c){
    let s='';
    s+='<rect x="16" y="26" width="116" height="40" rx="8" fill="#172036" stroke="#fb7185" stroke-width="1.5"/><text x="24" y="43" font-size="10" fill="#fb7185">A: "a dog" ✗</text><text x="24" y="58" font-size="8" fill="#93a0bd">(hallucinated)</text>';
    s+='<rect x="16" y="76" width="116" height="40" rx="8" fill="#172036" stroke="#34d399" stroke-width="1.5"/><text x="24" y="93" font-size="10" fill="#34d399">B: "a cat" ✓</text>';
    s+='<text x="150" y="103" font-size="19">👍<animate attributeName="opacity" values="0.3;1;0.3" dur="1.6s" repeatCount="indefinite"/></text><path d="M134,96 L150,96" stroke="#34d399" stroke-width="2"/>';
    s+='<rect x="188" y="76" width="64" height="44" rx="9" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="220" y="102" text-anchor="middle" font-size="10" fill="'+c+'">VLM</text>';
    s+='<path d="M172,114 C192,138 212,128 220,122" fill="none" stroke="'+c+'" stroke-width="1.4" stroke-dasharray="4 4" opacity="0.6"/>';return s;
  },
  anytoany(c){
    let s='<rect x="20" y="48" width="56" height="52" rx="7" fill="#172036" stroke="'+c+'" stroke-width="1.5"/><circle cx="48" cy="74" r="11" fill="none" stroke="'+c+'" stroke-width="2"/><text x="48" y="40" text-anchor="middle" font-size="9" fill="#93a0bd">image</text>';
    s+='<rect x="204" y="48" width="56" height="52" rx="7" fill="#172036" stroke="'+c+'" stroke-width="1.5"/><text x="232" y="40" text-anchor="middle" font-size="9" fill="#93a0bd">text</text><text x="232" y="78" text-anchor="middle" font-size="10" fill="'+c+'">"a cat"</text>';
    s+='<path d="M78,62 L198,62" stroke="'+c+'" stroke-width="2"/><path d="M198,58 l8,4 l-8,4 z" fill="'+c+'"/><text x="140" y="56" text-anchor="middle" font-size="8" fill="#93a0bd">understand</text>';
    s+='<path d="M202,88 L82,88" stroke="'+c+'" stroke-width="2"/><path d="M82,84 l-8,4 l8,4 z" fill="'+c+'"/><text x="140" y="102" text-anchor="middle" font-size="8" fill="#93a0bd">generate</text>';
    s+='';return s;
  },
  efficient(c){
    let s='<rect x="16" y="34" width="70" height="60" rx="10" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="51" y="68" text-anchor="middle" font-size="11" fill="'+c+'">big VLM</text>';
    s+='<path d="M90,64 L138,64" stroke="'+c+'" stroke-width="2"/><path d="M132,59 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="146" y="48" width="40" height="34" rx="7" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="166" y="68" text-anchor="middle" font-size="9" fill="'+c+'">small</text>';
    s+='<rect x="196" y="40" width="34" height="56" rx="6" fill="#172036" stroke="#475569"/><text x="213" y="73" text-anchor="middle" font-size="15">📱</text>';
    s+='';return s;
  },
  moe(c){
    let s='';
    s+='<circle cx="38" cy="74" r="10" fill="'+c+'"/><text x="38" y="78" text-anchor="middle" font-size="8" fill="#06121f">tok</text>';
    s+='<rect x="68" y="60" width="42" height="28" rx="7" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="89" y="78" text-anchor="middle" font-size="9" fill="'+c+'">router</text>';
    const ys=[30,60,90,118];
    ys.forEach((y,i)=>{const on=i===1||i===2;
      s+='<rect x="150" y="'+(y-12)+'" width="64" height="24" rx="6" fill="#172036" stroke="'+(on?c:'#334155')+'" stroke-width="'+(on?2:1)+'"/><text x="182" y="'+(y+4)+'" text-anchor="middle" font-size="9" fill="'+(on?c:'#64748b')+'">expert '+(i+1)+'</text>';
      s+='<line x1="110" y1="74" x2="148" y2="'+y+'" stroke="'+c+'" stroke-width="'+(on?2:1)+'" opacity="'+(on?0.9:0.2)+'"/>';});
    s+='';return s;
  },
  agentic(c){
    let s='';
    s+='<rect x="104" y="58" width="72" height="34" rx="9" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="140" y="80" text-anchor="middle" font-size="11" fill="'+c+'">VLM agent</text>';
    const tools=[['🔍 search',26,30],['🧮 run code',196,30],['🖱️ act',110,112]];
    tools.forEach(t=>{s+='<rect x="'+t[1]+'" y="'+t[2]+'" width="66" height="22" rx="6" fill="#172036" stroke="#475569"/><text x="'+(t[1]+33)+'" y="'+(t[2]+15)+'" text-anchor="middle" font-size="9" fill="#cbd5e1">'+t[0]+'</text>';});
    s+='<line x1="118" y1="58" x2="66" y2="52" stroke="'+c+'" stroke-width="1.5" stroke-dasharray="4 4" opacity="0.5"/>';
    s+='<line x1="162" y1="58" x2="214" y2="52" stroke="'+c+'" stroke-width="1.5" stroke-dasharray="4 4" opacity="0.5"/>';
    s+='<line x1="140" y1="92" x2="143" y2="112" stroke="'+c+'" stroke-width="1.5" stroke-dasharray="4 4" opacity="0.5"/>';
    s+='<circle r="4" fill="'+c+'"><animateMotion dur="3s" repeatCount="indefinite" path="M140,58 L66,52"/><animate attributeName="opacity" values="0;1;0" dur="3s" repeatCount="indefinite"/></circle>';
    s+='<circle r="4" fill="'+c+'"><animateMotion dur="3s" begin="1s" repeatCount="indefinite" path="M140,58 L214,52"/><animate attributeName="opacity" values="0;1;0" dur="3s" begin="1s" repeatCount="indefinite"/></circle>';
    s+='<circle r="4" fill="'+c+'"><animateMotion dur="3s" begin="2s" repeatCount="indefinite" path="M140,92 L143,112"/><animate attributeName="opacity" values="0;1;0" dur="3s" begin="2s" repeatCount="indefinite"/></circle>';
    return s;
  },
  rag(c){
    let s='';
    for(let i=0;i<4;i++)s+='<rect x="14" y="'+(26+i*20)+'" width="40" height="16" rx="3" fill="#172036" stroke="#27406b"/>';
    for(let i=0;i<2;i++)s+='<rect x="14" y="'+(26+i*20)+'" width="40" height="16" rx="3" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;0.8;0.8" keyTimes="0;0.3;1" dur="3s" begin="'+(i*0.3)+'s" repeatCount="indefinite"/></rect>';
    s+='';
    s+='<path d="M58,40 L96,64" stroke="'+c+'" stroke-width="1.5" stroke-dasharray="4 4" opacity="0.6"/>';
    s+='<rect x="98" y="50" width="62" height="34" rx="9" fill="#0c1326" stroke="'+c+'" stroke-width="2"/><text x="129" y="72" text-anchor="middle" font-size="11" fill="'+c+'">VLM</text>';
    s+='<path d="M162,67 L196,67" stroke="'+c+'" stroke-width="2"/><path d="M190,62 l8,5 l-8,5" fill="none" stroke="'+c+'" stroke-width="2"/>';
    s+='<rect x="200" y="52" width="66" height="30" rx="8" fill="'+c+'" opacity="0"><animate attributeName="opacity" values="0;1" dur="0.4s" begin="1.2s" fill="freeze"/></rect>'+
       '<text x="233" y="71" text-anchor="middle" font-size="9" fill="#06121f" opacity="0">grounded answer<animate attributeName="opacity" values="0;1" dur="0.4s" begin="1.3s" fill="freeze"/></text>';
    return s;
  },
"""

VLM_CAP_JS = (
 'contrastive:"Match the true image-text pair; push the rest apart.",'
 'caption:"The image is described one word at a time.",'
 'projector:"A small MLP turns image patches into LLM tokens.",'
 'qformer:"A few query tokens compress the image for the LLM.",'
 'crossattn:"A frozen LLM\'s text cross-attends to image features.",'
 'patchify:"The image becomes a sequence of patch tokens.",'
 'earlyfusion:"Image and text tokens share one transformer.",'
 'grounding:"Text names an object; a box is drawn on it.",'
 'anyres:"A high-res image is split into tiles + a thumbnail.",'
 'videoframes:"A few sampled frames stand in for the video.",'
 'ocr:"A document image is read straight into structured text.",'
 'ssl:"Hidden patches are reconstructed — learning with no labels.",'
 'instructtune:"Tuned on image+question to answer like an assistant.",'
 'rlhf:"Preference for the truthful answer reduces hallucination.",'
 'anytoany:"One model both understands and generates images.",'
 'efficient:"Shrink a big VLM (distill + compress) to run on-device.",'
 'moe:"A router fires only a few expert sub-nets per token.",'
 'agentic:"A VLM that calls tools in a loop to solve tasks.",'
 'rag:"Retrieve relevant images/docs, then answer grounded in them.",'
)


def render():
    t = gen_landscape.TEMPLATE
    t = t.replace("const ANIM = {", "const ANIM = {\n" + VLM_ANIM_JS, 1)
    t = t.replace("const ANIM_CAP = {", "const ANIM_CAP = {\n  " + VLM_CAP_JS + "\n", 1)
    t = t.replace("🤖 Robot Learning Landscape", "👁️ Vision-Language Model Landscape")
    t = t.replace(">🤖</text>", ">👁️</text>")
    t = t.replace(">POLICY</text>", ">VLM</text>")
    t = t.replace("__DATA_JSON__", json.dumps(build_data()))
    return t


def main():
    html = render()
    with open("robot_vlm.html", "w", encoding="utf-8") as fh:
        fh.write(html)
    d = build_data()
    print("wrote robot_vlm.html (%d chars) — %d families, %d paradigms, %d edges"
          % (len(html), len(d["families"]), len(d["paradigms"]), len(d["edges"])))


if __name__ == "__main__":
    main()