"""Generate a Vision-Language-Model (VLM) landscape page. Reuses the robot-landscape engine (gen_landscape.TEMPLATE: constellation map, Magic-Move detail panels, typed edges, MathJax) but injects a VLM-specific set of animations and data. Self-contained HTML. Run: .venv_robot_paradigms/bin/python gen_vlm.py -> robot_vlm.html """ import json import gen_landscape import my_papers # --------------------------------------------------------------------------- # Families (11) # --------------------------------------------------------------------------- FAMILIES = [ ("Contrastive", "Contrastive Alignment", "#2563eb"), ("Generative", "Generative Pretraining", "#0891b2"), ("Bridge", "Vision→LLM Bridges", "#7c3aed"), ("Native", "Native Multimodal", "#ea580c"), ("Grounding", "Grounding / Regions", "#16a34a"), ("AnyRes", "High-Resolution", "#ca8a04"), ("Video", "Video VLMs", "#db2777"), ("Document", "Document / OCR", "#92400e"), ("Backbone", "Vision Backbones", "#64748b"), ("Posttrain", "Post-Training", "#e11d48"), ("Unified", "Any-to-Any", "#4f46e5"), ("Masked", "Masked Pretraining", "#38bdf8"), ("Efficient", "Efficient / Small", "#34d399"), ("MoE", "Mixture-of-Experts", "#fbbf24"), ("Agentic", "Agentic / RAG", "#fb7185"), ] # id, name, short, family, anim, tagline, simple, mapping, math, when, pros, cons, papers, (learn title,url) P = [ dict(id="clip", name="CLIP (Contrastive Image-Text)", short="CLIP", family="Contrastive", anim="contrastive", tagline="Pull matching image–text pairs together, push the rest apart.", simple=("Show the model millions of image+caption pairs. Train it so each image's vector lands " "next to its true caption's vector and far from all the wrong captions. The result: you can " "compare any image to any sentence — the basis of almost every modern VLM."), mapping="image, text → shared embedding space", math=r"\mathcal{L}=-\tfrac12\sum_i\Big[\log\tfrac{e^{\langle I_i,T_i\rangle/\tau}}{\sum_j e^{\langle I_i,T_j\rangle/\tau}}+\log\tfrac{e^{\langle I_i,T_i\rangle/\tau}}{\sum_j e^{\langle I_j,T_i\rangle/\tau}}\Big]", when="Zero-shot classification/retrieval, and as the vision encoder feeding most VLMs.", pros=["Zero-shot transfer", "Web-scale, no labels needed", "Encoder reused everywhere"], cons=["Weak at fine spatial detail / counting", "Bag-of-words text understanding", "Needs huge batches"], papers=["CLIP (Radford 2021)", "ALIGN (Jia 2021)", "OpenCLIP"], learn=("What is CLIP? — Roboflow (illustrated)", "https://blog.roboflow.com/openai-clip/")), dict(id="siglip", name="SigLIP (Sigmoid Loss)", short="SigLIP", family="Contrastive", anim="contrastive", tagline="Same idea as CLIP, but a simpler pairwise sigmoid loss.", simple=("CLIP needs a giant batch so every image can be compared against many captions at once. " "SigLIP swaps the softmax for an independent yes/no (sigmoid) on each image–text pair, so it " "trains well even with smaller batches — and tends to give a stronger encoder."), mapping="image, text → matched? (per-pair sigmoid)", math=r"\mathcal{L}=\sum_{i,j}\log\sigma\!\big(z_{ij}\,(t\,\langle I_i,T_j\rangle+b)\big),\;\;z_{ij}\in\{+1,-1\}", when="A drop-in, batch-friendly upgrade to CLIP; encoder for newer VLMs (e.g. Qwen-VL, PaliGemma).", pros=["Works with smaller batches", "Often stronger than CLIP", "Simple loss"], cons=["Still global (not region-level)", "Needs careful bias init"], papers=["SigLIP (Zhai 2023)", "SigLIP 2 (2025)"], learn=("Google's SigLIP explained — Analytics Vidhya", "https://www.analyticsvidhya.com/blog/2024/10/googles-siglip/")), dict(id="coca", name="CoCa (Contrastive Captioner)", short="CoCa", family="Generative", anim="caption", tagline="One model trained on BOTH a contrastive loss and a captioning loss.", simple=("Why choose between matching (CLIP) and describing (captioning)? CoCa does both at once: the " "first half of the model learns CLIP-style alignment, the second half learns to write the " "caption. You get a strong encoder AND a generator from one training run."), mapping="image → embedding + generated caption", math=r"\mathcal{L}=\lambda_{\text{con}}\,\mathcal{L}_{\text{contrastive}}+\lambda_{\text{cap}}\,\mathcal{L}_{\text{caption}}", when="When you want one backbone good at both retrieval and captioning/VQA.", pros=["Encoder + decoder in one", "Strong transfer", "Single pretraining"], cons=["Heavier to train", "Closed weights (original)"], papers=["CoCa (Yu 2022)"], learn=("CoCa visually explained — DataCamp", "https://www.datacamp.com/tutorial/coca-contrastive-captioners-are-image-text-foundation-models-visually-explained")), dict(id="blip", name="BLIP (Bootstrapped Captions)", short="BLIP", family="Generative", anim="caption", tagline="Generate captions, filter the noisy ones, retrain on the clean set.", simple=("Web captions are noisy. BLIP trains a captioner to write new captions for images, a filter to " "throw out bad ones, and then re-trains on this cleaned-up data (CapFilt). It can both " "understand (match) and describe (generate)."), mapping="image → caption (bootstrapped, CapFilt)", math=r"\mathcal{L}=\mathcal{L}_{\text{ITC}}+\mathcal{L}_{\text{ITM}}+\mathcal{L}_{\text{LM}}\;\;(\text{caption}+\text{filter})", when="Caption/VQA when your web data is noisy; predecessor of BLIP-2.", pros=["Cleans noisy web data", "Understand + generate", "Open"], cons=["Multi-stage pipeline", "Smaller than LLM-bridged VLMs"], papers=["BLIP (Li 2022)"], learn=("BLIP walkthrough — Yannic Kilcher (video)", "https://www.youtube.com/watch?v=X2k7n4FuI7c")), dict(id="git", name="GIT / SimVLM (Generative)", short="GIT / SimVLM", family="Generative", anim="caption", tagline="Just predict the caption tokens, end to end.", simple=("The simplest recipe: feed the image into an encoder and train one decoder to emit the caption " "word by word — like a language model that can see. No contrastive loss, no fancy pipeline."), mapping="image → text (autoregressive)", math=r"\mathcal{L}=-\sum_t \log p_\theta(w_t\mid w_{'+(d?'':'')+'';} for(let i=0;i'; s+='';} s+=''; return s; }, caption(c){ let s=''+ ''; const w=['a','cat','on','a','mat']; w.forEach((t,i)=>{const x=110+i*30; s+=''+ ''+t+'';}); s+='';return s; }, projector(c){ let s='vision patches'; for(let i=0;i<4;i++)s+=''; s+='proj'; for(let i=0;i<4;i++)s+=''; s+='LLM'; s+='';return s; }, qformer(c){ let s=''; for(let r=0;r<4;r++)for(let q=0;q<3;q++)s+=''; s+='few queries'; for(let i=0;i<3;i++){s+='';s+='';} s+='LLM'; s+='';return s; }, crossattn(c){ let s='image features'; for(let q=0;q<6;q++)s+=''; const t=['the','cat','is']; t.forEach((w,i)=>{const x=36+i*72; s+=''+w+''+ '';}); s+='';return s; }, patchify(c){ let s=''; const gx=16,gy=24,ps=20; for(let r=0;r<3;r++)for(let q=0;q<3;q++)s+=''; for(let i=0;i<9;i++)s+=''; s+='';return s; }, earlyfusion(c){ let s=''; const seq=[1,1,0,1,0,0,1]; seq.forEach((m,i)=>{const x=16+i*30; s+=''+ '';}); s+='one Transformer';return s; }, grounding(c){ let s='scene'; s+=''; s+='"the box"'; s+=''; s+='→ box on thenamed object';return s; }, anyres(c){ let s=''; for(let r=0;r<2;r++)for(let q=0;q<2;q++)s+=''; s+='thumbnail'; for(let i=0;i<4;i++)s+=''; s+='VLM';return s; }, videoframes(c){ let s=''; for(let i=0;i<6;i++){const x=16+i*40,sp=i%2===0; s+=''; if(sp)s+='';} for(let i=0;i<3;i++)s+=''; s+='';return s; }, ocr(c){ let s=''; for(let i=0;i<6;i++)s+=''; s+='document image'; s+=''; s+=''; const rows=['{ "total":',' "$42.00",',' "date":',' "6/5" }']; rows.forEach((t,i)=>s+=''+t+''); return s; }, ssl(c){ let s=''; const gx=86,gy=26,ps=24,mask=[1,2,4,7,8,5]; for(let i=0;i<9;i++){const r=Math.floor(i/3),q=i%3,x=gx+q*ps,y=gy+r*ps,m=mask.indexOf(i)>=0; s+=''; if(m)s+='';} s+='';return s; }, instructtune(c){ let s=''; s+='"What is in the image?"'; s+=''+ '"A cat sitting on a'+ 'windowsill in the sun."'; s+='';return s; }, rlhf(c){ let s=''; s+='A: "a dog" ✗(hallucinated)'; s+='B: "a cat" ✓'; s+='👍'; s+='VLM'; s+='';return s; }, anytoany(c){ let s='image'; s+='text"a cat"'; s+='understand'; s+='generate'; s+='';return s; }, efficient(c){ let s='big VLM'; s+=''; s+='small'; s+='📱'; s+='';return s; }, moe(c){ let s=''; s+='tok'; s+='router'; const ys=[30,60,90,118]; ys.forEach((y,i)=>{const on=i===1||i===2; s+='expert '+(i+1)+''; s+='';}); s+='';return s; }, agentic(c){ let s=''; s+='VLM agent'; const tools=[['🔍 search',26,30],['🧮 run code',196,30],['🖱️ act',110,112]]; tools.forEach(t=>{s+=''+t[0]+'';}); s+=''; s+=''; s+=''; s+=''; s+=''; s+=''; return s; }, rag(c){ let s=''; for(let i=0;i<4;i++)s+=''; for(let i=0;i<2;i++)s+=''; s+=''; s+=''; s+='VLM'; s+=''; s+=''+ 'grounded answer'; return s; }, """ VLM_CAP_JS = ( 'contrastive:"Match the true image-text pair; push the rest apart.",' 'caption:"The image is described one word at a time.",' 'projector:"A small MLP turns image patches into LLM tokens.",' 'qformer:"A few query tokens compress the image for the LLM.",' 'crossattn:"A frozen LLM\'s text cross-attends to image features.",' 'patchify:"The image becomes a sequence of patch tokens.",' 'earlyfusion:"Image and text tokens share one transformer.",' 'grounding:"Text names an object; a box is drawn on it.",' 'anyres:"A high-res image is split into tiles + a thumbnail.",' 'videoframes:"A few sampled frames stand in for the video.",' 'ocr:"A document image is read straight into structured text.",' 'ssl:"Hidden patches are reconstructed — learning with no labels.",' 'instructtune:"Tuned on image+question to answer like an assistant.",' 'rlhf:"Preference for the truthful answer reduces hallucination.",' 'anytoany:"One model both understands and generates images.",' 'efficient:"Shrink a big VLM (distill + compress) to run on-device.",' 'moe:"A router fires only a few expert sub-nets per token.",' 'agentic:"A VLM that calls tools in a loop to solve tasks.",' 'rag:"Retrieve relevant images/docs, then answer grounded in them.",' ) def render(): t = gen_landscape.TEMPLATE t = t.replace("const ANIM = {", "const ANIM = {\n" + VLM_ANIM_JS, 1) t = t.replace("const ANIM_CAP = {", "const ANIM_CAP = {\n " + VLM_CAP_JS + "\n", 1) t = t.replace("🤖 Robot Learning Landscape", "👁️ Vision-Language Model Landscape") t = t.replace(">🤖", ">👁️") t = t.replace(">POLICY", ">VLM") t = t.replace("__DATA_JSON__", json.dumps(build_data())) return t def main(): html = render() with open("robot_vlm.html", "w", encoding="utf-8") as fh: fh.write(html) d = build_data() print("wrote robot_vlm.html (%d chars) — %d families, %d paradigms, %d edges" % (len(html), len(d["families"]), len(d["paradigms"]), len(d["edges"]))) if __name__ == "__main__": main()