"""A 3-axis overview of the VLM landscape (the survey-standard organization): ① Architecture / objective axis — HOW vision meets language ② Capability / extension axis — WHAT it can do ③ Training-stage axis — HOW it is built (a left-to-right pipeline) Reuses gen_vlm.build_data() for names / colors / taglines / learn links, and renders a static (but linked + tooltip'd) layered diagram. Self-contained HTML. Run: .venv_robot_paradigms/bin/python gen_vlm_axes.py -> robot_vlm_axes.html """ import html as H import gen_vlm AXIS_A = ["Backbone", "Contrastive", "Masked", "Generative", "Bridge", "Native"] AXIS_B = ["Grounding", "AnyRes", "Video", "Document", "Unified", "Efficient", "MoE", "Agentic"] STAGES = [ ("① Vision pretraining", "learn to see (often label-free)", ["vit", "ssl"]), ("② VL pretraining / alignment", "align or generate over image+text", ["clip", "siglip", "flava", "beit3", "blip", "coca", "git", "chameleon"]), ("③ Connector / bridge", "wire a vision encoder into an LLM", ["flamingo", "blip2", "llava", "qwenvl"]), ("④ Instruction tuning", "become a helpful assistant", ["instructtune"]), ("⑤ Preference (RLHF / DPO)", "be truthful, not hallucinated", ["mmrlhf"]), ("⑥ Inference-time", "no weight change: tools & retrieval", ["agentic-vlm", "mm-rag", "frontier"]), ] def esc(t): return H.escape(str(t), quote=True) def render(): d = gen_vlm.build_data() byid = {p["id"]: p for p in d["paradigms"]} fam = {f["key"]: f for f in d["families"]} def chip(pid): p = byid.get(pid) if not p: return "" c = fam[p["family"]]["color"] url = (p.get("learn") or {}).get("url", "") a_open = '' % ( c, esc(url), esc(p["tagline"])) return a_open + esc(p["short"]) + "" def fam_card(fkey): f = fam[fkey] kids = [p for p in d["paradigms"] if p["family"] == fkey] chips = "".join(chip(p["id"]) for p in kids) return ('

' '

') % (f["color"], esc(f["label"]), chips) def stage_card(label, sub, ids): chips = "".join(chip(i) for i in ids) return ('

' '

') % ( esc(label), esc(sub), chips) axis_a = "".join(fam_card(k) for k in AXIS_A if k in fam) axis_b = "".join(fam_card(k) for k in AXIS_B if k in fam) stages = ('

→

').join(stage_card(l, s, ids) for (l, s, ids) in STAGES) return """ VLM — Three Axes

👁️ Vision-Language Models — the three axes

The same models, organized the way recent surveys do: by architecture (how vision meets language), by capability (what it can do), and by training stage (how it's built). Hover a chip for its one-line idea; click to read.

① Architecture axis— how vision meets language (the training objective / fusion)

__AXIS_A__

② Capability axis— what the model can additionally do (extensions)

__AXIS_B__

③ Training-stage axis— how a modern VLM is built, left to right

__STAGES__

Read it as a grid: a model picks one cell from the architecture axis (e.g. Bridge), gains capability cells (e.g. High-Res + Agentic), and is produced by walking the training-stage pipeline (pretrain → align → connect → instruction-tune → preference → inference-time). Vision Backbones are the foundations every column builds on.

""".replace("__AXIS_A__", axis_a).replace("__AXIS_B__", axis_b).replace("__STAGES__", stages) def main(): html = render() with open("robot_vlm_axes.html", "w", encoding="utf-8") as fh: fh.write(html) print("wrote robot_vlm_axes.html (%d chars)" % len(html)) if __name__ == "__main__": main()