import os # The Boogu transformer/pipeline select their attention + norm kernels based on # this env var at construction time, so it must be set before importing torch. os.environ.setdefault("device", "cuda:0") # Use the pure-torch RMSNorm path (not the triton fused kernel) so the block # parameter layout matches the AoTI graph compiled in the companion Space. import boogu.utils.import_utils as _import_utils _import_utils._triton_available = False import base64 import csv import io import json import sys # Example caching writes the cached output (which embeds the base64 before/after # data URIs) through the csv module; bump the field limit so large frames don't # trip "_csv.Error: field larger than field limit". csv.field_size_limit(sys.maxsize) import spaces import torch import gradio as gr from PIL import Image from boogu.pipelines.boogu.pipeline_boogu import BooguImagePipeline from boogu.pipelines.boogu.pipeline_boogu_turbo import BooguImageTurboPipeline MODEL_ID = "Boogu/Boogu-Image-0.1-Edit" TURBO_ID = "Boogu/Boogu-Image-0.1-Turbo" AOTI_REPO = "multimodalart/Boogu-Image-0.1-Edit-aoti" # Set to a Turbo AoTI repo to patch the Turbo single-stream blocks (None = eager). # Flip between "...-Turbo-aoti" (default compile) and "...-Turbo-aoti-mat" (max_autotune) # to A/B the compiled variants. Leave None to keep the eager 3.3s baseline. TURBO_AOTI_REPO = os.environ.get("TURBO_AOTI_REPO") or None pipe = BooguImagePipeline.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True, ) pipe.to("cuda") # Turbo shares the (byte-identical) mllm / vae / processor / scheduler with Edit; # only the transformer differs. Load just the Turbo transformer and build a Turbo # pipeline reusing the already-resident components — no duplicate 17.5GB mllm. turbo_transformer = type(pipe.transformer).from_pretrained( TURBO_ID, subfolder="transformer", torch_dtype=torch.bfloat16, ) _turbo_components = dict(pipe.components) _turbo_components["transformer"] = turbo_transformer turbo_pipe = BooguImageTurboPipeline(**_turbo_components) turbo_pipe.text_instruction_rewriter = pipe.text_instruction_rewriter turbo_pipe.instruction_rewriter_processor = pipe.instruction_rewriter_processor turbo_pipe.to("cuda") # Swap the 24 repeated single-stream blocks for their AoTI-compiled graph # (one shared compiled graph, per-block weights). Falls back to eager on any error. # Only the Edit transformer is compiled for now; Turbo runs eager (baseline). try: from pathlib import Path from huggingface_hub import snapshot_download from spaces.zero.torch.aoti import aoti_load_from_module_dir _block_dir = Path(snapshot_download(AOTI_REPO)) / "BooguImageTransformerBlock" if (_block_dir / "package.pt2").exists(): aoti_load_from_module_dir(pipe.transformer.single_stream_layers, _block_dir) print(f"AoTI: patched {len(pipe.transformer.single_stream_layers)} Edit single-stream blocks") else: print("AoTI: Edit package.pt2 not found, running eager") except Exception as exc: # noqa: BLE001 print(f"AoTI (Edit) load failed ({exc!r}); running eager") # Optionally patch the Turbo single-stream blocks too (off by default = eager baseline). if TURBO_AOTI_REPO: try: from pathlib import Path from huggingface_hub import snapshot_download from spaces.zero.torch.aoti import aoti_load_from_module_dir _t_dir = Path(snapshot_download(TURBO_AOTI_REPO)) / "BooguImageTransformerBlock" if (_t_dir / "package.pt2").exists(): aoti_load_from_module_dir(turbo_pipe.transformer.single_stream_layers, _t_dir) print(f"AoTI: patched {len(turbo_pipe.transformer.single_stream_layers)} Turbo blocks from {TURBO_AOTI_REPO}") else: print(f"AoTI: Turbo package.pt2 not found in {TURBO_AOTI_REPO}, running eager") except Exception as exc: # noqa: BLE001 print(f"AoTI (Turbo) load failed ({exc!r}); running eager") # EXPERIMENT (#10): optionally patch the 2 Turbo double-stream blocks with a second # AoTI graph. WARNING: that graph bakes the captured per-sample seq lengths as # constants (the block takes them as python int lists, not dynamic tensors), so it # is only correct for prompts whose instruction tokenizes to the captured length. DS_TURBO_AOTI_REPO = os.environ.get("DS_TURBO_AOTI_REPO") or None if DS_TURBO_AOTI_REPO: try: from pathlib import Path from huggingface_hub import snapshot_download from spaces.zero.torch.aoti import aoti_load_from_module_dir _ds_dir = Path(snapshot_download(DS_TURBO_AOTI_REPO)) / "BooguImageDoubleStreamTransformerBlock" if (_ds_dir / "package.pt2").exists(): aoti_load_from_module_dir(turbo_pipe.transformer.double_stream_layers, _ds_dir) print(f"AoTI: patched {len(turbo_pipe.transformer.double_stream_layers)} Turbo double-stream blocks from {DS_TURBO_AOTI_REPO}") else: print(f"AoTI: Turbo double-stream package.pt2 not found in {DS_TURBO_AOTI_REPO}, running eager") except Exception as exc: # noqa: BLE001 print(f"AoTI (Turbo double-stream) load failed ({exc!r}); running eager") MAX_SEED = 2**31 - 1 def _data_uri(img): buf = io.BytesIO() img.save(buf, format="WEBP", quality=92) return "data:image/webp;base64," + base64.b64encode(buf.getvalue()).decode() # Custom before/after comparison built on gr.HTML (gr.ImageSlider is broken with # gr.Examples caching on this Gradio build and doesn't keep the two sides aligned). # Markup/CSS mirror Gradio's native ImageSlider: both images fill the same box with # object-fit:contain so they line up regardless of native size; the edited ("after") # image is revealed by a clip-path driven by an accent-pill handle on a 1px divider. # NOTE: Gradio evaluates html_template via `new Function(..., "return `" + tpl + "`")`, # i.e. it wraps the whole template in backticks. So the template must NOT contain any # backticks of its own (nested template literals terminate the wrapper and silently # blank the component) — build the markup with single-quote string concatenation. # Native-style floating block label (icon + text), mirroring Gradio's block-label. _BA_LABEL = ( '' ) _BA_DOWNLOAD_ICON = ( '' '' '' '' ) # value arrives as a JSON string (see edit()); parse it defensively. An IIFE keeps # this a single ${...} expression with no backticks. BA_HTML = ( "${(function(){\n" " var d = {};\n" " try { d = value ? JSON.parse(value) : {}; } catch (e) { d = {}; }\n" " return (d && d.after)\n" " ? '
'\n" " + '" + _BA_LABEL + "'\n" " + ''\n" " + '" + _BA_DOWNLOAD_ICON + "'\n" " + ''\n" " + ''\n" " + ''\n" " + '
'\n" " + '
'\n" " + ''\n" " + ''\n" " + ''\n" " + '
'\n" " : '
'\n" " + '" + _BA_LABEL + "'\n" " + 'Result will appear here'\n" " + '
';\n" "})()}" ) BA_CSS = """.ba{position:relative;width:100%;height:360px;background:var(--block-background-fill);border:var(--block-border-width) solid var(--block-border-color);border-radius:var(--block-radius);box-shadow:var(--block-shadow);overflow:hidden;touch-action:none;user-select:none} .ba-img{position:absolute;inset:0;width:100%;height:100%;object-fit:contain;background:var(--block-background-fill);-webkit-user-drag:none;user-select:none;transform-origin:0 0;will-change:transform} .ba-after{clip-path:inset(0 0 0 var(--pos,50%))} .ba-line{position:absolute;top:0;height:100%;left:var(--pos,50%);width:20px;transform:translateX(-50%);cursor:grab;z-index:2} .ba.dragging .ba-line{cursor:grabbing} .ba-inner{position:absolute;left:50%;top:0;width:1px;height:100%;transform:translateX(-50%);background:var(--border-color-primary)} .ba-handle{position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);width:40px;height:30px;border-radius:5px;background:var(--color-accent);color:var(--body-text-color);display:flex;align-items:center;justify-content:center;box-shadow:0 0 5px 2px #0000004d;font-size:12px;transition:opacity .2s} .ba.dragging .ba-handle{opacity:0} .ba-arrow{text-shadow:-1px -1px 1px rgba(0,0,0,.1)} .ba-arrow-l{transform:rotate(135deg)} .ba-arrow-r{transform:rotate(-45deg)} .ba-center{display:block;width:1px;height:100%;margin:0 3px;background:var(--border-color-primary);opacity:.1} .ba-empty{display:flex;align-items:center;justify-content:center} .ba-empty-text{color:var(--body-text-color-subdued)} .ba-label{position:absolute;top:var(--block-label-margin);left:var(--block-label-margin);z-index:4;display:inline-flex;align-items:center;box-shadow:var(--block-label-shadow);border:var(--block-label-border-width) solid var(--block-label-border-color);border-top:none;border-left:none;border-radius:var(--block-label-radius);background:var(--block-label-background-fill);padding:var(--block-label-padding);pointer-events:none;color:var(--block-label-text-color);font-weight:var(--block-label-text-weight);font-size:var(--block-label-text-size);line-height:var(--line-sm)} .ba-label-icon{opacity:.8;margin-right:var(--size-2);width:calc(var(--block-label-text-size) - 1px);height:calc(var(--block-label-text-size) - 1px)} .ba-download{position:absolute;top:var(--block-label-margin);right:var(--block-label-margin);z-index:5;display:flex;align-items:center;justify-content:center;box-sizing:border-box;width:var(--size-7);height:var(--size-7);padding:var(--size-1-5);color:var(--block-label-text-color);background:var(--block-background-fill);border:1px solid var(--border-color-primary);border-radius:var(--radius-sm);box-shadow:var(--shadow-drop);opacity:.85;transition:opacity .15s,color .15s} .ba-download:hover{opacity:1;color:var(--color-accent)}""" BA_JS = """ let scale = 1, tx = 0, ty = 0; let mode = null; // 'slider' | 'pan' let lastX = 0, lastY = 0, pinch = 0; let curBa = null; // detect re-render to reset zoom state function ba(){ return element.querySelector('.ba'); } function fresh(){ const el = ba(); if(el !== curBa){ curBa = el; scale = 1; tx = 0; ty = 0; } return el; } function dividerFrac(el){ const v = getComputedStyle(el).getPropertyValue('--pos').trim(); let f = parseFloat(v); if(v.indexOf('%') >= 0) f = f / 100; else f = f / el.getBoundingClientRect().width; return isNaN(f) ? 0.5 : Math.max(0, Math.min(1, f)); } function realRect(el){ const r = el.getBoundingClientRect(); const im = el.querySelector('.ba-after'); const nw = (im && im.naturalWidth) || r.width; const nh = (im && im.naturalHeight) || r.height; const A = nw / nh, B = r.width / r.height; let dw, dh; if(A > B){ dw = r.width; dh = r.width / A; } else { dh = r.height; dw = r.height * A; } return {left:(r.width - dw) / 2, top:(r.height - dh) / 2, width:dw, height:dh, W:r.width, H:r.height}; } function constrain(el){ if(scale <= 1){ tx = 0; ty = 0; return; } const rr = realRect(el); tx = Math.max(rr.W - scale * (rr.left + rr.width), Math.min(-scale * rr.left, tx)); ty = Math.max(rr.H - scale * (rr.top + rr.height), Math.min(-scale * rr.top, ty)); } function apply(){ const el = ba(); if(!el) return; const t = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')'; el.querySelectorAll('.ba-img').forEach(im => { im.style.transform = t; }); const r = el.getBoundingClientRect(); let f = (dividerFrac(el) * r.width - tx) / (scale * r.width); f = Math.max(0, Math.min(1, f)); const af = el.querySelector('.ba-after'); if(af) af.style.clipPath = 'inset(0 0 0 ' + (f * 100) + '%)'; el.style.cursor = scale > 1 ? (mode === 'pan' ? 'grabbing' : 'grab') : 'default'; } function setDivider(clientX){ const el = ba(); if(!el) return; const r = el.getBoundingClientRect(); let p = ((clientX - r.left) / r.width) * 100; p = Math.max(0, Math.min(100, p)); el.style.setProperty('--pos', p + '%'); apply(); } function zoomAt(cx, cy, factor){ const el = ba(); if(!el) return; const r = el.getBoundingClientRect(); const px = cx - r.left, py = cy - r.top; const old = scale; const ns = Math.max(1, Math.min(15, scale * factor)); if(ns === old) return; tx = px - (ns / old) * (px - tx); ty = py - (ns / old) * (py - ty); scale = ns; constrain(el); apply(); } element.addEventListener('wheel', e => { if(!fresh()) return; e.preventDefault(); zoomAt(e.clientX, e.clientY, e.deltaY < 0 ? 1.08 : 1 / 1.08); }, {passive:false}); element.addEventListener('pointerdown', e => { if(e.button !== 0) return; if(e.target.closest('.ba-download')) return; const el = fresh(); if(!el) return; const onLine = !!e.target.closest('.ba-line'); mode = (scale > 1 && !onLine) ? 'pan' : 'slider'; lastX = e.clientX; lastY = e.clientY; el.classList.add('dragging'); if(mode === 'slider') setDivider(e.clientX); else apply(); e.preventDefault(); }); window.addEventListener('pointermove', e => { if(!mode) return; if(mode === 'pan'){ tx += e.clientX - lastX; ty += e.clientY - lastY; lastX = e.clientX; lastY = e.clientY; const el = ba(); if(el) constrain(el); apply(); } else setDivider(e.clientX); }); window.addEventListener('pointerup', () => { if(!mode) return; mode = null; const el = ba(); if(el) el.classList.remove('dragging'); apply(); }); element.addEventListener('dblclick', () => { if(!fresh()) return; scale = 1; tx = 0; ty = 0; apply(); }); element.addEventListener('touchstart', e => { if(e.target.closest('.ba-download')) return; if(!fresh()) return; if(e.touches.length === 2){ const a = e.touches[0], b = e.touches[1]; pinch = Math.hypot(b.clientX - a.clientX, b.clientY - a.clientY); } else if(e.touches.length === 1 && scale > 1){ mode = 'pan'; lastX = e.touches[0].clientX; lastY = e.touches[0].clientY; } }, {passive:true}); element.addEventListener('touchmove', e => { if(e.touches.length === 2){ e.preventDefault(); const a = e.touches[0], b = e.touches[1]; const d = Math.hypot(b.clientX - a.clientX, b.clientY - a.clientY); if(pinch > 0) zoomAt((a.clientX + b.clientX) / 2, (a.clientY + b.clientY) / 2, d / pinch); pinch = d; } else if(e.touches.length === 1 && mode === 'pan'){ e.preventDefault(); tx += e.touches[0].clientX - lastX; ty += e.touches[0].clientY - lastY; lastX = e.touches[0].clientX; lastY = e.touches[0].clientY; const el = ba(); if(el) constrain(el); apply(); } }, {passive:false}); element.addEventListener('touchend', e => { if(e.touches.length === 0){ pinch = 0; mode = null; } }); """ RESOLUTIONS = { "1K": {"pixels": 1024 * 1024, "side": 2048}, "2K": {"pixels": 2048 * 2048, "side": 4096}, } def _duration(image, instruction, model_choice, resolution, num_inference_steps, *args, **kwargs): per_step = 4 if model_choice == "Turbo" else 4 base = int(num_inference_steps) * per_step + (40 if model_choice == "Turbo" else 60) return base * 2 if resolution == "2K" else base @spaces.GPU(duration=_duration) def edit( image, instruction, model_choice="Edit", resolution="1K", num_inference_steps=32, text_guidance_scale=4, image_guidance_scale=1, seed=42, randomize_seed=False, progress=gr.Progress(track_tqdm=True), ): if not instruction or not instruction.strip(): raise gr.Error("Please enter a prompt.") if randomize_seed: seed = int(torch.randint(0, MAX_SEED, (1,)).item()) seed = int(seed) res = RESOLUTIONS[resolution] generator = torch.Generator("cuda").manual_seed(seed) input_pil = None if model_choice == "Turbo": # DMD few-step text-to-image: no reference image, no CFG (all scales == 1.0). size = 1024 if resolution == "1K" else 2048 result = turbo_pipe( instruction=[instruction.strip()], negative_instruction="", empty_instruction="", height=size, width=size, max_input_image_pixels=res["pixels"], max_input_image_side_length=res["side"], num_inference_steps=int(num_inference_steps), text_guidance_scale=1.0, image_guidance_scale=1.0, empty_instruction_guidance_scale=0.0, use_dmd_student_inference=True, dmd_conditioning_sigma=0.001, generator=generator, device="cuda", ).images[0] elif image is None: # Text-to-image: no reference image, output size is set explicitly. size = 1024 if resolution == "1K" else 2048 result = pipe( instruction=[instruction.strip()], negative_instruction="", height=size, width=size, max_input_image_pixels=res["pixels"], max_input_image_side_length=res["side"], num_inference_steps=int(num_inference_steps), text_guidance_scale=float(text_guidance_scale), generator=generator, device="cuda", ).images[0] else: input_pil = Image.open(image).convert("RGB") result = pipe( instruction=[instruction.strip()], input_image_paths=[[image]], input_images=[[input_pil]], negative_instruction="", height=None, width=None, max_input_image_pixels=res["pixels"], max_input_image_side_length=res["side"], align_res=True, num_inference_steps=int(num_inference_steps), text_guidance_scale=float(text_guidance_scale), image_guidance_scale=float(image_guidance_scale), generator=generator, device="cuda", ).images[0] if input_pil is not None: return json.dumps({"before": _data_uri(input_pil), "after": _data_uri(result)}), result, seed return "", result, seed CSS = """ #col-container { max-width: 1100px; margin: 0 auto; } #result-ba .html-container { padding: 0 !important; } """ with gr.Blocks(theme=gr.themes.Citrus(), css=CSS) as demo: with gr.Column(elem_id="col-container"): gr.Markdown( """ # 🍊 Boogu-Image-0.1 Unified generation/editing with [Boogu-Image-0.1](https://huggingface.co/Boogu) - a 10B model (Qwen3-VL + FLUX VAE) """ ) with gr.Row(): with gr.Column(): image = gr.Image( label="Input image (leave empty for text-to-image)", type="filepath", height=360, ) model_choice = gr.Radio( choices=["Edit", "Turbo"], value="Edit", label="Model", ) instruction = gr.Textbox( label="Prompt", placeholder="e.g. A street photography portrait of an elderly man, or 把背景替换到沙滩", lines=2, ) run_button = gr.Button("Generate", variant="primary") with gr.Accordion("Advanced settings", open=False): resolution = gr.Radio( choices=["1K", "2K"], value="1K", label="Output resolution" ) num_inference_steps = gr.Slider( minimum=1, maximum=50, step=1, value=32, label="Inference steps", ) text_guidance_scale = gr.Slider( minimum=1.0, maximum=7.0, step=0.1, value=4.0, label="Text guidance scale", ) image_guidance_scale = gr.Slider( minimum=1.0, maximum=3.0, step=0.1, value=1.0, label="Image guidance scale", ) with gr.Row(): seed = gr.Slider( minimum=0, maximum=MAX_SEED, step=1, value=0, label="Seed" ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Column(): with gr.Column(visible=True) as slider_col: result_ba = gr.HTML( value="", elem_id="result-ba", html_template=BA_HTML, css_template=BA_CSS, js_on_load=BA_JS, apply_default_css=False, ) with gr.Column(visible=False) as image_col: result_image = gr.Image(label="Result", height=360) gr.Examples( examples=[ ["examples/03.jpg", "Remove the dog and seamlessly blend the background."], ["examples/01.png", "帮我在这幅画右下角加上三个带叶子的柿子。"], ["examples/02.png", "Make it look like a watercolor painting."], ["examples/04.jpg", "Change the season to winter with snow."], ], fn=edit, inputs=[image, instruction], outputs=[result_ba, result_image, seed], cache_examples=True, cache_mode="lazy", ) def _on_model_change(choice): if choice == "Turbo": return ( gr.update(visible=False), # image (Turbo is T2I only) gr.update(value=4, minimum=1, maximum=8, label="Inference steps (Turbo)"), gr.update(value=1.0, interactive=False), # text guidance (CFG off for DMD) gr.update(value=1.0, interactive=False), # image guidance (unused) gr.update(visible=False), # slider_col (T2I has no before image) gr.update(visible=True), # image_col ) return ( gr.update(visible=True), gr.update(value=32, minimum=1, maximum=50, label="Inference steps"), gr.update(value=4.0, interactive=True), gr.update(value=1.0, interactive=True), gr.update(visible=True), # slider_col (Edit shows before/after) gr.update(visible=False), # image_col ) model_choice.change( _on_model_change, inputs=[model_choice], outputs=[ image, num_inference_steps, text_guidance_scale, image_guidance_scale, slider_col, image_col, ], ) def _result_visibility(model_choice, image): # Comparison only when there is a genuine before/after (Edit + reference image). is_compare = model_choice != "Turbo" and image is not None return gr.update(visible=is_compare), gr.update(visible=not is_compare) inputs = [ image, instruction, model_choice, resolution, num_inference_steps, text_guidance_scale, image_guidance_scale, seed, randomize_seed, ] outputs = [result_ba, result_image, seed] run_button.click(fn=edit, inputs=inputs, outputs=outputs).then( _result_visibility, inputs=[model_choice, image], outputs=[slider_col, image_col] ) instruction.submit(fn=edit, inputs=inputs, outputs=outputs).then( _result_visibility, inputs=[model_choice, image], outputs=[slider_col, image_col] ) demo.queue().launch()