"""Self-contained inference example for GUI-G2-3B + CCF. Usage: python predict.py --image screenshot.png --instruction "click the settings icon" python predict.py --image screenshot.png --instruction "type your password" --no-ccf python predict.py --image screenshot.png --instruction "click X" --no-type-gate Loads GUI-G2-3B from inclusionAI/GUI-G2-3B (downloads on first run, ~6GB), applies the CCF wrapper (with optional type-aware gating), and prints the predicted (x, y) click point in original-image pixel coordinates. The cursor_ccf.py module bundled in this repo is the only piece of "ours"; everything else is just glue around the Hugging Face model. """ import argparse import os import re import sys import time import torch from PIL import Image # Local module bundled with this release sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from cursor_ccf import ( # noqa: E402 CCFConfig, ccf_predict_bbox, classify_instruction, ) GUI_G2_PROMPT = ( "Outline the position corresponding to the instruction: {}. " "The output should be only [x1,y1,x2,y2]." ) def parse_bbox(response: str): """Extract (cx, cy) from a "[x1,y1,x2,y2]" model response. Returns (None, None) if the model output didn't contain a parseable bbox.""" m = re.search(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", response) if not m: return None, None x1, y1, x2, y2 = map(int, m.groups()) return (x1 + x2) / 2.0, (y1 + y2) / 2.0 def load_model(model_id: str, attn_impl: str = "flash_attention_2"): """Load GUI-G2-3B and its processor with the parameters that match the published 89.2% ScreenSpot-v2 baseline.""" from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration print(f"Loading {model_id} ...") processor = AutoProcessor.from_pretrained( model_id, min_pixels=3136, max_pixels=12_845_056, ) model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, attn_implementation=attn_impl, device_map="auto", ) model.eval() return model, processor def predict_gui_g2(model, processor, image: Image.Image, instruction: str): """One greedy forward pass. Returns ((cx, cy), raw_text) where the coords are in the ORIGINAL image's pixel space (rescaled from the processor's smart_resize space, where the model emits).""" from qwen_vl_utils import process_vision_info orig_w, orig_h = image.size messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": GUI_G2_PROMPT.format(instruction)}, ]}] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=32, do_sample=False, ) trimmed = output[0][inputs.input_ids.shape[1]:] response = processor.batch_decode( [trimmed], skip_special_tokens=True, )[0] abs_cx, abs_cy = parse_bbox(response) if abs_cx is None: return (None, None), response # Model outputs coords in processed-pixel space (post smart_resize). # Rescale to original-image pixels. proc_h = inputs["image_grid_thw"][0][1].item() * 14 proc_w = inputs["image_grid_thw"][0][2].item() * 14 return (abs_cx * orig_w / proc_w, abs_cy * orig_h / proc_h), response def predict_with_ccf( model, processor, image: Image.Image, instruction: str, use_type_gate: bool = True, ): """CCF-wrapped prediction. Returns (x, y, stage) or (None, None, "fail").""" cfg = CCFConfig( zoom_factor=2.0, coarse_max_pixels=1_500_000, instruction_classifier_fn=( classify_instruction if use_type_gate else None ), ) def inner(img, instr): (x, y), raw = predict_gui_g2(model, processor, img, instr) if x is None: return None, raw return (float(x), float(y)), raw result = ccf_predict_bbox(inner, image, instruction, cfg) if result is None: return None, None, "fail" return result.x, result.y, result.stage def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--image", required=True, help="Path to a PNG/JPG screenshot") parser.add_argument( "--instruction", required=True, help='Natural-language target description, e.g. "click the settings icon"', ) parser.add_argument( "--model-id", default="inclusionAI/GUI-G2-3B", help="Hugging Face model ID for the base GUI-G2-3B (or a local path)", ) parser.add_argument( "--no-ccf", action="store_true", help="Disable CCF; just run the base model once. Faster but loses " "the +2.2pp icon improvement.", ) parser.add_argument( "--no-type-gate", action="store_true", help="Disable the type-aware gate. Recommended when most of your " "instructions target icons; slight web boost vs gated.", ) parser.add_argument( "--attn-impl", default="flash_attention_2", choices=["sdpa", "flash_attention_2", "eager"], help="Attention backend. flash_attention_2 is fastest but requires " "the flash-attn package compiled for your torch version. " "sdpa works everywhere but is ~3x slower on big images.", ) args = parser.parse_args() if not os.path.exists(args.image): sys.exit(f"ERROR: image not found at {args.image}") image = Image.open(args.image).convert("RGB") print(f"Image: {image.size[0]}x{image.size[1]} pixels") print(f"Instruction: {args.instruction!r}") model, processor = load_model(args.model_id, attn_impl=args.attn_impl) t0 = time.time() if args.no_ccf: (x, y), raw = predict_gui_g2(model, processor, image, args.instruction) stage = "no_ccf" else: x, y, stage = predict_with_ccf( model, processor, image, args.instruction, use_type_gate=not args.no_type_gate, ) elapsed = time.time() - t0 if x is None: print(f"\nFAILED to parse a bbox from the model output.") sys.exit(1) print(f"\nPrediction: click at ({x:.1f}, {y:.1f})") print(f" Stage: {stage}") print(f" Wall: {elapsed:.2f}s") if __name__ == "__main__": main()