gui-g2-3b-ccf / cursor_ccf.py
luisf-mc's picture
Initial release: GUI-G2-3B + CCF inference wrapper (+2.2pp icon over base)
e97c820 verified
"""Cursor-Centric Focusing (CCF) inference wrapper.
Generalizes the GUI-Cursor paper's CCF technique to any grounding model:
run once on the full screenshot to get a coarse prediction, crop a
window centered on that prediction, run again on the crop, and map the
refined coordinate back to the original image's pixel space.
Why this helps: a small icon that occupies ~30 pixels in a 1920x1080
screenshot becomes ~60 pixels in a 2x crop. Qwen2.5-VL sees more
per-element tokens, making small-target grounding noticeably easier.
Design choices locked in:
- Greedy-only on both passes. Our earlier stochastic-sampling zoom
regressed because temperature noise corrupted already-correct
predictions. Never do that again.
- Single refinement pass. Iterative narrowing (> 1 refinement) gives
diminishing returns and compounds coordinate-mapping errors.
- Fall back to coarse if the refined pass fails to parse -- we never
return None if the coarse pass succeeded.
- Crop shifts near the boundary instead of shrinking, so the model
always sees a fixed-size region at its native resolution budget.
"""
from dataclasses import dataclass
from typing import Callable, Optional, Tuple
from PIL import Image
Bbox = Tuple[int, int, int, int] # (left, top, right, bottom)
Point = Tuple[float, float]
DEFAULT_ZOOM_FACTOR = 2.0
DEFAULT_MIN_PIXELS_FOR_CCF = 200_000 # ~450x450; below this, crop won't help
DEFAULT_MIN_CROP_SIDE = 112 # 4 * 28, Qwen2.5-VL's smallest patch grid
DEFAULT_COARSE_MAX_PIXELS: Optional[int] = None # None == use original
# Phase 9: type-aware classifier keyword lists. Tuned by inspecting the
# Phase 4 ScreenSpot-v2 results -- text-targeted instructions that
# regressed when refined had patterns like "address bar", "search field",
# "page heading", quoted strings; icon-targeted ones that benefitted had
# "icon", "button", "menu", short verb-noun patterns.
ICON_KEYWORDS = (
" icon", " button", " logo", " image", " menu", " arrow",
" plus", " minus", " back", " home", " settings", " trash",
" delete", " edit", " send", " download", " upload", " share",
" filter", " sort", " heart", " star", " bell", " camera",
" phone", " mail", " calendar", " clock", " refresh", " play",
" pause", " stop", " volume", " mute", " expand", " collapse",
" hamburger", " kebab", " dots", " more", " plus icon",
" gear", " cog", " checkbox", " toggle", " switch",
)
TEXT_KEYWORDS = (
" link", " label", " heading", " title", " field", " input",
" text", " bar", " address bar", " search bar", " search field",
" text box", " textbox", " textarea", " text area", " paragraph",
" caption", " sentence", " phrase", " word ",
)
def classify_instruction(instruction: str) -> str:
"""Classify a grounding instruction as targeting an icon, text, or ambiguous.
Returns one of "icon" | "text" | "ambiguous". Used by the Phase 9
type-aware CCF gate to decide whether to run the refined pass --
text targets are usually wide enough that the coarse model already
nails them, while CCF refinement introduces small drift that misses.
Heuristic in three stages:
1. Quoted strings ("Submit", "Login") almost always reference a
text label by exact match -> "text".
2. Keyword counts: more icon hits than text hits -> "icon", and
vice versa.
3. Tie-break by length: short instructions are usually icon-like
("the X icon"), longer descriptive ones are usually text.
"""
if not instruction:
return "ambiguous"
s = " " + instruction.lower() + " " # pad so " icon" matches at edges
if '"' in instruction or "'" in instruction:
return "text"
icon_hits = sum(1 for k in ICON_KEYWORDS if k in s)
text_hits = sum(1 for k in TEXT_KEYWORDS if k in s)
if text_hits > icon_hits:
return "text"
if icon_hits > text_hits:
return "icon"
# Tie / no hits: short = icon, long = text
n = len(instruction.strip())
if n < 25:
return "icon"
return "ambiguous"
@dataclass
class CCFConfig:
"""Tuning knobs for CCF."""
zoom_factor: float = DEFAULT_ZOOM_FACTOR
min_pixels_for_ccf: int = DEFAULT_MIN_PIXELS_FOR_CCF
min_crop_side: int = DEFAULT_MIN_CROP_SIDE
fallback_to_coarse_on_invalid: bool = True
# When set, the coarse pass downsizes any image whose pixel count
# exceeds this value. Predicted (x, y) is scaled back to the
# original-image space before the crop window is computed. This is
# how the GUI-Cursor paper actually runs CCF -- the coarse pass
# just needs to localize; the refined pass uses native resolution
# on the cropped region. Saves ~50% wall time on 1920x1080 inputs.
coarse_max_pixels: Optional[int] = DEFAULT_COARSE_MAX_PIXELS
# Phase 9: when set, called with the instruction string. If it
# returns "text", we skip the refinement pass and return the coarse
# prediction (with stage="coarse_text_gate"). The Phase 4 result
# showed CCF refinement helps icons (+2.2pp) but hurts text (-2.3pp);
# gating by instruction type recovers the text loss without giving
# up the icon win. None disables gating (Phase 4 behavior).
instruction_classifier_fn: Optional[Callable[[str], str]] = None
@dataclass
class CCFResult:
"""What CCF returns. Never None if the coarse pass succeeded."""
x: float
y: float
stage: str # "coarse" | "refined" | "fallback"
coarse_xy: Point
crop_window: Optional[Bbox] # None when we skipped cropping
# Type alias for the caller-provided prediction function. Takes an image
# and an instruction, returns ((x, y) or None, raw_text_for_logging).
PredictFn = Callable[[Image.Image, str], Tuple[Optional[Point], str]]
def compute_crop_window(
center: Point,
image_size: Tuple[int, int],
zoom_factor: float = DEFAULT_ZOOM_FACTOR,
min_crop_side: int = DEFAULT_MIN_CROP_SIDE,
) -> Bbox:
"""Return a (left, top, right, bottom) window of target dimensions.
The window is centered on `center` when possible. If `center` sits
near an edge the window shifts rather than shrinks, so the model
always sees a full-size crop. Dimensions floor at `min_crop_side`.
"""
img_w, img_h = image_size
if img_w <= 0 or img_h <= 0:
raise ValueError(f"invalid image size {image_size}")
if zoom_factor <= 0:
raise ValueError(f"zoom_factor must be positive, got {zoom_factor}")
target_w = max(min_crop_side, int(img_w / zoom_factor))
target_h = max(min_crop_side, int(img_h / zoom_factor))
# Clamp target to image bounds so we never try to crop bigger than the image.
target_w = min(target_w, img_w)
target_h = min(target_h, img_h)
cx = int(round(center[0]))
cy = int(round(center[1]))
left = cx - target_w // 2
top = cy - target_h // 2
# Shift window inward if it spills past the image bounds.
if left < 0:
left = 0
elif left + target_w > img_w:
left = img_w - target_w
if top < 0:
top = 0
elif top + target_h > img_h:
top = img_h - target_h
return left, top, left + target_w, top + target_h
def map_crop_to_orig(
x_on_crop: float,
y_on_crop: float,
crop_window: Bbox,
) -> Point:
"""Translate crop-pixel coords to original-image pixel coords."""
left, top, _right, _bottom = crop_window
return left + float(x_on_crop), top + float(y_on_crop)
def _should_skip_ccf(image_size: Tuple[int, int], config: CCFConfig) -> bool:
w, h = image_size
return (w * h) < config.min_pixels_for_ccf
def _maybe_downsize_for_coarse(
image: Image.Image, max_pixels: Optional[int]
) -> Tuple[Image.Image, float]:
"""Return (downsized_image, scale_factor).
`scale_factor` is the multiplier that converts a coordinate in the
DOWNSIZED image's pixel space back into the ORIGINAL image's pixel
space. For an image already smaller than `max_pixels`, scale is 1.0.
"""
if max_pixels is None:
return image, 1.0
w, h = image.size
px = w * h
if px <= max_pixels:
return image, 1.0
# Preserve aspect ratio. scale_down < 1.0 shrinks the image.
import math
scale_down = math.sqrt(max_pixels / px)
new_w = max(1, int(round(w * scale_down)))
new_h = max(1, int(round(h * scale_down)))
downsized = image.resize((new_w, new_h), Image.BILINEAR)
# Scale UP: coarse coords are in downsized space; multiply by this to
# recover original-space coords.
scale_up = w / new_w # equivalently h / new_h up to rounding
return downsized, scale_up
def ccf_predict_bbox(
predict_fn: PredictFn,
image: Image.Image,
instruction: str,
config: Optional[CCFConfig] = None,
) -> Optional[CCFResult]:
"""Run CCF around a bbox/point predictor.
`predict_fn` must return ((x, y) or None, raw_text). Coordinates are
in the passed image's pixel space.
Returns a `CCFResult` or `None` if even the coarse pass failed to
parse. When the refined pass fails, we return a `CCFResult` with
stage="fallback" using the coarse prediction (provided the config
has `fallback_to_coarse_on_invalid=True`, which is the default).
"""
if config is None:
config = CCFConfig()
# Coarse pass: optionally downsize the image so we don't spend
# 10+ seconds on a 1920x1080 forward pass when all we need is a
# rough localization. Predicted coords come back in downsized
# pixel space and we scale them up.
coarse_image, coarse_scale = _maybe_downsize_for_coarse(
image, config.coarse_max_pixels
)
coarse_xy_raw, _coarse_raw = predict_fn(coarse_image, instruction)
if coarse_xy_raw is None:
return None
coarse_xy = (
float(coarse_xy_raw[0]) * coarse_scale,
float(coarse_xy_raw[1]) * coarse_scale,
)
# Skip refinement on already-small images -- CCF adds latency without
# helping when the model is already seeing the target at near-native res.
if _should_skip_ccf(image.size, config):
return CCFResult(
x=coarse_xy[0],
y=coarse_xy[1],
stage="coarse",
coarse_xy=coarse_xy,
crop_window=None,
)
# Phase 9 type-aware gate: if the instruction looks like a text
# target, skip refinement -- text bboxes are wide and the coarse
# prediction lands inside them; CCF refinement introduces sub-bbox
# drift that misses (Phase 4 saw text -2.3pp from this).
if config.instruction_classifier_fn is not None:
target_type = config.instruction_classifier_fn(instruction)
if target_type == "text":
return CCFResult(
x=coarse_xy[0],
y=coarse_xy[1],
stage="coarse_text_gate",
coarse_xy=coarse_xy,
crop_window=None,
)
crop_window = compute_crop_window(
center=coarse_xy,
image_size=image.size,
zoom_factor=config.zoom_factor,
min_crop_side=config.min_crop_side,
)
cropped = image.crop(crop_window)
refined_xy, _refined_raw = predict_fn(cropped, instruction)
if refined_xy is None:
if not config.fallback_to_coarse_on_invalid:
return None
return CCFResult(
x=coarse_xy[0],
y=coarse_xy[1],
stage="fallback",
coarse_xy=coarse_xy,
crop_window=crop_window,
)
orig_x, orig_y = map_crop_to_orig(refined_xy[0], refined_xy[1], crop_window)
return CCFResult(
x=orig_x,
y=orig_y,
stage="refined",
coarse_xy=coarse_xy,
crop_window=crop_window,
)
def make_ccf_eval_adapter(model, processor, base_predict_fn, config: Optional[CCFConfig] = None):
"""Adapt an eval-style predictor (takes model + processor + image + instr,
returns ((x, y), raw_text)) into a CCF-wrapped version with the same signature.
Used by src/eval.py's --ccf flag. Extracted here so tests can exercise the
wrapper without importing torch / transformers.
The wrapped function returns ((x, y), tag) where tag is one of
"[ccf:coarse]", "[ccf:refined]", "[ccf:fallback]", or "[ccf:parse_fail]".
"""
if config is None:
config = CCFConfig()
def wrapped(model_unused, processor_unused, image: Image.Image, instruction: str):
def inner(img, instr):
(x, y), raw = base_predict_fn(model, processor, img, instr)
if x is None:
return None, raw
return (float(x), float(y)), raw
result = ccf_predict_bbox(inner, image, instruction, config)
if result is None:
return (None, None), "[ccf:parse_fail]"
return (result.x, result.y), f"[ccf:{result.stage}]"
return wrapped