Initial release: GUI-G2-3B + CCF inference wrapper (+2.2pp icon over base)

e97c820 verified about 2 months ago

12.8 kB

	"""Cursor-Centric Focusing (CCF) inference wrapper.

	Generalizes the GUI-Cursor paper's CCF technique to any grounding model:
	run once on the full screenshot to get a coarse prediction, crop a
	window centered on that prediction, run again on the crop, and map the
	refined coordinate back to the original image's pixel space.

	Why this helps: a small icon that occupies ~30 pixels in a 1920x1080
	screenshot becomes ~60 pixels in a 2x crop. Qwen2.5-VL sees more
	per-element tokens, making small-target grounding noticeably easier.

	Design choices locked in:
	- Greedy-only on both passes. Our earlier stochastic-sampling zoom
	regressed because temperature noise corrupted already-correct
	predictions. Never do that again.
	- Single refinement pass. Iterative narrowing (> 1 refinement) gives
	diminishing returns and compounds coordinate-mapping errors.
	- Fall back to coarse if the refined pass fails to parse -- we never
	return None if the coarse pass succeeded.
	- Crop shifts near the boundary instead of shrinking, so the model
	always sees a fixed-size region at its native resolution budget.
	"""

	from dataclasses import dataclass
	from typing import Callable, Optional, Tuple

	from PIL import Image

	Bbox = Tuple[int, int, int, int] # (left, top, right, bottom)
	Point = Tuple[float, float]

	DEFAULT_ZOOM_FACTOR = 2.0
	DEFAULT_MIN_PIXELS_FOR_CCF = 200_000 # ~450x450; below this, crop won't help
	DEFAULT_MIN_CROP_SIDE = 112 # 4 * 28, Qwen2.5-VL's smallest patch grid
	DEFAULT_COARSE_MAX_PIXELS: Optional[int] = None # None == use original


	# Phase 9: type-aware classifier keyword lists. Tuned by inspecting the
	# Phase 4 ScreenSpot-v2 results -- text-targeted instructions that
	# regressed when refined had patterns like "address bar", "search field",
	# "page heading", quoted strings; icon-targeted ones that benefitted had
	# "icon", "button", "menu", short verb-noun patterns.
	ICON_KEYWORDS = (
	" icon", " button", " logo", " image", " menu", " arrow",
	" plus", " minus", " back", " home", " settings", " trash",
	" delete", " edit", " send", " download", " upload", " share",
	" filter", " sort", " heart", " star", " bell", " camera",
	" phone", " mail", " calendar", " clock", " refresh", " play",
	" pause", " stop", " volume", " mute", " expand", " collapse",
	" hamburger", " kebab", " dots", " more", " plus icon",
	" gear", " cog", " checkbox", " toggle", " switch",
	)
	TEXT_KEYWORDS = (
	" link", " label", " heading", " title", " field", " input",
	" text", " bar", " address bar", " search bar", " search field",
	" text box", " textbox", " textarea", " text area", " paragraph",
	" caption", " sentence", " phrase", " word ",
	)


	def classify_instruction(instruction: str) -> str:
	"""Classify a grounding instruction as targeting an icon, text, or ambiguous.

	Returns one of "icon" \| "text" \| "ambiguous". Used by the Phase 9
	type-aware CCF gate to decide whether to run the refined pass --
	text targets are usually wide enough that the coarse model already
	nails them, while CCF refinement introduces small drift that misses.

	Heuristic in three stages:
	1. Quoted strings ("Submit", "Login") almost always reference a
	text label by exact match -> "text".
	2. Keyword counts: more icon hits than text hits -> "icon", and
	vice versa.
	3. Tie-break by length: short instructions are usually icon-like
	("the X icon"), longer descriptive ones are usually text.
	"""
	if not instruction:
	return "ambiguous"
	s = " " + instruction.lower() + " " # pad so " icon" matches at edges
	if '"' in instruction or "'" in instruction:
	return "text"
	icon_hits = sum(1 for k in ICON_KEYWORDS if k in s)
	text_hits = sum(1 for k in TEXT_KEYWORDS if k in s)
	if text_hits > icon_hits:
	return "text"
	if icon_hits > text_hits:
	return "icon"
	# Tie / no hits: short = icon, long = text
	n = len(instruction.strip())
	if n < 25:
	return "icon"
	return "ambiguous"


	@dataclass
	class CCFConfig:
	"""Tuning knobs for CCF."""
	zoom_factor: float = DEFAULT_ZOOM_FACTOR
	min_pixels_for_ccf: int = DEFAULT_MIN_PIXELS_FOR_CCF
	min_crop_side: int = DEFAULT_MIN_CROP_SIDE
	fallback_to_coarse_on_invalid: bool = True
	# When set, the coarse pass downsizes any image whose pixel count
	# exceeds this value. Predicted (x, y) is scaled back to the
	# original-image space before the crop window is computed. This is
	# how the GUI-Cursor paper actually runs CCF -- the coarse pass
	# just needs to localize; the refined pass uses native resolution
	# on the cropped region. Saves ~50% wall time on 1920x1080 inputs.
	coarse_max_pixels: Optional[int] = DEFAULT_COARSE_MAX_PIXELS
	# Phase 9: when set, called with the instruction string. If it
	# returns "text", we skip the refinement pass and return the coarse
	# prediction (with stage="coarse_text_gate"). The Phase 4 result
	# showed CCF refinement helps icons (+2.2pp) but hurts text (-2.3pp);
	# gating by instruction type recovers the text loss without giving
	# up the icon win. None disables gating (Phase 4 behavior).
	instruction_classifier_fn: Optional[Callable[[str], str]] = None


	@dataclass
	class CCFResult:
	"""What CCF returns. Never None if the coarse pass succeeded."""
	x: float
	y: float
	stage: str # "coarse" \| "refined" \| "fallback"
	coarse_xy: Point
	crop_window: Optional[Bbox] # None when we skipped cropping


	# Type alias for the caller-provided prediction function. Takes an image
	# and an instruction, returns ((x, y) or None, raw_text_for_logging).
	PredictFn = Callable[[Image.Image, str], Tuple[Optional[Point], str]]


	def compute_crop_window(
	center: Point,
	image_size: Tuple[int, int],
	zoom_factor: float = DEFAULT_ZOOM_FACTOR,
	min_crop_side: int = DEFAULT_MIN_CROP_SIDE,
	) -> Bbox:
	"""Return a (left, top, right, bottom) window of target dimensions.

	The window is centered on `center` when possible. If `center` sits
	near an edge the window shifts rather than shrinks, so the model
	always sees a full-size crop. Dimensions floor at `min_crop_side`.
	"""
	img_w, img_h = image_size
	if img_w <= 0 or img_h <= 0:
	raise ValueError(f"invalid image size {image_size}")
	if zoom_factor <= 0:
	raise ValueError(f"zoom_factor must be positive, got {zoom_factor}")

	target_w = max(min_crop_side, int(img_w / zoom_factor))
	target_h = max(min_crop_side, int(img_h / zoom_factor))
	# Clamp target to image bounds so we never try to crop bigger than the image.
	target_w = min(target_w, img_w)
	target_h = min(target_h, img_h)

	cx = int(round(center[0]))
	cy = int(round(center[1]))

	left = cx - target_w // 2
	top = cy - target_h // 2

	# Shift window inward if it spills past the image bounds.
	if left < 0:
	left = 0
	elif left + target_w > img_w:
	left = img_w - target_w
	if top < 0:
	top = 0
	elif top + target_h > img_h:
	top = img_h - target_h

	return left, top, left + target_w, top + target_h


	def map_crop_to_orig(
	x_on_crop: float,
	y_on_crop: float,
	crop_window: Bbox,
	) -> Point:
	"""Translate crop-pixel coords to original-image pixel coords."""
	left, top, _right, _bottom = crop_window
	return left + float(x_on_crop), top + float(y_on_crop)


	def _should_skip_ccf(image_size: Tuple[int, int], config: CCFConfig) -> bool:
	w, h = image_size
	return (w * h) < config.min_pixels_for_ccf


	def _maybe_downsize_for_coarse(
	image: Image.Image, max_pixels: Optional[int]
	) -> Tuple[Image.Image, float]:
	"""Return (downsized_image, scale_factor).

	`scale_factor` is the multiplier that converts a coordinate in the
	DOWNSIZED image's pixel space back into the ORIGINAL image's pixel
	space. For an image already smaller than `max_pixels`, scale is 1.0.
	"""
	if max_pixels is None:
	return image, 1.0
	w, h = image.size
	px = w * h
	if px <= max_pixels:
	return image, 1.0
	# Preserve aspect ratio. scale_down < 1.0 shrinks the image.
	import math
	scale_down = math.sqrt(max_pixels / px)
	new_w = max(1, int(round(w * scale_down)))
	new_h = max(1, int(round(h * scale_down)))
	downsized = image.resize((new_w, new_h), Image.BILINEAR)
	# Scale UP: coarse coords are in downsized space; multiply by this to
	# recover original-space coords.
	scale_up = w / new_w # equivalently h / new_h up to rounding
	return downsized, scale_up


	def ccf_predict_bbox(
	predict_fn: PredictFn,
	image: Image.Image,
	instruction: str,
	config: Optional[CCFConfig] = None,
	) -> Optional[CCFResult]:
	"""Run CCF around a bbox/point predictor.

	`predict_fn` must return ((x, y) or None, raw_text). Coordinates are
	in the passed image's pixel space.

	Returns a `CCFResult` or `None` if even the coarse pass failed to
	parse. When the refined pass fails, we return a `CCFResult` with
	stage="fallback" using the coarse prediction (provided the config
	has `fallback_to_coarse_on_invalid=True`, which is the default).
	"""
	if config is None:
	config = CCFConfig()

	# Coarse pass: optionally downsize the image so we don't spend
	# 10+ seconds on a 1920x1080 forward pass when all we need is a
	# rough localization. Predicted coords come back in downsized
	# pixel space and we scale them up.
	coarse_image, coarse_scale = _maybe_downsize_for_coarse(
	image, config.coarse_max_pixels
	)
	coarse_xy_raw, _coarse_raw = predict_fn(coarse_image, instruction)
	if coarse_xy_raw is None:
	return None
	coarse_xy = (
	float(coarse_xy_raw[0]) * coarse_scale,
	float(coarse_xy_raw[1]) * coarse_scale,
	)

	# Skip refinement on already-small images -- CCF adds latency without
	# helping when the model is already seeing the target at near-native res.
	if _should_skip_ccf(image.size, config):
	return CCFResult(
	x=coarse_xy[0],
	y=coarse_xy[1],
	stage="coarse",
	coarse_xy=coarse_xy,
	crop_window=None,
	)

	# Phase 9 type-aware gate: if the instruction looks like a text
	# target, skip refinement -- text bboxes are wide and the coarse
	# prediction lands inside them; CCF refinement introduces sub-bbox
	# drift that misses (Phase 4 saw text -2.3pp from this).
	if config.instruction_classifier_fn is not None:
	target_type = config.instruction_classifier_fn(instruction)
	if target_type == "text":
	return CCFResult(
	x=coarse_xy[0],
	y=coarse_xy[1],
	stage="coarse_text_gate",
	coarse_xy=coarse_xy,
	crop_window=None,
	)

	crop_window = compute_crop_window(
	center=coarse_xy,
	image_size=image.size,
	zoom_factor=config.zoom_factor,
	min_crop_side=config.min_crop_side,
	)
	cropped = image.crop(crop_window)

	refined_xy, _refined_raw = predict_fn(cropped, instruction)
	if refined_xy is None:
	if not config.fallback_to_coarse_on_invalid:
	return None
	return CCFResult(
	x=coarse_xy[0],
	y=coarse_xy[1],
	stage="fallback",
	coarse_xy=coarse_xy,
	crop_window=crop_window,
	)

	orig_x, orig_y = map_crop_to_orig(refined_xy[0], refined_xy[1], crop_window)
	return CCFResult(
	x=orig_x,
	y=orig_y,
	stage="refined",
	coarse_xy=coarse_xy,
	crop_window=crop_window,
	)


	def make_ccf_eval_adapter(model, processor, base_predict_fn, config: Optional[CCFConfig] = None):
	"""Adapt an eval-style predictor (takes model + processor + image + instr,
	returns ((x, y), raw_text)) into a CCF-wrapped version with the same signature.

	Used by src/eval.py's --ccf flag. Extracted here so tests can exercise the
	wrapper without importing torch / transformers.

	The wrapped function returns ((x, y), tag) where tag is one of
	"[ccf:coarse]", "[ccf:refined]", "[ccf:fallback]", or "[ccf:parse_fail]".
	"""
	if config is None:
	config = CCFConfig()

	def wrapped(model_unused, processor_unused, image: Image.Image, instruction: str):
	def inner(img, instr):
	(x, y), raw = base_predict_fn(model, processor, img, instr)
	if x is None:
	return None, raw
	return (float(x), float(y)), raw

	result = ccf_predict_bbox(inner, image, instruction, config)
	if result is None:
	return (None, None), "[ccf:parse_fail]"
	return (result.x, result.y), f"[ccf:{result.stage}]"

	return wrapped