File size: 12,830 Bytes

e97c820

"""Cursor-Centric Focusing (CCF) inference wrapper.

Generalizes the GUI-Cursor paper's CCF technique to any grounding model:
run once on the full screenshot to get a coarse prediction, crop a
window centered on that prediction, run again on the crop, and map the
refined coordinate back to the original image's pixel space.

Why this helps: a small icon that occupies ~30 pixels in a 1920x1080
screenshot becomes ~60 pixels in a 2x crop. Qwen2.5-VL sees more
per-element tokens, making small-target grounding noticeably easier.

Design choices locked in:
- Greedy-only on both passes. Our earlier stochastic-sampling zoom
  regressed because temperature noise corrupted already-correct
  predictions. Never do that again.
- Single refinement pass. Iterative narrowing (> 1 refinement) gives
  diminishing returns and compounds coordinate-mapping errors.
- Fall back to coarse if the refined pass fails to parse -- we never
  return None if the coarse pass succeeded.
- Crop shifts near the boundary instead of shrinking, so the model
  always sees a fixed-size region at its native resolution budget.
"""

from dataclasses import dataclass
from typing import Callable, Optional, Tuple

from PIL import Image

Bbox = Tuple[int, int, int, int]  # (left, top, right, bottom)
Point = Tuple[float, float]

DEFAULT_ZOOM_FACTOR = 2.0
DEFAULT_MIN_PIXELS_FOR_CCF = 200_000  # ~450x450; below this, crop won't help
DEFAULT_MIN_CROP_SIDE = 112  # 4 * 28, Qwen2.5-VL's smallest patch grid
DEFAULT_COARSE_MAX_PIXELS: Optional[int] = None  # None == use original


# Phase 9: type-aware classifier keyword lists. Tuned by inspecting the
# Phase 4 ScreenSpot-v2 results -- text-targeted instructions that
# regressed when refined had patterns like "address bar", "search field",
# "page heading", quoted strings; icon-targeted ones that benefitted had
# "icon", "button", "menu", short verb-noun patterns.
ICON_KEYWORDS = (
    " icon", " button", " logo", " image", " menu", " arrow",
    " plus", " minus", " back", " home", " settings", " trash",
    " delete", " edit", " send", " download", " upload", " share",
    " filter", " sort", " heart", " star", " bell", " camera",
    " phone", " mail", " calendar", " clock", " refresh", " play",
    " pause", " stop", " volume", " mute", " expand", " collapse",
    " hamburger", " kebab", " dots", " more", " plus icon",
    " gear", " cog", " checkbox", " toggle", " switch",
)
TEXT_KEYWORDS = (
    " link", " label", " heading", " title", " field", " input",
    " text", " bar", " address bar", " search bar", " search field",
    " text box", " textbox", " textarea", " text area", " paragraph",
    " caption", " sentence", " phrase", " word ",
)


def classify_instruction(instruction: str) -> str:
    """Classify a grounding instruction as targeting an icon, text, or ambiguous.

    Returns one of "icon" | "text" | "ambiguous". Used by the Phase 9
    type-aware CCF gate to decide whether to run the refined pass --
    text targets are usually wide enough that the coarse model already
    nails them, while CCF refinement introduces small drift that misses.

    Heuristic in three stages:
      1. Quoted strings ("Submit", "Login") almost always reference a
         text label by exact match -> "text".
      2. Keyword counts: more icon hits than text hits -> "icon", and
         vice versa.
      3. Tie-break by length: short instructions are usually icon-like
         ("the X icon"), longer descriptive ones are usually text.
    """
    if not instruction:
        return "ambiguous"
    s = " " + instruction.lower() + " "  # pad so " icon" matches at edges
    if '"' in instruction or "'" in instruction:
        return "text"
    icon_hits = sum(1 for k in ICON_KEYWORDS if k in s)
    text_hits = sum(1 for k in TEXT_KEYWORDS if k in s)
    if text_hits > icon_hits:
        return "text"
    if icon_hits > text_hits:
        return "icon"
    # Tie / no hits: short = icon, long = text
    n = len(instruction.strip())
    if n < 25:
        return "icon"
    return "ambiguous"


@dataclass
class CCFConfig:
    """Tuning knobs for CCF."""
    zoom_factor: float = DEFAULT_ZOOM_FACTOR
    min_pixels_for_ccf: int = DEFAULT_MIN_PIXELS_FOR_CCF
    min_crop_side: int = DEFAULT_MIN_CROP_SIDE
    fallback_to_coarse_on_invalid: bool = True
    # When set, the coarse pass downsizes any image whose pixel count
    # exceeds this value. Predicted (x, y) is scaled back to the
    # original-image space before the crop window is computed. This is
    # how the GUI-Cursor paper actually runs CCF -- the coarse pass
    # just needs to localize; the refined pass uses native resolution
    # on the cropped region. Saves ~50% wall time on 1920x1080 inputs.
    coarse_max_pixels: Optional[int] = DEFAULT_COARSE_MAX_PIXELS
    # Phase 9: when set, called with the instruction string. If it
    # returns "text", we skip the refinement pass and return the coarse
    # prediction (with stage="coarse_text_gate"). The Phase 4 result
    # showed CCF refinement helps icons (+2.2pp) but hurts text (-2.3pp);
    # gating by instruction type recovers the text loss without giving
    # up the icon win. None disables gating (Phase 4 behavior).
    instruction_classifier_fn: Optional[Callable[[str], str]] = None


@dataclass
class CCFResult:
    """What CCF returns. Never None if the coarse pass succeeded."""
    x: float
    y: float
    stage: str  # "coarse" | "refined" | "fallback"
    coarse_xy: Point
    crop_window: Optional[Bbox]  # None when we skipped cropping


# Type alias for the caller-provided prediction function. Takes an image
# and an instruction, returns ((x, y) or None, raw_text_for_logging).
PredictFn = Callable[[Image.Image, str], Tuple[Optional[Point], str]]


def compute_crop_window(
    center: Point,
    image_size: Tuple[int, int],
    zoom_factor: float = DEFAULT_ZOOM_FACTOR,
    min_crop_side: int = DEFAULT_MIN_CROP_SIDE,
) -> Bbox:
    """Return a (left, top, right, bottom) window of target dimensions.

    The window is centered on `center` when possible. If `center` sits
    near an edge the window shifts rather than shrinks, so the model
    always sees a full-size crop. Dimensions floor at `min_crop_side`.
    """
    img_w, img_h = image_size
    if img_w <= 0 or img_h <= 0:
        raise ValueError(f"invalid image size {image_size}")
    if zoom_factor <= 0:
        raise ValueError(f"zoom_factor must be positive, got {zoom_factor}")

    target_w = max(min_crop_side, int(img_w / zoom_factor))
    target_h = max(min_crop_side, int(img_h / zoom_factor))
    # Clamp target to image bounds so we never try to crop bigger than the image.
    target_w = min(target_w, img_w)
    target_h = min(target_h, img_h)

    cx = int(round(center[0]))
    cy = int(round(center[1]))

    left = cx - target_w // 2
    top = cy - target_h // 2

    # Shift window inward if it spills past the image bounds.
    if left < 0:
        left = 0
    elif left + target_w > img_w:
        left = img_w - target_w
    if top < 0:
        top = 0
    elif top + target_h > img_h:
        top = img_h - target_h

    return left, top, left + target_w, top + target_h


def map_crop_to_orig(
    x_on_crop: float,
    y_on_crop: float,
    crop_window: Bbox,
) -> Point:
    """Translate crop-pixel coords to original-image pixel coords."""
    left, top, _right, _bottom = crop_window
    return left + float(x_on_crop), top + float(y_on_crop)


def _should_skip_ccf(image_size: Tuple[int, int], config: CCFConfig) -> bool:
    w, h = image_size
    return (w * h) < config.min_pixels_for_ccf


def _maybe_downsize_for_coarse(
    image: Image.Image, max_pixels: Optional[int]
) -> Tuple[Image.Image, float]:
    """Return (downsized_image, scale_factor).

    `scale_factor` is the multiplier that converts a coordinate in the
    DOWNSIZED image's pixel space back into the ORIGINAL image's pixel
    space. For an image already smaller than `max_pixels`, scale is 1.0.
    """
    if max_pixels is None:
        return image, 1.0
    w, h = image.size
    px = w * h
    if px <= max_pixels:
        return image, 1.0
    # Preserve aspect ratio. scale_down < 1.0 shrinks the image.
    import math
    scale_down = math.sqrt(max_pixels / px)
    new_w = max(1, int(round(w * scale_down)))
    new_h = max(1, int(round(h * scale_down)))
    downsized = image.resize((new_w, new_h), Image.BILINEAR)
    # Scale UP: coarse coords are in downsized space; multiply by this to
    # recover original-space coords.
    scale_up = w / new_w  # equivalently h / new_h up to rounding
    return downsized, scale_up


def ccf_predict_bbox(
    predict_fn: PredictFn,
    image: Image.Image,
    instruction: str,
    config: Optional[CCFConfig] = None,
) -> Optional[CCFResult]:
    """Run CCF around a bbox/point predictor.

    `predict_fn` must return ((x, y) or None, raw_text). Coordinates are
    in the passed image's pixel space.

    Returns a `CCFResult` or `None` if even the coarse pass failed to
    parse. When the refined pass fails, we return a `CCFResult` with
    stage="fallback" using the coarse prediction (provided the config
    has `fallback_to_coarse_on_invalid=True`, which is the default).
    """
    if config is None:
        config = CCFConfig()

    # Coarse pass: optionally downsize the image so we don't spend
    # 10+ seconds on a 1920x1080 forward pass when all we need is a
    # rough localization. Predicted coords come back in downsized
    # pixel space and we scale them up.
    coarse_image, coarse_scale = _maybe_downsize_for_coarse(
        image, config.coarse_max_pixels
    )
    coarse_xy_raw, _coarse_raw = predict_fn(coarse_image, instruction)
    if coarse_xy_raw is None:
        return None
    coarse_xy = (
        float(coarse_xy_raw[0]) * coarse_scale,
        float(coarse_xy_raw[1]) * coarse_scale,
    )

    # Skip refinement on already-small images -- CCF adds latency without
    # helping when the model is already seeing the target at near-native res.
    if _should_skip_ccf(image.size, config):
        return CCFResult(
            x=coarse_xy[0],
            y=coarse_xy[1],
            stage="coarse",
            coarse_xy=coarse_xy,
            crop_window=None,
        )

    # Phase 9 type-aware gate: if the instruction looks like a text
    # target, skip refinement -- text bboxes are wide and the coarse
    # prediction lands inside them; CCF refinement introduces sub-bbox
    # drift that misses (Phase 4 saw text -2.3pp from this).
    if config.instruction_classifier_fn is not None:
        target_type = config.instruction_classifier_fn(instruction)
        if target_type == "text":
            return CCFResult(
                x=coarse_xy[0],
                y=coarse_xy[1],
                stage="coarse_text_gate",
                coarse_xy=coarse_xy,
                crop_window=None,
            )

    crop_window = compute_crop_window(
        center=coarse_xy,
        image_size=image.size,
        zoom_factor=config.zoom_factor,
        min_crop_side=config.min_crop_side,
    )
    cropped = image.crop(crop_window)

    refined_xy, _refined_raw = predict_fn(cropped, instruction)
    if refined_xy is None:
        if not config.fallback_to_coarse_on_invalid:
            return None
        return CCFResult(
            x=coarse_xy[0],
            y=coarse_xy[1],
            stage="fallback",
            coarse_xy=coarse_xy,
            crop_window=crop_window,
        )

    orig_x, orig_y = map_crop_to_orig(refined_xy[0], refined_xy[1], crop_window)
    return CCFResult(
        x=orig_x,
        y=orig_y,
        stage="refined",
        coarse_xy=coarse_xy,
        crop_window=crop_window,
    )


def make_ccf_eval_adapter(model, processor, base_predict_fn, config: Optional[CCFConfig] = None):
    """Adapt an eval-style predictor (takes model + processor + image + instr,
    returns ((x, y), raw_text)) into a CCF-wrapped version with the same signature.

    Used by src/eval.py's --ccf flag. Extracted here so tests can exercise the
    wrapper without importing torch / transformers.

    The wrapped function returns ((x, y), tag) where tag is one of
    "[ccf:coarse]", "[ccf:refined]", "[ccf:fallback]", or "[ccf:parse_fail]".
    """
    if config is None:
        config = CCFConfig()

    def wrapped(model_unused, processor_unused, image: Image.Image, instruction: str):
        def inner(img, instr):
            (x, y), raw = base_predict_fn(model, processor, img, instr)
            if x is None:
                return None, raw
            return (float(x), float(y)), raw

        result = ccf_predict_bbox(inner, image, instruction, config)
        if result is None:
            return (None, None), "[ccf:parse_fail]"
        return (result.x, result.y), f"[ccf:{result.stage}]"

    return wrapped