Image-to-Text
Transformers
English
gui-grounding
screen-understanding
vision-language-model
icon-detection
screenspot
visual-search
Instructions to use luisf-mc/gui-g2-3b-ccf with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use luisf-mc/gui-g2-3b-ccf with Transformers:
# Use a pipeline as a high-level helper # Warning: Pipeline type "image-to-text" is no longer supported in transformers v5. # You must load the model directly (see below) or downgrade to v4.x with: # 'pip install "transformers<5.0.0' from transformers import pipeline pipe = pipeline("image-to-text", model="luisf-mc/gui-g2-3b-ccf")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("luisf-mc/gui-g2-3b-ccf", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """Cursor-Centric Focusing (CCF) inference wrapper. | |
| Generalizes the GUI-Cursor paper's CCF technique to any grounding model: | |
| run once on the full screenshot to get a coarse prediction, crop a | |
| window centered on that prediction, run again on the crop, and map the | |
| refined coordinate back to the original image's pixel space. | |
| Why this helps: a small icon that occupies ~30 pixels in a 1920x1080 | |
| screenshot becomes ~60 pixels in a 2x crop. Qwen2.5-VL sees more | |
| per-element tokens, making small-target grounding noticeably easier. | |
| Design choices locked in: | |
| - Greedy-only on both passes. Our earlier stochastic-sampling zoom | |
| regressed because temperature noise corrupted already-correct | |
| predictions. Never do that again. | |
| - Single refinement pass. Iterative narrowing (> 1 refinement) gives | |
| diminishing returns and compounds coordinate-mapping errors. | |
| - Fall back to coarse if the refined pass fails to parse -- we never | |
| return None if the coarse pass succeeded. | |
| - Crop shifts near the boundary instead of shrinking, so the model | |
| always sees a fixed-size region at its native resolution budget. | |
| """ | |
| from dataclasses import dataclass | |
| from typing import Callable, Optional, Tuple | |
| from PIL import Image | |
| Bbox = Tuple[int, int, int, int] # (left, top, right, bottom) | |
| Point = Tuple[float, float] | |
| DEFAULT_ZOOM_FACTOR = 2.0 | |
| DEFAULT_MIN_PIXELS_FOR_CCF = 200_000 # ~450x450; below this, crop won't help | |
| DEFAULT_MIN_CROP_SIDE = 112 # 4 * 28, Qwen2.5-VL's smallest patch grid | |
| DEFAULT_COARSE_MAX_PIXELS: Optional[int] = None # None == use original | |
| # Phase 9: type-aware classifier keyword lists. Tuned by inspecting the | |
| # Phase 4 ScreenSpot-v2 results -- text-targeted instructions that | |
| # regressed when refined had patterns like "address bar", "search field", | |
| # "page heading", quoted strings; icon-targeted ones that benefitted had | |
| # "icon", "button", "menu", short verb-noun patterns. | |
| ICON_KEYWORDS = ( | |
| " icon", " button", " logo", " image", " menu", " arrow", | |
| " plus", " minus", " back", " home", " settings", " trash", | |
| " delete", " edit", " send", " download", " upload", " share", | |
| " filter", " sort", " heart", " star", " bell", " camera", | |
| " phone", " mail", " calendar", " clock", " refresh", " play", | |
| " pause", " stop", " volume", " mute", " expand", " collapse", | |
| " hamburger", " kebab", " dots", " more", " plus icon", | |
| " gear", " cog", " checkbox", " toggle", " switch", | |
| ) | |
| TEXT_KEYWORDS = ( | |
| " link", " label", " heading", " title", " field", " input", | |
| " text", " bar", " address bar", " search bar", " search field", | |
| " text box", " textbox", " textarea", " text area", " paragraph", | |
| " caption", " sentence", " phrase", " word ", | |
| ) | |
| def classify_instruction(instruction: str) -> str: | |
| """Classify a grounding instruction as targeting an icon, text, or ambiguous. | |
| Returns one of "icon" | "text" | "ambiguous". Used by the Phase 9 | |
| type-aware CCF gate to decide whether to run the refined pass -- | |
| text targets are usually wide enough that the coarse model already | |
| nails them, while CCF refinement introduces small drift that misses. | |
| Heuristic in three stages: | |
| 1. Quoted strings ("Submit", "Login") almost always reference a | |
| text label by exact match -> "text". | |
| 2. Keyword counts: more icon hits than text hits -> "icon", and | |
| vice versa. | |
| 3. Tie-break by length: short instructions are usually icon-like | |
| ("the X icon"), longer descriptive ones are usually text. | |
| """ | |
| if not instruction: | |
| return "ambiguous" | |
| s = " " + instruction.lower() + " " # pad so " icon" matches at edges | |
| if '"' in instruction or "'" in instruction: | |
| return "text" | |
| icon_hits = sum(1 for k in ICON_KEYWORDS if k in s) | |
| text_hits = sum(1 for k in TEXT_KEYWORDS if k in s) | |
| if text_hits > icon_hits: | |
| return "text" | |
| if icon_hits > text_hits: | |
| return "icon" | |
| # Tie / no hits: short = icon, long = text | |
| n = len(instruction.strip()) | |
| if n < 25: | |
| return "icon" | |
| return "ambiguous" | |
| class CCFConfig: | |
| """Tuning knobs for CCF.""" | |
| zoom_factor: float = DEFAULT_ZOOM_FACTOR | |
| min_pixels_for_ccf: int = DEFAULT_MIN_PIXELS_FOR_CCF | |
| min_crop_side: int = DEFAULT_MIN_CROP_SIDE | |
| fallback_to_coarse_on_invalid: bool = True | |
| # When set, the coarse pass downsizes any image whose pixel count | |
| # exceeds this value. Predicted (x, y) is scaled back to the | |
| # original-image space before the crop window is computed. This is | |
| # how the GUI-Cursor paper actually runs CCF -- the coarse pass | |
| # just needs to localize; the refined pass uses native resolution | |
| # on the cropped region. Saves ~50% wall time on 1920x1080 inputs. | |
| coarse_max_pixels: Optional[int] = DEFAULT_COARSE_MAX_PIXELS | |
| # Phase 9: when set, called with the instruction string. If it | |
| # returns "text", we skip the refinement pass and return the coarse | |
| # prediction (with stage="coarse_text_gate"). The Phase 4 result | |
| # showed CCF refinement helps icons (+2.2pp) but hurts text (-2.3pp); | |
| # gating by instruction type recovers the text loss without giving | |
| # up the icon win. None disables gating (Phase 4 behavior). | |
| instruction_classifier_fn: Optional[Callable[[str], str]] = None | |
| class CCFResult: | |
| """What CCF returns. Never None if the coarse pass succeeded.""" | |
| x: float | |
| y: float | |
| stage: str # "coarse" | "refined" | "fallback" | |
| coarse_xy: Point | |
| crop_window: Optional[Bbox] # None when we skipped cropping | |
| # Type alias for the caller-provided prediction function. Takes an image | |
| # and an instruction, returns ((x, y) or None, raw_text_for_logging). | |
| PredictFn = Callable[[Image.Image, str], Tuple[Optional[Point], str]] | |
| def compute_crop_window( | |
| center: Point, | |
| image_size: Tuple[int, int], | |
| zoom_factor: float = DEFAULT_ZOOM_FACTOR, | |
| min_crop_side: int = DEFAULT_MIN_CROP_SIDE, | |
| ) -> Bbox: | |
| """Return a (left, top, right, bottom) window of target dimensions. | |
| The window is centered on `center` when possible. If `center` sits | |
| near an edge the window shifts rather than shrinks, so the model | |
| always sees a full-size crop. Dimensions floor at `min_crop_side`. | |
| """ | |
| img_w, img_h = image_size | |
| if img_w <= 0 or img_h <= 0: | |
| raise ValueError(f"invalid image size {image_size}") | |
| if zoom_factor <= 0: | |
| raise ValueError(f"zoom_factor must be positive, got {zoom_factor}") | |
| target_w = max(min_crop_side, int(img_w / zoom_factor)) | |
| target_h = max(min_crop_side, int(img_h / zoom_factor)) | |
| # Clamp target to image bounds so we never try to crop bigger than the image. | |
| target_w = min(target_w, img_w) | |
| target_h = min(target_h, img_h) | |
| cx = int(round(center[0])) | |
| cy = int(round(center[1])) | |
| left = cx - target_w // 2 | |
| top = cy - target_h // 2 | |
| # Shift window inward if it spills past the image bounds. | |
| if left < 0: | |
| left = 0 | |
| elif left + target_w > img_w: | |
| left = img_w - target_w | |
| if top < 0: | |
| top = 0 | |
| elif top + target_h > img_h: | |
| top = img_h - target_h | |
| return left, top, left + target_w, top + target_h | |
| def map_crop_to_orig( | |
| x_on_crop: float, | |
| y_on_crop: float, | |
| crop_window: Bbox, | |
| ) -> Point: | |
| """Translate crop-pixel coords to original-image pixel coords.""" | |
| left, top, _right, _bottom = crop_window | |
| return left + float(x_on_crop), top + float(y_on_crop) | |
| def _should_skip_ccf(image_size: Tuple[int, int], config: CCFConfig) -> bool: | |
| w, h = image_size | |
| return (w * h) < config.min_pixels_for_ccf | |
| def _maybe_downsize_for_coarse( | |
| image: Image.Image, max_pixels: Optional[int] | |
| ) -> Tuple[Image.Image, float]: | |
| """Return (downsized_image, scale_factor). | |
| `scale_factor` is the multiplier that converts a coordinate in the | |
| DOWNSIZED image's pixel space back into the ORIGINAL image's pixel | |
| space. For an image already smaller than `max_pixels`, scale is 1.0. | |
| """ | |
| if max_pixels is None: | |
| return image, 1.0 | |
| w, h = image.size | |
| px = w * h | |
| if px <= max_pixels: | |
| return image, 1.0 | |
| # Preserve aspect ratio. scale_down < 1.0 shrinks the image. | |
| import math | |
| scale_down = math.sqrt(max_pixels / px) | |
| new_w = max(1, int(round(w * scale_down))) | |
| new_h = max(1, int(round(h * scale_down))) | |
| downsized = image.resize((new_w, new_h), Image.BILINEAR) | |
| # Scale UP: coarse coords are in downsized space; multiply by this to | |
| # recover original-space coords. | |
| scale_up = w / new_w # equivalently h / new_h up to rounding | |
| return downsized, scale_up | |
| def ccf_predict_bbox( | |
| predict_fn: PredictFn, | |
| image: Image.Image, | |
| instruction: str, | |
| config: Optional[CCFConfig] = None, | |
| ) -> Optional[CCFResult]: | |
| """Run CCF around a bbox/point predictor. | |
| `predict_fn` must return ((x, y) or None, raw_text). Coordinates are | |
| in the passed image's pixel space. | |
| Returns a `CCFResult` or `None` if even the coarse pass failed to | |
| parse. When the refined pass fails, we return a `CCFResult` with | |
| stage="fallback" using the coarse prediction (provided the config | |
| has `fallback_to_coarse_on_invalid=True`, which is the default). | |
| """ | |
| if config is None: | |
| config = CCFConfig() | |
| # Coarse pass: optionally downsize the image so we don't spend | |
| # 10+ seconds on a 1920x1080 forward pass when all we need is a | |
| # rough localization. Predicted coords come back in downsized | |
| # pixel space and we scale them up. | |
| coarse_image, coarse_scale = _maybe_downsize_for_coarse( | |
| image, config.coarse_max_pixels | |
| ) | |
| coarse_xy_raw, _coarse_raw = predict_fn(coarse_image, instruction) | |
| if coarse_xy_raw is None: | |
| return None | |
| coarse_xy = ( | |
| float(coarse_xy_raw[0]) * coarse_scale, | |
| float(coarse_xy_raw[1]) * coarse_scale, | |
| ) | |
| # Skip refinement on already-small images -- CCF adds latency without | |
| # helping when the model is already seeing the target at near-native res. | |
| if _should_skip_ccf(image.size, config): | |
| return CCFResult( | |
| x=coarse_xy[0], | |
| y=coarse_xy[1], | |
| stage="coarse", | |
| coarse_xy=coarse_xy, | |
| crop_window=None, | |
| ) | |
| # Phase 9 type-aware gate: if the instruction looks like a text | |
| # target, skip refinement -- text bboxes are wide and the coarse | |
| # prediction lands inside them; CCF refinement introduces sub-bbox | |
| # drift that misses (Phase 4 saw text -2.3pp from this). | |
| if config.instruction_classifier_fn is not None: | |
| target_type = config.instruction_classifier_fn(instruction) | |
| if target_type == "text": | |
| return CCFResult( | |
| x=coarse_xy[0], | |
| y=coarse_xy[1], | |
| stage="coarse_text_gate", | |
| coarse_xy=coarse_xy, | |
| crop_window=None, | |
| ) | |
| crop_window = compute_crop_window( | |
| center=coarse_xy, | |
| image_size=image.size, | |
| zoom_factor=config.zoom_factor, | |
| min_crop_side=config.min_crop_side, | |
| ) | |
| cropped = image.crop(crop_window) | |
| refined_xy, _refined_raw = predict_fn(cropped, instruction) | |
| if refined_xy is None: | |
| if not config.fallback_to_coarse_on_invalid: | |
| return None | |
| return CCFResult( | |
| x=coarse_xy[0], | |
| y=coarse_xy[1], | |
| stage="fallback", | |
| coarse_xy=coarse_xy, | |
| crop_window=crop_window, | |
| ) | |
| orig_x, orig_y = map_crop_to_orig(refined_xy[0], refined_xy[1], crop_window) | |
| return CCFResult( | |
| x=orig_x, | |
| y=orig_y, | |
| stage="refined", | |
| coarse_xy=coarse_xy, | |
| crop_window=crop_window, | |
| ) | |
| def make_ccf_eval_adapter(model, processor, base_predict_fn, config: Optional[CCFConfig] = None): | |
| """Adapt an eval-style predictor (takes model + processor + image + instr, | |
| returns ((x, y), raw_text)) into a CCF-wrapped version with the same signature. | |
| Used by src/eval.py's --ccf flag. Extracted here so tests can exercise the | |
| wrapper without importing torch / transformers. | |
| The wrapped function returns ((x, y), tag) where tag is one of | |
| "[ccf:coarse]", "[ccf:refined]", "[ccf:fallback]", or "[ccf:parse_fail]". | |
| """ | |
| if config is None: | |
| config = CCFConfig() | |
| def wrapped(model_unused, processor_unused, image: Image.Image, instruction: str): | |
| def inner(img, instr): | |
| (x, y), raw = base_predict_fn(model, processor, img, instr) | |
| if x is None: | |
| return None, raw | |
| return (float(x), float(y)), raw | |
| result = ccf_predict_bbox(inner, image, instruction, config) | |
| if result is None: | |
| return (None, None), "[ccf:parse_fail]" | |
| return (result.x, result.y), f"[ccf:{result.stage}]" | |
| return wrapped | |