"""Self-contained inference example for GUI-G2-3B + CCF.

Usage:
    python predict.py --image screenshot.png --instruction "click the settings icon"
    python predict.py --image screenshot.png --instruction "type your password" --no-ccf
    python predict.py --image screenshot.png --instruction "click X" --no-type-gate

Loads GUI-G2-3B from inclusionAI/GUI-G2-3B (downloads on first run, ~6GB),
applies the CCF wrapper (with optional type-aware gating), and prints the
predicted (x, y) click point in original-image pixel coordinates.

The cursor_ccf.py module bundled in this repo is the only piece of "ours";
everything else is just glue around the Hugging Face model.
"""

import argparse
import os
import re
import sys
import time

import torch
from PIL import Image

# Local module bundled with this release
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from cursor_ccf import (  # noqa: E402
    CCFConfig,
    ccf_predict_bbox,
    classify_instruction,
)


GUI_G2_PROMPT = (
    "Outline the position corresponding to the instruction: {}. "
    "The output should be only [x1,y1,x2,y2]."
)


def parse_bbox(response: str):
    """Extract (cx, cy) from a "[x1,y1,x2,y2]" model response. Returns
    (None, None) if the model output didn't contain a parseable bbox."""
    m = re.search(r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]", response)
    if not m:
        return None, None
    x1, y1, x2, y2 = map(int, m.groups())
    return (x1 + x2) / 2.0, (y1 + y2) / 2.0


def load_model(model_id: str, attn_impl: str = "flash_attention_2"):
    """Load GUI-G2-3B and its processor with the parameters that match
    the published 89.2% ScreenSpot-v2 baseline."""
    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

    print(f"Loading {model_id} ...")
    processor = AutoProcessor.from_pretrained(
        model_id, min_pixels=3136, max_pixels=12_845_056,
    )
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        attn_implementation=attn_impl,
        device_map="auto",
    )
    model.eval()
    return model, processor


def predict_gui_g2(model, processor, image: Image.Image, instruction: str):
    """One greedy forward pass. Returns ((cx, cy), raw_text) where the
    coords are in the ORIGINAL image's pixel space (rescaled from the
    processor's smart_resize space, where the model emits)."""
    from qwen_vl_utils import process_vision_info

    orig_w, orig_h = image.size
    messages = [{"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": GUI_G2_PROMPT.format(instruction)},
    ]}]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True,
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text], images=image_inputs, videos=video_inputs,
        padding=True, return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs, max_new_tokens=32, do_sample=False,
        )

    trimmed = output[0][inputs.input_ids.shape[1]:]
    response = processor.batch_decode(
        [trimmed], skip_special_tokens=True,
    )[0]

    abs_cx, abs_cy = parse_bbox(response)
    if abs_cx is None:
        return (None, None), response

    # Model outputs coords in processed-pixel space (post smart_resize).
    # Rescale to original-image pixels.
    proc_h = inputs["image_grid_thw"][0][1].item() * 14
    proc_w = inputs["image_grid_thw"][0][2].item() * 14
    return (abs_cx * orig_w / proc_w, abs_cy * orig_h / proc_h), response


def predict_with_ccf(
    model,
    processor,
    image: Image.Image,
    instruction: str,
    use_type_gate: bool = True,
):
    """CCF-wrapped prediction. Returns (x, y, stage) or (None, None, "fail")."""
    cfg = CCFConfig(
        zoom_factor=2.0,
        coarse_max_pixels=1_500_000,
        instruction_classifier_fn=(
            classify_instruction if use_type_gate else None
        ),
    )

    def inner(img, instr):
        (x, y), raw = predict_gui_g2(model, processor, img, instr)
        if x is None:
            return None, raw
        return (float(x), float(y)), raw

    result = ccf_predict_bbox(inner, image, instruction, cfg)
    if result is None:
        return None, None, "fail"
    return result.x, result.y, result.stage


def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--image", required=True, help="Path to a PNG/JPG screenshot")
    parser.add_argument(
        "--instruction", required=True,
        help='Natural-language target description, e.g. "click the settings icon"',
    )
    parser.add_argument(
        "--model-id", default="inclusionAI/GUI-G2-3B",
        help="Hugging Face model ID for the base GUI-G2-3B (or a local path)",
    )
    parser.add_argument(
        "--no-ccf", action="store_true",
        help="Disable CCF; just run the base model once. Faster but loses "
             "the +2.2pp icon improvement.",
    )
    parser.add_argument(
        "--no-type-gate", action="store_true",
        help="Disable the type-aware gate. Recommended when most of your "
             "instructions target icons; slight web boost vs gated.",
    )
    parser.add_argument(
        "--attn-impl", default="flash_attention_2",
        choices=["sdpa", "flash_attention_2", "eager"],
        help="Attention backend. flash_attention_2 is fastest but requires "
             "the flash-attn package compiled for your torch version. "
             "sdpa works everywhere but is ~3x slower on big images.",
    )
    args = parser.parse_args()

    if not os.path.exists(args.image):
        sys.exit(f"ERROR: image not found at {args.image}")

    image = Image.open(args.image).convert("RGB")
    print(f"Image: {image.size[0]}x{image.size[1]} pixels")
    print(f"Instruction: {args.instruction!r}")

    model, processor = load_model(args.model_id, attn_impl=args.attn_impl)

    t0 = time.time()
    if args.no_ccf:
        (x, y), raw = predict_gui_g2(model, processor, image, args.instruction)
        stage = "no_ccf"
    else:
        x, y, stage = predict_with_ccf(
            model, processor, image, args.instruction,
            use_type_gate=not args.no_type_gate,
        )
    elapsed = time.time() - t0

    if x is None:
        print(f"\nFAILED to parse a bbox from the model output.")
        sys.exit(1)

    print(f"\nPrediction: click at ({x:.1f}, {y:.1f})")
    print(f"  Stage:   {stage}")
    print(f"  Wall:    {elapsed:.2f}s")


if __name__ == "__main__":
    main()