import os
import sys
import time
from threading import Lock, Thread

import gradio as gr
import spaces
from PIL import Image

from tools.config import (
    ADD_VLM_BOUNDING_BOX_RULES,
    CLOUD_VLM_MODEL_CHOICE,
    DEFAULT_INFERENCE_SERVER_VLM_MODEL,
    LOAD_PADDLE_AT_STARTUP,
    LOAD_TRANSFORMERS_VLM_MODEL_AT_START,
    MAX_INPUT_TOKEN_LENGTH,
    MAX_NEW_TOKENS,
    MAX_SPACES_GPU_RUN_TIME,
    MAX_WORKERS,
    PADDLE_DET_DB_UNCLIP_RATIO,
    PADDLE_FONT_PATH,
    PADDLE_MODEL_PATH,
    PADDLE_USE_TEXTLINE_ORIENTATION,
    QUANTISE_VLM_MODELS,
    REPORT_VLM_OUTPUTS_TO_GUI,
    SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
    SHOW_BEDROCK_VLM_MODELS,
    SHOW_INFERENCE_SERVER_VLM_OPTIONS,
    SHOW_VLM_MODEL_OPTIONS,
    VLM_DEFAULT_DO_SAMPLE,
    VLM_DEFAULT_MIN_P,
    VLM_DEFAULT_PRESENCE_PENALTY,
    VLM_DEFAULT_REPETITION_PENALTY,
    VLM_DEFAULT_STREAM,
    VLM_DEFAULT_TEMPERATURE,
    VLM_DEFAULT_TOP_K,
    VLM_DEFAULT_TOP_P,
    VLM_DISABLE_QWEN3_5_THINKING,
    VLM_MAX_IMAGE_SIZE,
    VLM_MIN_IMAGE_SIZE,
    VLM_QWEN3_5_NOTHINK_SUFFIX,
    VLM_SEED,
)
from tools.helper_functions import get_system_font_path, strip_vlm_thinking_tags
from tools.inference_attention import (
    log_attn_implementation_choice,
    resolve_attn_implementation,
)

text_read_default_prompt = """Read the main line of text in the image, and return JSON with keys "text" (string) and "conf" (number 0–1) for confidence in your identification, e.g. {"text": "read text", "conf": 0.95}. Do not include any other keys in the JSON. Ignore any words that are not part of the main line of text closest to the center of the image. Ensure that spaces between words and upper/lower cases are preserved. If you can't read the text, return an empty string ""."""

if LOAD_PADDLE_AT_STARTUP:
    # Set PaddleOCR environment variables BEFORE importing PaddleOCR
    # This ensures fonts are configured before the package loads

    # Set PaddleOCR model directory environment variable (only if specified).
    if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip():
        os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH
        print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}")
    else:
        print("Using default PaddleOCR model storage location")

    # Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf
    # This MUST be set before importing PaddleOCR to prevent font downloads
    if (
        PADDLE_FONT_PATH
        and PADDLE_FONT_PATH.strip()
        and os.path.exists(PADDLE_FONT_PATH)
    ):
        os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
        print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}")
    else:
        system_font_path = get_system_font_path()
        if system_font_path:
            os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
            print(f"Setting PaddleOCR font path to system font: {system_font_path}")
        else:
            print(
                "Warning: No suitable system font found. PaddleOCR may download default fonts."
            )

    try:
        from paddleocr import PaddleOCR

        print("PaddleOCR imported successfully")

        paddle_kwargs = None

        # Default paddle configuration if none provided
        if paddle_kwargs is None:
            paddle_kwargs = {
                "text_detection_model_name": "PP-OCRv6_medium_det",
                "text_recognition_model_name": "PP-OCRv6_medium_rec",
                "engine": "transformers",
                "det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO,
                "use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION,
                "use_doc_orientation_classify": False,
                "use_doc_unwarping": False,
                "lang": "en",
            }
        else:
            # Enforce language if not explicitly provided
            paddle_kwargs.setdefault("lang", "en")

        try:
            PaddleOCR(**paddle_kwargs)
        except Exception as e:
            # Handle DLL loading errors (common on Windows with GPU version)
            if (
                "WinError 127" in str(e)
                or "could not be found" in str(e).lower()
                or "dll" in str(e).lower()
            ):
                print(
                    f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}"
                )
                print("PaddleOCR will not be available. To fix GPU issues:")
                print("1. Install Visual C++ Redistributables (latest version)")
                print("2. Ensure CUDA runtime libraries are in your PATH")
                print(
                    "3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle"
                )
                raise ImportError(
                    f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
                )
            else:
                raise e

    except ImportError:
        PaddleOCR = None
        print(
            "PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
        )


# Module-level refs to loaded VLM model/processor (set when SHOW_VLM_MODEL_OPTIONS and model is loaded). Used by LLM entity detection when USE_TRANSFORMERS_VLM_MODEL_AS_LLM.
_loaded_vlm_model = None
_loaded_vlm_processor = None

# Define module-level defaults for model parameters (always available for import)
# These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled
model_default_prompt = text_read_default_prompt
model_default_do_sample = (
    VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else True
)
model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
model_default_temperature = (
    VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
)
model_default_repetition_penalty = (
    VLM_DEFAULT_REPETITION_PENALTY
    if VLM_DEFAULT_REPETITION_PENALTY is not None
    else None
)
model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
model_default_max_new_tokens = int(MAX_NEW_TOKENS)
model_default_seed = VLM_SEED if VLM_SEED is not None else None

_load_vlm_weights_fn = None
_vlm_load_lock = Lock()
_transformers_vlm_weights_loaded = False


def ensure_transformers_vlm_loaded():
    """Load local transformers VLM weights once (thread-safe). No-op if VLM options disabled."""
    global _transformers_vlm_weights_loaded
    if not SHOW_VLM_MODEL_OPTIONS:
        return
    if _transformers_vlm_weights_loaded:
        return
    fn = _load_vlm_weights_fn
    if fn is None:
        return
    with _vlm_load_lock:
        if _transformers_vlm_weights_loaded:
            return
        fn()
        _transformers_vlm_weights_loaded = True


if SHOW_VLM_MODEL_OPTIONS is True:
    try:
        import torch  # type: ignore
    except ModuleNotFoundError:
        # Keep base installs usable without heavy optional deps.
        SHOW_VLM_MODEL_OPTIONS = False
        torch = None  # type: ignore
        print(
            "VLM options disabled because 'torch' is not installed. "
            'Install with the extra: pip install "doc_redaction[vlm]"'
        )

if SHOW_VLM_MODEL_OPTIONS is True:
    from huggingface_hub import snapshot_download
    from transformers import AutoConfig, BitsAndBytesConfig, TextIteratorStreamer

    from tools.config import (
        MAX_INPUT_TOKEN_LENGTH,
        MAX_NEW_TOKENS,
        MODEL_CACHE_PATH,
        OVERRIDE_VLM_REPO_ID,
        QUANTISE_VLM_MODELS,
        SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL,
        VLM_DEFAULT_DO_SAMPLE,
        VLM_DEFAULT_MIN_P,
        VLM_DEFAULT_PRESENCE_PENALTY,
        VLM_DEFAULT_REPETITION_PENALTY,
        VLM_DEFAULT_TEMPERATURE,
        VLM_DEFAULT_TOP_K,
        VLM_DEFAULT_TOP_P,
        VLM_SEED,
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print("torch.__version__ =", torch.__version__)
    # print("torch.version.cuda =", torch.version.cuda)
    print("cuda available:", torch.cuda.is_available())
    # print("cuda device count:", torch.cuda.device_count())
    if torch.cuda.is_available():
        # print("current device:", torch.cuda.current_device())
        print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

    # print("Using device:", device)

    CACHE_PATH = MODEL_CACHE_PATH
    if not os.path.exists(CACHE_PATH):
        os.makedirs(CACHE_PATH)

    # Initialize model and processor variables
    processor = None
    model = None

    # Initialize model-specific generation parameters (will be set by specific models if needed)
    # If config values are provided, use them; otherwise leave as None to use model defaults
    model_default_prompt = text_read_default_prompt
    model_default_do_sample = (
        VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
    )
    model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
    model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
    model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
    model_default_temperature = (
        VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
    )
    model_default_repetition_penalty = (
        VLM_DEFAULT_REPETITION_PENALTY
        if VLM_DEFAULT_REPETITION_PENALTY is not None
        else None
    )
    model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
    model_default_max_new_tokens = int(MAX_NEW_TOKENS)
    # Track which models support presence_penalty (only Qwen3-VL models currently)
    model_supports_presence_penalty = False
    model_default_seed = VLM_SEED if VLM_SEED is not None else None

    attn_implementation = resolve_attn_implementation()
    log_attn_implementation_choice()

    # Setup quantisation config if enabled
    quantization_config = None
    if QUANTISE_VLM_MODELS is True:
        if not torch.cuda.is_available():
            print(
                "Warning: 4-bit quantisation requires CUDA, but CUDA is not available."
            )
            print("Falling back to loading models without quantisation")
            quantization_config = None
        else:
            try:
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                )
                print("Created quantization config for bitsandbytes")
            except Exception as e:
                print(f"Warning: Could not setup bitsandbytes quantization: {e}")
                print("Falling back to loading models without quantization")
                quantization_config = None

    def _get_vlm_config_capped_length(model_id):
        """Load model config with max_position_embeddings capped to MAX_INPUT_TOKEN_LENGTH to reduce VRAM (KV cache)."""
        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
        cap = MAX_INPUT_TOKEN_LENGTH
        if getattr(config, "max_position_embeddings", None) is not None:
            if config.max_position_embeddings > cap:
                config.max_position_embeddings = cap
        if getattr(config, "text_config", None) is not None:
            tc = config.text_config
            if (
                getattr(tc, "max_position_embeddings", None) is not None
                and tc.max_position_embeddings > cap
            ):
                tc.max_position_embeddings = cap
        return config

    def _load_vlm_weights_and_finalize():
        global model, processor, _loaded_vlm_model, _loaded_vlm_processor
        global model_default_prompt, model_default_do_sample, model_default_top_p, model_default_min_p, model_default_top_k
        global model_default_temperature, model_default_repetition_penalty, model_default_presence_penalty
        global model_default_max_new_tokens, model_default_seed, model_supports_presence_penalty

        def _apply_generation_family_defaults(defaults: dict):
            """
            Apply a shared set of generation defaults for a model family.
            These are *model defaults* (later overridden by config/env VLM_DEFAULT_* if set).
            """
            # Note: we intentionally assign to the module-level "model_default_*" globals declared above.
            global model_default_do_sample, model_default_top_p, model_default_min_p, model_default_top_k
            global model_default_temperature, model_default_repetition_penalty, model_default_presence_penalty
            global model_default_max_new_tokens, model_supports_presence_penalty

            if not defaults:
                return
            if "do_sample" in defaults:
                model_default_do_sample = defaults["do_sample"]
            if "top_p" in defaults:
                model_default_top_p = defaults["top_p"]
            if "min_p" in defaults:
                model_default_min_p = defaults["min_p"]
            if "top_k" in defaults:
                model_default_top_k = defaults["top_k"]
            if "temperature" in defaults:
                model_default_temperature = defaults["temperature"]
            if "repetition_penalty" in defaults:
                model_default_repetition_penalty = defaults["repetition_penalty"]
            if "presence_penalty" in defaults:
                model_default_presence_penalty = defaults["presence_penalty"]
            if "max_new_tokens" in defaults:
                model_default_max_new_tokens = defaults["max_new_tokens"]
            if "supports_presence_penalty" in defaults:
                model_supports_presence_penalty = defaults["supports_presence_penalty"]

        # Shared generation defaults (top_p/top_k/etc.) by model family to avoid repeating values in each model block.
        # These are applied as "model defaults" and can still be overridden by VLM_DEFAULT_* config later.
        _QWEN3_VL_FAMILY_DEFAULTS = {
            "top_p": 0.8,
            "min_p": 0.0,
            "top_k": 20,
            "temperature": 0.7,
            "repetition_penalty": 1.0,
            "presence_penalty": 1.0,
            "max_new_tokens": MAX_NEW_TOKENS,
            # I found that this doesn't work when using transformers
            "supports_presence_penalty": False,
        }
        _QWEN3_5_FAMILY_DEFAULTS = dict(_QWEN3_VL_FAMILY_DEFAULTS)
        _GEMMA4_FAMILY_DEFAULTS = {
            "top_p": 0.95,
            "top_k": 64,
            "temperature": 1.0,
            "max_new_tokens": MAX_NEW_TOKENS,
            # I found that this doesn't work when using transformers
            "supports_presence_penalty": False,
        }
        # print(f"Loading vision model: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}")

        # Load only the selected model based on configuration
        if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Nanonets-OCR2-3B":
            MODEL_ID = "nanonets/Nanonets-OCR2-3B"
            from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
                load_kwargs["device_map"] = "auto"
            else:
                load_kwargs["torch_dtype"] = torch.float16
            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()
            if quantization_config is None:
                model = model.to(device)

            model_default_prompt = text_read_default_prompt

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Dots.OCR":
            # Download and patch Dots.OCR model
            model_path_d_local = snapshot_download(
                repo_id="rednote-hilab/dots.ocr",
                local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
                max_workers=MAX_WORKERS,
                local_dir_use_symlinks=False,
            )

            config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")

            if os.path.exists(config_file_path):
                with open(config_file_path, "r") as f:
                    input_code = f.read()

                lines = input_code.splitlines()
                if "class DotsVLProcessor" in input_code and not any(
                    "attributes = " in line for line in lines
                ):
                    output_lines = []
                    for line in lines:
                        output_lines.append(line)
                        if line.strip().startswith("class DotsVLProcessor"):
                            output_lines.append(
                                '    attributes = ["image_processor", "tokenizer"]'
                            )

                    with open(config_file_path, "w") as f:
                        f.write("\n".join(output_lines))
                    print("Patched configuration_dots.py successfully.")

            sys.path.append(model_path_d_local)

            MODEL_ID = model_path_d_local
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            from transformers import AutoModelForCausalLM, AutoProcessor

            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["torch_dtype"] = torch.bfloat16
            model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()

            model_default_prompt = text_read_default_prompt
            model_default_max_new_tokens = MAX_NEW_TOKENS

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "PaddleOCR-VL":
            MODEL_ID = "PaddlePaddle/PaddleOCR-VL"
            from transformers import AutoModelForCausalLM, AutoProcessor

            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            load_kwargs = {
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
                load_kwargs["device_map"] = "auto"
            else:
                load_kwargs["torch_dtype"] = torch.bfloat16
            model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
            if quantization_config is None:
                model = model.to(device)
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

            model_default_prompt = """OCR:"""
            model_default_max_new_tokens = MAX_NEW_TOKENS

        ###
        # QWEN 3-VL MODELS
        ###
        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-2B-Instruct":
            MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct"
            from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-4B-Instruct":
            MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
            from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS)
        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-8B-Instruct":
            MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
            from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-32B-Instruct":
            MODEL_ID = "Qwen/Qwen3-VL-32B-Instruct"
            from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-30B-A3B-Instruct":
            MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration

            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }

            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-235B-A22B-Instruct-FP8":
            MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct-FP8"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration

            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }

            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            ).eval()

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS)

        ###
        # QWEN 3.5 MODELS
        ###
        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-0.8B":
            from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration

            MODEL_ID = "Qwen/Qwen3.5-0.8B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"

            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-2B":
            from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration

            MODEL_ID = "Qwen/Qwen3.5-2B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"

            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-4B":
            from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration

            MODEL_ID = "Qwen/Qwen3.5-4B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-9B":
            from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration

            MODEL_ID = "Qwen/Qwen3.5-9B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-27B":
            from transformers import (
                AutoProcessor,
                Qwen3_5ForConditionalGeneration,
            )

            MODEL_ID = "Qwen/Qwen3.5-27B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-27B-bnb-4bit":
            from transformers import (
                AutoProcessor,
                Qwen3_5ForConditionalGeneration,
            )

            MODEL_ID = "bertbobson/Qwen3.5-27B-bnb-4bit"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-35B-A3B":
            from transformers import (
                AutoProcessor,
                Qwen3_5MoeForConditionalGeneration,
            )

            MODEL_ID = "Qwen/Qwen3.5-35B-A3B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.6-27B":
            from transformers import (
                AutoProcessor,
                Qwen3_5ForConditionalGeneration,
            )

            MODEL_ID = "Qwen/Qwen3.6-27B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.6-27B-bnb-4bit":
            from transformers import (
                AutoProcessor,
                Qwen3_5ForConditionalGeneration,
            )

            MODEL_ID = "samajlouis/Qwen3.6-27B-bnb-nf4"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5ForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.6-35B-A3B":
            from transformers import (
                AutoProcessor,
                Qwen3_5MoeForConditionalGeneration,
            )

            MODEL_ID = "Qwen/Qwen3.6-35B-A3B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-122B-A10B":
            from transformers import (
                AutoProcessor,
                Qwen3_5MoeForConditionalGeneration,
            )

            MODEL_ID = "Qwen/Qwen3.5-122B-A10B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-397B-A17B":
            from transformers import (
                AutoProcessor,
                Qwen3_5MoeForConditionalGeneration,
            )

            MODEL_ID = "Qwen/Qwen3.5-397B-A17B"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = Qwen3_5MoeForConditionalGeneration.from_pretrained(
                MODEL_ID, **load_kwargs
            )

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Gemma 4 31B bnb":
            from transformers import AutoModelForCausalLM, AutoProcessor

            MODEL_ID = "unsloth/gemma-4-31B-it-unsloth-bnb-4bit"
            if OVERRIDE_VLM_REPO_ID:
                MODEL_ID = OVERRIDE_VLM_REPO_ID
            processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
            load_kwargs = {
                "attn_implementation": attn_implementation,
                "device_map": "auto",
                "trust_remote_code": True,
                "config": _get_vlm_config_capped_length(MODEL_ID),
            }
            if quantization_config is not None:
                load_kwargs["quantization_config"] = quantization_config
            else:
                load_kwargs["dtype"] = "auto"
            model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)

            model_default_prompt = text_read_default_prompt
            _apply_generation_family_defaults(_GEMMA4_FAMILY_DEFAULTS)

        elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "None":
            model = None
            processor = None

        else:
            raise ValueError(
                f"Invalid model selected: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}. Valid options are shown in tools/run_vlm.py, or None"
            )

        # Override model defaults with user-provided config values if they are set
        # Priority: user config value > model default
        if VLM_DEFAULT_DO_SAMPLE is not None:
            model_default_do_sample = VLM_DEFAULT_DO_SAMPLE
        if VLM_DEFAULT_TOP_P is not None:
            model_default_top_p = VLM_DEFAULT_TOP_P
        if VLM_DEFAULT_MIN_P is not None:
            model_default_min_p = VLM_DEFAULT_MIN_P
        if VLM_DEFAULT_TOP_K is not None:
            model_default_top_k = VLM_DEFAULT_TOP_K
        if VLM_DEFAULT_TEMPERATURE is not None:
            model_default_temperature = VLM_DEFAULT_TEMPERATURE
        if VLM_DEFAULT_REPETITION_PENALTY is not None:
            model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY
        if VLM_DEFAULT_PRESENCE_PENALTY is not None:
            model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
        if VLM_SEED is not None:
            model_default_seed = VLM_SEED

        # Cap processor tokenizer to config max context length so all tokenization respects MAX_INPUT_TOKEN_LENGTH
        if processor is not None:
            tokenizer = getattr(processor, "tokenizer", None)
            if tokenizer is not None and hasattr(tokenizer, "model_max_length"):
                current_max = tokenizer.model_max_length
                if current_max is None or current_max == float("inf"):
                    tokenizer.model_max_length = MAX_INPUT_TOKEN_LENGTH
                elif current_max > MAX_INPUT_TOKEN_LENGTH:
                    tokenizer.model_max_length = MAX_INPUT_TOKEN_LENGTH
        # Log effective VLM context cap so env (e.g. MAX_INPUT_TOKEN_LENGTH=4096) can be verified
        _ref_ctx = 32768
        _reserve = 1024
        _eff_max = min(
            VLM_MAX_IMAGE_SIZE,
            (VLM_MAX_IMAGE_SIZE * max(0, MAX_INPUT_TOKEN_LENGTH - _reserve) // _ref_ctx)
            // 1024
            * 1024,
        )
        _abs_min = 65536
        effective_max_pixels_at_load = max(_abs_min, _eff_max)
        effective_min_pixels_at_load = min(
            VLM_MIN_IMAGE_SIZE, effective_max_pixels_at_load
        )

        if SHOW_VLM_MODEL_OPTIONS:
            print(
                f"VLM context cap: MAX_INPUT_TOKEN_LENGTH={MAX_INPUT_TOKEN_LENGTH}, "
                f"effective max_pixels={effective_max_pixels_at_load}, min_pixels={effective_min_pixels_at_load} "
                f"(VLM_MAX_IMAGE_SIZE={VLM_MAX_IMAGE_SIZE}, VLM_MIN_IMAGE_SIZE={VLM_MIN_IMAGE_SIZE})"
            )

        # Store at module level for USE_TRANSFORMERS_VLM_MODEL_AS_LLM (no global needed at module level)
        _loaded_vlm_model = model
        _loaded_vlm_processor = processor

    _load_vlm_weights_fn = _load_vlm_weights_and_finalize

    # print(f"Successfully loaded {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}")


if SHOW_VLM_MODEL_OPTIONS and LOAD_TRANSFORMERS_VLM_MODEL_AT_START:
    try:
        ensure_transformers_vlm_loaded()
    except Exception as e:
        print(f"Warning: Could not load transformers VLM model at startup: {e}")
        print("The VLM will be loaded on first use when a VLM OCR task runs.")


def get_loaded_vlm_model_and_tokenizer():
    """
    Return the currently loaded VLM model and its tokenizer for use by LLM tasks (e.g. entity detection) when USE_TRANSFORMERS_VLM_MODEL_AS_LLM is True.
    Returns (model, tokenizer) or (None, None) if the VLM has not been loaded yet.
    """
    global _loaded_vlm_model, _loaded_vlm_processor
    ensure_transformers_vlm_loaded()
    if _loaded_vlm_model is None or _loaded_vlm_processor is None:
        return None, None
    tokenizer = getattr(_loaded_vlm_processor, "tokenizer", _loaded_vlm_processor)
    return _loaded_vlm_model, tokenizer


@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
def extract_text_from_image_vlm(
    text: str,
    image: Image.Image,
    max_new_tokens: int = None,
    temperature: float = None,
    top_p: float = None,
    min_p: float = None,
    top_k: int = None,
    repetition_penalty: float = None,
    do_sample: bool = None,
    presence_penalty: float = None,
    seed: int = None,
    model_default_prompt: str = None,
):
    """
    Generates responses using the configured vision model for image input.

    When ``VLM_DEFAULT_STREAM`` is True (default), streams text to the console and
    returns the full string when generation finishes. When ``VLM_DEFAULT_STREAM``
    is False, runs a single batched ``generate`` call (no console streaming) and
    returns the same ``(text, input_tokens, output_tokens)`` tuple.

    Uses model-specific defaults if they were set during model initialization,
    falling back to function argument defaults if provided, and finally to sensible
    general defaults if neither are available.

    Args:
        text (str): The text prompt to send to the vision model. If empty and model
            has a default prompt, the model default will be used.
        image (Image.Image): The PIL Image to process. Must not be None.
        max_new_tokens (int, optional): Maximum number of new tokens to generate.
            Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config.
        temperature (float, optional): Sampling temperature for generation.
            Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7.
        top_p (float, optional): Nucleus sampling parameter (top-p).
            Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9.
        min_p (float, optional): Minimum probability threshold for token sampling.
            Defaults to model-specific value or 0.0.
        top_k (int, optional): Top-k sampling parameter.
            Defaults to model-specific value (20 for Qwen3-VL models) or 50.
        repetition_penalty (float, optional): Penalty for token repetition.
            Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3.
        do_sample (bool, optional): If True, use sampling (do_sample=True).
            If False, use sampling (do_sample=True). If None, defaults to False
            (sampling) for Qwen3-VL models, or True (sampling) for other models.
        presence_penalty (float, optional): Penalty for token presence.
            Defaults to model-specific value (1.5 for Qwen3-VL models) or None.
            Note: Not all models support this parameter.
        seed (int, optional): Random seed for generation. If None, uses VLM_SEED
            from config if set, otherwise no seed is set (non-deterministic).
        model_default_prompt (str, optional): The default prompt to use if no text is provided.
            Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image."

    Returns:
        Tuple[str, int, int]: The complete generated text response, input tokens (estimated), output tokens (estimated).
    """
    if image is None:
        return "Please upload an image.", 0, 0

    if not SHOW_VLM_MODEL_OPTIONS:
        return (
            "Local transformers VLM is not enabled (SHOW_VLM_MODEL_OPTIONS=False).",
            0,
            0,
        )

    ensure_transformers_vlm_loaded()
    if model is None or processor is None:
        return (
            "No local transformers VLM is loaded. Check SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL "
            "or prior load errors (e.g. set LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True to load at startup).",
            0,
            0,
        )

    # Determine parameter values with priority: function args > model defaults > general defaults
    # Priority order: function argument (if not None) > model default > general default

    # Text/prompt handling
    if text and text.strip():
        actual_text = text
    elif model_default_prompt is not None:
        actual_text = model_default_prompt
    else:
        actual_text = "Read all the text in the image."  # General default

    # max_new_tokens: function arg > model default > general default
    if max_new_tokens is not None:
        actual_max_new_tokens = max_new_tokens
    elif model_default_max_new_tokens is not None:
        actual_max_new_tokens = model_default_max_new_tokens
    else:
        actual_max_new_tokens = MAX_NEW_TOKENS  # General default (from config)

    # temperature: function arg > model default (which may include config override)
    if temperature is not None:
        actual_temperature = temperature
    elif model_default_temperature is not None:
        actual_temperature = model_default_temperature
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_temperature = 0.1

    # top_p: function arg > model default (which may include config override)
    if top_p is not None:
        actual_top_p = top_p
    elif model_default_top_p is not None:
        actual_top_p = model_default_top_p
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_top_p = 0.8

    # min_p: function arg > model default (which may include config override)
    if min_p is not None:
        actual_min_p = min_p
    elif model_default_min_p is not None:
        actual_min_p = model_default_min_p
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_min_p = 0.0

    # top_k: function arg > model default (which may include config override)
    if top_k is not None:
        actual_top_k = top_k
    elif model_default_top_k is not None:
        actual_top_k = model_default_top_k
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_top_k = 20

    # repetition_penalty: function arg > model default (which may include config override)
    if repetition_penalty is not None:
        actual_repetition_penalty = repetition_penalty
    elif model_default_repetition_penalty is not None:
        actual_repetition_penalty = model_default_repetition_penalty
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_repetition_penalty = 1.0

    # do_sample: function arg > model default (which may include config override)
    if do_sample is not None:
        actual_do_sample = do_sample
    elif model_default_do_sample is not None:
        actual_do_sample = model_default_do_sample
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_do_sample = True

    # presence_penalty: function arg > model default (which may include config override) > None
    actual_presence_penalty = None
    if presence_penalty is not None:
        actual_presence_penalty = presence_penalty
    elif model_default_presence_penalty is not None:
        actual_presence_penalty = model_default_presence_penalty

    # seed: function arg > model default (which may include config override)
    actual_seed = None
    if seed is not None:
        actual_seed = seed
    elif model_default_seed is not None:
        actual_seed = model_default_seed

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": actual_text},
            ],
        }
    ]
    # Build prompt: when disabling Qwen3.5 thinking we append <think></think> after the generation
    # prompt so the model sees it and continues with the answer (avoids continue_final_message
    # which can fail when the chat template does not include the final assistant message in the
    # rendered string).
    prompt_full = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    if VLM_DISABLE_QWEN3_5_THINKING:
        prompt_full = prompt_full + VLM_QWEN3_5_NOTHINK_SUFFIX

    # Cap max_pixels so image tokens + text fit within MAX_INPUT_TOKEN_LENGTH (image token count scales with resolution).
    # Reserve ~1k tokens for prompt; allow max_pixels below VLM_MIN_IMAGE_SIZE when context is small to avoid VRAM spike.
    _ref_context = 32768
    _reserve_text = 1024
    _effective_max_pixels = min(
        VLM_MAX_IMAGE_SIZE,
        (
            VLM_MAX_IMAGE_SIZE
            * max(0, MAX_INPUT_TOKEN_LENGTH - _reserve_text)
            // _ref_context
        )
        // 1024
        * 1024,
    )
    _absolute_min_pixels = 65536  # 256*256 so image remains usable
    effective_max_pixels = max(_absolute_min_pixels, _effective_max_pixels)
    # Don't force upscaling above our cap: min_pixels must not exceed max_pixels
    effective_min_pixels = min(VLM_MIN_IMAGE_SIZE, effective_max_pixels)

    inputs = processor(
        text=[prompt_full],
        images=[image],
        return_tensors="pt",
        padding=True,
        min_pixels=effective_min_pixels,
        max_pixels=effective_max_pixels,
        truncation=True,
        max_length=MAX_INPUT_TOKEN_LENGTH,
    ).to(device)

    use_stream = VLM_DEFAULT_STREAM if VLM_DEFAULT_STREAM is not None else True

    # Set random seed if specified
    if actual_seed is not None:
        torch.manual_seed(actual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(actual_seed)

    # Build generation kwargs with resolved parameters
    generation_kwargs = {
        **inputs,
        "max_new_tokens": actual_max_new_tokens,
        "do_sample": actual_do_sample,
        "temperature": actual_temperature,
        "top_p": actual_top_p,
        "min_p": actual_min_p,
        "top_k": actual_top_k,
        "repetition_penalty": actual_repetition_penalty,
    }

    # Add presence_penalty if it's set and the model supports it
    # Only Qwen3-VL models currently support presence_penalty
    if actual_presence_penalty is not None and model_supports_presence_penalty:
        generation_kwargs["presence_penalty"] = actual_presence_penalty

    start_time = time.time()
    buffer = ""

    if use_stream:
        streamer = TextIteratorStreamer(
            processor, skip_prompt=True, skip_special_tokens=True
        )
        generation_kwargs["streamer"] = streamer
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        line_buffer = ""  # Accumulate text for the current line
        for new_text in streamer:
            buffer += new_text
            buffer = buffer.replace("<|im_end|>", "")
            line_buffer += new_text

            # Print to console as it streams
            print(new_text, end="", flush=True)

            # If we hit a newline, report the entire accumulated line to GUI
            if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text:
                # Split by newline to handle the line(s) we just completed
                parts = line_buffer.split("\n")
                # Report all complete lines (everything except the last part which may be incomplete)
                for line in parts[:-1]:
                    if line.strip():  # Only report non-empty lines
                        gr.Info(line, duration=2)
                # Keep the last part (after the last newline) for the next line
                line_buffer = parts[-1] if parts else ""

        # Print final newline after streaming is complete
        print()  # Add newline at the end
    else:
        with torch.inference_mode():
            output_ids = model.generate(**generation_kwargs)
        prompt_len = inputs["input_ids"].shape[1]
        new_token_ids = output_ids[:, prompt_len:]
        buffer = processor.batch_decode(new_token_ids, skip_special_tokens=True)[0]
        buffer = buffer.replace("<|im_end|>", "")
        if REPORT_VLM_OUTPUTS_TO_GUI and buffer.strip():
            for line in buffer.split("\n"):
                if line.strip():
                    gr.Info(line, duration=2)

    end_time = time.time()

    # Estimate token usage for local models
    # For local transformers models, we can estimate using the tokenizer if available
    input_tokens = 0
    output_tokens = 0
    try:
        if (
            processor
            and hasattr(processor, "tokenizer")
            and processor.tokenizer is not None
        ):
            # Estimate input tokens from prompt and image
            # Note: Vision models encode images differently, so this is an approximation
            prompt_tokens = len(
                processor.tokenizer.encode(actual_text, add_special_tokens=False)
            )
            # Rough estimate: assume image tokens are proportional to image size
            # This is a rough approximation - actual vision tokenization is more complex
            image_tokens_estimate = (
                image.size[0] * image.size[1]
            ) // 1000  # Rough estimate
            input_tokens = prompt_tokens + image_tokens_estimate

            # Estimate output tokens from generated text
            output_tokens = len(
                processor.tokenizer.encode(buffer, add_special_tokens=False)
            )
    except Exception:
        # If token counting fails, use rough word-based estimates
        input_tokens = len(actual_text.split()) * 2  # Rough estimate
        output_tokens = len(buffer.split()) * 2  # Rough estimate

    duration = end_time - start_time
    tokens_per_second = output_tokens / duration if duration > 0 else 0

    print("\n--- Performance ---")
    print(f"Time taken: {duration:.2f} seconds")
    print(f"Generated tokens: {output_tokens}")
    print(f"Tokens per second: {tokens_per_second:.2f}")

    buffer = strip_vlm_thinking_tags(buffer)

    # Return the complete text and token estimates
    return buffer, input_tokens, output_tokens


# Optionally, give some more guidance on bounding box coordinates
if ADD_VLM_BOUNDING_BOX_RULES:
    # Qwen models don't need the additional bounding box guidance as they have already been trained in this coordinate system
    if (
        (
            "qwen" in str(SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL).lower()
            and SHOW_VLM_MODEL_OPTIONS
        )
        or (
            "qwen" in str(DEFAULT_INFERENCE_SERVER_VLM_MODEL).lower()
            and SHOW_INFERENCE_SERVER_VLM_OPTIONS
        )
        or ("qwen" in str(CLOUD_VLM_MODEL_CHOICE).lower() and SHOW_BEDROCK_VLM_MODELS)
    ):
        additional_bounding_box_rules = ""
    else:
        additional_bounding_box_rules = "\n- Bounding boxes should fit within the coordinate extents of the image: 0, 0 is the top left corner of the image, and 999, 999 is the bottom right corner of the image"
else:
    additional_bounding_box_rules = ""

full_page_ocr_vlm_prompt = f"""Spot all the text in the image at line-level, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': 'identified text', 'conf': 'confidence score 0-1.0'}}, ...].

IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry.

Rules:
- Each line must be on a separate horizontal row in the image
- Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line)
- If text spans multiple horizontal lines, split it into separate entries (one per line)
- The text should not contain any formatting tags unless they are explicitly written in the text (e.g. the text is html or markdown)
- Do NOT combine lines that appear on different horizontal rows
- Each bounding box should tightly fit around a single horizontal line of text{additional_bounding_box_rules}
- Empty lines should be skipped
- Use keys bbox, text, and conf; 'conf' must be a numeric confidence from 0-1


# Only return valid JSON, no additional text or explanation."""

full_page_ocr_people_vlm_prompt = f"""Spot all photos of people's faces in the image, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': '[FACE]', 'conf': 'confidence score 0-1.0'}}, ...].

Rules:
- If there are no photos of people's faces in the image, return an empty JSON array []
- If you are not confident that the detected object is a photo of a person's face, do not include it in the results. Only return results for objects that are clearly photos of people's faces. If in doubt, do not include it in the results.
- For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON.
- Each identified photo of a person's face with high confidence should be a separate JSON entry
- Only include photos of people's faces in the results, not a drawing or sketch
- Bounding boxes around an identified person's face should completely cover the person's face{additional_bounding_box_rules}
- 'text' must be exactly the string '[FACE]' (no other wording)
- 'conf' should be a numeric confidence from 0-1
- Do NOT include any other text or information in the JSON


# Only return valid JSON, no additional text or explanation."""

full_page_ocr_signature_vlm_prompt = f"""Spot all handwritten signatures in the image, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': '[SIGNATURE]', 'conf': 'confidence score 0-1.0'}}, ...].

Rules:
- If there are no handwritten signatures in the image, return an empty JSON array []
- If you are not confident that the detected object is a handwritten signature, do not include it in the results. Only return results for objects that are clearly handwritten signatures. If in doubt, do not include it in the results.
- For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON.
- Each identified handwritten signature with high confidence should be a separate JSON entry
- Bounding boxes around an identified handwritten signature should completely cover the signature{additional_bounding_box_rules}
- 'text' must be exactly the string '[SIGNATURE]' (no other wording)
- 'conf' should be a numeric confidence from 0-1
- Do NOT include any other text or information in the JSON.

# Only return valid JSON, no additional text or explanation."""

# Test for word-level OCR with VLMs - makes some mistakes but not bad
full_page_ocr_vlm_words_prompt = f"""Spot all the text in the image at word-level, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': 'identified word', 'conf': 'confidence score 0-1.0'}}, ...].

IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image.

Rules:
- Each entry should correspond to a single distinct word (not groups of words, not whole lines)
- For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word{additional_bounding_box_rules}
- For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON.
- Do not merge words. Do not split words into letters. Only return one entry per word
- Maintain the order of words as they appear spatially from top to bottom, left to right
- Skip any empty or whitespace-only entries
- Do not include extraneous text, explanations, or formatting beyond the required JSON

Only return valid JSON, no additional text or explanation."""