import os import sys import time from threading import Lock, Thread import gradio as gr import spaces from PIL import Image from tools.config import ( ADD_VLM_BOUNDING_BOX_RULES, CLOUD_VLM_MODEL_CHOICE, DEFAULT_INFERENCE_SERVER_VLM_MODEL, LOAD_PADDLE_AT_STARTUP, LOAD_TRANSFORMERS_VLM_MODEL_AT_START, MAX_INPUT_TOKEN_LENGTH, MAX_NEW_TOKENS, MAX_SPACES_GPU_RUN_TIME, MAX_WORKERS, PADDLE_DET_DB_UNCLIP_RATIO, PADDLE_FONT_PATH, PADDLE_MODEL_PATH, PADDLE_USE_TEXTLINE_ORIENTATION, QUANTISE_VLM_MODELS, REPORT_VLM_OUTPUTS_TO_GUI, SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL, SHOW_BEDROCK_VLM_MODELS, SHOW_INFERENCE_SERVER_VLM_OPTIONS, SHOW_VLM_MODEL_OPTIONS, VLM_DEFAULT_DO_SAMPLE, VLM_DEFAULT_MIN_P, VLM_DEFAULT_PRESENCE_PENALTY, VLM_DEFAULT_REPETITION_PENALTY, VLM_DEFAULT_STREAM, VLM_DEFAULT_TEMPERATURE, VLM_DEFAULT_TOP_K, VLM_DEFAULT_TOP_P, VLM_DISABLE_QWEN3_5_THINKING, VLM_MAX_IMAGE_SIZE, VLM_MIN_IMAGE_SIZE, VLM_QWEN3_5_NOTHINK_SUFFIX, VLM_SEED, ) from tools.helper_functions import get_system_font_path, strip_vlm_thinking_tags from tools.inference_attention import ( log_attn_implementation_choice, resolve_attn_implementation, ) text_read_default_prompt = """Read the main line of text in the image, and return JSON with keys "text" (string) and "conf" (number 0–1) for confidence in your identification, e.g. {"text": "read text", "conf": 0.95}. Do not include any other keys in the JSON. Ignore any words that are not part of the main line of text closest to the center of the image. Ensure that spaces between words and upper/lower cases are preserved. If you can't read the text, return an empty string "".""" if LOAD_PADDLE_AT_STARTUP: # Set PaddleOCR environment variables BEFORE importing PaddleOCR # This ensures fonts are configured before the package loads # Set PaddleOCR model directory environment variable (only if specified). if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip(): os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}") else: print("Using default PaddleOCR model storage location") # Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf # This MUST be set before importing PaddleOCR to prevent font downloads if ( PADDLE_FONT_PATH and PADDLE_FONT_PATH.strip() and os.path.exists(PADDLE_FONT_PATH) ): os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}") else: system_font_path = get_system_font_path() if system_font_path: os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path print(f"Setting PaddleOCR font path to system font: {system_font_path}") else: print( "Warning: No suitable system font found. PaddleOCR may download default fonts." ) try: from paddleocr import PaddleOCR print("PaddleOCR imported successfully") paddle_kwargs = None # Default paddle configuration if none provided if paddle_kwargs is None: paddle_kwargs = { "text_detection_model_name": "PP-OCRv6_medium_det", "text_recognition_model_name": "PP-OCRv6_medium_rec", "engine": "transformers", "det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO, "use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION, "use_doc_orientation_classify": False, "use_doc_unwarping": False, "lang": "en", } else: # Enforce language if not explicitly provided paddle_kwargs.setdefault("lang", "en") try: PaddleOCR(**paddle_kwargs) except Exception as e: # Handle DLL loading errors (common on Windows with GPU version) if ( "WinError 127" in str(e) or "could not be found" in str(e).lower() or "dll" in str(e).lower() ): print( f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}" ) print("PaddleOCR will not be available. To fix GPU issues:") print("1. Install Visual C++ Redistributables (latest version)") print("2. Ensure CUDA runtime libraries are in your PATH") print( "3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle" ) raise ImportError( f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry." ) else: raise e except ImportError: PaddleOCR = None print( "PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry." ) # Module-level refs to loaded VLM model/processor (set when SHOW_VLM_MODEL_OPTIONS and model is loaded). Used by LLM entity detection when USE_TRANSFORMERS_VLM_MODEL_AS_LLM. _loaded_vlm_model = None _loaded_vlm_processor = None # Define module-level defaults for model parameters (always available for import) # These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled model_default_prompt = text_read_default_prompt model_default_do_sample = ( VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else True ) model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None model_default_temperature = ( VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None ) model_default_repetition_penalty = ( VLM_DEFAULT_REPETITION_PENALTY if VLM_DEFAULT_REPETITION_PENALTY is not None else None ) model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY model_default_max_new_tokens = int(MAX_NEW_TOKENS) model_default_seed = VLM_SEED if VLM_SEED is not None else None _load_vlm_weights_fn = None _vlm_load_lock = Lock() _transformers_vlm_weights_loaded = False def ensure_transformers_vlm_loaded(): """Load local transformers VLM weights once (thread-safe). No-op if VLM options disabled.""" global _transformers_vlm_weights_loaded if not SHOW_VLM_MODEL_OPTIONS: return if _transformers_vlm_weights_loaded: return fn = _load_vlm_weights_fn if fn is None: return with _vlm_load_lock: if _transformers_vlm_weights_loaded: return fn() _transformers_vlm_weights_loaded = True if SHOW_VLM_MODEL_OPTIONS is True: try: import torch # type: ignore except ModuleNotFoundError: # Keep base installs usable without heavy optional deps. SHOW_VLM_MODEL_OPTIONS = False torch = None # type: ignore print( "VLM options disabled because 'torch' is not installed. " 'Install with the extra: pip install "doc_redaction[vlm]"' ) if SHOW_VLM_MODEL_OPTIONS is True: from huggingface_hub import snapshot_download from transformers import AutoConfig, BitsAndBytesConfig, TextIteratorStreamer from tools.config import ( MAX_INPUT_TOKEN_LENGTH, MAX_NEW_TOKENS, MODEL_CACHE_PATH, OVERRIDE_VLM_REPO_ID, QUANTISE_VLM_MODELS, SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL, VLM_DEFAULT_DO_SAMPLE, VLM_DEFAULT_MIN_P, VLM_DEFAULT_PRESENCE_PENALTY, VLM_DEFAULT_REPETITION_PENALTY, VLM_DEFAULT_TEMPERATURE, VLM_DEFAULT_TOP_K, VLM_DEFAULT_TOP_P, VLM_SEED, ) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("torch.__version__ =", torch.__version__) # print("torch.version.cuda =", torch.version.cuda) print("cuda available:", torch.cuda.is_available()) # print("cuda device count:", torch.cuda.device_count()) if torch.cuda.is_available(): # print("current device:", torch.cuda.current_device()) print("device name:", torch.cuda.get_device_name(torch.cuda.current_device())) # print("Using device:", device) CACHE_PATH = MODEL_CACHE_PATH if not os.path.exists(CACHE_PATH): os.makedirs(CACHE_PATH) # Initialize model and processor variables processor = None model = None # Initialize model-specific generation parameters (will be set by specific models if needed) # If config values are provided, use them; otherwise leave as None to use model defaults model_default_prompt = text_read_default_prompt model_default_do_sample = ( VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None ) model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None model_default_temperature = ( VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None ) model_default_repetition_penalty = ( VLM_DEFAULT_REPETITION_PENALTY if VLM_DEFAULT_REPETITION_PENALTY is not None else None ) model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY model_default_max_new_tokens = int(MAX_NEW_TOKENS) # Track which models support presence_penalty (only Qwen3-VL models currently) model_supports_presence_penalty = False model_default_seed = VLM_SEED if VLM_SEED is not None else None attn_implementation = resolve_attn_implementation() log_attn_implementation_choice() # Setup quantisation config if enabled quantization_config = None if QUANTISE_VLM_MODELS is True: if not torch.cuda.is_available(): print( "Warning: 4-bit quantisation requires CUDA, but CUDA is not available." ) print("Falling back to loading models without quantisation") quantization_config = None else: try: quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) print("Created quantization config for bitsandbytes") except Exception as e: print(f"Warning: Could not setup bitsandbytes quantization: {e}") print("Falling back to loading models without quantization") quantization_config = None def _get_vlm_config_capped_length(model_id): """Load model config with max_position_embeddings capped to MAX_INPUT_TOKEN_LENGTH to reduce VRAM (KV cache).""" config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) cap = MAX_INPUT_TOKEN_LENGTH if getattr(config, "max_position_embeddings", None) is not None: if config.max_position_embeddings > cap: config.max_position_embeddings = cap if getattr(config, "text_config", None) is not None: tc = config.text_config if ( getattr(tc, "max_position_embeddings", None) is not None and tc.max_position_embeddings > cap ): tc.max_position_embeddings = cap return config def _load_vlm_weights_and_finalize(): global model, processor, _loaded_vlm_model, _loaded_vlm_processor global model_default_prompt, model_default_do_sample, model_default_top_p, model_default_min_p, model_default_top_k global model_default_temperature, model_default_repetition_penalty, model_default_presence_penalty global model_default_max_new_tokens, model_default_seed, model_supports_presence_penalty def _apply_generation_family_defaults(defaults: dict): """ Apply a shared set of generation defaults for a model family. These are *model defaults* (later overridden by config/env VLM_DEFAULT_* if set). """ # Note: we intentionally assign to the module-level "model_default_*" globals declared above. global model_default_do_sample, model_default_top_p, model_default_min_p, model_default_top_k global model_default_temperature, model_default_repetition_penalty, model_default_presence_penalty global model_default_max_new_tokens, model_supports_presence_penalty if not defaults: return if "do_sample" in defaults: model_default_do_sample = defaults["do_sample"] if "top_p" in defaults: model_default_top_p = defaults["top_p"] if "min_p" in defaults: model_default_min_p = defaults["min_p"] if "top_k" in defaults: model_default_top_k = defaults["top_k"] if "temperature" in defaults: model_default_temperature = defaults["temperature"] if "repetition_penalty" in defaults: model_default_repetition_penalty = defaults["repetition_penalty"] if "presence_penalty" in defaults: model_default_presence_penalty = defaults["presence_penalty"] if "max_new_tokens" in defaults: model_default_max_new_tokens = defaults["max_new_tokens"] if "supports_presence_penalty" in defaults: model_supports_presence_penalty = defaults["supports_presence_penalty"] # Shared generation defaults (top_p/top_k/etc.) by model family to avoid repeating values in each model block. # These are applied as "model defaults" and can still be overridden by VLM_DEFAULT_* config later. _QWEN3_VL_FAMILY_DEFAULTS = { "top_p": 0.8, "min_p": 0.0, "top_k": 20, "temperature": 0.7, "repetition_penalty": 1.0, "presence_penalty": 1.0, "max_new_tokens": MAX_NEW_TOKENS, # I found that this doesn't work when using transformers "supports_presence_penalty": False, } _QWEN3_5_FAMILY_DEFAULTS = dict(_QWEN3_VL_FAMILY_DEFAULTS) _GEMMA4_FAMILY_DEFAULTS = { "top_p": 0.95, "top_k": 64, "temperature": 1.0, "max_new_tokens": MAX_NEW_TOKENS, # I found that this doesn't work when using transformers "supports_presence_penalty": False, } # print(f"Loading vision model: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}") # Load only the selected model based on configuration if SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Nanonets-OCR2-3B": MODEL_ID = "nanonets/Nanonets-OCR2-3B" from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config load_kwargs["device_map"] = "auto" else: load_kwargs["torch_dtype"] = torch.float16 model = Qwen2_5_VLForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() if quantization_config is None: model = model.to(device) model_default_prompt = text_read_default_prompt elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Dots.OCR": # Download and patch Dots.OCR model model_path_d_local = snapshot_download( repo_id="rednote-hilab/dots.ocr", local_dir=os.path.join(CACHE_PATH, "dots.ocr"), max_workers=MAX_WORKERS, local_dir_use_symlinks=False, ) config_file_path = os.path.join(model_path_d_local, "configuration_dots.py") if os.path.exists(config_file_path): with open(config_file_path, "r") as f: input_code = f.read() lines = input_code.splitlines() if "class DotsVLProcessor" in input_code and not any( "attributes = " in line for line in lines ): output_lines = [] for line in lines: output_lines.append(line) if line.strip().startswith("class DotsVLProcessor"): output_lines.append( ' attributes = ["image_processor", "tokenizer"]' ) with open(config_file_path, "w") as f: f.write("\n".join(output_lines)) print("Patched configuration_dots.py successfully.") sys.path.append(model_path_d_local) MODEL_ID = model_path_d_local if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID from transformers import AutoModelForCausalLM, AutoProcessor processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["torch_dtype"] = torch.bfloat16 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval() model_default_prompt = text_read_default_prompt model_default_max_new_tokens = MAX_NEW_TOKENS elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "PaddleOCR-VL": MODEL_ID = "PaddlePaddle/PaddleOCR-VL" from transformers import AutoModelForCausalLM, AutoProcessor if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID load_kwargs = { "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config load_kwargs["device_map"] = "auto" else: load_kwargs["torch_dtype"] = torch.bfloat16 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval() if quantization_config is None: model = model.to(device) processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model_default_prompt = """OCR:""" model_default_max_new_tokens = MAX_NEW_TOKENS ### # QWEN 3-VL MODELS ### elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-2B-Instruct": MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct" from transformers import AutoProcessor, Qwen3VLForConditionalGeneration if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3VLForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-4B-Instruct": MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct" from transformers import AutoProcessor, Qwen3VLForConditionalGeneration if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3VLForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-8B-Instruct": MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" from transformers import AutoProcessor, Qwen3VLForConditionalGeneration if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3VLForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-32B-Instruct": MODEL_ID = "Qwen/Qwen3-VL-32B-Instruct" from transformers import AutoProcessor, Qwen3VLForConditionalGeneration if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3VLForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-30B-A3B-Instruct": MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3VLMoeForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3-VL-235B-A22B-Instruct-FP8": MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct-FP8" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3VLMoeForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ).eval() model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_VL_FAMILY_DEFAULTS) ### # QWEN 3.5 MODELS ### elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-0.8B": from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration MODEL_ID = "Qwen/Qwen3.5-0.8B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-2B": from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration MODEL_ID = "Qwen/Qwen3.5-2B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-4B": from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration MODEL_ID = "Qwen/Qwen3.5-4B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-9B": from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration MODEL_ID = "Qwen/Qwen3.5-9B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-27B": from transformers import ( AutoProcessor, Qwen3_5ForConditionalGeneration, ) MODEL_ID = "Qwen/Qwen3.5-27B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-27B-bnb-4bit": from transformers import ( AutoProcessor, Qwen3_5ForConditionalGeneration, ) MODEL_ID = "bertbobson/Qwen3.5-27B-bnb-4bit" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-35B-A3B": from transformers import ( AutoProcessor, Qwen3_5MoeForConditionalGeneration, ) MODEL_ID = "Qwen/Qwen3.5-35B-A3B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5MoeForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.6-27B": from transformers import ( AutoProcessor, Qwen3_5ForConditionalGeneration, ) MODEL_ID = "Qwen/Qwen3.6-27B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.6-27B-bnb-4bit": from transformers import ( AutoProcessor, Qwen3_5ForConditionalGeneration, ) MODEL_ID = "samajlouis/Qwen3.6-27B-bnb-nf4" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.6-35B-A3B": from transformers import ( AutoProcessor, Qwen3_5MoeForConditionalGeneration, ) MODEL_ID = "Qwen/Qwen3.6-35B-A3B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5MoeForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-122B-A10B": from transformers import ( AutoProcessor, Qwen3_5MoeForConditionalGeneration, ) MODEL_ID = "Qwen/Qwen3.5-122B-A10B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5MoeForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Qwen3.5-397B-A17B": from transformers import ( AutoProcessor, Qwen3_5MoeForConditionalGeneration, ) MODEL_ID = "Qwen/Qwen3.5-397B-A17B" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = Qwen3_5MoeForConditionalGeneration.from_pretrained( MODEL_ID, **load_kwargs ) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_QWEN3_5_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "Gemma 4 31B bnb": from transformers import AutoModelForCausalLM, AutoProcessor MODEL_ID = "unsloth/gemma-4-31B-it-unsloth-bnb-4bit" if OVERRIDE_VLM_REPO_ID: MODEL_ID = OVERRIDE_VLM_REPO_ID processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) load_kwargs = { "attn_implementation": attn_implementation, "device_map": "auto", "trust_remote_code": True, "config": _get_vlm_config_capped_length(MODEL_ID), } if quantization_config is not None: load_kwargs["quantization_config"] = quantization_config else: load_kwargs["dtype"] = "auto" model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs) model_default_prompt = text_read_default_prompt _apply_generation_family_defaults(_GEMMA4_FAMILY_DEFAULTS) elif SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL == "None": model = None processor = None else: raise ValueError( f"Invalid model selected: {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}. Valid options are shown in tools/run_vlm.py, or None" ) # Override model defaults with user-provided config values if they are set # Priority: user config value > model default if VLM_DEFAULT_DO_SAMPLE is not None: model_default_do_sample = VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_TOP_P is not None: model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_MIN_P is not None: model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_TOP_K is not None: model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TEMPERATURE is not None: model_default_temperature = VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_REPETITION_PENALTY is not None: model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY if VLM_DEFAULT_PRESENCE_PENALTY is not None: model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY if VLM_SEED is not None: model_default_seed = VLM_SEED # Cap processor tokenizer to config max context length so all tokenization respects MAX_INPUT_TOKEN_LENGTH if processor is not None: tokenizer = getattr(processor, "tokenizer", None) if tokenizer is not None and hasattr(tokenizer, "model_max_length"): current_max = tokenizer.model_max_length if current_max is None or current_max == float("inf"): tokenizer.model_max_length = MAX_INPUT_TOKEN_LENGTH elif current_max > MAX_INPUT_TOKEN_LENGTH: tokenizer.model_max_length = MAX_INPUT_TOKEN_LENGTH # Log effective VLM context cap so env (e.g. MAX_INPUT_TOKEN_LENGTH=4096) can be verified _ref_ctx = 32768 _reserve = 1024 _eff_max = min( VLM_MAX_IMAGE_SIZE, (VLM_MAX_IMAGE_SIZE * max(0, MAX_INPUT_TOKEN_LENGTH - _reserve) // _ref_ctx) // 1024 * 1024, ) _abs_min = 65536 effective_max_pixels_at_load = max(_abs_min, _eff_max) effective_min_pixels_at_load = min( VLM_MIN_IMAGE_SIZE, effective_max_pixels_at_load ) if SHOW_VLM_MODEL_OPTIONS: print( f"VLM context cap: MAX_INPUT_TOKEN_LENGTH={MAX_INPUT_TOKEN_LENGTH}, " f"effective max_pixels={effective_max_pixels_at_load}, min_pixels={effective_min_pixels_at_load} " f"(VLM_MAX_IMAGE_SIZE={VLM_MAX_IMAGE_SIZE}, VLM_MIN_IMAGE_SIZE={VLM_MIN_IMAGE_SIZE})" ) # Store at module level for USE_TRANSFORMERS_VLM_MODEL_AS_LLM (no global needed at module level) _loaded_vlm_model = model _loaded_vlm_processor = processor _load_vlm_weights_fn = _load_vlm_weights_and_finalize # print(f"Successfully loaded {SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL}") if SHOW_VLM_MODEL_OPTIONS and LOAD_TRANSFORMERS_VLM_MODEL_AT_START: try: ensure_transformers_vlm_loaded() except Exception as e: print(f"Warning: Could not load transformers VLM model at startup: {e}") print("The VLM will be loaded on first use when a VLM OCR task runs.") def get_loaded_vlm_model_and_tokenizer(): """ Return the currently loaded VLM model and its tokenizer for use by LLM tasks (e.g. entity detection) when USE_TRANSFORMERS_VLM_MODEL_AS_LLM is True. Returns (model, tokenizer) or (None, None) if the VLM has not been loaded yet. """ global _loaded_vlm_model, _loaded_vlm_processor ensure_transformers_vlm_loaded() if _loaded_vlm_model is None or _loaded_vlm_processor is None: return None, None tokenizer = getattr(_loaded_vlm_processor, "tokenizer", _loaded_vlm_processor) return _loaded_vlm_model, tokenizer @spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME) def extract_text_from_image_vlm( text: str, image: Image.Image, max_new_tokens: int = None, temperature: float = None, top_p: float = None, min_p: float = None, top_k: int = None, repetition_penalty: float = None, do_sample: bool = None, presence_penalty: float = None, seed: int = None, model_default_prompt: str = None, ): """ Generates responses using the configured vision model for image input. When ``VLM_DEFAULT_STREAM`` is True (default), streams text to the console and returns the full string when generation finishes. When ``VLM_DEFAULT_STREAM`` is False, runs a single batched ``generate`` call (no console streaming) and returns the same ``(text, input_tokens, output_tokens)`` tuple. Uses model-specific defaults if they were set during model initialization, falling back to function argument defaults if provided, and finally to sensible general defaults if neither are available. Args: text (str): The text prompt to send to the vision model. If empty and model has a default prompt, the model default will be used. image (Image.Image): The PIL Image to process. Must not be None. max_new_tokens (int, optional): Maximum number of new tokens to generate. Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config. temperature (float, optional): Sampling temperature for generation. Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7. top_p (float, optional): Nucleus sampling parameter (top-p). Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9. min_p (float, optional): Minimum probability threshold for token sampling. Defaults to model-specific value or 0.0. top_k (int, optional): Top-k sampling parameter. Defaults to model-specific value (20 for Qwen3-VL models) or 50. repetition_penalty (float, optional): Penalty for token repetition. Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3. do_sample (bool, optional): If True, use sampling (do_sample=True). If False, use sampling (do_sample=True). If None, defaults to False (sampling) for Qwen3-VL models, or True (sampling) for other models. presence_penalty (float, optional): Penalty for token presence. Defaults to model-specific value (1.5 for Qwen3-VL models) or None. Note: Not all models support this parameter. seed (int, optional): Random seed for generation. If None, uses VLM_SEED from config if set, otherwise no seed is set (non-deterministic). model_default_prompt (str, optional): The default prompt to use if no text is provided. Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image." Returns: Tuple[str, int, int]: The complete generated text response, input tokens (estimated), output tokens (estimated). """ if image is None: return "Please upload an image.", 0, 0 if not SHOW_VLM_MODEL_OPTIONS: return ( "Local transformers VLM is not enabled (SHOW_VLM_MODEL_OPTIONS=False).", 0, 0, ) ensure_transformers_vlm_loaded() if model is None or processor is None: return ( "No local transformers VLM is loaded. Check SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL " "or prior load errors (e.g. set LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True to load at startup).", 0, 0, ) # Determine parameter values with priority: function args > model defaults > general defaults # Priority order: function argument (if not None) > model default > general default # Text/prompt handling if text and text.strip(): actual_text = text elif model_default_prompt is not None: actual_text = model_default_prompt else: actual_text = "Read all the text in the image." # General default # max_new_tokens: function arg > model default > general default if max_new_tokens is not None: actual_max_new_tokens = max_new_tokens elif model_default_max_new_tokens is not None: actual_max_new_tokens = model_default_max_new_tokens else: actual_max_new_tokens = MAX_NEW_TOKENS # General default (from config) # temperature: function arg > model default (which may include config override) if temperature is not None: actual_temperature = temperature elif model_default_temperature is not None: actual_temperature = model_default_temperature else: # Fallback to a sensible default if neither function arg nor model default is set actual_temperature = 0.1 # top_p: function arg > model default (which may include config override) if top_p is not None: actual_top_p = top_p elif model_default_top_p is not None: actual_top_p = model_default_top_p else: # Fallback to a sensible default if neither function arg nor model default is set actual_top_p = 0.8 # min_p: function arg > model default (which may include config override) if min_p is not None: actual_min_p = min_p elif model_default_min_p is not None: actual_min_p = model_default_min_p else: # Fallback to a sensible default if neither function arg nor model default is set actual_min_p = 0.0 # top_k: function arg > model default (which may include config override) if top_k is not None: actual_top_k = top_k elif model_default_top_k is not None: actual_top_k = model_default_top_k else: # Fallback to a sensible default if neither function arg nor model default is set actual_top_k = 20 # repetition_penalty: function arg > model default (which may include config override) if repetition_penalty is not None: actual_repetition_penalty = repetition_penalty elif model_default_repetition_penalty is not None: actual_repetition_penalty = model_default_repetition_penalty else: # Fallback to a sensible default if neither function arg nor model default is set actual_repetition_penalty = 1.0 # do_sample: function arg > model default (which may include config override) if do_sample is not None: actual_do_sample = do_sample elif model_default_do_sample is not None: actual_do_sample = model_default_do_sample else: # Fallback to a sensible default if neither function arg nor model default is set actual_do_sample = True # presence_penalty: function arg > model default (which may include config override) > None actual_presence_penalty = None if presence_penalty is not None: actual_presence_penalty = presence_penalty elif model_default_presence_penalty is not None: actual_presence_penalty = model_default_presence_penalty # seed: function arg > model default (which may include config override) actual_seed = None if seed is not None: actual_seed = seed elif model_default_seed is not None: actual_seed = model_default_seed messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": actual_text}, ], } ] # Build prompt: when disabling Qwen3.5 thinking we append after the generation # prompt so the model sees it and continues with the answer (avoids continue_final_message # which can fail when the chat template does not include the final assistant message in the # rendered string). prompt_full = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) if VLM_DISABLE_QWEN3_5_THINKING: prompt_full = prompt_full + VLM_QWEN3_5_NOTHINK_SUFFIX # Cap max_pixels so image tokens + text fit within MAX_INPUT_TOKEN_LENGTH (image token count scales with resolution). # Reserve ~1k tokens for prompt; allow max_pixels below VLM_MIN_IMAGE_SIZE when context is small to avoid VRAM spike. _ref_context = 32768 _reserve_text = 1024 _effective_max_pixels = min( VLM_MAX_IMAGE_SIZE, ( VLM_MAX_IMAGE_SIZE * max(0, MAX_INPUT_TOKEN_LENGTH - _reserve_text) // _ref_context ) // 1024 * 1024, ) _absolute_min_pixels = 65536 # 256*256 so image remains usable effective_max_pixels = max(_absolute_min_pixels, _effective_max_pixels) # Don't force upscaling above our cap: min_pixels must not exceed max_pixels effective_min_pixels = min(VLM_MIN_IMAGE_SIZE, effective_max_pixels) inputs = processor( text=[prompt_full], images=[image], return_tensors="pt", padding=True, min_pixels=effective_min_pixels, max_pixels=effective_max_pixels, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH, ).to(device) use_stream = VLM_DEFAULT_STREAM if VLM_DEFAULT_STREAM is not None else True # Set random seed if specified if actual_seed is not None: torch.manual_seed(actual_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(actual_seed) # Build generation kwargs with resolved parameters generation_kwargs = { **inputs, "max_new_tokens": actual_max_new_tokens, "do_sample": actual_do_sample, "temperature": actual_temperature, "top_p": actual_top_p, "min_p": actual_min_p, "top_k": actual_top_k, "repetition_penalty": actual_repetition_penalty, } # Add presence_penalty if it's set and the model supports it # Only Qwen3-VL models currently support presence_penalty if actual_presence_penalty is not None and model_supports_presence_penalty: generation_kwargs["presence_penalty"] = actual_presence_penalty start_time = time.time() buffer = "" if use_stream: streamer = TextIteratorStreamer( processor, skip_prompt=True, skip_special_tokens=True ) generation_kwargs["streamer"] = streamer thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() line_buffer = "" # Accumulate text for the current line for new_text in streamer: buffer += new_text buffer = buffer.replace("<|im_end|>", "") line_buffer += new_text # Print to console as it streams print(new_text, end="", flush=True) # If we hit a newline, report the entire accumulated line to GUI if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text: # Split by newline to handle the line(s) we just completed parts = line_buffer.split("\n") # Report all complete lines (everything except the last part which may be incomplete) for line in parts[:-1]: if line.strip(): # Only report non-empty lines gr.Info(line, duration=2) # Keep the last part (after the last newline) for the next line line_buffer = parts[-1] if parts else "" # Print final newline after streaming is complete print() # Add newline at the end else: with torch.inference_mode(): output_ids = model.generate(**generation_kwargs) prompt_len = inputs["input_ids"].shape[1] new_token_ids = output_ids[:, prompt_len:] buffer = processor.batch_decode(new_token_ids, skip_special_tokens=True)[0] buffer = buffer.replace("<|im_end|>", "") if REPORT_VLM_OUTPUTS_TO_GUI and buffer.strip(): for line in buffer.split("\n"): if line.strip(): gr.Info(line, duration=2) end_time = time.time() # Estimate token usage for local models # For local transformers models, we can estimate using the tokenizer if available input_tokens = 0 output_tokens = 0 try: if ( processor and hasattr(processor, "tokenizer") and processor.tokenizer is not None ): # Estimate input tokens from prompt and image # Note: Vision models encode images differently, so this is an approximation prompt_tokens = len( processor.tokenizer.encode(actual_text, add_special_tokens=False) ) # Rough estimate: assume image tokens are proportional to image size # This is a rough approximation - actual vision tokenization is more complex image_tokens_estimate = ( image.size[0] * image.size[1] ) // 1000 # Rough estimate input_tokens = prompt_tokens + image_tokens_estimate # Estimate output tokens from generated text output_tokens = len( processor.tokenizer.encode(buffer, add_special_tokens=False) ) except Exception: # If token counting fails, use rough word-based estimates input_tokens = len(actual_text.split()) * 2 # Rough estimate output_tokens = len(buffer.split()) * 2 # Rough estimate duration = end_time - start_time tokens_per_second = output_tokens / duration if duration > 0 else 0 print("\n--- Performance ---") print(f"Time taken: {duration:.2f} seconds") print(f"Generated tokens: {output_tokens}") print(f"Tokens per second: {tokens_per_second:.2f}") buffer = strip_vlm_thinking_tags(buffer) # Return the complete text and token estimates return buffer, input_tokens, output_tokens # Optionally, give some more guidance on bounding box coordinates if ADD_VLM_BOUNDING_BOX_RULES: # Qwen models don't need the additional bounding box guidance as they have already been trained in this coordinate system if ( ( "qwen" in str(SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL).lower() and SHOW_VLM_MODEL_OPTIONS ) or ( "qwen" in str(DEFAULT_INFERENCE_SERVER_VLM_MODEL).lower() and SHOW_INFERENCE_SERVER_VLM_OPTIONS ) or ("qwen" in str(CLOUD_VLM_MODEL_CHOICE).lower() and SHOW_BEDROCK_VLM_MODELS) ): additional_bounding_box_rules = "" else: additional_bounding_box_rules = "\n- Bounding boxes should fit within the coordinate extents of the image: 0, 0 is the top left corner of the image, and 999, 999 is the bottom right corner of the image" else: additional_bounding_box_rules = "" full_page_ocr_vlm_prompt = f"""Spot all the text in the image at line-level, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': 'identified text', 'conf': 'confidence score 0-1.0'}}, ...]. IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry. Rules: - Each line must be on a separate horizontal row in the image - Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line) - If text spans multiple horizontal lines, split it into separate entries (one per line) - The text should not contain any formatting tags unless they are explicitly written in the text (e.g. the text is html or markdown) - Do NOT combine lines that appear on different horizontal rows - Each bounding box should tightly fit around a single horizontal line of text{additional_bounding_box_rules} - Empty lines should be skipped - Use keys bbox, text, and conf; 'conf' must be a numeric confidence from 0-1 # Only return valid JSON, no additional text or explanation.""" full_page_ocr_people_vlm_prompt = f"""Spot all photos of people's faces in the image, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': '[FACE]', 'conf': 'confidence score 0-1.0'}}, ...]. Rules: - If there are no photos of people's faces in the image, return an empty JSON array [] - If you are not confident that the detected object is a photo of a person's face, do not include it in the results. Only return results for objects that are clearly photos of people's faces. If in doubt, do not include it in the results. - For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON. - Each identified photo of a person's face with high confidence should be a separate JSON entry - Only include photos of people's faces in the results, not a drawing or sketch - Bounding boxes around an identified person's face should completely cover the person's face{additional_bounding_box_rules} - 'text' must be exactly the string '[FACE]' (no other wording) - 'conf' should be a numeric confidence from 0-1 - Do NOT include any other text or information in the JSON # Only return valid JSON, no additional text or explanation.""" full_page_ocr_signature_vlm_prompt = f"""Spot all handwritten signatures in the image, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': '[SIGNATURE]', 'conf': 'confidence score 0-1.0'}}, ...]. Rules: - If there are no handwritten signatures in the image, return an empty JSON array [] - If you are not confident that the detected object is a handwritten signature, do not include it in the results. Only return results for objects that are clearly handwritten signatures. If in doubt, do not include it in the results. - For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON. - Each identified handwritten signature with high confidence should be a separate JSON entry - Bounding boxes around an identified handwritten signature should completely cover the signature{additional_bounding_box_rules} - 'text' must be exactly the string '[SIGNATURE]' (no other wording) - 'conf' should be a numeric confidence from 0-1 - Do NOT include any other text or information in the JSON. # Only return valid JSON, no additional text or explanation.""" # Test for word-level OCR with VLMs - makes some mistakes but not bad full_page_ocr_vlm_words_prompt = f"""Spot all the text in the image at word-level, and output in JSON format as [{{'bbox': [x1, y1, x2, y2], 'text': 'identified word', 'conf': 'confidence score 0-1.0'}}, ...]. IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image. Rules: - Each entry should correspond to a single distinct word (not groups of words, not whole lines) - For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word{additional_bounding_box_rules} - For successful results, only return bbox, text, and conf keys. Do not include any other keys in the JSON. - Do not merge words. Do not split words into letters. Only return one entry per word - Maintain the order of words as they appear spatially from top to bottom, left to right - Skip any empty or whitespace-only entries - Do not include extraneous text, explanations, or formatting beyond the required JSON Only return valid JSON, no additional text or explanation."""