import sys import os import torch import copy sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from transformers import AutoConfig, AutoTokenizer, AutoProcessor, PreTrainedTokenizer from quark.torch import ( LLMTemplate, ModelQuantizer, export_safetensors, import_model_from_safetensors, ) from quark.torch.utils.llm import ( get_calib_dataloader, ) # Import the correct model class try: from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration model_class = Qwen2_5_VLForConditionalGeneration print("Using Qwen2_5_VLForConditionalGeneration") except ImportError: print("Failed to load the model using Qwen2_5_VLForConditionalGeneration") # Setup model_dir = "/scratch/dwchenna/github/hf-models/Qwen2.5-VL-3B-Instruct" model_out_dir = "quantized_models/Qwen2.5-VL-3B-Instruct-per-grp-quant" device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"Device: {device}") print("Loading model...") # Load config, tokenizer, processor config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True) # Load model model = model_class.from_pretrained( model_dir, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cpu" ).to(device) print(f"Model loaded: {model.__class__.__name__}") model.eval() # Filter out visual part del model.visual print(model) # Create output directory os.makedirs(model_out_dir, exist_ok=True) tokenizer.save_pretrained(model_out_dir) # Get model type model_config_type = ( model.config.model_type if hasattr(model.config, "model_type") else model.config.architectures[0] ) print(f"Model config type: {model_config_type}") # Register template for qwen2_5_vl if needed if model_config_type == "qwen2_5_vl": qwen2_5_vl_template = LLMTemplate( model_type="qwen2_5_vl", kv_layers_name=["model.layers.*.self_attn.q_proj", "model.layers.*.self_attn.k_proj", "model.layers.*.self_attn.v_proj"], q_layer_name="model.layers.*.self_attn.q_proj", exclude_layers_name=["visual*", "*vision*"], ) LLMTemplate.register_template(qwen2_5_vl_template) print(f"Registered template for '{qwen2_5_vl_template.model_type}'") # Check if model type is supported if model_config_type not in LLMTemplate.list_available(): print(f"Available templates: {LLMTemplate.list_available()}") raise ValueError(f"Model type '{model_config_type}' is not supported.") template = LLMTemplate.get(model_config_type) print(f"Using template: {model_config_type}") # Quantization configuration quant_scheme = "uint4_wo_128" quant_algo = None # "awq" exclude_layers = ["visual*", "*vision*"] quant_config = template.get_config( scheme=quant_scheme, algorithm=quant_algo, exclude_layers=exclude_layers, ) print(f"Quantization config: {quant_config}") # Create calibration dataloader print("Loading calibration dataset...") main_device = model.device dataset = "pileval_for_awq_benchmark" batch_size = 1 num_calib_data = 128 seq_len = 512 calib_dataloader = get_calib_dataloader( dataset_name=dataset, processor=None, # Set to None to avoid multimodal issues tokenizer=tokenizer, batch_size=batch_size, num_calib_data=num_calib_data, seqlen=seq_len, device=main_device, ) # Quantize model print("Starting quantization...") try: quantizer = ModelQuantizer(quant_config) model = quantizer.quantize_model(model, calib_dataloader) # Freeze model model = quantizer.freeze(model) print("✓ Quantization completed successfully!") # Export quantized model print("Exporting quantized model...") with torch.no_grad(): export_safetensors( model=model, output_dir=model_out_dir, custom_mode="quark", weight_format="real_quantized", pack_method="reorder", ) print(f"✓ Model exported to: {model_out_dir}") except Exception as e: print(f"✗ Quantization failed: {e}") print("This is likely due to AWQ not being compatible with multimodal models.") print("Try using a simpler quantization scheme without AWQ.") print("Quantization script completed!") # CHANGED: Load model using quark's import function instead of model_class.from_pretrained print("Loading quantized model...") try: model_quant = import_model_from_safetensors( model=model, # Pass the original model structure model_dir=model_out_dir # Directory with safetensors files ) model_quant = model_quant.to(device) print(f"✓ Successfully loaded quantized model from safetensors") print(f"Model type: {type(model_quant)}") except Exception as e: print(f"✗ Failed to load with import_model_from_safetensors: {e}") print("Using the quantized model directly from memory...") model_quant = model # Evaluation settings (from quantize_quark.py) eval_args = { 'dataset_name': 'wikitext2', 'num_eval_samples': 128, 'num_eval_data': -1, 'seqlen': 512, 'limit': None, 'eval_bs': 1, 'save_result_path': None, } # Load the evaluation dataset from datasets import Dataset, load_dataset testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( # type: ignore model_out_dir, trust_remote_code=True, ) # load the test dataset testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt") # evaluation metrics metrics = [] main_device = model_quant.device print('main_device:', main_device) from quark.contrib.llm_eval import ppl_eval print("[INFO]: Evaluation of quantized model ... ") ppl = ppl_eval(model_quant, testenc, main_device) print(f"\n[INFO] Perplexity: {ppl.item()}") metrics.append(["Perplexity", ppl.cpu().numpy()])