import sys
import os
import torch
import copy

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from transformers import AutoConfig, AutoTokenizer, AutoProcessor, PreTrainedTokenizer

from quark.torch import (
    LLMTemplate,
    ModelQuantizer,
    export_safetensors,
    import_model_from_safetensors,
)

from quark.torch.utils.llm import (
    get_calib_dataloader,
)

# Import the correct model class
try:
    from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
    model_class = Qwen2_5_VLForConditionalGeneration
    print("Using Qwen2_5_VLForConditionalGeneration")
except ImportError:
    print("Failed to load the model using Qwen2_5_VLForConditionalGeneration")

# Setup
model_dir = "/scratch/dwchenna/github/hf-models/Qwen2.5-VL-3B-Instruct"
model_out_dir = "quantized_models/Qwen2.5-VL-3B-Instruct-per-grp-quant"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Device: {device}")
print("Loading model...")

# Load config, tokenizer, processor
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)

# Load model
model = model_class.from_pretrained(
    model_dir, 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map="cpu"
).to(device)

print(f"Model loaded: {model.__class__.__name__}")
model.eval()

# Filter out visual part
del model.visual
print(model)

# Create output directory
os.makedirs(model_out_dir, exist_ok=True)
tokenizer.save_pretrained(model_out_dir)

# Get model type
model_config_type = (
    model.config.model_type if hasattr(model.config, "model_type") else model.config.architectures[0]
)
print(f"Model config type: {model_config_type}")

# Register template for qwen2_5_vl if needed
if model_config_type == "qwen2_5_vl":
    qwen2_5_vl_template = LLMTemplate(
        model_type="qwen2_5_vl",
        kv_layers_name=["model.layers.*.self_attn.q_proj", 
                       "model.layers.*.self_attn.k_proj", 
                       "model.layers.*.self_attn.v_proj"],
        q_layer_name="model.layers.*.self_attn.q_proj",
        exclude_layers_name=["visual*", "*vision*"],
    )
    LLMTemplate.register_template(qwen2_5_vl_template)
    print(f"Registered template for '{qwen2_5_vl_template.model_type}'")

# Check if model type is supported
if model_config_type not in LLMTemplate.list_available():
    print(f"Available templates: {LLMTemplate.list_available()}")
    raise ValueError(f"Model type '{model_config_type}' is not supported.")

template = LLMTemplate.get(model_config_type)
print(f"Using template: {model_config_type}")

# Quantization configuration
quant_scheme = "uint4_wo_128"
quant_algo = None # "awq"
exclude_layers = ["visual*", "*vision*"]

quant_config = template.get_config(
    scheme=quant_scheme,
    algorithm=quant_algo,
    exclude_layers=exclude_layers,
)

print(f"Quantization config: {quant_config}")

# Create calibration dataloader
print("Loading calibration dataset...")
main_device = model.device
dataset = "pileval_for_awq_benchmark"
batch_size = 1
num_calib_data = 128
seq_len = 512

calib_dataloader = get_calib_dataloader(
    dataset_name=dataset,
    processor=None,  # Set to None to avoid multimodal issues
    tokenizer=tokenizer,
    batch_size=batch_size,
    num_calib_data=num_calib_data,
    seqlen=seq_len,
    device=main_device,
)

# Quantize model
print("Starting quantization...")
try:
    quantizer = ModelQuantizer(quant_config)
    model = quantizer.quantize_model(model, calib_dataloader)
    
    # Freeze model
    model = quantizer.freeze(model)
    
    print("✓ Quantization completed successfully!")
    
    # Export quantized model
    print("Exporting quantized model...")
    with torch.no_grad():
        export_safetensors(
            model=model,
            output_dir=model_out_dir,
            custom_mode="quark",
            weight_format="real_quantized",
            pack_method="reorder",
        )
    
    print(f"✓ Model exported to: {model_out_dir}")
    
except Exception as e:
    print(f"✗ Quantization failed: {e}")
    print("This is likely due to AWQ not being compatible with multimodal models.")
    print("Try using a simpler quantization scheme without AWQ.")

print("Quantization script completed!")

# CHANGED: Load model using quark's import function instead of model_class.from_pretrained
print("Loading quantized model...")
try:
    model_quant = import_model_from_safetensors(
        model=model,  # Pass the original model structure 
        model_dir=model_out_dir  # Directory with safetensors files
    )
    model_quant = model_quant.to(device)
    print(f"✓ Successfully loaded quantized model from safetensors")
    print(f"Model type: {type(model_quant)}")
except Exception as e:
    print(f"✗ Failed to load with import_model_from_safetensors: {e}")
    print("Using the quantized model directly from memory...")
    model_quant = model

# Evaluation settings (from quantize_quark.py)
eval_args = {
    'dataset_name': 'wikitext2',
    'num_eval_samples': 128,
    'num_eval_data': -1,
    'seqlen': 512,
    'limit': None,
    'eval_bs': 1,
    'save_result_path': None,
}

# Load the evaluation dataset
from datasets import Dataset, load_dataset
testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(  # type: ignore
    model_out_dir,
    trust_remote_code=True,
)
# load the test dataset
testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
# evaluation metrics
metrics = []
main_device = model_quant.device
print('main_device:', main_device)

from quark.contrib.llm_eval import ppl_eval
print("[INFO]: Evaluation of quantized model ... ")
ppl = ppl_eval(model_quant, testenc, main_device)
print(f"\n[INFO] Perplexity: {ppl.item()}")
metrics.append(["Perplexity", ppl.cpu().numpy()])