Qwen3.5-9B-NVFP4 / Qwen3.5-9B_nvfp4.py
ig1sa's picture
Add files using upload-large-folder tool
391ee72 verified
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, Qwen3_5ForConditionalGeneration
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# NOTE: This example requires transformers >= v5
MODEL_ID = "Qwen/Qwen3.5-9B"
# Load model.
model = Qwen3_5ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
processor = AutoTokenizer.from_pretrained(MODEL_ID)
recipe = QuantizationModifier(
targets="Linear",
scheme="NVFP4",
ignore=[
"lm_head",
"re:.*visual.*",
"re:.*linear_attn.*",
]
)
NUM_CALIBRATION_SAMPLES = 1024
MAX_SEQUENCE_LENGTH = 8192
samples_per_split = NUM_CALIBRATION_SAMPLES // 4 # 256 per domain
# ============================================================
# 1. General conversation (English)
# ============================================================
ds_chat = load_dataset(
"HuggingFaceH4/ultrachat_200k",
split=f"train_sft[:{samples_per_split}]",
)
def preprocess_chat(example):
text = processor.apply_chat_template(
example["messages"], tokenize=False
)
return {"text": text}
ds_chat = ds_chat.map(preprocess_chat).select_columns(["text"])
# ============================================================
# 2. Math / reasoning
# ============================================================
ds_math = load_dataset(
"openai/gsm8k", "main",
split=f"train[:{samples_per_split}]",
)
def preprocess_math(example):
messages = [
{"role": "user", "content": example["question"]},
{"role": "assistant", "content": example["answer"]},
]
text = processor.apply_chat_template(messages, tokenize=False)
return {"text": text}
ds_math = ds_math.map(preprocess_math).select_columns(["text"])
# ============================================================
# 3. Code
# ============================================================
ds_code = load_dataset(
"sahil2801/CodeAlpaca-20k",
split=f"train[:{samples_per_split}]",
)
def preprocess_code(example):
user_content = example["instruction"]
if example.get("input"):
user_content += "\n\n" + example["input"]
messages = [
{"role": "user", "content": user_content},
{"role": "assistant", "content": example["output"]},
]
text = processor.apply_chat_template(messages, tokenize=False)
return {"text": text}
ds_code = ds_code.map(preprocess_code).select_columns(["text"])
# ============================================================
# 4. Multilingual
# ============================================================
ds_multi = load_dataset(
"CohereForAI/aya_dataset",
split=f"train[:{samples_per_split}]",
)
def preprocess_multi(example):
messages = [
{"role": "user", "content": example["inputs"]},
{"role": "assistant", "content": example["targets"]},
]
text = processor.apply_chat_template(messages, tokenize=False)
return {"text": text}
ds_multi = ds_multi.map(preprocess_multi).select_columns(["text"])
# ============================================================
# Combine all datasets and shuffle
# ============================================================
ds = concatenate_datasets([ds_chat, ds_math, ds_code, ds_multi])
ds = ds.shuffle(seed=42)
# Filter out any empty entries just in case.
ds = ds.filter(lambda x: len(x["text"].strip()) > 0)
# Tokenize inputs.
def tokenize(sample):
return processor(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
# ============================================================
# Patch: llmcompressor reads attention config from top-level,
# but for this multimodal model it lives in text_config
# ============================================================
text_cfg = model.config.text_config
for attr in [
"num_attention_heads",
"num_key_value_heads",
"hidden_size",
"head_dim",
]:
if not hasattr(model.config, attr) and hasattr(text_cfg, attr):
setattr(model.config, attr, getattr(text_cfg, attr))
# Apply quantization.
oneshot(
model=model,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
moe_calibrate_all_experts=True,
)
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, safe_serialization=True)
processor.save_pretrained(SAVE_DIR)