Instructions to use ig1/Qwen3.5-9B-NVFP4 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use ig1/Qwen3.5-9B-NVFP4 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="ig1/Qwen3.5-9B-NVFP4")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForMultimodalLM

processor = AutoProcessor.from_pretrained("ig1/Qwen3.5-9B-NVFP4")
model = AutoModelForMultimodalLM.from_pretrained("ig1/Qwen3.5-9B-NVFP4")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use ig1/Qwen3.5-9B-NVFP4 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "ig1/Qwen3.5-9B-NVFP4"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ig1/Qwen3.5-9B-NVFP4",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/ig1/Qwen3.5-9B-NVFP4

SGLang

How to use ig1/Qwen3.5-9B-NVFP4 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "ig1/Qwen3.5-9B-NVFP4" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ig1/Qwen3.5-9B-NVFP4",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "ig1/Qwen3.5-9B-NVFP4" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ig1/Qwen3.5-9B-NVFP4",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use ig1/Qwen3.5-9B-NVFP4 with Docker Model Runner:
```
docker model run hf.co/ig1/Qwen3.5-9B-NVFP4
```

Qwen3.5-9B-NVFP4 / Qwen3.5-9B_nvfp4.py

ig1sa

Add files using upload-large-folder tool

391ee72 verified 3 months ago

raw

history blame contribute delete

4.64 kB

	from datasets import load_dataset, concatenate_datasets
	from transformers import AutoTokenizer, Qwen3_5ForConditionalGeneration

	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import QuantizationModifier

	# NOTE: This example requires transformers >= v5

	MODEL_ID = "Qwen/Qwen3.5-9B"

	# Load model.
	model = Qwen3_5ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
	processor = AutoTokenizer.from_pretrained(MODEL_ID)

	recipe = QuantizationModifier(
	targets="Linear",
	scheme="NVFP4",
	ignore=[
	"lm_head",
	"re:.visual.",
	"re:.linear_attn.",
	]
	)

	NUM_CALIBRATION_SAMPLES = 1024
	MAX_SEQUENCE_LENGTH = 8192
	samples_per_split = NUM_CALIBRATION_SAMPLES // 4 # 256 per domain

	# ============================================================
	# 1. General conversation (English)
	# ============================================================
	ds_chat = load_dataset(
	"HuggingFaceH4/ultrachat_200k",
	split=f"train_sft[:{samples_per_split}]",
	)


	def preprocess_chat(example):
	text = processor.apply_chat_template(
	example["messages"], tokenize=False
	)
	return {"text": text}


	ds_chat = ds_chat.map(preprocess_chat).select_columns(["text"])

	# ============================================================
	# 2. Math / reasoning
	# ============================================================
	ds_math = load_dataset(
	"openai/gsm8k", "main",
	split=f"train[:{samples_per_split}]",
	)


	def preprocess_math(example):
	messages = [
	{"role": "user", "content": example["question"]},
	{"role": "assistant", "content": example["answer"]},
	]
	text = processor.apply_chat_template(messages, tokenize=False)
	return {"text": text}


	ds_math = ds_math.map(preprocess_math).select_columns(["text"])

	# ============================================================
	# 3. Code
	# ============================================================
	ds_code = load_dataset(
	"sahil2801/CodeAlpaca-20k",
	split=f"train[:{samples_per_split}]",
	)


	def preprocess_code(example):
	user_content = example["instruction"]
	if example.get("input"):
	user_content += "\n\n" + example["input"]
	messages = [
	{"role": "user", "content": user_content},
	{"role": "assistant", "content": example["output"]},
	]
	text = processor.apply_chat_template(messages, tokenize=False)
	return {"text": text}


	ds_code = ds_code.map(preprocess_code).select_columns(["text"])

	# ============================================================
	# 4. Multilingual
	# ============================================================
	ds_multi = load_dataset(
	"CohereForAI/aya_dataset",
	split=f"train[:{samples_per_split}]",
	)


	def preprocess_multi(example):
	messages = [
	{"role": "user", "content": example["inputs"]},
	{"role": "assistant", "content": example["targets"]},
	]
	text = processor.apply_chat_template(messages, tokenize=False)
	return {"text": text}


	ds_multi = ds_multi.map(preprocess_multi).select_columns(["text"])

	# ============================================================
	# Combine all datasets and shuffle
	# ============================================================
	ds = concatenate_datasets([ds_chat, ds_math, ds_code, ds_multi])
	ds = ds.shuffle(seed=42)

	# Filter out any empty entries just in case.
	ds = ds.filter(lambda x: len(x["text"].strip()) > 0)


	# Tokenize inputs.
	def tokenize(sample):
	return processor(
	sample["text"],
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	add_special_tokens=False,
	)


	ds = ds.map(tokenize, remove_columns=ds.column_names)


	# ============================================================
	# Patch: llmcompressor reads attention config from top-level,
	# but for this multimodal model it lives in text_config
	# ============================================================
	text_cfg = model.config.text_config

	for attr in [
	"num_attention_heads",
	"num_key_value_heads",
	"hidden_size",
	"head_dim",
	]:
	if not hasattr(model.config, attr) and hasattr(text_cfg, attr):
	setattr(model.config, attr, getattr(text_cfg, attr))


	# Apply quantization.
	oneshot(
	model=model,
	recipe=recipe,
	dataset=ds,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	moe_calibrate_all_experts=True,
	)

	# Save to disk in compressed-tensors format.
	SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
	model.save_pretrained(SAVE_DIR, safe_serialization=True)
	processor.save_pretrained(SAVE_DIR)