Instructions to use lainlives/Qwen3.5-9B-bnb-4bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use lainlives/Qwen3.5-9B-bnb-4bit with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="lainlives/Qwen3.5-9B-bnb-4bit") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForMultimodalLM processor = AutoProcessor.from_pretrained("lainlives/Qwen3.5-9B-bnb-4bit") model = AutoModelForMultimodalLM.from_pretrained("lainlives/Qwen3.5-9B-bnb-4bit") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use lainlives/Qwen3.5-9B-bnb-4bit with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "lainlives/Qwen3.5-9B-bnb-4bit" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lainlives/Qwen3.5-9B-bnb-4bit", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/lainlives/Qwen3.5-9B-bnb-4bit
- SGLang
How to use lainlives/Qwen3.5-9B-bnb-4bit with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "lainlives/Qwen3.5-9B-bnb-4bit" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lainlives/Qwen3.5-9B-bnb-4bit", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "lainlives/Qwen3.5-9B-bnb-4bit" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lainlives/Qwen3.5-9B-bnb-4bit", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use lainlives/Qwen3.5-9B-bnb-4bit with Docker Model Runner:
docker model run hf.co/lainlives/Qwen3.5-9B-bnb-4bit
vllm==0.18.0 RuntimeError: Failed to load the tokenizer. If the tokenizer is a custom tokenizer not yet available in the HuggingFace transformers library, consider setting `trust_remote_code=True`
CUDA_VISIBLE_DEVICES=0 vllm serve lainlives/Qwen3.5-9B-bnb-4bit --kv-cache-dtype fp8 --mm-processor-cache-gb 0 --limit-mm-per-prompt.video 0 --limit-mm-per-prompt.image 1 -dp 1 --port 10014 --gpu-memory-utilization 0.976 --served-model-name qwen3.5-9b --dtype half --max_num_seqs 4 --max-model-len 8192 --mamba-cache-dtype float16 --mamba-ssm-cache-dtype float16 --kv-cache-memory-bytes 100M --async-scheduling --default-chat-template-kwargs '{"enable_thinking": false}'
It should work with Qwen3_5Model just fine, I can try reuploading it later.
CUDA_VISIBLE_DEVICES=0 vllm serve lainlives/Qwen3.5-9B-bnb-4bit --kv-cache-dtype fp8 --mm-processor-cache-gb 0 --limit-mm-per-prompt.video 0 --limit-mm-per-prompt.image 1 -dp 1 --port 10014 --gpu-memory-utilization 0.976 --served-model-name qwen3.5-9b --dtype half --max_num_seqs 4 --max-model-len 8192 --mamba-cache-dtype float16 --mamba-ssm-cache-dtype float16 --kv-cache-memory-bytes 100M --async-scheduling --default-chat-template-kwargs '{"enable_thinking": false}'
Sorry i forgot, but i forgot that a function had to be wrapped heres the wrapper i used to get bnb to load it.
import contextlib
import torch
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
class AbliteratedQwen3_5ForCausalLM(Qwen3_5ForConditionalGeneration):
def __init__(self, config):
text_cfg = getattr(config, "text_config", None)
if text_cfg is not None:
for key, value in text_cfg.to_dict().items():
if not hasattr(config, key):
setattr(config, key, value)
super().__init__(config)
direction = getattr(config, "refusal_direction", None)
if direction is None:
raise ValueError(
"Missing `refusal_direction` in config. "
"This checkpoint expects an embedded refusal direction."
)
direction_t = torch.tensor(direction, dtype=torch.float32)
self.register_buffer("_refusal_direction", direction_t, persistent=False)
self.ablation_layers = [int(x) for x in getattr(config, "ablation_layers", [])]
self.ablation_strength = float(getattr(config, "ablation_strength", 1.0))
def _decoder_layers(self):
if hasattr(self.model, "language_model") and hasattr(self.model.language_model, "layers"):
return self.model.language_model.layers
if hasattr(self.model, "layers"):
return self.model.layers
raise AttributeError("Could not find decoder layers in model.")
def _make_hook(self):
def hook_fn(module, inputs):
hidden = inputs[0]
d = self._refusal_direction.to(hidden.device, hidden.dtype)
d = d / (d.norm() + 1e-8)
projection = torch.einsum("bsd,d->bs", hidden, d).unsqueeze(-1) * d
hidden = hidden - float(self.ablation_strength) * projection
return (hidden, *inputs[1:])
return hook_fn
@contextlib.contextmanager
def _temporary_hooks(self):
layers = self._decoder_layers()
layer_ids = [i for i in self.ablation_layers if 0 <= i < len(layers)]
if not layer_ids or self.ablation_strength <= 0:
yield
return
handle_list = []
try:
hook_fn = self._make_hook()
for layer_id in layer_ids:
handle_list.append(layers[layer_id].register_forward_pre_hook(hook_fn))
yield
finally:
for h in handle_list:
h.remove()
def forward(self, *args, **kwargs):
self.ablation_strength = float(getattr(self.config, "ablation_strength", self.ablation_strength))
self.ablation_layers = [int(x) for x in getattr(self.config, "ablation_layers", self.ablation_layers)]
with self._temporary_hooks():
return super().forward(*args, **kwargs)
It's worth mentioning depending whats loading it layers might be missing. You will have to work around that yourself. Ask gemini or whatever its pretty good at that.