Text Generation
Transformers
Safetensors
PyTorch
English
qwen3
qwen
qwen3-1.7b
qwen3-8b
quintus
quintus-1.7b
causal-lm
language-model
chat
assistant
compact-llm
small-language-model
knowledge-distillation
online-kd
full-vocabulary-kd
supervised-fine-tuning
sft
reasoning
code-generation
english
vllm
conversational
text-generation-inference
Instructions to use iamrahulreddy/Quintus with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use iamrahulreddy/Quintus with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="iamrahulreddy/Quintus") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForMultimodalLM tokenizer = AutoTokenizer.from_pretrained("iamrahulreddy/Quintus") model = AutoModelForMultimodalLM.from_pretrained("iamrahulreddy/Quintus") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use iamrahulreddy/Quintus with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "iamrahulreddy/Quintus" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "iamrahulreddy/Quintus", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/iamrahulreddy/Quintus
- SGLang
How to use iamrahulreddy/Quintus with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "iamrahulreddy/Quintus" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "iamrahulreddy/Quintus", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "iamrahulreddy/Quintus" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "iamrahulreddy/Quintus", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use iamrahulreddy/Quintus with Docker Model Runner:
docker model run hf.co/iamrahulreddy/Quintus
| from __future__ import annotations | |
| from bisect import bisect_left, insort | |
| import torch | |
| import torch.nn.functional as F | |
| from torch.utils.data import Dataset | |
| from src.training_data import DistillationDataset | |
| class SequencePackedDataset(Dataset): | |
| def __init__( | |
| self, | |
| source: DistillationDataset, | |
| source_indices: list[int], | |
| pack_length: int, | |
| eos_token_id: int, | |
| pad_token_id: int, | |
| mask_first_after_separator: bool = True, | |
| ): | |
| if pack_length <= 0: | |
| raise ValueError(f"pack_length must be positive, got {pack_length}.") | |
| if not hasattr(source, "sample_lengths"): | |
| raise ValueError("Packed training requires a source dataset with sample_lengths metadata.") | |
| if not source_indices: | |
| raise ValueError("Packed training requires at least one source row.") | |
| self.source = source | |
| self.source_indices = [int(index) for index in source_indices] | |
| self.source_index_set = set(self.source_indices) | |
| if len(self.source_index_set) != len(self.source_indices): | |
| raise ValueError("Packed training source indices contain duplicates.") | |
| self.pack_length = int(pack_length) | |
| self.eos_token_id = int(eos_token_id) | |
| self.pad_token_id = int(pad_token_id) | |
| self.mask_first_after_separator = bool(mask_first_after_separator) | |
| self._length_by_index: dict[int, int] = {} | |
| self.plan: list[list[int]] = [] | |
| for source_index in self.source_indices: | |
| try: | |
| length = int(source.sample_lengths[source_index]) | |
| except IndexError as exc: | |
| raise IndexError(f"Source index {source_index} is outside the tokenized dataset.") from exc | |
| if length > self.pack_length: | |
| raise ValueError( | |
| f"Tokenized sample #{source_index} has length {length}, " | |
| f"which exceeds pack_length={self.pack_length}." | |
| ) | |
| self._length_by_index[source_index] = length | |
| self._build_plan() | |
| self._validate_plan() | |
| self.source_sample_count = len(self.source_indices) | |
| self.bin_count = len(self.plan) | |
| self.original_token_count = sum(self._length_by_index.values()) | |
| self.separator_token_count = sum(max(0, len(bin_indices) - 1) for bin_indices in self.plan) | |
| self.packed_token_count = self.original_token_count + self.separator_token_count | |
| self.total_capacity = self.bin_count * self.pack_length | |
| self.pad_token_count = self.total_capacity - self.packed_token_count | |
| self.average_samples_per_bin = self.source_sample_count / max(self.bin_count, 1) | |
| self.utilization = self.packed_token_count / max(self.total_capacity, 1) | |
| def _build_plan(self) -> None: | |
| items = sorted( | |
| ((self._length_by_index[source_index], source_index) for source_index in self.source_indices), | |
| key=lambda item: (-item[0], item[1]), | |
| ) | |
| available: list[tuple[int, int]] = [] | |
| for length, source_index in items: | |
| required_existing = length + 1 | |
| insert_at = bisect_left(available, (required_existing, -1)) | |
| if insert_at == len(available): | |
| bin_id = len(self.plan) | |
| self.plan.append([source_index]) | |
| remaining = self.pack_length - length | |
| insort(available, (remaining, bin_id)) | |
| continue | |
| remaining, bin_id = available.pop(insert_at) | |
| next_remaining = remaining - required_existing | |
| if next_remaining < 0: | |
| raise ValueError("Internal packing error: bin capacity became negative.") | |
| self.plan[bin_id].append(source_index) | |
| insort(available, (next_remaining, bin_id)) | |
| def _validate_plan(self) -> None: | |
| seen: set[int] = set() | |
| for bin_id, bin_indices in enumerate(self.plan): | |
| if not bin_indices: | |
| raise ValueError(f"Packed bin #{bin_id} is empty.") | |
| real_length = sum(self._length_by_index[source_index] for source_index in bin_indices) | |
| real_length += max(0, len(bin_indices) - 1) | |
| if real_length > self.pack_length: | |
| raise ValueError( | |
| f"Packed bin #{bin_id} has real_length={real_length}, " | |
| f"which exceeds pack_length={self.pack_length}." | |
| ) | |
| for source_index in bin_indices: | |
| if source_index in seen: | |
| raise ValueError(f"Source sample #{source_index} appears in more than one packed bin.") | |
| seen.add(source_index) | |
| missing = self.source_index_set - seen | |
| if missing: | |
| first_missing = min(missing) | |
| raise ValueError(f"Source sample #{first_missing} was not assigned to a packed bin.") | |
| def __len__(self) -> int: | |
| return len(self.plan) | |
| def __getitem__(self, bin_idx: int) -> dict[str, torch.Tensor]: | |
| bin_indices = self.plan[bin_idx] | |
| input_parts: list[torch.Tensor] = [] | |
| mask_parts: list[torch.Tensor] = [] | |
| original_tokens = 0 | |
| separator_tokens = 0 | |
| for sample_offset, source_index in enumerate(bin_indices): | |
| item = self.source[source_index] | |
| input_ids = item["input_ids"].long() | |
| loss_mask = item["loss_mask"].long() | |
| original_tokens += int(input_ids.size(0)) | |
| if sample_offset > 0: | |
| input_parts.append(torch.tensor([self.eos_token_id], dtype=torch.long)) | |
| mask_parts.append(torch.zeros(1, dtype=torch.long)) | |
| separator_tokens += 1 | |
| if self.mask_first_after_separator and loss_mask.numel() > 0: | |
| loss_mask = loss_mask.clone() | |
| loss_mask[0] = 0 | |
| input_parts.append(input_ids) | |
| mask_parts.append(loss_mask) | |
| input_ids = torch.cat(input_parts) | |
| loss_mask = torch.cat(mask_parts) | |
| real_length = int(input_ids.size(0)) | |
| if real_length > self.pack_length: | |
| raise ValueError( | |
| f"Packed bin #{bin_idx} has real_length={real_length}, " | |
| f"which exceeds pack_length={self.pack_length}." | |
| ) | |
| pad_len = self.pack_length - real_length | |
| if pad_len: | |
| input_ids = F.pad(input_ids, (0, pad_len), value=self.pad_token_id) | |
| loss_mask = F.pad(loss_mask, (0, pad_len), value=0) | |
| return { | |
| "input_ids": input_ids, | |
| "loss_mask": loss_mask, | |
| "real_length": torch.tensor(real_length, dtype=torch.long), | |
| "source_samples": torch.tensor(len(bin_indices), dtype=torch.long), | |
| "original_tokens": torch.tensor(original_tokens, dtype=torch.long), | |
| "separator_tokens": torch.tensor(separator_tokens, dtype=torch.long), | |
| } | |
| def collate_packed_fn(batch: list[dict], pad_token_id: int) -> dict: | |
| del pad_token_id | |
| input_ids = torch.stack([item["input_ids"] for item in batch]) | |
| loss_mask = torch.stack([item["loss_mask"] for item in batch]).long() | |
| real_lengths = torch.stack([item["real_length"] for item in batch]).long() | |
| seq_len = input_ids.size(1) | |
| positions = torch.arange(seq_len, dtype=torch.long).unsqueeze(0) | |
| attention_mask = (positions < real_lengths.unsqueeze(1)).long() | |
| labels = input_ids.clone() | |
| labels = labels.masked_fill(loss_mask == 0, -100) | |
| return { | |
| "input_ids": input_ids, | |
| "attention_mask": attention_mask, | |
| "loss_mask": loss_mask, | |
| "labels": labels, | |
| "real_length": real_lengths, | |
| "source_samples": torch.stack([item["source_samples"] for item in batch]).long(), | |
| "original_tokens": torch.stack([item["original_tokens"] for item in batch]).long(), | |
| "separator_tokens": torch.stack([item["separator_tokens"] for item in batch]).long(), | |
| } | |