Text Generation
Safetensors
PyTorch
Italian
gpt_neox
pretrained
causal-lm
autoround
intel-autoround
woq
gptq
autogptq
auto-gptq
intel
italia
italiano
italian
conversational
custom_code
8-bit precision
Instructions to use fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Local Apps Settings
- vLLM
How to use fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym
- SGLang
How to use fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym with Docker Model Runner:
docker model run hf.co/fbaldassarri/iGeniusAI_Italia-9B-Instruct-v0.1-autogptq-int8-gs128-auto-asym
| from typing import Optional, Tuple | |
| import torch | |
| from torch import nn | |
| from .configuration_italia import ItaliaConfig | |
| from transformers.models.gpt_neox import modeling_gpt_neox | |
| # inject a GPTNeoXLayer no post layer norm | |
| class GPTNeoXLayer(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.use_parallel_residual = config.use_parallel_residual | |
| self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) | |
| self.post_attention_dropout = nn.Dropout(config.hidden_dropout) | |
| self.post_mlp_dropout = nn.Dropout(config.hidden_dropout) | |
| self.attention = modeling_gpt_neox.GPT_NEOX_ATTENTION_CLASSES[config._attn_implementation](config) | |
| self.mlp = modeling_gpt_neox.GPTNeoXMLP(config) | |
| def forward( | |
| self, | |
| hidden_states: Optional[torch.FloatTensor], | |
| attention_mask: Optional[torch.FloatTensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| head_mask: Optional[torch.FloatTensor] = None, | |
| use_cache: Optional[bool] = False, | |
| layer_past: Optional[Tuple[torch.Tensor]] = None, | |
| output_attentions: Optional[bool] = False, | |
| ): | |
| attention_layer_outputs = self.attention( | |
| self.input_layernorm(hidden_states), | |
| attention_mask=attention_mask, | |
| position_ids=position_ids, | |
| layer_past=layer_past, | |
| head_mask=head_mask, | |
| use_cache=use_cache, | |
| output_attentions=output_attentions, | |
| ) | |
| attn_output = attention_layer_outputs[0] # output_attn: attn_output, present, (attn_weights) | |
| attn_output = self.post_attention_dropout(attn_output) | |
| outputs = attention_layer_outputs[1:] | |
| # self.use_parallel_residual: default true | |
| # x = x + attn(ln1(x)) + mlp(ln1(x)) | |
| mlp_output = self.mlp(self.input_layernorm(hidden_states)) | |
| mlp_output = self.post_mlp_dropout(mlp_output) | |
| hidden_states = mlp_output + attn_output + hidden_states | |
| if use_cache: | |
| outputs = (hidden_states,) + outputs # hidden_states, present, (attn_weights) | |
| else: | |
| outputs = (hidden_states,) + outputs[1:] # hidden_states, (attn_weights) | |
| return outputs | |
| modeling_gpt_neox.GPTNeoXLayer = GPTNeoXLayer | |
| from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXModel | |
| class ItaliaForCausalLM(GPTNeoXForCausalLM): | |
| config_class = ItaliaConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.gpt_neox = GPTNeoXModel(config) | |
| self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=True) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |