DeL-TaiseiOzaki/Tengentoppa-sft-reasoning-v3.0
Viewer • Updated • 198k • 11
How to use DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8 with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8")
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe(messages) # Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8")
model = AutoModelForCausalLM.from_pretrained("DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8")
messages = [
{"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8 with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker model run hf.co/DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8
How to use DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8 with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'How to use DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8 with Docker Model Runner:
docker model run hf.co/DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8
このモデルはllm-jp-13B 特殊トークン拡張+LoRA事後学習したモデルです.構造化された会話と高度なプロンプティングのための追加の特殊トークンを含む拡張トークナイザーを備えています.
以下に推論用のノートブックを示します.こちらからColabで動かせます.
1.ライブラリのインストールとインポート
!pip install -U vllm
!pip install -U triton
!pip install -U transformers
!pip install -U datasets
!pip install -U llmcompressor
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import torch
import json
import re
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HF_TOKEN = "your-token"
2.vllm用のモデルの準備
model_id = "DeL-TaiseiOzaki/Tengentoppa-llm-jp-13B-reasoning-it-fp8"
# VLLMモデルの初期化
llm = LLM(
model=model_id,
trust_remote_code=True,
tensor_parallel_size=1,
max_model_len=1536,
gpu_memory_utilization=0.8,
quantization="compressed-tensors"
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=HF_TOKEN)
3.データセットの読み込みと推論
#データセットのロード
datasets = []
with open("./elyza-tasks-100-TV_0.jsonl", "r") as f:
item = ""
for line in f:
line = line.strip()
item += line
if item.endswith("}"):
datasets.append(json.loads(item))
item = ""
# システムプロンプト
SYSTEM_PROMPT = """あなたは親切で有能なアシスタントです。\nユーザーからの質問に対して、正しい回答を提供します。\n与えられた情報を正確に整理し,論理的に説明し,簡潔に回答します.<|REASONING|>,</|REASONING|>の間で思考の過程を抜けがないように記載します."""
# 推論処理
results = []
# バッチサイズの設定
BATCH_SIZE = 4
# データセットをバッチで処理
for i in tqdm(range(0, len(datasets), BATCH_SIZE)):
batch = datasets[i:i + BATCH_SIZE]
prompts = []
for data in batch: #{hint_prompt}
user_prompt = data["input"]
prompt = f"""<|SYSTEM|>{SYSTEM_PROMPT}</|SYSTEM|>
<|USER|>{user_prompt}</|USER|>
<|REASONING|>"""
prompts.append(prompt)
try:
# バッチ推論の実行
sampling_params = SamplingParams(
max_tokens=1536,
temperature=0.7,
repetition_penalty=1.2,
skip_special_tokens=False
)
outputs = llm.generate(prompts, sampling_params)
# 出力の処理
for data, output in zip(batch, outputs):
generated_text = output.outputs[0].text
results.append({
"task_id": data["task_id"],
"input": data["input"],
"output": generated_text
})
except Exception as e:
print(f"Error processing batch starting with item {data['task_id']}: {str(e)}")
4.出力の処理と保存
def process_output(output_text):
# Check if <|ASSISTANT|> exists in the text
if '<|ASSISTANT|>' in output_text:
# Extract text after <|ASSISTANT|>
assistant_text = output_text.split('<|ASSISTANT|>')[1]
# Remove </|ASSISTANT|> if it exists
return assistant_text.replace('</|ASSISTANT|>', '').strip()
else:
# Return the original text if <|ASSISTANT|> is not found
return output_text.strip()
# Process the list of dictionaries
def process_data(data_list):
for item in data_list:
item['output'] = process_output(item['output'])
return data_list
data_list = process_data(results)
# 結果の保存
model_name = re.sub(".*/", "", model_id)
with open(f"./{model_name}-outputs.jsonl", "w", encoding="utf-8") as f:
for result in data_list:
json.dump(result, f, ensure_ascii=False)
f.write("\n")
{token_config.bos_token}{token_config.eos_token}{token_config.pad_token}{token_config.system_token} and {token_config.system_end_token}{token_config.user_token} and {token_config.user_end_token}{token_config.assistant_token} and {token_config.assistant_end_token}{token_config.reasoning_token} and {token_config.reasoning_end_token}このモデルは以下の役割(roles)をサポートしています:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("{model_name}")
tokenizer = AutoTokenizer.from_pretrained("{model_name}")
# チャット形式での使用例
messages = [
{
"role": "system",
"content": "あなたは親切で有能なAIアシスタントです。"
},
{
"role": "user",
"content": "次の数学の問題を解いてください:2x + 3 = 7"
},
{
"role": "reasoning",
"content": "この方程式を解くために以下のステップで考えます:\\n1. 3を両辺から引く\\n2. 両辺を2で割る"
},
{
"role": "assistant",
"content": "x = 2 が方程式の解です。"
}
]
# チャットテンプレートを使用してメッセージを整形
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
print("\\nGenerated prompt:\\n", prompt)
# トークン化と推論
inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
outputs = model.generate(**inputs, max_length=2048, temperature=0.7)
response = tokenizer.decode(outputs[0])
print("\\nModel response:\\n", response)
モデルのチャットテンプレートは以下の要素を含みます:
特徴:
Base model
llm-jp/llm-jp-3-13b