Instructions to use remiai3/gpt_oss_20b_GGUF_project_guide with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use remiai3/gpt_oss_20b_GGUF_project_guide with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="remiai3/gpt_oss_20b_GGUF_project_guide")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("remiai3/gpt_oss_20b_GGUF_project_guide", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use remiai3/gpt_oss_20b_GGUF_project_guide with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "remiai3/gpt_oss_20b_GGUF_project_guide" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "remiai3/gpt_oss_20b_GGUF_project_guide", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/remiai3/gpt_oss_20b_GGUF_project_guide
- SGLang
How to use remiai3/gpt_oss_20b_GGUF_project_guide with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "remiai3/gpt_oss_20b_GGUF_project_guide" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "remiai3/gpt_oss_20b_GGUF_project_guide", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "remiai3/gpt_oss_20b_GGUF_project_guide" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "remiai3/gpt_oss_20b_GGUF_project_guide", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use remiai3/gpt_oss_20b_GGUF_project_guide with Docker Model Runner:
docker model run hf.co/remiai3/gpt_oss_20b_GGUF_project_guide
Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,28 +1,39 @@
|
|
| 1 |
from flask import Flask, render_template, request, jsonify
|
| 2 |
from llama_cpp import Llama
|
| 3 |
-
import
|
| 4 |
|
| 5 |
app = Flask(__name__)
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
# Build adaptive prompt
|
| 18 |
def build_prompt(history, user_text):
|
| 19 |
system_prompt = (
|
| 20 |
-
"You are a helpful
|
| 21 |
-
"-
|
| 22 |
-
"-
|
| 23 |
-
"-
|
| 24 |
-
"-
|
| 25 |
-
"- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n"
|
| 26 |
)
|
| 27 |
prompt = system_prompt + "\n\n"
|
| 28 |
for turn in history:
|
|
@@ -42,7 +53,6 @@ def chat():
|
|
| 42 |
|
| 43 |
prompt = build_prompt(history, user_message)
|
| 44 |
|
| 45 |
-
# Adjust max_tokens dynamically
|
| 46 |
if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
|
| 47 |
max_out = 800
|
| 48 |
elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
|
|
@@ -57,10 +67,7 @@ def chat():
|
|
| 57 |
stop=["\nUser:", "\nAssistant:"]
|
| 58 |
)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
# Wrap fenced code blocks with copy button (handled in JS)
|
| 63 |
-
return jsonify({"response": text})
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
app.run(host="0.0.0.0", port=5000, debug=True)
|
|
|
|
| 1 |
from flask import Flask, render_template, request, jsonify
|
| 2 |
from llama_cpp import Llama
|
| 3 |
+
import os
|
| 4 |
|
| 5 |
app = Flask(__name__)
|
| 6 |
|
| 7 |
+
# Update this path to your downloaded model weight
|
| 8 |
+
MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
|
| 9 |
|
| 10 |
+
# Detect GPU automatically: if llama-cpp-python was compiled with CUDA/Metal and GPU layers can be offloaded
|
| 11 |
+
# Adjust n_gpu_layers for your GPU memory; 20-40 for mid GPUs, 60-100 for higher VRAM, 0 = CPU only
|
| 12 |
+
try:
|
| 13 |
+
print("Trying GPU offload...")
|
| 14 |
+
llm = Llama(
|
| 15 |
+
model_path=MODEL_PATH,
|
| 16 |
+
n_ctx=2048,
|
| 17 |
+
n_threads=os.cpu_count(),
|
| 18 |
+
n_gpu_layers=40 # increase or decrease based on your GPU memory
|
| 19 |
+
)
|
| 20 |
+
print("GPU initialized successfully.")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"GPU failed: {e}\nFalling back to CPU.")
|
| 23 |
+
llm = Llama(
|
| 24 |
+
model_path=MODEL_PATH,
|
| 25 |
+
n_ctx=2048,
|
| 26 |
+
n_threads=os.cpu_count(),
|
| 27 |
+
n_gpu_layers=0 # CPU only
|
| 28 |
+
)
|
| 29 |
|
|
|
|
| 30 |
def build_prompt(history, user_text):
|
| 31 |
system_prompt = (
|
| 32 |
+
"You are a helpful assistant. Follow these:\n"
|
| 33 |
+
"- Simple Q: Short, precise.\n"
|
| 34 |
+
"- Story/letter/essay: Longer answer.\n"
|
| 35 |
+
"- Code: Complete, neat, Markdown fenced code with language tag.\n"
|
| 36 |
+
"- Use points when helpful.\n"
|
|
|
|
| 37 |
)
|
| 38 |
prompt = system_prompt + "\n\n"
|
| 39 |
for turn in history:
|
|
|
|
| 53 |
|
| 54 |
prompt = build_prompt(history, user_message)
|
| 55 |
|
|
|
|
| 56 |
if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
|
| 57 |
max_out = 800
|
| 58 |
elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
|
|
|
|
| 67 |
stop=["\nUser:", "\nAssistant:"]
|
| 68 |
)
|
| 69 |
|
| 70 |
+
return jsonify({"response": resp["choices"][0]["text"].strip()})
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
if __name__ == "__main__":
|
| 73 |
app.run(host="0.0.0.0", port=5000, debug=True)
|