"""
Unsloth Training Hub - Comprehensive LLM Fine-tuning & RL Platform
Supports: SFT, GRPO, GSPO, DPO, Dr-GRPO, DAPO, BNPO
Models: All Unsloth-optimized models (LLM, VLM, Embedding, Multimodal)
"""
import gradio as gr
import os
import json
from datetime import datetime
# ============================================================================
# MODEL CATALOG - All Unsloth Pre-optimized Models
# ============================================================================
UNSLOTH_MODELS = {
"text_llm": {
"Qwen3": [
"unsloth/Qwen3-0.6B",
"unsloth/Qwen3-1.7B",
"unsloth/Qwen3-4B",
"unsloth/Qwen3-8B",
"unsloth/Qwen3-14B",
"unsloth/Qwen3-32B",
"unsloth/Qwen3-30B-A3B",
"unsloth/Qwen3-235B-A22B",
],
"Qwen2.5": [
"unsloth/Qwen2.5-0.5B-Instruct",
"unsloth/Qwen2.5-1.5B-Instruct",
"unsloth/Qwen2.5-3B-Instruct",
"unsloth/Qwen2.5-7B-Instruct",
"unsloth/Qwen2.5-14B-Instruct",
"unsloth/Qwen2.5-32B-Instruct",
"unsloth/Qwen2.5-72B-Instruct",
],
"Qwen2.5-Coder": [
"unsloth/Qwen2.5-Coder-0.5B-Instruct",
"unsloth/Qwen2.5-Coder-1.5B-Instruct",
"unsloth/Qwen2.5-Coder-3B-Instruct",
"unsloth/Qwen2.5-Coder-7B-Instruct",
"unsloth/Qwen2.5-Coder-14B-Instruct",
"unsloth/Qwen2.5-Coder-32B-Instruct",
],
"Llama-4": [
"unsloth/Llama-4-Scout-17B-16E-Instruct",
"unsloth/Llama-4-Maverick-17B-128E-Instruct",
],
"Llama-3.3": [
"unsloth/Llama-3.3-70B-Instruct",
],
"Llama-3.1": [
"unsloth/Meta-Llama-3.1-8B-Instruct",
"unsloth/Meta-Llama-3.1-70B-Instruct",
"unsloth/Meta-Llama-3.1-405B-Instruct",
],
"Llama-3.2": [
"unsloth/Llama-3.2-1B-Instruct",
"unsloth/Llama-3.2-3B-Instruct",
],
"DeepSeek-R1": [
"unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
"unsloth/DeepSeek-R1-Distill-Qwen-7B",
"unsloth/DeepSeek-R1-Distill-Qwen-14B",
"unsloth/DeepSeek-R1-Distill-Qwen-32B",
"unsloth/DeepSeek-R1-Distill-Llama-8B",
"unsloth/DeepSeek-R1-Distill-Llama-70B",
],
"Gemma-3": [
"unsloth/gemma-3-1b-it",
"unsloth/gemma-3-4b-it",
"unsloth/gemma-3-12b-it",
"unsloth/gemma-3-27b-it",
],
"Mistral": [
"unsloth/Mistral-Small-3.2-24B-Instruct-2506",
"unsloth/mistral-7b-instruct-v0.3",
"unsloth/Mistral-Nemo-Instruct-2407",
],
"Phi-4": [
"unsloth/Phi-4-mini-instruct",
"unsloth/Phi-4-Instruct",
],
"GLM": [
"unsloth/GLM-4.7-Flash",
"unsloth/GLM-4.5-Air",
],
"Nemotron": [
"unsloth/Nemotron-3-Nano-30B-A3B",
],
},
"vision_vlm": {
"Qwen3-VL": [
"unsloth/Qwen3-VL-2B-Instruct",
"unsloth/Qwen3-VL-4B-Instruct",
"unsloth/Qwen3-VL-8B-Instruct",
"unsloth/Qwen3-VL-32B-Instruct",
],
"Qwen2.5-VL": [
"unsloth/Qwen2.5-VL-3B-Instruct",
"unsloth/Qwen2.5-VL-7B-Instruct",
"unsloth/Qwen2.5-VL-32B-Instruct",
"unsloth/Qwen2.5-VL-72B-Instruct",
],
"Llama-Vision": [
"unsloth/Llama-3.2-11B-Vision-Instruct",
"unsloth/Llama-3.2-90B-Vision-Instruct",
],
"Pixtral": [
"unsloth/Pixtral-12B-2409",
],
"Gemma-3-Vision": [
"unsloth/gemma-3-4b-it", # Vision capable
"unsloth/gemma-3-12b-it",
"unsloth/gemma-3-27b-it",
],
},
"embedding": [
"unsloth/Qwen3-Embedding-0.6B",
"unsloth/Qwen3-Embedding-4B",
"unsloth/Qwen3-Embedding-8B",
"unsloth/embeddinggemma-300m",
"unsloth/bge-m3",
"unsloth/ModernBERT-base",
"unsloth/ModernBERT-large",
],
"multimodal_omni": [
"unsloth/Qwen2.5-Omni-3B",
"unsloth/Qwen2.5-Omni-7B",
],
}
# ============================================================================
# RL METHODS CONFIGURATION
# ============================================================================
RL_METHODS = {
"grpo": {
"name": "GRPO (Group Relative Policy Optimization)",
"description": "Token-level importance sampling. Default DeepSeek method.",
"config": {"loss_type": "grpo", "importance_sampling_level": "token"},
},
"gspo": {
"name": "GSPO (Group Sequence Policy Optimization)",
"description": "Sequence-level importance sampling. Qwen team variant.",
"config": {"loss_type": "grpo", "importance_sampling_level": "sequence"},
},
"dr_grpo": {
"name": "Dr-GRPO (Difficulty-Resilient GRPO)",
"description": "Avoids difficulty bias in training.",
"config": {"loss_type": "dr_grpo", "scale_rewards": False},
},
"dapo": {
"name": "DAPO (Direct Advantage Policy Optimization)",
"description": "Token-level normalization for long chain-of-thought.",
"config": {"loss_type": "dapo", "mask_truncated_completions": True},
},
"bnpo": {
"name": "BNPO (Bounded Natural Policy Optimization)",
"description": "Asymmetric clipping for better exploration.",
"config": {"loss_type": "bnpo", "epsilon": 0.2, "epsilon_high": 0.28, "delta": 1.5},
},
"dpo": {
"name": "DPO (Direct Preference Optimization)",
"description": "Preference-based training without reward model.",
"config": {"method": "dpo"},
},
}
# ============================================================================
# SAMPLE SIZE PRESETS
# ============================================================================
SAMPLE_PRESETS = {
"test_run": {"samples": 100, "max_steps": 50, "description": "Quick test (5-10 min)"},
"small_run": {"samples": 1000, "max_steps": 250, "description": "Small training (30-60 min)"},
"medium_run": {"samples": 5000, "max_steps": 1000, "description": "Medium training (2-4 hours)"},
"large_run": {"samples": 25000, "max_steps": 5000, "description": "Large training (8-12 hours)"},
"grokking_run": {"samples": 100000, "max_steps": 50000, "description": "Grokking/extended (24+ hours)"},
}
# ============================================================================
# REWARD FUNCTION PACKS
# ============================================================================
REWARD_PACKS = {
"reasoning_xml": {
"name": "XML Reasoning Format",
"description": "Rewards ...... format",
"functions": ["xmlcount_reward", "soft_format_reward", "strict_format_reward"],
},
"code_quality": {
"name": "Code Quality",
"description": "Rewards syntactically correct, well-formatted code",
"functions": ["syntax_reward", "docstring_reward", "type_hint_reward"],
},
"math_accuracy": {
"name": "Math Accuracy",
"description": "Rewards correct numerical answers with step verification",
"functions": ["correctness_reward", "int_reward", "step_count_reward"],
},
"instruction_following": {
"name": "Instruction Following",
"description": "Rewards adherence to specific output formats",
"functions": ["format_reward", "length_reward", "keyword_reward"],
},
"safety_alignment": {
"name": "Safety & Alignment",
"description": "Rewards helpful, harmless, honest outputs",
"functions": ["helpfulness_reward", "safety_reward", "factuality_reward"],
},
}
def get_environment_status():
"""Check environment and return status."""
import subprocess
status = {
"cuda_available": False,
"gpu_name": "Not detected",
"gpu_memory": "Unknown",
"unsloth_installed": False,
"vllm_installed": False,
"trl_installed": False,
"anthropic_key": bool(os.environ.get("ANTHROPIC_API_KEY")),
"hf_token": bool(os.environ.get("HF_TOKEN")),
}
try:
import torch
status["cuda_available"] = torch.cuda.is_available()
if status["cuda_available"]:
status["gpu_name"] = torch.cuda.get_device_name(0)
status["gpu_memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
except:
pass
try:
import unsloth
status["unsloth_installed"] = True
except:
pass
try:
import vllm
status["vllm_installed"] = True
except:
pass
try:
import trl
status["trl_installed"] = True
except:
pass
return status
def format_status_markdown(status):
"""Format status as markdown."""
lines = [
"## Environment Status\n",
f"- **CUDA**: {'Available' if status['cuda_available'] else 'Not available'}",
f"- **GPU**: {status['gpu_name']} ({status['gpu_memory']})",
f"- **Unsloth**: {'Installed' if status['unsloth_installed'] else 'Not installed'}",
f"- **vLLM**: {'Installed' if status['vllm_installed'] else 'Not installed'}",
f"- **TRL**: {'Installed' if status['trl_installed'] else 'Not installed'}",
f"- **ANTHROPIC_API_KEY**: {'Set' if status['anthropic_key'] else 'Not set'}",
f"- **HF_TOKEN**: {'Set' if status['hf_token'] else 'Not set'}",
]
return "\n".join(lines)
def get_model_list(model_type):
"""Get flat list of models for given type."""
if model_type == "text_llm":
models = []
for family, family_models in UNSLOTH_MODELS["text_llm"].items():
models.extend(family_models)
return models
elif model_type == "vision_vlm":
models = []
for family, family_models in UNSLOTH_MODELS["vision_vlm"].items():
models.extend(family_models)
return models
elif model_type == "embedding":
return UNSLOTH_MODELS["embedding"]
elif model_type == "multimodal":
return UNSLOTH_MODELS["multimodal_omni"]
return []
def start_training(
model_name,
model_type,
training_mode,
rl_method,
sample_preset,
reward_pack,
custom_reward_code,
lora_rank,
learning_rate,
num_generations,
temperature,
max_seq_length,
hub_model_id,
push_to_hub,
):
"""Start training with selected configuration."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_dir = f"/app/runs/{training_mode}_{timestamp}"
config = {
"model_name": model_name,
"model_type": model_type,
"training_mode": training_mode,
"rl_method": rl_method if training_mode == "rl" else None,
"sample_preset": sample_preset,
"reward_pack": reward_pack if training_mode == "rl" else None,
"lora_rank": lora_rank,
"learning_rate": learning_rate,
"num_generations": num_generations if training_mode == "rl" else None,
"temperature": temperature,
"max_seq_length": max_seq_length,
"hub_model_id": hub_model_id,
"push_to_hub": push_to_hub,
"run_dir": run_dir,
"timestamp": timestamp,
}
# Generate training script
if training_mode == "sft":
script = generate_sft_script(config)
else:
script = generate_rl_script(config)
return f"""
## Training Configuration Saved
**Run Directory**: `{run_dir}`
**Timestamp**: {timestamp}
### Configuration:
```json
{json.dumps(config, indent=2)}
```
### Generated Training Script:
```python
{script[:2000]}...
```
**Status**: Ready to execute. Click 'Execute Training' to start.
"""
def generate_sft_script(config):
"""Generate SFT training script."""
preset = SAMPLE_PRESETS[config["sample_preset"]]
return f'''
# Unsloth SFT Training Script
# Generated: {config["timestamp"]}
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
max_seq_length = {config["max_seq_length"]}
lora_rank = {config["lora_rank"]}
# Load model with Unsloth optimizations
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="{config["model_name"]}",
max_seq_length=max_seq_length,
load_in_4bit=True,
dtype=None,
)
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=lora_rank,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha=lora_rank,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
)
# Load and prepare dataset
dataset = load_dataset("your_dataset", split="train")
# Configure trainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=SFTConfig(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=10,
max_steps={preset["max_steps"]},
learning_rate={config["learning_rate"]},
optim="adamw_8bit",
packing=True,
max_length=max_seq_length,
output_dir="{config["run_dir"]}",
report_to="none",
),
)
# Train
trainer.train()
# Save
model.save_pretrained_merged("{config["run_dir"]}/merged", tokenizer, save_method="merged_16bit")
'''
def generate_rl_script(config):
"""Generate RL training script."""
preset = SAMPLE_PRESETS[config["sample_preset"]]
rl_config = RL_METHODS[config["rl_method"]]["config"]
return f'''
# Unsloth RL Training Script ({config["rl_method"].upper()})
# Generated: {config["timestamp"]}
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel) # CRITICAL: Must be BEFORE trl import
from trl import GRPOConfig, GRPOTrainer
from datasets import load_dataset
max_seq_length = {config["max_seq_length"]}
lora_rank = {config["lora_rank"]}
# Load model with Unsloth optimizations + vLLM fast inference
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="{config["model_name"]}",
max_seq_length=max_seq_length,
load_in_4bit=True,
fast_inference=True,
max_lora_rank=lora_rank,
gpu_memory_utilization=0.6,
)
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=lora_rank,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha=lora_rank,
use_gradient_checkpointing="unsloth",
random_state=3407,
)
# Reward functions from pack: {config["reward_pack"]}
def xmlcount_reward_func(completions, **kwargs):
def count_xml(text):
count = 0.0
if text.count("\\n") == 1: count += 0.125
if text.count("\\n\\n") == 1: count += 0.125
if text.count("\\n\\n") == 1: count += 0.125
if text.count("\\n") == 1: count += 0.125
return count
return [count_xml(c[0]["content"]) for c in completions]
def correctness_reward_func(prompts, completions, answer, **kwargs):
def extract_answer(text):
if "" in text and "" in text:
return text.split("")[-1].split("")[0].strip()
return text.strip()
responses = [c[0]["content"] for c in completions]
extracted = [extract_answer(r) for r in responses]
return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)]
# Load dataset
dataset = load_dataset("openai/gsm8k", "main", split="train")
# Configure GRPO trainer
training_args = GRPOConfig(
output_dir="{config["run_dir"]}",
learning_rate={config["learning_rate"]},
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_generations={config["num_generations"]},
max_prompt_length=256,
max_completion_length={config["max_seq_length"]} - 256,
max_steps={preset["max_steps"]},
temperature={config["temperature"]},
loss_type="{rl_config.get("loss_type", "grpo")}",
importance_sampling_level="{rl_config.get("importance_sampling_level", "token")}",
optim="adamw_8bit",
warmup_ratio=0.1,
lr_scheduler_type="cosine",
max_grad_norm=0.1,
report_to="none",
)
# Initialize trainer
trainer = GRPOTrainer(
model=model,
processing_class=tokenizer,
reward_funcs=[xmlcount_reward_func, correctness_reward_func],
args=training_args,
train_dataset=dataset,
)
# Train
trainer.train()
# Save
model.save_pretrained("{config["run_dir"]}/lora")
tokenizer.save_pretrained("{config["run_dir"]}/lora")
'''
# ============================================================================
# GRADIO UI
# ============================================================================
def create_ui():
"""Create Gradio interface."""
with gr.Blocks(title="Unsloth Training Hub", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Unsloth Training Hub")
gr.Markdown("Comprehensive LLM Fine-tuning & RL Platform")
# Status
status = get_environment_status()
gr.Markdown(format_status_markdown(status))
with gr.Tabs():
# Tab 1: Model Selection
with gr.Tab("1. Model Selection"):
model_type = gr.Radio(
choices=["text_llm", "vision_vlm", "embedding", "multimodal"],
value="text_llm",
label="Model Type",
)
model_dropdown = gr.Dropdown(
choices=get_model_list("text_llm"),
value="unsloth/Qwen2.5-7B-Instruct",
label="Select Model",
filterable=True,
)
def update_models(model_type):
models = get_model_list(model_type)
return gr.Dropdown(choices=models, value=models[0] if models else None)
model_type.change(update_models, model_type, model_dropdown)
# Tab 2: Training Mode
with gr.Tab("2. Training Mode"):
training_mode = gr.Radio(
choices=["sft", "rl"],
value="sft",
label="Training Mode",
info="SFT: Supervised Fine-Tuning | RL: Reinforcement Learning"
)
with gr.Group(visible=False) as rl_options:
rl_method = gr.Dropdown(
choices=list(RL_METHODS.keys()),
value="grpo",
label="RL Method",
)
rl_info = gr.Markdown(RL_METHODS["grpo"]["description"])
def update_rl_info(method):
return RL_METHODS[method]["description"]
rl_method.change(update_rl_info, rl_method, rl_info)
reward_pack = gr.Dropdown(
choices=list(REWARD_PACKS.keys()),
value="reasoning_xml",
label="Reward Pack",
)
custom_reward = gr.Code(
label="Custom Reward Function (Optional)",
language="python",
value="# def custom_reward(completions, **kwargs):\n# return [1.0 for _ in completions]",
)
num_generations = gr.Slider(
minimum=2, maximum=16, value=8, step=2,
label="Generations per Prompt",
)
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=1.0, step=0.1,
label="Generation Temperature",
)
def toggle_rl_options(mode):
return gr.Group(visible=(mode == "rl"))
training_mode.change(toggle_rl_options, training_mode, rl_options)
# Tab 3: Training Config
with gr.Tab("3. Training Config"):
sample_preset = gr.Radio(
choices=list(SAMPLE_PRESETS.keys()),
value="small_run",
label="Sample Size Preset",
)
preset_info = gr.Markdown(
f"**{SAMPLE_PRESETS['small_run']['description']}** - "
f"{SAMPLE_PRESETS['small_run']['samples']} samples, "
f"{SAMPLE_PRESETS['small_run']['max_steps']} steps"
)
def update_preset_info(preset):
p = SAMPLE_PRESETS[preset]
return f"**{p['description']}** - {p['samples']} samples, {p['max_steps']} steps"
sample_preset.change(update_preset_info, sample_preset, preset_info)
with gr.Row():
lora_rank = gr.Dropdown(
choices=[8, 16, 32, 64, 128],
value=32,
label="LoRA Rank",
)
learning_rate = gr.Number(
value=5e-6,
label="Learning Rate",
)
max_seq_length = gr.Dropdown(
choices=[512, 1024, 2048, 4096, 8192, 16384, 32768],
value=2048,
label="Max Sequence Length",
)
# Tab 4: Output & Hub
with gr.Tab("4. Output & Hub"):
hub_model_id = gr.Textbox(
value="wheattoast11/unsloth-trained-model",
label="HuggingFace Hub Model ID",
)
push_to_hub = gr.Checkbox(
value=True,
label="Push to HuggingFace Hub after training",
)
output_format = gr.CheckboxGroup(
choices=["merged_16bit", "merged_4bit", "lora", "gguf_q4_k_m", "gguf_q8_0"],
value=["merged_16bit", "lora"],
label="Output Formats",
)
# Start Training
gr.Markdown("---")
with gr.Row():
start_btn = gr.Button("Generate Training Config", variant="primary", scale=2)
execute_btn = gr.Button("Execute Training", variant="secondary", scale=1)
output = gr.Markdown("Configure your training and click 'Generate Training Config'")
start_btn.click(
start_training,
inputs=[
model_dropdown,
model_type,
training_mode,
rl_method,
sample_preset,
reward_pack,
custom_reward,
lora_rank,
learning_rate,
num_generations,
temperature,
max_seq_length,
hub_model_id,
push_to_hub,
],
outputs=output,
)
gr.Markdown("---")
gr.Markdown("**Intuition Labs** | Unsloth Training Hub | L40S ~$1.80/hr - PAUSE when not training!")
return demo
if __name__ == "__main__":
demo = create_ui()
demo.launch(server_name="0.0.0.0", server_port=7860)