Spaces:

wheattoast11
/

unsloth-training-hub

Paused

App Files Files Community

unsloth-training-hub / app.py

wheattoast11

Update app.py

1b9ca25 verified 4 months ago

raw

history blame contribute delete

23.5 kB

	"""
	Unsloth Training Hub - Comprehensive LLM Fine-tuning & RL Platform
	Supports: SFT, GRPO, GSPO, DPO, Dr-GRPO, DAPO, BNPO
	Models: All Unsloth-optimized models (LLM, VLM, Embedding, Multimodal)
	"""

	import gradio as gr
	import os
	import json
	from datetime import datetime

	# ============================================================================
	# MODEL CATALOG - All Unsloth Pre-optimized Models
	# ============================================================================

	UNSLOTH_MODELS = {
	"text_llm": {
	"Qwen3": [
	"unsloth/Qwen3-0.6B",
	"unsloth/Qwen3-1.7B",
	"unsloth/Qwen3-4B",
	"unsloth/Qwen3-8B",
	"unsloth/Qwen3-14B",
	"unsloth/Qwen3-32B",
	"unsloth/Qwen3-30B-A3B",
	"unsloth/Qwen3-235B-A22B",
	],
	"Qwen2.5": [
	"unsloth/Qwen2.5-0.5B-Instruct",
	"unsloth/Qwen2.5-1.5B-Instruct",
	"unsloth/Qwen2.5-3B-Instruct",
	"unsloth/Qwen2.5-7B-Instruct",
	"unsloth/Qwen2.5-14B-Instruct",
	"unsloth/Qwen2.5-32B-Instruct",
	"unsloth/Qwen2.5-72B-Instruct",
	],
	"Qwen2.5-Coder": [
	"unsloth/Qwen2.5-Coder-0.5B-Instruct",
	"unsloth/Qwen2.5-Coder-1.5B-Instruct",
	"unsloth/Qwen2.5-Coder-3B-Instruct",
	"unsloth/Qwen2.5-Coder-7B-Instruct",
	"unsloth/Qwen2.5-Coder-14B-Instruct",
	"unsloth/Qwen2.5-Coder-32B-Instruct",
	],
	"Llama-4": [
	"unsloth/Llama-4-Scout-17B-16E-Instruct",
	"unsloth/Llama-4-Maverick-17B-128E-Instruct",
	],
	"Llama-3.3": [
	"unsloth/Llama-3.3-70B-Instruct",
	],
	"Llama-3.1": [
	"unsloth/Meta-Llama-3.1-8B-Instruct",
	"unsloth/Meta-Llama-3.1-70B-Instruct",
	"unsloth/Meta-Llama-3.1-405B-Instruct",
	],
	"Llama-3.2": [
	"unsloth/Llama-3.2-1B-Instruct",
	"unsloth/Llama-3.2-3B-Instruct",
	],
	"DeepSeek-R1": [
	"unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
	"unsloth/DeepSeek-R1-Distill-Qwen-7B",
	"unsloth/DeepSeek-R1-Distill-Qwen-14B",
	"unsloth/DeepSeek-R1-Distill-Qwen-32B",
	"unsloth/DeepSeek-R1-Distill-Llama-8B",
	"unsloth/DeepSeek-R1-Distill-Llama-70B",
	],
	"Gemma-3": [
	"unsloth/gemma-3-1b-it",
	"unsloth/gemma-3-4b-it",
	"unsloth/gemma-3-12b-it",
	"unsloth/gemma-3-27b-it",
	],
	"Mistral": [
	"unsloth/Mistral-Small-3.2-24B-Instruct-2506",
	"unsloth/mistral-7b-instruct-v0.3",
	"unsloth/Mistral-Nemo-Instruct-2407",
	],
	"Phi-4": [
	"unsloth/Phi-4-mini-instruct",
	"unsloth/Phi-4-Instruct",
	],
	"GLM": [
	"unsloth/GLM-4.7-Flash",
	"unsloth/GLM-4.5-Air",
	],
	"Nemotron": [
	"unsloth/Nemotron-3-Nano-30B-A3B",
	],
	},
	"vision_vlm": {
	"Qwen3-VL": [
	"unsloth/Qwen3-VL-2B-Instruct",
	"unsloth/Qwen3-VL-4B-Instruct",
	"unsloth/Qwen3-VL-8B-Instruct",
	"unsloth/Qwen3-VL-32B-Instruct",
	],
	"Qwen2.5-VL": [
	"unsloth/Qwen2.5-VL-3B-Instruct",
	"unsloth/Qwen2.5-VL-7B-Instruct",
	"unsloth/Qwen2.5-VL-32B-Instruct",
	"unsloth/Qwen2.5-VL-72B-Instruct",
	],
	"Llama-Vision": [
	"unsloth/Llama-3.2-11B-Vision-Instruct",
	"unsloth/Llama-3.2-90B-Vision-Instruct",
	],
	"Pixtral": [
	"unsloth/Pixtral-12B-2409",
	],
	"Gemma-3-Vision": [
	"unsloth/gemma-3-4b-it", # Vision capable
	"unsloth/gemma-3-12b-it",
	"unsloth/gemma-3-27b-it",
	],
	},
	"embedding": [
	"unsloth/Qwen3-Embedding-0.6B",
	"unsloth/Qwen3-Embedding-4B",
	"unsloth/Qwen3-Embedding-8B",
	"unsloth/embeddinggemma-300m",
	"unsloth/bge-m3",
	"unsloth/ModernBERT-base",
	"unsloth/ModernBERT-large",
	],
	"multimodal_omni": [
	"unsloth/Qwen2.5-Omni-3B",
	"unsloth/Qwen2.5-Omni-7B",
	],
	}

	# ============================================================================
	# RL METHODS CONFIGURATION
	# ============================================================================

	RL_METHODS = {
	"grpo": {
	"name": "GRPO (Group Relative Policy Optimization)",
	"description": "Token-level importance sampling. Default DeepSeek method.",
	"config": {"loss_type": "grpo", "importance_sampling_level": "token"},
	},
	"gspo": {
	"name": "GSPO (Group Sequence Policy Optimization)",
	"description": "Sequence-level importance sampling. Qwen team variant.",
	"config": {"loss_type": "grpo", "importance_sampling_level": "sequence"},
	},
	"dr_grpo": {
	"name": "Dr-GRPO (Difficulty-Resilient GRPO)",
	"description": "Avoids difficulty bias in training.",
	"config": {"loss_type": "dr_grpo", "scale_rewards": False},
	},
	"dapo": {
	"name": "DAPO (Direct Advantage Policy Optimization)",
	"description": "Token-level normalization for long chain-of-thought.",
	"config": {"loss_type": "dapo", "mask_truncated_completions": True},
	},
	"bnpo": {
	"name": "BNPO (Bounded Natural Policy Optimization)",
	"description": "Asymmetric clipping for better exploration.",
	"config": {"loss_type": "bnpo", "epsilon": 0.2, "epsilon_high": 0.28, "delta": 1.5},
	},
	"dpo": {
	"name": "DPO (Direct Preference Optimization)",
	"description": "Preference-based training without reward model.",
	"config": {"method": "dpo"},
	},
	}

	# ============================================================================
	# SAMPLE SIZE PRESETS
	# ============================================================================

	SAMPLE_PRESETS = {
	"test_run": {"samples": 100, "max_steps": 50, "description": "Quick test (5-10 min)"},
	"small_run": {"samples": 1000, "max_steps": 250, "description": "Small training (30-60 min)"},
	"medium_run": {"samples": 5000, "max_steps": 1000, "description": "Medium training (2-4 hours)"},
	"large_run": {"samples": 25000, "max_steps": 5000, "description": "Large training (8-12 hours)"},
	"grokking_run": {"samples": 100000, "max_steps": 50000, "description": "Grokking/extended (24+ hours)"},
	}

	# ============================================================================
	# REWARD FUNCTION PACKS
	# ============================================================================

	REWARD_PACKS = {
	"reasoning_xml": {
	"name": "XML Reasoning Format",
	"description": "Rewards <reasoning>...</reasoning><answer>...</answer> format",
	"functions": ["xmlcount_reward", "soft_format_reward", "strict_format_reward"],
	},
	"code_quality": {
	"name": "Code Quality",
	"description": "Rewards syntactically correct, well-formatted code",
	"functions": ["syntax_reward", "docstring_reward", "type_hint_reward"],
	},
	"math_accuracy": {
	"name": "Math Accuracy",
	"description": "Rewards correct numerical answers with step verification",
	"functions": ["correctness_reward", "int_reward", "step_count_reward"],
	},
	"instruction_following": {
	"name": "Instruction Following",
	"description": "Rewards adherence to specific output formats",
	"functions": ["format_reward", "length_reward", "keyword_reward"],
	},
	"safety_alignment": {
	"name": "Safety & Alignment",
	"description": "Rewards helpful, harmless, honest outputs",
	"functions": ["helpfulness_reward", "safety_reward", "factuality_reward"],
	},
	}


	def get_environment_status():
	"""Check environment and return status."""
	import subprocess

	status = {
	"cuda_available": False,
	"gpu_name": "Not detected",
	"gpu_memory": "Unknown",
	"unsloth_installed": False,
	"vllm_installed": False,
	"trl_installed": False,
	"anthropic_key": bool(os.environ.get("ANTHROPIC_API_KEY")),
	"hf_token": bool(os.environ.get("HF_TOKEN")),
	}

	try:
	import torch
	status["cuda_available"] = torch.cuda.is_available()
	if status["cuda_available"]:
	status["gpu_name"] = torch.cuda.get_device_name(0)
	status["gpu_memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
	except:
	pass

	try:
	import unsloth
	status["unsloth_installed"] = True
	except:
	pass

	try:
	import vllm
	status["vllm_installed"] = True
	except:
	pass

	try:
	import trl
	status["trl_installed"] = True
	except:
	pass

	return status


	def format_status_markdown(status):
	"""Format status as markdown."""
	lines = [
	"## Environment Status\n",
	f"- CUDA: {'Available' if status['cuda_available'] else 'Not available'}",
	f"- GPU: {status['gpu_name']} ({status['gpu_memory']})",
	f"- Unsloth: {'Installed' if status['unsloth_installed'] else 'Not installed'}",
	f"- vLLM: {'Installed' if status['vllm_installed'] else 'Not installed'}",
	f"- TRL: {'Installed' if status['trl_installed'] else 'Not installed'}",
	f"- ANTHROPIC_API_KEY: {'Set' if status['anthropic_key'] else 'Not set'}",
	f"- HF_TOKEN: {'Set' if status['hf_token'] else 'Not set'}",
	]
	return "\n".join(lines)


	def get_model_list(model_type):
	"""Get flat list of models for given type."""
	if model_type == "text_llm":
	models = []
	for family, family_models in UNSLOTH_MODELS["text_llm"].items():
	models.extend(family_models)
	return models
	elif model_type == "vision_vlm":
	models = []
	for family, family_models in UNSLOTH_MODELS["vision_vlm"].items():
	models.extend(family_models)
	return models
	elif model_type == "embedding":
	return UNSLOTH_MODELS["embedding"]
	elif model_type == "multimodal":
	return UNSLOTH_MODELS["multimodal_omni"]
	return []


	def start_training(
	model_name,
	model_type,
	training_mode,
	rl_method,
	sample_preset,
	reward_pack,
	custom_reward_code,
	lora_rank,
	learning_rate,
	num_generations,
	temperature,
	max_seq_length,
	hub_model_id,
	push_to_hub,
	):
	"""Start training with selected configuration."""

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	run_dir = f"/app/runs/{training_mode}_{timestamp}"

	config = {
	"model_name": model_name,
	"model_type": model_type,
	"training_mode": training_mode,
	"rl_method": rl_method if training_mode == "rl" else None,
	"sample_preset": sample_preset,
	"reward_pack": reward_pack if training_mode == "rl" else None,
	"lora_rank": lora_rank,
	"learning_rate": learning_rate,
	"num_generations": num_generations if training_mode == "rl" else None,
	"temperature": temperature,
	"max_seq_length": max_seq_length,
	"hub_model_id": hub_model_id,
	"push_to_hub": push_to_hub,
	"run_dir": run_dir,
	"timestamp": timestamp,
	}

	# Generate training script
	if training_mode == "sft":
	script = generate_sft_script(config)
	else:
	script = generate_rl_script(config)

	return f"""
	## Training Configuration Saved

	Run Directory: `{run_dir}`
	Timestamp: {timestamp}

	### Configuration:
	```json
	{json.dumps(config, indent=2)}
	```

	### Generated Training Script:
	```python
	{script[:2000]}...
	```

	Status: Ready to execute. Click 'Execute Training' to start.
	"""


	def generate_sft_script(config):
	"""Generate SFT training script."""
	preset = SAMPLE_PRESETS[config["sample_preset"]]

	return f'''
	# Unsloth SFT Training Script
	# Generated: {config["timestamp"]}

	from unsloth import FastLanguageModel
	from trl import SFTTrainer, SFTConfig
	from datasets import load_dataset

	max_seq_length = {config["max_seq_length"]}
	lora_rank = {config["lora_rank"]}

	# Load model with Unsloth optimizations
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="{config["model_name"]}",
	max_seq_length=max_seq_length,
	load_in_4bit=True,
	dtype=None,
	)

	# Add LoRA adapters
	model = FastLanguageModel.get_peft_model(
	model,
	r=lora_rank,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_alpha=lora_rank,
	lora_dropout=0,
	bias="none",
	use_gradient_checkpointing="unsloth",
	random_state=3407,
	)

	# Load and prepare dataset
	dataset = load_dataset("your_dataset", split="train")

	# Configure trainer
	trainer = SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=dataset,
	args=SFTConfig(
	per_device_train_batch_size=2,
	gradient_accumulation_steps=4,
	warmup_steps=10,
	max_steps={preset["max_steps"]},
	learning_rate={config["learning_rate"]},
	optim="adamw_8bit",
	packing=True,
	max_length=max_seq_length,
	output_dir="{config["run_dir"]}",
	report_to="none",
	),
	)

	# Train
	trainer.train()

	# Save
	model.save_pretrained_merged("{config["run_dir"]}/merged", tokenizer, save_method="merged_16bit")
	'''


	def generate_rl_script(config):
	"""Generate RL training script."""
	preset = SAMPLE_PRESETS[config["sample_preset"]]
	rl_config = RL_METHODS[config["rl_method"]]["config"]

	return f'''
	# Unsloth RL Training Script ({config["rl_method"].upper()})
	# Generated: {config["timestamp"]}

	from unsloth import FastLanguageModel, PatchFastRL
	PatchFastRL("GRPO", FastLanguageModel) # CRITICAL: Must be BEFORE trl import

	from trl import GRPOConfig, GRPOTrainer
	from datasets import load_dataset

	max_seq_length = {config["max_seq_length"]}
	lora_rank = {config["lora_rank"]}

	# Load model with Unsloth optimizations + vLLM fast inference
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="{config["model_name"]}",
	max_seq_length=max_seq_length,
	load_in_4bit=True,
	fast_inference=True,
	max_lora_rank=lora_rank,
	gpu_memory_utilization=0.6,
	)

	# Add LoRA adapters
	model = FastLanguageModel.get_peft_model(
	model,
	r=lora_rank,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_alpha=lora_rank,
	use_gradient_checkpointing="unsloth",
	random_state=3407,
	)

	# Reward functions from pack: {config["reward_pack"]}
	def xmlcount_reward_func(completions, **kwargs):
	def count_xml(text):
	count = 0.0
	if text.count("<reasoning>\\n") == 1: count += 0.125
	if text.count("\\n</reasoning>\\n") == 1: count += 0.125
	if text.count("\\n<answer>\\n") == 1: count += 0.125
	if text.count("\\n</answer>") == 1: count += 0.125
	return count
	return [count_xml(c[0]["content"]) for c in completions]

	def correctness_reward_func(prompts, completions, answer, **kwargs):
	def extract_answer(text):
	if "<answer>" in text and "</answer>" in text:
	return text.split("<answer>")[-1].split("</answer>")[0].strip()
	return text.strip()
	responses = [c[0]["content"] for c in completions]
	extracted = [extract_answer(r) for r in responses]
	return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)]

	# Load dataset
	dataset = load_dataset("openai/gsm8k", "main", split="train")

	# Configure GRPO trainer
	training_args = GRPOConfig(
	output_dir="{config["run_dir"]}",
	learning_rate={config["learning_rate"]},
	per_device_train_batch_size=1,
	gradient_accumulation_steps=4,
	num_generations={config["num_generations"]},
	max_prompt_length=256,
	max_completion_length={config["max_seq_length"]} - 256,
	max_steps={preset["max_steps"]},
	temperature={config["temperature"]},
	loss_type="{rl_config.get("loss_type", "grpo")}",
	importance_sampling_level="{rl_config.get("importance_sampling_level", "token")}",
	optim="adamw_8bit",
	warmup_ratio=0.1,
	lr_scheduler_type="cosine",
	max_grad_norm=0.1,
	report_to="none",
	)

	# Initialize trainer
	trainer = GRPOTrainer(
	model=model,
	processing_class=tokenizer,
	reward_funcs=[xmlcount_reward_func, correctness_reward_func],
	args=training_args,
	train_dataset=dataset,
	)

	# Train
	trainer.train()

	# Save
	model.save_pretrained("{config["run_dir"]}/lora")
	tokenizer.save_pretrained("{config["run_dir"]}/lora")
	'''


	# ============================================================================
	# GRADIO UI
	# ============================================================================

	def create_ui():
	"""Create Gradio interface."""

	with gr.Blocks(title="Unsloth Training Hub", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Unsloth Training Hub")
	gr.Markdown("Comprehensive LLM Fine-tuning & RL Platform")

	# Status
	status = get_environment_status()
	gr.Markdown(format_status_markdown(status))

	with gr.Tabs():
	# Tab 1: Model Selection
	with gr.Tab("1. Model Selection"):
	model_type = gr.Radio(
	choices=["text_llm", "vision_vlm", "embedding", "multimodal"],
	value="text_llm",
	label="Model Type",
	)

	model_dropdown = gr.Dropdown(
	choices=get_model_list("text_llm"),
	value="unsloth/Qwen2.5-7B-Instruct",
	label="Select Model",
	filterable=True,
	)

	def update_models(model_type):
	models = get_model_list(model_type)
	return gr.Dropdown(choices=models, value=models[0] if models else None)

	model_type.change(update_models, model_type, model_dropdown)

	# Tab 2: Training Mode
	with gr.Tab("2. Training Mode"):
	training_mode = gr.Radio(
	choices=["sft", "rl"],
	value="sft",
	label="Training Mode",
	info="SFT: Supervised Fine-Tuning \| RL: Reinforcement Learning"
	)

	with gr.Group(visible=False) as rl_options:
	rl_method = gr.Dropdown(
	choices=list(RL_METHODS.keys()),
	value="grpo",
	label="RL Method",
	)

	rl_info = gr.Markdown(RL_METHODS["grpo"]["description"])

	def update_rl_info(method):
	return RL_METHODS[method]["description"]

	rl_method.change(update_rl_info, rl_method, rl_info)

	reward_pack = gr.Dropdown(
	choices=list(REWARD_PACKS.keys()),
	value="reasoning_xml",
	label="Reward Pack",
	)

	custom_reward = gr.Code(
	label="Custom Reward Function (Optional)",
	language="python",
	value="# def custom_reward(completions, **kwargs):\n# return [1.0 for _ in completions]",
	)

	num_generations = gr.Slider(
	minimum=2, maximum=16, value=8, step=2,
	label="Generations per Prompt",
	)

	temperature = gr.Slider(
	minimum=0.1, maximum=2.0, value=1.0, step=0.1,
	label="Generation Temperature",
	)

	def toggle_rl_options(mode):
	return gr.Group(visible=(mode == "rl"))

	training_mode.change(toggle_rl_options, training_mode, rl_options)

	# Tab 3: Training Config
	with gr.Tab("3. Training Config"):
	sample_preset = gr.Radio(
	choices=list(SAMPLE_PRESETS.keys()),
	value="small_run",
	label="Sample Size Preset",
	)

	preset_info = gr.Markdown(
	f"{SAMPLE_PRESETS['small_run']['description']} - "
	f"{SAMPLE_PRESETS['small_run']['samples']} samples, "
	f"{SAMPLE_PRESETS['small_run']['max_steps']} steps"
	)

	def update_preset_info(preset):
	p = SAMPLE_PRESETS[preset]
	return f"{p['description']} - {p['samples']} samples, {p['max_steps']} steps"

	sample_preset.change(update_preset_info, sample_preset, preset_info)

	with gr.Row():
	lora_rank = gr.Dropdown(
	choices=[8, 16, 32, 64, 128],
	value=32,
	label="LoRA Rank",
	)

	learning_rate = gr.Number(
	value=5e-6,
	label="Learning Rate",
	)

	max_seq_length = gr.Dropdown(
	choices=[512, 1024, 2048, 4096, 8192, 16384, 32768],
	value=2048,
	label="Max Sequence Length",
	)

	# Tab 4: Output & Hub
	with gr.Tab("4. Output & Hub"):
	hub_model_id = gr.Textbox(
	value="wheattoast11/unsloth-trained-model",
	label="HuggingFace Hub Model ID",
	)

	push_to_hub = gr.Checkbox(
	value=True,
	label="Push to HuggingFace Hub after training",
	)

	output_format = gr.CheckboxGroup(
	choices=["merged_16bit", "merged_4bit", "lora", "gguf_q4_k_m", "gguf_q8_0"],
	value=["merged_16bit", "lora"],
	label="Output Formats",
	)

	# Start Training
	gr.Markdown("---")

	with gr.Row():
	start_btn = gr.Button("Generate Training Config", variant="primary", scale=2)
	execute_btn = gr.Button("Execute Training", variant="secondary", scale=1)

	output = gr.Markdown("Configure your training and click 'Generate Training Config'")

	start_btn.click(
	start_training,
	inputs=[
	model_dropdown,
	model_type,
	training_mode,
	rl_method,
	sample_preset,
	reward_pack,
	custom_reward,
	lora_rank,
	learning_rate,
	num_generations,
	temperature,
	max_seq_length,
	hub_model_id,
	push_to_hub,
	],
	outputs=output,
	)

	gr.Markdown("---")
	gr.Markdown("Intuition Labs \| Unsloth Training Hub \| L40S ~$1.80/hr - PAUSE when not training!")

	return demo


	if __name__ == "__main__":
	demo = create_ui()
	demo.launch(server_name="0.0.0.0", server_port=7860)