""" Unsloth Training Hub - Comprehensive LLM Fine-tuning & RL Platform Supports: SFT, GRPO, GSPO, DPO, Dr-GRPO, DAPO, BNPO Models: All Unsloth-optimized models (LLM, VLM, Embedding, Multimodal) """ import gradio as gr import os import json from datetime import datetime # ============================================================================ # MODEL CATALOG - All Unsloth Pre-optimized Models # ============================================================================ UNSLOTH_MODELS = { "text_llm": { "Qwen3": [ "unsloth/Qwen3-0.6B", "unsloth/Qwen3-1.7B", "unsloth/Qwen3-4B", "unsloth/Qwen3-8B", "unsloth/Qwen3-14B", "unsloth/Qwen3-32B", "unsloth/Qwen3-30B-A3B", "unsloth/Qwen3-235B-A22B", ], "Qwen2.5": [ "unsloth/Qwen2.5-0.5B-Instruct", "unsloth/Qwen2.5-1.5B-Instruct", "unsloth/Qwen2.5-3B-Instruct", "unsloth/Qwen2.5-7B-Instruct", "unsloth/Qwen2.5-14B-Instruct", "unsloth/Qwen2.5-32B-Instruct", "unsloth/Qwen2.5-72B-Instruct", ], "Qwen2.5-Coder": [ "unsloth/Qwen2.5-Coder-0.5B-Instruct", "unsloth/Qwen2.5-Coder-1.5B-Instruct", "unsloth/Qwen2.5-Coder-3B-Instruct", "unsloth/Qwen2.5-Coder-7B-Instruct", "unsloth/Qwen2.5-Coder-14B-Instruct", "unsloth/Qwen2.5-Coder-32B-Instruct", ], "Llama-4": [ "unsloth/Llama-4-Scout-17B-16E-Instruct", "unsloth/Llama-4-Maverick-17B-128E-Instruct", ], "Llama-3.3": [ "unsloth/Llama-3.3-70B-Instruct", ], "Llama-3.1": [ "unsloth/Meta-Llama-3.1-8B-Instruct", "unsloth/Meta-Llama-3.1-70B-Instruct", "unsloth/Meta-Llama-3.1-405B-Instruct", ], "Llama-3.2": [ "unsloth/Llama-3.2-1B-Instruct", "unsloth/Llama-3.2-3B-Instruct", ], "DeepSeek-R1": [ "unsloth/DeepSeek-R1-Distill-Qwen-1.5B", "unsloth/DeepSeek-R1-Distill-Qwen-7B", "unsloth/DeepSeek-R1-Distill-Qwen-14B", "unsloth/DeepSeek-R1-Distill-Qwen-32B", "unsloth/DeepSeek-R1-Distill-Llama-8B", "unsloth/DeepSeek-R1-Distill-Llama-70B", ], "Gemma-3": [ "unsloth/gemma-3-1b-it", "unsloth/gemma-3-4b-it", "unsloth/gemma-3-12b-it", "unsloth/gemma-3-27b-it", ], "Mistral": [ "unsloth/Mistral-Small-3.2-24B-Instruct-2506", "unsloth/mistral-7b-instruct-v0.3", "unsloth/Mistral-Nemo-Instruct-2407", ], "Phi-4": [ "unsloth/Phi-4-mini-instruct", "unsloth/Phi-4-Instruct", ], "GLM": [ "unsloth/GLM-4.7-Flash", "unsloth/GLM-4.5-Air", ], "Nemotron": [ "unsloth/Nemotron-3-Nano-30B-A3B", ], }, "vision_vlm": { "Qwen3-VL": [ "unsloth/Qwen3-VL-2B-Instruct", "unsloth/Qwen3-VL-4B-Instruct", "unsloth/Qwen3-VL-8B-Instruct", "unsloth/Qwen3-VL-32B-Instruct", ], "Qwen2.5-VL": [ "unsloth/Qwen2.5-VL-3B-Instruct", "unsloth/Qwen2.5-VL-7B-Instruct", "unsloth/Qwen2.5-VL-32B-Instruct", "unsloth/Qwen2.5-VL-72B-Instruct", ], "Llama-Vision": [ "unsloth/Llama-3.2-11B-Vision-Instruct", "unsloth/Llama-3.2-90B-Vision-Instruct", ], "Pixtral": [ "unsloth/Pixtral-12B-2409", ], "Gemma-3-Vision": [ "unsloth/gemma-3-4b-it", # Vision capable "unsloth/gemma-3-12b-it", "unsloth/gemma-3-27b-it", ], }, "embedding": [ "unsloth/Qwen3-Embedding-0.6B", "unsloth/Qwen3-Embedding-4B", "unsloth/Qwen3-Embedding-8B", "unsloth/embeddinggemma-300m", "unsloth/bge-m3", "unsloth/ModernBERT-base", "unsloth/ModernBERT-large", ], "multimodal_omni": [ "unsloth/Qwen2.5-Omni-3B", "unsloth/Qwen2.5-Omni-7B", ], } # ============================================================================ # RL METHODS CONFIGURATION # ============================================================================ RL_METHODS = { "grpo": { "name": "GRPO (Group Relative Policy Optimization)", "description": "Token-level importance sampling. Default DeepSeek method.", "config": {"loss_type": "grpo", "importance_sampling_level": "token"}, }, "gspo": { "name": "GSPO (Group Sequence Policy Optimization)", "description": "Sequence-level importance sampling. Qwen team variant.", "config": {"loss_type": "grpo", "importance_sampling_level": "sequence"}, }, "dr_grpo": { "name": "Dr-GRPO (Difficulty-Resilient GRPO)", "description": "Avoids difficulty bias in training.", "config": {"loss_type": "dr_grpo", "scale_rewards": False}, }, "dapo": { "name": "DAPO (Direct Advantage Policy Optimization)", "description": "Token-level normalization for long chain-of-thought.", "config": {"loss_type": "dapo", "mask_truncated_completions": True}, }, "bnpo": { "name": "BNPO (Bounded Natural Policy Optimization)", "description": "Asymmetric clipping for better exploration.", "config": {"loss_type": "bnpo", "epsilon": 0.2, "epsilon_high": 0.28, "delta": 1.5}, }, "dpo": { "name": "DPO (Direct Preference Optimization)", "description": "Preference-based training without reward model.", "config": {"method": "dpo"}, }, } # ============================================================================ # SAMPLE SIZE PRESETS # ============================================================================ SAMPLE_PRESETS = { "test_run": {"samples": 100, "max_steps": 50, "description": "Quick test (5-10 min)"}, "small_run": {"samples": 1000, "max_steps": 250, "description": "Small training (30-60 min)"}, "medium_run": {"samples": 5000, "max_steps": 1000, "description": "Medium training (2-4 hours)"}, "large_run": {"samples": 25000, "max_steps": 5000, "description": "Large training (8-12 hours)"}, "grokking_run": {"samples": 100000, "max_steps": 50000, "description": "Grokking/extended (24+ hours)"}, } # ============================================================================ # REWARD FUNCTION PACKS # ============================================================================ REWARD_PACKS = { "reasoning_xml": { "name": "XML Reasoning Format", "description": "Rewards ...... format", "functions": ["xmlcount_reward", "soft_format_reward", "strict_format_reward"], }, "code_quality": { "name": "Code Quality", "description": "Rewards syntactically correct, well-formatted code", "functions": ["syntax_reward", "docstring_reward", "type_hint_reward"], }, "math_accuracy": { "name": "Math Accuracy", "description": "Rewards correct numerical answers with step verification", "functions": ["correctness_reward", "int_reward", "step_count_reward"], }, "instruction_following": { "name": "Instruction Following", "description": "Rewards adherence to specific output formats", "functions": ["format_reward", "length_reward", "keyword_reward"], }, "safety_alignment": { "name": "Safety & Alignment", "description": "Rewards helpful, harmless, honest outputs", "functions": ["helpfulness_reward", "safety_reward", "factuality_reward"], }, } def get_environment_status(): """Check environment and return status.""" import subprocess status = { "cuda_available": False, "gpu_name": "Not detected", "gpu_memory": "Unknown", "unsloth_installed": False, "vllm_installed": False, "trl_installed": False, "anthropic_key": bool(os.environ.get("ANTHROPIC_API_KEY")), "hf_token": bool(os.environ.get("HF_TOKEN")), } try: import torch status["cuda_available"] = torch.cuda.is_available() if status["cuda_available"]: status["gpu_name"] = torch.cuda.get_device_name(0) status["gpu_memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB" except: pass try: import unsloth status["unsloth_installed"] = True except: pass try: import vllm status["vllm_installed"] = True except: pass try: import trl status["trl_installed"] = True except: pass return status def format_status_markdown(status): """Format status as markdown.""" lines = [ "## Environment Status\n", f"- **CUDA**: {'Available' if status['cuda_available'] else 'Not available'}", f"- **GPU**: {status['gpu_name']} ({status['gpu_memory']})", f"- **Unsloth**: {'Installed' if status['unsloth_installed'] else 'Not installed'}", f"- **vLLM**: {'Installed' if status['vllm_installed'] else 'Not installed'}", f"- **TRL**: {'Installed' if status['trl_installed'] else 'Not installed'}", f"- **ANTHROPIC_API_KEY**: {'Set' if status['anthropic_key'] else 'Not set'}", f"- **HF_TOKEN**: {'Set' if status['hf_token'] else 'Not set'}", ] return "\n".join(lines) def get_model_list(model_type): """Get flat list of models for given type.""" if model_type == "text_llm": models = [] for family, family_models in UNSLOTH_MODELS["text_llm"].items(): models.extend(family_models) return models elif model_type == "vision_vlm": models = [] for family, family_models in UNSLOTH_MODELS["vision_vlm"].items(): models.extend(family_models) return models elif model_type == "embedding": return UNSLOTH_MODELS["embedding"] elif model_type == "multimodal": return UNSLOTH_MODELS["multimodal_omni"] return [] def start_training( model_name, model_type, training_mode, rl_method, sample_preset, reward_pack, custom_reward_code, lora_rank, learning_rate, num_generations, temperature, max_seq_length, hub_model_id, push_to_hub, ): """Start training with selected configuration.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") run_dir = f"/app/runs/{training_mode}_{timestamp}" config = { "model_name": model_name, "model_type": model_type, "training_mode": training_mode, "rl_method": rl_method if training_mode == "rl" else None, "sample_preset": sample_preset, "reward_pack": reward_pack if training_mode == "rl" else None, "lora_rank": lora_rank, "learning_rate": learning_rate, "num_generations": num_generations if training_mode == "rl" else None, "temperature": temperature, "max_seq_length": max_seq_length, "hub_model_id": hub_model_id, "push_to_hub": push_to_hub, "run_dir": run_dir, "timestamp": timestamp, } # Generate training script if training_mode == "sft": script = generate_sft_script(config) else: script = generate_rl_script(config) return f""" ## Training Configuration Saved **Run Directory**: `{run_dir}` **Timestamp**: {timestamp} ### Configuration: ```json {json.dumps(config, indent=2)} ``` ### Generated Training Script: ```python {script[:2000]}... ``` **Status**: Ready to execute. Click 'Execute Training' to start. """ def generate_sft_script(config): """Generate SFT training script.""" preset = SAMPLE_PRESETS[config["sample_preset"]] return f''' # Unsloth SFT Training Script # Generated: {config["timestamp"]} from unsloth import FastLanguageModel from trl import SFTTrainer, SFTConfig from datasets import load_dataset max_seq_length = {config["max_seq_length"]} lora_rank = {config["lora_rank"]} # Load model with Unsloth optimizations model, tokenizer = FastLanguageModel.from_pretrained( model_name="{config["model_name"]}", max_seq_length=max_seq_length, load_in_4bit=True, dtype=None, ) # Add LoRA adapters model = FastLanguageModel.get_peft_model( model, r=lora_rank, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=lora_rank, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=3407, ) # Load and prepare dataset dataset = load_dataset("your_dataset", split="train") # Configure trainer trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, args=SFTConfig( per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=10, max_steps={preset["max_steps"]}, learning_rate={config["learning_rate"]}, optim="adamw_8bit", packing=True, max_length=max_seq_length, output_dir="{config["run_dir"]}", report_to="none", ), ) # Train trainer.train() # Save model.save_pretrained_merged("{config["run_dir"]}/merged", tokenizer, save_method="merged_16bit") ''' def generate_rl_script(config): """Generate RL training script.""" preset = SAMPLE_PRESETS[config["sample_preset"]] rl_config = RL_METHODS[config["rl_method"]]["config"] return f''' # Unsloth RL Training Script ({config["rl_method"].upper()}) # Generated: {config["timestamp"]} from unsloth import FastLanguageModel, PatchFastRL PatchFastRL("GRPO", FastLanguageModel) # CRITICAL: Must be BEFORE trl import from trl import GRPOConfig, GRPOTrainer from datasets import load_dataset max_seq_length = {config["max_seq_length"]} lora_rank = {config["lora_rank"]} # Load model with Unsloth optimizations + vLLM fast inference model, tokenizer = FastLanguageModel.from_pretrained( model_name="{config["model_name"]}", max_seq_length=max_seq_length, load_in_4bit=True, fast_inference=True, max_lora_rank=lora_rank, gpu_memory_utilization=0.6, ) # Add LoRA adapters model = FastLanguageModel.get_peft_model( model, r=lora_rank, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=lora_rank, use_gradient_checkpointing="unsloth", random_state=3407, ) # Reward functions from pack: {config["reward_pack"]} def xmlcount_reward_func(completions, **kwargs): def count_xml(text): count = 0.0 if text.count("\\n") == 1: count += 0.125 if text.count("\\n\\n") == 1: count += 0.125 if text.count("\\n\\n") == 1: count += 0.125 if text.count("\\n") == 1: count += 0.125 return count return [count_xml(c[0]["content"]) for c in completions] def correctness_reward_func(prompts, completions, answer, **kwargs): def extract_answer(text): if "" in text and "" in text: return text.split("")[-1].split("")[0].strip() return text.strip() responses = [c[0]["content"] for c in completions] extracted = [extract_answer(r) for r in responses] return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)] # Load dataset dataset = load_dataset("openai/gsm8k", "main", split="train") # Configure GRPO trainer training_args = GRPOConfig( output_dir="{config["run_dir"]}", learning_rate={config["learning_rate"]}, per_device_train_batch_size=1, gradient_accumulation_steps=4, num_generations={config["num_generations"]}, max_prompt_length=256, max_completion_length={config["max_seq_length"]} - 256, max_steps={preset["max_steps"]}, temperature={config["temperature"]}, loss_type="{rl_config.get("loss_type", "grpo")}", importance_sampling_level="{rl_config.get("importance_sampling_level", "token")}", optim="adamw_8bit", warmup_ratio=0.1, lr_scheduler_type="cosine", max_grad_norm=0.1, report_to="none", ) # Initialize trainer trainer = GRPOTrainer( model=model, processing_class=tokenizer, reward_funcs=[xmlcount_reward_func, correctness_reward_func], args=training_args, train_dataset=dataset, ) # Train trainer.train() # Save model.save_pretrained("{config["run_dir"]}/lora") tokenizer.save_pretrained("{config["run_dir"]}/lora") ''' # ============================================================================ # GRADIO UI # ============================================================================ def create_ui(): """Create Gradio interface.""" with gr.Blocks(title="Unsloth Training Hub", theme=gr.themes.Soft()) as demo: gr.Markdown("# Unsloth Training Hub") gr.Markdown("Comprehensive LLM Fine-tuning & RL Platform") # Status status = get_environment_status() gr.Markdown(format_status_markdown(status)) with gr.Tabs(): # Tab 1: Model Selection with gr.Tab("1. Model Selection"): model_type = gr.Radio( choices=["text_llm", "vision_vlm", "embedding", "multimodal"], value="text_llm", label="Model Type", ) model_dropdown = gr.Dropdown( choices=get_model_list("text_llm"), value="unsloth/Qwen2.5-7B-Instruct", label="Select Model", filterable=True, ) def update_models(model_type): models = get_model_list(model_type) return gr.Dropdown(choices=models, value=models[0] if models else None) model_type.change(update_models, model_type, model_dropdown) # Tab 2: Training Mode with gr.Tab("2. Training Mode"): training_mode = gr.Radio( choices=["sft", "rl"], value="sft", label="Training Mode", info="SFT: Supervised Fine-Tuning | RL: Reinforcement Learning" ) with gr.Group(visible=False) as rl_options: rl_method = gr.Dropdown( choices=list(RL_METHODS.keys()), value="grpo", label="RL Method", ) rl_info = gr.Markdown(RL_METHODS["grpo"]["description"]) def update_rl_info(method): return RL_METHODS[method]["description"] rl_method.change(update_rl_info, rl_method, rl_info) reward_pack = gr.Dropdown( choices=list(REWARD_PACKS.keys()), value="reasoning_xml", label="Reward Pack", ) custom_reward = gr.Code( label="Custom Reward Function (Optional)", language="python", value="# def custom_reward(completions, **kwargs):\n# return [1.0 for _ in completions]", ) num_generations = gr.Slider( minimum=2, maximum=16, value=8, step=2, label="Generations per Prompt", ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Generation Temperature", ) def toggle_rl_options(mode): return gr.Group(visible=(mode == "rl")) training_mode.change(toggle_rl_options, training_mode, rl_options) # Tab 3: Training Config with gr.Tab("3. Training Config"): sample_preset = gr.Radio( choices=list(SAMPLE_PRESETS.keys()), value="small_run", label="Sample Size Preset", ) preset_info = gr.Markdown( f"**{SAMPLE_PRESETS['small_run']['description']}** - " f"{SAMPLE_PRESETS['small_run']['samples']} samples, " f"{SAMPLE_PRESETS['small_run']['max_steps']} steps" ) def update_preset_info(preset): p = SAMPLE_PRESETS[preset] return f"**{p['description']}** - {p['samples']} samples, {p['max_steps']} steps" sample_preset.change(update_preset_info, sample_preset, preset_info) with gr.Row(): lora_rank = gr.Dropdown( choices=[8, 16, 32, 64, 128], value=32, label="LoRA Rank", ) learning_rate = gr.Number( value=5e-6, label="Learning Rate", ) max_seq_length = gr.Dropdown( choices=[512, 1024, 2048, 4096, 8192, 16384, 32768], value=2048, label="Max Sequence Length", ) # Tab 4: Output & Hub with gr.Tab("4. Output & Hub"): hub_model_id = gr.Textbox( value="wheattoast11/unsloth-trained-model", label="HuggingFace Hub Model ID", ) push_to_hub = gr.Checkbox( value=True, label="Push to HuggingFace Hub after training", ) output_format = gr.CheckboxGroup( choices=["merged_16bit", "merged_4bit", "lora", "gguf_q4_k_m", "gguf_q8_0"], value=["merged_16bit", "lora"], label="Output Formats", ) # Start Training gr.Markdown("---") with gr.Row(): start_btn = gr.Button("Generate Training Config", variant="primary", scale=2) execute_btn = gr.Button("Execute Training", variant="secondary", scale=1) output = gr.Markdown("Configure your training and click 'Generate Training Config'") start_btn.click( start_training, inputs=[ model_dropdown, model_type, training_mode, rl_method, sample_preset, reward_pack, custom_reward, lora_rank, learning_rate, num_generations, temperature, max_seq_length, hub_model_id, push_to_hub, ], outputs=output, ) gr.Markdown("---") gr.Markdown("**Intuition Labs** | Unsloth Training Hub | L40S ~$1.80/hr - PAUSE when not training!") return demo if __name__ == "__main__": demo = create_ui() demo.launch(server_name="0.0.0.0", server_port=7860)