Spaces:

wheattoast11
/

unsloth-training-hub

Paused

App Files Files Community

wheattoast11 commited on Jan 25

Commit

1b9ca25

verified ·

1 Parent(s): 1482784

Update app.py

Browse files

Files changed (1) hide show

app.py +681 -56

app.py CHANGED Viewed

@@ -1,81 +1,706 @@
 """
-Unsloth Training Hub - LLM Fine-tuning & RL Platform
 Supports: SFT, GRPO, GSPO, DPO, Dr-GRPO, DAPO, BNPO
 """
 import gradio as gr
 import os
 import json
 from datetime import datetime
-MODELS = [
-    "unsloth/Qwen2.5-7B-Instruct",
-    "unsloth/Qwen2.5-3B-Instruct",
-    "unsloth/Qwen2.5-14B-Instruct",
-    "unsloth/Meta-Llama-3.1-8B-Instruct",
-    "unsloth/DeepSeek-R1-Distill-Qwen-7B",
-    "unsloth/gemma-3-4b-it",
-    "unsloth/Phi-4-mini-instruct",
-]
-RL_METHODS = ["grpo", "gspo", "dr_grpo", "dapo", "bnpo", "dpo"]
-PRESETS = ["test_run", "small_run", "medium_run", "large_run", "grokking_run"]
-def get_status():
-    s = {"cuda": False, "gpu": "None", "unsloth": False, "vllm": False}
     try:
         import torch
-        s["cuda"] = torch.cuda.is_available()
-        if s["cuda"]: s["gpu"] = torch.cuda.get_device_name(0)
-    except: pass
     try:
         import unsloth
-        s["unsloth"] = True
-    except: pass
     try:
         import vllm
-        s["vllm"] = True
-    except: pass
-    return s
 def create_ui():
     with gr.Blocks(title="Unsloth Training Hub", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Unsloth Training Hub")
         gr.Markdown("Comprehensive LLM Fine-tuning & RL Platform")
-        status = get_status()
-        gr.Markdown(f"**CUDA**: {status['cuda']} | **GPU**: {status['gpu']} | **Unsloth**: {status['unsloth']} | **vLLM**: {status['vllm']}")
         with gr.Tabs():
-            with gr.Tab("Model & Mode"):
-                model = gr.Dropdown(choices=MODELS, value=MODELS[0], label="Model")
-                mode = gr.Radio(choices=["sft", "rl"], value="sft", label="Training Mode")
-                rl_method = gr.Dropdown(choices=RL_METHODS, value="grpo", label="RL Method", visible=False)
-                mode.change(lambda m: gr.Dropdown(visible=m=="rl"), mode, rl_method)
-            with gr.Tab("Training Config"):
-                preset = gr.Radio(choices=PRESETS, value="small_run", label="Preset")
-                lora_rank = gr.Dropdown(choices=[8,16,32,64,128], value=32, label="LoRA Rank")
-                lr = gr.Number(value=5e-6, label="Learning Rate")
-            with gr.Tab("Output"):
-                hub_id = gr.Textbox(value="wheattoast11/trained-model", label="Hub Model ID")
-                push = gr.Checkbox(value=True, label="Push to Hub")
-        output = gr.Markdown("Configure and click Generate")
-        btn = gr.Button("Generate Training Script", variant="primary")
-        def generate(model, mode, rl_method, preset, lora_rank, lr, hub_id, push):
-            return f"**Model**: {model}
-**Mode**: {mode}
-**Preset**: {preset}
-**LoRA**: {lora_rank}
-**LR**: {lr}"
-        btn.click(generate, [model, mode, rl_method, preset, lora_rank, lr, hub_id, push], output)
-        gr.Markdown("---
-**Intuition Labs** | L40S ~$1.80/hr - PAUSE when not training!")
     return demo
 if __name__ == "__main__":
     demo = create_ui()
     demo.launch(server_name="0.0.0.0", server_port=7860)

 """
+Unsloth Training Hub - Comprehensive LLM Fine-tuning & RL Platform
 Supports: SFT, GRPO, GSPO, DPO, Dr-GRPO, DAPO, BNPO
+Models: All Unsloth-optimized models (LLM, VLM, Embedding, Multimodal)
 """
 import gradio as gr
 import os
 import json
 from datetime import datetime
+# ============================================================================
+# MODEL CATALOG - All Unsloth Pre-optimized Models
+# ============================================================================
+UNSLOTH_MODELS = {
+    "text_llm": {
+        "Qwen3": [
+            "unsloth/Qwen3-0.6B",
+            "unsloth/Qwen3-1.7B",
+            "unsloth/Qwen3-4B",
+            "unsloth/Qwen3-8B",
+            "unsloth/Qwen3-14B",
+            "unsloth/Qwen3-32B",
+            "unsloth/Qwen3-30B-A3B",
+            "unsloth/Qwen3-235B-A22B",
+        ],
+        "Qwen2.5": [
+            "unsloth/Qwen2.5-0.5B-Instruct",
+            "unsloth/Qwen2.5-1.5B-Instruct",
+            "unsloth/Qwen2.5-3B-Instruct",
+            "unsloth/Qwen2.5-7B-Instruct",
+            "unsloth/Qwen2.5-14B-Instruct",
+            "unsloth/Qwen2.5-32B-Instruct",
+            "unsloth/Qwen2.5-72B-Instruct",
+        ],
+        "Qwen2.5-Coder": [
+            "unsloth/Qwen2.5-Coder-0.5B-Instruct",
+            "unsloth/Qwen2.5-Coder-1.5B-Instruct",
+            "unsloth/Qwen2.5-Coder-3B-Instruct",
+            "unsloth/Qwen2.5-Coder-7B-Instruct",
+            "unsloth/Qwen2.5-Coder-14B-Instruct",
+            "unsloth/Qwen2.5-Coder-32B-Instruct",
+        ],
+        "Llama-4": [
+            "unsloth/Llama-4-Scout-17B-16E-Instruct",
+            "unsloth/Llama-4-Maverick-17B-128E-Instruct",
+        ],
+        "Llama-3.3": [
+            "unsloth/Llama-3.3-70B-Instruct",
+        ],
+        "Llama-3.1": [
+            "unsloth/Meta-Llama-3.1-8B-Instruct",
+            "unsloth/Meta-Llama-3.1-70B-Instruct",
+            "unsloth/Meta-Llama-3.1-405B-Instruct",
+        ],
+        "Llama-3.2": [
+            "unsloth/Llama-3.2-1B-Instruct",
+            "unsloth/Llama-3.2-3B-Instruct",
+        ],
+        "DeepSeek-R1": [
+            "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
+            "unsloth/DeepSeek-R1-Distill-Qwen-7B",
+            "unsloth/DeepSeek-R1-Distill-Qwen-14B",
+            "unsloth/DeepSeek-R1-Distill-Qwen-32B",
+            "unsloth/DeepSeek-R1-Distill-Llama-8B",
+            "unsloth/DeepSeek-R1-Distill-Llama-70B",
+        ],
+        "Gemma-3": [
+            "unsloth/gemma-3-1b-it",
+            "unsloth/gemma-3-4b-it",
+            "unsloth/gemma-3-12b-it",
+            "unsloth/gemma-3-27b-it",
+        ],
+        "Mistral": [
+            "unsloth/Mistral-Small-3.2-24B-Instruct-2506",
+            "unsloth/mistral-7b-instruct-v0.3",
+            "unsloth/Mistral-Nemo-Instruct-2407",
+        ],
+        "Phi-4": [
+            "unsloth/Phi-4-mini-instruct",
+            "unsloth/Phi-4-Instruct",
+        ],
+        "GLM": [
+            "unsloth/GLM-4.7-Flash",
+            "unsloth/GLM-4.5-Air",
+        ],
+        "Nemotron": [
+            "unsloth/Nemotron-3-Nano-30B-A3B",
+        ],
+    },
+    "vision_vlm": {
+        "Qwen3-VL": [
+            "unsloth/Qwen3-VL-2B-Instruct",
+            "unsloth/Qwen3-VL-4B-Instruct",
+            "unsloth/Qwen3-VL-8B-Instruct",
+            "unsloth/Qwen3-VL-32B-Instruct",
+        ],
+        "Qwen2.5-VL": [
+            "unsloth/Qwen2.5-VL-3B-Instruct",
+            "unsloth/Qwen2.5-VL-7B-Instruct",
+            "unsloth/Qwen2.5-VL-32B-Instruct",
+            "unsloth/Qwen2.5-VL-72B-Instruct",
+        ],
+        "Llama-Vision": [
+            "unsloth/Llama-3.2-11B-Vision-Instruct",
+            "unsloth/Llama-3.2-90B-Vision-Instruct",
+        ],
+        "Pixtral": [
+            "unsloth/Pixtral-12B-2409",
+        ],
+        "Gemma-3-Vision": [
+            "unsloth/gemma-3-4b-it",  # Vision capable
+            "unsloth/gemma-3-12b-it",
+            "unsloth/gemma-3-27b-it",
+        ],
+    },
+    "embedding": [
+        "unsloth/Qwen3-Embedding-0.6B",
+        "unsloth/Qwen3-Embedding-4B",
+        "unsloth/Qwen3-Embedding-8B",
+        "unsloth/embeddinggemma-300m",
+        "unsloth/bge-m3",
+        "unsloth/ModernBERT-base",
+        "unsloth/ModernBERT-large",
+    ],
+    "multimodal_omni": [
+        "unsloth/Qwen2.5-Omni-3B",
+        "unsloth/Qwen2.5-Omni-7B",
+    ],
+}
+# ============================================================================
+# RL METHODS CONFIGURATION
+# ============================================================================
+RL_METHODS = {
+    "grpo": {
+        "name": "GRPO (Group Relative Policy Optimization)",
+        "description": "Token-level importance sampling. Default DeepSeek method.",
+        "config": {"loss_type": "grpo", "importance_sampling_level": "token"},
+    },
+    "gspo": {
+        "name": "GSPO (Group Sequence Policy Optimization)",
+        "description": "Sequence-level importance sampling. Qwen team variant.",
+        "config": {"loss_type": "grpo", "importance_sampling_level": "sequence"},
+    },
+    "dr_grpo": {
+        "name": "Dr-GRPO (Difficulty-Resilient GRPO)",
+        "description": "Avoids difficulty bias in training.",
+        "config": {"loss_type": "dr_grpo", "scale_rewards": False},
+    },
+    "dapo": {
+        "name": "DAPO (Direct Advantage Policy Optimization)",
+        "description": "Token-level normalization for long chain-of-thought.",
+        "config": {"loss_type": "dapo", "mask_truncated_completions": True},
+    },
+    "bnpo": {
+        "name": "BNPO (Bounded Natural Policy Optimization)",
+        "description": "Asymmetric clipping for better exploration.",
+        "config": {"loss_type": "bnpo", "epsilon": 0.2, "epsilon_high": 0.28, "delta": 1.5},
+    },
+    "dpo": {
+        "name": "DPO (Direct Preference Optimization)",
+        "description": "Preference-based training without reward model.",
+        "config": {"method": "dpo"},
+    },
+}
+# ============================================================================
+# SAMPLE SIZE PRESETS
+# ============================================================================
+SAMPLE_PRESETS = {
+    "test_run": {"samples": 100, "max_steps": 50, "description": "Quick test (5-10 min)"},
+    "small_run": {"samples": 1000, "max_steps": 250, "description": "Small training (30-60 min)"},
+    "medium_run": {"samples": 5000, "max_steps": 1000, "description": "Medium training (2-4 hours)"},
+    "large_run": {"samples": 25000, "max_steps": 5000, "description": "Large training (8-12 hours)"},
+    "grokking_run": {"samples": 100000, "max_steps": 50000, "description": "Grokking/extended (24+ hours)"},
+}
+# ============================================================================
+# REWARD FUNCTION PACKS
+# ============================================================================
+REWARD_PACKS = {
+    "reasoning_xml": {
+        "name": "XML Reasoning Format",
+        "description": "Rewards <reasoning>...</reasoning><answer>...</answer> format",
+        "functions": ["xmlcount_reward", "soft_format_reward", "strict_format_reward"],
+    },
+    "code_quality": {
+        "name": "Code Quality",
+        "description": "Rewards syntactically correct, well-formatted code",
+        "functions": ["syntax_reward", "docstring_reward", "type_hint_reward"],
+    },
+    "math_accuracy": {
+        "name": "Math Accuracy",
+        "description": "Rewards correct numerical answers with step verification",
+        "functions": ["correctness_reward", "int_reward", "step_count_reward"],
+    },
+    "instruction_following": {
+        "name": "Instruction Following",
+        "description": "Rewards adherence to specific output formats",
+        "functions": ["format_reward", "length_reward", "keyword_reward"],
+    },
+    "safety_alignment": {
+        "name": "Safety & Alignment",
+        "description": "Rewards helpful, harmless, honest outputs",
+        "functions": ["helpfulness_reward", "safety_reward", "factuality_reward"],
+    },
+}
+def get_environment_status():
+    """Check environment and return status."""
+    import subprocess
+    status = {
+        "cuda_available": False,
+        "gpu_name": "Not detected",
+        "gpu_memory": "Unknown",
+        "unsloth_installed": False,
+        "vllm_installed": False,
+        "trl_installed": False,
+        "anthropic_key": bool(os.environ.get("ANTHROPIC_API_KEY")),
+        "hf_token": bool(os.environ.get("HF_TOKEN")),
+    }
     try:
         import torch
+        status["cuda_available"] = torch.cuda.is_available()
+        if status["cuda_available"]:
+            status["gpu_name"] = torch.cuda.get_device_name(0)
+            status["gpu_memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
+    except:
+        pass
     try:
         import unsloth
+        status["unsloth_installed"] = True
+    except:
+        pass
     try:
         import vllm
+        status["vllm_installed"] = True
+    except:
+        pass
+    try:
+        import trl
+        status["trl_installed"] = True
+    except:
+        pass
+    return status
+def format_status_markdown(status):
+    """Format status as markdown."""
+    lines = [
+        "## Environment Status\n",
+        f"- **CUDA**: {'Available' if status['cuda_available'] else 'Not available'}",
+        f"- **GPU**: {status['gpu_name']} ({status['gpu_memory']})",
+        f"- **Unsloth**: {'Installed' if status['unsloth_installed'] else 'Not installed'}",
+        f"- **vLLM**: {'Installed' if status['vllm_installed'] else 'Not installed'}",
+        f"- **TRL**: {'Installed' if status['trl_installed'] else 'Not installed'}",
+        f"- **ANTHROPIC_API_KEY**: {'Set' if status['anthropic_key'] else 'Not set'}",
+        f"- **HF_TOKEN**: {'Set' if status['hf_token'] else 'Not set'}",
+    ]
+    return "\n".join(lines)
+def get_model_list(model_type):
+    """Get flat list of models for given type."""
+    if model_type == "text_llm":
+        models = []
+        for family, family_models in UNSLOTH_MODELS["text_llm"].items():
+            models.extend(family_models)
+        return models
+    elif model_type == "vision_vlm":
+        models = []
+        for family, family_models in UNSLOTH_MODELS["vision_vlm"].items():
+            models.extend(family_models)
+        return models
+    elif model_type == "embedding":
+        return UNSLOTH_MODELS["embedding"]
+    elif model_type == "multimodal":
+        return UNSLOTH_MODELS["multimodal_omni"]
+    return []
+def start_training(
+    model_name,
+    model_type,
+    training_mode,
+    rl_method,
+    sample_preset,
+    reward_pack,
+    custom_reward_code,
+    lora_rank,
+    learning_rate,
+    num_generations,
+    temperature,
+    max_seq_length,
+    hub_model_id,
+    push_to_hub,
+):
+    """Start training with selected configuration."""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_dir = f"/app/runs/{training_mode}_{timestamp}"
+    config = {
+        "model_name": model_name,
+        "model_type": model_type,
+        "training_mode": training_mode,
+        "rl_method": rl_method if training_mode == "rl" else None,
+        "sample_preset": sample_preset,
+        "reward_pack": reward_pack if training_mode == "rl" else None,
+        "lora_rank": lora_rank,
+        "learning_rate": learning_rate,
+        "num_generations": num_generations if training_mode == "rl" else None,
+        "temperature": temperature,
+        "max_seq_length": max_seq_length,
+        "hub_model_id": hub_model_id,
+        "push_to_hub": push_to_hub,
+        "run_dir": run_dir,
+        "timestamp": timestamp,
+    }
+    # Generate training script
+    if training_mode == "sft":
+        script = generate_sft_script(config)
+    else:
+        script = generate_rl_script(config)
+    return f"""
+## Training Configuration Saved
+**Run Directory**: `{run_dir}`
+**Timestamp**: {timestamp}
+### Configuration:
+```json
+{json.dumps(config, indent=2)}
+```
+### Generated Training Script:
+```python
+{script[:2000]}...
+```
+**Status**: Ready to execute. Click 'Execute Training' to start.
+"""
+def generate_sft_script(config):
+    """Generate SFT training script."""
+    preset = SAMPLE_PRESETS[config["sample_preset"]]
+    return f'''
+# Unsloth SFT Training Script
+# Generated: {config["timestamp"]}
+from unsloth import FastLanguageModel
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+max_seq_length = {config["max_seq_length"]}
+lora_rank = {config["lora_rank"]}
+# Load model with Unsloth optimizations
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="{config["model_name"]}",
+    max_seq_length=max_seq_length,
+    load_in_4bit=True,
+    dtype=None,
+)
+# Add LoRA adapters
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=lora_rank,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=lora_rank,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+)
+# Load and prepare dataset
+dataset = load_dataset("your_dataset", split="train")
+# Configure trainer
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    args=SFTConfig(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        warmup_steps=10,
+        max_steps={preset["max_steps"]},
+        learning_rate={config["learning_rate"]},
+        optim="adamw_8bit",
+        packing=True,
+        max_length=max_seq_length,
+        output_dir="{config["run_dir"]}",
+        report_to="none",
+    ),
+)
+# Train
+trainer.train()
+# Save
+model.save_pretrained_merged("{config["run_dir"]}/merged", tokenizer, save_method="merged_16bit")
+'''
+def generate_rl_script(config):
+    """Generate RL training script."""
+    preset = SAMPLE_PRESETS[config["sample_preset"]]
+    rl_config = RL_METHODS[config["rl_method"]]["config"]
+    return f'''
+# Unsloth RL Training Script ({config["rl_method"].upper()})
+# Generated: {config["timestamp"]}
+from unsloth import FastLanguageModel, PatchFastRL
+PatchFastRL("GRPO", FastLanguageModel)  # CRITICAL: Must be BEFORE trl import
+from trl import GRPOConfig, GRPOTrainer
+from datasets import load_dataset
+max_seq_length = {config["max_seq_length"]}
+lora_rank = {config["lora_rank"]}
+# Load model with Unsloth optimizations + vLLM fast inference
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="{config["model_name"]}",
+    max_seq_length=max_seq_length,
+    load_in_4bit=True,
+    fast_inference=True,
+    max_lora_rank=lora_rank,
+    gpu_memory_utilization=0.6,
+)
+# Add LoRA adapters
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=lora_rank,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=lora_rank,
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+)
+# Reward functions from pack: {config["reward_pack"]}
+def xmlcount_reward_func(completions, **kwargs):
+    def count_xml(text):
+        count = 0.0
+        if text.count("<reasoning>\\n") == 1: count += 0.125
+        if text.count("\\n</reasoning>\\n") == 1: count += 0.125
+        if text.count("\\n<answer>\\n") == 1: count += 0.125
+        if text.count("\\n</answer>") == 1: count += 0.125
+        return count
+    return [count_xml(c[0]["content"]) for c in completions]
+def correctness_reward_func(prompts, completions, answer, **kwargs):
+    def extract_answer(text):
+        if "<answer>" in text and "</answer>" in text:
+            return text.split("<answer>")[-1].split("</answer>")[0].strip()
+        return text.strip()
+    responses = [c[0]["content"] for c in completions]
+    extracted = [extract_answer(r) for r in responses]
+    return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)]
+# Load dataset
+dataset = load_dataset("openai/gsm8k", "main", split="train")
+# Configure GRPO trainer
+training_args = GRPOConfig(
+    output_dir="{config["run_dir"]}",
+    learning_rate={config["learning_rate"]},
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_generations={config["num_generations"]},
+    max_prompt_length=256,
+    max_completion_length={config["max_seq_length"]} - 256,
+    max_steps={preset["max_steps"]},
+    temperature={config["temperature"]},
+    loss_type="{rl_config.get("loss_type", "grpo")}",
+    importance_sampling_level="{rl_config.get("importance_sampling_level", "token")}",
+    optim="adamw_8bit",
+    warmup_ratio=0.1,
+    lr_scheduler_type="cosine",
+    max_grad_norm=0.1,
+    report_to="none",
+)
+# Initialize trainer
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=[xmlcount_reward_func, correctness_reward_func],
+    args=training_args,
+    train_dataset=dataset,
+)
+# Train
+trainer.train()
+# Save
+model.save_pretrained("{config["run_dir"]}/lora")
+tokenizer.save_pretrained("{config["run_dir"]}/lora")
+'''
+# ============================================================================
+# GRADIO UI
+# ============================================================================
 def create_ui():
+    """Create Gradio interface."""
     with gr.Blocks(title="Unsloth Training Hub", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Unsloth Training Hub")
         gr.Markdown("Comprehensive LLM Fine-tuning & RL Platform")
+        # Status
+        status = get_environment_status()
+        gr.Markdown(format_status_markdown(status))
         with gr.Tabs():
+            # Tab 1: Model Selection
+            with gr.Tab("1. Model Selection"):
+                model_type = gr.Radio(
+                    choices=["text_llm", "vision_vlm", "embedding", "multimodal"],
+                    value="text_llm",
+                    label="Model Type",
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=get_model_list("text_llm"),
+                    value="unsloth/Qwen2.5-7B-Instruct",
+                    label="Select Model",
+                    filterable=True,
+                )
+                def update_models(model_type):
+                    models = get_model_list(model_type)
+                    return gr.Dropdown(choices=models, value=models[0] if models else None)
+                model_type.change(update_models, model_type, model_dropdown)
+            # Tab 2: Training Mode
+            with gr.Tab("2. Training Mode"):
+                training_mode = gr.Radio(
+                    choices=["sft", "rl"],
+                    value="sft",
+                    label="Training Mode",
+                    info="SFT: Supervised Fine-Tuning | RL: Reinforcement Learning"
+                )
+                with gr.Group(visible=False) as rl_options:
+                    rl_method = gr.Dropdown(
+                        choices=list(RL_METHODS.keys()),
+                        value="grpo",
+                        label="RL Method",
+                    )
+                    rl_info = gr.Markdown(RL_METHODS["grpo"]["description"])
+                    def update_rl_info(method):
+                        return RL_METHODS[method]["description"]
+                    rl_method.change(update_rl_info, rl_method, rl_info)
+                    reward_pack = gr.Dropdown(
+                        choices=list(REWARD_PACKS.keys()),
+                        value="reasoning_xml",
+                        label="Reward Pack",
+                    )
+                    custom_reward = gr.Code(
+                        label="Custom Reward Function (Optional)",
+                        language="python",
+                        value="# def custom_reward(completions, **kwargs):\n#     return [1.0 for _ in completions]",
+                    )
+                    num_generations = gr.Slider(
+                        minimum=2, maximum=16, value=8, step=2,
+                        label="Generations per Prompt",
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1, maximum=2.0, value=1.0, step=0.1,
+                        label="Generation Temperature",
+                    )
+                def toggle_rl_options(mode):
+                    return gr.Group(visible=(mode == "rl"))
+                training_mode.change(toggle_rl_options, training_mode, rl_options)
+            # Tab 3: Training Config
+            with gr.Tab("3. Training Config"):
+                sample_preset = gr.Radio(
+                    choices=list(SAMPLE_PRESETS.keys()),
+                    value="small_run",
+                    label="Sample Size Preset",
+                )
+                preset_info = gr.Markdown(
+                    f"**{SAMPLE_PRESETS['small_run']['description']}** - "
+                    f"{SAMPLE_PRESETS['small_run']['samples']} samples, "
+                    f"{SAMPLE_PRESETS['small_run']['max_steps']} steps"
+                )
+                def update_preset_info(preset):
+                    p = SAMPLE_PRESETS[preset]
+                    return f"**{p['description']}** - {p['samples']} samples, {p['max_steps']} steps"
+                sample_preset.change(update_preset_info, sample_preset, preset_info)
+                with gr.Row():
+                    lora_rank = gr.Dropdown(
+                        choices=[8, 16, 32, 64, 128],
+                        value=32,
+                        label="LoRA Rank",
+                    )
+                    learning_rate = gr.Number(
+                        value=5e-6,
+                        label="Learning Rate",
+                    )
+                    max_seq_length = gr.Dropdown(
+                        choices=[512, 1024, 2048, 4096, 8192, 16384, 32768],
+                        value=2048,
+                        label="Max Sequence Length",
+                    )
+            # Tab 4: Output & Hub
+            with gr.Tab("4. Output & Hub"):
+                hub_model_id = gr.Textbox(
+                    value="wheattoast11/unsloth-trained-model",
+                    label="HuggingFace Hub Model ID",
+                )
+                push_to_hub = gr.Checkbox(
+                    value=True,
+                    label="Push to HuggingFace Hub after training",
+                )
+                output_format = gr.CheckboxGroup(
+                    choices=["merged_16bit", "merged_4bit", "lora", "gguf_q4_k_m", "gguf_q8_0"],
+                    value=["merged_16bit", "lora"],
+                    label="Output Formats",
+                )
+        # Start Training
+        gr.Markdown("---")
+        with gr.Row():
+            start_btn = gr.Button("Generate Training Config", variant="primary", scale=2)
+            execute_btn = gr.Button("Execute Training", variant="secondary", scale=1)
+        output = gr.Markdown("Configure your training and click 'Generate Training Config'")
+        start_btn.click(
+            start_training,
+            inputs=[
+                model_dropdown,
+                model_type,
+                training_mode,
+                rl_method,
+                sample_preset,
+                reward_pack,
+                custom_reward,
+                lora_rank,
+                learning_rate,
+                num_generations,
+                temperature,
+                max_seq_length,
+                hub_model_id,
+                push_to_hub,
+            ],
+            outputs=output,
+        )
+        gr.Markdown("---")
+        gr.Markdown("**Intuition Labs** | Unsloth Training Hub | L40S ~$1.80/hr - PAUSE when not training!")
     return demo
 if __name__ == "__main__":
     demo = create_ui()
     demo.launch(server_name="0.0.0.0", server_port=7860)