"""
Unsloth Training Hub - Comprehensive LLM Fine-tuning & RL Platform
Supports: SFT, GRPO, GSPO, DPO, Dr-GRPO, DAPO, BNPO
Models: All Unsloth-optimized models (LLM, VLM, Embedding, Multimodal)
"""

import gradio as gr
import os
import json
from datetime import datetime

# ============================================================================
# MODEL CATALOG - All Unsloth Pre-optimized Models
# ============================================================================

UNSLOTH_MODELS = {
    "text_llm": {
        "Qwen3": [
            "unsloth/Qwen3-0.6B",
            "unsloth/Qwen3-1.7B",
            "unsloth/Qwen3-4B",
            "unsloth/Qwen3-8B",
            "unsloth/Qwen3-14B",
            "unsloth/Qwen3-32B",
            "unsloth/Qwen3-30B-A3B",
            "unsloth/Qwen3-235B-A22B",
        ],
        "Qwen2.5": [
            "unsloth/Qwen2.5-0.5B-Instruct",
            "unsloth/Qwen2.5-1.5B-Instruct",
            "unsloth/Qwen2.5-3B-Instruct",
            "unsloth/Qwen2.5-7B-Instruct",
            "unsloth/Qwen2.5-14B-Instruct",
            "unsloth/Qwen2.5-32B-Instruct",
            "unsloth/Qwen2.5-72B-Instruct",
        ],
        "Qwen2.5-Coder": [
            "unsloth/Qwen2.5-Coder-0.5B-Instruct",
            "unsloth/Qwen2.5-Coder-1.5B-Instruct",
            "unsloth/Qwen2.5-Coder-3B-Instruct",
            "unsloth/Qwen2.5-Coder-7B-Instruct",
            "unsloth/Qwen2.5-Coder-14B-Instruct",
            "unsloth/Qwen2.5-Coder-32B-Instruct",
        ],
        "Llama-4": [
            "unsloth/Llama-4-Scout-17B-16E-Instruct",
            "unsloth/Llama-4-Maverick-17B-128E-Instruct",
        ],
        "Llama-3.3": [
            "unsloth/Llama-3.3-70B-Instruct",
        ],
        "Llama-3.1": [
            "unsloth/Meta-Llama-3.1-8B-Instruct",
            "unsloth/Meta-Llama-3.1-70B-Instruct",
            "unsloth/Meta-Llama-3.1-405B-Instruct",
        ],
        "Llama-3.2": [
            "unsloth/Llama-3.2-1B-Instruct",
            "unsloth/Llama-3.2-3B-Instruct",
        ],
        "DeepSeek-R1": [
            "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
            "unsloth/DeepSeek-R1-Distill-Qwen-7B",
            "unsloth/DeepSeek-R1-Distill-Qwen-14B",
            "unsloth/DeepSeek-R1-Distill-Qwen-32B",
            "unsloth/DeepSeek-R1-Distill-Llama-8B",
            "unsloth/DeepSeek-R1-Distill-Llama-70B",
        ],
        "Gemma-3": [
            "unsloth/gemma-3-1b-it",
            "unsloth/gemma-3-4b-it",
            "unsloth/gemma-3-12b-it",
            "unsloth/gemma-3-27b-it",
        ],
        "Mistral": [
            "unsloth/Mistral-Small-3.2-24B-Instruct-2506",
            "unsloth/mistral-7b-instruct-v0.3",
            "unsloth/Mistral-Nemo-Instruct-2407",
        ],
        "Phi-4": [
            "unsloth/Phi-4-mini-instruct",
            "unsloth/Phi-4-Instruct",
        ],
        "GLM": [
            "unsloth/GLM-4.7-Flash",
            "unsloth/GLM-4.5-Air",
        ],
        "Nemotron": [
            "unsloth/Nemotron-3-Nano-30B-A3B",
        ],
    },
    "vision_vlm": {
        "Qwen3-VL": [
            "unsloth/Qwen3-VL-2B-Instruct",
            "unsloth/Qwen3-VL-4B-Instruct",
            "unsloth/Qwen3-VL-8B-Instruct",
            "unsloth/Qwen3-VL-32B-Instruct",
        ],
        "Qwen2.5-VL": [
            "unsloth/Qwen2.5-VL-3B-Instruct",
            "unsloth/Qwen2.5-VL-7B-Instruct",
            "unsloth/Qwen2.5-VL-32B-Instruct",
            "unsloth/Qwen2.5-VL-72B-Instruct",
        ],
        "Llama-Vision": [
            "unsloth/Llama-3.2-11B-Vision-Instruct",
            "unsloth/Llama-3.2-90B-Vision-Instruct",
        ],
        "Pixtral": [
            "unsloth/Pixtral-12B-2409",
        ],
        "Gemma-3-Vision": [
            "unsloth/gemma-3-4b-it",  # Vision capable
            "unsloth/gemma-3-12b-it",
            "unsloth/gemma-3-27b-it",
        ],
    },
    "embedding": [
        "unsloth/Qwen3-Embedding-0.6B",
        "unsloth/Qwen3-Embedding-4B",
        "unsloth/Qwen3-Embedding-8B",
        "unsloth/embeddinggemma-300m",
        "unsloth/bge-m3",
        "unsloth/ModernBERT-base",
        "unsloth/ModernBERT-large",
    ],
    "multimodal_omni": [
        "unsloth/Qwen2.5-Omni-3B",
        "unsloth/Qwen2.5-Omni-7B",
    ],
}

# ============================================================================
# RL METHODS CONFIGURATION
# ============================================================================

RL_METHODS = {
    "grpo": {
        "name": "GRPO (Group Relative Policy Optimization)",
        "description": "Token-level importance sampling. Default DeepSeek method.",
        "config": {"loss_type": "grpo", "importance_sampling_level": "token"},
    },
    "gspo": {
        "name": "GSPO (Group Sequence Policy Optimization)",
        "description": "Sequence-level importance sampling. Qwen team variant.",
        "config": {"loss_type": "grpo", "importance_sampling_level": "sequence"},
    },
    "dr_grpo": {
        "name": "Dr-GRPO (Difficulty-Resilient GRPO)",
        "description": "Avoids difficulty bias in training.",
        "config": {"loss_type": "dr_grpo", "scale_rewards": False},
    },
    "dapo": {
        "name": "DAPO (Direct Advantage Policy Optimization)",
        "description": "Token-level normalization for long chain-of-thought.",
        "config": {"loss_type": "dapo", "mask_truncated_completions": True},
    },
    "bnpo": {
        "name": "BNPO (Bounded Natural Policy Optimization)",
        "description": "Asymmetric clipping for better exploration.",
        "config": {"loss_type": "bnpo", "epsilon": 0.2, "epsilon_high": 0.28, "delta": 1.5},
    },
    "dpo": {
        "name": "DPO (Direct Preference Optimization)",
        "description": "Preference-based training without reward model.",
        "config": {"method": "dpo"},
    },
}

# ============================================================================
# SAMPLE SIZE PRESETS
# ============================================================================

SAMPLE_PRESETS = {
    "test_run": {"samples": 100, "max_steps": 50, "description": "Quick test (5-10 min)"},
    "small_run": {"samples": 1000, "max_steps": 250, "description": "Small training (30-60 min)"},
    "medium_run": {"samples": 5000, "max_steps": 1000, "description": "Medium training (2-4 hours)"},
    "large_run": {"samples": 25000, "max_steps": 5000, "description": "Large training (8-12 hours)"},
    "grokking_run": {"samples": 100000, "max_steps": 50000, "description": "Grokking/extended (24+ hours)"},
}

# ============================================================================
# REWARD FUNCTION PACKS
# ============================================================================

REWARD_PACKS = {
    "reasoning_xml": {
        "name": "XML Reasoning Format",
        "description": "Rewards <reasoning>...</reasoning><answer>...</answer> format",
        "functions": ["xmlcount_reward", "soft_format_reward", "strict_format_reward"],
    },
    "code_quality": {
        "name": "Code Quality",
        "description": "Rewards syntactically correct, well-formatted code",
        "functions": ["syntax_reward", "docstring_reward", "type_hint_reward"],
    },
    "math_accuracy": {
        "name": "Math Accuracy",
        "description": "Rewards correct numerical answers with step verification",
        "functions": ["correctness_reward", "int_reward", "step_count_reward"],
    },
    "instruction_following": {
        "name": "Instruction Following",
        "description": "Rewards adherence to specific output formats",
        "functions": ["format_reward", "length_reward", "keyword_reward"],
    },
    "safety_alignment": {
        "name": "Safety & Alignment",
        "description": "Rewards helpful, harmless, honest outputs",
        "functions": ["helpfulness_reward", "safety_reward", "factuality_reward"],
    },
}


def get_environment_status():
    """Check environment and return status."""
    import subprocess

    status = {
        "cuda_available": False,
        "gpu_name": "Not detected",
        "gpu_memory": "Unknown",
        "unsloth_installed": False,
        "vllm_installed": False,
        "trl_installed": False,
        "anthropic_key": bool(os.environ.get("ANTHROPIC_API_KEY")),
        "hf_token": bool(os.environ.get("HF_TOKEN")),
    }

    try:
        import torch
        status["cuda_available"] = torch.cuda.is_available()
        if status["cuda_available"]:
            status["gpu_name"] = torch.cuda.get_device_name(0)
            status["gpu_memory"] = f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    except:
        pass

    try:
        import unsloth
        status["unsloth_installed"] = True
    except:
        pass

    try:
        import vllm
        status["vllm_installed"] = True
    except:
        pass

    try:
        import trl
        status["trl_installed"] = True
    except:
        pass

    return status


def format_status_markdown(status):
    """Format status as markdown."""
    lines = [
        "## Environment Status\n",
        f"- **CUDA**: {'Available' if status['cuda_available'] else 'Not available'}",
        f"- **GPU**: {status['gpu_name']} ({status['gpu_memory']})",
        f"- **Unsloth**: {'Installed' if status['unsloth_installed'] else 'Not installed'}",
        f"- **vLLM**: {'Installed' if status['vllm_installed'] else 'Not installed'}",
        f"- **TRL**: {'Installed' if status['trl_installed'] else 'Not installed'}",
        f"- **ANTHROPIC_API_KEY**: {'Set' if status['anthropic_key'] else 'Not set'}",
        f"- **HF_TOKEN**: {'Set' if status['hf_token'] else 'Not set'}",
    ]
    return "\n".join(lines)


def get_model_list(model_type):
    """Get flat list of models for given type."""
    if model_type == "text_llm":
        models = []
        for family, family_models in UNSLOTH_MODELS["text_llm"].items():
            models.extend(family_models)
        return models
    elif model_type == "vision_vlm":
        models = []
        for family, family_models in UNSLOTH_MODELS["vision_vlm"].items():
            models.extend(family_models)
        return models
    elif model_type == "embedding":
        return UNSLOTH_MODELS["embedding"]
    elif model_type == "multimodal":
        return UNSLOTH_MODELS["multimodal_omni"]
    return []


def start_training(
    model_name,
    model_type,
    training_mode,
    rl_method,
    sample_preset,
    reward_pack,
    custom_reward_code,
    lora_rank,
    learning_rate,
    num_generations,
    temperature,
    max_seq_length,
    hub_model_id,
    push_to_hub,
):
    """Start training with selected configuration."""

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = f"/app/runs/{training_mode}_{timestamp}"

    config = {
        "model_name": model_name,
        "model_type": model_type,
        "training_mode": training_mode,
        "rl_method": rl_method if training_mode == "rl" else None,
        "sample_preset": sample_preset,
        "reward_pack": reward_pack if training_mode == "rl" else None,
        "lora_rank": lora_rank,
        "learning_rate": learning_rate,
        "num_generations": num_generations if training_mode == "rl" else None,
        "temperature": temperature,
        "max_seq_length": max_seq_length,
        "hub_model_id": hub_model_id,
        "push_to_hub": push_to_hub,
        "run_dir": run_dir,
        "timestamp": timestamp,
    }

    # Generate training script
    if training_mode == "sft":
        script = generate_sft_script(config)
    else:
        script = generate_rl_script(config)

    return f"""
## Training Configuration Saved

**Run Directory**: `{run_dir}`
**Timestamp**: {timestamp}

### Configuration:
```json
{json.dumps(config, indent=2)}
```

### Generated Training Script:
```python
{script[:2000]}...
```

**Status**: Ready to execute. Click 'Execute Training' to start.
"""


def generate_sft_script(config):
    """Generate SFT training script."""
    preset = SAMPLE_PRESETS[config["sample_preset"]]

    return f'''
# Unsloth SFT Training Script
# Generated: {config["timestamp"]}

from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

max_seq_length = {config["max_seq_length"]}
lora_rank = {config["lora_rank"]}

# Load model with Unsloth optimizations
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="{config["model_name"]}",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=lora_rank,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# Load and prepare dataset
dataset = load_dataset("your_dataset", split="train")

# Configure trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        max_steps={preset["max_steps"]},
        learning_rate={config["learning_rate"]},
        optim="adamw_8bit",
        packing=True,
        max_length=max_seq_length,
        output_dir="{config["run_dir"]}",
        report_to="none",
    ),
)

# Train
trainer.train()

# Save
model.save_pretrained_merged("{config["run_dir"]}/merged", tokenizer, save_method="merged_16bit")
'''


def generate_rl_script(config):
    """Generate RL training script."""
    preset = SAMPLE_PRESETS[config["sample_preset"]]
    rl_config = RL_METHODS[config["rl_method"]]["config"]

    return f'''
# Unsloth RL Training Script ({config["rl_method"].upper()})
# Generated: {config["timestamp"]}

from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)  # CRITICAL: Must be BEFORE trl import

from trl import GRPOConfig, GRPOTrainer
from datasets import load_dataset

max_seq_length = {config["max_seq_length"]}
lora_rank = {config["lora_rank"]}

# Load model with Unsloth optimizations + vLLM fast inference
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="{config["model_name"]}",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=True,
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.6,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# Reward functions from pack: {config["reward_pack"]}
def xmlcount_reward_func(completions, **kwargs):
    def count_xml(text):
        count = 0.0
        if text.count("<reasoning>\\n") == 1: count += 0.125
        if text.count("\\n</reasoning>\\n") == 1: count += 0.125
        if text.count("\\n<answer>\\n") == 1: count += 0.125
        if text.count("\\n</answer>") == 1: count += 0.125
        return count
    return [count_xml(c[0]["content"]) for c in completions]

def correctness_reward_func(prompts, completions, answer, **kwargs):
    def extract_answer(text):
        if "<answer>" in text and "</answer>" in text:
            return text.split("<answer>")[-1].split("</answer>")[0].strip()
        return text.strip()
    responses = [c[0]["content"] for c in completions]
    extracted = [extract_answer(r) for r in responses]
    return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)]

# Load dataset
dataset = load_dataset("openai/gsm8k", "main", split="train")

# Configure GRPO trainer
training_args = GRPOConfig(
    output_dir="{config["run_dir"]}",
    learning_rate={config["learning_rate"]},
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_generations={config["num_generations"]},
    max_prompt_length=256,
    max_completion_length={config["max_seq_length"]} - 256,
    max_steps={preset["max_steps"]},
    temperature={config["temperature"]},
    loss_type="{rl_config.get("loss_type", "grpo")}",
    importance_sampling_level="{rl_config.get("importance_sampling_level", "token")}",
    optim="adamw_8bit",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    max_grad_norm=0.1,
    report_to="none",
)

# Initialize trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[xmlcount_reward_func, correctness_reward_func],
    args=training_args,
    train_dataset=dataset,
)

# Train
trainer.train()

# Save
model.save_pretrained("{config["run_dir"]}/lora")
tokenizer.save_pretrained("{config["run_dir"]}/lora")
'''


# ============================================================================
# GRADIO UI
# ============================================================================

def create_ui():
    """Create Gradio interface."""

    with gr.Blocks(title="Unsloth Training Hub", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Unsloth Training Hub")
        gr.Markdown("Comprehensive LLM Fine-tuning & RL Platform")

        # Status
        status = get_environment_status()
        gr.Markdown(format_status_markdown(status))

        with gr.Tabs():
            # Tab 1: Model Selection
            with gr.Tab("1. Model Selection"):
                model_type = gr.Radio(
                    choices=["text_llm", "vision_vlm", "embedding", "multimodal"],
                    value="text_llm",
                    label="Model Type",
                )

                model_dropdown = gr.Dropdown(
                    choices=get_model_list("text_llm"),
                    value="unsloth/Qwen2.5-7B-Instruct",
                    label="Select Model",
                    filterable=True,
                )

                def update_models(model_type):
                    models = get_model_list(model_type)
                    return gr.Dropdown(choices=models, value=models[0] if models else None)

                model_type.change(update_models, model_type, model_dropdown)

            # Tab 2: Training Mode
            with gr.Tab("2. Training Mode"):
                training_mode = gr.Radio(
                    choices=["sft", "rl"],
                    value="sft",
                    label="Training Mode",
                    info="SFT: Supervised Fine-Tuning | RL: Reinforcement Learning"
                )

                with gr.Group(visible=False) as rl_options:
                    rl_method = gr.Dropdown(
                        choices=list(RL_METHODS.keys()),
                        value="grpo",
                        label="RL Method",
                    )

                    rl_info = gr.Markdown(RL_METHODS["grpo"]["description"])

                    def update_rl_info(method):
                        return RL_METHODS[method]["description"]

                    rl_method.change(update_rl_info, rl_method, rl_info)

                    reward_pack = gr.Dropdown(
                        choices=list(REWARD_PACKS.keys()),
                        value="reasoning_xml",
                        label="Reward Pack",
                    )

                    custom_reward = gr.Code(
                        label="Custom Reward Function (Optional)",
                        language="python",
                        value="# def custom_reward(completions, **kwargs):\n#     return [1.0 for _ in completions]",
                    )

                    num_generations = gr.Slider(
                        minimum=2, maximum=16, value=8, step=2,
                        label="Generations per Prompt",
                    )

                    temperature = gr.Slider(
                        minimum=0.1, maximum=2.0, value=1.0, step=0.1,
                        label="Generation Temperature",
                    )

                def toggle_rl_options(mode):
                    return gr.Group(visible=(mode == "rl"))

                training_mode.change(toggle_rl_options, training_mode, rl_options)

            # Tab 3: Training Config
            with gr.Tab("3. Training Config"):
                sample_preset = gr.Radio(
                    choices=list(SAMPLE_PRESETS.keys()),
                    value="small_run",
                    label="Sample Size Preset",
                )

                preset_info = gr.Markdown(
                    f"**{SAMPLE_PRESETS['small_run']['description']}** - "
                    f"{SAMPLE_PRESETS['small_run']['samples']} samples, "
                    f"{SAMPLE_PRESETS['small_run']['max_steps']} steps"
                )

                def update_preset_info(preset):
                    p = SAMPLE_PRESETS[preset]
                    return f"**{p['description']}** - {p['samples']} samples, {p['max_steps']} steps"

                sample_preset.change(update_preset_info, sample_preset, preset_info)

                with gr.Row():
                    lora_rank = gr.Dropdown(
                        choices=[8, 16, 32, 64, 128],
                        value=32,
                        label="LoRA Rank",
                    )

                    learning_rate = gr.Number(
                        value=5e-6,
                        label="Learning Rate",
                    )

                    max_seq_length = gr.Dropdown(
                        choices=[512, 1024, 2048, 4096, 8192, 16384, 32768],
                        value=2048,
                        label="Max Sequence Length",
                    )

            # Tab 4: Output & Hub
            with gr.Tab("4. Output & Hub"):
                hub_model_id = gr.Textbox(
                    value="wheattoast11/unsloth-trained-model",
                    label="HuggingFace Hub Model ID",
                )

                push_to_hub = gr.Checkbox(
                    value=True,
                    label="Push to HuggingFace Hub after training",
                )

                output_format = gr.CheckboxGroup(
                    choices=["merged_16bit", "merged_4bit", "lora", "gguf_q4_k_m", "gguf_q8_0"],
                    value=["merged_16bit", "lora"],
                    label="Output Formats",
                )

        # Start Training
        gr.Markdown("---")

        with gr.Row():
            start_btn = gr.Button("Generate Training Config", variant="primary", scale=2)
            execute_btn = gr.Button("Execute Training", variant="secondary", scale=1)

        output = gr.Markdown("Configure your training and click 'Generate Training Config'")

        start_btn.click(
            start_training,
            inputs=[
                model_dropdown,
                model_type,
                training_mode,
                rl_method,
                sample_preset,
                reward_pack,
                custom_reward,
                lora_rank,
                learning_rate,
                num_generations,
                temperature,
                max_seq_length,
                hub_model_id,
                push_to_hub,
            ],
            outputs=output,
        )

        gr.Markdown("---")
        gr.Markdown("**Intuition Labs** | Unsloth Training Hub | L40S ~$1.80/hr - PAUSE when not training!")

    return demo


if __name__ == "__main__":
    demo = create_ui()
    demo.launch(server_name="0.0.0.0", server_port=7860)