Spaces:

VoltageVagabond
/

spam-classifier-liquid

Sleeping

File size: 16,131 Bytes

8ab16ab

"""
fine_tune.py — Fine-tune Liquid AI's LFM2.5-1.2B model for spam classification using LoRA.

This script uses Liquid AI's officially recommended fine-tuning approach:
  - TRL's SFTTrainer for supervised fine-tuning
  - PEFT's LoRA for memory-efficient adapter training

It does three things:
  1. Trains LoRA adapters on our spam/ham training data
  2. Evaluates the trained model on the test set
  3. Runs a quick generation test to verify the adapter works

Prerequisites:
  - Training data in training_data/train.jsonl (copied from MLX project)
  - Internet connection to download the model (first run only)

Usage:
    python3 fine_tune.py

Hardware: Requires Apple Silicon Mac (M1/M2/M3/M4). Targets ~8-12 GB unified memory.
"""

import json
import os
import sys

import torch
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

# The base model from Liquid AI (downloaded automatically from HuggingFace)
MODEL_ID = "LiquidAI/LFM2.5-1.2B-Instruct"

# Where to cache the downloaded model locally
MODEL_CACHE_DIR = "models"

# Path to the shared training data (shared with spam-classifier-mlx)
DATA_DIR = "../shared_training_data"
TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
TEST_FILE = os.path.join(DATA_DIR, "test.jsonl")

# Where the trained LoRA adapter weights will be saved
ADAPTER_DIR = "adapters"

# LoRA hyperparameters — from Liquid AI's official cookbook
# (Source: https://github.com/Liquid4All/cookbook/blob/main/finetuning/notebooks/sft_with_trl.ipynb)
LORA_RANK = 8           # How many parameters LoRA adds per layer
LORA_ALPHA = 16         # Scaling factor (2x the rank)
LORA_DROPOUT = 0.1      # Dropout to prevent overfitting
LORA_TARGET_MODULES = [  # Which layers inside the model get LoRA adapters
    # Attention layers (Multi-Head Attention)
    "q_proj",            # Query projection
    "k_proj",            # Key projection
    "v_proj",            # Value projection
    "out_proj",          # Output projection (Liquid AI uses out_proj, not o_proj)
    # Feed-forward layers (GLU — Gated Linear Unit)
    "w1",                # First feed-forward weight
    "w2",                # Second feed-forward weight
    "w3",                # Gate weight
    # Conv layers (unique to Liquid AI's architecture)
    "in_proj",           # Input projection (conv block)
]

# Training hyperparameters
NUM_EPOCHS = 3                    # How many times to loop through all training data
BATCH_SIZE = 1                    # Smaller batch = much less memory per step
GRADIENT_ACCUMULATION_STEPS = 4   # Accumulate 4 steps to get the same effective batch size of 4
LEARNING_RATE = 2e-4              # How fast the model learns (Liquid AI's SFT recommendation)
MAX_LENGTH = 256                  # Spam emails rarely need 512 tokens — 256 covers most cases
LOGGING_STEPS = 10                # Print training loss every 10 steps


# ---------------------------------------------------------------------------
# 1. Check prerequisites
# ---------------------------------------------------------------------------

def check_prerequisites():
    """Make sure the training data exists before we start."""
    print("=" * 60)
    print("STEP 1: Checking prerequisites")
    print("=" * 60)

    # Check training data
    if not os.path.isfile(TRAIN_FILE):
        print(f"  ERROR: Training data not found: {TRAIN_FILE}")
        print("  Training data lives in the shared folder:")
        print(f"    {os.path.abspath(DATA_DIR)}")
        sys.exit(1)
    print(f"  Training data: {TRAIN_FILE} ... OK")

    # Check test data
    if not os.path.isfile(TEST_FILE):
        print(f"  WARNING: Test data not found: {TEST_FILE}")
        print("  Evaluation step will be skipped.")
    else:
        print(f"  Test data:     {TEST_FILE} ... OK")

    # Check for Apple Silicon
    if torch.backends.mps.is_available():
        print("  Apple Silicon:  MPS backend available ... OK")
    else:
        print("  WARNING: MPS not available. Training will use CPU (much slower).")

    print()


# ---------------------------------------------------------------------------
# 2. Load training data
# ---------------------------------------------------------------------------

def load_jsonl_as_dataset(file_path):
    """Load a JSONL file into a HuggingFace Dataset.

    Each line in the JSONL file looks like:
    {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}

    SFTTrainer expects a Dataset with a "messages" column in this exact format.
    """
    examples = []
    with open(file_path, "r") as f:
        for line in f:
            data = json.loads(line.strip())
            examples.append(data)
    return Dataset.from_list(examples)


# ---------------------------------------------------------------------------
# 3. Run LoRA fine-tuning
# ---------------------------------------------------------------------------

def run_training():
    """
    Fine-tune the Liquid AI model using TRL's SFTTrainer with LoRA.

    This is Liquid AI's officially recommended fine-tuning approach.
    SFTTrainer handles:
      - Applying the chat template to format messages correctly
      - Tokenizing the text into numbers the model understands
      - Masking the prompt so loss is only computed on the assistant's response
      - Running the training loop (forward pass, loss, backward pass, optimizer)

    Source: https://docs.liquid.ai/customization/finetuning-frameworks/trl
    """
    print("=" * 60)
    print("STEP 2: LoRA fine-tuning with TRL SFTTrainer")
    print("=" * 60)

    # --- Load the base model ---
    print(f"  Loading model: {MODEL_ID}")
    print("  (First run downloads ~2.4 GB from HuggingFace — be patient)")
    print()

    # Detect if bfloat16 is supported on this Mac
    # Newer macOS + Apple Silicon supports bf16, but older versions may not
    use_bf16 = torch.backends.mps.is_available()

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        cache_dir=MODEL_CACHE_DIR,       # Save to local models/ directory
        device_map="mps",                # Pin everything to the MPS GPU (avoids CPU spill)
        torch_dtype=torch.bfloat16,      # Load in bfloat16 from the start to cut memory in half
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)

    # Make sure the tokenizer has a padding token (needed for batched training)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("  Model loaded successfully!")
    print()

    # --- Configure LoRA ---
    # LoRA adds small trainable adapter layers to the model's attention layers.
    # Only these adapters are trained — the original model weights stay frozen.
    print("  Configuring LoRA adapters...")
    print(f"    Rank:           {LORA_RANK}")
    print(f"    Alpha:          {LORA_ALPHA}")
    print(f"    Target modules: {LORA_TARGET_MODULES}")
    print(f"    Dropout:        {LORA_DROPOUT}")

    peft_config = LoraConfig(
        r=LORA_RANK,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGET_MODULES,
        task_type="CAUSAL_LM",
    )

    # --- Load training data ---
    print()
    print(f"  Loading training data from {TRAIN_FILE}...")
    train_dataset = load_jsonl_as_dataset(TRAIN_FILE)
    print(f"  Loaded {len(train_dataset)} training examples")

    # --- Configure the trainer ---
    # SFTConfig holds all the training settings.
    # SFTTrainer is the engine that runs the actual training loop.
    print()
    print("  Training configuration:")
    print(f"    Epochs:              {NUM_EPOCHS}")
    print(f"    Batch size:          {BATCH_SIZE}")
    print(f"    Grad accumulation:   {GRADIENT_ACCUMULATION_STEPS} (effective batch = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
    print(f"    Learning rate:       {LEARNING_RATE}")
    print(f"    Max sequence length: {MAX_LENGTH}")
    print(f"    Logging every:       {LOGGING_STEPS} steps")
    print()

    training_args = SFTConfig(
        output_dir=ADAPTER_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        max_length=MAX_LENGTH,
        logging_steps=LOGGING_STEPS,
        save_strategy="epoch",           # Save a checkpoint after each epoch
        bf16=use_bf16,                   # Use bfloat16 if MPS supports it
        gradient_checkpointing=True,     # Save memory by recomputing gradients
        gradient_checkpointing_kwargs={"use_reentrant": False},  # More stable on MPS
        optim="adamw_torch",             # Standard PyTorch AdamW (bitsandbytes not supported on MPS)
        max_grad_norm=0.3,               # Clip gradients to prevent instability
        report_to="none",               # Don't log to wandb or other services
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        processing_class=tokenizer,
        peft_config=peft_config,
    )

    # --- Run training ---
    print("  Training started — watch the loss decrease over time!")
    print("  (This will take ~1-1.5 hours on Apple Silicon for 3 epochs)")
    print()

    trainer.train()

    # --- Save the trained adapter ---
    trainer.save_model(ADAPTER_DIR)
    tokenizer.save_pretrained(ADAPTER_DIR)

    print()
    print(f"  Training complete! Adapter saved to: {ADAPTER_DIR}/")
    print()

    return model, tokenizer


# ---------------------------------------------------------------------------
# 4. Evaluate on test set
# ---------------------------------------------------------------------------

def run_evaluation(model, tokenizer):
    """
    Evaluate the trained model on the test set.

    Computes the average loss (and perplexity) on held-out test data.
    Lower perplexity = the model is better at predicting the test data.
    """
    print("=" * 60)
    print("STEP 3: Evaluating on test set")
    print("=" * 60)

    if not os.path.isfile(TEST_FILE):
        print("  Skipping — no test data found.")
        print()
        return

    test_dataset = load_jsonl_as_dataset(TEST_FILE)
    print(f"  Loaded {len(test_dataset)} test examples")

    # Use the same config but for evaluation only
    eval_args = SFTConfig(
        output_dir=ADAPTER_DIR,
        per_device_eval_batch_size=1,
        max_length=MAX_LENGTH,
        bf16=torch.backends.mps.is_available(),
        report_to="none",
    )

    trainer = SFTTrainer(
        model=model,
        args=eval_args,
        train_dataset=test_dataset,   # Required by SFTTrainer, not used for eval
        eval_dataset=test_dataset,
        processing_class=tokenizer,
    )

    metrics = trainer.evaluate()
    print(f"  Test loss:       {metrics.get('eval_loss', 'N/A'):.4f}")

    eval_loss = metrics.get("eval_loss")
    if eval_loss is not None:
        import math
        perplexity = math.exp(eval_loss)
        print(f"  Test perplexity: {perplexity:.2f}")

    print()


# ---------------------------------------------------------------------------
# 5. Quick generation test
# ---------------------------------------------------------------------------

def run_generation_test(model, tokenizer):
    """
    Classify a sample spam email to verify the adapter actually works.

    This loads the trained adapter and generates a classification response
    for a clearly spammy test email.
    """
    print("=" * 60)
    print("STEP 4: Quick generation test")
    print("=" * 60)
    print("  Classifying a sample spam email to verify the adapter works...")
    print()

    # A clearly spammy test email
    test_email = (
        "Congratulations!!! You have been selected as the WINNER of our "
        "$1,000,000 prize draw! Click here NOW to claim your reward. "
        "This offer expires in 24 hours. Act fast! Send your bank details "
        "to claim your prize immediately."
    )

    # Build the chat messages — same format used during training
    messages = [
        {
            "role": "system",
            "content": (
                "You are an email spam classifier. Analyze the email and classify it "
                "as SPAM or HAM. Explain your reasoning."
            ),
        },
        {
            "role": "user",
            "content": (
                "Classify this email as SPAM or HAM. Give your classification on the "
                "first line, then explain your reasoning in 2-3 sentences. Be specific "
                "about what words, patterns, or signals you noticed.\n\n"
                f"Email:\n{test_email}"
            ),
        },
    ]

    # Apply the chat template — converts messages into the format the model expects
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Tokenize and move to the model's device
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate the response
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.1,
        )

    # Decode only the NEW tokens (skip the input prompt)
    new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)

    print(f"  Test email: {test_email[:80]}...")
    print()
    print("  Model response:")
    print("  " + "-" * 40)
    print(f"  {response}")
    print("  " + "-" * 40)
    print()


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def load_saved_adapter():
    """Load the base model with the previously trained LoRA adapter."""
    from peft import PeftModel

    print("=" * 60)
    print("STEP 2: Loading saved adapter (skipping training)")
    print("=" * 60)

    if not os.path.isdir(ADAPTER_DIR):
        print(f"  ERROR: No saved adapter found at {ADAPTER_DIR}/")
        print("  Run without --eval-only first to train the model.")
        sys.exit(1)

    print(f"  Loading base model: {MODEL_ID}")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        cache_dir=MODEL_CACHE_DIR,
        device_map="mps",
        torch_dtype=torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print(f"  Loading LoRA adapter from {ADAPTER_DIR}/")
    model = PeftModel.from_pretrained(model, ADAPTER_DIR)
    print("  Adapter loaded successfully!")
    print()

    return model, tokenizer


def main():
    eval_only = "--eval-only" in sys.argv

    print()
    print("*" * 60)
    print("  Spam Classifier — Liquid AI LoRA Fine-Tuning")
    print("  Model: LFM2.5-1.2B-Instruct")
    print("  Method: TRL SFTTrainer + PEFT LoRA")
    if eval_only:
        print("  Mode: Evaluation only (skipping training)")
    print("*" * 60)
    print()

    # Step 1: Check that everything is in place
    check_prerequisites()

    # Step 2: Train or load saved adapter
    if eval_only:
        model, tokenizer = load_saved_adapter()
    else:
        model, tokenizer = run_training()

    # Step 3: Evaluate on test set
    run_evaluation(model, tokenizer)

    # Step 4: Quick sanity check — classify a sample email
    run_generation_test(model, tokenizer)

    # Done!
    print("=" * 60)
    print("  All done!")
    if not eval_only:
        print(f"  Adapter weights saved to: {ADAPTER_DIR}/")
    print()
    print("  Next step: python3 app.py")
    print("=" * 60)
    print()


if __name__ == "__main__":
    main()