spam-classifier-liquid / fine_tune.py
VoltageVagabond's picture
Upload folder using huggingface_hub
8ab16ab verified
Raw
History Blame
16.1 kB
"""
fine_tune.py — Fine-tune Liquid AI's LFM2.5-1.2B model for spam classification using LoRA.
This script uses Liquid AI's officially recommended fine-tuning approach:
- TRL's SFTTrainer for supervised fine-tuning
- PEFT's LoRA for memory-efficient adapter training
It does three things:
1. Trains LoRA adapters on our spam/ham training data
2. Evaluates the trained model on the test set
3. Runs a quick generation test to verify the adapter works
Prerequisites:
- Training data in training_data/train.jsonl (copied from MLX project)
- Internet connection to download the model (first run only)
Usage:
python3 fine_tune.py
Hardware: Requires Apple Silicon Mac (M1/M2/M3/M4). Targets ~8-12 GB unified memory.
"""
import json
import os
import sys
import torch
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
# The base model from Liquid AI (downloaded automatically from HuggingFace)
MODEL_ID = "LiquidAI/LFM2.5-1.2B-Instruct"
# Where to cache the downloaded model locally
MODEL_CACHE_DIR = "models"
# Path to the shared training data (shared with spam-classifier-mlx)
DATA_DIR = "../shared_training_data"
TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
TEST_FILE = os.path.join(DATA_DIR, "test.jsonl")
# Where the trained LoRA adapter weights will be saved
ADAPTER_DIR = "adapters"
# LoRA hyperparameters — from Liquid AI's official cookbook
# (Source: https://github.com/Liquid4All/cookbook/blob/main/finetuning/notebooks/sft_with_trl.ipynb)
LORA_RANK = 8 # How many parameters LoRA adds per layer
LORA_ALPHA = 16 # Scaling factor (2x the rank)
LORA_DROPOUT = 0.1 # Dropout to prevent overfitting
LORA_TARGET_MODULES = [ # Which layers inside the model get LoRA adapters
# Attention layers (Multi-Head Attention)
"q_proj", # Query projection
"k_proj", # Key projection
"v_proj", # Value projection
"out_proj", # Output projection (Liquid AI uses out_proj, not o_proj)
# Feed-forward layers (GLU — Gated Linear Unit)
"w1", # First feed-forward weight
"w2", # Second feed-forward weight
"w3", # Gate weight
# Conv layers (unique to Liquid AI's architecture)
"in_proj", # Input projection (conv block)
]
# Training hyperparameters
NUM_EPOCHS = 3 # How many times to loop through all training data
BATCH_SIZE = 1 # Smaller batch = much less memory per step
GRADIENT_ACCUMULATION_STEPS = 4 # Accumulate 4 steps to get the same effective batch size of 4
LEARNING_RATE = 2e-4 # How fast the model learns (Liquid AI's SFT recommendation)
MAX_LENGTH = 256 # Spam emails rarely need 512 tokens — 256 covers most cases
LOGGING_STEPS = 10 # Print training loss every 10 steps
# ---------------------------------------------------------------------------
# 1. Check prerequisites
# ---------------------------------------------------------------------------
def check_prerequisites():
"""Make sure the training data exists before we start."""
print("=" * 60)
print("STEP 1: Checking prerequisites")
print("=" * 60)
# Check training data
if not os.path.isfile(TRAIN_FILE):
print(f" ERROR: Training data not found: {TRAIN_FILE}")
print(" Training data lives in the shared folder:")
print(f" {os.path.abspath(DATA_DIR)}")
sys.exit(1)
print(f" Training data: {TRAIN_FILE} ... OK")
# Check test data
if not os.path.isfile(TEST_FILE):
print(f" WARNING: Test data not found: {TEST_FILE}")
print(" Evaluation step will be skipped.")
else:
print(f" Test data: {TEST_FILE} ... OK")
# Check for Apple Silicon
if torch.backends.mps.is_available():
print(" Apple Silicon: MPS backend available ... OK")
else:
print(" WARNING: MPS not available. Training will use CPU (much slower).")
print()
# ---------------------------------------------------------------------------
# 2. Load training data
# ---------------------------------------------------------------------------
def load_jsonl_as_dataset(file_path):
"""Load a JSONL file into a HuggingFace Dataset.
Each line in the JSONL file looks like:
{"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}
SFTTrainer expects a Dataset with a "messages" column in this exact format.
"""
examples = []
with open(file_path, "r") as f:
for line in f:
data = json.loads(line.strip())
examples.append(data)
return Dataset.from_list(examples)
# ---------------------------------------------------------------------------
# 3. Run LoRA fine-tuning
# ---------------------------------------------------------------------------
def run_training():
"""
Fine-tune the Liquid AI model using TRL's SFTTrainer with LoRA.
This is Liquid AI's officially recommended fine-tuning approach.
SFTTrainer handles:
- Applying the chat template to format messages correctly
- Tokenizing the text into numbers the model understands
- Masking the prompt so loss is only computed on the assistant's response
- Running the training loop (forward pass, loss, backward pass, optimizer)
Source: https://docs.liquid.ai/customization/finetuning-frameworks/trl
"""
print("=" * 60)
print("STEP 2: LoRA fine-tuning with TRL SFTTrainer")
print("=" * 60)
# --- Load the base model ---
print(f" Loading model: {MODEL_ID}")
print(" (First run downloads ~2.4 GB from HuggingFace — be patient)")
print()
# Detect if bfloat16 is supported on this Mac
# Newer macOS + Apple Silicon supports bf16, but older versions may not
use_bf16 = torch.backends.mps.is_available()
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=MODEL_CACHE_DIR, # Save to local models/ directory
device_map="mps", # Pin everything to the MPS GPU (avoids CPU spill)
torch_dtype=torch.bfloat16, # Load in bfloat16 from the start to cut memory in half
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)
# Make sure the tokenizer has a padding token (needed for batched training)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(" Model loaded successfully!")
print()
# --- Configure LoRA ---
# LoRA adds small trainable adapter layers to the model's attention layers.
# Only these adapters are trained — the original model weights stay frozen.
print(" Configuring LoRA adapters...")
print(f" Rank: {LORA_RANK}")
print(f" Alpha: {LORA_ALPHA}")
print(f" Target modules: {LORA_TARGET_MODULES}")
print(f" Dropout: {LORA_DROPOUT}")
peft_config = LoraConfig(
r=LORA_RANK,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
target_modules=LORA_TARGET_MODULES,
task_type="CAUSAL_LM",
)
# --- Load training data ---
print()
print(f" Loading training data from {TRAIN_FILE}...")
train_dataset = load_jsonl_as_dataset(TRAIN_FILE)
print(f" Loaded {len(train_dataset)} training examples")
# --- Configure the trainer ---
# SFTConfig holds all the training settings.
# SFTTrainer is the engine that runs the actual training loop.
print()
print(" Training configuration:")
print(f" Epochs: {NUM_EPOCHS}")
print(f" Batch size: {BATCH_SIZE}")
print(f" Grad accumulation: {GRADIENT_ACCUMULATION_STEPS} (effective batch = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
print(f" Learning rate: {LEARNING_RATE}")
print(f" Max sequence length: {MAX_LENGTH}")
print(f" Logging every: {LOGGING_STEPS} steps")
print()
training_args = SFTConfig(
output_dir=ADAPTER_DIR,
num_train_epochs=NUM_EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
learning_rate=LEARNING_RATE,
max_length=MAX_LENGTH,
logging_steps=LOGGING_STEPS,
save_strategy="epoch", # Save a checkpoint after each epoch
bf16=use_bf16, # Use bfloat16 if MPS supports it
gradient_checkpointing=True, # Save memory by recomputing gradients
gradient_checkpointing_kwargs={"use_reentrant": False}, # More stable on MPS
optim="adamw_torch", # Standard PyTorch AdamW (bitsandbytes not supported on MPS)
max_grad_norm=0.3, # Clip gradients to prevent instability
report_to="none", # Don't log to wandb or other services
)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
processing_class=tokenizer,
peft_config=peft_config,
)
# --- Run training ---
print(" Training started — watch the loss decrease over time!")
print(" (This will take ~1-1.5 hours on Apple Silicon for 3 epochs)")
print()
trainer.train()
# --- Save the trained adapter ---
trainer.save_model(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print()
print(f" Training complete! Adapter saved to: {ADAPTER_DIR}/")
print()
return model, tokenizer
# ---------------------------------------------------------------------------
# 4. Evaluate on test set
# ---------------------------------------------------------------------------
def run_evaluation(model, tokenizer):
"""
Evaluate the trained model on the test set.
Computes the average loss (and perplexity) on held-out test data.
Lower perplexity = the model is better at predicting the test data.
"""
print("=" * 60)
print("STEP 3: Evaluating on test set")
print("=" * 60)
if not os.path.isfile(TEST_FILE):
print(" Skipping — no test data found.")
print()
return
test_dataset = load_jsonl_as_dataset(TEST_FILE)
print(f" Loaded {len(test_dataset)} test examples")
# Use the same config but for evaluation only
eval_args = SFTConfig(
output_dir=ADAPTER_DIR,
per_device_eval_batch_size=1,
max_length=MAX_LENGTH,
bf16=torch.backends.mps.is_available(),
report_to="none",
)
trainer = SFTTrainer(
model=model,
args=eval_args,
train_dataset=test_dataset, # Required by SFTTrainer, not used for eval
eval_dataset=test_dataset,
processing_class=tokenizer,
)
metrics = trainer.evaluate()
print(f" Test loss: {metrics.get('eval_loss', 'N/A'):.4f}")
eval_loss = metrics.get("eval_loss")
if eval_loss is not None:
import math
perplexity = math.exp(eval_loss)
print(f" Test perplexity: {perplexity:.2f}")
print()
# ---------------------------------------------------------------------------
# 5. Quick generation test
# ---------------------------------------------------------------------------
def run_generation_test(model, tokenizer):
"""
Classify a sample spam email to verify the adapter actually works.
This loads the trained adapter and generates a classification response
for a clearly spammy test email.
"""
print("=" * 60)
print("STEP 4: Quick generation test")
print("=" * 60)
print(" Classifying a sample spam email to verify the adapter works...")
print()
# A clearly spammy test email
test_email = (
"Congratulations!!! You have been selected as the WINNER of our "
"$1,000,000 prize draw! Click here NOW to claim your reward. "
"This offer expires in 24 hours. Act fast! Send your bank details "
"to claim your prize immediately."
)
# Build the chat messages — same format used during training
messages = [
{
"role": "system",
"content": (
"You are an email spam classifier. Analyze the email and classify it "
"as SPAM or HAM. Explain your reasoning."
),
},
{
"role": "user",
"content": (
"Classify this email as SPAM or HAM. Give your classification on the "
"first line, then explain your reasoning in 2-3 sentences. Be specific "
"about what words, patterns, or signals you noticed.\n\n"
f"Email:\n{test_email}"
),
},
]
# Apply the chat template — converts messages into the format the model expects
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Tokenize and move to the model's device
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate the response
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=200,
do_sample=True,
temperature=0.1,
)
# Decode only the NEW tokens (skip the input prompt)
new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
print(f" Test email: {test_email[:80]}...")
print()
print(" Model response:")
print(" " + "-" * 40)
print(f" {response}")
print(" " + "-" * 40)
print()
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def load_saved_adapter():
"""Load the base model with the previously trained LoRA adapter."""
from peft import PeftModel
print("=" * 60)
print("STEP 2: Loading saved adapter (skipping training)")
print("=" * 60)
if not os.path.isdir(ADAPTER_DIR):
print(f" ERROR: No saved adapter found at {ADAPTER_DIR}/")
print(" Run without --eval-only first to train the model.")
sys.exit(1)
print(f" Loading base model: {MODEL_ID}")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=MODEL_CACHE_DIR,
device_map="mps",
torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f" Loading LoRA adapter from {ADAPTER_DIR}/")
model = PeftModel.from_pretrained(model, ADAPTER_DIR)
print(" Adapter loaded successfully!")
print()
return model, tokenizer
def main():
eval_only = "--eval-only" in sys.argv
print()
print("*" * 60)
print(" Spam Classifier — Liquid AI LoRA Fine-Tuning")
print(" Model: LFM2.5-1.2B-Instruct")
print(" Method: TRL SFTTrainer + PEFT LoRA")
if eval_only:
print(" Mode: Evaluation only (skipping training)")
print("*" * 60)
print()
# Step 1: Check that everything is in place
check_prerequisites()
# Step 2: Train or load saved adapter
if eval_only:
model, tokenizer = load_saved_adapter()
else:
model, tokenizer = run_training()
# Step 3: Evaluate on test set
run_evaluation(model, tokenizer)
# Step 4: Quick sanity check — classify a sample email
run_generation_test(model, tokenizer)
# Done!
print("=" * 60)
print(" All done!")
if not eval_only:
print(f" Adapter weights saved to: {ADAPTER_DIR}/")
print()
print(" Next step: python3 app.py")
print("=" * 60)
print()
if __name__ == "__main__":
main()