Spaces:

VoltageVagabond
/

spam-classifier-liquid

Sleeping

App Files Files

spam-classifier-liquid / fine_tune.py

VoltageVagabond

Upload folder using huggingface_hub

8ab16ab verified 2 months ago

Raw

History Blame

16.1 kB

	"""
	fine_tune.py — Fine-tune Liquid AI's LFM2.5-1.2B model for spam classification using LoRA.

	This script uses Liquid AI's officially recommended fine-tuning approach:
	- TRL's SFTTrainer for supervised fine-tuning
	- PEFT's LoRA for memory-efficient adapter training

	It does three things:
	1. Trains LoRA adapters on our spam/ham training data
	2. Evaluates the trained model on the test set
	3. Runs a quick generation test to verify the adapter works

	Prerequisites:
	- Training data in training_data/train.jsonl (copied from MLX project)
	- Internet connection to download the model (first run only)

	Usage:
	python3 fine_tune.py

	Hardware: Requires Apple Silicon Mac (M1/M2/M3/M4). Targets ~8-12 GB unified memory.
	"""

	import json
	import os
	import sys

	import torch
	from datasets import Dataset
	from peft import LoraConfig
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from trl import SFTConfig, SFTTrainer

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	# The base model from Liquid AI (downloaded automatically from HuggingFace)
	MODEL_ID = "LiquidAI/LFM2.5-1.2B-Instruct"

	# Where to cache the downloaded model locally
	MODEL_CACHE_DIR = "models"

	# Path to the shared training data (shared with spam-classifier-mlx)
	DATA_DIR = "../shared_training_data"
	TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
	TEST_FILE = os.path.join(DATA_DIR, "test.jsonl")

	# Where the trained LoRA adapter weights will be saved
	ADAPTER_DIR = "adapters"

	# LoRA hyperparameters — from Liquid AI's official cookbook
	# (Source: https://github.com/Liquid4All/cookbook/blob/main/finetuning/notebooks/sft_with_trl.ipynb)
	LORA_RANK = 8 # How many parameters LoRA adds per layer
	LORA_ALPHA = 16 # Scaling factor (2x the rank)
	LORA_DROPOUT = 0.1 # Dropout to prevent overfitting
	LORA_TARGET_MODULES = [ # Which layers inside the model get LoRA adapters
	# Attention layers (Multi-Head Attention)
	"q_proj", # Query projection
	"k_proj", # Key projection
	"v_proj", # Value projection
	"out_proj", # Output projection (Liquid AI uses out_proj, not o_proj)
	# Feed-forward layers (GLU — Gated Linear Unit)
	"w1", # First feed-forward weight
	"w2", # Second feed-forward weight
	"w3", # Gate weight
	# Conv layers (unique to Liquid AI's architecture)
	"in_proj", # Input projection (conv block)
	]

	# Training hyperparameters
	NUM_EPOCHS = 3 # How many times to loop through all training data
	BATCH_SIZE = 1 # Smaller batch = much less memory per step
	GRADIENT_ACCUMULATION_STEPS = 4 # Accumulate 4 steps to get the same effective batch size of 4
	LEARNING_RATE = 2e-4 # How fast the model learns (Liquid AI's SFT recommendation)
	MAX_LENGTH = 256 # Spam emails rarely need 512 tokens — 256 covers most cases
	LOGGING_STEPS = 10 # Print training loss every 10 steps


	# ---------------------------------------------------------------------------
	# 1. Check prerequisites
	# ---------------------------------------------------------------------------

	def check_prerequisites():
	"""Make sure the training data exists before we start."""
	print("=" * 60)
	print("STEP 1: Checking prerequisites")
	print("=" * 60)

	# Check training data
	if not os.path.isfile(TRAIN_FILE):
	print(f" ERROR: Training data not found: {TRAIN_FILE}")
	print(" Training data lives in the shared folder:")
	print(f" {os.path.abspath(DATA_DIR)}")
	sys.exit(1)
	print(f" Training data: {TRAIN_FILE} ... OK")

	# Check test data
	if not os.path.isfile(TEST_FILE):
	print(f" WARNING: Test data not found: {TEST_FILE}")
	print(" Evaluation step will be skipped.")
	else:
	print(f" Test data: {TEST_FILE} ... OK")

	# Check for Apple Silicon
	if torch.backends.mps.is_available():
	print(" Apple Silicon: MPS backend available ... OK")
	else:
	print(" WARNING: MPS not available. Training will use CPU (much slower).")

	print()


	# ---------------------------------------------------------------------------
	# 2. Load training data
	# ---------------------------------------------------------------------------

	def load_jsonl_as_dataset(file_path):
	"""Load a JSONL file into a HuggingFace Dataset.

	Each line in the JSONL file looks like:
	{"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}

	SFTTrainer expects a Dataset with a "messages" column in this exact format.
	"""
	examples = []
	with open(file_path, "r") as f:
	for line in f:
	data = json.loads(line.strip())
	examples.append(data)
	return Dataset.from_list(examples)


	# ---------------------------------------------------------------------------
	# 3. Run LoRA fine-tuning
	# ---------------------------------------------------------------------------

	def run_training():
	"""
	Fine-tune the Liquid AI model using TRL's SFTTrainer with LoRA.

	This is Liquid AI's officially recommended fine-tuning approach.
	SFTTrainer handles:
	- Applying the chat template to format messages correctly
	- Tokenizing the text into numbers the model understands
	- Masking the prompt so loss is only computed on the assistant's response
	- Running the training loop (forward pass, loss, backward pass, optimizer)

	Source: https://docs.liquid.ai/customization/finetuning-frameworks/trl
	"""
	print("=" * 60)
	print("STEP 2: LoRA fine-tuning with TRL SFTTrainer")
	print("=" * 60)

	# --- Load the base model ---
	print(f" Loading model: {MODEL_ID}")
	print(" (First run downloads ~2.4 GB from HuggingFace — be patient)")
	print()

	# Detect if bfloat16 is supported on this Mac
	# Newer macOS + Apple Silicon supports bf16, but older versions may not
	use_bf16 = torch.backends.mps.is_available()

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	cache_dir=MODEL_CACHE_DIR, # Save to local models/ directory
	device_map="mps", # Pin everything to the MPS GPU (avoids CPU spill)
	torch_dtype=torch.bfloat16, # Load in bfloat16 from the start to cut memory in half
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)

	# Make sure the tokenizer has a padding token (needed for batched training)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print(" Model loaded successfully!")
	print()

	# --- Configure LoRA ---
	# LoRA adds small trainable adapter layers to the model's attention layers.
	# Only these adapters are trained — the original model weights stay frozen.
	print(" Configuring LoRA adapters...")
	print(f" Rank: {LORA_RANK}")
	print(f" Alpha: {LORA_ALPHA}")
	print(f" Target modules: {LORA_TARGET_MODULES}")
	print(f" Dropout: {LORA_DROPOUT}")

	peft_config = LoraConfig(
	r=LORA_RANK,
	lora_alpha=LORA_ALPHA,
	lora_dropout=LORA_DROPOUT,
	target_modules=LORA_TARGET_MODULES,
	task_type="CAUSAL_LM",
	)

	# --- Load training data ---
	print()
	print(f" Loading training data from {TRAIN_FILE}...")
	train_dataset = load_jsonl_as_dataset(TRAIN_FILE)
	print(f" Loaded {len(train_dataset)} training examples")

	# --- Configure the trainer ---
	# SFTConfig holds all the training settings.
	# SFTTrainer is the engine that runs the actual training loop.
	print()
	print(" Training configuration:")
	print(f" Epochs: {NUM_EPOCHS}")
	print(f" Batch size: {BATCH_SIZE}")
	print(f" Grad accumulation: {GRADIENT_ACCUMULATION_STEPS} (effective batch = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
	print(f" Learning rate: {LEARNING_RATE}")
	print(f" Max sequence length: {MAX_LENGTH}")
	print(f" Logging every: {LOGGING_STEPS} steps")
	print()

	training_args = SFTConfig(
	output_dir=ADAPTER_DIR,
	num_train_epochs=NUM_EPOCHS,
	per_device_train_batch_size=BATCH_SIZE,
	gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
	learning_rate=LEARNING_RATE,
	max_length=MAX_LENGTH,
	logging_steps=LOGGING_STEPS,
	save_strategy="epoch", # Save a checkpoint after each epoch
	bf16=use_bf16, # Use bfloat16 if MPS supports it
	gradient_checkpointing=True, # Save memory by recomputing gradients
	gradient_checkpointing_kwargs={"use_reentrant": False}, # More stable on MPS
	optim="adamw_torch", # Standard PyTorch AdamW (bitsandbytes not supported on MPS)
	max_grad_norm=0.3, # Clip gradients to prevent instability
	report_to="none", # Don't log to wandb or other services
	)

	trainer = SFTTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	processing_class=tokenizer,
	peft_config=peft_config,
	)

	# --- Run training ---
	print(" Training started — watch the loss decrease over time!")
	print(" (This will take ~1-1.5 hours on Apple Silicon for 3 epochs)")
	print()

	trainer.train()

	# --- Save the trained adapter ---
	trainer.save_model(ADAPTER_DIR)
	tokenizer.save_pretrained(ADAPTER_DIR)

	print()
	print(f" Training complete! Adapter saved to: {ADAPTER_DIR}/")
	print()

	return model, tokenizer


	# ---------------------------------------------------------------------------
	# 4. Evaluate on test set
	# ---------------------------------------------------------------------------

	def run_evaluation(model, tokenizer):
	"""
	Evaluate the trained model on the test set.

	Computes the average loss (and perplexity) on held-out test data.
	Lower perplexity = the model is better at predicting the test data.
	"""
	print("=" * 60)
	print("STEP 3: Evaluating on test set")
	print("=" * 60)

	if not os.path.isfile(TEST_FILE):
	print(" Skipping — no test data found.")
	print()
	return

	test_dataset = load_jsonl_as_dataset(TEST_FILE)
	print(f" Loaded {len(test_dataset)} test examples")

	# Use the same config but for evaluation only
	eval_args = SFTConfig(
	output_dir=ADAPTER_DIR,
	per_device_eval_batch_size=1,
	max_length=MAX_LENGTH,
	bf16=torch.backends.mps.is_available(),
	report_to="none",
	)

	trainer = SFTTrainer(
	model=model,
	args=eval_args,
	train_dataset=test_dataset, # Required by SFTTrainer, not used for eval
	eval_dataset=test_dataset,
	processing_class=tokenizer,
	)

	metrics = trainer.evaluate()
	print(f" Test loss: {metrics.get('eval_loss', 'N/A'):.4f}")

	eval_loss = metrics.get("eval_loss")
	if eval_loss is not None:
	import math
	perplexity = math.exp(eval_loss)
	print(f" Test perplexity: {perplexity:.2f}")

	print()


	# ---------------------------------------------------------------------------
	# 5. Quick generation test
	# ---------------------------------------------------------------------------

	def run_generation_test(model, tokenizer):
	"""
	Classify a sample spam email to verify the adapter actually works.

	This loads the trained adapter and generates a classification response
	for a clearly spammy test email.
	"""
	print("=" * 60)
	print("STEP 4: Quick generation test")
	print("=" * 60)
	print(" Classifying a sample spam email to verify the adapter works...")
	print()

	# A clearly spammy test email
	test_email = (
	"Congratulations!!! You have been selected as the WINNER of our "
	"$1,000,000 prize draw! Click here NOW to claim your reward. "
	"This offer expires in 24 hours. Act fast! Send your bank details "
	"to claim your prize immediately."
	)

	# Build the chat messages — same format used during training
	messages = [
	{
	"role": "system",
	"content": (
	"You are an email spam classifier. Analyze the email and classify it "
	"as SPAM or HAM. Explain your reasoning."
	),
	},
	{
	"role": "user",
	"content": (
	"Classify this email as SPAM or HAM. Give your classification on the "
	"first line, then explain your reasoning in 2-3 sentences. Be specific "
	"about what words, patterns, or signals you noticed.\n\n"
	f"Email:\n{test_email}"
	),
	},
	]

	# Apply the chat template — converts messages into the format the model expects
	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	# Tokenize and move to the model's device
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	# Generate the response
	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=200,
	do_sample=True,
	temperature=0.1,
	)

	# Decode only the NEW tokens (skip the input prompt)
	new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
	response = tokenizer.decode(new_tokens, skip_special_tokens=True)

	print(f" Test email: {test_email[:80]}...")
	print()
	print(" Model response:")
	print(" " + "-" * 40)
	print(f" {response}")
	print(" " + "-" * 40)
	print()


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def load_saved_adapter():
	"""Load the base model with the previously trained LoRA adapter."""
	from peft import PeftModel

	print("=" * 60)
	print("STEP 2: Loading saved adapter (skipping training)")
	print("=" * 60)

	if not os.path.isdir(ADAPTER_DIR):
	print(f" ERROR: No saved adapter found at {ADAPTER_DIR}/")
	print(" Run without --eval-only first to train the model.")
	sys.exit(1)

	print(f" Loading base model: {MODEL_ID}")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	cache_dir=MODEL_CACHE_DIR,
	device_map="mps",
	torch_dtype=torch.bfloat16,
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print(f" Loading LoRA adapter from {ADAPTER_DIR}/")
	model = PeftModel.from_pretrained(model, ADAPTER_DIR)
	print(" Adapter loaded successfully!")
	print()

	return model, tokenizer


	def main():
	eval_only = "--eval-only" in sys.argv

	print()
	print("" 60)
	print(" Spam Classifier — Liquid AI LoRA Fine-Tuning")
	print(" Model: LFM2.5-1.2B-Instruct")
	print(" Method: TRL SFTTrainer + PEFT LoRA")
	if eval_only:
	print(" Mode: Evaluation only (skipping training)")
	print("" 60)
	print()

	# Step 1: Check that everything is in place
	check_prerequisites()

	# Step 2: Train or load saved adapter
	if eval_only:
	model, tokenizer = load_saved_adapter()
	else:
	model, tokenizer = run_training()

	# Step 3: Evaluate on test set
	run_evaluation(model, tokenizer)

	# Step 4: Quick sanity check — classify a sample email
	run_generation_test(model, tokenizer)

	# Done!
	print("=" * 60)
	print(" All done!")
	if not eval_only:
	print(f" Adapter weights saved to: {ADAPTER_DIR}/")
	print()
	print(" Next step: python3 app.py")
	print("=" * 60)
	print()


	if __name__ == "__main__":
	main()