Spaces:
Sleeping
Sleeping
File size: 16,131 Bytes
8ab16ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | """
fine_tune.py — Fine-tune Liquid AI's LFM2.5-1.2B model for spam classification using LoRA.
This script uses Liquid AI's officially recommended fine-tuning approach:
- TRL's SFTTrainer for supervised fine-tuning
- PEFT's LoRA for memory-efficient adapter training
It does three things:
1. Trains LoRA adapters on our spam/ham training data
2. Evaluates the trained model on the test set
3. Runs a quick generation test to verify the adapter works
Prerequisites:
- Training data in training_data/train.jsonl (copied from MLX project)
- Internet connection to download the model (first run only)
Usage:
python3 fine_tune.py
Hardware: Requires Apple Silicon Mac (M1/M2/M3/M4). Targets ~8-12 GB unified memory.
"""
import json
import os
import sys
import torch
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
# The base model from Liquid AI (downloaded automatically from HuggingFace)
MODEL_ID = "LiquidAI/LFM2.5-1.2B-Instruct"
# Where to cache the downloaded model locally
MODEL_CACHE_DIR = "models"
# Path to the shared training data (shared with spam-classifier-mlx)
DATA_DIR = "../shared_training_data"
TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
TEST_FILE = os.path.join(DATA_DIR, "test.jsonl")
# Where the trained LoRA adapter weights will be saved
ADAPTER_DIR = "adapters"
# LoRA hyperparameters — from Liquid AI's official cookbook
# (Source: https://github.com/Liquid4All/cookbook/blob/main/finetuning/notebooks/sft_with_trl.ipynb)
LORA_RANK = 8 # How many parameters LoRA adds per layer
LORA_ALPHA = 16 # Scaling factor (2x the rank)
LORA_DROPOUT = 0.1 # Dropout to prevent overfitting
LORA_TARGET_MODULES = [ # Which layers inside the model get LoRA adapters
# Attention layers (Multi-Head Attention)
"q_proj", # Query projection
"k_proj", # Key projection
"v_proj", # Value projection
"out_proj", # Output projection (Liquid AI uses out_proj, not o_proj)
# Feed-forward layers (GLU — Gated Linear Unit)
"w1", # First feed-forward weight
"w2", # Second feed-forward weight
"w3", # Gate weight
# Conv layers (unique to Liquid AI's architecture)
"in_proj", # Input projection (conv block)
]
# Training hyperparameters
NUM_EPOCHS = 3 # How many times to loop through all training data
BATCH_SIZE = 1 # Smaller batch = much less memory per step
GRADIENT_ACCUMULATION_STEPS = 4 # Accumulate 4 steps to get the same effective batch size of 4
LEARNING_RATE = 2e-4 # How fast the model learns (Liquid AI's SFT recommendation)
MAX_LENGTH = 256 # Spam emails rarely need 512 tokens — 256 covers most cases
LOGGING_STEPS = 10 # Print training loss every 10 steps
# ---------------------------------------------------------------------------
# 1. Check prerequisites
# ---------------------------------------------------------------------------
def check_prerequisites():
"""Make sure the training data exists before we start."""
print("=" * 60)
print("STEP 1: Checking prerequisites")
print("=" * 60)
# Check training data
if not os.path.isfile(TRAIN_FILE):
print(f" ERROR: Training data not found: {TRAIN_FILE}")
print(" Training data lives in the shared folder:")
print(f" {os.path.abspath(DATA_DIR)}")
sys.exit(1)
print(f" Training data: {TRAIN_FILE} ... OK")
# Check test data
if not os.path.isfile(TEST_FILE):
print(f" WARNING: Test data not found: {TEST_FILE}")
print(" Evaluation step will be skipped.")
else:
print(f" Test data: {TEST_FILE} ... OK")
# Check for Apple Silicon
if torch.backends.mps.is_available():
print(" Apple Silicon: MPS backend available ... OK")
else:
print(" WARNING: MPS not available. Training will use CPU (much slower).")
print()
# ---------------------------------------------------------------------------
# 2. Load training data
# ---------------------------------------------------------------------------
def load_jsonl_as_dataset(file_path):
"""Load a JSONL file into a HuggingFace Dataset.
Each line in the JSONL file looks like:
{"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}
SFTTrainer expects a Dataset with a "messages" column in this exact format.
"""
examples = []
with open(file_path, "r") as f:
for line in f:
data = json.loads(line.strip())
examples.append(data)
return Dataset.from_list(examples)
# ---------------------------------------------------------------------------
# 3. Run LoRA fine-tuning
# ---------------------------------------------------------------------------
def run_training():
"""
Fine-tune the Liquid AI model using TRL's SFTTrainer with LoRA.
This is Liquid AI's officially recommended fine-tuning approach.
SFTTrainer handles:
- Applying the chat template to format messages correctly
- Tokenizing the text into numbers the model understands
- Masking the prompt so loss is only computed on the assistant's response
- Running the training loop (forward pass, loss, backward pass, optimizer)
Source: https://docs.liquid.ai/customization/finetuning-frameworks/trl
"""
print("=" * 60)
print("STEP 2: LoRA fine-tuning with TRL SFTTrainer")
print("=" * 60)
# --- Load the base model ---
print(f" Loading model: {MODEL_ID}")
print(" (First run downloads ~2.4 GB from HuggingFace — be patient)")
print()
# Detect if bfloat16 is supported on this Mac
# Newer macOS + Apple Silicon supports bf16, but older versions may not
use_bf16 = torch.backends.mps.is_available()
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=MODEL_CACHE_DIR, # Save to local models/ directory
device_map="mps", # Pin everything to the MPS GPU (avoids CPU spill)
torch_dtype=torch.bfloat16, # Load in bfloat16 from the start to cut memory in half
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)
# Make sure the tokenizer has a padding token (needed for batched training)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(" Model loaded successfully!")
print()
# --- Configure LoRA ---
# LoRA adds small trainable adapter layers to the model's attention layers.
# Only these adapters are trained — the original model weights stay frozen.
print(" Configuring LoRA adapters...")
print(f" Rank: {LORA_RANK}")
print(f" Alpha: {LORA_ALPHA}")
print(f" Target modules: {LORA_TARGET_MODULES}")
print(f" Dropout: {LORA_DROPOUT}")
peft_config = LoraConfig(
r=LORA_RANK,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
target_modules=LORA_TARGET_MODULES,
task_type="CAUSAL_LM",
)
# --- Load training data ---
print()
print(f" Loading training data from {TRAIN_FILE}...")
train_dataset = load_jsonl_as_dataset(TRAIN_FILE)
print(f" Loaded {len(train_dataset)} training examples")
# --- Configure the trainer ---
# SFTConfig holds all the training settings.
# SFTTrainer is the engine that runs the actual training loop.
print()
print(" Training configuration:")
print(f" Epochs: {NUM_EPOCHS}")
print(f" Batch size: {BATCH_SIZE}")
print(f" Grad accumulation: {GRADIENT_ACCUMULATION_STEPS} (effective batch = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
print(f" Learning rate: {LEARNING_RATE}")
print(f" Max sequence length: {MAX_LENGTH}")
print(f" Logging every: {LOGGING_STEPS} steps")
print()
training_args = SFTConfig(
output_dir=ADAPTER_DIR,
num_train_epochs=NUM_EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
learning_rate=LEARNING_RATE,
max_length=MAX_LENGTH,
logging_steps=LOGGING_STEPS,
save_strategy="epoch", # Save a checkpoint after each epoch
bf16=use_bf16, # Use bfloat16 if MPS supports it
gradient_checkpointing=True, # Save memory by recomputing gradients
gradient_checkpointing_kwargs={"use_reentrant": False}, # More stable on MPS
optim="adamw_torch", # Standard PyTorch AdamW (bitsandbytes not supported on MPS)
max_grad_norm=0.3, # Clip gradients to prevent instability
report_to="none", # Don't log to wandb or other services
)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
processing_class=tokenizer,
peft_config=peft_config,
)
# --- Run training ---
print(" Training started — watch the loss decrease over time!")
print(" (This will take ~1-1.5 hours on Apple Silicon for 3 epochs)")
print()
trainer.train()
# --- Save the trained adapter ---
trainer.save_model(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print()
print(f" Training complete! Adapter saved to: {ADAPTER_DIR}/")
print()
return model, tokenizer
# ---------------------------------------------------------------------------
# 4. Evaluate on test set
# ---------------------------------------------------------------------------
def run_evaluation(model, tokenizer):
"""
Evaluate the trained model on the test set.
Computes the average loss (and perplexity) on held-out test data.
Lower perplexity = the model is better at predicting the test data.
"""
print("=" * 60)
print("STEP 3: Evaluating on test set")
print("=" * 60)
if not os.path.isfile(TEST_FILE):
print(" Skipping — no test data found.")
print()
return
test_dataset = load_jsonl_as_dataset(TEST_FILE)
print(f" Loaded {len(test_dataset)} test examples")
# Use the same config but for evaluation only
eval_args = SFTConfig(
output_dir=ADAPTER_DIR,
per_device_eval_batch_size=1,
max_length=MAX_LENGTH,
bf16=torch.backends.mps.is_available(),
report_to="none",
)
trainer = SFTTrainer(
model=model,
args=eval_args,
train_dataset=test_dataset, # Required by SFTTrainer, not used for eval
eval_dataset=test_dataset,
processing_class=tokenizer,
)
metrics = trainer.evaluate()
print(f" Test loss: {metrics.get('eval_loss', 'N/A'):.4f}")
eval_loss = metrics.get("eval_loss")
if eval_loss is not None:
import math
perplexity = math.exp(eval_loss)
print(f" Test perplexity: {perplexity:.2f}")
print()
# ---------------------------------------------------------------------------
# 5. Quick generation test
# ---------------------------------------------------------------------------
def run_generation_test(model, tokenizer):
"""
Classify a sample spam email to verify the adapter actually works.
This loads the trained adapter and generates a classification response
for a clearly spammy test email.
"""
print("=" * 60)
print("STEP 4: Quick generation test")
print("=" * 60)
print(" Classifying a sample spam email to verify the adapter works...")
print()
# A clearly spammy test email
test_email = (
"Congratulations!!! You have been selected as the WINNER of our "
"$1,000,000 prize draw! Click here NOW to claim your reward. "
"This offer expires in 24 hours. Act fast! Send your bank details "
"to claim your prize immediately."
)
# Build the chat messages — same format used during training
messages = [
{
"role": "system",
"content": (
"You are an email spam classifier. Analyze the email and classify it "
"as SPAM or HAM. Explain your reasoning."
),
},
{
"role": "user",
"content": (
"Classify this email as SPAM or HAM. Give your classification on the "
"first line, then explain your reasoning in 2-3 sentences. Be specific "
"about what words, patterns, or signals you noticed.\n\n"
f"Email:\n{test_email}"
),
},
]
# Apply the chat template — converts messages into the format the model expects
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Tokenize and move to the model's device
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate the response
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=200,
do_sample=True,
temperature=0.1,
)
# Decode only the NEW tokens (skip the input prompt)
new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
print(f" Test email: {test_email[:80]}...")
print()
print(" Model response:")
print(" " + "-" * 40)
print(f" {response}")
print(" " + "-" * 40)
print()
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def load_saved_adapter():
"""Load the base model with the previously trained LoRA adapter."""
from peft import PeftModel
print("=" * 60)
print("STEP 2: Loading saved adapter (skipping training)")
print("=" * 60)
if not os.path.isdir(ADAPTER_DIR):
print(f" ERROR: No saved adapter found at {ADAPTER_DIR}/")
print(" Run without --eval-only first to train the model.")
sys.exit(1)
print(f" Loading base model: {MODEL_ID}")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=MODEL_CACHE_DIR,
device_map="mps",
torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f" Loading LoRA adapter from {ADAPTER_DIR}/")
model = PeftModel.from_pretrained(model, ADAPTER_DIR)
print(" Adapter loaded successfully!")
print()
return model, tokenizer
def main():
eval_only = "--eval-only" in sys.argv
print()
print("*" * 60)
print(" Spam Classifier — Liquid AI LoRA Fine-Tuning")
print(" Model: LFM2.5-1.2B-Instruct")
print(" Method: TRL SFTTrainer + PEFT LoRA")
if eval_only:
print(" Mode: Evaluation only (skipping training)")
print("*" * 60)
print()
# Step 1: Check that everything is in place
check_prerequisites()
# Step 2: Train or load saved adapter
if eval_only:
model, tokenizer = load_saved_adapter()
else:
model, tokenizer = run_training()
# Step 3: Evaluate on test set
run_evaluation(model, tokenizer)
# Step 4: Quick sanity check — classify a sample email
run_generation_test(model, tokenizer)
# Done!
print("=" * 60)
print(" All done!")
if not eval_only:
print(f" Adapter weights saved to: {ADAPTER_DIR}/")
print()
print(" Next step: python3 app.py")
print("=" * 60)
print()
if __name__ == "__main__":
main()
|