Spaces:
Sleeping
Sleeping
| """ | |
| fine_tune.py — Fine-tune Liquid AI's LFM2.5-1.2B model for spam classification using LoRA. | |
| This script uses Liquid AI's officially recommended fine-tuning approach: | |
| - TRL's SFTTrainer for supervised fine-tuning | |
| - PEFT's LoRA for memory-efficient adapter training | |
| It does three things: | |
| 1. Trains LoRA adapters on our spam/ham training data | |
| 2. Evaluates the trained model on the test set | |
| 3. Runs a quick generation test to verify the adapter works | |
| Prerequisites: | |
| - Training data in training_data/train.jsonl (copied from MLX project) | |
| - Internet connection to download the model (first run only) | |
| Usage: | |
| python3 fine_tune.py | |
| Hardware: Requires Apple Silicon Mac (M1/M2/M3/M4). Targets ~8-12 GB unified memory. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import torch | |
| from datasets import Dataset | |
| from peft import LoraConfig | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from trl import SFTConfig, SFTTrainer | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| # The base model from Liquid AI (downloaded automatically from HuggingFace) | |
| MODEL_ID = "LiquidAI/LFM2.5-1.2B-Instruct" | |
| # Where to cache the downloaded model locally | |
| MODEL_CACHE_DIR = "models" | |
| # Path to the shared training data (shared with spam-classifier-mlx) | |
| DATA_DIR = "../shared_training_data" | |
| TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl") | |
| TEST_FILE = os.path.join(DATA_DIR, "test.jsonl") | |
| # Where the trained LoRA adapter weights will be saved | |
| ADAPTER_DIR = "adapters" | |
| # LoRA hyperparameters — from Liquid AI's official cookbook | |
| # (Source: https://github.com/Liquid4All/cookbook/blob/main/finetuning/notebooks/sft_with_trl.ipynb) | |
| LORA_RANK = 8 # How many parameters LoRA adds per layer | |
| LORA_ALPHA = 16 # Scaling factor (2x the rank) | |
| LORA_DROPOUT = 0.1 # Dropout to prevent overfitting | |
| LORA_TARGET_MODULES = [ # Which layers inside the model get LoRA adapters | |
| # Attention layers (Multi-Head Attention) | |
| "q_proj", # Query projection | |
| "k_proj", # Key projection | |
| "v_proj", # Value projection | |
| "out_proj", # Output projection (Liquid AI uses out_proj, not o_proj) | |
| # Feed-forward layers (GLU — Gated Linear Unit) | |
| "w1", # First feed-forward weight | |
| "w2", # Second feed-forward weight | |
| "w3", # Gate weight | |
| # Conv layers (unique to Liquid AI's architecture) | |
| "in_proj", # Input projection (conv block) | |
| ] | |
| # Training hyperparameters | |
| NUM_EPOCHS = 3 # How many times to loop through all training data | |
| BATCH_SIZE = 1 # Smaller batch = much less memory per step | |
| GRADIENT_ACCUMULATION_STEPS = 4 # Accumulate 4 steps to get the same effective batch size of 4 | |
| LEARNING_RATE = 2e-4 # How fast the model learns (Liquid AI's SFT recommendation) | |
| MAX_LENGTH = 256 # Spam emails rarely need 512 tokens — 256 covers most cases | |
| LOGGING_STEPS = 10 # Print training loss every 10 steps | |
| # --------------------------------------------------------------------------- | |
| # 1. Check prerequisites | |
| # --------------------------------------------------------------------------- | |
| def check_prerequisites(): | |
| """Make sure the training data exists before we start.""" | |
| print("=" * 60) | |
| print("STEP 1: Checking prerequisites") | |
| print("=" * 60) | |
| # Check training data | |
| if not os.path.isfile(TRAIN_FILE): | |
| print(f" ERROR: Training data not found: {TRAIN_FILE}") | |
| print(" Training data lives in the shared folder:") | |
| print(f" {os.path.abspath(DATA_DIR)}") | |
| sys.exit(1) | |
| print(f" Training data: {TRAIN_FILE} ... OK") | |
| # Check test data | |
| if not os.path.isfile(TEST_FILE): | |
| print(f" WARNING: Test data not found: {TEST_FILE}") | |
| print(" Evaluation step will be skipped.") | |
| else: | |
| print(f" Test data: {TEST_FILE} ... OK") | |
| # Check for Apple Silicon | |
| if torch.backends.mps.is_available(): | |
| print(" Apple Silicon: MPS backend available ... OK") | |
| else: | |
| print(" WARNING: MPS not available. Training will use CPU (much slower).") | |
| print() | |
| # --------------------------------------------------------------------------- | |
| # 2. Load training data | |
| # --------------------------------------------------------------------------- | |
| def load_jsonl_as_dataset(file_path): | |
| """Load a JSONL file into a HuggingFace Dataset. | |
| Each line in the JSONL file looks like: | |
| {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]} | |
| SFTTrainer expects a Dataset with a "messages" column in this exact format. | |
| """ | |
| examples = [] | |
| with open(file_path, "r") as f: | |
| for line in f: | |
| data = json.loads(line.strip()) | |
| examples.append(data) | |
| return Dataset.from_list(examples) | |
| # --------------------------------------------------------------------------- | |
| # 3. Run LoRA fine-tuning | |
| # --------------------------------------------------------------------------- | |
| def run_training(): | |
| """ | |
| Fine-tune the Liquid AI model using TRL's SFTTrainer with LoRA. | |
| This is Liquid AI's officially recommended fine-tuning approach. | |
| SFTTrainer handles: | |
| - Applying the chat template to format messages correctly | |
| - Tokenizing the text into numbers the model understands | |
| - Masking the prompt so loss is only computed on the assistant's response | |
| - Running the training loop (forward pass, loss, backward pass, optimizer) | |
| Source: https://docs.liquid.ai/customization/finetuning-frameworks/trl | |
| """ | |
| print("=" * 60) | |
| print("STEP 2: LoRA fine-tuning with TRL SFTTrainer") | |
| print("=" * 60) | |
| # --- Load the base model --- | |
| print(f" Loading model: {MODEL_ID}") | |
| print(" (First run downloads ~2.4 GB from HuggingFace — be patient)") | |
| print() | |
| # Detect if bfloat16 is supported on this Mac | |
| # Newer macOS + Apple Silicon supports bf16, but older versions may not | |
| use_bf16 = torch.backends.mps.is_available() | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| cache_dir=MODEL_CACHE_DIR, # Save to local models/ directory | |
| device_map="mps", # Pin everything to the MPS GPU (avoids CPU spill) | |
| torch_dtype=torch.bfloat16, # Load in bfloat16 from the start to cut memory in half | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR) | |
| # Make sure the tokenizer has a padding token (needed for batched training) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print(" Model loaded successfully!") | |
| print() | |
| # --- Configure LoRA --- | |
| # LoRA adds small trainable adapter layers to the model's attention layers. | |
| # Only these adapters are trained — the original model weights stay frozen. | |
| print(" Configuring LoRA adapters...") | |
| print(f" Rank: {LORA_RANK}") | |
| print(f" Alpha: {LORA_ALPHA}") | |
| print(f" Target modules: {LORA_TARGET_MODULES}") | |
| print(f" Dropout: {LORA_DROPOUT}") | |
| peft_config = LoraConfig( | |
| r=LORA_RANK, | |
| lora_alpha=LORA_ALPHA, | |
| lora_dropout=LORA_DROPOUT, | |
| target_modules=LORA_TARGET_MODULES, | |
| task_type="CAUSAL_LM", | |
| ) | |
| # --- Load training data --- | |
| print() | |
| print(f" Loading training data from {TRAIN_FILE}...") | |
| train_dataset = load_jsonl_as_dataset(TRAIN_FILE) | |
| print(f" Loaded {len(train_dataset)} training examples") | |
| # --- Configure the trainer --- | |
| # SFTConfig holds all the training settings. | |
| # SFTTrainer is the engine that runs the actual training loop. | |
| print() | |
| print(" Training configuration:") | |
| print(f" Epochs: {NUM_EPOCHS}") | |
| print(f" Batch size: {BATCH_SIZE}") | |
| print(f" Grad accumulation: {GRADIENT_ACCUMULATION_STEPS} (effective batch = {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})") | |
| print(f" Learning rate: {LEARNING_RATE}") | |
| print(f" Max sequence length: {MAX_LENGTH}") | |
| print(f" Logging every: {LOGGING_STEPS} steps") | |
| print() | |
| training_args = SFTConfig( | |
| output_dir=ADAPTER_DIR, | |
| num_train_epochs=NUM_EPOCHS, | |
| per_device_train_batch_size=BATCH_SIZE, | |
| gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, | |
| learning_rate=LEARNING_RATE, | |
| max_length=MAX_LENGTH, | |
| logging_steps=LOGGING_STEPS, | |
| save_strategy="epoch", # Save a checkpoint after each epoch | |
| bf16=use_bf16, # Use bfloat16 if MPS supports it | |
| gradient_checkpointing=True, # Save memory by recomputing gradients | |
| gradient_checkpointing_kwargs={"use_reentrant": False}, # More stable on MPS | |
| optim="adamw_torch", # Standard PyTorch AdamW (bitsandbytes not supported on MPS) | |
| max_grad_norm=0.3, # Clip gradients to prevent instability | |
| report_to="none", # Don't log to wandb or other services | |
| ) | |
| trainer = SFTTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| processing_class=tokenizer, | |
| peft_config=peft_config, | |
| ) | |
| # --- Run training --- | |
| print(" Training started — watch the loss decrease over time!") | |
| print(" (This will take ~1-1.5 hours on Apple Silicon for 3 epochs)") | |
| print() | |
| trainer.train() | |
| # --- Save the trained adapter --- | |
| trainer.save_model(ADAPTER_DIR) | |
| tokenizer.save_pretrained(ADAPTER_DIR) | |
| print() | |
| print(f" Training complete! Adapter saved to: {ADAPTER_DIR}/") | |
| print() | |
| return model, tokenizer | |
| # --------------------------------------------------------------------------- | |
| # 4. Evaluate on test set | |
| # --------------------------------------------------------------------------- | |
| def run_evaluation(model, tokenizer): | |
| """ | |
| Evaluate the trained model on the test set. | |
| Computes the average loss (and perplexity) on held-out test data. | |
| Lower perplexity = the model is better at predicting the test data. | |
| """ | |
| print("=" * 60) | |
| print("STEP 3: Evaluating on test set") | |
| print("=" * 60) | |
| if not os.path.isfile(TEST_FILE): | |
| print(" Skipping — no test data found.") | |
| print() | |
| return | |
| test_dataset = load_jsonl_as_dataset(TEST_FILE) | |
| print(f" Loaded {len(test_dataset)} test examples") | |
| # Use the same config but for evaluation only | |
| eval_args = SFTConfig( | |
| output_dir=ADAPTER_DIR, | |
| per_device_eval_batch_size=1, | |
| max_length=MAX_LENGTH, | |
| bf16=torch.backends.mps.is_available(), | |
| report_to="none", | |
| ) | |
| trainer = SFTTrainer( | |
| model=model, | |
| args=eval_args, | |
| train_dataset=test_dataset, # Required by SFTTrainer, not used for eval | |
| eval_dataset=test_dataset, | |
| processing_class=tokenizer, | |
| ) | |
| metrics = trainer.evaluate() | |
| print(f" Test loss: {metrics.get('eval_loss', 'N/A'):.4f}") | |
| eval_loss = metrics.get("eval_loss") | |
| if eval_loss is not None: | |
| import math | |
| perplexity = math.exp(eval_loss) | |
| print(f" Test perplexity: {perplexity:.2f}") | |
| print() | |
| # --------------------------------------------------------------------------- | |
| # 5. Quick generation test | |
| # --------------------------------------------------------------------------- | |
| def run_generation_test(model, tokenizer): | |
| """ | |
| Classify a sample spam email to verify the adapter actually works. | |
| This loads the trained adapter and generates a classification response | |
| for a clearly spammy test email. | |
| """ | |
| print("=" * 60) | |
| print("STEP 4: Quick generation test") | |
| print("=" * 60) | |
| print(" Classifying a sample spam email to verify the adapter works...") | |
| print() | |
| # A clearly spammy test email | |
| test_email = ( | |
| "Congratulations!!! You have been selected as the WINNER of our " | |
| "$1,000,000 prize draw! Click here NOW to claim your reward. " | |
| "This offer expires in 24 hours. Act fast! Send your bank details " | |
| "to claim your prize immediately." | |
| ) | |
| # Build the chat messages — same format used during training | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are an email spam classifier. Analyze the email and classify it " | |
| "as SPAM or HAM. Explain your reasoning." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": ( | |
| "Classify this email as SPAM or HAM. Give your classification on the " | |
| "first line, then explain your reasoning in 2-3 sentences. Be specific " | |
| "about what words, patterns, or signals you noticed.\n\n" | |
| f"Email:\n{test_email}" | |
| ), | |
| }, | |
| ] | |
| # Apply the chat template — converts messages into the format the model expects | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| # Tokenize and move to the model's device | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| # Generate the response | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| do_sample=True, | |
| temperature=0.1, | |
| ) | |
| # Decode only the NEW tokens (skip the input prompt) | |
| new_tokens = output_ids[0][inputs["input_ids"].shape[1]:] | |
| response = tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| print(f" Test email: {test_email[:80]}...") | |
| print() | |
| print(" Model response:") | |
| print(" " + "-" * 40) | |
| print(f" {response}") | |
| print(" " + "-" * 40) | |
| print() | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def load_saved_adapter(): | |
| """Load the base model with the previously trained LoRA adapter.""" | |
| from peft import PeftModel | |
| print("=" * 60) | |
| print("STEP 2: Loading saved adapter (skipping training)") | |
| print("=" * 60) | |
| if not os.path.isdir(ADAPTER_DIR): | |
| print(f" ERROR: No saved adapter found at {ADAPTER_DIR}/") | |
| print(" Run without --eval-only first to train the model.") | |
| sys.exit(1) | |
| print(f" Loading base model: {MODEL_ID}") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| cache_dir=MODEL_CACHE_DIR, | |
| device_map="mps", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=MODEL_CACHE_DIR) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print(f" Loading LoRA adapter from {ADAPTER_DIR}/") | |
| model = PeftModel.from_pretrained(model, ADAPTER_DIR) | |
| print(" Adapter loaded successfully!") | |
| print() | |
| return model, tokenizer | |
| def main(): | |
| eval_only = "--eval-only" in sys.argv | |
| print() | |
| print("*" * 60) | |
| print(" Spam Classifier — Liquid AI LoRA Fine-Tuning") | |
| print(" Model: LFM2.5-1.2B-Instruct") | |
| print(" Method: TRL SFTTrainer + PEFT LoRA") | |
| if eval_only: | |
| print(" Mode: Evaluation only (skipping training)") | |
| print("*" * 60) | |
| print() | |
| # Step 1: Check that everything is in place | |
| check_prerequisites() | |
| # Step 2: Train or load saved adapter | |
| if eval_only: | |
| model, tokenizer = load_saved_adapter() | |
| else: | |
| model, tokenizer = run_training() | |
| # Step 3: Evaluate on test set | |
| run_evaluation(model, tokenizer) | |
| # Step 4: Quick sanity check — classify a sample email | |
| run_generation_test(model, tokenizer) | |
| # Done! | |
| print("=" * 60) | |
| print(" All done!") | |
| if not eval_only: | |
| print(f" Adapter weights saved to: {ADAPTER_DIR}/") | |
| print() | |
| print(" Next step: python3 app.py") | |
| print("=" * 60) | |
| print() | |
| if __name__ == "__main__": | |
| main() | |