# Training pipeline for the spam classifier # ENGT 375 Project - Spring 2026 - ODU # # This script loads two email datasets, engineers features, compares three # models (Random Forest, Logistic Regression, SVM), builds a VotingClassifier # ensemble, and saves all artifacts needed by the Gradio app. import json import warnings import numpy as np import pandas as pd from pathlib import Path from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.calibration import CalibratedClassifierCV from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import classification_report, precision_recall_curve from scipy.sparse import hstack, csr_matrix import joblib # Import our shared utility functions (preprocess_text, compute_metadata_features) from utils import preprocess_text, compute_metadata_features, META_FEATURE_NAMES warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=DeprecationWarning) # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- project_dir = Path(__file__).parent data_dir = project_dir / 'data' models_dir = project_dir / 'models' random_state = 42 KAGGLE_CAP = 100_000 # max rows to keep from the Kaggle CSV (stratified) # --------------------------------------------------------------------------- # Helper: collect accuracy, precision, recall, f1 into a dict # --------------------------------------------------------------------------- def get_metrics(y_true, y_pred): """Return a dict with accuracy, precision, recall, and f1 for the positive class (spam = 1). We pull these from classification_report so we don't have to compute them twice.""" report = classification_report(y_true, y_pred, output_dict=True) return { 'accuracy': round(report['accuracy'], 4), 'precision': round(report['1']['precision'], 4), 'recall': round(report['1']['recall'], 4), 'f1': round(report['1']['f1-score'], 4), } # =================================================================== # 1. DATA LOADING # =================================================================== print('=' * 60) print('STEP 1 — Loading datasets') print('=' * 60) # --- 1a. Kaggle spam CSV --------------------------------------------------- kaggle_csv = data_dir / 'spam_Emails_data.csv' if kaggle_csv.exists(): print('Loading Kaggle CSV ...') df_kaggle = pd.read_csv(kaggle_csv) # Normalize labels to lowercase ('ham' / 'spam') df_kaggle['label'] = df_kaggle['label'].str.strip().str.lower() # Keep only rows where label is ham or spam (drop anything weird) df_kaggle = df_kaggle[df_kaggle['label'].isin(['ham', 'spam'])] df_kaggle = df_kaggle[['text', 'label']].dropna() print(' Kaggle total: %d emails' % len(df_kaggle)) # Stratified sample so training stays fast (keeps class balance) if len(df_kaggle) > KAGGLE_CAP: df_kaggle = df_kaggle.groupby('label', group_keys=False).apply( lambda x: x.sample( n=int(KAGGLE_CAP * len(x) / len(df_kaggle)), random_state=random_state, ) ) print(' After stratified cap: %d emails' % len(df_kaggle)) else: print('WARNING: Kaggle CSV not found at %s' % kaggle_csv) df_kaggle = pd.DataFrame(columns=['text', 'label']) # --- 1b. GitHub email-dataset (txt files) ---------------------------------- github_base = data_dir / 'email-dataset-main' / 'email-dataset-main' / 'dataset' github_rows = [] if github_base.exists(): print('Loading GitHub email-dataset ...') # Folder 1 = ham, folder 2 = spam for folder_name, label in [('1', 'ham'), ('2', 'spam')]: folder = github_base / folder_name if not folder.exists(): continue for fpath in folder.iterdir(): if not fpath.is_file(): continue try: content = fpath.read_text(encoding='utf-8', errors='replace') if content.strip(): github_rows.append({'text': content, 'label': label}) except Exception: pass # skip unreadable files print(' GitHub dataset: %d emails' % len(github_rows)) else: print('WARNING: GitHub dataset not found at %s' % github_base) # --- 1c. Combine and deduplicate ------------------------------------------- df = pd.concat( [df_kaggle, pd.DataFrame(github_rows)], ignore_index=True, ) print('Combined: %d emails' % len(df)) before_dedup = len(df) df = df.drop_duplicates(subset=['text']).reset_index(drop=True) print('After dedup: %d emails (removed %d duplicates)' % (len(df), before_dedup - len(df))) ham_count = (df['label'] == 'ham').sum() spam_count = (df['label'] == 'spam').sum() print(' Ham: %d | Spam: %d' % (ham_count, spam_count)) if len(df) == 0: raise RuntimeError('No training data found! Check the data/ folder.') # =================================================================== # 2. FEATURE ENGINEERING # =================================================================== print() print('=' * 60) print('STEP 2 — Feature engineering') print('=' * 60) # 2a. Preprocess (stem, remove stopwords, etc.) print('Preprocessing text ...') df['clean_text'] = df['text'].apply(preprocess_text) # 2b. TF-IDF on the cleaned text print('Fitting TF-IDF (max 3000 features, 1-3 grams) ...') tfidf = TfidfVectorizer( max_features=3000, ngram_range=(1, 3), min_df=2, max_df=0.90, sublinear_tf=True, ) X_tfidf = tfidf.fit_transform(df['clean_text']) print(' TF-IDF shape: %s' % str(X_tfidf.shape)) # 2c. 24 metadata features (things like exclamation density, caps ratio, ...) print('Computing 24 metadata features ...') X_meta_raw = compute_metadata_features(df['text'].values) # Scale metadata to 0-1 so they match the TF-IDF scale meta_scaler = MinMaxScaler() X_meta_scaled = meta_scaler.fit_transform(X_meta_raw) # 2d. Combine TF-IDF (sparse) + metadata (dense -> sparse) into one matrix X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)]) feature_names = list(tfidf.get_feature_names_out()) + META_FEATURE_NAMES print('Total features: %d (%d TF-IDF + %d metadata)' % ( len(feature_names), X_tfidf.shape[1], len(META_FEATURE_NAMES) )) # 2e. Encode labels: 1 = spam, 0 = ham y = (df['label'] == 'spam').astype(int) # =================================================================== # 3. MODEL COMPARISON # =================================================================== print() print('=' * 60) print('STEP 3 — Train / test split + model comparison') print('=' * 60) X_train, X_test, y_train, y_test = train_test_split( X_combined, y, test_size=0.30, random_state=random_state, stratify=y, ) print('Train: %d | Test: %d' % (X_train.shape[0], X_test.shape[0])) # We'll store metrics for each model here all_metrics = {} # --- Random Forest ---------------------------------------------------------- print('\nTraining Random Forest ...') rf = RandomForestClassifier( n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=random_state, ) rf.fit(X_train, y_train) rf_pred = rf.predict(X_test) print(classification_report(y_test, rf_pred, target_names=['Ham', 'Spam'])) all_metrics['RandomForest'] = get_metrics(y_test, rf_pred) # --- Logistic Regression ---------------------------------------------------- print('Training Logistic Regression ...') lr = LogisticRegression( max_iter=1000, class_weight='balanced', random_state=random_state, ) lr.fit(X_train, y_train) lr_pred = lr.predict(X_test) print(classification_report(y_test, lr_pred, target_names=['Ham', 'Spam'])) all_metrics['LogisticRegression'] = get_metrics(y_test, lr_pred) # --- SVM (linear kernel) ---------------------------------------------------- # We use LinearSVC + CalibratedClassifierCV instead of SVC(probability=True) # because SVC is O(n^2) and would take hours on 100K emails. # CalibratedClassifierCV wraps LinearSVC to add predict_proba() support, # which is needed for the soft-voting ensemble. print('Training SVM (LinearSVC + calibration for probabilities) ...') base_svm = LinearSVC( class_weight='balanced', max_iter=2000, random_state=random_state, ) svm = CalibratedClassifierCV(base_svm, cv=5) svm.fit(X_train, y_train) svm_pred = svm.predict(X_test) print(classification_report(y_test, svm_pred, target_names=['Ham', 'Spam'])) all_metrics['SVM'] = get_metrics(y_test, svm_pred) # =================================================================== # 4. ENSEMBLE (VotingClassifier) # =================================================================== print() print('=' * 60) print('STEP 4 — Voting ensemble (soft voting)') print('=' * 60) # Build fresh estimators (not the already-fitted ones) for VotingClassifier ensemble = VotingClassifier( estimators=[ ('rf', RandomForestClassifier( n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=random_state)), ('lr', LogisticRegression( max_iter=1000, class_weight='balanced', random_state=random_state)), ('svm', CalibratedClassifierCV( LinearSVC(class_weight='balanced', max_iter=2000, random_state=random_state), cv=5)), ], voting='soft', n_jobs=-1, ) print('Fitting ensemble (all three models from scratch) ...') ensemble.fit(X_train, y_train) ens_pred = ensemble.predict(X_test) print(classification_report(y_test, ens_pred, target_names=['Ham', 'Spam'])) all_metrics['VotingEnsemble'] = get_metrics(y_test, ens_pred) # --- Optimal threshold via precision-recall curve --- # We want to find the threshold where ham precision >= 99%. # That means: when the model says "ham", it's right at least 99% of the time. print('Computing optimal threshold (target: 99%% ham precision) ...') y_test_proba = ensemble.predict_proba(X_test)[:, 1] # P(spam) precision_arr, recall_arr, thresholds_pr = precision_recall_curve(y_test, y_test_proba) optimal_threshold = 0.50 # fallback for t in sorted(thresholds_pr, reverse=True): # Emails with P(spam) < t are classified as ham predicted_ham_mask = y_test_proba < t if predicted_ham_mask.sum() == 0: continue # Of those predicted-ham emails, how many are actually ham? ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum() if ham_precision >= 0.99: optimal_threshold = t break print('Optimal threshold: %.4f' % optimal_threshold) # =================================================================== # 5. SAVE ARTIFACTS # =================================================================== print() print('=' * 60) print('STEP 5 — Saving artifacts to models/') print('=' * 60) models_dir.mkdir(exist_ok=True) # 5a. Voting model joblib.dump(ensemble, models_dir / 'voting_model.joblib') print(' Saved voting_model.joblib') # 5b. TF-IDF vectorizer joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib') print(' Saved tfidf_vectorizer.joblib') # 5c. Metadata scaler joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib') print(' Saved meta_scaler.joblib') # 5d. Feature names list (TF-IDF names + META_FEATURE_NAMES) joblib.dump(feature_names, models_dir / 'feature_names.joblib') print(' Saved feature_names.joblib (%d names)' % len(feature_names)) # 5e. Optimal threshold joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib') print(' Saved optimal_threshold.joblib (%.4f)' % optimal_threshold) # 5f. Training sample (200 dense rows for LIME / SHAP explanations) rng = np.random.RandomState(random_state) sample_size = min(200, X_train.shape[0]) sample_idx = rng.choice(X_train.shape[0], size=sample_size, replace=False) training_sample = X_train[sample_idx].toarray() # convert sparse -> dense joblib.dump(training_sample, models_dir / 'training_sample.joblib') print(' Saved training_sample.joblib shape=%s' % str(training_sample.shape)) # 5g. Training report (JSON with metrics for every model) report = { 'random_state': random_state, 'train_size': int(X_train.shape[0]), 'test_size': int(X_test.shape[0]), 'total_features': len(feature_names), 'optimal_threshold': round(optimal_threshold, 4), 'models': all_metrics, } report_path = models_dir / 'training_report.json' with open(report_path, 'w') as f: json.dump(report, f, indent=2) print(' Saved training_report.json') print() print('=' * 60) print('DONE — all 7 artifacts saved to models/') print('=' * 60)