# Batch retrain script for the spam classifier # Reads feedback corrections from feedback/feedback_log.csv, # merges them into the original training data, and retrains # the VotingClassifier ensemble. # # Usage: # python3 retrain.py # retrain with feedback # python3 retrain.py --no-feedback # retrain with original data only import sys import csv import warnings from pathlib import Path import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.calibration import CalibratedClassifierCV from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import classification_report, precision_recall_curve from scipy.sparse import hstack, csr_matrix import joblib from utils import preprocess_text, compute_metadata_features, META_FEATURE_NAMES warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=DeprecationWarning) project_dir = Path(__file__).parent data_dir = project_dir / 'data' models_dir = project_dir / 'models' feedback_csv = project_dir / 'feedback' / 'feedback_log.csv' random_state = 42 KAGGLE_CAP = 100_000 def load_feedback_corrections(): """Read feedback CSV and return a DataFrame of corrections.""" if not feedback_csv.exists(): print("No feedback file found.") return pd.DataFrame(columns=['text', 'label']) corrections = [] with open(feedback_csv, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: if row.get('feedback') == 'wrong' and row.get('correct_label'): label = 1 if row['correct_label'].lower() == 'spam' else 0 corrections.append({ 'text': row['email_text'], 'label': label, }) df = pd.DataFrame(corrections) print(f"Found {len(df)} corrections in feedback log.") return df def main(): use_feedback = '--no-feedback' not in sys.argv print("Loading training data...") kaggle_path = data_dir / 'spam_Emails_data.csv' github_dir = data_dir / 'email-dataset-main' / 'email-dataset-main' frames = [] if kaggle_path.exists(): kaggle_df = pd.read_csv(kaggle_path) if 'label' in kaggle_df.columns and 'text' in kaggle_df.columns: kaggle_df['label'] = kaggle_df['label'].map({'spam': 1, 'ham': 0}) kaggle_df = kaggle_df.dropna(subset=['label', 'text']) elif 'v1' in kaggle_df.columns and 'v2' in kaggle_df.columns: kaggle_df = kaggle_df.rename(columns={'v1': 'label_str', 'v2': 'text'}) kaggle_df['label'] = kaggle_df['label_str'].map({'spam': 1, 'ham': 0}) kaggle_df = kaggle_df[['text', 'label']].dropna() if len(kaggle_df) > KAGGLE_CAP: kaggle_df = kaggle_df.groupby('label', group_keys=False).apply( lambda x: x.sample(min(len(x), KAGGLE_CAP // 2), random_state=random_state) ) frames.append(kaggle_df[['text', 'label']]) print(f" Kaggle: {len(kaggle_df)} emails") if github_dir.exists(): for label_dir in github_dir.iterdir(): if label_dir.is_dir() and label_dir.name in ('spam', 'ham'): lbl = 1 if label_dir.name == 'spam' else 0 for f in label_dir.iterdir(): if f.is_file(): try: text = f.read_text(encoding='utf-8', errors='ignore') frames.append(pd.DataFrame([{'text': text, 'label': lbl}])) except Exception: pass if not frames: print("ERROR: No training data found in data/ directory.") sys.exit(1) df = pd.concat(frames, ignore_index=True) print(f" Total original: {len(df)} emails") if use_feedback: feedback_df = load_feedback_corrections() if len(feedback_df) > 0: df = pd.concat([df, feedback_df], ignore_index=True) print(f" After feedback merge: {len(df)} emails") print("Preprocessing...") df['clean'] = df['text'].apply(preprocess_text) df = df[df['clean'].str.len() > 0] X_text = df['clean'].values y = df['label'].values X_train_text, X_test_text, y_train, y_test = train_test_split( X_text, y, test_size=0.2, random_state=random_state, stratify=y ) print("Fitting TF-IDF...") tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), min_df=2, max_df=0.95) X_train_tfidf = tfidf.fit_transform(X_train_text) X_test_tfidf = tfidf.transform(X_test_text) print("Computing metadata features...") # We need to get the original text (not cleaned) for metadata features # Use index alignment with the split train_orig = df.loc[df['clean'].isin(X_train_text), 'text'].values[:len(X_train_text)] test_orig = df.loc[df['clean'].isin(X_test_text), 'text'].values[:len(X_test_text)] X_train_meta = compute_metadata_features(train_orig.tolist()) X_test_meta = compute_metadata_features(test_orig.tolist()) scaler = MinMaxScaler() X_train_meta_scaled = scaler.fit_transform(X_train_meta) X_test_meta_scaled = scaler.transform(X_test_meta) X_train = hstack([X_train_tfidf, csr_matrix(X_train_meta_scaled)]) X_test = hstack([X_test_tfidf, csr_matrix(X_test_meta_scaled)]) feature_names_list = tfidf.get_feature_names_out().tolist() + META_FEATURE_NAMES print("Training VotingClassifier ensemble...") ensemble = VotingClassifier( estimators=[ ('rf', RandomForestClassifier( n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=random_state)), ('lr', LogisticRegression( max_iter=1000, class_weight='balanced', random_state=random_state)), ('svm', CalibratedClassifierCV( LinearSVC(class_weight='balanced', max_iter=2000, random_state=random_state))), ], voting='soft', ) ensemble.fit(X_train, y_train) y_pred = ensemble.predict(X_test) print("\nClassification Report:") print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])) y_scores = ensemble.predict_proba(X_test)[:, 1] precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores) f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) best_idx = np.argmax(f1_scores) optimal_threshold = float(thresholds[best_idx]) print(f"Optimal threshold: {optimal_threshold:.4f}") models_dir.mkdir(exist_ok=True) joblib.dump(ensemble, models_dir / 'voting_model.joblib') joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib') joblib.dump(scaler, models_dir / 'meta_scaler.joblib') joblib.dump(feature_names_list, models_dir / 'feature_names.joblib') joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib') sample_size = min(200, X_train.shape[0]) sample_idx = np.random.RandomState(random_state).choice( X_train.shape[0], sample_size, replace=False) training_sample = X_train[sample_idx].toarray() joblib.dump(training_sample, models_dir / 'training_sample.joblib') print(f"\nAll models saved to {models_dir}/") if use_feedback: corrections = load_feedback_corrections() print(f"Feedback corrections incorporated: {len(corrections)}") if __name__ == '__main__': main()