"""Tier-1 real-time card-fraud scorer (LightGBM) on Sparkov. Engineers behavioral/contextual features (velocity, geo, time, age) and trains a gradient-boosted scorer. Evaluated at NATURAL imbalance with PR-AUC / ROC-AUC. """ import math, json, time import numpy as np, pandas as pd import lightgbm as lgb from datasets import load_dataset from sklearn.metrics import (average_precision_score, roc_auc_score, precision_recall_curve, precision_score, recall_score, f1_score, confusion_matrix) import joblib R = 6371.0 def haversine(lat1, lon1, lat2, lon2): p1, p2 = np.radians(lat1), np.radians(lat2) dphi = np.radians(lat2 - lat1); dl = np.radians(lon2 - lon1) a = np.sin(dphi/2)**2 + np.cos(p1)*np.cos(p2)*np.sin(dl/2)**2 return 2*R*np.arcsin(np.sqrt(np.clip(a, 0, 1))) CAT_COLS = ["category", "gender", "state"] def featurize(df, fit_maps=None, cat_p95=None, cat_rate=None): df = df.sort_values(["cc_num", "unix_time"]).copy() # time features dt = pd.to_datetime(df["trans_date_trans_time"]) df["hour"] = dt.dt.hour df["dow"] = dt.dt.dayofweek df["is_night"] = ((df["hour"] >= 22) | (df["hour"] <= 4)).astype(int) # age df["age"] = (dt - pd.to_datetime(df["dob"])).dt.days / 365.25 # geo distance home->merchant df["geo_km"] = haversine(df["lat"].values, df["long"].values, df["merch_lat"].values, df["merch_long"].values) df["log_amt"] = np.log1p(df["amt"]) df["log_city_pop"] = np.log1p(df["city_pop"]) # velocity per card (vectorized rolling within 24h / 1h) df["prev_unix"] = df.groupby("cc_num")["unix_time"].shift(1) df["mins_since_last"] = (df["unix_time"] - df["prev_unix"]) / 60.0 df["mins_since_last"] = df["mins_since_last"].fillna(99999).clip(upper=99999) # rolling counts using numpy two-pointer per card tx24 = np.zeros(len(df), int); amt24 = np.zeros(len(df)); tx1 = np.zeros(len(df), int) idx = 0 for _, g in df.groupby("cc_num", sort=False): ut = g["unix_time"].values; am = g["amt"].values j24 = j1 = 0 for i in range(len(g)): while ut[i] - ut[j24] > 86400: j24 += 1 while ut[i] - ut[j1] > 3600: j1 += 1 tx24[idx] = i - j24 amt24[idx] = am[j24:i].sum() tx1[idx] = i - j1 idx += 1 df["tx_24h"] = tx24; df["amt_24h"] = amt24; df["tx_1h"] = tx1 # category amount anomaly vs train norms if cat_p95 is None: cat_p95 = df.groupby("category")["amt"].quantile(0.95).to_dict() cat_rate = df.groupby("category")["is_fraud"].mean().to_dict() df["cat_p95"] = df["category"].map(cat_p95).fillna(np.median(list(cat_p95.values()))) df["amt_over_p95"] = (df["amt"] > df["cat_p95"]).astype(int) df["amt_to_p95"] = df["amt"] / (df["cat_p95"] + 1e-6) df["cat_fraud_rate"] = df["category"].map(cat_rate).fillna(0.0) # categorical encodings for c in CAT_COLS: df[c] = df[c].astype("category") return df, cat_p95, cat_rate FEATURES = ["log_amt", "amt", "hour", "dow", "is_night", "age", "geo_km", "log_city_pop", "mins_since_last", "tx_24h", "amt_24h", "tx_1h", "amt_over_p95", "amt_to_p95", "cat_fraud_rate"] + CAT_COLS def main(): t0 = time.time() print("Loading Sparkov splits...") tr = load_dataset("pointe77/credit-card-transaction", split="train").to_pandas() te = load_dataset("pointe77/credit-card-transaction", split="test").to_pandas() print(f"train {len(tr):,} (fraud {tr.is_fraud.mean():.4%}) | test {len(te):,} (fraud {te.is_fraud.mean():.4%})") tr, cat_p95, cat_rate = featurize(tr) te, _, _ = featurize(te, cat_p95=cat_p95, cat_rate=cat_rate) Xtr, ytr = tr[FEATURES], tr["is_fraud"].values Xte, yte = te[FEATURES], te["is_fraud"].values spw = (ytr == 0).sum() / (ytr == 1).sum() print(f"scale_pos_weight = {spw:.1f}") # hold out a validation slice from train for early stopping n = len(Xtr); cut = int(n*0.9) Xt, yt = Xtr.iloc[:cut], ytr[:cut] Xv, yv = Xtr.iloc[cut:], ytr[cut:] params = dict(objective="binary", metric="average_precision", learning_rate=0.05, num_leaves=64, max_depth=-1, min_child_samples=100, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, reg_lambda=5.0, scale_pos_weight=spw, n_jobs=-1, verbose=-1) dtr = lgb.Dataset(Xt, yt, categorical_feature=CAT_COLS) dvl = lgb.Dataset(Xv, yv, categorical_feature=CAT_COLS, reference=dtr) print("Training card-fraud LightGBM...") model = lgb.train(params, dtr, num_boost_round=2000, valid_sets=[dvl], callbacks=[lgb.early_stopping(80), lgb.log_evaluation(100)]) # evaluate at natural imbalance on official test split p = model.predict(Xte, num_iteration=model.best_iteration) prauc = average_precision_score(yte, p) rocauc = roc_auc_score(yte, p) print(f"\n=== CARD TEST (natural imbalance {yte.mean():.4%}) ===") print(f"PR-AUC={prauc:.4f} ROC-AUC={rocauc:.4f}") # choose routing threshold: high-recall to NOT miss fraud (Tier-1 must catch, LLM filters FPs) prec, rec, thr = precision_recall_curve(yte, p) results = {} for target_recall in [0.80, 0.85, 0.90, 0.95]: # smallest threshold achieving >= target recall ok = np.where(rec[:-1] >= target_recall)[0] if len(ok): ti = ok[-1] t = float(thr[ti]) pr_ = precision_score(yte, p >= t); re_ = recall_score(yte, p >= t) flagged = float((p >= t).mean()) results[f"recall_{target_recall}"] = dict(threshold=t, precision=pr_, recall=re_, flagged_frac=flagged) print(f"recall>={target_recall}: thr={t:.4f} P={pr_:.3f} R={re_:.3f} flagged={flagged:.2%}") # default routing threshold = target recall 0.90 route_t = results["recall_0.9"]["threshold"] yhat = (p >= route_t).astype(int) print("\nConfusion @ routing threshold (recall~0.90):") print(confusion_matrix(yte, yhat)) print("F1:", f1_score(yte, yhat)) # feature importance imp = dict(sorted(zip(FEATURES, model.feature_importance(importance_type="gain").tolist()), key=lambda x: -x[1])) print("\nTop features:", list(imp.items())[:8]) model.save_model("cc_lgbm_model.txt") joblib.dump({"cat_p95": cat_p95, "cat_rate": cat_rate, "features": FEATURES, "cat_cols": CAT_COLS}, "cc_lgbm_preproc.joblib") meta = dict(domain="card_fraud", source="pointe77/credit-card-transaction", train_rows=int(len(tr)), test_rows=int(len(te)), test_fraud_rate=float(yte.mean()), pr_auc=float(prauc), roc_auc=float(rocauc), routing_threshold=float(route_t), thresholds=results, top_features=list(imp.items())[:12], n_features=len(FEATURES), best_iteration=int(model.best_iteration), scale_pos_weight=float(spw)) json.dump(meta, open("cc_lgbm_metrics.json", "w"), indent=2) print(f"\nDone in {time.time()-t0:.0f}s. Saved model + preproc + metrics.") if __name__ == "__main__": main()