#!/usr/bin/env python3 """app_inference.py — UI de inferencia de fraude. Descarga modelo + preprocessor desde HF Hub y expone: - Predicción individual con campos del dominio (amt, category, hour, ...). - Predicción por lotes desde CSV con las mismas columnas que el dataset crudo. Uso local: uv run python app_inference/app_inference.py HF Spaces : sube esta carpeta con scripts/08_deploy_hf.py """ from __future__ import annotations import json import os from datetime import datetime, timezone from pathlib import Path import gradio as gr import joblib import numpy as np import pandas as pd from huggingface_hub import hf_hub_download # --- Modelo / preprocessor desde HF Hub --- MODEL_REPO = os.environ.get("HF_MODEL_REPO", "gusdelact/credit-card-fraud-bagging-boosting") THRESHOLD_DEFAULT = float(os.environ.get("FRAUD_THRESHOLD", "0.5")) def _safe_download(repo_id: str, filename: str): try: return hf_hub_download(repo_id=repo_id, filename=filename) except Exception as exc: # noqa: BLE001 print(f"⚠️ No se pudo descargar {filename} de {repo_id}: {exc}") return None print(f"Descargando artefactos desde HF Hub: {MODEL_REPO}") model_path = _safe_download(MODEL_REPO, "model.joblib") preprocessor_path = _safe_download(MODEL_REPO, "preprocessor.joblib") metrics_path = _safe_download(MODEL_REPO, "metrics.json") # Fallback a artefactos locales (útil al testear antes de publicar) if model_path is None: local_model = Path(__file__).resolve().parents[1] / "models" / "best_model.joblib" if local_model.exists(): model_path = str(local_model) print(f"Usando modelo local: {model_path}") else: raise RuntimeError(f"No hay modelo en HF Hub ({MODEL_REPO}) ni local.") if preprocessor_path is None: local_pre = Path(__file__).resolve().parents[1] / "models" / "preprocessor.joblib" if local_pre.exists(): preprocessor_path = str(local_pre) model = joblib.load(model_path) preprocessor = joblib.load(preprocessor_path) # Threshold calibrado: si publicamos metrics.json úsalo threshold = THRESHOLD_DEFAULT if metrics_path is not None: try: with open(metrics_path) as f: mj = json.load(f) winner = mj.get("winner") if winner and winner in mj: threshold = float(mj[winner]["metrics_calibrated_threshold"]["threshold"]) print(f"Threshold calibrado cargado: {threshold:.4f} ({winner})") except Exception as exc: # noqa: BLE001 print(f"⚠️ No se pudo leer metrics.json: {exc}") # --- Reproducir el feature engineering del script 03 --- def haversine_km(lat1, lon1, lat2, lon2): R = 6371.0088 lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) dlat = lat2 - lat1 dlon = lon2 - lon1 a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2 return 2 * R * np.arcsin(np.sqrt(a)) HIGH_CARD_COLS = ["merchant", "city", "job", "state"] LOW_CARD_COLS = ["category", "gender"] RAW_INPUT_COLS = [ "trans_date_trans_time", "merchant", "category", "amt", "gender", "city", "state", "lat", "long", "city_pop", "job", "dob", "merch_lat", "merch_long", ] def engineer(df_raw: pd.DataFrame) -> pd.DataFrame: out = df_raw.copy() out["trans_date_trans_time"] = pd.to_datetime(out["trans_date_trans_time"], errors="coerce") out["dob"] = pd.to_datetime(out["dob"], errors="coerce") out["hour"] = out["trans_date_trans_time"].dt.hour.astype("int16") out["dayofweek"] = out["trans_date_trans_time"].dt.dayofweek.astype("int8") out["month"] = out["trans_date_trans_time"].dt.month.astype("int8") age_days = (out["trans_date_trans_time"] - out["dob"]).dt.days out["age"] = (age_days / 365.25).clip(lower=0, upper=110).astype("float32") out["distance_km"] = haversine_km(out["lat"], out["long"], out["merch_lat"], out["merch_long"]).astype("float32") out["amt_log1p"] = np.log1p(out["amt"].astype("float64")).astype("float32") out = out.drop(columns=["trans_date_trans_time", "dob", "merch_lat", "merch_long"], errors="ignore") for c in HIGH_CARD_COLS + LOW_CARD_COLS: if c in out.columns: out[c] = out[c].astype(str) return out CATEGORIES = [ "entertainment", "food_dining", "gas_transport", "grocery_net", "grocery_pos", "health_fitness", "home", "kids_pets", "misc_net", "misc_pos", "personal_care", "shopping_net", "shopping_pos", "travel", ] def predict_single( trans_datetime, amt, category, gender, merchant, city, state, job, lat, long_, city_pop, dob, merch_lat, merch_long, ): try: raw = pd.DataFrame([{ "trans_date_trans_time": trans_datetime, "merchant": merchant or "fraud_unknown", "category": category, "amt": float(amt), "gender": gender, "city": city or "Unknown", "state": state or "NY", "lat": float(lat), "long": float(long_), "city_pop": int(city_pop), "job": job or "Unknown", "dob": dob, "merch_lat": float(merch_lat), "merch_long": float(merch_long), }]) eng = engineer(raw) X = preprocessor.transform(eng) proba = float(model.predict_proba(X)[0, 1]) pred = int(proba >= threshold) return ( {"FRAUD (1)": proba, "LEGIT (0)": 1.0 - proba}, f"### Predicción: {'🚨 FRAUDE' if pred else '✅ LEGÍTIMA'}\n" f"- P(fraud) = **{proba:.4f}**\n" f"- Umbral aplicado = `{threshold:.4f}`", ) except Exception as exc: # noqa: BLE001 return {}, f"❌ Error: {exc}" def predict_batch(file): if file is None: return "Sube un CSV", None df = pd.read_csv(file.name) missing = [c for c in RAW_INPUT_COLS if c not in df.columns] if missing: return f"❌ Faltan columnas: {missing}", None eng = engineer(df[RAW_INPUT_COLS]) X = preprocessor.transform(eng) proba = model.predict_proba(X)[:, 1] pred = (proba >= threshold).astype(int) out = df.copy() out["fraud_proba"] = proba out["fraud_prediction"] = pred return f"✅ {len(out)} filas evaluadas. Umbral={threshold:.4f}", out.head(50) # --- UI --- with gr.Blocks(title="Credit Card Fraud Detector", theme=gr.themes.Soft()) as demo: gr.Markdown("# 💳 Credit Card Fraud Detector") gr.Markdown( f"Modelo: `{MODEL_REPO}` · Umbral: `{threshold:.4f}`\n\n" "Detección binaria de fraude en transacciones. Pipeline ensemble (Random Forest + XGBoost) " "entrenado sobre [`alenc123/credit-card-fraud`](https://huggingface.co/datasets/alenc123/credit-card-fraud)." ) with gr.Tab("Predicción individual"): with gr.Row(): with gr.Column(): gr.Markdown("**Transacción**") trans_dt = gr.Textbox( label="Fecha y hora de la transacción (YYYY-MM-DD HH:MM:SS)", value=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), ) amt = gr.Number(label="Monto (USD)", value=120.0) category = gr.Dropdown(label="Categoría", choices=CATEGORIES, value="shopping_net") merchant = gr.Textbox(label="Merchant", value="fraud_Heller, Gutmann and Zieme") gr.Markdown("**Comercio (geolocalización)**") merch_lat = gr.Number(label="Merchant latitude", value=40.7128) merch_long = gr.Number(label="Merchant longitude", value=-74.0060) with gr.Column(): gr.Markdown("**Titular de la tarjeta**") gender = gr.Dropdown(label="Gender", choices=["M", "F"], value="M") dob = gr.Textbox(label="Fecha de nacimiento (YYYY-MM-DD)", value="1985-05-15") job = gr.Textbox(label="Ocupación", value="Engineer") city = gr.Textbox(label="Ciudad", value="New York") state = gr.Textbox(label="Estado (2 letras)", value="NY") city_pop = gr.Number(label="Población de la ciudad", value=8000000) lat = gr.Number(label="Latitude del titular", value=40.7128) long_ = gr.Number(label="Longitude del titular", value=-74.0060) predict_btn = gr.Button("Predecir", variant="primary") proba_out = gr.Label(num_top_classes=2, label="Probabilidades") verdict_md = gr.Markdown() predict_btn.click( predict_single, inputs=[trans_dt, amt, category, gender, merchant, city, state, job, lat, long_, city_pop, dob, merch_lat, merch_long], outputs=[proba_out, verdict_md], ) with gr.Tab("Batch (CSV)"): gr.Markdown( "Sube un CSV con las mismas columnas crudas del dataset original " f"(`{', '.join(RAW_INPUT_COLS)}`). Devuelve `fraud_proba` y `fraud_prediction`." ) f = gr.File(label="CSV", file_types=[".csv"]) b_btn = gr.Button("Procesar batch", variant="primary") b_status = gr.Textbox(label="Estado") b_table = gr.DataFrame(label="Primeras 50 filas") b_btn.click(predict_batch, inputs=f, outputs=[b_status, b_table]) with gr.Tab("Información del modelo"): gr.Markdown( f""" **Modelo**: `{type(model).__name__}` **Repositorio HF Hub**: [`{MODEL_REPO}`](https://huggingface.co/{MODEL_REPO}) **Umbral activo**: `{threshold:.4f}` (calibrado por F1) **Pipeline**: 1. Feature engineering (`age`, `distance_km`, `hour`, `dayofweek`, `month`, `amt_log1p`). 2. Frequency encoding para `merchant`, `city`, `job`, `state`. 3. One-Hot encoding para `category`, `gender`. 4. Modelo ganador entre Random Forest y XGBoost (selección por F1 calibrado). """ ) port = 7860 if os.environ.get("SPACE_ID") else 7861 demo.launch(server_name="0.0.0.0", server_port=port, ssr_mode=False)