AMP-Classifier2

Sleeping

App Files Files Community

nonzeroexit commited on May 21

Commit

1dcb272

verified ·

1 Parent(s): 4222f98

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -54

app.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import os
 # --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
-# TF and torch each bundle their own OpenMP/MKL; loaded together they can collide
-# and crash at the C level. These settings make them coexist and reduce memory.
 os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
 os.environ.setdefault("OMP_NUM_THREADS", "1")
 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
-# Quiet TensorFlow logs (must be set before importing tensorflow)
 os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
 os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
@@ -15,6 +12,7 @@ import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 import sys
 import json
 import subprocess
@@ -39,9 +37,7 @@ def get_amp_model():
 # ---------------------------------------------------------------------------
-# The EXACT 343 features the scaler was fit on, IN THE EXACT TRAINING ORDER.
-# The scaler was fit on a numpy array (no stored names), so order is critical:
-# we must select these columns in this order BEFORE calling scaler.transform().
 # ---------------------------------------------------------------------------
 selected_features = [
     "_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
@@ -137,18 +133,41 @@ selected_features = [
 ]
 assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
 def keras_predict_proba(X):
-    """Return probabilities as [P(Non-AMP), P(AMP)] for LIME (X already scaled)."""
     amp_model, _ = get_amp_model()
     preds = amp_model.predict(X, verbose=0)
     if preds.ndim == 1 or preds.shape[1] == 1:
-        preds = preds.reshape(-1, 1)
-        return np.hstack([1 - preds, preds])  # sigmoid output assumed = P(AMP)
     return preds
 def extract_features(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
@@ -156,64 +175,65 @@ def extract_features(sequence):
     try:
         _, amp_scaler = get_amp_model()
-        # --- Replicate the EXACT same feature pool the scaler was fit on ---
-        # 1. Dipeptide + AAC (filtered to first 420 keys, same as old script)
-        dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
-        filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
-        # 2. CTD features
         ctd_features = CTD.CalculateCTD(sequence)
         # 3. Autocorrelation features
         auto_features = Autocorrelation.CalculateAutoTotal(sequence)
         # 4. Pseudo-AAC features
         pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
-        # Merge in the same order as the old script
         all_features_dict = {}
         all_features_dict.update(ctd_features)
         all_features_dict.update(filtered_dipeptide)
         all_features_dict.update(auto_features)
         all_features_dict.update(pseudo_features)
-        # Build full-pool DataFrame (should be 1325 columns)
         feature_df_all = pd.DataFrame([all_features_dict])
-        # Scale the FULL pool (scaler expects 1325 features in training order)
         scaled_array = amp_scaler.transform(feature_df_all.values)
         scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
-        # Verify all 343 selected features are present
         missing = [f for f in selected_features if f not in scaled_df.columns]
         if missing:
             return f"Error: Missing features after scaling: {missing[:5]}..."
-        # Select the 343 features IN TRAINING ORDER from the scaled DataFrame
         selected_df = scaled_df[selected_features].fillna(0)
         return selected_df.values.astype(np.float32)
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
 def predictmic(sequence):
     """Run MIC prediction in a SEPARATE process (mic_worker.py).
-    This isolates PyTorch/ProtBert from TensorFlow, preventing the native-library
-    crash (exit 139) and keeping peak memory low. The worker prints a JSON dict on
-    its last stdout line; we parse and return it.
     """
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
     try:
-        # First run downloads ProtBert (~1.6GB), so allow a generous timeout.
         proc = subprocess.run(
             [sys.executable, "mic_worker.py", sequence],
             capture_output=True,
             text=True,
-            timeout=900  # 15 minutes; mostly for the one-time model download
         )
     except subprocess.TimeoutExpired:
         return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
@@ -221,11 +241,9 @@ def predictmic(sequence):
         return {"Error": f"Failed to start MIC worker: {str(e)}"}
     if proc.returncode != 0:
-        # Worker crashed; surface stderr tail for debugging
         tail = (proc.stderr or "").strip().splitlines()[-3:]
         return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
-    # Parse the last non-empty stdout line as JSON
     out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
     if not out_lines:
         return {"Error": "MIC worker produced no output."}
@@ -240,33 +258,35 @@ def full_prediction(sequence):
     print("[CHECKPOINT] full_prediction called", flush=True)
     features = extract_features(sequence)
     if isinstance(features, str):
-        print("[CHECKPOINT] extract_features returned error:", features, flush=True)
         return features
     print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
     amp_model, _ = get_amp_model()
-    print("[CHECKPOINT] AMP model loaded, running predict...", flush=True)
     raw_pred = amp_model.predict(features, verbose=0)
-    print("[CHECKPOINT] AMP predict done:", raw_pred, flush=True)
-    if raw_pred.ndim == 1 or raw_pred.shape[1] == 1:
-        prob_amp = float(raw_pred.flatten()[0])  # sigmoid output assumed = P(AMP)
-        if prob_amp >= 0.5:
-            prediction = 1
-            confidence = round(prob_amp * 100, 2)
-        else:
-            prediction = 0
-            confidence = round((1 - prob_amp) * 100, 2)
     else:
-        class_idx = int(np.argmax(raw_pred[0]))
-        prediction = class_idx
-        confidence = round(float(raw_pred[0][class_idx]) * 100, 2)
-    # Label convention: 1 = AMP, 0 = Non-AMP (swap if your model is reversed)
-    amp_result = "Antimicrobial Peptide (AMP)" if prediction == 1 else "Non-AMP"
-    result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
-    if prediction == 1:
         print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
         mic_values = predictmic(sequence)
         print("[CHECKPOINT] MIC done:", mic_values, flush=True)
@@ -276,16 +296,9 @@ def full_prediction(sequence):
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
     try:
-        from lime.lime_tabular import LimeTabularExplainer
-        sample_data = np.random.rand(100, len(selected_features))
-        explainer = LimeTabularExplainer(
-            training_data=sample_data,
-            feature_names=selected_features,
-            class_names=["Non-AMP", "AMP"],
-            mode="classification"
-        )
-        explanation = explainer.explain_instance(
             data_row=features[0],
             predict_fn=keras_predict_proba,
             num_features=10

 import os
 # --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
 os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
 os.environ.setdefault("OMP_NUM_THREADS", "1")
 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
 os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
 os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
+from lime.lime_tabular import LimeTabularExplainer
 import sys
 import json
 import subprocess
 # ---------------------------------------------------------------------------
+# The EXACT 343 features the model was trained on, IN THE EXACT TRAINING ORDER.
 # ---------------------------------------------------------------------------
 selected_features = [
     "_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
 ]
 assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
+# ---------------------------------------------------------------------------
+# LIME explainer — built ONCE at startup with uniform [0,1] background data.
+# This is valid because all features are MinMax-scaled to [0,1], so uniform
+# noise is a reasonable approximation of the feature distribution.
+# Building it here avoids rebuilding on every prediction call.
+# ---------------------------------------------------------------------------
+_lime_background = np.random.rand(100, len(selected_features))
+_explainer = LimeTabularExplainer(
+    training_data=_lime_background,
+    feature_names=selected_features,
+    # FIX: label convention matches training — AMP=0, Non-AMP=1
+    # (same as the working old RF script: prediction==0 → AMP)
+    class_names=["AMP", "Non-AMP"],
+    mode="classification"
+)
 def keras_predict_proba(X):
+    """Return [P(AMP), P(Non-AMP)] for LIME.
+    The model was trained with AMP=0, Non-AMP=1.
+    A sigmoid output therefore represents P(Non-AMP=1).
+    So P(AMP) = 1 - sigmoid_output.
+    Columns must match class_names order: index 0 = AMP, index 1 = Non-AMP.
+    """
     amp_model, _ = get_amp_model()
     preds = amp_model.predict(X, verbose=0)
     if preds.ndim == 1 or preds.shape[1] == 1:
+        preds = preds.reshape(-1, 1)          # preds = P(Non-AMP)
+        return np.hstack([1 - preds, preds])  # [P(AMP), P(Non-AMP)]
     return preds
 def extract_features(sequence):
+    """Compute the full 1325-feature pool, scale it, then select the 343 model features."""
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
         _, amp_scaler = get_amp_model()
+        # Replicate the EXACT feature pool the scaler was fit on (1325 features):
+        # Order must match training: CTD → dipeptide(420) → autocorr → pseudoAAC
+        # 1. CTD features
         ctd_features = CTD.CalculateCTD(sequence)
+        # 2. Dipeptide + AAC filtered to first 420 keys (same as old working script)
+        dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
+        filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
         # 3. Autocorrelation features
         auto_features = Autocorrelation.CalculateAutoTotal(sequence)
         # 4. Pseudo-AAC features
         pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
+        # Merge in training order
         all_features_dict = {}
         all_features_dict.update(ctd_features)
         all_features_dict.update(filtered_dipeptide)
         all_features_dict.update(auto_features)
         all_features_dict.update(pseudo_features)
+        # Build full-pool DataFrame (~1325 columns)
         feature_df_all = pd.DataFrame([all_features_dict])
+        # Scale the FULL pool — scaler expects 1325 features in training column order
         scaled_array = amp_scaler.transform(feature_df_all.values)
         scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
+        # Verify all 343 selected features survived
         missing = [f for f in selected_features if f not in scaled_df.columns]
         if missing:
             return f"Error: Missing features after scaling: {missing[:5]}..."
+        # Select the 343 features in model training order
         selected_df = scaled_df[selected_features].fillna(0)
         return selected_df.values.astype(np.float32)
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
 def predictmic(sequence):
     """Run MIC prediction in a SEPARATE process (mic_worker.py).
+    Isolates PyTorch/ProtBert from TensorFlow to prevent SIGSEGV (exit 139).
+    The worker prints a JSON dict on its last stdout line.
     """
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
     try:
         proc = subprocess.run(
             [sys.executable, "mic_worker.py", sequence],
             capture_output=True,
             text=True,
+            timeout=900  # 15 min — generous for one-time ProtBert download
         )
     except subprocess.TimeoutExpired:
         return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
         return {"Error": f"Failed to start MIC worker: {str(e)}"}
     if proc.returncode != 0:
         tail = (proc.stderr or "").strip().splitlines()[-3:]
         return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
     out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
     if not out_lines:
         return {"Error": "MIC worker produced no output."}
     print("[CHECKPOINT] full_prediction called", flush=True)
     features = extract_features(sequence)
     if isinstance(features, str):
+        print("[CHECKPOINT] extract_features error:", features, flush=True)
         return features
     print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
     amp_model, _ = get_amp_model()
     raw_pred = amp_model.predict(features, verbose=0)
+    print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
+    # ------------------------------------------------------------------
+    # FIX: sigmoid output = P(Non-AMP) because training labels were
+    # AMP=0, Non-AMP=1  (same convention as the working old RF script).
+    # ------------------------------------------------------------------
+    prob_non_amp = float(raw_pred.flatten()[0])
+    prob_amp     = 1.0 - prob_non_amp
+    if prob_amp >= 0.5:
+        prediction = 0   # AMP  (class 0, same as old script)
+        confidence = round(prob_amp * 100, 2)
     else:
+        prediction = 1   # Non-AMP (class 1)
+        confidence = round(prob_non_amp * 100, 2)
+    amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
+    result  = f"Prediction: {amp_result}\n"
+    result += f"Confidence: {confidence}%\n"
+    # Debug line — remove once you've verified on known sequences
+    result += f"[Debug] sigmoid={round(prob_non_amp,4)} | P(AMP)={round(prob_amp,4)} | P(Non-AMP)={round(prob_non_amp,4)}\n"
+    if prediction == 0:  # AMP → run MIC
         print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
         mic_values = predictmic(sequence)
         print("[CHECKPOINT] MIC done:", mic_values, flush=True)
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
+    # LIME — uses the explainer built once at startup
     try:
+        explanation = _explainer.explain_instance(
             data_row=features[0],
             predict_fn=keras_predict_proba,
             num_features=10