AMP-Classifier2

Sleeping

App Files Files Community

nonzeroexit commited on May 21

Commit

89e5a22

verified ·

1 Parent(s): 1dcb272

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -45

app.py CHANGED Viewed

@@ -135,17 +135,16 @@ assert len(selected_features) == 343, f"Expected 343 features, got {len(selected
 # ---------------------------------------------------------------------------
 # LIME explainer — built ONCE at startup with uniform [0,1] background data.
-# This is valid because all features are MinMax-scaled to [0,1], so uniform
-# noise is a reasonable approximation of the feature distribution.
-# Building it here avoids rebuilding on every prediction call.
 # ---------------------------------------------------------------------------
 _lime_background = np.random.rand(100, len(selected_features))
 _explainer = LimeTabularExplainer(
     training_data=_lime_background,
     feature_names=selected_features,
-    # FIX: label convention matches training — AMP=0, Non-AMP=1
-    # (same as the working old RF script: prediction==0 → AMP)
-    class_names=["AMP", "Non-AMP"],
     mode="classification"
 )
@@ -153,10 +152,9 @@ _explainer = LimeTabularExplainer(
 def keras_predict_proba(X):
     """Return [P(AMP), P(Non-AMP)] for LIME.
-    The model was trained with AMP=0, Non-AMP=1.
-    A sigmoid output therefore represents P(Non-AMP=1).
-    So P(AMP) = 1 - sigmoid_output.
-    Columns must match class_names order: index 0 = AMP, index 1 = Non-AMP.
     """
     amp_model, _ = get_amp_model()
     preds = amp_model.predict(X, verbose=0)
@@ -175,42 +173,32 @@ def extract_features(sequence):
     try:
         _, amp_scaler = get_amp_model()
-        # Replicate the EXACT feature pool the scaler was fit on (1325 features):
-        # Order must match training: CTD → dipeptide(420) → autocorr → pseudoAAC
-        # 1. CTD features
-        ctd_features = CTD.CalculateCTD(sequence)
-        # 2. Dipeptide + AAC filtered to first 420 keys (same as old working script)
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
         filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
-        # 3. Autocorrelation features
-        auto_features = Autocorrelation.CalculateAutoTotal(sequence)
-        # 4. Pseudo-AAC features
-        pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
-        # Merge in training order
         all_features_dict = {}
         all_features_dict.update(ctd_features)
         all_features_dict.update(filtered_dipeptide)
         all_features_dict.update(auto_features)
         all_features_dict.update(pseudo_features)
-        # Build full-pool DataFrame (~1325 columns)
         feature_df_all = pd.DataFrame([all_features_dict])
-        # Scale the FULL pool — scaler expects 1325 features in training column order
-        scaled_array = amp_scaler.transform(feature_df_all.values)
-        scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
-        # Verify all 343 selected features survived
         missing = [f for f in selected_features if f not in scaled_df.columns]
         if missing:
             return f"Error: Missing features after scaling: {missing[:5]}..."
-        # Select the 343 features in model training order
         selected_df = scaled_df[selected_features].fillna(0)
         return selected_df.values.astype(np.float32)
@@ -233,7 +221,7 @@ def predictmic(sequence):
             [sys.executable, "mic_worker.py", sequence],
             capture_output=True,
             text=True,
-            timeout=900  # 15 min — generous for one-time ProtBert download
         )
     except subprocess.TimeoutExpired:
         return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
@@ -266,27 +254,22 @@ def full_prediction(sequence):
     raw_pred = amp_model.predict(features, verbose=0)
     print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
-    # ------------------------------------------------------------------
-    # FIX: sigmoid output = P(Non-AMP) because training labels were
-    # AMP=0, Non-AMP=1  (same convention as the working old RF script).
-    # ------------------------------------------------------------------
     prob_non_amp = float(raw_pred.flatten()[0])
     prob_amp     = 1.0 - prob_non_amp
     if prob_amp >= 0.5:
-        prediction = 0   # AMP  (class 0, same as old script)
         confidence = round(prob_amp * 100, 2)
     else:
-        prediction = 1   # Non-AMP (class 1)
         confidence = round(prob_non_amp * 100, 2)
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
     result  = f"Prediction: {amp_result}\n"
     result += f"Confidence: {confidence}%\n"
-    # Debug line — remove once you've verified on known sequences
-    result += f"[Debug] sigmoid={round(prob_non_amp,4)} | P(AMP)={round(prob_amp,4)} | P(Non-AMP)={round(prob_non_amp,4)}\n"
-    if prediction == 0:  # AMP → run MIC
         print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
         mic_values = predictmic(sequence)
         print("[CHECKPOINT] MIC done:", mic_values, flush=True)
@@ -296,16 +279,25 @@ def full_prediction(sequence):
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
-    # LIME — uses the explainer built once at startup
     try:
         explanation = _explainer.explain_instance(
             data_row=features[0],
             predict_fn=keras_predict_proba,
-            num_features=10
         )
-        result += "\nTop Features Influencing Prediction:\n"
-        for feat, weight in explanation.as_list():
-            result += f"- {feat}: {round(weight, 4)}\n"
     except Exception as e:
         result += f"\nLIME explanation failed: {str(e)}\n"

 # ---------------------------------------------------------------------------
 # LIME explainer — built ONCE at startup with uniform [0,1] background data.
+# Valid because all features are MinMax-scaled to [0,1].
+# class_names: index 0 = AMP, index 1 = Non-AMP  (matches training: AMP=0, Non-AMP=1)
+# We always explain label=0 (AMP class) so weights are consistent across all
+# sequences — positive weight = pushes TOWARD AMP, negative = pushes AWAY.
 # ---------------------------------------------------------------------------
 _lime_background = np.random.rand(100, len(selected_features))
 _explainer = LimeTabularExplainer(
     training_data=_lime_background,
     feature_names=selected_features,
+    class_names=["AMP", "Non-AMP"],   # index 0=AMP, index 1=Non-AMP
     mode="classification"
 )
 def keras_predict_proba(X):
     """Return [P(AMP), P(Non-AMP)] for LIME.
+    Training labels: AMP=0, Non-AMP=1.
+    Sigmoid output = P(Non-AMP=1), so P(AMP) = 1 - sigmoid.
+    Column order must match class_names: col0=P(AMP), col1=P(Non-AMP).
     """
     amp_model, _ = get_amp_model()
     preds = amp_model.predict(X, verbose=0)
     try:
         _, amp_scaler = get_amp_model()
+        # Replicate the EXACT feature pool the scaler was fit on (1325 features).
+        # Merge order must match training: CTD → dipeptide(420) → autocorr → pseudoAAC
+        ctd_features      = CTD.CalculateCTD(sequence)
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
         filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
+        auto_features     = Autocorrelation.CalculateAutoTotal(sequence)
+        pseudo_features   = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
         all_features_dict = {}
         all_features_dict.update(ctd_features)
         all_features_dict.update(filtered_dipeptide)
         all_features_dict.update(auto_features)
         all_features_dict.update(pseudo_features)
+        # Build full-pool DataFrame (~1325 columns) and scale
         feature_df_all = pd.DataFrame([all_features_dict])
+        scaled_array   = amp_scaler.transform(feature_df_all.values)
+        scaled_df      = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
+        # Verify all 343 selected features are present
         missing = [f for f in selected_features if f not in scaled_df.columns]
         if missing:
             return f"Error: Missing features after scaling: {missing[:5]}..."
+        # Select 343 features in model training order
         selected_df = scaled_df[selected_features].fillna(0)
         return selected_df.values.astype(np.float32)
             [sys.executable, "mic_worker.py", sequence],
             capture_output=True,
             text=True,
+            timeout=900
         )
     except subprocess.TimeoutExpired:
         return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
     raw_pred = amp_model.predict(features, verbose=0)
     print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
+    # sigmoid output = P(Non-AMP) — training labels were AMP=0, Non-AMP=1
     prob_non_amp = float(raw_pred.flatten()[0])
     prob_amp     = 1.0 - prob_non_amp
     if prob_amp >= 0.5:
+        prediction = 0                  # AMP  (class 0)
         confidence = round(prob_amp * 100, 2)
     else:
+        prediction = 1                  # Non-AMP (class 1)
         confidence = round(prob_non_amp * 100, 2)
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
     result  = f"Prediction: {amp_result}\n"
     result += f"Confidence: {confidence}%\n"
+    if prediction == 0:                 # AMP → run MIC
         print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
         mic_values = predictmic(sequence)
         print("[CHECKPOINT] MIC done:", mic_values, flush=True)
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
+    # ------------------------------------------------------------------
+    # LIME — always explains class 0 (AMP) so weights are consistent:
+    #   weight > 0  →  feature pushes TOWARD AMP classification
+    #   weight < 0  →  feature pushes AWAY from AMP classification
+    # This is meaningful for both AMPs and Non-AMPs:
+    #   AMP sequence    → top positive weights explain why it's an AMP
+    #   Non-AMP sequence → top negative weights explain why it's NOT an AMP
+    # ------------------------------------------------------------------
     try:
         explanation = _explainer.explain_instance(
             data_row=features[0],
             predict_fn=keras_predict_proba,
+            num_features=10,
+            labels=(0,)                 # always explain AMP class (index 0)
         )
+        result += "\nTop Features Influencing AMP Classification:\n"
+        for feat, weight in explanation.as_list(label=0):
+            direction = "↑ AMP" if weight > 0 else "↓ AMP"
+            result += f"- {feat}: {round(weight, 4)} ({direction})\n"
     except Exception as e:
         result += f"\nLIME explanation failed: {str(e)}\n"