Spaces:

nonzeroexit
/

AMP-Classifier

Running

App Files Files Community

nonzeroexit commited on 7 days ago

Commit

59d7aab

verified ·

1 Parent(s): 0ff9972

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -20

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
-from sklearn.preprocessing import MinMaxScaler
 import torch
 from transformers import BertTokenizer, BertModel
 from lime.lime_tabular import LimeTabularExplainer
@@ -19,7 +18,7 @@ protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 protbert_model = protbert_model.to(device).eval()
-# Define selected features (put your complete list here)
 selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
 "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
 "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
@@ -47,13 +46,17 @@ selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondarySt
 "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5", "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13",
 "APAAC15", "APAAC18", "APAAC19", "APAAC24"]
-# Dummy data for LIME
-sample_data = np.random.rand(100, len(selected_features))
 explainer = LimeTabularExplainer(
     training_data=sample_data,
     feature_names=selected_features,
     class_names=["AMP", "Non-AMP"],
-    mode="classification"
 )
 # Feature extraction function
@@ -111,11 +114,16 @@ def predictmic(sequence):
     mic_results = {}
     for bacterium, cfg in bacteria_config.items():
         try:
-            scaler = joblib.load(cfg["scaler"])
-            scaled = scaler.transform(embedding)
             transformed = joblib.load(cfg["pca"]).transform(scaled) if cfg["pca"] else scaled
-            model = joblib.load(cfg["model"])
-            mic_log = model.predict(transformed)[0]
             mic = round(expm1(mic_log), 3)
             mic_results[bacterium] = mic
         except Exception as e:
@@ -141,6 +149,22 @@ def full_prediction(sequence):
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
     result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
     if prediction == 0:
         mic_values = predictmic(sequence)
         result += "\nPredicted MIC Values (μM):\n"
@@ -149,16 +173,6 @@ def full_prediction(sequence):
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
-    explanation = explainer.explain_instance(
-        data_row=features[0],
-        predict_fn=model.predict_proba,
-        num_features=10
-    )
-    result += "\nTop Features Influencing Prediction:\n"
-    for feat, weight in explanation.as_list():
-        result += f"- {feat}: {round(weight, 4)}\n"
     return result
 # Gradio UI
@@ -170,4 +184,6 @@ iface = gr.Interface(
     description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
 )
-iface.launch(share=True)

 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 import torch
 from transformers import BertTokenizer, BertModel
 from lime.lime_tabular import LimeTabularExplainer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 protbert_model = protbert_model.to(device).eval()
+# Define selected features (146 RFE-selected features)
 selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
 "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
 "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
 "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5", "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13",
 "APAAC15", "APAAC18", "APAAC19", "APAAC24"]
+# --- FIX (LIME): seed the random background so explanations are reproducible
+# across Space restarts. (Loading a real saved training sample here would
+# produce more faithful weights; see build_lime_background.py for that path.)
+np.random.seed(42)
+sample_data = np.random.rand(500, len(selected_features))
 explainer = LimeTabularExplainer(
     training_data=sample_data,
     feature_names=selected_features,
     class_names=["AMP", "Non-AMP"],
+    mode="classification",
+    random_state=42,
 )
 # Feature extraction function
     mic_results = {}
     for bacterium, cfg in bacteria_config.items():
         try:
+            # --- FIX (variable shadowing): renamed locals so the global `scaler`
+            # and `model` (the AMP RF + its MinMax scaler) are NEVER overwritten.
+            # The original code reused the names `scaler` and `model` here, which
+            # silently broke the AMP classifier on every prediction after the
+            # first MIC run.
+            mic_scaler = joblib.load(cfg["scaler"])
+            scaled = mic_scaler.transform(embedding)
             transformed = joblib.load(cfg["pca"]).transform(scaled) if cfg["pca"] else scaled
+            mic_model = joblib.load(cfg["model"])
+            mic_log = mic_model.predict(transformed)[0]
             mic = round(expm1(mic_log), 3)
             mic_results[bacterium] = mic
         except Exception as e:
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
     result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
+    # --- LIME first (per spec: LIME before SHAP in the HTML report).
+    # explain_instance perturbs THIS single input sequence's feature row 2000
+    # times and fits a local linear model; weights describe this specific input.
+    try:
+        explanation = explainer.explain_instance(
+            data_row=features[0],
+            predict_fn=model.predict_proba,
+            num_features=10,
+            num_samples=2000,
+        )
+        result += "\nTop Features Influencing Prediction (LIME):\n"
+        for feat, weight in explanation.as_list():
+            result += f"- {feat}: {round(weight, 4)}\n"
+    except Exception as e:
+        result += f"\nLIME explanation failed: {str(e)}\n"
     if prediction == 0:
         mic_values = predictmic(sequence)
         result += "\nPredicted MIC Values (μM):\n"
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
     return result
 # Gradio UI
     description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
 )
+# --- FIX (launch): removed share=True. On Hugging Face Spaces the public URL
+# is provided by the platform; share=True is for local dev only.
+iface.launch()