Spaces:

nonzeroexit
/

AMP-Classifier

Running

App Files Files Community

nonzeroexit commited on May 21

Commit

2739a59

verified ·

1 Parent(s): 98f9e87

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -39

app.py CHANGED Viewed

@@ -1,30 +1,60 @@
 import os
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, CTD
-import tensorflow as tf
-from tensorflow.keras.models import load_model
-import torch
-from transformers import BertTokenizer, BertModel
-from lime.lime_tabular import LimeTabularExplainer
 from math import expm1
-# Load AMP Classifier (Keras) and Scaler
-model = load_model("Comb1_aac_ctd_RFE_selected_features_model.keras")
-scaler = joblib.load("Comb1_aac_ctd_RFE_selected_features_scaler.joblib")
-# Load ProtBert (for MIC prediction)
-tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
-protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-protbert_model = protbert_model.to(device).eval()
-# Define selected features (AAC + CTD, RFE-selected)
-# Note: 'Activity' is the target label and is excluded from input features
 selected_features = [
     '_PolarizabilityC1', '_PolarizabilityC2', '_PolarizabilityC3',
     '_SolventAccessibilityC1', '_SolventAccessibilityC2', '_SolventAccessibilityC3',
@@ -84,32 +114,27 @@ selected_features = [
     'VL', 'VK', 'VM', 'VF', 'VP', 'VS', 'VT', 'VW', 'VY', 'VV'
 ]
-# Wrapper to make Keras model behave like a sklearn classifier for LIME
 def keras_predict_proba(X):
-    """Return probabilities for both classes as [P(Non-AMP), P(AMP)]."""
-    preds = model.predict(X, verbose=0)
     if preds.ndim == 1 or preds.shape[1] == 1:
         preds = preds.reshape(-1, 1)
         # Assuming sigmoid output = P(AMP); adjust if your model is reversed.
         return np.hstack([1 - preds, preds])
     return preds
-# Dummy data for LIME
-sample_data = np.random.rand(100, len(selected_features))
-explainer = LimeTabularExplainer(
-    training_data=sample_data,
-    feature_names=selected_features,
-    class_names=["Non-AMP", "AMP"],
-    mode="classification"
-)
-# Feature extraction function (AAC + CTD only)
 def extract_features(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
         # AAC: 20 single AAs + 400 dipeptides = 420 features
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
         filtered_aac = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
@@ -122,7 +147,7 @@ def extract_features(sequence):
         all_features_dict.update(filtered_aac)
         feature_df_all = pd.DataFrame([all_features_dict])
-        normalized_array = scaler.transform(feature_df_all.values)
         normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
         if not set(selected_features).issubset(normalized_df.columns):
@@ -134,14 +159,18 @@ def extract_features(sequence):
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
-# MIC prediction function (unchanged)
 def predictmic(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
     seq_spaced = ' '.join(list(sequence))
-    tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
     tokens = {k: v.to(device) for k, v in tokens.items()}
     with torch.no_grad():
@@ -170,14 +199,14 @@ def predictmic(sequence):
     return mic_results
-# Main prediction function
 def full_prediction(sequence):
     features = extract_features(sequence)
     if isinstance(features, str):
         return features
-    # Keras prediction
-    raw_pred = model.predict(features, verbose=0)
     # Handle sigmoid (1 output) vs softmax (>=2 outputs)
     if raw_pred.ndim == 1 or raw_pred.shape[1] == 1:
@@ -193,7 +222,7 @@ def full_prediction(sequence):
         prediction = class_idx
         confidence = round(float(raw_pred[0][class_idx]) * 100, 2)
-    # Label convention: 1 = AMP, 0 = Non-AMP (swap if your model uses the opposite)
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 1 else "Non-AMP"
     result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
@@ -205,14 +234,21 @@ def full_prediction(sequence):
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
-    # LIME explanation
     try:
         explanation = explainer.explain_instance(
             data_row=features[0],
             predict_fn=keras_predict_proba,
             num_features=10
         )
         result += "\nTop Features Influencing Prediction:\n"
         for feat, weight in explanation.as_list():
             result += f"- {feat}: {round(weight, 4)}\n"
@@ -221,6 +257,7 @@ def full_prediction(sequence):
     return result
 # Gradio UI
 iface = gr.Interface(
     fn=full_prediction,
@@ -230,4 +267,4 @@ iface = gr.Interface(
     description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
 )
-iface.launch(share=True)

 import os
+# Quiet TensorFlow logs and disable oneDNN nondeterminism notice
+os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
+os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
 import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, CTD
 from math import expm1
+# ---------------------------------------------------------------------------
+# LAZY LOADING
+# On the free 16GB Space, loading TensorFlow + PyTorch + ProtBert all at once
+# at import time causes an out-of-memory crash. We therefore load each heavy
+# component only when it is first needed, and cache it after that.
+# ---------------------------------------------------------------------------
+_amp_model = None          # Keras AMP classifier
+_amp_scaler = None         # joblib scaler for AMP features
+_protbert_tokenizer = None
+_protbert_model = None
+_torch = None              # torch module, imported lazily
+_device = None
+def get_amp_model():
+    """Load the Keras AMP classifier + scaler on first use."""
+    global _amp_model, _amp_scaler
+    if _amp_model is None:
+        from tensorflow.keras.models import load_model
+        _amp_model = load_model("Comb1_aac_ctd_RFE_selected_features_model.keras")
+        _amp_scaler = joblib.load("Comb1_aac_ctd_RFE_selected_features_scaler.joblib")
+    return _amp_model, _amp_scaler
+def get_protbert():
+    """Load ProtBert tokenizer + model on first use (only needed for MIC)."""
+    global _protbert_tokenizer, _protbert_model, _torch, _device
+    if _protbert_model is None:
+        import torch
+        from transformers import BertTokenizer, BertModel
+        _torch = torch
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        _protbert_tokenizer = BertTokenizer.from_pretrained(
+            "Rostlab/prot_bert", do_lower_case=False
+        )
+        _protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
+        _protbert_model = _protbert_model.to(_device).eval()
+    return _protbert_tokenizer, _protbert_model, _torch, _device
+# ---------------------------------------------------------------------------
+# Selected features (AAC + CTD, RFE-selected). 'Activity' is the target label
+# and is intentionally excluded from the input features.
+# ---------------------------------------------------------------------------
 selected_features = [
     '_PolarizabilityC1', '_PolarizabilityC2', '_PolarizabilityC3',
     '_SolventAccessibilityC1', '_SolventAccessibilityC2', '_SolventAccessibilityC3',
     'VL', 'VK', 'VM', 'VF', 'VP', 'VS', 'VT', 'VW', 'VY', 'VV'
 ]
 def keras_predict_proba(X):
+    """Return probabilities as [P(Non-AMP), P(AMP)] for LIME."""
+    amp_model, _ = get_amp_model()
+    preds = amp_model.predict(X, verbose=0)
     if preds.ndim == 1 or preds.shape[1] == 1:
         preds = preds.reshape(-1, 1)
         # Assuming sigmoid output = P(AMP); adjust if your model is reversed.
         return np.hstack([1 - preds, preds])
     return preds
 def extract_features(sequence):
+    """Compute AAC (420) + CTD features, scale, and select RFE features."""
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
+        _, amp_scaler = get_amp_model()
         # AAC: 20 single AAs + 400 dipeptides = 420 features
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
         filtered_aac = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
         all_features_dict.update(filtered_aac)
         feature_df_all = pd.DataFrame([all_features_dict])
+        normalized_array = amp_scaler.transform(feature_df_all.values)
         normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
         if not set(selected_features).issubset(normalized_df.columns):
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
 def predictmic(sequence):
+    """Predict MIC values using ProtBert embeddings + per-bacterium models."""
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
+    tokenizer, protbert_model, torch, device = get_protbert()
     seq_spaced = ' '.join(list(sequence))
+    tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length',
+                       truncation=True, max_length=512)
     tokens = {k: v.to(device) for k, v in tokens.items()}
     with torch.no_grad():
     return mic_results
 def full_prediction(sequence):
     features = extract_features(sequence)
     if isinstance(features, str):
         return features
+    amp_model, _ = get_amp_model()
+    raw_pred = amp_model.predict(features, verbose=0)
     # Handle sigmoid (1 output) vs softmax (>=2 outputs)
     if raw_pred.ndim == 1 or raw_pred.shape[1] == 1:
         prediction = class_idx
         confidence = round(float(raw_pred[0][class_idx]) * 100, 2)
+    # Label convention: 1 = AMP, 0 = Non-AMP (swap if your model is opposite)
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 1 else "Non-AMP"
     result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
+    # LIME explanation (lazy import keeps startup light)
     try:
+        from lime.lime_tabular import LimeTabularExplainer
+        sample_data = np.random.rand(100, len(selected_features))
+        explainer = LimeTabularExplainer(
+            training_data=sample_data,
+            feature_names=selected_features,
+            class_names=["Non-AMP", "AMP"],
+            mode="classification"
+        )
         explanation = explainer.explain_instance(
             data_row=features[0],
             predict_fn=keras_predict_proba,
             num_features=10
         )
         result += "\nTop Features Influencing Prediction:\n"
         for feat, weight in explanation.as_list():
             result += f"- {feat}: {round(weight, 4)}\n"
     return result
 # Gradio UI
 iface = gr.Interface(
     fn=full_prediction,
     description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
 )
+iface.launch()