Spaces:

nonzeroexit
/

AMP-Classifier

Running

App Files Files Community

nonzeroexit commited on 7 days ago

Commit

0ff9972

verified ·

1 Parent(s): 8a9cc7c

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -102

app.py CHANGED Viewed

@@ -1,31 +1,25 @@
-import os
-# Native-lib hygiene (prevents TF/PyTorch SIGSEGV when both load; harmless for RF)
-os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
-os.environ.setdefault("OMP_NUM_THREADS", "1")
-os.environ.setdefault("MKL_NUM_THREADS", "1")
-os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
-os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-import sys
-import json
-import subprocess
 import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 from lime.lime_tabular import LimeTabularExplainer
-import gradio as gr
-# ---------------------------------------------------------------------------
-# Load Random Forest AMP classifier + MinMax scaler (original files)
-# ---------------------------------------------------------------------------
 model = joblib.load("RF.joblib")
 scaler = joblib.load("norm (4).joblib")
-# ---------------------------------------------------------------------------
-# Original 138 RFE-selected features (CTD + AAC + Autocorrelation + APAAC)
-# ---------------------------------------------------------------------------
 selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
 "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
 "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
@@ -53,49 +47,22 @@ selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondarySt
 "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5", "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13",
 "APAAC15", "APAAC18", "APAAC19", "APAAC24"]
-assert len(selected_features) == 138, f"Expected 138 features, got {len(selected_features)}"
-# ---------------------------------------------------------------------------
-# LIME explainer
-# Built ONCE at startup so explanations are reproducible across requests.
-# Prefers a real normalized training sample (lime_background.joblib). Falls
-# back to seeded uniform noise if that file isn't present (still stable, but
-# less faithful to the true feature distribution).
-# ---------------------------------------------------------------------------
-try:
-    _lime_background = joblib.load("lime_background.joblib")
-    if _lime_background.shape[1] != len(selected_features):
-        raise ValueError(
-            f"lime_background.joblib has {_lime_background.shape[1]} cols, "
-            f"expected {len(selected_features)}"
-        )
-    print(f"[LIME] Using real training sample: {_lime_background.shape}", flush=True)
-except Exception as e:
-    print(f"[LIME] No usable lime_background.joblib ({e}); falling back to uniform noise.", flush=True)
-    _rng = np.random.default_rng(seed=42)
-    _lime_background = _rng.uniform(low=0.0, high=1.0, size=(500, len(selected_features)))
 explainer = LimeTabularExplainer(
-    training_data=_lime_background,
     feature_names=selected_features,
     class_names=["AMP", "Non-AMP"],
-    mode="classification",
-    discretize_continuous=True,
-    random_state=42,  # stable explanations
 )
-# ---------------------------------------------------------------------------
-# Feature extraction — produces the full propy feature pool, scales it with
-# the saved MinMax scaler, then selects the 138 features the RF was trained on.
-# ---------------------------------------------------------------------------
 def extract_features(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
-        # Original full pool: CTD + AAC(first 420) + Autocorrelation + PseudoAAC
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
         filtered_dipeptide_features = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
         ctd_features = CTD.CalculateCTD(sequence)
@@ -113,51 +80,50 @@ def extract_features(sequence):
         normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
         if not set(selected_features).issubset(normalized_df.columns):
-            missing = set(selected_features) - set(normalized_df.columns)
-            return f"Error: Missing features: {list(missing)[:5]}..."
         selected_df = normalized_df[selected_features].fillna(0)
         return selected_df.values
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
-# ---------------------------------------------------------------------------
-# MIC prediction — runs in a SEPARATE process (mic_worker.py).
-# This isolates PyTorch/ProtBert from the main process and prevents the
-# native-library crash (exit 139) plus the OOM spike on the free tier.
-# ---------------------------------------------------------------------------
 def predictmic(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
-    try:
-        proc = subprocess.run(
-            [sys.executable, "mic_worker.py", sequence],
-            capture_output=True, text=True, timeout=900
-        )
-    except subprocess.TimeoutExpired:
-        return {"Error": "MIC prediction timed out (ProtBert may still be downloading; try again shortly)."}
-    except Exception as e:
-        return {"Error": f"Failed to start MIC worker: {str(e)}"}
-    if proc.returncode != 0:
-        tail = (proc.stderr or "").strip().splitlines()[-3:]
-        return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
-    out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
-    if not out_lines:
-        return {"Error": "MIC worker produced no output."}
-    try:
-        return json.loads(out_lines[-1])
-    except Exception:
-        return {"Error": f"Could not parse MIC worker output: {out_lines[-1][:200]}"}
-# ---------------------------------------------------------------------------
-# Main prediction pipeline
-# ---------------------------------------------------------------------------
 def full_prediction(sequence):
     features = extract_features(sequence)
     if isinstance(features, str):
@@ -175,21 +141,6 @@ def full_prediction(sequence):
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
     result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
-    # ---- LIME first (per your spec: LIME before SHAP in the report) ----
-    try:
-        explanation = explainer.explain_instance(
-            data_row=features[0],          # <-- explicitly the single input sequence
-            predict_fn=model.predict_proba,
-            num_features=10,
-            num_samples=2000,              # perturbations around this single input
-        )
-        result += "\nTop Features Influencing Prediction (LIME):\n"
-        for feat, weight in explanation.as_list():
-            result += f"- {feat}: {round(weight, 4)}\n"
-    except Exception as e:
-        result += f"\nLIME explanation failed: {str(e)}\n"
-    # ---- MIC (only for AMPs) ----
     if prediction == 0:
         mic_values = predictmic(sequence)
         result += "\nPredicted MIC Values (μM):\n"
@@ -198,8 +149,17 @@ def full_prediction(sequence):
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
-    return result
 # Gradio UI
 iface = gr.Interface(
@@ -210,4 +170,4 @@ iface = gr.Interface(
     description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
 )
-iface.launch()

+import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
+from sklearn.preprocessing import MinMaxScaler
+import torch
+from transformers import BertTokenizer, BertModel
 from lime.lime_tabular import LimeTabularExplainer
+from math import expm1
+# Load AMP Classifier and Scaler
 model = joblib.load("RF.joblib")
 scaler = joblib.load("norm (4).joblib")
+# Load ProtBert
+tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
+protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+protbert_model = protbert_model.to(device).eval()
+# Define selected features (put your complete list here)
 selected_features = ["_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
 "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
 "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
 "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5", "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13",
 "APAAC15", "APAAC18", "APAAC19", "APAAC24"]
+# Dummy data for LIME
+sample_data = np.random.rand(100, len(selected_features))
 explainer = LimeTabularExplainer(
+    training_data=sample_data,
     feature_names=selected_features,
     class_names=["AMP", "Non-AMP"],
+    mode="classification"
 )
+# Feature extraction function
 def extract_features(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
         filtered_dipeptide_features = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
         ctd_features = CTD.CalculateCTD(sequence)
         normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
         if not set(selected_features).issubset(normalized_df.columns):
+            return "Error: Some selected features are missing."
         selected_df = normalized_df[selected_features].fillna(0)
         return selected_df.values
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
+# MIC prediction function
 def predictmic(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
+    seq_spaced = ' '.join(list(sequence))
+    tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
+    tokens = {k: v.to(device) for k, v in tokens.items()}
+    with torch.no_grad():
+        outputs = protbert_model(**tokens)
+        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().reshape(1, -1)
+    bacteria_config = {
+        "E.coli": {"model": "coli_xgboost_model.pkl", "scaler": "coli_scaler.pkl", "pca": None},
+        "S.aureus": {"model": "aur_xgboost_model.pkl", "scaler": "aur_scaler.pkl", "pca": None},
+        "P.aeruginosa": {"model": "arg_xgboost_model.pkl", "scaler": "arg_scaler.pkl", "pca": None},
+        "K.Pneumonia": {"model": "pne_mlp_model.pkl", "scaler": "pne_scaler.pkl", "pca": "pne_pca.pkl"}
+    }
+    mic_results = {}
+    for bacterium, cfg in bacteria_config.items():
+        try:
+            scaler = joblib.load(cfg["scaler"])
+            scaled = scaler.transform(embedding)
+            transformed = joblib.load(cfg["pca"]).transform(scaled) if cfg["pca"] else scaled
+            model = joblib.load(cfg["model"])
+            mic_log = model.predict(transformed)[0]
+            mic = round(expm1(mic_log), 3)
+            mic_results[bacterium] = mic
+        except Exception as e:
+            mic_results[bacterium] = f"Error: {str(e)}"
+    return mic_results
+# Main prediction function
 def full_prediction(sequence):
     features = extract_features(sequence)
     if isinstance(features, str):
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
     result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
     if prediction == 0:
         mic_values = predictmic(sequence)
         result += "\nPredicted MIC Values (μM):\n"
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
+    explanation = explainer.explain_instance(
+        data_row=features[0],
+        predict_fn=model.predict_proba,
+        num_features=10
+    )
+    result += "\nTop Features Influencing Prediction:\n"
+    for feat, weight in explanation.as_list():
+        result += f"- {feat}: {round(weight, 4)}\n"
+    return result
 # Gradio UI
 iface = gr.Interface(
     description="Paste an amino acid sequence (≥10 characters). Get AMP classification, MIC predictions, and LIME interpretability insights."
 )
+iface.launch(share=True)