AMP-Classifier2

Sleeping

App Files Files Community

nonzeroexit commited on 9 days ago

Commit

bd01e5d

verified ·

1 Parent(s): 5745f40

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -212

app.py CHANGED Viewed

@@ -1,217 +1,126 @@
 import os
-# --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
 os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
 os.environ.setdefault("OMP_NUM_THREADS", "1")
 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
-os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
-os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
-import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 from lime.lime_tabular import LimeTabularExplainer
-import sys
-import json
-import subprocess
 # ---------------------------------------------------------------------------
-# LAZY LOADING — keeps the free 16GB Space from OOM-ing at startup.
-# Only the TensorFlow AMP model is loaded in THIS process. ProtBert/PyTorch
-# run in a SEPARATE process (mic_worker.py) to avoid a native-library clash
-# between TensorFlow and PyTorch that caused SIGSEGV (exit 139).
 # ---------------------------------------------------------------------------
-_amp_model = None
-_amp_scaler = None
-def get_amp_model():
-    global _amp_model, _amp_scaler
-    if _amp_model is None:
-        from tensorflow.keras.models import load_model
-        _amp_model = load_model("Comb1_aac_ctd_RFE_selected_features_model (1).keras")
-        _amp_scaler = joblib.load("norm (4).joblib")
-    return _amp_model, _amp_scaler
 # ---------------------------------------------------------------------------
-# The EXACT 343 features the model was trained on, IN THE EXACT TRAINING ORDER.
 # ---------------------------------------------------------------------------
 selected_features = [
-    "_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
-    "_SolventAccessibilityC1", "_SolventAccessibilityC2", "_SolventAccessibilityC3",
-    "_SecondaryStrC1", "_SecondaryStrC2", "_SecondaryStrC3",
-    "_ChargeC1", "_ChargeC2", "_ChargeC3",
-    "_PolarityC1", "_PolarityC2", "_PolarityC3",
-    "_NormalizedVDWVC1", "_NormalizedVDWVC2", "_NormalizedVDWVC3",
-    "_HydrophobicityC1", "_HydrophobicityC2", "_HydrophobicityC3",
-    "_PolarizabilityT12", "_PolarizabilityT13", "_PolarizabilityT23",
-    "_SolventAccessibilityT12", "_SolventAccessibilityT13", "_SolventAccessibilityT23",
-    "_SecondaryStrT12", "_SecondaryStrT13", "_SecondaryStrT23",
-    "_ChargeT12", "_ChargeT13", "_ChargeT23",
-    "_PolarityT12", "_PolarityT13", "_PolarityT23",
-    "_NormalizedVDWVT12", "_NormalizedVDWVT13", "_NormalizedVDWVT23",
-    "_HydrophobicityT12", "_HydrophobicityT13", "_HydrophobicityT23",
-    "_PolarizabilityD1001", "_PolarizabilityD1025", "_PolarizabilityD1050",
-    "_PolarizabilityD1075", "_PolarizabilityD1100",
-    "_PolarizabilityD2001", "_PolarizabilityD2025", "_PolarizabilityD2050",
-    "_PolarizabilityD2075", "_PolarizabilityD2100",
-    "_PolarizabilityD3001", "_PolarizabilityD3025", "_PolarizabilityD3050",
-    "_PolarizabilityD3075", "_PolarizabilityD3100",
-    "_SolventAccessibilityD1001", "_SolventAccessibilityD1025",
-    "_SolventAccessibilityD1050", "_SolventAccessibilityD1075",
-    "_SolventAccessibilityD1100",
-    "_SolventAccessibilityD2001", "_SolventAccessibilityD2025",
-    "_SolventAccessibilityD2050", "_SolventAccessibilityD2075",
-    "_SolventAccessibilityD2100",
-    "_SolventAccessibilityD3001", "_SolventAccessibilityD3025",
-    "_SolventAccessibilityD3050", "_SolventAccessibilityD3075",
-    "_SolventAccessibilityD3100",
-    "_SecondaryStrD1001", "_SecondaryStrD1025", "_SecondaryStrD1050",
-    "_SecondaryStrD1075", "_SecondaryStrD1100",
-    "_SecondaryStrD2001", "_SecondaryStrD2025", "_SecondaryStrD2050",
-    "_SecondaryStrD2075", "_SecondaryStrD2100",
-    "_SecondaryStrD3001", "_SecondaryStrD3025", "_SecondaryStrD3050",
-    "_SecondaryStrD3075", "_SecondaryStrD3100",
-    "_ChargeD1001", "_ChargeD1025", "_ChargeD1050",
-    "_ChargeD1075", "_ChargeD1100",
-    "_ChargeD2001", "_ChargeD2025", "_ChargeD2050",
-    "_ChargeD2075",
-    "_ChargeD3001", "_ChargeD3025", "_ChargeD3050",
-    "_ChargeD3075", "_ChargeD3100",
-    "_PolarityD1001", "_PolarityD1025", "_PolarityD1050",
-    "_PolarityD1075", "_PolarityD1100",
-    "_PolarityD2001", "_PolarityD2025", "_PolarityD2050",
-    "_PolarityD2075", "_PolarityD2100",
-    "_PolarityD3001", "_PolarityD3025", "_PolarityD3050",
-    "_PolarityD3075", "_PolarityD3100",
-    "_NormalizedVDWVD1001", "_NormalizedVDWVD1025",
-    "_NormalizedVDWVD1050", "_NormalizedVDWVD1075",
-    "_NormalizedVDWVD1100",
-    "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
-    "_NormalizedVDWVD2050", "_NormalizedVDWVD2075",
-    "_NormalizedVDWVD2100",
-    "_NormalizedVDWVD3001", "_NormalizedVDWVD3025",
-    "_NormalizedVDWVD3050", "_NormalizedVDWVD3075",
-    "_NormalizedVDWVD3100",
-    "_HydrophobicityD1001", "_HydrophobicityD1025",
-    "_HydrophobicityD1050", "_HydrophobicityD1075",
-    "_HydrophobicityD1100",
-    "_HydrophobicityD2001", "_HydrophobicityD2025",
-    "_HydrophobicityD2050", "_HydrophobicityD2075",
-    "_HydrophobicityD2100",
-    "_HydrophobicityD3001", "_HydrophobicityD3025",
-    "_HydrophobicityD3050", "_HydrophobicityD3075",
-    "_HydrophobicityD3100",
-    "A", "R", "N", "D", "C", "E", "Q", "G", "H", "I",
-    "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V",
-    "AR", "AD", "AQ", "AG", "AL", "AK", "AF", "AP", "AT", "AV",
-    "RA", "RC", "RE", "RG", "RI", "RL", "RS", "RT", "RV",
-    "NR", "NC", "NG", "NI", "NP", "NS", "NY", "NV",
-    "DR", "DN", "DC", "DE", "DG", "DF", "DS", "DT", "DY",
-    "CR", "CN", "CD", "CC", "CI", "CL", "CK", "CT", "CY", "CV",
-    "EA", "ER", "ED", "EC", "EE", "EG", "EI", "EL", "EK",
-    "EF", "EP", "ET", "EV",
-    "QN", "QF", "QV",
-    "GA", "GR", "GC", "GE", "GG", "GI", "GL", "GK", "GF", "GP", "GY",
-    "HA", "HP", "HT",
-    "IA", "IR", "ID", "II", "IL", "IF", "IP", "IS", "IV",
-    "LA", "LR", "LD", "LC", "LG", "LI", "LK", "LM", "LF",
-    "LS", "LT", "LY", "LV",
-    "KA", "KN", "KC", "KG", "KI", "KL", "KK", "KP", "KY",
-    "MA", "MD", "ME", "MI", "MK", "MF", "MP", "MS", "MV",
-    "FR", "FE", "FQ", "FG", "FL", "FF", "FS", "FT", "FY", "FV",
-    "PA", "PR", "PC", "PE", "PL", "PK", "PP", "PS", "PV",
-    "SA", "SR", "SD", "SC", "SG", "SH", "SI", "SL", "SP", "ST", "SY",
-    "TA", "TR", "TC", "TE", "TQ", "TG", "TI", "TL", "TP", "TS", "TV",
-    "WA",
-    "YN", "YD", "YC", "YQ", "YG", "YP",
-    "VA", "VR", "VD", "VC", "VE", "VG", "VI", "VL", "VK",
-    "VS", "VT", "VY", "VV"
 ]
-assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
 # ---------------------------------------------------------------------------
-# LIME explainer — built ONCE at startup with uniform [0,1] background data.
-# Valid because all features are MinMax-scaled to [0,1].
-# class_names: index 0 = AMP, index 1 = Non-AMP  (matches training: AMP=0, Non-AMP=1)
-# We always explain label=0 (AMP class) so weights are consistent across all
-# sequences — positive weight = pushes TOWARD AMP, negative = pushes AWAY.
 # ---------------------------------------------------------------------------
-_lime_background = np.random.rand(100, len(selected_features))
-_explainer = LimeTabularExplainer(
     training_data=_lime_background,
     feature_names=selected_features,
-    class_names=["AMP", "Non-AMP"],   # index 0=AMP, index 1=Non-AMP
-    mode="classification"
 )
-def keras_predict_proba(X):
-    """Return [P(AMP), P(Non-AMP)] for LIME.
-    Training labels: AMP=0, Non-AMP=1.
-    Sigmoid output = P(Non-AMP=1), so P(AMP) = 1 - sigmoid.
-    Column order must match class_names: col0=P(AMP), col1=P(Non-AMP).
-    """
-    amp_model, _ = get_amp_model()
-    preds = amp_model.predict(X, verbose=0)
-    if preds.ndim == 1 or preds.shape[1] == 1:
-        preds = preds.reshape(-1, 1)          # preds = P(Non-AMP)
-        return np.hstack([1 - preds, preds])  # [P(AMP), P(Non-AMP)]
-    return preds
 def extract_features(sequence):
-    """Compute the full 1325-feature pool, scale it, then select the 343 model features."""
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
-        _, amp_scaler = get_amp_model()
-        # Replicate the EXACT feature pool the scaler was fit on (1325 features).
-        # Merge order must match training: CTD → dipeptide(420) → autocorr → pseudoAAC
-        ctd_features      = CTD.CalculateCTD(sequence)
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
-        filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
-        auto_features     = Autocorrelation.CalculateAutoTotal(sequence)
-        pseudo_features   = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
         all_features_dict = {}
         all_features_dict.update(ctd_features)
-        all_features_dict.update(filtered_dipeptide)
         all_features_dict.update(auto_features)
         all_features_dict.update(pseudo_features)
-        # Build full-pool DataFrame (~1325 columns) and scale
         feature_df_all = pd.DataFrame([all_features_dict])
-        scaled_array   = amp_scaler.transform(feature_df_all.values)
-        scaled_df      = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
-        # Verify all 343 selected features are present
-        missing = [f for f in selected_features if f not in scaled_df.columns]
-        if missing:
-            return f"Error: Missing features after scaling: {missing[:5]}..."
-        # Select 343 features in model training order
-        selected_df = scaled_df[selected_features].fillna(0)
-        return selected_df.values.astype(np.float32)
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
 def predictmic(sequence):
-    """Run MIC prediction in a SEPARATE process (mic_worker.py).
-    Isolates PyTorch/ProtBert from TensorFlow to prevent SIGSEGV (exit 139).
-    The worker prints a JSON dict on its last stdout line.
-    """
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
@@ -219,12 +128,10 @@ def predictmic(sequence):
     try:
         proc = subprocess.run(
             [sys.executable, "mic_worker.py", sequence],
-            capture_output=True,
-            text=True,
-            timeout=900
         )
     except subprocess.TimeoutExpired:
-        return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
     except Exception as e:
         return {"Error": f"Failed to start MIC worker: {str(e)}"}
@@ -235,74 +142,59 @@ def predictmic(sequence):
     out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
     if not out_lines:
         return {"Error": "MIC worker produced no output."}
     try:
         return json.loads(out_lines[-1])
     except Exception:
         return {"Error": f"Could not parse MIC worker output: {out_lines[-1][:200]}"}
 def full_prediction(sequence):
-    print("[CHECKPOINT] full_prediction called", flush=True)
     features = extract_features(sequence)
     if isinstance(features, str):
-        print("[CHECKPOINT] extract_features error:", features, flush=True)
         return features
-    print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
-    amp_model, _ = get_amp_model()
-    raw_pred = amp_model.predict(features, verbose=0)
-    print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
-    prob_non_amp = float(raw_pred.flatten()[0])
-    prob_amp     = 1.0 - prob_non_amp
-    if prob_amp >= 0.5:
-        prediction = 0
-        confidence = round(prob_amp * 100, 2)
-    else:
-        prediction = 1
-        confidence = round(prob_non_amp * 100, 2)
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
-    result  = f"Prediction: {amp_result}\n"
-    result += f"Confidence: {confidence}%\n"
     if prediction == 0:
-        print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
         mic_values = predictmic(sequence)
-        print("[CHECKPOINT] MIC done:", mic_values, flush=True)
         result += "\nPredicted MIC Values (μM):\n"
         for org, mic in mic_values.items():
             result += f"- {org}: {mic}\n"
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
-    # ------------------------------------------------------------------
-    # LIME
-    # ------------------------------------------------------------------
-    print("[CHECKPOINT] Starting LIME...", flush=True)
-    try:
-        explanation = _explainer.explain_instance(
-            data_row=features[0],
-            predict_fn=keras_predict_proba,
-            num_features=10,
-            labels=(0,)
-        )
-        lime_list = explanation.as_list(label=0)
-        print("[CHECKPOINT] LIME done:", lime_list, flush=True)
-        result += "\nTop Features Influencing AMP Classification:\n"
-        for feat, weight in lime_list:
-            direction = "↑ AMP" if weight > 0 else "↓ AMP"
-            result += f"- {feat}: {round(weight, 4)} ({direction})\n"
-    except Exception as e:
-        print("[CHECKPOINT] LIME FAILED:", str(e), flush=True)
-        result += f"\nLIME explanation failed: {str(e)}\n"
-    print("[CHECKPOINT] full_prediction complete", flush=True)
     return result
 iface = gr.Interface(
     fn=full_prediction,
     inputs=gr.Textbox(label="Enter Protein Sequence"),

 import os
+# Native-lib hygiene (prevents TF/PyTorch SIGSEGV when both load; harmless for RF)
 os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
 os.environ.setdefault("OMP_NUM_THREADS", "1")
 os.environ.setdefault("MKL_NUM_THREADS", "1")
 os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+import sys
+import json
+import subprocess
 import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 from lime.lime_tabular import LimeTabularExplainer
+import gradio as gr
 # ---------------------------------------------------------------------------
+# Load Random Forest AMP classifier + MinMax scaler (original files)
 # ---------------------------------------------------------------------------
+model = joblib.load("RF.joblib")
+scaler = joblib.load("norm (4).joblib")
 # ---------------------------------------------------------------------------
+# Original 138 RFE-selected features (CTD + AAC + Autocorrelation + APAAC)
 # ---------------------------------------------------------------------------
 selected_features = [
+    "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
+    "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
+    "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
+    "_SecondaryStrD1001", "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
+    "_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001", "_PolarityD1050",
+    "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
+    "_NormalizedVDWVD2050", "_NormalizedVDWVD3001", "_HydrophobicityD1001", "_HydrophobicityD2001",
+    "_HydrophobicityD3001", "_HydrophobicityD3025", "A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
+    "AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL", "HC", "IA", "IL", "IV", "LA", "LC", "LE",
+    "LI", "LT", "LV", "KC", "MA", "MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
+    "MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
+    "GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
+    "GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
+    "GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
+    "GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29", "GearyAuto_AvFlexibility30",
+    "GearyAuto_Polarizability22", "GearyAuto_Polarizability24", "GearyAuto_Polarizability25",
+    "GearyAuto_Polarizability27", "GearyAuto_Polarizability28", "GearyAuto_Polarizability29",
+    "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24", "GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30",
+    "GearyAuto_ResidueASA21", "GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
+    "GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24", "GearyAuto_ResidueVol25",
+    "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28", "GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30",
+    "GearyAuto_Steric18", "GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
+    "GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
+    "GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28", "GearyAuto_Mutability29",
+    "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5", "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13",
+    "APAAC15", "APAAC18", "APAAC19", "APAAC24"
 ]
+assert len(selected_features) == 138, f"Expected 138 features, got {len(selected_features)}"
 # ---------------------------------------------------------------------------
+# LIME explainer
+# Built ONCE at startup so explanations are reproducible across requests.
+# The training-data argument controls how LIME perturbs features around the
+# input. After MinMax scaling each feature lives in [0,1], so we use a small
+# uniform sample with a FIXED seed — that gives stable, repeatable weights.
+# (If you have a saved sample of real normalized training rows, swap it in
+# here and explanations will reflect the true feature distribution.)
 # ---------------------------------------------------------------------------
+_rng = np.random.default_rng(seed=42)
+_lime_background = _rng.uniform(low=0.0, high=1.0, size=(500, len(selected_features)))
+explainer = LimeTabularExplainer(
     training_data=_lime_background,
     feature_names=selected_features,
+    class_names=["AMP", "Non-AMP"],
+    mode="classification",
+    discretize_continuous=True,
+    random_state=42,  # stable explanations
 )
+# ---------------------------------------------------------------------------
+# Feature extraction — produces the full propy feature pool, scales it with
+# the saved MinMax scaler, then selects the 138 features the RF was trained on.
+# ---------------------------------------------------------------------------
 def extract_features(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return "Error: Sequence too short."
     try:
+        # Original full pool: CTD + AAC(first 420) + Autocorrelation + PseudoAAC
         dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
+        filtered_dipeptide_features = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
+        ctd_features = CTD.CalculateCTD(sequence)
+        auto_features = Autocorrelation.CalculateAutoTotal(sequence)
+        pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
         all_features_dict = {}
         all_features_dict.update(ctd_features)
+        all_features_dict.update(filtered_dipeptide_features)
         all_features_dict.update(auto_features)
         all_features_dict.update(pseudo_features)
         feature_df_all = pd.DataFrame([all_features_dict])
+        normalized_array = scaler.transform(feature_df_all.values)
+        normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
+        if not set(selected_features).issubset(normalized_df.columns):
+            missing = set(selected_features) - set(normalized_df.columns)
+            return f"Error: Missing features: {list(missing)[:5]}..."
+        selected_df = normalized_df[selected_features].fillna(0)
+        return selected_df.values
     except Exception as e:
         return f"Error in feature extraction: {str(e)}"
+# ---------------------------------------------------------------------------
+# MIC prediction — runs in a SEPARATE process (mic_worker.py).
+# This isolates PyTorch/ProtBert from the main process and prevents the
+# native-library crash (exit 139) plus the OOM spike on the free tier.
+# ---------------------------------------------------------------------------
 def predictmic(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
     try:
         proc = subprocess.run(
             [sys.executable, "mic_worker.py", sequence],
+            capture_output=True, text=True, timeout=900
         )
     except subprocess.TimeoutExpired:
+        return {"Error": "MIC prediction timed out (ProtBert may still be downloading; try again shortly)."}
     except Exception as e:
         return {"Error": f"Failed to start MIC worker: {str(e)}"}
     out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
     if not out_lines:
         return {"Error": "MIC worker produced no output."}
     try:
         return json.loads(out_lines[-1])
     except Exception:
         return {"Error": f"Could not parse MIC worker output: {out_lines[-1][:200]}"}
+# ---------------------------------------------------------------------------
+# Main prediction pipeline
+# ---------------------------------------------------------------------------
 def full_prediction(sequence):
     features = extract_features(sequence)
     if isinstance(features, str):
         return features
+    prediction = model.predict(features)[0]
+    probabilities = model.predict_proba(features)[0]
+    try:
+        class_index = list(model.classes_).index(prediction)
+        confidence = round(probabilities[class_index] * 100, 2)
+    except Exception:
+        confidence = "Unknown"
     amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
+    result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
+    # ---- LIME first (per your spec: LIME before SHAP in the report) ----
+    try:
+        explanation = explainer.explain_instance(
+            data_row=features[0],          # <-- explicitly the single input sequence
+            predict_fn=model.predict_proba,
+            num_features=10,
+            num_samples=2000,              # perturbations around this single input
+        )
+        result += "\nTop Features Influencing Prediction (LIME):\n"
+        for feat, weight in explanation.as_list():
+            result += f"- {feat}: {round(weight, 4)}\n"
+    except Exception as e:
+        result += f"\nLIME explanation failed: {str(e)}\n"
+    # ---- MIC (only for AMPs) ----
     if prediction == 0:
         mic_values = predictmic(sequence)
         result += "\nPredicted MIC Values (μM):\n"
         for org, mic in mic_values.items():
             result += f"- {org}: {mic}\n"
     else:
         result += "\nMIC prediction skipped for Non-AMP sequences.\n"
     return result
+# Gradio UI
 iface = gr.Interface(
     fn=full_prediction,
     inputs=gr.Textbox(label="Enter Protein Sequence"),