Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
| 1 |
import os
|
| 2 |
# --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
|
| 3 |
-
# TF and torch each bundle their own OpenMP/MKL; loaded together they can collide
|
| 4 |
-
# and crash at the C level. These settings make them coexist and reduce memory.
|
| 5 |
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
|
| 6 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
| 7 |
os.environ.setdefault("MKL_NUM_THREADS", "1")
|
| 8 |
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
|
| 9 |
-
# Quiet TensorFlow logs (must be set before importing tensorflow)
|
| 10 |
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
|
| 11 |
os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
|
| 12 |
|
|
@@ -15,6 +12,7 @@ import joblib
|
|
| 15 |
import numpy as np
|
| 16 |
import pandas as pd
|
| 17 |
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
|
|
|
|
| 18 |
import sys
|
| 19 |
import json
|
| 20 |
import subprocess
|
|
@@ -39,9 +37,7 @@ def get_amp_model():
|
|
| 39 |
|
| 40 |
|
| 41 |
# ---------------------------------------------------------------------------
|
| 42 |
-
# The EXACT 343 features the
|
| 43 |
-
# The scaler was fit on a numpy array (no stored names), so order is critical:
|
| 44 |
-
# we must select these columns in this order BEFORE calling scaler.transform().
|
| 45 |
# ---------------------------------------------------------------------------
|
| 46 |
selected_features = [
|
| 47 |
"_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
|
|
@@ -137,18 +133,41 @@ selected_features = [
|
|
| 137 |
]
|
| 138 |
assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
def keras_predict_proba(X):
|
| 142 |
-
"""Return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
amp_model, _ = get_amp_model()
|
| 144 |
preds = amp_model.predict(X, verbose=0)
|
| 145 |
if preds.ndim == 1 or preds.shape[1] == 1:
|
| 146 |
-
preds = preds.reshape(-1, 1)
|
| 147 |
-
return np.hstack([1 - preds, preds]) #
|
| 148 |
return preds
|
| 149 |
|
| 150 |
|
| 151 |
def extract_features(sequence):
|
|
|
|
| 152 |
sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
|
| 153 |
if len(sequence) < 10:
|
| 154 |
return "Error: Sequence too short."
|
|
@@ -156,64 +175,65 @@ def extract_features(sequence):
|
|
| 156 |
try:
|
| 157 |
_, amp_scaler = get_amp_model()
|
| 158 |
|
| 159 |
-
#
|
| 160 |
-
#
|
| 161 |
-
dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
|
| 162 |
-
filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
|
| 163 |
|
| 164 |
-
#
|
| 165 |
ctd_features = CTD.CalculateCTD(sequence)
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
# 3. Autocorrelation features
|
| 168 |
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
|
| 169 |
|
| 170 |
# 4. Pseudo-AAC features
|
| 171 |
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
|
| 172 |
|
| 173 |
-
# Merge in
|
| 174 |
all_features_dict = {}
|
| 175 |
all_features_dict.update(ctd_features)
|
| 176 |
all_features_dict.update(filtered_dipeptide)
|
| 177 |
all_features_dict.update(auto_features)
|
| 178 |
all_features_dict.update(pseudo_features)
|
| 179 |
|
| 180 |
-
# Build full-pool DataFrame (
|
| 181 |
feature_df_all = pd.DataFrame([all_features_dict])
|
| 182 |
|
| 183 |
-
# Scale the FULL pool
|
| 184 |
scaled_array = amp_scaler.transform(feature_df_all.values)
|
| 185 |
scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
|
| 186 |
|
| 187 |
-
# Verify all 343 selected features
|
| 188 |
missing = [f for f in selected_features if f not in scaled_df.columns]
|
| 189 |
if missing:
|
| 190 |
return f"Error: Missing features after scaling: {missing[:5]}..."
|
| 191 |
|
| 192 |
-
# Select the 343 features
|
| 193 |
selected_df = scaled_df[selected_features].fillna(0)
|
| 194 |
return selected_df.values.astype(np.float32)
|
| 195 |
|
| 196 |
except Exception as e:
|
| 197 |
return f"Error in feature extraction: {str(e)}"
|
| 198 |
|
|
|
|
| 199 |
def predictmic(sequence):
|
| 200 |
"""Run MIC prediction in a SEPARATE process (mic_worker.py).
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
its last stdout line; we parse and return it.
|
| 205 |
"""
|
| 206 |
sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
|
| 207 |
if len(sequence) < 10:
|
| 208 |
return {"Error": "Sequence too short or invalid."}
|
| 209 |
|
| 210 |
try:
|
| 211 |
-
# First run downloads ProtBert (~1.6GB), so allow a generous timeout.
|
| 212 |
proc = subprocess.run(
|
| 213 |
[sys.executable, "mic_worker.py", sequence],
|
| 214 |
capture_output=True,
|
| 215 |
text=True,
|
| 216 |
-
timeout=900 # 15
|
| 217 |
)
|
| 218 |
except subprocess.TimeoutExpired:
|
| 219 |
return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
|
|
@@ -221,11 +241,9 @@ def predictmic(sequence):
|
|
| 221 |
return {"Error": f"Failed to start MIC worker: {str(e)}"}
|
| 222 |
|
| 223 |
if proc.returncode != 0:
|
| 224 |
-
# Worker crashed; surface stderr tail for debugging
|
| 225 |
tail = (proc.stderr or "").strip().splitlines()[-3:]
|
| 226 |
return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
|
| 227 |
|
| 228 |
-
# Parse the last non-empty stdout line as JSON
|
| 229 |
out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
|
| 230 |
if not out_lines:
|
| 231 |
return {"Error": "MIC worker produced no output."}
|
|
@@ -240,33 +258,35 @@ def full_prediction(sequence):
|
|
| 240 |
print("[CHECKPOINT] full_prediction called", flush=True)
|
| 241 |
features = extract_features(sequence)
|
| 242 |
if isinstance(features, str):
|
| 243 |
-
print("[CHECKPOINT] extract_features
|
| 244 |
return features
|
| 245 |
print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
|
| 246 |
|
| 247 |
amp_model, _ = get_amp_model()
|
| 248 |
-
print("[CHECKPOINT] AMP model loaded, running predict...", flush=True)
|
| 249 |
raw_pred = amp_model.predict(features, verbose=0)
|
| 250 |
-
print("[CHECKPOINT]
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
| 260 |
else:
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
confidence = round(float(raw_pred[0][class_idx]) * 100, 2)
|
| 264 |
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
result = f"
|
|
|
|
|
|
|
| 268 |
|
| 269 |
-
if prediction ==
|
| 270 |
print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
|
| 271 |
mic_values = predictmic(sequence)
|
| 272 |
print("[CHECKPOINT] MIC done:", mic_values, flush=True)
|
|
@@ -276,16 +296,9 @@ def full_prediction(sequence):
|
|
| 276 |
else:
|
| 277 |
result += "\nMIC prediction skipped for Non-AMP sequences.\n"
|
| 278 |
|
|
|
|
| 279 |
try:
|
| 280 |
-
|
| 281 |
-
sample_data = np.random.rand(100, len(selected_features))
|
| 282 |
-
explainer = LimeTabularExplainer(
|
| 283 |
-
training_data=sample_data,
|
| 284 |
-
feature_names=selected_features,
|
| 285 |
-
class_names=["Non-AMP", "AMP"],
|
| 286 |
-
mode="classification"
|
| 287 |
-
)
|
| 288 |
-
explanation = explainer.explain_instance(
|
| 289 |
data_row=features[0],
|
| 290 |
predict_fn=keras_predict_proba,
|
| 291 |
num_features=10
|
|
|
|
| 1 |
import os
|
| 2 |
# --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
|
|
|
|
|
|
|
| 3 |
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
|
| 4 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
| 5 |
os.environ.setdefault("MKL_NUM_THREADS", "1")
|
| 6 |
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
|
|
|
|
| 7 |
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
|
| 8 |
os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
|
| 9 |
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
import pandas as pd
|
| 14 |
from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
|
| 15 |
+
from lime.lime_tabular import LimeTabularExplainer
|
| 16 |
import sys
|
| 17 |
import json
|
| 18 |
import subprocess
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
# ---------------------------------------------------------------------------
|
| 40 |
+
# The EXACT 343 features the model was trained on, IN THE EXACT TRAINING ORDER.
|
|
|
|
|
|
|
| 41 |
# ---------------------------------------------------------------------------
|
| 42 |
selected_features = [
|
| 43 |
"_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
|
|
|
|
| 133 |
]
|
| 134 |
assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
|
| 135 |
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
# LIME explainer — built ONCE at startup with uniform [0,1] background data.
|
| 138 |
+
# This is valid because all features are MinMax-scaled to [0,1], so uniform
|
| 139 |
+
# noise is a reasonable approximation of the feature distribution.
|
| 140 |
+
# Building it here avoids rebuilding on every prediction call.
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
_lime_background = np.random.rand(100, len(selected_features))
|
| 143 |
+
_explainer = LimeTabularExplainer(
|
| 144 |
+
training_data=_lime_background,
|
| 145 |
+
feature_names=selected_features,
|
| 146 |
+
# FIX: label convention matches training — AMP=0, Non-AMP=1
|
| 147 |
+
# (same as the working old RF script: prediction==0 → AMP)
|
| 148 |
+
class_names=["AMP", "Non-AMP"],
|
| 149 |
+
mode="classification"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
|
| 153 |
def keras_predict_proba(X):
|
| 154 |
+
"""Return [P(AMP), P(Non-AMP)] for LIME.
|
| 155 |
+
|
| 156 |
+
The model was trained with AMP=0, Non-AMP=1.
|
| 157 |
+
A sigmoid output therefore represents P(Non-AMP=1).
|
| 158 |
+
So P(AMP) = 1 - sigmoid_output.
|
| 159 |
+
Columns must match class_names order: index 0 = AMP, index 1 = Non-AMP.
|
| 160 |
+
"""
|
| 161 |
amp_model, _ = get_amp_model()
|
| 162 |
preds = amp_model.predict(X, verbose=0)
|
| 163 |
if preds.ndim == 1 or preds.shape[1] == 1:
|
| 164 |
+
preds = preds.reshape(-1, 1) # preds = P(Non-AMP)
|
| 165 |
+
return np.hstack([1 - preds, preds]) # [P(AMP), P(Non-AMP)]
|
| 166 |
return preds
|
| 167 |
|
| 168 |
|
| 169 |
def extract_features(sequence):
|
| 170 |
+
"""Compute the full 1325-feature pool, scale it, then select the 343 model features."""
|
| 171 |
sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
|
| 172 |
if len(sequence) < 10:
|
| 173 |
return "Error: Sequence too short."
|
|
|
|
| 175 |
try:
|
| 176 |
_, amp_scaler = get_amp_model()
|
| 177 |
|
| 178 |
+
# Replicate the EXACT feature pool the scaler was fit on (1325 features):
|
| 179 |
+
# Order must match training: CTD → dipeptide(420) → autocorr → pseudoAAC
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
# 1. CTD features
|
| 182 |
ctd_features = CTD.CalculateCTD(sequence)
|
| 183 |
|
| 184 |
+
# 2. Dipeptide + AAC filtered to first 420 keys (same as old working script)
|
| 185 |
+
dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
|
| 186 |
+
filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
|
| 187 |
+
|
| 188 |
# 3. Autocorrelation features
|
| 189 |
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
|
| 190 |
|
| 191 |
# 4. Pseudo-AAC features
|
| 192 |
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
|
| 193 |
|
| 194 |
+
# Merge in training order
|
| 195 |
all_features_dict = {}
|
| 196 |
all_features_dict.update(ctd_features)
|
| 197 |
all_features_dict.update(filtered_dipeptide)
|
| 198 |
all_features_dict.update(auto_features)
|
| 199 |
all_features_dict.update(pseudo_features)
|
| 200 |
|
| 201 |
+
# Build full-pool DataFrame (~1325 columns)
|
| 202 |
feature_df_all = pd.DataFrame([all_features_dict])
|
| 203 |
|
| 204 |
+
# Scale the FULL pool — scaler expects 1325 features in training column order
|
| 205 |
scaled_array = amp_scaler.transform(feature_df_all.values)
|
| 206 |
scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
|
| 207 |
|
| 208 |
+
# Verify all 343 selected features survived
|
| 209 |
missing = [f for f in selected_features if f not in scaled_df.columns]
|
| 210 |
if missing:
|
| 211 |
return f"Error: Missing features after scaling: {missing[:5]}..."
|
| 212 |
|
| 213 |
+
# Select the 343 features in model training order
|
| 214 |
selected_df = scaled_df[selected_features].fillna(0)
|
| 215 |
return selected_df.values.astype(np.float32)
|
| 216 |
|
| 217 |
except Exception as e:
|
| 218 |
return f"Error in feature extraction: {str(e)}"
|
| 219 |
|
| 220 |
+
|
| 221 |
def predictmic(sequence):
|
| 222 |
"""Run MIC prediction in a SEPARATE process (mic_worker.py).
|
| 223 |
|
| 224 |
+
Isolates PyTorch/ProtBert from TensorFlow to prevent SIGSEGV (exit 139).
|
| 225 |
+
The worker prints a JSON dict on its last stdout line.
|
|
|
|
| 226 |
"""
|
| 227 |
sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
|
| 228 |
if len(sequence) < 10:
|
| 229 |
return {"Error": "Sequence too short or invalid."}
|
| 230 |
|
| 231 |
try:
|
|
|
|
| 232 |
proc = subprocess.run(
|
| 233 |
[sys.executable, "mic_worker.py", sequence],
|
| 234 |
capture_output=True,
|
| 235 |
text=True,
|
| 236 |
+
timeout=900 # 15 min — generous for one-time ProtBert download
|
| 237 |
)
|
| 238 |
except subprocess.TimeoutExpired:
|
| 239 |
return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
|
|
|
|
| 241 |
return {"Error": f"Failed to start MIC worker: {str(e)}"}
|
| 242 |
|
| 243 |
if proc.returncode != 0:
|
|
|
|
| 244 |
tail = (proc.stderr or "").strip().splitlines()[-3:]
|
| 245 |
return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
|
| 246 |
|
|
|
|
| 247 |
out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
|
| 248 |
if not out_lines:
|
| 249 |
return {"Error": "MIC worker produced no output."}
|
|
|
|
| 258 |
print("[CHECKPOINT] full_prediction called", flush=True)
|
| 259 |
features = extract_features(sequence)
|
| 260 |
if isinstance(features, str):
|
| 261 |
+
print("[CHECKPOINT] extract_features error:", features, flush=True)
|
| 262 |
return features
|
| 263 |
print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
|
| 264 |
|
| 265 |
amp_model, _ = get_amp_model()
|
|
|
|
| 266 |
raw_pred = amp_model.predict(features, verbose=0)
|
| 267 |
+
print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
|
| 268 |
+
|
| 269 |
+
# ------------------------------------------------------------------
|
| 270 |
+
# FIX: sigmoid output = P(Non-AMP) because training labels were
|
| 271 |
+
# AMP=0, Non-AMP=1 (same convention as the working old RF script).
|
| 272 |
+
# ------------------------------------------------------------------
|
| 273 |
+
prob_non_amp = float(raw_pred.flatten()[0])
|
| 274 |
+
prob_amp = 1.0 - prob_non_amp
|
| 275 |
+
|
| 276 |
+
if prob_amp >= 0.5:
|
| 277 |
+
prediction = 0 # AMP (class 0, same as old script)
|
| 278 |
+
confidence = round(prob_amp * 100, 2)
|
| 279 |
else:
|
| 280 |
+
prediction = 1 # Non-AMP (class 1)
|
| 281 |
+
confidence = round(prob_non_amp * 100, 2)
|
|
|
|
| 282 |
|
| 283 |
+
amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
|
| 284 |
+
result = f"Prediction: {amp_result}\n"
|
| 285 |
+
result += f"Confidence: {confidence}%\n"
|
| 286 |
+
# Debug line — remove once you've verified on known sequences
|
| 287 |
+
result += f"[Debug] sigmoid={round(prob_non_amp,4)} | P(AMP)={round(prob_amp,4)} | P(Non-AMP)={round(prob_non_amp,4)}\n"
|
| 288 |
|
| 289 |
+
if prediction == 0: # AMP → run MIC
|
| 290 |
print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
|
| 291 |
mic_values = predictmic(sequence)
|
| 292 |
print("[CHECKPOINT] MIC done:", mic_values, flush=True)
|
|
|
|
| 296 |
else:
|
| 297 |
result += "\nMIC prediction skipped for Non-AMP sequences.\n"
|
| 298 |
|
| 299 |
+
# LIME — uses the explainer built once at startup
|
| 300 |
try:
|
| 301 |
+
explanation = _explainer.explain_instance(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
data_row=features[0],
|
| 303 |
predict_fn=keras_predict_proba,
|
| 304 |
num_features=10
|