nonzeroexit commited on
Commit
1dcb272
·
verified ·
1 Parent(s): 4222f98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -54
app.py CHANGED
@@ -1,12 +1,9 @@
1
  import os
2
  # --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
3
- # TF and torch each bundle their own OpenMP/MKL; loaded together they can collide
4
- # and crash at the C level. These settings make them coexist and reduce memory.
5
  os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
6
  os.environ.setdefault("OMP_NUM_THREADS", "1")
7
  os.environ.setdefault("MKL_NUM_THREADS", "1")
8
  os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
9
- # Quiet TensorFlow logs (must be set before importing tensorflow)
10
  os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
11
  os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
12
 
@@ -15,6 +12,7 @@ import joblib
15
  import numpy as np
16
  import pandas as pd
17
  from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
 
18
  import sys
19
  import json
20
  import subprocess
@@ -39,9 +37,7 @@ def get_amp_model():
39
 
40
 
41
  # ---------------------------------------------------------------------------
42
- # The EXACT 343 features the scaler was fit on, IN THE EXACT TRAINING ORDER.
43
- # The scaler was fit on a numpy array (no stored names), so order is critical:
44
- # we must select these columns in this order BEFORE calling scaler.transform().
45
  # ---------------------------------------------------------------------------
46
  selected_features = [
47
  "_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
@@ -137,18 +133,41 @@ selected_features = [
137
  ]
138
  assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  def keras_predict_proba(X):
142
- """Return probabilities as [P(Non-AMP), P(AMP)] for LIME (X already scaled)."""
 
 
 
 
 
 
143
  amp_model, _ = get_amp_model()
144
  preds = amp_model.predict(X, verbose=0)
145
  if preds.ndim == 1 or preds.shape[1] == 1:
146
- preds = preds.reshape(-1, 1)
147
- return np.hstack([1 - preds, preds]) # sigmoid output assumed = P(AMP)
148
  return preds
149
 
150
 
151
  def extract_features(sequence):
 
152
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
153
  if len(sequence) < 10:
154
  return "Error: Sequence too short."
@@ -156,64 +175,65 @@ def extract_features(sequence):
156
  try:
157
  _, amp_scaler = get_amp_model()
158
 
159
- # --- Replicate the EXACT same feature pool the scaler was fit on ---
160
- # 1. Dipeptide + AAC (filtered to first 420 keys, same as old script)
161
- dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
162
- filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
163
 
164
- # 2. CTD features
165
  ctd_features = CTD.CalculateCTD(sequence)
166
 
 
 
 
 
167
  # 3. Autocorrelation features
168
  auto_features = Autocorrelation.CalculateAutoTotal(sequence)
169
 
170
  # 4. Pseudo-AAC features
171
  pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
172
 
173
- # Merge in the same order as the old script
174
  all_features_dict = {}
175
  all_features_dict.update(ctd_features)
176
  all_features_dict.update(filtered_dipeptide)
177
  all_features_dict.update(auto_features)
178
  all_features_dict.update(pseudo_features)
179
 
180
- # Build full-pool DataFrame (should be 1325 columns)
181
  feature_df_all = pd.DataFrame([all_features_dict])
182
 
183
- # Scale the FULL pool (scaler expects 1325 features in training order)
184
  scaled_array = amp_scaler.transform(feature_df_all.values)
185
  scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
186
 
187
- # Verify all 343 selected features are present
188
  missing = [f for f in selected_features if f not in scaled_df.columns]
189
  if missing:
190
  return f"Error: Missing features after scaling: {missing[:5]}..."
191
 
192
- # Select the 343 features IN TRAINING ORDER from the scaled DataFrame
193
  selected_df = scaled_df[selected_features].fillna(0)
194
  return selected_df.values.astype(np.float32)
195
 
196
  except Exception as e:
197
  return f"Error in feature extraction: {str(e)}"
198
 
 
199
  def predictmic(sequence):
200
  """Run MIC prediction in a SEPARATE process (mic_worker.py).
201
 
202
- This isolates PyTorch/ProtBert from TensorFlow, preventing the native-library
203
- crash (exit 139) and keeping peak memory low. The worker prints a JSON dict on
204
- its last stdout line; we parse and return it.
205
  """
206
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
207
  if len(sequence) < 10:
208
  return {"Error": "Sequence too short or invalid."}
209
 
210
  try:
211
- # First run downloads ProtBert (~1.6GB), so allow a generous timeout.
212
  proc = subprocess.run(
213
  [sys.executable, "mic_worker.py", sequence],
214
  capture_output=True,
215
  text=True,
216
- timeout=900 # 15 minutes; mostly for the one-time model download
217
  )
218
  except subprocess.TimeoutExpired:
219
  return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
@@ -221,11 +241,9 @@ def predictmic(sequence):
221
  return {"Error": f"Failed to start MIC worker: {str(e)}"}
222
 
223
  if proc.returncode != 0:
224
- # Worker crashed; surface stderr tail for debugging
225
  tail = (proc.stderr or "").strip().splitlines()[-3:]
226
  return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
227
 
228
- # Parse the last non-empty stdout line as JSON
229
  out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
230
  if not out_lines:
231
  return {"Error": "MIC worker produced no output."}
@@ -240,33 +258,35 @@ def full_prediction(sequence):
240
  print("[CHECKPOINT] full_prediction called", flush=True)
241
  features = extract_features(sequence)
242
  if isinstance(features, str):
243
- print("[CHECKPOINT] extract_features returned error:", features, flush=True)
244
  return features
245
  print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
246
 
247
  amp_model, _ = get_amp_model()
248
- print("[CHECKPOINT] AMP model loaded, running predict...", flush=True)
249
  raw_pred = amp_model.predict(features, verbose=0)
250
- print("[CHECKPOINT] AMP predict done:", raw_pred, flush=True)
251
-
252
- if raw_pred.ndim == 1 or raw_pred.shape[1] == 1:
253
- prob_amp = float(raw_pred.flatten()[0]) # sigmoid output assumed = P(AMP)
254
- if prob_amp >= 0.5:
255
- prediction = 1
256
- confidence = round(prob_amp * 100, 2)
257
- else:
258
- prediction = 0
259
- confidence = round((1 - prob_amp) * 100, 2)
 
 
260
  else:
261
- class_idx = int(np.argmax(raw_pred[0]))
262
- prediction = class_idx
263
- confidence = round(float(raw_pred[0][class_idx]) * 100, 2)
264
 
265
- # Label convention: 1 = AMP, 0 = Non-AMP (swap if your model is reversed)
266
- amp_result = "Antimicrobial Peptide (AMP)" if prediction == 1 else "Non-AMP"
267
- result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
 
 
268
 
269
- if prediction == 1:
270
  print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
271
  mic_values = predictmic(sequence)
272
  print("[CHECKPOINT] MIC done:", mic_values, flush=True)
@@ -276,16 +296,9 @@ def full_prediction(sequence):
276
  else:
277
  result += "\nMIC prediction skipped for Non-AMP sequences.\n"
278
 
 
279
  try:
280
- from lime.lime_tabular import LimeTabularExplainer
281
- sample_data = np.random.rand(100, len(selected_features))
282
- explainer = LimeTabularExplainer(
283
- training_data=sample_data,
284
- feature_names=selected_features,
285
- class_names=["Non-AMP", "AMP"],
286
- mode="classification"
287
- )
288
- explanation = explainer.explain_instance(
289
  data_row=features[0],
290
  predict_fn=keras_predict_proba,
291
  num_features=10
 
1
  import os
2
  # --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
 
 
3
  os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
4
  os.environ.setdefault("OMP_NUM_THREADS", "1")
5
  os.environ.setdefault("MKL_NUM_THREADS", "1")
6
  os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
 
7
  os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
8
  os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
9
 
 
12
  import numpy as np
13
  import pandas as pd
14
  from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
15
+ from lime.lime_tabular import LimeTabularExplainer
16
  import sys
17
  import json
18
  import subprocess
 
37
 
38
 
39
  # ---------------------------------------------------------------------------
40
+ # The EXACT 343 features the model was trained on, IN THE EXACT TRAINING ORDER.
 
 
41
  # ---------------------------------------------------------------------------
42
  selected_features = [
43
  "_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
 
133
  ]
134
  assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
135
 
136
+ # ---------------------------------------------------------------------------
137
+ # LIME explainer — built ONCE at startup with uniform [0,1] background data.
138
+ # This is valid because all features are MinMax-scaled to [0,1], so uniform
139
+ # noise is a reasonable approximation of the feature distribution.
140
+ # Building it here avoids rebuilding on every prediction call.
141
+ # ---------------------------------------------------------------------------
142
+ _lime_background = np.random.rand(100, len(selected_features))
143
+ _explainer = LimeTabularExplainer(
144
+ training_data=_lime_background,
145
+ feature_names=selected_features,
146
+ # FIX: label convention matches training — AMP=0, Non-AMP=1
147
+ # (same as the working old RF script: prediction==0 → AMP)
148
+ class_names=["AMP", "Non-AMP"],
149
+ mode="classification"
150
+ )
151
+
152
 
153
  def keras_predict_proba(X):
154
+ """Return [P(AMP), P(Non-AMP)] for LIME.
155
+
156
+ The model was trained with AMP=0, Non-AMP=1.
157
+ A sigmoid output therefore represents P(Non-AMP=1).
158
+ So P(AMP) = 1 - sigmoid_output.
159
+ Columns must match class_names order: index 0 = AMP, index 1 = Non-AMP.
160
+ """
161
  amp_model, _ = get_amp_model()
162
  preds = amp_model.predict(X, verbose=0)
163
  if preds.ndim == 1 or preds.shape[1] == 1:
164
+ preds = preds.reshape(-1, 1) # preds = P(Non-AMP)
165
+ return np.hstack([1 - preds, preds]) # [P(AMP), P(Non-AMP)]
166
  return preds
167
 
168
 
169
  def extract_features(sequence):
170
+ """Compute the full 1325-feature pool, scale it, then select the 343 model features."""
171
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
172
  if len(sequence) < 10:
173
  return "Error: Sequence too short."
 
175
  try:
176
  _, amp_scaler = get_amp_model()
177
 
178
+ # Replicate the EXACT feature pool the scaler was fit on (1325 features):
179
+ # Order must match training: CTD dipeptide(420) autocorr pseudoAAC
 
 
180
 
181
+ # 1. CTD features
182
  ctd_features = CTD.CalculateCTD(sequence)
183
 
184
+ # 2. Dipeptide + AAC filtered to first 420 keys (same as old working script)
185
+ dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
186
+ filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
187
+
188
  # 3. Autocorrelation features
189
  auto_features = Autocorrelation.CalculateAutoTotal(sequence)
190
 
191
  # 4. Pseudo-AAC features
192
  pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
193
 
194
+ # Merge in training order
195
  all_features_dict = {}
196
  all_features_dict.update(ctd_features)
197
  all_features_dict.update(filtered_dipeptide)
198
  all_features_dict.update(auto_features)
199
  all_features_dict.update(pseudo_features)
200
 
201
+ # Build full-pool DataFrame (~1325 columns)
202
  feature_df_all = pd.DataFrame([all_features_dict])
203
 
204
+ # Scale the FULL pool scaler expects 1325 features in training column order
205
  scaled_array = amp_scaler.transform(feature_df_all.values)
206
  scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
207
 
208
+ # Verify all 343 selected features survived
209
  missing = [f for f in selected_features if f not in scaled_df.columns]
210
  if missing:
211
  return f"Error: Missing features after scaling: {missing[:5]}..."
212
 
213
+ # Select the 343 features in model training order
214
  selected_df = scaled_df[selected_features].fillna(0)
215
  return selected_df.values.astype(np.float32)
216
 
217
  except Exception as e:
218
  return f"Error in feature extraction: {str(e)}"
219
 
220
+
221
  def predictmic(sequence):
222
  """Run MIC prediction in a SEPARATE process (mic_worker.py).
223
 
224
+ Isolates PyTorch/ProtBert from TensorFlow to prevent SIGSEGV (exit 139).
225
+ The worker prints a JSON dict on its last stdout line.
 
226
  """
227
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
228
  if len(sequence) < 10:
229
  return {"Error": "Sequence too short or invalid."}
230
 
231
  try:
 
232
  proc = subprocess.run(
233
  [sys.executable, "mic_worker.py", sequence],
234
  capture_output=True,
235
  text=True,
236
+ timeout=900 # 15 min generous for one-time ProtBert download
237
  )
238
  except subprocess.TimeoutExpired:
239
  return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
 
241
  return {"Error": f"Failed to start MIC worker: {str(e)}"}
242
 
243
  if proc.returncode != 0:
 
244
  tail = (proc.stderr or "").strip().splitlines()[-3:]
245
  return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
246
 
 
247
  out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
248
  if not out_lines:
249
  return {"Error": "MIC worker produced no output."}
 
258
  print("[CHECKPOINT] full_prediction called", flush=True)
259
  features = extract_features(sequence)
260
  if isinstance(features, str):
261
+ print("[CHECKPOINT] extract_features error:", features, flush=True)
262
  return features
263
  print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
264
 
265
  amp_model, _ = get_amp_model()
 
266
  raw_pred = amp_model.predict(features, verbose=0)
267
+ print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
268
+
269
+ # ------------------------------------------------------------------
270
+ # FIX: sigmoid output = P(Non-AMP) because training labels were
271
+ # AMP=0, Non-AMP=1 (same convention as the working old RF script).
272
+ # ------------------------------------------------------------------
273
+ prob_non_amp = float(raw_pred.flatten()[0])
274
+ prob_amp = 1.0 - prob_non_amp
275
+
276
+ if prob_amp >= 0.5:
277
+ prediction = 0 # AMP (class 0, same as old script)
278
+ confidence = round(prob_amp * 100, 2)
279
  else:
280
+ prediction = 1 # Non-AMP (class 1)
281
+ confidence = round(prob_non_amp * 100, 2)
 
282
 
283
+ amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
284
+ result = f"Prediction: {amp_result}\n"
285
+ result += f"Confidence: {confidence}%\n"
286
+ # Debug line — remove once you've verified on known sequences
287
+ result += f"[Debug] sigmoid={round(prob_non_amp,4)} | P(AMP)={round(prob_amp,4)} | P(Non-AMP)={round(prob_non_amp,4)}\n"
288
 
289
+ if prediction == 0: # AMP → run MIC
290
  print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
291
  mic_values = predictmic(sequence)
292
  print("[CHECKPOINT] MIC done:", mic_values, flush=True)
 
296
  else:
297
  result += "\nMIC prediction skipped for Non-AMP sequences.\n"
298
 
299
+ # LIME — uses the explainer built once at startup
300
  try:
301
+ explanation = _explainer.explain_instance(
 
 
 
 
 
 
 
 
302
  data_row=features[0],
303
  predict_fn=keras_predict_proba,
304
  num_features=10