nonzeroexit commited on
Commit
89e5a22
Β·
verified Β·
1 Parent(s): 1dcb272

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -45
app.py CHANGED
@@ -135,17 +135,16 @@ assert len(selected_features) == 343, f"Expected 343 features, got {len(selected
135
 
136
  # ---------------------------------------------------------------------------
137
  # LIME explainer β€” built ONCE at startup with uniform [0,1] background data.
138
- # This is valid because all features are MinMax-scaled to [0,1], so uniform
139
- # noise is a reasonable approximation of the feature distribution.
140
- # Building it here avoids rebuilding on every prediction call.
 
141
  # ---------------------------------------------------------------------------
142
  _lime_background = np.random.rand(100, len(selected_features))
143
  _explainer = LimeTabularExplainer(
144
  training_data=_lime_background,
145
  feature_names=selected_features,
146
- # FIX: label convention matches training β€” AMP=0, Non-AMP=1
147
- # (same as the working old RF script: prediction==0 β†’ AMP)
148
- class_names=["AMP", "Non-AMP"],
149
  mode="classification"
150
  )
151
 
@@ -153,10 +152,9 @@ _explainer = LimeTabularExplainer(
153
  def keras_predict_proba(X):
154
  """Return [P(AMP), P(Non-AMP)] for LIME.
155
 
156
- The model was trained with AMP=0, Non-AMP=1.
157
- A sigmoid output therefore represents P(Non-AMP=1).
158
- So P(AMP) = 1 - sigmoid_output.
159
- Columns must match class_names order: index 0 = AMP, index 1 = Non-AMP.
160
  """
161
  amp_model, _ = get_amp_model()
162
  preds = amp_model.predict(X, verbose=0)
@@ -175,42 +173,32 @@ def extract_features(sequence):
175
  try:
176
  _, amp_scaler = get_amp_model()
177
 
178
- # Replicate the EXACT feature pool the scaler was fit on (1325 features):
179
- # Order must match training: CTD β†’ dipeptide(420) β†’ autocorr β†’ pseudoAAC
180
 
181
- # 1. CTD features
182
- ctd_features = CTD.CalculateCTD(sequence)
183
-
184
- # 2. Dipeptide + AAC filtered to first 420 keys (same as old working script)
185
  dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
186
  filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
 
 
187
 
188
- # 3. Autocorrelation features
189
- auto_features = Autocorrelation.CalculateAutoTotal(sequence)
190
-
191
- # 4. Pseudo-AAC features
192
- pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
193
-
194
- # Merge in training order
195
  all_features_dict = {}
196
  all_features_dict.update(ctd_features)
197
  all_features_dict.update(filtered_dipeptide)
198
  all_features_dict.update(auto_features)
199
  all_features_dict.update(pseudo_features)
200
 
201
- # Build full-pool DataFrame (~1325 columns)
202
  feature_df_all = pd.DataFrame([all_features_dict])
 
 
203
 
204
- # Scale the FULL pool β€” scaler expects 1325 features in training column order
205
- scaled_array = amp_scaler.transform(feature_df_all.values)
206
- scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
207
-
208
- # Verify all 343 selected features survived
209
  missing = [f for f in selected_features if f not in scaled_df.columns]
210
  if missing:
211
  return f"Error: Missing features after scaling: {missing[:5]}..."
212
 
213
- # Select the 343 features in model training order
214
  selected_df = scaled_df[selected_features].fillna(0)
215
  return selected_df.values.astype(np.float32)
216
 
@@ -233,7 +221,7 @@ def predictmic(sequence):
233
  [sys.executable, "mic_worker.py", sequence],
234
  capture_output=True,
235
  text=True,
236
- timeout=900 # 15 min β€” generous for one-time ProtBert download
237
  )
238
  except subprocess.TimeoutExpired:
239
  return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
@@ -266,27 +254,22 @@ def full_prediction(sequence):
266
  raw_pred = amp_model.predict(features, verbose=0)
267
  print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
268
 
269
- # ------------------------------------------------------------------
270
- # FIX: sigmoid output = P(Non-AMP) because training labels were
271
- # AMP=0, Non-AMP=1 (same convention as the working old RF script).
272
- # ------------------------------------------------------------------
273
  prob_non_amp = float(raw_pred.flatten()[0])
274
  prob_amp = 1.0 - prob_non_amp
275
 
276
  if prob_amp >= 0.5:
277
- prediction = 0 # AMP (class 0, same as old script)
278
  confidence = round(prob_amp * 100, 2)
279
  else:
280
- prediction = 1 # Non-AMP (class 1)
281
  confidence = round(prob_non_amp * 100, 2)
282
 
283
  amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
284
  result = f"Prediction: {amp_result}\n"
285
  result += f"Confidence: {confidence}%\n"
286
- # Debug line β€” remove once you've verified on known sequences
287
- result += f"[Debug] sigmoid={round(prob_non_amp,4)} | P(AMP)={round(prob_amp,4)} | P(Non-AMP)={round(prob_non_amp,4)}\n"
288
 
289
- if prediction == 0: # AMP β†’ run MIC
290
  print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
291
  mic_values = predictmic(sequence)
292
  print("[CHECKPOINT] MIC done:", mic_values, flush=True)
@@ -296,16 +279,25 @@ def full_prediction(sequence):
296
  else:
297
  result += "\nMIC prediction skipped for Non-AMP sequences.\n"
298
 
299
- # LIME β€” uses the explainer built once at startup
 
 
 
 
 
 
 
300
  try:
301
  explanation = _explainer.explain_instance(
302
  data_row=features[0],
303
  predict_fn=keras_predict_proba,
304
- num_features=10
 
305
  )
306
- result += "\nTop Features Influencing Prediction:\n"
307
- for feat, weight in explanation.as_list():
308
- result += f"- {feat}: {round(weight, 4)}\n"
 
309
  except Exception as e:
310
  result += f"\nLIME explanation failed: {str(e)}\n"
311
 
 
135
 
136
  # ---------------------------------------------------------------------------
137
  # LIME explainer β€” built ONCE at startup with uniform [0,1] background data.
138
+ # Valid because all features are MinMax-scaled to [0,1].
139
+ # class_names: index 0 = AMP, index 1 = Non-AMP (matches training: AMP=0, Non-AMP=1)
140
+ # We always explain label=0 (AMP class) so weights are consistent across all
141
+ # sequences β€” positive weight = pushes TOWARD AMP, negative = pushes AWAY.
142
  # ---------------------------------------------------------------------------
143
  _lime_background = np.random.rand(100, len(selected_features))
144
  _explainer = LimeTabularExplainer(
145
  training_data=_lime_background,
146
  feature_names=selected_features,
147
+ class_names=["AMP", "Non-AMP"], # index 0=AMP, index 1=Non-AMP
 
 
148
  mode="classification"
149
  )
150
 
 
152
  def keras_predict_proba(X):
153
  """Return [P(AMP), P(Non-AMP)] for LIME.
154
 
155
+ Training labels: AMP=0, Non-AMP=1.
156
+ Sigmoid output = P(Non-AMP=1), so P(AMP) = 1 - sigmoid.
157
+ Column order must match class_names: col0=P(AMP), col1=P(Non-AMP).
 
158
  """
159
  amp_model, _ = get_amp_model()
160
  preds = amp_model.predict(X, verbose=0)
 
173
  try:
174
  _, amp_scaler = get_amp_model()
175
 
176
+ # Replicate the EXACT feature pool the scaler was fit on (1325 features).
177
+ # Merge order must match training: CTD β†’ dipeptide(420) β†’ autocorr β†’ pseudoAAC
178
 
179
+ ctd_features = CTD.CalculateCTD(sequence)
 
 
 
180
  dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
181
  filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
182
+ auto_features = Autocorrelation.CalculateAutoTotal(sequence)
183
+ pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
184
 
 
 
 
 
 
 
 
185
  all_features_dict = {}
186
  all_features_dict.update(ctd_features)
187
  all_features_dict.update(filtered_dipeptide)
188
  all_features_dict.update(auto_features)
189
  all_features_dict.update(pseudo_features)
190
 
191
+ # Build full-pool DataFrame (~1325 columns) and scale
192
  feature_df_all = pd.DataFrame([all_features_dict])
193
+ scaled_array = amp_scaler.transform(feature_df_all.values)
194
+ scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
195
 
196
+ # Verify all 343 selected features are present
 
 
 
 
197
  missing = [f for f in selected_features if f not in scaled_df.columns]
198
  if missing:
199
  return f"Error: Missing features after scaling: {missing[:5]}..."
200
 
201
+ # Select 343 features in model training order
202
  selected_df = scaled_df[selected_features].fillna(0)
203
  return selected_df.values.astype(np.float32)
204
 
 
221
  [sys.executable, "mic_worker.py", sequence],
222
  capture_output=True,
223
  text=True,
224
+ timeout=900
225
  )
226
  except subprocess.TimeoutExpired:
227
  return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
 
254
  raw_pred = amp_model.predict(features, verbose=0)
255
  print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
256
 
257
+ # sigmoid output = P(Non-AMP) β€” training labels were AMP=0, Non-AMP=1
 
 
 
258
  prob_non_amp = float(raw_pred.flatten()[0])
259
  prob_amp = 1.0 - prob_non_amp
260
 
261
  if prob_amp >= 0.5:
262
+ prediction = 0 # AMP (class 0)
263
  confidence = round(prob_amp * 100, 2)
264
  else:
265
+ prediction = 1 # Non-AMP (class 1)
266
  confidence = round(prob_non_amp * 100, 2)
267
 
268
  amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
269
  result = f"Prediction: {amp_result}\n"
270
  result += f"Confidence: {confidence}%\n"
 
 
271
 
272
+ if prediction == 0: # AMP β†’ run MIC
273
  print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
274
  mic_values = predictmic(sequence)
275
  print("[CHECKPOINT] MIC done:", mic_values, flush=True)
 
279
  else:
280
  result += "\nMIC prediction skipped for Non-AMP sequences.\n"
281
 
282
+ # ------------------------------------------------------------------
283
+ # LIME β€” always explains class 0 (AMP) so weights are consistent:
284
+ # weight > 0 β†’ feature pushes TOWARD AMP classification
285
+ # weight < 0 β†’ feature pushes AWAY from AMP classification
286
+ # This is meaningful for both AMPs and Non-AMPs:
287
+ # AMP sequence β†’ top positive weights explain why it's an AMP
288
+ # Non-AMP sequence β†’ top negative weights explain why it's NOT an AMP
289
+ # ------------------------------------------------------------------
290
  try:
291
  explanation = _explainer.explain_instance(
292
  data_row=features[0],
293
  predict_fn=keras_predict_proba,
294
+ num_features=10,
295
+ labels=(0,) # always explain AMP class (index 0)
296
  )
297
+ result += "\nTop Features Influencing AMP Classification:\n"
298
+ for feat, weight in explanation.as_list(label=0):
299
+ direction = "↑ AMP" if weight > 0 else "↓ AMP"
300
+ result += f"- {feat}: {round(weight, 4)} ({direction})\n"
301
  except Exception as e:
302
  result += f"\nLIME explanation failed: {str(e)}\n"
303