nonzeroexit commited on
Commit
bd01e5d
·
verified ·
1 Parent(s): 5745f40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -212
app.py CHANGED
@@ -1,217 +1,126 @@
1
  import os
2
- # --- Prevent SIGSEGV (exit 139) from TensorFlow + PyTorch native lib clashes ---
3
  os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
4
  os.environ.setdefault("OMP_NUM_THREADS", "1")
5
  os.environ.setdefault("MKL_NUM_THREADS", "1")
6
  os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
7
- os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
8
- os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
9
 
10
- import gradio as gr
 
 
11
  import joblib
12
  import numpy as np
13
  import pandas as pd
14
  from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
15
  from lime.lime_tabular import LimeTabularExplainer
16
- import sys
17
- import json
18
- import subprocess
19
 
20
  # ---------------------------------------------------------------------------
21
- # LAZY LOADING keeps the free 16GB Space from OOM-ing at startup.
22
- # Only the TensorFlow AMP model is loaded in THIS process. ProtBert/PyTorch
23
- # run in a SEPARATE process (mic_worker.py) to avoid a native-library clash
24
- # between TensorFlow and PyTorch that caused SIGSEGV (exit 139).
25
  # ---------------------------------------------------------------------------
26
- _amp_model = None
27
- _amp_scaler = None
28
-
29
-
30
- def get_amp_model():
31
- global _amp_model, _amp_scaler
32
- if _amp_model is None:
33
- from tensorflow.keras.models import load_model
34
- _amp_model = load_model("Comb1_aac_ctd_RFE_selected_features_model (1).keras")
35
- _amp_scaler = joblib.load("norm (4).joblib")
36
- return _amp_model, _amp_scaler
37
-
38
 
39
  # ---------------------------------------------------------------------------
40
- # The EXACT 343 features the model was trained on, IN THE EXACT TRAINING ORDER.
41
  # ---------------------------------------------------------------------------
42
  selected_features = [
43
- "_PolarizabilityC1", "_PolarizabilityC2", "_PolarizabilityC3",
44
- "_SolventAccessibilityC1", "_SolventAccessibilityC2", "_SolventAccessibilityC3",
45
- "_SecondaryStrC1", "_SecondaryStrC2", "_SecondaryStrC3",
46
- "_ChargeC1", "_ChargeC2", "_ChargeC3",
47
- "_PolarityC1", "_PolarityC2", "_PolarityC3",
48
- "_NormalizedVDWVC1", "_NormalizedVDWVC2", "_NormalizedVDWVC3",
49
- "_HydrophobicityC1", "_HydrophobicityC2", "_HydrophobicityC3",
50
- "_PolarizabilityT12", "_PolarizabilityT13", "_PolarizabilityT23",
51
- "_SolventAccessibilityT12", "_SolventAccessibilityT13", "_SolventAccessibilityT23",
52
- "_SecondaryStrT12", "_SecondaryStrT13", "_SecondaryStrT23",
53
- "_ChargeT12", "_ChargeT13", "_ChargeT23",
54
- "_PolarityT12", "_PolarityT13", "_PolarityT23",
55
- "_NormalizedVDWVT12", "_NormalizedVDWVT13", "_NormalizedVDWVT23",
56
- "_HydrophobicityT12", "_HydrophobicityT13", "_HydrophobicityT23",
57
- "_PolarizabilityD1001", "_PolarizabilityD1025", "_PolarizabilityD1050",
58
- "_PolarizabilityD1075", "_PolarizabilityD1100",
59
- "_PolarizabilityD2001", "_PolarizabilityD2025", "_PolarizabilityD2050",
60
- "_PolarizabilityD2075", "_PolarizabilityD2100",
61
- "_PolarizabilityD3001", "_PolarizabilityD3025", "_PolarizabilityD3050",
62
- "_PolarizabilityD3075", "_PolarizabilityD3100",
63
- "_SolventAccessibilityD1001", "_SolventAccessibilityD1025",
64
- "_SolventAccessibilityD1050", "_SolventAccessibilityD1075",
65
- "_SolventAccessibilityD1100",
66
- "_SolventAccessibilityD2001", "_SolventAccessibilityD2025",
67
- "_SolventAccessibilityD2050", "_SolventAccessibilityD2075",
68
- "_SolventAccessibilityD2100",
69
- "_SolventAccessibilityD3001", "_SolventAccessibilityD3025",
70
- "_SolventAccessibilityD3050", "_SolventAccessibilityD3075",
71
- "_SolventAccessibilityD3100",
72
- "_SecondaryStrD1001", "_SecondaryStrD1025", "_SecondaryStrD1050",
73
- "_SecondaryStrD1075", "_SecondaryStrD1100",
74
- "_SecondaryStrD2001", "_SecondaryStrD2025", "_SecondaryStrD2050",
75
- "_SecondaryStrD2075", "_SecondaryStrD2100",
76
- "_SecondaryStrD3001", "_SecondaryStrD3025", "_SecondaryStrD3050",
77
- "_SecondaryStrD3075", "_SecondaryStrD3100",
78
- "_ChargeD1001", "_ChargeD1025", "_ChargeD1050",
79
- "_ChargeD1075", "_ChargeD1100",
80
- "_ChargeD2001", "_ChargeD2025", "_ChargeD2050",
81
- "_ChargeD2075",
82
- "_ChargeD3001", "_ChargeD3025", "_ChargeD3050",
83
- "_ChargeD3075", "_ChargeD3100",
84
- "_PolarityD1001", "_PolarityD1025", "_PolarityD1050",
85
- "_PolarityD1075", "_PolarityD1100",
86
- "_PolarityD2001", "_PolarityD2025", "_PolarityD2050",
87
- "_PolarityD2075", "_PolarityD2100",
88
- "_PolarityD3001", "_PolarityD3025", "_PolarityD3050",
89
- "_PolarityD3075", "_PolarityD3100",
90
- "_NormalizedVDWVD1001", "_NormalizedVDWVD1025",
91
- "_NormalizedVDWVD1050", "_NormalizedVDWVD1075",
92
- "_NormalizedVDWVD1100",
93
- "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
94
- "_NormalizedVDWVD2050", "_NormalizedVDWVD2075",
95
- "_NormalizedVDWVD2100",
96
- "_NormalizedVDWVD3001", "_NormalizedVDWVD3025",
97
- "_NormalizedVDWVD3050", "_NormalizedVDWVD3075",
98
- "_NormalizedVDWVD3100",
99
- "_HydrophobicityD1001", "_HydrophobicityD1025",
100
- "_HydrophobicityD1050", "_HydrophobicityD1075",
101
- "_HydrophobicityD1100",
102
- "_HydrophobicityD2001", "_HydrophobicityD2025",
103
- "_HydrophobicityD2050", "_HydrophobicityD2075",
104
- "_HydrophobicityD2100",
105
- "_HydrophobicityD3001", "_HydrophobicityD3025",
106
- "_HydrophobicityD3050", "_HydrophobicityD3075",
107
- "_HydrophobicityD3100",
108
- "A", "R", "N", "D", "C", "E", "Q", "G", "H", "I",
109
- "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V",
110
- "AR", "AD", "AQ", "AG", "AL", "AK", "AF", "AP", "AT", "AV",
111
- "RA", "RC", "RE", "RG", "RI", "RL", "RS", "RT", "RV",
112
- "NR", "NC", "NG", "NI", "NP", "NS", "NY", "NV",
113
- "DR", "DN", "DC", "DE", "DG", "DF", "DS", "DT", "DY",
114
- "CR", "CN", "CD", "CC", "CI", "CL", "CK", "CT", "CY", "CV",
115
- "EA", "ER", "ED", "EC", "EE", "EG", "EI", "EL", "EK",
116
- "EF", "EP", "ET", "EV",
117
- "QN", "QF", "QV",
118
- "GA", "GR", "GC", "GE", "GG", "GI", "GL", "GK", "GF", "GP", "GY",
119
- "HA", "HP", "HT",
120
- "IA", "IR", "ID", "II", "IL", "IF", "IP", "IS", "IV",
121
- "LA", "LR", "LD", "LC", "LG", "LI", "LK", "LM", "LF",
122
- "LS", "LT", "LY", "LV",
123
- "KA", "KN", "KC", "KG", "KI", "KL", "KK", "KP", "KY",
124
- "MA", "MD", "ME", "MI", "MK", "MF", "MP", "MS", "MV",
125
- "FR", "FE", "FQ", "FG", "FL", "FF", "FS", "FT", "FY", "FV",
126
- "PA", "PR", "PC", "PE", "PL", "PK", "PP", "PS", "PV",
127
- "SA", "SR", "SD", "SC", "SG", "SH", "SI", "SL", "SP", "ST", "SY",
128
- "TA", "TR", "TC", "TE", "TQ", "TG", "TI", "TL", "TP", "TS", "TV",
129
- "WA",
130
- "YN", "YD", "YC", "YQ", "YG", "YP",
131
- "VA", "VR", "VD", "VC", "VE", "VG", "VI", "VL", "VK",
132
- "VS", "VT", "VY", "VV"
133
  ]
134
- assert len(selected_features) == 343, f"Expected 343 features, got {len(selected_features)}"
135
 
136
  # ---------------------------------------------------------------------------
137
- # LIME explainer — built ONCE at startup with uniform [0,1] background data.
138
- # Valid because all features are MinMax-scaled to [0,1].
139
- # class_names: index 0 = AMP, index 1 = Non-AMP (matches training: AMP=0, Non-AMP=1)
140
- # We always explain label=0 (AMP class) so weights are consistent across all
141
- # sequences positive weight = pushes TOWARD AMP, negative = pushes AWAY.
 
 
142
  # ---------------------------------------------------------------------------
143
- _lime_background = np.random.rand(100, len(selected_features))
144
- _explainer = LimeTabularExplainer(
 
 
145
  training_data=_lime_background,
146
  feature_names=selected_features,
147
- class_names=["AMP", "Non-AMP"], # index 0=AMP, index 1=Non-AMP
148
- mode="classification"
 
 
149
  )
150
 
151
 
152
- def keras_predict_proba(X):
153
- """Return [P(AMP), P(Non-AMP)] for LIME.
154
-
155
- Training labels: AMP=0, Non-AMP=1.
156
- Sigmoid output = P(Non-AMP=1), so P(AMP) = 1 - sigmoid.
157
- Column order must match class_names: col0=P(AMP), col1=P(Non-AMP).
158
- """
159
- amp_model, _ = get_amp_model()
160
- preds = amp_model.predict(X, verbose=0)
161
- if preds.ndim == 1 or preds.shape[1] == 1:
162
- preds = preds.reshape(-1, 1) # preds = P(Non-AMP)
163
- return np.hstack([1 - preds, preds]) # [P(AMP), P(Non-AMP)]
164
- return preds
165
-
166
-
167
  def extract_features(sequence):
168
- """Compute the full 1325-feature pool, scale it, then select the 343 model features."""
169
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
170
  if len(sequence) < 10:
171
  return "Error: Sequence too short."
172
 
173
  try:
174
- _, amp_scaler = get_amp_model()
175
-
176
- # Replicate the EXACT feature pool the scaler was fit on (1325 features).
177
- # Merge order must match training: CTD → dipeptide(420) → autocorr → pseudoAAC
178
-
179
- ctd_features = CTD.CalculateCTD(sequence)
180
  dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
181
- filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
182
- auto_features = Autocorrelation.CalculateAutoTotal(sequence)
183
- pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
 
184
 
185
  all_features_dict = {}
186
  all_features_dict.update(ctd_features)
187
- all_features_dict.update(filtered_dipeptide)
188
  all_features_dict.update(auto_features)
189
  all_features_dict.update(pseudo_features)
190
 
191
- # Build full-pool DataFrame (~1325 columns) and scale
192
  feature_df_all = pd.DataFrame([all_features_dict])
193
- scaled_array = amp_scaler.transform(feature_df_all.values)
194
- scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
195
 
196
- # Verify all 343 selected features are present
197
- missing = [f for f in selected_features if f not in scaled_df.columns]
198
- if missing:
199
- return f"Error: Missing features after scaling: {missing[:5]}..."
200
-
201
- # Select 343 features in model training order
202
- selected_df = scaled_df[selected_features].fillna(0)
203
- return selected_df.values.astype(np.float32)
204
 
 
 
205
  except Exception as e:
206
  return f"Error in feature extraction: {str(e)}"
207
 
208
 
 
 
 
 
 
209
  def predictmic(sequence):
210
- """Run MIC prediction in a SEPARATE process (mic_worker.py).
211
-
212
- Isolates PyTorch/ProtBert from TensorFlow to prevent SIGSEGV (exit 139).
213
- The worker prints a JSON dict on its last stdout line.
214
- """
215
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
216
  if len(sequence) < 10:
217
  return {"Error": "Sequence too short or invalid."}
@@ -219,12 +128,10 @@ def predictmic(sequence):
219
  try:
220
  proc = subprocess.run(
221
  [sys.executable, "mic_worker.py", sequence],
222
- capture_output=True,
223
- text=True,
224
- timeout=900
225
  )
226
  except subprocess.TimeoutExpired:
227
- return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
228
  except Exception as e:
229
  return {"Error": f"Failed to start MIC worker: {str(e)}"}
230
 
@@ -235,74 +142,59 @@ def predictmic(sequence):
235
  out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
236
  if not out_lines:
237
  return {"Error": "MIC worker produced no output."}
238
-
239
  try:
240
  return json.loads(out_lines[-1])
241
  except Exception:
242
  return {"Error": f"Could not parse MIC worker output: {out_lines[-1][:200]}"}
243
 
244
 
 
 
 
245
  def full_prediction(sequence):
246
- print("[CHECKPOINT] full_prediction called", flush=True)
247
  features = extract_features(sequence)
248
  if isinstance(features, str):
249
- print("[CHECKPOINT] extract_features error:", features, flush=True)
250
  return features
251
- print("[CHECKPOINT] features extracted OK, shape:", features.shape, flush=True)
252
-
253
- amp_model, _ = get_amp_model()
254
- raw_pred = amp_model.predict(features, verbose=0)
255
- print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
256
 
257
- prob_non_amp = float(raw_pred.flatten()[0])
258
- prob_amp = 1.0 - prob_non_amp
259
 
260
- if prob_amp >= 0.5:
261
- prediction = 0
262
- confidence = round(prob_amp * 100, 2)
263
- else:
264
- prediction = 1
265
- confidence = round(prob_non_amp * 100, 2)
266
 
267
  amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
268
- result = f"Prediction: {amp_result}\n"
269
- result += f"Confidence: {confidence}%\n"
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  if prediction == 0:
272
- print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
273
  mic_values = predictmic(sequence)
274
- print("[CHECKPOINT] MIC done:", mic_values, flush=True)
275
  result += "\nPredicted MIC Values (μM):\n"
276
  for org, mic in mic_values.items():
277
  result += f"- {org}: {mic}\n"
278
  else:
279
  result += "\nMIC prediction skipped for Non-AMP sequences.\n"
280
 
281
- # ------------------------------------------------------------------
282
- # LIME
283
- # ------------------------------------------------------------------
284
- print("[CHECKPOINT] Starting LIME...", flush=True)
285
- try:
286
- explanation = _explainer.explain_instance(
287
- data_row=features[0],
288
- predict_fn=keras_predict_proba,
289
- num_features=10,
290
- labels=(0,)
291
- )
292
- lime_list = explanation.as_list(label=0)
293
- print("[CHECKPOINT] LIME done:", lime_list, flush=True)
294
- result += "\nTop Features Influencing AMP Classification:\n"
295
- for feat, weight in lime_list:
296
- direction = "↑ AMP" if weight > 0 else "↓ AMP"
297
- result += f"- {feat}: {round(weight, 4)} ({direction})\n"
298
- except Exception as e:
299
- print("[CHECKPOINT] LIME FAILED:", str(e), flush=True)
300
- result += f"\nLIME explanation failed: {str(e)}\n"
301
-
302
- print("[CHECKPOINT] full_prediction complete", flush=True)
303
  return result
304
 
305
 
 
306
  iface = gr.Interface(
307
  fn=full_prediction,
308
  inputs=gr.Textbox(label="Enter Protein Sequence"),
 
1
  import os
2
+ # Native-lib hygiene (prevents TF/PyTorch SIGSEGV when both load; harmless for RF)
3
  os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
4
  os.environ.setdefault("OMP_NUM_THREADS", "1")
5
  os.environ.setdefault("MKL_NUM_THREADS", "1")
6
  os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
7
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 
8
 
9
+ import sys
10
+ import json
11
+ import subprocess
12
  import joblib
13
  import numpy as np
14
  import pandas as pd
15
  from propy import AAComposition, Autocorrelation, CTD, PseudoAAC
16
  from lime.lime_tabular import LimeTabularExplainer
17
+
18
+ import gradio as gr
 
19
 
20
  # ---------------------------------------------------------------------------
21
+ # Load Random Forest AMP classifier + MinMax scaler (original files)
 
 
 
22
  # ---------------------------------------------------------------------------
23
+ model = joblib.load("RF.joblib")
24
+ scaler = joblib.load("norm (4).joblib")
 
 
 
 
 
 
 
 
 
 
25
 
26
  # ---------------------------------------------------------------------------
27
+ # Original 138 RFE-selected features (CTD + AAC + Autocorrelation + APAAC)
28
  # ---------------------------------------------------------------------------
29
  selected_features = [
30
+ "_SolventAccessibilityC3", "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_PolarityC1",
31
+ "_NormalizedVDWVC1", "_HydrophobicityC3", "_SecondaryStrT23", "_PolarizabilityD1001", "_PolarizabilityD2001",
32
+ "_PolarizabilityD3001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", "_SolventAccessibilityD3001",
33
+ "_SecondaryStrD1001", "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD3001", "_ChargeD1001",
34
+ "_ChargeD1025", "_ChargeD2001", "_ChargeD3075", "_ChargeD3100", "_PolarityD1001", "_PolarityD1050",
35
+ "_PolarityD2001", "_PolarityD3001", "_NormalizedVDWVD1001", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025",
36
+ "_NormalizedVDWVD2050", "_NormalizedVDWVD3001", "_HydrophobicityD1001", "_HydrophobicityD2001",
37
+ "_HydrophobicityD3001", "_HydrophobicityD3025", "A", "R", "D", "C", "E", "Q", "H", "I", "M", "P", "Y", "V",
38
+ "AR", "AV", "RC", "RL", "RV", "CR", "CC", "CL", "CK", "EE", "EI", "EL", "HC", "IA", "IL", "IV", "LA", "LC", "LE",
39
+ "LI", "LT", "LV", "KC", "MA", "MS", "SC", "TC", "TV", "YC", "VC", "VE", "VL", "VK", "VV",
40
+ "MoreauBrotoAuto_FreeEnergy30", "MoranAuto_Hydrophobicity2", "MoranAuto_Hydrophobicity4",
41
+ "GearyAuto_Hydrophobicity20", "GearyAuto_Hydrophobicity24", "GearyAuto_Hydrophobicity26",
42
+ "GearyAuto_Hydrophobicity27", "GearyAuto_Hydrophobicity28", "GearyAuto_Hydrophobicity29",
43
+ "GearyAuto_Hydrophobicity30", "GearyAuto_AvFlexibility22", "GearyAuto_AvFlexibility26",
44
+ "GearyAuto_AvFlexibility27", "GearyAuto_AvFlexibility28", "GearyAuto_AvFlexibility29", "GearyAuto_AvFlexibility30",
45
+ "GearyAuto_Polarizability22", "GearyAuto_Polarizability24", "GearyAuto_Polarizability25",
46
+ "GearyAuto_Polarizability27", "GearyAuto_Polarizability28", "GearyAuto_Polarizability29",
47
+ "GearyAuto_Polarizability30", "GearyAuto_FreeEnergy24", "GearyAuto_FreeEnergy25", "GearyAuto_FreeEnergy30",
48
+ "GearyAuto_ResidueASA21", "GearyAuto_ResidueASA22", "GearyAuto_ResidueASA23", "GearyAuto_ResidueASA24",
49
+ "GearyAuto_ResidueASA30", "GearyAuto_ResidueVol21", "GearyAuto_ResidueVol24", "GearyAuto_ResidueVol25",
50
+ "GearyAuto_ResidueVol26", "GearyAuto_ResidueVol28", "GearyAuto_ResidueVol29", "GearyAuto_ResidueVol30",
51
+ "GearyAuto_Steric18", "GearyAuto_Steric21", "GearyAuto_Steric26", "GearyAuto_Steric27", "GearyAuto_Steric28",
52
+ "GearyAuto_Steric29", "GearyAuto_Steric30", "GearyAuto_Mutability23", "GearyAuto_Mutability25",
53
+ "GearyAuto_Mutability26", "GearyAuto_Mutability27", "GearyAuto_Mutability28", "GearyAuto_Mutability29",
54
+ "GearyAuto_Mutability30", "APAAC1", "APAAC4", "APAAC5", "APAAC6", "APAAC8", "APAAC9", "APAAC12", "APAAC13",
55
+ "APAAC15", "APAAC18", "APAAC19", "APAAC24"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ]
57
+ assert len(selected_features) == 138, f"Expected 138 features, got {len(selected_features)}"
58
 
59
  # ---------------------------------------------------------------------------
60
+ # LIME explainer
61
+ # Built ONCE at startup so explanations are reproducible across requests.
62
+ # The training-data argument controls how LIME perturbs features around the
63
+ # input. After MinMax scaling each feature lives in [0,1], so we use a small
64
+ # uniform sample with a FIXED seed that gives stable, repeatable weights.
65
+ # (If you have a saved sample of real normalized training rows, swap it in
66
+ # here and explanations will reflect the true feature distribution.)
67
  # ---------------------------------------------------------------------------
68
+ _rng = np.random.default_rng(seed=42)
69
+ _lime_background = _rng.uniform(low=0.0, high=1.0, size=(500, len(selected_features)))
70
+
71
+ explainer = LimeTabularExplainer(
72
  training_data=_lime_background,
73
  feature_names=selected_features,
74
+ class_names=["AMP", "Non-AMP"],
75
+ mode="classification",
76
+ discretize_continuous=True,
77
+ random_state=42, # stable explanations
78
  )
79
 
80
 
81
+ # ---------------------------------------------------------------------------
82
+ # Feature extraction — produces the full propy feature pool, scales it with
83
+ # the saved MinMax scaler, then selects the 138 features the RF was trained on.
84
+ # ---------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
85
  def extract_features(sequence):
 
86
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
87
  if len(sequence) < 10:
88
  return "Error: Sequence too short."
89
 
90
  try:
91
+ # Original full pool: CTD + AAC(first 420) + Autocorrelation + PseudoAAC
 
 
 
 
 
92
  dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
93
+ filtered_dipeptide_features = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
94
+ ctd_features = CTD.CalculateCTD(sequence)
95
+ auto_features = Autocorrelation.CalculateAutoTotal(sequence)
96
+ pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
97
 
98
  all_features_dict = {}
99
  all_features_dict.update(ctd_features)
100
+ all_features_dict.update(filtered_dipeptide_features)
101
  all_features_dict.update(auto_features)
102
  all_features_dict.update(pseudo_features)
103
 
 
104
  feature_df_all = pd.DataFrame([all_features_dict])
105
+ normalized_array = scaler.transform(feature_df_all.values)
106
+ normalized_df = pd.DataFrame(normalized_array, columns=feature_df_all.columns)
107
 
108
+ if not set(selected_features).issubset(normalized_df.columns):
109
+ missing = set(selected_features) - set(normalized_df.columns)
110
+ return f"Error: Missing features: {list(missing)[:5]}..."
 
 
 
 
 
111
 
112
+ selected_df = normalized_df[selected_features].fillna(0)
113
+ return selected_df.values
114
  except Exception as e:
115
  return f"Error in feature extraction: {str(e)}"
116
 
117
 
118
+ # ---------------------------------------------------------------------------
119
+ # MIC prediction — runs in a SEPARATE process (mic_worker.py).
120
+ # This isolates PyTorch/ProtBert from the main process and prevents the
121
+ # native-library crash (exit 139) plus the OOM spike on the free tier.
122
+ # ---------------------------------------------------------------------------
123
  def predictmic(sequence):
 
 
 
 
 
124
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
125
  if len(sequence) < 10:
126
  return {"Error": "Sequence too short or invalid."}
 
128
  try:
129
  proc = subprocess.run(
130
  [sys.executable, "mic_worker.py", sequence],
131
+ capture_output=True, text=True, timeout=900
 
 
132
  )
133
  except subprocess.TimeoutExpired:
134
+ return {"Error": "MIC prediction timed out (ProtBert may still be downloading; try again shortly)."}
135
  except Exception as e:
136
  return {"Error": f"Failed to start MIC worker: {str(e)}"}
137
 
 
142
  out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
143
  if not out_lines:
144
  return {"Error": "MIC worker produced no output."}
 
145
  try:
146
  return json.loads(out_lines[-1])
147
  except Exception:
148
  return {"Error": f"Could not parse MIC worker output: {out_lines[-1][:200]}"}
149
 
150
 
151
+ # ---------------------------------------------------------------------------
152
+ # Main prediction pipeline
153
+ # ---------------------------------------------------------------------------
154
  def full_prediction(sequence):
 
155
  features = extract_features(sequence)
156
  if isinstance(features, str):
 
157
  return features
 
 
 
 
 
158
 
159
+ prediction = model.predict(features)[0]
160
+ probabilities = model.predict_proba(features)[0]
161
 
162
+ try:
163
+ class_index = list(model.classes_).index(prediction)
164
+ confidence = round(probabilities[class_index] * 100, 2)
165
+ except Exception:
166
+ confidence = "Unknown"
 
167
 
168
  amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
169
+ result = f"Prediction: {amp_result}\nConfidence: {confidence}%\n"
 
170
 
171
+ # ---- LIME first (per your spec: LIME before SHAP in the report) ----
172
+ try:
173
+ explanation = explainer.explain_instance(
174
+ data_row=features[0], # <-- explicitly the single input sequence
175
+ predict_fn=model.predict_proba,
176
+ num_features=10,
177
+ num_samples=2000, # perturbations around this single input
178
+ )
179
+ result += "\nTop Features Influencing Prediction (LIME):\n"
180
+ for feat, weight in explanation.as_list():
181
+ result += f"- {feat}: {round(weight, 4)}\n"
182
+ except Exception as e:
183
+ result += f"\nLIME explanation failed: {str(e)}\n"
184
+
185
+ # ---- MIC (only for AMPs) ----
186
  if prediction == 0:
 
187
  mic_values = predictmic(sequence)
 
188
  result += "\nPredicted MIC Values (μM):\n"
189
  for org, mic in mic_values.items():
190
  result += f"- {org}: {mic}\n"
191
  else:
192
  result += "\nMIC prediction skipped for Non-AMP sequences.\n"
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  return result
195
 
196
 
197
+ # Gradio UI
198
  iface = gr.Interface(
199
  fn=full_prediction,
200
  inputs=gr.Textbox(label="Enter Protein Sequence"),