Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -135,17 +135,16 @@ assert len(selected_features) == 343, f"Expected 343 features, got {len(selected
|
|
| 135 |
|
| 136 |
# ---------------------------------------------------------------------------
|
| 137 |
# LIME explainer β built ONCE at startup with uniform [0,1] background data.
|
| 138 |
-
#
|
| 139 |
-
#
|
| 140 |
-
#
|
|
|
|
| 141 |
# ---------------------------------------------------------------------------
|
| 142 |
_lime_background = np.random.rand(100, len(selected_features))
|
| 143 |
_explainer = LimeTabularExplainer(
|
| 144 |
training_data=_lime_background,
|
| 145 |
feature_names=selected_features,
|
| 146 |
-
#
|
| 147 |
-
# (same as the working old RF script: prediction==0 β AMP)
|
| 148 |
-
class_names=["AMP", "Non-AMP"],
|
| 149 |
mode="classification"
|
| 150 |
)
|
| 151 |
|
|
@@ -153,10 +152,9 @@ _explainer = LimeTabularExplainer(
|
|
| 153 |
def keras_predict_proba(X):
|
| 154 |
"""Return [P(AMP), P(Non-AMP)] for LIME.
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
Columns must match class_names order: index 0 = AMP, index 1 = Non-AMP.
|
| 160 |
"""
|
| 161 |
amp_model, _ = get_amp_model()
|
| 162 |
preds = amp_model.predict(X, verbose=0)
|
|
@@ -175,42 +173,32 @@ def extract_features(sequence):
|
|
| 175 |
try:
|
| 176 |
_, amp_scaler = get_amp_model()
|
| 177 |
|
| 178 |
-
# Replicate the EXACT feature pool the scaler was fit on (1325 features)
|
| 179 |
-
#
|
| 180 |
|
| 181 |
-
|
| 182 |
-
ctd_features = CTD.CalculateCTD(sequence)
|
| 183 |
-
|
| 184 |
-
# 2. Dipeptide + AAC filtered to first 420 keys (same as old working script)
|
| 185 |
dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
|
| 186 |
filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
# 3. Autocorrelation features
|
| 189 |
-
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
|
| 190 |
-
|
| 191 |
-
# 4. Pseudo-AAC features
|
| 192 |
-
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
|
| 193 |
-
|
| 194 |
-
# Merge in training order
|
| 195 |
all_features_dict = {}
|
| 196 |
all_features_dict.update(ctd_features)
|
| 197 |
all_features_dict.update(filtered_dipeptide)
|
| 198 |
all_features_dict.update(auto_features)
|
| 199 |
all_features_dict.update(pseudo_features)
|
| 200 |
|
| 201 |
-
# Build full-pool DataFrame (~1325 columns)
|
| 202 |
feature_df_all = pd.DataFrame([all_features_dict])
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
#
|
| 205 |
-
scaled_array = amp_scaler.transform(feature_df_all.values)
|
| 206 |
-
scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
|
| 207 |
-
|
| 208 |
-
# Verify all 343 selected features survived
|
| 209 |
missing = [f for f in selected_features if f not in scaled_df.columns]
|
| 210 |
if missing:
|
| 211 |
return f"Error: Missing features after scaling: {missing[:5]}..."
|
| 212 |
|
| 213 |
-
# Select
|
| 214 |
selected_df = scaled_df[selected_features].fillna(0)
|
| 215 |
return selected_df.values.astype(np.float32)
|
| 216 |
|
|
@@ -233,7 +221,7 @@ def predictmic(sequence):
|
|
| 233 |
[sys.executable, "mic_worker.py", sequence],
|
| 234 |
capture_output=True,
|
| 235 |
text=True,
|
| 236 |
-
timeout=900
|
| 237 |
)
|
| 238 |
except subprocess.TimeoutExpired:
|
| 239 |
return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
|
|
@@ -266,27 +254,22 @@ def full_prediction(sequence):
|
|
| 266 |
raw_pred = amp_model.predict(features, verbose=0)
|
| 267 |
print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
|
| 268 |
|
| 269 |
-
# --
|
| 270 |
-
# FIX: sigmoid output = P(Non-AMP) because training labels were
|
| 271 |
-
# AMP=0, Non-AMP=1 (same convention as the working old RF script).
|
| 272 |
-
# ------------------------------------------------------------------
|
| 273 |
prob_non_amp = float(raw_pred.flatten()[0])
|
| 274 |
prob_amp = 1.0 - prob_non_amp
|
| 275 |
|
| 276 |
if prob_amp >= 0.5:
|
| 277 |
-
prediction = 0
|
| 278 |
confidence = round(prob_amp * 100, 2)
|
| 279 |
else:
|
| 280 |
-
prediction = 1
|
| 281 |
confidence = round(prob_non_amp * 100, 2)
|
| 282 |
|
| 283 |
amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
|
| 284 |
result = f"Prediction: {amp_result}\n"
|
| 285 |
result += f"Confidence: {confidence}%\n"
|
| 286 |
-
# Debug line β remove once you've verified on known sequences
|
| 287 |
-
result += f"[Debug] sigmoid={round(prob_non_amp,4)} | P(AMP)={round(prob_amp,4)} | P(Non-AMP)={round(prob_non_amp,4)}\n"
|
| 288 |
|
| 289 |
-
if prediction == 0:
|
| 290 |
print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
|
| 291 |
mic_values = predictmic(sequence)
|
| 292 |
print("[CHECKPOINT] MIC done:", mic_values, flush=True)
|
|
@@ -296,16 +279,25 @@ def full_prediction(sequence):
|
|
| 296 |
else:
|
| 297 |
result += "\nMIC prediction skipped for Non-AMP sequences.\n"
|
| 298 |
|
| 299 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
try:
|
| 301 |
explanation = _explainer.explain_instance(
|
| 302 |
data_row=features[0],
|
| 303 |
predict_fn=keras_predict_proba,
|
| 304 |
-
num_features=10
|
|
|
|
| 305 |
)
|
| 306 |
-
result += "\nTop Features Influencing
|
| 307 |
-
for feat, weight in explanation.as_list():
|
| 308 |
-
|
|
|
|
| 309 |
except Exception as e:
|
| 310 |
result += f"\nLIME explanation failed: {str(e)}\n"
|
| 311 |
|
|
|
|
| 135 |
|
| 136 |
# ---------------------------------------------------------------------------
|
| 137 |
# LIME explainer β built ONCE at startup with uniform [0,1] background data.
|
| 138 |
+
# Valid because all features are MinMax-scaled to [0,1].
|
| 139 |
+
# class_names: index 0 = AMP, index 1 = Non-AMP (matches training: AMP=0, Non-AMP=1)
|
| 140 |
+
# We always explain label=0 (AMP class) so weights are consistent across all
|
| 141 |
+
# sequences β positive weight = pushes TOWARD AMP, negative = pushes AWAY.
|
| 142 |
# ---------------------------------------------------------------------------
|
| 143 |
_lime_background = np.random.rand(100, len(selected_features))
|
| 144 |
_explainer = LimeTabularExplainer(
|
| 145 |
training_data=_lime_background,
|
| 146 |
feature_names=selected_features,
|
| 147 |
+
class_names=["AMP", "Non-AMP"], # index 0=AMP, index 1=Non-AMP
|
|
|
|
|
|
|
| 148 |
mode="classification"
|
| 149 |
)
|
| 150 |
|
|
|
|
| 152 |
def keras_predict_proba(X):
|
| 153 |
"""Return [P(AMP), P(Non-AMP)] for LIME.
|
| 154 |
|
| 155 |
+
Training labels: AMP=0, Non-AMP=1.
|
| 156 |
+
Sigmoid output = P(Non-AMP=1), so P(AMP) = 1 - sigmoid.
|
| 157 |
+
Column order must match class_names: col0=P(AMP), col1=P(Non-AMP).
|
|
|
|
| 158 |
"""
|
| 159 |
amp_model, _ = get_amp_model()
|
| 160 |
preds = amp_model.predict(X, verbose=0)
|
|
|
|
| 173 |
try:
|
| 174 |
_, amp_scaler = get_amp_model()
|
| 175 |
|
| 176 |
+
# Replicate the EXACT feature pool the scaler was fit on (1325 features).
|
| 177 |
+
# Merge order must match training: CTD β dipeptide(420) β autocorr β pseudoAAC
|
| 178 |
|
| 179 |
+
ctd_features = CTD.CalculateCTD(sequence)
|
|
|
|
|
|
|
|
|
|
| 180 |
dipeptide_features = AAComposition.CalculateAADipeptideComposition(sequence)
|
| 181 |
filtered_dipeptide = {k: dipeptide_features[k] for k in list(dipeptide_features.keys())[:420]}
|
| 182 |
+
auto_features = Autocorrelation.CalculateAutoTotal(sequence)
|
| 183 |
+
pseudo_features = PseudoAAC.GetAPseudoAAC(sequence, lamda=9)
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
all_features_dict = {}
|
| 186 |
all_features_dict.update(ctd_features)
|
| 187 |
all_features_dict.update(filtered_dipeptide)
|
| 188 |
all_features_dict.update(auto_features)
|
| 189 |
all_features_dict.update(pseudo_features)
|
| 190 |
|
| 191 |
+
# Build full-pool DataFrame (~1325 columns) and scale
|
| 192 |
feature_df_all = pd.DataFrame([all_features_dict])
|
| 193 |
+
scaled_array = amp_scaler.transform(feature_df_all.values)
|
| 194 |
+
scaled_df = pd.DataFrame(scaled_array, columns=feature_df_all.columns)
|
| 195 |
|
| 196 |
+
# Verify all 343 selected features are present
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
missing = [f for f in selected_features if f not in scaled_df.columns]
|
| 198 |
if missing:
|
| 199 |
return f"Error: Missing features after scaling: {missing[:5]}..."
|
| 200 |
|
| 201 |
+
# Select 343 features in model training order
|
| 202 |
selected_df = scaled_df[selected_features].fillna(0)
|
| 203 |
return selected_df.values.astype(np.float32)
|
| 204 |
|
|
|
|
| 221 |
[sys.executable, "mic_worker.py", sequence],
|
| 222 |
capture_output=True,
|
| 223 |
text=True,
|
| 224 |
+
timeout=900
|
| 225 |
)
|
| 226 |
except subprocess.TimeoutExpired:
|
| 227 |
return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
|
|
|
|
| 254 |
raw_pred = amp_model.predict(features, verbose=0)
|
| 255 |
print("[CHECKPOINT] raw sigmoid output:", raw_pred, flush=True)
|
| 256 |
|
| 257 |
+
# sigmoid output = P(Non-AMP) β training labels were AMP=0, Non-AMP=1
|
|
|
|
|
|
|
|
|
|
| 258 |
prob_non_amp = float(raw_pred.flatten()[0])
|
| 259 |
prob_amp = 1.0 - prob_non_amp
|
| 260 |
|
| 261 |
if prob_amp >= 0.5:
|
| 262 |
+
prediction = 0 # AMP (class 0)
|
| 263 |
confidence = round(prob_amp * 100, 2)
|
| 264 |
else:
|
| 265 |
+
prediction = 1 # Non-AMP (class 1)
|
| 266 |
confidence = round(prob_non_amp * 100, 2)
|
| 267 |
|
| 268 |
amp_result = "Antimicrobial Peptide (AMP)" if prediction == 0 else "Non-AMP"
|
| 269 |
result = f"Prediction: {amp_result}\n"
|
| 270 |
result += f"Confidence: {confidence}%\n"
|
|
|
|
|
|
|
| 271 |
|
| 272 |
+
if prediction == 0: # AMP β run MIC
|
| 273 |
print("[CHECKPOINT] AMP detected, starting MIC (ProtBert)...", flush=True)
|
| 274 |
mic_values = predictmic(sequence)
|
| 275 |
print("[CHECKPOINT] MIC done:", mic_values, flush=True)
|
|
|
|
| 279 |
else:
|
| 280 |
result += "\nMIC prediction skipped for Non-AMP sequences.\n"
|
| 281 |
|
| 282 |
+
# ------------------------------------------------------------------
|
| 283 |
+
# LIME β always explains class 0 (AMP) so weights are consistent:
|
| 284 |
+
# weight > 0 β feature pushes TOWARD AMP classification
|
| 285 |
+
# weight < 0 β feature pushes AWAY from AMP classification
|
| 286 |
+
# This is meaningful for both AMPs and Non-AMPs:
|
| 287 |
+
# AMP sequence β top positive weights explain why it's an AMP
|
| 288 |
+
# Non-AMP sequence β top negative weights explain why it's NOT an AMP
|
| 289 |
+
# ------------------------------------------------------------------
|
| 290 |
try:
|
| 291 |
explanation = _explainer.explain_instance(
|
| 292 |
data_row=features[0],
|
| 293 |
predict_fn=keras_predict_proba,
|
| 294 |
+
num_features=10,
|
| 295 |
+
labels=(0,) # always explain AMP class (index 0)
|
| 296 |
)
|
| 297 |
+
result += "\nTop Features Influencing AMP Classification:\n"
|
| 298 |
+
for feat, weight in explanation.as_list(label=0):
|
| 299 |
+
direction = "β AMP" if weight > 0 else "β AMP"
|
| 300 |
+
result += f"- {feat}: {round(weight, 4)} ({direction})\n"
|
| 301 |
except Exception as e:
|
| 302 |
result += f"\nLIME explanation failed: {str(e)}\n"
|
| 303 |
|