AMP-Classifier2

Sleeping

App Files Files Community

nonzeroexit commited on May 21

Commit

12675f2

verified ·

1 Parent(s): 769f73d

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -57

app.py CHANGED Viewed

@@ -15,18 +15,18 @@ import joblib
 import numpy as np
 import pandas as pd
 from propy import AAComposition, CTD
-from math import expm1
 # ---------------------------------------------------------------------------
 # LAZY LOADING — keeps the free 16GB Space from OOM-ing at startup.
-# Heavy libs (TF, torch, ProtBert) load only when first needed.
 # ---------------------------------------------------------------------------
 _amp_model = None
 _amp_scaler = None
-_protbert_tokenizer = None
-_protbert_model = None
-_torch = None
-_device = None
 def get_amp_model():
@@ -38,25 +38,6 @@ def get_amp_model():
     return _amp_model, _amp_scaler
-def get_protbert():
-    global _protbert_tokenizer, _protbert_model, _torch, _device
-    if _protbert_model is None:
-        import torch
-        from transformers import BertTokenizer, BertModel
-        try:
-            torch.set_num_threads(1)  # reduce native threading conflicts with TF
-        except Exception:
-            pass
-        _torch = torch
-        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        _protbert_tokenizer = BertTokenizer.from_pretrained(
-            "Rostlab/prot_bert", do_lower_case=False
-        )
-        _protbert_model = BertModel.from_pretrained("Rostlab/prot_bert")
-        _protbert_model = _protbert_model.to(_device).eval()
-    return _protbert_tokenizer, _protbert_model, _torch, _device
 # ---------------------------------------------------------------------------
 # The EXACT 343 features the scaler was fit on, IN THE EXACT TRAINING ORDER.
 # The scaler was fit on a numpy array (no stored names), so order is critical:
@@ -201,42 +182,43 @@ def extract_features(sequence):
 def predictmic(sequence):
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
-    tokenizer, protbert_model, torch, device = get_protbert()
-    seq_spaced = ' '.join(list(sequence))
-    tokens = tokenizer(seq_spaced, return_tensors="pt", padding='max_length',
-                       truncation=True, max_length=512)
-    tokens = {k: v.to(device) for k, v in tokens.items()}
-    with torch.no_grad():
-        outputs = protbert_model(**tokens)
-        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().reshape(1, -1)
-    bacteria_config = {
-        "E.coli": {"model": "coli_xgboost_model.pkl", "scaler": "coli_scaler.pkl", "pca": None},
-        "S.aureus": {"model": "aur_xgboost_model.pkl", "scaler": "aur_scaler.pkl", "pca": None},
-        "P.aeruginosa": {"model": "arg_xgboost_model.pkl", "scaler": "arg_scaler.pkl", "pca": None},
-        "K.Pneumonia": {"model": "pne_mlp_model.pkl", "scaler": "pne_scaler.pkl", "pca": "pne_pca.pkl"}
-    }
-    mic_results = {}
-    for bacterium, cfg in bacteria_config.items():
-        try:
-            mic_scaler = joblib.load(cfg["scaler"])
-            scaled = mic_scaler.transform(embedding)
-            transformed = joblib.load(cfg["pca"]).transform(scaled) if cfg["pca"] else scaled
-            mic_model = joblib.load(cfg["model"])
-            mic_log = mic_model.predict(transformed)[0]
-            mic = round(expm1(mic_log), 3)
-            mic_results[bacterium] = mic
-        except Exception as e:
-            mic_results[bacterium] = f"Error: {str(e)}"
-    return mic_results
 def full_prediction(sequence):

 import numpy as np
 import pandas as pd
 from propy import AAComposition, CTD
+import sys
+import json
+import subprocess
 # ---------------------------------------------------------------------------
 # LAZY LOADING — keeps the free 16GB Space from OOM-ing at startup.
+# Only the TensorFlow AMP model is loaded in THIS process. ProtBert/PyTorch
+# run in a SEPARATE process (mic_worker.py) to avoid a native-library clash
+# between TensorFlow and PyTorch that caused SIGSEGV (exit 139).
 # ---------------------------------------------------------------------------
 _amp_model = None
 _amp_scaler = None
 def get_amp_model():
     return _amp_model, _amp_scaler
 # ---------------------------------------------------------------------------
 # The EXACT 343 features the scaler was fit on, IN THE EXACT TRAINING ORDER.
 # The scaler was fit on a numpy array (no stored names), so order is critical:
 def predictmic(sequence):
+    """Run MIC prediction in a SEPARATE process (mic_worker.py).
+    This isolates PyTorch/ProtBert from TensorFlow, preventing the native-library
+    crash (exit 139) and keeping peak memory low. The worker prints a JSON dict on
+    its last stdout line; we parse and return it.
+    """
     sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
     if len(sequence) < 10:
         return {"Error": "Sequence too short or invalid."}
+    try:
+        # First run downloads ProtBert (~1.6GB), so allow a generous timeout.
+        proc = subprocess.run(
+            [sys.executable, "mic_worker.py", sequence],
+            capture_output=True,
+            text=True,
+            timeout=900  # 15 minutes; mostly for the one-time model download
+        )
+    except subprocess.TimeoutExpired:
+        return {"Error": "MIC prediction timed out (model download may still be in progress; try again shortly)."}
+    except Exception as e:
+        return {"Error": f"Failed to start MIC worker: {str(e)}"}
+    if proc.returncode != 0:
+        # Worker crashed; surface stderr tail for debugging
+        tail = (proc.stderr or "").strip().splitlines()[-3:]
+        return {"Error": f"MIC worker exited with code {proc.returncode}. {' '.join(tail)}"}
+    # Parse the last non-empty stdout line as JSON
+    out_lines = [ln for ln in (proc.stdout or "").splitlines() if ln.strip()]
+    if not out_lines:
+        return {"Error": "MIC worker produced no output."}
+    try:
+        return json.loads(out_lines[-1])
+    except Exception:
+        return {"Error": f"Could not parse MIC worker output: {out_lines[-1][:200]}"}
 def full_prediction(sequence):