nonzeroexit commited on
Commit
03f381c
·
verified ·
1 Parent(s): caf966d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -10
app.py CHANGED
@@ -33,7 +33,7 @@ def get_amp_model():
33
  global _amp_model, _amp_scaler
34
  if _amp_model is None:
35
  from tensorflow.keras.models import load_model
36
- _amp_model = load_model("Comb1_aac_ctd_RFE_selected_features_model.keras")
37
  _amp_scaler = joblib.load("norm (4).joblib")
38
  return _amp_model, _amp_scaler
39
 
@@ -149,7 +149,7 @@ def keras_predict_proba(X):
149
 
150
 
151
  def extract_features(sequence):
152
- """Compute CTD + AAC, select the 343 training columns IN ORDER, then scale."""
153
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
154
  if len(sequence) < 10:
155
  return "Error: Sequence too short."
@@ -161,26 +161,34 @@ def extract_features(sequence):
161
  ctd_features = CTD.CalculateCTD(sequence)
162
  aac = AAComposition.CalculateAADipeptideComposition(sequence)
163
 
164
- # Merge everything into one lookup dict
165
  pool = {}
166
  pool.update(ctd_features)
167
  pool.update(aac)
168
 
169
- # Verify all needed features are present
170
  missing = [f for f in selected_features if f not in pool]
171
  if missing:
172
  return f"Error: Missing features from propy: {missing[:5]}..."
173
 
174
- # Build the 343-wide row IN THE EXACT TRAINING ORDER, THEN scale.
175
- ordered_values = [pool[f] for f in selected_features]
176
- feature_row = np.array(ordered_values, dtype=np.float64).reshape(1, -1)
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- scaled = amp_scaler.transform(feature_row) # scaler expects exactly 343 cols
179
- return scaled.astype(np.float32)
180
  except Exception as e:
181
  return f"Error in feature extraction: {str(e)}"
182
 
183
-
184
  def predictmic(sequence):
185
  """Run MIC prediction in a SEPARATE process (mic_worker.py).
186
 
 
33
  global _amp_model, _amp_scaler
34
  if _amp_model is None:
35
  from tensorflow.keras.models import load_model
36
+ _amp_model = load_model("Comb1_aac_ctd_RFE_selected_features_model(1).keras")
37
  _amp_scaler = joblib.load("norm (4).joblib")
38
  return _amp_model, _amp_scaler
39
 
 
149
 
150
 
151
  def extract_features(sequence):
152
+ """Compute CTD + AAC, scale the FULL feature pool, then select the 343 training columns IN ORDER."""
153
  sequence = ''.join([aa for aa in sequence.upper() if aa in "ACDEFGHIKLMNPQRSTVWY"])
154
  if len(sequence) < 10:
155
  return "Error: Sequence too short."
 
161
  ctd_features = CTD.CalculateCTD(sequence)
162
  aac = AAComposition.CalculateAADipeptideComposition(sequence)
163
 
 
164
  pool = {}
165
  pool.update(ctd_features)
166
  pool.update(aac)
167
 
168
+ # Verify all needed features are present BEFORE doing anything
169
  missing = [f for f in selected_features if f not in pool]
170
  if missing:
171
  return f"Error: Missing features from propy: {missing[:5]}..."
172
 
173
+ # 1. Build a row from ALL pool keys, in a consistent order
174
+ all_feature_names = sorted(pool.keys())
175
+ all_values = np.array([pool[f] for f in all_feature_names], dtype=np.float64).reshape(1, -1)
176
+
177
+ # 2. Scale the FULL feature row
178
+ # NOTE: amp_scaler must have been fit on this same full pool.
179
+ # If your scaler was fit on only 343 cols, re-fit it on the full pool first.
180
+ scaled_all = amp_scaler.transform(all_values)
181
+
182
+ # 3. Select the 343 features IN TRAINING ORDER from the scaled array
183
+ name_to_idx = {name: i for i, name in enumerate(all_feature_names)}
184
+ selected_indices = [name_to_idx[f] for f in selected_features]
185
+ scaled_selected = scaled_all[:, selected_indices]
186
+
187
+ return scaled_selected.astype(np.float32)
188
 
 
 
189
  except Exception as e:
190
  return f"Error in feature extraction: {str(e)}"
191
 
 
192
  def predictmic(sequence):
193
  """Run MIC prediction in a SEPARATE process (mic_worker.py).
194