aedupuga's picture
Update app.py
cd23171 verified
import gradio as gr
import numpy as np
import pandas as pd
import scipy.sparse
import joblib
from huggingface_hub import hf_hub_download # Import hf_hub_download
# Define custom CSS to hide the default image label
custom_css = """
.gradio-container label.svelte-10771d6 {
display: none !important;
}
"""
# Download and load the model using joblib
repo_id = "aedupuga/multioutput-regression-models"
filename = "ridge_regression.joblib"
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
elastic_net_model = joblib.load(model_path)
# Assuming one_hot_encode_sequence function is defined elsewhere and available
# If not, you will need to define it based on your training code.
def one_hot_encode_sequence(sequence, target_size):
"""
One-hot encode a DNA/RNA sequence and pad with zeros to a target size.
Args:
sequence: The input DNA/RNA sequence string.
target_size: The desired length of the one-hot encoded vector.
Returns:
A list representing the one-hot encoded and padded sequence.
"""
# Example: Simple one-hot encoding (replace with your actual function)
mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1], 'N': [0, 0, 0, 0]} # Include N or other characters if present
encoded_sequence = [mapping.get(base, [0, 0, 0, 0]) for base in sequence] # Removed .upper() here as sequence will be upper in predict_features
# Flatten the list of lists
flat_encoded_sequence = [item for sublist in encoded_sequence for item in sublist]
# Pad with zeros to the target size
if len(flat_encoded_sequence) < target_size:
padding_needed = target_size - len(flat_encoded_sequence)
flat_encoded_sequence.extend([0] * padding_needed)
# Truncate if the sequence is longer than the target size (based on training logic)
elif len(flat_encoded_sequence) > target_size:
flat_encoded_sequence = flat_encoded_sequence[:target_size]
return flat_encoded_sequence
def predict_features(sequence):
"""
Predicts features based on the input sequence using the loaded Elastic Net model.
Args:
sequence: The input DNA/RNA sequence string.
Returns:
A dictionary containing the predicted features.
"""
print(f"Received sequence in predict_features: {sequence}") # Added print statement
# Convert input sequence to uppercase to handle both capital and small letters
sequence = sequence.upper()
if not sequence:
# Ensure the number of returned values matches the Gradio outputs
return (
0, # length_bp
"0 %", # GC_content
"N/A", # mfe_energy
"N/A", # num_pairs
"N/A", # stem_len_mean
"N/A", # num_stems
"N/A", # num_hairpins
"N/A", # num_internal_loops
)
# 1. Calculate Length and GC Content
sequence_length = len(sequence)
gc_content = (sequence.count('G') + sequence.count('C')) / sequence_length if sequence_length > 0 else 0
at_content = (sequence.count('A') + sequence.count('T')) / sequence_length if sequence_length > 0 else 0
# Determine the target size for the one-hot encoded vector
# Total features expected by model is 109658
# Assuming 2 numerical features (length, GC content), the one-hot encoded part is 109658 - 2 = 109656
target_one_hot_size = 109658 - 2 # Adjust if the number of numerical features is different
# 2. One-Hot Encode the Sequence and Pad
# Use the actual one_hot_encode_sequence function used during training
encoded_sequence = one_hot_encode_sequence(sequence, target_one_hot_size)
encoded_sequence_df = pd.DataFrame([encoded_sequence]) # Convert to DataFrame row
# 3. Combine Numerical Features with One-Hot Encoded Sequence Features
# Ensure the order and names of numerical features match the training data
# Assuming length and GC content were the initial numerical features
numerical_features_df = pd.DataFrame([[sequence_length, gc_content]], columns=['length', 'gc_content']) # Adjust column names if needed
# Combine numerical features with one-hot encoded sequence features
# The column names for encoded_sequence_df will be numerical (0, 1, 2...)
# Ensure this matches how your training data features were ordered and named.
# If you had other numerical features, add them to numerical_features_df
combined_features_df = pd.concat([numerical_features_df, encoded_sequence_df], axis=1)
# Convert to a format suitable for the model (e.g., NumPy array or sparse matrix)
# If your model was trained on a sparse matrix, convert to sparse
# If trained on a dense NumPy array, convert to numpy
# This example assumes a dense NumPy array input is acceptable for joblib loaded scikit-learn models
numerical_features = combined_features_df.values # Convert to NumPy array
# Use the loaded Elastic Net model to predict the features
# The model is expected to be a multi-output regression model
try:
predicted_values = elastic_net_model.predict(numerical_features)[0] # Get the predictions for the single input
# Ensure predicted_values has at least 6 elements, pad with None if not
while len(predicted_values) < 6:
predicted_values = np.append(predicted_values, None)
mfe_energy = round(predicted_values[0], 1) if isinstance(predicted_values[0], (int, float, np.number)) else "N/A"
num_pairs = round(predicted_values[1], 1) if isinstance(predicted_values[1], (int, float, np.number)) else "N/A"
stem_len_mean = round(predicted_values[2], 1) if isinstance(predicted_values[2], (int, float, np.number)) else "N/A"
num_stems = round(predicted_values[3], 1) if isinstance(predicted_values[3], (int, float, np.number)) else "N/A"
num_hairpins = round(predicted_values[4], 1) if isinstance(predicted_values[4], (int, float, np.number)) else "N/A"
num_internal_loops = round(predicted_values[5], 1) if isinstance(predicted_values[5], (int, float, np.number)) else "N/A"
except Exception as e:
print(f"Error during model prediction: {e}")
print(f"Input shape to model: {numerical_features.shape}")
# Return placeholder values if prediction fails - Ensure correct number of outputs
return (
sequence_length, # length_bp
f"{gc_content * 100:.1f} %", # GC_content
"Prediction Error", # mfe_energy
"Prediction Error", # num_pairs
"Prediction Error", # stem_len_mean
"Prediction Error", # num_stems
"Prediction Error", # num_hairpins
"Prediction Error", # num_internal_loops
)
# Return a tuple of predicted values in the order expected by the Gradio interface outputs
# The order should match the outputs list in cell 24df2117
return (
sequence_length, # Maps to length_output
f"{gc_content * 100:.1f} %", # Maps to gc_content_output
mfe_energy, # Maps to mfe_energy_output
num_pairs, # Maps to num_pairs_output
stem_len_mean, # Maps to stem_len_mean_output
num_stems, # Maps to num_stems_output
num_hairpins, # Maps to num_hairpins_output
num_internal_loops, # Maps to num_internal_loops_output
# Ensure the number and order match the outputs list in cell 24df2117
)
# Create the Gradio interface
iface = gr.Blocks(theme="soft", css=custom_css) # Added css parameter
with iface:
gr.Markdown("## Sequence Feature Predictor")
gr.Markdown("Enter a DNA/RNA sequence to predict its features.")
with gr.Row():
sequence_input = gr.Textbox(label="Enter DNA/RNA Sequence", lines=5, placeholder="Paste DNA sequence here...")
gr.Markdown("### Sequence Features:")
with gr.Column():
with gr.Row(): # Grouping label and explanation
length_output = gr.Textbox(label="Sequence Length (base pairs)", scale=2) # Adjusted scale
gr.Markdown("*(e.g., A T G C G A T C G A -> 10 bases)*") # Removed scale
with gr.Row(): # Groupdicteding label and explanation
gc_content_output = gr.Textbox(label="GC Content (%)", scale=2) # Adjusted scale
gr.Markdown("*(e.g., A T **G C G** A T **C G** A -> 50% GC content)*") # Removed scale
gr.Markdown("#### Predicted Structural Features:")
with gr.Row(): # Grouping label and explanation
mfe_energy_output = gr.Textbox(label="Minimum Free Energy (kcal/mol)", lines=3, scale=2) # Adjusted scale
gr.Markdown("*(More negative MFE, more stable the structure)*") # Removed scale
with gr.Row(): # First row for structural features (3 items)
with gr.Column(scale=1, min_width=75): # Column for Number of Base Pairs, adjusted min_width
num_pairs_output = gr.Textbox(label="Number of Base Pairs") # Updated label
gr.Markdown("_Example: Image below shows 6 Base Pairs_") # Moved markdown above image and updated text
gr.Image(value="structure (6).svg", scale=0.5, width=50, height=100, show_label=False, image_mode="1") # Reduced scale and added explicit dimensions, removed label, added image_mode
with gr.Column(scale=1, min_width=75): # Column for Mean Stem Length, adjusted min_width
stem_len_mean_output = gr.Textbox(label="Mean Stem Length") # Updated label
gr.Markdown("_Example: Image below shows Mean Stem Length 3.5 base pairs (bp)_") # Moved markdown above image and updated text
gr.Image(value="structure (3).svg", scale=0.5, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label
with gr.Column(scale=1, min_width=75): # Column for Number of Stems, adjusted min_width
num_stems_output = gr.Textbox(label="Number of Stems") # Updated label
gr.Markdown("_Example: Image below shows 3 Stems_") # Moved markdown above image and updated text
gr.Image(value="structure (3.1).svg", scale=0.5, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label
with gr.Row(): # Second row for structural features (2 items)
with gr.Column(scale=1, min_width=75): # Column for Number of Hairpins, adjusted min_width
num_hairpins_output = gr.Textbox(label="Number of Hairpins") # Updated label
gr.Markdown("_Example: Image below shows 3 Hairpins_") # Moved markdown above image and updated text
gr.Image(value="structure (4).svg", scale=0.1, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label
with gr.Column(scale=1, min_width=75): # Column for Number of Internal Loops, adjusted min_width
num_internal_loops_output = gr.Textbox(label="Number of Internal Loops") # Updated label
gr.Markdown("_Example: Image below shows 1 Internal Loop_") # Moved markdown above image and updated text
gr.Image(value="structure (6.1).svg", scale=0.1, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label
sequence_input.change(
fn=predict_features,
inputs=sequence_input,
outputs=[
length_output,
gc_content_output,
mfe_energy_output,
num_pairs_output,
stem_len_mean_output,
num_stems_output,
num_hairpins_output,
num_internal_loops_output,
]
)
# Removed the launch call as it's handled by the Space environment
iface.launch(debug=True)