import gradio as gr import numpy as np import pandas as pd import scipy.sparse import joblib from huggingface_hub import hf_hub_download # Import hf_hub_download # Define custom CSS to hide the default image label custom_css = """ .gradio-container label.svelte-10771d6 { display: none !important; } """ # Download and load the model using joblib repo_id = "aedupuga/multioutput-regression-models" filename = "ridge_regression.joblib" model_path = hf_hub_download(repo_id=repo_id, filename=filename) elastic_net_model = joblib.load(model_path) # Assuming one_hot_encode_sequence function is defined elsewhere and available # If not, you will need to define it based on your training code. def one_hot_encode_sequence(sequence, target_size): """ One-hot encode a DNA/RNA sequence and pad with zeros to a target size. Args: sequence: The input DNA/RNA sequence string. target_size: The desired length of the one-hot encoded vector. Returns: A list representing the one-hot encoded and padded sequence. """ # Example: Simple one-hot encoding (replace with your actual function) mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1], 'N': [0, 0, 0, 0]} # Include N or other characters if present encoded_sequence = [mapping.get(base, [0, 0, 0, 0]) for base in sequence] # Removed .upper() here as sequence will be upper in predict_features # Flatten the list of lists flat_encoded_sequence = [item for sublist in encoded_sequence for item in sublist] # Pad with zeros to the target size if len(flat_encoded_sequence) < target_size: padding_needed = target_size - len(flat_encoded_sequence) flat_encoded_sequence.extend([0] * padding_needed) # Truncate if the sequence is longer than the target size (based on training logic) elif len(flat_encoded_sequence) > target_size: flat_encoded_sequence = flat_encoded_sequence[:target_size] return flat_encoded_sequence def predict_features(sequence): """ Predicts features based on the input sequence using the loaded Elastic Net model. Args: sequence: The input DNA/RNA sequence string. Returns: A dictionary containing the predicted features. """ print(f"Received sequence in predict_features: {sequence}") # Added print statement # Convert input sequence to uppercase to handle both capital and small letters sequence = sequence.upper() if not sequence: # Ensure the number of returned values matches the Gradio outputs return ( 0, # length_bp "0 %", # GC_content "N/A", # mfe_energy "N/A", # num_pairs "N/A", # stem_len_mean "N/A", # num_stems "N/A", # num_hairpins "N/A", # num_internal_loops ) # 1. Calculate Length and GC Content sequence_length = len(sequence) gc_content = (sequence.count('G') + sequence.count('C')) / sequence_length if sequence_length > 0 else 0 at_content = (sequence.count('A') + sequence.count('T')) / sequence_length if sequence_length > 0 else 0 # Determine the target size for the one-hot encoded vector # Total features expected by model is 109658 # Assuming 2 numerical features (length, GC content), the one-hot encoded part is 109658 - 2 = 109656 target_one_hot_size = 109658 - 2 # Adjust if the number of numerical features is different # 2. One-Hot Encode the Sequence and Pad # Use the actual one_hot_encode_sequence function used during training encoded_sequence = one_hot_encode_sequence(sequence, target_one_hot_size) encoded_sequence_df = pd.DataFrame([encoded_sequence]) # Convert to DataFrame row # 3. Combine Numerical Features with One-Hot Encoded Sequence Features # Ensure the order and names of numerical features match the training data # Assuming length and GC content were the initial numerical features numerical_features_df = pd.DataFrame([[sequence_length, gc_content]], columns=['length', 'gc_content']) # Adjust column names if needed # Combine numerical features with one-hot encoded sequence features # The column names for encoded_sequence_df will be numerical (0, 1, 2...) # Ensure this matches how your training data features were ordered and named. # If you had other numerical features, add them to numerical_features_df combined_features_df = pd.concat([numerical_features_df, encoded_sequence_df], axis=1) # Convert to a format suitable for the model (e.g., NumPy array or sparse matrix) # If your model was trained on a sparse matrix, convert to sparse # If trained on a dense NumPy array, convert to numpy # This example assumes a dense NumPy array input is acceptable for joblib loaded scikit-learn models numerical_features = combined_features_df.values # Convert to NumPy array # Use the loaded Elastic Net model to predict the features # The model is expected to be a multi-output regression model try: predicted_values = elastic_net_model.predict(numerical_features)[0] # Get the predictions for the single input # Ensure predicted_values has at least 6 elements, pad with None if not while len(predicted_values) < 6: predicted_values = np.append(predicted_values, None) mfe_energy = round(predicted_values[0], 1) if isinstance(predicted_values[0], (int, float, np.number)) else "N/A" num_pairs = round(predicted_values[1], 1) if isinstance(predicted_values[1], (int, float, np.number)) else "N/A" stem_len_mean = round(predicted_values[2], 1) if isinstance(predicted_values[2], (int, float, np.number)) else "N/A" num_stems = round(predicted_values[3], 1) if isinstance(predicted_values[3], (int, float, np.number)) else "N/A" num_hairpins = round(predicted_values[4], 1) if isinstance(predicted_values[4], (int, float, np.number)) else "N/A" num_internal_loops = round(predicted_values[5], 1) if isinstance(predicted_values[5], (int, float, np.number)) else "N/A" except Exception as e: print(f"Error during model prediction: {e}") print(f"Input shape to model: {numerical_features.shape}") # Return placeholder values if prediction fails - Ensure correct number of outputs return ( sequence_length, # length_bp f"{gc_content * 100:.1f} %", # GC_content "Prediction Error", # mfe_energy "Prediction Error", # num_pairs "Prediction Error", # stem_len_mean "Prediction Error", # num_stems "Prediction Error", # num_hairpins "Prediction Error", # num_internal_loops ) # Return a tuple of predicted values in the order expected by the Gradio interface outputs # The order should match the outputs list in cell 24df2117 return ( sequence_length, # Maps to length_output f"{gc_content * 100:.1f} %", # Maps to gc_content_output mfe_energy, # Maps to mfe_energy_output num_pairs, # Maps to num_pairs_output stem_len_mean, # Maps to stem_len_mean_output num_stems, # Maps to num_stems_output num_hairpins, # Maps to num_hairpins_output num_internal_loops, # Maps to num_internal_loops_output # Ensure the number and order match the outputs list in cell 24df2117 ) # Create the Gradio interface iface = gr.Blocks(theme="soft", css=custom_css) # Added css parameter with iface: gr.Markdown("## Sequence Feature Predictor") gr.Markdown("Enter a DNA/RNA sequence to predict its features.") with gr.Row(): sequence_input = gr.Textbox(label="Enter DNA/RNA Sequence", lines=5, placeholder="Paste DNA sequence here...") gr.Markdown("### Sequence Features:") with gr.Column(): with gr.Row(): # Grouping label and explanation length_output = gr.Textbox(label="Sequence Length (base pairs)", scale=2) # Adjusted scale gr.Markdown("*(e.g., A T G C G A T C G A -> 10 bases)*") # Removed scale with gr.Row(): # Groupdicteding label and explanation gc_content_output = gr.Textbox(label="GC Content (%)", scale=2) # Adjusted scale gr.Markdown("*(e.g., A T **G C G** A T **C G** A -> 50% GC content)*") # Removed scale gr.Markdown("#### Predicted Structural Features:") with gr.Row(): # Grouping label and explanation mfe_energy_output = gr.Textbox(label="Minimum Free Energy (kcal/mol)", lines=3, scale=2) # Adjusted scale gr.Markdown("*(More negative MFE, more stable the structure)*") # Removed scale with gr.Row(): # First row for structural features (3 items) with gr.Column(scale=1, min_width=75): # Column for Number of Base Pairs, adjusted min_width num_pairs_output = gr.Textbox(label="Number of Base Pairs") # Updated label gr.Markdown("_Example: Image below shows 6 Base Pairs_") # Moved markdown above image and updated text gr.Image(value="structure (6).svg", scale=0.5, width=50, height=100, show_label=False, image_mode="1") # Reduced scale and added explicit dimensions, removed label, added image_mode with gr.Column(scale=1, min_width=75): # Column for Mean Stem Length, adjusted min_width stem_len_mean_output = gr.Textbox(label="Mean Stem Length") # Updated label gr.Markdown("_Example: Image below shows Mean Stem Length 3.5 base pairs (bp)_") # Moved markdown above image and updated text gr.Image(value="structure (3).svg", scale=0.5, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label with gr.Column(scale=1, min_width=75): # Column for Number of Stems, adjusted min_width num_stems_output = gr.Textbox(label="Number of Stems") # Updated label gr.Markdown("_Example: Image below shows 3 Stems_") # Moved markdown above image and updated text gr.Image(value="structure (3.1).svg", scale=0.5, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label with gr.Row(): # Second row for structural features (2 items) with gr.Column(scale=1, min_width=75): # Column for Number of Hairpins, adjusted min_width num_hairpins_output = gr.Textbox(label="Number of Hairpins") # Updated label gr.Markdown("_Example: Image below shows 3 Hairpins_") # Moved markdown above image and updated text gr.Image(value="structure (4).svg", scale=0.1, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label with gr.Column(scale=1, min_width=75): # Column for Number of Internal Loops, adjusted min_width num_internal_loops_output = gr.Textbox(label="Number of Internal Loops") # Updated label gr.Markdown("_Example: Image below shows 1 Internal Loop_") # Moved markdown above image and updated text gr.Image(value="structure (6.1).svg", scale=0.1, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label sequence_input.change( fn=predict_features, inputs=sequence_input, outputs=[ length_output, gc_content_output, mfe_energy_output, num_pairs_output, stem_len_mean_output, num_stems_output, num_hairpins_output, num_internal_loops_output, ] ) # Removed the launch call as it's handled by the Space environment iface.launch(debug=True)