| import gradio as gr |
| import numpy as np |
| import pandas as pd |
| import scipy.sparse |
| import joblib |
| from huggingface_hub import hf_hub_download |
|
|
| |
| custom_css = """ |
| .gradio-container label.svelte-10771d6 { |
| display: none !important; |
| } |
| """ |
|
|
| |
| repo_id = "aedupuga/multioutput-regression-models" |
| filename = "ridge_regression.joblib" |
| model_path = hf_hub_download(repo_id=repo_id, filename=filename) |
| elastic_net_model = joblib.load(model_path) |
|
|
| |
| |
| def one_hot_encode_sequence(sequence, target_size): |
| """ |
| One-hot encode a DNA/RNA sequence and pad with zeros to a target size. |
| |
| Args: |
| sequence: The input DNA/RNA sequence string. |
| target_size: The desired length of the one-hot encoded vector. |
| |
| Returns: |
| A list representing the one-hot encoded and padded sequence. |
| """ |
| |
| mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1], 'N': [0, 0, 0, 0]} |
| encoded_sequence = [mapping.get(base, [0, 0, 0, 0]) for base in sequence] |
| |
| flat_encoded_sequence = [item for sublist in encoded_sequence for item in sublist] |
|
|
| |
| if len(flat_encoded_sequence) < target_size: |
| padding_needed = target_size - len(flat_encoded_sequence) |
| flat_encoded_sequence.extend([0] * padding_needed) |
| |
| elif len(flat_encoded_sequence) > target_size: |
| flat_encoded_sequence = flat_encoded_sequence[:target_size] |
|
|
| return flat_encoded_sequence |
|
|
|
|
| def predict_features(sequence): |
| """ |
| Predicts features based on the input sequence using the loaded Elastic Net model. |
| |
| Args: |
| sequence: The input DNA/RNA sequence string. |
| |
| Returns: |
| A dictionary containing the predicted features. |
| """ |
| print(f"Received sequence in predict_features: {sequence}") |
|
|
| |
| sequence = sequence.upper() |
|
|
| if not sequence: |
| |
| return ( |
| 0, |
| "0 %", |
| "N/A", |
| "N/A", |
| "N/A", |
| "N/A", |
| "N/A", |
| "N/A", |
| ) |
|
|
|
|
| |
| sequence_length = len(sequence) |
| gc_content = (sequence.count('G') + sequence.count('C')) / sequence_length if sequence_length > 0 else 0 |
| at_content = (sequence.count('A') + sequence.count('T')) / sequence_length if sequence_length > 0 else 0 |
|
|
| |
| |
| |
| target_one_hot_size = 109658 - 2 |
|
|
|
|
| |
| |
| encoded_sequence = one_hot_encode_sequence(sequence, target_one_hot_size) |
| encoded_sequence_df = pd.DataFrame([encoded_sequence]) |
|
|
| |
| |
| |
| numerical_features_df = pd.DataFrame([[sequence_length, gc_content]], columns=['length', 'gc_content']) |
|
|
| |
| |
| |
| |
| combined_features_df = pd.concat([numerical_features_df, encoded_sequence_df], axis=1) |
|
|
| |
| |
| |
| |
| numerical_features = combined_features_df.values |
|
|
|
|
| |
| |
| try: |
| predicted_values = elastic_net_model.predict(numerical_features)[0] |
| |
| while len(predicted_values) < 6: |
| predicted_values = np.append(predicted_values, None) |
|
|
| mfe_energy = round(predicted_values[0], 1) if isinstance(predicted_values[0], (int, float, np.number)) else "N/A" |
| num_pairs = round(predicted_values[1], 1) if isinstance(predicted_values[1], (int, float, np.number)) else "N/A" |
| stem_len_mean = round(predicted_values[2], 1) if isinstance(predicted_values[2], (int, float, np.number)) else "N/A" |
| num_stems = round(predicted_values[3], 1) if isinstance(predicted_values[3], (int, float, np.number)) else "N/A" |
| num_hairpins = round(predicted_values[4], 1) if isinstance(predicted_values[4], (int, float, np.number)) else "N/A" |
| num_internal_loops = round(predicted_values[5], 1) if isinstance(predicted_values[5], (int, float, np.number)) else "N/A" |
|
|
| except Exception as e: |
| print(f"Error during model prediction: {e}") |
| print(f"Input shape to model: {numerical_features.shape}") |
| |
| return ( |
| sequence_length, |
| f"{gc_content * 100:.1f} %", |
| "Prediction Error", |
| "Prediction Error", |
| "Prediction Error", |
| "Prediction Error", |
| "Prediction Error", |
| "Prediction Error", |
| ) |
|
|
|
|
| |
| |
| return ( |
| sequence_length, |
| f"{gc_content * 100:.1f} %", |
| mfe_energy, |
| num_pairs, |
| stem_len_mean, |
| num_stems, |
| num_hairpins, |
| num_internal_loops, |
| |
| ) |
|
|
| |
| iface = gr.Blocks(theme="soft", css=custom_css) |
| with iface: |
| gr.Markdown("## Sequence Feature Predictor") |
| gr.Markdown("Enter a DNA/RNA sequence to predict its features.") |
|
|
| with gr.Row(): |
| sequence_input = gr.Textbox(label="Enter DNA/RNA Sequence", lines=5, placeholder="Paste DNA sequence here...") |
|
|
| gr.Markdown("### Sequence Features:") |
|
|
| with gr.Column(): |
| with gr.Row(): |
| length_output = gr.Textbox(label="Sequence Length (base pairs)", scale=2) |
| gr.Markdown("*(e.g., A T G C G A T C G A -> 10 bases)*") |
| with gr.Row(): |
| gc_content_output = gr.Textbox(label="GC Content (%)", scale=2) |
| gr.Markdown("*(e.g., A T **G C G** A T **C G** A -> 50% GC content)*") |
|
|
|
|
| gr.Markdown("#### Predicted Structural Features:") |
|
|
| with gr.Row(): |
| mfe_energy_output = gr.Textbox(label="Minimum Free Energy (kcal/mol)", lines=3, scale=2) |
| gr.Markdown("*(More negative MFE, more stable the structure)*") |
|
|
| with gr.Row(): |
|
|
| with gr.Column(scale=1, min_width=75): |
| num_pairs_output = gr.Textbox(label="Number of Base Pairs") |
| gr.Markdown("_Example: Image below shows 6 Base Pairs_") |
| gr.Image(value="structure (6).svg", scale=0.5, width=50, height=100, show_label=False, image_mode="1") |
| with gr.Column(scale=1, min_width=75): |
| stem_len_mean_output = gr.Textbox(label="Mean Stem Length") |
| gr.Markdown("_Example: Image below shows Mean Stem Length 3.5 base pairs (bp)_") |
| gr.Image(value="structure (3).svg", scale=0.5, width=50, height=100, show_label=False) |
| with gr.Column(scale=1, min_width=75): |
| num_stems_output = gr.Textbox(label="Number of Stems") |
| gr.Markdown("_Example: Image below shows 3 Stems_") |
| gr.Image(value="structure (3.1).svg", scale=0.5, width=50, height=100,show_label=False) |
|
|
|
|
| with gr.Row(): |
| with gr.Column(scale=1, min_width=75): |
| num_hairpins_output = gr.Textbox(label="Number of Hairpins") |
| gr.Markdown("_Example: Image below shows 3 Hairpins_") |
| gr.Image(value="structure (4).svg", scale=0.1, width=50, height=100,show_label=False) |
| with gr.Column(scale=1, min_width=75): |
| num_internal_loops_output = gr.Textbox(label="Number of Internal Loops") |
| gr.Markdown("_Example: Image below shows 1 Internal Loop_") |
| gr.Image(value="structure (6.1).svg", scale=0.1, width=50, height=100, show_label=False) |
|
|
|
|
| sequence_input.change( |
| fn=predict_features, |
| inputs=sequence_input, |
| outputs=[ |
| length_output, |
| gc_content_output, |
| mfe_energy_output, |
| num_pairs_output, |
| stem_len_mean_output, |
| num_stems_output, |
| num_hairpins_output, |
| num_internal_loops_output, |
| ] |
| ) |
|
|
| |
| iface.launch(debug=True) |