Spaces:

aedupuga
/

scaffold_strucutre_predictor

Sleeping

App Files Files Community

scaffold_strucutre_predictor / app.py

aedupuga

Update app.py

cd23171 verified 8 months ago

raw

history blame contribute delete

11.6 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import scipy.sparse
	import joblib
	from huggingface_hub import hf_hub_download # Import hf_hub_download

	# Define custom CSS to hide the default image label
	custom_css = """
	.gradio-container label.svelte-10771d6 {
	display: none !important;
	}
	"""

	# Download and load the model using joblib
	repo_id = "aedupuga/multioutput-regression-models"
	filename = "ridge_regression.joblib"
	model_path = hf_hub_download(repo_id=repo_id, filename=filename)
	elastic_net_model = joblib.load(model_path)

	# Assuming one_hot_encode_sequence function is defined elsewhere and available
	# If not, you will need to define it based on your training code.
	def one_hot_encode_sequence(sequence, target_size):
	"""
	One-hot encode a DNA/RNA sequence and pad with zeros to a target size.

	Args:
	sequence: The input DNA/RNA sequence string.
	target_size: The desired length of the one-hot encoded vector.

	Returns:
	A list representing the one-hot encoded and padded sequence.
	"""
	# Example: Simple one-hot encoding (replace with your actual function)
	mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1], 'N': [0, 0, 0, 0]} # Include N or other characters if present
	encoded_sequence = [mapping.get(base, [0, 0, 0, 0]) for base in sequence] # Removed .upper() here as sequence will be upper in predict_features
	# Flatten the list of lists
	flat_encoded_sequence = [item for sublist in encoded_sequence for item in sublist]

	# Pad with zeros to the target size
	if len(flat_encoded_sequence) < target_size:
	padding_needed = target_size - len(flat_encoded_sequence)
	flat_encoded_sequence.extend([0] * padding_needed)
	# Truncate if the sequence is longer than the target size (based on training logic)
	elif len(flat_encoded_sequence) > target_size:
	flat_encoded_sequence = flat_encoded_sequence[:target_size]

	return flat_encoded_sequence


	def predict_features(sequence):
	"""
	Predicts features based on the input sequence using the loaded Elastic Net model.

	Args:
	sequence: The input DNA/RNA sequence string.

	Returns:
	A dictionary containing the predicted features.
	"""
	print(f"Received sequence in predict_features: {sequence}") # Added print statement

	# Convert input sequence to uppercase to handle both capital and small letters
	sequence = sequence.upper()

	if not sequence:
	# Ensure the number of returned values matches the Gradio outputs
	return (
	0, # length_bp
	"0 %", # GC_content
	"N/A", # mfe_energy
	"N/A", # num_pairs
	"N/A", # stem_len_mean
	"N/A", # num_stems
	"N/A", # num_hairpins
	"N/A", # num_internal_loops
	)


	# 1. Calculate Length and GC Content
	sequence_length = len(sequence)
	gc_content = (sequence.count('G') + sequence.count('C')) / sequence_length if sequence_length > 0 else 0
	at_content = (sequence.count('A') + sequence.count('T')) / sequence_length if sequence_length > 0 else 0

	# Determine the target size for the one-hot encoded vector
	# Total features expected by model is 109658
	# Assuming 2 numerical features (length, GC content), the one-hot encoded part is 109658 - 2 = 109656
	target_one_hot_size = 109658 - 2 # Adjust if the number of numerical features is different


	# 2. One-Hot Encode the Sequence and Pad
	# Use the actual one_hot_encode_sequence function used during training
	encoded_sequence = one_hot_encode_sequence(sequence, target_one_hot_size)
	encoded_sequence_df = pd.DataFrame([encoded_sequence]) # Convert to DataFrame row

	# 3. Combine Numerical Features with One-Hot Encoded Sequence Features
	# Ensure the order and names of numerical features match the training data
	# Assuming length and GC content were the initial numerical features
	numerical_features_df = pd.DataFrame([[sequence_length, gc_content]], columns=['length', 'gc_content']) # Adjust column names if needed

	# Combine numerical features with one-hot encoded sequence features
	# The column names for encoded_sequence_df will be numerical (0, 1, 2...)
	# Ensure this matches how your training data features were ordered and named.
	# If you had other numerical features, add them to numerical_features_df
	combined_features_df = pd.concat([numerical_features_df, encoded_sequence_df], axis=1)

	# Convert to a format suitable for the model (e.g., NumPy array or sparse matrix)
	# If your model was trained on a sparse matrix, convert to sparse
	# If trained on a dense NumPy array, convert to numpy
	# This example assumes a dense NumPy array input is acceptable for joblib loaded scikit-learn models
	numerical_features = combined_features_df.values # Convert to NumPy array


	# Use the loaded Elastic Net model to predict the features
	# The model is expected to be a multi-output regression model
	try:
	predicted_values = elastic_net_model.predict(numerical_features)[0] # Get the predictions for the single input
	# Ensure predicted_values has at least 6 elements, pad with None if not
	while len(predicted_values) < 6:
	predicted_values = np.append(predicted_values, None)

	mfe_energy = round(predicted_values[0], 1) if isinstance(predicted_values[0], (int, float, np.number)) else "N/A"
	num_pairs = round(predicted_values[1], 1) if isinstance(predicted_values[1], (int, float, np.number)) else "N/A"
	stem_len_mean = round(predicted_values[2], 1) if isinstance(predicted_values[2], (int, float, np.number)) else "N/A"
	num_stems = round(predicted_values[3], 1) if isinstance(predicted_values[3], (int, float, np.number)) else "N/A"
	num_hairpins = round(predicted_values[4], 1) if isinstance(predicted_values[4], (int, float, np.number)) else "N/A"
	num_internal_loops = round(predicted_values[5], 1) if isinstance(predicted_values[5], (int, float, np.number)) else "N/A"

	except Exception as e:
	print(f"Error during model prediction: {e}")
	print(f"Input shape to model: {numerical_features.shape}")
	# Return placeholder values if prediction fails - Ensure correct number of outputs
	return (
	sequence_length, # length_bp
	f"{gc_content * 100:.1f} %", # GC_content
	"Prediction Error", # mfe_energy
	"Prediction Error", # num_pairs
	"Prediction Error", # stem_len_mean
	"Prediction Error", # num_stems
	"Prediction Error", # num_hairpins
	"Prediction Error", # num_internal_loops
	)


	# Return a tuple of predicted values in the order expected by the Gradio interface outputs
	# The order should match the outputs list in cell 24df2117
	return (
	sequence_length, # Maps to length_output
	f"{gc_content * 100:.1f} %", # Maps to gc_content_output
	mfe_energy, # Maps to mfe_energy_output
	num_pairs, # Maps to num_pairs_output
	stem_len_mean, # Maps to stem_len_mean_output
	num_stems, # Maps to num_stems_output
	num_hairpins, # Maps to num_hairpins_output
	num_internal_loops, # Maps to num_internal_loops_output
	# Ensure the number and order match the outputs list in cell 24df2117
	)

	# Create the Gradio interface
	iface = gr.Blocks(theme="soft", css=custom_css) # Added css parameter
	with iface:
	gr.Markdown("## Sequence Feature Predictor")
	gr.Markdown("Enter a DNA/RNA sequence to predict its features.")

	with gr.Row():
	sequence_input = gr.Textbox(label="Enter DNA/RNA Sequence", lines=5, placeholder="Paste DNA sequence here...")

	gr.Markdown("### Sequence Features:")

	with gr.Column():
	with gr.Row(): # Grouping label and explanation
	length_output = gr.Textbox(label="Sequence Length (base pairs)", scale=2) # Adjusted scale
	gr.Markdown("(e.g., A T G C G A T C G A -> 10 bases)") # Removed scale
	with gr.Row(): # Groupdicteding label and explanation
	gc_content_output = gr.Textbox(label="GC Content (%)", scale=2) # Adjusted scale
	gr.Markdown("(e.g., A T G C G* A T C G A -> 50% GC content)*") # Removed scale


	gr.Markdown("#### Predicted Structural Features:")

	with gr.Row(): # Grouping label and explanation
	mfe_energy_output = gr.Textbox(label="Minimum Free Energy (kcal/mol)", lines=3, scale=2) # Adjusted scale
	gr.Markdown("(More negative MFE, more stable the structure)") # Removed scale

	with gr.Row(): # First row for structural features (3 items)

	with gr.Column(scale=1, min_width=75): # Column for Number of Base Pairs, adjusted min_width
	num_pairs_output = gr.Textbox(label="Number of Base Pairs") # Updated label
	gr.Markdown("_Example: Image below shows 6 Base Pairs_") # Moved markdown above image and updated text
	gr.Image(value="structure (6).svg", scale=0.5, width=50, height=100, show_label=False, image_mode="1") # Reduced scale and added explicit dimensions, removed label, added image_mode
	with gr.Column(scale=1, min_width=75): # Column for Mean Stem Length, adjusted min_width
	stem_len_mean_output = gr.Textbox(label="Mean Stem Length") # Updated label
	gr.Markdown("_Example: Image below shows Mean Stem Length 3.5 base pairs (bp)_") # Moved markdown above image and updated text
	gr.Image(value="structure (3).svg", scale=0.5, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label
	with gr.Column(scale=1, min_width=75): # Column for Number of Stems, adjusted min_width
	num_stems_output = gr.Textbox(label="Number of Stems") # Updated label
	gr.Markdown("_Example: Image below shows 3 Stems_") # Moved markdown above image and updated text
	gr.Image(value="structure (3.1).svg", scale=0.5, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label


	with gr.Row(): # Second row for structural features (2 items)
	with gr.Column(scale=1, min_width=75): # Column for Number of Hairpins, adjusted min_width
	num_hairpins_output = gr.Textbox(label="Number of Hairpins") # Updated label
	gr.Markdown("_Example: Image below shows 3 Hairpins_") # Moved markdown above image and updated text
	gr.Image(value="structure (4).svg", scale=0.1, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label
	with gr.Column(scale=1, min_width=75): # Column for Number of Internal Loops, adjusted min_width
	num_internal_loops_output = gr.Textbox(label="Number of Internal Loops") # Updated label
	gr.Markdown("_Example: Image below shows 1 Internal Loop_") # Moved markdown above image and updated text
	gr.Image(value="structure (6.1).svg", scale=0.1, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label


	sequence_input.change(
	fn=predict_features,
	inputs=sequence_input,
	outputs=[
	length_output,
	gc_content_output,
	mfe_energy_output,
	num_pairs_output,
	stem_len_mean_output,
	num_stems_output,
	num_hairpins_output,
	num_internal_loops_output,
	]
	)

	# Removed the launch call as it's handled by the Space environment
	iface.launch(debug=True)