import gradio as gr
import numpy as np
import pandas as pd
import scipy.sparse
import joblib
from huggingface_hub import hf_hub_download # Import hf_hub_download

# Define custom CSS to hide the default image label
custom_css = """
.gradio-container label.svelte-10771d6 {
    display: none !important;
}
"""

# Download and load the model using joblib
repo_id = "aedupuga/multioutput-regression-models"
filename = "ridge_regression.joblib"
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
elastic_net_model = joblib.load(model_path)

# Assuming one_hot_encode_sequence function is defined elsewhere and available
# If not, you will need to define it based on your training code.
def one_hot_encode_sequence(sequence, target_size):
    """
    One-hot encode a DNA/RNA sequence and pad with zeros to a target size.

    Args:
        sequence: The input DNA/RNA sequence string.
        target_size: The desired length of the one-hot encoded vector.

    Returns:
        A list representing the one-hot encoded and padded sequence.
    """
    # Example: Simple one-hot encoding (replace with your actual function)
    mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1], 'N': [0, 0, 0, 0]} # Include N or other characters if present
    encoded_sequence = [mapping.get(base, [0, 0, 0, 0]) for base in sequence] # Removed .upper() here as sequence will be upper in predict_features
    # Flatten the list of lists
    flat_encoded_sequence = [item for sublist in encoded_sequence for item in sublist]

    # Pad with zeros to the target size
    if len(flat_encoded_sequence) < target_size:
        padding_needed = target_size - len(flat_encoded_sequence)
        flat_encoded_sequence.extend([0] * padding_needed)
    # Truncate if the sequence is longer than the target size (based on training logic)
    elif len(flat_encoded_sequence) > target_size:
        flat_encoded_sequence = flat_encoded_sequence[:target_size]

    return flat_encoded_sequence


def predict_features(sequence):
  """
  Predicts features based on the input sequence using the loaded Elastic Net model.

  Args:
    sequence: The input DNA/RNA sequence string.

  Returns:
    A dictionary containing the predicted features.
  """
  print(f"Received sequence in predict_features: {sequence}") # Added print statement

  # Convert input sequence to uppercase to handle both capital and small letters
  sequence = sequence.upper()

  if not sequence:
      # Ensure the number of returned values matches the Gradio outputs
      return (
          0, # length_bp
          "0 %", # GC_content
          "N/A", # mfe_energy
          "N/A", # num_pairs
          "N/A", # stem_len_mean
          "N/A", # num_stems
          "N/A", # num_hairpins
          "N/A", # num_internal_loops
      )


  # 1. Calculate Length and GC Content
  sequence_length = len(sequence)
  gc_content = (sequence.count('G') + sequence.count('C')) / sequence_length if sequence_length > 0 else 0
  at_content = (sequence.count('A') + sequence.count('T')) / sequence_length if sequence_length > 0 else 0

  # Determine the target size for the one-hot encoded vector
  # Total features expected by model is 109658
  # Assuming 2 numerical features (length, GC content), the one-hot encoded part is 109658 - 2 = 109656
  target_one_hot_size = 109658 - 2 # Adjust if the number of numerical features is different


  # 2. One-Hot Encode the Sequence and Pad
  # Use the actual one_hot_encode_sequence function used during training
  encoded_sequence = one_hot_encode_sequence(sequence, target_one_hot_size)
  encoded_sequence_df = pd.DataFrame([encoded_sequence]) # Convert to DataFrame row

  # 3. Combine Numerical Features with One-Hot Encoded Sequence Features
  # Ensure the order and names of numerical features match the training data
  # Assuming length and GC content were the initial numerical features
  numerical_features_df = pd.DataFrame([[sequence_length, gc_content]], columns=['length', 'gc_content']) # Adjust column names if needed

  # Combine numerical features with one-hot encoded sequence features
  # The column names for encoded_sequence_df will be numerical (0, 1, 2...)
  # Ensure this matches how your training data features were ordered and named.
  # If you had other numerical features, add them to numerical_features_df
  combined_features_df = pd.concat([numerical_features_df, encoded_sequence_df], axis=1)

  # Convert to a format suitable for the model (e.g., NumPy array or sparse matrix)
  # If your model was trained on a sparse matrix, convert to sparse
  # If trained on a dense NumPy array, convert to numpy
  # This example assumes a dense NumPy array input is acceptable for joblib loaded scikit-learn models
  numerical_features = combined_features_df.values # Convert to NumPy array


  # Use the loaded Elastic Net model to predict the features
  # The model is expected to be a multi-output regression model
  try:
      predicted_values = elastic_net_model.predict(numerical_features)[0] # Get the predictions for the single input
      # Ensure predicted_values has at least 6 elements, pad with None if not
      while len(predicted_values) < 6:
          predicted_values = np.append(predicted_values, None)

      mfe_energy = round(predicted_values[0], 1) if isinstance(predicted_values[0], (int, float, np.number)) else "N/A"
      num_pairs = round(predicted_values[1], 1) if isinstance(predicted_values[1], (int, float, np.number)) else "N/A"
      stem_len_mean = round(predicted_values[2], 1) if isinstance(predicted_values[2], (int, float, np.number)) else "N/A"
      num_stems = round(predicted_values[3], 1) if isinstance(predicted_values[3], (int, float, np.number)) else "N/A"
      num_hairpins = round(predicted_values[4], 1) if isinstance(predicted_values[4], (int, float, np.number)) else "N/A"
      num_internal_loops = round(predicted_values[5], 1) if isinstance(predicted_values[5], (int, float, np.number)) else "N/A"

  except Exception as e:
      print(f"Error during model prediction: {e}")
      print(f"Input shape to model: {numerical_features.shape}")
      # Return placeholder values if prediction fails - Ensure correct number of outputs
      return (
          sequence_length, # length_bp
          f"{gc_content * 100:.1f} %", # GC_content
          "Prediction Error", # mfe_energy
          "Prediction Error", # num_pairs
          "Prediction Error", # stem_len_mean
          "Prediction Error", # num_stems
          "Prediction Error", # num_hairpins
          "Prediction Error", # num_internal_loops
      )


  # Return a tuple of predicted values in the order expected by the Gradio interface outputs
  # The order should match the outputs list in cell 24df2117
  return (
      sequence_length, # Maps to length_output
      f"{gc_content * 100:.1f} %", # Maps to gc_content_output
      mfe_energy, # Maps to mfe_energy_output
      num_pairs, # Maps to num_pairs_output
      stem_len_mean, # Maps to stem_len_mean_output
      num_stems, # Maps to num_stems_output
      num_hairpins, # Maps to num_hairpins_output
      num_internal_loops, # Maps to num_internal_loops_output
      # Ensure the number and order match the outputs list in cell 24df2117
  )

# Create the Gradio interface
iface = gr.Blocks(theme="soft", css=custom_css) # Added css parameter
with iface:
    gr.Markdown("## Sequence Feature Predictor")
    gr.Markdown("Enter a DNA/RNA sequence to predict its features.")

    with gr.Row():
        sequence_input = gr.Textbox(label="Enter DNA/RNA Sequence", lines=5, placeholder="Paste DNA sequence here...")

    gr.Markdown("### Sequence Features:")

    with gr.Column():
        with gr.Row(): # Grouping label and explanation
            length_output = gr.Textbox(label="Sequence Length (base pairs)", scale=2) # Adjusted scale
            gr.Markdown("*(e.g., A T G C G A T C G A -> 10 bases)*") # Removed scale
        with gr.Row(): # Groupdicteding label and explanation
            gc_content_output = gr.Textbox(label="GC Content (%)", scale=2) # Adjusted scale
            gr.Markdown("*(e.g., A T **G C G** A T **C G** A -> 50% GC content)*") # Removed scale


        gr.Markdown("#### Predicted Structural Features:")

        with gr.Row(): # Grouping label and explanation
            mfe_energy_output = gr.Textbox(label="Minimum Free Energy (kcal/mol)", lines=3, scale=2) # Adjusted scale
            gr.Markdown("*(More negative MFE, more stable the structure)*") # Removed scale

        with gr.Row(): # First row for structural features (3 items)

            with gr.Column(scale=1, min_width=75): # Column for Number of Base Pairs, adjusted min_width
                num_pairs_output = gr.Textbox(label="Number of Base Pairs") # Updated label
                gr.Markdown("_Example: Image below shows 6 Base Pairs_") # Moved markdown above image and updated text
                gr.Image(value="structure (6).svg", scale=0.5, width=50, height=100, show_label=False, image_mode="1") # Reduced scale and added explicit dimensions, removed label, added image_mode
            with gr.Column(scale=1, min_width=75): # Column for Mean Stem Length, adjusted min_width
                stem_len_mean_output = gr.Textbox(label="Mean Stem Length") # Updated label
                gr.Markdown("_Example: Image below shows Mean Stem Length 3.5 base pairs (bp)_") # Moved markdown above image and updated text
                gr.Image(value="structure (3).svg", scale=0.5, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label
            with gr.Column(scale=1, min_width=75): # Column for Number of Stems, adjusted min_width
                num_stems_output = gr.Textbox(label="Number of Stems") # Updated label
                gr.Markdown("_Example: Image below shows 3 Stems_") # Moved markdown above image and updated text
                gr.Image(value="structure (3.1).svg", scale=0.5, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label


        with gr.Row(): # Second row for structural features (2 items)
            with gr.Column(scale=1, min_width=75): # Column for Number of Hairpins, adjusted min_width
                num_hairpins_output = gr.Textbox(label="Number of Hairpins") # Updated label
                gr.Markdown("_Example: Image below shows 3 Hairpins_") # Moved markdown above image and updated text
                gr.Image(value="structure (4).svg", scale=0.1, width=50, height=100,show_label=False) # Reduced scale and added explicit dimensions, removed label
            with gr.Column(scale=1, min_width=75): # Column for Number of Internal Loops, adjusted min_width
                num_internal_loops_output = gr.Textbox(label="Number of Internal Loops") # Updated label
                gr.Markdown("_Example: Image below shows 1 Internal Loop_") # Moved markdown above image and updated text
                gr.Image(value="structure (6.1).svg", scale=0.1, width=50, height=100, show_label=False) # Reduced scale and added explicit dimensions, removed label


    sequence_input.change(
        fn=predict_features,
        inputs=sequence_input,
        outputs=[
            length_output,
            gc_content_output,
            mfe_energy_output,
            num_pairs_output,
            stem_len_mean_output,
            num_stems_output,
            num_hairpins_output,
            num_internal_loops_output,
        ]
    )

# Removed the launch call as it's handled by the Space environment
iface.launch(debug=True)