import gradio as gr
from transformers import pipeline

# 1. System Prompt/Personality for Hermes Agent
HERMES_SYSTEM_PROMPT = """You are Hermes Agent, a helpful, fast, and practical multi-purpose assistant. You are professional, calm, and user-friendly. You can answer questions, reason through tasks step-by-step, plan, summarize, provide coding help, offer research-style explanations, and break down complex tasks. Do not pretend to have abilities you do not possess. Always strive for clarity and conciseness."""

# 2. Model Integration
# Using a small, CPU-friendly model for demonstration on Hugging Face Spaces free tier.
# For better performance and more complex tasks, a larger model with GPU would be recommended.
# Example: 'distilgpt2' is a good starting point for CPU inference.
# For more capable models, consider 'HuggingFaceH4/zephyr-7b-beta' or 'mistralai/Mistral-7B-Instruct-v0.2' 
# which would require a GPU-enabled Space.

try:
    # Initialize the pipeline for text generation
    # Using 'text-generation' task with a pre-trained model
    # Setting trust_remote_code=True might be necessary for some models, but generally avoid if not explicitly needed.
    generator = pipeline('text-generation', model='distilgpt2')
except Exception as e:
    print(f"Error loading model: {e}")
    generator = None

# 3. Chatbot Logic
def predict(message, history):
    if generator is None:
        return "Error: Model could not be loaded. Please check the backend logs."

    # Format conversation history for the model
    # For distilgpt2, a simple concatenation is sufficient. 
    # For more advanced models, a specific chat template might be required.
    conversation = HERMES_SYSTEM_PROMPT + "\n\n"
    for human, agent in history:
        conversation += f"User: {human}\nHermes Agent: {agent}\n"
    conversation += f"User: {message}\nHermes Agent:"

    try:
        # Generate response
        # max_new_tokens controls the length of the generated response
        # num_return_sequences=1 to get a single best response
        # truncation=True to handle long inputs gracefully
        response = generator(conversation, max_new_tokens=150, num_return_sequences=1, truncation=True)
        generated_text = response[0]['generated_text']

        # Extract only the agent's response, removing the prompt and user's input
        # This is a simple heuristic and might need refinement for complex models/prompts
        agent_response_start = generated_text.rfind("Hermes Agent:")
        if agent_response_start != -1:
            agent_response = generated_text[agent_response_start + len("Hermes Agent:"):].strip()
        else:
            agent_response = generated_text.strip() # Fallback if marker not found
            
        # Clean up any potential incomplete sentences or model artifacts
        # For distilgpt2, it often generates incomplete sentences, so we might need to truncate at the last punctuation.
        last_punctuation = max(agent_response.rfind('.'), agent_response.rfind('?'), agent_response.rfind('!'))
        if last_punctuation != -1:
            agent_response = agent_response[:last_punctuation + 1]
        
        return agent_response

    except Exception as e:
        return f"An error occurred during model inference: {e}"

# 4. Gradio Web UI
with gr.Blocks() as demo:
    gr.Markdown("# Hermes Agent")
    gr.Markdown("""
    Hermes Agent is a helpful, fast, and practical multi-purpose AI assistant. 
    It can answer questions, reason through tasks, plan, summarize, and provide coding help.
    """)

    chatbot = gr.Chatbot(height=400)
    msg = gr.Textbox(label="Your Message", placeholder="Type your message here...")
    clear = gr.Button("Clear")

    msg.submit(predict, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, [msg, chatbot], queue=False)

# Launch the Gradio app
# The share=True option creates a public link, useful for testing, but should be False for deployment on Spaces.
# For Hugging Face Spaces, the app runs automatically when app.py is present.
if __name__ == "__main__":
    demo.launch(debug=True) # debug=True for local development, set to False for production