import gradio as gr from transformers import pipeline # 1. System Prompt/Personality for Hermes Agent HERMES_SYSTEM_PROMPT = """You are Hermes Agent, a helpful, fast, and practical multi-purpose assistant. You are professional, calm, and user-friendly. You can answer questions, reason through tasks step-by-step, plan, summarize, provide coding help, offer research-style explanations, and break down complex tasks. Do not pretend to have abilities you do not possess. Always strive for clarity and conciseness.""" # 2. Model Integration # Using a small, CPU-friendly model for demonstration on Hugging Face Spaces free tier. # For better performance and more complex tasks, a larger model with GPU would be recommended. # Example: 'distilgpt2' is a good starting point for CPU inference. # For more capable models, consider 'HuggingFaceH4/zephyr-7b-beta' or 'mistralai/Mistral-7B-Instruct-v0.2' # which would require a GPU-enabled Space. try: # Initialize the pipeline for text generation # Using 'text-generation' task with a pre-trained model # Setting trust_remote_code=True might be necessary for some models, but generally avoid if not explicitly needed. generator = pipeline('text-generation', model='distilgpt2') except Exception as e: print(f"Error loading model: {e}") generator = None # 3. Chatbot Logic def predict(message, history): if generator is None: return "Error: Model could not be loaded. Please check the backend logs." # Format conversation history for the model # For distilgpt2, a simple concatenation is sufficient. # For more advanced models, a specific chat template might be required. conversation = HERMES_SYSTEM_PROMPT + "\n\n" for human, agent in history: conversation += f"User: {human}\nHermes Agent: {agent}\n" conversation += f"User: {message}\nHermes Agent:" try: # Generate response # max_new_tokens controls the length of the generated response # num_return_sequences=1 to get a single best response # truncation=True to handle long inputs gracefully response = generator(conversation, max_new_tokens=150, num_return_sequences=1, truncation=True) generated_text = response[0]['generated_text'] # Extract only the agent's response, removing the prompt and user's input # This is a simple heuristic and might need refinement for complex models/prompts agent_response_start = generated_text.rfind("Hermes Agent:") if agent_response_start != -1: agent_response = generated_text[agent_response_start + len("Hermes Agent:"):].strip() else: agent_response = generated_text.strip() # Fallback if marker not found # Clean up any potential incomplete sentences or model artifacts # For distilgpt2, it often generates incomplete sentences, so we might need to truncate at the last punctuation. last_punctuation = max(agent_response.rfind('.'), agent_response.rfind('?'), agent_response.rfind('!')) if last_punctuation != -1: agent_response = agent_response[:last_punctuation + 1] return agent_response except Exception as e: return f"An error occurred during model inference: {e}" # 4. Gradio Web UI with gr.Blocks() as demo: gr.Markdown("# Hermes Agent") gr.Markdown(""" Hermes Agent is a helpful, fast, and practical multi-purpose AI assistant. It can answer questions, reason through tasks, plan, summarize, and provide coding help. """) chatbot = gr.Chatbot(height=400) msg = gr.Textbox(label="Your Message", placeholder="Type your message here...") clear = gr.Button("Clear") msg.submit(predict, [msg, chatbot], [msg, chatbot]) clear.click(lambda: None, None, [msg, chatbot], queue=False) # Launch the Gradio app # The share=True option creates a public link, useful for testing, but should be False for deployment on Spaces. # For Hugging Face Spaces, the app runs automatically when app.py is present. if __name__ == "__main__": demo.launch(debug=True) # debug=True for local development, set to False for production