| import os |
| import gradio as gr |
| from llama_cpp import Llama |
|
|
| |
| MODEL_PATH = "./models/mistral.gguf" |
|
|
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=2048, |
| n_threads=9, |
| n_batch=128, |
| use_mlock=True, |
| use_mmap=True, |
| verbose=False |
| ) |
|
|
| |
| def generate_response(prompt): |
| stream = llm( |
| prompt=f"[INST] {prompt.strip()} [/INST]", |
| max_tokens=512, |
| stop=["</s>"], |
| stream=True |
| ) |
| partial = "" |
| for chunk in stream: |
| partial += chunk["choices"][0]["text"] |
| yield partial |
|
|
| |
| gr.ChatInterface( |
| fn=generate_response, |
| title="Leo9 AI Tutor", |
| description="An ai chatbots who answer any question.", |
| ).launch() |
|
|