from unsloth import FastLanguageModel import torch class EndpointHandler: def __init__(self, path=""): # 1. Load the model and tokenizer self.model, self.tokenizer = FastLanguageModel.from_pretrained( model_name = path, max_seq_length = 2048, load_in_4bit = True, ) FastLanguageModel.for_inference(self.model) def __call__(self, data): # 2. Get the input text from the request inputs = data.get("inputs", "") # 3. Tokenize and Generate input_ids = self.tokenizer(inputs, return_tensors="pt").to("cuda") outputs = self.model.generate(**input_ids, max_new_tokens=128) # 4. Decode and return the result prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return [{"generated_text": prediction}]