Pavishanth68 commited on
Commit
f13e3b0
·
verified ·
1 Parent(s): a70cd9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -36
app.py CHANGED
@@ -1,34 +1,14 @@
1
- import subprocess
2
- import sys
3
  import os
4
-
5
- # --- THE STABILIZER BLOCK ---
6
- print("🛠️ Stabilizing environment...")
7
- subprocess.check_call([
8
- sys.executable, "-m", "pip", "install",
9
- "tokenizers==0.20.1",
10
- "transformers==4.45.2",
11
- "huggingface-hub==0.24.7",
12
- "peft==0.13.2"
13
- ])
14
-
15
  import torch
16
  import gradio as gr
17
- from transformers import AutoModelForCausalLM, AutoTokenizer
18
  from peft import PeftModel
19
- from huggingface_hub import login
20
 
21
- # Login with HuggingFace token
22
  HF_TOKEN = os.environ.get("HF_TOKEN")
23
- if HF_TOKEN:
24
- print("🔐 Logging in to HuggingFace...")
25
- login(token=HF_TOKEN)
26
- else:
27
- print("⚠️ No HF_TOKEN found - may fail on gated models")
28
 
29
  BASE_MODEL = "polyglots/SinLlama_v01"
30
  LORA_ADAPTER = "E-motionAssistant/SinLlama_v01-Therapy-Sinhala"
31
-
32
  SYSTEM_PROMPT = "You are an empathetic Sinhala therapist providing mental health support."
33
 
34
  model = None
@@ -37,27 +17,35 @@ tokenizer = None
37
  def load_model():
38
  global model, tokenizer
39
  if model is None:
 
 
 
 
 
 
 
 
 
 
40
  print(f"📥 Loading base model: {BASE_MODEL}...")
41
  base_model = AutoModelForCausalLM.from_pretrained(
42
  BASE_MODEL,
43
- torch_dtype=torch.float32,
44
- device_map="cpu",
45
- trust_remote_code=True,
46
- low_cpu_mem_usage=True,
47
- ignore_mismatched_sizes=True,
48
- token=HF_TOKEN
49
  )
50
 
51
  print(f"📥 Loading LoRA adapter: {LORA_ADAPTER}...")
52
  model = PeftModel.from_pretrained(base_model, LORA_ADAPTER, token=HF_TOKEN)
53
 
54
- print(f"📥 Loading tokenizer from adapter...")
55
- tokenizer = AutoTokenizer.from_pretrained(LORA_ADAPTER, trust_remote_code=True, token=HF_TOKEN)
56
 
57
  if tokenizer.pad_token is None:
58
  tokenizer.pad_token = tokenizer.eos_token
59
 
60
- print("✅ Success: Sinhala Therapy System is online on CPU!")
61
 
62
  load_model()
63
 
@@ -66,7 +54,6 @@ def chat(message, history):
66
  return ""
67
 
68
  try:
69
- # Build prompt
70
  prompt = f"{SYSTEM_PROMPT}\n\n"
71
 
72
  for user_msg, bot_msg in history[-3:]:
@@ -74,7 +61,6 @@ def chat(message, history):
74
 
75
  prompt += f"User: {message}\nTherapist:"
76
 
77
- # Tokenize
78
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
79
 
80
  with torch.no_grad():
@@ -88,15 +74,14 @@ def chat(message, history):
88
  eos_token_id=tokenizer.eos_token_id
89
  )
90
 
91
- # Decode only the new tokens
92
  input_len = inputs.input_ids.shape[1]
93
  response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
94
 
95
  return response.strip()
96
 
97
  except Exception as e:
98
- print(f"❌ Generation Error: {e}")
99
- return f"සමාවන්න, දෝෂයක් ඇතිවිය: {str(e)}. කරුණාකර නැවත උත්සාහ කරන්න."
100
 
101
  demo = gr.ChatInterface(
102
  fn=chat,
 
 
 
1
  import os
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
3
  import gradio as gr
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
5
  from peft import PeftModel
 
6
 
7
+ # Get HuggingFace token from environment
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
 
9
 
10
  BASE_MODEL = "polyglots/SinLlama_v01"
11
  LORA_ADAPTER = "E-motionAssistant/SinLlama_v01-Therapy-Sinhala"
 
12
  SYSTEM_PROMPT = "You are an empathetic Sinhala therapist providing mental health support."
13
 
14
  model = None
 
17
  def load_model():
18
  global model, tokenizer
19
  if model is None:
20
+ print("🔐 Loading with 4-bit quantization...")
21
+
22
+ # 4-bit quantization config
23
+ bnb_config = BitsAndBytesConfig(
24
+ load_in_4bit=True,
25
+ bnb_4bit_use_double_quant=True,
26
+ bnb_4bit_quant_type="nf4",
27
+ bnb_4bit_compute_dtype=torch.float16
28
+ )
29
+
30
  print(f"📥 Loading base model: {BASE_MODEL}...")
31
  base_model = AutoModelForCausalLM.from_pretrained(
32
  BASE_MODEL,
33
+ quantization_config=bnb_config,
34
+ device_map="auto",
35
+ token=HF_TOKEN,
36
+ trust_remote_code=True
 
 
37
  )
38
 
39
  print(f"📥 Loading LoRA adapter: {LORA_ADAPTER}...")
40
  model = PeftModel.from_pretrained(base_model, LORA_ADAPTER, token=HF_TOKEN)
41
 
42
+ print(f"📥 Loading tokenizer...")
43
+ tokenizer = AutoTokenizer.from_pretrained(LORA_ADAPTER, token=HF_TOKEN, trust_remote_code=True)
44
 
45
  if tokenizer.pad_token is None:
46
  tokenizer.pad_token = tokenizer.eos_token
47
 
48
+ print("✅ Model loaded in 4-bit!")
49
 
50
  load_model()
51
 
 
54
  return ""
55
 
56
  try:
 
57
  prompt = f"{SYSTEM_PROMPT}\n\n"
58
 
59
  for user_msg, bot_msg in history[-3:]:
 
61
 
62
  prompt += f"User: {message}\nTherapist:"
63
 
 
64
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
65
 
66
  with torch.no_grad():
 
74
  eos_token_id=tokenizer.eos_token_id
75
  )
76
 
 
77
  input_len = inputs.input_ids.shape[1]
78
  response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
79
 
80
  return response.strip()
81
 
82
  except Exception as e:
83
+ print(f"❌ Error: {e}")
84
+ return f"සමාවන්න, දෝෂයක් ඇතිවිය. කරුණාකර නැවත උත්සාහ කරන්න."
85
 
86
  demo = gr.ChatInterface(
87
  fn=chat,