Santiagoismo commited on
Commit
36e2e2a
·
1 Parent(s): 14a3a14

bundle gguf locally

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +12 -14
  3. app.py +84 -111
  4. models/Qwen3.5-0.8B-Q8_0.gguf +3 -0
  5. requirements.txt +3 -4
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.gguf filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,24 +1,22 @@
1
  FROM python:3.11-slim
2
 
3
- ENV DEBIAN_FRONTEND=noninteractive \
4
- PIP_NO_CACHE_DIR=1 \
5
- PYTHONUNBUFFERED=1
6
-
7
  RUN apt-get update && apt-get install -y --no-install-recommends \
8
- build-essential cmake git curl ca-certificates \
9
- && rm -rf /var/lib/apt/lists/*
10
 
11
  WORKDIR /app
12
 
13
- # Build llama.cpp
14
- RUN git clone https://github.com/ggml-org/llama.cpp.git \
15
- && cmake -S llama.cpp -B llama.cpp/build -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF \
16
- && cmake --build llama.cpp/build --config Release -j2
17
 
18
  COPY requirements.txt .
19
- RUN pip install -r requirements.txt
 
 
20
 
21
- COPY app.py .
 
22
 
23
- EXPOSE 7860
24
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.11-slim
2
 
 
 
 
 
3
  RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ git build-essential cmake curl ca-certificates && \
5
+ rm -rf /var/lib/apt/lists/*
6
 
7
  WORKDIR /app
8
 
9
+ RUN git clone https://github.com/ggerganov/llama.cpp /tmp/llama.cpp && \
10
+ cmake -S /tmp/llama.cpp -B /tmp/llama.cpp/build -DBUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF && \
11
+ cmake --build /tmp/llama.cpp/build -j && \
12
+ cp /tmp/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
13
 
14
  COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ COPY . .
18
 
19
+ ENV PORT=7860
20
+ ENV MODEL_PATH=/app/models/Qwen3.5-0.8B-Q4_K_M.gguf
21
 
22
+ CMD ["python","app.py"]
 
app.py CHANGED
@@ -1,121 +1,94 @@
1
- import asyncio
2
- import os
3
- import subprocess
4
- import time
5
- from contextlib import asynccontextmanager
6
-
7
- import httpx
8
- from fastapi import FastAPI, Request
9
- from huggingface_hub import hf_hub_download
10
-
11
- TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "AAGgNISzjysyQzOWzTaE4Y4MKlQ8_Cz5YCI")
12
- MODEL_REPO = os.getenv("MODEL_REPO", "bartowski/Qwen3.5-0.8B-GGUF")
13
- MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3.5-0.8B-Q4_K_M.gguf")
14
- SYSTEM = os.getenv("BOT_SYSTEM", "You are CelachoBot. Reply helpfully and briefly.")
15
- PUBLIC_URL = os.getenv("PUBLIC_URL", "").rstrip("/")
16
- CTX = os.getenv("N_CTX", "2048")
17
- PREDICT = os.getenv("N_PREDICT", "128")
18
- TEMP = os.getenv("TEMPERATURE", "0.7")
19
- THREADS = os.getenv("N_THREADS", str(os.cpu_count() or 2))
20
-
21
- LLAMA = "/app/llama.cpp/build/bin/llama-server"
22
- MODEL_DIR = "/data"
23
- SERVER = "http://127.0.0.1:8080"
24
- proc = None
25
-
26
-
27
- def tg(method: str) -> str:
28
- return f"https://api.telegram.org/bot{TOKEN}/{method}"
29
-
30
-
31
- async def post_json(url: str, data: dict):
32
- async with httpx.AsyncClient(timeout=120) as c:
33
- return await c.post(url, json=data)
34
-
35
-
36
- def start_llama() -> None:
37
- global proc
38
- os.makedirs(MODEL_DIR, exist_ok=True)
39
- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
40
- cmd = [
41
- LLAMA,
42
- "-m", model_path,
43
- "--host", "127.0.0.1",
44
- "--port", "8080",
45
- "-c", CTX,
46
- "-n", PREDICT,
47
- "-t", THREADS,
48
- "--no-webui",
49
- ]
50
- proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
51
-
52
-
53
- def wait_llama(timeout: int = 120) -> None:
54
- end = time.time() + timeout
55
- while time.time() < end:
56
  try:
57
- r = httpx.get(f"{SERVER}/health", timeout=2)
58
- if r.status_code == 200:
59
- return
 
 
60
  except Exception:
61
  pass
62
- time.sleep(1)
63
- raise RuntimeError("llama-server did not become ready")
64
 
 
 
 
65
 
66
- async def set_webhook() -> None:
67
- if TOKEN and PUBLIC_URL:
68
- await post_json(tg("setWebhook"), {"url": f"{PUBLIC_URL}/webhook"})
69
-
70
-
71
- @asynccontextmanager
72
- async def lifespan(app: FastAPI):
73
- start_llama()
74
- await asyncio.to_thread(wait_llama)
75
- try:
76
- await set_webhook()
77
- except Exception:
78
- pass
79
- yield
80
- global proc
81
- if proc and proc.poll() is None:
82
- proc.terminate()
83
-
84
-
85
- app = FastAPI(lifespan=lifespan)
86
-
87
-
88
- @app.get("/")
89
- async def root():
90
- return {"ok": True}
91
-
92
 
93
  @app.post("/webhook")
94
- async def webhook(req: Request):
95
- u = await req.json()
96
- m = u.get("message") or u.get("edited_message") or {}
97
- chat = (m.get("chat") or {}).get("id")
98
- text = (m.get("text") or "").strip()
99
- if not chat:
100
- return {"ok": True}
101
- if not text:
102
- await post_json(tg("sendMessage"), {"chat_id": chat, "text": "Send text only."})
103
- return {"ok": True}
104
-
105
- prompt = f"{SYSTEM}\n\nUser: {text}\nAssistant:"
106
- payload = {
107
- "prompt": prompt,
108
- "n_predict": int(PREDICT),
109
- "temperature": float(TEMP),
110
- "stop": ["\nUser:", "\n\nUser:"],
111
- }
112
  try:
113
- async with httpx.AsyncClient(timeout=120) as c:
114
- r = await c.post(f"{SERVER}/completion", json=payload)
115
- r.raise_for_status()
116
- out = r.json().get("content", "").strip() or "..."
 
 
 
 
 
 
 
 
 
 
117
  except Exception:
118
- out = "Model unavailable. Try again in a moment."
 
119
 
120
- await post_json(tg("sendMessage"), {"chat_id": chat, "text": out[:4000]})
121
- return {"ok": True}
 
1
+ import os,time,subprocess,requests
2
+ from fastapi import FastAPI,Request
3
+ from fastapi.responses import JSONResponse
4
+ import uvicorn
5
+
6
+ T=os.getenv("TELEGRAM_BOT_TOKEN","AAGgNISzjysyQzOWzTaE4Y4MKlQ8_Cz5YCI")
7
+ S=os.getenv("BOT_SYSTEM","You are El Celacho. Reply helpfully and briefly.")
8
+ U=os.getenv("PUBLIC_URL","https://huggingface.co/spaces/Santiagoismo/CelachoBot")
9
+ P=int(os.getenv("PORT","7860"))
10
+ MP=os.getenv("MODEL_PATH","/app/models/Qwen3.5-0.8B-Q8_0.gguf")
11
+ LPORT=8081
12
+
13
+ app=FastAPI()
14
+ ready=False
15
+
16
+ def tg(m,**kw):
17
+ return requests.post(f"https://api.telegram.org/bot{T}/{m}",json=kw,timeout=60)
18
+
19
+ def ask(msg):
20
+ r=requests.post(
21
+ f"http://127.0.0.1:{LPORT}/v1/chat/completions",
22
+ json={
23
+ "messages":[
24
+ {"role":"system","content":S},
25
+ {"role":"user","content":msg}
26
+ ],
27
+ "temperature":0.7,
28
+ "max_tokens":200
29
+ },
30
+ timeout=180
31
+ )
32
+ r.raise_for_status()
33
+ return r.json()["choices"][0]["message"]["content"].strip()
34
+
35
+ def boot():
36
+ global ready
37
+ if not os.path.exists(MP):
38
+ raise FileNotFoundError(MP)
39
+ subprocess.Popen([
40
+ "llama-server",
41
+ "-m",MP,
42
+ "--host","127.0.0.1",
43
+ "--port",str(LPORT),
44
+ "-ngl","0",
45
+ "-c","2048"
46
+ ])
47
+ for _ in range(120):
48
+ try:
49
+ requests.get(f"http://127.0.0.1:{LPORT}/health",timeout=2)
50
+ ready=True
51
+ break
52
+ except Exception:
53
+ time.sleep(2)
54
+ if T and U:
 
55
  try:
56
+ requests.get(
57
+ f"https://api.telegram.org/bot{T}/setWebhook",
58
+ params={"url":U.rstrip("/")+"/webhook"},
59
+ timeout=30
60
+ )
61
  except Exception:
62
  pass
 
 
63
 
64
+ @app.on_event("startup")
65
+ def startup():
66
+ boot()
67
 
68
+ @app.get("/health")
69
+ def health():
70
+ return {"ok":True,"model_ready":ready}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  @app.post("/webhook")
73
+ async def webhook(req:Request):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  try:
75
+ u=await req.json()
76
+ m=u.get("message",{})
77
+ c=m.get("chat",{}).get("id")
78
+ t=m.get("text")
79
+ if not c:
80
+ return {"ok":True}
81
+ if not t:
82
+ tg("sendMessage",chat_id=c,text="Send text only.")
83
+ return {"ok":True}
84
+ if not ready:
85
+ tg("sendMessage",chat_id=c,text="Model waking up. Retry in a moment.")
86
+ return {"ok":True}
87
+ a=ask(t)
88
+ tg("sendMessage",chat_id=c,text=(a or "...")[:4000])
89
  except Exception:
90
+ pass
91
+ return JSONResponse({"ok":True})
92
 
93
+ if __name__=="__main__":
94
+ uvicorn.run(app,host="0.0.0.0",port=P)
models/Qwen3.5-0.8B-Q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ad885ffd4bb022fc4f0d33a3308fa108ef8613159d3b3a67e23abca056b7a6c
3
+ size 811843840
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- fastapi==0.115.12
2
- uvicorn[standard]==0.34.0
3
- httpx==0.28.1
4
- huggingface_hub==0.30.2
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ requests