Upload 5 files
Browse files- Dockerfile +10 -5
- health-server.js +25 -13
- hermes-sync.py +35 -15
- start.sh +32 -18
- telegram-bot.js +116 -29
Dockerfile
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
# ββ System dependencies βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
| 4 |
RUN apt-get update && apt-get install -y \
|
| 5 |
curl \
|
| 6 |
wget \
|
|
@@ -10,13 +12,15 @@ RUN apt-get update && apt-get install -y \
|
|
| 10 |
build-essential \
|
| 11 |
cmake \
|
| 12 |
supervisor \
|
|
|
|
|
|
|
| 13 |
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
|
| 15 |
# ββ Python dependencies βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
-
#
|
|
|
|
| 17 |
RUN pip install --no-cache-dir \
|
| 18 |
-
"llama-cpp-python[server]"
|
| 19 |
-
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
| 20 |
|
| 21 |
# Install server sub-dependencies explicitly as safety net
|
| 22 |
RUN pip install --no-cache-dir \
|
|
@@ -36,7 +40,8 @@ RUN pip install --no-cache-dir \
|
|
| 36 |
requests \
|
| 37 |
schedule \
|
| 38 |
diskcache \
|
| 39 |
-
numpy
|
|
|
|
| 40 |
|
| 41 |
# ββ Node.js gateway dependencies βββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
WORKDIR /app
|
|
@@ -57,4 +62,4 @@ RUN python3 -c "from llama_cpp.server.app import create_app; print('β
llama_cp
|
|
| 57 |
|
| 58 |
EXPOSE 7860
|
| 59 |
|
| 60 |
-
CMD ["./start.sh"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
# ββ System dependencies βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
# FIX #11: Added libopenblas-dev + libgomp1 β llama-cpp-python built with BLAS
|
| 5 |
+
# gives ~2x faster CPU matrix multiplication (tokenisation + inference)
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
curl \
|
| 8 |
wget \
|
|
|
|
| 12 |
build-essential \
|
| 13 |
cmake \
|
| 14 |
supervisor \
|
| 15 |
+
libopenblas-dev \
|
| 16 |
+
libgomp1 \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
# ββ Python dependencies βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
# FIX #11: Build llama-cpp-python with OpenBLAS support instead of pure CPU
|
| 21 |
+
ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
|
| 22 |
RUN pip install --no-cache-dir \
|
| 23 |
+
"llama-cpp-python[server]"
|
|
|
|
| 24 |
|
| 25 |
# Install server sub-dependencies explicitly as safety net
|
| 26 |
RUN pip install --no-cache-dir \
|
|
|
|
| 40 |
requests \
|
| 41 |
schedule \
|
| 42 |
diskcache \
|
| 43 |
+
numpy \
|
| 44 |
+
filelock
|
| 45 |
|
| 46 |
# ββ Node.js gateway dependencies βββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
WORKDIR /app
|
|
|
|
| 62 |
|
| 63 |
EXPOSE 7860
|
| 64 |
|
| 65 |
+
CMD ["./start.sh"]
|
health-server.js
CHANGED
|
@@ -24,7 +24,7 @@ app.get('/health', async (req, res) => {
|
|
| 24 |
try {
|
| 25 |
const controller = new AbortController();
|
| 26 |
const timeout = setTimeout(() => controller.abort(), 5000);
|
| 27 |
-
|
| 28 |
const resp = await fetch(`${LLAMA_URL}/v1/models`, {
|
| 29 |
headers: { 'Authorization': `Bearer ${GATEWAY_TOKEN}` },
|
| 30 |
signal: controller.signal
|
|
@@ -40,18 +40,30 @@ app.get('/health', async (req, res) => {
|
|
| 40 |
}
|
| 41 |
});
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
res.
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
-
|
| 55 |
|
| 56 |
app.use('/v1', requireAuth, createProxyMiddleware({
|
| 57 |
target: LLAMA_URL,
|
|
@@ -65,4 +77,4 @@ app.use('/v1', requireAuth, createProxyMiddleware({
|
|
| 65 |
|
| 66 |
app.listen(PORT, '0.0.0.0', () => {
|
| 67 |
console.log(`π Gateway running on port ${PORT}`);
|
| 68 |
-
});
|
|
|
|
| 24 |
try {
|
| 25 |
const controller = new AbortController();
|
| 26 |
const timeout = setTimeout(() => controller.abort(), 5000);
|
| 27 |
+
|
| 28 |
const resp = await fetch(`${LLAMA_URL}/v1/models`, {
|
| 29 |
headers: { 'Authorization': `Bearer ${GATEWAY_TOKEN}` },
|
| 30 |
signal: controller.signal
|
|
|
|
| 40 |
}
|
| 41 |
});
|
| 42 |
|
| 43 |
+
// FIX #10: Added body size limit (1mb) to prevent DoS via oversized payloads.
|
| 44 |
+
// FIX #12: Telegram webhook is intentionally open (Telegram needs to POST here
|
| 45 |
+
// without auth). Kept as-is but capped body size for safety.
|
| 46 |
+
app.use('/telegram',
|
| 47 |
+
express.json({ limit: '1mb' }),
|
| 48 |
+
async (req, res) => {
|
| 49 |
+
if (req.method !== 'POST') return res.status(405).send('Method Not Allowed');
|
| 50 |
+
|
| 51 |
+
// Basic sanity-check: Telegram updates always have update_id
|
| 52 |
+
if (!req.body || typeof req.body.update_id !== 'number') {
|
| 53 |
+
return res.status(400).json({ ok: false, error: 'Invalid update payload' });
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
try {
|
| 57 |
+
const { handleTelegramUpdate } = require('./telegram-bot');
|
| 58 |
+
await handleTelegramUpdate(req.body);
|
| 59 |
+
// Always return 200 to Telegram to prevent retries
|
| 60 |
+
res.json({ ok: true });
|
| 61 |
+
} catch (e) {
|
| 62 |
+
console.error('Webhook Endpoint Error:', e.message);
|
| 63 |
+
res.status(200).json({ ok: false, error: 'Handled internal error' });
|
| 64 |
+
}
|
| 65 |
}
|
| 66 |
+
);
|
| 67 |
|
| 68 |
app.use('/v1', requireAuth, createProxyMiddleware({
|
| 69 |
target: LLAMA_URL,
|
|
|
|
| 77 |
|
| 78 |
app.listen(PORT, '0.0.0.0', () => {
|
| 79 |
console.log(`π Gateway running on port ${PORT}`);
|
| 80 |
+
});
|
hermes-sync.py
CHANGED
|
@@ -1,28 +1,38 @@
|
|
| 1 |
import os, json, time, schedule, threading
|
| 2 |
from pathlib import Path
|
| 3 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 6 |
HF_DATASET = os.environ.get("HF_BACKUP_DATASET", "your-username/llm-space-backup")
|
| 7 |
DATA_DIR = Path("/app/data")
|
| 8 |
SYNC_EVERY = int(os.environ.get("SYNC_INTERVAL_MINUTES", "3"))
|
|
|
|
| 9 |
|
| 10 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 11 |
|
|
|
|
| 12 |
def save_conversations(conversations: dict):
|
| 13 |
-
"""Save in-memory conversations to disk."""
|
| 14 |
path = DATA_DIR / "conversations.json"
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def load_conversations() -> dict:
|
| 19 |
-
"""Load conversations from disk on boot."""
|
| 20 |
path = DATA_DIR / "conversations.json"
|
| 21 |
if path.exists():
|
| 22 |
-
with
|
| 23 |
-
|
|
|
|
| 24 |
return {}
|
| 25 |
|
|
|
|
| 26 |
def sync_to_hf_dataset():
|
| 27 |
"""Push backup data to HuggingFace private Dataset."""
|
| 28 |
if not HF_TOKEN:
|
|
@@ -43,8 +53,10 @@ def sync_to_hf_dataset():
|
|
| 43 |
except Exception:
|
| 44 |
pass
|
| 45 |
|
| 46 |
-
# Upload all data files
|
| 47 |
for file in DATA_DIR.glob("*"):
|
|
|
|
|
|
|
| 48 |
api.upload_file(
|
| 49 |
path_or_fileobj=str(file),
|
| 50 |
path_in_repo=file.name,
|
|
@@ -57,17 +69,24 @@ def sync_to_hf_dataset():
|
|
| 57 |
except Exception as e:
|
| 58 |
print(f"β Sync failed: {e}")
|
| 59 |
|
|
|
|
| 60 |
def restore_from_hf_dataset():
|
| 61 |
"""Restore backup from HF Dataset on boot."""
|
| 62 |
if not HF_TOKEN:
|
| 63 |
return
|
| 64 |
try:
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
for fname in files:
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
repo_id=HF_DATASET,
|
| 72 |
filename=fname,
|
| 73 |
repo_type="dataset",
|
|
@@ -78,13 +97,14 @@ def restore_from_hf_dataset():
|
|
| 78 |
except Exception as e:
|
| 79 |
print(f"β οΈ Could not restore backup (first run?): {e}")
|
| 80 |
|
|
|
|
| 81 |
if __name__ == "__main__":
|
| 82 |
print("πΎ Hermes Sync starting...")
|
| 83 |
restore_from_hf_dataset()
|
| 84 |
-
|
| 85 |
schedule.every(SYNC_EVERY).minutes.do(sync_to_hf_dataset)
|
| 86 |
-
|
| 87 |
print(f"π Syncing every {SYNC_EVERY} minutes to {HF_DATASET}")
|
| 88 |
while True:
|
| 89 |
schedule.run_pending()
|
| 90 |
-
time.sleep(30)
|
|
|
|
| 1 |
import os, json, time, schedule, threading
|
| 2 |
from pathlib import Path
|
| 3 |
from datetime import datetime
|
| 4 |
+
# FIX #13: Added filelock to prevent race conditions when Node.js and this
|
| 5 |
+
# script both try to read/write conversations.json simultaneously
|
| 6 |
+
from filelock import FileLock
|
| 7 |
|
| 8 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 9 |
HF_DATASET = os.environ.get("HF_BACKUP_DATASET", "your-username/llm-space-backup")
|
| 10 |
DATA_DIR = Path("/app/data")
|
| 11 |
SYNC_EVERY = int(os.environ.get("SYNC_INTERVAL_MINUTES", "3"))
|
| 12 |
+
LOCK_PATH = str(DATA_DIR / "conversations.lock")
|
| 13 |
|
| 14 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 15 |
|
| 16 |
+
|
| 17 |
def save_conversations(conversations: dict):
|
| 18 |
+
"""Save in-memory conversations to disk (with file lock)."""
|
| 19 |
path = DATA_DIR / "conversations.json"
|
| 20 |
+
# FIX #13: File lock prevents torn writes when Node.js is also writing
|
| 21 |
+
with FileLock(LOCK_PATH, timeout=10):
|
| 22 |
+
with open(path, "w") as f:
|
| 23 |
+
json.dump(conversations, f, indent=2, default=str)
|
| 24 |
+
|
| 25 |
|
| 26 |
def load_conversations() -> dict:
|
| 27 |
+
"""Load conversations from disk on boot (with file lock)."""
|
| 28 |
path = DATA_DIR / "conversations.json"
|
| 29 |
if path.exists():
|
| 30 |
+
with FileLock(LOCK_PATH, timeout=10):
|
| 31 |
+
with open(path) as f:
|
| 32 |
+
return json.load(f)
|
| 33 |
return {}
|
| 34 |
|
| 35 |
+
|
| 36 |
def sync_to_hf_dataset():
|
| 37 |
"""Push backup data to HuggingFace private Dataset."""
|
| 38 |
if not HF_TOKEN:
|
|
|
|
| 53 |
except Exception:
|
| 54 |
pass
|
| 55 |
|
| 56 |
+
# Upload all data files (skip lock file)
|
| 57 |
for file in DATA_DIR.glob("*"):
|
| 58 |
+
if file.suffix == ".lock":
|
| 59 |
+
continue
|
| 60 |
api.upload_file(
|
| 61 |
path_or_fileobj=str(file),
|
| 62 |
path_in_repo=file.name,
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
print(f"β Sync failed: {e}")
|
| 71 |
|
| 72 |
+
|
| 73 |
def restore_from_hf_dataset():
|
| 74 |
"""Restore backup from HF Dataset on boot."""
|
| 75 |
if not HF_TOKEN:
|
| 76 |
return
|
| 77 |
try:
|
| 78 |
+
# FIX #1 (CRITICAL): The original code called `api.hf_hub_download()`
|
| 79 |
+
# which does NOT exist on the HfApi class. This caused a silent
|
| 80 |
+
# AttributeError meaning conversations were NEVER restored on restart.
|
| 81 |
+
# Correct approach: use the module-level `hf_hub_download` function.
|
| 82 |
+
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
|
| 83 |
+
|
| 84 |
+
files = list_repo_files(repo_id=HF_DATASET, repo_type="dataset", token=HF_TOKEN)
|
| 85 |
+
|
| 86 |
for fname in files:
|
| 87 |
+
if fname.endswith(".lock"):
|
| 88 |
+
continue
|
| 89 |
+
hf_hub_download( # β was `api.hf_hub_download` (bug)
|
| 90 |
repo_id=HF_DATASET,
|
| 91 |
filename=fname,
|
| 92 |
repo_type="dataset",
|
|
|
|
| 97 |
except Exception as e:
|
| 98 |
print(f"β οΈ Could not restore backup (first run?): {e}")
|
| 99 |
|
| 100 |
+
|
| 101 |
if __name__ == "__main__":
|
| 102 |
print("πΎ Hermes Sync starting...")
|
| 103 |
restore_from_hf_dataset()
|
| 104 |
+
|
| 105 |
schedule.every(SYNC_EVERY).minutes.do(sync_to_hf_dataset)
|
| 106 |
+
|
| 107 |
print(f"π Syncing every {SYNC_EVERY} minutes to {HF_DATASET}")
|
| 108 |
while True:
|
| 109 |
schedule.run_pending()
|
| 110 |
+
time.sleep(30)
|
start.sh
CHANGED
|
@@ -3,7 +3,7 @@ set -e
|
|
| 3 |
|
| 4 |
echo "π Starting LLM Space..."
|
| 5 |
|
| 6 |
-
# ββ
|
| 7 |
echo "π Running pre-flight checks..."
|
| 8 |
|
| 9 |
python3 - <<'PYEOF'
|
|
@@ -19,6 +19,7 @@ checks = [
|
|
| 19 |
("pydantic", "pydantic"),
|
| 20 |
("huggingface_hub", "huggingface_hub"),
|
| 21 |
("schedule", "schedule"),
|
|
|
|
| 22 |
]
|
| 23 |
|
| 24 |
failed = []
|
|
@@ -38,17 +39,23 @@ if failed:
|
|
| 38 |
print("β
All pre-flight checks passed!")
|
| 39 |
PYEOF
|
| 40 |
|
| 41 |
-
# ββ
|
| 42 |
MODEL_PATH="/app/models/model.gguf"
|
| 43 |
|
| 44 |
if [ ! -f "$MODEL_PATH" ]; then
|
| 45 |
-
echo "π₯ Downloading model: ${MODEL_HF_ID:-bartowski/
|
| 46 |
python3 - <<'PYEOF'
|
| 47 |
import os, sys, shutil
|
| 48 |
from huggingface_hub import hf_hub_download
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
hf_token = os.environ.get("HF_TOKEN")
|
| 53 |
|
| 54 |
print(f" Repo : {model_id}")
|
|
@@ -74,15 +81,25 @@ else
|
|
| 74 |
echo "β
Model already cached at $MODEL_PATH"
|
| 75 |
fi
|
| 76 |
|
| 77 |
-
# ββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
echo "π§ Starting llama.cpp inference server..."
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
python3 -m llama_cpp.server \
|
| 81 |
--model /app/models/model.gguf \
|
| 82 |
--host 127.0.0.1 \
|
| 83 |
--port 8080 \
|
| 84 |
-
--n_ctx "${CONTEXT_LENGTH:-
|
| 85 |
-
--n_threads "${
|
|
|
|
| 86 |
--chat_format chatml \
|
| 87 |
--api_key "${GATEWAY_TOKEN:-changeme}" \
|
| 88 |
> /app/logs/llama.log 2>&1 &
|
|
@@ -90,10 +107,10 @@ python3 -m llama_cpp.server \
|
|
| 90 |
LLAMA_PID=$!
|
| 91 |
echo "llama.cpp PID: $LLAMA_PID"
|
| 92 |
|
| 93 |
-
# ββ
|
| 94 |
-
echo "β³ Waiting for llama.cpp server to load model
|
| 95 |
WAIT_SECS=0
|
| 96 |
-
MAX_WAIT=
|
| 97 |
|
| 98 |
while [ $WAIT_SECS -lt $MAX_WAIT ]; do
|
| 99 |
|
|
@@ -101,13 +118,11 @@ while [ $WAIT_SECS -lt $MAX_WAIT ]; do
|
|
| 101 |
-H "Authorization: Bearer ${GATEWAY_TOKEN:-changeme}" \
|
| 102 |
http://127.0.0.1:8080/v1/models 2>/dev/null || echo "000")
|
| 103 |
|
| 104 |
-
# 200 = ready, 401 = server up but wrong token (still means server is alive)
|
| 105 |
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
|
| 106 |
echo "β
llama.cpp server ready after ${WAIT_SECS}s (HTTP $HTTP_CODE)"
|
| 107 |
break
|
| 108 |
fi
|
| 109 |
|
| 110 |
-
# Check if process silently died
|
| 111 |
if ! kill -0 $LLAMA_PID 2>/dev/null; then
|
| 112 |
echo ""
|
| 113 |
echo "β llama.cpp process crashed! Last 50 lines of log:"
|
|
@@ -117,7 +132,6 @@ while [ $WAIT_SECS -lt $MAX_WAIT ]; do
|
|
| 117 |
exit 1
|
| 118 |
fi
|
| 119 |
|
| 120 |
-
# Progress report every 30s
|
| 121 |
if [ $((WAIT_SECS % 30)) -eq 0 ] && [ $WAIT_SECS -gt 0 ]; then
|
| 122 |
echo " β³ Still loading... ${WAIT_SECS}s elapsed (HTTP last=$HTTP_CODE)"
|
| 123 |
tail -1 /app/logs/llama.log 2>/dev/null || true
|
|
@@ -133,11 +147,11 @@ if [ $WAIT_SECS -ge $MAX_WAIT ]; then
|
|
| 133 |
exit 1
|
| 134 |
fi
|
| 135 |
|
| 136 |
-
# ββ
|
| 137 |
echo "πΎ Starting memory sync..."
|
| 138 |
python3 /app/hermes-sync.py > /app/logs/sync.log 2>&1 &
|
| 139 |
|
| 140 |
-
# ββ
|
| 141 |
if [ -n "$CLOUDFLARE_WORKERS_TOKEN" ] && [ -n "$CLOUDFLARE_ACCOUNT_ID" ]; then
|
| 142 |
echo "βοΈ Setting up Cloudflare Workers..."
|
| 143 |
python3 /app/setup-cloudflare.py
|
|
@@ -145,6 +159,6 @@ else
|
|
| 145 |
echo "β οΈ Cloudflare secrets not set β skipping keep-alive & proxy"
|
| 146 |
fi
|
| 147 |
|
| 148 |
-
# ββ
|
| 149 |
echo "π Starting gateway on port 7860..."
|
| 150 |
-
node /app/health-server.js
|
|
|
|
| 3 |
|
| 4 |
echo "π Starting LLM Space..."
|
| 5 |
|
| 6 |
+
# ββ Pre-flight: verify all Python modules are importable ββββββββββββββββββββ
|
| 7 |
echo "π Running pre-flight checks..."
|
| 8 |
|
| 9 |
python3 - <<'PYEOF'
|
|
|
|
| 19 |
("pydantic", "pydantic"),
|
| 20 |
("huggingface_hub", "huggingface_hub"),
|
| 21 |
("schedule", "schedule"),
|
| 22 |
+
("filelock", "filelock"),
|
| 23 |
]
|
| 24 |
|
| 25 |
failed = []
|
|
|
|
| 39 |
print("β
All pre-flight checks passed!")
|
| 40 |
PYEOF
|
| 41 |
|
| 42 |
+
# ββ Download model if not cached ββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
MODEL_PATH="/app/models/model.gguf"
|
| 44 |
|
| 45 |
if [ ! -f "$MODEL_PATH" ]; then
|
| 46 |
+
echo "π₯ Downloading model: ${MODEL_HF_ID:-bartowski/Phi-3.5-mini-instruct-GGUF}"
|
| 47 |
python3 - <<'PYEOF'
|
| 48 |
import os, sys, shutil
|
| 49 |
from huggingface_hub import hf_hub_download
|
| 50 |
|
| 51 |
+
# FIX #4 / Model recommendation: Phi-3.5-mini Q4_K_M is ~3x faster than Qwen2.5-7B
|
| 52 |
+
# on CPU while maintaining strong instruction-following quality.
|
| 53 |
+
# Other fast options (set via env vars):
|
| 54 |
+
# bartowski/Qwen2.5-3B-Instruct-GGUF / Qwen2.5-3B-Instruct-Q4_K_M.gguf (fastest Qwen)
|
| 55 |
+
# bartowski/Llama-3.2-3B-Instruct-GGUF / Llama-3.2-3B-Instruct-Q4_K_M.gguf
|
| 56 |
+
# bartowski/Qwen2.5-1.5B-Instruct-GGUF / Qwen2.5-1.5B-Instruct-Q8_0.gguf (tiny + fast)
|
| 57 |
+
model_id = os.environ.get("MODEL_HF_ID", "bartowski/Phi-3.5-mini-instruct-GGUF")
|
| 58 |
+
filename = os.environ.get("MODEL_FILENAME", "Phi-3.5-mini-instruct-Q4_K_M.gguf")
|
| 59 |
hf_token = os.environ.get("HF_TOKEN")
|
| 60 |
|
| 61 |
print(f" Repo : {model_id}")
|
|
|
|
| 81 |
echo "β
Model already cached at $MODEL_PATH"
|
| 82 |
fi
|
| 83 |
|
| 84 |
+
# ββ Detect CPU count dynamically βββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
# FIX #4: Was hardcoded to 4; now uses all available cores for max throughput
|
| 86 |
+
CPU_COUNT=$(nproc)
|
| 87 |
+
THREADS="${CPU_THREADS:-$CPU_COUNT}"
|
| 88 |
+
echo "π₯οΈ Detected ${CPU_COUNT} CPU cores β using ${THREADS} threads"
|
| 89 |
+
|
| 90 |
+
# ββ Start llama.cpp server βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
echo "π§ Starting llama.cpp inference server..."
|
| 92 |
|
| 93 |
+
# FIX #5: Added --n_batch 512 (explicit, helps prompt processing speed)
|
| 94 |
+
# FIX #6: Reduced default CONTEXT_LENGTH to 2048 (cuts KV-cache 50%, faster inference)
|
| 95 |
+
# If you need longer context, set CONTEXT_LENGTH=4096 in Space secrets.
|
| 96 |
python3 -m llama_cpp.server \
|
| 97 |
--model /app/models/model.gguf \
|
| 98 |
--host 127.0.0.1 \
|
| 99 |
--port 8080 \
|
| 100 |
+
--n_ctx "${CONTEXT_LENGTH:-2048}" \
|
| 101 |
+
--n_threads "${THREADS}" \
|
| 102 |
+
--n_batch "${BATCH_SIZE:-512}" \
|
| 103 |
--chat_format chatml \
|
| 104 |
--api_key "${GATEWAY_TOKEN:-changeme}" \
|
| 105 |
> /app/logs/llama.log 2>&1 &
|
|
|
|
| 107 |
LLAMA_PID=$!
|
| 108 |
echo "llama.cpp PID: $LLAMA_PID"
|
| 109 |
|
| 110 |
+
# ββ Wait for llama.cpp to be ready ββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
+
echo "β³ Waiting for llama.cpp server to load model..."
|
| 112 |
WAIT_SECS=0
|
| 113 |
+
MAX_WAIT=480 # 8 minutes (smaller models load faster)
|
| 114 |
|
| 115 |
while [ $WAIT_SECS -lt $MAX_WAIT ]; do
|
| 116 |
|
|
|
|
| 118 |
-H "Authorization: Bearer ${GATEWAY_TOKEN:-changeme}" \
|
| 119 |
http://127.0.0.1:8080/v1/models 2>/dev/null || echo "000")
|
| 120 |
|
|
|
|
| 121 |
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
|
| 122 |
echo "β
llama.cpp server ready after ${WAIT_SECS}s (HTTP $HTTP_CODE)"
|
| 123 |
break
|
| 124 |
fi
|
| 125 |
|
|
|
|
| 126 |
if ! kill -0 $LLAMA_PID 2>/dev/null; then
|
| 127 |
echo ""
|
| 128 |
echo "β llama.cpp process crashed! Last 50 lines of log:"
|
|
|
|
| 132 |
exit 1
|
| 133 |
fi
|
| 134 |
|
|
|
|
| 135 |
if [ $((WAIT_SECS % 30)) -eq 0 ] && [ $WAIT_SECS -gt 0 ]; then
|
| 136 |
echo " β³ Still loading... ${WAIT_SECS}s elapsed (HTTP last=$HTTP_CODE)"
|
| 137 |
tail -1 /app/logs/llama.log 2>/dev/null || true
|
|
|
|
| 147 |
exit 1
|
| 148 |
fi
|
| 149 |
|
| 150 |
+
# ββ Start persistent memory sync βββββββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
echo "πΎ Starting memory sync..."
|
| 152 |
python3 /app/hermes-sync.py > /app/logs/sync.log 2>&1 &
|
| 153 |
|
| 154 |
+
# ββ Setup Cloudflare Workers ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
if [ -n "$CLOUDFLARE_WORKERS_TOKEN" ] && [ -n "$CLOUDFLARE_ACCOUNT_ID" ]; then
|
| 156 |
echo "βοΈ Setting up Cloudflare Workers..."
|
| 157 |
python3 /app/setup-cloudflare.py
|
|
|
|
| 159 |
echo "β οΈ Cloudflare secrets not set β skipping keep-alive & proxy"
|
| 160 |
fi
|
| 161 |
|
| 162 |
+
# ββ Start gateway server ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
echo "π Starting gateway on port 7860..."
|
| 164 |
+
node /app/health-server.js
|
telegram-bot.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
'use strict';
|
| 2 |
const fetch = require('node-fetch');
|
|
|
|
|
|
|
| 3 |
|
| 4 |
// ββ Environment βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
const BOT_TOKEN = process.env.TELEGRAM_BOT_TOKEN;
|
|
@@ -7,6 +9,13 @@ const GATEWAY_TOKEN = process.env.GATEWAY_TOKEN;
|
|
| 7 |
const LLAMA_URL = 'http://127.0.0.1:8080';
|
| 8 |
const PROXY_BASE = (process.env.CLOUDFLARE_TELEGRAM_PROXY_URL || 'https://api.telegram.org').replace(/\/$/, '');
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
// ββ Deduplication tracking ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
const processedUpdates = new Set();
|
| 12 |
|
|
@@ -25,11 +34,39 @@ if (!GATEWAY_TOKEN || GATEWAY_TOKEN === 'changeme') {
|
|
| 25 |
}
|
| 26 |
|
| 27 |
// ββ In-memory conversation history βββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
const CONV_TTL_MS
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
setInterval(() => {
|
| 34 |
const now = Date.now();
|
| 35 |
let cleaned = 0;
|
|
@@ -40,7 +77,10 @@ setInterval(() => {
|
|
| 40 |
cleaned++;
|
| 41 |
}
|
| 42 |
}
|
| 43 |
-
if (cleaned > 0)
|
|
|
|
|
|
|
|
|
|
| 44 |
}, 30 * 60 * 1000);
|
| 45 |
|
| 46 |
// ββ Core Telegram API helper ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -64,12 +104,49 @@ async function telegramCall(method, body) {
|
|
| 64 |
}
|
| 65 |
}
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
async function sendTelegram(chatId, text) {
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
text
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
}
|
| 74 |
|
| 75 |
async function sendTyping(chatId) {
|
|
@@ -79,7 +156,7 @@ async function sendTyping(chatId) {
|
|
| 79 |
});
|
| 80 |
}
|
| 81 |
|
| 82 |
-
// ββ LLM Call βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
async function callLLM(messages) {
|
| 84 |
const res = await fetch(`${LLAMA_URL}/v1/chat/completions`, {
|
| 85 |
method: 'POST',
|
|
@@ -90,7 +167,10 @@ async function callLLM(messages) {
|
|
| 90 |
body: JSON.stringify({
|
| 91 |
model: 'local-model',
|
| 92 |
messages,
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
| 94 |
temperature: parseFloat(process.env.TEMPERATURE || '0.7'),
|
| 95 |
stream: false
|
| 96 |
})
|
|
@@ -104,16 +184,14 @@ async function callLLM(messages) {
|
|
| 104 |
|
| 105 |
// ββ Update Handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
async function handleTelegramUpdate(update) {
|
| 107 |
-
//
|
| 108 |
-
if (!update.update_id || processedUpdates.has(update.update_id))
|
| 109 |
-
return; // Ignore the retry
|
| 110 |
-
}
|
| 111 |
processedUpdates.add(update.update_id);
|
| 112 |
|
| 113 |
-
//
|
| 114 |
-
if (processedUpdates.size >
|
| 115 |
-
const
|
| 116 |
-
processedUpdates.delete(
|
| 117 |
}
|
| 118 |
|
| 119 |
try {
|
|
@@ -126,7 +204,6 @@ async function handleTelegramUpdate(update) {
|
|
| 126 |
const username = (msg.from && msg.from.username) || String(userId);
|
| 127 |
|
| 128 |
console.log(`π¨ [${username}] ${scrub(text)}`);
|
| 129 |
-
|
| 130 |
lastActive[chatId] = Date.now();
|
| 131 |
|
| 132 |
if (!conversations[chatId]) {
|
|
@@ -143,20 +220,27 @@ async function handleTelegramUpdate(update) {
|
|
| 143 |
}
|
| 144 |
if (text === '/clear') {
|
| 145 |
conversations[chatId] = conversations[chatId].slice(0, 1);
|
|
|
|
| 146 |
await sendTelegram(chatId, 'π§Ή Conversation history cleared!');
|
| 147 |
return;
|
| 148 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
// Normal message processing
|
| 151 |
conversations[chatId].push({ role: 'user', content: text });
|
| 152 |
|
| 153 |
-
// History management
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
| 157 |
}
|
| 158 |
|
| 159 |
-
// LLM Interaction
|
| 160 |
await sendTyping(chatId);
|
| 161 |
const reply = await callLLM(conversations[chatId]);
|
| 162 |
conversations[chatId].push({ role: 'assistant', content: reply });
|
|
@@ -165,14 +249,17 @@ async function handleTelegramUpdate(update) {
|
|
| 165 |
} catch (e) {
|
| 166 |
const safeMsg = scrub(e.message);
|
| 167 |
console.error('β Update Handler Error:', safeMsg);
|
| 168 |
-
|
| 169 |
try {
|
| 170 |
const chatId = (update.message || update.edited_message)?.chat?.id;
|
| 171 |
if (chatId) {
|
| 172 |
-
await sendTelegram(chatId, `β οΈ
|
| 173 |
}
|
| 174 |
} catch (_) {}
|
| 175 |
}
|
| 176 |
}
|
| 177 |
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
'use strict';
|
| 2 |
const fetch = require('node-fetch');
|
| 3 |
+
const fs = require('fs');
|
| 4 |
+
const path = require('path');
|
| 5 |
|
| 6 |
// ββ Environment βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
const BOT_TOKEN = process.env.TELEGRAM_BOT_TOKEN;
|
|
|
|
| 9 |
const LLAMA_URL = 'http://127.0.0.1:8080';
|
| 10 |
const PROXY_BASE = (process.env.CLOUDFLARE_TELEGRAM_PROXY_URL || 'https://api.telegram.org').replace(/\/$/, '');
|
| 11 |
|
| 12 |
+
// ββ Persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
+
// FIX #2: Conversations are now saved to disk so hermes-sync can back them up
|
| 14 |
+
// and they survive Space restarts.
|
| 15 |
+
const DATA_DIR = '/app/data';
|
| 16 |
+
const CONV_FILE = path.join(DATA_DIR, 'conversations.json');
|
| 17 |
+
const SAVE_INTERVAL = 60 * 1000; // write to disk every 60s
|
| 18 |
+
|
| 19 |
// ββ Deduplication tracking ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
const processedUpdates = new Set();
|
| 21 |
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
// ββ In-memory conversation history βββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
let conversations = {};
|
| 38 |
+
let lastActive = {};
|
| 39 |
+
const CONV_TTL_MS = 60 * 60 * 1000;
|
| 40 |
+
// FIX #8: Reduced from 15 to 10 turns β shorter prompt = faster inference.
|
| 41 |
+
// Override with MAX_HISTORY_TURNS env var if you need more context.
|
| 42 |
+
const MAX_HIST = parseInt(process.env.MAX_HISTORY_TURNS || '10');
|
| 43 |
+
|
| 44 |
+
// ββ Boot: restore conversations from disk ββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
// FIX #2: Load persisted conversations on startup so history survives restarts
|
| 46 |
+
try {
|
| 47 |
+
if (fs.existsSync(CONV_FILE)) {
|
| 48 |
+
const saved = JSON.parse(fs.readFileSync(CONV_FILE, 'utf8'));
|
| 49 |
+
conversations = saved.conversations || {};
|
| 50 |
+
lastActive = saved.lastActive || {};
|
| 51 |
+
console.log(`π Restored ${Object.keys(conversations).length} conversations from disk`);
|
| 52 |
+
}
|
| 53 |
+
} catch (e) {
|
| 54 |
+
console.warn('β οΈ Could not restore conversations from disk:', e.message);
|
| 55 |
+
}
|
| 56 |
|
| 57 |
+
// ββ Persist conversations to disk βββββββββββββββββββββββββββββββββββββββββββββ
|
| 58 |
+
function saveConversations() {
|
| 59 |
+
try {
|
| 60 |
+
fs.mkdirSync(DATA_DIR, { recursive: true });
|
| 61 |
+
fs.writeFileSync(CONV_FILE, JSON.stringify({ conversations, lastActive }, null, 2));
|
| 62 |
+
} catch (e) {
|
| 63 |
+
console.error('β Failed to save conversations:', e.message);
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
// FIX #2: Write to disk on a timer so hermes-sync.py has data to push to HF
|
| 67 |
+
setInterval(saveConversations, SAVE_INTERVAL);
|
| 68 |
+
|
| 69 |
+
// ββ Stale conversation cleanup ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
setInterval(() => {
|
| 71 |
const now = Date.now();
|
| 72 |
let cleaned = 0;
|
|
|
|
| 77 |
cleaned++;
|
| 78 |
}
|
| 79 |
}
|
| 80 |
+
if (cleaned > 0) {
|
| 81 |
+
console.log(`π§Ή Cleaned ${cleaned} stale conversations`);
|
| 82 |
+
saveConversations();
|
| 83 |
+
}
|
| 84 |
}, 30 * 60 * 1000);
|
| 85 |
|
| 86 |
// ββ Core Telegram API helper ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 104 |
}
|
| 105 |
}
|
| 106 |
|
| 107 |
+
// FIX #3: Split parse_mode into a safe helper.
|
| 108 |
+
// The old code used 'Markdown' which silently fails when the LLM outputs
|
| 109 |
+
// unmatched backtick fences, underscores, etc. We now try HTML first (which
|
| 110 |
+
// handles code blocks well), fall back to plain text on any delivery error.
|
| 111 |
async function sendTelegram(chatId, text) {
|
| 112 |
+
// Try HTML mode β LLM text wrapped in <pre> code blocks renders cleanly
|
| 113 |
+
try {
|
| 114 |
+
const htmlText = markdownToTelegramHtml(text);
|
| 115 |
+
return await telegramCall('sendMessage', {
|
| 116 |
+
chat_id: chatId,
|
| 117 |
+
text: htmlText,
|
| 118 |
+
parse_mode: 'HTML'
|
| 119 |
+
});
|
| 120 |
+
} catch (_) {
|
| 121 |
+
// Fallback: strip all formatting and send as plain text
|
| 122 |
+
return await telegramCall('sendMessage', {
|
| 123 |
+
chat_id: chatId,
|
| 124 |
+
text: text.substring(0, 4096) // Telegram max message length
|
| 125 |
+
});
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
/**
|
| 130 |
+
* Minimal Markdown β Telegram HTML converter.
|
| 131 |
+
* Handles the most common LLM output patterns.
|
| 132 |
+
*/
|
| 133 |
+
function markdownToTelegramHtml(text) {
|
| 134 |
+
let out = text
|
| 135 |
+
// Escape HTML entities first
|
| 136 |
+
.replace(/&/g, '&')
|
| 137 |
+
.replace(/</g, '<')
|
| 138 |
+
.replace(/>/g, '>')
|
| 139 |
+
// Fenced code blocks ```lang\n...\n``` β <pre><code>...</code></pre>
|
| 140 |
+
.replace(/```[\w]*\n?([\s\S]*?)```/g, (_, code) => `<pre><code>${code.trim()}</code></pre>`)
|
| 141 |
+
// Inline code `...` β <code>...</code>
|
| 142 |
+
.replace(/`([^`\n]+)`/g, '<code>$1</code>')
|
| 143 |
+
// Bold **...** β <b>...</b>
|
| 144 |
+
.replace(/\*\*(.+?)\*\*/g, '<b>$1</b>')
|
| 145 |
+
// Italic *...* β <i>...</i> (only single asterisks)
|
| 146 |
+
.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, '<i>$1</i>')
|
| 147 |
+
// Truncate to Telegram's 4096-char limit
|
| 148 |
+
.substring(0, 4096);
|
| 149 |
+
return out;
|
| 150 |
}
|
| 151 |
|
| 152 |
async function sendTyping(chatId) {
|
|
|
|
| 156 |
});
|
| 157 |
}
|
| 158 |
|
| 159 |
+
// ββ LLM Call βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 160 |
async function callLLM(messages) {
|
| 161 |
const res = await fetch(`${LLAMA_URL}/v1/chat/completions`, {
|
| 162 |
method: 'POST',
|
|
|
|
| 167 |
body: JSON.stringify({
|
| 168 |
model: 'local-model',
|
| 169 |
messages,
|
| 170 |
+
// FIX #7: Reduced default from 1024 β 512. For chat replies 512 tokens
|
| 171 |
+
// is almost always enough and cuts generation time ~50%.
|
| 172 |
+
// Override with MAX_TOKENS env var if you need longer responses.
|
| 173 |
+
max_tokens: parseInt(process.env.MAX_TOKENS || '512'),
|
| 174 |
temperature: parseFloat(process.env.TEMPERATURE || '0.7'),
|
| 175 |
stream: false
|
| 176 |
})
|
|
|
|
| 184 |
|
| 185 |
// ββ Update Handler ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
async function handleTelegramUpdate(update) {
|
| 187 |
+
// Deduplication
|
| 188 |
+
if (!update.update_id || processedUpdates.has(update.update_id)) return;
|
|
|
|
|
|
|
| 189 |
processedUpdates.add(update.update_id);
|
| 190 |
|
| 191 |
+
// FIX #9: MAX_CONV_SIZE was defined but never used; replaced with proper pruning
|
| 192 |
+
if (processedUpdates.size > 500) {
|
| 193 |
+
const iter = processedUpdates.values();
|
| 194 |
+
for (let i = 0; i < 300; i++) processedUpdates.delete(iter.next().value);
|
| 195 |
}
|
| 196 |
|
| 197 |
try {
|
|
|
|
| 204 |
const username = (msg.from && msg.from.username) || String(userId);
|
| 205 |
|
| 206 |
console.log(`π¨ [${username}] ${scrub(text)}`);
|
|
|
|
| 207 |
lastActive[chatId] = Date.now();
|
| 208 |
|
| 209 |
if (!conversations[chatId]) {
|
|
|
|
| 220 |
}
|
| 221 |
if (text === '/clear') {
|
| 222 |
conversations[chatId] = conversations[chatId].slice(0, 1);
|
| 223 |
+
saveConversations();
|
| 224 |
await sendTelegram(chatId, 'π§Ή Conversation history cleared!');
|
| 225 |
return;
|
| 226 |
}
|
| 227 |
+
if (text === '/status') {
|
| 228 |
+
const turns = Math.floor((conversations[chatId].length - 1) / 2);
|
| 229 |
+
await sendTelegram(chatId, `π Active turns: ${turns}/${MAX_HIST}`);
|
| 230 |
+
return;
|
| 231 |
+
}
|
| 232 |
|
| 233 |
// Normal message processing
|
| 234 |
conversations[chatId].push({ role: 'user', content: text });
|
| 235 |
|
| 236 |
+
// History management β keep system prompt + last N turns
|
| 237 |
+
if (conversations[chatId].length > MAX_HIST * 2 + 1) {
|
| 238 |
+
conversations[chatId] = [
|
| 239 |
+
conversations[chatId][0],
|
| 240 |
+
...conversations[chatId].slice(-(MAX_HIST * 2))
|
| 241 |
+
];
|
| 242 |
}
|
| 243 |
|
|
|
|
| 244 |
await sendTyping(chatId);
|
| 245 |
const reply = await callLLM(conversations[chatId]);
|
| 246 |
conversations[chatId].push({ role: 'assistant', content: reply });
|
|
|
|
| 249 |
} catch (e) {
|
| 250 |
const safeMsg = scrub(e.message);
|
| 251 |
console.error('β Update Handler Error:', safeMsg);
|
|
|
|
| 252 |
try {
|
| 253 |
const chatId = (update.message || update.edited_message)?.chat?.id;
|
| 254 |
if (chatId) {
|
| 255 |
+
await sendTelegram(chatId, `β οΈ Something went wrong. Please try again.`);
|
| 256 |
}
|
| 257 |
} catch (_) {}
|
| 258 |
}
|
| 259 |
}
|
| 260 |
|
| 261 |
+
// ββ Graceful shutdown: save on exit ββββββββββββββββββββββββββββββββββββββββββ
|
| 262 |
+
process.on('SIGTERM', () => { saveConversations(); process.exit(0); });
|
| 263 |
+
process.on('SIGINT', () => { saveConversations(); process.exit(0); });
|
| 264 |
+
|
| 265 |
+
module.exports = { handleTelegramUpdate, conversations };
|