#!/bin/bash set -e echo "šŸš€ Starting LLM Space..." # ── Pre-flight: verify all Python modules are importable ──────────────────── echo "šŸ” Running pre-flight checks..." python3 - <<'PYEOF' import sys checks = [ ("llama_cpp", "llama-cpp-python[server]"), ("llama_cpp.server.app", "llama-cpp-python[server]"), ("sse_starlette.sse", "sse-starlette"), ("fastapi", "fastapi"), ("uvicorn", "uvicorn[standard]"), ("starlette", "starlette"), ("pydantic", "pydantic"), ("huggingface_hub", "huggingface_hub"), ("schedule", "schedule"), ("filelock", "filelock"), ] failed = [] for module, package in checks: try: __import__(module) print(f" āœ… {module}") except ImportError as e: print(f" āŒ {module} — install: pip install {package}") failed.append(package) if failed: print(f"\nšŸ’„ Missing packages: {', '.join(failed)}", file=sys.stderr) print("Fix: add these to your Dockerfile RUN pip install block", file=sys.stderr) sys.exit(1) print("āœ… All pre-flight checks passed!") PYEOF # ── Download model if not cached ──────────────────────────────────────────── MODEL_PATH="/app/models/model.gguf" if [ ! -f "$MODEL_PATH" ]; then echo "šŸ“„ Downloading model: ${MODEL_HF_ID:-bartowski/Phi-3.5-mini-instruct-GGUF}" python3 - <<'PYEOF' import os, sys, shutil from huggingface_hub import hf_hub_download # FIX #4 / Model recommendation: Phi-3.5-mini Q4_K_M is ~3x faster than Qwen2.5-7B # on CPU while maintaining strong instruction-following quality. # Other fast options (set via env vars): # bartowski/Qwen2.5-3B-Instruct-GGUF / Qwen2.5-3B-Instruct-Q4_K_M.gguf (fastest Qwen) # bartowski/Llama-3.2-3B-Instruct-GGUF / Llama-3.2-3B-Instruct-Q4_K_M.gguf # bartowski/Qwen2.5-1.5B-Instruct-GGUF / Qwen2.5-1.5B-Instruct-Q8_0.gguf (tiny + fast) model_id = os.environ.get("MODEL_HF_ID", "bartowski/Phi-3.5-mini-instruct-GGUF") filename = os.environ.get("MODEL_FILENAME", "Phi-3.5-mini-instruct-Q4_K_M.gguf") hf_token = os.environ.get("HF_TOKEN") print(f" Repo : {model_id}") print(f" Filename : {filename}") try: path = hf_hub_download( repo_id=model_id, filename=filename, local_dir="/app/models", token=hf_token, local_files_only=False, ) if path != "/app/models/model.gguf": shutil.copy(path, "/app/models/model.gguf") print(f"āœ… Model ready at /app/models/model.gguf") except Exception as e: print(f"āŒ Download failed: {e}", file=sys.stderr) sys.exit(1) PYEOF else echo "āœ… Model already cached at $MODEL_PATH" fi # ── Detect CPU count dynamically ───────────────────────────────────────────── # FIX #4: Was hardcoded to 4; now uses all available cores for max throughput CPU_COUNT=$(nproc) THREADS="${CPU_THREADS:-$CPU_COUNT}" echo "šŸ–„ļø Detected ${CPU_COUNT} CPU cores → using ${THREADS} threads" # ── Start llama.cpp server ─────────────────────────────────────────────────── echo "🧠 Starting llama.cpp inference server..." # FIX #5: Added --n_batch 512 (explicit, helps prompt processing speed) # FIX #6: Reduced default CONTEXT_LENGTH to 2048 (cuts KV-cache 50%, faster inference) # If you need longer context, set CONTEXT_LENGTH=4096 in Space secrets. python3 -m llama_cpp.server \ --model /app/models/model.gguf \ --host 127.0.0.1 \ --port 8080 \ --n_ctx "${CONTEXT_LENGTH:-2048}" \ --n_threads "${THREADS}" \ --n_batch "${BATCH_SIZE:-512}" \ --chat_format chatml \ --api_key "${GATEWAY_TOKEN:-changeme}" \ > /app/logs/llama.log 2>&1 & LLAMA_PID=$! echo "llama.cpp PID: $LLAMA_PID" # ── Wait for llama.cpp to be ready ────────────────────────────────────────── echo "ā³ Waiting for llama.cpp server to load model..." WAIT_SECS=0 MAX_WAIT=480 # 8 minutes (smaller models load faster) while [ $WAIT_SECS -lt $MAX_WAIT ]; do HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ -H "Authorization: Bearer ${GATEWAY_TOKEN:-changeme}" \ http://127.0.0.1:8080/v1/models 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then echo "āœ… llama.cpp server ready after ${WAIT_SECS}s (HTTP $HTTP_CODE)" break fi if ! kill -0 $LLAMA_PID 2>/dev/null; then echo "" echo "āŒ llama.cpp process crashed! Last 50 lines of log:" echo "──────────────────────────────────────────────────────" tail -50 /app/logs/llama.log echo "──────────────────────────────────────────────────────" exit 1 fi if [ $((WAIT_SECS % 30)) -eq 0 ] && [ $WAIT_SECS -gt 0 ]; then echo " ā³ Still loading... ${WAIT_SECS}s elapsed (HTTP last=$HTTP_CODE)" tail -1 /app/logs/llama.log 2>/dev/null || true fi sleep 5 WAIT_SECS=$((WAIT_SECS + 5)) done if [ $WAIT_SECS -ge $MAX_WAIT ]; then echo "āŒ Timed out after ${MAX_WAIT}s. Last log:" tail -50 /app/logs/llama.log exit 1 fi # ── Start persistent memory sync ───────────────────────────────────────────── echo "šŸ’¾ Starting memory sync..." python3 /app/hermes-sync.py > /app/logs/sync.log 2>&1 & # ── Setup Cloudflare Workers ────────────────────────────────────────────────── if [ -n "$CLOUDFLARE_WORKERS_TOKEN" ] && [ -n "$CLOUDFLARE_ACCOUNT_ID" ]; then echo "ā˜ļø Setting up Cloudflare Workers..." python3 /app/setup-cloudflare.py else echo "āš ļø Cloudflare secrets not set — skipping keep-alive & proxy" fi # ── Start gateway server ────────────────────────────────────────────────────── echo "🌐 Starting gateway on port 7860..." node /app/health-server.js