#!/bin/bash
set -e

echo "🚀 Starting LLM Space..."

# ── Pre-flight: verify all Python modules are importable ────────────────────
echo "🔍 Running pre-flight checks..."

python3 - <<'PYEOF'
import sys

checks = [
    ("llama_cpp",                  "llama-cpp-python[server]"),
    ("llama_cpp.server.app",       "llama-cpp-python[server]"),
    ("sse_starlette.sse",          "sse-starlette"),
    ("fastapi",                    "fastapi"),
    ("uvicorn",                    "uvicorn[standard]"),
    ("starlette",                  "starlette"),
    ("pydantic",                   "pydantic"),
    ("huggingface_hub",            "huggingface_hub"),
    ("schedule",                   "schedule"),
    ("filelock",                   "filelock"),
]

failed = []
for module, package in checks:
    try:
        __import__(module)
        print(f"  ✅ {module}")
    except ImportError as e:
        print(f"  ❌ {module} — install: pip install {package}")
        failed.append(package)

if failed:
    print(f"\n💥 Missing packages: {', '.join(failed)}", file=sys.stderr)
    print("Fix: add these to your Dockerfile RUN pip install block", file=sys.stderr)
    sys.exit(1)

print("✅ All pre-flight checks passed!")
PYEOF

# ── Download model if not cached ────────────────────────────────────────────
MODEL_PATH="/app/models/model.gguf"

if [ ! -f "$MODEL_PATH" ]; then
  echo "📥 Downloading model: ${MODEL_HF_ID:-bartowski/Phi-3.5-mini-instruct-GGUF}"
  python3 - <<'PYEOF'
import os, sys, shutil
from huggingface_hub import hf_hub_download

# FIX #4 / Model recommendation: Phi-3.5-mini Q4_K_M is ~3x faster than Qwen2.5-7B
# on CPU while maintaining strong instruction-following quality.
# Other fast options (set via env vars):
#   bartowski/Qwen2.5-3B-Instruct-GGUF   / Qwen2.5-3B-Instruct-Q4_K_M.gguf   (fastest Qwen)
#   bartowski/Llama-3.2-3B-Instruct-GGUF / Llama-3.2-3B-Instruct-Q4_K_M.gguf
#   bartowski/Qwen2.5-1.5B-Instruct-GGUF / Qwen2.5-1.5B-Instruct-Q8_0.gguf   (tiny + fast)
model_id = os.environ.get("MODEL_HF_ID",    "bartowski/Phi-3.5-mini-instruct-GGUF")
filename = os.environ.get("MODEL_FILENAME", "Phi-3.5-mini-instruct-Q4_K_M.gguf")
hf_token = os.environ.get("HF_TOKEN")

print(f"  Repo     : {model_id}")
print(f"  Filename : {filename}")

try:
    path = hf_hub_download(
        repo_id=model_id,
        filename=filename,
        local_dir="/app/models",
        token=hf_token,
        local_files_only=False,
    )
    if path != "/app/models/model.gguf":
        shutil.copy(path, "/app/models/model.gguf")
    print(f"✅ Model ready at /app/models/model.gguf")

except Exception as e:
    print(f"❌ Download failed: {e}", file=sys.stderr)
    sys.exit(1)
PYEOF
else
  echo "✅ Model already cached at $MODEL_PATH"
fi

# ── Detect CPU count dynamically ─────────────────────────────────────────────
# FIX #4: Was hardcoded to 4; now uses all available cores for max throughput
CPU_COUNT=$(nproc)
THREADS="${CPU_THREADS:-$CPU_COUNT}"
echo "🖥️  Detected ${CPU_COUNT} CPU cores → using ${THREADS} threads"

# ── Start llama.cpp server ───────────────────────────────────────────────────
echo "🧠 Starting llama.cpp inference server..."

# FIX #5: Added --n_batch 512 (explicit, helps prompt processing speed)
# FIX #6: Reduced default CONTEXT_LENGTH to 2048 (cuts KV-cache 50%, faster inference)
#         If you need longer context, set CONTEXT_LENGTH=4096 in Space secrets.
python3 -m llama_cpp.server \
  --model /app/models/model.gguf \
  --host 127.0.0.1 \
  --port 8080 \
  --n_ctx "${CONTEXT_LENGTH:-2048}" \
  --n_threads "${THREADS}" \
  --n_batch "${BATCH_SIZE:-512}" \
  --chat_format chatml \
  --api_key "${GATEWAY_TOKEN:-changeme}" \
  > /app/logs/llama.log 2>&1 &

LLAMA_PID=$!
echo "llama.cpp PID: $LLAMA_PID"

# ── Wait for llama.cpp to be ready ──────────────────────────────────────────
echo "⏳ Waiting for llama.cpp server to load model..."
WAIT_SECS=0
MAX_WAIT=480  # 8 minutes (smaller models load faster)

while [ $WAIT_SECS -lt $MAX_WAIT ]; do

  HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
    -H "Authorization: Bearer ${GATEWAY_TOKEN:-changeme}" \
    http://127.0.0.1:8080/v1/models 2>/dev/null || echo "000")

  if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
    echo "✅ llama.cpp server ready after ${WAIT_SECS}s (HTTP $HTTP_CODE)"
    break
  fi

  if ! kill -0 $LLAMA_PID 2>/dev/null; then
    echo ""
    echo "❌ llama.cpp process crashed! Last 50 lines of log:"
    echo "──────────────────────────────────────────────────────"
    tail -50 /app/logs/llama.log
    echo "──────────────────────────────────────────────────────"
    exit 1
  fi

  if [ $((WAIT_SECS % 30)) -eq 0 ] && [ $WAIT_SECS -gt 0 ]; then
    echo "  ⏳ Still loading... ${WAIT_SECS}s elapsed (HTTP last=$HTTP_CODE)"
    tail -1 /app/logs/llama.log 2>/dev/null || true
  fi

  sleep 5
  WAIT_SECS=$((WAIT_SECS + 5))
done

if [ $WAIT_SECS -ge $MAX_WAIT ]; then
  echo "❌ Timed out after ${MAX_WAIT}s. Last log:"
  tail -50 /app/logs/llama.log
  exit 1
fi

# ── Start persistent memory sync ─────────────────────────────────────────────
echo "💾 Starting memory sync..."
python3 /app/hermes-sync.py > /app/logs/sync.log 2>&1 &

# ── Setup Cloudflare Workers ──────────────────────────────────────────────────
if [ -n "$CLOUDFLARE_WORKERS_TOKEN" ] && [ -n "$CLOUDFLARE_ACCOUNT_ID" ]; then
  echo "☁️  Setting up Cloudflare Workers..."
  python3 /app/setup-cloudflare.py
else
  echo "⚠️  Cloudflare secrets not set — skipping keep-alive & proxy"
fi

# ── Start gateway server ──────────────────────────────────────────────────────
echo "🌐 Starting gateway on port 7860..."
node /app/health-server.js