from __future__ import annotations import json import os import sys import urllib.parse import urllib.request from pathlib import Path from huggingface_hub import hf_hub_download MODEL_REPO = os.getenv("MODEL_REPO", "yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF") MODEL_FILE = os.getenv("MODEL_FILE", "gemma4-v2-Q4_K_M.gguf") MODEL_DIR = Path(os.getenv("MODEL_DIR", "/data/models/gemma4-coder")) CHAT_TEMPLATE_FILE = Path(os.getenv("CHAT_TEMPLATE_FILE", "/data/models/gemma4-coder/chat_template.jinja")) LLAMA_SERVER_BIN = os.getenv("LLAMA_SERVER_BIN", "/opt/llama.cpp/llama-server") LLAMA_HOST = os.getenv("LLAMA_HOST", "0.0.0.0") LLAMA_PORT = os.getenv("LLAMA_PORT", "7860") THREADS = os.getenv("THREADS", "4") CTX_SIZE = os.getenv("CTX_SIZE", "2048") BATCH_SIZE = os.getenv("BATCH_SIZE", "default") UBATCH_SIZE = os.getenv("UBATCH_SIZE", "default") GPU_LAYERS = os.getenv("GPU_LAYERS", "0") FLASH_ATTN = os.getenv("FLASH_ATTN", "default") CACHE_TYPE_K = os.getenv("CACHE_TYPE_K", "default") CACHE_TYPE_V = os.getenv("CACHE_TYPE_V", "default") TEMPERATURE = os.getenv("TEMPERATURE", "0.2") TOP_P = os.getenv("TOP_P", "0.95") TOP_K = os.getenv("TOP_K", "64") REPEAT_PENALTY = os.getenv("REPEAT_PENALTY", "1.08") def log(message: str) -> None: print(f"[startup] {message}", flush=True) def download_model() -> str: MODEL_DIR.mkdir(parents=True, exist_ok=True) local_file = MODEL_DIR / MODEL_FILE if local_file.exists(): log(f"Using cached model: {local_file}") return str(local_file) log(f"Downloading {MODEL_REPO}/{MODEL_FILE}") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=str(MODEL_DIR), ) log(f"Model ready: {model_path}") return model_path def download_chat_template() -> str | None: if CHAT_TEMPLATE_FILE.exists() and CHAT_TEMPLATE_FILE.stat().st_size > 0: log(f"Using cached chat template: {CHAT_TEMPLATE_FILE}") return str(CHAT_TEMPLATE_FILE) encoded_repo = urllib.parse.quote(MODEL_REPO, safe="/") api_url = f"https://huggingface.co/api/models/{encoded_repo}" log("Fetching chat template from model metadata") try: with urllib.request.urlopen(api_url, timeout=30) as response: metadata = json.loads(response.read().decode("utf-8")) except Exception as exc: log(f"Could not fetch chat template metadata: {exc}") return None template = (metadata.get("gguf") or {}).get("chat_template") if not template: log("No chat template found in model metadata; llama-server will use GGUF metadata") return None CHAT_TEMPLATE_FILE.parent.mkdir(parents=True, exist_ok=True) CHAT_TEMPLATE_FILE.write_text(template, encoding="utf-8") log(f"Chat template ready: {CHAT_TEMPLATE_FILE}") return str(CHAT_TEMPLATE_FILE) def build_command(model_path: str, template_path: str | None) -> list[str]: def has_custom_value(value: str) -> bool: return value.strip().lower() not in {"", "default", "auto", "none", "off"} def add_optional_pair(flag: str, value: str) -> None: if has_custom_value(value): cmd.extend([flag, value]) cmd = [ LLAMA_SERVER_BIN, "-m", model_path, "--host", LLAMA_HOST, "--port", LLAMA_PORT, "--threads", THREADS, "--ctx-size", CTX_SIZE, "--n-gpu-layers", GPU_LAYERS, "--parallel", "1", "--cont-batching", "--temp", TEMPERATURE, "--top-p", TOP_P, "--top-k", TOP_K, "--repeat-penalty", REPEAT_PENALTY, ] add_optional_pair("--batch-size", BATCH_SIZE) add_optional_pair("--ubatch-size", UBATCH_SIZE) add_optional_pair("--cache-type-k", CACHE_TYPE_K) add_optional_pair("--cache-type-v", CACHE_TYPE_V) if has_custom_value(FLASH_ATTN): cmd.extend(["-fa", FLASH_ATTN]) if template_path: cmd.extend(["--chat-template-file", template_path]) return cmd def main() -> None: binary_dir = str(Path(LLAMA_SERVER_BIN).parent) existing_library_path = os.environ.get("LD_LIBRARY_PATH") os.environ["LD_LIBRARY_PATH"] = ( binary_dir if not existing_library_path else f"{binary_dir}:{existing_library_path}" ) os.environ.setdefault("OMP_NUM_THREADS", THREADS) os.environ.setdefault("OPENBLAS_NUM_THREADS", THREADS) os.environ.setdefault("MKL_NUM_THREADS", THREADS) model_path = download_model() template_path = download_chat_template() cmd = build_command(model_path, template_path) log("Starting native llama.cpp web UI") log(" ".join(cmd)) os.execvpe(cmd[0], cmd, os.environ) if __name__ == "__main__": try: main() except Exception as exc: print(f"[fatal] {exc}", file=sys.stderr, flush=True) raise