Spaces:
Running
Running
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| import urllib.parse | |
| import urllib.request | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| MODEL_REPO = os.getenv("MODEL_REPO", "yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF") | |
| MODEL_FILE = os.getenv("MODEL_FILE", "gemma4-v2-Q4_K_M.gguf") | |
| MODEL_DIR = Path(os.getenv("MODEL_DIR", "/data/models/gemma4-coder")) | |
| CHAT_TEMPLATE_FILE = Path(os.getenv("CHAT_TEMPLATE_FILE", "/data/models/gemma4-coder/chat_template.jinja")) | |
| LLAMA_SERVER_BIN = os.getenv("LLAMA_SERVER_BIN", "/opt/llama.cpp/llama-server") | |
| LLAMA_HOST = os.getenv("LLAMA_HOST", "0.0.0.0") | |
| LLAMA_PORT = os.getenv("LLAMA_PORT", "7860") | |
| THREADS = os.getenv("THREADS", "4") | |
| CTX_SIZE = os.getenv("CTX_SIZE", "2048") | |
| BATCH_SIZE = os.getenv("BATCH_SIZE", "default") | |
| UBATCH_SIZE = os.getenv("UBATCH_SIZE", "default") | |
| GPU_LAYERS = os.getenv("GPU_LAYERS", "0") | |
| FLASH_ATTN = os.getenv("FLASH_ATTN", "default") | |
| CACHE_TYPE_K = os.getenv("CACHE_TYPE_K", "default") | |
| CACHE_TYPE_V = os.getenv("CACHE_TYPE_V", "default") | |
| TEMPERATURE = os.getenv("TEMPERATURE", "0.2") | |
| TOP_P = os.getenv("TOP_P", "0.95") | |
| TOP_K = os.getenv("TOP_K", "64") | |
| REPEAT_PENALTY = os.getenv("REPEAT_PENALTY", "1.08") | |
| def log(message: str) -> None: | |
| print(f"[startup] {message}", flush=True) | |
| def download_model() -> str: | |
| MODEL_DIR.mkdir(parents=True, exist_ok=True) | |
| local_file = MODEL_DIR / MODEL_FILE | |
| if local_file.exists(): | |
| log(f"Using cached model: {local_file}") | |
| return str(local_file) | |
| log(f"Downloading {MODEL_REPO}/{MODEL_FILE}") | |
| model_path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE, | |
| local_dir=str(MODEL_DIR), | |
| ) | |
| log(f"Model ready: {model_path}") | |
| return model_path | |
| def download_chat_template() -> str | None: | |
| if CHAT_TEMPLATE_FILE.exists() and CHAT_TEMPLATE_FILE.stat().st_size > 0: | |
| log(f"Using cached chat template: {CHAT_TEMPLATE_FILE}") | |
| return str(CHAT_TEMPLATE_FILE) | |
| encoded_repo = urllib.parse.quote(MODEL_REPO, safe="/") | |
| api_url = f"https://huggingface.co/api/models/{encoded_repo}" | |
| log("Fetching chat template from model metadata") | |
| try: | |
| with urllib.request.urlopen(api_url, timeout=30) as response: | |
| metadata = json.loads(response.read().decode("utf-8")) | |
| except Exception as exc: | |
| log(f"Could not fetch chat template metadata: {exc}") | |
| return None | |
| template = (metadata.get("gguf") or {}).get("chat_template") | |
| if not template: | |
| log("No chat template found in model metadata; llama-server will use GGUF metadata") | |
| return None | |
| CHAT_TEMPLATE_FILE.parent.mkdir(parents=True, exist_ok=True) | |
| CHAT_TEMPLATE_FILE.write_text(template, encoding="utf-8") | |
| log(f"Chat template ready: {CHAT_TEMPLATE_FILE}") | |
| return str(CHAT_TEMPLATE_FILE) | |
| def build_command(model_path: str, template_path: str | None) -> list[str]: | |
| def has_custom_value(value: str) -> bool: | |
| return value.strip().lower() not in {"", "default", "auto", "none", "off"} | |
| def add_optional_pair(flag: str, value: str) -> None: | |
| if has_custom_value(value): | |
| cmd.extend([flag, value]) | |
| cmd = [ | |
| LLAMA_SERVER_BIN, | |
| "-m", | |
| model_path, | |
| "--host", | |
| LLAMA_HOST, | |
| "--port", | |
| LLAMA_PORT, | |
| "--threads", | |
| THREADS, | |
| "--ctx-size", | |
| CTX_SIZE, | |
| "--n-gpu-layers", | |
| GPU_LAYERS, | |
| "--parallel", | |
| "1", | |
| "--cont-batching", | |
| "--temp", | |
| TEMPERATURE, | |
| "--top-p", | |
| TOP_P, | |
| "--top-k", | |
| TOP_K, | |
| "--repeat-penalty", | |
| REPEAT_PENALTY, | |
| ] | |
| add_optional_pair("--batch-size", BATCH_SIZE) | |
| add_optional_pair("--ubatch-size", UBATCH_SIZE) | |
| add_optional_pair("--cache-type-k", CACHE_TYPE_K) | |
| add_optional_pair("--cache-type-v", CACHE_TYPE_V) | |
| if has_custom_value(FLASH_ATTN): | |
| cmd.extend(["-fa", FLASH_ATTN]) | |
| if template_path: | |
| cmd.extend(["--chat-template-file", template_path]) | |
| return cmd | |
| def main() -> None: | |
| binary_dir = str(Path(LLAMA_SERVER_BIN).parent) | |
| existing_library_path = os.environ.get("LD_LIBRARY_PATH") | |
| os.environ["LD_LIBRARY_PATH"] = ( | |
| binary_dir if not existing_library_path else f"{binary_dir}:{existing_library_path}" | |
| ) | |
| os.environ.setdefault("OMP_NUM_THREADS", THREADS) | |
| os.environ.setdefault("OPENBLAS_NUM_THREADS", THREADS) | |
| os.environ.setdefault("MKL_NUM_THREADS", THREADS) | |
| model_path = download_model() | |
| template_path = download_chat_template() | |
| cmd = build_command(model_path, template_path) | |
| log("Starting native llama.cpp web UI") | |
| log(" ".join(cmd)) | |
| os.execvpe(cmd[0], cmd, os.environ) | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except Exception as exc: | |
| print(f"[fatal] {exc}", file=sys.stderr, flush=True) | |
| raise | |