cl4ude commited on
Commit
6e23cd8
·
verified ·
1 Parent(s): fea3064

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +47 -0
  2. README.md +68 -5
  3. app.py +158 -0
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive \
4
+ MODEL_REPO=yuxinlu1/gemma-4-12B-coder-fable5-composer2.5-v1-GGUF \
5
+ MODEL_FILE=gemma4-coding-Q4_K_M.gguf \
6
+ MODEL_DIR=/data/models/gemma4-coder \
7
+ LLAMA_VERSION=b9592 \
8
+ LLAMA_DIR=/opt/llama.cpp \
9
+ LLAMA_SERVER_BIN=/opt/llama.cpp/llama-server \
10
+ LD_LIBRARY_PATH=/opt/llama.cpp \
11
+ LLAMA_HOST=0.0.0.0 \
12
+ LLAMA_PORT=7860 \
13
+ THREADS=4 \
14
+ CTX_SIZE=2048 \
15
+ BATCH_SIZE=default \
16
+ UBATCH_SIZE=default \
17
+ FLASH_ATTN=default \
18
+ CACHE_TYPE_K=default \
19
+ CACHE_TYPE_V=default \
20
+ GPU_LAYERS=0 \
21
+ TEMPERATURE=0.2 \
22
+ TOP_P=0.95 \
23
+ TOP_K=64 \
24
+ REPEAT_PENALTY=1.08 \
25
+ HF_XET_HIGH_PERFORMANCE=1 \
26
+ PYTHONUNBUFFERED=1
27
+
28
+ RUN apt-get update && apt-get install -y --no-install-recommends \
29
+ ca-certificates \
30
+ curl \
31
+ libgomp1 \
32
+ libstdc++6 \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ RUN mkdir -p "${LLAMA_DIR}" \
36
+ && curl -fL "https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_VERSION}/llama-${LLAMA_VERSION}-bin-ubuntu-x64.tar.gz" \
37
+ | tar -xz --strip-components=1 -C "${LLAMA_DIR}" \
38
+ && chmod +x "${LLAMA_SERVER_BIN}"
39
+
40
+ RUN pip install --no-cache-dir \
41
+ huggingface_hub
42
+
43
+ WORKDIR /app
44
+ COPY app.py /app/app.py
45
+
46
+ EXPOSE 7860
47
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,73 @@
1
  ---
2
- title: Fable5
3
- emoji: 📚
4
- colorFrom: gray
5
- colorTo: red
6
  sdk: docker
 
 
7
  pinned: false
 
 
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Gemma4 Coder GGUF Chat
3
+ emoji: "💬"
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_file: app.py
8
+ app_port: 7860
9
  pinned: false
10
+ models:
11
+ - yuxinlu1/gemma-4-12B-coder-fable5-composer2.5-v1-GGUF
12
+ tags:
13
+ - llama.cpp
14
+ - gguf
15
+ - gemma4
16
+ - coding
17
+ - cpu
18
  ---
19
 
20
+ # Gemma4 12B Coder GGUF Chat
21
+
22
+ Hugging Face Spaces Docker chatbot for:
23
+
24
+ - Model: `yuxinlu1/gemma-4-12B-coder-fable5-composer2.5-v1-GGUF`
25
+ - Default quant: `gemma4-coding-Q4_K_M.gguf`
26
+ - Backend: prebuilt `llama.cpp` `llama-server`
27
+ - UI: native `llama.cpp` web UI
28
+ - Target: testing Gemma4 Coder on HF Spaces CPU
29
+
30
+ ## Why Q4 by default?
31
+
32
+ `gemma4-coding-Q2_K.gguf` is smaller and faster, but it can produce broken fake-language responses on CPU. This Space uses `gemma4-coding-Q4_K_M.gguf` by default for better coherence. It is slower than Q2, but it is the safer option if the goal is a usable chatbot.
33
+
34
+ ## Default settings
35
+
36
+ ```text
37
+ MODEL_REPO=yuxinlu1/gemma-4-12B-coder-fable5-composer2.5-v1-GGUF
38
+ MODEL_FILE=gemma4-coding-Q4_K_M.gguf
39
+ LLAMA_VERSION=b9592
40
+ THREADS=4
41
+ CTX_SIZE=2048
42
+ BATCH_SIZE=default
43
+ UBATCH_SIZE=default
44
+ FLASH_ATTN=default
45
+ CACHE_TYPE_K=default
46
+ CACHE_TYPE_V=default
47
+ TEMPERATURE=0.2
48
+ TOP_P=0.95
49
+ TOP_K=64
50
+ REPEAT_PENALTY=1.08
51
+ ```
52
+
53
+ The launcher downloads the GGUF into `/data`, fetches the model chat template from Hugging Face metadata, then hands the process over to `llama-server` on port `7860`.
54
+
55
+ `default` means the launcher does not pass that flag, so native `llama.cpp` picks its own optimized default. This is closer to the fast reference Space and avoids CPU overhead from experimental KV-cache quantization or tiny batch settings.
56
+
57
+ ## If you want to compare Q2
58
+
59
+ Change this environment variable back:
60
+
61
+ ```text
62
+ MODEL_FILE=gemma4-coding-Q2_K.gguf
63
+ ```
64
+
65
+ Q2 starts and responds faster, but the output may be incoherent.
66
+
67
+ ## Upload
68
+
69
+ Upload these files to the root of a Docker Space:
70
+
71
+ - `Dockerfile`
72
+ - `app.py`
73
+ - `README.md`
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import urllib.parse
7
+ import urllib.request
8
+ from pathlib import Path
9
+
10
+ from huggingface_hub import hf_hub_download
11
+
12
+
13
+ MODEL_REPO = os.getenv("MODEL_REPO", "yuxinlu1/gemma-4-12B-coder-fable5-composer2.5-v1-GGUF")
14
+ MODEL_FILE = os.getenv("MODEL_FILE", "gemma4-coding-Q4_K_M.gguf")
15
+ MODEL_DIR = Path(os.getenv("MODEL_DIR", "/data/models/gemma4-coder"))
16
+ CHAT_TEMPLATE_FILE = Path(os.getenv("CHAT_TEMPLATE_FILE", "/data/models/gemma4-coder/chat_template.jinja"))
17
+
18
+ LLAMA_SERVER_BIN = os.getenv("LLAMA_SERVER_BIN", "/opt/llama.cpp/llama-server")
19
+ LLAMA_HOST = os.getenv("LLAMA_HOST", "0.0.0.0")
20
+ LLAMA_PORT = os.getenv("LLAMA_PORT", "7860")
21
+
22
+ THREADS = os.getenv("THREADS", "4")
23
+ CTX_SIZE = os.getenv("CTX_SIZE", "2048")
24
+ BATCH_SIZE = os.getenv("BATCH_SIZE", "default")
25
+ UBATCH_SIZE = os.getenv("UBATCH_SIZE", "default")
26
+ GPU_LAYERS = os.getenv("GPU_LAYERS", "0")
27
+ FLASH_ATTN = os.getenv("FLASH_ATTN", "default")
28
+ CACHE_TYPE_K = os.getenv("CACHE_TYPE_K", "default")
29
+ CACHE_TYPE_V = os.getenv("CACHE_TYPE_V", "default")
30
+
31
+ TEMPERATURE = os.getenv("TEMPERATURE", "0.2")
32
+ TOP_P = os.getenv("TOP_P", "0.95")
33
+ TOP_K = os.getenv("TOP_K", "64")
34
+ REPEAT_PENALTY = os.getenv("REPEAT_PENALTY", "1.08")
35
+
36
+
37
+ def log(message: str) -> None:
38
+ print(f"[startup] {message}", flush=True)
39
+
40
+
41
+ def download_model() -> str:
42
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
43
+ local_file = MODEL_DIR / MODEL_FILE
44
+ if local_file.exists():
45
+ log(f"Using cached model: {local_file}")
46
+ return str(local_file)
47
+
48
+ log(f"Downloading {MODEL_REPO}/{MODEL_FILE}")
49
+ model_path = hf_hub_download(
50
+ repo_id=MODEL_REPO,
51
+ filename=MODEL_FILE,
52
+ local_dir=str(MODEL_DIR),
53
+ )
54
+ log(f"Model ready: {model_path}")
55
+ return model_path
56
+
57
+
58
+ def download_chat_template() -> str | None:
59
+ if CHAT_TEMPLATE_FILE.exists() and CHAT_TEMPLATE_FILE.stat().st_size > 0:
60
+ log(f"Using cached chat template: {CHAT_TEMPLATE_FILE}")
61
+ return str(CHAT_TEMPLATE_FILE)
62
+
63
+ encoded_repo = urllib.parse.quote(MODEL_REPO, safe="/")
64
+ api_url = f"https://huggingface.co/api/models/{encoded_repo}"
65
+ log("Fetching chat template from model metadata")
66
+
67
+ try:
68
+ with urllib.request.urlopen(api_url, timeout=30) as response:
69
+ metadata = json.loads(response.read().decode("utf-8"))
70
+ except Exception as exc:
71
+ log(f"Could not fetch chat template metadata: {exc}")
72
+ return None
73
+
74
+ template = (metadata.get("gguf") or {}).get("chat_template")
75
+ if not template:
76
+ log("No chat template found in model metadata; llama-server will use GGUF metadata")
77
+ return None
78
+
79
+ CHAT_TEMPLATE_FILE.parent.mkdir(parents=True, exist_ok=True)
80
+ CHAT_TEMPLATE_FILE.write_text(template, encoding="utf-8")
81
+ log(f"Chat template ready: {CHAT_TEMPLATE_FILE}")
82
+ return str(CHAT_TEMPLATE_FILE)
83
+
84
+
85
+ def build_command(model_path: str, template_path: str | None) -> list[str]:
86
+ def has_custom_value(value: str) -> bool:
87
+ return value.strip().lower() not in {"", "default", "auto", "none", "off"}
88
+
89
+ def add_optional_pair(flag: str, value: str) -> None:
90
+ if has_custom_value(value):
91
+ cmd.extend([flag, value])
92
+
93
+ cmd = [
94
+ LLAMA_SERVER_BIN,
95
+ "-m",
96
+ model_path,
97
+ "--host",
98
+ LLAMA_HOST,
99
+ "--port",
100
+ LLAMA_PORT,
101
+ "--threads",
102
+ THREADS,
103
+ "--ctx-size",
104
+ CTX_SIZE,
105
+ "--n-gpu-layers",
106
+ GPU_LAYERS,
107
+ "--parallel",
108
+ "1",
109
+ "--cont-batching",
110
+ "--temp",
111
+ TEMPERATURE,
112
+ "--top-p",
113
+ TOP_P,
114
+ "--top-k",
115
+ TOP_K,
116
+ "--repeat-penalty",
117
+ REPEAT_PENALTY,
118
+ ]
119
+
120
+ add_optional_pair("--batch-size", BATCH_SIZE)
121
+ add_optional_pair("--ubatch-size", UBATCH_SIZE)
122
+ add_optional_pair("--cache-type-k", CACHE_TYPE_K)
123
+ add_optional_pair("--cache-type-v", CACHE_TYPE_V)
124
+ if has_custom_value(FLASH_ATTN):
125
+ cmd.extend(["-fa", FLASH_ATTN])
126
+
127
+ if template_path:
128
+ cmd.extend(["--chat-template-file", template_path])
129
+
130
+ return cmd
131
+
132
+
133
+ def main() -> None:
134
+ binary_dir = str(Path(LLAMA_SERVER_BIN).parent)
135
+ existing_library_path = os.environ.get("LD_LIBRARY_PATH")
136
+ os.environ["LD_LIBRARY_PATH"] = (
137
+ binary_dir if not existing_library_path else f"{binary_dir}:{existing_library_path}"
138
+ )
139
+
140
+ os.environ.setdefault("OMP_NUM_THREADS", THREADS)
141
+ os.environ.setdefault("OPENBLAS_NUM_THREADS", THREADS)
142
+ os.environ.setdefault("MKL_NUM_THREADS", THREADS)
143
+
144
+ model_path = download_model()
145
+ template_path = download_chat_template()
146
+ cmd = build_command(model_path, template_path)
147
+
148
+ log("Starting native llama.cpp web UI")
149
+ log(" ".join(cmd))
150
+ os.execvpe(cmd[0], cmd, os.environ)
151
+
152
+
153
+ if __name__ == "__main__":
154
+ try:
155
+ main()
156
+ except Exception as exc:
157
+ print(f"[fatal] {exc}", file=sys.stderr, flush=True)
158
+ raise