FROM python:3.11-slim ENV DEBIAN_FRONTEND=noninteractive \ MODEL_REPO=yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF \ MODEL_FILE=gemma4-v2-Q4_K_M.gguf \ MODEL_DIR=/data/models/gemma4-coder \ LLAMA_VERSION=b9592 \ LLAMA_DIR=/opt/llama.cpp \ LLAMA_SERVER_BIN=/opt/llama.cpp/llama-server \ LD_LIBRARY_PATH=/opt/llama.cpp \ LLAMA_HOST=0.0.0.0 \ LLAMA_PORT=7860 \ THREADS=4 \ CTX_SIZE=2048 \ BATCH_SIZE=default \ UBATCH_SIZE=default \ FLASH_ATTN=default \ CACHE_TYPE_K=default \ CACHE_TYPE_V=default \ GPU_LAYERS=0 \ TEMPERATURE=0.2 \ TOP_P=0.95 \ TOP_K=64 \ REPEAT_PENALTY=1.08 \ HF_XET_HIGH_PERFORMANCE=1 \ PYTHONUNBUFFERED=1 RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ libgomp1 \ libstdc++6 \ && rm -rf /var/lib/apt/lists/* RUN mkdir -p "${LLAMA_DIR}" \ && curl -fL "https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_VERSION}/llama-${LLAMA_VERSION}-bin-ubuntu-x64.tar.gz" \ | tar -xz --strip-components=1 -C "${LLAMA_DIR}" \ && chmod +x "${LLAMA_SERVER_BIN}" RUN pip install --no-cache-dir \ huggingface_hub WORKDIR /app COPY app.py /app/app.py EXPOSE 7860 CMD ["python", "app.py"]