#!/bin/bash if [ ! -d "llama.cpp" ]; then # only run in dev env git clone https://github.com/ggerganov/llama.cpp fi export GGML_CUDA=OFF if [[ -z "${RUN_LOCALLY}" ]]; then # enable CUDA if NOT running locally export GGML_CUDA=ON fi # Use ccache to reduce memory usage during build export CCACHE_DIR="/home/user/.ccache" mkdir -p "${CCACHE_DIR}" export CCACHE_MAXSIZE="2G" cd llama.cpp cmake -B build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=${GGML_CUDA} -DLLAMA_CURL=OFF -DCMAKE_CXX_FLAGS="-O2 -Wno-deprecated-gpu-targets" cmake --build build --config Release -j2 --target llama-quantize llama-gguf-split llama-imatrix cp ./build/bin/llama-* . rm -rf build cd .. python app.py