FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04
ENV DEBIAN_FRONTEND=noninteractive

ARG HF_TOKEN

ENV HF_TOKEN=$HF_TOKEN

RUN apt-get update && apt-get install -y 
RUN apt-get update && \
    apt-get upgrade -y
RUN apt-get install -y --no-install-recommends --fix-missing \
    git \
    git-lfs \
    wget \
    curl \
    cmake \
    build-essential \
    libssl-dev \
    zlib1g-dev \
    libbz2-dev \
    libreadline-dev \
    libsqlite3-dev \
    libncursesw5-dev \
    xz-utils \
    tk-dev \
    libxml2-dev \
    libxmlsec1-dev \
    libffi-dev \
    golang-go \
    python3 \
    liblzma-dev \
    ffmpeg \ 
    nvidia-driver-570 \
    python3 \
    python3-pip unzip curl original-awk grep sed zstd

WORKDIR /app
COPY --chown=1000 . /app
RUN mkdir /app -p

RUN curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst | tar x --zstd -C /usr  && chmod +x /usr/bin/ollama

# RUN cd /app && \
#     git clone --recursive https://github.com/ollama/ollama.git && \
#     cd ollama && \
#     go generate ./... && \
#     go build . && \
#     ln -s $PWD/ollama /usr/bin/ollama  && \
#     chmod +x ollama && \
#     cd ..

# RUN cd /app && \
#     git clone --recursive https://github.com/ggerganov/llama.cpp && \
#     cd llama.cpp && \
#     cmake -B build -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=OFF -DLLAMA_CURL=OFF && \
#     cmake --build build --config Release -j --target llama-quantize --parallel 12 && \
#     cp ./build/bin/llama-* /usr/bin/ && \
#     cp convert_hf_to_gguf.py /usr/bin/convert_hf_to_gguf && \
#     rm -rf build && \
#     cd ..


RUN pip install --no-cache-dir -U pip setuptools wheel --break-system-packages --ignore-installed 
RUN pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=6.5.1" "APScheduler" "protobuf>=4.21.0,<5.0.0" "sentencepiece>=0.1.98,<0.3.0" "numpy~=1.26.4" "gguf>=0.1.0" "fastapi" --break-system-packages --ignore-installed 

RUN pip install "torch>=2.8.0"  --break-system-packages --ignore-installed 
RUN pip install git+https://github.com/huggingface/transformers.git --break-system-packages --ignore-installed 


RUN mkdir /tmp/llama && hf download lainlives/llama.cpp --local-dir /tmp/llama && chmod +x /tmp/llama/* && cp /tmp/llama/convert* /app/convert_hf_to_gguf.py && mv /tmp/llama/* /usr/bin/

ENV PYTHONPATH=${HOME}/app \
    PYTHONUNBUFFERED=1 \
    GRADIO_ALLOW_FLAGGING=never \
    GRADIO_NUM_PORTS=1 \
    PATH=/usr/local/bin:$PATH \
    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \
    GRADIO_SERVER_NAME=0.0.0.0 \
    GRADIO_ANALYTICS_ENABLED=False \
    TQDM_POSITION=-1 \
    TQDM_MININTERVAL=1 \
    SYSTEM=spaces \
    LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
    PATH=/usr/local/nvidia/bin:${PATH} \
    OLLAMA_HOST=0.0.0.0:11434 \
    OLLAMA_MODELS=/app/ollama_models \
    HOME=/app


EXPOSE 7860
ENTRYPOINT python3 /app/app.py