FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive

# System dependencies (gcc needed by Triton for kernel compilation)
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.10 python3.10-venv python3-pip python3.10-dev \
    git git-lfs ffmpeg gcc \
    libgl1 libglib2.0-0 libsm6 libxrender1 libxext6 libsndfile1 \
    && git lfs install \
    && ln -sf /usr/bin/python3.10 /usr/bin/python \
    && ln -sf /usr/bin/pip3 /usr/bin/pip \
    && rm -rf /var/lib/apt/lists/* \
    && pip install --no-cache-dir --upgrade pip

# Create user (HF Spaces requirement)
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user
ENV PATH="/home/user/.local/bin:$PATH"

WORKDIR /app

# ---- HEAVY INSTALLS (cached unless dependencies change) ----

# vLLM 0.7.3 (installs compatible PyTorch 2.5.1+cu121 automatically)
RUN pip install --no-cache-dir vllm==0.7.3

# Clone FLOAT repo
RUN git clone https://github.com/deepbrainai-research/float.git /app/float_repo

# FLOAT dependencies + app requirements + Orpheus dependencies (one layer)
COPY --chown=user ./requirements.txt requirements.txt
RUN pip install --no-cache-dir \
    "numpy<2" \
    pyyaml opencv-python pandas tqdm matplotlib flow-vis librosa \
    "transformers==4.46.3" "tokenizers>=0.20" "albumentations==1.4.15" "albucore==0.0.16" \
    "torchdiffeq==0.2.5" "timm==1.0.9" "face_alignment==1.4.1" "av==12.0.0" \
    snac accelerate scipy soundfile vosk \
    && pip install --no-cache-dir --upgrade -r requirements.txt

# Create dirs
RUN mkdir -p /tmp/videos /tmp/tts_output /tmp/lipsync_output

# Pre-download SNAC decoder (public, cached in Docker layer)
RUN python -c "from huggingface_hub import snapshot_download; \
    snapshot_download('hubertsiuzdak/snac_24khz')" \
    || echo 'WARNING: SNAC pre-download failed'

# Pre-download FLOAT dependencies (wav2vec2 + emotion model) — cached in Docker layer
RUN mkdir -p /app/checkpoints && \
    python -c "from huggingface_hub import snapshot_download; \
    print('Downloading wav2vec2-base-960h...'); \
    snapshot_download('facebook/wav2vec2-base-960h', local_dir='/app/checkpoints/wav2vec2-base-960h'); \
    print('Downloading emotion model...'); \
    snapshot_download('r-f/wav2vec-english-speech-emotion-recognition', local_dir='/app/checkpoints/wav2vec-english-speech-emotion-recognition'); \
    print('Models ready')"

# Download Vosk small English model (~50MB, CPU-only STT)
RUN python -c "import urllib.request, zipfile, os; \
    url='https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip'; \
    print('Downloading Vosk model...'); \
    urllib.request.urlretrieve(url, '/tmp/vosk.zip'); \
    zipfile.ZipFile('/tmp/vosk.zip').extractall('/app/'); \
    os.rename('/app/vosk-model-small-en-us-0.15', '/app/vosk-model'); \
    os.unlink('/tmp/vosk.zip'); \
    print('Vosk model ready')"

# ---- APP CODE (only this rebuilds on code changes) ----
# cache bust v4
COPY --chown=user ./app.py ./llm.py ./groq_tts.py ./orpheus_tts.py ./float_lipsync.py ./idle_generator.py ./vosk_stt.py /app/

# Copy avatar profiles (each has ref.png, persona.txt, idlevideos/)
COPY --chown=user ./avatars/ /app/avatars/

# Copy checkpoints (wav2vec2, emotion model, float.pth) — baked in to avoid runtime downloads
COPY --chown=user ./app/checkpoints/ /app/checkpoints/

# Copy voice reference + transcript for voice cloning (used by orpheus_tts)
COPY --chown=user ./voice* /app/

EXPOSE 7860
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]