FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive # System dependencies (gcc needed by Triton for kernel compilation) RUN apt-get update && apt-get install -y --no-install-recommends \ python3.10 python3.10-venv python3-pip python3.10-dev \ git git-lfs ffmpeg gcc \ libgl1 libglib2.0-0 libsm6 libxrender1 libxext6 libsndfile1 \ && git lfs install \ && ln -sf /usr/bin/python3.10 /usr/bin/python \ && ln -sf /usr/bin/pip3 /usr/bin/pip \ && rm -rf /var/lib/apt/lists/* \ && pip install --no-cache-dir --upgrade pip # Create user (HF Spaces requirement) RUN useradd -m -u 1000 user USER user ENV HOME=/home/user ENV PATH="/home/user/.local/bin:$PATH" WORKDIR /app # ---- HEAVY INSTALLS (cached unless dependencies change) ---- # vLLM 0.7.3 (installs compatible PyTorch 2.5.1+cu121 automatically) RUN pip install --no-cache-dir vllm==0.7.3 # Clone FLOAT repo RUN git clone https://github.com/deepbrainai-research/float.git /app/float_repo # FLOAT dependencies + app requirements + Orpheus dependencies (one layer) COPY --chown=user ./requirements.txt requirements.txt RUN pip install --no-cache-dir \ "numpy<2" \ pyyaml opencv-python pandas tqdm matplotlib flow-vis librosa \ "transformers==4.46.3" "tokenizers>=0.20" "albumentations==1.4.15" "albucore==0.0.16" \ "torchdiffeq==0.2.5" "timm==1.0.9" "face_alignment==1.4.1" "av==12.0.0" \ snac accelerate scipy soundfile vosk \ && pip install --no-cache-dir --upgrade -r requirements.txt # Create dirs RUN mkdir -p /tmp/videos /tmp/tts_output /tmp/lipsync_output # Pre-download SNAC decoder (public, cached in Docker layer) RUN python -c "from huggingface_hub import snapshot_download; \ snapshot_download('hubertsiuzdak/snac_24khz')" \ || echo 'WARNING: SNAC pre-download failed' # Pre-download FLOAT dependencies (wav2vec2 + emotion model) — cached in Docker layer RUN mkdir -p /app/checkpoints && \ python -c "from huggingface_hub import snapshot_download; \ print('Downloading wav2vec2-base-960h...'); \ snapshot_download('facebook/wav2vec2-base-960h', local_dir='/app/checkpoints/wav2vec2-base-960h'); \ print('Downloading emotion model...'); \ snapshot_download('r-f/wav2vec-english-speech-emotion-recognition', local_dir='/app/checkpoints/wav2vec-english-speech-emotion-recognition'); \ print('Models ready')" # Download Vosk small English model (~50MB, CPU-only STT) RUN python -c "import urllib.request, zipfile, os; \ url='https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip'; \ print('Downloading Vosk model...'); \ urllib.request.urlretrieve(url, '/tmp/vosk.zip'); \ zipfile.ZipFile('/tmp/vosk.zip').extractall('/app/'); \ os.rename('/app/vosk-model-small-en-us-0.15', '/app/vosk-model'); \ os.unlink('/tmp/vosk.zip'); \ print('Vosk model ready')" # ---- APP CODE (only this rebuilds on code changes) ---- # cache bust v4 COPY --chown=user ./app.py ./llm.py ./groq_tts.py ./orpheus_tts.py ./float_lipsync.py ./idle_generator.py ./vosk_stt.py /app/ # Copy avatar profiles (each has ref.png, persona.txt, idlevideos/) COPY --chown=user ./avatars/ /app/avatars/ # Copy checkpoints (wav2vec2, emotion model, float.pth) — baked in to avoid runtime downloads COPY --chown=user ./app/checkpoints/ /app/checkpoints/ # Copy voice reference + transcript for voice cloning (used by orpheus_tts) COPY --chown=user ./voice* /app/ EXPOSE 7860 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]