# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
#   COMPOSE_PROFILES=35b   -> qwen35-35b_q4_gguf
#   COMPOSE_PROFILES=27b   -> qwen35-27b_q4_gguf
# The app always talks to http://llama-inference:8080 (shared network alias on both model services).
# Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf
# (same filename per repo) and -hf downloads are not shared across profiles.
# Example CLI commands (all are recommended for 24gb VRAM systems minimum, add --build to the below commands if you want to rebuild the app images):

# docker compose -f docker-compose_llama_agentic.yml --profile 35b_36 up -d
# docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d
# docker compose -f docker-compose_llama_agentic.yml --profile gemma4-31b up -d
# docker compose -f docker-compose_llama_agentic.yml --profile gemma4-26b up -d

# For agentic usage with pi (pi-agent service name matches the active profile):
# docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d pi-agent
# docker compose -f docker-compose_llama_agentic.yml --profile 35b_36 up -d pi-agent-35b
# docker compose -f docker-compose_llama_agentic.yml --profile gemma4-31b up -d pi-agent-gemma-31b
# docker compose -f docker-compose_llama_agentic.yml --profile gemma4-26b up -d pi-agent-gemma-26b

# Cloud-only Pi agent (no local llama.cpp GPU model; redaction app still runs locally):
# Set GEMINI_API_KEY (and optionally GOOGLE_API_KEY) in .env or config/pi_agent.env before starting pi-gemini.
# docker compose -f docker-compose_llama_agentic.yml --profile pi-gemini up -d

# Set AWS_REGION plus AWS credentials (or AWS_PROFILE via mounted ~/.aws) before starting pi-bedrock.
# SSO (recommended): mount host ~/.aws (read-write — SSO token refresh writes to sso/cache),
# set AWS_PROFILE (or PI_AWS_PROFILE) to your SSO profile name, run `aws sso login` on the host.
# Pi requires AWS_PROFILE in the container env — mounting ~/.aws alone is not enough for Pi's auth check.
# docker compose -f docker-compose_llama_agentic.yml --profile pi-bedrock up -d
#
# Optional Docker-only settings for redaction-app services: copy settings into
# config/docker_app_config.env (see config/docker_app_config.env.example). Loaded
# at container start; values in each service's environment: block override these.

x-redaction-app-env: &redaction-app-env
  env_file:
    - path: config/docker_app_config.env
      required: false

x-pi-agent-common: &pi-agent-common
  build:
    context: .
    dockerfile: agent-redact/pi-agent/Dockerfile
    target: dev
  image: pi-agent-doc-redaction
  env_file:
    - path: config/pi_agent.env
      required: false
  ports:
    - "7862:7862"
  volumes:
    - .:/workspace/doc_redaction:rw
    - ./workspace:/home/user/app/workspace:rw
    - pi-agent-sessions:/home/user/.pi/agent/sessions
  working_dir: /workspace/doc_redaction
  stdin_open: true
  tty: true
  entrypoint: ["/bin/bash", "/workspace/doc_redaction/agent-redact/pi/start.sh"]
  networks:
    - redaction-net-llama

x-pi-agent-env: &pi-agent-env
  APP_TYPE: pi
  APP_CONFIG_PATH: /workspace/doc_redaction/config/pi_agent.env
  HOME: /home/user
  PI_SKIP_VERSION_CHECK: "1"
  PI_OFFLINE: "1"
  DOC_REDACTION_GRADIO_URL: ${DOC_REDACTION_GRADIO_URL:-http://redaction-app-llama:7860}
  GRADIO_SERVER_NAME: ${GRADIO_SERVER_NAME:-0.0.0.0}
  PI_GRADIO_PORT: ${PI_GRADIO_PORT:-7862}
  GRADIO_SERVER_PORT: ${GRADIO_SERVER_PORT:-7862}
  PYTHONPATH: /workspace/doc_redaction:/workspace/doc_redaction/agent-redact/pi
  PI_WORKSPACE_DIR: /home/user/app/workspace
  # PI_DEFAULT_PROVIDER: ${PI_DEFAULT_PROVIDER:-llama-cpp}
  # PI_DEFAULT_MODEL: ${PI_DEFAULT_MODEL:-}
  # PI_DEFAULT_OCR_METHOD: ${PI_DEFAULT_OCR_METHOD:-hybrid-paddle-inference-server}
  # PI_DEFAULT_PII_METHOD: ${PI_DEFAULT_PII_METHOD:-Local}
  PI_LLAMA_BASE_URL: ${PI_LLAMA_BASE_URL:-http://llama-inference:8080/v1}
  GEMINI_API_KEY: ${GEMINI_API_KEY:-}
  GOOGLE_API_KEY: ${GOOGLE_API_KEY:-}
  AWS_REGION: ${AWS_REGION:-eu-west-2}
  AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-${AWS_REGION:-eu-west-2}}
  AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
  AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
  AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN:-}
  AWS_PROFILE: ${AWS_PROFILE:-}
  RUN_AWS_FUNCTIONS: ${RUN_AWS_FUNCTIONS:-False}
  PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: ${PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:-True}
  # PI_VLM_BASE_URL: http://llama-inference:8080
  RUN_FASTAPI: ${RUN_FASTAPI:-False}
  ROOT_PATH: ${ROOT_PATH:-}
  FASTAPI_ROOT_PATH: ${FASTAPI_ROOT_PATH:-/}
  ALLOWED_HOSTS: ${ALLOWED_HOSTS:-}
  ALLOWED_ORIGINS: ${ALLOWED_ORIGINS:-}
  COGNITO_AUTH: ${COGNITO_AUTH:-False}
  AWS_USER_POOL_ID: ${AWS_USER_POOL_ID:-}
  AWS_CLIENT_ID: ${AWS_CLIENT_ID:-}
  AWS_CLIENT_SECRET: ${AWS_CLIENT_SECRET:-}
  SESSION_OUTPUT_FOLDER: ${SESSION_OUTPUT_FOLDER:-True}
  SAVE_LOGS_TO_CSV: ${SAVE_LOGS_TO_CSV:-True}
  SAVE_LOGS_TO_DYNAMODB: ${SAVE_LOGS_TO_DYNAMODB:-False}
  SAVE_OUTPUTS_TO_S3: ${SAVE_OUTPUTS_TO_S3:-False}
  S3_OUTPUTS_FOLDER: ${S3_OUTPUTS_FOLDER:-}
  S3_OUTPUTS_BUCKET: ${S3_OUTPUTS_BUCKET:-}
  CUSTOM_HEADER: ${CUSTOM_HEADER:-}
  CUSTOM_HEADER_VALUE: ${CUSTOM_HEADER_VALUE:-}
  PI_MAX_PAGES: ${PI_MAX_PAGES:-${MAX_DOC_PAGES:-3000}}

x-redaction-app-build: &redaction-app-build
  <<: *redaction-app-env
  image: redaction-app-main
  build:
    context: .
    dockerfile: Dockerfile
    target: gradio
    args:
      - TORCH_GPU_ENABLED=False
      - INSTALL_VLM=False
      - PADDLE_GPU_ENABLED=True
      - INSTALL_PADDLEOCR=True
  shm_size: '8gb'
  deploy:
    resources:
      reservations:
        devices:
          - driver: nvidia
            count: all
            capabilities: [gpu]
  ports:
    - "7861:7860"
  volumes:
    - ./workspace:/home/user/app/workspace:rw
  networks:
    - redaction-net-llama

services:

  # Qwen 3.6 27B model setup below requires 40GB of VRAM to run. For 24GB, Change to -hf-file parameter to Qwen3.6-27B-UD-Q4_K_XL.gguf, or Qwen3.6-27B-IQ4_NL.gguf
  qwen36-27b_q4_gguf:
    profiles: ["27b_36"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.6-27B-MTP-GGUF # For
      - --hf-file
      - Qwen3.6-27B-UD-Q6_K_XL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF/resolve/main/mmproj-BF16.gguf
      - --n-gpu-layers
      - "-1"
      # - -mg
      # - "0"
      # - -dev 
      # - "cuda0,cuda1"
      # - -sm
      # - "row"
      - --tensor-split
      - "24,14"
      - --ctx-size
      - "114688"
      - -ub
      - "512"
      - --fit
      - "off"
      - --temp
      - "0.7"
      - --top-k
      - "20"
      - --top-p
      - "0.95"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "0.0"
      - --chat-template-kwargs
      - "{\"preserve_thinking\": true}"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --image_min_tokens
      - "300"
      - --parallel
      - "1"
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
      - --spec-type
      - "draft-mtp"
      - --spec-draft-n-max
      - "2"
    ports:
      - "8000:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen36-27b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen36-27b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

   # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
  qwen36-35b_q4_gguf:
    profiles: ["35b_36"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/Qwen3.6-35B-A3B-GGUF
      - --hf-file
      - Qwen3.6-35B-A3B-UD-IQ4_NL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-BF16.gguf
      - --n-gpu-layers
      - "-1"
      - --ctx-size
      - "114688"
      - --fit
      - "off"
      - --temp
      - "1.0"
      - --top-k
      - "20"
      - --top-p
      - "0.95"
      - --min-p
      - "0.0"
      - --frequency-penalty
      - "1"
      - --presence-penalty
      - "1.5"
      - --repeat-penalty
      - "1"
      - --chat-template-kwargs
      - "{\"preserve_thinking\": true}"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --n-cpu-moe
      - "0" # Increase this value to fit within your available VRAM
      - --image_min_tokens
      - "300"
      - --parallel
      - "1"
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
    ports:
      - "8005:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp
      - hf-hub-cache-qwen36-35b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

   # Gemma 4 31B model setup below requires 40GB of VRAM to run with the following settings
  gemma4-31b_q4_gguf:
    profiles: ["gemma4-31b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/gemma-4-31B-it-qat-GGUF
      - --hf-file
      - gemma-4-31B-it-qat-UD-Q4_K_XL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/gemma-4-31B-it-qat-GGUF/resolve/main/mmproj-BF16.gguf
      #- --no-mmproj
      - --n-gpu-layers
      - "-1"
      # - --tensor-split
      # - "24,16"
      - --ctx-size
      - "114688"
      - -ub
      - "1024"
      - --fit
      - "off"
      - --temp
      - "1.0"
      - --top-k
      - "64"
      - --top-p
      - "1.0"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --parallel
      - "1"
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
      - --chat-template-kwargs
      - "{\"enable_thinking\": false}"
      - --reasoning
      - "off"
      - --image_min_tokens
      - "300"
      - --image_max_tokens
      - "1800"
      - --split-mode
      - "layer"
      # - --spec-type
      # - "draft-mtp"
      # - --spec-draft-n-max
      # - "3"
    ports:
      - "8002:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp
      - hf-hub-cache-gemma4-31b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

 # Gemma 4 26B model setup below requires 24GB+ of VRAM to run.
  gemma4-26b_q4_gguf:
    profiles: ["gemma4-26b"]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda12
    command:
      - -hf
      - unsloth/gemma-4-26B-A4B-it-GGUF
      - --hf-file
      - gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf
      - --mmproj-url
      - https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf
      - --n-gpu-layers
      - "-1"
      - --ctx-size
      - "114688"
      #- --no-mmproj
      # - -mg
      # - "0"
      # - -dev 
      # - "cuda0,cuda1"
      - -dev 
      - "cuda0"
      # - -sm
      # - "row"
      # - --tensor-split
      # - "24,16"
      - -ub
      - "1024"
      - --fit
      - "off"
      - --temp
      - "0.1"
      - --top-k
      - "64"
      - --top-p
      - "0.95"
      - --host
      - "0.0.0.0"
      - --port
      - "8080"
      - --no-warmup
      - --seed
      - "42"
      - --parallel
      - "1"
      #- --chat-template-kwargs
      #- "{\"enable_thinking\": false}"
      #- reasoning off
      - --cache-type-k
      - "q8_0"
      - --cache-type-v
      - "q8_0"
      # - --image_min_tokens
      # - "300"
    ports:
      - "8002:8080"
    volumes:
      - ./models:/models
      - hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp
      - hf-hub-cache-gemma4-26b:/root/.cache/huggingface
    pull_policy: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 8
      start_period: 1200s
    networks:
      redaction-net-llama:
        aliases:
          - llama-inference

  redaction-app-llama:
    <<: *redaction-app-env
    profiles: ["35b_36", "27b_36", "gemma4-31b", "gemma4-26b"]
    image: redaction-app-main
    build:
      context: .              # Look in the current folder
      dockerfile: Dockerfile  # Use this file
      target: gradio          # Use the 'gradio' stage from your Dockerfile
      args:                   # Pass your build-time variables here!
        - TORCH_GPU_ENABLED=False
        - INSTALL_VLM=False
        - PADDLE_GPU_ENABLED=True
        - INSTALL_PADDLEOCR=True
    shm_size: '8gb'
    depends_on:
      qwen36-35b_q4_gguf:
        condition: service_healthy
        required: false
      qwen36-27b_q4_gguf:
        condition: service_healthy
        required: false
      gemma4-31b_q4_gguf:
        condition: service_healthy
        required: false
      gemma4-26b_q4_gguf:
        condition: service_healthy
        required: false
    environment:
      - FLAGS_fraction_of_gpu_memory_to_use=0.05
      - RUN_FASTAPI=True
      - APP_MODE=fastapi
      - SHOW_PADDLE_MODEL_OPTIONS=True
      - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
      - SHOW_LOCAL_PII_DETECTION_OPTIONS=True
      - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
      - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
      - SHOW_HYBRID_MODELS=True
      - SHOW_DIFFICULT_OCR_EXAMPLES=True
      - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
      - SHOW_SUMMARISATION=True
      - SHOW_AWS_API_KEYS=True
      - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
      - DEFAULT_LOCAL_OCR_MODEL=paddle
      - DEFAULT_PII_DETECTION_MODEL=Local
      - INFERENCE_SERVER_API_URL=http://llama-inference:8080
      - DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
      - DEFAULT_INFERENCE_SERVER_PII_MODEL=""
      - CUSTOM_VLM_BACKEND=inference_vlm
      - MAX_WORKERS=12
      - TESSERACT_MAX_WORKERS=8
      - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
      - LOAD_PADDLE_AT_STARTUP=False
      - EFFICIENT_OCR=True
      - SHOW_CUSTOM_VLM_ENTITIES=True
      - SESSION_OUTPUT_FOLDER=True
      - SAVE_PAGE_OCR_VISUALISATIONS=False
      - HYBRID_OCR_CONFIDENCE_THRESHOLD=90
      - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
      - PREPROCESS_LOCAL_OCR_IMAGES=False
      - INFERENCE_SERVER_DISABLE_THINKING=True
      - MAX_NEW_TOKENS=8192
      - SAVE_EXAMPLE_HYBRID_IMAGES=False
      - SAVE_VLM_INPUT_IMAGES=False
      - VLM_MAX_DPI=200.0
      - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
      - REPORT_VLM_OUTPUTS_TO_GUI=True
      - REPORT_LLM_OUTPUTS_TO_GUI=True
      - ADD_VLM_BOUNDING_BOX_RULES=False
      - RUN_MCP_SERVER=True   

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    ports:
      - "7861:7860"
    volumes:
      - ./workspace:/home/user/app/workspace:rw
    networks:
      - redaction-net-llama

  # Cloud-backed redaction app (no llama.cpp). Network alias keeps pi-agent URL unchanged.
  redaction-app-llama-gemini:
    <<: *redaction-app-build
    profiles: ["pi-gemini"]
    environment:
      - FLAGS_fraction_of_gpu_memory_to_use=0.05
      - RUN_FASTAPI=True
      - SHOW_PADDLE_MODEL_OPTIONS=True
      - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
      - SHOW_LOCAL_PII_DETECTION_OPTIONS=True
      - SHOW_INFERENCE_SERVER_PII_OPTIONS=False
      - SHOW_INFERENCE_SERVER_VLM_OPTIONS=False
      - SHOW_HYBRID_MODELS=False
      - SHOW_VLM_MODEL_OPTIONS=False
      - SHOW_GEMINI_VLM_MODELS=True
      - SHOW_GEMINI_LLM_MODELS=True
      - SHOW_GEMINI_LLM_PII_OPTIONS=True
      - SHOW_DIFFICULT_OCR_EXAMPLES=True
      - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
      - SHOW_SUMMARISATION=True
      - SHOW_AWS_API_KEYS=True
      - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
      - DEFAULT_LOCAL_OCR_MODEL=paddle
      - DEFAULT_PII_DETECTION_MODEL=Local
      - CLOUD_VLM_MODEL_CHOICE=${CLOUD_VLM_MODEL_CHOICE:-gemini-flash-latest}
      - CUSTOM_VLM_BACKEND=bedrock_vlm
      - SHOW_CUSTOM_VLM_ENTITIES=False
      - MAX_WORKERS=8
      - TESSERACT_MAX_WORKERS=4
      - PADDLE_MAX_WORKERS=1
      - LOAD_PADDLE_AT_STARTUP=False
      - EFFICIENT_OCR=True
      - SESSION_OUTPUT_FOLDER=True
      - SAVE_PAGE_OCR_VISUALISATIONS=False
      - HYBRID_OCR_CONFIDENCE_THRESHOLD=90
      - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=False
      - PREPROCESS_LOCAL_OCR_IMAGES=False
      - MAX_NEW_TOKENS=8192
      - SAVE_EXAMPLE_HYBRID_IMAGES=False
      - SAVE_VLM_INPUT_IMAGES=False
      - VLM_MAX_DPI=200.0
      - ADD_VLM_BOUNDING_BOX_RULES=True
      - GEMINI_API_KEY=${GEMINI_API_KEY:-}
      - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
    networks:
      redaction-net-llama:
        aliases:
          - redaction-app-llama

  redaction-app-llama-bedrock:
    <<: *redaction-app-build
    profiles: ["pi-bedrock"]
    environment:
      - FLAGS_fraction_of_gpu_memory_to_use=0.05
      - RUN_FASTAPI=True
      - APP_MODE=fastapi
      - SHOW_PADDLE_MODEL_OPTIONS=True
      - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
      - SHOW_LOCAL_PII_DETECTION_OPTIONS=True
      - SHOW_INFERENCE_SERVER_PII_OPTIONS=False
      - SHOW_INFERENCE_SERVER_VLM_OPTIONS=False
      - SHOW_HYBRID_MODELS=False
      - SHOW_VLM_MODEL_OPTIONS=False
      - SHOW_AWS_PII_DETECTION_OPTIONS=True
      - SHOW_AWS_BEDROCK_LLM_MODELS=True
      - SHOW_BEDROCK_VLM_MODELS=True
      - SHOW_DIFFICULT_OCR_EXAMPLES=True
      - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
      - SHOW_SUMMARISATION=True
      - SHOW_AWS_API_KEYS=True
      - DEFAULT_TEXT_EXTRACTION_MODEL=AWS Textract
      - DEFAULT_LOCAL_OCR_MODEL=tesseract
      - DEFAULT_PII_DETECTION_MODEL=AWS Comprehend
      - CLOUD_VLM_MODEL_CHOICE=${CLOUD_VLM_MODEL_CHOICE:-amazon.nova-pro-v1:0}
      - CUSTOM_VLM_BACKEND=bedrock_vlm
      - HYBRID_TEXTRACT_BEDROCK_VLM=True
      - SHOW_CUSTOM_VLM_ENTITIES=True
      - MAX_WORKERS=8
      - TESSERACT_MAX_WORKERS=4
      - PADDLE_MAX_WORKERS=1
      - LOAD_PADDLE_AT_STARTUP=False
      - EFFICIENT_OCR=True
      - SESSION_OUTPUT_FOLDER=True
      - SAVE_PAGE_OCR_VISUALISATIONS=False
      - HYBRID_OCR_CONFIDENCE_THRESHOLD=90
      - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=False
      - PREPROCESS_LOCAL_OCR_IMAGES=False
      - MAX_NEW_TOKENS=8192
      - SAVE_EXAMPLE_HYBRID_IMAGES=False
      - SAVE_VLM_INPUT_IMAGES=False
      - VLM_MAX_DPI=200.0
      - ADD_VLM_BOUNDING_BOX_RULES=True
      - RUN_AWS_FUNCTIONS=True
      - PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS=${PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:-True}
      - AWS_REGION=${AWS_REGION:-eu-west-2}
      - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-${AWS_REGION:-eu-west-2}}
      - AWS_PROFILE=${AWS_PROFILE:-}
      - USAGE_LOG_FILE_NAME=usage_log.csv
      - SAVE_LOGS_TO_CSV=True
      - SAVE_LOGS_TO_DYNAMODB=False
      - SAVE_OUTPUTS_TO_S3=False
      - S3_OUTPUTS_FOLDER=${S3_OUTPUTS_FOLDER:-}
      - S3_OUTPUTS_BUCKET=${S3_OUTPUTS_BUCKET:-}
      - DOCUMENT_REDACTION_BUCKET=${DOCUMENT_REDACTION_BUCKET:-}
      - INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION=True

    volumes:
      - ./workspace:/home/user/app/workspace:rw
      - ${USERPROFILE}/.aws:/home/user/.aws:rw
    networks:
      redaction-net-llama:
        aliases:
          - redaction-app-llama

  pi-agent:
    <<: *pi-agent-common
    profiles: ["27b_36"]
    depends_on:
      qwen36-27b_q4_gguf:
        condition: service_healthy
      redaction-app-llama:
        condition: service_started
    environment:
      <<: *pi-agent-env
      PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/Qwen3.6-27B-MTP-GGUF}
      PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-114688}
      PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-32768}
      PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/Qwen3.6-27B-MTP-GGUF}

  pi-agent-35b:
    <<: *pi-agent-common
    profiles: ["35b_36"]
    depends_on:
      qwen36-35b_q4_gguf:
        condition: service_healthy
      redaction-app-llama:
        condition: service_started
    environment:
      <<: *pi-agent-env
      PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/Qwen3.6-35B-A3B-GGUF}
      PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-196608}
      PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-65536}
      PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/Qwen3.6-35B-A3B-GGUF}

  pi-agent-gemma-31b:
    <<: *pi-agent-common
    profiles: ["gemma4-31b"]
    depends_on:
      gemma4-31b_q4_gguf:
        condition: service_healthy
      redaction-app-llama:
        condition: service_started
    environment:
      <<: *pi-agent-env
      PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/gemma-4-31B-it-GGUF}
      PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-65536}
      PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-32768}
      PI_COMPACTION_RESERVE_TOKENS: ${PI_COMPACTION_RESERVE_TOKENS:-16384}
      PI_COMPACTION_KEEP_RECENT_TOKENS: ${PI_COMPACTION_KEEP_RECENT_TOKENS:-12288}
      PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/gemma-4-31B-it-GGUF}

  pi-agent-gemma-26b:
    <<: *pi-agent-common
    profiles: ["gemma4-26b"]
    depends_on:
      gemma4-26b_q4_gguf:
        condition: service_healthy
      redaction-app-llama:
        condition: service_started
    environment:
      <<: *pi-agent-env
      PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/gemma-4-26B-A4B-it-GGUF}
      PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-65536}
      PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-32768}
      PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/gemma-4-26B-A4B-it-GGUF}

  pi-agent-gemini:
    <<: *pi-agent-common
    profiles: ["pi-gemini"]
    depends_on:
      redaction-app-llama-gemini:
        condition: service_started
    environment:
      <<: *pi-agent-env
      PI_DEFAULT_PROVIDER: google-gemini
      PI_DEFAULT_MODEL: ${PI_DEFAULT_MODEL:-gemini-flash-latest}
      PI_DEFAULT_OCR_METHOD: hybrid-paddle-vlm
      PI_DEFAULT_PII_METHOD: Local
      PI_VLM_BASE_URL: http://redaction-app-llama:7860
      PI_VLM_MODEL: ${PI_VLM_MODEL:-gemini-flash-latest}

  pi-agent-bedrock:
    <<: *pi-agent-common
    profiles: ["pi-bedrock"]
    volumes:
      - .:/workspace/doc_redaction:rw
      - ./workspace:/home/user/app/workspace:rw
      - pi-agent-sessions:/home/user/.pi/agent/sessions
      - ${USERPROFILE}/.aws:/home/user/.aws:rw
    depends_on:
      redaction-app-llama-bedrock:
        condition: service_started
    environment:
      <<: *pi-agent-env
      PI_DEFAULT_PROVIDER: amazon-bedrock
      # PI_DEFAULT_MODEL: ${PI_DEFAULT_MODEL:-anthropic.claude-sonnet-4-6}
      # PI_DEFAULT_OCR_METHOD: AWS Textract service - all PDF types
      # PI_DEFAULT_PII_METHOD: AWS Comprehend
      PI_VLM_BASE_URL: http://redaction-app-llama:7860
      # PI_VLM_MODEL: ${PI_VLM_MODEL:-anthropic.claude-sonnet-4-6}
      RUN_AWS_FUNCTIONS: "True"
      # AWS_REGION: ${AWS_REGION:-eu-west-2}
      # AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-${AWS_REGION:-eu-west-2}}
      # AWS_PROFILE: ${AWS_PROFILE:-${PI_AWS_PROFILE:-}}
      # PI_AWS_PROFILE: ${PI_AWS_PROFILE:-}

networks:
  redaction-net-llama:
    driver: bridge

volumes:
  hf-llama-cache-qwen36-35b:
  hf-llama-cache-qwen36-27b:
  hf-llama-cache-gemma4-31b:
  hf-llama-cache-gemma4-26b:
  hf-hub-cache-qwen36-35b:
  hf-hub-cache-qwen36-27b:
  hf-hub-cache-gemma4-31b:
  hf-hub-cache-gemma4-26b:
  pi-agent-sessions: