# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile): # COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf # COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf # The app always talks to http://llama-inference:8080 (shared network alias on both model services). # Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf # (same filename per repo) and -hf downloads are not shared across profiles. # Example CLI commands (all are recommended for 24gb VRAM systems minimum, add --build to the below commands if you want to rebuild the app images): # docker compose -f docker-compose_llama_agentic.yml --profile 35b_36 up -d # docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d # docker compose -f docker-compose_llama_agentic.yml --profile gemma4-31b up -d # docker compose -f docker-compose_llama_agentic.yml --profile gemma4-26b up -d # For agentic usage with pi (pi-agent service name matches the active profile): # docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d pi-agent # docker compose -f docker-compose_llama_agentic.yml --profile 35b_36 up -d pi-agent-35b # docker compose -f docker-compose_llama_agentic.yml --profile gemma4-31b up -d pi-agent-gemma-31b # docker compose -f docker-compose_llama_agentic.yml --profile gemma4-26b up -d pi-agent-gemma-26b # Cloud-only Pi agent (no local llama.cpp GPU model; redaction app still runs locally): # Set GEMINI_API_KEY (and optionally GOOGLE_API_KEY) in .env or config/pi_agent.env before starting pi-gemini. # docker compose -f docker-compose_llama_agentic.yml --profile pi-gemini up -d # Set AWS_REGION plus AWS credentials (or AWS_PROFILE via mounted ~/.aws) before starting pi-bedrock. # SSO (recommended): mount host ~/.aws (read-write — SSO token refresh writes to sso/cache), # set AWS_PROFILE (or PI_AWS_PROFILE) to your SSO profile name, run `aws sso login` on the host. # Pi requires AWS_PROFILE in the container env — mounting ~/.aws alone is not enough for Pi's auth check. # docker compose -f docker-compose_llama_agentic.yml --profile pi-bedrock up -d # # Optional Docker-only settings for redaction-app services: copy settings into # config/docker_app_config.env (see config/docker_app_config.env.example). Loaded # at container start; values in each service's environment: block override these. x-redaction-app-env: &redaction-app-env env_file: - path: config/docker_app_config.env required: false x-pi-agent-common: &pi-agent-common build: context: . dockerfile: agent-redact/pi-agent/Dockerfile target: dev image: pi-agent-doc-redaction env_file: - path: config/pi_agent.env required: false ports: - "7862:7862" volumes: - .:/workspace/doc_redaction:rw - ./workspace:/home/user/app/workspace:rw - pi-agent-sessions:/home/user/.pi/agent/sessions working_dir: /workspace/doc_redaction stdin_open: true tty: true entrypoint: ["/bin/bash", "/workspace/doc_redaction/agent-redact/pi/start.sh"] networks: - redaction-net-llama x-pi-agent-env: &pi-agent-env APP_TYPE: pi APP_CONFIG_PATH: /workspace/doc_redaction/config/pi_agent.env HOME: /home/user PI_SKIP_VERSION_CHECK: "1" PI_OFFLINE: "1" DOC_REDACTION_GRADIO_URL: ${DOC_REDACTION_GRADIO_URL:-http://redaction-app-llama:7860} GRADIO_SERVER_NAME: ${GRADIO_SERVER_NAME:-0.0.0.0} PI_GRADIO_PORT: ${PI_GRADIO_PORT:-7862} GRADIO_SERVER_PORT: ${GRADIO_SERVER_PORT:-7862} PYTHONPATH: /workspace/doc_redaction:/workspace/doc_redaction/agent-redact/pi PI_WORKSPACE_DIR: /home/user/app/workspace # PI_DEFAULT_PROVIDER: ${PI_DEFAULT_PROVIDER:-llama-cpp} # PI_DEFAULT_MODEL: ${PI_DEFAULT_MODEL:-} # PI_DEFAULT_OCR_METHOD: ${PI_DEFAULT_OCR_METHOD:-hybrid-paddle-inference-server} # PI_DEFAULT_PII_METHOD: ${PI_DEFAULT_PII_METHOD:-Local} PI_LLAMA_BASE_URL: ${PI_LLAMA_BASE_URL:-http://llama-inference:8080/v1} GEMINI_API_KEY: ${GEMINI_API_KEY:-} GOOGLE_API_KEY: ${GOOGLE_API_KEY:-} AWS_REGION: ${AWS_REGION:-eu-west-2} AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-${AWS_REGION:-eu-west-2}} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-} AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN:-} AWS_PROFILE: ${AWS_PROFILE:-} RUN_AWS_FUNCTIONS: ${RUN_AWS_FUNCTIONS:-False} PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: ${PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:-True} # PI_VLM_BASE_URL: http://llama-inference:8080 RUN_FASTAPI: ${RUN_FASTAPI:-False} ROOT_PATH: ${ROOT_PATH:-} FASTAPI_ROOT_PATH: ${FASTAPI_ROOT_PATH:-/} ALLOWED_HOSTS: ${ALLOWED_HOSTS:-} ALLOWED_ORIGINS: ${ALLOWED_ORIGINS:-} COGNITO_AUTH: ${COGNITO_AUTH:-False} AWS_USER_POOL_ID: ${AWS_USER_POOL_ID:-} AWS_CLIENT_ID: ${AWS_CLIENT_ID:-} AWS_CLIENT_SECRET: ${AWS_CLIENT_SECRET:-} SESSION_OUTPUT_FOLDER: ${SESSION_OUTPUT_FOLDER:-True} SAVE_LOGS_TO_CSV: ${SAVE_LOGS_TO_CSV:-True} SAVE_LOGS_TO_DYNAMODB: ${SAVE_LOGS_TO_DYNAMODB:-False} SAVE_OUTPUTS_TO_S3: ${SAVE_OUTPUTS_TO_S3:-False} S3_OUTPUTS_FOLDER: ${S3_OUTPUTS_FOLDER:-} S3_OUTPUTS_BUCKET: ${S3_OUTPUTS_BUCKET:-} CUSTOM_HEADER: ${CUSTOM_HEADER:-} CUSTOM_HEADER_VALUE: ${CUSTOM_HEADER_VALUE:-} PI_MAX_PAGES: ${PI_MAX_PAGES:-${MAX_DOC_PAGES:-3000}} x-redaction-app-build: &redaction-app-build <<: *redaction-app-env image: redaction-app-main build: context: . dockerfile: Dockerfile target: gradio args: - TORCH_GPU_ENABLED=False - INSTALL_VLM=False - PADDLE_GPU_ENABLED=True - INSTALL_PADDLEOCR=True shm_size: '8gb' deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ports: - "7861:7860" volumes: - ./workspace:/home/user/app/workspace:rw networks: - redaction-net-llama services: # Qwen 3.6 27B model setup below requires 40GB of VRAM to run. For 24GB, Change to -hf-file parameter to Qwen3.6-27B-UD-Q4_K_XL.gguf, or Qwen3.6-27B-IQ4_NL.gguf qwen36-27b_q4_gguf: profiles: ["27b_36"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.6-27B-MTP-GGUF # For - --hf-file - Qwen3.6-27B-UD-Q6_K_XL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF/resolve/main/mmproj-BF16.gguf - --n-gpu-layers - "-1" # - -mg # - "0" # - -dev # - "cuda0,cuda1" # - -sm # - "row" - --tensor-split - "24,14" - --ctx-size - "114688" - -ub - "512" - --fit - "off" - --temp - "0.7" - --top-k - "20" - --top-p - "0.95" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "0.0" - --chat-template-kwargs - "{\"preserve_thinking\": true}" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --image_min_tokens - "300" - --parallel - "1" - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" - --spec-type - "draft-mtp" - --spec-draft-n-max - "2" ports: - "8000:8080" volumes: - ./models:/models - hf-llama-cache-qwen36-27b:/root/.cache/llama.cpp - hf-hub-cache-qwen36-27b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system. qwen36-35b_q4_gguf: profiles: ["35b_36"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/Qwen3.6-35B-A3B-GGUF - --hf-file - Qwen3.6-35B-A3B-UD-IQ4_NL.gguf - --mmproj-url - https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-BF16.gguf - --n-gpu-layers - "-1" - --ctx-size - "114688" - --fit - "off" - --temp - "1.0" - --top-k - "20" - --top-p - "0.95" - --min-p - "0.0" - --frequency-penalty - "1" - --presence-penalty - "1.5" - --repeat-penalty - "1" - --chat-template-kwargs - "{\"preserve_thinking\": true}" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --n-cpu-moe - "0" # Increase this value to fit within your available VRAM - --image_min_tokens - "300" - --parallel - "1" - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" ports: - "8005:8080" volumes: - ./models:/models - hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp - hf-hub-cache-qwen36-35b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Gemma 4 31B model setup below requires 40GB of VRAM to run with the following settings gemma4-31b_q4_gguf: profiles: ["gemma4-31b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/gemma-4-31B-it-qat-GGUF - --hf-file - gemma-4-31B-it-qat-UD-Q4_K_XL.gguf - --mmproj-url - https://huggingface.co/unsloth/gemma-4-31B-it-qat-GGUF/resolve/main/mmproj-BF16.gguf #- --no-mmproj - --n-gpu-layers - "-1" # - --tensor-split # - "24,16" - --ctx-size - "114688" - -ub - "1024" - --fit - "off" - --temp - "1.0" - --top-k - "64" - --top-p - "1.0" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --parallel - "1" - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" - --chat-template-kwargs - "{\"enable_thinking\": false}" - --reasoning - "off" - --image_min_tokens - "300" - --image_max_tokens - "1800" - --split-mode - "layer" # - --spec-type # - "draft-mtp" # - --spec-draft-n-max # - "3" ports: - "8002:8080" volumes: - ./models:/models - hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp - hf-hub-cache-gemma4-31b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference # Gemma 4 26B model setup below requires 24GB+ of VRAM to run. gemma4-26b_q4_gguf: profiles: ["gemma4-26b"] image: ghcr.io/ggml-org/llama.cpp:server-cuda12 command: - -hf - unsloth/gemma-4-26B-A4B-it-GGUF - --hf-file - gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf - --mmproj-url - https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf - --n-gpu-layers - "-1" - --ctx-size - "114688" #- --no-mmproj # - -mg # - "0" # - -dev # - "cuda0,cuda1" - -dev - "cuda0" # - -sm # - "row" # - --tensor-split # - "24,16" - -ub - "1024" - --fit - "off" - --temp - "0.1" - --top-k - "64" - --top-p - "0.95" - --host - "0.0.0.0" - --port - "8080" - --no-warmup - --seed - "42" - --parallel - "1" #- --chat-template-kwargs #- "{\"enable_thinking\": false}" #- reasoning off - --cache-type-k - "q8_0" - --cache-type-v - "q8_0" # - --image_min_tokens # - "300" ports: - "8002:8080" volumes: - ./models:/models - hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp - hf-hub-cache-gemma4-26b:/root/.cache/huggingface pull_policy: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] healthcheck: test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] interval: 30s timeout: 15s retries: 8 start_period: 1200s networks: redaction-net-llama: aliases: - llama-inference redaction-app-llama: <<: *redaction-app-env profiles: ["35b_36", "27b_36", "gemma4-31b", "gemma4-26b"] image: redaction-app-main build: context: . # Look in the current folder dockerfile: Dockerfile # Use this file target: gradio # Use the 'gradio' stage from your Dockerfile args: # Pass your build-time variables here! - TORCH_GPU_ENABLED=False - INSTALL_VLM=False - PADDLE_GPU_ENABLED=True - INSTALL_PADDLEOCR=True shm_size: '8gb' depends_on: qwen36-35b_q4_gguf: condition: service_healthy required: false qwen36-27b_q4_gguf: condition: service_healthy required: false gemma4-31b_q4_gguf: condition: service_healthy required: false gemma4-26b_q4_gguf: condition: service_healthy required: false environment: - FLAGS_fraction_of_gpu_memory_to_use=0.05 - RUN_FASTAPI=True - APP_MODE=fastapi - SHOW_PADDLE_MODEL_OPTIONS=True - SHOW_LOCAL_OCR_MODEL_OPTIONS=True - SHOW_LOCAL_PII_DETECTION_OPTIONS=True - SHOW_INFERENCE_SERVER_PII_OPTIONS=True - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True - SHOW_HYBRID_MODELS=True - SHOW_DIFFICULT_OCR_EXAMPLES=True - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True - SHOW_SUMMARISATION=True - SHOW_AWS_API_KEYS=True - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text - DEFAULT_LOCAL_OCR_MODEL=paddle - DEFAULT_PII_DETECTION_MODEL=Local - INFERENCE_SERVER_API_URL=http://llama-inference:8080 - DEFAULT_INFERENCE_SERVER_VLM_MODEL="" - DEFAULT_INFERENCE_SERVER_PII_MODEL="" - CUSTOM_VLM_BACKEND=inference_vlm - MAX_WORKERS=12 - TESSERACT_MAX_WORKERS=8 - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors - LOAD_PADDLE_AT_STARTUP=False - EFFICIENT_OCR=True - SHOW_CUSTOM_VLM_ENTITIES=True - SESSION_OUTPUT_FOLDER=True - SAVE_PAGE_OCR_VISUALISATIONS=False - HYBRID_OCR_CONFIDENCE_THRESHOLD=90 - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True - PREPROCESS_LOCAL_OCR_IMAGES=False - INFERENCE_SERVER_DISABLE_THINKING=True - MAX_NEW_TOKENS=8192 - SAVE_EXAMPLE_HYBRID_IMAGES=False - SAVE_VLM_INPUT_IMAGES=False - VLM_MAX_DPI=200.0 - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 - REPORT_VLM_OUTPUTS_TO_GUI=True - REPORT_LLM_OUTPUTS_TO_GUI=True - ADD_VLM_BOUNDING_BOX_RULES=False - RUN_MCP_SERVER=True deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ports: - "7861:7860" volumes: - ./workspace:/home/user/app/workspace:rw networks: - redaction-net-llama # Cloud-backed redaction app (no llama.cpp). Network alias keeps pi-agent URL unchanged. redaction-app-llama-gemini: <<: *redaction-app-build profiles: ["pi-gemini"] environment: - FLAGS_fraction_of_gpu_memory_to_use=0.05 - RUN_FASTAPI=True - SHOW_PADDLE_MODEL_OPTIONS=True - SHOW_LOCAL_OCR_MODEL_OPTIONS=True - SHOW_LOCAL_PII_DETECTION_OPTIONS=True - SHOW_INFERENCE_SERVER_PII_OPTIONS=False - SHOW_INFERENCE_SERVER_VLM_OPTIONS=False - SHOW_HYBRID_MODELS=False - SHOW_VLM_MODEL_OPTIONS=False - SHOW_GEMINI_VLM_MODELS=True - SHOW_GEMINI_LLM_MODELS=True - SHOW_GEMINI_LLM_PII_OPTIONS=True - SHOW_DIFFICULT_OCR_EXAMPLES=True - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True - SHOW_SUMMARISATION=True - SHOW_AWS_API_KEYS=True - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text - DEFAULT_LOCAL_OCR_MODEL=paddle - DEFAULT_PII_DETECTION_MODEL=Local - CLOUD_VLM_MODEL_CHOICE=${CLOUD_VLM_MODEL_CHOICE:-gemini-flash-latest} - CUSTOM_VLM_BACKEND=bedrock_vlm - SHOW_CUSTOM_VLM_ENTITIES=False - MAX_WORKERS=8 - TESSERACT_MAX_WORKERS=4 - PADDLE_MAX_WORKERS=1 - LOAD_PADDLE_AT_STARTUP=False - EFFICIENT_OCR=True - SESSION_OUTPUT_FOLDER=True - SAVE_PAGE_OCR_VISUALISATIONS=False - HYBRID_OCR_CONFIDENCE_THRESHOLD=90 - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=False - PREPROCESS_LOCAL_OCR_IMAGES=False - MAX_NEW_TOKENS=8192 - SAVE_EXAMPLE_HYBRID_IMAGES=False - SAVE_VLM_INPUT_IMAGES=False - VLM_MAX_DPI=200.0 - ADD_VLM_BOUNDING_BOX_RULES=True - GEMINI_API_KEY=${GEMINI_API_KEY:-} - GOOGLE_API_KEY=${GOOGLE_API_KEY:-} networks: redaction-net-llama: aliases: - redaction-app-llama redaction-app-llama-bedrock: <<: *redaction-app-build profiles: ["pi-bedrock"] environment: - FLAGS_fraction_of_gpu_memory_to_use=0.05 - RUN_FASTAPI=True - APP_MODE=fastapi - SHOW_PADDLE_MODEL_OPTIONS=True - SHOW_LOCAL_OCR_MODEL_OPTIONS=True - SHOW_LOCAL_PII_DETECTION_OPTIONS=True - SHOW_INFERENCE_SERVER_PII_OPTIONS=False - SHOW_INFERENCE_SERVER_VLM_OPTIONS=False - SHOW_HYBRID_MODELS=False - SHOW_VLM_MODEL_OPTIONS=False - SHOW_AWS_PII_DETECTION_OPTIONS=True - SHOW_AWS_BEDROCK_LLM_MODELS=True - SHOW_BEDROCK_VLM_MODELS=True - SHOW_DIFFICULT_OCR_EXAMPLES=True - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True - SHOW_SUMMARISATION=True - SHOW_AWS_API_KEYS=True - DEFAULT_TEXT_EXTRACTION_MODEL=AWS Textract - DEFAULT_LOCAL_OCR_MODEL=tesseract - DEFAULT_PII_DETECTION_MODEL=AWS Comprehend - CLOUD_VLM_MODEL_CHOICE=${CLOUD_VLM_MODEL_CHOICE:-amazon.nova-pro-v1:0} - CUSTOM_VLM_BACKEND=bedrock_vlm - HYBRID_TEXTRACT_BEDROCK_VLM=True - SHOW_CUSTOM_VLM_ENTITIES=True - MAX_WORKERS=8 - TESSERACT_MAX_WORKERS=4 - PADDLE_MAX_WORKERS=1 - LOAD_PADDLE_AT_STARTUP=False - EFFICIENT_OCR=True - SESSION_OUTPUT_FOLDER=True - SAVE_PAGE_OCR_VISUALISATIONS=False - HYBRID_OCR_CONFIDENCE_THRESHOLD=90 - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=False - PREPROCESS_LOCAL_OCR_IMAGES=False - MAX_NEW_TOKENS=8192 - SAVE_EXAMPLE_HYBRID_IMAGES=False - SAVE_VLM_INPUT_IMAGES=False - VLM_MAX_DPI=200.0 - ADD_VLM_BOUNDING_BOX_RULES=True - RUN_AWS_FUNCTIONS=True - PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS=${PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:-True} - AWS_REGION=${AWS_REGION:-eu-west-2} - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-${AWS_REGION:-eu-west-2}} - AWS_PROFILE=${AWS_PROFILE:-} - USAGE_LOG_FILE_NAME=usage_log.csv - SAVE_LOGS_TO_CSV=True - SAVE_LOGS_TO_DYNAMODB=False - SAVE_OUTPUTS_TO_S3=False - S3_OUTPUTS_FOLDER=${S3_OUTPUTS_FOLDER:-} - S3_OUTPUTS_BUCKET=${S3_OUTPUTS_BUCKET:-} - DOCUMENT_REDACTION_BUCKET=${DOCUMENT_REDACTION_BUCKET:-} - INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION=True volumes: - ./workspace:/home/user/app/workspace:rw - ${USERPROFILE}/.aws:/home/user/.aws:rw networks: redaction-net-llama: aliases: - redaction-app-llama pi-agent: <<: *pi-agent-common profiles: ["27b_36"] depends_on: qwen36-27b_q4_gguf: condition: service_healthy redaction-app-llama: condition: service_started environment: <<: *pi-agent-env PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/Qwen3.6-27B-MTP-GGUF} PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-114688} PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-32768} PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/Qwen3.6-27B-MTP-GGUF} pi-agent-35b: <<: *pi-agent-common profiles: ["35b_36"] depends_on: qwen36-35b_q4_gguf: condition: service_healthy redaction-app-llama: condition: service_started environment: <<: *pi-agent-env PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/Qwen3.6-35B-A3B-GGUF} PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-196608} PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-65536} PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/Qwen3.6-35B-A3B-GGUF} pi-agent-gemma-31b: <<: *pi-agent-common profiles: ["gemma4-31b"] depends_on: gemma4-31b_q4_gguf: condition: service_healthy redaction-app-llama: condition: service_started environment: <<: *pi-agent-env PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/gemma-4-31B-it-GGUF} PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-65536} PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-32768} PI_COMPACTION_RESERVE_TOKENS: ${PI_COMPACTION_RESERVE_TOKENS:-16384} PI_COMPACTION_KEEP_RECENT_TOKENS: ${PI_COMPACTION_KEEP_RECENT_TOKENS:-12288} PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/gemma-4-31B-it-GGUF} pi-agent-gemma-26b: <<: *pi-agent-common profiles: ["gemma4-26b"] depends_on: gemma4-26b_q4_gguf: condition: service_healthy redaction-app-llama: condition: service_started environment: <<: *pi-agent-env PI_LLAMA_MODEL_ID: ${PI_LLAMA_MODEL_ID:-unsloth/gemma-4-26B-A4B-it-GGUF} PI_LLAMA_CONTEXT_WINDOW: ${PI_LLAMA_CONTEXT_WINDOW:-65536} PI_LLAMA_MAX_TOKENS: ${PI_LLAMA_MAX_TOKENS:-32768} PI_VLM_MODEL: ${PI_VLM_MODEL:-unsloth/gemma-4-26B-A4B-it-GGUF} pi-agent-gemini: <<: *pi-agent-common profiles: ["pi-gemini"] depends_on: redaction-app-llama-gemini: condition: service_started environment: <<: *pi-agent-env PI_DEFAULT_PROVIDER: google-gemini PI_DEFAULT_MODEL: ${PI_DEFAULT_MODEL:-gemini-flash-latest} PI_DEFAULT_OCR_METHOD: hybrid-paddle-vlm PI_DEFAULT_PII_METHOD: Local PI_VLM_BASE_URL: http://redaction-app-llama:7860 PI_VLM_MODEL: ${PI_VLM_MODEL:-gemini-flash-latest} pi-agent-bedrock: <<: *pi-agent-common profiles: ["pi-bedrock"] volumes: - .:/workspace/doc_redaction:rw - ./workspace:/home/user/app/workspace:rw - pi-agent-sessions:/home/user/.pi/agent/sessions - ${USERPROFILE}/.aws:/home/user/.aws:rw depends_on: redaction-app-llama-bedrock: condition: service_started environment: <<: *pi-agent-env PI_DEFAULT_PROVIDER: amazon-bedrock # PI_DEFAULT_MODEL: ${PI_DEFAULT_MODEL:-anthropic.claude-sonnet-4-6} # PI_DEFAULT_OCR_METHOD: AWS Textract service - all PDF types # PI_DEFAULT_PII_METHOD: AWS Comprehend PI_VLM_BASE_URL: http://redaction-app-llama:7860 # PI_VLM_MODEL: ${PI_VLM_MODEL:-anthropic.claude-sonnet-4-6} RUN_AWS_FUNCTIONS: "True" # AWS_REGION: ${AWS_REGION:-eu-west-2} # AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-${AWS_REGION:-eu-west-2}} # AWS_PROFILE: ${AWS_PROFILE:-${PI_AWS_PROFILE:-}} # PI_AWS_PROFILE: ${PI_AWS_PROFILE:-} networks: redaction-net-llama: driver: bridge volumes: hf-llama-cache-qwen36-35b: hf-llama-cache-qwen36-27b: hf-llama-cache-gemma4-31b: hf-llama-cache-gemma4-26b: hf-hub-cache-qwen36-35b: hf-hub-cache-qwen36-27b: hf-hub-cache-gemma4-31b: hf-hub-cache-gemma4-26b: pi-agent-sessions: