Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 6 days ago

Commit

a2e06b3

0 Parent(s):

Sync: Merge pull request #199 from seanpedrick-case/startup_optimise

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.coveragerc +56 -0
.dockerignore +57 -0
.gitattributes +9 -0
.github/scripts/setup_test_data.py +320 -0
.github/workflow_README.md +183 -0
.github/workflows/archive_workflows/multi-os-test.yml +115 -0
.github/workflows/ci.yml +269 -0
.github/workflows/simple-test.yml +74 -0
.github/workflows/sync-pi-agent-space.yml +64 -0
.github/workflows/sync_to_hf.yml +54 -0
.github/workflows/sync_to_hf_zero_gpu.yml +59 -0
.gitignore +74 -0
AGENTS.md +113 -0
Dockerfile +235 -0
LICENSE +661 -0
MANIFEST.in +4 -0
README.md +367 -0
README_PYPI.md +351 -0
agent-redact/README.md +29 -0
agent-redact/pi-agent/.dockerignore +10 -0
agent-redact/pi-agent/.gitattributes +2 -0
agent-redact/pi-agent/Dockerfile +176 -0
agent-redact/pi-agent/README.md +46 -0
agent-redact/pi-agent/entrypoint-ecs.sh +12 -0
agent-redact/pi-agent/entrypoint.sh +36 -0
agent-redact/pi-agent/sync-manifest.txt +12 -0
agent-redact/pi-agent/sync_to_space.sh +42 -0
agent-redact/pi/agent/README.md +194 -0
agent-redact/pi/agent/models.json +31 -0
agent-redact/pi/agent/settings.json +32 -0
agent-redact/pi/bootstrap_pi_config.py +192 -0
agent-redact/pi/gradio_app.py +0 -0
agent-redact/pi/output_files.py +423 -0
agent-redact/pi/pi_agent_config.py +857 -0
agent-redact/pi/pi_examples.py +180 -0
agent-redact/pi/pi_rpc_client.py +989 -0
agent-redact/pi/pi_session_usage.py +185 -0
agent-redact/pi/pi_workspace_skills.py +392 -0
agent-redact/pi/redaction_prompt.py +756 -0
agent-redact/pi/remote_redaction.py +410 -0
agent-redact/pi/run_doc_redact.py +87 -0
agent-redact/pi/session_logs.py +124 -0
agent-redact/pi/session_workspace.py +212 -0
agent-redact/pi/start.sh +26 -0
agent-redact/requirements_pi_agent.txt +34 -0
agent_routes.py +1167 -0
app.py +0 -0
cdk/__init__.py +0 -0
cdk/app.py +123 -0
cdk/cdk.json.example +7 -0

.coveragerc ADDED Viewed

	@@ -0,0 +1,56 @@

+[run]
+source = .
+omit =
+    */tests/*
+    */test/*
+    */__pycache__/*
+    */venv/*
+    */env/*
+    */build/*
+    */dist/*
+    */cdk/*
+    */docs/*
+    */example_data/*
+    */examples/*
+    */feedback/*
+    */logs/*
+    */old_code/*
+    */output/*
+    */tmp/*
+    */usage/*
+    */tld/*
+    */tesseract/*
+    */poppler/*
+    config*.py
+    setup.py
+    lambda_entrypoint.py
+    entrypoint.sh
+    cli_redact.py
+    load_dynamo_logs.py
+    load_s3_logs.py
+    *.spec
+    Dockerfile
+    *.qmd
+    *.md
+    *.txt
+    *.yml
+    *.yaml
+    *.json
+    *.csv
+    *.env
+    *.bat
+    *.ps1
+    *.sh
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod

.dockerignore ADDED Viewed

	@@ -0,0 +1,57 @@

+*.url
+*.ipynb
+*.pyc
+*.qmd
+*.json.bak.*
+_quarto.yml
+quarto_site/*
+src/*
+redaction_deps/*
+.venv/*
+examples/*
+processing/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+docs/*
+.pi/*
+build_deps/*
+user_guide/*
+_extensions/*
+workspace/*
+doc_redaction.egg-info/*
+.venv_pypi_test/*
+cdk/config/*
+tld/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+cdk.json
+cdk.context.json
+.quarto/*
+logs/
+output/
+input/
+feedback/
+# Exclude local secrets; allow committed *.example templates (Pi agent + main app images).
+config/*
+!config/pi_agent.env.example
+!config/app_config.env.example
+!config/docker_app_config.env.example
+usage/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*
+src/doc_redaction.egg-info/*
+docker_compose/*
+skills/example_prompts/*

.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.sh text eol=lf
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text

.github/scripts/setup_test_data.py ADDED Viewed

	@@ -0,0 +1,320 @@

+#!/usr/bin/env python3
+"""
+Setup script for GitHub Actions test data.
+Creates dummy test files when example data is not available.
+"""
+import os
+import sys
+import pandas as pd
+def create_directories():
+    """Create necessary directories."""
+    dirs = ["doc_redaction/example_data", "doc_redaction/example_data/example_outputs"]
+    for dir_path in dirs:
+        os.makedirs(dir_path, exist_ok=True)
+        print(f"Created directory: {dir_path}")
+def create_dummy_pdf():
+    """Create dummy PDFs for testing."""
+    # Install reportlab if not available
+    try:
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+    except ImportError:
+        import subprocess
+        subprocess.check_call(["pip", "install", "reportlab"])
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+    try:
+        # Create the main test PDF
+        pdf_path = "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+        print(f"Creating PDF: {pdf_path}")
+        print(f"Directory exists: {os.path.exists('doc_redaction/example_data')}")
+        c = canvas.Canvas(pdf_path, pagesize=letter)
+        c.drawString(100, 750, "This is a test document for redaction testing.")
+        c.drawString(100, 700, "Email: test@example.com")
+        c.drawString(100, 650, "Phone: 123-456-7890")
+        c.drawString(100, 600, "Name: John Doe")
+        c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
+        c.showPage()
+        # Add second page
+        c.drawString(100, 750, "Second page content")
+        c.drawString(100, 700, "More test data: jane.doe@example.com")
+        c.drawString(100, 650, "Another phone: 987-654-3210")
+        c.save()
+        print(f"Created dummy PDF: {pdf_path}")
+        # Create Partnership Agreement Toolkit PDF
+        partnership_pdf_path = (
+            "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf"
+        )
+        print(f"Creating PDF: {partnership_pdf_path}")
+        c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
+        c.drawString(100, 750, "Partnership Agreement Toolkit")
+        c.drawString(100, 700, "This is a test partnership agreement document.")
+        c.drawString(100, 650, "Contact: partnership@example.com")
+        c.drawString(100, 600, "Phone: (555) 123-4567")
+        c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
+        c.showPage()
+        # Add second page
+        c.drawString(100, 750, "Page 2 - Partnership Details")
+        c.drawString(100, 700, "More partnership information here.")
+        c.drawString(100, 650, "Contact: info@partnership.org")
+        c.showPage()
+        # Add third page
+        c.drawString(100, 750, "Page 3 - Terms and Conditions")
+        c.drawString(100, 700, "Terms and conditions content.")
+        c.drawString(100, 650, "Legal contact: legal@partnership.org")
+        c.save()
+        print(f"Created dummy PDF: {partnership_pdf_path}")
+        # Create Graduate Job Cover Letter PDF
+        cover_letter_pdf_path = (
+            "doc_redaction/example_data/graduate-job-example-cover-letter.pdf"
+        )
+        print(f"Creating PDF: {cover_letter_pdf_path}")
+        c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
+        c.drawString(100, 750, "Cover Letter Example")
+        c.drawString(100, 700, "Dear Hiring Manager,")
+        c.drawString(100, 650, "I am writing to apply for the position.")
+        c.drawString(100, 600, "Contact: applicant@example.com")
+        c.drawString(100, 550, "Phone: (555) 987-6543")
+        c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
+        c.drawString(100, 450, "Sincerely,")
+        c.drawString(100, 400, "John Applicant")
+        c.save()
+        print(f"Created dummy PDF: {cover_letter_pdf_path}")
+    except ImportError:
+        print("ReportLab not available, skipping PDF creation")
+        # Create simple text files instead
+        with open(
+            "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy PDF file for testing")
+        with open(
+            "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy Partnership Agreement PDF file for testing")
+        with open(
+            "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy cover letter PDF file for testing")
+        print("Created dummy text files instead of PDFs")
+def create_dummy_csv():
+    """Create dummy CSV files for testing."""
+    # Main CSV
+    csv_data = {
+        "Case Note": [
+            "Client visited for consultation regarding housing issues",
+            "Follow-up appointment scheduled for next week",
+            "Documentation submitted for review",
+        ],
+        "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
+        "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
+    }
+    df = pd.DataFrame(csv_data)
+    df.to_csv("doc_redaction/example_data/combined_case_notes.csv", index=False)
+    print("Created dummy CSV: doc_redaction/example_data/combined_case_notes.csv")
+    # Lambeth CSV
+    lambeth_data = {
+        "text": [
+            "Lambeth 2030 vision document content",
+            "Our Future Our Lambeth strategic plan",
+            "Community engagement and development",
+        ],
+        "page": [1, 2, 3],
+    }
+    df_lambeth = pd.DataFrame(lambeth_data)
+    df_lambeth.to_csv(
+        "doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
+        index=False,
+    )
+    print(
+        "Created dummy CSV: doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
+    )
+def create_dummy_word_doc():
+    """Create dummy Word document."""
+    try:
+        from docx import Document
+        doc = Document()
+        doc.add_heading("Test Document for Redaction", 0)
+        doc.add_paragraph("This is a test document for redaction testing.")
+        doc.add_paragraph("Contact Information:")
+        doc.add_paragraph("Email: test@example.com")
+        doc.add_paragraph("Phone: 123-456-7890")
+        doc.add_paragraph("Name: John Doe")
+        doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
+        doc.save(
+            "doc_redaction/example_data/Bold minimalist professional cover letter.docx"
+        )
+        print("Created dummy Word document")
+    except ImportError:
+        print("python-docx not available, skipping Word document creation")
+def create_allow_deny_lists():
+    """Create dummy allow/deny lists."""
+    # Allow lists
+    allow_data = {"word": ["test", "example", "document"]}
+    pd.DataFrame(allow_data).to_csv(
+        "doc_redaction/example_data/test_allow_list_graduate.csv", index=False
+    )
+    pd.DataFrame(allow_data).to_csv(
+        "doc_redaction/example_data/test_allow_list_partnership.csv", index=False
+    )
+    print("Created allow lists")
+    # Deny lists
+    deny_data = {"word": ["sensitive", "confidential", "private"]}
+    pd.DataFrame(deny_data).to_csv(
+        "doc_redaction/example_data/partnership_toolkit_redact_custom_deny_list.csv",
+        index=False,
+    )
+    pd.DataFrame(deny_data).to_csv(
+        "doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+        index=False,
+    )
+    print("Created deny lists")
+    # Whole page redaction list
+    page_data = {"page": [1, 2]}
+    pd.DataFrame(page_data).to_csv(
+        "doc_redaction/example_data/partnership_toolkit_redact_some_pages.csv",
+        index=False,
+    )
+    print("Created whole page redaction list")
+def create_ocr_output():
+    """Create dummy OCR output CSV."""
+    ocr_data = {
+        "page": [1, 2, 3],
+        "text": [
+            "This is page 1 content with some text",
+            "This is page 2 content with different text",
+            "This is page 3 content with more text",
+        ],
+        "left": [0.1, 0.3, 0.5],
+        "top": [0.95, 0.92, 0.88],
+        "width": [0.05, 0.02, 0.02],
+        "height": [0.01, 0.02, 0.02],
+        "line": [1, 2, 3],
+    }
+    df = pd.DataFrame(ocr_data)
+    df.to_csv(
+        "doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
+        index=False,
+    )
+    print("Created dummy OCR output CSV")
+def create_dummy_image():
+    """Create dummy image for testing."""
+    try:
+        from PIL import Image, ImageDraw, ImageFont
+        img = Image.new("RGB", (800, 600), color="white")
+        draw = ImageDraw.Draw(img)
+        # Try to use a system font
+        try:
+            font = ImageFont.truetype(
+                "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
+            )
+        except Exception as e:
+            print(f"Error loading DejaVuSans font: {e}")
+            try:
+                font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
+            except Exception as e:
+                print(f"Error loading Arial font: {e}")
+                font = ImageFont.load_default()
+        # Add text to image
+        draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
+        draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
+        draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
+        draw.text((50, 200), "Name: John Doe", fill="black", font=font)
+        draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
+        img.save("doc_redaction/example_data/example_complaint_letter.jpg")
+        print("Created dummy image")
+    except ImportError:
+        print("PIL not available, skipping image creation")
+def main():
+    """Main setup function."""
+    print("Setting up test data for GitHub Actions...")
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Python version: {sys.version}")
+    create_directories()
+    create_dummy_pdf()
+    create_dummy_csv()
+    create_dummy_word_doc()
+    create_allow_deny_lists()
+    create_ocr_output()
+    create_dummy_image()
+    print("\nTest data setup complete!")
+    print("Created files:")
+    for root, dirs, files in os.walk("doc_redaction/example_data"):
+        for file in files:
+            file_path = os.path.join(root, file)
+            print(f"  {file_path}")
+            # Verify the file exists and has content
+            if os.path.exists(file_path):
+                file_size = os.path.getsize(file_path)
+                print(f"    Size: {file_size} bytes")
+            else:
+                print("    WARNING: File does not exist!")
+    # Verify critical files exist
+    critical_files = [
+        "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+        "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
+        "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+    ]
+    print("\nVerifying critical test files:")
+    for file_path in critical_files:
+        if os.path.exists(file_path):
+            file_size = os.path.getsize(file_path)
+            print(f"✅ {file_path} exists ({file_size} bytes)")
+        else:
+            print(f"❌ {file_path} MISSING!")
+if __name__ == "__main__":
+    main()

.github/workflow_README.md ADDED Viewed

	@@ -0,0 +1,183 @@

+# GitHub Actions CI/CD Setup
+This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
+## Workflows Overview
+### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
+- **Purpose**: Basic test execution
+- **Triggers**: Push to main/dev, Pull requests
+- **OS**: Ubuntu Latest
+- **Python**: 3.11
+- **Features**:
+  - Installs system dependencies
+  - Sets up test data
+  - Runs CLI tests
+  - Runs pytest
+### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
+- **Purpose**: Full CI/CD pipeline
+- **Features**:
+  - Linting (Ruff, Black)
+  - Unit tests (Python 3.10, 3.11, 3.12)
+  - Integration tests
+  - Security scanning (Safety, Bandit)
+  - Coverage reporting
+  - Package building (on main branch)
+### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
+- **Purpose**: Cross-platform testing
+- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
+- **Python**: 3.10, 3.11, 3.12
+- **Features**: Tests compatibility across different operating systems
+### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
+- **Purpose**: Original test workflow
+- **Features**:
+  - Multiple Python versions
+  - System dependency installation
+  - Test data creation
+  - Coverage reporting
+## Setup Scripts
+### Test Data Setup (`.github/scripts/setup_test_data.py`)
+Creates dummy test files when example data is not available:
+- PDF documents
+- CSV files
+- Word documents
+- Images
+- Allow/deny lists
+- OCR output files
+## Usage
+### Running Tests Locally
+```bash
+# Install dependencies
+pip install -r requirements.txt
+pip install pytest pytest-cov
+# Setup test data
+python .github/scripts/setup_test_data.py
+# Run tests
+cd test
+python cli_epilog_suite.py
+```
+### GitHub Actions Triggers
+1. **Push to main/dev**: Runs all tests
+2. **Pull Request**: Runs tests and linting
+3. **Daily Schedule**: Runs tests at 2 AM UTC
+4. **Manual Trigger**: Can be triggered manually from GitHub
+## Configuration
+### Environment Variables
+- `PYTHON_VERSION`: Default Python version (3.11)
+- `PYTHONPATH`: Set automatically for test discovery
+### Caching
+- Pip dependencies are cached for faster builds
+- Cache key based on requirements.txt hash
+### Artifacts
+- Test results (JUnit XML)
+- Coverage reports (HTML, XML)
+- Security reports
+- Build artifacts (on main branch)
+## Test Data
+The workflows automatically create test data when example files are missing:
+### Required Files Created:
+- `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
+- `example_data/combined_case_notes.csv`
+- `example_data/Bold minimalist professional cover letter.docx`
+- `example_data/example_complaint_letter.jpg`
+- `example_data/test_allow_list_*.csv`
+- `example_data/partnership_toolkit_redact_*.csv`
+- `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
+### Dependencies Installed:
+- **System**: tesseract-ocr, poppler-utils, OpenGL libraries
+- **Python**: All requirements.txt packages + pytest, reportlab, pillow
+## Workflow Status
+### Success Criteria:
+- ✅ All tests pass
+- ✅ No linting errors
+- ✅ Security checks pass
+- ✅ Coverage meets threshold (if configured)
+### Failure Handling:
+- Tests are designed to skip gracefully if files are missing
+- AWS tests are expected to fail without credentials
+- System dependency failures are handled with fallbacks
+## Customization
+### Adding New Tests:
+1. Add test methods to `test/cli_epilog_suite.py` or pytest files under `test/test_*.py`
+2. Update test data in `setup_test_data.py` if needed
+3. Tests will automatically run in all workflows
+### Modifying Workflows:
+1. Edit the appropriate `.yml` file
+2. Test locally first
+3. Push to trigger the workflow
+### Environment-Specific Settings:
+- **Ubuntu**: Full system dependencies
+- **Windows**: Python packages only
+- **macOS**: Homebrew dependencies
+## Troubleshooting
+### Common Issues:
+1. **Missing Dependencies**:
+   - Check system dependency installation
+   - Verify Python package versions
+2. **Test Failures**:
+   - Check test data creation
+   - Verify file paths
+   - Review test output logs
+3. **AWS Test Failures**:
+   - Expected without credentials
+   - Tests are designed to handle this gracefully
+4. **System Dependency Issues**:
+   - Different OS have different requirements
+   - Check the specific OS section in workflows
+### Debug Mode:
+Add `--verbose` or `-v` flags to pytest commands for more detailed output.
+## Security
+- Dependencies are scanned with Safety
+- Code is scanned with Bandit
+- No secrets are exposed in logs
+- Test data is temporary and cleaned up
+## Performance
+- Tests run in parallel where possible
+- Dependencies are cached
+- Only necessary system packages are installed
+- Test data is created efficiently
+## Monitoring
+- Workflow status is visible in GitHub Actions tab
+- Coverage reports are uploaded to Codecov
+- Test results are available as artifacts
+- Security reports are generated and stored

.github/workflows/archive_workflows/multi-os-test.yml ADDED Viewed

	@@ -0,0 +1,115 @@

+name: Multi-OS Test
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    env:
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
+        python-version: ["3.11", "3.12", "3.13"]
+        exclude:
+          # Exclude some combinations to reduce CI time
+          #- os: windows-latest
+          #  python-version: ["3.12", "3.13"]
+          - os: macos-latest
+            python-version: ["3.12", "3.13"]
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install system dependencies (Ubuntu)
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install system dependencies (macOS)
+      if: matrix.os == 'macos-latest'
+      run: |
+        brew install tesseract poppler
+    - name: Install system dependencies (Windows)
+      if: matrix.os == 'windows-latest'
+      run: |
+        # Create tools directory
+        if (!(Test-Path "C:\tools")) {
+            mkdir C:\tools
+        }
+        # Download and install Tesseract
+        $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
+        $tesseractInstaller = "C:\tools\tesseract-installer.exe"
+        Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
+        # Install Tesseract silently
+        Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
+        # Download and extract Poppler
+        $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
+        $popplerZip = "C:\tools\poppler.zip"
+        Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
+        # Extract Poppler
+        Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
+        # Add to PATH
+        echo "C:\tools\tesseract" >> $env:GITHUB_PATH
+        echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
+        # Set environment variables for your application
+        echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
+        echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
+        echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
+        # Verify installation using full paths (since PATH won't be updated in current session)
+        & "C:\tools\tesseract\tesseract.exe" --version
+        & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+    - name: Run CLI tests
+      run: |
+        cd test
+        python cli_epilog_suite.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/ -v --tb=short

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,269 @@

+name: CI/CD Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+  #schedule:
+  # Run tests daily at 2 AM UTC
+  #  - cron: '0 2 * * *'
+permissions:
+  contents: read
+  actions: read
+  pull-requests: write
+  issues: write
+env:
+  PYTHON_VERSION: "3.11"
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff black
+    - name: Run Ruff linter
+      run: ruff check .
+    - name: Run Black formatter check
+      run: black --check .
+  test-unit:
+    runs-on: ubuntu-latest
+    env:
+      # Avoid optional VLM/torch import path in tools.run_vlm (not installed in lightweight CI deps)
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    strategy:
+      matrix:
+        python-version: [3.11, 3.12, 3.13]
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Cache pip dependencies
+      uses: actions/cache@v5
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements_lightweight.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la doc_redaction/example_data/
+        echo "Checking for specific PDF files:"
+        ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Clean up problematic config files
+      run: |
+        rm -f config*.py || true
+    - name: Run CLI tests
+      run: |
+        cd test
+        python cli_epilog_suite.py
+    - name: Run tests with pytest (JUnit and coverage)
+      run: |
+        pytest test/ -v --tb=short \
+          --junitxml=test-results.xml \
+          --cov=. --cov-config=.coveragerc \
+          --cov-report=xml --cov-report=html --cov-report=term
+    #- name: Upload coverage to Codecov - not necessary
+    #  uses: codecov/codecov-action@v3
+    #  if: matrix.python-version == '3.11'
+    #  with:
+    #    file: ./coverage.xml
+    #    flags: unittests
+    #    name: codecov-umbrella
+    #    fail_ci_if_error: false
+    - name: Upload test results
+      uses: actions/upload-artifact@v6
+      if: always()
+      with:
+        name: test-results-python-${{ matrix.python-version }}
+        path: |
+          test-results.xml
+          htmlcov/
+          coverage.xml
+  test-integration:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    env:
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la doc_redaction/example_data/
+        echo "Checking for specific PDF files:"
+        ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Run integration tests
+      run: |
+        cd test
+        python demo_single_test.py
+    - name: Test CLI help
+      run: |
+        python cli_redact.py --help
+    - name: Test CLI version
+      run: |
+        python -c "import sys; print(f'Python {sys.version}')"
+  security:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install safety bandit
+    #- name: Run safety scan - removed as now requires login
+    #  run: |
+    #    safety scan -r requirements.txt
+    - name: Run bandit security check
+      run: |
+        bandit -r . -f json -o bandit-report.json || true
+    - name: Upload security report
+      uses: actions/upload-artifact@v6
+      if: always()
+      with:
+        name: security-report
+        path: bandit-report.json
+  build:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install build dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Build package
+      run: |
+        python -m build
+    - name: Check package
+      run: |
+        twine check dist/*
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v6
+      with:
+        name: dist
+        path: dist/

.github/workflows/simple-test.yml ADDED Viewed

	@@ -0,0 +1,74 @@

+name: Simple Test Run
+on:
+  push:
+    branches: [ dev ]
+  pull_request:
+    branches: [ dev ]
+  workflow_dispatch:
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    env:
+      SHOW_VLM_MODEL_OPTIONS: "False"
+    steps:
+    - uses: actions/checkout@v6
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la doc_redaction/example_data/
+        echo "Checking for specific PDF files:"
+        ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Run CLI tests
+      run: |
+        cd test
+        python cli_epilog_suite.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/ -v --tb=short

.github/workflows/sync-pi-agent-space.yml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: Sync Pi agent to Hugging Face Space
+on:
+  push:
+    branches: [dev]
+    paths:
+      - "agent-redact/**"
+      - "skills/**"
+      - "tools/**"
+      - "intros/**"
+      - "doc_redaction/example_data/**"
+      - "AGENTS.md"
+      - "config/**"
+      - ".github/workflows/sync-pi-agent-space.yml"
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync-pi-agent-space:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+          lfs: true
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Materialize example PDFs (Git LFS)
+        run: |
+          git lfs pull --include="doc_redaction/example_data/*.pdf"
+          for f in \
+            doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
+            doc_redaction/example_data/graduate-job-example-cover-letter.pdf; do
+            if head -1 "$f" | grep -q "^version https://git-lfs.github.com/spec/v1"; then
+              echo "Example PDF is still an LFS pointer (not materialized): $f" >&2
+              exit 1
+            fi
+          done
+      - name: Flatten Pi agent Space tree
+        run: |
+          chmod +x agent-redact/pi-agent/sync_to_space.sh
+          agent-redact/pi-agent/sync_to_space.sh /tmp/pi-agent-space
+      - name: Push to Hugging Face Space
+        run: |
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing Pi agent Space: seanpedrickcase/agentic_document_redaction"
+          cd /tmp/pi-agent-space
+          git init -b main
+          git config user.name "$HF_USERNAME"
+          git config user.email "$HF_EMAIL"
+          git add .
+          git commit -m "Sync Pi agent Space: $COMMIT_MSG"
+          git remote add hf "https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/agentic_document_redaction"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [dev]
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID: ${{ secrets.HF_REPO_ID }}

.github/workflows/sync_to_hf_zero_gpu.yml ADDED Viewed

	@@ -0,0 +1,59 @@

+name: Sync to Hugging Face hub Zero GPU
+on:
+  push:
+    branches: [dev]
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync-to-hub-zero-gpu:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      # HF Spaces read Space config from README.md front matter. The repo README
+      # targets GitHub (e.g. docker); patch only this CI checkout before HF push.
+      - name: Apply HF Zero GPU Space README front matter
+        run: python3 tools/apply_hf_zero_gpu_readme_frontmatter.py
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}

.gitignore ADDED Viewed

	@@ -0,0 +1,74 @@

+*.url
+*.ipynb
+*.pyc
+*.qmd
+*.json.bak.*
+_quarto.yml
+quarto_site/*
+src/*
+redaction_deps/*
+.venv/*
+examples/*
+processing/*
+input/*
+output/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+build_deps/*
+logs/*
+usage/*
+feedback/*
+config/*
+!config/pi_agent.env.example
+!config/docker_app_config.env.example
+!config/app_config.env.example
+workspace/*
+user_guide/*
+_extensions/*
+doc_redaction.egg-info/*
+.venv_pypi_test/*
+cdk/config/*
+!cdk/config/app_config.env.example
+!cdk/config/lambda/
+cdk/config/lambda/*
+!cdk/config/lambda/lambda_function.py
+!cdk/config/headless_s3_seed/
+cdk/config/headless_s3_seed/*
+!cdk/config/headless_s3_seed/input/
+cdk/config/headless_s3_seed/input/*
+!cdk/config/headless_s3_seed/input/config/
+cdk/config/headless_s3_seed/input/config/*
+!cdk/config/headless_s3_seed/input/config/example_headless_env_file.env
+cdk/cdk.out/*
+cdk/archive/*
+tld/*
+tmp/*
+docs/*
+.pi/*
+cdk.out/*
+cdk.json
+cdk.context.json
+precheck.context.json
+.quarto/*
+/.quarto/
+/_site/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*
+src/doc_redaction.egg-info/*
+docker_compose/*
+**/*.quarto_ipynb
+skills/example_prompts/*
+.pi/sessions/
+agent-redact/pi/agent/sessions/

AGENTS.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# AGENTS.md
+Context for AI coding agents working on **doc_redaction** (PII redaction for PDFs, images, Word, and tabular files). Human-oriented docs: [README.md](README.md). User guide: [doc_redaction user guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
+## Project overview
+- **Stack**: Python 3.10+, Gradio UI ([app.py](app.py)), optional FastAPI when `RUN_FASTAPI` is enabled, AWS/LLM integrations via [tools/config.py](tools/config.py) and env files under `config/`.
+- **License**: AGPL-3.0-only (see [pyproject.toml](pyproject.toml)). Respect license terms when adding dependencies.
+- **Accuracy**: Outputs are not guaranteed complete; downstream use should assume **human review** of redacted material.
+## Cursor skills: redaction workflow (optional)
+For agents operating the deployed app (Gradio Client, review CSV, `/review_apply`), these repo-local playbooks are a suggested ladder:
+0. **[`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md)** — copy-paste user task prompt (Pass 1 default, Pass 2 gated); **user redaction requirements go at the end of the prompt**.
+1. **[`skills/doc-redaction-app/SKILL.md`](skills/doc-redaction-app/SKILL.md)** — first-pass redaction (`/doc_redact` / `/redact_document`) and downloading artifacts.
+2. **[`skills/doc-redact-page-review/SKILL.md`](skills/doc-redact-page-review/SKILL.md)** — after outputs exist: **parallel per-page** child agents, merge into one full-document `*_review_file.csv`, **single** `/review_apply` from the parent.
+3. **[`skills/doc-redaction-modifications/SKILL.md`](skills/doc-redaction-modifications/SKILL.md)** — CSV mechanics, `preview_redaction_boxes`, `/review_apply` patterns, verification, VLM and PyMuPDF fallbacks (single-thread edits and the **technical** reference for page-review children).
+## Setup
+1. **System**: Install **Tesseract** and **Poppler** (required for OCR/PDF). See [README.md](README.md) (Windows/Linux sections).
+2. **Python**: Create a venv, then install the project (e.g. `pip install -e ".[dev]"` or follow README).
+3. **Configuration**: Copy or edit environment/config as described in README / `config/` (e.g. `app_config.env`). Do not commit secrets.
+## Run locally
+- Gradio/FastAPI entrypoint is [app.py](app.py). With FastAPI enabled, typical pattern is `uvicorn app:app --host 0.0.0.0 --port 7860` (exact host/port from your config).
+- OpenAPI docs: `/docs` when the FastAPI app is mounted.
+## Tests
+- Run from repo root: `pytest` (optional: `pytest test/`).
+- Fix failures related to your changes before opening a PR.
+## Line order (local OCR and simple text extraction)
+Multi-column layouts use shared logic in [`tools/ocr_reading_order.py`](tools/ocr_reading_order.py). Controlled by **`LOCAL_OCR_READING_ORDER`** (`column` default, `legacy` for previous top-left behaviour).
+### Local OCR (Paddle/Tesseract)
+Word boxes are merged into line-level CSV rows in [`combine_ocr_results`](tools/custom_image_analyser_engine.py).
+- **`column`**: detect text columns, assign line numbers down each column left-to-right; full-width lines (headers) first. Stops cross-column merging that produced wide erroneous lines on multi-column PDFs. **Auto-fallback**: the page is treated as single-column unless a *consecutive cluster* of gutter rows (y-gap between adjacent rows ≤ `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, default `0.06` of page height) has ≥ `OCR_COLUMN_MIN_GUTTER_ROWS` (default `3`) rows **and** the cluster's topmost row is above the footer zone (`OCR_COLUMN_FOOTER_ZONE_FRACTION`, default `0.75`). This prevents isolated header bands (logo | title, 1 gutter row), signature-only blocks at the page bottom (cluster starts at y ≥ 0.75), or the combination of both, from forcing column mode on the single-column body text between them.
+- **`PADDLE_PRESERVE_LINE_BOXES=True`** or **`CONVERT_LINE_TO_WORD_LEVEL=False`** with Paddle: keep Paddle line boxes (skip word split + regrouping); line numbers still use column reading order.
+### Simple text extraction (PyMuPDF)
+[`redact_text_pdf`](tools/file_redaction.py) → [`process_page_to_structured_ocr_pymupdf`](tools/file_redaction.py) calls [`reorder_structured_text_lines`](tools/ocr_reading_order.py) after collecting lines, using **`page.mediabox`** width/height for full-span header detection.
+`reorder_structured_text_lines` now mirrors `build_line_groups` (local OCR route):
+1. **Column-aware sort** (`sort_reading_order` / `assign_layout_boxes` / `detect_column_split_xpoints`) — or legacy top-left for single-column pages.
+2. **Y-band grouping** (`group_into_lines`) — merges any same-row PyMuPDF lines that were emitted as separate objects (e.g. mixed-font spans) and splits horizontally-disparate boxes via `_finalize_line`.  *Column mode only.*
+3. **Secondary sub-column pass** (`_reorder_lines_column_major`) — ensures correct column-major order when sub-columns sit within a single macro-column.  *Column mode only.*
+4. When a group contains more than one box, constituent boxes are **merged** into a single `OCRResult` (union bbox, joined text, concatenated chars/words).
+In single-column / legacy mode only step 1 is applied; PyMuPDF lines are pre-formed so no merging is needed.
+### Tunables (both routes)
+`OCR_FULL_SPAN_WIDTH_RATIO`, `OCR_COLUMN_GAP_MIN_FRACTION`, `OCR_COLUMN_GUTTER_MIN_FRACTION`, `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` (default `0.015` — fine-grained gutter scan in `assign_layout_boxes`; lower = detects narrower sub-column boundaries), `OCR_COLUMN_MIN_GUTTER_ROWS`, `OCR_COLUMN_MAX_BOX_HEIGHT_RATIO`, `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, `OCR_COLUMN_FOOTER_ZONE_FRACTION`, `OCR_LINE_SPLIT_GAP_FRACTION` (default 0.025 — horizontal gap fraction that forces a line split; must be below the narrowest column gutter, ~0.030 for two-page spreads; also used as the gap threshold for the secondary sub-column sort in `build_line_groups`), `OCR_LINE_Y_THRESHOLD_FRACTION` (default 0.013 — row-alignment tolerance as a fraction of page height; reduced from 0.015 to correctly separate tightly-set 10 pt body text whose row spacing is ~0.014), `OCR_LINE_Y_THRESHOLD_MIN_PX`.
+**Sub-column ordering** (`build_line_groups`): after the primary word-level column sort, a second pass (`_reorder_lines_column_major`) clusters the produced line groups by their leftmost x-position using `OCR_LINE_SPLIT_GAP_FRACTION` as the gap threshold. This ensures that adjacent narrow sub-columns whose word-level centre gap is below `column_gap_threshold` (e.g. two columns on a spread where each page is already one macro-column) are still output in left-to-right column-major order rather than interleaved by y-position.
+**Fine-grained gutter-based column assignment** (`assign_layout_boxes`): before falling back to centre-gap clustering, `detect_column_split_xpoints` scans the page for structural gutters at the finer `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` threshold (default 0.015). Each qualifying gutter cluster produces a `(split_x, y_min)` pair — the split point is only applied to boxes whose `top ≥ y_min`, preventing a narrow sub-column gutter (visible only in the lower two-column section) from mis-splitting a full-width introductory paragraph that sits above it. This correctly separates narrow adjacent columns (e.g. 1.9 % gutter on a two-page spread) without fragmenting full-width headings or paragraphs.
+Changing line order affects PII page text, duplicate-page detection, and review CSV line indices on multi-column documents; re-review after upgrading.
+## Agentic / programmatic access (two surfaces)
+### 1. FastAPI Agent API (recommended for LLM agents: small JSON bodies)
+When `RUN_FASTAPI` is true, routes are mounted under **`/agent`** ([agent_routes.py](agent_routes.py)).
+- **Catalog**: `GET /agent/operations` — maps each Gradio `api_name` to an HTTP path and notes whether the route is implemented via CLI or returns HTTP 501 for Gradio-only flows.
+- **Implemented POST routes** (CLI- or [tools/simplified_api.py](tools/simplified_api.py)-backed where noted):
+  `redact_document`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_pdfs`, `combine_review_csvs`, `export_review_redaction_overlay`, `export_review_page_ocr_visualisation`, `apply_review_redactions`, **`verify_redaction_coverage`** (Pass 1 QA: `must_redact` / `must_not_redact` regex lists, optional `redacted_pdf_path`, optional `auto_prune_suspicious` + `pruned_output_path`; returns `pass_strict`, `pass_with_cleanup`, `pages_flagged_for_vlm`, `pages_needing_csv_cleanup`), **`word_level_ocr_text_search`** (headless word OCR search with optional review-box overlap flags).
+**Optional post-redaction Pass 1 QA (main app / CLI):** When `POST_REDACT_PASS1_QA=True` in [`tools/config.py`](tools/config.py) (or `config/app_config.env`), initial redaction emits `*_coverage_report.json` beside the review CSV and optionally `*_review_file_pruned.csv` (sibling, when `POST_REDACT_PASS1_AUTO_PRUNE=True`). Uses deny/allow lists and/or `POST_REDACT_PASS1_MUST_REDACT_PATH` / `POST_REDACT_PASS1_MUST_NOT_REDACT_PATH`. CLI overrides: `--post-redact-pass1-qa`, `--post-redact-pass1-auto-prune`. This is pre-review-apply sanity QA only — agent Pass 1 (policy edits + `/review_apply`) remains separate.
+  Note: on Gradio ([app.py](app.py)), the Review-tab visual exports use `api_name` **`page_redaction_review_image`** and **`page_ocr_review_image`**; the **`/agent`** routes above keep the explicit `export_review_*` names for the same operations.
+- **Gradio-only stubs** (501 + JSON hint): `load_and_prepare_documents_or_data`.
+- **Auth**: If `AGENT_API_KEY` is set in the environment, send header `X-Agent-API-Key` with that value.
+- **Paths**: Inputs must resolve to files under the repo root, `INPUT_FOLDER`, or `OUTPUT_FOLDER` (see router validation).
+Implementation uses **`cli_redact.main(direct_mode_args=...)`** where a CLI task exists (same behaviour as [cli_redact.py](cli_redact.py)); `apply_review_redactions` calls [tools/simplified_api.py](tools/simplified_api.py) instead.
+### 2. Gradio Client API (e.g. Hugging Face Spaces)
+For remote Spaces or any Gradio deployment exposing the HTTP API:
+- **Schema**: `GET https://<host>/gradio_api/info`
+- **Call**: `POST https://<host>/gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order matches the named endpoint’s component list).
+- **Poll**: `GET https://<host>/gradio_api/call/{api_name}/{event_id}`
+- **Hugging Face**: `Authorization: Bearer $HF_TOKEN`
+Named `api_name` values in this app include: `redact_document`, `load_and_prepare_documents_or_data`, `apply_review_redactions`, **`doc_redact`** (simple `gr.api`: one PDF/image + optional OCR/PII knobs; returns `(output_paths, message)`; `api_name='/doc_redact'`; parameters include `document_file`, `redact_entities`, `output_dir`, `ocr_method`, `pii_method`, `allow_list`, `deny_list`, `page_min`, `page_max`, **`handwrite_signature_checkbox`** — AWS Textract extraction options such as `Extract handwriting` / `Extract signatures`), **`review_apply`** (simple `gr.api`: PDF + `*_review_file.csv`; returns `(output_paths, message)`; `api_name='/review_apply'`), **`preview_boxes`** (simple `gr.api`: PDF + `*_review_file.csv`; renders proposed boxes onto the original PDF and returns `(zip_path, message)` — use to verify coordinates *before* calling `review_apply`, no redaction applied; `api_name='/preview_boxes'`), **`pdf_summarise`** (simple `gr.api`: PDF + optional summarisation/OCR knobs; returns `(output_paths, status_message, summary_text)`; `api_name='/pdf_summarise'`), **`tabular_redact`** (simple `gr.api`: one tabular file (CSV/XLSX/Parquet/DOCX) + optional knobs; returns `(output_paths, message)`; `api_name='/tabular_redact'`), **`page_redaction_review_image`** (short review overlay export; `api_name='/page_redaction_review_image'`), **`page_ocr_review_image`** (short OCR visualisation export; `api_name='/page_ocr_review_image'`), `word_level_ocr_text_search`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_csvs`, `combine_review_pdfs`. The matching **`POST /agent`** names for those two visual exports are `export_review_redaction_overlay` and `export_review_page_ocr_visualisation` (§1). Many endpoints require **many positional arguments** (full Gradio state); prefer the short `gr.api` routes above or **`POST /agent/apply_review_redactions`** where applicable instead of building the full `data` array from `/gradio_api/info`.
+## CLI parity
+For scripting and tests, `python cli_redact.py` with flags is authoritative; programmatic merges use `get_cli_default_args_dict()` in [cli_redact.py](cli_redact.py).
+## Security and data handling
+- Do not commit API keys, tokens, or customer data.
+- Treat paths as untrusted outside validated roots (see [tools/secure_path_utils.py](tools/secure_path_utils.py)).
+- Optional `instruction` / LLM fields must not be passed into shell or unconstrained config keys.
+## Conventions for PRs
+- Keep changes focused; avoid drive-by refactors.
+- Match existing naming and patterns in [app.py](app.py) and [tools/](tools/).
+- Update tests when behaviour changes; run `pytest` before merge.

Dockerfile ADDED Viewed

	@@ -0,0 +1,235 @@

+# Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
+# Install system dependencies
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        cmake \
+        unzip \
+        libcurl4-openssl-dev \
+        git \
+    && pip install --upgrade pip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY requirements_lightweight.txt .
+RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
+ARG INSTALL_GRADIO_MCP=False
+ENV INSTALL_GRADIO_MCP=${INSTALL_GRADIO_MCP}
+RUN if [ "$INSTALL_GRADIO_MCP" = "True" ]; then \
+    pip install --verbose --no-cache-dir --force-reinstall --target=/install "gradio[mcp]>=6.16.0"; \
+fi
+# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
+ARG INSTALL_PADDLEOCR=False
+ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
+ARG PADDLE_GPU_ENABLED=False
+ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
+RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
+    pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
+    pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
+    pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.7.0"; \
+elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
+    pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
+    pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.7.0" && \
+    pip install --verbose --no-cache-dir --target=/install "torch<=2.9.1" --index-url https://download.pytorch.org/whl/cu129 && \
+    pip install --verbose --no-cache-dir --target=/install "torchvision<=0.24.1" --index-url https://download.pytorch.org/whl/cu129 && \
+    pip install --verbose --no-cache-dir --target=/install "transformers<=5.12.0"; \
+fi
+ARG INSTALL_VLM=False
+ENV INSTALL_VLM=${INSTALL_VLM}
+ARG TORCH_GPU_ENABLED=False
+ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
+# Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
+RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
+    pip install --verbose --no-cache-dir --target=/install \
+    "torch==2.9.1+cpu" \
+    "torchvision==0.24.1+cpu" \
+    "transformers<=5.12.0" \
+    "accelerate<=1.13.0" \
+    "bitsandbytes<=0.49.2" \
+    "sentencepiece<=0.2.1" \
+    --extra-index-url https://download.pytorch.org/whl/cpu; \
+elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install "torch<=2.9.1" --index-url https://download.pytorch.org/whl/cu129 && \
+    pip install --verbose --no-cache-dir --target=/install "torchvision<=0.24.1" --index-url https://download.pytorch.org/whl/cu129 && \
+    pip install --verbose --no-cache-dir --target=/install \
+        "transformers<=5.12.0" \
+        "accelerate<=1.13.0" \
+        "bitsandbytes<=0.49.2" \
+        "sentencepiece<=0.2.1" && \
+    pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
+    pip install --verbose --no-cache-dir --target=/install  https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
+    pip install --verbose --no-cache-dir --target=/install  https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
+fi
+# ===================================================================
+# Stage 2: A common base for both Lambda and Gradio
+# ===================================================================
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
+# MUST re-declare ARGs in every stage where they are used in RUN commands
+ARG TORCH_GPU_ENABLED=False
+ARG PADDLE_GPU_ENABLED=False
+ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
+ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    poppler-utils \
+    libgl1 \
+    libglib2.0-0 && \
+    if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
+        apt-get install -y --no-install-recommends libgomp1; \
+    fi && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+ENV APP_HOME=/home/user
+# Set env variables for Gradio & other apps
+ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
+    MPLCONFIGDIR=/tmp/matplotlib_cache/ \
+    GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
+    GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
+    FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
+    ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
+    USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
+    CONFIG_FOLDER=$APP_HOME/app/config/ \
+    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
+    TESSERACT_DATA_FOLDER=/usr/share/tessdata \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_ANALYTICS_ENABLED=False
+# Copy Python packages from the builder stage
+COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
+COPY --from=builder /install/bin /usr/local/bin/
+# Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
+# passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
+RUN pip install --no-cache-dir "protobuf<=7.34.0"
+# English pipeline is not a normal PyPI dependency; bundle it in the image so runtime works offline.
+# Placed before COPY app code so application changes do not invalidate this layer.
+RUN python -m spacy download en_core_web_lg
+# Copy your application code and entrypoint
+COPY . ${APP_HOME}/app
+COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
+# Fix line endings and set execute permissions
+RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
+    && chmod +x ${APP_HOME}/app/entrypoint.sh
+WORKDIR ${APP_HOME}/app
+# ===================================================================
+# FINAL Stage 3: The Lambda Image (runs as root for simplicity)
+# ===================================================================
+FROM base AS lambda
+# Set runtime ENV for Lambda mode
+ENV APP_MODE=lambda
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["lambda_entrypoint.lambda_handler"]
+# ===================================================================
+# FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
+# ===================================================================
+FROM base AS gradio
+# Set runtime ENV for Gradio mode
+ENV APP_MODE=gradio
+# Create non-root user
+RUN useradd -m -u 1000 user
+# Create the base application directory and set its ownership
+RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
+# Create required sub-folders within the app directory and set their permissions
+# This ensures these specific directories are owned by 'user'
+RUN mkdir -p \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chown user:user \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chmod 755 \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config
+# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
+RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
+    && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
+    && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME} \
+    && mkdir -p ${APP_HOME}/.paddlex \
+    && chown user:user ${APP_HOME}/.paddlex \
+    && chmod 755 ${APP_HOME}/.paddlex \
+    && mkdir -p ${APP_HOME}/.local/share/spacy/data \
+    && chown user:user ${APP_HOME}/.local/share/spacy/data \
+    && chmod 755 ${APP_HOME}/.local/share/spacy/data \
+    && mkdir -p /usr/share/tessdata \
+    && chown user:user /usr/share/tessdata \
+    && chmod 755 /usr/share/tessdata
+# Fix apply user ownership to all files in the home directory
+RUN chown -R user:user /home/user
+# Set permissions for Python executable
+RUN chmod 755 /usr/local/bin/python
+# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["/tmp/gradio_tmp"]
+VOLUME ["/tmp/tld"]
+VOLUME ["/home/user/app/output"]
+VOLUME ["/home/user/app/input"]
+VOLUME ["/home/user/app/logs"]
+VOLUME ["/home/user/app/usage"]
+VOLUME ["/home/user/app/feedback"]
+VOLUME ["/home/user/app/config"]
+VOLUME ["/home/user/.paddlex"]
+VOLUME ["/home/user/.local/share/spacy/data"]
+VOLUME ["/usr/share/tessdata"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+USER user
+EXPOSE $GRADIO_SERVER_PORT
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,661 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,4 @@

+recursive-include doc_redaction/assets *.png
+recursive-include doc_redaction/example_data *
+recursive-include intros *.txt

README.md ADDED Viewed

	@@ -0,0 +1,367 @@

+---
+title: Document redaction
+emoji: 📝
+colorFrom: blue
+colorTo: yellow
+sdk: docker
+app_file: app.py
+pinned: true
+license: agpl-3.0
+short_description: OCR / redact PDF documents and tabular data
+---
+# Document redaction (doc_redaction)
+<a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
+---
+## 🚀 Quick Start - Installation and first run
+Follow these instructions to get the document redaction application running on your local machine.
+### 1. Installation
+#### Option 1 - Recommended: Install from source repo
+Clone the repository and install in editable mode:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+pip install -e .
+```
+##### Install extras (Paddle or Transformers/Torch VLM)
+To install with PaddleOCR (with a transformers backend as of v2.4.0):
+```bash
+pip install -e ".[paddle]"
+```
+If you want to run VLMs / LLMs with the transformers package:
+```bash
+pip install -e ".[vlm]"
+```
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the GPU-enabled version of torch, it is advised to install the following version:
+```bash
+pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129
+```
+#### Option 2 - Install from PyPI
+Create a virtual environment (recommended) and install **doc_redaction**.
+```bash
+python -m venv venv
+# Windows:
+.\venv\Scripts\activate
+# macOS/Linux:
+source venv/bin/activate
+```
+The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
+```bash
+pip install doc_redaction
+```
+Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
+```bash
+pip install "doc_redaction[paddle]"
+```
+For running VLMs / LLMs with the transformers package:
+```bash
+pip install "doc_redaction[vlm]"
+```
+For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
+**Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
+```bash
+python -m app
+```
+**Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
+- It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
+- It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
+- The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
+In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
+#### Option 3 - Docker installation
+The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
+##### With Llama.cpp / vLLM inference server
+The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+##### Without Llama.cpp / vLLM inference server
+If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
+The main [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) produces two final images via build targets: **`gradio`** (default web UI, non-root user, named volumes for writable paths) and **`lambda`** (AWS Lambda handler). Build examples:
+```bash
+docker build -f Dockerfile --target gradio -t doc-redaction-gradio .
+docker build -f Dockerfile --target lambda -t doc-redaction-lambda .
+```
+##### Pi agent (agentic redaction)
+The [Pi](https://github.com/earendil-works/pi) orchestration UI uses a separate multi-stage image at [agent-redact/pi-agent/Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/pi-agent/Dockerfile). It shares the same Python 3.12 slim base as the main app; a small Node stage installs the `pi` CLI, which is copied into the runtime image.
+| Build target | Typical use |
+|--------------|-------------|
+| **`dev`** | Local development with [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml) — the repo is bind-mounted; only Pi CLI + Python deps are in the image. |
+| **`runtime`** | [Hugging Face Space](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) and AWS ECS — agent code is baked in; runs as non-root `user` with **named volumes** for workspace, uploads, and session dirs (read-only root filesystem friendly). |
+Build from the repository root:
+```bash
+docker build -f agent-redact/pi-agent/Dockerfile --target dev -t pi-agent-dev .
+docker build -f agent-redact/pi-agent/Dockerfile --target runtime -t pi-agent-runtime .
+```
+For llama.cpp + Pi together, see the compose examples at the top of [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml). Further detail: [agent-redact/README.md](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/README.md).
+#### Option 4 - Installation on AWS with CDK
+The repo contains a [CDK folder](https://github.com/seanpedrick-case/doc_redaction/tree/main/cdk), that contains all the files you need to setup and deploy to an AWS environment with CDK. The installation wizard is [cdk_install.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cdk/cdk_install.py), which provides a number of options to deploy the Document Redaction App to AWS for demonstration or production. More details on CDK deployment can be found in the [Installation Guide](https://seanpedrick-case.github.io/doc_redaction/src/installation_guide.html).
+### 2. Install prerequisites: Tesseract and Poppler
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). If not using a Docker-based deployment, you will need to install them on your system before proceeding. To run the Document Redaction app successfully, these tools need to be installed and either 1. added to PATH, or 2. be in a folder that is directly referenced in the config/app_config.env file with the variables TESSERACT_FOLDER and POPPLER_FOLDER (defined [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py) if you want to see the code). The instructions below will guide you through different ways to install these dependencies.
+---
+#### Automated dependency setup (recommended)
+If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
+You need the installer script available first, which means either:
+- **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
+- **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
+From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
+```bash
+python -m doc_redaction.install_deps
+```
+This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
+To just check whether your machine can already see the tools:
+```bash
+python -m doc_redaction.install_deps --verify-only
+```
+#### **On Windows**
+If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
+1.  **Install Tesseract OCR:**
+    *   Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+    *   Run the installer.
+    *   **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+2.  **Install Poppler:**
+    *   Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
+    *   Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+    *   You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+        *   Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+        *   Click the "Environment Variables..." button.
+        *   In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+        *   Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+        *   Click OK on all windows to save the changes.
+    To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+---
+#### **On Linux (Debian/Ubuntu)**
+Open your terminal and run the following command to install Tesseract and Poppler:
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+#### **On Linux (Fedora/CentOS/RHEL)**
+Open your terminal and use the `dnf` or `yum` package manager:
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+### 3. Run the Application
+With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
+```bash
+python app.py
+```
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+Open this URL in your web browser to use the document redaction tool
+#### Command line interface
+For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
+If you installed from **PyPI**, use the installed console script:
+```bash
+cli_redact --help
+```
+From a **repository checkout**, you can also run:
+```bash
+python cli_redact.py --help
+```
+#### Python package commands
+For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+---
+### 4. ⚙️ Configuration (Optional)
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+To get started:
+1.  Copy `config/app_config.env.example` to `config/app_config.env`.
+2.  Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+If you do not create this file, the application will run with default settings.
+#### Configuration Breakdown
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+---
+#### **Local & General Settings (No AWS Required)**
+These settings are useful for all users, regardless of whether you are using AWS.
+*   `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+    *   Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+    *   Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+    *   **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+*   `TESSERACT_DATA_FOLDER`
+    *   If Tesseract runs but you see an error like `Error opening data file ./eng.traineddata` or `Tesseract couldn't load any languages`, this is usually because it can't find the `tessdata/` language files.
+    *   Set this to the folder that contains `eng.traineddata` (typically a `tessdata` directory).
+    *   **Examples (Windows):** `TESSERACT_DATA_FOLDER=C:/Program Files/Tesseract-OCR/tessdata`
+*   `SHOW_LANGUAGE_SELECTION=True`
+    *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
+*   `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+*   `SESSION_OUTPUT_FOLDER=False`
+    *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+*   `DISPLAY_FILE_NAMES_IN_LOGS=False`
+    *   For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+---
+#### **AWS-Specific Settings**
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+*   `RUN_AWS_FUNCTIONS=True`
+    *   **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
+*   **UI Options:**
+    *   `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+    *   `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+*   **Core AWS Configuration:**
+    *   `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+    *   `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+*   **AWS Logging:**
+    *   `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+    *   `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+*   **Advanced AWS Textract Features:**
+    *   `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+    *   `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+    *   `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+*   **Cost Tracking (for internal accounting):**
+    *   `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+    *   `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+    *   `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+    *   `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
+## For agents (API quickstart)
+If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
+- **Discover schema**: `GET /gradio_api/info`
+- **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
+- **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
+- **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
+- **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
+### Choose the correct route (prefer short `gr.api` endpoints)
+Fetch `/gradio_api/info` and then prefer the simplest route that exists:
+- **Apply edited review CSV to a PDF**: `/review_apply`
+- **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
+- **Summarise a PDF**: `/pdf_summarise`
+- **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
+If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
+### Common gotchas
+- **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
+- **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
+- **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
+### Optional: MCP server
+If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
+**Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
+For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings).  AWS Comprehend gives better results at a small cost.
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.

README_PYPI.md ADDED Viewed

	@@ -0,0 +1,351 @@

+# Document redaction (doc_redaction)
+<a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
+---
+## 🚀 Quick Start - Installation and first run
+Follow these instructions to get the document redaction application running on your local machine.
+### 1. Installation
+#### Option 1 - Recommended: Install from source repo
+Clone the repository and install in editable mode:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+pip install -e .
+```
+##### Install extras (Paddle or Transformers/Torch VLM)
+To install with PaddleOCR (with a transformers backend as of v2.4.0):
+```bash
+pip install -e ".[paddle]"
+```
+If you want to run VLMs / LLMs with the transformers package:
+```bash
+pip install -e ".[vlm]"
+```
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the GPU-enabled version of torch, it is advised to install the following version:
+```bash
+pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129
+```
+#### Option 2 - Install from PyPI
+Create a virtual environment (recommended) and install **doc_redaction**.
+```bash
+python -m venv venv
+# Windows:
+.\venv\Scripts\activate
+# macOS/Linux:
+source venv/bin/activate
+```
+The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
+```bash
+pip install doc_redaction
+```
+Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
+```bash
+pip install "doc_redaction[paddle]"
+```
+For running VLMs / LLMs with the transformers package:
+```bash
+pip install "doc_redaction[vlm]"
+```
+For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
+**Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
+```bash
+python -m app
+```
+**Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
+- It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
+- It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
+- The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
+In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
+#### Option 3 - Docker installation
+The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
+##### With Llama.cpp / vLLM inference server
+The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
+For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
+You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
+##### Without Llama.cpp / vLLM inference server
+If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
+The main [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) produces two final images via build targets: **`gradio`** (default web UI, non-root user, named volumes for writable paths) and **`lambda`** (AWS Lambda handler). Build examples:
+```bash
+docker build -f Dockerfile --target gradio -t doc-redaction-gradio .
+docker build -f Dockerfile --target lambda -t doc-redaction-lambda .
+```
+##### Pi agent (agentic redaction)
+The [Pi](https://github.com/earendil-works/pi) orchestration UI uses a separate multi-stage image at [agent-redact/pi-agent/Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/pi-agent/Dockerfile). It shares the same Python 3.12 slim base as the main app; a small Node stage installs the `pi` CLI, which is copied into the runtime image.
+| Build target | Typical use |
+|--------------|-------------|
+| **`dev`** | Local development with [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml) — the repo is bind-mounted; only Pi CLI + Python deps are in the image. |
+| **`runtime`** | [Hugging Face Space](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) and AWS ECS — agent code is baked in; runs as non-root `user` with **named volumes** for workspace, uploads, and session dirs (read-only root filesystem friendly). |
+Build from the repository root:
+```bash
+docker build -f agent-redact/pi-agent/Dockerfile --target dev -t pi-agent-dev .
+docker build -f agent-redact/pi-agent/Dockerfile --target runtime -t pi-agent-runtime .
+```
+For llama.cpp + Pi together, see the compose examples at the top of [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml). Further detail: [agent-redact/README.md](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/README.md).
+#### Option 4 - Installation on AWS with CDK
+The repo contains a [CDK folder](https://github.com/seanpedrick-case/doc_redaction/tree/main/cdk), that contains all the files you need to setup and deploy to an AWS environment with CDK. The installation wizard is [cdk_install.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cdk/cdk_install.py), which provides a number of options to deploy the Document Redaction App to AWS for demonstration or production. More details on CDK deployment can be found in the [Installation Guide](https://seanpedrick-case.github.io/doc_redaction/src/installation_guide.html).
+### 2. Install prerequisites: Tesseract and Poppler
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
+---
+#### Automated dependency setup (recommended)
+If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
+You need the installer script available first, which means either:
+- **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
+- **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
+From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
+```bash
+python -m doc_redaction.install_deps
+```
+This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
+To just check whether your machine can already see the tools:
+```bash
+python -m doc_redaction.install_deps --verify-only
+```
+#### **On Windows**
+If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
+1.  **Install Tesseract OCR:**
+    *   Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+    *   Run the installer.
+    *   **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+2.  **Install Poppler:**
+    *   Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
+    *   Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+    *   You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+        *   Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+        *   Click the "Environment Variables..." button.
+        *   In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+        *   Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+        *   Click OK on all windows to save the changes.
+    To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+---
+#### **On Linux (Debian/Ubuntu)**
+Open your terminal and run the following command to install Tesseract and Poppler:
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+#### **On Linux (Fedora/CentOS/RHEL)**
+Open your terminal and use the `dnf` or `yum` package manager:
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+### 3. Run the Application
+With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
+```bash
+python app.py
+```
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+Open this URL in your web browser to use the document redaction tool
+#### Command line interface
+For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
+If you installed from **PyPI**, use the installed console script:
+```bash
+cli_redact --help
+```
+From a **repository checkout**, you can also run:
+```bash
+python cli_redact.py --help
+```
+#### Python package commands
+For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+---
+### 4. ⚙️ Configuration (Optional)
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+To get started:
+1.  Copy `config/app_config.env.example` to `config/app_config.env`.
+2.  Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+If you do not create this file, the application will run with default settings.
+#### Configuration Breakdown
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+---
+#### **Local & General Settings (No AWS Required)**
+These settings are useful for all users, regardless of whether you are using AWS.
+*   `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+    *   Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+    *   Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+    *   **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+*   `SHOW_LANGUAGE_SELECTION=True`
+    *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
+*   `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+*   `SESSION_OUTPUT_FOLDER=False`
+    *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+*   `DISPLAY_FILE_NAMES_IN_LOGS=False`
+    *   For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+---
+#### **AWS-Specific Settings**
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+*   `RUN_AWS_FUNCTIONS=True`
+    *   **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
+*   **UI Options:**
+    *   `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+    *   `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+*   **Core AWS Configuration:**
+    *   `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+    *   `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+*   **AWS Logging:**
+    *   `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+    *   `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+*   **Advanced AWS Textract Features:**
+    *   `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+    *   `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+    *   `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+*   **Cost Tracking (for internal accounting):**
+    *   `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+    *   `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+    *   `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+    *   `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
+## For agents (API quickstart)
+If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
+- **Discover schema**: `GET /gradio_api/info`
+- **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
+- **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
+- **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
+- **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
+### Choose the correct route (prefer short `gr.api` endpoints)
+Fetch `/gradio_api/info` and then prefer the simplest route that exists:
+- **Apply edited review CSV to a PDF**: `/review_apply`
+- **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
+- **Summarise a PDF**: `/pdf_summarise`
+- **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
+If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
+### Common gotchas
+- **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
+- **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
+- **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
+### Optional: MCP server
+If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
+**Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
+For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings).  AWS Comprehend gives better results at a small cost.
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.

agent-redact/README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# Agent redaction (Pi)
+Pi-based agentic document redaction: local Docker orchestration and Hugging Face Space packaging.
+| Path | Purpose |
+|------|---------|
+| [`pi/`](pi/) | Gradio UI, Pi RPC client, remote redaction helpers, runtime config |
+| [`pi-agent/`](pi-agent/) | Pi Docker image (`dev` + `runtime` targets), sync script, and manifest |
+| [`requirements_pi_agent.txt`](requirements_pi_agent.txt) | Python deps for the Pi agent image |
+Per-user output isolation uses Gradio `session_hash` subfolders under `PI_WORKSPACE_DIR` (see `agent-redact/pi/session_workspace.py`). Enabled by default locally and on HF Spaces. Set `PI_SESSION_WORKSPACE=false` only if you want one shared workspace tree for all sessions.
+## Local Docker
+Use the `pi-agent` service in [`docker-compose_llama_agentic.yml`](../docker-compose_llama_agentic.yml) (profile `27b_36`). See [`pi/agent/README.md`](pi/agent/README.md).
+## Hugging Face Space
+Build from repo root:
+```bash
+# Production (HF Space / ECS)
+docker build -f agent-redact/pi-agent/Dockerfile --target runtime .
+# Local compose (bind-mounted repo)
+docker build -f agent-redact/pi-agent/Dockerfile --target dev .
+```
+Sync to Space on pushes to `dev` via [`.github/workflows/sync-pi-agent-space.yml`](../.github/workflows/sync-pi-agent-space.yml).

agent-redact/pi-agent/.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.git
+.github
+**/__pycache__
+**/*.pyc
+**/.pytest_cache
+**/node_modules
+workspace
+output
+input
+config/pi_agent.env

agent-redact/pi-agent/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Example PDFs must be plain files in the Space repo (not Git LFS pointers).
2	+ *.pdf -filter -diff -merge

agent-redact/pi-agent/Dockerfile ADDED Viewed

	@@ -0,0 +1,176 @@

+# syntax=docker/dockerfile:1
+# Pi agent image (dev + production). Build from monorepo root:
+#   docker build -f agent-redact/pi-agent/Dockerfile --target dev .
+#   docker build -f agent-redact/pi-agent/Dockerfile --target runtime .
+# Root .dockerignore must allow config/*.example into the context (secrets stay gitignored).
+#
+# Targets:
+#   dev     — docker-compose: Pi CLI + Python deps; app tree bind-mounted at runtime.
+#   runtime — HF Space / AWS ECS: baked agent-redact tree, non-root user, named volumes.
+# ===================================================================
+# Stage 1: Pi CLI (Node) — isolated so the runtime base stays Python 3.12
+# ===================================================================
+FROM public.ecr.aws/docker/library/node:24.16.0-slim AS pi-cli
+ENV NPM_CONFIG_PREFIX=/opt/pi
+ENV PATH="/opt/pi/bin:${PATH}"
+RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
+# ===================================================================
+# Stage 2: Shared Python base (aligned with main app Dockerfile)
+# ===================================================================
+FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS pi-base
+ENV NODE_ENV=production
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NPM_CONFIG_LOGLEVEL=warn
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV APP_HOME=/home/user
+ENV PI_WORKDIR=/workspace/doc_redaction
+ENV PYTHONPATH=${PI_WORKDIR}:${PI_WORKDIR}/agent-redact/pi
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV MPLCONFIGDIR=/tmp/matplotlib_cache/
+ENV XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
+ENV PATH="/opt/pi/bin:${PATH}"
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash \
+    git \
+    curl \
+    ca-certificates \
+    procps \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+COPY --from=pi-cli /opt/pi /opt/pi
+COPY --from=pi-cli /usr/local/bin/node /usr/local/bin/node
+COPY agent-redact/requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
+RUN pip install --no-cache-dir -r /tmp/requirements_pi_agent.txt \
+    && rm /tmp/requirements_pi_agent.txt
+# ===================================================================
+# Stage 3: Dev — thin image for docker-compose (repo bind-mounted)
+# ===================================================================
+FROM pi-base AS dev
+ENV HOME=${APP_HOME}
+ENV PI_WORKSPACE_DIR=${APP_HOME}/app/workspace
+ENV PI_UPLOAD_ROOT=/tmp/gradio
+ENV PI_SESSION_DIR=${APP_HOME}/.pi/agent/sessions
+RUN useradd -m -u 1000 user \
+    && mkdir -p \
+        ${APP_HOME}/app/workspace \
+        ${APP_HOME}/.pi/agent/sessions \
+        ${PI_WORKDIR} \
+        /tmp/gradio \
+        /tmp/matplotlib_cache \
+        ${XDG_CACHE_HOME} \
+    && chown -R user:user ${APP_HOME} ${PI_WORKDIR} \
+    && chown user:user /tmp/gradio /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
+    && chmod 1777 /tmp/gradio /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME}
+WORKDIR ${PI_WORKDIR}
+USER user
+RUN pi --version
+# Compose overrides entrypoint with agent-redact/pi/start.sh on the bind mount.
+# ===================================================================
+# Stage 4: Runtime — baked app for Hugging Face Space and AWS ECS
+# ===================================================================
+FROM pi-base AS runtime
+ENV PI_DEPLOYMENT_PROFILE=hf-space
+ENV PI_DEFAULT_PROVIDER=google-gemini
+ENV PI_DEFAULT_MODEL=gemini-flash-lite-latest
+ENV DOC_REDACTION_GRADIO_URL=https://seanpedrickcase-document-redaction.hf.space
+ENV HOME=${APP_HOME}
+ENV PI_WORKDIR=/workspace/doc_redaction
+# Fargate uses volume mounts under ${APP_HOME}/app/workspace (CDK chown entrypoint).
+# ECS Express has no mounts — CDK sets PI_WORKSPACE_DIR=/tmp/pi-workspace at deploy.
+ENV PI_WORKSPACE_DIR=${APP_HOME}/app/workspace
+ENV PI_UPLOAD_ROOT=/tmp/gradio
+ENV PI_SESSION_DIR=/tmp/pi-sessions
+ENV PI_CODING_AGENT_DIR=/tmp/pi-agent
+ENV ACCESS_LOGS_FOLDER=/tmp/pi-logs/
+ENV USAGE_LOGS_FOLDER=/tmp/pi-usage/
+ENV FEEDBACK_LOGS_FOLDER=/tmp/pi-feedback/
+ENV PI_OFFLINE=1
+ENV PI_SKIP_VERSION_CHECK=1
+ENV PI_GRADIO_SHOW_EXAMPLES=true
+ENV PI_UI_HOST=0.0.0.0
+ENV PI_UI_PORT=7860
+ENV PI_GRADIO_PORT=7860
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+ENV GRADIO_ANALYTICS_ENABLED=False
+ENV RUN_FASTAPI=False
+WORKDIR ${PI_WORKDIR}
+COPY agent-redact/pi agent-redact/pi
+COPY skills skills
+COPY tools tools
+# Committed template only (see sync-manifest.txt); runtime secrets come from S3/env on ECS.
+COPY config/pi_agent.env.example config/pi_agent.env.example
+COPY intros intros
+COPY AGENTS.md AGENTS.md
+COPY doc_redaction/example_data doc_redaction/example_data
+RUN test -f doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
+    && test -f doc_redaction/example_data/graduate-job-example-cover-letter.pdf \
+    && ! head -1 doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
+        | grep -q "^version https://git-lfs.github.com/spec/v1"
+RUN useradd -m -u 1000 user \
+    && mkdir -p \
+        ${APP_HOME}/app/workspace \
+        ${APP_HOME}/.pi/agent \
+        /tmp/gradio \
+        /tmp/pi-sessions \
+        /tmp/matplotlib_cache \
+        ${XDG_CACHE_HOME} \
+    && chown user:user \
+        ${APP_HOME}/app/workspace \
+        ${APP_HOME}/.pi \
+        /tmp/gradio \
+        /tmp/pi-sessions \
+        /tmp/matplotlib_cache \
+        ${XDG_CACHE_HOME} \
+    && chmod 755 ${APP_HOME}/app/workspace ${APP_HOME}/.pi \
+    && chmod 1777 /tmp/gradio /tmp/pi-sessions /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME} \
+    && chown -R root:root ${PI_WORKDIR} \
+    && find ${PI_WORKDIR} -type d -exec chmod 755 {} \; \
+    && find ${PI_WORKDIR} -type f -exec chmod 644 {} \; \
+    && mkdir -p ${APP_HOME}/app \
+    && chown user:user ${APP_HOME}/app
+COPY agent-redact/pi-agent/entrypoint-ecs.sh /usr/local/bin/entrypoint-ecs.sh
+COPY agent-redact/pi-agent/entrypoint.sh ${APP_HOME}/app/entrypoint.sh
+RUN sed -i 's/\r$//' /usr/local/bin/entrypoint-ecs.sh ${APP_HOME}/app/entrypoint.sh \
+    && chmod +x /usr/local/bin/entrypoint-ecs.sh ${APP_HOME}/app/entrypoint.sh
+# Writable paths only via runtime mounts (read-only root FS friendly).
+VOLUME ["${APP_HOME}/app/workspace"]
+VOLUME ["/tmp/gradio"]
+VOLUME ["/tmp/pi-sessions"]
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["${XDG_CACHE_HOME}"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+USER user
+RUN pi --version
+EXPOSE 7860
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]

agent-redact/pi-agent/README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+---
+title: Agentic Document Redaction
+emoji: 🤖
+colorFrom: blue
+colorTo: yellow
+sdk: docker
+app_file: agent-redact/pi/gradio_app.py
+pinned: false
+license: agpl-3.0
+short_description: Agentic interface to redact PDF documents
+---
+# Pi agent — agentic document redaction
+Orchestrate document redaction with **[Pi](https://github.com/earendil-works/pi)** and **Google Gemini**. Heavy redaction runs on a separate **private [doc_redaction](https://huggingface.co/spaces/seanpedrickcase/document_redaction)** Hugging Face Space (simple text extraction + Local PII).
+## Before you start
+1. **Gemini API key** — paste in **Agent backend** → **Apply backend** (session-only; not stored on disk).
+2. **HF token** — Space admin should set `HF_TOKEN` under **Settings → Secrets** so this Space can call the private redaction backend. Users may optionally override per session in the UI.
+## Limitations
+- **No face or signature VLM** — text-layer PII only via Local spaCy/Presidio on the remote Space.
+- **No Pass 2 VLM** on this deployment.
+- **Ephemeral storage** — download deliverables from **Workspace output files** before the Space restarts.
+- **Human review** — outputs are not guaranteed complete; review redacted PDFs before release.
+## Defaults
+| Setting | Value |
+|---------|--------|
+| Pi LLM | Gemini (`gemini-flash-latest` default) |
+| Redaction backend | `https://seanpedrickcase-document-redaction.hf.space` |
+| Text extraction | `Local model - selectable text` |
+| PII detection | `Local` |
+## Examples
+Two sample PDFs load in **Redaction task** → **Try an example** (same demos as the main doc_redaction app). Examples are **on by default**; set Space variable `PI_GRADIO_SHOW_EXAMPLES=false` to hide them. (`SHOW_PI_EXAMPLES` is also accepted.)
+If examples do not appear, the UI shows a short status message (usually missing PDFs in the image — rebuild after a successful sync with LFS materialization).
+## Development
+This Space is synced from the [doc_redaction monorepo](https://github.com/seanpedrick-case/doc_redaction) on pushes to **`dev`** (see `.github/workflows/sync-pi-agent-space.yml`). Space: [seanpedrickcase/agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction).

agent-redact/pi-agent/entrypoint-ecs.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+# ECS Fargate: ephemeral volume mounts are root-owned; chown then drop to user (image USER).
+set -euo pipefail
+for dir in /tmp/pi-agent /tmp/pi-logs /tmp/pi-usage /tmp/pi-feedback \
+    /home/user/app/workspace /tmp/gradio /tmp/pi-sessions; do
+  mkdir -p "$dir"
+  chown -R user:user "$dir"
+done
+cd /workspace/doc_redaction
+exec su -s /bin/bash user -c "$*"

agent-redact/pi-agent/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/bin/sh
+set -e
+echo "Starting Pi agent (profile=${PI_DEPLOYMENT_PROFILE:-unknown})"
+for dir in \
+    "${PI_CODING_AGENT_DIR:-/tmp/pi-agent}" \
+    "${PI_WORKSPACE_DIR:-/home/user/app/workspace}" \
+    "${PI_UPLOAD_ROOT:-/tmp/gradio}" \
+    "${PI_SESSION_DIR:-/tmp/pi-sessions}" \
+    "${ACCESS_LOGS_FOLDER:-/tmp/pi-logs}" \
+    "${USAGE_LOGS_FOLDER:-/tmp/pi-usage}" \
+    "${FEEDBACK_LOGS_FOLDER:-/tmp/pi-feedback}" \
+    "${MPLCONFIGDIR:-/tmp/matplotlib_cache}" \
+    "${XDG_CACHE_HOME:-/tmp/xdg_cache/user_1000}"; do
+    mkdir -p "$dir" 2>/dev/null || true
+    if [ ! -w "$dir" ]; then
+        echo "WARNING: Directory $dir is not writable by current user (uid=$(id -u)). File I/O may fail." >&2
+    fi
+done
+cd "${PI_WORKDIR:-/workspace/doc_redaction}"
+echo "Entrypoint environment: PI_WORKSPACE_DIR=${PI_WORKSPACE_DIR:-} PI_UI_HOST=${PI_UI_HOST:-} PI_UI_PORT=${PI_UI_PORT:-} PI_GRADIO_PORT=${PI_GRADIO_PORT:-} GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME:-} GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-} RUN_FASTAPI=${RUN_FASTAPI:-}"
+python3 agent-redact/pi/pi_agent_config.py
+if [ "${RUN_FASTAPI:-False}" = "True" ]; then
+  exec uvicorn gradio_app:app \
+    --app-dir agent-redact/pi \
+    --host "${GRADIO_SERVER_NAME:-0.0.0.0}" \
+    --port "${PI_GRADIO_PORT:-${GRADIO_SERVER_PORT:-7860}}" \
+    --proxy-headers \
+    --forwarded-allow-ips "*"
+else
+  exec python3 agent-redact/pi/gradio_app.py
+fi

agent-redact/pi-agent/sync-manifest.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Paths copied from the monorepo root into the flattened Pi agent HF Space repo.
+agent-redact/requirements_pi_agent.txt
+agent-redact/pi
+agent-redact/pi-agent/entrypoint.sh
+agent-redact/pi-agent/entrypoint-ecs.sh
+skills
+tools
+config/pi_agent.env.example
+intros/pi_intro.txt
+AGENTS.md
+doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+doc_redaction/example_data/graduate-job-example-cover-letter.pdf

agent-redact/pi-agent/sync_to_space.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+# Flatten monorepo paths into a temp directory for the Pi agent HF Space repo.
+# Usage (from repo root):
+#   agent-redact/pi-agent/sync_to_space.sh /path/to/output-dir
+set -euo pipefail
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+OUT="${1:?Output directory required}"
+MANIFEST="$(dirname "$0")/sync-manifest.txt"
+_is_lfs_pointer() {
+  [[ -f "$1" ]] && head -1 "$1" 2>/dev/null | grep -q "^version https://git-lfs.github.com/spec/v1"
+}
+rm -rf "$OUT"
+mkdir -p "$OUT"
+cp "$(dirname "$0")/Dockerfile" "$OUT/Dockerfile"
+cp "$(dirname "$0")/README.md" "$OUT/README.md"
+cp "$(dirname "$0")/.dockerignore" "$OUT/.dockerignore"
+cp "$(dirname "$0")/.gitattributes" "$OUT/.gitattributes"
+while IFS= read -r line || [[ -n "$line" ]]; do
+  line="${line%%#*}"
+  line="$(echo "$line" | xargs)"
+  [[ -z "$line" ]] && continue
+  src="$ROOT/$line"
+  if [[ ! -e "$src" ]]; then
+    echo "Missing: $src" >&2
+    exit 1
+  fi
+  dest="$OUT/$line"
+  mkdir -p "$(dirname "$dest")"
+  cp -a "$src" "$dest"
+  if [[ "$line" == *.pdf ]] && _is_lfs_pointer "$dest"; then
+    echo "Copied file is a Git LFS pointer, not a PDF: $line" >&2
+    echo "Run 'git lfs pull' in the monorepo before syncing." >&2
+    exit 1
+  fi
+done < "$MANIFEST"
+echo "Flattened Pi agent Space tree: $OUT"

agent-redact/pi/agent/README.md ADDED Viewed

	@@ -0,0 +1,194 @@

+# Pi agent config (Docker)
+Runtime Pi config is **generated at container start** by [`agent-redact/pi/pi_agent_config.py`](../pi_agent_config.py) into `~/.pi/agent/models.json` and `~/.pi/agent/settings.json`.
+Files in this folder (`settings.json`, `models.json`) are **templates/references** only — they are no longer bind-mounted into the container.
+## LLM backends (Pi orchestration)
+The Pi agent (chat + redaction orchestration) can use:
+| Provider key | Label | Pi API | Auth |
+|--------------|-------|--------|------|
+| `llama-cpp` | Local (llama-cpp) | `openai-completions` | None (local llama-inference) |
+| `google-gemini` | Gemini | `google-generative-ai` | `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
+| `amazon-bedrock` | AWS Bedrock | `bedrock-converse-stream` | AWS SDK credentials (`AWS_ACCESS_KEY_ID`, etc.) |
+This is separate from doc_redaction **Pass 2 VLM** (`{VLM_BASE_URL}` in redaction prompts), which still targets local llama-inference by default.
+### Environment variables
+Copy [`config/pi_agent.env.example`](../../../config/pi_agent.env.example) to `config/pi_agent.env` (gitignored) or set on the host before `docker compose up`:
+| Variable | Purpose |
+|----------|---------|
+| `PI_DEFAULT_PROVIDER` | `llama-cpp` \| `google-gemini` \| `amazon-bedrock` |
+| `PI_DEFAULT_MODEL` | Model id within provider |
+| `PI_LLAMA_BASE_URL` | Local OpenAI-compatible URL (default `http://llama-inference:8080/v1`) |
+| `PI_LLAMA_MODEL_ID` | Local model id |
+| `GEMINI_API_KEY` / `GOOGLE_API_KEY` | Gemini API key |
+| `AWS_REGION` / `AWS_DEFAULT_REGION` | Bedrock region |
+| `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN` | Bedrock credentials (when not using SSO) |
+| `AWS_PROFILE` | Named profile for SSO / shared credentials file (**required for Pi Bedrock with SSO**) |
+| `PI_AWS_PROFILE` | Alternative to `AWS_PROFILE`; also used to auto-select profile when only `~/.aws` is mounted |
+| `RUN_AWS_FUNCTIONS` | When `True`, use the AWS default credential chain (SSO, profile, role) |
+| `PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS` | When `True` with `RUN_AWS_FUNCTIONS`, prefer SSO/chain over static env keys (default `True`, same as main app) |
+| `PI_MAX_PAGES` | Maximum PDF pages allowed per redaction upload (falls back to `MAX_PAGES` / `MAX_DOC_PAGES`, default `3000`) |
+| `PI_MAX_RETRIES` | Gemini quota / rate-limit retries for Pi auto-retry and Gradio backoff (default `5`; alias `PI_QUOTA_RETRY_ATTEMPTS`) |
+| `PI_QUOTA_RETRY_DELAY_S` | Seconds between Gradio quota retries (default `60`) |
+| `PI_COMPACTION_ENABLED` | Pi session auto-compaction in `settings.json` (`true` / `false`; unset uses template default, enabled) |
+| `PI_COMPACTION_RESERVE_TOKENS` | Optional compaction `reserveTokens` (default `32768` from template) |
+| `PI_COMPACTION_KEEP_RECENT_TOKENS` | Optional compaction `keepRecentTokens` (default `20000` from template) |
+### Usage logging (CSV / DynamoDB / S3)
+Each completed Pi agent run (chat message or redaction task) writes **one row** to the **same usage log schema** as the main redaction app (`USAGE_LOG_FILE_NAME`, `USAGE_LOGS_FOLDER`, `S3_USAGE_LOGS_FOLDER`, `USAGE_LOG_DYNAMODB_TABLE_NAME`). Key fields:
+| Log column | Pi agent value |
+|------------|----------------|
+| `task` | `agent` |
+| `llm_model_name` | Pi provider/model (e.g. `amazon-bedrock/anthropic.claude-sonnet-4-6`) |
+| `text_extraction_method` / `pii_detection_method` | From redaction task settings when applicable |
+| `actual_time_taken_number` | Wall-clock seconds for the Pi RPC turn |
+| `total_page_count` | Pages in scope for PDF redaction tasks |
+| `llm_total_input_tokens` / `llm_total_output_tokens` | Pi orchestration LLM usage for that turn (from Pi `get_session_stats` delta, or assistant `usage` in session JSONL). Includes cache read/write in the input column. **VLM/tokens from doc_redaction Pass 1 are not included** (those stay on the main app usage log when you run redaction there directly). |
+Toggle with `SAVE_LOGS_TO_CSV`, `SAVE_LOGS_TO_DYNAMODB`, and `RUN_AWS_FUNCTIONS` (required for S3 log upload). Access logs on session load use the main app access log paths separately.
+At startup, if only `GOOGLE_API_KEY` is set, it is mirrored to `GEMINI_API_KEY` for Pi.
+### Gradio UI
+Open **http://localhost:7862** → **Agent backend** accordion:
+- Select provider and model
+- Optionally enter Gemini / AWS credentials (**session-only** — not written to disk)
+- Click **Apply backend** — regenerates config, restarts the Pi RPC subprocess, and starts a new session
+Credential fields are cleared after apply.
+## Local model id
+After the llama.cpp service is healthy, confirm the model id:
+```bash
+curl http://localhost:8000/v1/models
+```
+If the returned `id` differs from `unsloth/Qwen3.6-27B-MTP-GGUF`, set `PI_LLAMA_MODEL_ID` in `config/pi_agent.env` or compose environment and restart `pi-agent`.
+### llama.cpp / llama-swap and back-to-back redaction tasks
+If the **first** redaction task succeeds but a **second** task in the same browser session kills the llama server (`Killed`, `saving prompt with length 69804`, `proxy error: EOF`, `502`):
+1. **Oversized Pi session** — the orchestration agent kept the full first run (tool logs, bash output) in context (~70k tokens). The Gradio UI **restarts the Pi RPC process** and **clears the chat panel** on **page reload** and before each **Start redaction task** (same behaviour). Workspace files are unchanged. Use **New session** before a follow-up **chat** turn if you still hit context limits.
+2. **llama.cpp OOM** — a second task that reuses the first run’s context can try to allocate multi‑GiB KV state (`total state size = 3322 MiB` in logs) and be killed by the OS. A clean Pi process keeps the orchestration prompt small.
+3. **llama-swap GPU monitor** — on newer NVIDIA drivers, older llama-swap builds fail `nvidia-smi -loop` and can log `failed reading from gpuCh`. Upgrade to [llama-swap v213+](https://github.com/mostlygeek/llama-swap) (or disable performance monitoring in your swap config).
+4. **Concurrent load** — Pi orchestration and doc_redaction VLM may share one llama endpoint; `--parallel 1` allows only one generation. Wait until the first task shows **Agent finished** before starting another.
+For Gemma 4 31B, `pi-agent-gemma-31b` sets lower compaction defaults (`PI_COMPACTION_RESERVE_TOKENS=16384`) to match `PI_LLAMA_CONTEXT_WINDOW=65536`.
+## In-container URLs for task prompts
+When filling [`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](../../../skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md) inside the Pi container, use:
+| Placeholder | In-container value |
+|-------------|-------------------|
+| `{GRADIO_URL}` | `http://redaction-app-llama:7860` |
+| `{VLM_BASE_URL}` | `http://llama-inference:8080` |
+| `{INPUT_PATH}` | `/home/user/app/workspace/{session_hash}/{FILE_NAME}` (when `PI_SESSION_WORKSPACE=true`) |
+| `{OUTPUT_BASE}` | `/home/user/app/workspace/{session_hash}/redact/{FILE_NAME}/` |
+Host-side examples (`host.docker.internal`, `localhost:7861`) do not apply inside the compose network.
+## Usage
+Start the stack (27B profile):
+```powershell
+docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d --build
+```
+Interactive Pi TUI:
+```powershell
+docker compose -f docker-compose_llama_agentic.yml exec -it pi-agent pi
+```
+Gradio chat UI (browser):
+Open **http://localhost:7862**. Use the **Redaction task** panel to upload a document, enter bullet-point requirements, and click **Start redaction task**. Pi receives the filled prompt from [`skills/Example prompt partnership.txt`](../../../skills/Example%20prompt%20partnership.txt) (file copied to `/home/user/app/workspace/`). The full prompt appears in the chat; Pi’s reply streams in the chat panel.
+The UI also shows:
+- **Agent backend** — switch between local, Gemini, and Bedrock
+- **Chat** — streamed assistant text
+- **Activity** — agent/turn lifecycle, compaction, auto-retry, tool start/end
+- **Tool output** — live bash/read output from `tool_execution_update` / `tool_execution_end`
+- **Thinking** — optional stream (`PI_GRADIO_SHOW_THINKING=true`)
+- **Abort** — sends Pi RPC `abort` and cancels the in-flight Gradio handler
+- **Workspace output files** — browse and download redaction artifacts
+Optional env vars on `pi-agent`: `PI_GRADIO_SHOW_THINKING`, `PI_GRADIO_SHOW_TOOL_OUTPUT`, `PI_GRADIO_TOOL_OUTPUT_MAX`, `PI_GRADIO_ACTIVITY_MAX_LINES`.
+When a Pi run completes, the chat shows an **Agent finished** (or **Agent stopped**) line, a Gradio info toast appears, and the browser tab title flashes for ~15 seconds. Desktop notifications are shown when the browser has granted notification permission (requested on first click/keypress in the Pi UI).
+Run the UI locally (outside Docker):
+```powershell
+cd agent-redact/pi
+pip install -r ../requirements_pi_agent.txt
+# Pi orchestration subprocess (required for Apply backend / chat):
+npm install -g @earendil-works/pi-coding-agent
+python pi_agent_config.py
+python gradio_app.py
+```
+**Apply backend** starts `pi --mode rpc`. If you see `FileNotFoundError` / “Pi CLI not found”, install Node.js, run the `npm install` line above, and ensure `pi` (or `pi.cmd` on Windows) is on `PATH`. Optional: `PI_EXECUTABLE=C:\Users\you\AppData\Roaming\npm\pi.cmd` in `config/pi_agent.env`.
+RPC mode (automation, no Gradio):
+```powershell
+docker compose -f docker-compose_llama_agentic.yml exec -T pi-agent pi --mode rpc
+```
+Skills are synced from the repo `skills/` tree into **`{PI_WORKSPACE_DIR}/.pi/skills/`** on startup (read-only). Pi runs with `cwd` in the user’s session subfolder and `--no-skills` so it does not load skills from the git checkout. Use `/skill:doc-redaction-app` etc. Set `PI_SKILLS_RESYNC=true` to refresh copies from the repo.
+Sessions persist in the **`pi-agent-sessions`** Docker volume at **`~/.pi/agent/sessions/`** (Pi’s default session location inside the container). Override with `PI_SESSION_DIR` if needed.
+On **HF Space** (`PI_DEPLOYMENT_PROFILE=hf-space`), sessions go to **`/tmp/pi-sessions`** instead (ephemeral; lost on restart).
+## Python dependencies
+The Pi image installs [`requirements_pi_agent.txt`](../requirements_pi_agent.txt) — Gradio UI + `gradio-client`, HTTP clients, CSV/PDF review helpers (`pandas`, `pymupdf`), and common utilities. It **does not** include spaCy, Presidio, or OCR; heavy redaction runs in `redaction-app-llama`.
+Rebuild after changing that file:
+```powershell
+docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 build pi-agent
+```
+## HF Space profile (remote redaction backend)
+Set `PI_DEPLOYMENT_PROFILE=hf-space` to run the Pi Gradio UI as a **Hugging Face Docker Space** that orchestrates with **Gemini only** and calls a **remote** doc_redaction Space over HTTPS.
+| Area | HF Space value |
+|------|----------------|
+| Pi LLM | Gemini only (`PI_DEFAULT_PROVIDER=google-gemini`) |
+| Redaction app | `DOC_REDACTION_GRADIO_URL` (default `https://seanpedrickcase-document-redaction.hf.space`) |
+| Auth to redaction | `HF_TOKEN` / `DOC_REDACTION_HF_TOKEN` (Space secret + optional UI override) |
+| Text extraction / PII | Locked to `Local model - selectable text` + `Local` |
+| VLM faces / signatures | Disabled |
+| Port | `7860` |
+| Pi session logs | `/tmp/pi-sessions` (`PI_SESSION_DIR`; ephemeral) |
+Package and Dockerfile: [`agent-redact/pi-agent/`](../../pi-agent/). Pushes to [agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) on **`dev`** branch via [`.github/workflows/sync-pi-agent-space.yml`](../../../.github/workflows/sync-pi-agent-space.yml) (GitHub secrets: `HF_TOKEN`, `HF_USERNAME`, `HF_EMAIL`).
+Local build test from monorepo root:
+```powershell
+docker build -f agent-redact/pi-agent/Dockerfile --target runtime -t pi-agent-hf-space .
+docker run --rm -p 7860:7860 -e GEMINI_API_KEY=... -e HF_TOKEN=... pi-agent-hf-space
+```
+Pi uses `gradio_client` + `agent-redact/pi/remote_redaction.py` to upload/download from the remote Space; prompts include `{REMOTE_BACKEND_GUIDANCE}` (see [`redaction_prompt.py`](../redaction_prompt.py)).

agent-redact/pi/agent/models.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "providers": {
+    "llama-cpp": {
+      "baseUrl": "http://llama-inference:8080/v1",
+      "api": "openai-completions",
+      "apiKey": "llama-cpp",
+      "compat": {
+        "supportsDeveloperRole": false,
+        "supportsReasoningEffort": false,
+        "supportsUsageInStreaming": false,
+        "maxTokensField": "max_tokens"
+      },
+      "models": [
+        {
+          "id": "unsloth/Qwen3.6-27B-MTP-GGUF",
+          "name": "Qwen 3.6 27B (local)",
+          "reasoning": false,
+          "input": ["text", "image"],
+          "contextWindow": 114688,
+          "maxTokens": 32768,
+          "cost": {
+            "input": 0,
+            "output": 0,
+            "cacheRead": 0,
+            "cacheWrite": 0
+          }
+        }
+      ]
+    }
+  }
+}

agent-redact/pi/agent/settings.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "defaultProvider": "llama-cpp",
+  "defaultModel": "unsloth/Qwen3.6-27B-MTP-GGUF",
+  "defaultThinkingLevel": "off",
+  "hideThinkingBlock": true,
+  "compaction": {
+    "enabled": true,
+    "reserveTokens": 32768,
+    "keepRecentTokens": 20000
+  },
+  "branchSummary": {
+    "skipPrompt": true,
+    "reserveTokens": 32768
+  },
+  "retry": {
+    "enabled": true,
+    "maxRetries": 5,
+    "baseDelayMs": 2000,
+    "provider": {
+      "timeoutMs": 3600000,
+      "maxRetries": 5,
+      "maxRetryDelayMs": 60000
+    }
+  },
+  "enableSkillCommands": true,
+  "sessionDir": "sessions",
+  "steeringMode": "one-at-a-time",
+  "followUpMode": "one-at-a-time",
+  "terminal": {
+    "showTerminalProgress": false
+  }
+}

agent-redact/pi/bootstrap_pi_config.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""Pi agent process bootstrap (env file + workspace) before ``tools.config`` import."""
+from __future__ import annotations
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+_DOCKER_WORKSPACE = Path("/home/user/app/workspace")
+_DOCKER_UPLOAD_ROOT = Path("/tmp/gradio")
+_DOCKER_PI_WORKDIR = Path("/workspace/doc_redaction")
+# CSV log dirs must not live under read-only PI_WORKDIR (ECS/HF runtime images).
+_DOCKER_ACCESS_LOGS = Path("/tmp/pi-logs")
+_DOCKER_USAGE_LOGS = Path("/tmp/pi-usage")
+_DOCKER_FEEDBACK_LOGS = Path("/tmp/pi-feedback")
+_PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
+def _pi_running_in_container() -> bool:
+    """
+    True when the Pi process is inside Docker / HF Space, not local Windows dev.
+    Avoids treating ``C:\\home\\user\\app\\workspace`` (created by mistake on Windows)
+    as the compose mount.
+    """
+    if Path("/.dockerenv").is_file():
+        return True
+    return _DOCKER_PI_WORKDIR.is_dir() and _partnership_template_exists(
+        _DOCKER_PI_WORKDIR
+    )
+def ensure_pi_workspace_dir(repo_root: Path | None = None) -> str:
+    """
+    Resolve ``PI_WORKSPACE_DIR``, create it, and sync ``os.environ``.
+    - Explicit ``PI_WORKSPACE_DIR`` wins.
+    - Else use the Docker mount only when running in a container.
+    - Else ``{repo_root}/workspace`` (local Windows/macOS/Linux dev).
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
+    if raw:
+        path = Path(raw)
+    elif _pi_running_in_container() and _DOCKER_WORKSPACE.is_dir():
+        path = _DOCKER_WORKSPACE
+    else:
+        path = root / "workspace"
+    path.mkdir(parents=True, exist_ok=True)
+    resolved = str(path.resolve())
+    os.environ["PI_WORKSPACE_DIR"] = resolved
+    return resolved
+def _pi_runtime_needs_tmp_log_dirs() -> bool:
+    """True when CSV logs must not live under read-only ``PI_WORKDIR`` (ECS/HF images)."""
+    profile = os.environ.get("PI_DEPLOYMENT_PROFILE", "").strip().lower()
+    if profile in ("aws-ecs", "hf-space"):
+        return True
+    return _pi_running_in_container()
+def ensure_pi_writable_log_dirs() -> None:
+    """
+    Point access/usage/feedback CSV logs at ``/tmp`` when running in Docker/ECS.
+    ``tools.config`` resolves relative ``logs/`` under ``PI_WORKDIR``, which is
+    read-only in the Pi runtime image; ``/tmp`` is allowed by
+    ``ensure_folder_within_app_directory`` for absolute paths.
+    For ``aws-ecs`` / ``hf-space``, always override (S3/task env files often set
+    ``logs/`` from the main app template).
+    """
+    if not _pi_running_in_container():
+        return
+    for path in (_DOCKER_ACCESS_LOGS, _DOCKER_USAGE_LOGS, _DOCKER_FEEDBACK_LOGS):
+        path.mkdir(parents=True, exist_ok=True)
+    access = _DOCKER_ACCESS_LOGS.as_posix() + "/"
+    usage = _DOCKER_USAGE_LOGS.as_posix() + "/"
+    feedback = _DOCKER_FEEDBACK_LOGS.as_posix() + "/"
+    if _pi_runtime_needs_tmp_log_dirs():
+        os.environ["ACCESS_LOGS_FOLDER"] = access
+        os.environ["USAGE_LOGS_FOLDER"] = usage
+        os.environ["FEEDBACK_LOGS_FOLDER"] = feedback
+    else:
+        os.environ.setdefault("ACCESS_LOGS_FOLDER", access)
+        os.environ.setdefault("USAGE_LOGS_FOLDER", usage)
+        os.environ.setdefault("FEEDBACK_LOGS_FOLDER", feedback)
+def ensure_pi_upload_root(repo_root: Path | None = None) -> str:
+    """
+    Resolve where Gradio stores ``gr.File`` uploads and sync ``os.environ``.
+    Must run before ``import gradio`` so ``GRADIO_TEMP_DIR`` matches validation
+    in ``redaction_prompt._resolve_and_validate_upload_path``.
+    - Explicit ``PI_UPLOAD_ROOT`` wins.
+    - Else ``GRADIO_TEMP_DIR`` if already set.
+    - Else Docker ``/tmp/gradio`` when that directory exists.
+    - Else ``{repo}/workspace/.gradio_uploads`` (local dev; stays inside the app tree
+      so ``tools.config.ensure_folder_within_app_directory`` accepts ``GRADIO_TEMP_DIR``).
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
+    if raw:
+        path = Path(raw)
+    else:
+        gradio_temp = (os.environ.get("GRADIO_TEMP_DIR") or "").strip()
+        if gradio_temp:
+            path = Path(gradio_temp)
+        elif _pi_running_in_container() and _DOCKER_UPLOAD_ROOT.is_dir():
+            path = _DOCKER_UPLOAD_ROOT
+        else:
+            path = root / "workspace" / ".gradio_uploads"
+    path.mkdir(parents=True, exist_ok=True)
+    resolved = str(path.resolve())
+    os.environ["PI_UPLOAD_ROOT"] = resolved
+    if not (os.environ.get("GRADIO_TEMP_DIR") or "").strip():
+        os.environ["GRADIO_TEMP_DIR"] = resolved
+    return resolved
+def _partnership_template_exists(repo: Path) -> bool:
+    return (repo / _PARTNERSHIP_TEMPLATE).is_file()
+def ensure_pi_workdir(repo_root: Path | None = None) -> str:
+    """
+    Resolve ``PI_WORKDIR`` (monorepo root for skills/ and Pi RPC cwd).
+    - Explicit ``PI_WORKDIR`` wins when the partnership prompt template exists there.
+    - Else use the checkout root (``agent-redact/pi`` → parents[2]).
+    - Docker images set ``PI_WORKDIR=/workspace/doc_redaction`` via env or ``start.sh``.
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    raw = (os.environ.get("PI_WORKDIR") or "").strip()
+    if raw:
+        candidate = Path(raw)
+        if _partnership_template_exists(candidate):
+            resolved = str(candidate.resolve())
+            os.environ["PI_WORKDIR"] = resolved
+            return resolved
+    if _pi_running_in_container() and _partnership_template_exists(_DOCKER_PI_WORKDIR):
+        resolved = str(_DOCKER_PI_WORKDIR.resolve())
+        os.environ["PI_WORKDIR"] = resolved
+        return resolved
+    resolved = str(root)
+    os.environ["PI_WORKDIR"] = resolved
+    return resolved
+def pi_repo_root_path(repo_root: Path | None = None) -> Path:
+    """Return ``PI_WORKDIR`` as a :class:`~pathlib.Path` (calls :func:`ensure_pi_workdir`)."""
+    return Path(ensure_pi_workdir(repo_root))
+def load_pi_agent_env_file(config_path: str | Path | None = None) -> bool:
+    """
+    Load ``config/pi_agent.env`` into ``os.environ`` (does not override existing vars).
+    Must run before ``import pi_agent_config`` so module-level defaults see the file.
+    """
+    path = Path(config_path or os.environ.get("APP_CONFIG_PATH", "")).expanduser()
+    if not path.is_file():
+        return False
+    load_dotenv(path, override=False)
+    return True
+def ensure_pi_config_env(repo_root: Path | None = None) -> str:
+    """
+    Set process env so ``tools.config`` loads the Pi agent env file.
+    Must run before any ``from pi_agent_config import ...`` or ``tools.config`` import
+    that depends on Pi env vars. Safe to call multiple times; does not override
+    existing environment variables.
+    """
+    root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
+    os.environ.setdefault("APP_TYPE", "pi")
+    if not os.environ.get("APP_CONFIG_PATH", "").strip():
+        os.environ["APP_CONFIG_PATH"] = str(root / "config" / "pi_agent.env")
+    load_pi_agent_env_file()
+    ensure_pi_workdir(root)
+    ensure_pi_workspace_dir(root)
+    ensure_pi_upload_root(root)
+    ensure_pi_writable_log_dirs()
+    from pi_workspace_skills import ensure_workspace_skills
+    ensure_workspace_skills()
+    return os.environ["APP_CONFIG_PATH"]

agent-redact/pi/gradio_app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

agent-redact/pi/output_files.py ADDED Viewed

	@@ -0,0 +1,423 @@

+"""Browse and download files from the Pi agent shared workspace."""
+from __future__ import annotations
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any
+import gradio as gr
+from bootstrap_pi_config import pi_repo_root_path
+from pi_examples import gradio_example_allowed_paths
+from session_logs import gradio_session_log_allowed_paths
+from session_workspace import (
+    sanitize_session_id,
+    session_workspace_dir,
+    workspace_base_dir,
+)
+REFRESH_STUB_DIR = Path(os.environ.get("PI_FILEEXPLORER_STUB_DIR", "/tmp"))
+# Folder names under ``.../review/`` where Pass 1 deliverables are saved (see partnership prompt).
+_DEFAULT_FINAL_OUTPUT_FOLDER_NAMES = ("output_review_final", "output_final")
+_DEFAULT_FINAL_DOWNLOAD_FOLDER = "output_final_download"
+_DEFAULT_GRADIO_PREFIX_MIN_LEN = 16
+def final_output_folder_names() -> frozenset[str]:
+    raw = os.environ.get("PI_FINAL_OUTPUT_FOLDER_NAMES", "").strip()
+    if raw:
+        names = {part.strip() for part in raw.split(",") if part.strip()}
+        if names:
+            return frozenset(names)
+    return frozenset(_DEFAULT_FINAL_OUTPUT_FOLDER_NAMES)
+def _is_under_final_output_dir(relative_path: Path) -> bool:
+    parts = relative_path.parts
+    names = final_output_folder_names()
+    for index, part in enumerate(parts):
+        if part == "review" and index + 1 < len(parts):
+            if parts[index + 1] in names:
+                return True
+    return False
+def final_download_folder_name() -> str:
+    raw = os.environ.get("PI_FINAL_DOWNLOAD_FOLDER", _DEFAULT_FINAL_DOWNLOAD_FOLDER)
+    stripped = raw.strip() if raw else ""
+    return stripped or _DEFAULT_FINAL_DOWNLOAD_FOLDER
+def final_download_dir(session_hash: str | None = None) -> Path:
+    """
+    Per-session staging folder for ``gr.File`` downloads.
+    Always ``{PI_WORKSPACE_DIR}/{session_id}/output_final_download/`` when a session
+    id is known, even if the broader workspace is shared (``PI_SESSION_WORKSPACE=false``).
+    """
+    base = workspace_base_dir().resolve()
+    folder = final_download_folder_name()
+    if not session_hash or not str(session_hash).strip():
+        return base / folder
+    safe_id = sanitize_session_id(str(session_hash))
+    return base / safe_id / folder
+def _remove_path(path: Path) -> None:
+    """Best-effort delete (handles read-only / OneDrive locks on Windows)."""
+    try:
+        if path.is_dir() and not path.is_symlink():
+            shutil.rmtree(path, ignore_errors=True)
+        else:
+            path.unlink(missing_ok=True)
+    except OSError:
+        if not path.exists():
+            return
+        try:
+            os.chmod(path, 0o666)
+            if path.is_dir() and not path.is_symlink():
+                shutil.rmtree(path, ignore_errors=True)
+            else:
+                path.unlink(missing_ok=True)
+        except OSError:
+            pass
+def _reset_download_dir(download_dir: Path) -> None:
+    """Clear staged downloads without removing the directory inode (safer on Windows)."""
+    download_dir.mkdir(parents=True, exist_ok=True)
+    for child in download_dir.iterdir():
+        _remove_path(child)
+def _gradio_prefix_min_len() -> int:
+    raw = os.environ.get(
+        "PI_GRADIO_FILENAME_PREFIX_MIN_LEN",
+        str(_DEFAULT_GRADIO_PREFIX_MIN_LEN),
+    )
+    try:
+        return max(1, int(raw))
+    except ValueError:
+        return _DEFAULT_GRADIO_PREFIX_MIN_LEN
+def strip_gradio_cache_prefix(filename: str) -> str:
+    """
+    Remove a leading Gradio cache id prefix (``{alphanumeric}_{name}``).
+    Gradio client downloads often prefix filenames with a long hash so repeated
+    exports do not collide; users expect the original basename instead.
+    """
+    pattern = re.compile(rf"^[A-Za-z0-9]{{{_gradio_prefix_min_len()},}}_(.+)$")
+    match = pattern.match(filename)
+    if match:
+        return match.group(1)
+    return filename
+def _file_created_timestamp(path: Path) -> float:
+    stat = path.stat()
+    birth = getattr(stat, "st_birthtime", None)
+    if birth is not None and birth > 0:
+        return float(birth)
+    return float(stat.st_mtime)
+def _collect_raw_final_output_files(
+    session_hash: str | None = None,
+) -> list[Path] | None:
+    """
+    Collect deliverable files from ``review/output_review_final/`` (and aliases)
+    anywhere under the session workspace.
+    """
+    root = workspace_root_from(session_hash)
+    if not root.is_dir():
+        return None
+    download_folder = final_download_folder_name()
+    candidates: list[Path] = []
+    try:
+        for path in root.rglob("*"):
+            if not path.is_file() or not _is_file_path(path.name):
+                continue
+            try:
+                relative = path.relative_to(root)
+            except ValueError:
+                continue
+            if download_folder in relative.parts:
+                continue
+            if not _is_under_final_output_dir(relative):
+                continue
+            try:
+                path.resolve(strict=False).relative_to(root)
+            except ValueError:
+                continue
+            candidates.append(path)
+    except OSError:
+        return None
+    if not candidates:
+        return None
+    return candidates
+def build_final_download_files(
+    session_hash: str | None = None,
+) -> list[str] | None:
+    """
+    Stage cleaned deliverables under ``{session_id}/output_final_download/``.
+    Copies files from agent final-output folders, strips Gradio cache prefixes,
+    deduplicates by basename (newest file wins), and returns paths for ``gr.File``.
+    """
+    raw_files = _collect_raw_final_output_files(session_hash)
+    if not raw_files:
+        return None
+    download_dir = final_download_dir(session_hash)
+    _reset_download_dir(download_dir)
+    ordered = sorted(raw_files, key=_file_created_timestamp)
+    latest_by_name: dict[str, Path] = {}
+    for path in ordered:
+        latest_by_name[strip_gradio_cache_prefix(path.name)] = path
+    staged: list[str] = []
+    for name in sorted(latest_by_name):
+        source = latest_by_name[name]
+        destination = download_dir / name
+        destination.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(source, destination)
+        staged.append(str(destination.resolve()))
+    return staged or None
+def collect_final_output_files(
+    session_hash: str | None = None,
+) -> list[str] | None:
+    """Return deduplicated, prefix-stripped deliverables for download and S3 export."""
+    return build_final_download_files(session_hash)
+_REDACTED_PDF_SUFFIX = "_redacted.pdf"
+_REVIEW_PDF_MARKER = "_redactions_for_review"
+_PREVIEW_DIRNAME = ".pi/preview"
+_PREVIEW_FILENAME = "latest_redacted.pdf"
+_MIN_PDF_BYTES = 64
+def _is_redacted_pdf_candidate(path: Path) -> bool:
+    """True for deliverable ``*_redacted.pdf`` names (not review-only copies)."""
+    name = path.name.lower()
+    if not name.endswith(_REDACTED_PDF_SUFFIX):
+        return False
+    if _REVIEW_PDF_MARKER in name:
+        return False
+    return True
+def _is_valid_pdf_file(path: Path, *, min_bytes: int = _MIN_PDF_BYTES) -> bool:
+    """Reject empty, partial, or non-PDF files (e.g. HTML error bodies from failed downloads)."""
+    try:
+        if not path.is_file():
+            return False
+        if path.stat().st_size < min_bytes:
+            return False
+        with path.open("rb") as handle:
+            return handle.read(5).startswith(b"%PDF-")
+    except OSError:
+        return False
+def _find_newest_valid_redacted_pdf(session_hash: str | None) -> Path | None:
+    """Newest readable ``*_redacted.pdf`` under the session workspace."""
+    root = workspace_root_from(session_hash)
+    if not root.is_dir():
+        return None
+    newest: tuple[float, Path] | None = None
+    try:
+        for path in root.rglob("*"):
+            if not path.is_file() or not _is_redacted_pdf_candidate(path):
+                continue
+            if not _is_valid_pdf_file(path):
+                continue
+            try:
+                path.resolve(strict=False).relative_to(root.resolve())
+            except ValueError:
+                continue
+            timestamp = _file_created_timestamp(path)
+            if newest is None or timestamp > newest[0]:
+                newest = (timestamp, path)
+    except OSError:
+        return None
+    return newest[1] if newest else None
+def _staged_preview_pdf_path(session_hash: str | None) -> Path:
+    root = workspace_root_from(session_hash)
+    return root / ".pi" / "preview" / _PREVIEW_FILENAME
+def _stage_preview_pdf(source: Path, session_hash: str | None) -> Path:
+    """
+    Copy *source* into a stable preview path under the session workspace.
+    The Gradio PDF component reads a single file path; staging avoids serving
+    files that are still being written in ``output_redact/`` and gives a
+    consistent path under ``allowed_paths``.
+    """
+    dest = _staged_preview_pdf_path(session_hash)
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    tmp = dest.with_name(dest.name + ".tmp")
+    shutil.copy2(source, tmp)
+    tmp.replace(dest)
+    return dest.resolve()
+def latest_redacted_pdf_path(session_hash: str | None = None) -> str | None:
+    """
+    Return the newest valid ``*_redacted.pdf`` for the Gradio PDF preview.
+    Copies the chosen file to ``{session}/.pi/preview/latest_redacted.pdf`` so
+    the component always receives a complete PDF under the workspace root.
+    """
+    source = _find_newest_valid_redacted_pdf(session_hash)
+    staged = _staged_preview_pdf_path(session_hash)
+    if source is None:
+        if _is_valid_pdf_file(staged):
+            return str(staged.resolve())
+        return None
+    try:
+        if staged.is_file():
+            src_mtime = _file_created_timestamp(source)
+            staged_mtime = _file_created_timestamp(staged)
+            if (
+                src_mtime <= staged_mtime
+                and staged.stat().st_size == source.stat().st_size
+                and _is_valid_pdf_file(staged)
+            ):
+                return str(staged.resolve())
+    except OSError:
+        pass
+    return str(_stage_preview_pdf(source, session_hash))
+def workspace_root_from(session_hash: str | None = None) -> Path:
+    """Resolve the session workspace from a sanitized Gradio session hash only."""
+    if not session_hash or not str(session_hash).strip():
+        return workspace_base_dir().resolve()
+    return session_workspace_dir(str(session_hash).strip())
+def _is_file_path(path: str) -> bool:
+    if not path or not path.strip():
+        return False
+    name = Path(path.rstrip("/\\")).name
+    if not name or "." not in name:
+        return False
+    ext = name.rsplit(".", 1)[-1]
+    return bool(ext and len(ext) <= 10 and ext.isalnum())
+def _is_safe_workspace_relative_path(path: str) -> bool:
+    """Reject absolute paths and traversal segments before joining under workspace."""
+    if not path or not path.strip():
+        return False
+    candidate = Path(path.strip())
+    if candidate.is_absolute() or candidate.anchor:
+        return False
+    return all(part not in ("", ".", "..") for part in candidate.parts)
+def _resolve_under_workspace(
+    path: str,
+    *,
+    workspace_root: Path | None = None,
+) -> Path | None:
+    if not path or not path.strip():
+        return None
+    root = (workspace_root or workspace_base_dir()).resolve()
+    stripped = path.strip()
+    try:
+        user_path = Path(stripped)
+        if user_path.is_absolute():
+            # Gradio FileExplorer may return absolute paths already under root_dir.
+            resolved = user_path.resolve(strict=False)
+        elif _is_safe_workspace_relative_path(stripped):
+            resolved = root.joinpath(*user_path.parts).resolve(strict=False)
+        else:
+            return None
+        resolved.relative_to(root)
+    except (ValueError, OSError):
+        return None
+    return resolved if resolved.is_file() else None
+def load_workspace_output_files(session_hash: str = ""):
+    root = workspace_root_from(session_hash or None)
+    root.mkdir(parents=True, exist_ok=True)
+    return gr.FileExplorer(root_dir=str(root))
+def refresh_workspace_output_files_stub():
+    return gr.FileExplorer(root_dir=str(REFRESH_STUB_DIR.resolve()))
+def gradio_allowed_paths() -> list[str]:
+    """Paths Gradio may serve via gr.File (must include the shared workspace)."""
+    paths: list[str] = []
+    for raw in (
+        workspace_base_dir(),
+        str(pi_repo_root_path()),
+        REFRESH_STUB_DIR,
+        "/tmp",
+    ):
+        try:
+            resolved = str(Path(raw).resolve())
+        except OSError:
+            continue
+        if resolved not in paths:
+            paths.append(resolved)
+    for raw in gradio_example_allowed_paths():
+        if raw not in paths:
+            paths.append(raw)
+    for raw in gradio_session_log_allowed_paths():
+        if raw not in paths:
+            paths.append(raw)
+    return paths
+def refresh_workspace_panel(
+    session_hash: str = "",
+) -> tuple[Any, list[str] | None]:
+    """Refresh file explorer and auto-detected final deliverables."""
+    return (
+        load_workspace_output_files(session_hash),
+        collect_final_output_files(session_hash),
+    )
+def workspace_files_download_fn(
+    selected: list[str] | None,
+    session_hash: str = "",
+) -> list[str] | None:
+    """Return only file paths under the session workspace (for gr.File download)."""
+    if not selected:
+        return None
+    root = workspace_root_from(session_hash or None)
+    downloads: list[str] = []
+    for raw in selected:
+        if not _is_file_path(raw):
+            continue
+        resolved = _resolve_under_workspace(raw, workspace_root=root)
+        if resolved is not None:
+            downloads.append(str(resolved))
+    return downloads or None

agent-redact/pi/pi_agent_config.py ADDED Viewed

	@@ -0,0 +1,857 @@

+"""Generate Pi agent models.json and settings.json at runtime."""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Any
+DEPLOYMENT_LOCAL = "local-docker"
+DEPLOYMENT_HF_SPACE = "hf-space"
+DEPLOYMENT_AWS_ECS = "aws-ecs"
+def resolve_agent_dir() -> Path:
+    """Directory for Pi ``models.json`` / ``settings.json`` (must be writable at runtime)."""
+    explicit = (os.environ.get("PI_CODING_AGENT_DIR") or "").strip()
+    if explicit:
+        return Path(explicit)
+    profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
+    # HF Space and ECS often use a read-only root FS; only mounted paths (or /tmp) are writable.
+    if profile in (DEPLOYMENT_HF_SPACE, DEPLOYMENT_AWS_ECS):
+        return Path("/tmp/pi-agent")
+    return Path.home() / ".pi" / "agent"
+# Back-compat alias; prefer resolve_agent_dir() when env may change after import.
+AGENT_DIR = resolve_agent_dir()
+TEMPLATE_DIR = Path(__file__).resolve().parent / "agent"
+SETTINGS_TEMPLATE = TEMPLATE_DIR / "settings.json"
+DEPLOYMENT_PROFILE = (
+    os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
+)
+def pi_max_retries() -> int:
+    """Max retries for Pi auto-retry and Gradio quota backoff (env: PI_MAX_RETRIES, default 5)."""
+    raw = (
+        os.environ.get("PI_QUOTA_RETRY_ATTEMPTS")
+        or os.environ.get("PI_MAX_RETRIES")
+        or "5"
+    ).strip()
+    return int(raw)
+def _apply_retry_settings(
+    settings: dict[str, Any],
+    *,
+    provider: str,
+) -> None:
+    """Write Pi ``settings.json`` retry block (cloud providers use longer delays)."""
+    max_retries = pi_max_retries()
+    use_long_delays = (
+        provider == PROVIDER_GEMINI
+        or provider == PROVIDER_BEDROCK
+        or is_hf_space_profile()
+        or is_aws_ecs_profile()
+    )
+    base_delay_ms = 2000
+    max_delay_ms = 60000
+    if use_long_delays:
+        default_base_ms = int(os.environ.get("PI_QUOTA_RETRY_DELAY_S", "60")) * 1000
+        default_max_ms = int(default_base_ms * 1.5)
+        if provider == PROVIDER_BEDROCK or (
+            is_aws_ecs_profile() and not is_hf_space_profile()
+        ):
+            prefix = "PI_BEDROCK"
+        else:
+            prefix = "PI_GEMINI"
+        base_delay_ms = int(
+            os.environ.get(f"{prefix}_RETRY_BASE_DELAY_MS")
+            or os.environ.get("PI_GEMINI_RETRY_BASE_DELAY_MS", str(default_base_ms))
+        )
+        max_delay_ms = int(
+            os.environ.get(f"{prefix}_RETRY_MAX_DELAY_MS")
+            or os.environ.get("PI_GEMINI_RETRY_MAX_DELAY_MS", str(default_max_ms))
+        )
+    settings["retry"] = {
+        "enabled": True,
+        "maxRetries": max_retries,
+        "baseDelayMs": base_delay_ms,
+        "provider": {
+            "timeoutMs": 3600000,
+            "maxRetries": max_retries,
+            "maxRetryDelayMs": max_delay_ms,
+        },
+    }
+PROVIDER_LLAMA = "llama-cpp"
+PROVIDER_GEMINI = "google-gemini"
+PROVIDER_BEDROCK = "amazon-bedrock"
+PROVIDER_LABELS: dict[str, str] = {
+    PROVIDER_LLAMA: "Local (llama-cpp)",
+    PROVIDER_GEMINI: "Gemini",
+    PROVIDER_BEDROCK: "AWS Bedrock",
+}
+def is_hf_space_profile() -> bool:
+    profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
+    return profile == DEPLOYMENT_HF_SPACE
+def is_aws_ecs_profile() -> bool:
+    profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
+    return profile == DEPLOYMENT_AWS_ECS
+def uses_split_redaction_backend() -> bool:
+    """
+    True when Pi and doc_redaction run in separate containers (no shared output disk).
+    HF Space and AWS ECS use Gradio HTTP download; local-docker typically shares a host
+    volume. Override with ``PI_REDACTION_SPLIT_BACKEND=true|false``.
+    """
+    explicit = (os.environ.get("PI_REDACTION_SPLIT_BACKEND") or "").strip().lower()
+    if explicit in {"1", "true", "yes", "on"}:
+        return True
+    if explicit in {"0", "false", "no", "off"}:
+        return False
+    return is_hf_space_profile() or is_aws_ecs_profile()
+def resolve_llama_base_url() -> str:
+    """
+    OpenAI-compatible base URL for Pi's ``llama-cpp`` provider (includes ``/v1``).
+    Reads ``PI_LLAMA_BASE_URL``; also accepts legacy aliases
+    ``PI_LLAMA_MODE_BASE_URL`` and ``PI_LLAMA_MODE__BASE_URL``.
+    """
+    for key in (
+        "PI_LLAMA_BASE_URL",
+        "PI_LLAMA_MODE_BASE_URL",
+    ):
+        raw = (os.environ.get(key) or "").strip().rstrip("/")
+        if raw:
+            return raw if raw.endswith("/v1") else f"{raw}/v1"
+    return "http://llama-inference:8080/v1"
+LLAMA_BASE_URL = resolve_llama_base_url()
+LLAMA_MODEL_ID = os.environ.get("PI_LLAMA_MODEL_ID", "unsloth/Qwen3.6-27B-MTP-GGUF")
+LLAMA_CONTEXT = int(os.environ.get("PI_LLAMA_CONTEXT_WINDOW", "114688"))
+LLAMA_MAX_TOKENS = int(os.environ.get("PI_LLAMA_MAX_TOKENS", "32768"))
+GEMINI_MODELS: tuple[tuple[str, str, int, bool], ...] = (
+    ("gemini-flash-lite-latest", "Gemini Flash Lite", 1048576, False),
+    ("gemini-flash-latest", "Gemini Flash", 1048576, True),
+    ("gemini-pro-latest", "Gemini Pro", 1048576, True),
+)
+BEDROCK_MODELS: tuple[tuple[str, str, int, bool], ...] = (
+    (
+        "anthropic.claude-sonnet-4-6",
+        "Anthropic Claude Sonnet 4.6 (Bedrock)",
+        1000000,
+        True,
+    ),
+    ("amazon.nova-pro-v1:0", "Amazon Nova Pro (Bedrock)", 300000, False),
+    (
+        "nvidia.nemotron-super-3-120b",
+        "NVIDIA Nemotron Super 3 120B (Bedrock)",
+        262000,
+        False,
+    ),
+    ("mistral.devstral-2-123b", "Mistral Devstral 2 123B (Bedrock)", 256000, False),
+)
+PROVIDER_MODELS: dict[str, list[str]] = {
+    PROVIDER_LLAMA: [LLAMA_MODEL_ID],
+    PROVIDER_GEMINI: [model_id for model_id, _, _, _ in GEMINI_MODELS],
+    PROVIDER_BEDROCK: [model_id for model_id, _, _, _ in BEDROCK_MODELS],
+}
+DEFAULT_MODEL_BY_PROVIDER: dict[str, str] = {
+    PROVIDER_LLAMA: LLAMA_MODEL_ID,
+    PROVIDER_GEMINI: GEMINI_MODELS[0][0],  # Gemini Flash Lite
+    PROVIDER_BEDROCK: "anthropic.claude-sonnet-4-6",
+}
+def get_default_provider() -> str:
+    """Current default Pi provider (reads ``PI_DEFAULT_PROVIDER`` from env each call)."""
+    if is_hf_space_profile():
+        return PROVIDER_GEMINI
+    raw = (os.environ.get("PI_DEFAULT_PROVIDER") or "").strip()
+    if raw in PROVIDER_MODELS:
+        return raw
+    if is_aws_ecs_profile():
+        return PROVIDER_BEDROCK
+    return PROVIDER_LLAMA
+DEFAULT_PROVIDER = get_default_provider()
+def _catalog_contains_model(model_id: str, provider: str) -> bool:
+    """True when *model_id* is listed for a non-llama *provider*."""
+    return model_id in PROVIDER_MODELS.get(provider, ())
+_env_default_model = (os.environ.get("PI_DEFAULT_MODEL") or "").strip()
+if _env_default_model and (
+    DEFAULT_PROVIDER == PROVIDER_LLAMA
+    or _catalog_contains_model(_env_default_model, DEFAULT_PROVIDER)
+):
+    DEFAULT_MODEL = _env_default_model
+else:
+    DEFAULT_MODEL = DEFAULT_MODEL_BY_PROVIDER.get(DEFAULT_PROVIDER, LLAMA_MODEL_ID)
+def llama_model_id() -> str:
+    """Active llama-cpp model id (runtime ``PI_LLAMA_MODEL_ID`` or startup default)."""
+    return (
+        os.environ.get("PI_LLAMA_MODEL_ID") or LLAMA_MODEL_ID
+    ).strip() or LLAMA_MODEL_ID
+def resolved_default_model(provider: str, *, override: str | None = None) -> str:
+    """
+    Pick the default model id for a provider.
+    Order: explicit override → ``PI_DEFAULT_MODEL`` when valid for *provider* →
+    built-in per-provider default (llama uses ``PI_LLAMA_MODEL_ID``).
+    """
+    if override and override.strip():
+        return override.strip()
+    normalized = normalize_provider(provider)
+    env_model = (os.environ.get("PI_DEFAULT_MODEL") or "").strip()
+    active_provider = normalize_provider(get_default_provider())
+    if env_model:
+        if normalized == PROVIDER_LLAMA:
+            if active_provider == PROVIDER_LLAMA:
+                return env_model
+        elif _catalog_contains_model(env_model, normalized):
+            return env_model
+    if normalized == PROVIDER_LLAMA:
+        return llama_model_id()
+    return DEFAULT_MODEL_BY_PROVIDER.get(normalized, LLAMA_MODEL_ID)
+def normalize_backend_model(provider: str, model_id: str | None) -> str:
+    """
+    Resolve a UI/backend model selection to a concrete model id.
+    llama-cpp accepts any non-empty id (llama-swap / custom OpenAI model names).
+    Other providers must match the static catalog.
+    """
+    normalized = normalize_provider(provider)
+    model = (model_id or default_model_for_provider(normalized)).strip()
+    if not model:
+        return default_model_for_provider(normalized)
+    if normalized == PROVIDER_LLAMA:
+        return model
+    if model in models_for_provider(normalized):
+        return model
+    return default_model_for_provider(normalized)
+def _zero_cost() -> dict[str, int]:
+    return {"input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0}
+def _model_entry(
+    model_id: str,
+    name: str,
+    *,
+    context_window: int,
+    max_tokens: int,
+    reasoning: bool,
+    image_input: bool = True,
+) -> dict[str, Any]:
+    inputs = ["text", "image"] if image_input else ["text"]
+    return {
+        "id": model_id,
+        "name": name,
+        "reasoning": reasoning,
+        "input": inputs,
+        "contextWindow": context_window,
+        "maxTokens": max_tokens,
+        "cost": _zero_cost(),
+    }
+def _llama_provider() -> dict[str, Any]:
+    model_id = llama_model_id()
+    return {
+        "baseUrl": LLAMA_BASE_URL,
+        "api": "openai-completions",
+        "apiKey": "llama-cpp",
+        "compat": {
+            "supportsDeveloperRole": False,
+            "supportsReasoningEffort": False,
+            "supportsUsageInStreaming": False,
+            "maxTokensField": "max_tokens",
+        },
+        "models": [
+            _model_entry(
+                model_id,
+                f"Local ({model_id})",
+                context_window=LLAMA_CONTEXT,
+                max_tokens=LLAMA_MAX_TOKENS,
+                reasoning=False,
+            )
+        ],
+    }
+def _gemini_provider() -> dict[str, Any]:
+    return {
+        "baseUrl": "https://generativelanguage.googleapis.com/v1beta",
+        "api": "google-generative-ai",
+        "apiKey": "GEMINI_API_KEY",
+        "models": [
+            _model_entry(
+                model_id, name, context_window=ctx, max_tokens=8192, reasoning=reasoning
+            )
+            for model_id, name, ctx, reasoning in GEMINI_MODELS
+        ],
+    }
+def _bedrock_region() -> str:
+    return (
+        os.environ.get("AWS_REGION")
+        or os.environ.get("AWS_DEFAULT_REGION")
+        or "eu-west-2"
+    )
+_AWS_CREDENTIAL_ENV_KEYS: tuple[str, ...] = (
+    "AWS_ACCESS_KEY_ID",
+    "AWS_SECRET_ACCESS_KEY",
+    "AWS_SESSION_TOKEN",
+    "AWS_ACCESS_KEY",
+    "AWS_SECRET_KEY",
+)
+_AWS_PROFILE_ENV_KEYS: tuple[str, ...] = ("AWS_PROFILE", "PI_AWS_PROFILE")
+def _env_flag(name: str, *, default: bool = False) -> bool:
+    raw = os.environ.get(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+def _strip_empty_env_vars(names: tuple[str, ...]) -> None:
+    for name in names:
+        if not (os.environ.get(name) or "").strip():
+            os.environ.pop(name, None)
+def _mirror_legacy_aws_key_env_vars() -> None:
+    if not (os.environ.get("AWS_ACCESS_KEY_ID") or "").strip():
+        legacy = (os.environ.get("AWS_ACCESS_KEY") or "").strip()
+        if legacy:
+            os.environ["AWS_ACCESS_KEY_ID"] = legacy
+    if not (os.environ.get("AWS_SECRET_ACCESS_KEY") or "").strip():
+        legacy = (os.environ.get("AWS_SECRET_KEY") or "").strip()
+        if legacy:
+            os.environ["AWS_SECRET_ACCESS_KEY"] = legacy
+def _has_explicit_aws_access_keys() -> bool:
+    access = (
+        os.environ.get("AWS_ACCESS_KEY_ID") or os.environ.get("AWS_ACCESS_KEY") or ""
+    ).strip()
+    secret = (
+        os.environ.get("AWS_SECRET_ACCESS_KEY")
+        or os.environ.get("AWS_SECRET_KEY")
+        or ""
+    ).strip()
+    return bool(access and secret)
+def _aws_config_path() -> Path | None:
+    explicit = (os.environ.get("AWS_CONFIG_FILE") or "").strip()
+    if explicit:
+        path = Path(explicit).expanduser()
+        return path if path.is_file() else None
+    home = Path(os.environ.get("HOME", "/home/user"))
+    path = home / ".aws" / "config"
+    return path if path.is_file() else None
+def _discover_aws_profile_from_config() -> str | None:
+    """Return an AWS profile name for Pi/Bedrock when only ~/.aws is mounted."""
+    explicit = (os.environ.get("PI_AWS_PROFILE") or "").strip()
+    if not explicit:
+        explicit = (os.environ.get("AWS_PROFILE") or "").strip()
+    if explicit:
+        return explicit
+    path = _aws_config_path()
+    if not path:
+        return None
+    current_profile: str | None = None
+    sso_profiles: list[str] = []
+    all_profiles: list[str] = []
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or line.startswith(";"):
+            continue
+        if line == "[default]":
+            current_profile = "default"
+            all_profiles.append("default")
+            continue
+        if line.startswith("[profile ") and line.endswith("]"):
+            current_profile = line[len("[profile ") : -1].strip()
+            if current_profile:
+                all_profiles.append(current_profile)
+            continue
+        if current_profile and line.startswith("sso_session"):
+            sso_profiles.append(current_profile)
+    if sso_profiles:
+        return sso_profiles[0]
+    if "default" in all_profiles:
+        return "default"
+    return all_profiles[0] if all_profiles else None
+def _region_from_aws_config(profile: str | None = None) -> str | None:
+    """Read ``region =`` from a profile block in ``~/.aws/config``."""
+    path = _aws_config_path()
+    if not path:
+        return None
+    target = (profile or _discover_aws_profile_from_config() or "").strip()
+    if not target:
+        return None
+    current_profile: str | None = None
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or line.startswith(";"):
+            continue
+        if line == "[default]":
+            current_profile = "default"
+            continue
+        if line.startswith("[profile ") and line.endswith("]"):
+            current_profile = line[len("[profile ") : -1].strip()
+            continue
+        if current_profile != target:
+            continue
+        if line.startswith("region"):
+            _, _, value = line.partition("=")
+            region = value.strip()
+            if region:
+                return region
+    return None
+def _ensure_aws_region_env() -> None:
+    """Ensure AWS SDK env has a non-empty region (profile config, then eu-west-2)."""
+    _strip_empty_env_vars(("AWS_REGION", "AWS_DEFAULT_REGION"))
+    region = (
+        os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
+    ).strip()
+    if not region:
+        profile = (os.environ.get("AWS_PROFILE") or "").strip()
+        region = (_region_from_aws_config(profile) or "").strip()
+    if not region:
+        region = _bedrock_region()
+    os.environ["AWS_REGION"] = region
+    os.environ["AWS_DEFAULT_REGION"] = region
+def _pi_bedrock_auth_visible() -> bool:
+    """True when Pi's amazon-bedrock provider would detect configured auth."""
+    if (os.environ.get("AWS_PROFILE") or "").strip():
+        return True
+    if _has_explicit_aws_access_keys():
+        return True
+    if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
+        return True
+    return False
+def _ensure_pi_bedrock_auth_env() -> None:
+    """
+    Pi checks env vars (not ~/.aws alone) before Bedrock is usable.
+    When SSO credentials live in a mounted ``~/.aws`` tree, set ``AWS_PROFILE``
+    so Pi passes its auth preflight and the AWS SDK loads the profile.
+    """
+    if _pi_bedrock_auth_visible():
+        return
+    profile = _discover_aws_profile_from_config()
+    if profile:
+        os.environ["AWS_PROFILE"] = profile
+def configure_aws_credentials(
+    *,
+    session_access_key_id: str | None = None,
+    session_secret_access_key: str | None = None,
+    session_session_token: str | None = None,
+) -> None:
+    """
+    Align Pi Bedrock AWS env with doc_redaction SSO/key priority.
+    Mirrors ``tools/file_redaction.py``: when ``RUN_AWS_FUNCTIONS`` is enabled,
+    prefer the default credential chain (SSO profile, instance role, etc.) over
+    static env keys when ``PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS`` is true.
+    Explicit UI session keys from **Apply backend** always win.
+    """
+    _strip_empty_env_vars(_AWS_CREDENTIAL_ENV_KEYS)
+    _strip_empty_env_vars(_AWS_PROFILE_ENV_KEYS)
+    _mirror_legacy_aws_key_env_vars()
+    session_explicit = bool(
+        session_access_key_id
+        and session_access_key_id.strip()
+        and session_secret_access_key
+        and session_secret_access_key.strip()
+    )
+    if session_explicit:
+        os.environ["AWS_ACCESS_KEY_ID"] = session_access_key_id.strip()
+        os.environ["AWS_SECRET_ACCESS_KEY"] = session_secret_access_key.strip()
+        if session_session_token and session_session_token.strip():
+            os.environ["AWS_SESSION_TOKEN"] = session_session_token.strip()
+        else:
+            os.environ.pop("AWS_SESSION_TOKEN", None)
+        _ensure_aws_region_env()
+        return
+    run_aws = _env_flag("RUN_AWS_FUNCTIONS")
+    prioritise_sso = _env_flag("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", default=True)
+    if run_aws and prioritise_sso:
+        for key in _AWS_CREDENTIAL_ENV_KEYS:
+            os.environ.pop(key, None)
+        _ensure_pi_bedrock_auth_env()
+    elif run_aws:
+        for key in _AWS_CREDENTIAL_ENV_KEYS:
+            os.environ.pop(key, None)
+        _ensure_pi_bedrock_auth_env()
+    # Propagate PI_AWS_PROFILE when only that alias is set (e.g. pi_agent.env).
+    pi_profile = (os.environ.get("PI_AWS_PROFILE") or "").strip()
+    if pi_profile and not (os.environ.get("AWS_PROFILE") or "").strip():
+        os.environ["AWS_PROFILE"] = pi_profile
+    _ensure_aws_region_env()
+def _aws_credential_status() -> str:
+    if _has_explicit_aws_access_keys():
+        return "access keys"
+    profile = (os.environ.get("AWS_PROFILE") or "").strip()
+    if profile:
+        return f"profile {profile}"
+    if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
+        return "Bedrock bearer token"
+    if _aws_config_path():
+        return "SSO config mounted (profile not set)"
+    if _env_flag("RUN_AWS_FUNCTIONS"):
+        return "SSO/default chain (missing profile)"
+    return "missing"
+def _bedrock_provider() -> dict[str, Any]:
+    region = _bedrock_region()
+    return {
+        "baseUrl": f"https://bedrock-runtime.{region}.amazonaws.com",
+        "api": "bedrock-converse-stream",
+        "models": [
+            _model_entry(
+                model_id,
+                name,
+                context_window=ctx,
+                max_tokens=8192,
+                reasoning=reasoning,
+            )
+            for model_id, name, ctx, reasoning in BEDROCK_MODELS
+        ],
+    }
+def build_models_config() -> dict[str, Any]:
+    if is_hf_space_profile():
+        return {"providers": {PROVIDER_GEMINI: _gemini_provider()}}
+    return {
+        "providers": {
+            PROVIDER_LLAMA: _llama_provider(),
+            PROVIDER_GEMINI: _gemini_provider(),
+            PROVIDER_BEDROCK: _bedrock_provider(),
+        }
+    }
+def _load_settings_template() -> dict[str, Any]:
+    if SETTINGS_TEMPLATE.is_file():
+        return json.loads(SETTINGS_TEMPLATE.read_text(encoding="utf-8"))
+    return {
+        "defaultThinkingLevel": "off",
+        "hideThinkingBlock": True,
+        "compaction": {
+            "enabled": True,
+            "reserveTokens": 32768,
+            "keepRecentTokens": 20000,
+        },
+        "enableSkillCommands": True,
+        "sessionDir": "sessions",
+    }
+def _apply_compaction_settings(settings: dict[str, Any]) -> None:
+    """
+    Merge Pi session auto-compaction from env into ``settings.json``.
+    ``PI_COMPACTION_ENABLED`` — when set, overrides the template ``compaction.enabled``
+    flag (``true`` / ``false``). When unset, the template default applies (enabled).
+    Optional tuning: ``PI_COMPACTION_RESERVE_TOKENS``, ``PI_COMPACTION_KEEP_RECENT_TOKENS``.
+    """
+    compaction = dict(
+        settings.get("compaction")
+        or {
+            "enabled": True,
+            "reserveTokens": 32768,
+            "keepRecentTokens": 20000,
+        }
+    )
+    if os.environ.get("PI_COMPACTION_ENABLED") is not None:
+        compaction["enabled"] = _env_flag("PI_COMPACTION_ENABLED")
+    reserve = (os.environ.get("PI_COMPACTION_RESERVE_TOKENS") or "").strip()
+    if reserve:
+        compaction["reserveTokens"] = int(reserve)
+    elif LLAMA_CONTEXT < 100_000:
+        # Smaller local models (e.g. Gemma 4 31B at 65536): default reserve was 32768.
+        compaction["reserveTokens"] = min(16_384, max(8_192, LLAMA_CONTEXT // 4))
+    keep = (os.environ.get("PI_COMPACTION_KEEP_RECENT_TOKENS") or "").strip()
+    if keep:
+        compaction["keepRecentTokens"] = int(keep)
+    elif LLAMA_CONTEXT < 100_000:
+        compaction["keepRecentTokens"] = min(12_288, max(4_096, LLAMA_CONTEXT // 5))
+    settings["compaction"] = compaction
+def resolve_session_dir() -> str:
+    """Pi session JSONL directory (absolute path or relative to ``AGENT_DIR``)."""
+    explicit = os.environ.get("PI_SESSION_DIR", "").strip()
+    if explicit:
+        return explicit
+    if is_hf_space_profile():
+        return "/tmp/pi-sessions"
+    return "sessions"
+def ensure_session_dir(session_dir: str | None = None) -> Path:
+    """Create the Pi session directory and return its resolved absolute path."""
+    raw = (session_dir or resolve_session_dir()).strip()
+    path = Path(raw)
+    if not path.is_absolute():
+        path = (resolve_agent_dir() / path).resolve()
+    else:
+        path = path.resolve()
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def build_settings_config(
+    *,
+    default_provider: str | None = None,
+    default_model: str | None = None,
+) -> dict[str, Any]:
+    provider = default_provider or get_default_provider()
+    if provider not in PROVIDER_MODELS:
+        provider = PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
+    model = resolved_default_model(provider, override=default_model)
+    settings = _load_settings_template()
+    settings["defaultProvider"] = provider
+    settings["defaultModel"] = model
+    _apply_compaction_settings(settings)
+    session_path = ensure_session_dir(resolve_session_dir())
+    settings["sessionDir"] = session_path.as_posix()
+    if (
+        is_hf_space_profile()
+        or is_aws_ecs_profile()
+        or provider in (PROVIDER_GEMINI, PROVIDER_BEDROCK)
+    ):
+        _apply_retry_settings(settings, provider=provider)
+    from pi_workspace_skills import ensure_workspace_skills, workspace_skills_dir
+    ensure_workspace_skills()
+    settings["skills"] = [workspace_skills_dir().as_posix()]
+    return settings
+def write_runtime_config(
+    *,
+    agent_dir: Path | None = None,
+    default_provider: str | None = None,
+    default_model: str | None = None,
+) -> tuple[Path, Path]:
+    """Write models.json and settings.json; return their paths."""
+    provider = normalize_provider(default_provider or get_default_provider())
+    if default_provider:
+        os.environ["PI_DEFAULT_PROVIDER"] = provider
+    if default_model and default_model.strip():
+        model = default_model.strip()
+        os.environ["PI_DEFAULT_MODEL"] = model
+        if provider == PROVIDER_LLAMA:
+            os.environ["PI_LLAMA_MODEL_ID"] = model
+    target = Path(agent_dir or resolve_agent_dir())
+    target.mkdir(parents=True, exist_ok=True)
+    models_path = target / "models.json"
+    settings_path = target / "settings.json"
+    models_path.write_text(
+        json.dumps(build_models_config(), indent=2) + "\n",
+        encoding="utf-8",
+    )
+    settings_path.write_text(
+        json.dumps(
+            build_settings_config(
+                default_provider=default_provider,
+                default_model=default_model,
+            ),
+            indent=2,
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    return models_path, settings_path
+def models_for_provider(provider: str) -> list[str]:
+    if is_hf_space_profile():
+        return list(PROVIDER_MODELS[PROVIDER_GEMINI])
+    if provider == PROVIDER_LLAMA:
+        return [llama_model_id()]
+    return list(PROVIDER_MODELS.get(provider, PROVIDER_MODELS[PROVIDER_LLAMA]))
+def default_model_for_provider(provider: str) -> str:
+    return resolved_default_model(provider)
+def normalize_provider(provider: str) -> str:
+    label_map = {label.lower(): key for key, label in PROVIDER_LABELS.items()}
+    lowered = (provider or "").strip().lower()
+    if lowered in PROVIDER_MODELS:
+        return lowered
+    if lowered in label_map:
+        return label_map[lowered]
+    return PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
+def apply_session_credentials(
+    *,
+    gemini_api_key: str | None = None,
+    hf_token: str | None = None,
+    aws_region: str | None = None,
+    aws_access_key_id: str | None = None,
+    aws_secret_access_key: str | None = None,
+    aws_session_token: str | None = None,
+) -> None:
+    """Apply session-only credential overrides to os.environ."""
+    if gemini_api_key and gemini_api_key.strip():
+        os.environ["GEMINI_API_KEY"] = gemini_api_key.strip()
+    if hf_token and hf_token.strip():
+        token = hf_token.strip()
+        os.environ["HF_TOKEN"] = token
+        os.environ["DOC_REDACTION_HF_TOKEN"] = token
+    if aws_region and aws_region.strip():
+        os.environ["AWS_REGION"] = aws_region.strip()
+        os.environ["AWS_DEFAULT_REGION"] = aws_region.strip()
+    configure_aws_credentials(
+        session_access_key_id=aws_access_key_id,
+        session_secret_access_key=aws_secret_access_key,
+        session_session_token=aws_session_token,
+    )
+def mirror_hf_token_from_env() -> None:
+    """Mirror DOC_REDACTION_HF_TOKEN or Space secret HF_TOKEN for Pi subprocess."""
+    if os.environ.get("HF_TOKEN"):
+        return
+    doc_token = os.environ.get("DOC_REDACTION_HF_TOKEN", "").strip()
+    if doc_token:
+        os.environ["HF_TOKEN"] = doc_token
+def _hf_token_status() -> str:
+    if os.environ.get("HF_TOKEN"):
+        source = (
+            "UI session" if os.environ.get("_HF_TOKEN_FROM_UI") else "env/Space secret"
+        )
+        return f"set ({source})"
+    return "missing"
+def credential_status_markdown(*, provider: str | None = None) -> str:
+    """
+    Credential summary for the active Pi provider.
+    ``llama-cpp`` uses the local OpenAI-compatible endpoint only (no Gemini/AWS keys).
+    Gemini and Bedrock lines appear only when that provider is selected.
+    """
+    active = normalize_provider(provider or get_default_provider())
+    if is_hf_space_profile():
+        gemini = (
+            "set"
+            if os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
+            else "missing"
+        )
+        return (
+            f"**Credentials:** Gemini `{gemini}` · "
+            f"HF token (redaction backend) `{_hf_token_status()}`"
+        )
+    if active == PROVIDER_LLAMA:
+        return (
+            f"**Credentials:** local llama-cpp at `{LLAMA_BASE_URL}` "
+            f"(no API key; AWS/Gemini not used for Pi orchestration)"
+        )
+    if active == PROVIDER_GEMINI:
+        gemini = (
+            "set"
+            if os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
+            else "missing"
+        )
+        return f"**Credentials:** Gemini `{gemini}`"
+    region = _bedrock_region()
+    return f"**Credentials:** AWS `{_aws_credential_status()}` · region `{region}`"
+def provider_choices() -> list[str]:
+    if is_hf_space_profile():
+        return [PROVIDER_GEMINI]
+    return list(PROVIDER_LABELS.keys())
+def gemini_api_key_configured() -> bool:
+    return bool(os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))
+def provider_label(provider: str) -> str:
+    return PROVIDER_LABELS.get(provider, provider)
+if __name__ == "__main__":
+    configure_aws_credentials()
+    models_path, settings_path = write_runtime_config()
+    print(f"Wrote {models_path}")
+    print(f"Wrote {settings_path}")

agent-redact/pi/pi_examples.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""Pi agent Gradio examples aligned with the main app SHOW_EXAMPLES redaction demos."""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from pi_agent_config import is_hf_space_profile
+from redaction_prompt import HF_DEFAULT_OCR
+def _show_examples_from_env() -> bool:
+    """True unless PI_GRADIO_SHOW_EXAMPLES or SHOW_PI_EXAMPLES is explicitly false."""
+    for key in ("PI_GRADIO_SHOW_EXAMPLES", "SHOW_PI_EXAMPLES"):
+        raw = os.environ.get(key)
+        if raw is None:
+            continue
+        lowered = raw.strip().lower()
+        if lowered in {"0", "false", "no"}:
+            return False
+        if lowered in {"1", "true", "yes"}:
+            return True
+    return True
+SHOW_PI_EXAMPLES = _show_examples_from_env()
+@dataclass(frozen=True)
+class PiRedactionExample:
+    label: str
+    file_name: str
+    instructions: str
+    ocr_method: str
+    pii_method: str = "Local"
+    encourage_vlm_faces: bool = False
+    encourage_vlm_signatures: bool = False
+    page_range: str = "all"
+def resolve_example_data_dir() -> Path | None:
+    """Locate bundled example PDFs (repo checkout, PyPI package, or Docker layout)."""
+    from bootstrap_pi_config import pi_repo_root_path
+    workdir = pi_repo_root_path()
+    repo_root = Path(__file__).resolve().parents[2]
+    candidates = [
+        workdir / "doc_redaction" / "example_data",
+        workdir / "example_data",
+        repo_root / "doc_redaction" / "example_data",
+        repo_root / "example_data",
+    ]
+    for candidate in candidates:
+        if candidate.is_dir():
+            return candidate.resolve()
+    return None
+def example_file_path(file_name: str) -> Path | None:
+    root = resolve_example_data_dir()
+    if root is None:
+        return None
+    path = (root / file_name).resolve()
+    try:
+        path.relative_to(root)
+    except ValueError:
+        return None
+    if not path.is_file():
+        return None
+    if _is_lfs_pointer(path):
+        return None
+    return path
+def _is_lfs_pointer(path: Path) -> bool:
+    try:
+        first_line = path.read_text(encoding="utf-8", errors="ignore").splitlines()[0]
+    except (OSError, IndexError):
+        return False
+    return first_line.startswith("version https://git-lfs.github.com/spec/v1")
+def _catalog() -> tuple[PiRedactionExample, ...]:
+    selectable_text_ocr = (
+        HF_DEFAULT_OCR if is_hf_space_profile() else "Local model - selectable text"
+    )
+    # local_ocr = (
+    #     HF_DEFAULT_OCR
+    #     if is_hf_space_profile()
+    #     else "Local OCR model - PDFs without selectable text"
+    # )
+    return (
+        PiRedactionExample(
+            label="Emails to a professor",
+            file_name="example_of_emails_sent_to_a_professor_before_applying.pdf",
+            ocr_method=selectable_text_ocr,
+            pii_method="Local",
+            instructions=(
+                "- Any redaction box related to Dr Kornbluth should be removed\n"
+                "- References to Dr Hyde, or Dr Hyde's lab should be redacted. Also any references to Lauren, or Lauren Lilley\n"
+                "- All mentions of Universities and their names should be redacted\n"
+            ),
+        ),
+        PiRedactionExample(
+            label="Graduate cover letter",
+            file_name="graduate-job-example-cover-letter.pdf",
+            ocr_method=selectable_text_ocr,
+            pii_method="Local",
+            instructions=(
+                "- Redact any names and titles, apart from Mr Wilson\n"
+                "- Redact any organisation names\n"
+                "- Redact any place names\n"
+            ),
+        ),
+    )
+def available_pi_examples() -> list[PiRedactionExample]:
+    if not SHOW_PI_EXAMPLES:
+        return []
+    available: list[PiRedactionExample] = []
+    for example in _catalog():
+        if example_file_path(example.file_name) is not None:
+            available.append(example)
+    return available
+def example_rows() -> tuple[list[list], list[str]]:
+    """Return (gr.Examples rows, labels) for available demos."""
+    rows: list[list] = []
+    labels: list[str] = []
+    for example in available_pi_examples():
+        path = example_file_path(example.file_name)
+        if path is None:
+            continue
+        rows.append(
+            [
+                str(path),
+                example.instructions,
+                example.page_range,
+                example.ocr_method,
+                example.pii_method,
+                example.encourage_vlm_faces,
+                example.encourage_vlm_signatures,
+            ]
+        )
+        labels.append(example.label)
+    return rows, labels
+def gradio_example_allowed_paths() -> list[str]:
+    root = resolve_example_data_dir()
+    if root is None:
+        return []
+    return [str(root)]
+def examples_status_markdown() -> str:
+    """Human-readable status for the UI when examples are missing or disabled."""
+    if not SHOW_PI_EXAMPLES:
+        return (
+            "_Examples are disabled. Set Space variable "
+            "`PI_GRADIO_SHOW_EXAMPLES=true` (or `SHOW_PI_EXAMPLES=true`) and restart._"
+        )
+    root = resolve_example_data_dir()
+    if root is None:
+        return (
+            "_Example PDFs not found — expected under "
+            "`doc_redaction/example_data/` in the Space image._"
+        )
+    available = available_pi_examples()
+    if not available:
+        return (
+            f"_Example PDFs not found under `{root}`. "
+            "Rebuild the Space after syncing example files from the monorepo._"
+        )
+    names = ", ".join(f"`{ex.file_name}`" for ex in available)
+    return f"_Examples loaded from `{root}`: {names}_"

agent-redact/pi/pi_rpc_client.py ADDED Viewed

	@@ -0,0 +1,989 @@

+"""Python client for Pi RPC mode (JSONL over stdin/stdout)."""
+from __future__ import annotations
+import json
+import os
+import queue
+import shutil
+import subprocess
+import sys
+import threading
+import uuid
+from collections import deque
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from typing import Any
+class PiRpcError(RuntimeError):
+    pass
+# Sentinel pushed to every pending response slot and the events queue when the
+# Pi RPC subprocess exits, so blocked waiters unblock with a clear error instead
+# of hanging forever.
+_PI_PROCESS_EXIT = object()
+# Pi RPC is JSONL over pipes; always UTF-8 (Windows default locale is cp1252).
+_PI_SUBPROCESS_ENCODING = "utf-8"
+_PI_SUBPROCESS_ENCODING_ERRORS = "replace"
+_PI_INSTALL_HINT = (
+    "Install the Pi coding agent CLI, then restart the Gradio app:  \n"
+    "`npm install -g @earendil-works/pi-coding-agent`  \n"
+    "On Windows, ensure Node.js/npm are on PATH (or set `PI_EXECUTABLE` to the "
+    "full path to `pi.cmd`, e.g. `%APPDATA%\\npm\\pi.cmd`).  \n"
+    "Docker users: run the Pi UI via `docker compose` (`pi-agent` service) instead "
+    "of `python gradio_app.py` on the host."
+)
+def resolve_pi_executable() -> str:
+    """Return a path to the ``pi`` RPC executable (raises ``PiRpcError`` if missing)."""
+    override = os.environ.get("PI_EXECUTABLE", "").strip()
+    if override:
+        if os.path.isfile(override) or shutil.which(override):
+            return override
+        raise PiRpcError(
+            f"PI_EXECUTABLE is set but not found: `{override}`  \n\n{_PI_INSTALL_HINT}"
+        )
+    for name in ("pi", "pi.cmd"):
+        found = shutil.which(name)
+        if found:
+            return found
+    raise PiRpcError(f"Pi CLI (`pi`) not found on PATH.  \n\n{_PI_INSTALL_HINT}")
+@dataclass
+class PiStreamEvent:
+    """Structured event from Pi RPC for UI layers."""
+    kind: str
+    text: str = ""
+    tool_name: str | None = None
+    tool_call_id: str | None = None
+    tool_args: dict[str, Any] | None = None
+    tool_output: str | None = None
+    is_error: bool = False
+    meta: dict[str, Any] = field(default_factory=dict)
+def extract_tool_text(payload: dict[str, Any] | None) -> str:
+    if not payload:
+        return ""
+    content = payload.get("content")
+    if content is None and isinstance(payload.get("partialResult"), dict):
+        content = payload["partialResult"].get("content")
+    if content is None and isinstance(payload.get("result"), dict):
+        content = payload["result"].get("content")
+    if not isinstance(content, list):
+        return ""
+    parts: list[str] = []
+    for block in content:
+        if isinstance(block, dict) and block.get("type") == "text":
+            parts.append(str(block.get("text") or ""))
+    return "\n".join(parts).strip()
+def extract_assistant_display(message: dict[str, Any] | None) -> tuple[str, str]:
+    """Extract visible text and thinking from a partial assistant message."""
+    if not message or message.get("role") != "assistant":
+        return "", ""
+    content = message.get("content")
+    if isinstance(content, str):
+        return content, ""
+    if not isinstance(content, list):
+        return "", ""
+    texts: list[str] = []
+    thinkings: list[str] = []
+    for block in content:
+        if isinstance(block, str):
+            if block.strip():
+                texts.append(block)
+            continue
+        if not isinstance(block, dict):
+            continue
+        block_type = block.get("type")
+        if block_type in (None, "text", "output_text"):
+            text = block.get("text") or block.get("content") or ""
+            if text:
+                texts.append(str(text))
+        elif block_type in ("thinking", "reasoning", "thought"):
+            thought = (
+                block.get("thinking")
+                or block.get("text")
+                or block.get("reasoning")
+                or block.get("content")
+                or ""
+            )
+            if thought:
+                thinkings.append(str(thought))
+    return "".join(texts), "".join(thinkings)
+def assistant_chat_text(visible: str, thinking: str) -> str:
+    """Text to show in the main chat — visible answer, or thinking when Gemini sends only that."""
+    if visible.strip():
+        return visible
+    return thinking
+def _tool_lines_from_content(content: list[Any]) -> list[str]:
+    tool_lines: list[str] = []
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        block_type = block.get("type")
+        if block_type not in {"toolCall", "tool_use", "functionCall"}:
+            continue
+        name = str(block.get("name") or block.get("toolName") or "tool")
+        args = block.get("arguments") or block.get("input") or block.get("args")
+        if isinstance(args, str):
+            try:
+                args = json.loads(args)
+            except json.JSONDecodeError:
+                args = {"raw": args}
+        if not isinstance(args, dict):
+            args = {}
+        tool_lines.append(format_tool_chat_line(name, args))
+    return tool_lines
+def format_tool_chat_line(tool_name: str | None, args: dict[str, Any] | None) -> str:
+    """Render one tool invocation for the chat UI (prose for comment-only bash)."""
+    name = str(tool_name or "tool")
+    lowered = name.lower()
+    if lowered == "bash" and args and args.get("command"):
+        cmd = str(args["command"])
+        if is_bash_commentary_only(cmd):
+            return extract_bash_commentary_text(cmd)
+        commentary, executable = split_bash_commentary_and_command(cmd)
+        if commentary and executable:
+            short = executable[:200] + ("…" if len(executable) > 200 else "")
+            return f"{commentary}\n\n**bash:** `{short}`"
+        if commentary:
+            return commentary
+    detail = format_tool_args(tool_name, args)
+    if detail and detail != name:
+        return f"**{name}:** {detail}"
+    return f"**{name}**"
+def format_assistant_message_for_chat(message: dict[str, Any]) -> str:
+    """Render one assistant message for the chat UI (visible text or tool calls; no thinking)."""
+    visible, _thinking = extract_assistant_display(message)
+    if visible.strip():
+        return visible
+    content = message.get("content")
+    if not isinstance(content, list):
+        return ""
+    return "\n".join(_tool_lines_from_content(content))
+def chat_text_from_assistant_message(message: dict[str, Any] | None) -> str:
+    """Non-thinking chat text from a Pi/Gemini assistant message snapshot."""
+    if not message or message.get("role") != "assistant":
+        return ""
+    return format_assistant_message_for_chat(message)
+_RATE_LIMIT_MARKERS = (
+    "429",
+    "quota",
+    "rate limit",
+    "rate-limit",
+    "resource_exhausted",
+    "too many requests",
+    "throttlingexception",
+    "throttling",
+    "toomanyrequestsexception",
+    "servicequotaexceeded",
+)
+def is_rate_limit_error(text: str | None) -> bool:
+    """True when *text* looks like a provider quota or rate-limit failure."""
+    if not text:
+        return False
+    lowered = text.lower()
+    return any(marker in lowered for marker in _RATE_LIMIT_MARKERS)
+def _strip_rpc_payload_for_debug(obj: Any) -> Any:
+    """
+    Strip large message content from RPC objects for compact debug logging.
+    Keeps metadata (id, type, command, success) but removes or truncates
+    actual message/data payloads.
+    """
+    if not isinstance(obj, dict):
+        return obj
+    kept_keys = {"type", "id", "command", "success", "error", "stopReason"}
+    result = {k: v for k, v in obj.items() if k in kept_keys}
+    # Keep data/result/messages structure without content
+    for key in ("data", "result", "messages", "response"):
+        if key in obj:
+            val = obj[key]
+            if isinstance(val, dict):
+                result[key] = {k: "..." for k in val.keys()}
+            elif isinstance(val, list):
+                result[key] = f"[... {len(val)} items]"
+            else:
+                result[key] = "..."
+    return result
+def last_assistant_turn_error(messages: list[dict[str, Any]]) -> str | None:
+    """Return the latest assistant error in the current user turn, if any."""
+    last_user = -1
+    for index, message in enumerate(messages):
+        if message.get("role") == "user":
+            last_user = index
+    turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
+    for message in reversed(turn_messages):
+        if message.get("role") != "assistant":
+            continue
+        error = message.get("errorMessage")
+        if error:
+            return str(error)
+        if message.get("stopReason") == "error":
+            visible, _ = extract_assistant_display(message)
+            if visible.strip():
+                return visible
+            return "assistant turn failed"
+    return None
+def assistant_text_since_last_user(messages: list[dict[str, Any]]) -> str:
+    """Combine assistant messages from the latest user turn."""
+    last_user = -1
+    for index, message in enumerate(messages):
+        if message.get("role") == "user":
+            last_user = index
+    turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
+    parts: list[str] = []
+    for message in turn_messages:
+        if message.get("role") != "assistant":
+            continue
+        part = format_assistant_message_for_chat(message)
+        if part.strip():
+            parts.append(part)
+    return "\n\n".join(parts)
+def partial_message_from_update(event: dict[str, Any]) -> dict[str, Any] | None:
+    delta = event.get("assistantMessageEvent") or {}
+    partial = delta.get("partial")
+    if isinstance(partial, dict):
+        return partial
+    message = event.get("message")
+    if isinstance(message, dict):
+        return message
+    return None
+def is_bash_commentary_only(command: str) -> bool:
+    """True when a bash tool call contains only shell comments (no executable lines)."""
+    lines = [ln.strip() for ln in command.splitlines() if ln.strip()]
+    if not lines:
+        return False
+    return all(ln.startswith("#") for ln in lines)
+def extract_bash_commentary_text(command: str) -> str:
+    """Join non-empty ``#`` comment bodies from a bash command into readable prose."""
+    parts: list[str] = []
+    for raw in command.splitlines():
+        stripped = raw.strip()
+        if not stripped.startswith("#"):
+            continue
+        text = stripped.lstrip("#").strip()
+        if text:
+            parts.append(text)
+    return "\n".join(parts)
+def split_bash_commentary_and_command(command: str) -> tuple[str, str]:
+    """Split ``#`` planning lines from executable shell lines."""
+    comments: list[str] = []
+    commands: list[str] = []
+    for raw in command.splitlines():
+        stripped = raw.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("#"):
+            text = stripped.lstrip("#").strip()
+            if text:
+                comments.append(text)
+        else:
+            commands.append(stripped)
+    return "\n".join(comments), " ↵ ".join(commands)
+def format_tool_args(tool_name: str | None, args: dict[str, Any] | None) -> str:
+    if not args:
+        return ""
+    name = (tool_name or "").lower()
+    if name == "bash" and args.get("command"):
+        cmd = str(args["command"])
+        if is_bash_commentary_only(cmd):
+            return extract_bash_commentary_text(cmd)
+        _commentary, executable = split_bash_commentary_and_command(cmd)
+        if not executable:
+            return extract_bash_commentary_text(cmd)
+        shown = executable[:240] + ("…" if len(executable) > 240 else "")
+        return f"`{shown}`"
+    if name in {"read", "write", "edit"} and args.get("path"):
+        return f"`{args['path']}`"
+    compact = json.dumps(args, ensure_ascii=False)
+    if len(compact) > 280:
+        compact = compact[:277] + "…"
+    return compact
+class PiRpcClient:
+    """Drive a long-lived ``pi --mode rpc`` subprocess."""
+    # Extension UI dialog methods block Pi until the client replies; auto-cancel
+    # them so a missing UI layer can never wedge the RPC process.
+    _EXTENSION_UI_DIALOG_METHODS = frozenset({"select", "confirm", "input", "editor"})
+    def __init__(
+        self,
+        *,
+        cwd: str | None = None,
+        env: dict[str, str] | None = None,
+        pi_args: list[str] | None = None,
+    ) -> None:
+        self._cwd = cwd
+        self._env = env
+        self._pi_args = pi_args or []
+        self._proc: subprocess.Popen[str] | None = None
+        self._write_lock = threading.Lock()
+        self._abort_requested = False
+        self._prompt_stream_depth = 0
+        self._pending_follow_ups = 0
+        self._pending_ui_history: list[dict[str, Any]] = []
+        # Single stdout reader thread demultiplexes the JSONL stream: command
+        # responses go to per-id slots, agent events go to ``_events``. This lets
+        # any thread (e.g. post-task logging) call the client safely while a
+        # prompt stream is active.
+        self._reader_thread: threading.Thread | None = None
+        self._stderr_thread: threading.Thread | None = None
+        self._events: queue.Queue[Any] = queue.Queue()
+        self._pending_lock = threading.Lock()
+        self._pending_responses: dict[str, queue.Queue[Any]] = {}
+        self._stderr_buffer: deque[str] = deque(maxlen=200)
+        self._closing = False
+    @property
+    def running(self) -> bool:
+        return self._proc is not None and self._proc.poll() is None
+    @property
+    def prompt_stream_active(self) -> bool:
+        """True while :meth:`prompt_events` is consuming the RPC event stream."""
+        return self._prompt_stream_depth > 0
+    def start(self) -> None:
+        if self.running:
+            return
+        command = [resolve_pi_executable(), "--mode", "rpc", *self._pi_args]
+        self._closing = False
+        self._abort_requested = False
+        proc = subprocess.Popen(
+            command,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            encoding=_PI_SUBPROCESS_ENCODING,
+            errors=_PI_SUBPROCESS_ENCODING_ERRORS,
+            bufsize=1,
+            cwd=self._cwd,
+            env=self._env,
+        )
+        self._proc = proc
+        # Fresh demux state for this process.
+        self._events = queue.Queue()
+        with self._pending_lock:
+            self._pending_responses = {}
+        self._stderr_buffer = deque(maxlen=200)
+        self._reader_thread = threading.Thread(
+            target=self._reader_loop,
+            args=(proc,),
+            name="pi-rpc-stdout",
+            daemon=True,
+        )
+        self._reader_thread.start()
+        if proc.stderr is not None:
+            self._stderr_thread = threading.Thread(
+                target=self._stderr_loop,
+                args=(proc,),
+                name="pi-rpc-stderr",
+                daemon=True,
+            )
+            self._stderr_thread.start()
+    def close(self) -> None:
+        if not self._proc:
+            return
+        self._closing = True
+        proc = self._proc
+        if proc.poll() is None:
+            try:
+                self.abort()
+            except Exception:
+                pass
+            proc.terminate()
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+        # Process exit makes ``readline`` return EOF; the reader thread then
+        # notifies waiters. Nudge waiters here too in case the threads are slow.
+        self._notify_process_exit()
+        for thread in (self._reader_thread, self._stderr_thread):
+            if (
+                thread is not None
+                and thread.is_alive()
+                and thread is not threading.current_thread()
+            ):
+                thread.join(timeout=2)
+        self._reader_thread = None
+        self._stderr_thread = None
+        self._proc = None
+    def _ensure_running(self) -> subprocess.Popen[str]:
+        if not self.running:
+            self.start()
+        assert self._proc is not None
+        return self._proc
+    def _recent_stderr(self) -> str:
+        return "\n".join(self._stderr_buffer)
+    def _process_exit_error(self) -> PiRpcError:
+        code = self._proc.poll() if self._proc else None
+        err = self._recent_stderr()
+        return PiRpcError(
+            f"Pi RPC process exited (code={code})."
+            + (f" stderr: {err[:500]}" if err else "")
+        )
+    def _notify_process_exit(self) -> None:
+        """Unblock every pending response slot and the events queue on exit."""
+        with self._pending_lock:
+            pending = list(self._pending_responses.values())
+            self._pending_responses.clear()
+        for slot in pending:
+            try:
+                slot.put_nowait(_PI_PROCESS_EXIT)
+            except queue.Full:
+                pass
+        try:
+            self._events.put_nowait(_PI_PROCESS_EXIT)
+        except queue.Full:
+            pass
+    def _stderr_loop(self, proc: subprocess.Popen[str]) -> None:
+        """Continuously drain stderr into a bounded buffer (prevents pipe deadlock)."""
+        stream = proc.stderr
+        if stream is None:
+            return
+        try:
+            for line in stream:
+                self._stderr_buffer.append(line.rstrip("\r\n"))
+        except (ValueError, OSError):
+            pass
+    def _reader_loop(self, proc: subprocess.Popen[str]) -> None:
+        """Read every stdout line and route responses vs. agent events."""
+        stream = proc.stdout
+        if stream is None:
+            self._notify_process_exit()
+            return
+        try:
+            while True:
+                line = stream.readline()
+                if not line:
+                    break
+                line = line.rstrip("\r\n")
+                if not line:
+                    continue
+                try:
+                    message = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                self._dispatch_message(message)
+        except (ValueError, OSError):
+            pass
+        finally:
+            self._notify_process_exit()
+    def _dispatch_message(self, message: Any) -> None:
+        if not isinstance(message, dict):
+            return
+        if os.environ.get("PI_RPC_DEBUG", "").strip() == "1":
+            try:
+                stripped = _strip_rpc_payload_for_debug(message)
+                sys.stderr.write(
+                    "Pi RPC recv: " + json.dumps(stripped, ensure_ascii=False) + "\n"
+                )
+                sys.stderr.flush()
+            except Exception:
+                pass
+        msg_type = message.get("type")
+        if msg_type == "response":
+            req_id = message.get("id")
+            slot: queue.Queue[Any] | None = None
+            if req_id is not None:
+                with self._pending_lock:
+                    slot = self._pending_responses.pop(str(req_id), None)
+            if slot is not None:
+                try:
+                    slot.put_nowait(message)
+                except queue.Full:
+                    pass
+            return
+        if msg_type == "extension_ui_request":
+            self._auto_reply_extension_ui(message)
+            return
+        # Agent event — consumed by the active ``prompt_events`` stream.
+        self._events.put(message)
+    def _auto_reply_extension_ui(self, message: dict[str, Any]) -> None:
+        method = message.get("method")
+        req_id = message.get("id")
+        if req_id is None or method not in self._EXTENSION_UI_DIALOG_METHODS:
+            return
+        try:
+            self._write_command(
+                {"type": "extension_ui_response", "id": req_id, "cancelled": True}
+            )
+        except (OSError, PiRpcError):
+            pass
+    def _write_command(self, command: dict[str, Any]) -> None:
+        proc = self._ensure_running()
+        assert proc.stdin is not None
+        if os.environ.get("PI_RPC_DEBUG", "").strip() == "1":
+            try:
+                stripped = _strip_rpc_payload_for_debug(command)
+                sys.stderr.write(
+                    "Pi RPC send: " + json.dumps(stripped, ensure_ascii=False) + "\n"
+                )
+                sys.stderr.flush()
+            except Exception:
+                pass
+        with self._write_lock:
+            proc.stdin.write(json.dumps(command) + "\n")
+            proc.stdin.flush()
+    def _send_command(
+        self,
+        command: dict[str, Any],
+        *,
+        wait_response: bool = True,
+    ) -> dict[str, Any] | None:
+        req_id = str(command.setdefault("id", str(uuid.uuid4())))
+        if not wait_response:
+            self._write_command(command)
+            return None
+        slot: queue.Queue[Any] = queue.Queue(maxsize=1)
+        with self._pending_lock:
+            self._pending_responses[req_id] = slot
+        try:
+            self._write_command(command)
+        except Exception:
+            with self._pending_lock:
+                self._pending_responses.pop(req_id, None)
+            raise
+        result = slot.get()
+        if result is _PI_PROCESS_EXIT:
+            raise self._process_exit_error()
+        if not result.get("success", False):
+            error = result.get("error") or result.get("message") or "command failed"
+            raise PiRpcError(str(error))
+        return result
+    def abort(self) -> None:
+        """Request abort without reading stdout (the active stream consumer drains events)."""
+        if not self.running:
+            return
+        self._abort_requested = True
+        try:
+            self._send_command({"type": "abort"}, wait_response=False)
+        except OSError:
+            pass
+    def stage_ui_chat_notice(self, label: str, message: str) -> None:
+        """Stage user/assistant chat rows for the active prompt stream to merge on yield."""
+        text = message.strip()
+        if not text:
+            return
+        self._pending_ui_history.append(
+            {"role": "user", "content": f"_**{label}:**_ {text}"}
+        )
+        self._pending_ui_history.append({"role": "assistant", "content": ""})
+    def drain_pending_ui_history(self) -> list[dict[str, Any]]:
+        """Return and clear UI chat rows staged by :meth:`stage_ui_chat_notice`."""
+        pending = self._pending_ui_history[:]
+        self._pending_ui_history.clear()
+        return pending
+    def steer(self, message: str) -> None:
+        """Queue a steering message (delivered after the current tool step completes)."""
+        if not message.strip():
+            return
+        self._send_command(
+            {"type": "steer", "message": message},
+            wait_response=False,
+        )
+    def follow_up(self, message: str) -> None:
+        """Queue a follow-up message for when the agent stops."""
+        if not message.strip():
+            return
+        self._pending_follow_ups += 1
+        self._send_command(
+            {"type": "follow_up", "message": message},
+            wait_response=False,
+        )
+    @property
+    def abort_requested(self) -> bool:
+        return self._abort_requested
+    def clear_abort(self) -> None:
+        self._abort_requested = False
+    def new_session(self) -> None:
+        self._send_command({"type": "new_session"})
+    def get_state(self) -> dict[str, Any]:
+        response = self._send_command({"type": "get_state"})
+        data = response.get("data") if response else {}
+        return data if isinstance(data, dict) else {}
+    def get_messages(self) -> list[dict[str, Any]]:
+        response = self._send_command({"type": "get_messages"})
+        data = response.get("data") if response else {}
+        messages = data.get("messages") if isinstance(data, dict) else []
+        return messages if isinstance(messages, list) else []
+    def get_session_stats(self) -> dict[str, Any]:
+        """Token usage and cost totals for the active session (Pi RPC ``get_session_stats``)."""
+        response = self._send_command({"type": "get_session_stats"})
+        data = response.get("data") if response else {}
+        return data if isinstance(data, dict) else {}
+    def set_model(self, provider: str, model_id: str) -> dict[str, Any]:
+        response = self._send_command(
+            {
+                "type": "set_model",
+                "provider": provider,
+                "modelId": model_id,
+            }
+        )
+        data = response.get("data") if response else {}
+        return data if isinstance(data, dict) else {}
+    def get_available_models(self) -> list[dict[str, Any]]:
+        response = self._send_command({"type": "get_available_models"})
+        data = response.get("data") if response else {}
+        models = data.get("models") if isinstance(data, dict) else []
+        return models if isinstance(models, list) else []
+    def restart(self) -> None:
+        self.close()
+        self.start()
+    def prompt_events(self, message: str) -> Iterator[PiStreamEvent]:
+        """Send a user message and yield structured events until ``agent_end``."""
+        self._prompt_stream_depth += 1
+        try:
+            yield from self._prompt_events_impl(message)
+        finally:
+            self._prompt_stream_depth = max(0, self._prompt_stream_depth - 1)
+    def _drain_events(self) -> None:
+        """Discard stale events left over from a prior stream (single active prompt)."""
+        while True:
+            try:
+                item = self._events.get_nowait()
+            except queue.Empty:
+                return
+            if item is _PI_PROCESS_EXIT:
+                # Preserve the exit signal for the consumer to observe.
+                try:
+                    self._events.put_nowait(_PI_PROCESS_EXIT)
+                except queue.Full:
+                    pass
+                return
+    def _prompt_events_impl(self, message: str) -> Iterator[PiStreamEvent]:
+        self.clear_abort()
+        self._drain_events()
+        try:
+            self._send_command({"type": "prompt", "message": message})
+        except PiRpcError as exc:
+            yield PiStreamEvent(kind="error", text=str(exc), is_error=True)
+            return
+        yield from self._iter_agent_events()
+    def _iter_agent_events(self) -> Iterator[PiStreamEvent]:
+        while True:
+            event = self._events.get()
+            if event is _PI_PROCESS_EXIT:
+                raise self._process_exit_error()
+            event_type = event.get("type")
+            if event_type == "agent_start":
+                yield PiStreamEvent(kind="status", text="Agent started…")
+            elif event_type == "turn_start":
+                yield PiStreamEvent(kind="status", text="Turn started.")
+            elif event_type == "turn_end":
+                yield PiStreamEvent(kind="turn_end", text="Turn finished.")
+            elif event_type == "message_update":
+                yield from self._parse_message_update(event)
+            elif event_type == "tool_execution_start":
+                tool_name = event.get("toolName")
+                tool_args = (
+                    event.get("args") if isinstance(event.get("args"), dict) else {}
+                )
+                yield PiStreamEvent(
+                    kind="tool_start",
+                    tool_name=str(tool_name) if tool_name else "tool",
+                    tool_call_id=event.get("toolCallId"),
+                    tool_args=tool_args,
+                    text=format_tool_args(
+                        str(tool_name) if tool_name else None,
+                        tool_args,
+                    ),
+                )
+            elif event_type == "tool_execution_update":
+                output = extract_tool_text(event)
+                yield PiStreamEvent(
+                    kind="tool_update",
+                    tool_name=event.get("toolName"),
+                    tool_call_id=event.get("toolCallId"),
+                    tool_output=output,
+                )
+            elif event_type == "tool_execution_end":
+                result = (
+                    event.get("result") if isinstance(event.get("result"), dict) else {}
+                )
+                output = extract_tool_text(result)
+                yield PiStreamEvent(
+                    kind="tool_end",
+                    tool_name=event.get("toolName"),
+                    tool_call_id=event.get("toolCallId"),
+                    tool_output=output,
+                    is_error=bool(event.get("isError")),
+                )
+            elif event_type == "queue_update":
+                steering = event.get("steering") or []
+                follow_up = event.get("followUp") or []
+                if steering or follow_up:
+                    yield PiStreamEvent(
+                        kind="queue_update",
+                        meta={"steering": steering, "follow_up": follow_up},
+                    )
+            elif event_type == "compaction_start":
+                reason = event.get("reason") or "unknown"
+                yield PiStreamEvent(
+                    kind="status",
+                    text=f"Compaction started ({reason})…",
+                    meta={"reason": reason},
+                )
+            elif event_type == "compaction_end":
+                if event.get("aborted"):
+                    text = "Compaction aborted."
+                elif event.get("errorMessage"):
+                    text = f"Compaction failed: {event['errorMessage']}"
+                    yield PiStreamEvent(kind="error", text=text, is_error=True)
+                    continue
+                elif event.get("willRetry"):
+                    text = "Compaction complete — retrying prompt…"
+                else:
+                    tokens = (event.get("result") or {}).get("tokensBefore")
+                    text = (
+                        f"Compaction complete ({tokens:,} tokens before)."
+                        if isinstance(tokens, int)
+                        else "Compaction complete."
+                    )
+                yield PiStreamEvent(kind="status", text=text, meta=event)
+            elif event_type == "auto_retry_start":
+                attempt = event.get("attempt")
+                max_attempts = event.get("maxAttempts")
+                delay_ms = event.get("delayMs")
+                msg = event.get("errorMessage") or "transient error"
+                yield PiStreamEvent(
+                    kind="status",
+                    text=(
+                        f"Auto-retry {attempt}/{max_attempts} in {delay_ms}ms "
+                        f"({str(msg)[:120]})"
+                    ),
+                    meta=event,
+                )
+            elif event_type == "auto_retry_end":
+                if event.get("success"):
+                    yield PiStreamEvent(
+                        kind="status",
+                        text=f"Auto-retry succeeded on attempt {event.get('attempt')}.",
+                    )
+                else:
+                    yield PiStreamEvent(
+                        kind="error",
+                        text=f"Auto-retry failed: {event.get('finalError', 'unknown error')}",
+                        is_error=True,
+                    )
+            elif event_type == "extension_error":
+                yield PiStreamEvent(
+                    kind="error",
+                    text=str(event.get("error") or "extension error"),
+                    is_error=True,
+                )
+            elif event_type == "agent_end":
+                # Pi delivers queued ``follow_up`` messages after ``agent_end`` and
+                # continues streaming; do not stop the stdout consumer until they run.
+                if self._pending_follow_ups > 0:
+                    self._pending_follow_ups -= 1
+                    yield PiStreamEvent(
+                        kind="status",
+                        text="Follow-up queued — continuing…",
+                    )
+                    continue
+                aborted = self._abort_requested
+                self.clear_abort()
+                yield PiStreamEvent(
+                    kind="done",
+                    text="Agent aborted." if aborted else "Agent finished.",
+                )
+                return
+    def _parse_message_update(self, event: dict[str, Any]) -> Iterator[PiStreamEvent]:
+        delta = event.get("assistantMessageEvent") or {}
+        delta_type = delta.get("type")
+        partial = partial_message_from_update(event)
+        if partial is not None:
+            visible, thinking = extract_assistant_display(partial)
+            if visible.strip():
+                yield PiStreamEvent(kind="text_snapshot", text=visible)
+            elif chat_text := chat_text_from_assistant_message(partial):
+                yield PiStreamEvent(kind="text_snapshot", text=chat_text)
+            if thinking.strip():
+                yield PiStreamEvent(kind="thinking_snapshot", text=thinking)
+        if delta_type == "text_delta":
+            chunk = delta.get("delta") or ""
+            if chunk:
+                yield PiStreamEvent(kind="text_delta", text=chunk)
+        elif delta_type == "thinking_delta":
+            chunk = delta.get("delta") or ""
+            if chunk:
+                yield PiStreamEvent(kind="thinking_delta", text=chunk)
+        elif delta_type == "toolcall_start":
+            tool_call = delta.get("toolCall") or {}
+            tool_name = tool_call.get("name") or delta.get("toolName") or "tool"
+            tool_args = tool_call.get("arguments")
+            if isinstance(tool_args, str):
+                try:
+                    tool_args = json.loads(tool_args)
+                except json.JSONDecodeError:
+                    tool_args = {"raw": tool_args}
+            if not isinstance(tool_args, dict):
+                tool_args = {}
+            chat_line = format_tool_chat_line(str(tool_name), tool_args)
+            yield PiStreamEvent(kind="text_snapshot", text=chat_line)
+        elif delta_type == "error":
+            yield PiStreamEvent(
+                kind="error",
+                text=str(
+                    delta.get("message") or delta.get("error") or "generation error"
+                ),
+                is_error=True,
+            )
+    def prompt_stream(
+        self, message: str, *, show_tool_status: bool = True
+    ) -> Iterator[str]:
+        """Backward-compatible text stream (assistant visible text + optional tool status)."""
+        for event in self.prompt_events(message):
+            if event.kind == "text_delta":
+                yield event.text
+            elif show_tool_status and event.kind == "tool_start":
+                yield f"\n\n_[Running {event.tool_name}…]_\n"
+            elif event.kind == "error":
+                yield f"\n\n**Error:** {event.text}\n"
+def start_pi_prompt_event_worker(
+    client: PiRpcClient,
+    event_queue: queue.Queue[Any],
+    prompt: str,
+) -> None:
+    """Run ``client.prompt_events`` on a background thread, feeding *event_queue*."""
+    def _worker() -> None:
+        try:
+            for event in client.prompt_events(prompt):
+                event_queue.put(event)
+        except Exception as exc:
+            event_queue.put(PiStreamEvent(kind="error", text=str(exc), is_error=True))
+        finally:
+            event_queue.put(None)
+    threading.Thread(target=_worker, daemon=True).start()
+def default_client(session_hash: str | None = None) -> PiRpcClient:
+    from pi_agent_config import configure_aws_credentials
+    from pi_workspace_skills import ensure_workspace_skills, pi_rpc_args, pi_rpc_cwd
+    configure_aws_credentials()
+    ensure_workspace_skills()
+    env = os.environ.copy()
+    env.setdefault("HOME", os.path.expanduser("~"))
+    env.setdefault("PYTHONUTF8", "1")
+    env.setdefault("PYTHONIOENCODING", "utf-8")
+    from session_workspace import workspace_base_dir
+    env.setdefault("PI_WORKSPACE_DIR", str(workspace_base_dir()))
+    if not env.get("GEMINI_API_KEY") and env.get("GOOGLE_API_KEY"):
+        env["GEMINI_API_KEY"] = env["GOOGLE_API_KEY"]
+    if not env.get("HF_TOKEN") and env.get("DOC_REDACTION_HF_TOKEN"):
+        env["HF_TOKEN"] = env["DOC_REDACTION_HF_TOKEN"]
+    return PiRpcClient(
+        cwd=pi_rpc_cwd(session_hash),
+        env=env,
+        pi_args=pi_rpc_args(),
+    )

agent-redact/pi/pi_session_usage.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""Summarize Pi agent LLM token usage for usage-log CSV rows."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from pi_rpc_client import PiRpcClient, PiRpcError
+@dataclass(frozen=True)
+class TokenUsageTotals:
+    """Pi session usage (see Pi session-format ``Usage``)."""
+    input: int = 0
+    output: int = 0
+    cache_read: int = 0
+    cache_write: int = 0
+    @property
+    def llm_input_tokens(self) -> int:
+        """Input-side tokens for the main-app usage log (input + cache)."""
+        return self.input + self.cache_read + self.cache_write
+    @property
+    def llm_output_tokens(self) -> int:
+        return self.output
+def _int_field(raw: Any) -> int:
+    try:
+        return max(0, int(raw or 0))
+    except (TypeError, ValueError):
+        return 0
+def totals_from_usage_dict(usage: dict[str, Any] | None) -> TokenUsageTotals:
+    if not usage:
+        return TokenUsageTotals()
+    return TokenUsageTotals(
+        input=_int_field(usage.get("input")),
+        output=_int_field(usage.get("output")),
+        cache_read=_int_field(usage.get("cacheRead")),
+        cache_write=_int_field(usage.get("cacheWrite")),
+    )
+def totals_from_stats_payload(data: dict[str, Any] | None) -> TokenUsageTotals:
+    if not data:
+        return TokenUsageTotals()
+    tokens = data.get("tokens")
+    if isinstance(tokens, dict):
+        return totals_from_usage_dict(tokens)
+    return TokenUsageTotals()
+def subtract_usage(
+    after: TokenUsageTotals, before: TokenUsageTotals
+) -> TokenUsageTotals:
+    return TokenUsageTotals(
+        input=max(0, after.input - before.input),
+        output=max(0, after.output - before.output),
+        cache_read=max(0, after.cache_read - before.cache_read),
+        cache_write=max(0, after.cache_write - before.cache_write),
+    )
+def add_usage(left: TokenUsageTotals, right: TokenUsageTotals) -> TokenUsageTotals:
+    return TokenUsageTotals(
+        input=left.input + right.input,
+        output=left.output + right.output,
+        cache_read=left.cache_read + right.cache_read,
+        cache_write=left.cache_write + right.cache_write,
+    )
+def sum_usage_from_messages(
+    messages: list[dict[str, Any]],
+    *,
+    since_last_user: bool = False,
+) -> TokenUsageTotals:
+    """Sum ``usage`` on assistant messages (optional: only after the last user turn)."""
+    last_user = -1
+    if since_last_user:
+        for index, message in enumerate(messages):
+            if message.get("role") == "user":
+                last_user = index
+        messages = messages[last_user + 1 :] if last_user >= 0 else messages
+    total = TokenUsageTotals()
+    for message in messages:
+        if message.get("role") != "assistant":
+            continue
+        usage = message.get("usage")
+        if isinstance(usage, dict):
+            total = add_usage(total, totals_from_usage_dict(usage))
+    return total
+def sum_usage_from_jsonl(path: Path) -> TokenUsageTotals:
+    """Parse a Pi session JSONL file and sum assistant ``usage`` blocks."""
+    total = TokenUsageTotals()
+    try:
+        text = path.read_text(encoding="utf-8")
+    except OSError:
+        return total
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        try:
+            entry = json.loads(stripped)
+        except json.JSONDecodeError:
+            continue
+        if entry.get("type") != "message":
+            continue
+        message = entry.get("message")
+        if not isinstance(message, dict) or message.get("role") != "assistant":
+            continue
+        usage = message.get("usage")
+        if isinstance(usage, dict):
+            total = add_usage(total, totals_from_usage_dict(usage))
+    return total
+def resolve_session_token_usage(client: PiRpcClient | None) -> TokenUsageTotals:
+    """
+    Best-effort session usage from Pi RPC ``get_session_stats``, live messages, or JSONL.
+    """
+    if client is None or not client.running:
+        return TokenUsageTotals()
+    try:
+        stats = client.get_session_stats()
+        totals = totals_from_stats_payload(stats)
+        if totals.input or totals.output or totals.cache_read or totals.cache_write:
+            return totals
+    except PiRpcError:
+        pass
+    try:
+        messages = client.get_messages()
+        totals = sum_usage_from_messages(messages)
+        if totals.input or totals.output or totals.cache_read or totals.cache_write:
+            return totals
+    except PiRpcError:
+        pass
+    from session_logs import pi_session_file_from_client
+    session_file = pi_session_file_from_client(client)
+    if session_file is not None:
+        return sum_usage_from_jsonl(session_file)
+    return TokenUsageTotals()
+def usage_for_completed_turn(
+    client: PiRpcClient | None,
+    baseline: TokenUsageTotals | None,
+) -> TokenUsageTotals:
+    """
+    Tokens consumed by the prompt that just finished.
+    Prefers delta from *baseline* (captured before ``prompt_events``). Falls back to
+    summing assistant ``usage`` since the last user message, then whole-session totals.
+    """
+    if client is None or not client.running:
+        return TokenUsageTotals()
+    current = resolve_session_token_usage(client)
+    if baseline is not None:
+        delta = subtract_usage(current, baseline)
+        if delta.input or delta.output or delta.cache_read or delta.cache_write:
+            return delta
+    try:
+        turn = sum_usage_from_messages(client.get_messages(), since_last_user=True)
+        if turn.input or turn.output or turn.cache_read or turn.cache_write:
+            return turn
+    except PiRpcError:
+        pass
+    return current

agent-redact/pi/pi_workspace_skills.py ADDED Viewed

	@@ -0,0 +1,392 @@

+"""Sync doc_redaction skills into the Pi workspace and constrain Pi RPC to that tree."""
+from __future__ import annotations
+import os
+import shutil
+import stat
+from pathlib import Path
+from bootstrap_pi_config import pi_repo_root_path
+def workspace_base_dir() -> Path:
+    from session_workspace import workspace_base_dir as _base
+    return _base()
+def workspace_pi_dir() -> Path:
+    return workspace_base_dir() / ".pi"
+def workspace_skills_dir() -> Path:
+    return workspace_pi_dir() / "skills"
+def workspace_helpers_dir() -> Path:
+    return workspace_pi_dir() / "helpers"
+def remote_redaction_helper_path() -> Path:
+    """Absolute path to synced ``remote_redaction.py`` (always under workspace base, not session subfolders)."""
+    return workspace_helpers_dir() / "remote_redaction.py"
+def remote_redaction_helper_module() -> str:
+    return remote_redaction_helper_path().as_posix()
+def repo_skills_dir() -> Path:
+    return pi_repo_root_path() / "skills"
+def _env_flag(name: str) -> bool:
+    return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
+_SKILLS_SKIP_DIR_NAMES = frozenset({"archive_attempts"})
+_SKILLS_SKIP_SUFFIXES = (".b64.txt",)
+_SKILLS_MAX_FILE_BYTES = int(
+    os.environ.get("PI_SKILLS_MAX_FILE_BYTES", str(512 * 1024))
+)
+def _should_skip_skill_relpath(rel: Path, *, size_bytes: int | None = None) -> bool:
+    """Skip archive blobs and other non-skill artifacts during workspace sync."""
+    if any(part in _SKILLS_SKIP_DIR_NAMES for part in rel.parts):
+        return True
+    name_lower = rel.name.lower()
+    if name_lower.endswith(_SKILLS_SKIP_SUFFIXES):
+        return True
+    if size_bytes is not None and size_bytes > _SKILLS_MAX_FILE_BYTES:
+        return True
+    return False
+def _should_resync(dest: Path, src: Path) -> bool:
+    if _env_flag("PI_SKILLS_RESYNC"):
+        return True
+    if not dest.is_dir():
+        return True
+    if not any(dest.iterdir()):
+        return True
+    try:
+        return src.stat().st_mtime > dest.stat().st_mtime
+    except OSError:
+        return True
+def _copy_tree_item(src: Path, dest: Path) -> None:
+    _copy_tree_item_filtered(src, dest, src_root=src)
+def _copy_tree_item_filtered(src: Path, dest: Path, *, src_root: Path) -> None:
+    rel = src.relative_to(src_root)
+    if _should_skip_skill_relpath(rel):
+        return
+    if src.is_file():
+        try:
+            size = src.stat().st_size
+        except OSError:
+            size = None
+        if size is not None and size > _SKILLS_MAX_FILE_BYTES:
+            return
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        if dest.exists():
+            _make_writable(dest)
+        shutil.copy2(src, dest)
+        return
+    if dest.exists():
+        for child in sorted(src.iterdir()):
+            _copy_tree_item_filtered(child, dest / child.name, src_root=src_root)
+    else:
+        dest.mkdir(parents=True, exist_ok=True)
+        for child in sorted(src.iterdir()):
+            _copy_tree_item_filtered(child, dest / child.name, src_root=src_root)
+def _chmod_tree(path: Path, *, writable: bool) -> None:
+    """Set or clear write bits on a file tree (needed for Windows resync)."""
+    try:
+        if path.is_dir():
+            for root, dirs, files in os.walk(path):
+                root_path = Path(root)
+                for name in files:
+                    file_path = root_path / name
+                    mode = file_path.stat().st_mode
+                    file_path.chmod(
+                        (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
+                        if writable
+                        else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+                    )
+                for name in dirs:
+                    dir_path = root_path / name
+                    mode = dir_path.stat().st_mode
+                    dir_path.chmod(
+                        (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
+                        if writable
+                        else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+                    )
+            mode = path.stat().st_mode
+            path.chmod(
+                (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
+                if writable
+                else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+            )
+        else:
+            mode = path.stat().st_mode
+            path.chmod(
+                (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
+                if writable
+                else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
+            )
+    except OSError:
+        pass
+def _make_writable(path: Path) -> None:
+    _chmod_tree(path, writable=True)
+def _make_readonly(path: Path) -> None:
+    if _env_flag("PI_SKILLS_WRITABLE"):
+        return
+    _chmod_tree(path, writable=False)
+def write_workspace_pi_settings() -> Path:
+    """
+    Project Pi settings under ``{workspace}/.pi/settings.json``.
+    Paths in that file resolve relative to ``{workspace}/.pi/`` per Pi docs.
+    """
+    pi_dir = workspace_pi_dir()
+    pi_dir.mkdir(parents=True, exist_ok=True)
+    settings_path = pi_dir / "settings.json"
+    payload = {
+        "skills": ["skills"],
+        "extensions": [],
+        "packages": [],
+        "enableSkillCommands": True,
+    }
+    import json
+    settings_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+    return settings_path
+def sync_repo_skills_to_workspace(*, force: bool = False) -> Path:
+    """
+    Copy ``{repo}/skills/`` → ``{workspace}/.pi/skills/`` (read-only for the agent).
+    Re-sync when the repo tree is newer or ``PI_SKILLS_RESYNC=true``.
+    """
+    src = repo_skills_dir()
+    dest = workspace_skills_dir()
+    workspace_pi_dir().mkdir(parents=True, exist_ok=True)
+    if not src.is_dir():
+        dest.mkdir(parents=True, exist_ok=True)
+        write_workspace_pi_settings()
+        return dest
+    if force or _should_resync(dest, src):
+        if dest.exists():
+            _make_writable(dest)
+            shutil.rmtree(dest)
+        dest.mkdir(parents=True, exist_ok=True)
+        for item in sorted(src.iterdir()):
+            rel = item.relative_to(src)
+            try:
+                size = item.stat().st_size if item.is_file() else None
+            except OSError:
+                size = None
+            if _should_skip_skill_relpath(rel, size_bytes=size):
+                continue
+            _copy_tree_item_filtered(item, dest / item.name, src_root=src)
+    write_workspace_pi_settings()
+    os.environ["PI_WORKSPACE_SKILLS_DIR"] = str(dest.resolve())
+    return dest.resolve()
+def sync_workspace_helpers() -> Path:
+    """
+    Copy Pi redaction helper scripts into ``{workspace}/.pi/helpers/``.
+    Keeps ``remote_redaction.py`` inside the workspace boundary on AWS ECS so the
+    agent does not search ``/workspace/doc_redaction/agent-redact/``.
+    """
+    helpers = workspace_helpers_dir()
+    helpers.mkdir(parents=True, exist_ok=True)
+    pi_dir = Path(__file__).resolve().parent
+    for name in ("remote_redaction.py", "run_doc_redact.py"):
+        src = pi_dir / name
+        dest = helpers / name
+        if not src.is_file():
+            continue
+        if not dest.is_file() or src.stat().st_mtime > dest.stat().st_mtime:
+            shutil.copy2(src, dest)
+    return helpers.resolve()
+def write_hf_space_deployment_skill(*, force: bool = False) -> Path | None:
+    """
+    Write a deployment-specific skill that overrides Docker URLs in generic skills.
+    Only active when ``PI_DEPLOYMENT_PROFILE=hf-space``.
+    """
+    try:
+        from pi_agent_config import is_hf_space_profile
+        from redaction_prompt import doc_redaction_gradio_url
+    except ImportError:
+        return None
+    if not is_hf_space_profile():
+        return None
+    skills_root = workspace_skills_dir()
+    skills_root.mkdir(parents=True, exist_ok=True)
+    if skills_root.is_dir():
+        _make_writable(skills_root)
+    dest_dir = skills_root / "hf-space-deployment"
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest = dest_dir / "SKILL.md"
+    url = doc_redaction_gradio_url()
+    helpers = workspace_helpers_dir().as_posix()
+    content = (
+        "# HF Space deployment (read first)\n\n"
+        "This Pi agent runs on **Hugging Face Spaces** with **Gemini** and calls a "
+        "**remote** doc_redaction Space. Generic skills mention Docker URLs for "
+        "local-docker or AWS ECS — **ignore those here**.\n\n"
+        "## Authoritative settings\n\n"
+        "| Setting | Value |\n"
+        "|---------|--------|\n"
+        f"| **doc_redaction URL** | `{url}` **only** |\n"
+        "| **Auth** | `HF_TOKEN` (Space secret; already in Pi subprocess env) |\n"
+        f"| **Helper module** | `{helpers}/remote_redaction.py` |\n\n"
+        "## One-shot CLI (preferred over writing ``run_redact.py``)\n\n"
+        f"```bash\n"
+        f"python3 {helpers}/run_doc_redact.py \\\n"
+        f'  --pdf "<session-folder>/document.pdf" \\\n'
+        f'  --dest "<session-folder>/redact/document.pdf/output_redact/" \\\n'
+        f'  --ocr-method "Local model - selectable text" \\\n'
+        f'  --pii-method "Local"\n'
+        f"```\n\n"
+        "## Minimal Python (only if the CLI is insufficient)\n\n"
+        "```python\n"
+        "import importlib.util\n"
+        "import sys\n"
+        f'helper = "{helpers}/remote_redaction.py"\n'
+        'spec = importlib.util.spec_from_file_location("remote_redaction", helper)\n'
+        "mod = importlib.util.module_from_spec(spec)\n"
+        'sys.modules["remote_redaction"] = mod\n'
+        "spec.loader.exec_module(mod)\n"
+        "from gradio_client import handle_file\n\n"
+        f"client = mod.make_redaction_client()  # URL: {url}\n"
+        'pdf = "<your-session-folder>/document.pdf"\n'
+        "result = client.predict(\n"
+        '    api_name="/doc_redact",\n'
+        "    document_file=handle_file(pdf),\n"
+        ")\n"
+        'paths = mod.resolve_redaction_output_paths(result, document_stem="document")\n'
+        'mod.fetch_redaction_files(paths, "<your-session-folder>/redact/document/output_redact/")\n'
+        "```\n\n"
+        "## Rules\n\n"
+        f"- **Helper path is shared:** `{helpers}/remote_redaction.py` lives under the "
+        f"workspace root `{workspace_base_dir().as_posix()}/`, **not** under your session "
+        f"subfolder's `.pi/` tree.\n"
+        f"- Call `/doc_redact` via `{helpers}/run_doc_redact.py` or `make_redaction_client()`.\n"
+        "- **Do not** create `run_redact.py`, `run_redact_fixed.py`, or duplicate helpers in your session folder.\n"
+        "- **Do not** call `Client(...)` or `view_api()` in a loop from bash — each call hits HF rate limits. "
+        "Use the CLI once, or one `make_redaction_client()` (cached + retries).\n"
+        "- **Do not** pass `base_url=` manually — `make_redaction_client()` reads "
+        f"`DOC_REDACTION_GRADIO_URL` (`{url}`).\n"
+        "- **Do not** use `host.docker.internal`, `localhost`, `redaction:7861`, or probe "
+        "alternate URLs.\n"
+        "- **Do not** rewrite or duplicate `remote_redaction.py` — use the synced helper.\n"
+        "- On `TooManyRequestsError`, wait at least 60s and retry **once** via the CLI — "
+        "do not spawn repeated `python3 -c` Client probes.\n"
+        "- Write status updates as **normal assistant text**, not bash `#` comments.\n"
+        "- After `/doc_redact`, download outputs with `fetch_redaction_files` into your "
+        "session `output_redact/` folder.\n\n"
+        "Then read `/skill:doc-redaction-app` and `/skill:doc-redaction-modifications` "
+        "for workflow steps, substituting the URL above wherever examples show Docker hosts.\n"
+    )
+    if force or not dest.is_file() or dest.read_text(encoding="utf-8") != content:
+        dest.write_text(content, encoding="utf-8")
+    return dest
+def ensure_workspace_skills(*, force: bool = False) -> Path:
+    """Idempotent sync used at app startup and before Pi RPC starts."""
+    dest = sync_repo_skills_to_workspace(force=force)
+    sync_workspace_helpers()
+    write_hf_space_deployment_skill(force=force)
+    if dest.is_dir():
+        _make_readonly(dest)
+    return dest
+def partnership_template_in_workspace() -> Path | None:
+    path = workspace_skills_dir() / "Example prompt partnership.txt"
+    return path if path.is_file() else None
+def pi_rpc_cwd(session_hash: str | None = None) -> str:
+    """Subprocess cwd for ``pi --mode rpc`` (session subfolder when enabled)."""
+    from session_workspace import session_workspace_dir, session_workspace_enabled
+    base = workspace_base_dir()
+    if session_hash and session_hash.strip() and session_workspace_enabled():
+        return str(session_workspace_dir(session_hash))
+    return str(base)
+def pi_rpc_args() -> list[str]:
+    """Load only workspace skills; do not discover repo ``skills/`` via ancestors."""
+    skills_dir = ensure_workspace_skills()
+    return ["--no-skills", "--skill", str(skills_dir)]
+def workspace_boundary_prefix(session_hash: str | None = None) -> str:
+    """Extra prompt text: workspace root, skills path, and path rules."""
+    base = workspace_base_dir().as_posix().rstrip("/")
+    skills = workspace_skills_dir().as_posix()
+    from session_workspace import session_workspace_dir, session_workspace_enabled
+    if session_hash and session_hash.strip() and session_workspace_enabled():
+        root = session_workspace_dir(session_hash).as_posix().rstrip("/")
+        scope = f"your session folder `{root}/`"
+    else:
+        root = base
+        scope = f"the workspace `{base}/`"
+    hf_note = ""
+    try:
+        from pi_agent_config import is_hf_space_profile
+        from redaction_prompt import doc_redaction_gradio_url
+        if is_hf_space_profile():
+            helpers = remote_redaction_helper_module()
+            hf_note = (
+                f"**HF Space redaction backend:** use `{doc_redaction_gradio_url()}` only "
+                f"(see `/skill:hf-space-deployment`). Import helpers from `{helpers}` "
+                f"(workspace base — not `{root}/.pi/helpers/`). Do not use Docker host "
+                "URLs from other skills. Write user-facing progress as normal chat text, "
+                "not bash comments.\n\n"
+            )
+    except ImportError:
+        pass
+    return (
+        f"**Workspace boundary (mandatory):** work only under `{base}/`. "
+        f"Your active directory is {scope}. "
+        f"Do not read, write, or run shell commands targeting paths outside `{base}/` "
+        f"(including the git checkout and `agent-redact/`). "
+        f"**Skills (read-only):** doc_redaction skills are synced to `{skills}/`. "
+        f"Use `/skill:doc-redaction-app`, `/skill:doc-redact-page-review`, etc. "
+        f"Do not edit files under `{skills}/`.\n\n"
+        f"{hf_note}"
+    )

agent-redact/pi/redaction_prompt.py ADDED Viewed

	@@ -0,0 +1,756 @@

+"""Build Pi redaction task prompts from the partnership example template."""
+from __future__ import annotations
+import os
+import re
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from pi_agent_config import is_aws_ecs_profile, is_hf_space_profile
+from session_workspace import workspace_base_dir
+def upload_root() -> Path:
+    """Gradio upload directory (created by ``bootstrap_pi_config.ensure_pi_upload_root``)."""
+    raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
+    if not raw:
+        from bootstrap_pi_config import ensure_pi_upload_root
+        raw = ensure_pi_upload_root(pi_repo_root())
+    path = Path(raw)
+    path.mkdir(parents=True, exist_ok=True)
+    return path.resolve()
+_SAFE_UPLOAD_FILENAME_MAX_BYTES = 255
+# Path separators, nulls, and characters unsafe on common filesystems — not general punctuation.
+_UNSAFE_UPLOAD_FILENAME_CHARS_RE = re.compile(r'[\x00-\x1f<>:"|?*\\/]')
+def _truncate_upload_filename(
+    name: str, *, max_bytes: int = _SAFE_UPLOAD_FILENAME_MAX_BYTES
+) -> str:
+    encoded = name.encode("utf-8")
+    if len(encoded) <= max_bytes:
+        return name
+    stem, suffix = os.path.splitext(name)
+    suffix_bytes = suffix.encode("utf-8")
+    max_stem_bytes = max(1, max_bytes - len(suffix_bytes))
+    while stem and len(stem.encode("utf-8")) > max_stem_bytes:
+        stem = stem[:-1]
+    if not stem:
+        stem = "file"
+    return stem + suffix
+def _split_upload_basename(name: str) -> tuple[str, str]:
+    """Split an upload basename into stem and extension (handles ``.pdf`` on Windows)."""
+    if re.fullmatch(r"\.[^./\\]+", name):
+        return "", name
+    path = Path(name)
+    return path.stem, path.suffix
+def _workspace_filename_from_upload(name: str) -> tuple[str, str, bool]:
+    """
+    Derive a workspace-safe basename, changing the name only when required for security.
+    Returns ``(original_basename, workspace_basename, renamed)``.
+    """
+    original = Path(name).name.strip()
+    if not original or original in {".", ".."}:
+        raise ValueError("Uploaded file has an invalid name.")
+    if "\x00" in original or "/" in original or "\\" in original:
+        raise ValueError("Uploaded file has an invalid name.")
+    stem, suffix = _split_upload_basename(original)
+    safe_stem = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", stem)
+    safe_suffix = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", suffix)
+    safe_stem = safe_stem.strip(". ")
+    if not safe_stem:
+        safe_stem = "file"
+    safe_name = _truncate_upload_filename(safe_stem + safe_suffix)
+    return original, safe_name, safe_name != original
+_PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
+def _workspace_root() -> Path:
+    return workspace_base_dir()
+def pi_repo_root() -> Path:
+    """Monorepo checkout root (skills/, config/). Set via :func:`bootstrap_pi_config.ensure_pi_workdir`."""
+    from bootstrap_pi_config import pi_repo_root_path
+    return pi_repo_root_path()
+def partnership_template_path() -> Path:
+    from pi_workspace_skills import partnership_template_in_workspace
+    synced = partnership_template_in_workspace()
+    if synced is not None:
+        return synced
+    return pi_repo_root() / _PARTNERSHIP_TEMPLATE
+HF_DEFAULT_OCR = "Local model - selectable text"
+HF_DEFAULT_PII = "Local"
+HF_DEFAULT_GRADIO_URL = "https://seanpedrickcase-document-redaction.hf.space"
+# Used only when PI_DEFAULT_OCR_METHOD / PI_DEFAULT_PII_METHOD are unset (local-docker profile).
+_FALLBACK_LOCAL_OCR = "hybrid-paddle-inference-server"
+_FALLBACK_LOCAL_PII = "Local"
+def _env_default(key: str, *, hf_default: str, local_fallback: str) -> str:
+    """Resolve Pi redaction defaults from env (e.g. config/pi_agent.env) with profile fallbacks."""
+    explicit = (os.environ.get(key) or "").strip()
+    if explicit:
+        return explicit
+    if is_hf_space_profile():
+        return hf_default
+    return local_fallback
+DEFAULT_OCR_METHOD = _env_default(
+    "PI_DEFAULT_OCR_METHOD",
+    hf_default=HF_DEFAULT_OCR,
+    local_fallback=_FALLBACK_LOCAL_OCR,
+)
+DEFAULT_PII_METHOD = _env_default(
+    "PI_DEFAULT_PII_METHOD",
+    hf_default=HF_DEFAULT_PII,
+    local_fallback=_FALLBACK_LOCAL_PII,
+)
+OCR_METHOD_CHOICES: tuple[str, ...] = (
+    "hybrid-paddle-inference-server",
+    "hybrid-paddle-vlm",
+    "Local model - selectable text",
+    "Local OCR",
+    "AWS Textract service - all PDF types",
+    "tesseract",
+    "paddle",
+    "hybrid-paddle",
+    "vlm",
+    "inference-server",
+)
+PII_METHOD_CHOICES: tuple[str, ...] = (
+    "Local",
+    "AWS Comprehend",
+    "LLM (AWS Bedrock)",
+    "Local inference server",
+    "Local transformers LLM",
+    "Only extract text (no redaction)",
+)
+_DEFAULT_MAX_PAGES = 3000
+def max_pages_limit() -> int:
+    """
+    Maximum PDF pages allowed for a Pi redaction task.
+    Resolution order: ``PI_MAX_PAGES`` → ``MAX_PAGES`` → ``MAX_DOC_PAGES`` → 3000.
+    """
+    for key in ("PI_MAX_PAGES", "MAX_PAGES", "MAX_DOC_PAGES"):
+        raw = (os.environ.get(key) or "").strip()
+        if raw:
+            value = int(raw)
+            if value < 1:
+                raise ValueError(f"{key} must be a positive integer.")
+            return value
+    return _DEFAULT_MAX_PAGES
+def pages_to_process_count(page_range: str, total_pages: int) -> int:
+    """Return how many pages ``page_range`` selects from a ``total_pages`` PDF."""
+    if total_pages < 1:
+        raise ValueError("PDF has no pages.")
+    text = (page_range or "all").strip().lower()
+    if not text or text == "all":
+        return total_pages
+    if "-" in text:
+        start_text, end_text = text.split("-", 1)
+        try:
+            start = int(start_text.strip())
+            end = int(end_text.strip())
+        except ValueError as exc:
+            raise ValueError(f"Invalid page range: {page_range!r}") from exc
+        if start < 1 or end < start:
+            raise ValueError(f"Invalid page range: {page_range!r}")
+        if end > total_pages:
+            raise ValueError(
+                f"Page range {page_range!r} exceeds document length "
+                f"({total_pages} pages)."
+            )
+        return end - start + 1
+    try:
+        page = int(text)
+    except ValueError as exc:
+        raise ValueError(f"Invalid page range: {page_range!r}") from exc
+    if page < 1 or page > total_pages:
+        raise ValueError(
+            f"Page {page} is out of range (document has {total_pages} pages)."
+        )
+    return 1
+def pdf_page_count(file_path: str | Path) -> int:
+    import pymupdf
+    path = Path(file_path)
+    with pymupdf.open(path) as doc:
+        return int(doc.page_count)
+def validate_pdf_page_limit(
+    file_path: str | Path,
+    *,
+    page_range: str = "all",
+    max_pages: int | None = None,
+) -> None:
+    """Reject PDFs whose selected page count exceeds ``max_pages_limit()``."""
+    path = Path(file_path)
+    if path.suffix.lower() != ".pdf":
+        return
+    limit = max_pages if max_pages is not None else max_pages_limit()
+    try:
+        total = pdf_page_count(path)
+    except Exception as exc:
+        raise ValueError(f"Could not read PDF page count for {path.name}.") from exc
+    count = pages_to_process_count(page_range, total)
+    if count > limit:
+        scope = page_range.strip() or "all"
+        raise ValueError(
+            f"Number of pages to process ({count}) exceeds the maximum allowed "
+            f"({limit}). Submit a smaller document or narrow the page range "
+            f"({scope!r})."
+        )
+@dataclass(frozen=True)
+class RedactionTaskSettings:
+    ocr_method: str = DEFAULT_OCR_METHOD
+    pii_method: str = DEFAULT_PII_METHOD
+    encourage_vlm_faces: bool = False if is_hf_space_profile() else True
+    encourage_vlm_signatures: bool = False if is_hf_space_profile() else True
+    @classmethod
+    def hf_space_defaults(cls) -> RedactionTaskSettings:
+        return cls(
+            ocr_method=HF_DEFAULT_OCR,
+            pii_method=HF_DEFAULT_PII,
+            encourage_vlm_faces=False,
+            encourage_vlm_signatures=False,
+        )
+    @classmethod
+    def from_ui(
+        cls,
+        ocr_method: str,
+        pii_method: str,
+        encourage_vlm_faces: bool,
+        encourage_vlm_signatures: bool,
+    ) -> RedactionTaskSettings:
+        ocr = (ocr_method or DEFAULT_OCR_METHOD).strip()
+        pii = (pii_method or DEFAULT_PII_METHOD).strip()
+        if ocr not in OCR_METHOD_CHOICES:
+            ocr = DEFAULT_OCR_METHOD
+        if pii not in PII_METHOD_CHOICES:
+            pii = DEFAULT_PII_METHOD
+        return cls(
+            ocr_method=ocr,
+            pii_method=pii,
+            encourage_vlm_faces=bool(encourage_vlm_faces),
+            encourage_vlm_signatures=bool(encourage_vlm_signatures),
+        )
+def doc_redaction_gradio_url() -> str:
+    """
+    Base URL of the doc_redaction Gradio app used for ``/doc_redact`` and review APIs.
+    Set ``DOC_REDACTION_GRADIO_URL`` in ``config/pi_agent.env`` (or the process environment).
+    Reads the environment on each call so runtime overrides apply before ``tools.config``
+    is imported (e.g. HF Space Docker ``ENV``, tests, and late ``load_dotenv``).
+    """
+    raw = (os.environ.get("DOC_REDACTION_GRADIO_URL") or "").strip().rstrip("/")
+    if raw:
+        return raw
+    try:
+        from tools.config import DOC_REDACTION_GRADIO_URL
+        return str(DOC_REDACTION_GRADIO_URL).strip().rstrip("/")
+    except ImportError:
+        return (
+            HF_DEFAULT_GRADIO_URL if is_hf_space_profile() else "http://127.0.0.1:7860"
+        )
+def _default_gradio_url() -> str:
+    """Back-compat alias for prompt template substitution."""
+    return doc_redaction_gradio_url()
+def _default_vlm_base_url() -> str:
+    return os.environ.get("PI_VLM_BASE_URL", "http://llama-inference:8080")
+def _default_vlm_model() -> str:
+    return os.environ.get("PI_VLM_MODEL", "unsloth/Qwen3.6-27B-MTP-GGUF")
+def load_template(path: Path | None = None) -> str:
+    template_file = path or partnership_template_path()
+    if not template_file.is_file():
+        raise FileNotFoundError(f"Prompt template not found: {template_file}")
+    return template_file.read_text(encoding="utf-8")
+def format_user_requirements(instructions: str) -> str:
+    lines: list[str] = []
+    for raw in instructions.strip().splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        if not line.startswith("-"):
+            line = f"- {line}"
+        lines.append(line)
+    return "\n".join(lines)
+def replace_user_requirements_section(template: str, instructions: str) -> str:
+    marker = "## User redaction requirements"
+    idx = template.find(marker)
+    formatted = format_user_requirements(instructions)
+    if idx == -1:
+        return f"{template.rstrip()}\n\n{marker} (authoritative for this task)\n\n{formatted}\n"
+    head = template[:idx]
+    return f"{head}{marker} (authoritative for this task)\n\n{formatted}\n"
+def _is_textract_ocr_method(ocr_method: str) -> bool:
+    lowered = ocr_method.casefold()
+    return "textract" in lowered or lowered in {"textract", "aws textract"}
+def build_vlm_faces_guidance(encourage: bool) -> str:
+    if is_hf_space_profile():
+        return (
+            "Pass 2 VLM and CUSTOM_VLM_FACES are not available on this deployment. "
+            "Do not pass CUSTOM_VLM_FACES or request face detection."
+        )
+    if encourage:
+        return (
+            "If the user asks to redact faces, then pass the entity CUSTOM_VLM_FACES "
+            "in the initial redaction entity selection"
+        )
+    return (
+        "Do not pass CUSTOM_VLM_FACES in the initial redaction entity list unless "
+        "the user explicitly asks to redact faces"
+    )
+def build_vlm_signature_guidance(encourage: bool, ocr_method: str) -> str:
+    if is_hf_space_profile():
+        return (
+            "Pass 2 VLM and CUSTOM_VLM_SIGNATURE are not available on this deployment. "
+            "Do not pass CUSTOM_VLM_SIGNATURE or request signature detection."
+        )
+    if encourage:
+        if _is_textract_ocr_method(ocr_method):
+            return (
+                "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
+                "entity in the initial redaction entity selection, unless the text extraction "
+                "option is AWS Textract, in which case the handwrite_signature_textbox parameter "
+                "for the doc_redact endpoint should include 'Extract signatures'"
+            )
+        return (
+            "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
+            "entity in the initial redaction entity selection"
+        )
+    return (
+        "Do not pass CUSTOM_VLM_SIGNATURE in the initial redaction entity list unless "
+        "the user explicitly asks to redact signatures"
+    )
+def build_local_redaction_client_guidance(
+    *,
+    gradio_url: str,
+    output_base: str,
+    workspace_root: str = "",
+) -> str:
+    """Pi agent and doc_redaction on the same host (local dev / shared Docker volumes)."""
+    output_redact = f"{output_base.rstrip('/')}/output_redact/"
+    try:
+        from pi_workspace_skills import remote_redaction_helper_module
+        helpers = remote_redaction_helper_module()
+    except ImportError:
+        helpers = (
+            f"{workspace_root.rstrip('/')}/.pi/helpers/remote_redaction.py"
+            if workspace_root.strip()
+            else "`.pi/helpers/remote_redaction.py` (under `PI_WORKSPACE_DIR`)"
+        )
+    doc_output_hint = ""
+    try:
+        from tools.config import OUTPUT_FOLDER, SESSION_OUTPUT_FOLDER
+        doc_output_hint = (
+            f"- **doc_redaction writes to** `{OUTPUT_FOLDER}`"
+            + (
+                " (per-user subfolders when `SESSION_OUTPUT_FOLDER=True`). "
+                if SESSION_OUTPUT_FOLDER
+                else ". "
+            )
+            + "Do **not** pass a Pi workspace path as `output_dir` — the server only "
+            "accepts directories under that folder.\n"
+        )
+    except ImportError:
+        doc_output_hint = (
+            "- Do **not** pass a Pi workspace path as `/doc_redact` `output_dir` — "
+            "the server restricts `output_dir` to its own `OUTPUT_FOLDER`.\n"
+        )
+    return (
+        f"- **Local doc_redaction backend:** `{gradio_url}` (same machine as this workspace).\n"
+        f"{doc_output_hint}"
+        "- Do not pass `CUSTOM_FUZZY` in `redact_entities` on `/doc_redact` unless the user explicitly requests fuzzy matching; it can be very CPU/RAM intensive and may return an empty path list even when the job completes. Use `CUSTOM` with an explicit `deny_list` on `/doc_redact`, or use `/redact_document` with `max_fuzzy_spelling_mistakes_num > 0` for fuzzy matching.\n"
+        f"- Call **`/doc_redact`** (omit `output_dir` or leave it empty), then copy artifacts "
+        f"into `{output_redact}` with `remote_redaction.resolve_redaction_output_paths` "
+        f"and `fetch_redaction_files`.\n"
+        "- When the API returns **Windows paths** (`C:\\\\...`) or paths under "
+        "`workspace/.gradio_uploads/`, **copy from disk** with `shutil.copy2` — do not "
+        "assume `gradio_api/file=` works (403 until allowed_paths includes that folder).\n"
+        "- Path walkers must accept Windows drive paths, not only strings starting with `/`.\n"
+        f"- Use `{helpers}`: `extract_server_paths(result)` "
+        "then `fetch_redaction_files(paths, dest_dir)` (local copy, then HTTP fallback).\n"
+    )
+def build_hf_space_backend_guidance(
+    *,
+    gradio_url: str,
+    output_base: str,
+    workspace_root: str,
+) -> str:
+    from pi_workspace_skills import remote_redaction_helper_module
+    helpers = remote_redaction_helper_module()
+    helpers_dir = helpers.rsplit("/", 1)[0]
+    run_cli = f"{helpers_dir}/run_doc_redact.py"
+    base = _workspace_root().as_posix().rstrip("/")
+    output_dest = output_base.rstrip("/") + "/"
+    return (
+        f"- **Remote redaction backend (authoritative URL):** `{gradio_url}` **only**. "
+        "This Pi Space orchestrates a separate private doc_redaction Hugging Face Space "
+        "over HTTPS.\n"
+        "- **Read `/skill:hf-space-deployment` first** — it overrides Docker/local URLs "
+        "(`host.docker.internal`, `localhost`, `redaction:7861`, internal service names) "
+        "that appear in generic skills for local-docker or AWS ECS.\n"
+        f"- **Helper module (workspace base, not session folder):** `{helpers}` and "
+        f"`{run_cli}` under `{base}/.pi/helpers/`. "
+        f"Do **not** look for `{workspace_root.rstrip('/')}/.pi/helpers/`.\n"
+        f"- **First redaction call:** run `{run_cli}` once (see `/skill:hf-space-deployment`) "
+        "— **do not** write `run_redact.py` in your session folder.\n"
+        "- **Do not** probe alternate hosts, rewrite the helper, or hand-roll a new "
+        "Gradio client script. Import `make_redaction_client`, `fetch_redaction_files`, "
+        "and `resolve_redaction_output_paths` from that file (`HF_TOKEN` is already in "
+        "the Pi subprocess environment).\n"
+        "- Use **`gradio_client` only** — upload local files with `handle_file()` from "
+        f"`{workspace_root.rstrip('/')}/`. **Do not** call `/agent/*` routes or use "
+        "server-side paths from the redaction container.\n"
+        f"- Download all `/doc_redact` and `/review_apply` outputs via "
+        f"`{gradio_url.rstrip('/')}/gradio_api/file=…` with "
+        f"`Authorization: Bearer $HF_TOKEN` into `{output_dest}` (create subdirs as needed).\n"
+        "- On Hugging Face rate limits (`TooManyRequestsError`), wait and retry the **same** "
+        "URL via the helper — do not switch to another host.\n"
+        "- Do not pass `CUSTOM_FUZZY` in `redact_entities` on `/doc_redact` unless the user explicitly requests fuzzy matching; it can be very CPU/RAM intensive and may return an empty path list even when the job completes. Use `CUSTOM` with an explicit `deny_list` on `/doc_redact`, or use `/redact_document` with `max_fuzzy_spelling_mistakes_num > 0` for fuzzy matching.\n"
+        "- Run **`verify_redaction_coverage`** locally on downloaded CSV/PDF paths in this "
+        "workspace (pandas/PyMuPDF), not via Agent API.\n"
+        "- **Pass 2 VLM is not available** — do not call a VLM endpoint or use "
+        "`CUSTOM_VLM_FACES` / `CUSTOM_VLM_SIGNATURE` entities.\n"
+        "- **User-facing updates:** write progress and reasoning as normal assistant text. "
+        "Do not put commentary in bash `#` comments — the UI shows those as tool lines.\n"
+        f"- Helper module: `{helpers}`."
+    )
+def build_split_container_redaction_guidance(
+    *,
+    gradio_url: str,
+    output_base: str,
+    workspace_root: str,
+) -> str:
+    """AWS ECS (and similar): Pi agent and doc_redaction are separate containers."""
+    from pi_workspace_skills import remote_redaction_helper_module
+    output_redact = f"{output_base.rstrip('/')}/output_redact/"
+    helpers = remote_redaction_helper_module()
+    base = _workspace_root().as_posix().rstrip("/")
+    return (
+        f"- **Split-container redaction backend:** doc_redaction runs at `{gradio_url}` "
+        "(separate service from this Pi agent). Use **`gradio_client` only**.\n"
+        f"- **Helper module (workspace base):** `{helpers}` under `{base}/.pi/helpers/` "
+        f"(not `{workspace_root.rstrip('/')}/.pi/helpers/`).\n"
+        f"- **Deliverables belong in your session workspace:** `{output_redact}` "
+        f"(and `{output_base.rstrip('/')}/review/output_review_final/` after apply). "
+        "That is the **only** output tree you should populate for this task.\n"
+        "- **Do not** search this container for redaction outputs: no `find /workspace`, "
+        "no `ls /home/user/app/output`, no `import tools.config OUTPUT_FOLDER` on the Pi "
+        "agent — those paths are on the **redaction service**, not here (or are a read-only "
+        "git checkout without live run artifacts).\n"
+        "- Do not pass `CUSTOM_FUZZY` in `redact_entities` on `/doc_redact` unless the user explicitly requests fuzzy matching; it can be very CPU/RAM intensive and may return an empty path list even when the job completes. Use `CUSTOM` with an explicit `deny_list` on `/doc_redact`, or use `/redact_document` with `max_fuzzy_spelling_mistakes_num > 0` for fuzzy matching.\n"
+        f'- **Initial redaction:** `Client("{gradio_url}")` → `/doc_redact` with '
+        f"`document_file=handle_file(\"<file under {workspace_root.rstrip('/')}/>\")`. "
+        "Omit `output_dir` (server picks its own `OUTPUT_FOLDER`).\n"
+        f"- **Collect paths:** `extract_server_paths(result)` from the predict tuple. "
+        "When the path list is `[]`, parse the status `message` for embedded paths, or retry "
+        "once — **do not** spend turns grepping the filesystem.\n"
+        f'- **Download:** `fetch_redaction_files(paths, "{output_redact}")` from '
+        f"`{helpers}` (HTTP `GET /gradio_api/file=` — no shared disk copy).\n"
+        "- **Coverage verify (split-container):** `/agent/*` paths must already exist on "
+        "the **redaction server** under its `OUTPUT_FOLDER` (e.g. `/home/user/app/output/...`) "
+        "— not on this Pi container. **Pre-apply** (CSV edited here): download artifacts via "
+        "`fetch_redaction_files`, then run `python tools/verify_redaction_coverage.py` on "
+        "those local copies (the edited review CSV is not on the redaction server). "
+        "**Post-apply** (after `/review_apply`): call "
+        f"`POST {gradio_url.rstrip('/')}/agent/verify_redaction_coverage` with "
+        "**server paths** from `extract_server_paths(review_apply result)` for "
+        "`review_csv_path`, `ocr_words_csv_path` (from `/doc_redact`), and "
+        "`redacted_pdf_path`. **Do not** pass Pi workspace paths, `/tmp/gradio_tmp/...` "
+        "upload paths, or import `verify_redaction_coverage()` expecting redaction-server "
+        "paths to resolve from this container.\n"
+        f"- Helper module (inside workspace boundary): `{helpers}`."
+    )
+def build_remote_backend_guidance(
+    *,
+    gradio_url: str,
+    output_base: str,
+    workspace_root: str,
+) -> str:
+    if is_hf_space_profile():
+        return build_hf_space_backend_guidance(
+            gradio_url=gradio_url,
+            output_base=output_base,
+            workspace_root=workspace_root,
+        )
+    if is_aws_ecs_profile():
+        return build_split_container_redaction_guidance(
+            gradio_url=gradio_url,
+            output_base=output_base,
+            workspace_root=workspace_root,
+        )
+    return build_local_redaction_client_guidance(
+        gradio_url=gradio_url,
+        output_base=output_base,
+        workspace_root=workspace_root,
+    )
+def _resolve_and_validate_upload_path(upload_path: str | Path) -> Path:
+    if not isinstance(upload_path, (str, Path)):
+        raise ValueError("Uploaded file path has an invalid type.")
+    if not str(upload_path).strip():
+        raise ValueError("Uploaded file path is empty.")
+    root = upload_root()
+    raw_path = Path(upload_path).expanduser()
+    try:
+        source = raw_path.resolve(strict=True)
+    except FileNotFoundError as exc:
+        raise FileNotFoundError(f"Uploaded file not found: {raw_path}") from exc
+    try:
+        source.relative_to(root)
+    except ValueError as exc:
+        raise ValueError(
+            f"Uploaded file path resolves outside allowed upload root: {source}"
+        ) from exc
+    if not source.is_file():
+        raise FileNotFoundError(f"Uploaded file not found: {source}")
+    if source.is_symlink():
+        raise ValueError(f"Symlink uploads are not allowed: {source}")
+    return source
+def _resolve_and_validate_workspace_dir(workspace_dir: Path | None) -> Path:
+    if workspace_dir is not None and not isinstance(workspace_dir, Path):
+        raise ValueError("Workspace path has an invalid type.")
+    base_root = _workspace_root().resolve()
+    candidate = (
+        workspace_dir if workspace_dir is not None else _workspace_root()
+    ).resolve()
+    try:
+        candidate.relative_to(base_root)
+    except ValueError as exc:
+        raise ValueError(
+            f"Workspace path resolves outside allowed workspace root: {candidate}"
+        ) from exc
+    return candidate
+def copy_upload_to_workspace(
+    upload_path: str | Path,
+    *,
+    workspace_dir: Path | None = None,
+) -> tuple[Path, str | None]:
+    """
+    Copy upload into the session workspace.
+    Returns ``(destination_path, original_basename)`` where ``original_basename`` is
+    set only when the file was renamed for path safety.
+    """
+    source = _resolve_and_validate_upload_path(upload_path)
+    if not source.is_file():
+        raise FileNotFoundError(f"Uploaded file not found: {source}")
+    workspace_root = _resolve_and_validate_workspace_dir(workspace_dir)
+    workspace_root.mkdir(parents=True, exist_ok=True)
+    _original_name, safe_name, renamed = _workspace_filename_from_upload(source.name)
+    dest = (workspace_root / safe_name).resolve()
+    try:
+        dest.relative_to(workspace_root)
+    except ValueError as exc:
+        raise ValueError(f"Destination path is outside workspace: {dest}") from exc
+    if source != dest:
+        # copyfile only: copy2/copystat raises EPERM when overwriting on Docker Desktop bind mounts.
+        shutil.copyfile(source, dest)
+    return dest, (_original_name if renamed else None)
+def _strip_long_document_section(template: str) -> str:
+    """Remove the 100+ page operator block (keeps user requirements)."""
+    marker = "## Specific rules for long documents"
+    start = template.find(marker)
+    if start == -1:
+        return template
+    end = template.find("## User redaction requirements", start)
+    if end == -1:
+        return template[:start].rstrip() + "\n\n"
+    return template[:start].rstrip() + "\n\n" + template[end:]
+def _include_long_document_rules(page_range: str, total_pages: int) -> bool:
+    if total_pages <= 0:
+        return False
+    if total_pages >= 100:
+        return True
+    return pages_to_process_count(page_range or "all", total_pages) >= 100
+def build_redaction_prompt(
+    file_name: str,
+    user_instructions: str,
+    *,
+    page_range: str = "all",
+    template: str | None = None,
+    settings: RedactionTaskSettings | None = None,
+    workspace_dir: Path | None = None,
+    total_pages: int = 0,
+) -> str:
+    if not file_name.strip():
+        raise ValueError("A document file name is required.")
+    if not user_instructions.strip():
+        raise ValueError("Redaction requirements are required (use bullet points).")
+    task_settings = settings or RedactionTaskSettings()
+    workspace_root = (workspace_dir or _workspace_root()).resolve()
+    file_name = Path(file_name).name
+    input_path = f"{workspace_root.as_posix().rstrip('/')}/{file_name}"
+    output_base = f"{workspace_root.as_posix().rstrip('/')}/redact/{file_name}/"
+    text = template if template is not None else load_template()
+    remote_guidance = build_remote_backend_guidance(
+        gradio_url=_default_gradio_url(),
+        output_base=output_base,
+        workspace_root=workspace_root.as_posix(),
+    )
+    replacements = {
+        "{FILE_NAME}": file_name,
+        "{INPUT_PATH}": input_path,
+        "{OUTPUT_BASE}": output_base,
+        "{GRADIO_URL}": _default_gradio_url(),
+        "{PAGE_RANGE}": page_range.strip() or "all",
+        "{VLM_BASE_URL}": _default_vlm_base_url(),
+        "{VLM_MODEL}": _default_vlm_model(),
+        "{DEFAULT_OCR_METHOD}": task_settings.ocr_method,
+        "{DEFAULT_PII_METHOD}": task_settings.pii_method,
+        "{VLM_FACES_GUIDANCE}": build_vlm_faces_guidance(
+            task_settings.encourage_vlm_faces
+        ),
+        "{VLM_SIGNATURE_GUIDANCE}": build_vlm_signature_guidance(
+            task_settings.encourage_vlm_signatures,
+            task_settings.ocr_method,
+        ),
+    }
+    if remote_guidance:
+        replacements["{REMOTE_BACKEND_GUIDANCE}"] = remote_guidance
+    else:
+        text = text.replace("- {REMOTE_BACKEND_GUIDANCE}\n", "")
+    for key, value in replacements.items():
+        text = text.replace(key, value)
+    if is_hf_space_profile():
+        hf_row = (
+            "| **0 — HF deployment (read first)** | `hf-space-deployment` | "
+            "`.pi/skills/hf-space-deployment/SKILL.md` | "
+            "Use `run_doc_redact.py`; do not hand-roll Gradio clients |\n"
+        )
+        marker = "| **1 — Initial redaction** |"
+        if marker in text and hf_row not in text:
+            text = text.replace(marker, hf_row + marker, 1)
+    if not _include_long_document_rules(page_range, total_pages):
+        text = _strip_long_document_section(text)
+    return replace_user_requirements_section(text, user_instructions)
+def prepare_redaction_task(
+    upload_path: str | Path | None,
+    user_instructions: str,
+    *,
+    page_range: str = "all",
+    settings: RedactionTaskSettings | None = None,
+    workspace_dir: Path | None = None,
+) -> tuple[str, str, str | None]:
+    """
+    Copy upload into workspace and return ``(file_name, full_prompt, renamed_from)``.
+    ``renamed_from`` is the original upload basename when it was adjusted for path
+    safety; otherwise ``None``.
+    """
+    if upload_path is None:
+        raise ValueError("Please upload a document.")
+    root = _resolve_and_validate_workspace_dir(workspace_dir)
+    validate_pdf_page_limit(upload_path, page_range=page_range)
+    dest, renamed_from = copy_upload_to_workspace(upload_path, workspace_dir=root)
+    total_pages = 0
+    if str(dest).lower().endswith(".pdf"):
+        try:
+            total_pages = pdf_page_count(dest)
+        except (ValueError, OSError):
+            total_pages = 0
+    prompt = build_redaction_prompt(
+        dest.name,
+        user_instructions,
+        page_range=page_range,
+        settings=settings,
+        workspace_dir=root,
+        total_pages=total_pages,
+    )
+    return dest.name, prompt, renamed_from

agent-redact/pi/remote_redaction.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""Gradio client helpers for remote doc_redaction HF Space backends."""
+from __future__ import annotations
+import os
+import shutil
+import time
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+import httpx
+from gradio_client import Client
+DEFAULT_CONNECT_TIMEOUT = 120.0
+DEFAULT_READ_TIMEOUT = 1800.0
+_DEFAULT_REDACT_ENTITIES = (
+    "PERSON",
+    "EMAIL_ADDRESS",
+    "PHONE_NUMBER",
+    "STREETNAME",
+    "UKPOSTCODE",
+    "TITLES",
+    "CUSTOM",
+)
+_CLIENT_CACHE: dict[tuple[str, str], Client] = {}
+def split_redaction_backend() -> bool:
+    """True when Pi and doc_redaction do not share a filesystem (ECS, HF Space, …)."""
+    try:
+        from pi_agent_config import uses_split_redaction_backend
+        return uses_split_redaction_backend()
+    except ImportError:
+        return False
+def redaction_base_url() -> str:
+    raw = (os.environ.get("DOC_REDACTION_GRADIO_URL") or "").strip().rstrip("/")
+    if raw:
+        return raw
+    try:
+        from redaction_prompt import doc_redaction_gradio_url
+        return doc_redaction_gradio_url()
+    except ImportError:
+        return "http://127.0.0.1:7860"
+def redaction_hf_token() -> str | None:
+    token = os.environ.get("HF_TOKEN") or os.environ.get("DOC_REDACTION_HF_TOKEN")
+    return token.strip() if token and token.strip() else None
+def redaction_gradio_auth() -> tuple[str, str] | None:
+    """
+    Optional Gradio HTTP basic auth for doc_redaction when ``COGNITO_AUTH=True``.
+    Set ``DOC_REDACTION_GRADIO_AUTH_USER`` and ``DOC_REDACTION_GRADIO_AUTH_PASSWORD``
+    (e.g. a dedicated Cognito service account). Not the Pi UI user's session.
+    """
+    user = (os.environ.get("DOC_REDACTION_GRADIO_AUTH_USER") or "").strip()
+    password = os.environ.get("DOC_REDACTION_GRADIO_AUTH_PASSWORD") or ""
+    if user and password:
+        return (user, password)
+    return None
+def httpx_timeout(
+    *,
+    connect: float = DEFAULT_CONNECT_TIMEOUT,
+    read: float = DEFAULT_READ_TIMEOUT,
+) -> httpx.Timeout:
+    return httpx.Timeout(connect=connect, read=read, write=connect, pool=connect)
+def _quota_retry_attempts() -> int:
+    for key in ("PI_QUOTA_RETRY_ATTEMPTS", "PI_MAX_RETRIES"):
+        raw = (os.environ.get(key) or "").strip()
+        if raw.isdigit():
+            return max(1, int(raw))
+    return 5
+def _quota_retry_delay_s() -> int:
+    raw = (os.environ.get("PI_QUOTA_RETRY_DELAY_S") or "60").strip()
+    try:
+        return max(1, int(raw))
+    except ValueError:
+        return 60
+def is_gradio_rate_limit_error(exc: BaseException) -> bool:
+    if type(exc).__name__ == "TooManyRequestsError":
+        return True
+    lowered = str(exc).lower()
+    return any(
+        marker in lowered
+        for marker in ("429", "too many requests", "rate limit", "rate-limit")
+    )
+def clear_redaction_client_cache() -> None:
+    """Drop cached gradio_client instances (tests or after credential rotation)."""
+    _CLIENT_CACHE.clear()
+def make_redaction_client(
+    base_url: str | None = None,
+    hf_token: str | None = None,
+    *,
+    force_new: bool = False,
+    verbose: bool = False,
+) -> Client:
+    """
+    Return a gradio_client for the remote doc_redaction Space.
+    Uses ``token=`` (gradio_client 2.x). Retries ``TooManyRequestsError`` with
+    ``PI_QUOTA_RETRY_DELAY_S`` backoff and caches one client per URL+token pair
+    so agents do not re-fetch ``/gradio_api/info`` on every bash one-liner.
+    """
+    url = (base_url or redaction_base_url()).rstrip("/")
+    token = hf_token if hf_token is not None else redaction_hf_token()
+    auth = redaction_gradio_auth()
+    cache_key = (url, token or "", auth or ())
+    if not force_new and cache_key in _CLIENT_CACHE:
+        return _CLIENT_CACHE[cache_key]
+    client_kwargs: dict[str, Any] = {
+        "httpx_kwargs": {"timeout": httpx_timeout()},
+        "verbose": verbose,
+    }
+    max_attempts = _quota_retry_attempts()
+    delay_s = _quota_retry_delay_s()
+    last_error: BaseException | None = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            if auth:
+                client = Client(url, auth=auth, **client_kwargs)
+            elif token:
+                client = Client(url, token=token, **client_kwargs)
+            else:
+                client = Client(url, **client_kwargs)
+            _CLIENT_CACHE[cache_key] = client
+            return client
+        except Exception as exc:
+            if not is_gradio_rate_limit_error(exc):
+                raise
+            last_error = exc
+            if attempt >= max_attempts:
+                break
+            time.sleep(delay_s)
+    assert last_error is not None
+    raise last_error
+def call_doc_redact(
+    pdf_path: str | Path,
+    dest_dir: str | Path,
+    *,
+    ocr_method: str | None = None,
+    pii_method: str | None = None,
+    deny_list: list[str] | None = None,
+    allow_list: list[str] | None = None,
+    redact_entities: list[str] | None = None,
+    page_min: int | None = None,
+    page_max: int | None = None,
+) -> tuple[Any, list[Path]]:
+    """
+    Run ``/doc_redact`` and download outputs into *dest_dir*.
+    Prefer this or ``run_doc_redact.py`` over hand-written Gradio scripts.
+    """
+    from gradio_client import handle_file
+    pdf = Path(pdf_path).expanduser().resolve()
+    if not pdf.is_file():
+        raise FileNotFoundError(f"PDF not found: {pdf}")
+    predict_kwargs: dict[str, Any] = {
+        "api_name": "/doc_redact",
+        "document_file": handle_file(str(pdf)),
+        "redact_entities": list(redact_entities or _DEFAULT_REDACT_ENTITIES),
+    }
+    if ocr_method:
+        predict_kwargs["ocr_method"] = ocr_method
+    if pii_method:
+        predict_kwargs["pii_method"] = pii_method
+    if deny_list:
+        predict_kwargs["deny_list"] = deny_list
+    if allow_list:
+        predict_kwargs["allow_list"] = allow_list
+    if page_min is not None:
+        predict_kwargs["page_min"] = page_min
+    if page_max is not None:
+        predict_kwargs["page_max"] = page_max
+    client = make_redaction_client()
+    result = client.predict(**predict_kwargs)
+    paths = resolve_redaction_output_paths(result, document_stem=pdf.stem)
+    saved = fetch_redaction_files(paths, dest_dir)
+    return result, saved
+def is_gradio_file_path(value: str) -> bool:
+    """True for absolute Unix or Windows paths returned by Gradio predict."""
+    s = (value or "").strip()
+    if not s:
+        return False
+    if s.startswith("/") and len(s) > 1:
+        return True
+    return len(s) >= 3 and s[1] == ":" and s[0].isalpha() and s[2] in ("\\", "/")
+def _collect_paths(value: Any, out: list[str]) -> None:
+    if isinstance(value, str):
+        if is_gradio_file_path(value):
+            out.append(value.strip())
+    elif isinstance(value, dict):
+        path = value.get("path")
+        if isinstance(path, str) and is_gradio_file_path(path):
+            out.append(path.strip())
+        for item in value.values():
+            _collect_paths(item, out)
+    elif isinstance(value, (list, tuple)):
+        for item in value:
+            _collect_paths(item, out)
+def extract_server_paths(result: Any) -> list[str]:
+    """Walk a gradio_client predict result and collect server file paths."""
+    paths: list[str] = []
+    _collect_paths(result, paths)
+    seen: set[str] = set()
+    ordered: list[str] = []
+    for path in paths:
+        if path not in seen:
+            seen.add(path)
+            ordered.append(path)
+    return ordered
+def doc_redaction_output_root() -> Path | None:
+    """Resolved doc_redaction ``OUTPUT_FOLDER`` when the main app config is importable."""
+    try:
+        from tools.config import OUTPUT_FOLDER
+        return Path(OUTPUT_FOLDER).resolve()
+    except ImportError:
+        raw = (os.environ.get("DOC_REDACTION_OUTPUT_FOLDER") or "").strip()
+        if not raw:
+            return None
+        try:
+            return Path(raw).resolve()
+        except OSError:
+            return None
+def discover_redaction_outputs(
+    document_stem: str,
+    *,
+    since: float | None = None,
+) -> list[str]:
+    """
+    Fallback when ``/doc_redact`` returns ``[]``: glob the doc_redaction output tree.
+    Matches filenames containing *document_stem* (e.g. ``example_of_emails``).
+    When *since* is set, only files with ``mtime >= since`` are returned.
+    """
+    stem = (document_stem or "").strip()
+    if not stem:
+        return []
+    if split_redaction_backend():
+        return []
+    root = doc_redaction_output_root()
+    if root is None or not root.is_dir():
+        return []
+    threshold = since if since is not None else None
+    found: list[str] = []
+    try:
+        for path in root.rglob(f"*{stem}*"):
+            if not path.is_file():
+                continue
+            if threshold is not None:
+                try:
+                    if path.stat().st_mtime < threshold:
+                        continue
+                except OSError:
+                    continue
+            found.append(str(path.resolve()))
+    except OSError:
+        return []
+    return sorted(found)
+def resolve_redaction_output_paths(
+    result: Any,
+    *,
+    document_stem: str = "",
+    run_started_at: float | None = None,
+) -> list[str]:
+    """
+    Collect output paths from a ``/doc_redact`` result, with on-disk fallback.
+    Prefer paths embedded in the Gradio response; when empty, search
+    ``OUTPUT_FOLDER`` (including per-user session subfolders).
+    """
+    paths = extract_server_paths(result)
+    if paths:
+        return paths
+    if document_stem:
+        discovered = discover_redaction_outputs(
+            document_stem,
+            since=run_started_at,
+        )
+        if discovered:
+            return discovered
+    return []
+def _download_via_gradio_http(
+    paths: list[str],
+    dest: Path,
+    *,
+    base_url: str,
+    hf_token: str | None,
+) -> list[Path]:
+    headers: dict[str, str] = {}
+    if hf_token:
+        headers["Authorization"] = f"Bearer {hf_token.strip()}"
+    downloaded: list[Path] = []
+    with httpx.Client(timeout=httpx_timeout(), headers=headers) as http:
+        for path in paths:
+            file_url = f"{base_url}/gradio_api/file={quote(path, safe='')}"
+            local_path = dest / Path(path).name
+            response = http.get(file_url)
+            response.raise_for_status()
+            local_path.write_bytes(response.content)
+            downloaded.append(local_path)
+    return downloaded
+def fetch_redaction_files(
+    paths: list[str],
+    dest_dir: str | Path,
+    *,
+    base_url: str | None = None,
+    hf_token: str | None = None,
+) -> list[Path]:
+    """
+    Save redaction outputs into *dest_dir*.
+    When Pi and doc_redaction share a host filesystem (typical local dev), copies
+    directly from disk. Otherwise falls back to ``GET /gradio_api/file=``.
+    """
+    url = (base_url or redaction_base_url()).rstrip("/")
+    token = hf_token if hf_token is not None else redaction_hf_token()
+    dest = Path(dest_dir)
+    dest.mkdir(parents=True, exist_ok=True)
+    saved: list[Path] = []
+    http_paths: list[str] = []
+    use_http_only = split_redaction_backend()
+    for path in paths:
+        if not is_gradio_file_path(path):
+            continue
+        if not use_http_only:
+            local = Path(path)
+            try:
+                if local.is_file():
+                    out = dest / local.name
+                    if local.resolve() != out.resolve():
+                        shutil.copy2(local, out)
+                    else:
+                        out = local.resolve()
+                    saved.append(out)
+                    continue
+            except OSError:
+                pass
+        http_paths.append(path)
+    if http_paths:
+        saved.extend(
+            _download_via_gradio_http(http_paths, dest, base_url=url, hf_token=token)
+        )
+    return saved
+def download_gradio_files(
+    paths: list[str],
+    dest_dir: str | Path,
+    *,
+    base_url: str | None = None,
+    hf_token: str | None = None,
+) -> list[Path]:
+    """Backward-compatible alias for :func:`fetch_redaction_files`."""
+    return fetch_redaction_files(
+        paths,
+        dest_dir,
+        base_url=base_url,
+        hf_token=hf_token,
+    )

agent-redact/pi/run_doc_redact.py ADDED Viewed

	@@ -0,0 +1,87 @@

+#!/usr/bin/env python3
+"""One-shot ``/doc_redact`` CLI for Pi agents (HF Space / split-container backends)."""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+# Allow ``python3 …/run_doc_redact.py`` without installing the package.
+_HELPERS = Path(__file__).resolve().parent
+if str(_HELPERS) not in sys.path:
+    sys.path.insert(0, str(_HELPERS))
+from remote_redaction import call_doc_redact  # noqa: E402
+def _parse_list(raw: str | None) -> list[str] | None:
+    if raw is None or not str(raw).strip():
+        return None
+    text = str(raw).strip()
+    if text.startswith("["):
+        parsed = json.loads(text)
+        if isinstance(parsed, list):
+            return [str(item) for item in parsed]
+    return [part.strip() for part in text.split(",") if part.strip()]
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Run /doc_redact via remote_redaction.make_redaction_client()."
+    )
+    parser.add_argument(
+        "--pdf", required=True, help="Local PDF path (session workspace)."
+    )
+    parser.add_argument(
+        "--dest",
+        required=True,
+        help="Directory for downloaded artifacts (e.g. …/output_redact/).",
+    )
+    parser.add_argument("--ocr-method", default=None)
+    parser.add_argument("--pii-method", default=None)
+    parser.add_argument(
+        "--deny-list",
+        default=None,
+        help="Comma-separated or JSON list for CUSTOM deny terms.",
+    )
+    parser.add_argument(
+        "--allow-list",
+        default=None,
+        help="Comma-separated or JSON list for allow terms.",
+    )
+    parser.add_argument(
+        "--redact-entities",
+        default=None,
+        help="Comma-separated or JSON list (default: PERSON, EMAIL, …, CUSTOM).",
+    )
+    parser.add_argument("--page-min", type=int, default=None)
+    parser.add_argument("--page-max", type=int, default=None)
+    args = parser.parse_args(argv)
+    pdf = Path(args.pdf).expanduser().resolve()
+    if not pdf.is_file():
+        print(f"PDF not found: {pdf}", file=sys.stderr)
+        return 2
+    result, saved = call_doc_redact(
+        pdf,
+        args.dest,
+        ocr_method=args.ocr_method,
+        pii_method=args.pii_method,
+        deny_list=_parse_list(args.deny_list),
+        allow_list=_parse_list(args.allow_list),
+        redact_entities=_parse_list(args.redact_entities),
+        page_min=args.page_min,
+        page_max=args.page_max,
+    )
+    message = result[1] if isinstance(result, (list, tuple)) and len(result) > 1 else ""
+    print(message or "doc_redact completed.")
+    for path in saved:
+        print(path)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

agent-redact/pi/session_logs.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""Resolve Pi agent session JSONL logs for Gradio download and usage-log persistence."""
+from __future__ import annotations
+import shutil
+from pathlib import Path
+from pi_agent_config import ensure_session_dir
+from pi_rpc_client import PiRpcClient, PiRpcError
+from tools.aws_functions import upload_log_file_to_s3
+from tools.config import (
+    RUN_AWS_FUNCTIONS,
+    S3_USAGE_LOGS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    USAGE_LOGS_FOLDER,
+)
+def _session_dir_root() -> Path:
+    return ensure_session_dir()
+def pi_session_file_from_client(client: PiRpcClient | None) -> Path | None:
+    """Return the active Pi session JSONL path from RPC state, if readable."""
+    if client is None or not client.running:
+        return None
+    try:
+        state = client.get_state()
+    except PiRpcError:
+        return None
+    raw = state.get("sessionFile")
+    if not raw or str(raw).strip() in ("", "—"):
+        return None
+    path = Path(str(raw)).expanduser()
+    if not path.is_file():
+        return None
+    resolved = path.resolve(strict=False)
+    try:
+        resolved.relative_to(_session_dir_root())
+    except ValueError:
+        return None
+    return resolved
+def _usage_log_archive_name(source: Path, session_hash: str = "") -> str:
+    if session_hash and str(session_hash).strip():
+        return f"{str(session_hash).strip()}_{source.name}"
+    return source.name
+def copy_session_log_to_usage_folder(
+    source: Path,
+    *,
+    session_hash: str = "",
+) -> Path | None:
+    """Copy a Pi session JSONL into ``USAGE_LOGS_FOLDER`` (beside ``usage_log.csv``)."""
+    if not SAVE_LOGS_TO_CSV:
+        return None
+    usage_dir = Path(USAGE_LOGS_FOLDER)
+    usage_dir.mkdir(parents=True, exist_ok=True)
+    dest = usage_dir / _usage_log_archive_name(source, session_hash)
+    try:
+        shutil.copy2(source, dest)
+    except OSError:
+        return None
+    return dest.resolve()
+def collect_session_log_download(client: PiRpcClient | None) -> str | None:
+    """Path suitable for ``gr.File`` download, or ``None`` if no log yet."""
+    path = pi_session_file_from_client(client)
+    if path is None:
+        return None
+    return str(path)
+def persist_session_log(
+    client: PiRpcClient | None,
+    *,
+    session_hash: str = "",
+    source: Path | None = None,
+) -> Path | None:
+    """
+    Archive the active Pi session JSONL when local usage logging is enabled.
+    Copies into ``USAGE_LOGS_FOLDER`` when ``SAVE_LOGS_TO_CSV`` is true, then
+    uploads that copy to ``S3_USAGE_LOGS_FOLDER`` when ``RUN_AWS_FUNCTIONS`` is true.
+    When *source* is provided (resolved synchronously by the caller), it is used
+    directly so this can run on a background thread without issuing an RPC read.
+    """
+    if not SAVE_LOGS_TO_CSV:
+        return None
+    if source is None:
+        source = pi_session_file_from_client(client)
+    if source is None:
+        return None
+    archived = copy_session_log_to_usage_folder(source, session_hash=session_hash)
+    if archived is None:
+        return None
+    if RUN_AWS_FUNCTIONS:
+        upload_log_file_to_s3(str(archived), S3_USAGE_LOGS_FOLDER)
+    return archived
+def export_session_log_to_s3(client: PiRpcClient | None) -> None:
+    """Back-compat: persist session log (local archive + optional S3)."""
+    persist_session_log(client)
+def gradio_session_log_allowed_paths() -> list[str]:
+    """Directories Gradio must allow to serve Pi session JSONL files."""
+    paths: list[str] = []
+    try:
+        paths.append(str(_session_dir_root()))
+    except OSError:
+        pass
+    if SAVE_LOGS_TO_CSV:
+        try:
+            paths.append(str(Path(USAGE_LOGS_FOLDER).resolve()))
+        except OSError:
+            pass
+    return paths

agent-redact/pi/session_workspace.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""Per-session workspace paths for the Pi Gradio UI (mirrors main app session folders)."""
+from __future__ import annotations
+import os
+import re
+import sys
+from pathlib import Path
+import gradio as gr
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+_SESSION_ID_RE = re.compile(r"[^a-zA-Z0-9_@.+-]+")
+def workspace_base_dir() -> Path:
+    """Shared Pi workspace root (see ``bootstrap_pi_config.ensure_pi_workspace_dir``)."""
+    raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
+    if raw:
+        path = Path(raw)
+    else:
+        from bootstrap_pi_config import ensure_pi_workspace_dir
+        return Path(ensure_pi_workspace_dir(_REPO_ROOT))
+    path.mkdir(parents=True, exist_ok=True)
+    return path.resolve()
+def _session_output_folder_enabled() -> bool:
+    """Read at call time so ``pi_agent.env`` / dotenv apply before first use."""
+    raw = (os.environ.get("SESSION_OUTPUT_FOLDER") or "").strip().lower()
+    return raw in {"1", "true", "yes", "on"}
+def session_workspace_enabled() -> bool:
+    """
+    When true, each Gradio session uses ``{PI_WORKSPACE_DIR}/{session_hash}/``.
+    Controlled by ``PI_SESSION_WORKSPACE`` in ``config/pi_agent.env`` (default on when unset).
+    Set ``PI_SESSION_WORKSPACE=false`` for a single shared workspace root.
+    """
+    raw = os.environ.get("PI_SESSION_WORKSPACE", "").strip().lower()
+    if raw in {"0", "false", "no", "off"}:
+        return False
+    if raw in {"1", "true", "yes", "on"}:
+        return True
+    if _session_output_folder_enabled():
+        return True
+    return True
+def workspace_base_dir_resolved() -> Path:
+    """Current workspace root (never cached at import)."""
+    return workspace_base_dir()
+def sanitize_session_id(raw: str) -> str:
+    cleaned = _SESSION_ID_RE.sub("_", (raw or "").strip())[:128].strip("_")
+    return cleaned or "default"
+def resolve_session_hash(request: gr.Request | None) -> str:
+    """
+    Resolve Gradio session id for per-user workspace folders.
+    Prefers ``request.session_hash`` (local Pi UI). Falls back to the main app's
+    Cognito/OIDC resolver when a deployment header is configured.
+    """
+    if request is None:
+        return "default"
+    gradio_hash = getattr(request, "session_hash", None)
+    if gradio_hash is not None and str(gradio_hash).strip():
+        return sanitize_session_id(str(gradio_hash))
+    from tools.gradio_platform import resolve_session_identity
+    try:
+        identity = resolve_session_identity(request)
+    except ValueError:
+        return "default"
+    return sanitize_session_id(str(identity))
+def effective_session_hash(
+    session_hash: str,
+    request: gr.Request | None = None,
+) -> str:
+    """
+    Use ``session_hash_state`` when set; otherwise resolve from the active request.
+    Gradio ``demo.load`` may run before ``request.session_hash`` exists, so handlers
+    should pass ``request`` and call this on each event.
+    """
+    stored = (session_hash or "").strip()
+    if stored and stored != "default":
+        return sanitize_session_id(stored)
+    if request is not None:
+        resolved = resolve_session_hash(request)
+        if resolved and resolved != "default":
+            return resolved
+    if stored:
+        return sanitize_session_id(stored)
+    return "default"
+def session_workspace_status_markdown(session_hash: str) -> str:
+    """Markdown for the workspace panel."""
+    workspace = ensure_session_workspace(session_hash)
+    path = workspace.as_posix()
+    if session_workspace_enabled():
+        return (
+            f"**Session id:** `{session_hash}`  \n" f"**Your workspace:** `{path}/`  \n"
+        )
+    return f"**Workspace:** `{path}/`"
+def prepare_session_workspace(
+    session_hash: str,
+    request: gr.Request | None = None,
+) -> tuple[str, Path, str]:
+    """
+    Resolve session id, create ``{PI_WORKSPACE_DIR}/{hash}/``, return status text.
+    Call at the start of redaction (and on page load) so the folder always exists.
+    """
+    effective = effective_session_hash(session_hash, request)
+    workspace = ensure_session_workspace(effective)
+    return effective, workspace, session_workspace_status_markdown(effective)
+def session_s3_outputs_prefix(session_hash: str) -> str:
+    """Session-scoped S3 output prefix (shared env vars with main app)."""
+    from tools.gradio_platform import build_s3_outputs_prefix
+    return build_s3_outputs_prefix(
+        session_hash,
+        session_scoped=session_workspace_enabled(),
+    )
+def session_workspace_dir(session_hash: str) -> Path:
+    base = workspace_base_dir().resolve()
+    if not session_workspace_enabled():
+        return base
+    safe_id = sanitize_session_id(session_hash)
+    candidate = (base / safe_id).resolve()
+    try:
+        candidate.relative_to(base)
+    except ValueError:
+        return (base / "default").resolve()
+    return candidate
+def ensure_session_workspace(session_hash: str) -> Path:
+    workspace = session_workspace_dir(session_hash)
+    workspace.mkdir(parents=True, exist_ok=True)
+    return workspace
+def init_session_workspace(
+    request: gr.Request,
+) -> tuple[str, gr.FileExplorer, str, str]:
+    """
+    App-load handler: create the session subfolder and scope the file explorer.
+    Returns ``(session_hash, file_explorer_update, status_markdown, s3_output_prefix)``.
+    """
+    session_hash, workspace, status = prepare_session_workspace("", request)
+    s3_prefix = session_s3_outputs_prefix(session_hash)
+    return (
+        session_hash,
+        gr.FileExplorer(root_dir=workspace.as_posix()),
+        status,
+        s3_prefix,
+    )
+def workspace_context_prefix(session_hash: str) -> str:
+    """Prefix Pi prompts so the agent uses the session workspace."""
+    if not session_workspace_enabled() or not session_hash.strip():
+        return ""
+    root = session_workspace_dir(session_hash).as_posix().rstrip("/")
+    lines = [
+        f"**Session workspace (mandatory):** all uploads, downloads, and redaction "
+        f"artifacts for this user must live under `{root}/`. "
+        f"Use `{root}/redact/<document>/output_redact/` for Pass 1 downloads and "
+        f"`{root}/redact/<document>/review/output_review_final/` after `/review_apply`. "
+        f"Do not write to `{root}/output_final_download/` (UI-managed download copies only). "
+        f"Do not read or write other session folders under `{workspace_base_dir().as_posix()}/`.",
+    ]
+    try:
+        from pi_agent_config import uses_split_redaction_backend
+        from redaction_prompt import doc_redaction_gradio_url
+        if uses_split_redaction_backend():
+            from pi_workspace_skills import remote_redaction_helper_module
+            helpers = remote_redaction_helper_module()
+            lines.append(
+                f"**Redaction outputs (split backend):** doc_redaction at "
+                f"`{doc_redaction_gradio_url()}` writes to its own container — download "
+                f"artifacts into `{root}/redact/<document>/output_redact/` via "
+                f"`{helpers}` (`fetch_redaction_files`; helper is under workspace base "
+                f"`{workspace_base_dir().as_posix()}/.pi/helpers/`, not under `{root}/.pi/`). "
+                f"Do not `find` or `ls` `/workspace/doc_redaction/output` from this agent."
+            )
+    except ImportError:
+        pass
+    return "\n".join(lines) + "\n\n"

agent-redact/pi/start.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env bash
+# Start Gradio Pi chat UI in the background; keep container alive for `docker compose exec pi-agent pi`.
+set -euo pipefail
+export HOME="${HOME:-/home/user}"
+export PI_WORKDIR="${PI_WORKDIR:-/workspace/doc_redaction}"
+export PYTHONPATH="${PI_WORKDIR}:${PI_WORKDIR}/agent-redact/pi:${PYTHONPATH:-}"
+cd "$PI_WORKDIR"
+export APP_TYPE="${APP_TYPE:-pi}"
+export APP_CONFIG_PATH="${APP_CONFIG_PATH:-$PI_WORKDIR/config/pi_agent.env}"
+mkdir -p "${PI_WORKSPACE_DIR:-/home/user/app/workspace}"
+python3 agent-redact/pi/pi_agent_config.py
+if [ "${RUN_FASTAPI:-False}" = "True" ]; then
+  exec uvicorn gradio_app:app \
+    --app-dir agent-redact/pi \
+    --host "${GRADIO_SERVER_NAME:-0.0.0.0}" \
+    --port "${PI_GRADIO_PORT:-${GRADIO_SERVER_PORT:-7862}}"
+else
+  python3 agent-redact/pi/gradio_app.py &
+fi
+wait -n

agent-redact/requirements_pi_agent.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+# Python stack for the pi-agent Docker image (orchestration + Pi Gradio UI).
+#
+# Excludes spaCy, Presidio, and OCR stacks — heavy redaction runs in redaction-app-llama.
+# Includes full Gradio for agent-redact/pi/gradio_app.py (chat frontend over Pi RPC mode).
+#
+# Version caps align with requirements_lightweight.txt where packages overlap.
+# --- Gradio UI + API client ---
+gradio==6.10.0
+gradio-pdf-redaction<=0.0.25
+httpx<=0.28.1
+requests<=2.34.2
+# --- Config ---
+python-dotenv<=1.2.2
+# --- CSV / tabular review (skills, page-review merge) ---
+numpy<=2.4.4
+pandas<=2.3.3
+openpyxl<=3.1.5
+# --- PDF helpers (verify_redaction_coverage, preview scripts) ---
+pymupdf<=1.27.1
+# --- General utilities ---
+tabulate<=0.10.0
+rapidfuzz<=3.14.5
+defusedxml<=0.7.1
+# --- Shared platform features (logging, Cognito, S3 via tools/) ---
+boto3<=1.42.61
+bleach<=6.3.0
+fastapi>=0.115.0
+uvicorn>=0.34.0

agent_routes.py ADDED Viewed

	@@ -0,0 +1,1167 @@

+"""
+FastAPI routes for programmatic / agent callers.
+HTTP paths align with Gradio ``api_name`` values in app.py. See GET /agent/operations
+for the full map. Uses cli_redact.main(direct_mode_args=...) where a CLI task exists.
+"""
+from __future__ import annotations
+import io
+import os
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fastapi import APIRouter, Depends, Header, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field, field_validator
+from tools.config import (
+    AWS_LLM_PII_OPTION,
+    AWS_PII_OPTION,
+    INFERENCE_SERVER_PII_OPTION,
+    INPUT_FOLDER,
+    LOCAL_OCR_MODEL_OPTIONS,
+    LOCAL_PII_OPTION,
+    LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+    OUTPUT_FOLDER,
+)
+from tools.secure_path_utils import validate_path_safety
+router = APIRouter(tags=["Agent"])
+REPO_ROOT = Path(__file__).resolve().parent
+_MAX_INSTRUCTION_LEN = 16_000
+# NOTE: Paths from request bodies are untrusted. Avoid Path.resolve() on untrusted
+# input (CodeQL py/path-injection); instead normalize via os.path and enforce
+# containment under trusted roots.
+# Mirrors app.py api_name values (Gradio).
+GRADIO_API_NAMES: tuple[str, ...] = (
+    "redact_document",
+    "load_and_prepare_documents_or_data",
+    "apply_review_redactions",
+    "review_apply",
+    "pdf_summarise",
+    "tabular_redact",
+    "word_level_ocr_text_search",
+    "redact_data",
+    "find_duplicate_pages",
+    "find_duplicate_tabular",
+    "summarise_document",
+    "combine_review_csvs",
+    "combine_review_pdfs",
+    "export_review_redaction_overlay",
+    "export_review_page_ocr_visualisation",
+    "verify_redaction_coverage",
+)
+def _allowed_path_roots() -> list[Path]:
+    # Return roots without resolving. These are trusted config values, but avoiding
+    # Path.resolve() keeps CodeQL happy and matches our "no resolve on untrusted"
+    # approach elsewhere.
+    roots = [REPO_ROOT]
+    for folder in (INPUT_FOLDER, OUTPUT_FOLDER):
+        if folder:
+            roots.append(Path(str(folder)))
+    return roots
+def _sanitize_untrusted_path_input(path_str: str) -> str:
+    """Basic raw-input validation before any path normalization."""
+    if not isinstance(path_str, str):
+        raise HTTPException(status_code=400, detail="Path must be a string.")
+    cleaned = path_str.strip()
+    if not cleaned:
+        raise HTTPException(status_code=400, detail="Path must not be empty.")
+    if "\x00" in cleaned:
+        raise HTTPException(status_code=400, detail="Path contains invalid null byte.")
+    return cleaned
+def _normalize_untrusted_path_to_abs(path_str: str) -> str:
+    """
+    Expand ~, then normalize to an absolute path.
+    Relative paths are interpreted relative to REPO_ROOT (matching prior behaviour).
+    """
+    safe_input = _sanitize_untrusted_path_input(path_str)
+    expanded = os.path.expanduser(safe_input)
+    if os.path.isabs(expanded):
+        return os.path.normpath(os.path.abspath(expanded))
+    return os.path.normpath(os.path.abspath(os.path.join(str(REPO_ROOT), expanded)))
+def _must_be_under_allowed_roots(candidate_abs: str, original: str) -> None:
+    """Enforce candidate is contained under repo, INPUT_FOLDER, or OUTPUT_FOLDER."""
+    candidate_real = os.path.realpath(str(candidate_abs))
+    allowed_roots = [
+        os.path.realpath(os.path.abspath(str(p))) for p in _allowed_path_roots()
+    ]
+    for root in allowed_roots:
+        try:
+            common = os.path.commonpath([candidate_real, root])
+        except ValueError:
+            # Different drive on Windows or invalid path mix
+            continue
+        if common == root:
+            return
+    raise HTTPException(
+        status_code=403,
+        detail="Path must be under the app repo, INPUT_FOLDER, or OUTPUT_FOLDER",
+    )
+def _path_must_be_allowed_file(path_str: str) -> str:
+    """Resolve path, ensure it is under an allowed root and exists as a file."""
+    candidate_abs = _normalize_untrusted_path_to_abs(path_str)
+    candidate_real = os.path.realpath(candidate_abs)
+    # Validate both "safe path" patterns and containment under trusted roots.
+    _must_be_under_allowed_roots(candidate_real, path_str)
+    ok = any(
+        validate_path_safety(candidate_real, base_path=str(root))
+        for root in _allowed_path_roots()
+    )
+    if not ok:
+        raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
+    try:
+        candidate_path = Path(candidate_real)
+        if not candidate_path.is_file():
+            raise HTTPException(
+                status_code=400, detail=f"Not a file or missing: {candidate_real}"
+            )
+    except OSError:
+        raise HTTPException(
+            status_code=400, detail=f"Not a file or missing: {candidate_real}"
+        )
+    return candidate_real
+def _path_must_be_allowed_directory(path_str: str, *, must_exist: bool = True) -> str:
+    """
+    Normalize and validate a directory path under allowed roots.
+    By default the directory must already exist; callers can opt out (e.g. output_dir
+    that will be created later by the CLI).
+    """
+    candidate_abs = _normalize_untrusted_path_to_abs(path_str)
+    candidate_real = os.path.realpath(candidate_abs)
+    _must_be_under_allowed_roots(candidate_real, path_str)
+    ok = any(
+        validate_path_safety(candidate_real, base_path=str(root))
+        for root in _allowed_path_roots()
+    )
+    if not ok:
+        raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
+    if must_exist:
+        try:
+            if not Path(candidate_real).is_dir():
+                raise HTTPException(
+                    status_code=400, detail=f"Not a directory: {candidate_real}"
+                )
+        except OSError:
+            raise HTTPException(
+                status_code=400, detail=f"Not a directory: {candidate_real}"
+            )
+    return candidate_real
+def _optional_agent_api_key(x_agent_api_key: Optional[str] = Header(None)) -> None:
+    expected = os.environ.get("AGENT_API_KEY", "").strip()
+    if not expected:
+        return
+    if not x_agent_api_key or x_agent_api_key.strip() != expected:
+        raise HTTPException(
+            status_code=401,
+            detail="Set header X-Agent-API-Key to match AGENT_API_KEY environment variable",
+        )
+class AgentRedactDocumentRequest(BaseModel):
+    """Parity with Gradio api_name ``redact_document``."""
+    input_files: list[str] = Field(
+        ...,
+        min_length=1,
+        description="Paths to input files (PDF, images, or tabular/Word for anonymisation)",
+    )
+    instruction: Optional[str] = Field(
+        None,
+        description="Optional instructions for LLM-based PII detection (custom_llm_instructions)",
+    )
+    output_dir: Optional[str] = None
+    input_dir: Optional[str] = None
+    ocr_method: Optional[str] = Field(
+        None,
+        description=(
+            "High-level OCR/text mode. Accepted values: 'Local OCR', "
+            "'AWS Textract', 'Local text'. To choose a specific local OCR engine "
+            "(e.g. paddle/tesseract/vlm), set "
+            "overrides.chosen_local_ocr_model."
+        ),
+    )
+    pii_detector: Optional[str] = Field(
+        None,
+        description=(
+            "PII detection method. Recommended configured labels: "
+            f"'{LOCAL_PII_OPTION}', '{AWS_PII_OPTION}', '{AWS_LLM_PII_OPTION}', "
+            f"'{INFERENCE_SERVER_PII_OPTION}', '{LOCAL_TRANSFORMERS_LLM_PII_OPTION}', "
+            "'None'."
+        ),
+    )
+    overrides: Optional[dict[str, Any]] = Field(
+        None,
+        description=(
+            "Optional CLI flag overrides; keys must match argparse destination names. "
+            "For local OCR model selection, set 'chosen_local_ocr_model' "
+            f"(allowed models depend on deployment; configured options: {LOCAL_OCR_MODEL_OPTIONS})."
+        ),
+    )
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input_files": [
+                        "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+                    ],
+                    "instruction": "Do not redact the university name.",
+                    "ocr_method": "Local OCR",
+                    "pii_detector": LOCAL_PII_OPTION,
+                    "overrides": {"chosen_local_ocr_model": "paddle"},
+                }
+            ]
+        }
+    }
+    @field_validator("instruction")
+    @classmethod
+    def _cap_instruction(cls, v: Optional[str]) -> Optional[str]:
+        if v is None:
+            return v
+        if len(v) > _MAX_INSTRUCTION_LEN:
+            raise ValueError(f"instruction exceeds {_MAX_INSTRUCTION_LEN} characters")
+        return v
+class AgentRedactDataRequest(AgentRedactDocumentRequest):
+    """Parity with Gradio api_name ``redact_data``; same CLI task as redact_document."""
+class AgentTaskResponse(BaseModel):
+    status: str
+    gradio_api_name: str
+    task: str
+    output_dir: str
+    input_dir: str
+    message: str
+    log_excerpt: Optional[str] = None
+    output_paths: Optional[list[str]] = None
+class AgentVerifyRedactionRequest(BaseModel):
+    review_csv_path: str = Field(..., description="Path to *_review_file.csv")
+    ocr_words_csv_path: str = Field(
+        ..., description="Path to *_ocr_results_with_words_*.csv from the same run"
+    )
+    must_redact: Optional[List[str]] = Field(
+        None,
+        description="Regex patterns for terms that must be covered by review boxes.",
+    )
+    must_not_redact: Optional[List[str]] = Field(
+        None,
+        description="Regex patterns for terms that must not appear in review rows.",
+    )
+    redacted_pdf_path: Optional[str] = Field(
+        None, description="Optional applied *_redacted.pdf for text-layer leak checks."
+    )
+    total_pages: Optional[int] = Field(None, ge=1)
+    min_word_length: int = Field(3, ge=1, le=32)
+    sample_pixels: bool = Field(
+        False,
+        description="Sample pixel darkness at box centres on redacted PDF (requires redacted_pdf_path).",
+    )
+    auto_prune_suspicious: bool = Field(
+        False,
+        description="Remove prunable suspicious short/OCR-fragment rows and write pruned CSV.",
+    )
+    pruned_output_path: Optional[str] = Field(
+        None,
+        description="Output path for pruned CSV when auto_prune_suspicious is true.",
+    )
+class AgentVerifyRedactionResponse(BaseModel):
+    status: str
+    gradio_api_name: str = "verify_redaction_coverage"
+    coverage_pass: bool
+    coverage_pass_strict: bool
+    coverage_pass_with_cleanup: bool
+    pruned_csv_path: Optional[str] = None
+    prune_log: Optional[Dict[str, Any]] = None
+    report: Dict[str, Any]
+class AgentWordLevelOcrSearchRequest(BaseModel):
+    ocr_words_csv_path: str = Field(
+        ..., description="Path to *_ocr_results_with_words_*.csv"
+    )
+    search_text: str = Field(..., min_length=3, max_length=500)
+    similarity_threshold: float = Field(1.0, ge=0.0, le=1.0)
+    use_regex: bool = False
+    review_csv_path: Optional[str] = Field(
+        None,
+        description="Optional *_review_file.csv to flag whether each hit is covered by a box.",
+    )
+class AgentWordLevelOcrSearchResponse(BaseModel):
+    status: str
+    gradio_api_name: str = "word_level_ocr_text_search"
+    result: Dict[str, Any]
+def _merge_redact_direct_mode(body: AgentRedactDocumentRequest) -> dict[str, Any]:
+    from cli_redact import get_cli_default_args_dict
+    merged: dict[str, Any] = get_cli_default_args_dict()
+    merged["task"] = "redact"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.instruction is not None:
+        merged["custom_llm_instructions"] = body.instruction
+    if body.output_dir is not None:
+        # Output folders may not exist yet (CLI will create). Still constrain to allowed roots.
+        merged["output_dir"] = _path_must_be_allowed_directory(
+            body.output_dir, must_exist=False
+        )
+    if body.input_dir is not None:
+        # Input dir should exist if provided.
+        merged["input_dir"] = _path_must_be_allowed_directory(
+            body.input_dir, must_exist=True
+        )
+    if body.ocr_method is not None:
+        merged["ocr_method"] = body.ocr_method
+    if body.pii_detector is not None:
+        merged["pii_detector"] = body.pii_detector
+    if body.overrides:
+        allowed = set(merged.keys())
+        for key, value in body.overrides.items():
+            if key not in allowed:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unknown override key '{key}'. Must be a known CLI argument name.",
+                )
+            merged[key] = value
+    return merged
+def _run_cli_main(direct: dict[str, Any], gradio_api_name: str) -> AgentTaskResponse:
+    from cli_redact import main as cli_main
+    buf = io.StringIO()
+    old_stdout = sys.stdout
+    try:
+        sys.stdout = buf
+        cli_main(direct_mode_args=direct)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e)) from e
+    finally:
+        sys.stdout = old_stdout
+    log_excerpt = buf.getvalue()
+    if len(log_excerpt) > 8000:
+        log_excerpt = log_excerpt[-8000:]
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name=gradio_api_name,
+        task=str(direct.get("task", "")),
+        output_dir=str(direct.get("output_dir", "")),
+        input_dir=str(direct.get("input_dir", "")),
+        message="cli_redact.main finished; see log_excerpt for console output",
+        log_excerpt=log_excerpt or None,
+    )
+@router.post(
+    "/redact_document",
+    response_model=AgentTaskResponse,
+    summary="redact_document (Gradio api_name)",
+    description=(
+        "Matches Gradio ``api_name='redact_document'``. "
+        "``python cli_redact.py --task redact --input_file ...``. "
+        "Optional ``instruction`` maps to ``custom_llm_instructions``. "
+        "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
+        "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
+        f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
+        "PII methods should use configured labels shown on the request schema."
+    ),
+)
+def post_redact_document(
+    body: AgentRedactDocumentRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    direct = _merge_redact_direct_mode(body)
+    return _run_cli_main(direct, "redact_document")
+@router.post(
+    "/redact_data",
+    response_model=AgentTaskResponse,
+    summary="redact_data (Gradio api_name)",
+    description=(
+        "Matches Gradio ``api_name='redact_data'``. Same CLI ``redact`` task as "
+        "/redact_document; use CSV/XLSX/DOCX paths for tabular/Word flows. "
+        "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
+        "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
+        f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
+        "PII methods should use configured labels shown on the request schema."
+    ),
+)
+def post_redact_data(
+    body: AgentRedactDataRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    direct = _merge_redact_direct_mode(body)
+    return _run_cli_main(direct, "redact_data")
+@router.post(
+    "/tasks/redact",
+    response_model=AgentTaskResponse,
+    summary="Legacy: same as /redact_document",
+    description="Deprecated alias; prefer POST /agent/redact_document.",
+    deprecated=True,
+    include_in_schema=True,
+)
+def post_tasks_redact_legacy(
+    body: AgentRedactDocumentRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    direct = _merge_redact_direct_mode(body)
+    return _run_cli_main(direct, "redact_document")
+class AgentFindDuplicatePagesRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    similarity_threshold: Optional[float] = None
+    min_word_count: Optional[int] = None
+    min_consecutive_pages: Optional[int] = None
+    greedy_match: Optional[bool] = None
+    combine_pages: Optional[bool] = None
+    overrides: Optional[dict[str, Any]] = None
+@router.post(
+    "/find_duplicate_pages",
+    response_model=AgentTaskResponse,
+    summary="find_duplicate_pages (Gradio api_name)",
+    description="``cli_redact --task deduplicate --duplicate_type pages``.",
+)
+def post_find_duplicate_pages(
+    body: AgentFindDuplicatePagesRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "deduplicate"
+    merged["duplicate_type"] = "pages"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.similarity_threshold is not None:
+        merged["similarity_threshold"] = body.similarity_threshold
+    if body.min_word_count is not None:
+        merged["min_word_count"] = body.min_word_count
+    if body.min_consecutive_pages is not None:
+        merged["min_consecutive_pages"] = body.min_consecutive_pages
+    if body.greedy_match is not None:
+        merged["greedy_match"] = "True" if body.greedy_match else "False"
+    if body.combine_pages is not None:
+        merged["combine_pages"] = "True" if body.combine_pages else "False"
+    if body.overrides:
+        allowed = set(merged.keys())
+        for k, v in body.overrides.items():
+            if k not in allowed:
+                raise HTTPException(400, f"Unknown override key: {k}")
+            merged[k] = v
+    return _run_cli_main(merged, "find_duplicate_pages")
+class AgentFindDuplicateTabularRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    text_columns: Optional[list[str]] = None
+    similarity_threshold: Optional[float] = None
+    min_word_count: Optional[int] = None
+    overrides: Optional[dict[str, Any]] = None
+@router.post(
+    "/find_duplicate_tabular",
+    response_model=AgentTaskResponse,
+    summary="find_duplicate_tabular (Gradio api_name)",
+)
+def post_find_duplicate_tabular(
+    body: AgentFindDuplicateTabularRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "deduplicate"
+    merged["duplicate_type"] = "tabular"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.text_columns is not None:
+        merged["text_columns"] = body.text_columns
+    if body.similarity_threshold is not None:
+        merged["similarity_threshold"] = body.similarity_threshold
+    if body.min_word_count is not None:
+        merged["min_word_count"] = body.min_word_count
+    if body.overrides:
+        allowed = set(merged.keys())
+        for k, v in body.overrides.items():
+            if k not in allowed:
+                raise HTTPException(400, f"Unknown override key: {k}")
+            merged[k] = v
+    return _run_cli_main(merged, "find_duplicate_tabular")
+class AgentSummariseDocumentRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    summarisation_inference_method: Optional[str] = None
+    summarisation_format: Optional[str] = None
+    summarisation_context: Optional[str] = None
+    summarisation_additional_instructions: Optional[str] = None
+    overrides: Optional[dict[str, Any]] = None
+@router.post(
+    "/summarise_document",
+    response_model=AgentTaskResponse,
+    summary="summarise_document (Gradio api_name)",
+)
+def post_summarise_document(
+    body: AgentSummariseDocumentRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "summarise"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.summarisation_inference_method is not None:
+        merged["summarisation_inference_method"] = body.summarisation_inference_method
+    if body.summarisation_format is not None:
+        merged["summarisation_format"] = body.summarisation_format
+    if body.summarisation_context is not None:
+        merged["summarisation_context"] = body.summarisation_context
+    if body.summarisation_additional_instructions is not None:
+        merged["summarisation_additional_instructions"] = (
+            body.summarisation_additional_instructions
+        )
+    if body.overrides:
+        allowed = set(merged.keys())
+        for k, v in body.overrides.items():
+            if k not in allowed:
+                raise HTTPException(400, f"Unknown override key: {k}")
+            merged[k] = v
+    return _run_cli_main(merged, "summarise_document")
+class AgentCombineReviewPdfsRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=2)
+    output_dir: Optional[str] = None
+@router.post(
+    "/combine_review_pdfs",
+    response_model=AgentTaskResponse,
+    summary="combine_review_pdfs (Gradio api_name)",
+)
+def post_combine_review_pdfs(
+    body: AgentCombineReviewPdfsRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from cli_redact import get_cli_default_args_dict
+    merged = get_cli_default_args_dict()
+    merged["task"] = "combine_review_pdfs"
+    merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
+    if body.output_dir is not None:
+        merged["output_dir"] = _path_must_be_allowed_directory(body.output_dir)
+    return _run_cli_main(merged, "combine_review_pdfs")
+class _NamedPath:
+    """merge_csv_files expects objects with a .name attribute (Gradio file-like)."""
+    __slots__ = ("name",)
+    def __init__(self, path: str) -> None:
+        self.name = path
+class AgentCombineReviewCsvsRequest(BaseModel):
+    input_files: list[str] = Field(..., min_length=1)
+    output_dir: Optional[str] = Field(
+        None, description="Defaults to config OUTPUT_FOLDER"
+    )
+class AgentApplyReviewRedactionsRequest(BaseModel):
+    """Headless parity with Gradio ``api_name='apply_review_redactions'`` (prepare + apply)."""
+    pdf_path: str = Field(
+        ...,
+        description="Path to the source PDF under allowed roots.",
+    )
+    review_csv_path: str = Field(
+        ...,
+        description=(
+            "Path to the review plan CSV; basename must contain '_review_file' "
+            "(e.g. mydoc_review_file.csv)."
+        ),
+    )
+    output_dir: Optional[str] = Field(
+        None,
+        description="Output directory (created if missing); defaults to OUTPUT_FOLDER.",
+    )
+    input_dir: Optional[str] = Field(
+        None,
+        description="Input/working directory for page images; defaults to INPUT_FOLDER.",
+    )
+    text_extract_method: Optional[str] = Field(
+        None,
+        description="OCR/text mode passed to prepare (defaults to CLI ocr_method).",
+    )
+    efficient_ocr: Optional[bool] = Field(
+        None,
+        description="If set, overrides EFFICIENT_OCR for the prepare step.",
+    )
+@router.post(
+    "/combine_review_csvs",
+    response_model=AgentTaskResponse,
+    summary="combine_review_csvs (Gradio api_name)",
+    description="Uses tools.helper_functions.merge_csv_files (not cli_redact).",
+)
+def post_combine_review_csvs(
+    body: AgentCombineReviewCsvsRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from tools.helper_functions import merge_csv_files
+    paths = [_NamedPath(_path_must_be_allowed_file(p)) for p in body.input_files]
+    out_dir = body.output_dir or OUTPUT_FOLDER
+    out_dir_resolved = _path_must_be_allowed_directory(str(out_dir), must_exist=True)
+    sep = "/" if not out_dir_resolved.endswith(("/", "\\")) else ""
+    out_files = merge_csv_files(paths, output_folder=out_dir_resolved + sep)
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="combine_review_csvs",
+        task="combine_review_csvs",
+        output_dir=out_dir_resolved,
+        input_dir="",
+        message="merge_csv_files completed",
+        output_paths=out_files,
+    )
+class AgentExportReviewRedactionOverlayRequest(BaseModel):
+    """Agent JSON body for the same overlay render as Gradio ``api_name='page_redaction_review_image'``."""
+    page_image_path: str = Field(
+        ...,
+        description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
+    )
+    boxes: List[Dict[str, Any]] = Field(
+        ...,
+        min_length=1,
+        description="Annotator-style boxes: label, color, xmin, ymin, xmax, ymax (normalized 0–1).",
+    )
+    page_number: int = Field(
+        1, ge=1, description="1-based page index for the output filename."
+    )
+    doc_base_name: str = Field(
+        "review",
+        description="Basename for output file (e.g. document name without extension).",
+    )
+    review_df_records: Optional[List[Dict[str, Any]]] = Field(
+        None,
+        description="Optional rows (include at least 'label') for stable label→line-pattern mapping.",
+    )
+    label_abbrev_chars: Optional[int] = Field(
+        None,
+        ge=0,
+        le=24,
+        description="Draw this many leading characters of each label on the image; omit to use REVIEW_OVERLAY_LABEL_ABBREV_CHARS from config (0 = off).",
+    )
+class AgentExportReviewPageOcrVisualisationRequest(BaseModel):
+    """Agent JSON body for the same OCR visualisation as Gradio ``api_name='page_ocr_review_image'``."""
+    page_image_path: str = Field(
+        ...,
+        description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
+    )
+    ocr_results: Dict[str, Any] = Field(
+        ...,
+        description="Word-level OCR results dict (line_key -> {words:[{text, bounding_box, conf, ...}]}).",
+    )
+    page_number: int = Field(
+        1, ge=1, description="1-based page index (used for naming)."
+    )
+    doc_base_name: str = Field(
+        "review",
+        description="Basename for output file (e.g. document name without extension).",
+    )
+@router.post(
+    "/export_review_redaction_overlay",
+    response_model=AgentTaskResponse,
+    summary="export_review_redaction_overlay (Agent API; Gradio api_name: page_redaction_review_image)",
+    description=(
+        "Renders hollow redaction outlines and a top-right legend on the page image; "
+        "writes ``redaction_overlay/{doc_base_name}_page{n}_redaction_overlay.jpg`` under OUTPUT_FOLDER "
+        "(scaled per REVIEW_OVERLAY_MAX_PIXELS, JPEG capped by REVIEW_OVERLAY_MAX_FILE_BYTES). "
+        "Uses ``tools.redaction_review.visualise_review_redaction_boxes``."
+    ),
+)
+def post_export_review_redaction_overlay(
+    body: AgentExportReviewRedactionOverlayRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    import pandas as pd
+    from tools.redaction_review import visualise_review_redaction_boxes
+    img_path = _path_must_be_allowed_file(body.page_image_path)
+    annotator: dict[str, Any] = {"image": img_path, "boxes": body.boxes}
+    review_df = (
+        pd.DataFrame(body.review_df_records)
+        if body.review_df_records
+        else pd.DataFrame()
+    )
+    out_folder_abs = os.path.realpath(
+        os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
+    )
+    if not validate_path_safety(out_folder_abs):
+        raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
+    _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
+    try:
+        Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
+    except OSError:
+        raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
+    out_folder = out_folder_abs
+    path = visualise_review_redaction_boxes(
+        annotator,
+        review_df=review_df,
+        output_folder=out_folder,
+        page_number=body.page_number,
+        doc_base_name=body.doc_base_name,
+        label_abbrev_chars=body.label_abbrev_chars,
+    )
+    if not path:
+        raise HTTPException(
+            status_code=500,
+            detail=(
+                "Could not produce overlay PNG (invalid image/boxes or write failed). "
+                "Ensure boxes are valid and the image loads."
+            ),
+        )
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="export_review_redaction_overlay",
+        task="export_review_redaction_overlay",
+        output_dir=out_folder,
+        input_dir="",
+        message="Redaction overlay PNG written",
+        output_paths=[path],
+    )
+@router.post(
+    "/export_review_page_ocr_visualisation",
+    response_model=AgentTaskResponse,
+    summary="export_review_page_ocr_visualisation (Agent API; Gradio api_name: page_ocr_review_image)",
+    description=(
+        "Renders a per-page OCR visualisation using tools.file_redaction.visualise_ocr_words_bounding_boxes; "
+        "writes under OUTPUT_FOLDER/review_ocr_visualisations/."
+    ),
+)
+def post_export_review_page_ocr_visualisation(
+    body: AgentExportReviewPageOcrVisualisationRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from PIL import Image
+    from tools.file_redaction import visualise_ocr_words_bounding_boxes
+    img_path = _path_must_be_allowed_file(body.page_image_path)
+    out_folder_abs = os.path.realpath(
+        os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
+    )
+    if not validate_path_safety(out_folder_abs):
+        raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
+    _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
+    try:
+        Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
+    except OSError:
+        raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
+    out_folder = out_folder_abs
+    safe_base = str(body.doc_base_name or "review")
+    image_name = f"{safe_base}_page{int(body.page_number)}.png"
+    log_paths: list[str] = []
+    try:
+        log_paths = visualise_ocr_words_bounding_boxes(
+            Image.open(img_path).convert("RGB"),
+            body.ocr_results,
+            image_name=image_name,
+            output_folder=out_folder,
+            visualisation_folder="review_ocr_visualisations",
+            add_legend=True,
+            log_files_output_paths=log_paths,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e)) from e
+    if not log_paths:
+        raise HTTPException(
+            status_code=500,
+            detail="Could not produce OCR visualisation (invalid image/ocr_results or write failed).",
+        )
+    out_path = log_paths[-1]
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="export_review_page_ocr_visualisation",
+        task="export_review_page_ocr_visualisation",
+        output_dir=out_folder,
+        input_dir="",
+        message="OCR visualisation written",
+        output_paths=[out_path],
+    )
+def _gradio_only(api_name: str, detail: str) -> JSONResponse:
+    return JSONResponse(
+        status_code=501,
+        content={
+            "gradio_api_name": api_name,
+            "detail": detail,
+            "hint": (
+                "This flow is Gradio-session stateful. Call the named route on the "
+                "Gradio HTTP API, not /agent."
+            ),
+            "gradio_http": {
+                "discover_schema": "GET /gradio_api/info",
+                "start_call": f"POST /gradio_api/call/{api_name}",
+                "request_body_shape": '{"data": [<args in schema order>]}',
+                "poll": f"GET /gradio_api/call/{api_name}/{{event_id}}",
+            },
+            "gradio_client_notes": [
+                "Pass api_name explicitly; do not rely on inferring the endpoint from "
+                "Python function names (large Blocks apps will look ambiguous).",
+                "If predict() still cannot resolve the route, open GET /gradio_api/info "
+                "and use the numeric fn_index with gradio_client, or call the HTTP "
+                "endpoints directly.",
+                "The length of data must match the parameter list for this deployment; "
+                "copy order and types from /gradio_api/info.",
+            ],
+        },
+    )
+@router.post("/load_and_prepare_documents_or_data")
+def post_load_and_prepare_documents_or_data() -> JSONResponse:
+    return _gradio_only(
+        "load_and_prepare_documents_or_data",
+        "Preparation uses Gradio session state and prepare_image_or_pdf_with_efficient_ocr; no single CLI task.",
+    )
+@router.post(
+    "/apply_review_redactions",
+    response_model=AgentTaskResponse,
+    summary="apply_review_redactions (Gradio api_name)",
+    description=(
+        "Runs prepare_image_or_pdf_with_efficient_ocr([pdf, review_csv]) then "
+        "apply_redactions_to_review_df_and_files — same core pipeline as the Review tab, "
+        "without Gradio session state. Requires paths under allowed roots."
+    ),
+)
+def post_apply_review_redactions(
+    body: AgentApplyReviewRedactionsRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentTaskResponse:
+    from tools.simplified_api import run_apply_review_redactions
+    pdf = _path_must_be_allowed_file(body.pdf_path)
+    csv = _path_must_be_allowed_file(body.review_csv_path)
+    out_dir: str | None = None
+    if body.output_dir is not None:
+        out_dir = _path_must_be_allowed_directory(body.output_dir, must_exist=False)
+    in_dir: str | None = None
+    if body.input_dir is not None:
+        in_dir = _path_must_be_allowed_directory(body.input_dir, must_exist=False)
+    try:
+        result = run_apply_review_redactions(
+            pdf_path=pdf,
+            review_csv_path=csv,
+            output_dir=out_dir,
+            input_dir=in_dir,
+            text_extract_method=body.text_extract_method,
+            efficient_ocr=body.efficient_ocr,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"apply_review_redactions failed: {e}",
+        ) from e
+    return AgentTaskResponse(
+        status="completed",
+        gradio_api_name="apply_review_redactions",
+        task="apply_review_redactions",
+        output_dir=result["output_dir"],
+        input_dir=result["input_dir"],
+        message=result["message"],
+        output_paths=result.get("output_paths"),
+    )
+@router.post(
+    "/verify_redaction_coverage",
+    response_model=AgentVerifyRedactionResponse,
+    summary="verify_redaction_coverage (Pass 1 programmatic QA)",
+)
+def post_verify_redaction_coverage(
+    body: AgentVerifyRedactionRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentVerifyRedactionResponse:
+    from tools.simplified_api import run_verify_redaction_coverage
+    review = _path_must_be_allowed_file(body.review_csv_path)
+    ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
+    redacted = None
+    if body.redacted_pdf_path:
+        redacted = _path_must_be_allowed_file(body.redacted_pdf_path)
+    try:
+        report, pruned_csv_path, prune_log = run_verify_redaction_coverage(
+            review_csv_path=review,
+            ocr_words_csv_path=ocr_words,
+            must_redact=body.must_redact,
+            must_not_redact=body.must_not_redact,
+            redacted_pdf_path=redacted,
+            total_pages=body.total_pages,
+            min_word_length=body.min_word_length,
+            sample_pixels=body.sample_pixels,
+            auto_prune_suspicious=body.auto_prune_suspicious,
+            pruned_output_path=body.pruned_output_path,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"verify_redaction_coverage failed: {e}"
+        ) from e
+    return AgentVerifyRedactionResponse(
+        status="completed",
+        coverage_pass=bool(report.get("pass_strict", report.get("pass"))),
+        coverage_pass_strict=bool(report.get("pass_strict", report.get("pass"))),
+        coverage_pass_with_cleanup=bool(report.get("pass_with_cleanup")),
+        pruned_csv_path=pruned_csv_path,
+        prune_log=prune_log,
+        report=report,
+    )
+@router.post(
+    "/word_level_ocr_text_search",
+    response_model=AgentWordLevelOcrSearchResponse,
+    summary="word_level_ocr_text_search (headless OCR CSV search)",
+)
+def post_word_level_ocr_text_search(
+    body: AgentWordLevelOcrSearchRequest,
+    _: None = Depends(_optional_agent_api_key),
+) -> AgentWordLevelOcrSearchResponse:
+    from tools.simplified_api import run_word_level_ocr_text_search_api
+    ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
+    review = None
+    if body.review_csv_path:
+        review = _path_must_be_allowed_file(body.review_csv_path)
+    try:
+        result = run_word_level_ocr_text_search_api(
+            ocr_words_csv_path=ocr_words,
+            search_text=body.search_text,
+            similarity_threshold=body.similarity_threshold,
+            use_regex=body.use_regex,
+            review_csv_path=review,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"word_level_ocr_text_search failed: {e}"
+        ) from e
+    return AgentWordLevelOcrSearchResponse(status="completed", result=result)
+@router.get("/operations")
+def list_operations() -> dict[str, Any]:
+    return {
+        "gradio_api_names": list(GRADIO_API_NAMES),
+        "gradio_session_state_endpoints": {
+            "description": (
+                "These api_name values are exposed on the Gradio HTTP API but return "
+                "501 on /agent because they depend on in-memory Gradio state."
+            ),
+            "discover_schema": "GET /gradio_api/info",
+            "call_pattern": 'POST /gradio_api/call/<api_name> with JSON body {"data": [...]}',
+            "names": [
+                "load_and_prepare_documents_or_data",
+            ],
+        },
+        "routes": [
+            {
+                "gradio_api_name": "redact_document",
+                "method": "POST",
+                "path": "/agent/redact_document",
+                "implementation": "cli_redact task redact",
+                "notes": {
+                    "ocr_method": [
+                        "Local OCR",
+                        "AWS Textract",
+                        "Local text",
+                    ],
+                    "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
+                    "pii_detector_recommended": [
+                        LOCAL_PII_OPTION,
+                        AWS_PII_OPTION,
+                        AWS_LLM_PII_OPTION,
+                        INFERENCE_SERVER_PII_OPTION,
+                        LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+                        "None",
+                    ],
+                },
+            },
+            {
+                "gradio_api_name": "redact_data",
+                "method": "POST",
+                "path": "/agent/redact_data",
+                "implementation": "cli_redact task redact",
+                "notes": {
+                    "ocr_method": [
+                        "Local OCR",
+                        "AWS Textract",
+                        "Local text",
+                    ],
+                    "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
+                    "pii_detector_recommended": [
+                        LOCAL_PII_OPTION,
+                        AWS_PII_OPTION,
+                        AWS_LLM_PII_OPTION,
+                        INFERENCE_SERVER_PII_OPTION,
+                        LOCAL_TRANSFORMERS_LLM_PII_OPTION,
+                        "None",
+                    ],
+                },
+            },
+            {
+                "gradio_api_name": "find_duplicate_pages",
+                "method": "POST",
+                "path": "/agent/find_duplicate_pages",
+                "implementation": "cli_redact deduplicate pages",
+            },
+            {
+                "gradio_api_name": "find_duplicate_tabular",
+                "method": "POST",
+                "path": "/agent/find_duplicate_tabular",
+                "implementation": "cli_redact deduplicate tabular",
+            },
+            {
+                "gradio_api_name": "summarise_document",
+                "method": "POST",
+                "path": "/agent/summarise_document",
+                "implementation": "cli_redact task summarise",
+            },
+            {
+                "gradio_api_name": "combine_review_pdfs",
+                "method": "POST",
+                "path": "/agent/combine_review_pdfs",
+                "implementation": "cli_redact combine_review_pdfs",
+            },
+            {
+                "gradio_api_name": "export_review_redaction_overlay",
+                "method": "POST",
+                "path": "/agent/export_review_redaction_overlay",
+                "implementation": "visualise_review_redaction_boxes",
+            },
+            {
+                "gradio_api_name": "export_review_page_ocr_visualisation",
+                "method": "POST",
+                "path": "/agent/export_review_page_ocr_visualisation",
+                "implementation": "visualise_ocr_words_bounding_boxes",
+            },
+            {
+                "gradio_api_name": "combine_review_csvs",
+                "method": "POST",
+                "path": "/agent/combine_review_csvs",
+                "implementation": "helper merge_csv_files",
+            },
+            {
+                "gradio_api_name": "load_and_prepare_documents_or_data",
+                "method": "POST",
+                "path": "/agent/load_and_prepare_documents_or_data",
+                "implementation": "not_implemented_http",
+            },
+            {
+                "gradio_api_name": "apply_review_redactions",
+                "method": "POST",
+                "path": "/agent/apply_review_redactions",
+                "implementation": "tools.simplified_api.run_apply_review_redactions",
+            },
+            {
+                "gradio_api_name": "verify_redaction_coverage",
+                "method": "POST",
+                "path": "/agent/verify_redaction_coverage",
+                "implementation": "tools.verify_redaction_coverage.verify_redaction_coverage",
+                "notes": {
+                    "purpose": "Pass 1 programmatic QA — pass_strict (policy), pass_with_cleanup (+ suspicious rows), optional prune and text/pixel checks.",
+                    "must_redact": "list of regex strings",
+                    "must_not_redact": "list of regex strings",
+                    "auto_prune_suspicious": "remove short OCR-fragment rows before reporting",
+                    "pages_flagged_for_vlm": "policy/visual failures only",
+                    "pages_needing_csv_cleanup": "suspicious rows — prune, not VLM",
+                    "leak_likely_causes": "per-page hints when text_layer_leaks (coord_not_normalized, missing_page_boxes, etc.) — not a broken /review_apply",
+                },
+            },
+            {
+                "gradio_api_name": "word_level_ocr_text_search",
+                "method": "POST",
+                "path": "/agent/word_level_ocr_text_search",
+                "implementation": "tools.verify_redaction_coverage.run_word_level_ocr_text_search",
+            },
+        ],
+    }
+@router.get("/health")
+def agent_health() -> dict[str, str]:
+    return {"status": "ok", "service": "agent"}

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cdk/__init__.py ADDED Viewed

File without changes

cdk/app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+from aws_cdk import App, Environment
+from cdk_appregistry import register_doc_redaction_application
+from cdk_config import (
+    ALB_NAME,
+    APPREGISTRY_APPLICATION_NAME,
+    APPREGISTRY_ATTRIBUTE_GROUP_NAME,
+    APPREGISTRY_DESCRIPTION,
+    APPREGISTRY_REPOSITORY_URL,
+    APPREGISTRY_STACK_NAME,
+    AWS_ACCOUNT_ID,
+    AWS_REGION,
+    CDK_CONTEXT_FILE,
+    CDK_PREFIX,
+    ENABLE_APPREGISTRY,
+    RUN_USEAST_STACK,
+    USE_CLOUDFRONT,
+)
+from cdk_functions import (
+    create_basic_config_env,
+    is_resource_delete_protection_enabled,
+    load_context_from_file,
+    log_aws_credential_context,
+    purge_cdk_lookup_context,
+)
+from cdk_stack import CdkStack, CdkStackCloudfront  # , CdkStackMain
+from check_resources import CONTEXT_FILE, check_and_set_context
+# Initialize the CDK app
+app = App()
+log_aws_credential_context(
+    expected_account_id=AWS_ACCOUNT_ID,
+    expected_region=AWS_REGION,
+)
+# Drop stale CDK lookup cache entries (require bootstrap lookup role in target account).
+purge_cdk_lookup_context(CDK_CONTEXT_FILE)
+# --- Pre-check context (boto3) — written to precheck.context.json, NOT cdk.context.json ---
+print(f"Pre-check context file: {CONTEXT_FILE}")
+print(f"CDK lookup cache file: {CDK_CONTEXT_FILE}")
+if os.path.basename(CONTEXT_FILE.replace("\\", "/")) == os.path.basename(
+    CDK_CONTEXT_FILE.replace("\\", "/")
+):
+    raise RuntimeError(
+        f"CONTEXT_FILE and CDK_CONTEXT_FILE must differ (got '{CONTEXT_FILE}' for both). "
+        "Set CONTEXT_FILE=precheck.context.json in config/cdk_config.env."
+    )
+print("Running pre-check script to generate application context...")
+try:
+    check_and_set_context()
+    if not os.path.exists(CONTEXT_FILE):
+        raise RuntimeError(
+            f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
+        )
+    print(f"Context generated successfully at {CONTEXT_FILE}.")
+except Exception as e:
+    raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
+# Pre-check must not repopulate CDK lookup keys; purge again if paths were ever shared.
+purge_cdk_lookup_context(CDK_CONTEXT_FILE)
+if os.path.exists(CONTEXT_FILE):
+    load_context_from_file(app, CONTEXT_FILE)
+else:
+    raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
+create_basic_config_env("config")
+aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
+_stack_delete_protection = is_resource_delete_protection_enabled()
+regional_stack = CdkStack(
+    app, "RedactionStack", env=aws_env_regional, cross_region_references=True
+)
+regional_stack.termination_protection = _stack_delete_protection
+if ENABLE_APPREGISTRY == "True":
+    # Use pre-check context only — not regional_stack.params (avoids AppRegistry
+    # -> RedactionStack dependency cycle during synth).
+    _alb_dns_context = app.node.try_get_context(f"dns:{ALB_NAME}")
+    _alb_dns_name = (
+        _alb_dns_context.strip()
+        if isinstance(_alb_dns_context, str) and _alb_dns_context.strip()
+        else None
+    )
+    appregistry_stack = register_doc_redaction_application(
+        app,
+        aws_account_id=AWS_ACCOUNT_ID,
+        aws_region=AWS_REGION,
+        application_name=APPREGISTRY_APPLICATION_NAME,
+        application_description=APPREGISTRY_DESCRIPTION,
+        appregistry_stack_name=APPREGISTRY_STACK_NAME,
+        attribute_group_name=APPREGISTRY_ATTRIBUTE_GROUP_NAME,
+        repository_url=APPREGISTRY_REPOSITORY_URL,
+        cdk_prefix=CDK_PREFIX,
+        use_cloudfront=USE_CLOUDFRONT,
+        alb_dns_name=_alb_dns_name,
+    )
+    appregistry_stack.termination_protection = _stack_delete_protection
+if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
+    aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
+    cloudfront_stack = CdkStackCloudfront(
+        app,
+        "RedactionStackCloudfront",
+        env=aws_env_us_east_1,
+        alb_arn=regional_stack.params["alb_arn_output"],
+        alb_sec_group_id=regional_stack.params["alb_security_group_id"],
+        alb_dns_name=regional_stack.params["alb_dns_name"],
+        cross_region_references=True,
+    )
+    cloudfront_stack.termination_protection = _stack_delete_protection
+# CDK CLI invokes this script and expects a cloud assembly in cdk.out.
+# Without app.synth(), Python defines constructs but never writes manifest.json
+# (ENOENT on deploy). See: https://github.com/aws/aws-cdk/issues/11023
+app.synth()

cdk/cdk.json.example ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "app": "python app.py",
+  "output": "cdk.out",
+  "context": {
+    "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": false
+  }
+}