seanpedrickcase commited on
Commit
a2e06b3
·
0 Parent(s):

Sync: Merge pull request #199 from seanpedrick-case/startup_optimise

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .coveragerc +56 -0
  2. .dockerignore +57 -0
  3. .gitattributes +9 -0
  4. .github/scripts/setup_test_data.py +320 -0
  5. .github/workflow_README.md +183 -0
  6. .github/workflows/archive_workflows/multi-os-test.yml +115 -0
  7. .github/workflows/ci.yml +269 -0
  8. .github/workflows/simple-test.yml +74 -0
  9. .github/workflows/sync-pi-agent-space.yml +64 -0
  10. .github/workflows/sync_to_hf.yml +54 -0
  11. .github/workflows/sync_to_hf_zero_gpu.yml +59 -0
  12. .gitignore +74 -0
  13. AGENTS.md +113 -0
  14. Dockerfile +235 -0
  15. LICENSE +661 -0
  16. MANIFEST.in +4 -0
  17. README.md +367 -0
  18. README_PYPI.md +351 -0
  19. agent-redact/README.md +29 -0
  20. agent-redact/pi-agent/.dockerignore +10 -0
  21. agent-redact/pi-agent/.gitattributes +2 -0
  22. agent-redact/pi-agent/Dockerfile +176 -0
  23. agent-redact/pi-agent/README.md +46 -0
  24. agent-redact/pi-agent/entrypoint-ecs.sh +12 -0
  25. agent-redact/pi-agent/entrypoint.sh +36 -0
  26. agent-redact/pi-agent/sync-manifest.txt +12 -0
  27. agent-redact/pi-agent/sync_to_space.sh +42 -0
  28. agent-redact/pi/agent/README.md +194 -0
  29. agent-redact/pi/agent/models.json +31 -0
  30. agent-redact/pi/agent/settings.json +32 -0
  31. agent-redact/pi/bootstrap_pi_config.py +192 -0
  32. agent-redact/pi/gradio_app.py +0 -0
  33. agent-redact/pi/output_files.py +423 -0
  34. agent-redact/pi/pi_agent_config.py +857 -0
  35. agent-redact/pi/pi_examples.py +180 -0
  36. agent-redact/pi/pi_rpc_client.py +989 -0
  37. agent-redact/pi/pi_session_usage.py +185 -0
  38. agent-redact/pi/pi_workspace_skills.py +392 -0
  39. agent-redact/pi/redaction_prompt.py +756 -0
  40. agent-redact/pi/remote_redaction.py +410 -0
  41. agent-redact/pi/run_doc_redact.py +87 -0
  42. agent-redact/pi/session_logs.py +124 -0
  43. agent-redact/pi/session_workspace.py +212 -0
  44. agent-redact/pi/start.sh +26 -0
  45. agent-redact/requirements_pi_agent.txt +34 -0
  46. agent_routes.py +1167 -0
  47. app.py +0 -0
  48. cdk/__init__.py +0 -0
  49. cdk/app.py +123 -0
  50. cdk/cdk.json.example +7 -0
.coveragerc ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [run]
2
+ source = .
3
+ omit =
4
+ */tests/*
5
+ */test/*
6
+ */__pycache__/*
7
+ */venv/*
8
+ */env/*
9
+ */build/*
10
+ */dist/*
11
+ */cdk/*
12
+ */docs/*
13
+ */example_data/*
14
+ */examples/*
15
+ */feedback/*
16
+ */logs/*
17
+ */old_code/*
18
+ */output/*
19
+ */tmp/*
20
+ */usage/*
21
+ */tld/*
22
+ */tesseract/*
23
+ */poppler/*
24
+ config*.py
25
+ setup.py
26
+ lambda_entrypoint.py
27
+ entrypoint.sh
28
+ cli_redact.py
29
+ load_dynamo_logs.py
30
+ load_s3_logs.py
31
+ *.spec
32
+ Dockerfile
33
+ *.qmd
34
+ *.md
35
+ *.txt
36
+ *.yml
37
+ *.yaml
38
+ *.json
39
+ *.csv
40
+ *.env
41
+ *.bat
42
+ *.ps1
43
+ *.sh
44
+
45
+ [report]
46
+ exclude_lines =
47
+ pragma: no cover
48
+ def __repr__
49
+ if self.debug:
50
+ if settings.DEBUG
51
+ raise AssertionError
52
+ raise NotImplementedError
53
+ if 0:
54
+ if __name__ == .__main__.:
55
+ class .*\bProtocol\):
56
+ @(abc\.)?abstractmethod
.dockerignore ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.url
2
+ *.ipynb
3
+ *.pyc
4
+ *.qmd
5
+ *.json.bak.*
6
+ _quarto.yml
7
+ quarto_site/*
8
+ src/*
9
+ redaction_deps/*
10
+ .venv/*
11
+ examples/*
12
+ processing/*
13
+ tools/__pycache__/*
14
+ old_code/*
15
+ tesseract/*
16
+ poppler/*
17
+ build/*
18
+ dist/*
19
+ docs/*
20
+ .pi/*
21
+ build_deps/*
22
+ user_guide/*
23
+ _extensions/*
24
+ workspace/*
25
+ doc_redaction.egg-info/*
26
+ .venv_pypi_test/*
27
+ cdk/config/*
28
+ tld/*
29
+ cdk/config/*
30
+ cdk/cdk.out/*
31
+ cdk/archive/*
32
+ cdk.json
33
+ cdk.context.json
34
+ .quarto/*
35
+ logs/
36
+ output/
37
+ input/
38
+ feedback/
39
+ # Exclude local secrets; allow committed *.example templates (Pi agent + main app images).
40
+ config/*
41
+ !config/pi_agent.env.example
42
+ !config/app_config.env.example
43
+ !config/docker_app_config.env.example
44
+ usage/
45
+ test/config/*
46
+ test/feedback/*
47
+ test/input/*
48
+ test/logs/*
49
+ test/output/*
50
+ test/tmp/*
51
+ test/usage/*
52
+ .ruff_cache/*
53
+ model_cache/*
54
+ sanitized_file/*
55
+ src/doc_redaction.egg-info/*
56
+ docker_compose/*
57
+ skills/example_prompts/*
.gitattributes ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.sh text eol=lf
3
+ *.jpg filter=lfs diff=lfs merge=lfs -text
4
+ *.xls filter=lfs diff=lfs merge=lfs -text
5
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
6
+ *.docx filter=lfs diff=lfs merge=lfs -text
7
+ *.doc filter=lfs diff=lfs merge=lfs -text
8
+ *.png filter=lfs diff=lfs merge=lfs -text
9
+ *.ico filter=lfs diff=lfs merge=lfs -text
.github/scripts/setup_test_data.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script for GitHub Actions test data.
4
+ Creates dummy test files when example data is not available.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+
10
+ import pandas as pd
11
+
12
+
13
+ def create_directories():
14
+ """Create necessary directories."""
15
+ dirs = ["doc_redaction/example_data", "doc_redaction/example_data/example_outputs"]
16
+
17
+ for dir_path in dirs:
18
+ os.makedirs(dir_path, exist_ok=True)
19
+ print(f"Created directory: {dir_path}")
20
+
21
+
22
+ def create_dummy_pdf():
23
+ """Create dummy PDFs for testing."""
24
+
25
+ # Install reportlab if not available
26
+ try:
27
+ from reportlab.lib.pagesizes import letter
28
+ from reportlab.pdfgen import canvas
29
+ except ImportError:
30
+ import subprocess
31
+
32
+ subprocess.check_call(["pip", "install", "reportlab"])
33
+ from reportlab.lib.pagesizes import letter
34
+ from reportlab.pdfgen import canvas
35
+
36
+ try:
37
+ # Create the main test PDF
38
+ pdf_path = "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
39
+ print(f"Creating PDF: {pdf_path}")
40
+ print(f"Directory exists: {os.path.exists('doc_redaction/example_data')}")
41
+
42
+ c = canvas.Canvas(pdf_path, pagesize=letter)
43
+ c.drawString(100, 750, "This is a test document for redaction testing.")
44
+ c.drawString(100, 700, "Email: test@example.com")
45
+ c.drawString(100, 650, "Phone: 123-456-7890")
46
+ c.drawString(100, 600, "Name: John Doe")
47
+ c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
48
+ c.showPage()
49
+
50
+ # Add second page
51
+ c.drawString(100, 750, "Second page content")
52
+ c.drawString(100, 700, "More test data: jane.doe@example.com")
53
+ c.drawString(100, 650, "Another phone: 987-654-3210")
54
+ c.save()
55
+
56
+ print(f"Created dummy PDF: {pdf_path}")
57
+
58
+ # Create Partnership Agreement Toolkit PDF
59
+ partnership_pdf_path = (
60
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf"
61
+ )
62
+ print(f"Creating PDF: {partnership_pdf_path}")
63
+ c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
64
+ c.drawString(100, 750, "Partnership Agreement Toolkit")
65
+ c.drawString(100, 700, "This is a test partnership agreement document.")
66
+ c.drawString(100, 650, "Contact: partnership@example.com")
67
+ c.drawString(100, 600, "Phone: (555) 123-4567")
68
+ c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
69
+ c.showPage()
70
+
71
+ # Add second page
72
+ c.drawString(100, 750, "Page 2 - Partnership Details")
73
+ c.drawString(100, 700, "More partnership information here.")
74
+ c.drawString(100, 650, "Contact: info@partnership.org")
75
+ c.showPage()
76
+
77
+ # Add third page
78
+ c.drawString(100, 750, "Page 3 - Terms and Conditions")
79
+ c.drawString(100, 700, "Terms and conditions content.")
80
+ c.drawString(100, 650, "Legal contact: legal@partnership.org")
81
+ c.save()
82
+
83
+ print(f"Created dummy PDF: {partnership_pdf_path}")
84
+
85
+ # Create Graduate Job Cover Letter PDF
86
+ cover_letter_pdf_path = (
87
+ "doc_redaction/example_data/graduate-job-example-cover-letter.pdf"
88
+ )
89
+ print(f"Creating PDF: {cover_letter_pdf_path}")
90
+ c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
91
+ c.drawString(100, 750, "Cover Letter Example")
92
+ c.drawString(100, 700, "Dear Hiring Manager,")
93
+ c.drawString(100, 650, "I am writing to apply for the position.")
94
+ c.drawString(100, 600, "Contact: applicant@example.com")
95
+ c.drawString(100, 550, "Phone: (555) 987-6543")
96
+ c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
97
+ c.drawString(100, 450, "Sincerely,")
98
+ c.drawString(100, 400, "John Applicant")
99
+ c.save()
100
+
101
+ print(f"Created dummy PDF: {cover_letter_pdf_path}")
102
+
103
+ except ImportError:
104
+ print("ReportLab not available, skipping PDF creation")
105
+ # Create simple text files instead
106
+ with open(
107
+ "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
108
+ "w",
109
+ ) as f:
110
+ f.write("This is a dummy PDF file for testing")
111
+
112
+ with open(
113
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
114
+ "w",
115
+ ) as f:
116
+ f.write("This is a dummy Partnership Agreement PDF file for testing")
117
+
118
+ with open(
119
+ "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
120
+ "w",
121
+ ) as f:
122
+ f.write("This is a dummy cover letter PDF file for testing")
123
+
124
+ print("Created dummy text files instead of PDFs")
125
+
126
+
127
+ def create_dummy_csv():
128
+ """Create dummy CSV files for testing."""
129
+ # Main CSV
130
+ csv_data = {
131
+ "Case Note": [
132
+ "Client visited for consultation regarding housing issues",
133
+ "Follow-up appointment scheduled for next week",
134
+ "Documentation submitted for review",
135
+ ],
136
+ "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
137
+ "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
138
+ }
139
+ df = pd.DataFrame(csv_data)
140
+ df.to_csv("doc_redaction/example_data/combined_case_notes.csv", index=False)
141
+ print("Created dummy CSV: doc_redaction/example_data/combined_case_notes.csv")
142
+
143
+ # Lambeth CSV
144
+ lambeth_data = {
145
+ "text": [
146
+ "Lambeth 2030 vision document content",
147
+ "Our Future Our Lambeth strategic plan",
148
+ "Community engagement and development",
149
+ ],
150
+ "page": [1, 2, 3],
151
+ }
152
+ df_lambeth = pd.DataFrame(lambeth_data)
153
+ df_lambeth.to_csv(
154
+ "doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
155
+ index=False,
156
+ )
157
+ print(
158
+ "Created dummy CSV: doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
159
+ )
160
+
161
+
162
+ def create_dummy_word_doc():
163
+ """Create dummy Word document."""
164
+ try:
165
+ from docx import Document
166
+
167
+ doc = Document()
168
+ doc.add_heading("Test Document for Redaction", 0)
169
+ doc.add_paragraph("This is a test document for redaction testing.")
170
+ doc.add_paragraph("Contact Information:")
171
+ doc.add_paragraph("Email: test@example.com")
172
+ doc.add_paragraph("Phone: 123-456-7890")
173
+ doc.add_paragraph("Name: John Doe")
174
+ doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
175
+
176
+ doc.save(
177
+ "doc_redaction/example_data/Bold minimalist professional cover letter.docx"
178
+ )
179
+ print("Created dummy Word document")
180
+
181
+ except ImportError:
182
+ print("python-docx not available, skipping Word document creation")
183
+
184
+
185
+ def create_allow_deny_lists():
186
+ """Create dummy allow/deny lists."""
187
+ # Allow lists
188
+ allow_data = {"word": ["test", "example", "document"]}
189
+ pd.DataFrame(allow_data).to_csv(
190
+ "doc_redaction/example_data/test_allow_list_graduate.csv", index=False
191
+ )
192
+ pd.DataFrame(allow_data).to_csv(
193
+ "doc_redaction/example_data/test_allow_list_partnership.csv", index=False
194
+ )
195
+ print("Created allow lists")
196
+
197
+ # Deny lists
198
+ deny_data = {"word": ["sensitive", "confidential", "private"]}
199
+ pd.DataFrame(deny_data).to_csv(
200
+ "doc_redaction/example_data/partnership_toolkit_redact_custom_deny_list.csv",
201
+ index=False,
202
+ )
203
+ pd.DataFrame(deny_data).to_csv(
204
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
205
+ index=False,
206
+ )
207
+ print("Created deny lists")
208
+
209
+ # Whole page redaction list
210
+ page_data = {"page": [1, 2]}
211
+ pd.DataFrame(page_data).to_csv(
212
+ "doc_redaction/example_data/partnership_toolkit_redact_some_pages.csv",
213
+ index=False,
214
+ )
215
+ print("Created whole page redaction list")
216
+
217
+
218
+ def create_ocr_output():
219
+ """Create dummy OCR output CSV."""
220
+ ocr_data = {
221
+ "page": [1, 2, 3],
222
+ "text": [
223
+ "This is page 1 content with some text",
224
+ "This is page 2 content with different text",
225
+ "This is page 3 content with more text",
226
+ ],
227
+ "left": [0.1, 0.3, 0.5],
228
+ "top": [0.95, 0.92, 0.88],
229
+ "width": [0.05, 0.02, 0.02],
230
+ "height": [0.01, 0.02, 0.02],
231
+ "line": [1, 2, 3],
232
+ }
233
+ df = pd.DataFrame(ocr_data)
234
+ df.to_csv(
235
+ "doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
236
+ index=False,
237
+ )
238
+ print("Created dummy OCR output CSV")
239
+
240
+
241
+ def create_dummy_image():
242
+ """Create dummy image for testing."""
243
+ try:
244
+ from PIL import Image, ImageDraw, ImageFont
245
+
246
+ img = Image.new("RGB", (800, 600), color="white")
247
+ draw = ImageDraw.Draw(img)
248
+
249
+ # Try to use a system font
250
+ try:
251
+ font = ImageFont.truetype(
252
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
253
+ )
254
+ except Exception as e:
255
+ print(f"Error loading DejaVuSans font: {e}")
256
+ try:
257
+ font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
258
+ except Exception as e:
259
+ print(f"Error loading Arial font: {e}")
260
+ font = ImageFont.load_default()
261
+
262
+ # Add text to image
263
+ draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
264
+ draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
265
+ draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
266
+ draw.text((50, 200), "Name: John Doe", fill="black", font=font)
267
+ draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
268
+
269
+ img.save("doc_redaction/example_data/example_complaint_letter.jpg")
270
+ print("Created dummy image")
271
+
272
+ except ImportError:
273
+ print("PIL not available, skipping image creation")
274
+
275
+
276
+ def main():
277
+ """Main setup function."""
278
+ print("Setting up test data for GitHub Actions...")
279
+ print(f"Current working directory: {os.getcwd()}")
280
+ print(f"Python version: {sys.version}")
281
+
282
+ create_directories()
283
+ create_dummy_pdf()
284
+ create_dummy_csv()
285
+ create_dummy_word_doc()
286
+ create_allow_deny_lists()
287
+ create_ocr_output()
288
+ create_dummy_image()
289
+
290
+ print("\nTest data setup complete!")
291
+ print("Created files:")
292
+ for root, dirs, files in os.walk("doc_redaction/example_data"):
293
+ for file in files:
294
+ file_path = os.path.join(root, file)
295
+ print(f" {file_path}")
296
+ # Verify the file exists and has content
297
+ if os.path.exists(file_path):
298
+ file_size = os.path.getsize(file_path)
299
+ print(f" Size: {file_size} bytes")
300
+ else:
301
+ print(" WARNING: File does not exist!")
302
+
303
+ # Verify critical files exist
304
+ critical_files = [
305
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
306
+ "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
307
+ "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
308
+ ]
309
+
310
+ print("\nVerifying critical test files:")
311
+ for file_path in critical_files:
312
+ if os.path.exists(file_path):
313
+ file_size = os.path.getsize(file_path)
314
+ print(f"✅ {file_path} exists ({file_size} bytes)")
315
+ else:
316
+ print(f"❌ {file_path} MISSING!")
317
+
318
+
319
+ if __name__ == "__main__":
320
+ main()
.github/workflow_README.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GitHub Actions CI/CD Setup
2
+
3
+ This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
4
+
5
+ ## Workflows Overview
6
+
7
+ ### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
8
+ - **Purpose**: Basic test execution
9
+ - **Triggers**: Push to main/dev, Pull requests
10
+ - **OS**: Ubuntu Latest
11
+ - **Python**: 3.11
12
+ - **Features**:
13
+ - Installs system dependencies
14
+ - Sets up test data
15
+ - Runs CLI tests
16
+ - Runs pytest
17
+
18
+ ### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
19
+ - **Purpose**: Full CI/CD pipeline
20
+ - **Features**:
21
+ - Linting (Ruff, Black)
22
+ - Unit tests (Python 3.10, 3.11, 3.12)
23
+ - Integration tests
24
+ - Security scanning (Safety, Bandit)
25
+ - Coverage reporting
26
+ - Package building (on main branch)
27
+
28
+ ### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
29
+ - **Purpose**: Cross-platform testing
30
+ - **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
31
+ - **Python**: 3.10, 3.11, 3.12
32
+ - **Features**: Tests compatibility across different operating systems
33
+
34
+ ### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
35
+ - **Purpose**: Original test workflow
36
+ - **Features**:
37
+ - Multiple Python versions
38
+ - System dependency installation
39
+ - Test data creation
40
+ - Coverage reporting
41
+
42
+ ## Setup Scripts
43
+
44
+ ### Test Data Setup (`.github/scripts/setup_test_data.py`)
45
+ Creates dummy test files when example data is not available:
46
+ - PDF documents
47
+ - CSV files
48
+ - Word documents
49
+ - Images
50
+ - Allow/deny lists
51
+ - OCR output files
52
+
53
+ ## Usage
54
+
55
+ ### Running Tests Locally
56
+
57
+ ```bash
58
+ # Install dependencies
59
+ pip install -r requirements.txt
60
+ pip install pytest pytest-cov
61
+
62
+ # Setup test data
63
+ python .github/scripts/setup_test_data.py
64
+
65
+ # Run tests
66
+ cd test
67
+ python cli_epilog_suite.py
68
+ ```
69
+
70
+ ### GitHub Actions Triggers
71
+
72
+ 1. **Push to main/dev**: Runs all tests
73
+ 2. **Pull Request**: Runs tests and linting
74
+ 3. **Daily Schedule**: Runs tests at 2 AM UTC
75
+ 4. **Manual Trigger**: Can be triggered manually from GitHub
76
+
77
+ ## Configuration
78
+
79
+ ### Environment Variables
80
+ - `PYTHON_VERSION`: Default Python version (3.11)
81
+ - `PYTHONPATH`: Set automatically for test discovery
82
+
83
+ ### Caching
84
+ - Pip dependencies are cached for faster builds
85
+ - Cache key based on requirements.txt hash
86
+
87
+ ### Artifacts
88
+ - Test results (JUnit XML)
89
+ - Coverage reports (HTML, XML)
90
+ - Security reports
91
+ - Build artifacts (on main branch)
92
+
93
+ ## Test Data
94
+
95
+ The workflows automatically create test data when example files are missing:
96
+
97
+ ### Required Files Created:
98
+ - `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
99
+ - `example_data/combined_case_notes.csv`
100
+ - `example_data/Bold minimalist professional cover letter.docx`
101
+ - `example_data/example_complaint_letter.jpg`
102
+ - `example_data/test_allow_list_*.csv`
103
+ - `example_data/partnership_toolkit_redact_*.csv`
104
+ - `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
105
+
106
+ ### Dependencies Installed:
107
+ - **System**: tesseract-ocr, poppler-utils, OpenGL libraries
108
+ - **Python**: All requirements.txt packages + pytest, reportlab, pillow
109
+
110
+ ## Workflow Status
111
+
112
+ ### Success Criteria:
113
+ - ✅ All tests pass
114
+ - ✅ No linting errors
115
+ - ✅ Security checks pass
116
+ - ✅ Coverage meets threshold (if configured)
117
+
118
+ ### Failure Handling:
119
+ - Tests are designed to skip gracefully if files are missing
120
+ - AWS tests are expected to fail without credentials
121
+ - System dependency failures are handled with fallbacks
122
+
123
+ ## Customization
124
+
125
+ ### Adding New Tests:
126
+ 1. Add test methods to `test/cli_epilog_suite.py` or pytest files under `test/test_*.py`
127
+ 2. Update test data in `setup_test_data.py` if needed
128
+ 3. Tests will automatically run in all workflows
129
+
130
+ ### Modifying Workflows:
131
+ 1. Edit the appropriate `.yml` file
132
+ 2. Test locally first
133
+ 3. Push to trigger the workflow
134
+
135
+ ### Environment-Specific Settings:
136
+ - **Ubuntu**: Full system dependencies
137
+ - **Windows**: Python packages only
138
+ - **macOS**: Homebrew dependencies
139
+
140
+ ## Troubleshooting
141
+
142
+ ### Common Issues:
143
+
144
+ 1. **Missing Dependencies**:
145
+ - Check system dependency installation
146
+ - Verify Python package versions
147
+
148
+ 2. **Test Failures**:
149
+ - Check test data creation
150
+ - Verify file paths
151
+ - Review test output logs
152
+
153
+ 3. **AWS Test Failures**:
154
+ - Expected without credentials
155
+ - Tests are designed to handle this gracefully
156
+
157
+ 4. **System Dependency Issues**:
158
+ - Different OS have different requirements
159
+ - Check the specific OS section in workflows
160
+
161
+ ### Debug Mode:
162
+ Add `--verbose` or `-v` flags to pytest commands for more detailed output.
163
+
164
+ ## Security
165
+
166
+ - Dependencies are scanned with Safety
167
+ - Code is scanned with Bandit
168
+ - No secrets are exposed in logs
169
+ - Test data is temporary and cleaned up
170
+
171
+ ## Performance
172
+
173
+ - Tests run in parallel where possible
174
+ - Dependencies are cached
175
+ - Only necessary system packages are installed
176
+ - Test data is created efficiently
177
+
178
+ ## Monitoring
179
+
180
+ - Workflow status is visible in GitHub Actions tab
181
+ - Coverage reports are uploaded to Codecov
182
+ - Test results are available as artifacts
183
+ - Security reports are generated and stored
.github/workflows/archive_workflows/multi-os-test.yml ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Multi-OS Test
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ permissions:
10
+ contents: read
11
+ actions: read
12
+
13
+ jobs:
14
+ test:
15
+ runs-on: ${{ matrix.os }}
16
+ env:
17
+ SHOW_VLM_MODEL_OPTIONS: "False"
18
+ strategy:
19
+ matrix:
20
+ os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
21
+ python-version: ["3.11", "3.12", "3.13"]
22
+ exclude:
23
+ # Exclude some combinations to reduce CI time
24
+ #- os: windows-latest
25
+ # python-version: ["3.12", "3.13"]
26
+ - os: macos-latest
27
+ python-version: ["3.12", "3.13"]
28
+
29
+ steps:
30
+ - uses: actions/checkout@v6
31
+
32
+ - name: Set up Python ${{ matrix.python-version }}
33
+ uses: actions/setup-python@v6
34
+ with:
35
+ python-version: ${{ matrix.python-version }}
36
+
37
+ - name: Install system dependencies (Ubuntu)
38
+ if: matrix.os == 'ubuntu-latest'
39
+ run: |
40
+ sudo apt-get update
41
+ sudo apt-get install -y \
42
+ tesseract-ocr \
43
+ tesseract-ocr-eng \
44
+ poppler-utils \
45
+ libgl1-mesa-dri \
46
+ libglib2.0-0 \
47
+ libsm6 \
48
+ libxext6 \
49
+ libxrender-dev \
50
+ libgomp1
51
+
52
+ - name: Install system dependencies (macOS)
53
+ if: matrix.os == 'macos-latest'
54
+ run: |
55
+ brew install tesseract poppler
56
+
57
+ - name: Install system dependencies (Windows)
58
+ if: matrix.os == 'windows-latest'
59
+ run: |
60
+ # Create tools directory
61
+ if (!(Test-Path "C:\tools")) {
62
+ mkdir C:\tools
63
+ }
64
+
65
+ # Download and install Tesseract
66
+ $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
67
+ $tesseractInstaller = "C:\tools\tesseract-installer.exe"
68
+ Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
69
+
70
+ # Install Tesseract silently
71
+ Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
72
+
73
+ # Download and extract Poppler
74
+ $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
75
+ $popplerZip = "C:\tools\poppler.zip"
76
+ Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
77
+
78
+ # Extract Poppler
79
+ Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
80
+
81
+ # Add to PATH
82
+ echo "C:\tools\tesseract" >> $env:GITHUB_PATH
83
+ echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
84
+
85
+ # Set environment variables for your application
86
+ echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
87
+ echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
88
+ echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
89
+
90
+ # Verify installation using full paths (since PATH won't be updated in current session)
91
+ & "C:\tools\tesseract\tesseract.exe" --version
92
+ & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
93
+
94
+ - name: Install Python dependencies
95
+ run: |
96
+ python -m pip install --upgrade pip
97
+ pip install -r requirements.txt
98
+ pip install pytest pytest-cov reportlab pillow
99
+
100
+ - name: Download spaCy model
101
+ run: |
102
+ python -m spacy download en_core_web_lg
103
+
104
+ - name: Setup test data
105
+ run: |
106
+ python .github/scripts/setup_test_data.py
107
+
108
+ - name: Run CLI tests
109
+ run: |
110
+ cd test
111
+ python cli_epilog_suite.py
112
+
113
+ - name: Run tests with pytest
114
+ run: |
115
+ pytest test/ -v --tb=short
.github/workflows/ci.yml ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ workflow_dispatch:
9
+ #schedule:
10
+ # Run tests daily at 2 AM UTC
11
+ # - cron: '0 2 * * *'
12
+
13
+ permissions:
14
+ contents: read
15
+ actions: read
16
+ pull-requests: write
17
+ issues: write
18
+
19
+ env:
20
+ PYTHON_VERSION: "3.11"
21
+
22
+ jobs:
23
+ lint:
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - uses: actions/checkout@v6
27
+
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v6
30
+ with:
31
+ python-version: ${{ env.PYTHON_VERSION }}
32
+
33
+ - name: Install dependencies
34
+ run: |
35
+ python -m pip install --upgrade pip
36
+ pip install ruff black
37
+
38
+ - name: Run Ruff linter
39
+ run: ruff check .
40
+
41
+ - name: Run Black formatter check
42
+ run: black --check .
43
+
44
+ test-unit:
45
+ runs-on: ubuntu-latest
46
+ env:
47
+ # Avoid optional VLM/torch import path in tools.run_vlm (not installed in lightweight CI deps)
48
+ SHOW_VLM_MODEL_OPTIONS: "False"
49
+ strategy:
50
+ matrix:
51
+ python-version: [3.11, 3.12, 3.13]
52
+
53
+ steps:
54
+ - uses: actions/checkout@v6
55
+
56
+ - name: Set up Python ${{ matrix.python-version }}
57
+ uses: actions/setup-python@v6
58
+ with:
59
+ python-version: ${{ matrix.python-version }}
60
+
61
+ - name: Cache pip dependencies
62
+ uses: actions/cache@v5
63
+ with:
64
+ path: ~/.cache/pip
65
+ key: ${{ runner.os }}-pip-${{ hashFiles('requirements_lightweight.txt') }}
66
+ restore-keys: |
67
+ ${{ runner.os }}-pip-
68
+
69
+ - name: Install system dependencies
70
+ run: |
71
+ sudo apt-get update
72
+ sudo apt-get install -y \
73
+ tesseract-ocr \
74
+ tesseract-ocr-eng \
75
+ poppler-utils \
76
+ libgl1-mesa-dri \
77
+ libglib2.0-0 \
78
+ libsm6 \
79
+ libxext6 \
80
+ libxrender-dev \
81
+ libgomp1
82
+
83
+ - name: Install Python dependencies
84
+ run: |
85
+ python -m pip install --upgrade pip
86
+ pip install -r requirements_lightweight.txt
87
+ pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
88
+
89
+ - name: Download spaCy model
90
+ run: |
91
+ python -m spacy download en_core_web_lg
92
+
93
+ - name: Setup test data
94
+ run: |
95
+ python .github/scripts/setup_test_data.py
96
+ echo "Setup script completed. Checking results:"
97
+ ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
98
+
99
+ - name: Verify test data files
100
+ run: |
101
+ echo "Checking if critical test files exist:"
102
+ ls -la doc_redaction/example_data/
103
+ echo "Checking for specific PDF files:"
104
+ ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
105
+ echo "Checking file sizes:"
106
+ find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
107
+
108
+ - name: Clean up problematic config files
109
+ run: |
110
+ rm -f config*.py || true
111
+
112
+ - name: Run CLI tests
113
+ run: |
114
+ cd test
115
+ python cli_epilog_suite.py
116
+
117
+ - name: Run tests with pytest (JUnit and coverage)
118
+ run: |
119
+ pytest test/ -v --tb=short \
120
+ --junitxml=test-results.xml \
121
+ --cov=. --cov-config=.coveragerc \
122
+ --cov-report=xml --cov-report=html --cov-report=term
123
+
124
+ #- name: Upload coverage to Codecov - not necessary
125
+ # uses: codecov/codecov-action@v3
126
+ # if: matrix.python-version == '3.11'
127
+ # with:
128
+ # file: ./coverage.xml
129
+ # flags: unittests
130
+ # name: codecov-umbrella
131
+ # fail_ci_if_error: false
132
+
133
+ - name: Upload test results
134
+ uses: actions/upload-artifact@v6
135
+ if: always()
136
+ with:
137
+ name: test-results-python-${{ matrix.python-version }}
138
+ path: |
139
+ test-results.xml
140
+ htmlcov/
141
+ coverage.xml
142
+
143
+ test-integration:
144
+ runs-on: ubuntu-latest
145
+ needs: [lint, test-unit]
146
+ env:
147
+ SHOW_VLM_MODEL_OPTIONS: "False"
148
+
149
+ steps:
150
+ - uses: actions/checkout@v6
151
+
152
+ - name: Set up Python
153
+ uses: actions/setup-python@v6
154
+ with:
155
+ python-version: ${{ env.PYTHON_VERSION }}
156
+
157
+ - name: Install dependencies
158
+ run: |
159
+ python -m pip install --upgrade pip
160
+ pip install -r requirements_lightweight.txt
161
+ pip install pytest pytest-cov reportlab pillow
162
+
163
+ - name: Install system dependencies
164
+ run: |
165
+ sudo apt-get update
166
+ sudo apt-get install -y \
167
+ tesseract-ocr \
168
+ tesseract-ocr-eng \
169
+ poppler-utils \
170
+ libgl1-mesa-dri \
171
+ libglib2.0-0 \
172
+ libsm6 \
173
+ libxext6 \
174
+ libxrender-dev \
175
+ libgomp1
176
+
177
+ - name: Download spaCy model
178
+ run: |
179
+ python -m spacy download en_core_web_lg
180
+
181
+ - name: Setup test data
182
+ run: |
183
+ python .github/scripts/setup_test_data.py
184
+ echo "Setup script completed. Checking results:"
185
+ ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
186
+
187
+ - name: Verify test data files
188
+ run: |
189
+ echo "Checking if critical test files exist:"
190
+ ls -la doc_redaction/example_data/
191
+ echo "Checking for specific PDF files:"
192
+ ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
193
+ echo "Checking file sizes:"
194
+ find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
195
+
196
+ - name: Run integration tests
197
+ run: |
198
+ cd test
199
+ python demo_single_test.py
200
+
201
+ - name: Test CLI help
202
+ run: |
203
+ python cli_redact.py --help
204
+
205
+ - name: Test CLI version
206
+ run: |
207
+ python -c "import sys; print(f'Python {sys.version}')"
208
+
209
+ security:
210
+ runs-on: ubuntu-latest
211
+ steps:
212
+ - uses: actions/checkout@v6
213
+
214
+ - name: Set up Python
215
+ uses: actions/setup-python@v6
216
+ with:
217
+ python-version: ${{ env.PYTHON_VERSION }}
218
+
219
+ - name: Install dependencies
220
+ run: |
221
+ python -m pip install --upgrade pip
222
+ pip install safety bandit
223
+
224
+ #- name: Run safety scan - removed as now requires login
225
+ # run: |
226
+ # safety scan -r requirements.txt
227
+
228
+ - name: Run bandit security check
229
+ run: |
230
+ bandit -r . -f json -o bandit-report.json || true
231
+
232
+ - name: Upload security report
233
+ uses: actions/upload-artifact@v6
234
+ if: always()
235
+ with:
236
+ name: security-report
237
+ path: bandit-report.json
238
+
239
+ build:
240
+ runs-on: ubuntu-latest
241
+ needs: [lint, test-unit]
242
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
243
+
244
+ steps:
245
+ - uses: actions/checkout@v6
246
+
247
+ - name: Set up Python
248
+ uses: actions/setup-python@v6
249
+ with:
250
+ python-version: ${{ env.PYTHON_VERSION }}
251
+
252
+ - name: Install build dependencies
253
+ run: |
254
+ python -m pip install --upgrade pip
255
+ pip install build twine
256
+
257
+ - name: Build package
258
+ run: |
259
+ python -m build
260
+
261
+ - name: Check package
262
+ run: |
263
+ twine check dist/*
264
+
265
+ - name: Upload build artifacts
266
+ uses: actions/upload-artifact@v6
267
+ with:
268
+ name: dist
269
+ path: dist/
.github/workflows/simple-test.yml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Simple Test Run
2
+
3
+ on:
4
+ push:
5
+ branches: [ dev ]
6
+ pull_request:
7
+ branches: [ dev ]
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+ actions: read
13
+
14
+ jobs:
15
+ test:
16
+ runs-on: ubuntu-latest
17
+ env:
18
+ SHOW_VLM_MODEL_OPTIONS: "False"
19
+
20
+ steps:
21
+ - uses: actions/checkout@v6
22
+
23
+ - name: Set up Python 3.12
24
+ uses: actions/setup-python@v6
25
+ with:
26
+ python-version: "3.12"
27
+
28
+ - name: Install system dependencies
29
+ run: |
30
+ sudo apt-get update
31
+ sudo apt-get install -y \
32
+ tesseract-ocr \
33
+ tesseract-ocr-eng \
34
+ poppler-utils \
35
+ libgl1-mesa-dri \
36
+ libglib2.0-0 \
37
+ libsm6 \
38
+ libxext6 \
39
+ libxrender-dev \
40
+ libgomp1
41
+
42
+ - name: Install Python dependencies
43
+ run: |
44
+ python -m pip install --upgrade pip
45
+ pip install -r requirements_lightweight.txt
46
+ pip install pytest pytest-cov reportlab pillow
47
+
48
+ - name: Download spaCy model
49
+ run: |
50
+ python -m spacy download en_core_web_lg
51
+
52
+ - name: Setup test data
53
+ run: |
54
+ python .github/scripts/setup_test_data.py
55
+ echo "Setup script completed. Checking results:"
56
+ ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
57
+
58
+ - name: Verify test data files
59
+ run: |
60
+ echo "Checking if critical test files exist:"
61
+ ls -la doc_redaction/example_data/
62
+ echo "Checking for specific PDF files:"
63
+ ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
64
+ echo "Checking file sizes:"
65
+ find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
66
+
67
+ - name: Run CLI tests
68
+ run: |
69
+ cd test
70
+ python cli_epilog_suite.py
71
+
72
+ - name: Run tests with pytest
73
+ run: |
74
+ pytest test/ -v --tb=short
.github/workflows/sync-pi-agent-space.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync Pi agent to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [dev]
6
+ paths:
7
+ - "agent-redact/**"
8
+ - "skills/**"
9
+ - "tools/**"
10
+ - "intros/**"
11
+ - "doc_redaction/example_data/**"
12
+ - "AGENTS.md"
13
+ - "config/**"
14
+ - ".github/workflows/sync-pi-agent-space.yml"
15
+ workflow_dispatch:
16
+
17
+ permissions:
18
+ contents: read
19
+
20
+ jobs:
21
+ sync-pi-agent-space:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - uses: actions/checkout@v6
25
+ with:
26
+ fetch-depth: 1
27
+ lfs: true
28
+
29
+ - name: Install Git LFS
30
+ run: git lfs install
31
+
32
+ - name: Materialize example PDFs (Git LFS)
33
+ run: |
34
+ git lfs pull --include="doc_redaction/example_data/*.pdf"
35
+ for f in \
36
+ doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
37
+ doc_redaction/example_data/graduate-job-example-cover-letter.pdf; do
38
+ if head -1 "$f" | grep -q "^version https://git-lfs.github.com/spec/v1"; then
39
+ echo "Example PDF is still an LFS pointer (not materialized): $f" >&2
40
+ exit 1
41
+ fi
42
+ done
43
+
44
+ - name: Flatten Pi agent Space tree
45
+ run: |
46
+ chmod +x agent-redact/pi-agent/sync_to_space.sh
47
+ agent-redact/pi-agent/sync_to_space.sh /tmp/pi-agent-space
48
+
49
+ - name: Push to Hugging Face Space
50
+ run: |
51
+ COMMIT_MSG=$(git log -1 --pretty=%B)
52
+ echo "Syncing Pi agent Space: seanpedrickcase/agentic_document_redaction"
53
+ cd /tmp/pi-agent-space
54
+ git init -b main
55
+ git config user.name "$HF_USERNAME"
56
+ git config user.email "$HF_EMAIL"
57
+ git add .
58
+ git commit -m "Sync Pi agent Space: $COMMIT_MSG"
59
+ git remote add hf "https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/agentic_document_redaction"
60
+ git push --force hf main
61
+ env:
62
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
63
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
64
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [dev]
5
+ workflow_dispatch:
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ sync-to-hub:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v6
15
+ with:
16
+ fetch-depth: 1 # Only get the latest state
17
+ lfs: true # Download actual LFS files so they can be pushed
18
+
19
+ - name: Install Git LFS
20
+ run: git lfs install
21
+
22
+ - name: Recreate repo history (single-commit force push)
23
+ run: |
24
+ # 1. Capture the message BEFORE we delete the .git folder
25
+ COMMIT_MSG=$(git log -1 --pretty=%B)
26
+ echo "Syncing commit message: $COMMIT_MSG"
27
+
28
+ # 2. DELETE the .git folder.
29
+ # This turns the repo into a standard folder of files.
30
+ rm -rf .git
31
+
32
+ # 3. Re-initialize a brand new git repo
33
+ git init -b main
34
+ git config --global user.name "$HF_USERNAME"
35
+ git config --global user.email "$HF_EMAIL"
36
+
37
+ # 4. Re-install LFS (needs to be done after git init)
38
+ git lfs install
39
+
40
+ # 5. Add the remote
41
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
42
+
43
+ # 6. Add all files
44
+ # Since this is a fresh init, Git sees EVERY file as "New"
45
+ git add .
46
+
47
+ # 7. Commit and Force Push
48
+ git commit -m "Sync: $COMMIT_MSG"
49
+ git push --force hf main
50
+ env:
51
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
52
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
53
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
54
+ HF_REPO_ID: ${{ secrets.HF_REPO_ID }}
.github/workflows/sync_to_hf_zero_gpu.yml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub Zero GPU
2
+ on:
3
+ push:
4
+ branches: [dev]
5
+ workflow_dispatch:
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ sync-to-hub-zero-gpu:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v6
15
+ with:
16
+ fetch-depth: 1 # Only get the latest state
17
+ lfs: true # Download actual LFS files so they can be pushed
18
+
19
+ - name: Install Git LFS
20
+ run: git lfs install
21
+
22
+ # HF Spaces read Space config from README.md front matter. The repo README
23
+ # targets GitHub (e.g. docker); patch only this CI checkout before HF push.
24
+ - name: Apply HF Zero GPU Space README front matter
25
+ run: python3 tools/apply_hf_zero_gpu_readme_frontmatter.py
26
+
27
+ - name: Recreate repo history (single-commit force push)
28
+ run: |
29
+ # 1. Capture the message BEFORE we delete the .git folder
30
+ COMMIT_MSG=$(git log -1 --pretty=%B)
31
+ echo "Syncing commit message: $COMMIT_MSG"
32
+
33
+ # 2. DELETE the .git folder.
34
+ # This turns the repo into a standard folder of files.
35
+ rm -rf .git
36
+
37
+ # 3. Re-initialize a brand new git repo
38
+ git init -b main
39
+ git config --global user.name "$HF_USERNAME"
40
+ git config --global user.email "$HF_EMAIL"
41
+
42
+ # 4. Re-install LFS (needs to be done after git init)
43
+ git lfs install
44
+
45
+ # 5. Add the remote
46
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
47
+
48
+ # 6. Add all files
49
+ # Since this is a fresh init, Git sees EVERY file as "New"
50
+ git add .
51
+
52
+ # 7. Commit and Force Push
53
+ git commit -m "Sync: $COMMIT_MSG"
54
+ git push --force hf main
55
+ env:
56
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
57
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
58
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
59
+ HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}
.gitignore ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.url
2
+ *.ipynb
3
+ *.pyc
4
+ *.qmd
5
+ *.json.bak.*
6
+ _quarto.yml
7
+ quarto_site/*
8
+ src/*
9
+ redaction_deps/*
10
+ .venv/*
11
+ examples/*
12
+ processing/*
13
+ input/*
14
+ output/*
15
+ tools/__pycache__/*
16
+ old_code/*
17
+ tesseract/*
18
+ poppler/*
19
+ build/*
20
+ dist/*
21
+ build_deps/*
22
+ logs/*
23
+ usage/*
24
+ feedback/*
25
+ config/*
26
+ !config/pi_agent.env.example
27
+ !config/docker_app_config.env.example
28
+ !config/app_config.env.example
29
+ workspace/*
30
+ user_guide/*
31
+ _extensions/*
32
+ doc_redaction.egg-info/*
33
+ .venv_pypi_test/*
34
+ cdk/config/*
35
+ !cdk/config/app_config.env.example
36
+ !cdk/config/lambda/
37
+ cdk/config/lambda/*
38
+ !cdk/config/lambda/lambda_function.py
39
+ !cdk/config/headless_s3_seed/
40
+ cdk/config/headless_s3_seed/*
41
+ !cdk/config/headless_s3_seed/input/
42
+ cdk/config/headless_s3_seed/input/*
43
+ !cdk/config/headless_s3_seed/input/config/
44
+ cdk/config/headless_s3_seed/input/config/*
45
+ !cdk/config/headless_s3_seed/input/config/example_headless_env_file.env
46
+ cdk/cdk.out/*
47
+ cdk/archive/*
48
+ tld/*
49
+ tmp/*
50
+ docs/*
51
+ .pi/*
52
+ cdk.out/*
53
+ cdk.json
54
+ cdk.context.json
55
+ precheck.context.json
56
+ .quarto/*
57
+ /.quarto/
58
+ /_site/
59
+ test/config/*
60
+ test/feedback/*
61
+ test/input/*
62
+ test/logs/*
63
+ test/output/*
64
+ test/tmp/*
65
+ test/usage/*
66
+ .ruff_cache/*
67
+ model_cache/*
68
+ sanitized_file/*
69
+ src/doc_redaction.egg-info/*
70
+ docker_compose/*
71
+ **/*.quarto_ipynb
72
+ skills/example_prompts/*
73
+ .pi/sessions/
74
+ agent-redact/pi/agent/sessions/
AGENTS.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AGENTS.md
2
+
3
+ Context for AI coding agents working on **doc_redaction** (PII redaction for PDFs, images, Word, and tabular files). Human-oriented docs: [README.md](README.md). User guide: [doc_redaction user guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
4
+
5
+ ## Project overview
6
+
7
+ - **Stack**: Python 3.10+, Gradio UI ([app.py](app.py)), optional FastAPI when `RUN_FASTAPI` is enabled, AWS/LLM integrations via [tools/config.py](tools/config.py) and env files under `config/`.
8
+ - **License**: AGPL-3.0-only (see [pyproject.toml](pyproject.toml)). Respect license terms when adding dependencies.
9
+ - **Accuracy**: Outputs are not guaranteed complete; downstream use should assume **human review** of redacted material.
10
+
11
+ ## Cursor skills: redaction workflow (optional)
12
+
13
+ For agents operating the deployed app (Gradio Client, review CSV, `/review_apply`), these repo-local playbooks are a suggested ladder:
14
+
15
+ 0. **[`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md)** — copy-paste user task prompt (Pass 1 default, Pass 2 gated); **user redaction requirements go at the end of the prompt**.
16
+ 1. **[`skills/doc-redaction-app/SKILL.md`](skills/doc-redaction-app/SKILL.md)** — first-pass redaction (`/doc_redact` / `/redact_document`) and downloading artifacts.
17
+ 2. **[`skills/doc-redact-page-review/SKILL.md`](skills/doc-redact-page-review/SKILL.md)** — after outputs exist: **parallel per-page** child agents, merge into one full-document `*_review_file.csv`, **single** `/review_apply` from the parent.
18
+ 3. **[`skills/doc-redaction-modifications/SKILL.md`](skills/doc-redaction-modifications/SKILL.md)** — CSV mechanics, `preview_redaction_boxes`, `/review_apply` patterns, verification, VLM and PyMuPDF fallbacks (single-thread edits and the **technical** reference for page-review children).
19
+
20
+ ## Setup
21
+
22
+ 1. **System**: Install **Tesseract** and **Poppler** (required for OCR/PDF). See [README.md](README.md) (Windows/Linux sections).
23
+ 2. **Python**: Create a venv, then install the project (e.g. `pip install -e ".[dev]"` or follow README).
24
+ 3. **Configuration**: Copy or edit environment/config as described in README / `config/` (e.g. `app_config.env`). Do not commit secrets.
25
+
26
+ ## Run locally
27
+
28
+ - Gradio/FastAPI entrypoint is [app.py](app.py). With FastAPI enabled, typical pattern is `uvicorn app:app --host 0.0.0.0 --port 7860` (exact host/port from your config).
29
+ - OpenAPI docs: `/docs` when the FastAPI app is mounted.
30
+
31
+ ## Tests
32
+
33
+ - Run from repo root: `pytest` (optional: `pytest test/`).
34
+ - Fix failures related to your changes before opening a PR.
35
+
36
+ ## Line order (local OCR and simple text extraction)
37
+
38
+ Multi-column layouts use shared logic in [`tools/ocr_reading_order.py`](tools/ocr_reading_order.py). Controlled by **`LOCAL_OCR_READING_ORDER`** (`column` default, `legacy` for previous top-left behaviour).
39
+
40
+ ### Local OCR (Paddle/Tesseract)
41
+
42
+ Word boxes are merged into line-level CSV rows in [`combine_ocr_results`](tools/custom_image_analyser_engine.py).
43
+
44
+ - **`column`**: detect text columns, assign line numbers down each column left-to-right; full-width lines (headers) first. Stops cross-column merging that produced wide erroneous lines on multi-column PDFs. **Auto-fallback**: the page is treated as single-column unless a *consecutive cluster* of gutter rows (y-gap between adjacent rows ≤ `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, default `0.06` of page height) has ≥ `OCR_COLUMN_MIN_GUTTER_ROWS` (default `3`) rows **and** the cluster's topmost row is above the footer zone (`OCR_COLUMN_FOOTER_ZONE_FRACTION`, default `0.75`). This prevents isolated header bands (logo | title, 1 gutter row), signature-only blocks at the page bottom (cluster starts at y ≥ 0.75), or the combination of both, from forcing column mode on the single-column body text between them.
45
+ - **`PADDLE_PRESERVE_LINE_BOXES=True`** or **`CONVERT_LINE_TO_WORD_LEVEL=False`** with Paddle: keep Paddle line boxes (skip word split + regrouping); line numbers still use column reading order.
46
+
47
+ ### Simple text extraction (PyMuPDF)
48
+
49
+ [`redact_text_pdf`](tools/file_redaction.py) → [`process_page_to_structured_ocr_pymupdf`](tools/file_redaction.py) calls [`reorder_structured_text_lines`](tools/ocr_reading_order.py) after collecting lines, using **`page.mediabox`** width/height for full-span header detection.
50
+
51
+ `reorder_structured_text_lines` now mirrors `build_line_groups` (local OCR route):
52
+
53
+ 1. **Column-aware sort** (`sort_reading_order` / `assign_layout_boxes` / `detect_column_split_xpoints`) — or legacy top-left for single-column pages.
54
+ 2. **Y-band grouping** (`group_into_lines`) — merges any same-row PyMuPDF lines that were emitted as separate objects (e.g. mixed-font spans) and splits horizontally-disparate boxes via `_finalize_line`. *Column mode only.*
55
+ 3. **Secondary sub-column pass** (`_reorder_lines_column_major`) — ensures correct column-major order when sub-columns sit within a single macro-column. *Column mode only.*
56
+ 4. When a group contains more than one box, constituent boxes are **merged** into a single `OCRResult` (union bbox, joined text, concatenated chars/words).
57
+
58
+ In single-column / legacy mode only step 1 is applied; PyMuPDF lines are pre-formed so no merging is needed.
59
+
60
+ ### Tunables (both routes)
61
+
62
+ `OCR_FULL_SPAN_WIDTH_RATIO`, `OCR_COLUMN_GAP_MIN_FRACTION`, `OCR_COLUMN_GUTTER_MIN_FRACTION`, `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` (default `0.015` — fine-grained gutter scan in `assign_layout_boxes`; lower = detects narrower sub-column boundaries), `OCR_COLUMN_MIN_GUTTER_ROWS`, `OCR_COLUMN_MAX_BOX_HEIGHT_RATIO`, `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, `OCR_COLUMN_FOOTER_ZONE_FRACTION`, `OCR_LINE_SPLIT_GAP_FRACTION` (default 0.025 — horizontal gap fraction that forces a line split; must be below the narrowest column gutter, ~0.030 for two-page spreads; also used as the gap threshold for the secondary sub-column sort in `build_line_groups`), `OCR_LINE_Y_THRESHOLD_FRACTION` (default 0.013 — row-alignment tolerance as a fraction of page height; reduced from 0.015 to correctly separate tightly-set 10 pt body text whose row spacing is ~0.014), `OCR_LINE_Y_THRESHOLD_MIN_PX`.
63
+
64
+ **Sub-column ordering** (`build_line_groups`): after the primary word-level column sort, a second pass (`_reorder_lines_column_major`) clusters the produced line groups by their leftmost x-position using `OCR_LINE_SPLIT_GAP_FRACTION` as the gap threshold. This ensures that adjacent narrow sub-columns whose word-level centre gap is below `column_gap_threshold` (e.g. two columns on a spread where each page is already one macro-column) are still output in left-to-right column-major order rather than interleaved by y-position.
65
+
66
+ **Fine-grained gutter-based column assignment** (`assign_layout_boxes`): before falling back to centre-gap clustering, `detect_column_split_xpoints` scans the page for structural gutters at the finer `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` threshold (default 0.015). Each qualifying gutter cluster produces a `(split_x, y_min)` pair — the split point is only applied to boxes whose `top ≥ y_min`, preventing a narrow sub-column gutter (visible only in the lower two-column section) from mis-splitting a full-width introductory paragraph that sits above it. This correctly separates narrow adjacent columns (e.g. 1.9 % gutter on a two-page spread) without fragmenting full-width headings or paragraphs.
67
+
68
+ Changing line order affects PII page text, duplicate-page detection, and review CSV line indices on multi-column documents; re-review after upgrading.
69
+
70
+ ## Agentic / programmatic access (two surfaces)
71
+
72
+ ### 1. FastAPI Agent API (recommended for LLM agents: small JSON bodies)
73
+
74
+ When `RUN_FASTAPI` is true, routes are mounted under **`/agent`** ([agent_routes.py](agent_routes.py)).
75
+
76
+ - **Catalog**: `GET /agent/operations` — maps each Gradio `api_name` to an HTTP path and notes whether the route is implemented via CLI or returns HTTP 501 for Gradio-only flows.
77
+ - **Implemented POST routes** (CLI- or [tools/simplified_api.py](tools/simplified_api.py)-backed where noted):
78
+ `redact_document`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_pdfs`, `combine_review_csvs`, `export_review_redaction_overlay`, `export_review_page_ocr_visualisation`, `apply_review_redactions`, **`verify_redaction_coverage`** (Pass 1 QA: `must_redact` / `must_not_redact` regex lists, optional `redacted_pdf_path`, optional `auto_prune_suspicious` + `pruned_output_path`; returns `pass_strict`, `pass_with_cleanup`, `pages_flagged_for_vlm`, `pages_needing_csv_cleanup`), **`word_level_ocr_text_search`** (headless word OCR search with optional review-box overlap flags).
79
+
80
+ **Optional post-redaction Pass 1 QA (main app / CLI):** When `POST_REDACT_PASS1_QA=True` in [`tools/config.py`](tools/config.py) (or `config/app_config.env`), initial redaction emits `*_coverage_report.json` beside the review CSV and optionally `*_review_file_pruned.csv` (sibling, when `POST_REDACT_PASS1_AUTO_PRUNE=True`). Uses deny/allow lists and/or `POST_REDACT_PASS1_MUST_REDACT_PATH` / `POST_REDACT_PASS1_MUST_NOT_REDACT_PATH`. CLI overrides: `--post-redact-pass1-qa`, `--post-redact-pass1-auto-prune`. This is pre-review-apply sanity QA only — agent Pass 1 (policy edits + `/review_apply`) remains separate.
81
+ Note: on Gradio ([app.py](app.py)), the Review-tab visual exports use `api_name` **`page_redaction_review_image`** and **`page_ocr_review_image`**; the **`/agent`** routes above keep the explicit `export_review_*` names for the same operations.
82
+ - **Gradio-only stubs** (501 + JSON hint): `load_and_prepare_documents_or_data`.
83
+ - **Auth**: If `AGENT_API_KEY` is set in the environment, send header `X-Agent-API-Key` with that value.
84
+ - **Paths**: Inputs must resolve to files under the repo root, `INPUT_FOLDER`, or `OUTPUT_FOLDER` (see router validation).
85
+
86
+ Implementation uses **`cli_redact.main(direct_mode_args=...)`** where a CLI task exists (same behaviour as [cli_redact.py](cli_redact.py)); `apply_review_redactions` calls [tools/simplified_api.py](tools/simplified_api.py) instead.
87
+
88
+ ### 2. Gradio Client API (e.g. Hugging Face Spaces)
89
+
90
+ For remote Spaces or any Gradio deployment exposing the HTTP API:
91
+
92
+ - **Schema**: `GET https://<host>/gradio_api/info`
93
+ - **Call**: `POST https://<host>/gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order matches the named endpoint’s component list).
94
+ - **Poll**: `GET https://<host>/gradio_api/call/{api_name}/{event_id}`
95
+ - **Hugging Face**: `Authorization: Bearer $HF_TOKEN`
96
+
97
+ Named `api_name` values in this app include: `redact_document`, `load_and_prepare_documents_or_data`, `apply_review_redactions`, **`doc_redact`** (simple `gr.api`: one PDF/image + optional OCR/PII knobs; returns `(output_paths, message)`; `api_name='/doc_redact'`; parameters include `document_file`, `redact_entities`, `output_dir`, `ocr_method`, `pii_method`, `allow_list`, `deny_list`, `page_min`, `page_max`, **`handwrite_signature_checkbox`** — AWS Textract extraction options such as `Extract handwriting` / `Extract signatures`), **`review_apply`** (simple `gr.api`: PDF + `*_review_file.csv`; returns `(output_paths, message)`; `api_name='/review_apply'`), **`preview_boxes`** (simple `gr.api`: PDF + `*_review_file.csv`; renders proposed boxes onto the original PDF and returns `(zip_path, message)` — use to verify coordinates *before* calling `review_apply`, no redaction applied; `api_name='/preview_boxes'`), **`pdf_summarise`** (simple `gr.api`: PDF + optional summarisation/OCR knobs; returns `(output_paths, status_message, summary_text)`; `api_name='/pdf_summarise'`), **`tabular_redact`** (simple `gr.api`: one tabular file (CSV/XLSX/Parquet/DOCX) + optional knobs; returns `(output_paths, message)`; `api_name='/tabular_redact'`), **`page_redaction_review_image`** (short review overlay export; `api_name='/page_redaction_review_image'`), **`page_ocr_review_image`** (short OCR visualisation export; `api_name='/page_ocr_review_image'`), `word_level_ocr_text_search`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_csvs`, `combine_review_pdfs`. The matching **`POST /agent`** names for those two visual exports are `export_review_redaction_overlay` and `export_review_page_ocr_visualisation` (§1). Many endpoints require **many positional arguments** (full Gradio state); prefer the short `gr.api` routes above or **`POST /agent/apply_review_redactions`** where applicable instead of building the full `data` array from `/gradio_api/info`.
98
+
99
+ ## CLI parity
100
+
101
+ For scripting and tests, `python cli_redact.py` with flags is authoritative; programmatic merges use `get_cli_default_args_dict()` in [cli_redact.py](cli_redact.py).
102
+
103
+ ## Security and data handling
104
+
105
+ - Do not commit API keys, tokens, or customer data.
106
+ - Treat paths as untrusted outside validated roots (see [tools/secure_path_utils.py](tools/secure_path_utils.py)).
107
+ - Optional `instruction` / LLM fields must not be passed into shell or unconstrained config keys.
108
+
109
+ ## Conventions for PRs
110
+
111
+ - Keep changes focused; avoid drive-by refactors.
112
+ - Match existing naming and patterns in [app.py](app.py) and [tools/](tools/).
113
+ - Update tests when behaviour changes; run `pytest` before merge.
Dockerfile ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
3
+
4
+ # Install system dependencies
5
+ RUN apt-get update \
6
+ && apt-get upgrade -y \
7
+ && apt-get install -y --no-install-recommends \
8
+ g++ \
9
+ make \
10
+ cmake \
11
+ unzip \
12
+ libcurl4-openssl-dev \
13
+ git \
14
+ && pip install --upgrade pip \
15
+ && apt-get clean \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ WORKDIR /src
19
+
20
+ COPY requirements_lightweight.txt .
21
+
22
+ RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
23
+
24
+ ARG INSTALL_GRADIO_MCP=False
25
+ ENV INSTALL_GRADIO_MCP=${INSTALL_GRADIO_MCP}
26
+
27
+ RUN if [ "$INSTALL_GRADIO_MCP" = "True" ]; then \
28
+ pip install --verbose --no-cache-dir --force-reinstall --target=/install "gradio[mcp]>=6.16.0"; \
29
+ fi
30
+
31
+ # Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
32
+
33
+ ARG INSTALL_PADDLEOCR=False
34
+ ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
35
+
36
+ ARG PADDLE_GPU_ENABLED=False
37
+ ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
38
+
39
+ RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
40
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
41
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
42
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.7.0"; \
43
+ elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
44
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
45
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
46
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.7.0" && \
47
+ pip install --verbose --no-cache-dir --target=/install "torch<=2.9.1" --index-url https://download.pytorch.org/whl/cu129 && \
48
+ pip install --verbose --no-cache-dir --target=/install "torchvision<=0.24.1" --index-url https://download.pytorch.org/whl/cu129 && \
49
+ pip install --verbose --no-cache-dir --target=/install "transformers<=5.12.0"; \
50
+ fi
51
+
52
+ ARG INSTALL_VLM=False
53
+ ENV INSTALL_VLM=${INSTALL_VLM}
54
+
55
+ ARG TORCH_GPU_ENABLED=False
56
+ ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
57
+
58
+ # Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
59
+ RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
60
+ pip install --verbose --no-cache-dir --target=/install \
61
+ "torch==2.9.1+cpu" \
62
+ "torchvision==0.24.1+cpu" \
63
+ "transformers<=5.12.0" \
64
+ "accelerate<=1.13.0" \
65
+ "bitsandbytes<=0.49.2" \
66
+ "sentencepiece<=0.2.1" \
67
+ --extra-index-url https://download.pytorch.org/whl/cpu; \
68
+ elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
69
+ pip install --verbose --no-cache-dir --target=/install "torch<=2.9.1" --index-url https://download.pytorch.org/whl/cu129 && \
70
+ pip install --verbose --no-cache-dir --target=/install "torchvision<=0.24.1" --index-url https://download.pytorch.org/whl/cu129 && \
71
+ pip install --verbose --no-cache-dir --target=/install \
72
+ "transformers<=5.12.0" \
73
+ "accelerate<=1.13.0" \
74
+ "bitsandbytes<=0.49.2" \
75
+ "sentencepiece<=0.2.1" && \
76
+ pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
77
+ pip install --verbose --no-cache-dir --target=/install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
78
+ pip install --verbose --no-cache-dir --target=/install https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
79
+ fi
80
+
81
+ # ===================================================================
82
+ # Stage 2: A common base for both Lambda and Gradio
83
+ # ===================================================================
84
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
85
+
86
+ # MUST re-declare ARGs in every stage where they are used in RUN commands
87
+ ARG TORCH_GPU_ENABLED=False
88
+ ARG PADDLE_GPU_ENABLED=False
89
+
90
+ ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
91
+ ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
92
+
93
+ RUN apt-get update && apt-get install -y --no-install-recommends \
94
+ tesseract-ocr \
95
+ poppler-utils \
96
+ libgl1 \
97
+ libglib2.0-0 && \
98
+ if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
99
+ apt-get install -y --no-install-recommends libgomp1; \
100
+ fi && \
101
+ apt-get clean && rm -rf /var/lib/apt/lists/*
102
+
103
+ ENV APP_HOME=/home/user
104
+
105
+ # Set env variables for Gradio & other apps
106
+ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
107
+ MPLCONFIGDIR=/tmp/matplotlib_cache/ \
108
+ GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
109
+ GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
110
+ FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
111
+ ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
112
+ USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
113
+ CONFIG_FOLDER=$APP_HOME/app/config/ \
114
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
115
+ TESSERACT_DATA_FOLDER=/usr/share/tessdata \
116
+ GRADIO_SERVER_NAME=0.0.0.0 \
117
+ GRADIO_SERVER_PORT=7860 \
118
+ PATH=$APP_HOME/.local/bin:$PATH \
119
+ PYTHONPATH=$APP_HOME/app \
120
+ PYTHONUNBUFFERED=1 \
121
+ PYTHONDONTWRITEBYTECODE=1 \
122
+ GRADIO_ALLOW_FLAGGING=never \
123
+ GRADIO_NUM_PORTS=1 \
124
+ GRADIO_ANALYTICS_ENABLED=False
125
+
126
+ # Copy Python packages from the builder stage
127
+ COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
128
+ COPY --from=builder /install/bin /usr/local/bin/
129
+
130
+ # Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
131
+ # passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
132
+ RUN pip install --no-cache-dir "protobuf<=7.34.0"
133
+
134
+ # English pipeline is not a normal PyPI dependency; bundle it in the image so runtime works offline.
135
+ # Placed before COPY app code so application changes do not invalidate this layer.
136
+ RUN python -m spacy download en_core_web_lg
137
+
138
+ # Copy your application code and entrypoint
139
+ COPY . ${APP_HOME}/app
140
+ COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
141
+ # Fix line endings and set execute permissions
142
+ RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
143
+ && chmod +x ${APP_HOME}/app/entrypoint.sh
144
+
145
+ WORKDIR ${APP_HOME}/app
146
+
147
+ # ===================================================================
148
+ # FINAL Stage 3: The Lambda Image (runs as root for simplicity)
149
+ # ===================================================================
150
+ FROM base AS lambda
151
+ # Set runtime ENV for Lambda mode
152
+ ENV APP_MODE=lambda
153
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
154
+ CMD ["lambda_entrypoint.lambda_handler"]
155
+
156
+ # ===================================================================
157
+ # FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
158
+ # ===================================================================
159
+ FROM base AS gradio
160
+ # Set runtime ENV for Gradio mode
161
+ ENV APP_MODE=gradio
162
+
163
+ # Create non-root user
164
+ RUN useradd -m -u 1000 user
165
+
166
+ # Create the base application directory and set its ownership
167
+ RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
168
+
169
+ # Create required sub-folders within the app directory and set their permissions
170
+ # This ensures these specific directories are owned by 'user'
171
+ RUN mkdir -p \
172
+ ${APP_HOME}/app/output \
173
+ ${APP_HOME}/app/input \
174
+ ${APP_HOME}/app/logs \
175
+ ${APP_HOME}/app/usage \
176
+ ${APP_HOME}/app/feedback \
177
+ ${APP_HOME}/app/config \
178
+ && chown user:user \
179
+ ${APP_HOME}/app/output \
180
+ ${APP_HOME}/app/input \
181
+ ${APP_HOME}/app/logs \
182
+ ${APP_HOME}/app/usage \
183
+ ${APP_HOME}/app/feedback \
184
+ ${APP_HOME}/app/config \
185
+ && chmod 755 \
186
+ ${APP_HOME}/app/output \
187
+ ${APP_HOME}/app/input \
188
+ ${APP_HOME}/app/logs \
189
+ ${APP_HOME}/app/usage \
190
+ ${APP_HOME}/app/feedback \
191
+ ${APP_HOME}/app/config
192
+
193
+ # Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
194
+ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
195
+ && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
196
+ && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
197
+ && chmod 700 ${XDG_CACHE_HOME} \
198
+ && mkdir -p ${APP_HOME}/.paddlex \
199
+ && chown user:user ${APP_HOME}/.paddlex \
200
+ && chmod 755 ${APP_HOME}/.paddlex \
201
+ && mkdir -p ${APP_HOME}/.local/share/spacy/data \
202
+ && chown user:user ${APP_HOME}/.local/share/spacy/data \
203
+ && chmod 755 ${APP_HOME}/.local/share/spacy/data \
204
+ && mkdir -p /usr/share/tessdata \
205
+ && chown user:user /usr/share/tessdata \
206
+ && chmod 755 /usr/share/tessdata
207
+
208
+ # Fix apply user ownership to all files in the home directory
209
+ RUN chown -R user:user /home/user
210
+
211
+ # Set permissions for Python executable
212
+ RUN chmod 755 /usr/local/bin/python
213
+
214
+ # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
215
+ VOLUME ["/tmp/matplotlib_cache"]
216
+ VOLUME ["/tmp/gradio_tmp"]
217
+ VOLUME ["/tmp/tld"]
218
+ VOLUME ["/home/user/app/output"]
219
+ VOLUME ["/home/user/app/input"]
220
+ VOLUME ["/home/user/app/logs"]
221
+ VOLUME ["/home/user/app/usage"]
222
+ VOLUME ["/home/user/app/feedback"]
223
+ VOLUME ["/home/user/app/config"]
224
+ VOLUME ["/home/user/.paddlex"]
225
+ VOLUME ["/home/user/.local/share/spacy/data"]
226
+ VOLUME ["/usr/share/tessdata"]
227
+ VOLUME ["/tmp"]
228
+ VOLUME ["/var/tmp"]
229
+
230
+ USER user
231
+
232
+ EXPOSE $GRADIO_SERVER_PORT
233
+
234
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
235
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU AFFERO GENERAL PUBLIC LICENSE
2
+ Version 3, 19 November 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU Affero General Public License is a free, copyleft license for
11
+ software and other kinds of works, specifically designed to ensure
12
+ cooperation with the community in the case of network server software.
13
+
14
+ The licenses for most software and other practical works are designed
15
+ to take away your freedom to share and change the works. By contrast,
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
26
+
27
+ Developers that use our General Public Licenses protect your rights
28
+ with two steps: (1) assert copyright on the software, and (2) offer
29
+ you this License which gives you legal permission to copy, distribute
30
+ and/or modify the software.
31
+
32
+ A secondary benefit of defending all users' freedom is that
33
+ improvements made in alternate versions of the program, if they
34
+ receive widespread use, become available for other developers to
35
+ incorporate. Many developers of free software are heartened and
36
+ encouraged by the resulting cooperation. However, in the case of
37
+ software used on network servers, this result may fail to come about.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ TERMS AND CONDITIONS
60
+
61
+ 0. Definitions.
62
+
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
71
+
72
+ To "modify" a work means to copy from or adapt all or part of the work
73
+ in a fashion requiring copyright permission, other than the making of an
74
+ exact copy. The resulting work is called a "modified version" of the
75
+ earlier work or a work "based on" the earlier work.
76
+
77
+ A "covered work" means either the unmodified Program or a work based
78
+ on the Program.
79
+
80
+ To "propagate" a work means to do anything with it that, without
81
+ permission, would make you directly or secondarily liable for
82
+ infringement under applicable copyright law, except executing it on a
83
+ computer or modifying a private copy. Propagation includes copying,
84
+ distribution (with or without modification), making available to the
85
+ public, and in some countries other activities as well.
86
+
87
+ To "convey" a work means any kind of propagation that enables other
88
+ parties to make or receive copies. Mere interaction with a user through
89
+ a computer network, with no transfer of a copy, is not conveying.
90
+
91
+ An interactive user interface displays "Appropriate Legal Notices"
92
+ to the extent that it includes a convenient and prominently visible
93
+ feature that (1) displays an appropriate copyright notice, and (2)
94
+ tells the user that there is no warranty for the work (except to the
95
+ extent that warranties are provided), that licensees may convey the
96
+ work under this License, and how to view a copy of this License. If
97
+ the interface presents a list of user commands or options, such as a
98
+ menu, a prominent item in the list meets this criterion.
99
+
100
+ 1. Source Code.
101
+
102
+ The "source code" for a work means the preferred form of the work
103
+ for making modifications to it. "Object code" means any non-source
104
+ form of a work.
105
+
106
+ A "Standard Interface" means an interface that either is an official
107
+ standard defined by a recognized standards body, or, in the case of
108
+ interfaces specified for a particular programming language, one that
109
+ is widely used among developers working in that language.
110
+
111
+ The "System Libraries" of an executable work include anything, other
112
+ than the work as a whole, that (a) is included in the normal form of
113
+ packaging a Major Component, but which is not part of that Major
114
+ Component, and (b) serves only to enable use of the work with that
115
+ Major Component, or to implement a Standard Interface for which an
116
+ implementation is available to the public in source code form. A
117
+ "Major Component", in this context, means a major essential component
118
+ (kernel, window system, and so on) of the specific operating system
119
+ (if any) on which the executable work runs, or a compiler used to
120
+ produce the work, or an object code interpreter used to run it.
121
+
122
+ The "Corresponding Source" for a work in object code form means all
123
+ the source code needed to generate, install, and (for an executable
124
+ work) run the object code and to modify the work, including scripts to
125
+ control those activities. However, it does not include the work's
126
+ System Libraries, or general-purpose tools or generally available free
127
+ programs which are used unmodified in performing those activities but
128
+ which are not part of the work. For example, Corresponding Source
129
+ includes interface definition files associated with source files for
130
+ the work, and the source code for shared libraries and dynamically
131
+ linked subprograms that the work is specifically designed to require,
132
+ such as by intimate data communication or control flow between those
133
+ subprograms and other parts of the work.
134
+
135
+ The Corresponding Source need not include anything that users
136
+ can regenerate automatically from other parts of the Corresponding
137
+ Source.
138
+
139
+ The Corresponding Source for a work in source code form is that
140
+ same work.
141
+
142
+ 2. Basic Permissions.
143
+
144
+ All rights granted under this License are granted for the term of
145
+ copyright on the Program, and are irrevocable provided the stated
146
+ conditions are met. This License explicitly affirms your unlimited
147
+ permission to run the unmodified Program. The output from running a
148
+ covered work is covered by this License only if the output, given its
149
+ content, constitutes a covered work. This License acknowledges your
150
+ rights of fair use or other equivalent, as provided by copyright law.
151
+
152
+ You may make, run and propagate covered works that you do not
153
+ convey, without conditions so long as your license otherwise remains
154
+ in force. You may convey covered works to others for the sole purpose
155
+ of having them make modifications exclusively for you, or provide you
156
+ with facilities for running those works, provided that you comply with
157
+ the terms of this License in conveying all material for which you do
158
+ not control copyright. Those thus making or running the covered works
159
+ for you must do so exclusively on your behalf, under your direction
160
+ and control, on terms that prohibit them from making any copies of
161
+ your copyrighted material outside their relationship with you.
162
+
163
+ Conveying under any other circumstances is permitted solely under
164
+ the conditions stated below. Sublicensing is not allowed; section 10
165
+ makes it unnecessary.
166
+
167
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168
+
169
+ No covered work shall be deemed part of an effective technological
170
+ measure under any applicable law fulfilling obligations under article
171
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172
+ similar laws prohibiting or restricting circumvention of such
173
+ measures.
174
+
175
+ When you convey a covered work, you waive any legal power to forbid
176
+ circumvention of technological measures to the extent such circumvention
177
+ is effected by exercising rights under this License with respect to
178
+ the covered work, and you disclaim any intention to limit operation or
179
+ modification of the work as a means of enforcing, against the work's
180
+ users, your or third parties' legal rights to forbid circumvention of
181
+ technological measures.
182
+
183
+ 4. Conveying Verbatim Copies.
184
+
185
+ You may convey verbatim copies of the Program's source code as you
186
+ receive it, in any medium, provided that you conspicuously and
187
+ appropriately publish on each copy an appropriate copyright notice;
188
+ keep intact all notices stating that this License and any
189
+ non-permissive terms added in accord with section 7 apply to the code;
190
+ keep intact all notices of the absence of any warranty; and give all
191
+ recipients a copy of this License along with the Program.
192
+
193
+ You may charge any price or no price for each copy that you convey,
194
+ and you may offer support or warranty protection for a fee.
195
+
196
+ 5. Conveying Modified Source Versions.
197
+
198
+ You may convey a work based on the Program, or the modifications to
199
+ produce it from the Program, in the form of source code under the
200
+ terms of section 4, provided that you also meet all of these conditions:
201
+
202
+ a) The work must carry prominent notices stating that you modified
203
+ it, and giving a relevant date.
204
+
205
+ b) The work must carry prominent notices stating that it is
206
+ released under this License and any conditions added under section
207
+ 7. This requirement modifies the requirement in section 4 to
208
+ "keep intact all notices".
209
+
210
+ c) You must license the entire work, as a whole, under this
211
+ License to anyone who comes into possession of a copy. This
212
+ License will therefore apply, along with any applicable section 7
213
+ additional terms, to the whole of the work, and all its parts,
214
+ regardless of how they are packaged. This License gives no
215
+ permission to license the work in any other way, but it does not
216
+ invalidate such permission if you have separately received it.
217
+
218
+ d) If the work has interactive user interfaces, each must display
219
+ Appropriate Legal Notices; however, if the Program has interactive
220
+ interfaces that do not display Appropriate Legal Notices, your
221
+ work need not make them do so.
222
+
223
+ A compilation of a covered work with other separate and independent
224
+ works, which are not by their nature extensions of the covered work,
225
+ and which are not combined with it such as to form a larger program,
226
+ in or on a volume of a storage or distribution medium, is called an
227
+ "aggregate" if the compilation and its resulting copyright are not
228
+ used to limit the access or legal rights of the compilation's users
229
+ beyond what the individual works permit. Inclusion of a covered work
230
+ in an aggregate does not cause this License to apply to the other
231
+ parts of the aggregate.
232
+
233
+ 6. Conveying Non-Source Forms.
234
+
235
+ You may convey a covered work in object code form under the terms
236
+ of sections 4 and 5, provided that you also convey the
237
+ machine-readable Corresponding Source under the terms of this License,
238
+ in one of these ways:
239
+
240
+ a) Convey the object code in, or embodied in, a physical product
241
+ (including a physical distribution medium), accompanied by the
242
+ Corresponding Source fixed on a durable physical medium
243
+ customarily used for software interchange.
244
+
245
+ b) Convey the object code in, or embodied in, a physical product
246
+ (including a physical distribution medium), accompanied by a
247
+ written offer, valid for at least three years and valid for as
248
+ long as you offer spare parts or customer support for that product
249
+ model, to give anyone who possesses the object code either (1) a
250
+ copy of the Corresponding Source for all the software in the
251
+ product that is covered by this License, on a durable physical
252
+ medium customarily used for software interchange, for a price no
253
+ more than your reasonable cost of physically performing this
254
+ conveying of source, or (2) access to copy the
255
+ Corresponding Source from a network server at no charge.
256
+
257
+ c) Convey individual copies of the object code with a copy of the
258
+ written offer to provide the Corresponding Source. This
259
+ alternative is allowed only occasionally and noncommercially, and
260
+ only if you received the object code with such an offer, in accord
261
+ with subsection 6b.
262
+
263
+ d) Convey the object code by offering access from a designated
264
+ place (gratis or for a charge), and offer equivalent access to the
265
+ Corresponding Source in the same way through the same place at no
266
+ further charge. You need not require recipients to copy the
267
+ Corresponding Source along with the object code. If the place to
268
+ copy the object code is a network server, the Corresponding Source
269
+ may be on a different server (operated by you or a third party)
270
+ that supports equivalent copying facilities, provided you maintain
271
+ clear directions next to the object code saying where to find the
272
+ Corresponding Source. Regardless of what server hosts the
273
+ Corresponding Source, you remain obligated to ensure that it is
274
+ available for as long as needed to satisfy these requirements.
275
+
276
+ e) Convey the object code using peer-to-peer transmission, provided
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
280
+
281
+ A separable portion of the object code, whose source code is excluded
282
+ from the Corresponding Source as a System Library, need not be
283
+ included in conveying the object code work.
284
+
285
+ A "User Product" is either (1) a "consumer product", which means any
286
+ tangible personal property which is normally used for personal, family,
287
+ or household purposes, or (2) anything designed or sold for incorporation
288
+ into a dwelling. In determining whether a product is a consumer product,
289
+ doubtful cases shall be resolved in favor of coverage. For a particular
290
+ product received by a particular user, "normally used" refers to a
291
+ typical or common use of that class of product, regardless of the status
292
+ of the particular user or of the way in which the particular user
293
+ actually uses, or expects or is expected to use, the product. A product
294
+ is a consumer product regardless of whether the product has substantial
295
+ commercial, industrial or non-consumer uses, unless such uses represent
296
+ the only significant mode of use of the product.
297
+
298
+ "Installation Information" for a User Product means any methods,
299
+ procedures, authorization keys, or other information required to install
300
+ and execute modified versions of a covered work in that User Product from
301
+ a modified version of its Corresponding Source. The information must
302
+ suffice to ensure that the continued functioning of the modified object
303
+ code is in no case prevented or interfered with solely because
304
+ modification has been made.
305
+
306
+ If you convey an object code work under this section in, or with, or
307
+ specifically for use in, a User Product, and the conveying occurs as
308
+ part of a transaction in which the right of possession and use of the
309
+ User Product is transferred to the recipient in perpetuity or for a
310
+ fixed term (regardless of how the transaction is characterized), the
311
+ Corresponding Source conveyed under this section must be accompanied
312
+ by the Installation Information. But this requirement does not apply
313
+ if neither you nor any third party retains the ability to install
314
+ modified object code on the User Product (for example, the work has
315
+ been installed in ROM).
316
+
317
+ The requirement to provide Installation Information does not include a
318
+ requirement to continue to provide support service, warranty, or updates
319
+ for a work that has been modified or installed by the recipient, or for
320
+ the User Product in which it has been modified or installed. Access to a
321
+ network may be denied when the modification itself materially and
322
+ adversely affects the operation of the network or violates the rules and
323
+ protocols for communication across the network.
324
+
325
+ Corresponding Source conveyed, and Installation Information provided,
326
+ in accord with this section must be in a format that is publicly
327
+ documented (and with an implementation available to the public in
328
+ source code form), and must require no special password or key for
329
+ unpacking, reading or copying.
330
+
331
+ 7. Additional Terms.
332
+
333
+ "Additional permissions" are terms that supplement the terms of this
334
+ License by making exceptions from one or more of its conditions.
335
+ Additional permissions that are applicable to the entire Program shall
336
+ be treated as though they were included in this License, to the extent
337
+ that they are valid under applicable law. If additional permissions
338
+ apply only to part of the Program, that part may be used separately
339
+ under those permissions, but the entire Program remains governed by
340
+ this License without regard to the additional permissions.
341
+
342
+ When you convey a copy of a covered work, you may at your option
343
+ remove any additional permissions from that copy, or from any part of
344
+ it. (Additional permissions may be written to require their own
345
+ removal in certain cases when you modify the work.) You may place
346
+ additional permissions on material, added by you to a covered work,
347
+ for which you have or can give appropriate copyright permission.
348
+
349
+ Notwithstanding any other provision of this License, for material you
350
+ add to a covered work, you may (if authorized by the copyright holders of
351
+ that material) supplement the terms of this License with terms:
352
+
353
+ a) Disclaiming warranty or limiting liability differently from the
354
+ terms of sections 15 and 16 of this License; or
355
+
356
+ b) Requiring preservation of specified reasonable legal notices or
357
+ author attributions in that material or in the Appropriate Legal
358
+ Notices displayed by works containing it; or
359
+
360
+ c) Prohibiting misrepresentation of the origin of that material, or
361
+ requiring that modified versions of such material be marked in
362
+ reasonable ways as different from the original version; or
363
+
364
+ d) Limiting the use for publicity purposes of names of licensors or
365
+ authors of the material; or
366
+
367
+ e) Declining to grant rights under trademark law for use of some
368
+ trade names, trademarks, or service marks; or
369
+
370
+ f) Requiring indemnification of licensors and authors of that
371
+ material by anyone who conveys the material (or modified versions of
372
+ it) with contractual assumptions of liability to the recipient, for
373
+ any liability that these contractual assumptions directly impose on
374
+ those licensors and authors.
375
+
376
+ All other non-permissive additional terms are considered "further
377
+ restrictions" within the meaning of section 10. If the Program as you
378
+ received it, or any part of it, contains a notice stating that it is
379
+ governed by this License along with a term that is a further
380
+ restriction, you may remove that term. If a license document contains
381
+ a further restriction but permits relicensing or conveying under this
382
+ License, you may add to a covered work material governed by the terms
383
+ of that license document, provided that the further restriction does
384
+ not survive such relicensing or conveying.
385
+
386
+ If you add terms to a covered work in accord with this section, you
387
+ must place, in the relevant source files, a statement of the
388
+ additional terms that apply to those files, or a notice indicating
389
+ where to find the applicable terms.
390
+
391
+ Additional terms, permissive or non-permissive, may be stated in the
392
+ form of a separately written license, or stated as exceptions;
393
+ the above requirements apply either way.
394
+
395
+ 8. Termination.
396
+
397
+ You may not propagate or modify a covered work except as expressly
398
+ provided under this License. Any attempt otherwise to propagate or
399
+ modify it is void, and will automatically terminate your rights under
400
+ this License (including any patent licenses granted under the third
401
+ paragraph of section 11).
402
+
403
+ However, if you cease all violation of this License, then your
404
+ license from a particular copyright holder is reinstated (a)
405
+ provisionally, unless and until the copyright holder explicitly and
406
+ finally terminates your license, and (b) permanently, if the copyright
407
+ holder fails to notify you of the violation by some reasonable means
408
+ prior to 60 days after the cessation.
409
+
410
+ Moreover, your license from a particular copyright holder is
411
+ reinstated permanently if the copyright holder notifies you of the
412
+ violation by some reasonable means, this is the first time you have
413
+ received notice of violation of this License (for any work) from that
414
+ copyright holder, and you cure the violation prior to 30 days after
415
+ your receipt of the notice.
416
+
417
+ Termination of your rights under this section does not terminate the
418
+ licenses of parties who have received copies or rights from you under
419
+ this License. If your rights have been terminated and not permanently
420
+ reinstated, you do not qualify to receive new licenses for the same
421
+ material under section 10.
422
+
423
+ 9. Acceptance Not Required for Having Copies.
424
+
425
+ You are not required to accept this License in order to receive or
426
+ run a copy of the Program. Ancillary propagation of a covered work
427
+ occurring solely as a consequence of using peer-to-peer transmission
428
+ to receive a copy likewise does not require acceptance. However,
429
+ nothing other than this License grants you permission to propagate or
430
+ modify any covered work. These actions infringe copyright if you do
431
+ not accept this License. Therefore, by modifying or propagating a
432
+ covered work, you indicate your acceptance of this License to do so.
433
+
434
+ 10. Automatic Licensing of Downstream Recipients.
435
+
436
+ Each time you convey a covered work, the recipient automatically
437
+ receives a license from the original licensors, to run, modify and
438
+ propagate that work, subject to this License. You are not responsible
439
+ for enforcing compliance by third parties with this License.
440
+
441
+ An "entity transaction" is a transaction transferring control of an
442
+ organization, or substantially all assets of one, or subdividing an
443
+ organization, or merging organizations. If propagation of a covered
444
+ work results from an entity transaction, each party to that
445
+ transaction who receives a copy of the work also receives whatever
446
+ licenses to the work the party's predecessor in interest had or could
447
+ give under the previous paragraph, plus a right to possession of the
448
+ Corresponding Source of the work from the predecessor in interest, if
449
+ the predecessor has it or can get it with reasonable efforts.
450
+
451
+ You may not impose any further restrictions on the exercise of the
452
+ rights granted or affirmed under this License. For example, you may
453
+ not impose a license fee, royalty, or other charge for exercise of
454
+ rights granted under this License, and you may not initiate litigation
455
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
456
+ any patent claim is infringed by making, using, selling, offering for
457
+ sale, or importing the Program or any portion of it.
458
+
459
+ 11. Patents.
460
+
461
+ A "contributor" is a copyright holder who authorizes use under this
462
+ License of the Program or a work on which the Program is based. The
463
+ work thus licensed is called the contributor's "contributor version".
464
+
465
+ A contributor's "essential patent claims" are all patent claims
466
+ owned or controlled by the contributor, whether already acquired or
467
+ hereafter acquired, that would be infringed by some manner, permitted
468
+ by this License, of making, using, or selling its contributor version,
469
+ but do not include claims that would be infringed only as a
470
+ consequence of further modification of the contributor version. For
471
+ purposes of this definition, "control" includes the right to grant
472
+ patent sublicenses in a manner consistent with the requirements of
473
+ this License.
474
+
475
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
476
+ patent license under the contributor's essential patent claims, to
477
+ make, use, sell, offer for sale, import and otherwise run, modify and
478
+ propagate the contents of its contributor version.
479
+
480
+ In the following three paragraphs, a "patent license" is any express
481
+ agreement or commitment, however denominated, not to enforce a patent
482
+ (such as an express permission to practice a patent or covenant not to
483
+ sue for patent infringement). To "grant" such a patent license to a
484
+ party means to make such an agreement or commitment not to enforce a
485
+ patent against the party.
486
+
487
+ If you convey a covered work, knowingly relying on a patent license,
488
+ and the Corresponding Source of the work is not available for anyone
489
+ to copy, free of charge and under the terms of this License, through a
490
+ publicly available network server or other readily accessible means,
491
+ then you must either (1) cause the Corresponding Source to be so
492
+ available, or (2) arrange to deprive yourself of the benefit of the
493
+ patent license for this particular work, or (3) arrange, in a manner
494
+ consistent with the requirements of this License, to extend the patent
495
+ license to downstream recipients. "Knowingly relying" means you have
496
+ actual knowledge that, but for the patent license, your conveying the
497
+ covered work in a country, or your recipient's use of the covered work
498
+ in a country, would infringe one or more identifiable patents in that
499
+ country that you have reason to believe are valid.
500
+
501
+ If, pursuant to or in connection with a single transaction or
502
+ arrangement, you convey, or propagate by procuring conveyance of, a
503
+ covered work, and grant a patent license to some of the parties
504
+ receiving the covered work authorizing them to use, propagate, modify
505
+ or convey a specific copy of the covered work, then the patent license
506
+ you grant is automatically extended to all recipients of the covered
507
+ work and works based on it.
508
+
509
+ A patent license is "discriminatory" if it does not include within
510
+ the scope of its coverage, prohibits the exercise of, or is
511
+ conditioned on the non-exercise of one or more of the rights that are
512
+ specifically granted under this License. You may not convey a covered
513
+ work if you are a party to an arrangement with a third party that is
514
+ in the business of distributing software, under which you make payment
515
+ to the third party based on the extent of your activity of conveying
516
+ the work, and under which the third party grants, to any of the
517
+ parties who would receive the covered work from you, a discriminatory
518
+ patent license (a) in connection with copies of the covered work
519
+ conveyed by you (or copies made from those copies), or (b) primarily
520
+ for and in connection with specific products or compilations that
521
+ contain the covered work, unless you entered into that arrangement,
522
+ or that patent license was granted, prior to 28 March 2007.
523
+
524
+ Nothing in this License shall be construed as excluding or limiting
525
+ any implied license or other defenses to infringement that may
526
+ otherwise be available to you under applicable patent law.
527
+
528
+ 12. No Surrender of Others' Freedom.
529
+
530
+ If conditions are imposed on you (whether by court order, agreement or
531
+ otherwise) that contradict the conditions of this License, they do not
532
+ excuse you from the conditions of this License. If you cannot convey a
533
+ covered work so as to satisfy simultaneously your obligations under this
534
+ License and any other pertinent obligations, then as a consequence you may
535
+ not convey it at all. For example, if you agree to terms that obligate you
536
+ to collect a royalty for further conveying from those to whom you convey
537
+ the Program, the only way you could satisfy both those terms and this
538
+ License would be to refrain entirely from conveying the Program.
539
+
540
+ 13. Remote Network Interaction; Use with the GNU General Public License.
541
+
542
+ Notwithstanding any other provision of this License, if you modify the
543
+ Program, your modified version must prominently offer all users
544
+ interacting with it remotely through a computer network (if your version
545
+ supports such interaction) an opportunity to receive the Corresponding
546
+ Source of your version by providing access to the Corresponding Source
547
+ from a network server at no charge, through some standard or customary
548
+ means of facilitating copying of software. This Corresponding Source
549
+ shall include the Corresponding Source for any work covered by version 3
550
+ of the GNU General Public License that is incorporated pursuant to the
551
+ following paragraph.
552
+
553
+ Notwithstanding any other provision of this License, you have
554
+ permission to link or combine any covered work with a work licensed
555
+ under version 3 of the GNU General Public License into a single
556
+ combined work, and to convey the resulting work. The terms of this
557
+ License will continue to apply to the part which is the covered work,
558
+ but the work with which it is combined will remain governed by version
559
+ 3 of the GNU General Public License.
560
+
561
+ 14. Revised Versions of this License.
562
+
563
+ The Free Software Foundation may publish revised and/or new versions of
564
+ the GNU Affero General Public License from time to time. Such new versions
565
+ will be similar in spirit to the present version, but may differ in detail to
566
+ address new problems or concerns.
567
+
568
+ Each version is given a distinguishing version number. If the
569
+ Program specifies that a certain numbered version of the GNU Affero General
570
+ Public License "or any later version" applies to it, you have the
571
+ option of following the terms and conditions either of that numbered
572
+ version or of any later version published by the Free Software
573
+ Foundation. If the Program does not specify a version number of the
574
+ GNU Affero General Public License, you may choose any version ever published
575
+ by the Free Software Foundation.
576
+
577
+ If the Program specifies that a proxy can decide which future
578
+ versions of the GNU Affero General Public License can be used, that proxy's
579
+ public statement of acceptance of a version permanently authorizes you
580
+ to choose that version for the Program.
581
+
582
+ Later license versions may give you additional or different
583
+ permissions. However, no additional obligations are imposed on any
584
+ author or copyright holder as a result of your choosing to follow a
585
+ later version.
586
+
587
+ 15. Disclaimer of Warranty.
588
+
589
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597
+
598
+ 16. Limitation of Liability.
599
+
600
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608
+ SUCH DAMAGES.
609
+
610
+ 17. Interpretation of Sections 15 and 16.
611
+
612
+ If the disclaimer of warranty and limitation of liability provided
613
+ above cannot be given local legal effect according to their terms,
614
+ reviewing courts shall apply local law that most closely approximates
615
+ an absolute waiver of all civil liability in connection with the
616
+ Program, unless a warranty or assumption of liability accompanies a
617
+ copy of the Program in return for a fee.
618
+
619
+ END OF TERMS AND CONDITIONS
620
+
621
+ How to Apply These Terms to Your New Programs
622
+
623
+ If you develop a new program, and you want it to be of the greatest
624
+ possible use to the public, the best way to achieve this is to make it
625
+ free software which everyone can redistribute and change under these terms.
626
+
627
+ To do so, attach the following notices to the program. It is safest
628
+ to attach them to the start of each source file to most effectively
629
+ state the exclusion of warranty; and each file should have at least
630
+ the "copyright" line and a pointer to where the full notice is found.
631
+
632
+ <one line to give the program's name and a brief idea of what it does.>
633
+ Copyright (C) <year> <name of author>
634
+
635
+ This program is free software: you can redistribute it and/or modify
636
+ it under the terms of the GNU Affero General Public License as published by
637
+ the Free Software Foundation, either version 3 of the License, or
638
+ (at your option) any later version.
639
+
640
+ This program is distributed in the hope that it will be useful,
641
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
642
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643
+ GNU Affero General Public License for more details.
644
+
645
+ You should have received a copy of the GNU Affero General Public License
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
647
+
648
+ Also add information on how to contact you by electronic and paper mail.
649
+
650
+ If your software can interact with users remotely through a computer
651
+ network, you should also make sure that it provides a way for users to
652
+ get its source. For example, if your program is a web application, its
653
+ interface could display a "Source" link that leads users to an archive
654
+ of the code. There are many ways you could offer source, and different
655
+ solutions will be better for different programs; see section 13 for the
656
+ specific requirements.
657
+
658
+ You should also get your employer (if you work as a programmer) or school,
659
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
660
+ For more information on this, and how to apply and follow the GNU AGPL, see
661
+ <https://www.gnu.org/licenses/>.
MANIFEST.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ recursive-include doc_redaction/assets *.png
2
+ recursive-include doc_redaction/example_data *
3
+ recursive-include intros *.txt
4
+
README.md ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Document redaction
3
+ emoji: 📝
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: true
9
+ license: agpl-3.0
10
+ short_description: OCR / redact PDF documents and tabular data
11
+ ---
12
+ # Document redaction (doc_redaction)
13
+
14
+ <a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
15
+
16
+ Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
17
+
18
+ ---
19
+
20
+ ## 🚀 Quick Start - Installation and first run
21
+
22
+ Follow these instructions to get the document redaction application running on your local machine.
23
+
24
+ ### 1. Installation
25
+
26
+ #### Option 1 - Recommended: Install from source repo
27
+
28
+ Clone the repository and install in editable mode:
29
+
30
+ ```bash
31
+ git clone https://github.com/seanpedrick-case/doc_redaction.git
32
+ cd doc_redaction
33
+ pip install -e .
34
+ ```
35
+
36
+ ##### Install extras (Paddle or Transformers/Torch VLM)
37
+
38
+ To install with PaddleOCR (with a transformers backend as of v2.4.0):
39
+
40
+ ```bash
41
+ pip install -e ".[paddle]"
42
+ ```
43
+
44
+
45
+ If you want to run VLMs / LLMs with the transformers package:
46
+
47
+ ```bash
48
+ pip install -e ".[vlm]"
49
+ ```
50
+
51
+
52
+ Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the GPU-enabled version of torch, it is advised to install the following version:
53
+ ```bash
54
+ pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129
55
+ ```
56
+
57
+ #### Option 2 - Install from PyPI
58
+
59
+ Create a virtual environment (recommended) and install **doc_redaction**.
60
+
61
+ ```bash
62
+ python -m venv venv
63
+ # Windows:
64
+ .\venv\Scripts\activate
65
+ # macOS/Linux:
66
+ source venv/bin/activate
67
+ ```
68
+
69
+ The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
70
+
71
+ ```bash
72
+ pip install doc_redaction
73
+ ```
74
+
75
+ Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
76
+
77
+ ```bash
78
+ pip install "doc_redaction[paddle]"
79
+ ```
80
+
81
+ For running VLMs / LLMs with the transformers package:
82
+
83
+ ```bash
84
+ pip install "doc_redaction[vlm]"
85
+ ```
86
+
87
+ For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
88
+
89
+ **Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
90
+
91
+ ```bash
92
+ python -m app
93
+ ```
94
+
95
+ **Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
96
+
97
+ - It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
98
+ - It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
99
+ - The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
100
+
101
+ In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
102
+
103
+ #### Option 3 - Docker installation
104
+
105
+ The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
106
+
107
+ ##### With Llama.cpp / vLLM inference server
108
+
109
+ The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
110
+
111
+ For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
112
+
113
+ You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
114
+
115
+ ##### Without Llama.cpp / vLLM inference server
116
+
117
+ If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
118
+
119
+ The main [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) produces two final images via build targets: **`gradio`** (default web UI, non-root user, named volumes for writable paths) and **`lambda`** (AWS Lambda handler). Build examples:
120
+
121
+ ```bash
122
+ docker build -f Dockerfile --target gradio -t doc-redaction-gradio .
123
+ docker build -f Dockerfile --target lambda -t doc-redaction-lambda .
124
+ ```
125
+
126
+ ##### Pi agent (agentic redaction)
127
+
128
+ The [Pi](https://github.com/earendil-works/pi) orchestration UI uses a separate multi-stage image at [agent-redact/pi-agent/Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/pi-agent/Dockerfile). It shares the same Python 3.12 slim base as the main app; a small Node stage installs the `pi` CLI, which is copied into the runtime image.
129
+
130
+ | Build target | Typical use |
131
+ |--------------|-------------|
132
+ | **`dev`** | Local development with [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml) — the repo is bind-mounted; only Pi CLI + Python deps are in the image. |
133
+ | **`runtime`** | [Hugging Face Space](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) and AWS ECS — agent code is baked in; runs as non-root `user` with **named volumes** for workspace, uploads, and session dirs (read-only root filesystem friendly). |
134
+
135
+ Build from the repository root:
136
+
137
+ ```bash
138
+ docker build -f agent-redact/pi-agent/Dockerfile --target dev -t pi-agent-dev .
139
+ docker build -f agent-redact/pi-agent/Dockerfile --target runtime -t pi-agent-runtime .
140
+ ```
141
+
142
+ For llama.cpp + Pi together, see the compose examples at the top of [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml). Further detail: [agent-redact/README.md](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/README.md).
143
+
144
+ #### Option 4 - Installation on AWS with CDK
145
+
146
+ The repo contains a [CDK folder](https://github.com/seanpedrick-case/doc_redaction/tree/main/cdk), that contains all the files you need to setup and deploy to an AWS environment with CDK. The installation wizard is [cdk_install.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cdk/cdk_install.py), which provides a number of options to deploy the Document Redaction App to AWS for demonstration or production. More details on CDK deployment can be found in the [Installation Guide](https://seanpedrick-case.github.io/doc_redaction/src/installation_guide.html).
147
+
148
+ ### 2. Install prerequisites: Tesseract and Poppler
149
+
150
+ This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). If not using a Docker-based deployment, you will need to install them on your system before proceeding. To run the Document Redaction app successfully, these tools need to be installed and either 1. added to PATH, or 2. be in a folder that is directly referenced in the config/app_config.env file with the variables TESSERACT_FOLDER and POPPLER_FOLDER (defined [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py) if you want to see the code). The instructions below will guide you through different ways to install these dependencies.
151
+
152
+ ---
153
+
154
+ #### Automated dependency setup (recommended)
155
+
156
+ If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
157
+
158
+ You need the installer script available first, which means either:
159
+
160
+ - **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
161
+ - **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
162
+
163
+ From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
164
+
165
+ ```bash
166
+ python -m doc_redaction.install_deps
167
+ ```
168
+
169
+ This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
170
+
171
+ To just check whether your machine can already see the tools:
172
+
173
+ ```bash
174
+ python -m doc_redaction.install_deps --verify-only
175
+ ```
176
+
177
+ #### **On Windows**
178
+
179
+ If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
180
+
181
+ 1. **Install Tesseract OCR:**
182
+ * Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
183
+ * Run the installer.
184
+ * **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
185
+
186
+
187
+ 2. **Install Poppler:**
188
+ * Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
189
+ * Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
190
+ * You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
191
+ * Search for "Edit the system environment variables" in the Windows Start Menu and open it.
192
+ * Click the "Environment Variables..." button.
193
+ * In the "System variables" section, find and select the `Path` variable, then click "Edit...".
194
+ * Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
195
+ * Click OK on all windows to save the changes.
196
+
197
+ To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
198
+ ---
199
+
200
+ #### **On Linux (Debian/Ubuntu)**
201
+
202
+ Open your terminal and run the following command to install Tesseract and Poppler:
203
+
204
+ ```bash
205
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
206
+ ```
207
+
208
+ #### **On Linux (Fedora/CentOS/RHEL)**
209
+
210
+ Open your terminal and use the `dnf` or `yum` package manager:
211
+
212
+ ```bash
213
+ sudo dnf install -y tesseract poppler-utils
214
+ ```
215
+ ---
216
+
217
+ ### 3. Run the Application
218
+
219
+ With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
220
+
221
+ ```bash
222
+ python app.py
223
+ ```
224
+
225
+ After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
226
+
227
+ Open this URL in your web browser to use the document redaction tool
228
+
229
+ #### Command line interface
230
+
231
+ For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
232
+
233
+ If you installed from **PyPI**, use the installed console script:
234
+
235
+ ```bash
236
+ cli_redact --help
237
+ ```
238
+
239
+ From a **repository checkout**, you can also run:
240
+
241
+ ```bash
242
+ python cli_redact.py --help
243
+ ```
244
+
245
+ #### Python package commands
246
+
247
+ For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
248
+
249
+ ---
250
+
251
+
252
+ ### 4. ⚙️ Configuration (Optional)
253
+
254
+ You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
255
+
256
+ To get started:
257
+ 1. Copy `config/app_config.env.example` to `config/app_config.env`.
258
+ 2. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
259
+
260
+ If you do not create this file, the application will run with default settings.
261
+
262
+ #### Configuration Breakdown
263
+
264
+ Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
265
+
266
+ ---
267
+
268
+ #### **Local & General Settings (No AWS Required)**
269
+
270
+ These settings are useful for all users, regardless of whether you are using AWS.
271
+
272
+ * `TESSERACT_FOLDER` / `POPPLER_FOLDER`
273
+ * Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
274
+ * Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
275
+ * **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
276
+
277
+ * `TESSERACT_DATA_FOLDER`
278
+ * If Tesseract runs but you see an error like `Error opening data file ./eng.traineddata` or `Tesseract couldn't load any languages`, this is usually because it can't find the `tessdata/` language files.
279
+ * Set this to the folder that contains `eng.traineddata` (typically a `tessdata` directory).
280
+ * **Examples (Windows):** `TESSERACT_DATA_FOLDER=C:/Program Files/Tesseract-OCR/tessdata`
281
+
282
+ * `SHOW_LANGUAGE_SELECTION=True`
283
+ * Set to `True` to display a language selection dropdown in the UI for OCR processing.
284
+
285
+ * `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
286
+ * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
287
+
288
+ * `SESSION_OUTPUT_FOLDER=False`
289
+ * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
290
+
291
+ * `DISPLAY_FILE_NAMES_IN_LOGS=False`
292
+ * For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
293
+
294
+ ---
295
+
296
+ #### **AWS-Specific Settings**
297
+
298
+ These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
299
+
300
+ * `RUN_AWS_FUNCTIONS=True`
301
+ * **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
302
+
303
+ * **UI Options:**
304
+ * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
305
+ * `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
306
+
307
+ * **Core AWS Configuration:**
308
+ * `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
309
+ * `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
310
+
311
+ * **AWS Logging:**
312
+ * `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
313
+ * `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
314
+
315
+ * **Advanced AWS Textract Features:**
316
+ * `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
317
+ * `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
318
+ * `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
319
+
320
+ * **Cost Tracking (for internal accounting):**
321
+ * `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
322
+ * `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
323
+ * `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
324
+ * `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
325
+
326
+ Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
327
+
328
+ ## For agents (API quickstart)
329
+
330
+ If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
331
+
332
+ - **Discover schema**: `GET /gradio_api/info`
333
+ - **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
334
+ - **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
335
+ - **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
336
+ - **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
337
+
338
+ ### Choose the correct route (prefer short `gr.api` endpoints)
339
+
340
+ Fetch `/gradio_api/info` and then prefer the simplest route that exists:
341
+
342
+ - **Apply edited review CSV to a PDF**: `/review_apply`
343
+ - **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
344
+ - **Summarise a PDF**: `/pdf_summarise`
345
+ - **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
346
+
347
+ If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
348
+
349
+ ### Common gotchas
350
+
351
+ - **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
352
+ - **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
353
+ - **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
354
+
355
+ ### Optional: MCP server
356
+
357
+ If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
358
+
359
+ **Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
360
+
361
+ To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
362
+
363
+ For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
364
+
365
+ Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
366
+
367
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
README_PYPI.md ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Document redaction (doc_redaction)
2
+
3
+ <a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
4
+
5
+ Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
6
+
7
+ ---
8
+
9
+ ## 🚀 Quick Start - Installation and first run
10
+
11
+ Follow these instructions to get the document redaction application running on your local machine.
12
+
13
+ ### 1. Installation
14
+
15
+ #### Option 1 - Recommended: Install from source repo
16
+
17
+ Clone the repository and install in editable mode:
18
+
19
+ ```bash
20
+ git clone https://github.com/seanpedrick-case/doc_redaction.git
21
+ cd doc_redaction
22
+ pip install -e .
23
+ ```
24
+
25
+ ##### Install extras (Paddle or Transformers/Torch VLM)
26
+
27
+ To install with PaddleOCR (with a transformers backend as of v2.4.0):
28
+
29
+ ```bash
30
+ pip install -e ".[paddle]"
31
+ ```
32
+
33
+
34
+ If you want to run VLMs / LLMs with the transformers package:
35
+
36
+ ```bash
37
+ pip install -e ".[vlm]"
38
+ ```
39
+
40
+
41
+ Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the GPU-enabled version of torch, it is advised to install the following version:
42
+ ```bash
43
+ pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129
44
+ ```
45
+
46
+ #### Option 2 - Install from PyPI
47
+
48
+ Create a virtual environment (recommended) and install **doc_redaction**.
49
+
50
+ ```bash
51
+ python -m venv venv
52
+ # Windows:
53
+ .\venv\Scripts\activate
54
+ # macOS/Linux:
55
+ source venv/bin/activate
56
+ ```
57
+
58
+ The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
59
+
60
+ ```bash
61
+ pip install doc_redaction
62
+ ```
63
+
64
+ Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
65
+
66
+ ```bash
67
+ pip install "doc_redaction[paddle]"
68
+ ```
69
+
70
+ For running VLMs / LLMs with the transformers package:
71
+
72
+ ```bash
73
+ pip install "doc_redaction[vlm]"
74
+ ```
75
+
76
+ For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
77
+
78
+ **Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
79
+
80
+ ```bash
81
+ python -m app
82
+ ```
83
+
84
+ **Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
85
+
86
+ - It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
87
+ - It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
88
+ - The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
89
+
90
+ In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
91
+
92
+ #### Option 3 - Docker installation
93
+
94
+ The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
95
+
96
+ ##### With Llama.cpp / vLLM inference server
97
+
98
+ The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
99
+
100
+ For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
101
+
102
+ You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
103
+
104
+ ##### Without Llama.cpp / vLLM inference server
105
+
106
+ If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
107
+
108
+ The main [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) produces two final images via build targets: **`gradio`** (default web UI, non-root user, named volumes for writable paths) and **`lambda`** (AWS Lambda handler). Build examples:
109
+
110
+ ```bash
111
+ docker build -f Dockerfile --target gradio -t doc-redaction-gradio .
112
+ docker build -f Dockerfile --target lambda -t doc-redaction-lambda .
113
+ ```
114
+
115
+ ##### Pi agent (agentic redaction)
116
+
117
+ The [Pi](https://github.com/earendil-works/pi) orchestration UI uses a separate multi-stage image at [agent-redact/pi-agent/Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/pi-agent/Dockerfile). It shares the same Python 3.12 slim base as the main app; a small Node stage installs the `pi` CLI, which is copied into the runtime image.
118
+
119
+ | Build target | Typical use |
120
+ |--------------|-------------|
121
+ | **`dev`** | Local development with [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml) — the repo is bind-mounted; only Pi CLI + Python deps are in the image. |
122
+ | **`runtime`** | [Hugging Face Space](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) and AWS ECS — agent code is baked in; runs as non-root `user` with **named volumes** for workspace, uploads, and session dirs (read-only root filesystem friendly). |
123
+
124
+ Build from the repository root:
125
+
126
+ ```bash
127
+ docker build -f agent-redact/pi-agent/Dockerfile --target dev -t pi-agent-dev .
128
+ docker build -f agent-redact/pi-agent/Dockerfile --target runtime -t pi-agent-runtime .
129
+ ```
130
+
131
+ For llama.cpp + Pi together, see the compose examples at the top of [docker-compose_llama_agentic.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama_agentic.yml). Further detail: [agent-redact/README.md](https://github.com/seanpedrick-case/doc_redaction/blob/main/agent-redact/README.md).
132
+
133
+ #### Option 4 - Installation on AWS with CDK
134
+
135
+ The repo contains a [CDK folder](https://github.com/seanpedrick-case/doc_redaction/tree/main/cdk), that contains all the files you need to setup and deploy to an AWS environment with CDK. The installation wizard is [cdk_install.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cdk/cdk_install.py), which provides a number of options to deploy the Document Redaction App to AWS for demonstration or production. More details on CDK deployment can be found in the [Installation Guide](https://seanpedrick-case.github.io/doc_redaction/src/installation_guide.html).
136
+
137
+ ### 2. Install prerequisites: Tesseract and Poppler
138
+
139
+ This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
140
+
141
+ ---
142
+
143
+ #### Automated dependency setup (recommended)
144
+
145
+ If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
146
+
147
+ You need the installer script available first, which means either:
148
+
149
+ - **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
150
+ - **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
151
+
152
+ From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
153
+
154
+ ```bash
155
+ python -m doc_redaction.install_deps
156
+ ```
157
+
158
+ This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
159
+
160
+ To just check whether your machine can already see the tools:
161
+
162
+ ```bash
163
+ python -m doc_redaction.install_deps --verify-only
164
+ ```
165
+
166
+ #### **On Windows**
167
+
168
+ If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
169
+
170
+ 1. **Install Tesseract OCR:**
171
+ * Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
172
+ * Run the installer.
173
+ * **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
174
+
175
+
176
+ 2. **Install Poppler:**
177
+ * Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
178
+ * Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
179
+ * You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
180
+ * Search for "Edit the system environment variables" in the Windows Start Menu and open it.
181
+ * Click the "Environment Variables..." button.
182
+ * In the "System variables" section, find and select the `Path` variable, then click "Edit...".
183
+ * Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
184
+ * Click OK on all windows to save the changes.
185
+
186
+ To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
187
+ ---
188
+
189
+ #### **On Linux (Debian/Ubuntu)**
190
+
191
+ Open your terminal and run the following command to install Tesseract and Poppler:
192
+
193
+ ```bash
194
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
195
+ ```
196
+
197
+ #### **On Linux (Fedora/CentOS/RHEL)**
198
+
199
+ Open your terminal and use the `dnf` or `yum` package manager:
200
+
201
+ ```bash
202
+ sudo dnf install -y tesseract poppler-utils
203
+ ```
204
+ ---
205
+
206
+ ### 3. Run the Application
207
+
208
+ With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
209
+
210
+ ```bash
211
+ python app.py
212
+ ```
213
+
214
+ After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
215
+
216
+ Open this URL in your web browser to use the document redaction tool
217
+
218
+ #### Command line interface
219
+
220
+ For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
221
+
222
+ If you installed from **PyPI**, use the installed console script:
223
+
224
+ ```bash
225
+ cli_redact --help
226
+ ```
227
+
228
+ From a **repository checkout**, you can also run:
229
+
230
+ ```bash
231
+ python cli_redact.py --help
232
+ ```
233
+
234
+ #### Python package commands
235
+
236
+ For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
237
+
238
+ ---
239
+
240
+
241
+ ### 4. ⚙️ Configuration (Optional)
242
+
243
+ You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
244
+
245
+ To get started:
246
+ 1. Copy `config/app_config.env.example` to `config/app_config.env`.
247
+ 2. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
248
+
249
+ If you do not create this file, the application will run with default settings.
250
+
251
+ #### Configuration Breakdown
252
+
253
+ Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
254
+
255
+ ---
256
+
257
+ #### **Local & General Settings (No AWS Required)**
258
+
259
+ These settings are useful for all users, regardless of whether you are using AWS.
260
+
261
+ * `TESSERACT_FOLDER` / `POPPLER_FOLDER`
262
+ * Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
263
+ * Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
264
+ * **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
265
+
266
+ * `SHOW_LANGUAGE_SELECTION=True`
267
+ * Set to `True` to display a language selection dropdown in the UI for OCR processing.
268
+
269
+ * `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
270
+ * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
271
+
272
+ * `SESSION_OUTPUT_FOLDER=False`
273
+ * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
274
+
275
+ * `DISPLAY_FILE_NAMES_IN_LOGS=False`
276
+ * For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
277
+
278
+ ---
279
+
280
+ #### **AWS-Specific Settings**
281
+
282
+ These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
283
+
284
+ * `RUN_AWS_FUNCTIONS=True`
285
+ * **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
286
+
287
+ * **UI Options:**
288
+ * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
289
+ * `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
290
+
291
+ * **Core AWS Configuration:**
292
+ * `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
293
+ * `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
294
+
295
+ * **AWS Logging:**
296
+ * `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
297
+ * `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
298
+
299
+ * **Advanced AWS Textract Features:**
300
+ * `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
301
+ * `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
302
+ * `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
303
+
304
+ * **Cost Tracking (for internal accounting):**
305
+ * `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
306
+ * `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
307
+ * `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
308
+ * `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
309
+
310
+ Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
311
+
312
+ ## For agents (API quickstart)
313
+
314
+ If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
315
+
316
+ - **Discover schema**: `GET /gradio_api/info`
317
+ - **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
318
+ - **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
319
+ - **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
320
+ - **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
321
+
322
+ ### Choose the correct route (prefer short `gr.api` endpoints)
323
+
324
+ Fetch `/gradio_api/info` and then prefer the simplest route that exists:
325
+
326
+ - **Apply edited review CSV to a PDF**: `/review_apply`
327
+ - **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
328
+ - **Summarise a PDF**: `/pdf_summarise`
329
+ - **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
330
+
331
+ If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
332
+
333
+ ### Common gotchas
334
+
335
+ - **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
336
+ - **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
337
+ - **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
338
+
339
+ ### Optional: MCP server
340
+
341
+ If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
342
+
343
+ **Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
344
+
345
+ To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
346
+
347
+ For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
348
+
349
+ Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
350
+
351
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
agent-redact/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agent redaction (Pi)
2
+
3
+ Pi-based agentic document redaction: local Docker orchestration and Hugging Face Space packaging.
4
+
5
+ | Path | Purpose |
6
+ |------|---------|
7
+ | [`pi/`](pi/) | Gradio UI, Pi RPC client, remote redaction helpers, runtime config |
8
+ | [`pi-agent/`](pi-agent/) | Pi Docker image (`dev` + `runtime` targets), sync script, and manifest |
9
+ | [`requirements_pi_agent.txt`](requirements_pi_agent.txt) | Python deps for the Pi agent image |
10
+
11
+ Per-user output isolation uses Gradio `session_hash` subfolders under `PI_WORKSPACE_DIR` (see `agent-redact/pi/session_workspace.py`). Enabled by default locally and on HF Spaces. Set `PI_SESSION_WORKSPACE=false` only if you want one shared workspace tree for all sessions.
12
+
13
+ ## Local Docker
14
+
15
+ Use the `pi-agent` service in [`docker-compose_llama_agentic.yml`](../docker-compose_llama_agentic.yml) (profile `27b_36`). See [`pi/agent/README.md`](pi/agent/README.md).
16
+
17
+ ## Hugging Face Space
18
+
19
+ Build from repo root:
20
+
21
+ ```bash
22
+ # Production (HF Space / ECS)
23
+ docker build -f agent-redact/pi-agent/Dockerfile --target runtime .
24
+
25
+ # Local compose (bind-mounted repo)
26
+ docker build -f agent-redact/pi-agent/Dockerfile --target dev .
27
+ ```
28
+
29
+ Sync to Space on pushes to `dev` via [`.github/workflows/sync-pi-agent-space.yml`](../.github/workflows/sync-pi-agent-space.yml).
agent-redact/pi-agent/.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .github
3
+ **/__pycache__
4
+ **/*.pyc
5
+ **/.pytest_cache
6
+ **/node_modules
7
+ workspace
8
+ output
9
+ input
10
+ config/pi_agent.env
agent-redact/pi-agent/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Example PDFs must be plain files in the Space repo (not Git LFS pointers).
2
+ *.pdf -filter -diff -merge
agent-redact/pi-agent/Dockerfile ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ # Pi agent image (dev + production). Build from monorepo root:
3
+ # docker build -f agent-redact/pi-agent/Dockerfile --target dev .
4
+ # docker build -f agent-redact/pi-agent/Dockerfile --target runtime .
5
+ # Root .dockerignore must allow config/*.example into the context (secrets stay gitignored).
6
+ #
7
+ # Targets:
8
+ # dev — docker-compose: Pi CLI + Python deps; app tree bind-mounted at runtime.
9
+ # runtime — HF Space / AWS ECS: baked agent-redact tree, non-root user, named volumes.
10
+
11
+ # ===================================================================
12
+ # Stage 1: Pi CLI (Node) — isolated so the runtime base stays Python 3.12
13
+ # ===================================================================
14
+ FROM public.ecr.aws/docker/library/node:24.16.0-slim AS pi-cli
15
+
16
+ ENV NPM_CONFIG_PREFIX=/opt/pi
17
+ ENV PATH="/opt/pi/bin:${PATH}"
18
+
19
+ RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
20
+
21
+ # ===================================================================
22
+ # Stage 2: Shared Python base (aligned with main app Dockerfile)
23
+ # ===================================================================
24
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS pi-base
25
+
26
+ ENV NODE_ENV=production
27
+ ENV DEBIAN_FRONTEND=noninteractive
28
+ ENV NPM_CONFIG_LOGLEVEL=warn
29
+ ENV PYTHONUNBUFFERED=1
30
+ ENV PYTHONDONTWRITEBYTECODE=1
31
+ ENV APP_HOME=/home/user
32
+ ENV PI_WORKDIR=/workspace/doc_redaction
33
+ ENV PYTHONPATH=${PI_WORKDIR}:${PI_WORKDIR}/agent-redact/pi
34
+ ENV GRADIO_SERVER_NAME=0.0.0.0
35
+ ENV MPLCONFIGDIR=/tmp/matplotlib_cache/
36
+ ENV XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
37
+ ENV PATH="/opt/pi/bin:${PATH}"
38
+
39
+ RUN apt-get update && apt-get install -y --no-install-recommends \
40
+ bash \
41
+ git \
42
+ curl \
43
+ ca-certificates \
44
+ procps \
45
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
46
+
47
+ COPY --from=pi-cli /opt/pi /opt/pi
48
+ COPY --from=pi-cli /usr/local/bin/node /usr/local/bin/node
49
+
50
+ COPY agent-redact/requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
51
+ RUN pip install --no-cache-dir -r /tmp/requirements_pi_agent.txt \
52
+ && rm /tmp/requirements_pi_agent.txt
53
+
54
+ # ===================================================================
55
+ # Stage 3: Dev — thin image for docker-compose (repo bind-mounted)
56
+ # ===================================================================
57
+ FROM pi-base AS dev
58
+
59
+ ENV HOME=${APP_HOME}
60
+ ENV PI_WORKSPACE_DIR=${APP_HOME}/app/workspace
61
+ ENV PI_UPLOAD_ROOT=/tmp/gradio
62
+ ENV PI_SESSION_DIR=${APP_HOME}/.pi/agent/sessions
63
+
64
+ RUN useradd -m -u 1000 user \
65
+ && mkdir -p \
66
+ ${APP_HOME}/app/workspace \
67
+ ${APP_HOME}/.pi/agent/sessions \
68
+ ${PI_WORKDIR} \
69
+ /tmp/gradio \
70
+ /tmp/matplotlib_cache \
71
+ ${XDG_CACHE_HOME} \
72
+ && chown -R user:user ${APP_HOME} ${PI_WORKDIR} \
73
+ && chown user:user /tmp/gradio /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
74
+ && chmod 1777 /tmp/gradio /tmp/matplotlib_cache \
75
+ && chmod 700 ${XDG_CACHE_HOME}
76
+
77
+ WORKDIR ${PI_WORKDIR}
78
+
79
+ USER user
80
+
81
+ RUN pi --version
82
+
83
+ # Compose overrides entrypoint with agent-redact/pi/start.sh on the bind mount.
84
+
85
+ # ===================================================================
86
+ # Stage 4: Runtime — baked app for Hugging Face Space and AWS ECS
87
+ # ===================================================================
88
+ FROM pi-base AS runtime
89
+
90
+ ENV PI_DEPLOYMENT_PROFILE=hf-space
91
+ ENV PI_DEFAULT_PROVIDER=google-gemini
92
+ ENV PI_DEFAULT_MODEL=gemini-flash-lite-latest
93
+ ENV DOC_REDACTION_GRADIO_URL=https://seanpedrickcase-document-redaction.hf.space
94
+ ENV HOME=${APP_HOME}
95
+ ENV PI_WORKDIR=/workspace/doc_redaction
96
+ # Fargate uses volume mounts under ${APP_HOME}/app/workspace (CDK chown entrypoint).
97
+ # ECS Express has no mounts — CDK sets PI_WORKSPACE_DIR=/tmp/pi-workspace at deploy.
98
+ ENV PI_WORKSPACE_DIR=${APP_HOME}/app/workspace
99
+ ENV PI_UPLOAD_ROOT=/tmp/gradio
100
+ ENV PI_SESSION_DIR=/tmp/pi-sessions
101
+ ENV PI_CODING_AGENT_DIR=/tmp/pi-agent
102
+ ENV ACCESS_LOGS_FOLDER=/tmp/pi-logs/
103
+ ENV USAGE_LOGS_FOLDER=/tmp/pi-usage/
104
+ ENV FEEDBACK_LOGS_FOLDER=/tmp/pi-feedback/
105
+ ENV PI_OFFLINE=1
106
+ ENV PI_SKIP_VERSION_CHECK=1
107
+ ENV PI_GRADIO_SHOW_EXAMPLES=true
108
+ ENV PI_UI_HOST=0.0.0.0
109
+ ENV PI_UI_PORT=7860
110
+ ENV PI_GRADIO_PORT=7860
111
+ ENV GRADIO_SERVER_NAME=0.0.0.0
112
+ ENV GRADIO_SERVER_PORT=7860
113
+ ENV GRADIO_ANALYTICS_ENABLED=False
114
+ ENV RUN_FASTAPI=False
115
+
116
+ WORKDIR ${PI_WORKDIR}
117
+
118
+ COPY agent-redact/pi agent-redact/pi
119
+ COPY skills skills
120
+ COPY tools tools
121
+ # Committed template only (see sync-manifest.txt); runtime secrets come from S3/env on ECS.
122
+ COPY config/pi_agent.env.example config/pi_agent.env.example
123
+ COPY intros intros
124
+ COPY AGENTS.md AGENTS.md
125
+ COPY doc_redaction/example_data doc_redaction/example_data
126
+
127
+ RUN test -f doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
128
+ && test -f doc_redaction/example_data/graduate-job-example-cover-letter.pdf \
129
+ && ! head -1 doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
130
+ | grep -q "^version https://git-lfs.github.com/spec/v1"
131
+
132
+ RUN useradd -m -u 1000 user \
133
+ && mkdir -p \
134
+ ${APP_HOME}/app/workspace \
135
+ ${APP_HOME}/.pi/agent \
136
+ /tmp/gradio \
137
+ /tmp/pi-sessions \
138
+ /tmp/matplotlib_cache \
139
+ ${XDG_CACHE_HOME} \
140
+ && chown user:user \
141
+ ${APP_HOME}/app/workspace \
142
+ ${APP_HOME}/.pi \
143
+ /tmp/gradio \
144
+ /tmp/pi-sessions \
145
+ /tmp/matplotlib_cache \
146
+ ${XDG_CACHE_HOME} \
147
+ && chmod 755 ${APP_HOME}/app/workspace ${APP_HOME}/.pi \
148
+ && chmod 1777 /tmp/gradio /tmp/pi-sessions /tmp/matplotlib_cache \
149
+ && chmod 700 ${XDG_CACHE_HOME} \
150
+ && chown -R root:root ${PI_WORKDIR} \
151
+ && find ${PI_WORKDIR} -type d -exec chmod 755 {} \; \
152
+ && find ${PI_WORKDIR} -type f -exec chmod 644 {} \; \
153
+ && mkdir -p ${APP_HOME}/app \
154
+ && chown user:user ${APP_HOME}/app
155
+
156
+ COPY agent-redact/pi-agent/entrypoint-ecs.sh /usr/local/bin/entrypoint-ecs.sh
157
+ COPY agent-redact/pi-agent/entrypoint.sh ${APP_HOME}/app/entrypoint.sh
158
+ RUN sed -i 's/\r$//' /usr/local/bin/entrypoint-ecs.sh ${APP_HOME}/app/entrypoint.sh \
159
+ && chmod +x /usr/local/bin/entrypoint-ecs.sh ${APP_HOME}/app/entrypoint.sh
160
+
161
+ # Writable paths only via runtime mounts (read-only root FS friendly).
162
+ VOLUME ["${APP_HOME}/app/workspace"]
163
+ VOLUME ["/tmp/gradio"]
164
+ VOLUME ["/tmp/pi-sessions"]
165
+ VOLUME ["/tmp/matplotlib_cache"]
166
+ VOLUME ["${XDG_CACHE_HOME}"]
167
+ VOLUME ["/tmp"]
168
+ VOLUME ["/var/tmp"]
169
+
170
+ USER user
171
+
172
+ RUN pi --version
173
+
174
+ EXPOSE 7860
175
+
176
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
agent-redact/pi-agent/README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Agentic Document Redaction
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ app_file: agent-redact/pi/gradio_app.py
8
+ pinned: false
9
+ license: agpl-3.0
10
+ short_description: Agentic interface to redact PDF documents
11
+ ---
12
+
13
+ # Pi agent — agentic document redaction
14
+
15
+ Orchestrate document redaction with **[Pi](https://github.com/earendil-works/pi)** and **Google Gemini**. Heavy redaction runs on a separate **private [doc_redaction](https://huggingface.co/spaces/seanpedrickcase/document_redaction)** Hugging Face Space (simple text extraction + Local PII).
16
+
17
+ ## Before you start
18
+
19
+ 1. **Gemini API key** — paste in **Agent backend** → **Apply backend** (session-only; not stored on disk).
20
+ 2. **HF token** — Space admin should set `HF_TOKEN` under **Settings → Secrets** so this Space can call the private redaction backend. Users may optionally override per session in the UI.
21
+
22
+ ## Limitations
23
+
24
+ - **No face or signature VLM** — text-layer PII only via Local spaCy/Presidio on the remote Space.
25
+ - **No Pass 2 VLM** on this deployment.
26
+ - **Ephemeral storage** — download deliverables from **Workspace output files** before the Space restarts.
27
+ - **Human review** — outputs are not guaranteed complete; review redacted PDFs before release.
28
+
29
+ ## Defaults
30
+
31
+ | Setting | Value |
32
+ |---------|--------|
33
+ | Pi LLM | Gemini (`gemini-flash-latest` default) |
34
+ | Redaction backend | `https://seanpedrickcase-document-redaction.hf.space` |
35
+ | Text extraction | `Local model - selectable text` |
36
+ | PII detection | `Local` |
37
+
38
+ ## Examples
39
+
40
+ Two sample PDFs load in **Redaction task** → **Try an example** (same demos as the main doc_redaction app). Examples are **on by default**; set Space variable `PI_GRADIO_SHOW_EXAMPLES=false` to hide them. (`SHOW_PI_EXAMPLES` is also accepted.)
41
+
42
+ If examples do not appear, the UI shows a short status message (usually missing PDFs in the image — rebuild after a successful sync with LFS materialization).
43
+
44
+ ## Development
45
+
46
+ This Space is synced from the [doc_redaction monorepo](https://github.com/seanpedrick-case/doc_redaction) on pushes to **`dev`** (see `.github/workflows/sync-pi-agent-space.yml`). Space: [seanpedrickcase/agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction).
agent-redact/pi-agent/entrypoint-ecs.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ECS Fargate: ephemeral volume mounts are root-owned; chown then drop to user (image USER).
3
+ set -euo pipefail
4
+
5
+ for dir in /tmp/pi-agent /tmp/pi-logs /tmp/pi-usage /tmp/pi-feedback \
6
+ /home/user/app/workspace /tmp/gradio /tmp/pi-sessions; do
7
+ mkdir -p "$dir"
8
+ chown -R user:user "$dir"
9
+ done
10
+
11
+ cd /workspace/doc_redaction
12
+ exec su -s /bin/bash user -c "$*"
agent-redact/pi-agent/entrypoint.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ set -e
3
+
4
+ echo "Starting Pi agent (profile=${PI_DEPLOYMENT_PROFILE:-unknown})"
5
+
6
+ for dir in \
7
+ "${PI_CODING_AGENT_DIR:-/tmp/pi-agent}" \
8
+ "${PI_WORKSPACE_DIR:-/home/user/app/workspace}" \
9
+ "${PI_UPLOAD_ROOT:-/tmp/gradio}" \
10
+ "${PI_SESSION_DIR:-/tmp/pi-sessions}" \
11
+ "${ACCESS_LOGS_FOLDER:-/tmp/pi-logs}" \
12
+ "${USAGE_LOGS_FOLDER:-/tmp/pi-usage}" \
13
+ "${FEEDBACK_LOGS_FOLDER:-/tmp/pi-feedback}" \
14
+ "${MPLCONFIGDIR:-/tmp/matplotlib_cache}" \
15
+ "${XDG_CACHE_HOME:-/tmp/xdg_cache/user_1000}"; do
16
+ mkdir -p "$dir" 2>/dev/null || true
17
+ if [ ! -w "$dir" ]; then
18
+ echo "WARNING: Directory $dir is not writable by current user (uid=$(id -u)). File I/O may fail." >&2
19
+ fi
20
+ done
21
+
22
+ cd "${PI_WORKDIR:-/workspace/doc_redaction}"
23
+
24
+ echo "Entrypoint environment: PI_WORKSPACE_DIR=${PI_WORKSPACE_DIR:-} PI_UI_HOST=${PI_UI_HOST:-} PI_UI_PORT=${PI_UI_PORT:-} PI_GRADIO_PORT=${PI_GRADIO_PORT:-} GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME:-} GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-} RUN_FASTAPI=${RUN_FASTAPI:-}"
25
+
26
+ python3 agent-redact/pi/pi_agent_config.py
27
+ if [ "${RUN_FASTAPI:-False}" = "True" ]; then
28
+ exec uvicorn gradio_app:app \
29
+ --app-dir agent-redact/pi \
30
+ --host "${GRADIO_SERVER_NAME:-0.0.0.0}" \
31
+ --port "${PI_GRADIO_PORT:-${GRADIO_SERVER_PORT:-7860}}" \
32
+ --proxy-headers \
33
+ --forwarded-allow-ips "*"
34
+ else
35
+ exec python3 agent-redact/pi/gradio_app.py
36
+ fi
agent-redact/pi-agent/sync-manifest.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paths copied from the monorepo root into the flattened Pi agent HF Space repo.
2
+ agent-redact/requirements_pi_agent.txt
3
+ agent-redact/pi
4
+ agent-redact/pi-agent/entrypoint.sh
5
+ agent-redact/pi-agent/entrypoint-ecs.sh
6
+ skills
7
+ tools
8
+ config/pi_agent.env.example
9
+ intros/pi_intro.txt
10
+ AGENTS.md
11
+ doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
12
+ doc_redaction/example_data/graduate-job-example-cover-letter.pdf
agent-redact/pi-agent/sync_to_space.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Flatten monorepo paths into a temp directory for the Pi agent HF Space repo.
3
+ # Usage (from repo root):
4
+ # agent-redact/pi-agent/sync_to_space.sh /path/to/output-dir
5
+ set -euo pipefail
6
+
7
+ ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
8
+ OUT="${1:?Output directory required}"
9
+ MANIFEST="$(dirname "$0")/sync-manifest.txt"
10
+
11
+ _is_lfs_pointer() {
12
+ [[ -f "$1" ]] && head -1 "$1" 2>/dev/null | grep -q "^version https://git-lfs.github.com/spec/v1"
13
+ }
14
+
15
+ rm -rf "$OUT"
16
+ mkdir -p "$OUT"
17
+
18
+ cp "$(dirname "$0")/Dockerfile" "$OUT/Dockerfile"
19
+ cp "$(dirname "$0")/README.md" "$OUT/README.md"
20
+ cp "$(dirname "$0")/.dockerignore" "$OUT/.dockerignore"
21
+ cp "$(dirname "$0")/.gitattributes" "$OUT/.gitattributes"
22
+
23
+ while IFS= read -r line || [[ -n "$line" ]]; do
24
+ line="${line%%#*}"
25
+ line="$(echo "$line" | xargs)"
26
+ [[ -z "$line" ]] && continue
27
+ src="$ROOT/$line"
28
+ if [[ ! -e "$src" ]]; then
29
+ echo "Missing: $src" >&2
30
+ exit 1
31
+ fi
32
+ dest="$OUT/$line"
33
+ mkdir -p "$(dirname "$dest")"
34
+ cp -a "$src" "$dest"
35
+ if [[ "$line" == *.pdf ]] && _is_lfs_pointer "$dest"; then
36
+ echo "Copied file is a Git LFS pointer, not a PDF: $line" >&2
37
+ echo "Run 'git lfs pull' in the monorepo before syncing." >&2
38
+ exit 1
39
+ fi
40
+ done < "$MANIFEST"
41
+
42
+ echo "Flattened Pi agent Space tree: $OUT"
agent-redact/pi/agent/README.md ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pi agent config (Docker)
2
+
3
+ Runtime Pi config is **generated at container start** by [`agent-redact/pi/pi_agent_config.py`](../pi_agent_config.py) into `~/.pi/agent/models.json` and `~/.pi/agent/settings.json`.
4
+
5
+ Files in this folder (`settings.json`, `models.json`) are **templates/references** only — they are no longer bind-mounted into the container.
6
+
7
+ ## LLM backends (Pi orchestration)
8
+
9
+ The Pi agent (chat + redaction orchestration) can use:
10
+
11
+ | Provider key | Label | Pi API | Auth |
12
+ |--------------|-------|--------|------|
13
+ | `llama-cpp` | Local (llama-cpp) | `openai-completions` | None (local llama-inference) |
14
+ | `google-gemini` | Gemini | `google-generative-ai` | `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
15
+ | `amazon-bedrock` | AWS Bedrock | `bedrock-converse-stream` | AWS SDK credentials (`AWS_ACCESS_KEY_ID`, etc.) |
16
+
17
+ This is separate from doc_redaction **Pass 2 VLM** (`{VLM_BASE_URL}` in redaction prompts), which still targets local llama-inference by default.
18
+
19
+ ### Environment variables
20
+
21
+ Copy [`config/pi_agent.env.example`](../../../config/pi_agent.env.example) to `config/pi_agent.env` (gitignored) or set on the host before `docker compose up`:
22
+
23
+ | Variable | Purpose |
24
+ |----------|---------|
25
+ | `PI_DEFAULT_PROVIDER` | `llama-cpp` \| `google-gemini` \| `amazon-bedrock` |
26
+ | `PI_DEFAULT_MODEL` | Model id within provider |
27
+ | `PI_LLAMA_BASE_URL` | Local OpenAI-compatible URL (default `http://llama-inference:8080/v1`) |
28
+ | `PI_LLAMA_MODEL_ID` | Local model id |
29
+ | `GEMINI_API_KEY` / `GOOGLE_API_KEY` | Gemini API key |
30
+ | `AWS_REGION` / `AWS_DEFAULT_REGION` | Bedrock region |
31
+ | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN` | Bedrock credentials (when not using SSO) |
32
+ | `AWS_PROFILE` | Named profile for SSO / shared credentials file (**required for Pi Bedrock with SSO**) |
33
+ | `PI_AWS_PROFILE` | Alternative to `AWS_PROFILE`; also used to auto-select profile when only `~/.aws` is mounted |
34
+ | `RUN_AWS_FUNCTIONS` | When `True`, use the AWS default credential chain (SSO, profile, role) |
35
+ | `PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS` | When `True` with `RUN_AWS_FUNCTIONS`, prefer SSO/chain over static env keys (default `True`, same as main app) |
36
+ | `PI_MAX_PAGES` | Maximum PDF pages allowed per redaction upload (falls back to `MAX_PAGES` / `MAX_DOC_PAGES`, default `3000`) |
37
+ | `PI_MAX_RETRIES` | Gemini quota / rate-limit retries for Pi auto-retry and Gradio backoff (default `5`; alias `PI_QUOTA_RETRY_ATTEMPTS`) |
38
+ | `PI_QUOTA_RETRY_DELAY_S` | Seconds between Gradio quota retries (default `60`) |
39
+ | `PI_COMPACTION_ENABLED` | Pi session auto-compaction in `settings.json` (`true` / `false`; unset uses template default, enabled) |
40
+ | `PI_COMPACTION_RESERVE_TOKENS` | Optional compaction `reserveTokens` (default `32768` from template) |
41
+ | `PI_COMPACTION_KEEP_RECENT_TOKENS` | Optional compaction `keepRecentTokens` (default `20000` from template) |
42
+
43
+ ### Usage logging (CSV / DynamoDB / S3)
44
+
45
+ Each completed Pi agent run (chat message or redaction task) writes **one row** to the **same usage log schema** as the main redaction app (`USAGE_LOG_FILE_NAME`, `USAGE_LOGS_FOLDER`, `S3_USAGE_LOGS_FOLDER`, `USAGE_LOG_DYNAMODB_TABLE_NAME`). Key fields:
46
+
47
+ | Log column | Pi agent value |
48
+ |------------|----------------|
49
+ | `task` | `agent` |
50
+ | `llm_model_name` | Pi provider/model (e.g. `amazon-bedrock/anthropic.claude-sonnet-4-6`) |
51
+ | `text_extraction_method` / `pii_detection_method` | From redaction task settings when applicable |
52
+ | `actual_time_taken_number` | Wall-clock seconds for the Pi RPC turn |
53
+ | `total_page_count` | Pages in scope for PDF redaction tasks |
54
+ | `llm_total_input_tokens` / `llm_total_output_tokens` | Pi orchestration LLM usage for that turn (from Pi `get_session_stats` delta, or assistant `usage` in session JSONL). Includes cache read/write in the input column. **VLM/tokens from doc_redaction Pass 1 are not included** (those stay on the main app usage log when you run redaction there directly). |
55
+
56
+ Toggle with `SAVE_LOGS_TO_CSV`, `SAVE_LOGS_TO_DYNAMODB`, and `RUN_AWS_FUNCTIONS` (required for S3 log upload). Access logs on session load use the main app access log paths separately.
57
+
58
+ At startup, if only `GOOGLE_API_KEY` is set, it is mirrored to `GEMINI_API_KEY` for Pi.
59
+
60
+ ### Gradio UI
61
+
62
+ Open **http://localhost:7862** → **Agent backend** accordion:
63
+
64
+ - Select provider and model
65
+ - Optionally enter Gemini / AWS credentials (**session-only** — not written to disk)
66
+ - Click **Apply backend** — regenerates config, restarts the Pi RPC subprocess, and starts a new session
67
+
68
+ Credential fields are cleared after apply.
69
+
70
+ ## Local model id
71
+
72
+ After the llama.cpp service is healthy, confirm the model id:
73
+
74
+ ```bash
75
+ curl http://localhost:8000/v1/models
76
+ ```
77
+
78
+ If the returned `id` differs from `unsloth/Qwen3.6-27B-MTP-GGUF`, set `PI_LLAMA_MODEL_ID` in `config/pi_agent.env` or compose environment and restart `pi-agent`.
79
+
80
+ ### llama.cpp / llama-swap and back-to-back redaction tasks
81
+
82
+ If the **first** redaction task succeeds but a **second** task in the same browser session kills the llama server (`Killed`, `saving prompt with length 69804`, `proxy error: EOF`, `502`):
83
+
84
+ 1. **Oversized Pi session** — the orchestration agent kept the full first run (tool logs, bash output) in context (~70k tokens). The Gradio UI **restarts the Pi RPC process** and **clears the chat panel** on **page reload** and before each **Start redaction task** (same behaviour). Workspace files are unchanged. Use **New session** before a follow-up **chat** turn if you still hit context limits.
85
+ 2. **llama.cpp OOM** — a second task that reuses the first run’s context can try to allocate multi‑GiB KV state (`total state size = 3322 MiB` in logs) and be killed by the OS. A clean Pi process keeps the orchestration prompt small.
86
+ 3. **llama-swap GPU monitor** — on newer NVIDIA drivers, older llama-swap builds fail `nvidia-smi -loop` and can log `failed reading from gpuCh`. Upgrade to [llama-swap v213+](https://github.com/mostlygeek/llama-swap) (or disable performance monitoring in your swap config).
87
+ 4. **Concurrent load** — Pi orchestration and doc_redaction VLM may share one llama endpoint; `--parallel 1` allows only one generation. Wait until the first task shows **Agent finished** before starting another.
88
+
89
+ For Gemma 4 31B, `pi-agent-gemma-31b` sets lower compaction defaults (`PI_COMPACTION_RESERVE_TOKENS=16384`) to match `PI_LLAMA_CONTEXT_WINDOW=65536`.
90
+
91
+ ## In-container URLs for task prompts
92
+
93
+ When filling [`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](../../../skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md) inside the Pi container, use:
94
+
95
+ | Placeholder | In-container value |
96
+ |-------------|-------------------|
97
+ | `{GRADIO_URL}` | `http://redaction-app-llama:7860` |
98
+ | `{VLM_BASE_URL}` | `http://llama-inference:8080` |
99
+ | `{INPUT_PATH}` | `/home/user/app/workspace/{session_hash}/{FILE_NAME}` (when `PI_SESSION_WORKSPACE=true`) |
100
+ | `{OUTPUT_BASE}` | `/home/user/app/workspace/{session_hash}/redact/{FILE_NAME}/` |
101
+
102
+ Host-side examples (`host.docker.internal`, `localhost:7861`) do not apply inside the compose network.
103
+
104
+ ## Usage
105
+
106
+ Start the stack (27B profile):
107
+
108
+ ```powershell
109
+ docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d --build
110
+ ```
111
+
112
+ Interactive Pi TUI:
113
+
114
+ ```powershell
115
+ docker compose -f docker-compose_llama_agentic.yml exec -it pi-agent pi
116
+ ```
117
+
118
+ Gradio chat UI (browser):
119
+
120
+ Open **http://localhost:7862**. Use the **Redaction task** panel to upload a document, enter bullet-point requirements, and click **Start redaction task**. Pi receives the filled prompt from [`skills/Example prompt partnership.txt`](../../../skills/Example%20prompt%20partnership.txt) (file copied to `/home/user/app/workspace/`). The full prompt appears in the chat; Pi’s reply streams in the chat panel.
121
+
122
+ The UI also shows:
123
+
124
+ - **Agent backend** — switch between local, Gemini, and Bedrock
125
+ - **Chat** — streamed assistant text
126
+ - **Activity** — agent/turn lifecycle, compaction, auto-retry, tool start/end
127
+ - **Tool output** — live bash/read output from `tool_execution_update` / `tool_execution_end`
128
+ - **Thinking** — optional stream (`PI_GRADIO_SHOW_THINKING=true`)
129
+ - **Abort** — sends Pi RPC `abort` and cancels the in-flight Gradio handler
130
+ - **Workspace output files** — browse and download redaction artifacts
131
+
132
+ Optional env vars on `pi-agent`: `PI_GRADIO_SHOW_THINKING`, `PI_GRADIO_SHOW_TOOL_OUTPUT`, `PI_GRADIO_TOOL_OUTPUT_MAX`, `PI_GRADIO_ACTIVITY_MAX_LINES`.
133
+
134
+ When a Pi run completes, the chat shows an **Agent finished** (or **Agent stopped**) line, a Gradio info toast appears, and the browser tab title flashes for ~15 seconds. Desktop notifications are shown when the browser has granted notification permission (requested on first click/keypress in the Pi UI).
135
+
136
+ Run the UI locally (outside Docker):
137
+
138
+ ```powershell
139
+ cd agent-redact/pi
140
+ pip install -r ../requirements_pi_agent.txt
141
+ # Pi orchestration subprocess (required for Apply backend / chat):
142
+ npm install -g @earendil-works/pi-coding-agent
143
+ python pi_agent_config.py
144
+ python gradio_app.py
145
+ ```
146
+
147
+ **Apply backend** starts `pi --mode rpc`. If you see `FileNotFoundError` / “Pi CLI not found”, install Node.js, run the `npm install` line above, and ensure `pi` (or `pi.cmd` on Windows) is on `PATH`. Optional: `PI_EXECUTABLE=C:\Users\you\AppData\Roaming\npm\pi.cmd` in `config/pi_agent.env`.
148
+
149
+ RPC mode (automation, no Gradio):
150
+
151
+ ```powershell
152
+ docker compose -f docker-compose_llama_agentic.yml exec -T pi-agent pi --mode rpc
153
+ ```
154
+
155
+ Skills are synced from the repo `skills/` tree into **`{PI_WORKSPACE_DIR}/.pi/skills/`** on startup (read-only). Pi runs with `cwd` in the user’s session subfolder and `--no-skills` so it does not load skills from the git checkout. Use `/skill:doc-redaction-app` etc. Set `PI_SKILLS_RESYNC=true` to refresh copies from the repo.
156
+
157
+ Sessions persist in the **`pi-agent-sessions`** Docker volume at **`~/.pi/agent/sessions/`** (Pi’s default session location inside the container). Override with `PI_SESSION_DIR` if needed.
158
+
159
+ On **HF Space** (`PI_DEPLOYMENT_PROFILE=hf-space`), sessions go to **`/tmp/pi-sessions`** instead (ephemeral; lost on restart).
160
+
161
+ ## Python dependencies
162
+
163
+ The Pi image installs [`requirements_pi_agent.txt`](../requirements_pi_agent.txt) — Gradio UI + `gradio-client`, HTTP clients, CSV/PDF review helpers (`pandas`, `pymupdf`), and common utilities. It **does not** include spaCy, Presidio, or OCR; heavy redaction runs in `redaction-app-llama`.
164
+
165
+ Rebuild after changing that file:
166
+
167
+ ```powershell
168
+ docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 build pi-agent
169
+ ```
170
+
171
+ ## HF Space profile (remote redaction backend)
172
+
173
+ Set `PI_DEPLOYMENT_PROFILE=hf-space` to run the Pi Gradio UI as a **Hugging Face Docker Space** that orchestrates with **Gemini only** and calls a **remote** doc_redaction Space over HTTPS.
174
+
175
+ | Area | HF Space value |
176
+ |------|----------------|
177
+ | Pi LLM | Gemini only (`PI_DEFAULT_PROVIDER=google-gemini`) |
178
+ | Redaction app | `DOC_REDACTION_GRADIO_URL` (default `https://seanpedrickcase-document-redaction.hf.space`) |
179
+ | Auth to redaction | `HF_TOKEN` / `DOC_REDACTION_HF_TOKEN` (Space secret + optional UI override) |
180
+ | Text extraction / PII | Locked to `Local model - selectable text` + `Local` |
181
+ | VLM faces / signatures | Disabled |
182
+ | Port | `7860` |
183
+ | Pi session logs | `/tmp/pi-sessions` (`PI_SESSION_DIR`; ephemeral) |
184
+
185
+ Package and Dockerfile: [`agent-redact/pi-agent/`](../../pi-agent/). Pushes to [agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) on **`dev`** branch via [`.github/workflows/sync-pi-agent-space.yml`](../../../.github/workflows/sync-pi-agent-space.yml) (GitHub secrets: `HF_TOKEN`, `HF_USERNAME`, `HF_EMAIL`).
186
+
187
+ Local build test from monorepo root:
188
+
189
+ ```powershell
190
+ docker build -f agent-redact/pi-agent/Dockerfile --target runtime -t pi-agent-hf-space .
191
+ docker run --rm -p 7860:7860 -e GEMINI_API_KEY=... -e HF_TOKEN=... pi-agent-hf-space
192
+ ```
193
+
194
+ Pi uses `gradio_client` + `agent-redact/pi/remote_redaction.py` to upload/download from the remote Space; prompts include `{REMOTE_BACKEND_GUIDANCE}` (see [`redaction_prompt.py`](../redaction_prompt.py)).
agent-redact/pi/agent/models.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "providers": {
3
+ "llama-cpp": {
4
+ "baseUrl": "http://llama-inference:8080/v1",
5
+ "api": "openai-completions",
6
+ "apiKey": "llama-cpp",
7
+ "compat": {
8
+ "supportsDeveloperRole": false,
9
+ "supportsReasoningEffort": false,
10
+ "supportsUsageInStreaming": false,
11
+ "maxTokensField": "max_tokens"
12
+ },
13
+ "models": [
14
+ {
15
+ "id": "unsloth/Qwen3.6-27B-MTP-GGUF",
16
+ "name": "Qwen 3.6 27B (local)",
17
+ "reasoning": false,
18
+ "input": ["text", "image"],
19
+ "contextWindow": 114688,
20
+ "maxTokens": 32768,
21
+ "cost": {
22
+ "input": 0,
23
+ "output": 0,
24
+ "cacheRead": 0,
25
+ "cacheWrite": 0
26
+ }
27
+ }
28
+ ]
29
+ }
30
+ }
31
+ }
agent-redact/pi/agent/settings.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "defaultProvider": "llama-cpp",
3
+ "defaultModel": "unsloth/Qwen3.6-27B-MTP-GGUF",
4
+ "defaultThinkingLevel": "off",
5
+ "hideThinkingBlock": true,
6
+ "compaction": {
7
+ "enabled": true,
8
+ "reserveTokens": 32768,
9
+ "keepRecentTokens": 20000
10
+ },
11
+ "branchSummary": {
12
+ "skipPrompt": true,
13
+ "reserveTokens": 32768
14
+ },
15
+ "retry": {
16
+ "enabled": true,
17
+ "maxRetries": 5,
18
+ "baseDelayMs": 2000,
19
+ "provider": {
20
+ "timeoutMs": 3600000,
21
+ "maxRetries": 5,
22
+ "maxRetryDelayMs": 60000
23
+ }
24
+ },
25
+ "enableSkillCommands": true,
26
+ "sessionDir": "sessions",
27
+ "steeringMode": "one-at-a-time",
28
+ "followUpMode": "one-at-a-time",
29
+ "terminal": {
30
+ "showTerminalProgress": false
31
+ }
32
+ }
agent-redact/pi/bootstrap_pi_config.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pi agent process bootstrap (env file + workspace) before ``tools.config`` import."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ _DOCKER_WORKSPACE = Path("/home/user/app/workspace")
11
+ _DOCKER_UPLOAD_ROOT = Path("/tmp/gradio")
12
+ _DOCKER_PI_WORKDIR = Path("/workspace/doc_redaction")
13
+ # CSV log dirs must not live under read-only PI_WORKDIR (ECS/HF runtime images).
14
+ _DOCKER_ACCESS_LOGS = Path("/tmp/pi-logs")
15
+ _DOCKER_USAGE_LOGS = Path("/tmp/pi-usage")
16
+ _DOCKER_FEEDBACK_LOGS = Path("/tmp/pi-feedback")
17
+ _PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
18
+
19
+
20
+ def _pi_running_in_container() -> bool:
21
+ """
22
+ True when the Pi process is inside Docker / HF Space, not local Windows dev.
23
+
24
+ Avoids treating ``C:\\home\\user\\app\\workspace`` (created by mistake on Windows)
25
+ as the compose mount.
26
+ """
27
+ if Path("/.dockerenv").is_file():
28
+ return True
29
+ return _DOCKER_PI_WORKDIR.is_dir() and _partnership_template_exists(
30
+ _DOCKER_PI_WORKDIR
31
+ )
32
+
33
+
34
+ def ensure_pi_workspace_dir(repo_root: Path | None = None) -> str:
35
+ """
36
+ Resolve ``PI_WORKSPACE_DIR``, create it, and sync ``os.environ``.
37
+
38
+ - Explicit ``PI_WORKSPACE_DIR`` wins.
39
+ - Else use the Docker mount only when running in a container.
40
+ - Else ``{repo_root}/workspace`` (local Windows/macOS/Linux dev).
41
+ """
42
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
43
+ raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
44
+ if raw:
45
+ path = Path(raw)
46
+ elif _pi_running_in_container() and _DOCKER_WORKSPACE.is_dir():
47
+ path = _DOCKER_WORKSPACE
48
+ else:
49
+ path = root / "workspace"
50
+ path.mkdir(parents=True, exist_ok=True)
51
+ resolved = str(path.resolve())
52
+ os.environ["PI_WORKSPACE_DIR"] = resolved
53
+ return resolved
54
+
55
+
56
+ def _pi_runtime_needs_tmp_log_dirs() -> bool:
57
+ """True when CSV logs must not live under read-only ``PI_WORKDIR`` (ECS/HF images)."""
58
+ profile = os.environ.get("PI_DEPLOYMENT_PROFILE", "").strip().lower()
59
+ if profile in ("aws-ecs", "hf-space"):
60
+ return True
61
+ return _pi_running_in_container()
62
+
63
+
64
+ def ensure_pi_writable_log_dirs() -> None:
65
+ """
66
+ Point access/usage/feedback CSV logs at ``/tmp`` when running in Docker/ECS.
67
+
68
+ ``tools.config`` resolves relative ``logs/`` under ``PI_WORKDIR``, which is
69
+ read-only in the Pi runtime image; ``/tmp`` is allowed by
70
+ ``ensure_folder_within_app_directory`` for absolute paths.
71
+
72
+ For ``aws-ecs`` / ``hf-space``, always override (S3/task env files often set
73
+ ``logs/`` from the main app template).
74
+ """
75
+ if not _pi_running_in_container():
76
+ return
77
+ for path in (_DOCKER_ACCESS_LOGS, _DOCKER_USAGE_LOGS, _DOCKER_FEEDBACK_LOGS):
78
+ path.mkdir(parents=True, exist_ok=True)
79
+ access = _DOCKER_ACCESS_LOGS.as_posix() + "/"
80
+ usage = _DOCKER_USAGE_LOGS.as_posix() + "/"
81
+ feedback = _DOCKER_FEEDBACK_LOGS.as_posix() + "/"
82
+ if _pi_runtime_needs_tmp_log_dirs():
83
+ os.environ["ACCESS_LOGS_FOLDER"] = access
84
+ os.environ["USAGE_LOGS_FOLDER"] = usage
85
+ os.environ["FEEDBACK_LOGS_FOLDER"] = feedback
86
+ else:
87
+ os.environ.setdefault("ACCESS_LOGS_FOLDER", access)
88
+ os.environ.setdefault("USAGE_LOGS_FOLDER", usage)
89
+ os.environ.setdefault("FEEDBACK_LOGS_FOLDER", feedback)
90
+
91
+
92
+ def ensure_pi_upload_root(repo_root: Path | None = None) -> str:
93
+ """
94
+ Resolve where Gradio stores ``gr.File`` uploads and sync ``os.environ``.
95
+
96
+ Must run before ``import gradio`` so ``GRADIO_TEMP_DIR`` matches validation
97
+ in ``redaction_prompt._resolve_and_validate_upload_path``.
98
+
99
+ - Explicit ``PI_UPLOAD_ROOT`` wins.
100
+ - Else ``GRADIO_TEMP_DIR`` if already set.
101
+ - Else Docker ``/tmp/gradio`` when that directory exists.
102
+ - Else ``{repo}/workspace/.gradio_uploads`` (local dev; stays inside the app tree
103
+ so ``tools.config.ensure_folder_within_app_directory`` accepts ``GRADIO_TEMP_DIR``).
104
+ """
105
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
106
+ raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
107
+ if raw:
108
+ path = Path(raw)
109
+ else:
110
+ gradio_temp = (os.environ.get("GRADIO_TEMP_DIR") or "").strip()
111
+ if gradio_temp:
112
+ path = Path(gradio_temp)
113
+ elif _pi_running_in_container() and _DOCKER_UPLOAD_ROOT.is_dir():
114
+ path = _DOCKER_UPLOAD_ROOT
115
+ else:
116
+ path = root / "workspace" / ".gradio_uploads"
117
+ path.mkdir(parents=True, exist_ok=True)
118
+ resolved = str(path.resolve())
119
+ os.environ["PI_UPLOAD_ROOT"] = resolved
120
+ if not (os.environ.get("GRADIO_TEMP_DIR") or "").strip():
121
+ os.environ["GRADIO_TEMP_DIR"] = resolved
122
+ return resolved
123
+
124
+
125
+ def _partnership_template_exists(repo: Path) -> bool:
126
+ return (repo / _PARTNERSHIP_TEMPLATE).is_file()
127
+
128
+
129
+ def ensure_pi_workdir(repo_root: Path | None = None) -> str:
130
+ """
131
+ Resolve ``PI_WORKDIR`` (monorepo root for skills/ and Pi RPC cwd).
132
+
133
+ - Explicit ``PI_WORKDIR`` wins when the partnership prompt template exists there.
134
+ - Else use the checkout root (``agent-redact/pi`` → parents[2]).
135
+ - Docker images set ``PI_WORKDIR=/workspace/doc_redaction`` via env or ``start.sh``.
136
+ """
137
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
138
+ raw = (os.environ.get("PI_WORKDIR") or "").strip()
139
+ if raw:
140
+ candidate = Path(raw)
141
+ if _partnership_template_exists(candidate):
142
+ resolved = str(candidate.resolve())
143
+ os.environ["PI_WORKDIR"] = resolved
144
+ return resolved
145
+ if _pi_running_in_container() and _partnership_template_exists(_DOCKER_PI_WORKDIR):
146
+ resolved = str(_DOCKER_PI_WORKDIR.resolve())
147
+ os.environ["PI_WORKDIR"] = resolved
148
+ return resolved
149
+ resolved = str(root)
150
+ os.environ["PI_WORKDIR"] = resolved
151
+ return resolved
152
+
153
+
154
+ def pi_repo_root_path(repo_root: Path | None = None) -> Path:
155
+ """Return ``PI_WORKDIR`` as a :class:`~pathlib.Path` (calls :func:`ensure_pi_workdir`)."""
156
+ return Path(ensure_pi_workdir(repo_root))
157
+
158
+
159
+ def load_pi_agent_env_file(config_path: str | Path | None = None) -> bool:
160
+ """
161
+ Load ``config/pi_agent.env`` into ``os.environ`` (does not override existing vars).
162
+
163
+ Must run before ``import pi_agent_config`` so module-level defaults see the file.
164
+ """
165
+ path = Path(config_path or os.environ.get("APP_CONFIG_PATH", "")).expanduser()
166
+ if not path.is_file():
167
+ return False
168
+ load_dotenv(path, override=False)
169
+ return True
170
+
171
+
172
+ def ensure_pi_config_env(repo_root: Path | None = None) -> str:
173
+ """
174
+ Set process env so ``tools.config`` loads the Pi agent env file.
175
+
176
+ Must run before any ``from pi_agent_config import ...`` or ``tools.config`` import
177
+ that depends on Pi env vars. Safe to call multiple times; does not override
178
+ existing environment variables.
179
+ """
180
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
181
+ os.environ.setdefault("APP_TYPE", "pi")
182
+ if not os.environ.get("APP_CONFIG_PATH", "").strip():
183
+ os.environ["APP_CONFIG_PATH"] = str(root / "config" / "pi_agent.env")
184
+ load_pi_agent_env_file()
185
+ ensure_pi_workdir(root)
186
+ ensure_pi_workspace_dir(root)
187
+ ensure_pi_upload_root(root)
188
+ ensure_pi_writable_log_dirs()
189
+ from pi_workspace_skills import ensure_workspace_skills
190
+
191
+ ensure_workspace_skills()
192
+ return os.environ["APP_CONFIG_PATH"]
agent-redact/pi/gradio_app.py ADDED
The diff for this file is too large to render. See raw diff
 
agent-redact/pi/output_files.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browse and download files from the Pi agent shared workspace."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import gradio as gr
12
+ from bootstrap_pi_config import pi_repo_root_path
13
+ from pi_examples import gradio_example_allowed_paths
14
+ from session_logs import gradio_session_log_allowed_paths
15
+ from session_workspace import (
16
+ sanitize_session_id,
17
+ session_workspace_dir,
18
+ workspace_base_dir,
19
+ )
20
+
21
+ REFRESH_STUB_DIR = Path(os.environ.get("PI_FILEEXPLORER_STUB_DIR", "/tmp"))
22
+
23
+ # Folder names under ``.../review/`` where Pass 1 deliverables are saved (see partnership prompt).
24
+ _DEFAULT_FINAL_OUTPUT_FOLDER_NAMES = ("output_review_final", "output_final")
25
+ _DEFAULT_FINAL_DOWNLOAD_FOLDER = "output_final_download"
26
+ _DEFAULT_GRADIO_PREFIX_MIN_LEN = 16
27
+
28
+
29
+ def final_output_folder_names() -> frozenset[str]:
30
+ raw = os.environ.get("PI_FINAL_OUTPUT_FOLDER_NAMES", "").strip()
31
+ if raw:
32
+ names = {part.strip() for part in raw.split(",") if part.strip()}
33
+ if names:
34
+ return frozenset(names)
35
+ return frozenset(_DEFAULT_FINAL_OUTPUT_FOLDER_NAMES)
36
+
37
+
38
+ def _is_under_final_output_dir(relative_path: Path) -> bool:
39
+ parts = relative_path.parts
40
+ names = final_output_folder_names()
41
+ for index, part in enumerate(parts):
42
+ if part == "review" and index + 1 < len(parts):
43
+ if parts[index + 1] in names:
44
+ return True
45
+ return False
46
+
47
+
48
+ def final_download_folder_name() -> str:
49
+ raw = os.environ.get("PI_FINAL_DOWNLOAD_FOLDER", _DEFAULT_FINAL_DOWNLOAD_FOLDER)
50
+ stripped = raw.strip() if raw else ""
51
+ return stripped or _DEFAULT_FINAL_DOWNLOAD_FOLDER
52
+
53
+
54
+ def final_download_dir(session_hash: str | None = None) -> Path:
55
+ """
56
+ Per-session staging folder for ``gr.File`` downloads.
57
+
58
+ Always ``{PI_WORKSPACE_DIR}/{session_id}/output_final_download/`` when a session
59
+ id is known, even if the broader workspace is shared (``PI_SESSION_WORKSPACE=false``).
60
+ """
61
+ base = workspace_base_dir().resolve()
62
+ folder = final_download_folder_name()
63
+ if not session_hash or not str(session_hash).strip():
64
+ return base / folder
65
+ safe_id = sanitize_session_id(str(session_hash))
66
+ return base / safe_id / folder
67
+
68
+
69
+ def _remove_path(path: Path) -> None:
70
+ """Best-effort delete (handles read-only / OneDrive locks on Windows)."""
71
+ try:
72
+ if path.is_dir() and not path.is_symlink():
73
+ shutil.rmtree(path, ignore_errors=True)
74
+ else:
75
+ path.unlink(missing_ok=True)
76
+ except OSError:
77
+ if not path.exists():
78
+ return
79
+ try:
80
+ os.chmod(path, 0o666)
81
+ if path.is_dir() and not path.is_symlink():
82
+ shutil.rmtree(path, ignore_errors=True)
83
+ else:
84
+ path.unlink(missing_ok=True)
85
+ except OSError:
86
+ pass
87
+
88
+
89
+ def _reset_download_dir(download_dir: Path) -> None:
90
+ """Clear staged downloads without removing the directory inode (safer on Windows)."""
91
+ download_dir.mkdir(parents=True, exist_ok=True)
92
+ for child in download_dir.iterdir():
93
+ _remove_path(child)
94
+
95
+
96
+ def _gradio_prefix_min_len() -> int:
97
+ raw = os.environ.get(
98
+ "PI_GRADIO_FILENAME_PREFIX_MIN_LEN",
99
+ str(_DEFAULT_GRADIO_PREFIX_MIN_LEN),
100
+ )
101
+ try:
102
+ return max(1, int(raw))
103
+ except ValueError:
104
+ return _DEFAULT_GRADIO_PREFIX_MIN_LEN
105
+
106
+
107
+ def strip_gradio_cache_prefix(filename: str) -> str:
108
+ """
109
+ Remove a leading Gradio cache id prefix (``{alphanumeric}_{name}``).
110
+
111
+ Gradio client downloads often prefix filenames with a long hash so repeated
112
+ exports do not collide; users expect the original basename instead.
113
+ """
114
+ pattern = re.compile(rf"^[A-Za-z0-9]{{{_gradio_prefix_min_len()},}}_(.+)$")
115
+ match = pattern.match(filename)
116
+ if match:
117
+ return match.group(1)
118
+ return filename
119
+
120
+
121
+ def _file_created_timestamp(path: Path) -> float:
122
+ stat = path.stat()
123
+ birth = getattr(stat, "st_birthtime", None)
124
+ if birth is not None and birth > 0:
125
+ return float(birth)
126
+ return float(stat.st_mtime)
127
+
128
+
129
+ def _collect_raw_final_output_files(
130
+ session_hash: str | None = None,
131
+ ) -> list[Path] | None:
132
+ """
133
+ Collect deliverable files from ``review/output_review_final/`` (and aliases)
134
+ anywhere under the session workspace.
135
+ """
136
+ root = workspace_root_from(session_hash)
137
+ if not root.is_dir():
138
+ return None
139
+
140
+ download_folder = final_download_folder_name()
141
+ candidates: list[Path] = []
142
+ try:
143
+ for path in root.rglob("*"):
144
+ if not path.is_file() or not _is_file_path(path.name):
145
+ continue
146
+ try:
147
+ relative = path.relative_to(root)
148
+ except ValueError:
149
+ continue
150
+ if download_folder in relative.parts:
151
+ continue
152
+ if not _is_under_final_output_dir(relative):
153
+ continue
154
+ try:
155
+ path.resolve(strict=False).relative_to(root)
156
+ except ValueError:
157
+ continue
158
+ candidates.append(path)
159
+ except OSError:
160
+ return None
161
+
162
+ if not candidates:
163
+ return None
164
+ return candidates
165
+
166
+
167
+ def build_final_download_files(
168
+ session_hash: str | None = None,
169
+ ) -> list[str] | None:
170
+ """
171
+ Stage cleaned deliverables under ``{session_id}/output_final_download/``.
172
+
173
+ Copies files from agent final-output folders, strips Gradio cache prefixes,
174
+ deduplicates by basename (newest file wins), and returns paths for ``gr.File``.
175
+ """
176
+ raw_files = _collect_raw_final_output_files(session_hash)
177
+ if not raw_files:
178
+ return None
179
+
180
+ download_dir = final_download_dir(session_hash)
181
+ _reset_download_dir(download_dir)
182
+
183
+ ordered = sorted(raw_files, key=_file_created_timestamp)
184
+ latest_by_name: dict[str, Path] = {}
185
+ for path in ordered:
186
+ latest_by_name[strip_gradio_cache_prefix(path.name)] = path
187
+
188
+ staged: list[str] = []
189
+ for name in sorted(latest_by_name):
190
+ source = latest_by_name[name]
191
+ destination = download_dir / name
192
+ destination.parent.mkdir(parents=True, exist_ok=True)
193
+ shutil.copy2(source, destination)
194
+ staged.append(str(destination.resolve()))
195
+ return staged or None
196
+
197
+
198
+ def collect_final_output_files(
199
+ session_hash: str | None = None,
200
+ ) -> list[str] | None:
201
+ """Return deduplicated, prefix-stripped deliverables for download and S3 export."""
202
+ return build_final_download_files(session_hash)
203
+
204
+
205
+ _REDACTED_PDF_SUFFIX = "_redacted.pdf"
206
+ _REVIEW_PDF_MARKER = "_redactions_for_review"
207
+ _PREVIEW_DIRNAME = ".pi/preview"
208
+ _PREVIEW_FILENAME = "latest_redacted.pdf"
209
+ _MIN_PDF_BYTES = 64
210
+
211
+
212
+ def _is_redacted_pdf_candidate(path: Path) -> bool:
213
+ """True for deliverable ``*_redacted.pdf`` names (not review-only copies)."""
214
+ name = path.name.lower()
215
+ if not name.endswith(_REDACTED_PDF_SUFFIX):
216
+ return False
217
+ if _REVIEW_PDF_MARKER in name:
218
+ return False
219
+ return True
220
+
221
+
222
+ def _is_valid_pdf_file(path: Path, *, min_bytes: int = _MIN_PDF_BYTES) -> bool:
223
+ """Reject empty, partial, or non-PDF files (e.g. HTML error bodies from failed downloads)."""
224
+ try:
225
+ if not path.is_file():
226
+ return False
227
+ if path.stat().st_size < min_bytes:
228
+ return False
229
+ with path.open("rb") as handle:
230
+ return handle.read(5).startswith(b"%PDF-")
231
+ except OSError:
232
+ return False
233
+
234
+
235
+ def _find_newest_valid_redacted_pdf(session_hash: str | None) -> Path | None:
236
+ """Newest readable ``*_redacted.pdf`` under the session workspace."""
237
+ root = workspace_root_from(session_hash)
238
+ if not root.is_dir():
239
+ return None
240
+
241
+ newest: tuple[float, Path] | None = None
242
+ try:
243
+ for path in root.rglob("*"):
244
+ if not path.is_file() or not _is_redacted_pdf_candidate(path):
245
+ continue
246
+ if not _is_valid_pdf_file(path):
247
+ continue
248
+ try:
249
+ path.resolve(strict=False).relative_to(root.resolve())
250
+ except ValueError:
251
+ continue
252
+ timestamp = _file_created_timestamp(path)
253
+ if newest is None or timestamp > newest[0]:
254
+ newest = (timestamp, path)
255
+ except OSError:
256
+ return None
257
+
258
+ return newest[1] if newest else None
259
+
260
+
261
+ def _staged_preview_pdf_path(session_hash: str | None) -> Path:
262
+ root = workspace_root_from(session_hash)
263
+ return root / ".pi" / "preview" / _PREVIEW_FILENAME
264
+
265
+
266
+ def _stage_preview_pdf(source: Path, session_hash: str | None) -> Path:
267
+ """
268
+ Copy *source* into a stable preview path under the session workspace.
269
+
270
+ The Gradio PDF component reads a single file path; staging avoids serving
271
+ files that are still being written in ``output_redact/`` and gives a
272
+ consistent path under ``allowed_paths``.
273
+ """
274
+ dest = _staged_preview_pdf_path(session_hash)
275
+ dest.parent.mkdir(parents=True, exist_ok=True)
276
+ tmp = dest.with_name(dest.name + ".tmp")
277
+ shutil.copy2(source, tmp)
278
+ tmp.replace(dest)
279
+ return dest.resolve()
280
+
281
+
282
+ def latest_redacted_pdf_path(session_hash: str | None = None) -> str | None:
283
+ """
284
+ Return the newest valid ``*_redacted.pdf`` for the Gradio PDF preview.
285
+
286
+ Copies the chosen file to ``{session}/.pi/preview/latest_redacted.pdf`` so
287
+ the component always receives a complete PDF under the workspace root.
288
+ """
289
+ source = _find_newest_valid_redacted_pdf(session_hash)
290
+ staged = _staged_preview_pdf_path(session_hash)
291
+ if source is None:
292
+ if _is_valid_pdf_file(staged):
293
+ return str(staged.resolve())
294
+ return None
295
+
296
+ try:
297
+ if staged.is_file():
298
+ src_mtime = _file_created_timestamp(source)
299
+ staged_mtime = _file_created_timestamp(staged)
300
+ if (
301
+ src_mtime <= staged_mtime
302
+ and staged.stat().st_size == source.stat().st_size
303
+ and _is_valid_pdf_file(staged)
304
+ ):
305
+ return str(staged.resolve())
306
+ except OSError:
307
+ pass
308
+
309
+ return str(_stage_preview_pdf(source, session_hash))
310
+
311
+
312
+ def workspace_root_from(session_hash: str | None = None) -> Path:
313
+ """Resolve the session workspace from a sanitized Gradio session hash only."""
314
+ if not session_hash or not str(session_hash).strip():
315
+ return workspace_base_dir().resolve()
316
+ return session_workspace_dir(str(session_hash).strip())
317
+
318
+
319
+ def _is_file_path(path: str) -> bool:
320
+ if not path or not path.strip():
321
+ return False
322
+ name = Path(path.rstrip("/\\")).name
323
+ if not name or "." not in name:
324
+ return False
325
+ ext = name.rsplit(".", 1)[-1]
326
+ return bool(ext and len(ext) <= 10 and ext.isalnum())
327
+
328
+
329
+ def _is_safe_workspace_relative_path(path: str) -> bool:
330
+ """Reject absolute paths and traversal segments before joining under workspace."""
331
+ if not path or not path.strip():
332
+ return False
333
+ candidate = Path(path.strip())
334
+ if candidate.is_absolute() or candidate.anchor:
335
+ return False
336
+ return all(part not in ("", ".", "..") for part in candidate.parts)
337
+
338
+
339
+ def _resolve_under_workspace(
340
+ path: str,
341
+ *,
342
+ workspace_root: Path | None = None,
343
+ ) -> Path | None:
344
+ if not path or not path.strip():
345
+ return None
346
+
347
+ root = (workspace_root or workspace_base_dir()).resolve()
348
+ stripped = path.strip()
349
+ try:
350
+ user_path = Path(stripped)
351
+ if user_path.is_absolute():
352
+ # Gradio FileExplorer may return absolute paths already under root_dir.
353
+ resolved = user_path.resolve(strict=False)
354
+ elif _is_safe_workspace_relative_path(stripped):
355
+ resolved = root.joinpath(*user_path.parts).resolve(strict=False)
356
+ else:
357
+ return None
358
+ resolved.relative_to(root)
359
+ except (ValueError, OSError):
360
+ return None
361
+ return resolved if resolved.is_file() else None
362
+
363
+
364
+ def load_workspace_output_files(session_hash: str = ""):
365
+ root = workspace_root_from(session_hash or None)
366
+ root.mkdir(parents=True, exist_ok=True)
367
+ return gr.FileExplorer(root_dir=str(root))
368
+
369
+
370
+ def refresh_workspace_output_files_stub():
371
+ return gr.FileExplorer(root_dir=str(REFRESH_STUB_DIR.resolve()))
372
+
373
+
374
+ def gradio_allowed_paths() -> list[str]:
375
+ """Paths Gradio may serve via gr.File (must include the shared workspace)."""
376
+ paths: list[str] = []
377
+ for raw in (
378
+ workspace_base_dir(),
379
+ str(pi_repo_root_path()),
380
+ REFRESH_STUB_DIR,
381
+ "/tmp",
382
+ ):
383
+ try:
384
+ resolved = str(Path(raw).resolve())
385
+ except OSError:
386
+ continue
387
+ if resolved not in paths:
388
+ paths.append(resolved)
389
+ for raw in gradio_example_allowed_paths():
390
+ if raw not in paths:
391
+ paths.append(raw)
392
+ for raw in gradio_session_log_allowed_paths():
393
+ if raw not in paths:
394
+ paths.append(raw)
395
+ return paths
396
+
397
+
398
+ def refresh_workspace_panel(
399
+ session_hash: str = "",
400
+ ) -> tuple[Any, list[str] | None]:
401
+ """Refresh file explorer and auto-detected final deliverables."""
402
+ return (
403
+ load_workspace_output_files(session_hash),
404
+ collect_final_output_files(session_hash),
405
+ )
406
+
407
+
408
+ def workspace_files_download_fn(
409
+ selected: list[str] | None,
410
+ session_hash: str = "",
411
+ ) -> list[str] | None:
412
+ """Return only file paths under the session workspace (for gr.File download)."""
413
+ if not selected:
414
+ return None
415
+ root = workspace_root_from(session_hash or None)
416
+ downloads: list[str] = []
417
+ for raw in selected:
418
+ if not _is_file_path(raw):
419
+ continue
420
+ resolved = _resolve_under_workspace(raw, workspace_root=root)
421
+ if resolved is not None:
422
+ downloads.append(str(resolved))
423
+ return downloads or None
agent-redact/pi/pi_agent_config.py ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate Pi agent models.json and settings.json at runtime."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ DEPLOYMENT_LOCAL = "local-docker"
11
+ DEPLOYMENT_HF_SPACE = "hf-space"
12
+ DEPLOYMENT_AWS_ECS = "aws-ecs"
13
+
14
+
15
+ def resolve_agent_dir() -> Path:
16
+ """Directory for Pi ``models.json`` / ``settings.json`` (must be writable at runtime)."""
17
+ explicit = (os.environ.get("PI_CODING_AGENT_DIR") or "").strip()
18
+ if explicit:
19
+ return Path(explicit)
20
+ profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
21
+ # HF Space and ECS often use a read-only root FS; only mounted paths (or /tmp) are writable.
22
+ if profile in (DEPLOYMENT_HF_SPACE, DEPLOYMENT_AWS_ECS):
23
+ return Path("/tmp/pi-agent")
24
+ return Path.home() / ".pi" / "agent"
25
+
26
+
27
+ # Back-compat alias; prefer resolve_agent_dir() when env may change after import.
28
+ AGENT_DIR = resolve_agent_dir()
29
+ TEMPLATE_DIR = Path(__file__).resolve().parent / "agent"
30
+ SETTINGS_TEMPLATE = TEMPLATE_DIR / "settings.json"
31
+
32
+ DEPLOYMENT_PROFILE = (
33
+ os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
34
+ )
35
+
36
+
37
+ def pi_max_retries() -> int:
38
+ """Max retries for Pi auto-retry and Gradio quota backoff (env: PI_MAX_RETRIES, default 5)."""
39
+ raw = (
40
+ os.environ.get("PI_QUOTA_RETRY_ATTEMPTS")
41
+ or os.environ.get("PI_MAX_RETRIES")
42
+ or "5"
43
+ ).strip()
44
+ return int(raw)
45
+
46
+
47
+ def _apply_retry_settings(
48
+ settings: dict[str, Any],
49
+ *,
50
+ provider: str,
51
+ ) -> None:
52
+ """Write Pi ``settings.json`` retry block (cloud providers use longer delays)."""
53
+ max_retries = pi_max_retries()
54
+ use_long_delays = (
55
+ provider == PROVIDER_GEMINI
56
+ or provider == PROVIDER_BEDROCK
57
+ or is_hf_space_profile()
58
+ or is_aws_ecs_profile()
59
+ )
60
+ base_delay_ms = 2000
61
+ max_delay_ms = 60000
62
+ if use_long_delays:
63
+ default_base_ms = int(os.environ.get("PI_QUOTA_RETRY_DELAY_S", "60")) * 1000
64
+ default_max_ms = int(default_base_ms * 1.5)
65
+ if provider == PROVIDER_BEDROCK or (
66
+ is_aws_ecs_profile() and not is_hf_space_profile()
67
+ ):
68
+ prefix = "PI_BEDROCK"
69
+ else:
70
+ prefix = "PI_GEMINI"
71
+ base_delay_ms = int(
72
+ os.environ.get(f"{prefix}_RETRY_BASE_DELAY_MS")
73
+ or os.environ.get("PI_GEMINI_RETRY_BASE_DELAY_MS", str(default_base_ms))
74
+ )
75
+ max_delay_ms = int(
76
+ os.environ.get(f"{prefix}_RETRY_MAX_DELAY_MS")
77
+ or os.environ.get("PI_GEMINI_RETRY_MAX_DELAY_MS", str(default_max_ms))
78
+ )
79
+ settings["retry"] = {
80
+ "enabled": True,
81
+ "maxRetries": max_retries,
82
+ "baseDelayMs": base_delay_ms,
83
+ "provider": {
84
+ "timeoutMs": 3600000,
85
+ "maxRetries": max_retries,
86
+ "maxRetryDelayMs": max_delay_ms,
87
+ },
88
+ }
89
+
90
+
91
+ PROVIDER_LLAMA = "llama-cpp"
92
+ PROVIDER_GEMINI = "google-gemini"
93
+ PROVIDER_BEDROCK = "amazon-bedrock"
94
+
95
+ PROVIDER_LABELS: dict[str, str] = {
96
+ PROVIDER_LLAMA: "Local (llama-cpp)",
97
+ PROVIDER_GEMINI: "Gemini",
98
+ PROVIDER_BEDROCK: "AWS Bedrock",
99
+ }
100
+
101
+
102
+ def is_hf_space_profile() -> bool:
103
+ profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
104
+ return profile == DEPLOYMENT_HF_SPACE
105
+
106
+
107
+ def is_aws_ecs_profile() -> bool:
108
+ profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
109
+ return profile == DEPLOYMENT_AWS_ECS
110
+
111
+
112
+ def uses_split_redaction_backend() -> bool:
113
+ """
114
+ True when Pi and doc_redaction run in separate containers (no shared output disk).
115
+
116
+ HF Space and AWS ECS use Gradio HTTP download; local-docker typically shares a host
117
+ volume. Override with ``PI_REDACTION_SPLIT_BACKEND=true|false``.
118
+ """
119
+ explicit = (os.environ.get("PI_REDACTION_SPLIT_BACKEND") or "").strip().lower()
120
+ if explicit in {"1", "true", "yes", "on"}:
121
+ return True
122
+ if explicit in {"0", "false", "no", "off"}:
123
+ return False
124
+ return is_hf_space_profile() or is_aws_ecs_profile()
125
+
126
+
127
+ def resolve_llama_base_url() -> str:
128
+ """
129
+ OpenAI-compatible base URL for Pi's ``llama-cpp`` provider (includes ``/v1``).
130
+
131
+ Reads ``PI_LLAMA_BASE_URL``; also accepts legacy aliases
132
+ ``PI_LLAMA_MODE_BASE_URL`` and ``PI_LLAMA_MODE__BASE_URL``.
133
+ """
134
+ for key in (
135
+ "PI_LLAMA_BASE_URL",
136
+ "PI_LLAMA_MODE_BASE_URL",
137
+ ):
138
+ raw = (os.environ.get(key) or "").strip().rstrip("/")
139
+ if raw:
140
+ return raw if raw.endswith("/v1") else f"{raw}/v1"
141
+ return "http://llama-inference:8080/v1"
142
+
143
+
144
+ LLAMA_BASE_URL = resolve_llama_base_url()
145
+ LLAMA_MODEL_ID = os.environ.get("PI_LLAMA_MODEL_ID", "unsloth/Qwen3.6-27B-MTP-GGUF")
146
+ LLAMA_CONTEXT = int(os.environ.get("PI_LLAMA_CONTEXT_WINDOW", "114688"))
147
+ LLAMA_MAX_TOKENS = int(os.environ.get("PI_LLAMA_MAX_TOKENS", "32768"))
148
+
149
+ GEMINI_MODELS: tuple[tuple[str, str, int, bool], ...] = (
150
+ ("gemini-flash-lite-latest", "Gemini Flash Lite", 1048576, False),
151
+ ("gemini-flash-latest", "Gemini Flash", 1048576, True),
152
+ ("gemini-pro-latest", "Gemini Pro", 1048576, True),
153
+ )
154
+
155
+ BEDROCK_MODELS: tuple[tuple[str, str, int, bool], ...] = (
156
+ (
157
+ "anthropic.claude-sonnet-4-6",
158
+ "Anthropic Claude Sonnet 4.6 (Bedrock)",
159
+ 1000000,
160
+ True,
161
+ ),
162
+ ("amazon.nova-pro-v1:0", "Amazon Nova Pro (Bedrock)", 300000, False),
163
+ (
164
+ "nvidia.nemotron-super-3-120b",
165
+ "NVIDIA Nemotron Super 3 120B (Bedrock)",
166
+ 262000,
167
+ False,
168
+ ),
169
+ ("mistral.devstral-2-123b", "Mistral Devstral 2 123B (Bedrock)", 256000, False),
170
+ )
171
+
172
+ PROVIDER_MODELS: dict[str, list[str]] = {
173
+ PROVIDER_LLAMA: [LLAMA_MODEL_ID],
174
+ PROVIDER_GEMINI: [model_id for model_id, _, _, _ in GEMINI_MODELS],
175
+ PROVIDER_BEDROCK: [model_id for model_id, _, _, _ in BEDROCK_MODELS],
176
+ }
177
+
178
+ DEFAULT_MODEL_BY_PROVIDER: dict[str, str] = {
179
+ PROVIDER_LLAMA: LLAMA_MODEL_ID,
180
+ PROVIDER_GEMINI: GEMINI_MODELS[0][0], # Gemini Flash Lite
181
+ PROVIDER_BEDROCK: "anthropic.claude-sonnet-4-6",
182
+ }
183
+
184
+
185
+ def get_default_provider() -> str:
186
+ """Current default Pi provider (reads ``PI_DEFAULT_PROVIDER`` from env each call)."""
187
+ if is_hf_space_profile():
188
+ return PROVIDER_GEMINI
189
+ raw = (os.environ.get("PI_DEFAULT_PROVIDER") or "").strip()
190
+ if raw in PROVIDER_MODELS:
191
+ return raw
192
+ if is_aws_ecs_profile():
193
+ return PROVIDER_BEDROCK
194
+ return PROVIDER_LLAMA
195
+
196
+
197
+ DEFAULT_PROVIDER = get_default_provider()
198
+
199
+
200
+ def _catalog_contains_model(model_id: str, provider: str) -> bool:
201
+ """True when *model_id* is listed for a non-llama *provider*."""
202
+ return model_id in PROVIDER_MODELS.get(provider, ())
203
+
204
+
205
+ _env_default_model = (os.environ.get("PI_DEFAULT_MODEL") or "").strip()
206
+ if _env_default_model and (
207
+ DEFAULT_PROVIDER == PROVIDER_LLAMA
208
+ or _catalog_contains_model(_env_default_model, DEFAULT_PROVIDER)
209
+ ):
210
+ DEFAULT_MODEL = _env_default_model
211
+ else:
212
+ DEFAULT_MODEL = DEFAULT_MODEL_BY_PROVIDER.get(DEFAULT_PROVIDER, LLAMA_MODEL_ID)
213
+
214
+
215
+ def llama_model_id() -> str:
216
+ """Active llama-cpp model id (runtime ``PI_LLAMA_MODEL_ID`` or startup default)."""
217
+ return (
218
+ os.environ.get("PI_LLAMA_MODEL_ID") or LLAMA_MODEL_ID
219
+ ).strip() or LLAMA_MODEL_ID
220
+
221
+
222
+ def resolved_default_model(provider: str, *, override: str | None = None) -> str:
223
+ """
224
+ Pick the default model id for a provider.
225
+
226
+ Order: explicit override → ``PI_DEFAULT_MODEL`` when valid for *provider* →
227
+ built-in per-provider default (llama uses ``PI_LLAMA_MODEL_ID``).
228
+ """
229
+ if override and override.strip():
230
+ return override.strip()
231
+ normalized = normalize_provider(provider)
232
+ env_model = (os.environ.get("PI_DEFAULT_MODEL") or "").strip()
233
+ active_provider = normalize_provider(get_default_provider())
234
+ if env_model:
235
+ if normalized == PROVIDER_LLAMA:
236
+ if active_provider == PROVIDER_LLAMA:
237
+ return env_model
238
+ elif _catalog_contains_model(env_model, normalized):
239
+ return env_model
240
+ if normalized == PROVIDER_LLAMA:
241
+ return llama_model_id()
242
+ return DEFAULT_MODEL_BY_PROVIDER.get(normalized, LLAMA_MODEL_ID)
243
+
244
+
245
+ def normalize_backend_model(provider: str, model_id: str | None) -> str:
246
+ """
247
+ Resolve a UI/backend model selection to a concrete model id.
248
+
249
+ llama-cpp accepts any non-empty id (llama-swap / custom OpenAI model names).
250
+ Other providers must match the static catalog.
251
+ """
252
+ normalized = normalize_provider(provider)
253
+ model = (model_id or default_model_for_provider(normalized)).strip()
254
+ if not model:
255
+ return default_model_for_provider(normalized)
256
+ if normalized == PROVIDER_LLAMA:
257
+ return model
258
+ if model in models_for_provider(normalized):
259
+ return model
260
+ return default_model_for_provider(normalized)
261
+
262
+
263
+ def _zero_cost() -> dict[str, int]:
264
+ return {"input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0}
265
+
266
+
267
+ def _model_entry(
268
+ model_id: str,
269
+ name: str,
270
+ *,
271
+ context_window: int,
272
+ max_tokens: int,
273
+ reasoning: bool,
274
+ image_input: bool = True,
275
+ ) -> dict[str, Any]:
276
+ inputs = ["text", "image"] if image_input else ["text"]
277
+ return {
278
+ "id": model_id,
279
+ "name": name,
280
+ "reasoning": reasoning,
281
+ "input": inputs,
282
+ "contextWindow": context_window,
283
+ "maxTokens": max_tokens,
284
+ "cost": _zero_cost(),
285
+ }
286
+
287
+
288
+ def _llama_provider() -> dict[str, Any]:
289
+ model_id = llama_model_id()
290
+ return {
291
+ "baseUrl": LLAMA_BASE_URL,
292
+ "api": "openai-completions",
293
+ "apiKey": "llama-cpp",
294
+ "compat": {
295
+ "supportsDeveloperRole": False,
296
+ "supportsReasoningEffort": False,
297
+ "supportsUsageInStreaming": False,
298
+ "maxTokensField": "max_tokens",
299
+ },
300
+ "models": [
301
+ _model_entry(
302
+ model_id,
303
+ f"Local ({model_id})",
304
+ context_window=LLAMA_CONTEXT,
305
+ max_tokens=LLAMA_MAX_TOKENS,
306
+ reasoning=False,
307
+ )
308
+ ],
309
+ }
310
+
311
+
312
+ def _gemini_provider() -> dict[str, Any]:
313
+ return {
314
+ "baseUrl": "https://generativelanguage.googleapis.com/v1beta",
315
+ "api": "google-generative-ai",
316
+ "apiKey": "GEMINI_API_KEY",
317
+ "models": [
318
+ _model_entry(
319
+ model_id, name, context_window=ctx, max_tokens=8192, reasoning=reasoning
320
+ )
321
+ for model_id, name, ctx, reasoning in GEMINI_MODELS
322
+ ],
323
+ }
324
+
325
+
326
+ def _bedrock_region() -> str:
327
+ return (
328
+ os.environ.get("AWS_REGION")
329
+ or os.environ.get("AWS_DEFAULT_REGION")
330
+ or "eu-west-2"
331
+ )
332
+
333
+
334
+ _AWS_CREDENTIAL_ENV_KEYS: tuple[str, ...] = (
335
+ "AWS_ACCESS_KEY_ID",
336
+ "AWS_SECRET_ACCESS_KEY",
337
+ "AWS_SESSION_TOKEN",
338
+ "AWS_ACCESS_KEY",
339
+ "AWS_SECRET_KEY",
340
+ )
341
+ _AWS_PROFILE_ENV_KEYS: tuple[str, ...] = ("AWS_PROFILE", "PI_AWS_PROFILE")
342
+
343
+
344
+ def _env_flag(name: str, *, default: bool = False) -> bool:
345
+ raw = os.environ.get(name)
346
+ if raw is None:
347
+ return default
348
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
349
+
350
+
351
+ def _strip_empty_env_vars(names: tuple[str, ...]) -> None:
352
+ for name in names:
353
+ if not (os.environ.get(name) or "").strip():
354
+ os.environ.pop(name, None)
355
+
356
+
357
+ def _mirror_legacy_aws_key_env_vars() -> None:
358
+ if not (os.environ.get("AWS_ACCESS_KEY_ID") or "").strip():
359
+ legacy = (os.environ.get("AWS_ACCESS_KEY") or "").strip()
360
+ if legacy:
361
+ os.environ["AWS_ACCESS_KEY_ID"] = legacy
362
+ if not (os.environ.get("AWS_SECRET_ACCESS_KEY") or "").strip():
363
+ legacy = (os.environ.get("AWS_SECRET_KEY") or "").strip()
364
+ if legacy:
365
+ os.environ["AWS_SECRET_ACCESS_KEY"] = legacy
366
+
367
+
368
+ def _has_explicit_aws_access_keys() -> bool:
369
+ access = (
370
+ os.environ.get("AWS_ACCESS_KEY_ID") or os.environ.get("AWS_ACCESS_KEY") or ""
371
+ ).strip()
372
+ secret = (
373
+ os.environ.get("AWS_SECRET_ACCESS_KEY")
374
+ or os.environ.get("AWS_SECRET_KEY")
375
+ or ""
376
+ ).strip()
377
+ return bool(access and secret)
378
+
379
+
380
+ def _aws_config_path() -> Path | None:
381
+ explicit = (os.environ.get("AWS_CONFIG_FILE") or "").strip()
382
+ if explicit:
383
+ path = Path(explicit).expanduser()
384
+ return path if path.is_file() else None
385
+ home = Path(os.environ.get("HOME", "/home/user"))
386
+ path = home / ".aws" / "config"
387
+ return path if path.is_file() else None
388
+
389
+
390
+ def _discover_aws_profile_from_config() -> str | None:
391
+ """Return an AWS profile name for Pi/Bedrock when only ~/.aws is mounted."""
392
+ explicit = (os.environ.get("PI_AWS_PROFILE") or "").strip()
393
+ if not explicit:
394
+ explicit = (os.environ.get("AWS_PROFILE") or "").strip()
395
+ if explicit:
396
+ return explicit
397
+
398
+ path = _aws_config_path()
399
+ if not path:
400
+ return None
401
+
402
+ current_profile: str | None = None
403
+ sso_profiles: list[str] = []
404
+ all_profiles: list[str] = []
405
+
406
+ for raw_line in path.read_text(encoding="utf-8").splitlines():
407
+ line = raw_line.strip()
408
+ if not line or line.startswith("#") or line.startswith(";"):
409
+ continue
410
+ if line == "[default]":
411
+ current_profile = "default"
412
+ all_profiles.append("default")
413
+ continue
414
+ if line.startswith("[profile ") and line.endswith("]"):
415
+ current_profile = line[len("[profile ") : -1].strip()
416
+ if current_profile:
417
+ all_profiles.append(current_profile)
418
+ continue
419
+ if current_profile and line.startswith("sso_session"):
420
+ sso_profiles.append(current_profile)
421
+
422
+ if sso_profiles:
423
+ return sso_profiles[0]
424
+ if "default" in all_profiles:
425
+ return "default"
426
+ return all_profiles[0] if all_profiles else None
427
+
428
+
429
+ def _region_from_aws_config(profile: str | None = None) -> str | None:
430
+ """Read ``region =`` from a profile block in ``~/.aws/config``."""
431
+ path = _aws_config_path()
432
+ if not path:
433
+ return None
434
+
435
+ target = (profile or _discover_aws_profile_from_config() or "").strip()
436
+ if not target:
437
+ return None
438
+
439
+ current_profile: str | None = None
440
+ for raw_line in path.read_text(encoding="utf-8").splitlines():
441
+ line = raw_line.strip()
442
+ if not line or line.startswith("#") or line.startswith(";"):
443
+ continue
444
+ if line == "[default]":
445
+ current_profile = "default"
446
+ continue
447
+ if line.startswith("[profile ") and line.endswith("]"):
448
+ current_profile = line[len("[profile ") : -1].strip()
449
+ continue
450
+ if current_profile != target:
451
+ continue
452
+ if line.startswith("region"):
453
+ _, _, value = line.partition("=")
454
+ region = value.strip()
455
+ if region:
456
+ return region
457
+ return None
458
+
459
+
460
+ def _ensure_aws_region_env() -> None:
461
+ """Ensure AWS SDK env has a non-empty region (profile config, then eu-west-2)."""
462
+ _strip_empty_env_vars(("AWS_REGION", "AWS_DEFAULT_REGION"))
463
+ region = (
464
+ os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
465
+ ).strip()
466
+ if not region:
467
+ profile = (os.environ.get("AWS_PROFILE") or "").strip()
468
+ region = (_region_from_aws_config(profile) or "").strip()
469
+ if not region:
470
+ region = _bedrock_region()
471
+ os.environ["AWS_REGION"] = region
472
+ os.environ["AWS_DEFAULT_REGION"] = region
473
+
474
+
475
+ def _pi_bedrock_auth_visible() -> bool:
476
+ """True when Pi's amazon-bedrock provider would detect configured auth."""
477
+ if (os.environ.get("AWS_PROFILE") or "").strip():
478
+ return True
479
+ if _has_explicit_aws_access_keys():
480
+ return True
481
+ if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
482
+ return True
483
+ return False
484
+
485
+
486
+ def _ensure_pi_bedrock_auth_env() -> None:
487
+ """
488
+ Pi checks env vars (not ~/.aws alone) before Bedrock is usable.
489
+
490
+ When SSO credentials live in a mounted ``~/.aws`` tree, set ``AWS_PROFILE``
491
+ so Pi passes its auth preflight and the AWS SDK loads the profile.
492
+ """
493
+ if _pi_bedrock_auth_visible():
494
+ return
495
+ profile = _discover_aws_profile_from_config()
496
+ if profile:
497
+ os.environ["AWS_PROFILE"] = profile
498
+
499
+
500
+ def configure_aws_credentials(
501
+ *,
502
+ session_access_key_id: str | None = None,
503
+ session_secret_access_key: str | None = None,
504
+ session_session_token: str | None = None,
505
+ ) -> None:
506
+ """
507
+ Align Pi Bedrock AWS env with doc_redaction SSO/key priority.
508
+
509
+ Mirrors ``tools/file_redaction.py``: when ``RUN_AWS_FUNCTIONS`` is enabled,
510
+ prefer the default credential chain (SSO profile, instance role, etc.) over
511
+ static env keys when ``PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS`` is true.
512
+ Explicit UI session keys from **Apply backend** always win.
513
+ """
514
+ _strip_empty_env_vars(_AWS_CREDENTIAL_ENV_KEYS)
515
+ _strip_empty_env_vars(_AWS_PROFILE_ENV_KEYS)
516
+ _mirror_legacy_aws_key_env_vars()
517
+
518
+ session_explicit = bool(
519
+ session_access_key_id
520
+ and session_access_key_id.strip()
521
+ and session_secret_access_key
522
+ and session_secret_access_key.strip()
523
+ )
524
+ if session_explicit:
525
+ os.environ["AWS_ACCESS_KEY_ID"] = session_access_key_id.strip()
526
+ os.environ["AWS_SECRET_ACCESS_KEY"] = session_secret_access_key.strip()
527
+ if session_session_token and session_session_token.strip():
528
+ os.environ["AWS_SESSION_TOKEN"] = session_session_token.strip()
529
+ else:
530
+ os.environ.pop("AWS_SESSION_TOKEN", None)
531
+ _ensure_aws_region_env()
532
+ return
533
+
534
+ run_aws = _env_flag("RUN_AWS_FUNCTIONS")
535
+ prioritise_sso = _env_flag("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", default=True)
536
+
537
+ if run_aws and prioritise_sso:
538
+ for key in _AWS_CREDENTIAL_ENV_KEYS:
539
+ os.environ.pop(key, None)
540
+ _ensure_pi_bedrock_auth_env()
541
+ elif run_aws:
542
+ for key in _AWS_CREDENTIAL_ENV_KEYS:
543
+ os.environ.pop(key, None)
544
+ _ensure_pi_bedrock_auth_env()
545
+
546
+ # Propagate PI_AWS_PROFILE when only that alias is set (e.g. pi_agent.env).
547
+ pi_profile = (os.environ.get("PI_AWS_PROFILE") or "").strip()
548
+ if pi_profile and not (os.environ.get("AWS_PROFILE") or "").strip():
549
+ os.environ["AWS_PROFILE"] = pi_profile
550
+
551
+ _ensure_aws_region_env()
552
+
553
+
554
+ def _aws_credential_status() -> str:
555
+ if _has_explicit_aws_access_keys():
556
+ return "access keys"
557
+ profile = (os.environ.get("AWS_PROFILE") or "").strip()
558
+ if profile:
559
+ return f"profile {profile}"
560
+ if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
561
+ return "Bedrock bearer token"
562
+ if _aws_config_path():
563
+ return "SSO config mounted (profile not set)"
564
+ if _env_flag("RUN_AWS_FUNCTIONS"):
565
+ return "SSO/default chain (missing profile)"
566
+ return "missing"
567
+
568
+
569
+ def _bedrock_provider() -> dict[str, Any]:
570
+ region = _bedrock_region()
571
+ return {
572
+ "baseUrl": f"https://bedrock-runtime.{region}.amazonaws.com",
573
+ "api": "bedrock-converse-stream",
574
+ "models": [
575
+ _model_entry(
576
+ model_id,
577
+ name,
578
+ context_window=ctx,
579
+ max_tokens=8192,
580
+ reasoning=reasoning,
581
+ )
582
+ for model_id, name, ctx, reasoning in BEDROCK_MODELS
583
+ ],
584
+ }
585
+
586
+
587
+ def build_models_config() -> dict[str, Any]:
588
+ if is_hf_space_profile():
589
+ return {"providers": {PROVIDER_GEMINI: _gemini_provider()}}
590
+ return {
591
+ "providers": {
592
+ PROVIDER_LLAMA: _llama_provider(),
593
+ PROVIDER_GEMINI: _gemini_provider(),
594
+ PROVIDER_BEDROCK: _bedrock_provider(),
595
+ }
596
+ }
597
+
598
+
599
+ def _load_settings_template() -> dict[str, Any]:
600
+ if SETTINGS_TEMPLATE.is_file():
601
+ return json.loads(SETTINGS_TEMPLATE.read_text(encoding="utf-8"))
602
+ return {
603
+ "defaultThinkingLevel": "off",
604
+ "hideThinkingBlock": True,
605
+ "compaction": {
606
+ "enabled": True,
607
+ "reserveTokens": 32768,
608
+ "keepRecentTokens": 20000,
609
+ },
610
+ "enableSkillCommands": True,
611
+ "sessionDir": "sessions",
612
+ }
613
+
614
+
615
+ def _apply_compaction_settings(settings: dict[str, Any]) -> None:
616
+ """
617
+ Merge Pi session auto-compaction from env into ``settings.json``.
618
+
619
+ ``PI_COMPACTION_ENABLED`` — when set, overrides the template ``compaction.enabled``
620
+ flag (``true`` / ``false``). When unset, the template default applies (enabled).
621
+
622
+ Optional tuning: ``PI_COMPACTION_RESERVE_TOKENS``, ``PI_COMPACTION_KEEP_RECENT_TOKENS``.
623
+ """
624
+ compaction = dict(
625
+ settings.get("compaction")
626
+ or {
627
+ "enabled": True,
628
+ "reserveTokens": 32768,
629
+ "keepRecentTokens": 20000,
630
+ }
631
+ )
632
+ if os.environ.get("PI_COMPACTION_ENABLED") is not None:
633
+ compaction["enabled"] = _env_flag("PI_COMPACTION_ENABLED")
634
+ reserve = (os.environ.get("PI_COMPACTION_RESERVE_TOKENS") or "").strip()
635
+ if reserve:
636
+ compaction["reserveTokens"] = int(reserve)
637
+ elif LLAMA_CONTEXT < 100_000:
638
+ # Smaller local models (e.g. Gemma 4 31B at 65536): default reserve was 32768.
639
+ compaction["reserveTokens"] = min(16_384, max(8_192, LLAMA_CONTEXT // 4))
640
+ keep = (os.environ.get("PI_COMPACTION_KEEP_RECENT_TOKENS") or "").strip()
641
+ if keep:
642
+ compaction["keepRecentTokens"] = int(keep)
643
+ elif LLAMA_CONTEXT < 100_000:
644
+ compaction["keepRecentTokens"] = min(12_288, max(4_096, LLAMA_CONTEXT // 5))
645
+ settings["compaction"] = compaction
646
+
647
+
648
+ def resolve_session_dir() -> str:
649
+ """Pi session JSONL directory (absolute path or relative to ``AGENT_DIR``)."""
650
+ explicit = os.environ.get("PI_SESSION_DIR", "").strip()
651
+ if explicit:
652
+ return explicit
653
+ if is_hf_space_profile():
654
+ return "/tmp/pi-sessions"
655
+ return "sessions"
656
+
657
+
658
+ def ensure_session_dir(session_dir: str | None = None) -> Path:
659
+ """Create the Pi session directory and return its resolved absolute path."""
660
+ raw = (session_dir or resolve_session_dir()).strip()
661
+ path = Path(raw)
662
+ if not path.is_absolute():
663
+ path = (resolve_agent_dir() / path).resolve()
664
+ else:
665
+ path = path.resolve()
666
+ path.mkdir(parents=True, exist_ok=True)
667
+ return path
668
+
669
+
670
+ def build_settings_config(
671
+ *,
672
+ default_provider: str | None = None,
673
+ default_model: str | None = None,
674
+ ) -> dict[str, Any]:
675
+ provider = default_provider or get_default_provider()
676
+ if provider not in PROVIDER_MODELS:
677
+ provider = PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
678
+ model = resolved_default_model(provider, override=default_model)
679
+
680
+ settings = _load_settings_template()
681
+ settings["defaultProvider"] = provider
682
+ settings["defaultModel"] = model
683
+ _apply_compaction_settings(settings)
684
+ session_path = ensure_session_dir(resolve_session_dir())
685
+ settings["sessionDir"] = session_path.as_posix()
686
+ if (
687
+ is_hf_space_profile()
688
+ or is_aws_ecs_profile()
689
+ or provider in (PROVIDER_GEMINI, PROVIDER_BEDROCK)
690
+ ):
691
+ _apply_retry_settings(settings, provider=provider)
692
+ from pi_workspace_skills import ensure_workspace_skills, workspace_skills_dir
693
+
694
+ ensure_workspace_skills()
695
+ settings["skills"] = [workspace_skills_dir().as_posix()]
696
+ return settings
697
+
698
+
699
+ def write_runtime_config(
700
+ *,
701
+ agent_dir: Path | None = None,
702
+ default_provider: str | None = None,
703
+ default_model: str | None = None,
704
+ ) -> tuple[Path, Path]:
705
+ """Write models.json and settings.json; return their paths."""
706
+ provider = normalize_provider(default_provider or get_default_provider())
707
+ if default_provider:
708
+ os.environ["PI_DEFAULT_PROVIDER"] = provider
709
+ if default_model and default_model.strip():
710
+ model = default_model.strip()
711
+ os.environ["PI_DEFAULT_MODEL"] = model
712
+ if provider == PROVIDER_LLAMA:
713
+ os.environ["PI_LLAMA_MODEL_ID"] = model
714
+
715
+ target = Path(agent_dir or resolve_agent_dir())
716
+ target.mkdir(parents=True, exist_ok=True)
717
+
718
+ models_path = target / "models.json"
719
+ settings_path = target / "settings.json"
720
+
721
+ models_path.write_text(
722
+ json.dumps(build_models_config(), indent=2) + "\n",
723
+ encoding="utf-8",
724
+ )
725
+ settings_path.write_text(
726
+ json.dumps(
727
+ build_settings_config(
728
+ default_provider=default_provider,
729
+ default_model=default_model,
730
+ ),
731
+ indent=2,
732
+ )
733
+ + "\n",
734
+ encoding="utf-8",
735
+ )
736
+ return models_path, settings_path
737
+
738
+
739
+ def models_for_provider(provider: str) -> list[str]:
740
+ if is_hf_space_profile():
741
+ return list(PROVIDER_MODELS[PROVIDER_GEMINI])
742
+ if provider == PROVIDER_LLAMA:
743
+ return [llama_model_id()]
744
+ return list(PROVIDER_MODELS.get(provider, PROVIDER_MODELS[PROVIDER_LLAMA]))
745
+
746
+
747
+ def default_model_for_provider(provider: str) -> str:
748
+ return resolved_default_model(provider)
749
+
750
+
751
+ def normalize_provider(provider: str) -> str:
752
+ label_map = {label.lower(): key for key, label in PROVIDER_LABELS.items()}
753
+ lowered = (provider or "").strip().lower()
754
+ if lowered in PROVIDER_MODELS:
755
+ return lowered
756
+ if lowered in label_map:
757
+ return label_map[lowered]
758
+ return PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
759
+
760
+
761
+ def apply_session_credentials(
762
+ *,
763
+ gemini_api_key: str | None = None,
764
+ hf_token: str | None = None,
765
+ aws_region: str | None = None,
766
+ aws_access_key_id: str | None = None,
767
+ aws_secret_access_key: str | None = None,
768
+ aws_session_token: str | None = None,
769
+ ) -> None:
770
+ """Apply session-only credential overrides to os.environ."""
771
+ if gemini_api_key and gemini_api_key.strip():
772
+ os.environ["GEMINI_API_KEY"] = gemini_api_key.strip()
773
+ if hf_token and hf_token.strip():
774
+ token = hf_token.strip()
775
+ os.environ["HF_TOKEN"] = token
776
+ os.environ["DOC_REDACTION_HF_TOKEN"] = token
777
+ if aws_region and aws_region.strip():
778
+ os.environ["AWS_REGION"] = aws_region.strip()
779
+ os.environ["AWS_DEFAULT_REGION"] = aws_region.strip()
780
+ configure_aws_credentials(
781
+ session_access_key_id=aws_access_key_id,
782
+ session_secret_access_key=aws_secret_access_key,
783
+ session_session_token=aws_session_token,
784
+ )
785
+
786
+
787
+ def mirror_hf_token_from_env() -> None:
788
+ """Mirror DOC_REDACTION_HF_TOKEN or Space secret HF_TOKEN for Pi subprocess."""
789
+ if os.environ.get("HF_TOKEN"):
790
+ return
791
+ doc_token = os.environ.get("DOC_REDACTION_HF_TOKEN", "").strip()
792
+ if doc_token:
793
+ os.environ["HF_TOKEN"] = doc_token
794
+
795
+
796
+ def _hf_token_status() -> str:
797
+ if os.environ.get("HF_TOKEN"):
798
+ source = (
799
+ "UI session" if os.environ.get("_HF_TOKEN_FROM_UI") else "env/Space secret"
800
+ )
801
+ return f"set ({source})"
802
+ return "missing"
803
+
804
+
805
+ def credential_status_markdown(*, provider: str | None = None) -> str:
806
+ """
807
+ Credential summary for the active Pi provider.
808
+
809
+ ``llama-cpp`` uses the local OpenAI-compatible endpoint only (no Gemini/AWS keys).
810
+ Gemini and Bedrock lines appear only when that provider is selected.
811
+ """
812
+ active = normalize_provider(provider or get_default_provider())
813
+ if is_hf_space_profile():
814
+ gemini = (
815
+ "set"
816
+ if os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
817
+ else "missing"
818
+ )
819
+ return (
820
+ f"**Credentials:** Gemini `{gemini}` · "
821
+ f"HF token (redaction backend) `{_hf_token_status()}`"
822
+ )
823
+ if active == PROVIDER_LLAMA:
824
+ return (
825
+ f"**Credentials:** local llama-cpp at `{LLAMA_BASE_URL}` "
826
+ f"(no API key; AWS/Gemini not used for Pi orchestration)"
827
+ )
828
+ if active == PROVIDER_GEMINI:
829
+ gemini = (
830
+ "set"
831
+ if os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
832
+ else "missing"
833
+ )
834
+ return f"**Credentials:** Gemini `{gemini}`"
835
+ region = _bedrock_region()
836
+ return f"**Credentials:** AWS `{_aws_credential_status()}` · region `{region}`"
837
+
838
+
839
+ def provider_choices() -> list[str]:
840
+ if is_hf_space_profile():
841
+ return [PROVIDER_GEMINI]
842
+ return list(PROVIDER_LABELS.keys())
843
+
844
+
845
+ def gemini_api_key_configured() -> bool:
846
+ return bool(os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))
847
+
848
+
849
+ def provider_label(provider: str) -> str:
850
+ return PROVIDER_LABELS.get(provider, provider)
851
+
852
+
853
+ if __name__ == "__main__":
854
+ configure_aws_credentials()
855
+ models_path, settings_path = write_runtime_config()
856
+ print(f"Wrote {models_path}")
857
+ print(f"Wrote {settings_path}")
agent-redact/pi/pi_examples.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pi agent Gradio examples aligned with the main app SHOW_EXAMPLES redaction demos."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ from pi_agent_config import is_hf_space_profile
10
+ from redaction_prompt import HF_DEFAULT_OCR
11
+
12
+
13
+ def _show_examples_from_env() -> bool:
14
+ """True unless PI_GRADIO_SHOW_EXAMPLES or SHOW_PI_EXAMPLES is explicitly false."""
15
+ for key in ("PI_GRADIO_SHOW_EXAMPLES", "SHOW_PI_EXAMPLES"):
16
+ raw = os.environ.get(key)
17
+ if raw is None:
18
+ continue
19
+ lowered = raw.strip().lower()
20
+ if lowered in {"0", "false", "no"}:
21
+ return False
22
+ if lowered in {"1", "true", "yes"}:
23
+ return True
24
+ return True
25
+
26
+
27
+ SHOW_PI_EXAMPLES = _show_examples_from_env()
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class PiRedactionExample:
32
+ label: str
33
+ file_name: str
34
+ instructions: str
35
+ ocr_method: str
36
+ pii_method: str = "Local"
37
+ encourage_vlm_faces: bool = False
38
+ encourage_vlm_signatures: bool = False
39
+ page_range: str = "all"
40
+
41
+
42
+ def resolve_example_data_dir() -> Path | None:
43
+ """Locate bundled example PDFs (repo checkout, PyPI package, or Docker layout)."""
44
+ from bootstrap_pi_config import pi_repo_root_path
45
+
46
+ workdir = pi_repo_root_path()
47
+ repo_root = Path(__file__).resolve().parents[2]
48
+ candidates = [
49
+ workdir / "doc_redaction" / "example_data",
50
+ workdir / "example_data",
51
+ repo_root / "doc_redaction" / "example_data",
52
+ repo_root / "example_data",
53
+ ]
54
+
55
+ for candidate in candidates:
56
+ if candidate.is_dir():
57
+ return candidate.resolve()
58
+ return None
59
+
60
+
61
+ def example_file_path(file_name: str) -> Path | None:
62
+ root = resolve_example_data_dir()
63
+ if root is None:
64
+ return None
65
+ path = (root / file_name).resolve()
66
+ try:
67
+ path.relative_to(root)
68
+ except ValueError:
69
+ return None
70
+ if not path.is_file():
71
+ return None
72
+ if _is_lfs_pointer(path):
73
+ return None
74
+ return path
75
+
76
+
77
+ def _is_lfs_pointer(path: Path) -> bool:
78
+ try:
79
+ first_line = path.read_text(encoding="utf-8", errors="ignore").splitlines()[0]
80
+ except (OSError, IndexError):
81
+ return False
82
+ return first_line.startswith("version https://git-lfs.github.com/spec/v1")
83
+
84
+
85
+ def _catalog() -> tuple[PiRedactionExample, ...]:
86
+ selectable_text_ocr = (
87
+ HF_DEFAULT_OCR if is_hf_space_profile() else "Local model - selectable text"
88
+ )
89
+ # local_ocr = (
90
+ # HF_DEFAULT_OCR
91
+ # if is_hf_space_profile()
92
+ # else "Local OCR model - PDFs without selectable text"
93
+ # )
94
+ return (
95
+ PiRedactionExample(
96
+ label="Emails to a professor",
97
+ file_name="example_of_emails_sent_to_a_professor_before_applying.pdf",
98
+ ocr_method=selectable_text_ocr,
99
+ pii_method="Local",
100
+ instructions=(
101
+ "- Any redaction box related to Dr Kornbluth should be removed\n"
102
+ "- References to Dr Hyde, or Dr Hyde's lab should be redacted. Also any references to Lauren, or Lauren Lilley\n"
103
+ "- All mentions of Universities and their names should be redacted\n"
104
+ ),
105
+ ),
106
+ PiRedactionExample(
107
+ label="Graduate cover letter",
108
+ file_name="graduate-job-example-cover-letter.pdf",
109
+ ocr_method=selectable_text_ocr,
110
+ pii_method="Local",
111
+ instructions=(
112
+ "- Redact any names and titles, apart from Mr Wilson\n"
113
+ "- Redact any organisation names\n"
114
+ "- Redact any place names\n"
115
+ ),
116
+ ),
117
+ )
118
+
119
+
120
+ def available_pi_examples() -> list[PiRedactionExample]:
121
+ if not SHOW_PI_EXAMPLES:
122
+ return []
123
+ available: list[PiRedactionExample] = []
124
+ for example in _catalog():
125
+ if example_file_path(example.file_name) is not None:
126
+ available.append(example)
127
+ return available
128
+
129
+
130
+ def example_rows() -> tuple[list[list], list[str]]:
131
+ """Return (gr.Examples rows, labels) for available demos."""
132
+ rows: list[list] = []
133
+ labels: list[str] = []
134
+ for example in available_pi_examples():
135
+ path = example_file_path(example.file_name)
136
+ if path is None:
137
+ continue
138
+ rows.append(
139
+ [
140
+ str(path),
141
+ example.instructions,
142
+ example.page_range,
143
+ example.ocr_method,
144
+ example.pii_method,
145
+ example.encourage_vlm_faces,
146
+ example.encourage_vlm_signatures,
147
+ ]
148
+ )
149
+ labels.append(example.label)
150
+ return rows, labels
151
+
152
+
153
+ def gradio_example_allowed_paths() -> list[str]:
154
+ root = resolve_example_data_dir()
155
+ if root is None:
156
+ return []
157
+ return [str(root)]
158
+
159
+
160
+ def examples_status_markdown() -> str:
161
+ """Human-readable status for the UI when examples are missing or disabled."""
162
+ if not SHOW_PI_EXAMPLES:
163
+ return (
164
+ "_Examples are disabled. Set Space variable "
165
+ "`PI_GRADIO_SHOW_EXAMPLES=true` (or `SHOW_PI_EXAMPLES=true`) and restart._"
166
+ )
167
+ root = resolve_example_data_dir()
168
+ if root is None:
169
+ return (
170
+ "_Example PDFs not found — expected under "
171
+ "`doc_redaction/example_data/` in the Space image._"
172
+ )
173
+ available = available_pi_examples()
174
+ if not available:
175
+ return (
176
+ f"_Example PDFs not found under `{root}`. "
177
+ "Rebuild the Space after syncing example files from the monorepo._"
178
+ )
179
+ names = ", ".join(f"`{ex.file_name}`" for ex in available)
180
+ return f"_Examples loaded from `{root}`: {names}_"
agent-redact/pi/pi_rpc_client.py ADDED
@@ -0,0 +1,989 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Python client for Pi RPC mode (JSONL over stdin/stdout)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import queue
8
+ import shutil
9
+ import subprocess
10
+ import sys
11
+ import threading
12
+ import uuid
13
+ from collections import deque
14
+ from collections.abc import Iterator
15
+ from dataclasses import dataclass, field
16
+ from typing import Any
17
+
18
+
19
+ class PiRpcError(RuntimeError):
20
+ pass
21
+
22
+
23
+ # Sentinel pushed to every pending response slot and the events queue when the
24
+ # Pi RPC subprocess exits, so blocked waiters unblock with a clear error instead
25
+ # of hanging forever.
26
+ _PI_PROCESS_EXIT = object()
27
+
28
+
29
+ # Pi RPC is JSONL over pipes; always UTF-8 (Windows default locale is cp1252).
30
+ _PI_SUBPROCESS_ENCODING = "utf-8"
31
+ _PI_SUBPROCESS_ENCODING_ERRORS = "replace"
32
+
33
+ _PI_INSTALL_HINT = (
34
+ "Install the Pi coding agent CLI, then restart the Gradio app: \n"
35
+ "`npm install -g @earendil-works/pi-coding-agent` \n"
36
+ "On Windows, ensure Node.js/npm are on PATH (or set `PI_EXECUTABLE` to the "
37
+ "full path to `pi.cmd`, e.g. `%APPDATA%\\npm\\pi.cmd`). \n"
38
+ "Docker users: run the Pi UI via `docker compose` (`pi-agent` service) instead "
39
+ "of `python gradio_app.py` on the host."
40
+ )
41
+
42
+
43
+ def resolve_pi_executable() -> str:
44
+ """Return a path to the ``pi`` RPC executable (raises ``PiRpcError`` if missing)."""
45
+ override = os.environ.get("PI_EXECUTABLE", "").strip()
46
+ if override:
47
+ if os.path.isfile(override) or shutil.which(override):
48
+ return override
49
+ raise PiRpcError(
50
+ f"PI_EXECUTABLE is set but not found: `{override}` \n\n{_PI_INSTALL_HINT}"
51
+ )
52
+ for name in ("pi", "pi.cmd"):
53
+ found = shutil.which(name)
54
+ if found:
55
+ return found
56
+ raise PiRpcError(f"Pi CLI (`pi`) not found on PATH. \n\n{_PI_INSTALL_HINT}")
57
+
58
+
59
+ @dataclass
60
+ class PiStreamEvent:
61
+ """Structured event from Pi RPC for UI layers."""
62
+
63
+ kind: str
64
+ text: str = ""
65
+ tool_name: str | None = None
66
+ tool_call_id: str | None = None
67
+ tool_args: dict[str, Any] | None = None
68
+ tool_output: str | None = None
69
+ is_error: bool = False
70
+ meta: dict[str, Any] = field(default_factory=dict)
71
+
72
+
73
+ def extract_tool_text(payload: dict[str, Any] | None) -> str:
74
+ if not payload:
75
+ return ""
76
+ content = payload.get("content")
77
+ if content is None and isinstance(payload.get("partialResult"), dict):
78
+ content = payload["partialResult"].get("content")
79
+ if content is None and isinstance(payload.get("result"), dict):
80
+ content = payload["result"].get("content")
81
+ if not isinstance(content, list):
82
+ return ""
83
+ parts: list[str] = []
84
+ for block in content:
85
+ if isinstance(block, dict) and block.get("type") == "text":
86
+ parts.append(str(block.get("text") or ""))
87
+ return "\n".join(parts).strip()
88
+
89
+
90
+ def extract_assistant_display(message: dict[str, Any] | None) -> tuple[str, str]:
91
+ """Extract visible text and thinking from a partial assistant message."""
92
+ if not message or message.get("role") != "assistant":
93
+ return "", ""
94
+ content = message.get("content")
95
+ if isinstance(content, str):
96
+ return content, ""
97
+ if not isinstance(content, list):
98
+ return "", ""
99
+
100
+ texts: list[str] = []
101
+ thinkings: list[str] = []
102
+ for block in content:
103
+ if isinstance(block, str):
104
+ if block.strip():
105
+ texts.append(block)
106
+ continue
107
+ if not isinstance(block, dict):
108
+ continue
109
+ block_type = block.get("type")
110
+ if block_type in (None, "text", "output_text"):
111
+ text = block.get("text") or block.get("content") or ""
112
+ if text:
113
+ texts.append(str(text))
114
+ elif block_type in ("thinking", "reasoning", "thought"):
115
+ thought = (
116
+ block.get("thinking")
117
+ or block.get("text")
118
+ or block.get("reasoning")
119
+ or block.get("content")
120
+ or ""
121
+ )
122
+ if thought:
123
+ thinkings.append(str(thought))
124
+ return "".join(texts), "".join(thinkings)
125
+
126
+
127
+ def assistant_chat_text(visible: str, thinking: str) -> str:
128
+ """Text to show in the main chat — visible answer, or thinking when Gemini sends only that."""
129
+ if visible.strip():
130
+ return visible
131
+ return thinking
132
+
133
+
134
+ def _tool_lines_from_content(content: list[Any]) -> list[str]:
135
+ tool_lines: list[str] = []
136
+ for block in content:
137
+ if not isinstance(block, dict):
138
+ continue
139
+ block_type = block.get("type")
140
+ if block_type not in {"toolCall", "tool_use", "functionCall"}:
141
+ continue
142
+ name = str(block.get("name") or block.get("toolName") or "tool")
143
+ args = block.get("arguments") or block.get("input") or block.get("args")
144
+ if isinstance(args, str):
145
+ try:
146
+ args = json.loads(args)
147
+ except json.JSONDecodeError:
148
+ args = {"raw": args}
149
+ if not isinstance(args, dict):
150
+ args = {}
151
+ tool_lines.append(format_tool_chat_line(name, args))
152
+ return tool_lines
153
+
154
+
155
+ def format_tool_chat_line(tool_name: str | None, args: dict[str, Any] | None) -> str:
156
+ """Render one tool invocation for the chat UI (prose for comment-only bash)."""
157
+ name = str(tool_name or "tool")
158
+ lowered = name.lower()
159
+ if lowered == "bash" and args and args.get("command"):
160
+ cmd = str(args["command"])
161
+ if is_bash_commentary_only(cmd):
162
+ return extract_bash_commentary_text(cmd)
163
+ commentary, executable = split_bash_commentary_and_command(cmd)
164
+ if commentary and executable:
165
+ short = executable[:200] + ("…" if len(executable) > 200 else "")
166
+ return f"{commentary}\n\n**bash:** `{short}`"
167
+ if commentary:
168
+ return commentary
169
+ detail = format_tool_args(tool_name, args)
170
+ if detail and detail != name:
171
+ return f"**{name}:** {detail}"
172
+ return f"**{name}**"
173
+
174
+
175
+ def format_assistant_message_for_chat(message: dict[str, Any]) -> str:
176
+ """Render one assistant message for the chat UI (visible text or tool calls; no thinking)."""
177
+ visible, _thinking = extract_assistant_display(message)
178
+ if visible.strip():
179
+ return visible
180
+
181
+ content = message.get("content")
182
+ if not isinstance(content, list):
183
+ return ""
184
+
185
+ return "\n".join(_tool_lines_from_content(content))
186
+
187
+
188
+ def chat_text_from_assistant_message(message: dict[str, Any] | None) -> str:
189
+ """Non-thinking chat text from a Pi/Gemini assistant message snapshot."""
190
+ if not message or message.get("role") != "assistant":
191
+ return ""
192
+ return format_assistant_message_for_chat(message)
193
+
194
+
195
+ _RATE_LIMIT_MARKERS = (
196
+ "429",
197
+ "quota",
198
+ "rate limit",
199
+ "rate-limit",
200
+ "resource_exhausted",
201
+ "too many requests",
202
+ "throttlingexception",
203
+ "throttling",
204
+ "toomanyrequestsexception",
205
+ "servicequotaexceeded",
206
+ )
207
+
208
+
209
+ def is_rate_limit_error(text: str | None) -> bool:
210
+ """True when *text* looks like a provider quota or rate-limit failure."""
211
+ if not text:
212
+ return False
213
+ lowered = text.lower()
214
+ return any(marker in lowered for marker in _RATE_LIMIT_MARKERS)
215
+
216
+
217
+ def _strip_rpc_payload_for_debug(obj: Any) -> Any:
218
+ """
219
+ Strip large message content from RPC objects for compact debug logging.
220
+
221
+ Keeps metadata (id, type, command, success) but removes or truncates
222
+ actual message/data payloads.
223
+ """
224
+ if not isinstance(obj, dict):
225
+ return obj
226
+
227
+ kept_keys = {"type", "id", "command", "success", "error", "stopReason"}
228
+ result = {k: v for k, v in obj.items() if k in kept_keys}
229
+
230
+ # Keep data/result/messages structure without content
231
+ for key in ("data", "result", "messages", "response"):
232
+ if key in obj:
233
+ val = obj[key]
234
+ if isinstance(val, dict):
235
+ result[key] = {k: "..." for k in val.keys()}
236
+ elif isinstance(val, list):
237
+ result[key] = f"[... {len(val)} items]"
238
+ else:
239
+ result[key] = "..."
240
+
241
+ return result
242
+
243
+
244
+ def last_assistant_turn_error(messages: list[dict[str, Any]]) -> str | None:
245
+ """Return the latest assistant error in the current user turn, if any."""
246
+ last_user = -1
247
+ for index, message in enumerate(messages):
248
+ if message.get("role") == "user":
249
+ last_user = index
250
+
251
+ turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
252
+ for message in reversed(turn_messages):
253
+ if message.get("role") != "assistant":
254
+ continue
255
+ error = message.get("errorMessage")
256
+ if error:
257
+ return str(error)
258
+ if message.get("stopReason") == "error":
259
+ visible, _ = extract_assistant_display(message)
260
+ if visible.strip():
261
+ return visible
262
+ return "assistant turn failed"
263
+ return None
264
+
265
+
266
+ def assistant_text_since_last_user(messages: list[dict[str, Any]]) -> str:
267
+ """Combine assistant messages from the latest user turn."""
268
+ last_user = -1
269
+ for index, message in enumerate(messages):
270
+ if message.get("role") == "user":
271
+ last_user = index
272
+
273
+ turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
274
+ parts: list[str] = []
275
+ for message in turn_messages:
276
+ if message.get("role") != "assistant":
277
+ continue
278
+ part = format_assistant_message_for_chat(message)
279
+ if part.strip():
280
+ parts.append(part)
281
+ return "\n\n".join(parts)
282
+
283
+
284
+ def partial_message_from_update(event: dict[str, Any]) -> dict[str, Any] | None:
285
+ delta = event.get("assistantMessageEvent") or {}
286
+ partial = delta.get("partial")
287
+ if isinstance(partial, dict):
288
+ return partial
289
+ message = event.get("message")
290
+ if isinstance(message, dict):
291
+ return message
292
+ return None
293
+
294
+
295
+ def is_bash_commentary_only(command: str) -> bool:
296
+ """True when a bash tool call contains only shell comments (no executable lines)."""
297
+ lines = [ln.strip() for ln in command.splitlines() if ln.strip()]
298
+ if not lines:
299
+ return False
300
+ return all(ln.startswith("#") for ln in lines)
301
+
302
+
303
+ def extract_bash_commentary_text(command: str) -> str:
304
+ """Join non-empty ``#`` comment bodies from a bash command into readable prose."""
305
+ parts: list[str] = []
306
+ for raw in command.splitlines():
307
+ stripped = raw.strip()
308
+ if not stripped.startswith("#"):
309
+ continue
310
+ text = stripped.lstrip("#").strip()
311
+ if text:
312
+ parts.append(text)
313
+ return "\n".join(parts)
314
+
315
+
316
+ def split_bash_commentary_and_command(command: str) -> tuple[str, str]:
317
+ """Split ``#`` planning lines from executable shell lines."""
318
+ comments: list[str] = []
319
+ commands: list[str] = []
320
+ for raw in command.splitlines():
321
+ stripped = raw.strip()
322
+ if not stripped:
323
+ continue
324
+ if stripped.startswith("#"):
325
+ text = stripped.lstrip("#").strip()
326
+ if text:
327
+ comments.append(text)
328
+ else:
329
+ commands.append(stripped)
330
+ return "\n".join(comments), " ↵ ".join(commands)
331
+
332
+
333
+ def format_tool_args(tool_name: str | None, args: dict[str, Any] | None) -> str:
334
+ if not args:
335
+ return ""
336
+ name = (tool_name or "").lower()
337
+ if name == "bash" and args.get("command"):
338
+ cmd = str(args["command"])
339
+ if is_bash_commentary_only(cmd):
340
+ return extract_bash_commentary_text(cmd)
341
+ _commentary, executable = split_bash_commentary_and_command(cmd)
342
+ if not executable:
343
+ return extract_bash_commentary_text(cmd)
344
+ shown = executable[:240] + ("…" if len(executable) > 240 else "")
345
+ return f"`{shown}`"
346
+ if name in {"read", "write", "edit"} and args.get("path"):
347
+ return f"`{args['path']}`"
348
+ compact = json.dumps(args, ensure_ascii=False)
349
+ if len(compact) > 280:
350
+ compact = compact[:277] + "…"
351
+ return compact
352
+
353
+
354
+ class PiRpcClient:
355
+ """Drive a long-lived ``pi --mode rpc`` subprocess."""
356
+
357
+ # Extension UI dialog methods block Pi until the client replies; auto-cancel
358
+ # them so a missing UI layer can never wedge the RPC process.
359
+ _EXTENSION_UI_DIALOG_METHODS = frozenset({"select", "confirm", "input", "editor"})
360
+
361
+ def __init__(
362
+ self,
363
+ *,
364
+ cwd: str | None = None,
365
+ env: dict[str, str] | None = None,
366
+ pi_args: list[str] | None = None,
367
+ ) -> None:
368
+ self._cwd = cwd
369
+ self._env = env
370
+ self._pi_args = pi_args or []
371
+ self._proc: subprocess.Popen[str] | None = None
372
+ self._write_lock = threading.Lock()
373
+ self._abort_requested = False
374
+ self._prompt_stream_depth = 0
375
+ self._pending_follow_ups = 0
376
+ self._pending_ui_history: list[dict[str, Any]] = []
377
+ # Single stdout reader thread demultiplexes the JSONL stream: command
378
+ # responses go to per-id slots, agent events go to ``_events``. This lets
379
+ # any thread (e.g. post-task logging) call the client safely while a
380
+ # prompt stream is active.
381
+ self._reader_thread: threading.Thread | None = None
382
+ self._stderr_thread: threading.Thread | None = None
383
+ self._events: queue.Queue[Any] = queue.Queue()
384
+ self._pending_lock = threading.Lock()
385
+ self._pending_responses: dict[str, queue.Queue[Any]] = {}
386
+ self._stderr_buffer: deque[str] = deque(maxlen=200)
387
+ self._closing = False
388
+
389
+ @property
390
+ def running(self) -> bool:
391
+ return self._proc is not None and self._proc.poll() is None
392
+
393
+ @property
394
+ def prompt_stream_active(self) -> bool:
395
+ """True while :meth:`prompt_events` is consuming the RPC event stream."""
396
+ return self._prompt_stream_depth > 0
397
+
398
+ def start(self) -> None:
399
+ if self.running:
400
+ return
401
+ command = [resolve_pi_executable(), "--mode", "rpc", *self._pi_args]
402
+ self._closing = False
403
+ self._abort_requested = False
404
+ proc = subprocess.Popen(
405
+ command,
406
+ stdin=subprocess.PIPE,
407
+ stdout=subprocess.PIPE,
408
+ stderr=subprocess.PIPE,
409
+ encoding=_PI_SUBPROCESS_ENCODING,
410
+ errors=_PI_SUBPROCESS_ENCODING_ERRORS,
411
+ bufsize=1,
412
+ cwd=self._cwd,
413
+ env=self._env,
414
+ )
415
+ self._proc = proc
416
+ # Fresh demux state for this process.
417
+ self._events = queue.Queue()
418
+ with self._pending_lock:
419
+ self._pending_responses = {}
420
+ self._stderr_buffer = deque(maxlen=200)
421
+ self._reader_thread = threading.Thread(
422
+ target=self._reader_loop,
423
+ args=(proc,),
424
+ name="pi-rpc-stdout",
425
+ daemon=True,
426
+ )
427
+ self._reader_thread.start()
428
+ if proc.stderr is not None:
429
+ self._stderr_thread = threading.Thread(
430
+ target=self._stderr_loop,
431
+ args=(proc,),
432
+ name="pi-rpc-stderr",
433
+ daemon=True,
434
+ )
435
+ self._stderr_thread.start()
436
+
437
+ def close(self) -> None:
438
+ if not self._proc:
439
+ return
440
+ self._closing = True
441
+ proc = self._proc
442
+ if proc.poll() is None:
443
+ try:
444
+ self.abort()
445
+ except Exception:
446
+ pass
447
+ proc.terminate()
448
+ try:
449
+ proc.wait(timeout=5)
450
+ except subprocess.TimeoutExpired:
451
+ proc.kill()
452
+ # Process exit makes ``readline`` return EOF; the reader thread then
453
+ # notifies waiters. Nudge waiters here too in case the threads are slow.
454
+ self._notify_process_exit()
455
+ for thread in (self._reader_thread, self._stderr_thread):
456
+ if (
457
+ thread is not None
458
+ and thread.is_alive()
459
+ and thread is not threading.current_thread()
460
+ ):
461
+ thread.join(timeout=2)
462
+ self._reader_thread = None
463
+ self._stderr_thread = None
464
+ self._proc = None
465
+
466
+ def _ensure_running(self) -> subprocess.Popen[str]:
467
+ if not self.running:
468
+ self.start()
469
+ assert self._proc is not None
470
+ return self._proc
471
+
472
+ def _recent_stderr(self) -> str:
473
+ return "\n".join(self._stderr_buffer)
474
+
475
+ def _process_exit_error(self) -> PiRpcError:
476
+ code = self._proc.poll() if self._proc else None
477
+ err = self._recent_stderr()
478
+ return PiRpcError(
479
+ f"Pi RPC process exited (code={code})."
480
+ + (f" stderr: {err[:500]}" if err else "")
481
+ )
482
+
483
+ def _notify_process_exit(self) -> None:
484
+ """Unblock every pending response slot and the events queue on exit."""
485
+ with self._pending_lock:
486
+ pending = list(self._pending_responses.values())
487
+ self._pending_responses.clear()
488
+ for slot in pending:
489
+ try:
490
+ slot.put_nowait(_PI_PROCESS_EXIT)
491
+ except queue.Full:
492
+ pass
493
+ try:
494
+ self._events.put_nowait(_PI_PROCESS_EXIT)
495
+ except queue.Full:
496
+ pass
497
+
498
+ def _stderr_loop(self, proc: subprocess.Popen[str]) -> None:
499
+ """Continuously drain stderr into a bounded buffer (prevents pipe deadlock)."""
500
+ stream = proc.stderr
501
+ if stream is None:
502
+ return
503
+ try:
504
+ for line in stream:
505
+ self._stderr_buffer.append(line.rstrip("\r\n"))
506
+ except (ValueError, OSError):
507
+ pass
508
+
509
+ def _reader_loop(self, proc: subprocess.Popen[str]) -> None:
510
+ """Read every stdout line and route responses vs. agent events."""
511
+ stream = proc.stdout
512
+ if stream is None:
513
+ self._notify_process_exit()
514
+ return
515
+ try:
516
+ while True:
517
+ line = stream.readline()
518
+ if not line:
519
+ break
520
+ line = line.rstrip("\r\n")
521
+ if not line:
522
+ continue
523
+ try:
524
+ message = json.loads(line)
525
+ except json.JSONDecodeError:
526
+ continue
527
+ self._dispatch_message(message)
528
+ except (ValueError, OSError):
529
+ pass
530
+ finally:
531
+ self._notify_process_exit()
532
+
533
+ def _dispatch_message(self, message: Any) -> None:
534
+ if not isinstance(message, dict):
535
+ return
536
+ if os.environ.get("PI_RPC_DEBUG", "").strip() == "1":
537
+ try:
538
+ stripped = _strip_rpc_payload_for_debug(message)
539
+ sys.stderr.write(
540
+ "Pi RPC recv: " + json.dumps(stripped, ensure_ascii=False) + "\n"
541
+ )
542
+ sys.stderr.flush()
543
+ except Exception:
544
+ pass
545
+ msg_type = message.get("type")
546
+ if msg_type == "response":
547
+ req_id = message.get("id")
548
+ slot: queue.Queue[Any] | None = None
549
+ if req_id is not None:
550
+ with self._pending_lock:
551
+ slot = self._pending_responses.pop(str(req_id), None)
552
+ if slot is not None:
553
+ try:
554
+ slot.put_nowait(message)
555
+ except queue.Full:
556
+ pass
557
+ return
558
+ if msg_type == "extension_ui_request":
559
+ self._auto_reply_extension_ui(message)
560
+ return
561
+ # Agent event — consumed by the active ``prompt_events`` stream.
562
+ self._events.put(message)
563
+
564
+ def _auto_reply_extension_ui(self, message: dict[str, Any]) -> None:
565
+ method = message.get("method")
566
+ req_id = message.get("id")
567
+ if req_id is None or method not in self._EXTENSION_UI_DIALOG_METHODS:
568
+ return
569
+ try:
570
+ self._write_command(
571
+ {"type": "extension_ui_response", "id": req_id, "cancelled": True}
572
+ )
573
+ except (OSError, PiRpcError):
574
+ pass
575
+
576
+ def _write_command(self, command: dict[str, Any]) -> None:
577
+ proc = self._ensure_running()
578
+ assert proc.stdin is not None
579
+ if os.environ.get("PI_RPC_DEBUG", "").strip() == "1":
580
+ try:
581
+ stripped = _strip_rpc_payload_for_debug(command)
582
+ sys.stderr.write(
583
+ "Pi RPC send: " + json.dumps(stripped, ensure_ascii=False) + "\n"
584
+ )
585
+ sys.stderr.flush()
586
+ except Exception:
587
+ pass
588
+ with self._write_lock:
589
+ proc.stdin.write(json.dumps(command) + "\n")
590
+ proc.stdin.flush()
591
+
592
+ def _send_command(
593
+ self,
594
+ command: dict[str, Any],
595
+ *,
596
+ wait_response: bool = True,
597
+ ) -> dict[str, Any] | None:
598
+ req_id = str(command.setdefault("id", str(uuid.uuid4())))
599
+ if not wait_response:
600
+ self._write_command(command)
601
+ return None
602
+ slot: queue.Queue[Any] = queue.Queue(maxsize=1)
603
+ with self._pending_lock:
604
+ self._pending_responses[req_id] = slot
605
+ try:
606
+ self._write_command(command)
607
+ except Exception:
608
+ with self._pending_lock:
609
+ self._pending_responses.pop(req_id, None)
610
+ raise
611
+ result = slot.get()
612
+ if result is _PI_PROCESS_EXIT:
613
+ raise self._process_exit_error()
614
+ if not result.get("success", False):
615
+ error = result.get("error") or result.get("message") or "command failed"
616
+ raise PiRpcError(str(error))
617
+ return result
618
+
619
+ def abort(self) -> None:
620
+ """Request abort without reading stdout (the active stream consumer drains events)."""
621
+ if not self.running:
622
+ return
623
+ self._abort_requested = True
624
+ try:
625
+ self._send_command({"type": "abort"}, wait_response=False)
626
+ except OSError:
627
+ pass
628
+
629
+ def stage_ui_chat_notice(self, label: str, message: str) -> None:
630
+ """Stage user/assistant chat rows for the active prompt stream to merge on yield."""
631
+ text = message.strip()
632
+ if not text:
633
+ return
634
+ self._pending_ui_history.append(
635
+ {"role": "user", "content": f"_**{label}:**_ {text}"}
636
+ )
637
+ self._pending_ui_history.append({"role": "assistant", "content": ""})
638
+
639
+ def drain_pending_ui_history(self) -> list[dict[str, Any]]:
640
+ """Return and clear UI chat rows staged by :meth:`stage_ui_chat_notice`."""
641
+ pending = self._pending_ui_history[:]
642
+ self._pending_ui_history.clear()
643
+ return pending
644
+
645
+ def steer(self, message: str) -> None:
646
+ """Queue a steering message (delivered after the current tool step completes)."""
647
+ if not message.strip():
648
+ return
649
+ self._send_command(
650
+ {"type": "steer", "message": message},
651
+ wait_response=False,
652
+ )
653
+
654
+ def follow_up(self, message: str) -> None:
655
+ """Queue a follow-up message for when the agent stops."""
656
+ if not message.strip():
657
+ return
658
+ self._pending_follow_ups += 1
659
+ self._send_command(
660
+ {"type": "follow_up", "message": message},
661
+ wait_response=False,
662
+ )
663
+
664
+ @property
665
+ def abort_requested(self) -> bool:
666
+ return self._abort_requested
667
+
668
+ def clear_abort(self) -> None:
669
+ self._abort_requested = False
670
+
671
+ def new_session(self) -> None:
672
+ self._send_command({"type": "new_session"})
673
+
674
+ def get_state(self) -> dict[str, Any]:
675
+ response = self._send_command({"type": "get_state"})
676
+ data = response.get("data") if response else {}
677
+ return data if isinstance(data, dict) else {}
678
+
679
+ def get_messages(self) -> list[dict[str, Any]]:
680
+ response = self._send_command({"type": "get_messages"})
681
+ data = response.get("data") if response else {}
682
+ messages = data.get("messages") if isinstance(data, dict) else []
683
+ return messages if isinstance(messages, list) else []
684
+
685
+ def get_session_stats(self) -> dict[str, Any]:
686
+ """Token usage and cost totals for the active session (Pi RPC ``get_session_stats``)."""
687
+ response = self._send_command({"type": "get_session_stats"})
688
+ data = response.get("data") if response else {}
689
+ return data if isinstance(data, dict) else {}
690
+
691
+ def set_model(self, provider: str, model_id: str) -> dict[str, Any]:
692
+ response = self._send_command(
693
+ {
694
+ "type": "set_model",
695
+ "provider": provider,
696
+ "modelId": model_id,
697
+ }
698
+ )
699
+ data = response.get("data") if response else {}
700
+ return data if isinstance(data, dict) else {}
701
+
702
+ def get_available_models(self) -> list[dict[str, Any]]:
703
+ response = self._send_command({"type": "get_available_models"})
704
+ data = response.get("data") if response else {}
705
+ models = data.get("models") if isinstance(data, dict) else []
706
+ return models if isinstance(models, list) else []
707
+
708
+ def restart(self) -> None:
709
+ self.close()
710
+ self.start()
711
+
712
+ def prompt_events(self, message: str) -> Iterator[PiStreamEvent]:
713
+ """Send a user message and yield structured events until ``agent_end``."""
714
+ self._prompt_stream_depth += 1
715
+ try:
716
+ yield from self._prompt_events_impl(message)
717
+ finally:
718
+ self._prompt_stream_depth = max(0, self._prompt_stream_depth - 1)
719
+
720
+ def _drain_events(self) -> None:
721
+ """Discard stale events left over from a prior stream (single active prompt)."""
722
+ while True:
723
+ try:
724
+ item = self._events.get_nowait()
725
+ except queue.Empty:
726
+ return
727
+ if item is _PI_PROCESS_EXIT:
728
+ # Preserve the exit signal for the consumer to observe.
729
+ try:
730
+ self._events.put_nowait(_PI_PROCESS_EXIT)
731
+ except queue.Full:
732
+ pass
733
+ return
734
+
735
+ def _prompt_events_impl(self, message: str) -> Iterator[PiStreamEvent]:
736
+ self.clear_abort()
737
+ self._drain_events()
738
+ try:
739
+ self._send_command({"type": "prompt", "message": message})
740
+ except PiRpcError as exc:
741
+ yield PiStreamEvent(kind="error", text=str(exc), is_error=True)
742
+ return
743
+
744
+ yield from self._iter_agent_events()
745
+
746
+ def _iter_agent_events(self) -> Iterator[PiStreamEvent]:
747
+ while True:
748
+ event = self._events.get()
749
+ if event is _PI_PROCESS_EXIT:
750
+ raise self._process_exit_error()
751
+ event_type = event.get("type")
752
+
753
+ if event_type == "agent_start":
754
+ yield PiStreamEvent(kind="status", text="Agent started…")
755
+
756
+ elif event_type == "turn_start":
757
+ yield PiStreamEvent(kind="status", text="Turn started.")
758
+
759
+ elif event_type == "turn_end":
760
+ yield PiStreamEvent(kind="turn_end", text="Turn finished.")
761
+
762
+ elif event_type == "message_update":
763
+ yield from self._parse_message_update(event)
764
+
765
+ elif event_type == "tool_execution_start":
766
+ tool_name = event.get("toolName")
767
+ tool_args = (
768
+ event.get("args") if isinstance(event.get("args"), dict) else {}
769
+ )
770
+ yield PiStreamEvent(
771
+ kind="tool_start",
772
+ tool_name=str(tool_name) if tool_name else "tool",
773
+ tool_call_id=event.get("toolCallId"),
774
+ tool_args=tool_args,
775
+ text=format_tool_args(
776
+ str(tool_name) if tool_name else None,
777
+ tool_args,
778
+ ),
779
+ )
780
+
781
+ elif event_type == "tool_execution_update":
782
+ output = extract_tool_text(event)
783
+ yield PiStreamEvent(
784
+ kind="tool_update",
785
+ tool_name=event.get("toolName"),
786
+ tool_call_id=event.get("toolCallId"),
787
+ tool_output=output,
788
+ )
789
+
790
+ elif event_type == "tool_execution_end":
791
+ result = (
792
+ event.get("result") if isinstance(event.get("result"), dict) else {}
793
+ )
794
+ output = extract_tool_text(result)
795
+ yield PiStreamEvent(
796
+ kind="tool_end",
797
+ tool_name=event.get("toolName"),
798
+ tool_call_id=event.get("toolCallId"),
799
+ tool_output=output,
800
+ is_error=bool(event.get("isError")),
801
+ )
802
+
803
+ elif event_type == "queue_update":
804
+ steering = event.get("steering") or []
805
+ follow_up = event.get("followUp") or []
806
+ if steering or follow_up:
807
+ yield PiStreamEvent(
808
+ kind="queue_update",
809
+ meta={"steering": steering, "follow_up": follow_up},
810
+ )
811
+
812
+ elif event_type == "compaction_start":
813
+ reason = event.get("reason") or "unknown"
814
+ yield PiStreamEvent(
815
+ kind="status",
816
+ text=f"Compaction started ({reason})…",
817
+ meta={"reason": reason},
818
+ )
819
+
820
+ elif event_type == "compaction_end":
821
+ if event.get("aborted"):
822
+ text = "Compaction aborted."
823
+ elif event.get("errorMessage"):
824
+ text = f"Compaction failed: {event['errorMessage']}"
825
+ yield PiStreamEvent(kind="error", text=text, is_error=True)
826
+ continue
827
+ elif event.get("willRetry"):
828
+ text = "Compaction complete — retrying prompt…"
829
+ else:
830
+ tokens = (event.get("result") or {}).get("tokensBefore")
831
+ text = (
832
+ f"Compaction complete ({tokens:,} tokens before)."
833
+ if isinstance(tokens, int)
834
+ else "Compaction complete."
835
+ )
836
+ yield PiStreamEvent(kind="status", text=text, meta=event)
837
+
838
+ elif event_type == "auto_retry_start":
839
+ attempt = event.get("attempt")
840
+ max_attempts = event.get("maxAttempts")
841
+ delay_ms = event.get("delayMs")
842
+ msg = event.get("errorMessage") or "transient error"
843
+ yield PiStreamEvent(
844
+ kind="status",
845
+ text=(
846
+ f"Auto-retry {attempt}/{max_attempts} in {delay_ms}ms "
847
+ f"({str(msg)[:120]})"
848
+ ),
849
+ meta=event,
850
+ )
851
+
852
+ elif event_type == "auto_retry_end":
853
+ if event.get("success"):
854
+ yield PiStreamEvent(
855
+ kind="status",
856
+ text=f"Auto-retry succeeded on attempt {event.get('attempt')}.",
857
+ )
858
+ else:
859
+ yield PiStreamEvent(
860
+ kind="error",
861
+ text=f"Auto-retry failed: {event.get('finalError', 'unknown error')}",
862
+ is_error=True,
863
+ )
864
+
865
+ elif event_type == "extension_error":
866
+ yield PiStreamEvent(
867
+ kind="error",
868
+ text=str(event.get("error") or "extension error"),
869
+ is_error=True,
870
+ )
871
+
872
+ elif event_type == "agent_end":
873
+ # Pi delivers queued ``follow_up`` messages after ``agent_end`` and
874
+ # continues streaming; do not stop the stdout consumer until they run.
875
+ if self._pending_follow_ups > 0:
876
+ self._pending_follow_ups -= 1
877
+ yield PiStreamEvent(
878
+ kind="status",
879
+ text="Follow-up queued — continuing…",
880
+ )
881
+ continue
882
+ aborted = self._abort_requested
883
+ self.clear_abort()
884
+ yield PiStreamEvent(
885
+ kind="done",
886
+ text="Agent aborted." if aborted else "Agent finished.",
887
+ )
888
+ return
889
+
890
+ def _parse_message_update(self, event: dict[str, Any]) -> Iterator[PiStreamEvent]:
891
+ delta = event.get("assistantMessageEvent") or {}
892
+ delta_type = delta.get("type")
893
+ partial = partial_message_from_update(event)
894
+ if partial is not None:
895
+ visible, thinking = extract_assistant_display(partial)
896
+ if visible.strip():
897
+ yield PiStreamEvent(kind="text_snapshot", text=visible)
898
+ elif chat_text := chat_text_from_assistant_message(partial):
899
+ yield PiStreamEvent(kind="text_snapshot", text=chat_text)
900
+ if thinking.strip():
901
+ yield PiStreamEvent(kind="thinking_snapshot", text=thinking)
902
+
903
+ if delta_type == "text_delta":
904
+ chunk = delta.get("delta") or ""
905
+ if chunk:
906
+ yield PiStreamEvent(kind="text_delta", text=chunk)
907
+
908
+ elif delta_type == "thinking_delta":
909
+ chunk = delta.get("delta") or ""
910
+ if chunk:
911
+ yield PiStreamEvent(kind="thinking_delta", text=chunk)
912
+
913
+ elif delta_type == "toolcall_start":
914
+ tool_call = delta.get("toolCall") or {}
915
+ tool_name = tool_call.get("name") or delta.get("toolName") or "tool"
916
+ tool_args = tool_call.get("arguments")
917
+ if isinstance(tool_args, str):
918
+ try:
919
+ tool_args = json.loads(tool_args)
920
+ except json.JSONDecodeError:
921
+ tool_args = {"raw": tool_args}
922
+ if not isinstance(tool_args, dict):
923
+ tool_args = {}
924
+ chat_line = format_tool_chat_line(str(tool_name), tool_args)
925
+ yield PiStreamEvent(kind="text_snapshot", text=chat_line)
926
+
927
+ elif delta_type == "error":
928
+ yield PiStreamEvent(
929
+ kind="error",
930
+ text=str(
931
+ delta.get("message") or delta.get("error") or "generation error"
932
+ ),
933
+ is_error=True,
934
+ )
935
+
936
+ def prompt_stream(
937
+ self, message: str, *, show_tool_status: bool = True
938
+ ) -> Iterator[str]:
939
+ """Backward-compatible text stream (assistant visible text + optional tool status)."""
940
+ for event in self.prompt_events(message):
941
+ if event.kind == "text_delta":
942
+ yield event.text
943
+ elif show_tool_status and event.kind == "tool_start":
944
+ yield f"\n\n_[Running {event.tool_name}…]_\n"
945
+ elif event.kind == "error":
946
+ yield f"\n\n**Error:** {event.text}\n"
947
+
948
+
949
+ def start_pi_prompt_event_worker(
950
+ client: PiRpcClient,
951
+ event_queue: queue.Queue[Any],
952
+ prompt: str,
953
+ ) -> None:
954
+ """Run ``client.prompt_events`` on a background thread, feeding *event_queue*."""
955
+
956
+ def _worker() -> None:
957
+ try:
958
+ for event in client.prompt_events(prompt):
959
+ event_queue.put(event)
960
+ except Exception as exc:
961
+ event_queue.put(PiStreamEvent(kind="error", text=str(exc), is_error=True))
962
+ finally:
963
+ event_queue.put(None)
964
+
965
+ threading.Thread(target=_worker, daemon=True).start()
966
+
967
+
968
+ def default_client(session_hash: str | None = None) -> PiRpcClient:
969
+ from pi_agent_config import configure_aws_credentials
970
+ from pi_workspace_skills import ensure_workspace_skills, pi_rpc_args, pi_rpc_cwd
971
+
972
+ configure_aws_credentials()
973
+ ensure_workspace_skills()
974
+ env = os.environ.copy()
975
+ env.setdefault("HOME", os.path.expanduser("~"))
976
+ env.setdefault("PYTHONUTF8", "1")
977
+ env.setdefault("PYTHONIOENCODING", "utf-8")
978
+ from session_workspace import workspace_base_dir
979
+
980
+ env.setdefault("PI_WORKSPACE_DIR", str(workspace_base_dir()))
981
+ if not env.get("GEMINI_API_KEY") and env.get("GOOGLE_API_KEY"):
982
+ env["GEMINI_API_KEY"] = env["GOOGLE_API_KEY"]
983
+ if not env.get("HF_TOKEN") and env.get("DOC_REDACTION_HF_TOKEN"):
984
+ env["HF_TOKEN"] = env["DOC_REDACTION_HF_TOKEN"]
985
+ return PiRpcClient(
986
+ cwd=pi_rpc_cwd(session_hash),
987
+ env=env,
988
+ pi_args=pi_rpc_args(),
989
+ )
agent-redact/pi/pi_session_usage.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Summarize Pi agent LLM token usage for usage-log CSV rows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from pi_rpc_client import PiRpcClient, PiRpcError
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class TokenUsageTotals:
15
+ """Pi session usage (see Pi session-format ``Usage``)."""
16
+
17
+ input: int = 0
18
+ output: int = 0
19
+ cache_read: int = 0
20
+ cache_write: int = 0
21
+
22
+ @property
23
+ def llm_input_tokens(self) -> int:
24
+ """Input-side tokens for the main-app usage log (input + cache)."""
25
+ return self.input + self.cache_read + self.cache_write
26
+
27
+ @property
28
+ def llm_output_tokens(self) -> int:
29
+ return self.output
30
+
31
+
32
+ def _int_field(raw: Any) -> int:
33
+ try:
34
+ return max(0, int(raw or 0))
35
+ except (TypeError, ValueError):
36
+ return 0
37
+
38
+
39
+ def totals_from_usage_dict(usage: dict[str, Any] | None) -> TokenUsageTotals:
40
+ if not usage:
41
+ return TokenUsageTotals()
42
+ return TokenUsageTotals(
43
+ input=_int_field(usage.get("input")),
44
+ output=_int_field(usage.get("output")),
45
+ cache_read=_int_field(usage.get("cacheRead")),
46
+ cache_write=_int_field(usage.get("cacheWrite")),
47
+ )
48
+
49
+
50
+ def totals_from_stats_payload(data: dict[str, Any] | None) -> TokenUsageTotals:
51
+ if not data:
52
+ return TokenUsageTotals()
53
+ tokens = data.get("tokens")
54
+ if isinstance(tokens, dict):
55
+ return totals_from_usage_dict(tokens)
56
+ return TokenUsageTotals()
57
+
58
+
59
+ def subtract_usage(
60
+ after: TokenUsageTotals, before: TokenUsageTotals
61
+ ) -> TokenUsageTotals:
62
+ return TokenUsageTotals(
63
+ input=max(0, after.input - before.input),
64
+ output=max(0, after.output - before.output),
65
+ cache_read=max(0, after.cache_read - before.cache_read),
66
+ cache_write=max(0, after.cache_write - before.cache_write),
67
+ )
68
+
69
+
70
+ def add_usage(left: TokenUsageTotals, right: TokenUsageTotals) -> TokenUsageTotals:
71
+ return TokenUsageTotals(
72
+ input=left.input + right.input,
73
+ output=left.output + right.output,
74
+ cache_read=left.cache_read + right.cache_read,
75
+ cache_write=left.cache_write + right.cache_write,
76
+ )
77
+
78
+
79
+ def sum_usage_from_messages(
80
+ messages: list[dict[str, Any]],
81
+ *,
82
+ since_last_user: bool = False,
83
+ ) -> TokenUsageTotals:
84
+ """Sum ``usage`` on assistant messages (optional: only after the last user turn)."""
85
+ last_user = -1
86
+ if since_last_user:
87
+ for index, message in enumerate(messages):
88
+ if message.get("role") == "user":
89
+ last_user = index
90
+ messages = messages[last_user + 1 :] if last_user >= 0 else messages
91
+
92
+ total = TokenUsageTotals()
93
+ for message in messages:
94
+ if message.get("role") != "assistant":
95
+ continue
96
+ usage = message.get("usage")
97
+ if isinstance(usage, dict):
98
+ total = add_usage(total, totals_from_usage_dict(usage))
99
+ return total
100
+
101
+
102
+ def sum_usage_from_jsonl(path: Path) -> TokenUsageTotals:
103
+ """Parse a Pi session JSONL file and sum assistant ``usage`` blocks."""
104
+ total = TokenUsageTotals()
105
+ try:
106
+ text = path.read_text(encoding="utf-8")
107
+ except OSError:
108
+ return total
109
+ for line in text.splitlines():
110
+ stripped = line.strip()
111
+ if not stripped:
112
+ continue
113
+ try:
114
+ entry = json.loads(stripped)
115
+ except json.JSONDecodeError:
116
+ continue
117
+ if entry.get("type") != "message":
118
+ continue
119
+ message = entry.get("message")
120
+ if not isinstance(message, dict) or message.get("role") != "assistant":
121
+ continue
122
+ usage = message.get("usage")
123
+ if isinstance(usage, dict):
124
+ total = add_usage(total, totals_from_usage_dict(usage))
125
+ return total
126
+
127
+
128
+ def resolve_session_token_usage(client: PiRpcClient | None) -> TokenUsageTotals:
129
+ """
130
+ Best-effort session usage from Pi RPC ``get_session_stats``, live messages, or JSONL.
131
+ """
132
+ if client is None or not client.running:
133
+ return TokenUsageTotals()
134
+
135
+ try:
136
+ stats = client.get_session_stats()
137
+ totals = totals_from_stats_payload(stats)
138
+ if totals.input or totals.output or totals.cache_read or totals.cache_write:
139
+ return totals
140
+ except PiRpcError:
141
+ pass
142
+
143
+ try:
144
+ messages = client.get_messages()
145
+ totals = sum_usage_from_messages(messages)
146
+ if totals.input or totals.output or totals.cache_read or totals.cache_write:
147
+ return totals
148
+ except PiRpcError:
149
+ pass
150
+
151
+ from session_logs import pi_session_file_from_client
152
+
153
+ session_file = pi_session_file_from_client(client)
154
+ if session_file is not None:
155
+ return sum_usage_from_jsonl(session_file)
156
+ return TokenUsageTotals()
157
+
158
+
159
+ def usage_for_completed_turn(
160
+ client: PiRpcClient | None,
161
+ baseline: TokenUsageTotals | None,
162
+ ) -> TokenUsageTotals:
163
+ """
164
+ Tokens consumed by the prompt that just finished.
165
+
166
+ Prefers delta from *baseline* (captured before ``prompt_events``). Falls back to
167
+ summing assistant ``usage`` since the last user message, then whole-session totals.
168
+ """
169
+ if client is None or not client.running:
170
+ return TokenUsageTotals()
171
+
172
+ current = resolve_session_token_usage(client)
173
+ if baseline is not None:
174
+ delta = subtract_usage(current, baseline)
175
+ if delta.input or delta.output or delta.cache_read or delta.cache_write:
176
+ return delta
177
+
178
+ try:
179
+ turn = sum_usage_from_messages(client.get_messages(), since_last_user=True)
180
+ if turn.input or turn.output or turn.cache_read or turn.cache_write:
181
+ return turn
182
+ except PiRpcError:
183
+ pass
184
+
185
+ return current
agent-redact/pi/pi_workspace_skills.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sync doc_redaction skills into the Pi workspace and constrain Pi RPC to that tree."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import stat
8
+ from pathlib import Path
9
+
10
+ from bootstrap_pi_config import pi_repo_root_path
11
+
12
+
13
+ def workspace_base_dir() -> Path:
14
+ from session_workspace import workspace_base_dir as _base
15
+
16
+ return _base()
17
+
18
+
19
+ def workspace_pi_dir() -> Path:
20
+ return workspace_base_dir() / ".pi"
21
+
22
+
23
+ def workspace_skills_dir() -> Path:
24
+ return workspace_pi_dir() / "skills"
25
+
26
+
27
+ def workspace_helpers_dir() -> Path:
28
+ return workspace_pi_dir() / "helpers"
29
+
30
+
31
+ def remote_redaction_helper_path() -> Path:
32
+ """Absolute path to synced ``remote_redaction.py`` (always under workspace base, not session subfolders)."""
33
+ return workspace_helpers_dir() / "remote_redaction.py"
34
+
35
+
36
+ def remote_redaction_helper_module() -> str:
37
+ return remote_redaction_helper_path().as_posix()
38
+
39
+
40
+ def repo_skills_dir() -> Path:
41
+ return pi_repo_root_path() / "skills"
42
+
43
+
44
+ def _env_flag(name: str) -> bool:
45
+ return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
46
+
47
+
48
+ _SKILLS_SKIP_DIR_NAMES = frozenset({"archive_attempts"})
49
+ _SKILLS_SKIP_SUFFIXES = (".b64.txt",)
50
+ _SKILLS_MAX_FILE_BYTES = int(
51
+ os.environ.get("PI_SKILLS_MAX_FILE_BYTES", str(512 * 1024))
52
+ )
53
+
54
+
55
+ def _should_skip_skill_relpath(rel: Path, *, size_bytes: int | None = None) -> bool:
56
+ """Skip archive blobs and other non-skill artifacts during workspace sync."""
57
+ if any(part in _SKILLS_SKIP_DIR_NAMES for part in rel.parts):
58
+ return True
59
+ name_lower = rel.name.lower()
60
+ if name_lower.endswith(_SKILLS_SKIP_SUFFIXES):
61
+ return True
62
+ if size_bytes is not None and size_bytes > _SKILLS_MAX_FILE_BYTES:
63
+ return True
64
+ return False
65
+
66
+
67
+ def _should_resync(dest: Path, src: Path) -> bool:
68
+ if _env_flag("PI_SKILLS_RESYNC"):
69
+ return True
70
+ if not dest.is_dir():
71
+ return True
72
+ if not any(dest.iterdir()):
73
+ return True
74
+ try:
75
+ return src.stat().st_mtime > dest.stat().st_mtime
76
+ except OSError:
77
+ return True
78
+
79
+
80
+ def _copy_tree_item(src: Path, dest: Path) -> None:
81
+ _copy_tree_item_filtered(src, dest, src_root=src)
82
+
83
+
84
+ def _copy_tree_item_filtered(src: Path, dest: Path, *, src_root: Path) -> None:
85
+ rel = src.relative_to(src_root)
86
+ if _should_skip_skill_relpath(rel):
87
+ return
88
+ if src.is_file():
89
+ try:
90
+ size = src.stat().st_size
91
+ except OSError:
92
+ size = None
93
+ if size is not None and size > _SKILLS_MAX_FILE_BYTES:
94
+ return
95
+ dest.parent.mkdir(parents=True, exist_ok=True)
96
+ if dest.exists():
97
+ _make_writable(dest)
98
+ shutil.copy2(src, dest)
99
+ return
100
+ if dest.exists():
101
+ for child in sorted(src.iterdir()):
102
+ _copy_tree_item_filtered(child, dest / child.name, src_root=src_root)
103
+ else:
104
+ dest.mkdir(parents=True, exist_ok=True)
105
+ for child in sorted(src.iterdir()):
106
+ _copy_tree_item_filtered(child, dest / child.name, src_root=src_root)
107
+
108
+
109
+ def _chmod_tree(path: Path, *, writable: bool) -> None:
110
+ """Set or clear write bits on a file tree (needed for Windows resync)."""
111
+ try:
112
+ if path.is_dir():
113
+ for root, dirs, files in os.walk(path):
114
+ root_path = Path(root)
115
+ for name in files:
116
+ file_path = root_path / name
117
+ mode = file_path.stat().st_mode
118
+ file_path.chmod(
119
+ (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
120
+ if writable
121
+ else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
122
+ )
123
+ for name in dirs:
124
+ dir_path = root_path / name
125
+ mode = dir_path.stat().st_mode
126
+ dir_path.chmod(
127
+ (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
128
+ if writable
129
+ else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
130
+ )
131
+ mode = path.stat().st_mode
132
+ path.chmod(
133
+ (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
134
+ if writable
135
+ else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
136
+ )
137
+ else:
138
+ mode = path.stat().st_mode
139
+ path.chmod(
140
+ (mode | stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
141
+ if writable
142
+ else (mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
143
+ )
144
+ except OSError:
145
+ pass
146
+
147
+
148
+ def _make_writable(path: Path) -> None:
149
+ _chmod_tree(path, writable=True)
150
+
151
+
152
+ def _make_readonly(path: Path) -> None:
153
+ if _env_flag("PI_SKILLS_WRITABLE"):
154
+ return
155
+ _chmod_tree(path, writable=False)
156
+
157
+
158
+ def write_workspace_pi_settings() -> Path:
159
+ """
160
+ Project Pi settings under ``{workspace}/.pi/settings.json``.
161
+
162
+ Paths in that file resolve relative to ``{workspace}/.pi/`` per Pi docs.
163
+ """
164
+ pi_dir = workspace_pi_dir()
165
+ pi_dir.mkdir(parents=True, exist_ok=True)
166
+ settings_path = pi_dir / "settings.json"
167
+ payload = {
168
+ "skills": ["skills"],
169
+ "extensions": [],
170
+ "packages": [],
171
+ "enableSkillCommands": True,
172
+ }
173
+ import json
174
+
175
+ settings_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
176
+ return settings_path
177
+
178
+
179
+ def sync_repo_skills_to_workspace(*, force: bool = False) -> Path:
180
+ """
181
+ Copy ``{repo}/skills/`` → ``{workspace}/.pi/skills/`` (read-only for the agent).
182
+
183
+ Re-sync when the repo tree is newer or ``PI_SKILLS_RESYNC=true``.
184
+ """
185
+ src = repo_skills_dir()
186
+ dest = workspace_skills_dir()
187
+ workspace_pi_dir().mkdir(parents=True, exist_ok=True)
188
+
189
+ if not src.is_dir():
190
+ dest.mkdir(parents=True, exist_ok=True)
191
+ write_workspace_pi_settings()
192
+ return dest
193
+
194
+ if force or _should_resync(dest, src):
195
+ if dest.exists():
196
+ _make_writable(dest)
197
+ shutil.rmtree(dest)
198
+ dest.mkdir(parents=True, exist_ok=True)
199
+ for item in sorted(src.iterdir()):
200
+ rel = item.relative_to(src)
201
+ try:
202
+ size = item.stat().st_size if item.is_file() else None
203
+ except OSError:
204
+ size = None
205
+ if _should_skip_skill_relpath(rel, size_bytes=size):
206
+ continue
207
+ _copy_tree_item_filtered(item, dest / item.name, src_root=src)
208
+
209
+ write_workspace_pi_settings()
210
+ os.environ["PI_WORKSPACE_SKILLS_DIR"] = str(dest.resolve())
211
+ return dest.resolve()
212
+
213
+
214
+ def sync_workspace_helpers() -> Path:
215
+ """
216
+ Copy Pi redaction helper scripts into ``{workspace}/.pi/helpers/``.
217
+
218
+ Keeps ``remote_redaction.py`` inside the workspace boundary on AWS ECS so the
219
+ agent does not search ``/workspace/doc_redaction/agent-redact/``.
220
+ """
221
+ helpers = workspace_helpers_dir()
222
+ helpers.mkdir(parents=True, exist_ok=True)
223
+ pi_dir = Path(__file__).resolve().parent
224
+ for name in ("remote_redaction.py", "run_doc_redact.py"):
225
+ src = pi_dir / name
226
+ dest = helpers / name
227
+ if not src.is_file():
228
+ continue
229
+ if not dest.is_file() or src.stat().st_mtime > dest.stat().st_mtime:
230
+ shutil.copy2(src, dest)
231
+ return helpers.resolve()
232
+
233
+
234
+ def write_hf_space_deployment_skill(*, force: bool = False) -> Path | None:
235
+ """
236
+ Write a deployment-specific skill that overrides Docker URLs in generic skills.
237
+
238
+ Only active when ``PI_DEPLOYMENT_PROFILE=hf-space``.
239
+ """
240
+ try:
241
+ from pi_agent_config import is_hf_space_profile
242
+ from redaction_prompt import doc_redaction_gradio_url
243
+ except ImportError:
244
+ return None
245
+ if not is_hf_space_profile():
246
+ return None
247
+
248
+ skills_root = workspace_skills_dir()
249
+ skills_root.mkdir(parents=True, exist_ok=True)
250
+ if skills_root.is_dir():
251
+ _make_writable(skills_root)
252
+
253
+ dest_dir = skills_root / "hf-space-deployment"
254
+ dest_dir.mkdir(parents=True, exist_ok=True)
255
+ dest = dest_dir / "SKILL.md"
256
+ url = doc_redaction_gradio_url()
257
+ helpers = workspace_helpers_dir().as_posix()
258
+ content = (
259
+ "# HF Space deployment (read first)\n\n"
260
+ "This Pi agent runs on **Hugging Face Spaces** with **Gemini** and calls a "
261
+ "**remote** doc_redaction Space. Generic skills mention Docker URLs for "
262
+ "local-docker or AWS ECS — **ignore those here**.\n\n"
263
+ "## Authoritative settings\n\n"
264
+ "| Setting | Value |\n"
265
+ "|---------|--------|\n"
266
+ f"| **doc_redaction URL** | `{url}` **only** |\n"
267
+ "| **Auth** | `HF_TOKEN` (Space secret; already in Pi subprocess env) |\n"
268
+ f"| **Helper module** | `{helpers}/remote_redaction.py` |\n\n"
269
+ "## One-shot CLI (preferred over writing ``run_redact.py``)\n\n"
270
+ f"```bash\n"
271
+ f"python3 {helpers}/run_doc_redact.py \\\n"
272
+ f' --pdf "<session-folder>/document.pdf" \\\n'
273
+ f' --dest "<session-folder>/redact/document.pdf/output_redact/" \\\n'
274
+ f' --ocr-method "Local model - selectable text" \\\n'
275
+ f' --pii-method "Local"\n'
276
+ f"```\n\n"
277
+ "## Minimal Python (only if the CLI is insufficient)\n\n"
278
+ "```python\n"
279
+ "import importlib.util\n"
280
+ "import sys\n"
281
+ f'helper = "{helpers}/remote_redaction.py"\n'
282
+ 'spec = importlib.util.spec_from_file_location("remote_redaction", helper)\n'
283
+ "mod = importlib.util.module_from_spec(spec)\n"
284
+ 'sys.modules["remote_redaction"] = mod\n'
285
+ "spec.loader.exec_module(mod)\n"
286
+ "from gradio_client import handle_file\n\n"
287
+ f"client = mod.make_redaction_client() # URL: {url}\n"
288
+ 'pdf = "<your-session-folder>/document.pdf"\n'
289
+ "result = client.predict(\n"
290
+ ' api_name="/doc_redact",\n'
291
+ " document_file=handle_file(pdf),\n"
292
+ ")\n"
293
+ 'paths = mod.resolve_redaction_output_paths(result, document_stem="document")\n'
294
+ 'mod.fetch_redaction_files(paths, "<your-session-folder>/redact/document/output_redact/")\n'
295
+ "```\n\n"
296
+ "## Rules\n\n"
297
+ f"- **Helper path is shared:** `{helpers}/remote_redaction.py` lives under the "
298
+ f"workspace root `{workspace_base_dir().as_posix()}/`, **not** under your session "
299
+ f"subfolder's `.pi/` tree.\n"
300
+ f"- Call `/doc_redact` via `{helpers}/run_doc_redact.py` or `make_redaction_client()`.\n"
301
+ "- **Do not** create `run_redact.py`, `run_redact_fixed.py`, or duplicate helpers in your session folder.\n"
302
+ "- **Do not** call `Client(...)` or `view_api()` in a loop from bash — each call hits HF rate limits. "
303
+ "Use the CLI once, or one `make_redaction_client()` (cached + retries).\n"
304
+ "- **Do not** pass `base_url=` manually — `make_redaction_client()` reads "
305
+ f"`DOC_REDACTION_GRADIO_URL` (`{url}`).\n"
306
+ "- **Do not** use `host.docker.internal`, `localhost`, `redaction:7861`, or probe "
307
+ "alternate URLs.\n"
308
+ "- **Do not** rewrite or duplicate `remote_redaction.py` — use the synced helper.\n"
309
+ "- On `TooManyRequestsError`, wait at least 60s and retry **once** via the CLI — "
310
+ "do not spawn repeated `python3 -c` Client probes.\n"
311
+ "- Write status updates as **normal assistant text**, not bash `#` comments.\n"
312
+ "- After `/doc_redact`, download outputs with `fetch_redaction_files` into your "
313
+ "session `output_redact/` folder.\n\n"
314
+ "Then read `/skill:doc-redaction-app` and `/skill:doc-redaction-modifications` "
315
+ "for workflow steps, substituting the URL above wherever examples show Docker hosts.\n"
316
+ )
317
+ if force or not dest.is_file() or dest.read_text(encoding="utf-8") != content:
318
+ dest.write_text(content, encoding="utf-8")
319
+ return dest
320
+
321
+
322
+ def ensure_workspace_skills(*, force: bool = False) -> Path:
323
+ """Idempotent sync used at app startup and before Pi RPC starts."""
324
+ dest = sync_repo_skills_to_workspace(force=force)
325
+ sync_workspace_helpers()
326
+ write_hf_space_deployment_skill(force=force)
327
+ if dest.is_dir():
328
+ _make_readonly(dest)
329
+ return dest
330
+
331
+
332
+ def partnership_template_in_workspace() -> Path | None:
333
+ path = workspace_skills_dir() / "Example prompt partnership.txt"
334
+ return path if path.is_file() else None
335
+
336
+
337
+ def pi_rpc_cwd(session_hash: str | None = None) -> str:
338
+ """Subprocess cwd for ``pi --mode rpc`` (session subfolder when enabled)."""
339
+ from session_workspace import session_workspace_dir, session_workspace_enabled
340
+
341
+ base = workspace_base_dir()
342
+ if session_hash and session_hash.strip() and session_workspace_enabled():
343
+ return str(session_workspace_dir(session_hash))
344
+ return str(base)
345
+
346
+
347
+ def pi_rpc_args() -> list[str]:
348
+ """Load only workspace skills; do not discover repo ``skills/`` via ancestors."""
349
+ skills_dir = ensure_workspace_skills()
350
+ return ["--no-skills", "--skill", str(skills_dir)]
351
+
352
+
353
+ def workspace_boundary_prefix(session_hash: str | None = None) -> str:
354
+ """Extra prompt text: workspace root, skills path, and path rules."""
355
+ base = workspace_base_dir().as_posix().rstrip("/")
356
+ skills = workspace_skills_dir().as_posix()
357
+ from session_workspace import session_workspace_dir, session_workspace_enabled
358
+
359
+ if session_hash and session_hash.strip() and session_workspace_enabled():
360
+ root = session_workspace_dir(session_hash).as_posix().rstrip("/")
361
+ scope = f"your session folder `{root}/`"
362
+ else:
363
+ root = base
364
+ scope = f"the workspace `{base}/`"
365
+
366
+ hf_note = ""
367
+ try:
368
+ from pi_agent_config import is_hf_space_profile
369
+ from redaction_prompt import doc_redaction_gradio_url
370
+
371
+ if is_hf_space_profile():
372
+ helpers = remote_redaction_helper_module()
373
+ hf_note = (
374
+ f"**HF Space redaction backend:** use `{doc_redaction_gradio_url()}` only "
375
+ f"(see `/skill:hf-space-deployment`). Import helpers from `{helpers}` "
376
+ f"(workspace base — not `{root}/.pi/helpers/`). Do not use Docker host "
377
+ "URLs from other skills. Write user-facing progress as normal chat text, "
378
+ "not bash comments.\n\n"
379
+ )
380
+ except ImportError:
381
+ pass
382
+
383
+ return (
384
+ f"**Workspace boundary (mandatory):** work only under `{base}/`. "
385
+ f"Your active directory is {scope}. "
386
+ f"Do not read, write, or run shell commands targeting paths outside `{base}/` "
387
+ f"(including the git checkout and `agent-redact/`). "
388
+ f"**Skills (read-only):** doc_redaction skills are synced to `{skills}/`. "
389
+ f"Use `/skill:doc-redaction-app`, `/skill:doc-redact-page-review`, etc. "
390
+ f"Do not edit files under `{skills}/`.\n\n"
391
+ f"{hf_note}"
392
+ )
agent-redact/pi/redaction_prompt.py ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build Pi redaction task prompts from the partnership example template."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+
11
+ from pi_agent_config import is_aws_ecs_profile, is_hf_space_profile
12
+ from session_workspace import workspace_base_dir
13
+
14
+
15
+ def upload_root() -> Path:
16
+ """Gradio upload directory (created by ``bootstrap_pi_config.ensure_pi_upload_root``)."""
17
+ raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
18
+ if not raw:
19
+ from bootstrap_pi_config import ensure_pi_upload_root
20
+
21
+ raw = ensure_pi_upload_root(pi_repo_root())
22
+ path = Path(raw)
23
+ path.mkdir(parents=True, exist_ok=True)
24
+ return path.resolve()
25
+
26
+
27
+ _SAFE_UPLOAD_FILENAME_MAX_BYTES = 255
28
+ # Path separators, nulls, and characters unsafe on common filesystems — not general punctuation.
29
+ _UNSAFE_UPLOAD_FILENAME_CHARS_RE = re.compile(r'[\x00-\x1f<>:"|?*\\/]')
30
+
31
+
32
+ def _truncate_upload_filename(
33
+ name: str, *, max_bytes: int = _SAFE_UPLOAD_FILENAME_MAX_BYTES
34
+ ) -> str:
35
+ encoded = name.encode("utf-8")
36
+ if len(encoded) <= max_bytes:
37
+ return name
38
+ stem, suffix = os.path.splitext(name)
39
+ suffix_bytes = suffix.encode("utf-8")
40
+ max_stem_bytes = max(1, max_bytes - len(suffix_bytes))
41
+ while stem and len(stem.encode("utf-8")) > max_stem_bytes:
42
+ stem = stem[:-1]
43
+ if not stem:
44
+ stem = "file"
45
+ return stem + suffix
46
+
47
+
48
+ def _split_upload_basename(name: str) -> tuple[str, str]:
49
+ """Split an upload basename into stem and extension (handles ``.pdf`` on Windows)."""
50
+ if re.fullmatch(r"\.[^./\\]+", name):
51
+ return "", name
52
+ path = Path(name)
53
+ return path.stem, path.suffix
54
+
55
+
56
+ def _workspace_filename_from_upload(name: str) -> tuple[str, str, bool]:
57
+ """
58
+ Derive a workspace-safe basename, changing the name only when required for security.
59
+
60
+ Returns ``(original_basename, workspace_basename, renamed)``.
61
+ """
62
+ original = Path(name).name.strip()
63
+ if not original or original in {".", ".."}:
64
+ raise ValueError("Uploaded file has an invalid name.")
65
+ if "\x00" in original or "/" in original or "\\" in original:
66
+ raise ValueError("Uploaded file has an invalid name.")
67
+
68
+ stem, suffix = _split_upload_basename(original)
69
+ safe_stem = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", stem)
70
+ safe_suffix = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", suffix)
71
+ safe_stem = safe_stem.strip(". ")
72
+ if not safe_stem:
73
+ safe_stem = "file"
74
+ safe_name = _truncate_upload_filename(safe_stem + safe_suffix)
75
+ return original, safe_name, safe_name != original
76
+
77
+
78
+ _PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
79
+
80
+
81
+ def _workspace_root() -> Path:
82
+ return workspace_base_dir()
83
+
84
+
85
+ def pi_repo_root() -> Path:
86
+ """Monorepo checkout root (skills/, config/). Set via :func:`bootstrap_pi_config.ensure_pi_workdir`."""
87
+ from bootstrap_pi_config import pi_repo_root_path
88
+
89
+ return pi_repo_root_path()
90
+
91
+
92
+ def partnership_template_path() -> Path:
93
+ from pi_workspace_skills import partnership_template_in_workspace
94
+
95
+ synced = partnership_template_in_workspace()
96
+ if synced is not None:
97
+ return synced
98
+ return pi_repo_root() / _PARTNERSHIP_TEMPLATE
99
+
100
+
101
+ HF_DEFAULT_OCR = "Local model - selectable text"
102
+ HF_DEFAULT_PII = "Local"
103
+ HF_DEFAULT_GRADIO_URL = "https://seanpedrickcase-document-redaction.hf.space"
104
+
105
+ # Used only when PI_DEFAULT_OCR_METHOD / PI_DEFAULT_PII_METHOD are unset (local-docker profile).
106
+ _FALLBACK_LOCAL_OCR = "hybrid-paddle-inference-server"
107
+ _FALLBACK_LOCAL_PII = "Local"
108
+
109
+
110
+ def _env_default(key: str, *, hf_default: str, local_fallback: str) -> str:
111
+ """Resolve Pi redaction defaults from env (e.g. config/pi_agent.env) with profile fallbacks."""
112
+ explicit = (os.environ.get(key) or "").strip()
113
+ if explicit:
114
+ return explicit
115
+ if is_hf_space_profile():
116
+ return hf_default
117
+ return local_fallback
118
+
119
+
120
+ DEFAULT_OCR_METHOD = _env_default(
121
+ "PI_DEFAULT_OCR_METHOD",
122
+ hf_default=HF_DEFAULT_OCR,
123
+ local_fallback=_FALLBACK_LOCAL_OCR,
124
+ )
125
+ DEFAULT_PII_METHOD = _env_default(
126
+ "PI_DEFAULT_PII_METHOD",
127
+ hf_default=HF_DEFAULT_PII,
128
+ local_fallback=_FALLBACK_LOCAL_PII,
129
+ )
130
+
131
+ OCR_METHOD_CHOICES: tuple[str, ...] = (
132
+ "hybrid-paddle-inference-server",
133
+ "hybrid-paddle-vlm",
134
+ "Local model - selectable text",
135
+ "Local OCR",
136
+ "AWS Textract service - all PDF types",
137
+ "tesseract",
138
+ "paddle",
139
+ "hybrid-paddle",
140
+ "vlm",
141
+ "inference-server",
142
+ )
143
+
144
+ PII_METHOD_CHOICES: tuple[str, ...] = (
145
+ "Local",
146
+ "AWS Comprehend",
147
+ "LLM (AWS Bedrock)",
148
+ "Local inference server",
149
+ "Local transformers LLM",
150
+ "Only extract text (no redaction)",
151
+ )
152
+
153
+ _DEFAULT_MAX_PAGES = 3000
154
+
155
+
156
+ def max_pages_limit() -> int:
157
+ """
158
+ Maximum PDF pages allowed for a Pi redaction task.
159
+
160
+ Resolution order: ``PI_MAX_PAGES`` → ``MAX_PAGES`` → ``MAX_DOC_PAGES`` → 3000.
161
+ """
162
+ for key in ("PI_MAX_PAGES", "MAX_PAGES", "MAX_DOC_PAGES"):
163
+ raw = (os.environ.get(key) or "").strip()
164
+ if raw:
165
+ value = int(raw)
166
+ if value < 1:
167
+ raise ValueError(f"{key} must be a positive integer.")
168
+ return value
169
+ return _DEFAULT_MAX_PAGES
170
+
171
+
172
+ def pages_to_process_count(page_range: str, total_pages: int) -> int:
173
+ """Return how many pages ``page_range`` selects from a ``total_pages`` PDF."""
174
+ if total_pages < 1:
175
+ raise ValueError("PDF has no pages.")
176
+
177
+ text = (page_range or "all").strip().lower()
178
+ if not text or text == "all":
179
+ return total_pages
180
+
181
+ if "-" in text:
182
+ start_text, end_text = text.split("-", 1)
183
+ try:
184
+ start = int(start_text.strip())
185
+ end = int(end_text.strip())
186
+ except ValueError as exc:
187
+ raise ValueError(f"Invalid page range: {page_range!r}") from exc
188
+ if start < 1 or end < start:
189
+ raise ValueError(f"Invalid page range: {page_range!r}")
190
+ if end > total_pages:
191
+ raise ValueError(
192
+ f"Page range {page_range!r} exceeds document length "
193
+ f"({total_pages} pages)."
194
+ )
195
+ return end - start + 1
196
+
197
+ try:
198
+ page = int(text)
199
+ except ValueError as exc:
200
+ raise ValueError(f"Invalid page range: {page_range!r}") from exc
201
+ if page < 1 or page > total_pages:
202
+ raise ValueError(
203
+ f"Page {page} is out of range (document has {total_pages} pages)."
204
+ )
205
+ return 1
206
+
207
+
208
+ def pdf_page_count(file_path: str | Path) -> int:
209
+ import pymupdf
210
+
211
+ path = Path(file_path)
212
+ with pymupdf.open(path) as doc:
213
+ return int(doc.page_count)
214
+
215
+
216
+ def validate_pdf_page_limit(
217
+ file_path: str | Path,
218
+ *,
219
+ page_range: str = "all",
220
+ max_pages: int | None = None,
221
+ ) -> None:
222
+ """Reject PDFs whose selected page count exceeds ``max_pages_limit()``."""
223
+ path = Path(file_path)
224
+ if path.suffix.lower() != ".pdf":
225
+ return
226
+
227
+ limit = max_pages if max_pages is not None else max_pages_limit()
228
+ try:
229
+ total = pdf_page_count(path)
230
+ except Exception as exc:
231
+ raise ValueError(f"Could not read PDF page count for {path.name}.") from exc
232
+
233
+ count = pages_to_process_count(page_range, total)
234
+ if count > limit:
235
+ scope = page_range.strip() or "all"
236
+ raise ValueError(
237
+ f"Number of pages to process ({count}) exceeds the maximum allowed "
238
+ f"({limit}). Submit a smaller document or narrow the page range "
239
+ f"({scope!r})."
240
+ )
241
+
242
+
243
+ @dataclass(frozen=True)
244
+ class RedactionTaskSettings:
245
+ ocr_method: str = DEFAULT_OCR_METHOD
246
+ pii_method: str = DEFAULT_PII_METHOD
247
+ encourage_vlm_faces: bool = False if is_hf_space_profile() else True
248
+ encourage_vlm_signatures: bool = False if is_hf_space_profile() else True
249
+
250
+ @classmethod
251
+ def hf_space_defaults(cls) -> RedactionTaskSettings:
252
+ return cls(
253
+ ocr_method=HF_DEFAULT_OCR,
254
+ pii_method=HF_DEFAULT_PII,
255
+ encourage_vlm_faces=False,
256
+ encourage_vlm_signatures=False,
257
+ )
258
+
259
+ @classmethod
260
+ def from_ui(
261
+ cls,
262
+ ocr_method: str,
263
+ pii_method: str,
264
+ encourage_vlm_faces: bool,
265
+ encourage_vlm_signatures: bool,
266
+ ) -> RedactionTaskSettings:
267
+ ocr = (ocr_method or DEFAULT_OCR_METHOD).strip()
268
+ pii = (pii_method or DEFAULT_PII_METHOD).strip()
269
+ if ocr not in OCR_METHOD_CHOICES:
270
+ ocr = DEFAULT_OCR_METHOD
271
+ if pii not in PII_METHOD_CHOICES:
272
+ pii = DEFAULT_PII_METHOD
273
+ return cls(
274
+ ocr_method=ocr,
275
+ pii_method=pii,
276
+ encourage_vlm_faces=bool(encourage_vlm_faces),
277
+ encourage_vlm_signatures=bool(encourage_vlm_signatures),
278
+ )
279
+
280
+
281
+ def doc_redaction_gradio_url() -> str:
282
+ """
283
+ Base URL of the doc_redaction Gradio app used for ``/doc_redact`` and review APIs.
284
+
285
+ Set ``DOC_REDACTION_GRADIO_URL`` in ``config/pi_agent.env`` (or the process environment).
286
+ Reads the environment on each call so runtime overrides apply before ``tools.config``
287
+ is imported (e.g. HF Space Docker ``ENV``, tests, and late ``load_dotenv``).
288
+ """
289
+ raw = (os.environ.get("DOC_REDACTION_GRADIO_URL") or "").strip().rstrip("/")
290
+ if raw:
291
+ return raw
292
+ try:
293
+ from tools.config import DOC_REDACTION_GRADIO_URL
294
+
295
+ return str(DOC_REDACTION_GRADIO_URL).strip().rstrip("/")
296
+ except ImportError:
297
+ return (
298
+ HF_DEFAULT_GRADIO_URL if is_hf_space_profile() else "http://127.0.0.1:7860"
299
+ )
300
+
301
+
302
+ def _default_gradio_url() -> str:
303
+ """Back-compat alias for prompt template substitution."""
304
+ return doc_redaction_gradio_url()
305
+
306
+
307
+ def _default_vlm_base_url() -> str:
308
+ return os.environ.get("PI_VLM_BASE_URL", "http://llama-inference:8080")
309
+
310
+
311
+ def _default_vlm_model() -> str:
312
+ return os.environ.get("PI_VLM_MODEL", "unsloth/Qwen3.6-27B-MTP-GGUF")
313
+
314
+
315
+ def load_template(path: Path | None = None) -> str:
316
+ template_file = path or partnership_template_path()
317
+ if not template_file.is_file():
318
+ raise FileNotFoundError(f"Prompt template not found: {template_file}")
319
+ return template_file.read_text(encoding="utf-8")
320
+
321
+
322
+ def format_user_requirements(instructions: str) -> str:
323
+ lines: list[str] = []
324
+ for raw in instructions.strip().splitlines():
325
+ line = raw.strip()
326
+ if not line:
327
+ continue
328
+ if not line.startswith("-"):
329
+ line = f"- {line}"
330
+ lines.append(line)
331
+ return "\n".join(lines)
332
+
333
+
334
+ def replace_user_requirements_section(template: str, instructions: str) -> str:
335
+ marker = "## User redaction requirements"
336
+ idx = template.find(marker)
337
+ formatted = format_user_requirements(instructions)
338
+ if idx == -1:
339
+ return f"{template.rstrip()}\n\n{marker} (authoritative for this task)\n\n{formatted}\n"
340
+ head = template[:idx]
341
+ return f"{head}{marker} (authoritative for this task)\n\n{formatted}\n"
342
+
343
+
344
+ def _is_textract_ocr_method(ocr_method: str) -> bool:
345
+ lowered = ocr_method.casefold()
346
+ return "textract" in lowered or lowered in {"textract", "aws textract"}
347
+
348
+
349
+ def build_vlm_faces_guidance(encourage: bool) -> str:
350
+ if is_hf_space_profile():
351
+ return (
352
+ "Pass 2 VLM and CUSTOM_VLM_FACES are not available on this deployment. "
353
+ "Do not pass CUSTOM_VLM_FACES or request face detection."
354
+ )
355
+ if encourage:
356
+ return (
357
+ "If the user asks to redact faces, then pass the entity CUSTOM_VLM_FACES "
358
+ "in the initial redaction entity selection"
359
+ )
360
+ return (
361
+ "Do not pass CUSTOM_VLM_FACES in the initial redaction entity list unless "
362
+ "the user explicitly asks to redact faces"
363
+ )
364
+
365
+
366
+ def build_vlm_signature_guidance(encourage: bool, ocr_method: str) -> str:
367
+ if is_hf_space_profile():
368
+ return (
369
+ "Pass 2 VLM and CUSTOM_VLM_SIGNATURE are not available on this deployment. "
370
+ "Do not pass CUSTOM_VLM_SIGNATURE or request signature detection."
371
+ )
372
+ if encourage:
373
+ if _is_textract_ocr_method(ocr_method):
374
+ return (
375
+ "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
376
+ "entity in the initial redaction entity selection, unless the text extraction "
377
+ "option is AWS Textract, in which case the handwrite_signature_textbox parameter "
378
+ "for the doc_redact endpoint should include 'Extract signatures'"
379
+ )
380
+ return (
381
+ "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
382
+ "entity in the initial redaction entity selection"
383
+ )
384
+ return (
385
+ "Do not pass CUSTOM_VLM_SIGNATURE in the initial redaction entity list unless "
386
+ "the user explicitly asks to redact signatures"
387
+ )
388
+
389
+
390
+ def build_local_redaction_client_guidance(
391
+ *,
392
+ gradio_url: str,
393
+ output_base: str,
394
+ workspace_root: str = "",
395
+ ) -> str:
396
+ """Pi agent and doc_redaction on the same host (local dev / shared Docker volumes)."""
397
+ output_redact = f"{output_base.rstrip('/')}/output_redact/"
398
+ try:
399
+ from pi_workspace_skills import remote_redaction_helper_module
400
+
401
+ helpers = remote_redaction_helper_module()
402
+ except ImportError:
403
+ helpers = (
404
+ f"{workspace_root.rstrip('/')}/.pi/helpers/remote_redaction.py"
405
+ if workspace_root.strip()
406
+ else "`.pi/helpers/remote_redaction.py` (under `PI_WORKSPACE_DIR`)"
407
+ )
408
+ doc_output_hint = ""
409
+ try:
410
+ from tools.config import OUTPUT_FOLDER, SESSION_OUTPUT_FOLDER
411
+
412
+ doc_output_hint = (
413
+ f"- **doc_redaction writes to** `{OUTPUT_FOLDER}`"
414
+ + (
415
+ " (per-user subfolders when `SESSION_OUTPUT_FOLDER=True`). "
416
+ if SESSION_OUTPUT_FOLDER
417
+ else ". "
418
+ )
419
+ + "Do **not** pass a Pi workspace path as `output_dir` — the server only "
420
+ "accepts directories under that folder.\n"
421
+ )
422
+ except ImportError:
423
+ doc_output_hint = (
424
+ "- Do **not** pass a Pi workspace path as `/doc_redact` `output_dir` — "
425
+ "the server restricts `output_dir` to its own `OUTPUT_FOLDER`.\n"
426
+ )
427
+ return (
428
+ f"- **Local doc_redaction backend:** `{gradio_url}` (same machine as this workspace).\n"
429
+ f"{doc_output_hint}"
430
+ "- Do not pass `CUSTOM_FUZZY` in `redact_entities` on `/doc_redact` unless the user explicitly requests fuzzy matching; it can be very CPU/RAM intensive and may return an empty path list even when the job completes. Use `CUSTOM` with an explicit `deny_list` on `/doc_redact`, or use `/redact_document` with `max_fuzzy_spelling_mistakes_num > 0` for fuzzy matching.\n"
431
+ f"- Call **`/doc_redact`** (omit `output_dir` or leave it empty), then copy artifacts "
432
+ f"into `{output_redact}` with `remote_redaction.resolve_redaction_output_paths` "
433
+ f"and `fetch_redaction_files`.\n"
434
+ "- When the API returns **Windows paths** (`C:\\\\...`) or paths under "
435
+ "`workspace/.gradio_uploads/`, **copy from disk** with `shutil.copy2` — do not "
436
+ "assume `gradio_api/file=` works (403 until allowed_paths includes that folder).\n"
437
+ "- Path walkers must accept Windows drive paths, not only strings starting with `/`.\n"
438
+ f"- Use `{helpers}`: `extract_server_paths(result)` "
439
+ "then `fetch_redaction_files(paths, dest_dir)` (local copy, then HTTP fallback).\n"
440
+ )
441
+
442
+
443
+ def build_hf_space_backend_guidance(
444
+ *,
445
+ gradio_url: str,
446
+ output_base: str,
447
+ workspace_root: str,
448
+ ) -> str:
449
+ from pi_workspace_skills import remote_redaction_helper_module
450
+
451
+ helpers = remote_redaction_helper_module()
452
+ helpers_dir = helpers.rsplit("/", 1)[0]
453
+ run_cli = f"{helpers_dir}/run_doc_redact.py"
454
+ base = _workspace_root().as_posix().rstrip("/")
455
+ output_dest = output_base.rstrip("/") + "/"
456
+ return (
457
+ f"- **Remote redaction backend (authoritative URL):** `{gradio_url}` **only**. "
458
+ "This Pi Space orchestrates a separate private doc_redaction Hugging Face Space "
459
+ "over HTTPS.\n"
460
+ "- **Read `/skill:hf-space-deployment` first** — it overrides Docker/local URLs "
461
+ "(`host.docker.internal`, `localhost`, `redaction:7861`, internal service names) "
462
+ "that appear in generic skills for local-docker or AWS ECS.\n"
463
+ f"- **Helper module (workspace base, not session folder):** `{helpers}` and "
464
+ f"`{run_cli}` under `{base}/.pi/helpers/`. "
465
+ f"Do **not** look for `{workspace_root.rstrip('/')}/.pi/helpers/`.\n"
466
+ f"- **First redaction call:** run `{run_cli}` once (see `/skill:hf-space-deployment`) "
467
+ "— **do not** write `run_redact.py` in your session folder.\n"
468
+ "- **Do not** probe alternate hosts, rewrite the helper, or hand-roll a new "
469
+ "Gradio client script. Import `make_redaction_client`, `fetch_redaction_files`, "
470
+ "and `resolve_redaction_output_paths` from that file (`HF_TOKEN` is already in "
471
+ "the Pi subprocess environment).\n"
472
+ "- Use **`gradio_client` only** — upload local files with `handle_file()` from "
473
+ f"`{workspace_root.rstrip('/')}/`. **Do not** call `/agent/*` routes or use "
474
+ "server-side paths from the redaction container.\n"
475
+ f"- Download all `/doc_redact` and `/review_apply` outputs via "
476
+ f"`{gradio_url.rstrip('/')}/gradio_api/file=…` with "
477
+ f"`Authorization: Bearer $HF_TOKEN` into `{output_dest}` (create subdirs as needed).\n"
478
+ "- On Hugging Face rate limits (`TooManyRequestsError`), wait and retry the **same** "
479
+ "URL via the helper — do not switch to another host.\n"
480
+ "- Do not pass `CUSTOM_FUZZY` in `redact_entities` on `/doc_redact` unless the user explicitly requests fuzzy matching; it can be very CPU/RAM intensive and may return an empty path list even when the job completes. Use `CUSTOM` with an explicit `deny_list` on `/doc_redact`, or use `/redact_document` with `max_fuzzy_spelling_mistakes_num > 0` for fuzzy matching.\n"
481
+ "- Run **`verify_redaction_coverage`** locally on downloaded CSV/PDF paths in this "
482
+ "workspace (pandas/PyMuPDF), not via Agent API.\n"
483
+ "- **Pass 2 VLM is not available** — do not call a VLM endpoint or use "
484
+ "`CUSTOM_VLM_FACES` / `CUSTOM_VLM_SIGNATURE` entities.\n"
485
+ "- **User-facing updates:** write progress and reasoning as normal assistant text. "
486
+ "Do not put commentary in bash `#` comments — the UI shows those as tool lines.\n"
487
+ f"- Helper module: `{helpers}`."
488
+ )
489
+
490
+
491
+ def build_split_container_redaction_guidance(
492
+ *,
493
+ gradio_url: str,
494
+ output_base: str,
495
+ workspace_root: str,
496
+ ) -> str:
497
+ """AWS ECS (and similar): Pi agent and doc_redaction are separate containers."""
498
+ from pi_workspace_skills import remote_redaction_helper_module
499
+
500
+ output_redact = f"{output_base.rstrip('/')}/output_redact/"
501
+ helpers = remote_redaction_helper_module()
502
+ base = _workspace_root().as_posix().rstrip("/")
503
+ return (
504
+ f"- **Split-container redaction backend:** doc_redaction runs at `{gradio_url}` "
505
+ "(separate service from this Pi agent). Use **`gradio_client` only**.\n"
506
+ f"- **Helper module (workspace base):** `{helpers}` under `{base}/.pi/helpers/` "
507
+ f"(not `{workspace_root.rstrip('/')}/.pi/helpers/`).\n"
508
+ f"- **Deliverables belong in your session workspace:** `{output_redact}` "
509
+ f"(and `{output_base.rstrip('/')}/review/output_review_final/` after apply). "
510
+ "That is the **only** output tree you should populate for this task.\n"
511
+ "- **Do not** search this container for redaction outputs: no `find /workspace`, "
512
+ "no `ls /home/user/app/output`, no `import tools.config OUTPUT_FOLDER` on the Pi "
513
+ "agent — those paths are on the **redaction service**, not here (or are a read-only "
514
+ "git checkout without live run artifacts).\n"
515
+ "- Do not pass `CUSTOM_FUZZY` in `redact_entities` on `/doc_redact` unless the user explicitly requests fuzzy matching; it can be very CPU/RAM intensive and may return an empty path list even when the job completes. Use `CUSTOM` with an explicit `deny_list` on `/doc_redact`, or use `/redact_document` with `max_fuzzy_spelling_mistakes_num > 0` for fuzzy matching.\n"
516
+ f'- **Initial redaction:** `Client("{gradio_url}")` → `/doc_redact` with '
517
+ f"`document_file=handle_file(\"<file under {workspace_root.rstrip('/')}/>\")`. "
518
+ "Omit `output_dir` (server picks its own `OUTPUT_FOLDER`).\n"
519
+ f"- **Collect paths:** `extract_server_paths(result)` from the predict tuple. "
520
+ "When the path list is `[]`, parse the status `message` for embedded paths, or retry "
521
+ "once — **do not** spend turns grepping the filesystem.\n"
522
+ f'- **Download:** `fetch_redaction_files(paths, "{output_redact}")` from '
523
+ f"`{helpers}` (HTTP `GET /gradio_api/file=` — no shared disk copy).\n"
524
+ "- **Coverage verify (split-container):** `/agent/*` paths must already exist on "
525
+ "the **redaction server** under its `OUTPUT_FOLDER` (e.g. `/home/user/app/output/...`) "
526
+ "— not on this Pi container. **Pre-apply** (CSV edited here): download artifacts via "
527
+ "`fetch_redaction_files`, then run `python tools/verify_redaction_coverage.py` on "
528
+ "those local copies (the edited review CSV is not on the redaction server). "
529
+ "**Post-apply** (after `/review_apply`): call "
530
+ f"`POST {gradio_url.rstrip('/')}/agent/verify_redaction_coverage` with "
531
+ "**server paths** from `extract_server_paths(review_apply result)` for "
532
+ "`review_csv_path`, `ocr_words_csv_path` (from `/doc_redact`), and "
533
+ "`redacted_pdf_path`. **Do not** pass Pi workspace paths, `/tmp/gradio_tmp/...` "
534
+ "upload paths, or import `verify_redaction_coverage()` expecting redaction-server "
535
+ "paths to resolve from this container.\n"
536
+ f"- Helper module (inside workspace boundary): `{helpers}`."
537
+ )
538
+
539
+
540
+ def build_remote_backend_guidance(
541
+ *,
542
+ gradio_url: str,
543
+ output_base: str,
544
+ workspace_root: str,
545
+ ) -> str:
546
+ if is_hf_space_profile():
547
+ return build_hf_space_backend_guidance(
548
+ gradio_url=gradio_url,
549
+ output_base=output_base,
550
+ workspace_root=workspace_root,
551
+ )
552
+ if is_aws_ecs_profile():
553
+ return build_split_container_redaction_guidance(
554
+ gradio_url=gradio_url,
555
+ output_base=output_base,
556
+ workspace_root=workspace_root,
557
+ )
558
+ return build_local_redaction_client_guidance(
559
+ gradio_url=gradio_url,
560
+ output_base=output_base,
561
+ workspace_root=workspace_root,
562
+ )
563
+
564
+
565
+ def _resolve_and_validate_upload_path(upload_path: str | Path) -> Path:
566
+ if not isinstance(upload_path, (str, Path)):
567
+ raise ValueError("Uploaded file path has an invalid type.")
568
+ if not str(upload_path).strip():
569
+ raise ValueError("Uploaded file path is empty.")
570
+
571
+ root = upload_root()
572
+ raw_path = Path(upload_path).expanduser()
573
+ try:
574
+ source = raw_path.resolve(strict=True)
575
+ except FileNotFoundError as exc:
576
+ raise FileNotFoundError(f"Uploaded file not found: {raw_path}") from exc
577
+
578
+ try:
579
+ source.relative_to(root)
580
+ except ValueError as exc:
581
+ raise ValueError(
582
+ f"Uploaded file path resolves outside allowed upload root: {source}"
583
+ ) from exc
584
+ if not source.is_file():
585
+ raise FileNotFoundError(f"Uploaded file not found: {source}")
586
+ if source.is_symlink():
587
+ raise ValueError(f"Symlink uploads are not allowed: {source}")
588
+ return source
589
+
590
+
591
+ def _resolve_and_validate_workspace_dir(workspace_dir: Path | None) -> Path:
592
+ if workspace_dir is not None and not isinstance(workspace_dir, Path):
593
+ raise ValueError("Workspace path has an invalid type.")
594
+ base_root = _workspace_root().resolve()
595
+ candidate = (
596
+ workspace_dir if workspace_dir is not None else _workspace_root()
597
+ ).resolve()
598
+ try:
599
+ candidate.relative_to(base_root)
600
+ except ValueError as exc:
601
+ raise ValueError(
602
+ f"Workspace path resolves outside allowed workspace root: {candidate}"
603
+ ) from exc
604
+ return candidate
605
+
606
+
607
+ def copy_upload_to_workspace(
608
+ upload_path: str | Path,
609
+ *,
610
+ workspace_dir: Path | None = None,
611
+ ) -> tuple[Path, str | None]:
612
+ """
613
+ Copy upload into the session workspace.
614
+
615
+ Returns ``(destination_path, original_basename)`` where ``original_basename`` is
616
+ set only when the file was renamed for path safety.
617
+ """
618
+ source = _resolve_and_validate_upload_path(upload_path)
619
+ if not source.is_file():
620
+ raise FileNotFoundError(f"Uploaded file not found: {source}")
621
+ workspace_root = _resolve_and_validate_workspace_dir(workspace_dir)
622
+ workspace_root.mkdir(parents=True, exist_ok=True)
623
+ _original_name, safe_name, renamed = _workspace_filename_from_upload(source.name)
624
+ dest = (workspace_root / safe_name).resolve()
625
+ try:
626
+ dest.relative_to(workspace_root)
627
+ except ValueError as exc:
628
+ raise ValueError(f"Destination path is outside workspace: {dest}") from exc
629
+ if source != dest:
630
+ # copyfile only: copy2/copystat raises EPERM when overwriting on Docker Desktop bind mounts.
631
+ shutil.copyfile(source, dest)
632
+ return dest, (_original_name if renamed else None)
633
+
634
+
635
+ def _strip_long_document_section(template: str) -> str:
636
+ """Remove the 100+ page operator block (keeps user requirements)."""
637
+ marker = "## Specific rules for long documents"
638
+ start = template.find(marker)
639
+ if start == -1:
640
+ return template
641
+ end = template.find("## User redaction requirements", start)
642
+ if end == -1:
643
+ return template[:start].rstrip() + "\n\n"
644
+ return template[:start].rstrip() + "\n\n" + template[end:]
645
+
646
+
647
+ def _include_long_document_rules(page_range: str, total_pages: int) -> bool:
648
+ if total_pages <= 0:
649
+ return False
650
+ if total_pages >= 100:
651
+ return True
652
+ return pages_to_process_count(page_range or "all", total_pages) >= 100
653
+
654
+
655
+ def build_redaction_prompt(
656
+ file_name: str,
657
+ user_instructions: str,
658
+ *,
659
+ page_range: str = "all",
660
+ template: str | None = None,
661
+ settings: RedactionTaskSettings | None = None,
662
+ workspace_dir: Path | None = None,
663
+ total_pages: int = 0,
664
+ ) -> str:
665
+ if not file_name.strip():
666
+ raise ValueError("A document file name is required.")
667
+ if not user_instructions.strip():
668
+ raise ValueError("Redaction requirements are required (use bullet points).")
669
+
670
+ task_settings = settings or RedactionTaskSettings()
671
+ workspace_root = (workspace_dir or _workspace_root()).resolve()
672
+ file_name = Path(file_name).name
673
+ input_path = f"{workspace_root.as_posix().rstrip('/')}/{file_name}"
674
+ output_base = f"{workspace_root.as_posix().rstrip('/')}/redact/{file_name}/"
675
+
676
+ text = template if template is not None else load_template()
677
+ remote_guidance = build_remote_backend_guidance(
678
+ gradio_url=_default_gradio_url(),
679
+ output_base=output_base,
680
+ workspace_root=workspace_root.as_posix(),
681
+ )
682
+ replacements = {
683
+ "{FILE_NAME}": file_name,
684
+ "{INPUT_PATH}": input_path,
685
+ "{OUTPUT_BASE}": output_base,
686
+ "{GRADIO_URL}": _default_gradio_url(),
687
+ "{PAGE_RANGE}": page_range.strip() or "all",
688
+ "{VLM_BASE_URL}": _default_vlm_base_url(),
689
+ "{VLM_MODEL}": _default_vlm_model(),
690
+ "{DEFAULT_OCR_METHOD}": task_settings.ocr_method,
691
+ "{DEFAULT_PII_METHOD}": task_settings.pii_method,
692
+ "{VLM_FACES_GUIDANCE}": build_vlm_faces_guidance(
693
+ task_settings.encourage_vlm_faces
694
+ ),
695
+ "{VLM_SIGNATURE_GUIDANCE}": build_vlm_signature_guidance(
696
+ task_settings.encourage_vlm_signatures,
697
+ task_settings.ocr_method,
698
+ ),
699
+ }
700
+ if remote_guidance:
701
+ replacements["{REMOTE_BACKEND_GUIDANCE}"] = remote_guidance
702
+ else:
703
+ text = text.replace("- {REMOTE_BACKEND_GUIDANCE}\n", "")
704
+ for key, value in replacements.items():
705
+ text = text.replace(key, value)
706
+
707
+ if is_hf_space_profile():
708
+ hf_row = (
709
+ "| **0 — HF deployment (read first)** | `hf-space-deployment` | "
710
+ "`.pi/skills/hf-space-deployment/SKILL.md` | "
711
+ "Use `run_doc_redact.py`; do not hand-roll Gradio clients |\n"
712
+ )
713
+ marker = "| **1 — Initial redaction** |"
714
+ if marker in text and hf_row not in text:
715
+ text = text.replace(marker, hf_row + marker, 1)
716
+
717
+ if not _include_long_document_rules(page_range, total_pages):
718
+ text = _strip_long_document_section(text)
719
+
720
+ return replace_user_requirements_section(text, user_instructions)
721
+
722
+
723
+ def prepare_redaction_task(
724
+ upload_path: str | Path | None,
725
+ user_instructions: str,
726
+ *,
727
+ page_range: str = "all",
728
+ settings: RedactionTaskSettings | None = None,
729
+ workspace_dir: Path | None = None,
730
+ ) -> tuple[str, str, str | None]:
731
+ """
732
+ Copy upload into workspace and return ``(file_name, full_prompt, renamed_from)``.
733
+
734
+ ``renamed_from`` is the original upload basename when it was adjusted for path
735
+ safety; otherwise ``None``.
736
+ """
737
+ if upload_path is None:
738
+ raise ValueError("Please upload a document.")
739
+ root = _resolve_and_validate_workspace_dir(workspace_dir)
740
+ validate_pdf_page_limit(upload_path, page_range=page_range)
741
+ dest, renamed_from = copy_upload_to_workspace(upload_path, workspace_dir=root)
742
+ total_pages = 0
743
+ if str(dest).lower().endswith(".pdf"):
744
+ try:
745
+ total_pages = pdf_page_count(dest)
746
+ except (ValueError, OSError):
747
+ total_pages = 0
748
+ prompt = build_redaction_prompt(
749
+ dest.name,
750
+ user_instructions,
751
+ page_range=page_range,
752
+ settings=settings,
753
+ workspace_dir=root,
754
+ total_pages=total_pages,
755
+ )
756
+ return dest.name, prompt, renamed_from
agent-redact/pi/remote_redaction.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio client helpers for remote doc_redaction HF Space backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Any
10
+ from urllib.parse import quote
11
+
12
+ import httpx
13
+ from gradio_client import Client
14
+
15
+ DEFAULT_CONNECT_TIMEOUT = 120.0
16
+ DEFAULT_READ_TIMEOUT = 1800.0
17
+ _DEFAULT_REDACT_ENTITIES = (
18
+ "PERSON",
19
+ "EMAIL_ADDRESS",
20
+ "PHONE_NUMBER",
21
+ "STREETNAME",
22
+ "UKPOSTCODE",
23
+ "TITLES",
24
+ "CUSTOM",
25
+ )
26
+
27
+ _CLIENT_CACHE: dict[tuple[str, str], Client] = {}
28
+
29
+
30
+ def split_redaction_backend() -> bool:
31
+ """True when Pi and doc_redaction do not share a filesystem (ECS, HF Space, …)."""
32
+ try:
33
+ from pi_agent_config import uses_split_redaction_backend
34
+
35
+ return uses_split_redaction_backend()
36
+ except ImportError:
37
+ return False
38
+
39
+
40
+ def redaction_base_url() -> str:
41
+ raw = (os.environ.get("DOC_REDACTION_GRADIO_URL") or "").strip().rstrip("/")
42
+ if raw:
43
+ return raw
44
+ try:
45
+ from redaction_prompt import doc_redaction_gradio_url
46
+
47
+ return doc_redaction_gradio_url()
48
+ except ImportError:
49
+ return "http://127.0.0.1:7860"
50
+
51
+
52
+ def redaction_hf_token() -> str | None:
53
+ token = os.environ.get("HF_TOKEN") or os.environ.get("DOC_REDACTION_HF_TOKEN")
54
+ return token.strip() if token and token.strip() else None
55
+
56
+
57
+ def redaction_gradio_auth() -> tuple[str, str] | None:
58
+ """
59
+ Optional Gradio HTTP basic auth for doc_redaction when ``COGNITO_AUTH=True``.
60
+
61
+ Set ``DOC_REDACTION_GRADIO_AUTH_USER`` and ``DOC_REDACTION_GRADIO_AUTH_PASSWORD``
62
+ (e.g. a dedicated Cognito service account). Not the Pi UI user's session.
63
+ """
64
+ user = (os.environ.get("DOC_REDACTION_GRADIO_AUTH_USER") or "").strip()
65
+ password = os.environ.get("DOC_REDACTION_GRADIO_AUTH_PASSWORD") or ""
66
+ if user and password:
67
+ return (user, password)
68
+ return None
69
+
70
+
71
+ def httpx_timeout(
72
+ *,
73
+ connect: float = DEFAULT_CONNECT_TIMEOUT,
74
+ read: float = DEFAULT_READ_TIMEOUT,
75
+ ) -> httpx.Timeout:
76
+ return httpx.Timeout(connect=connect, read=read, write=connect, pool=connect)
77
+
78
+
79
+ def _quota_retry_attempts() -> int:
80
+ for key in ("PI_QUOTA_RETRY_ATTEMPTS", "PI_MAX_RETRIES"):
81
+ raw = (os.environ.get(key) or "").strip()
82
+ if raw.isdigit():
83
+ return max(1, int(raw))
84
+ return 5
85
+
86
+
87
+ def _quota_retry_delay_s() -> int:
88
+ raw = (os.environ.get("PI_QUOTA_RETRY_DELAY_S") or "60").strip()
89
+ try:
90
+ return max(1, int(raw))
91
+ except ValueError:
92
+ return 60
93
+
94
+
95
+ def is_gradio_rate_limit_error(exc: BaseException) -> bool:
96
+ if type(exc).__name__ == "TooManyRequestsError":
97
+ return True
98
+ lowered = str(exc).lower()
99
+ return any(
100
+ marker in lowered
101
+ for marker in ("429", "too many requests", "rate limit", "rate-limit")
102
+ )
103
+
104
+
105
+ def clear_redaction_client_cache() -> None:
106
+ """Drop cached gradio_client instances (tests or after credential rotation)."""
107
+ _CLIENT_CACHE.clear()
108
+
109
+
110
+ def make_redaction_client(
111
+ base_url: str | None = None,
112
+ hf_token: str | None = None,
113
+ *,
114
+ force_new: bool = False,
115
+ verbose: bool = False,
116
+ ) -> Client:
117
+ """
118
+ Return a gradio_client for the remote doc_redaction Space.
119
+
120
+ Uses ``token=`` (gradio_client 2.x). Retries ``TooManyRequestsError`` with
121
+ ``PI_QUOTA_RETRY_DELAY_S`` backoff and caches one client per URL+token pair
122
+ so agents do not re-fetch ``/gradio_api/info`` on every bash one-liner.
123
+ """
124
+ url = (base_url or redaction_base_url()).rstrip("/")
125
+ token = hf_token if hf_token is not None else redaction_hf_token()
126
+ auth = redaction_gradio_auth()
127
+ cache_key = (url, token or "", auth or ())
128
+ if not force_new and cache_key in _CLIENT_CACHE:
129
+ return _CLIENT_CACHE[cache_key]
130
+
131
+ client_kwargs: dict[str, Any] = {
132
+ "httpx_kwargs": {"timeout": httpx_timeout()},
133
+ "verbose": verbose,
134
+ }
135
+ max_attempts = _quota_retry_attempts()
136
+ delay_s = _quota_retry_delay_s()
137
+ last_error: BaseException | None = None
138
+
139
+ for attempt in range(1, max_attempts + 1):
140
+ try:
141
+ if auth:
142
+ client = Client(url, auth=auth, **client_kwargs)
143
+ elif token:
144
+ client = Client(url, token=token, **client_kwargs)
145
+ else:
146
+ client = Client(url, **client_kwargs)
147
+ _CLIENT_CACHE[cache_key] = client
148
+ return client
149
+ except Exception as exc:
150
+ if not is_gradio_rate_limit_error(exc):
151
+ raise
152
+ last_error = exc
153
+ if attempt >= max_attempts:
154
+ break
155
+ time.sleep(delay_s)
156
+
157
+ assert last_error is not None
158
+ raise last_error
159
+
160
+
161
+ def call_doc_redact(
162
+ pdf_path: str | Path,
163
+ dest_dir: str | Path,
164
+ *,
165
+ ocr_method: str | None = None,
166
+ pii_method: str | None = None,
167
+ deny_list: list[str] | None = None,
168
+ allow_list: list[str] | None = None,
169
+ redact_entities: list[str] | None = None,
170
+ page_min: int | None = None,
171
+ page_max: int | None = None,
172
+ ) -> tuple[Any, list[Path]]:
173
+ """
174
+ Run ``/doc_redact`` and download outputs into *dest_dir*.
175
+
176
+ Prefer this or ``run_doc_redact.py`` over hand-written Gradio scripts.
177
+ """
178
+ from gradio_client import handle_file
179
+
180
+ pdf = Path(pdf_path).expanduser().resolve()
181
+ if not pdf.is_file():
182
+ raise FileNotFoundError(f"PDF not found: {pdf}")
183
+
184
+ predict_kwargs: dict[str, Any] = {
185
+ "api_name": "/doc_redact",
186
+ "document_file": handle_file(str(pdf)),
187
+ "redact_entities": list(redact_entities or _DEFAULT_REDACT_ENTITIES),
188
+ }
189
+ if ocr_method:
190
+ predict_kwargs["ocr_method"] = ocr_method
191
+ if pii_method:
192
+ predict_kwargs["pii_method"] = pii_method
193
+ if deny_list:
194
+ predict_kwargs["deny_list"] = deny_list
195
+ if allow_list:
196
+ predict_kwargs["allow_list"] = allow_list
197
+ if page_min is not None:
198
+ predict_kwargs["page_min"] = page_min
199
+ if page_max is not None:
200
+ predict_kwargs["page_max"] = page_max
201
+
202
+ client = make_redaction_client()
203
+ result = client.predict(**predict_kwargs)
204
+ paths = resolve_redaction_output_paths(result, document_stem=pdf.stem)
205
+ saved = fetch_redaction_files(paths, dest_dir)
206
+ return result, saved
207
+
208
+
209
+ def is_gradio_file_path(value: str) -> bool:
210
+ """True for absolute Unix or Windows paths returned by Gradio predict."""
211
+ s = (value or "").strip()
212
+ if not s:
213
+ return False
214
+ if s.startswith("/") and len(s) > 1:
215
+ return True
216
+ return len(s) >= 3 and s[1] == ":" and s[0].isalpha() and s[2] in ("\\", "/")
217
+
218
+
219
+ def _collect_paths(value: Any, out: list[str]) -> None:
220
+ if isinstance(value, str):
221
+ if is_gradio_file_path(value):
222
+ out.append(value.strip())
223
+ elif isinstance(value, dict):
224
+ path = value.get("path")
225
+ if isinstance(path, str) and is_gradio_file_path(path):
226
+ out.append(path.strip())
227
+ for item in value.values():
228
+ _collect_paths(item, out)
229
+ elif isinstance(value, (list, tuple)):
230
+ for item in value:
231
+ _collect_paths(item, out)
232
+
233
+
234
+ def extract_server_paths(result: Any) -> list[str]:
235
+ """Walk a gradio_client predict result and collect server file paths."""
236
+ paths: list[str] = []
237
+ _collect_paths(result, paths)
238
+ seen: set[str] = set()
239
+ ordered: list[str] = []
240
+ for path in paths:
241
+ if path not in seen:
242
+ seen.add(path)
243
+ ordered.append(path)
244
+ return ordered
245
+
246
+
247
+ def doc_redaction_output_root() -> Path | None:
248
+ """Resolved doc_redaction ``OUTPUT_FOLDER`` when the main app config is importable."""
249
+ try:
250
+ from tools.config import OUTPUT_FOLDER
251
+
252
+ return Path(OUTPUT_FOLDER).resolve()
253
+ except ImportError:
254
+ raw = (os.environ.get("DOC_REDACTION_OUTPUT_FOLDER") or "").strip()
255
+ if not raw:
256
+ return None
257
+ try:
258
+ return Path(raw).resolve()
259
+ except OSError:
260
+ return None
261
+
262
+
263
+ def discover_redaction_outputs(
264
+ document_stem: str,
265
+ *,
266
+ since: float | None = None,
267
+ ) -> list[str]:
268
+ """
269
+ Fallback when ``/doc_redact`` returns ``[]``: glob the doc_redaction output tree.
270
+
271
+ Matches filenames containing *document_stem* (e.g. ``example_of_emails``).
272
+ When *since* is set, only files with ``mtime >= since`` are returned.
273
+ """
274
+ stem = (document_stem or "").strip()
275
+ if not stem:
276
+ return []
277
+ if split_redaction_backend():
278
+ return []
279
+
280
+ root = doc_redaction_output_root()
281
+ if root is None or not root.is_dir():
282
+ return []
283
+
284
+ threshold = since if since is not None else None
285
+ found: list[str] = []
286
+ try:
287
+ for path in root.rglob(f"*{stem}*"):
288
+ if not path.is_file():
289
+ continue
290
+ if threshold is not None:
291
+ try:
292
+ if path.stat().st_mtime < threshold:
293
+ continue
294
+ except OSError:
295
+ continue
296
+ found.append(str(path.resolve()))
297
+ except OSError:
298
+ return []
299
+ return sorted(found)
300
+
301
+
302
+ def resolve_redaction_output_paths(
303
+ result: Any,
304
+ *,
305
+ document_stem: str = "",
306
+ run_started_at: float | None = None,
307
+ ) -> list[str]:
308
+ """
309
+ Collect output paths from a ``/doc_redact`` result, with on-disk fallback.
310
+
311
+ Prefer paths embedded in the Gradio response; when empty, search
312
+ ``OUTPUT_FOLDER`` (including per-user session subfolders).
313
+ """
314
+ paths = extract_server_paths(result)
315
+ if paths:
316
+ return paths
317
+ if document_stem:
318
+ discovered = discover_redaction_outputs(
319
+ document_stem,
320
+ since=run_started_at,
321
+ )
322
+ if discovered:
323
+ return discovered
324
+ return []
325
+
326
+
327
+ def _download_via_gradio_http(
328
+ paths: list[str],
329
+ dest: Path,
330
+ *,
331
+ base_url: str,
332
+ hf_token: str | None,
333
+ ) -> list[Path]:
334
+ headers: dict[str, str] = {}
335
+ if hf_token:
336
+ headers["Authorization"] = f"Bearer {hf_token.strip()}"
337
+
338
+ downloaded: list[Path] = []
339
+ with httpx.Client(timeout=httpx_timeout(), headers=headers) as http:
340
+ for path in paths:
341
+ file_url = f"{base_url}/gradio_api/file={quote(path, safe='')}"
342
+ local_path = dest / Path(path).name
343
+ response = http.get(file_url)
344
+ response.raise_for_status()
345
+ local_path.write_bytes(response.content)
346
+ downloaded.append(local_path)
347
+ return downloaded
348
+
349
+
350
+ def fetch_redaction_files(
351
+ paths: list[str],
352
+ dest_dir: str | Path,
353
+ *,
354
+ base_url: str | None = None,
355
+ hf_token: str | None = None,
356
+ ) -> list[Path]:
357
+ """
358
+ Save redaction outputs into *dest_dir*.
359
+
360
+ When Pi and doc_redaction share a host filesystem (typical local dev), copies
361
+ directly from disk. Otherwise falls back to ``GET /gradio_api/file=``.
362
+ """
363
+ url = (base_url or redaction_base_url()).rstrip("/")
364
+ token = hf_token if hf_token is not None else redaction_hf_token()
365
+
366
+ dest = Path(dest_dir)
367
+ dest.mkdir(parents=True, exist_ok=True)
368
+ saved: list[Path] = []
369
+ http_paths: list[str] = []
370
+
371
+ use_http_only = split_redaction_backend()
372
+ for path in paths:
373
+ if not is_gradio_file_path(path):
374
+ continue
375
+ if not use_http_only:
376
+ local = Path(path)
377
+ try:
378
+ if local.is_file():
379
+ out = dest / local.name
380
+ if local.resolve() != out.resolve():
381
+ shutil.copy2(local, out)
382
+ else:
383
+ out = local.resolve()
384
+ saved.append(out)
385
+ continue
386
+ except OSError:
387
+ pass
388
+ http_paths.append(path)
389
+
390
+ if http_paths:
391
+ saved.extend(
392
+ _download_via_gradio_http(http_paths, dest, base_url=url, hf_token=token)
393
+ )
394
+ return saved
395
+
396
+
397
+ def download_gradio_files(
398
+ paths: list[str],
399
+ dest_dir: str | Path,
400
+ *,
401
+ base_url: str | None = None,
402
+ hf_token: str | None = None,
403
+ ) -> list[Path]:
404
+ """Backward-compatible alias for :func:`fetch_redaction_files`."""
405
+ return fetch_redaction_files(
406
+ paths,
407
+ dest_dir,
408
+ base_url=base_url,
409
+ hf_token=hf_token,
410
+ )
agent-redact/pi/run_doc_redact.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """One-shot ``/doc_redact`` CLI for Pi agents (HF Space / split-container backends)."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # Allow ``python3 …/run_doc_redact.py`` without installing the package.
12
+ _HELPERS = Path(__file__).resolve().parent
13
+ if str(_HELPERS) not in sys.path:
14
+ sys.path.insert(0, str(_HELPERS))
15
+
16
+ from remote_redaction import call_doc_redact # noqa: E402
17
+
18
+
19
+ def _parse_list(raw: str | None) -> list[str] | None:
20
+ if raw is None or not str(raw).strip():
21
+ return None
22
+ text = str(raw).strip()
23
+ if text.startswith("["):
24
+ parsed = json.loads(text)
25
+ if isinstance(parsed, list):
26
+ return [str(item) for item in parsed]
27
+ return [part.strip() for part in text.split(",") if part.strip()]
28
+
29
+
30
+ def main(argv: list[str] | None = None) -> int:
31
+ parser = argparse.ArgumentParser(
32
+ description="Run /doc_redact via remote_redaction.make_redaction_client()."
33
+ )
34
+ parser.add_argument(
35
+ "--pdf", required=True, help="Local PDF path (session workspace)."
36
+ )
37
+ parser.add_argument(
38
+ "--dest",
39
+ required=True,
40
+ help="Directory for downloaded artifacts (e.g. …/output_redact/).",
41
+ )
42
+ parser.add_argument("--ocr-method", default=None)
43
+ parser.add_argument("--pii-method", default=None)
44
+ parser.add_argument(
45
+ "--deny-list",
46
+ default=None,
47
+ help="Comma-separated or JSON list for CUSTOM deny terms.",
48
+ )
49
+ parser.add_argument(
50
+ "--allow-list",
51
+ default=None,
52
+ help="Comma-separated or JSON list for allow terms.",
53
+ )
54
+ parser.add_argument(
55
+ "--redact-entities",
56
+ default=None,
57
+ help="Comma-separated or JSON list (default: PERSON, EMAIL, …, CUSTOM).",
58
+ )
59
+ parser.add_argument("--page-min", type=int, default=None)
60
+ parser.add_argument("--page-max", type=int, default=None)
61
+ args = parser.parse_args(argv)
62
+
63
+ pdf = Path(args.pdf).expanduser().resolve()
64
+ if not pdf.is_file():
65
+ print(f"PDF not found: {pdf}", file=sys.stderr)
66
+ return 2
67
+
68
+ result, saved = call_doc_redact(
69
+ pdf,
70
+ args.dest,
71
+ ocr_method=args.ocr_method,
72
+ pii_method=args.pii_method,
73
+ deny_list=_parse_list(args.deny_list),
74
+ allow_list=_parse_list(args.allow_list),
75
+ redact_entities=_parse_list(args.redact_entities),
76
+ page_min=args.page_min,
77
+ page_max=args.page_max,
78
+ )
79
+ message = result[1] if isinstance(result, (list, tuple)) and len(result) > 1 else ""
80
+ print(message or "doc_redact completed.")
81
+ for path in saved:
82
+ print(path)
83
+ return 0
84
+
85
+
86
+ if __name__ == "__main__":
87
+ raise SystemExit(main())
agent-redact/pi/session_logs.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Resolve Pi agent session JSONL logs for Gradio download and usage-log persistence."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ from pi_agent_config import ensure_session_dir
9
+ from pi_rpc_client import PiRpcClient, PiRpcError
10
+
11
+ from tools.aws_functions import upload_log_file_to_s3
12
+ from tools.config import (
13
+ RUN_AWS_FUNCTIONS,
14
+ S3_USAGE_LOGS_FOLDER,
15
+ SAVE_LOGS_TO_CSV,
16
+ USAGE_LOGS_FOLDER,
17
+ )
18
+
19
+
20
+ def _session_dir_root() -> Path:
21
+ return ensure_session_dir()
22
+
23
+
24
+ def pi_session_file_from_client(client: PiRpcClient | None) -> Path | None:
25
+ """Return the active Pi session JSONL path from RPC state, if readable."""
26
+ if client is None or not client.running:
27
+ return None
28
+ try:
29
+ state = client.get_state()
30
+ except PiRpcError:
31
+ return None
32
+ raw = state.get("sessionFile")
33
+ if not raw or str(raw).strip() in ("", "—"):
34
+ return None
35
+ path = Path(str(raw)).expanduser()
36
+ if not path.is_file():
37
+ return None
38
+ resolved = path.resolve(strict=False)
39
+ try:
40
+ resolved.relative_to(_session_dir_root())
41
+ except ValueError:
42
+ return None
43
+ return resolved
44
+
45
+
46
+ def _usage_log_archive_name(source: Path, session_hash: str = "") -> str:
47
+ if session_hash and str(session_hash).strip():
48
+ return f"{str(session_hash).strip()}_{source.name}"
49
+ return source.name
50
+
51
+
52
+ def copy_session_log_to_usage_folder(
53
+ source: Path,
54
+ *,
55
+ session_hash: str = "",
56
+ ) -> Path | None:
57
+ """Copy a Pi session JSONL into ``USAGE_LOGS_FOLDER`` (beside ``usage_log.csv``)."""
58
+ if not SAVE_LOGS_TO_CSV:
59
+ return None
60
+ usage_dir = Path(USAGE_LOGS_FOLDER)
61
+ usage_dir.mkdir(parents=True, exist_ok=True)
62
+ dest = usage_dir / _usage_log_archive_name(source, session_hash)
63
+ try:
64
+ shutil.copy2(source, dest)
65
+ except OSError:
66
+ return None
67
+ return dest.resolve()
68
+
69
+
70
+ def collect_session_log_download(client: PiRpcClient | None) -> str | None:
71
+ """Path suitable for ``gr.File`` download, or ``None`` if no log yet."""
72
+ path = pi_session_file_from_client(client)
73
+ if path is None:
74
+ return None
75
+ return str(path)
76
+
77
+
78
+ def persist_session_log(
79
+ client: PiRpcClient | None,
80
+ *,
81
+ session_hash: str = "",
82
+ source: Path | None = None,
83
+ ) -> Path | None:
84
+ """
85
+ Archive the active Pi session JSONL when local usage logging is enabled.
86
+
87
+ Copies into ``USAGE_LOGS_FOLDER`` when ``SAVE_LOGS_TO_CSV`` is true, then
88
+ uploads that copy to ``S3_USAGE_LOGS_FOLDER`` when ``RUN_AWS_FUNCTIONS`` is true.
89
+
90
+ When *source* is provided (resolved synchronously by the caller), it is used
91
+ directly so this can run on a background thread without issuing an RPC read.
92
+ """
93
+ if not SAVE_LOGS_TO_CSV:
94
+ return None
95
+ if source is None:
96
+ source = pi_session_file_from_client(client)
97
+ if source is None:
98
+ return None
99
+ archived = copy_session_log_to_usage_folder(source, session_hash=session_hash)
100
+ if archived is None:
101
+ return None
102
+ if RUN_AWS_FUNCTIONS:
103
+ upload_log_file_to_s3(str(archived), S3_USAGE_LOGS_FOLDER)
104
+ return archived
105
+
106
+
107
+ def export_session_log_to_s3(client: PiRpcClient | None) -> None:
108
+ """Back-compat: persist session log (local archive + optional S3)."""
109
+ persist_session_log(client)
110
+
111
+
112
+ def gradio_session_log_allowed_paths() -> list[str]:
113
+ """Directories Gradio must allow to serve Pi session JSONL files."""
114
+ paths: list[str] = []
115
+ try:
116
+ paths.append(str(_session_dir_root()))
117
+ except OSError:
118
+ pass
119
+ if SAVE_LOGS_TO_CSV:
120
+ try:
121
+ paths.append(str(Path(USAGE_LOGS_FOLDER).resolve()))
122
+ except OSError:
123
+ pass
124
+ return paths
agent-redact/pi/session_workspace.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Per-session workspace paths for the Pi Gradio UI (mirrors main app session folders)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import gradio as gr
11
+
12
+ _REPO_ROOT = Path(__file__).resolve().parents[2]
13
+ if str(_REPO_ROOT) not in sys.path:
14
+ sys.path.insert(0, str(_REPO_ROOT))
15
+
16
+ _SESSION_ID_RE = re.compile(r"[^a-zA-Z0-9_@.+-]+")
17
+
18
+
19
+ def workspace_base_dir() -> Path:
20
+ """Shared Pi workspace root (see ``bootstrap_pi_config.ensure_pi_workspace_dir``)."""
21
+ raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
22
+ if raw:
23
+ path = Path(raw)
24
+ else:
25
+ from bootstrap_pi_config import ensure_pi_workspace_dir
26
+
27
+ return Path(ensure_pi_workspace_dir(_REPO_ROOT))
28
+ path.mkdir(parents=True, exist_ok=True)
29
+ return path.resolve()
30
+
31
+
32
+ def _session_output_folder_enabled() -> bool:
33
+ """Read at call time so ``pi_agent.env`` / dotenv apply before first use."""
34
+ raw = (os.environ.get("SESSION_OUTPUT_FOLDER") or "").strip().lower()
35
+ return raw in {"1", "true", "yes", "on"}
36
+
37
+
38
+ def session_workspace_enabled() -> bool:
39
+ """
40
+ When true, each Gradio session uses ``{PI_WORKSPACE_DIR}/{session_hash}/``.
41
+
42
+ Controlled by ``PI_SESSION_WORKSPACE`` in ``config/pi_agent.env`` (default on when unset).
43
+ Set ``PI_SESSION_WORKSPACE=false`` for a single shared workspace root.
44
+ """
45
+ raw = os.environ.get("PI_SESSION_WORKSPACE", "").strip().lower()
46
+ if raw in {"0", "false", "no", "off"}:
47
+ return False
48
+ if raw in {"1", "true", "yes", "on"}:
49
+ return True
50
+ if _session_output_folder_enabled():
51
+ return True
52
+ return True
53
+
54
+
55
+ def workspace_base_dir_resolved() -> Path:
56
+ """Current workspace root (never cached at import)."""
57
+ return workspace_base_dir()
58
+
59
+
60
+ def sanitize_session_id(raw: str) -> str:
61
+ cleaned = _SESSION_ID_RE.sub("_", (raw or "").strip())[:128].strip("_")
62
+ return cleaned or "default"
63
+
64
+
65
+ def resolve_session_hash(request: gr.Request | None) -> str:
66
+ """
67
+ Resolve Gradio session id for per-user workspace folders.
68
+
69
+ Prefers ``request.session_hash`` (local Pi UI). Falls back to the main app's
70
+ Cognito/OIDC resolver when a deployment header is configured.
71
+ """
72
+ if request is None:
73
+ return "default"
74
+ gradio_hash = getattr(request, "session_hash", None)
75
+ if gradio_hash is not None and str(gradio_hash).strip():
76
+ return sanitize_session_id(str(gradio_hash))
77
+ from tools.gradio_platform import resolve_session_identity
78
+
79
+ try:
80
+ identity = resolve_session_identity(request)
81
+ except ValueError:
82
+ return "default"
83
+ return sanitize_session_id(str(identity))
84
+
85
+
86
+ def effective_session_hash(
87
+ session_hash: str,
88
+ request: gr.Request | None = None,
89
+ ) -> str:
90
+ """
91
+ Use ``session_hash_state`` when set; otherwise resolve from the active request.
92
+
93
+ Gradio ``demo.load`` may run before ``request.session_hash`` exists, so handlers
94
+ should pass ``request`` and call this on each event.
95
+ """
96
+ stored = (session_hash or "").strip()
97
+ if stored and stored != "default":
98
+ return sanitize_session_id(stored)
99
+ if request is not None:
100
+ resolved = resolve_session_hash(request)
101
+ if resolved and resolved != "default":
102
+ return resolved
103
+ if stored:
104
+ return sanitize_session_id(stored)
105
+ return "default"
106
+
107
+
108
+ def session_workspace_status_markdown(session_hash: str) -> str:
109
+ """Markdown for the workspace panel."""
110
+ workspace = ensure_session_workspace(session_hash)
111
+ path = workspace.as_posix()
112
+ if session_workspace_enabled():
113
+ return (
114
+ f"**Session id:** `{session_hash}` \n" f"**Your workspace:** `{path}/` \n"
115
+ )
116
+ return f"**Workspace:** `{path}/`"
117
+
118
+
119
+ def prepare_session_workspace(
120
+ session_hash: str,
121
+ request: gr.Request | None = None,
122
+ ) -> tuple[str, Path, str]:
123
+ """
124
+ Resolve session id, create ``{PI_WORKSPACE_DIR}/{hash}/``, return status text.
125
+
126
+ Call at the start of redaction (and on page load) so the folder always exists.
127
+ """
128
+ effective = effective_session_hash(session_hash, request)
129
+ workspace = ensure_session_workspace(effective)
130
+ return effective, workspace, session_workspace_status_markdown(effective)
131
+
132
+
133
+ def session_s3_outputs_prefix(session_hash: str) -> str:
134
+ """Session-scoped S3 output prefix (shared env vars with main app)."""
135
+ from tools.gradio_platform import build_s3_outputs_prefix
136
+
137
+ return build_s3_outputs_prefix(
138
+ session_hash,
139
+ session_scoped=session_workspace_enabled(),
140
+ )
141
+
142
+
143
+ def session_workspace_dir(session_hash: str) -> Path:
144
+ base = workspace_base_dir().resolve()
145
+ if not session_workspace_enabled():
146
+ return base
147
+ safe_id = sanitize_session_id(session_hash)
148
+ candidate = (base / safe_id).resolve()
149
+ try:
150
+ candidate.relative_to(base)
151
+ except ValueError:
152
+ return (base / "default").resolve()
153
+ return candidate
154
+
155
+
156
+ def ensure_session_workspace(session_hash: str) -> Path:
157
+ workspace = session_workspace_dir(session_hash)
158
+ workspace.mkdir(parents=True, exist_ok=True)
159
+ return workspace
160
+
161
+
162
+ def init_session_workspace(
163
+ request: gr.Request,
164
+ ) -> tuple[str, gr.FileExplorer, str, str]:
165
+ """
166
+ App-load handler: create the session subfolder and scope the file explorer.
167
+
168
+ Returns ``(session_hash, file_explorer_update, status_markdown, s3_output_prefix)``.
169
+ """
170
+ session_hash, workspace, status = prepare_session_workspace("", request)
171
+ s3_prefix = session_s3_outputs_prefix(session_hash)
172
+
173
+ return (
174
+ session_hash,
175
+ gr.FileExplorer(root_dir=workspace.as_posix()),
176
+ status,
177
+ s3_prefix,
178
+ )
179
+
180
+
181
+ def workspace_context_prefix(session_hash: str) -> str:
182
+ """Prefix Pi prompts so the agent uses the session workspace."""
183
+ if not session_workspace_enabled() or not session_hash.strip():
184
+ return ""
185
+ root = session_workspace_dir(session_hash).as_posix().rstrip("/")
186
+ lines = [
187
+ f"**Session workspace (mandatory):** all uploads, downloads, and redaction "
188
+ f"artifacts for this user must live under `{root}/`. "
189
+ f"Use `{root}/redact/<document>/output_redact/` for Pass 1 downloads and "
190
+ f"`{root}/redact/<document>/review/output_review_final/` after `/review_apply`. "
191
+ f"Do not write to `{root}/output_final_download/` (UI-managed download copies only). "
192
+ f"Do not read or write other session folders under `{workspace_base_dir().as_posix()}/`.",
193
+ ]
194
+ try:
195
+ from pi_agent_config import uses_split_redaction_backend
196
+ from redaction_prompt import doc_redaction_gradio_url
197
+
198
+ if uses_split_redaction_backend():
199
+ from pi_workspace_skills import remote_redaction_helper_module
200
+
201
+ helpers = remote_redaction_helper_module()
202
+ lines.append(
203
+ f"**Redaction outputs (split backend):** doc_redaction at "
204
+ f"`{doc_redaction_gradio_url()}` writes to its own container — download "
205
+ f"artifacts into `{root}/redact/<document>/output_redact/` via "
206
+ f"`{helpers}` (`fetch_redaction_files`; helper is under workspace base "
207
+ f"`{workspace_base_dir().as_posix()}/.pi/helpers/`, not under `{root}/.pi/`). "
208
+ f"Do not `find` or `ls` `/workspace/doc_redaction/output` from this agent."
209
+ )
210
+ except ImportError:
211
+ pass
212
+ return "\n".join(lines) + "\n\n"
agent-redact/pi/start.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Start Gradio Pi chat UI in the background; keep container alive for `docker compose exec pi-agent pi`.
3
+ set -euo pipefail
4
+
5
+ export HOME="${HOME:-/home/user}"
6
+ export PI_WORKDIR="${PI_WORKDIR:-/workspace/doc_redaction}"
7
+ export PYTHONPATH="${PI_WORKDIR}:${PI_WORKDIR}/agent-redact/pi:${PYTHONPATH:-}"
8
+
9
+ cd "$PI_WORKDIR"
10
+
11
+ export APP_TYPE="${APP_TYPE:-pi}"
12
+ export APP_CONFIG_PATH="${APP_CONFIG_PATH:-$PI_WORKDIR/config/pi_agent.env}"
13
+
14
+ mkdir -p "${PI_WORKSPACE_DIR:-/home/user/app/workspace}"
15
+ python3 agent-redact/pi/pi_agent_config.py
16
+
17
+ if [ "${RUN_FASTAPI:-False}" = "True" ]; then
18
+ exec uvicorn gradio_app:app \
19
+ --app-dir agent-redact/pi \
20
+ --host "${GRADIO_SERVER_NAME:-0.0.0.0}" \
21
+ --port "${PI_GRADIO_PORT:-${GRADIO_SERVER_PORT:-7862}}"
22
+ else
23
+ python3 agent-redact/pi/gradio_app.py &
24
+ fi
25
+
26
+ wait -n
agent-redact/requirements_pi_agent.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python stack for the pi-agent Docker image (orchestration + Pi Gradio UI).
2
+ #
3
+ # Excludes spaCy, Presidio, and OCR stacks — heavy redaction runs in redaction-app-llama.
4
+ # Includes full Gradio for agent-redact/pi/gradio_app.py (chat frontend over Pi RPC mode).
5
+ #
6
+ # Version caps align with requirements_lightweight.txt where packages overlap.
7
+
8
+ # --- Gradio UI + API client ---
9
+ gradio==6.10.0
10
+ gradio-pdf-redaction<=0.0.25
11
+ httpx<=0.28.1
12
+ requests<=2.34.2
13
+
14
+ # --- Config ---
15
+ python-dotenv<=1.2.2
16
+
17
+ # --- CSV / tabular review (skills, page-review merge) ---
18
+ numpy<=2.4.4
19
+ pandas<=2.3.3
20
+ openpyxl<=3.1.5
21
+
22
+ # --- PDF helpers (verify_redaction_coverage, preview scripts) ---
23
+ pymupdf<=1.27.1
24
+
25
+ # --- General utilities ---
26
+ tabulate<=0.10.0
27
+ rapidfuzz<=3.14.5
28
+ defusedxml<=0.7.1
29
+
30
+ # --- Shared platform features (logging, Cognito, S3 via tools/) ---
31
+ boto3<=1.42.61
32
+ bleach<=6.3.0
33
+ fastapi>=0.115.0
34
+ uvicorn>=0.34.0
agent_routes.py ADDED
@@ -0,0 +1,1167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI routes for programmatic / agent callers.
3
+
4
+ HTTP paths align with Gradio ``api_name`` values in app.py. See GET /agent/operations
5
+ for the full map. Uses cli_redact.main(direct_mode_args=...) where a CLI task exists.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from fastapi import APIRouter, Depends, Header, HTTPException
17
+ from fastapi.responses import JSONResponse
18
+ from pydantic import BaseModel, Field, field_validator
19
+
20
+ from tools.config import (
21
+ AWS_LLM_PII_OPTION,
22
+ AWS_PII_OPTION,
23
+ INFERENCE_SERVER_PII_OPTION,
24
+ INPUT_FOLDER,
25
+ LOCAL_OCR_MODEL_OPTIONS,
26
+ LOCAL_PII_OPTION,
27
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
28
+ OUTPUT_FOLDER,
29
+ )
30
+ from tools.secure_path_utils import validate_path_safety
31
+
32
+ router = APIRouter(tags=["Agent"])
33
+
34
+ REPO_ROOT = Path(__file__).resolve().parent
35
+ _MAX_INSTRUCTION_LEN = 16_000
36
+
37
+ # NOTE: Paths from request bodies are untrusted. Avoid Path.resolve() on untrusted
38
+ # input (CodeQL py/path-injection); instead normalize via os.path and enforce
39
+ # containment under trusted roots.
40
+
41
+ # Mirrors app.py api_name values (Gradio).
42
+ GRADIO_API_NAMES: tuple[str, ...] = (
43
+ "redact_document",
44
+ "load_and_prepare_documents_or_data",
45
+ "apply_review_redactions",
46
+ "review_apply",
47
+ "pdf_summarise",
48
+ "tabular_redact",
49
+ "word_level_ocr_text_search",
50
+ "redact_data",
51
+ "find_duplicate_pages",
52
+ "find_duplicate_tabular",
53
+ "summarise_document",
54
+ "combine_review_csvs",
55
+ "combine_review_pdfs",
56
+ "export_review_redaction_overlay",
57
+ "export_review_page_ocr_visualisation",
58
+ "verify_redaction_coverage",
59
+ )
60
+
61
+
62
+ def _allowed_path_roots() -> list[Path]:
63
+ # Return roots without resolving. These are trusted config values, but avoiding
64
+ # Path.resolve() keeps CodeQL happy and matches our "no resolve on untrusted"
65
+ # approach elsewhere.
66
+ roots = [REPO_ROOT]
67
+ for folder in (INPUT_FOLDER, OUTPUT_FOLDER):
68
+ if folder:
69
+ roots.append(Path(str(folder)))
70
+ return roots
71
+
72
+
73
+ def _sanitize_untrusted_path_input(path_str: str) -> str:
74
+ """Basic raw-input validation before any path normalization."""
75
+ if not isinstance(path_str, str):
76
+ raise HTTPException(status_code=400, detail="Path must be a string.")
77
+ cleaned = path_str.strip()
78
+ if not cleaned:
79
+ raise HTTPException(status_code=400, detail="Path must not be empty.")
80
+ if "\x00" in cleaned:
81
+ raise HTTPException(status_code=400, detail="Path contains invalid null byte.")
82
+ return cleaned
83
+
84
+
85
+ def _normalize_untrusted_path_to_abs(path_str: str) -> str:
86
+ """
87
+ Expand ~, then normalize to an absolute path.
88
+
89
+ Relative paths are interpreted relative to REPO_ROOT (matching prior behaviour).
90
+ """
91
+ safe_input = _sanitize_untrusted_path_input(path_str)
92
+ expanded = os.path.expanduser(safe_input)
93
+ if os.path.isabs(expanded):
94
+ return os.path.normpath(os.path.abspath(expanded))
95
+ return os.path.normpath(os.path.abspath(os.path.join(str(REPO_ROOT), expanded)))
96
+
97
+
98
+ def _must_be_under_allowed_roots(candidate_abs: str, original: str) -> None:
99
+ """Enforce candidate is contained under repo, INPUT_FOLDER, or OUTPUT_FOLDER."""
100
+ candidate_real = os.path.realpath(str(candidate_abs))
101
+ allowed_roots = [
102
+ os.path.realpath(os.path.abspath(str(p))) for p in _allowed_path_roots()
103
+ ]
104
+ for root in allowed_roots:
105
+ try:
106
+ common = os.path.commonpath([candidate_real, root])
107
+ except ValueError:
108
+ # Different drive on Windows or invalid path mix
109
+ continue
110
+ if common == root:
111
+ return
112
+ raise HTTPException(
113
+ status_code=403,
114
+ detail="Path must be under the app repo, INPUT_FOLDER, or OUTPUT_FOLDER",
115
+ )
116
+
117
+
118
+ def _path_must_be_allowed_file(path_str: str) -> str:
119
+ """Resolve path, ensure it is under an allowed root and exists as a file."""
120
+ candidate_abs = _normalize_untrusted_path_to_abs(path_str)
121
+ candidate_real = os.path.realpath(candidate_abs)
122
+
123
+ # Validate both "safe path" patterns and containment under trusted roots.
124
+ _must_be_under_allowed_roots(candidate_real, path_str)
125
+ ok = any(
126
+ validate_path_safety(candidate_real, base_path=str(root))
127
+ for root in _allowed_path_roots()
128
+ )
129
+ if not ok:
130
+ raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
131
+ try:
132
+ candidate_path = Path(candidate_real)
133
+ if not candidate_path.is_file():
134
+ raise HTTPException(
135
+ status_code=400, detail=f"Not a file or missing: {candidate_real}"
136
+ )
137
+ except OSError:
138
+ raise HTTPException(
139
+ status_code=400, detail=f"Not a file or missing: {candidate_real}"
140
+ )
141
+ return candidate_real
142
+
143
+
144
+ def _path_must_be_allowed_directory(path_str: str, *, must_exist: bool = True) -> str:
145
+ """
146
+ Normalize and validate a directory path under allowed roots.
147
+
148
+ By default the directory must already exist; callers can opt out (e.g. output_dir
149
+ that will be created later by the CLI).
150
+ """
151
+ candidate_abs = _normalize_untrusted_path_to_abs(path_str)
152
+ candidate_real = os.path.realpath(candidate_abs)
153
+
154
+ _must_be_under_allowed_roots(candidate_real, path_str)
155
+ ok = any(
156
+ validate_path_safety(candidate_real, base_path=str(root))
157
+ for root in _allowed_path_roots()
158
+ )
159
+ if not ok:
160
+ raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
161
+ if must_exist:
162
+ try:
163
+ if not Path(candidate_real).is_dir():
164
+ raise HTTPException(
165
+ status_code=400, detail=f"Not a directory: {candidate_real}"
166
+ )
167
+ except OSError:
168
+ raise HTTPException(
169
+ status_code=400, detail=f"Not a directory: {candidate_real}"
170
+ )
171
+ return candidate_real
172
+
173
+
174
+ def _optional_agent_api_key(x_agent_api_key: Optional[str] = Header(None)) -> None:
175
+ expected = os.environ.get("AGENT_API_KEY", "").strip()
176
+ if not expected:
177
+ return
178
+ if not x_agent_api_key or x_agent_api_key.strip() != expected:
179
+ raise HTTPException(
180
+ status_code=401,
181
+ detail="Set header X-Agent-API-Key to match AGENT_API_KEY environment variable",
182
+ )
183
+
184
+
185
+ class AgentRedactDocumentRequest(BaseModel):
186
+ """Parity with Gradio api_name ``redact_document``."""
187
+
188
+ input_files: list[str] = Field(
189
+ ...,
190
+ min_length=1,
191
+ description="Paths to input files (PDF, images, or tabular/Word for anonymisation)",
192
+ )
193
+ instruction: Optional[str] = Field(
194
+ None,
195
+ description="Optional instructions for LLM-based PII detection (custom_llm_instructions)",
196
+ )
197
+ output_dir: Optional[str] = None
198
+ input_dir: Optional[str] = None
199
+ ocr_method: Optional[str] = Field(
200
+ None,
201
+ description=(
202
+ "High-level OCR/text mode. Accepted values: 'Local OCR', "
203
+ "'AWS Textract', 'Local text'. To choose a specific local OCR engine "
204
+ "(e.g. paddle/tesseract/vlm), set "
205
+ "overrides.chosen_local_ocr_model."
206
+ ),
207
+ )
208
+ pii_detector: Optional[str] = Field(
209
+ None,
210
+ description=(
211
+ "PII detection method. Recommended configured labels: "
212
+ f"'{LOCAL_PII_OPTION}', '{AWS_PII_OPTION}', '{AWS_LLM_PII_OPTION}', "
213
+ f"'{INFERENCE_SERVER_PII_OPTION}', '{LOCAL_TRANSFORMERS_LLM_PII_OPTION}', "
214
+ "'None'."
215
+ ),
216
+ )
217
+ overrides: Optional[dict[str, Any]] = Field(
218
+ None,
219
+ description=(
220
+ "Optional CLI flag overrides; keys must match argparse destination names. "
221
+ "For local OCR model selection, set 'chosen_local_ocr_model' "
222
+ f"(allowed models depend on deployment; configured options: {LOCAL_OCR_MODEL_OPTIONS})."
223
+ ),
224
+ )
225
+
226
+ model_config = {
227
+ "json_schema_extra": {
228
+ "examples": [
229
+ {
230
+ "input_files": [
231
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
232
+ ],
233
+ "instruction": "Do not redact the university name.",
234
+ "ocr_method": "Local OCR",
235
+ "pii_detector": LOCAL_PII_OPTION,
236
+ "overrides": {"chosen_local_ocr_model": "paddle"},
237
+ }
238
+ ]
239
+ }
240
+ }
241
+
242
+ @field_validator("instruction")
243
+ @classmethod
244
+ def _cap_instruction(cls, v: Optional[str]) -> Optional[str]:
245
+ if v is None:
246
+ return v
247
+ if len(v) > _MAX_INSTRUCTION_LEN:
248
+ raise ValueError(f"instruction exceeds {_MAX_INSTRUCTION_LEN} characters")
249
+ return v
250
+
251
+
252
+ class AgentRedactDataRequest(AgentRedactDocumentRequest):
253
+ """Parity with Gradio api_name ``redact_data``; same CLI task as redact_document."""
254
+
255
+
256
+ class AgentTaskResponse(BaseModel):
257
+ status: str
258
+ gradio_api_name: str
259
+ task: str
260
+ output_dir: str
261
+ input_dir: str
262
+ message: str
263
+ log_excerpt: Optional[str] = None
264
+ output_paths: Optional[list[str]] = None
265
+
266
+
267
+ class AgentVerifyRedactionRequest(BaseModel):
268
+ review_csv_path: str = Field(..., description="Path to *_review_file.csv")
269
+ ocr_words_csv_path: str = Field(
270
+ ..., description="Path to *_ocr_results_with_words_*.csv from the same run"
271
+ )
272
+ must_redact: Optional[List[str]] = Field(
273
+ None,
274
+ description="Regex patterns for terms that must be covered by review boxes.",
275
+ )
276
+ must_not_redact: Optional[List[str]] = Field(
277
+ None,
278
+ description="Regex patterns for terms that must not appear in review rows.",
279
+ )
280
+ redacted_pdf_path: Optional[str] = Field(
281
+ None, description="Optional applied *_redacted.pdf for text-layer leak checks."
282
+ )
283
+ total_pages: Optional[int] = Field(None, ge=1)
284
+ min_word_length: int = Field(3, ge=1, le=32)
285
+ sample_pixels: bool = Field(
286
+ False,
287
+ description="Sample pixel darkness at box centres on redacted PDF (requires redacted_pdf_path).",
288
+ )
289
+ auto_prune_suspicious: bool = Field(
290
+ False,
291
+ description="Remove prunable suspicious short/OCR-fragment rows and write pruned CSV.",
292
+ )
293
+ pruned_output_path: Optional[str] = Field(
294
+ None,
295
+ description="Output path for pruned CSV when auto_prune_suspicious is true.",
296
+ )
297
+
298
+
299
+ class AgentVerifyRedactionResponse(BaseModel):
300
+ status: str
301
+ gradio_api_name: str = "verify_redaction_coverage"
302
+ coverage_pass: bool
303
+ coverage_pass_strict: bool
304
+ coverage_pass_with_cleanup: bool
305
+ pruned_csv_path: Optional[str] = None
306
+ prune_log: Optional[Dict[str, Any]] = None
307
+ report: Dict[str, Any]
308
+
309
+
310
+ class AgentWordLevelOcrSearchRequest(BaseModel):
311
+ ocr_words_csv_path: str = Field(
312
+ ..., description="Path to *_ocr_results_with_words_*.csv"
313
+ )
314
+ search_text: str = Field(..., min_length=3, max_length=500)
315
+ similarity_threshold: float = Field(1.0, ge=0.0, le=1.0)
316
+ use_regex: bool = False
317
+ review_csv_path: Optional[str] = Field(
318
+ None,
319
+ description="Optional *_review_file.csv to flag whether each hit is covered by a box.",
320
+ )
321
+
322
+
323
+ class AgentWordLevelOcrSearchResponse(BaseModel):
324
+ status: str
325
+ gradio_api_name: str = "word_level_ocr_text_search"
326
+ result: Dict[str, Any]
327
+
328
+
329
+ def _merge_redact_direct_mode(body: AgentRedactDocumentRequest) -> dict[str, Any]:
330
+ from cli_redact import get_cli_default_args_dict
331
+
332
+ merged: dict[str, Any] = get_cli_default_args_dict()
333
+ merged["task"] = "redact"
334
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
335
+
336
+ if body.instruction is not None:
337
+ merged["custom_llm_instructions"] = body.instruction
338
+ if body.output_dir is not None:
339
+ # Output folders may not exist yet (CLI will create). Still constrain to allowed roots.
340
+ merged["output_dir"] = _path_must_be_allowed_directory(
341
+ body.output_dir, must_exist=False
342
+ )
343
+ if body.input_dir is not None:
344
+ # Input dir should exist if provided.
345
+ merged["input_dir"] = _path_must_be_allowed_directory(
346
+ body.input_dir, must_exist=True
347
+ )
348
+ if body.ocr_method is not None:
349
+ merged["ocr_method"] = body.ocr_method
350
+ if body.pii_detector is not None:
351
+ merged["pii_detector"] = body.pii_detector
352
+
353
+ if body.overrides:
354
+ allowed = set(merged.keys())
355
+ for key, value in body.overrides.items():
356
+ if key not in allowed:
357
+ raise HTTPException(
358
+ status_code=400,
359
+ detail=f"Unknown override key '{key}'. Must be a known CLI argument name.",
360
+ )
361
+ merged[key] = value
362
+
363
+ return merged
364
+
365
+
366
+ def _run_cli_main(direct: dict[str, Any], gradio_api_name: str) -> AgentTaskResponse:
367
+ from cli_redact import main as cli_main
368
+
369
+ buf = io.StringIO()
370
+ old_stdout = sys.stdout
371
+ try:
372
+ sys.stdout = buf
373
+ cli_main(direct_mode_args=direct)
374
+ except Exception as e:
375
+ raise HTTPException(status_code=500, detail=str(e)) from e
376
+ finally:
377
+ sys.stdout = old_stdout
378
+
379
+ log_excerpt = buf.getvalue()
380
+ if len(log_excerpt) > 8000:
381
+ log_excerpt = log_excerpt[-8000:]
382
+
383
+ return AgentTaskResponse(
384
+ status="completed",
385
+ gradio_api_name=gradio_api_name,
386
+ task=str(direct.get("task", "")),
387
+ output_dir=str(direct.get("output_dir", "")),
388
+ input_dir=str(direct.get("input_dir", "")),
389
+ message="cli_redact.main finished; see log_excerpt for console output",
390
+ log_excerpt=log_excerpt or None,
391
+ )
392
+
393
+
394
+ @router.post(
395
+ "/redact_document",
396
+ response_model=AgentTaskResponse,
397
+ summary="redact_document (Gradio api_name)",
398
+ description=(
399
+ "Matches Gradio ``api_name='redact_document'``. "
400
+ "``python cli_redact.py --task redact --input_file ...``. "
401
+ "Optional ``instruction`` maps to ``custom_llm_instructions``. "
402
+ "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
403
+ "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
404
+ f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
405
+ "PII methods should use configured labels shown on the request schema."
406
+ ),
407
+ )
408
+ def post_redact_document(
409
+ body: AgentRedactDocumentRequest,
410
+ _: None = Depends(_optional_agent_api_key),
411
+ ) -> AgentTaskResponse:
412
+ direct = _merge_redact_direct_mode(body)
413
+ return _run_cli_main(direct, "redact_document")
414
+
415
+
416
+ @router.post(
417
+ "/redact_data",
418
+ response_model=AgentTaskResponse,
419
+ summary="redact_data (Gradio api_name)",
420
+ description=(
421
+ "Matches Gradio ``api_name='redact_data'``. Same CLI ``redact`` task as "
422
+ "/redact_document; use CSV/XLSX/DOCX paths for tabular/Word flows. "
423
+ "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
424
+ "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
425
+ f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
426
+ "PII methods should use configured labels shown on the request schema."
427
+ ),
428
+ )
429
+ def post_redact_data(
430
+ body: AgentRedactDataRequest,
431
+ _: None = Depends(_optional_agent_api_key),
432
+ ) -> AgentTaskResponse:
433
+ direct = _merge_redact_direct_mode(body)
434
+ return _run_cli_main(direct, "redact_data")
435
+
436
+
437
+ @router.post(
438
+ "/tasks/redact",
439
+ response_model=AgentTaskResponse,
440
+ summary="Legacy: same as /redact_document",
441
+ description="Deprecated alias; prefer POST /agent/redact_document.",
442
+ deprecated=True,
443
+ include_in_schema=True,
444
+ )
445
+ def post_tasks_redact_legacy(
446
+ body: AgentRedactDocumentRequest,
447
+ _: None = Depends(_optional_agent_api_key),
448
+ ) -> AgentTaskResponse:
449
+ direct = _merge_redact_direct_mode(body)
450
+ return _run_cli_main(direct, "redact_document")
451
+
452
+
453
+ class AgentFindDuplicatePagesRequest(BaseModel):
454
+ input_files: list[str] = Field(..., min_length=1)
455
+ similarity_threshold: Optional[float] = None
456
+ min_word_count: Optional[int] = None
457
+ min_consecutive_pages: Optional[int] = None
458
+ greedy_match: Optional[bool] = None
459
+ combine_pages: Optional[bool] = None
460
+ overrides: Optional[dict[str, Any]] = None
461
+
462
+
463
+ @router.post(
464
+ "/find_duplicate_pages",
465
+ response_model=AgentTaskResponse,
466
+ summary="find_duplicate_pages (Gradio api_name)",
467
+ description="``cli_redact --task deduplicate --duplicate_type pages``.",
468
+ )
469
+ def post_find_duplicate_pages(
470
+ body: AgentFindDuplicatePagesRequest,
471
+ _: None = Depends(_optional_agent_api_key),
472
+ ) -> AgentTaskResponse:
473
+ from cli_redact import get_cli_default_args_dict
474
+
475
+ merged = get_cli_default_args_dict()
476
+ merged["task"] = "deduplicate"
477
+ merged["duplicate_type"] = "pages"
478
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
479
+ if body.similarity_threshold is not None:
480
+ merged["similarity_threshold"] = body.similarity_threshold
481
+ if body.min_word_count is not None:
482
+ merged["min_word_count"] = body.min_word_count
483
+ if body.min_consecutive_pages is not None:
484
+ merged["min_consecutive_pages"] = body.min_consecutive_pages
485
+ if body.greedy_match is not None:
486
+ merged["greedy_match"] = "True" if body.greedy_match else "False"
487
+ if body.combine_pages is not None:
488
+ merged["combine_pages"] = "True" if body.combine_pages else "False"
489
+ if body.overrides:
490
+ allowed = set(merged.keys())
491
+ for k, v in body.overrides.items():
492
+ if k not in allowed:
493
+ raise HTTPException(400, f"Unknown override key: {k}")
494
+ merged[k] = v
495
+ return _run_cli_main(merged, "find_duplicate_pages")
496
+
497
+
498
+ class AgentFindDuplicateTabularRequest(BaseModel):
499
+ input_files: list[str] = Field(..., min_length=1)
500
+ text_columns: Optional[list[str]] = None
501
+ similarity_threshold: Optional[float] = None
502
+ min_word_count: Optional[int] = None
503
+ overrides: Optional[dict[str, Any]] = None
504
+
505
+
506
+ @router.post(
507
+ "/find_duplicate_tabular",
508
+ response_model=AgentTaskResponse,
509
+ summary="find_duplicate_tabular (Gradio api_name)",
510
+ )
511
+ def post_find_duplicate_tabular(
512
+ body: AgentFindDuplicateTabularRequest,
513
+ _: None = Depends(_optional_agent_api_key),
514
+ ) -> AgentTaskResponse:
515
+ from cli_redact import get_cli_default_args_dict
516
+
517
+ merged = get_cli_default_args_dict()
518
+ merged["task"] = "deduplicate"
519
+ merged["duplicate_type"] = "tabular"
520
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
521
+ if body.text_columns is not None:
522
+ merged["text_columns"] = body.text_columns
523
+ if body.similarity_threshold is not None:
524
+ merged["similarity_threshold"] = body.similarity_threshold
525
+ if body.min_word_count is not None:
526
+ merged["min_word_count"] = body.min_word_count
527
+ if body.overrides:
528
+ allowed = set(merged.keys())
529
+ for k, v in body.overrides.items():
530
+ if k not in allowed:
531
+ raise HTTPException(400, f"Unknown override key: {k}")
532
+ merged[k] = v
533
+ return _run_cli_main(merged, "find_duplicate_tabular")
534
+
535
+
536
+ class AgentSummariseDocumentRequest(BaseModel):
537
+ input_files: list[str] = Field(..., min_length=1)
538
+ summarisation_inference_method: Optional[str] = None
539
+ summarisation_format: Optional[str] = None
540
+ summarisation_context: Optional[str] = None
541
+ summarisation_additional_instructions: Optional[str] = None
542
+ overrides: Optional[dict[str, Any]] = None
543
+
544
+
545
+ @router.post(
546
+ "/summarise_document",
547
+ response_model=AgentTaskResponse,
548
+ summary="summarise_document (Gradio api_name)",
549
+ )
550
+ def post_summarise_document(
551
+ body: AgentSummariseDocumentRequest,
552
+ _: None = Depends(_optional_agent_api_key),
553
+ ) -> AgentTaskResponse:
554
+ from cli_redact import get_cli_default_args_dict
555
+
556
+ merged = get_cli_default_args_dict()
557
+ merged["task"] = "summarise"
558
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
559
+ if body.summarisation_inference_method is not None:
560
+ merged["summarisation_inference_method"] = body.summarisation_inference_method
561
+ if body.summarisation_format is not None:
562
+ merged["summarisation_format"] = body.summarisation_format
563
+ if body.summarisation_context is not None:
564
+ merged["summarisation_context"] = body.summarisation_context
565
+ if body.summarisation_additional_instructions is not None:
566
+ merged["summarisation_additional_instructions"] = (
567
+ body.summarisation_additional_instructions
568
+ )
569
+ if body.overrides:
570
+ allowed = set(merged.keys())
571
+ for k, v in body.overrides.items():
572
+ if k not in allowed:
573
+ raise HTTPException(400, f"Unknown override key: {k}")
574
+ merged[k] = v
575
+ return _run_cli_main(merged, "summarise_document")
576
+
577
+
578
+ class AgentCombineReviewPdfsRequest(BaseModel):
579
+ input_files: list[str] = Field(..., min_length=2)
580
+ output_dir: Optional[str] = None
581
+
582
+
583
+ @router.post(
584
+ "/combine_review_pdfs",
585
+ response_model=AgentTaskResponse,
586
+ summary="combine_review_pdfs (Gradio api_name)",
587
+ )
588
+ def post_combine_review_pdfs(
589
+ body: AgentCombineReviewPdfsRequest,
590
+ _: None = Depends(_optional_agent_api_key),
591
+ ) -> AgentTaskResponse:
592
+ from cli_redact import get_cli_default_args_dict
593
+
594
+ merged = get_cli_default_args_dict()
595
+ merged["task"] = "combine_review_pdfs"
596
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
597
+ if body.output_dir is not None:
598
+ merged["output_dir"] = _path_must_be_allowed_directory(body.output_dir)
599
+ return _run_cli_main(merged, "combine_review_pdfs")
600
+
601
+
602
+ class _NamedPath:
603
+ """merge_csv_files expects objects with a .name attribute (Gradio file-like)."""
604
+
605
+ __slots__ = ("name",)
606
+
607
+ def __init__(self, path: str) -> None:
608
+ self.name = path
609
+
610
+
611
+ class AgentCombineReviewCsvsRequest(BaseModel):
612
+ input_files: list[str] = Field(..., min_length=1)
613
+ output_dir: Optional[str] = Field(
614
+ None, description="Defaults to config OUTPUT_FOLDER"
615
+ )
616
+
617
+
618
+ class AgentApplyReviewRedactionsRequest(BaseModel):
619
+ """Headless parity with Gradio ``api_name='apply_review_redactions'`` (prepare + apply)."""
620
+
621
+ pdf_path: str = Field(
622
+ ...,
623
+ description="Path to the source PDF under allowed roots.",
624
+ )
625
+ review_csv_path: str = Field(
626
+ ...,
627
+ description=(
628
+ "Path to the review plan CSV; basename must contain '_review_file' "
629
+ "(e.g. mydoc_review_file.csv)."
630
+ ),
631
+ )
632
+ output_dir: Optional[str] = Field(
633
+ None,
634
+ description="Output directory (created if missing); defaults to OUTPUT_FOLDER.",
635
+ )
636
+ input_dir: Optional[str] = Field(
637
+ None,
638
+ description="Input/working directory for page images; defaults to INPUT_FOLDER.",
639
+ )
640
+ text_extract_method: Optional[str] = Field(
641
+ None,
642
+ description="OCR/text mode passed to prepare (defaults to CLI ocr_method).",
643
+ )
644
+ efficient_ocr: Optional[bool] = Field(
645
+ None,
646
+ description="If set, overrides EFFICIENT_OCR for the prepare step.",
647
+ )
648
+
649
+
650
+ @router.post(
651
+ "/combine_review_csvs",
652
+ response_model=AgentTaskResponse,
653
+ summary="combine_review_csvs (Gradio api_name)",
654
+ description="Uses tools.helper_functions.merge_csv_files (not cli_redact).",
655
+ )
656
+ def post_combine_review_csvs(
657
+ body: AgentCombineReviewCsvsRequest,
658
+ _: None = Depends(_optional_agent_api_key),
659
+ ) -> AgentTaskResponse:
660
+ from tools.helper_functions import merge_csv_files
661
+
662
+ paths = [_NamedPath(_path_must_be_allowed_file(p)) for p in body.input_files]
663
+ out_dir = body.output_dir or OUTPUT_FOLDER
664
+ out_dir_resolved = _path_must_be_allowed_directory(str(out_dir), must_exist=True)
665
+ sep = "/" if not out_dir_resolved.endswith(("/", "\\")) else ""
666
+ out_files = merge_csv_files(paths, output_folder=out_dir_resolved + sep)
667
+ return AgentTaskResponse(
668
+ status="completed",
669
+ gradio_api_name="combine_review_csvs",
670
+ task="combine_review_csvs",
671
+ output_dir=out_dir_resolved,
672
+ input_dir="",
673
+ message="merge_csv_files completed",
674
+ output_paths=out_files,
675
+ )
676
+
677
+
678
+ class AgentExportReviewRedactionOverlayRequest(BaseModel):
679
+ """Agent JSON body for the same overlay render as Gradio ``api_name='page_redaction_review_image'``."""
680
+
681
+ page_image_path: str = Field(
682
+ ...,
683
+ description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
684
+ )
685
+ boxes: List[Dict[str, Any]] = Field(
686
+ ...,
687
+ min_length=1,
688
+ description="Annotator-style boxes: label, color, xmin, ymin, xmax, ymax (normalized 0–1).",
689
+ )
690
+ page_number: int = Field(
691
+ 1, ge=1, description="1-based page index for the output filename."
692
+ )
693
+ doc_base_name: str = Field(
694
+ "review",
695
+ description="Basename for output file (e.g. document name without extension).",
696
+ )
697
+ review_df_records: Optional[List[Dict[str, Any]]] = Field(
698
+ None,
699
+ description="Optional rows (include at least 'label') for stable label→line-pattern mapping.",
700
+ )
701
+ label_abbrev_chars: Optional[int] = Field(
702
+ None,
703
+ ge=0,
704
+ le=24,
705
+ description="Draw this many leading characters of each label on the image; omit to use REVIEW_OVERLAY_LABEL_ABBREV_CHARS from config (0 = off).",
706
+ )
707
+
708
+
709
+ class AgentExportReviewPageOcrVisualisationRequest(BaseModel):
710
+ """Agent JSON body for the same OCR visualisation as Gradio ``api_name='page_ocr_review_image'``."""
711
+
712
+ page_image_path: str = Field(
713
+ ...,
714
+ description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
715
+ )
716
+ ocr_results: Dict[str, Any] = Field(
717
+ ...,
718
+ description="Word-level OCR results dict (line_key -> {words:[{text, bounding_box, conf, ...}]}).",
719
+ )
720
+ page_number: int = Field(
721
+ 1, ge=1, description="1-based page index (used for naming)."
722
+ )
723
+ doc_base_name: str = Field(
724
+ "review",
725
+ description="Basename for output file (e.g. document name without extension).",
726
+ )
727
+
728
+
729
+ @router.post(
730
+ "/export_review_redaction_overlay",
731
+ response_model=AgentTaskResponse,
732
+ summary="export_review_redaction_overlay (Agent API; Gradio api_name: page_redaction_review_image)",
733
+ description=(
734
+ "Renders hollow redaction outlines and a top-right legend on the page image; "
735
+ "writes ``redaction_overlay/{doc_base_name}_page{n}_redaction_overlay.jpg`` under OUTPUT_FOLDER "
736
+ "(scaled per REVIEW_OVERLAY_MAX_PIXELS, JPEG capped by REVIEW_OVERLAY_MAX_FILE_BYTES). "
737
+ "Uses ``tools.redaction_review.visualise_review_redaction_boxes``."
738
+ ),
739
+ )
740
+ def post_export_review_redaction_overlay(
741
+ body: AgentExportReviewRedactionOverlayRequest,
742
+ _: None = Depends(_optional_agent_api_key),
743
+ ) -> AgentTaskResponse:
744
+ import pandas as pd
745
+
746
+ from tools.redaction_review import visualise_review_redaction_boxes
747
+
748
+ img_path = _path_must_be_allowed_file(body.page_image_path)
749
+ annotator: dict[str, Any] = {"image": img_path, "boxes": body.boxes}
750
+ review_df = (
751
+ pd.DataFrame(body.review_df_records)
752
+ if body.review_df_records
753
+ else pd.DataFrame()
754
+ )
755
+ out_folder_abs = os.path.realpath(
756
+ os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
757
+ )
758
+ if not validate_path_safety(out_folder_abs):
759
+ raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
760
+ _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
761
+ try:
762
+ Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
763
+ except OSError:
764
+ raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
765
+ out_folder = out_folder_abs
766
+
767
+ path = visualise_review_redaction_boxes(
768
+ annotator,
769
+ review_df=review_df,
770
+ output_folder=out_folder,
771
+ page_number=body.page_number,
772
+ doc_base_name=body.doc_base_name,
773
+ label_abbrev_chars=body.label_abbrev_chars,
774
+ )
775
+ if not path:
776
+ raise HTTPException(
777
+ status_code=500,
778
+ detail=(
779
+ "Could not produce overlay PNG (invalid image/boxes or write failed). "
780
+ "Ensure boxes are valid and the image loads."
781
+ ),
782
+ )
783
+ return AgentTaskResponse(
784
+ status="completed",
785
+ gradio_api_name="export_review_redaction_overlay",
786
+ task="export_review_redaction_overlay",
787
+ output_dir=out_folder,
788
+ input_dir="",
789
+ message="Redaction overlay PNG written",
790
+ output_paths=[path],
791
+ )
792
+
793
+
794
+ @router.post(
795
+ "/export_review_page_ocr_visualisation",
796
+ response_model=AgentTaskResponse,
797
+ summary="export_review_page_ocr_visualisation (Agent API; Gradio api_name: page_ocr_review_image)",
798
+ description=(
799
+ "Renders a per-page OCR visualisation using tools.file_redaction.visualise_ocr_words_bounding_boxes; "
800
+ "writes under OUTPUT_FOLDER/review_ocr_visualisations/."
801
+ ),
802
+ )
803
+ def post_export_review_page_ocr_visualisation(
804
+ body: AgentExportReviewPageOcrVisualisationRequest,
805
+ _: None = Depends(_optional_agent_api_key),
806
+ ) -> AgentTaskResponse:
807
+ from PIL import Image
808
+
809
+ from tools.file_redaction import visualise_ocr_words_bounding_boxes
810
+
811
+ img_path = _path_must_be_allowed_file(body.page_image_path)
812
+
813
+ out_folder_abs = os.path.realpath(
814
+ os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
815
+ )
816
+ if not validate_path_safety(out_folder_abs):
817
+ raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
818
+ _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
819
+ try:
820
+ Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
821
+ except OSError:
822
+ raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
823
+ out_folder = out_folder_abs
824
+
825
+ safe_base = str(body.doc_base_name or "review")
826
+ image_name = f"{safe_base}_page{int(body.page_number)}.png"
827
+ log_paths: list[str] = []
828
+ try:
829
+ log_paths = visualise_ocr_words_bounding_boxes(
830
+ Image.open(img_path).convert("RGB"),
831
+ body.ocr_results,
832
+ image_name=image_name,
833
+ output_folder=out_folder,
834
+ visualisation_folder="review_ocr_visualisations",
835
+ add_legend=True,
836
+ log_files_output_paths=log_paths,
837
+ )
838
+ except Exception as e:
839
+ raise HTTPException(status_code=500, detail=str(e)) from e
840
+
841
+ if not log_paths:
842
+ raise HTTPException(
843
+ status_code=500,
844
+ detail="Could not produce OCR visualisation (invalid image/ocr_results or write failed).",
845
+ )
846
+ out_path = log_paths[-1]
847
+ return AgentTaskResponse(
848
+ status="completed",
849
+ gradio_api_name="export_review_page_ocr_visualisation",
850
+ task="export_review_page_ocr_visualisation",
851
+ output_dir=out_folder,
852
+ input_dir="",
853
+ message="OCR visualisation written",
854
+ output_paths=[out_path],
855
+ )
856
+
857
+
858
+ def _gradio_only(api_name: str, detail: str) -> JSONResponse:
859
+ return JSONResponse(
860
+ status_code=501,
861
+ content={
862
+ "gradio_api_name": api_name,
863
+ "detail": detail,
864
+ "hint": (
865
+ "This flow is Gradio-session stateful. Call the named route on the "
866
+ "Gradio HTTP API, not /agent."
867
+ ),
868
+ "gradio_http": {
869
+ "discover_schema": "GET /gradio_api/info",
870
+ "start_call": f"POST /gradio_api/call/{api_name}",
871
+ "request_body_shape": '{"data": [<args in schema order>]}',
872
+ "poll": f"GET /gradio_api/call/{api_name}/{{event_id}}",
873
+ },
874
+ "gradio_client_notes": [
875
+ "Pass api_name explicitly; do not rely on inferring the endpoint from "
876
+ "Python function names (large Blocks apps will look ambiguous).",
877
+ "If predict() still cannot resolve the route, open GET /gradio_api/info "
878
+ "and use the numeric fn_index with gradio_client, or call the HTTP "
879
+ "endpoints directly.",
880
+ "The length of data must match the parameter list for this deployment; "
881
+ "copy order and types from /gradio_api/info.",
882
+ ],
883
+ },
884
+ )
885
+
886
+
887
+ @router.post("/load_and_prepare_documents_or_data")
888
+ def post_load_and_prepare_documents_or_data() -> JSONResponse:
889
+ return _gradio_only(
890
+ "load_and_prepare_documents_or_data",
891
+ "Preparation uses Gradio session state and prepare_image_or_pdf_with_efficient_ocr; no single CLI task.",
892
+ )
893
+
894
+
895
+ @router.post(
896
+ "/apply_review_redactions",
897
+ response_model=AgentTaskResponse,
898
+ summary="apply_review_redactions (Gradio api_name)",
899
+ description=(
900
+ "Runs prepare_image_or_pdf_with_efficient_ocr([pdf, review_csv]) then "
901
+ "apply_redactions_to_review_df_and_files — same core pipeline as the Review tab, "
902
+ "without Gradio session state. Requires paths under allowed roots."
903
+ ),
904
+ )
905
+ def post_apply_review_redactions(
906
+ body: AgentApplyReviewRedactionsRequest,
907
+ _: None = Depends(_optional_agent_api_key),
908
+ ) -> AgentTaskResponse:
909
+ from tools.simplified_api import run_apply_review_redactions
910
+
911
+ pdf = _path_must_be_allowed_file(body.pdf_path)
912
+ csv = _path_must_be_allowed_file(body.review_csv_path)
913
+ out_dir: str | None = None
914
+ if body.output_dir is not None:
915
+ out_dir = _path_must_be_allowed_directory(body.output_dir, must_exist=False)
916
+ in_dir: str | None = None
917
+ if body.input_dir is not None:
918
+ in_dir = _path_must_be_allowed_directory(body.input_dir, must_exist=False)
919
+
920
+ try:
921
+ result = run_apply_review_redactions(
922
+ pdf_path=pdf,
923
+ review_csv_path=csv,
924
+ output_dir=out_dir,
925
+ input_dir=in_dir,
926
+ text_extract_method=body.text_extract_method,
927
+ efficient_ocr=body.efficient_ocr,
928
+ )
929
+ except ValueError as e:
930
+ raise HTTPException(status_code=400, detail=str(e)) from e
931
+ except Exception as e:
932
+ raise HTTPException(
933
+ status_code=500,
934
+ detail=f"apply_review_redactions failed: {e}",
935
+ ) from e
936
+
937
+ return AgentTaskResponse(
938
+ status="completed",
939
+ gradio_api_name="apply_review_redactions",
940
+ task="apply_review_redactions",
941
+ output_dir=result["output_dir"],
942
+ input_dir=result["input_dir"],
943
+ message=result["message"],
944
+ output_paths=result.get("output_paths"),
945
+ )
946
+
947
+
948
+ @router.post(
949
+ "/verify_redaction_coverage",
950
+ response_model=AgentVerifyRedactionResponse,
951
+ summary="verify_redaction_coverage (Pass 1 programmatic QA)",
952
+ )
953
+ def post_verify_redaction_coverage(
954
+ body: AgentVerifyRedactionRequest,
955
+ _: None = Depends(_optional_agent_api_key),
956
+ ) -> AgentVerifyRedactionResponse:
957
+ from tools.simplified_api import run_verify_redaction_coverage
958
+
959
+ review = _path_must_be_allowed_file(body.review_csv_path)
960
+ ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
961
+ redacted = None
962
+ if body.redacted_pdf_path:
963
+ redacted = _path_must_be_allowed_file(body.redacted_pdf_path)
964
+ try:
965
+ report, pruned_csv_path, prune_log = run_verify_redaction_coverage(
966
+ review_csv_path=review,
967
+ ocr_words_csv_path=ocr_words,
968
+ must_redact=body.must_redact,
969
+ must_not_redact=body.must_not_redact,
970
+ redacted_pdf_path=redacted,
971
+ total_pages=body.total_pages,
972
+ min_word_length=body.min_word_length,
973
+ sample_pixels=body.sample_pixels,
974
+ auto_prune_suspicious=body.auto_prune_suspicious,
975
+ pruned_output_path=body.pruned_output_path,
976
+ )
977
+ except ValueError as e:
978
+ raise HTTPException(status_code=400, detail=str(e)) from e
979
+ except Exception as e:
980
+ raise HTTPException(
981
+ status_code=500, detail=f"verify_redaction_coverage failed: {e}"
982
+ ) from e
983
+ return AgentVerifyRedactionResponse(
984
+ status="completed",
985
+ coverage_pass=bool(report.get("pass_strict", report.get("pass"))),
986
+ coverage_pass_strict=bool(report.get("pass_strict", report.get("pass"))),
987
+ coverage_pass_with_cleanup=bool(report.get("pass_with_cleanup")),
988
+ pruned_csv_path=pruned_csv_path,
989
+ prune_log=prune_log,
990
+ report=report,
991
+ )
992
+
993
+
994
+ @router.post(
995
+ "/word_level_ocr_text_search",
996
+ response_model=AgentWordLevelOcrSearchResponse,
997
+ summary="word_level_ocr_text_search (headless OCR CSV search)",
998
+ )
999
+ def post_word_level_ocr_text_search(
1000
+ body: AgentWordLevelOcrSearchRequest,
1001
+ _: None = Depends(_optional_agent_api_key),
1002
+ ) -> AgentWordLevelOcrSearchResponse:
1003
+ from tools.simplified_api import run_word_level_ocr_text_search_api
1004
+
1005
+ ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
1006
+ review = None
1007
+ if body.review_csv_path:
1008
+ review = _path_must_be_allowed_file(body.review_csv_path)
1009
+ try:
1010
+ result = run_word_level_ocr_text_search_api(
1011
+ ocr_words_csv_path=ocr_words,
1012
+ search_text=body.search_text,
1013
+ similarity_threshold=body.similarity_threshold,
1014
+ use_regex=body.use_regex,
1015
+ review_csv_path=review,
1016
+ )
1017
+ except ValueError as e:
1018
+ raise HTTPException(status_code=400, detail=str(e)) from e
1019
+ except Exception as e:
1020
+ raise HTTPException(
1021
+ status_code=500, detail=f"word_level_ocr_text_search failed: {e}"
1022
+ ) from e
1023
+ return AgentWordLevelOcrSearchResponse(status="completed", result=result)
1024
+
1025
+
1026
+ @router.get("/operations")
1027
+ def list_operations() -> dict[str, Any]:
1028
+ return {
1029
+ "gradio_api_names": list(GRADIO_API_NAMES),
1030
+ "gradio_session_state_endpoints": {
1031
+ "description": (
1032
+ "These api_name values are exposed on the Gradio HTTP API but return "
1033
+ "501 on /agent because they depend on in-memory Gradio state."
1034
+ ),
1035
+ "discover_schema": "GET /gradio_api/info",
1036
+ "call_pattern": 'POST /gradio_api/call/<api_name> with JSON body {"data": [...]}',
1037
+ "names": [
1038
+ "load_and_prepare_documents_or_data",
1039
+ ],
1040
+ },
1041
+ "routes": [
1042
+ {
1043
+ "gradio_api_name": "redact_document",
1044
+ "method": "POST",
1045
+ "path": "/agent/redact_document",
1046
+ "implementation": "cli_redact task redact",
1047
+ "notes": {
1048
+ "ocr_method": [
1049
+ "Local OCR",
1050
+ "AWS Textract",
1051
+ "Local text",
1052
+ ],
1053
+ "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
1054
+ "pii_detector_recommended": [
1055
+ LOCAL_PII_OPTION,
1056
+ AWS_PII_OPTION,
1057
+ AWS_LLM_PII_OPTION,
1058
+ INFERENCE_SERVER_PII_OPTION,
1059
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
1060
+ "None",
1061
+ ],
1062
+ },
1063
+ },
1064
+ {
1065
+ "gradio_api_name": "redact_data",
1066
+ "method": "POST",
1067
+ "path": "/agent/redact_data",
1068
+ "implementation": "cli_redact task redact",
1069
+ "notes": {
1070
+ "ocr_method": [
1071
+ "Local OCR",
1072
+ "AWS Textract",
1073
+ "Local text",
1074
+ ],
1075
+ "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
1076
+ "pii_detector_recommended": [
1077
+ LOCAL_PII_OPTION,
1078
+ AWS_PII_OPTION,
1079
+ AWS_LLM_PII_OPTION,
1080
+ INFERENCE_SERVER_PII_OPTION,
1081
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
1082
+ "None",
1083
+ ],
1084
+ },
1085
+ },
1086
+ {
1087
+ "gradio_api_name": "find_duplicate_pages",
1088
+ "method": "POST",
1089
+ "path": "/agent/find_duplicate_pages",
1090
+ "implementation": "cli_redact deduplicate pages",
1091
+ },
1092
+ {
1093
+ "gradio_api_name": "find_duplicate_tabular",
1094
+ "method": "POST",
1095
+ "path": "/agent/find_duplicate_tabular",
1096
+ "implementation": "cli_redact deduplicate tabular",
1097
+ },
1098
+ {
1099
+ "gradio_api_name": "summarise_document",
1100
+ "method": "POST",
1101
+ "path": "/agent/summarise_document",
1102
+ "implementation": "cli_redact task summarise",
1103
+ },
1104
+ {
1105
+ "gradio_api_name": "combine_review_pdfs",
1106
+ "method": "POST",
1107
+ "path": "/agent/combine_review_pdfs",
1108
+ "implementation": "cli_redact combine_review_pdfs",
1109
+ },
1110
+ {
1111
+ "gradio_api_name": "export_review_redaction_overlay",
1112
+ "method": "POST",
1113
+ "path": "/agent/export_review_redaction_overlay",
1114
+ "implementation": "visualise_review_redaction_boxes",
1115
+ },
1116
+ {
1117
+ "gradio_api_name": "export_review_page_ocr_visualisation",
1118
+ "method": "POST",
1119
+ "path": "/agent/export_review_page_ocr_visualisation",
1120
+ "implementation": "visualise_ocr_words_bounding_boxes",
1121
+ },
1122
+ {
1123
+ "gradio_api_name": "combine_review_csvs",
1124
+ "method": "POST",
1125
+ "path": "/agent/combine_review_csvs",
1126
+ "implementation": "helper merge_csv_files",
1127
+ },
1128
+ {
1129
+ "gradio_api_name": "load_and_prepare_documents_or_data",
1130
+ "method": "POST",
1131
+ "path": "/agent/load_and_prepare_documents_or_data",
1132
+ "implementation": "not_implemented_http",
1133
+ },
1134
+ {
1135
+ "gradio_api_name": "apply_review_redactions",
1136
+ "method": "POST",
1137
+ "path": "/agent/apply_review_redactions",
1138
+ "implementation": "tools.simplified_api.run_apply_review_redactions",
1139
+ },
1140
+ {
1141
+ "gradio_api_name": "verify_redaction_coverage",
1142
+ "method": "POST",
1143
+ "path": "/agent/verify_redaction_coverage",
1144
+ "implementation": "tools.verify_redaction_coverage.verify_redaction_coverage",
1145
+ "notes": {
1146
+ "purpose": "Pass 1 programmatic QA — pass_strict (policy), pass_with_cleanup (+ suspicious rows), optional prune and text/pixel checks.",
1147
+ "must_redact": "list of regex strings",
1148
+ "must_not_redact": "list of regex strings",
1149
+ "auto_prune_suspicious": "remove short OCR-fragment rows before reporting",
1150
+ "pages_flagged_for_vlm": "policy/visual failures only",
1151
+ "pages_needing_csv_cleanup": "suspicious rows — prune, not VLM",
1152
+ "leak_likely_causes": "per-page hints when text_layer_leaks (coord_not_normalized, missing_page_boxes, etc.) — not a broken /review_apply",
1153
+ },
1154
+ },
1155
+ {
1156
+ "gradio_api_name": "word_level_ocr_text_search",
1157
+ "method": "POST",
1158
+ "path": "/agent/word_level_ocr_text_search",
1159
+ "implementation": "tools.verify_redaction_coverage.run_word_level_ocr_text_search",
1160
+ },
1161
+ ],
1162
+ }
1163
+
1164
+
1165
+ @router.get("/health")
1166
+ def agent_health() -> dict[str, str]:
1167
+ return {"status": "ok", "service": "agent"}
app.py ADDED
The diff for this file is too large to render. See raw diff
 
cdk/__init__.py ADDED
File without changes
cdk/app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from aws_cdk import App, Environment
4
+ from cdk_appregistry import register_doc_redaction_application
5
+ from cdk_config import (
6
+ ALB_NAME,
7
+ APPREGISTRY_APPLICATION_NAME,
8
+ APPREGISTRY_ATTRIBUTE_GROUP_NAME,
9
+ APPREGISTRY_DESCRIPTION,
10
+ APPREGISTRY_REPOSITORY_URL,
11
+ APPREGISTRY_STACK_NAME,
12
+ AWS_ACCOUNT_ID,
13
+ AWS_REGION,
14
+ CDK_CONTEXT_FILE,
15
+ CDK_PREFIX,
16
+ ENABLE_APPREGISTRY,
17
+ RUN_USEAST_STACK,
18
+ USE_CLOUDFRONT,
19
+ )
20
+ from cdk_functions import (
21
+ create_basic_config_env,
22
+ is_resource_delete_protection_enabled,
23
+ load_context_from_file,
24
+ log_aws_credential_context,
25
+ purge_cdk_lookup_context,
26
+ )
27
+ from cdk_stack import CdkStack, CdkStackCloudfront # , CdkStackMain
28
+ from check_resources import CONTEXT_FILE, check_and_set_context
29
+
30
+ # Initialize the CDK app
31
+ app = App()
32
+
33
+ log_aws_credential_context(
34
+ expected_account_id=AWS_ACCOUNT_ID,
35
+ expected_region=AWS_REGION,
36
+ )
37
+
38
+ # Drop stale CDK lookup cache entries (require bootstrap lookup role in target account).
39
+ purge_cdk_lookup_context(CDK_CONTEXT_FILE)
40
+
41
+ # --- Pre-check context (boto3) — written to precheck.context.json, NOT cdk.context.json ---
42
+ print(f"Pre-check context file: {CONTEXT_FILE}")
43
+ print(f"CDK lookup cache file: {CDK_CONTEXT_FILE}")
44
+ if os.path.basename(CONTEXT_FILE.replace("\\", "/")) == os.path.basename(
45
+ CDK_CONTEXT_FILE.replace("\\", "/")
46
+ ):
47
+ raise RuntimeError(
48
+ f"CONTEXT_FILE and CDK_CONTEXT_FILE must differ (got '{CONTEXT_FILE}' for both). "
49
+ "Set CONTEXT_FILE=precheck.context.json in config/cdk_config.env."
50
+ )
51
+
52
+ print("Running pre-check script to generate application context...")
53
+ try:
54
+ check_and_set_context()
55
+ if not os.path.exists(CONTEXT_FILE):
56
+ raise RuntimeError(
57
+ f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
58
+ )
59
+ print(f"Context generated successfully at {CONTEXT_FILE}.")
60
+ except Exception as e:
61
+ raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
62
+
63
+ # Pre-check must not repopulate CDK lookup keys; purge again if paths were ever shared.
64
+ purge_cdk_lookup_context(CDK_CONTEXT_FILE)
65
+
66
+ if os.path.exists(CONTEXT_FILE):
67
+ load_context_from_file(app, CONTEXT_FILE)
68
+ else:
69
+ raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
70
+
71
+ create_basic_config_env("config")
72
+
73
+ aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
74
+
75
+ _stack_delete_protection = is_resource_delete_protection_enabled()
76
+
77
+ regional_stack = CdkStack(
78
+ app, "RedactionStack", env=aws_env_regional, cross_region_references=True
79
+ )
80
+ regional_stack.termination_protection = _stack_delete_protection
81
+
82
+ if ENABLE_APPREGISTRY == "True":
83
+ # Use pre-check context only — not regional_stack.params (avoids AppRegistry
84
+ # -> RedactionStack dependency cycle during synth).
85
+ _alb_dns_context = app.node.try_get_context(f"dns:{ALB_NAME}")
86
+ _alb_dns_name = (
87
+ _alb_dns_context.strip()
88
+ if isinstance(_alb_dns_context, str) and _alb_dns_context.strip()
89
+ else None
90
+ )
91
+ appregistry_stack = register_doc_redaction_application(
92
+ app,
93
+ aws_account_id=AWS_ACCOUNT_ID,
94
+ aws_region=AWS_REGION,
95
+ application_name=APPREGISTRY_APPLICATION_NAME,
96
+ application_description=APPREGISTRY_DESCRIPTION,
97
+ appregistry_stack_name=APPREGISTRY_STACK_NAME,
98
+ attribute_group_name=APPREGISTRY_ATTRIBUTE_GROUP_NAME,
99
+ repository_url=APPREGISTRY_REPOSITORY_URL,
100
+ cdk_prefix=CDK_PREFIX,
101
+ use_cloudfront=USE_CLOUDFRONT,
102
+ alb_dns_name=_alb_dns_name,
103
+ )
104
+ appregistry_stack.termination_protection = _stack_delete_protection
105
+
106
+ if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
107
+ aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
108
+
109
+ cloudfront_stack = CdkStackCloudfront(
110
+ app,
111
+ "RedactionStackCloudfront",
112
+ env=aws_env_us_east_1,
113
+ alb_arn=regional_stack.params["alb_arn_output"],
114
+ alb_sec_group_id=regional_stack.params["alb_security_group_id"],
115
+ alb_dns_name=regional_stack.params["alb_dns_name"],
116
+ cross_region_references=True,
117
+ )
118
+ cloudfront_stack.termination_protection = _stack_delete_protection
119
+
120
+ # CDK CLI invokes this script and expects a cloud assembly in cdk.out.
121
+ # Without app.synth(), Python defines constructs but never writes manifest.json
122
+ # (ENOENT on deploy). See: https://github.com/aws/aws-cdk/issues/11023
123
+ app.synth()
cdk/cdk.json.example ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "app": "python app.py",
3
+ "output": "cdk.out",
4
+ "context": {
5
+ "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": false
6
+ }
7
+ }