Spaces:
Running on Zero
Running on Zero
File size: 3,060 Bytes
849ee7b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | """Privacy-preserving redaction helpers for uploaded traces."""
from __future__ import annotations
import re
from collections import Counter
from dataclasses import dataclass
@dataclass(slots=True)
class RedactionResult:
text: str
notes: list[str]
count: int
_REDACTION_PATTERNS: list[tuple[str, re.Pattern[str], str]] = [
(
"authorization bearer token",
re.compile(r"(?i)\b(authorization\s*:\s*bearer\s+)[A-Za-z0-9._~+/=-]{12,}"),
r"\1[REDACTED_BEARER_TOKEN]",
),
(
"GitHub token",
re.compile(r"\b(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{20,}\b"),
"[REDACTED_GITHUB_TOKEN]",
),
(
"GitHub fine-grained token",
re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,}\b"),
"[REDACTED_GITHUB_TOKEN]",
),
(
"OpenAI API key",
re.compile(r"\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\b"),
"[REDACTED_OPENAI_KEY]",
),
(
"Hugging Face token",
re.compile(r"\bhf_[A-Za-z0-9]{20,}\b"),
"[REDACTED_HF_TOKEN]",
),
(
"GitLab token",
re.compile(r"\bglpat-[A-Za-z0-9_-]{20,}\b"),
"[REDACTED_GITLAB_TOKEN]",
),
(
"AWS access key",
re.compile(r"\bAKIA[0-9A-Z]{16}\b"),
"[REDACTED_AWS_ACCESS_KEY]",
),
(
"Slack token",
re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{20,}\b"),
"[REDACTED_SLACK_TOKEN]",
),
(
"private key block",
re.compile(
r"-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----",
re.MULTILINE,
),
"[REDACTED_PRIVATE_KEY]",
),
(
"email address",
re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
"[REDACTED_EMAIL]",
),
(
"macOS user path",
re.compile(r"/Users/[^/\s]+/[^\s`'\"<>)]*"),
"/Users/[REDACTED_USER]/[REDACTED_PATH]",
),
(
"Linux home path",
re.compile(r"/home/[^/\s]+/[^\s`'\"<>)]*"),
"/home/[REDACTED_USER]/[REDACTED_PATH]",
),
(
"Windows user path",
re.compile(r"[A-Za-z]:\\Users\\[^\\\s]+\\[^\s`'\"<>)]*"),
r"C:\\Users\\[REDACTED_USER]\\[REDACTED_PATH]",
),
(
"URL query string",
re.compile(r"\b(https?://[^\s`'\"<>?]+)\?[^\s`'\"<>)]*"),
r"\1?[REDACTED_QUERY]",
),
(
"long base64-like secret",
re.compile(r"\b[A-Za-z0-9+/]{48,}={0,2}\b"),
"[REDACTED_LONG_TOKEN]",
),
]
def redact_text(text: str) -> RedactionResult:
"""Redact likely secrets while preserving surrounding prose and layout."""
counts: Counter[str] = Counter()
redacted = text
for label, pattern, replacement in _REDACTION_PATTERNS:
redacted, substitutions = pattern.subn(replacement, redacted)
if substitutions:
counts[label] += substitutions
notes = [f"{label}: {count}" for label, count in sorted(counts.items())]
return RedactionResult(text=redacted, notes=notes, count=sum(counts.values()))
|