File size: 3,060 Bytes
849ee7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Privacy-preserving redaction helpers for uploaded traces."""

from __future__ import annotations

import re
from collections import Counter
from dataclasses import dataclass


@dataclass(slots=True)
class RedactionResult:
    text: str
    notes: list[str]
    count: int


_REDACTION_PATTERNS: list[tuple[str, re.Pattern[str], str]] = [
    (
        "authorization bearer token",
        re.compile(r"(?i)\b(authorization\s*:\s*bearer\s+)[A-Za-z0-9._~+/=-]{12,}"),
        r"\1[REDACTED_BEARER_TOKEN]",
    ),
    (
        "GitHub token",
        re.compile(r"\b(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{20,}\b"),
        "[REDACTED_GITHUB_TOKEN]",
    ),
    (
        "GitHub fine-grained token",
        re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,}\b"),
        "[REDACTED_GITHUB_TOKEN]",
    ),
    (
        "OpenAI API key",
        re.compile(r"\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\b"),
        "[REDACTED_OPENAI_KEY]",
    ),
    (
        "Hugging Face token",
        re.compile(r"\bhf_[A-Za-z0-9]{20,}\b"),
        "[REDACTED_HF_TOKEN]",
    ),
    (
        "GitLab token",
        re.compile(r"\bglpat-[A-Za-z0-9_-]{20,}\b"),
        "[REDACTED_GITLAB_TOKEN]",
    ),
    (
        "AWS access key",
        re.compile(r"\bAKIA[0-9A-Z]{16}\b"),
        "[REDACTED_AWS_ACCESS_KEY]",
    ),
    (
        "Slack token",
        re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{20,}\b"),
        "[REDACTED_SLACK_TOKEN]",
    ),
    (
        "private key block",
        re.compile(
            r"-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----",
            re.MULTILINE,
        ),
        "[REDACTED_PRIVATE_KEY]",
    ),
    (
        "email address",
        re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
        "[REDACTED_EMAIL]",
    ),
    (
        "macOS user path",
        re.compile(r"/Users/[^/\s]+/[^\s`'\"<>)]*"),
        "/Users/[REDACTED_USER]/[REDACTED_PATH]",
    ),
    (
        "Linux home path",
        re.compile(r"/home/[^/\s]+/[^\s`'\"<>)]*"),
        "/home/[REDACTED_USER]/[REDACTED_PATH]",
    ),
    (
        "Windows user path",
        re.compile(r"[A-Za-z]:\\Users\\[^\\\s]+\\[^\s`'\"<>)]*"),
        r"C:\\Users\\[REDACTED_USER]\\[REDACTED_PATH]",
    ),
    (
        "URL query string",
        re.compile(r"\b(https?://[^\s`'\"<>?]+)\?[^\s`'\"<>)]*"),
        r"\1?[REDACTED_QUERY]",
    ),
    (
        "long base64-like secret",
        re.compile(r"\b[A-Za-z0-9+/]{48,}={0,2}\b"),
        "[REDACTED_LONG_TOKEN]",
    ),
]


def redact_text(text: str) -> RedactionResult:
    """Redact likely secrets while preserving surrounding prose and layout."""

    counts: Counter[str] = Counter()
    redacted = text
    for label, pattern, replacement in _REDACTION_PATTERNS:
        redacted, substitutions = pattern.subn(replacement, redacted)
        if substitutions:
            counts[label] += substitutions

    notes = [f"{label}: {count}" for label, count in sorted(counts.items())]
    return RedactionResult(text=redacted, notes=notes, count=sum(counts.values()))