"""HIPAA + Corporate PII scrubber for Orsync Scenarist v7.0. Removes personally identifiable and sensitive corporate/financial information from text before it is sent to external LLM APIs (Ollama Cloud, etc.). Coverage: HIPAA Safe Harbor PHI — SSN, phone, email, MRN/NPI, dates, IP, address, ZIP Corporate/Financial — IBAN, PAN/credit card, VAT, company reg, invoice refs, passport references, Egyptian National ID, large monetary amounts """ from __future__ import annotations import re # ── Compiled patterns (order matters: more specific first) ──────────────────── _PATTERNS: list[tuple[re.Pattern[str], str]] = [ # ── HIPAA PHI ────────────────────────────────────────────────────────── # US Social Security Numbers (XXX-XX-XXXX) (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[SSN_REDACTED]"), # Phone numbers (various US/intl formats) (re.compile(r"(?:\+?\d{1,3}[\s.-]?)?\(?\d{2,4}\)?[\s.-]?\d{3,4}[\s.-]?\d{3,4}\b"), "[PHONE_REDACTED]"), # Email addresses (re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Z|a-z]{2,}\b"), "[EMAIL_REDACTED]"), # Medical Record Numbers / NPI (7–12 digit sequences) (re.compile(r"\b(?:MRN|NPI|ID)[:#\s]*\d{6,12}\b", re.IGNORECASE), "[MRN_REDACTED]"), # Dates of birth / full dates (MM/DD/YYYY, YYYY-MM-DD, etc.) (re.compile( r"\b(?:" r"\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}" # MM/DD/YYYY or DD-MM-YY r"|" r"\d{4}[/\-]\d{1,2}[/\-]\d{1,2}" # YYYY-MM-DD r")\b" ), "[DATE_REDACTED]"), # IP addresses (v4) (re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "[IP_REDACTED]"), # Street addresses (rough heuristic: number + street word) (re.compile( r"\b\d{1,5}\s+(?:[A-Z][a-z]+\s?){1,4}" r"(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln|Way|Court|Ct|Circle|Cir)\b", re.IGNORECASE, ), "[ADDRESS_REDACTED]"), # ZIP codes (US 5-digit or 5+4) (re.compile(r"\b\d{5}(?:-\d{4})?\b"), "[ZIP_REDACTED]"), # ── Corporate / Financial PII ────────────────────────────────────────── # Payment card numbers — 4×4 digit groups (Visa/MC/Amex/Discover canonical print format) (re.compile(r"\b\d{4}[\s\-]\d{4}[\s\-]\d{4}[\s\-]\d{4}\b"), "[CARD_REDACTED]"), # IBAN — ISO 13616: 2-letter country code + 2 check digits + up to 30 alphanumeric BBAN (re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{4,30}\b"), "[IBAN_REDACTED]"), # EU / UK / Middle-East VAT numbers (various country prefixes) (re.compile(r"\b(?:VAT|TVA|BTW|MwSt|GST)[#\s:]*[A-Z]{0,2}\s*\d{6,15}\b", re.IGNORECASE), "[VAT_REDACTED]"), # Company Registration Numbers (CRN, Reg No, Company No) (re.compile(r"\b(?:CRN|Reg\.?\s*No\.?|Company\s*No\.?|CR\s*No\.?)[#\s:]*[A-Z0-9\-]{4,15}\b", re.IGNORECASE), "[CRN_REDACTED]"), # Egyptian National ID — 14-digit number starting with 2 or 3 (century byte) (re.compile(r"\b[23]\d{13}\b"), "[NATIONAL_ID_REDACTED]"), # Passport reference context (Passport No: ABC123456) (re.compile(r"\b(?:Passport|PP)\.?\s*(?:No\.?|Number|#)[\s:]*[A-Z0-9]{6,9}\b", re.IGNORECASE), "[PASSPORT_REDACTED]"), # Internal document reference IDs (INV-1234, PO-98765, SO/2024/001, CONTRACT-42) (re.compile(r"\b(?:INV|PO|SO|CONTRACT|CO|REF|WO|DO)[/\-#]?\s*\d{3,12}\b", re.IGNORECASE), "[DOC_REF_REDACTED]"), # Explicit large monetary amounts with ISO currency codes (≥ 4 digits to avoid false positives) (re.compile(r"\b(?:USD|EUR|GBP|EGP|SAR|AED|QAR)\s*[\d,]{4,}(?:\.\d{1,2})?\b"), "[AMOUNT_REDACTED]"), ] def scrub_pii(text: str) -> str: """Remove PII from *text* and return the sanitised version.""" result = text for pattern, replacement in _PATTERNS: result = pattern.sub(replacement, result) return result def contains_pii(text: str) -> bool: """Return True if any PII pattern is detected in *text*.""" for pattern, _ in _PATTERNS: if pattern.search(text): return True return False