from __future__ import annotations import re from dataclasses import dataclass, field # PII field names that must never be written to disk _PII_FIELDS: frozenset[str] = frozenset( { "name", "display_name", "author_name", "full_name", "first_name", "last_name", "email", "phone", "address", "affiliation", "institution_name", "investigator", "pi_name", "orcid", "npi", "dea_number", } ) # Patterns that look like real names (e.g. "John A. Smith", "فاطمة العتيبي") _NAME_PATTERN = re.compile( r"^[A-ZÀ-Ö][a-zA-ZÀ-öø-ÿ\-']+(?:\s[A-ZÀ-Ö]\.?)?(?:\s[A-ZÀ-Ö][a-zA-ZÀ-öø-ÿ\-']+){1,4}$" ) @dataclass class CodeNameFactory: """Assigns sequential, opaque code-names to HCP records. Real names are *never* stored — only the code name is persisted. """ prefix: str counter: int = 0 def next_code_name(self) -> str: self.counter += 1 return f"{self.prefix}_{self.counter:04d}" @dataclass class PIIScrubber: """Removes Personally Identifiable Information from a raw record dict. Strategy: * Drop any key whose name is in the ``_PII_FIELDS`` block-list. * Drop any key whose *value* is a non-empty string matching the name regex (catches unpredictably named fields carrying real names). * Assign an opaque ``code_name`` from the supplied ``CodeNameFactory``. The scrubber is intentionally *conservative*: when in doubt, drop the field. Data utility is never more important than zero-PII compliance. """ factory: CodeNameFactory extra_blocked_keys: frozenset[str] = field(default_factory=frozenset) def scrub(self, record: dict) -> dict: blocked = _PII_FIELDS | self.extra_blocked_keys clean: dict = {} for key, value in record.items(): # 1. Drop if the field name is on the block-list if key.lower() in blocked: continue # 2. Drop string values that look like real names if isinstance(value, str) and _NAME_PATTERN.match(value.strip()): continue clean[key] = value # Always assign the code name last so it cannot be overwritten clean["code_name"] = self.factory.next_code_name() return clean