from __future__ import annotations

import re
from dataclasses import dataclass, field


# PII field names that must never be written to disk
_PII_FIELDS: frozenset[str] = frozenset(
    {
        "name",
        "display_name",
        "author_name",
        "full_name",
        "first_name",
        "last_name",
        "email",
        "phone",
        "address",
        "affiliation",
        "institution_name",
        "investigator",
        "pi_name",
        "orcid",
        "npi",
        "dea_number",
    }
)

# Patterns that look like real names (e.g. "John A. Smith", "فاطمة العتيبي")
_NAME_PATTERN = re.compile(
    r"^[A-ZÀ-Ö][a-zA-ZÀ-öø-ÿ\-']+(?:\s[A-ZÀ-Ö]\.?)?(?:\s[A-ZÀ-Ö][a-zA-ZÀ-öø-ÿ\-']+){1,4}$"
)


@dataclass
class CodeNameFactory:
    """Assigns sequential, opaque code-names to HCP records.

    Real names are *never* stored — only the code name is persisted.
    """

    prefix: str
    counter: int = 0

    def next_code_name(self) -> str:
        self.counter += 1
        return f"{self.prefix}_{self.counter:04d}"


@dataclass
class PIIScrubber:
    """Removes Personally Identifiable Information from a raw record dict.

    Strategy:
    * Drop any key whose name is in the ``_PII_FIELDS`` block-list.
    * Drop any key whose *value* is a non-empty string matching the name
      regex (catches unpredictably named fields carrying real names).
    * Assign an opaque ``code_name`` from the supplied ``CodeNameFactory``.

    The scrubber is intentionally *conservative*: when in doubt, drop the
    field.  Data utility is never more important than zero-PII compliance.
    """

    factory: CodeNameFactory
    extra_blocked_keys: frozenset[str] = field(default_factory=frozenset)

    def scrub(self, record: dict) -> dict:
        blocked = _PII_FIELDS | self.extra_blocked_keys
        clean: dict = {}
        for key, value in record.items():
            # 1. Drop if the field name is on the block-list
            if key.lower() in blocked:
                continue
            # 2. Drop string values that look like real names
            if isinstance(value, str) and _NAME_PATTERN.match(value.strip()):
                continue
            clean[key] = value

        # Always assign the code name last so it cannot be overwritten
        clean["code_name"] = self.factory.next_code_name()
        return clean