"""Stream C — stylometric / structural features.

These capture *how* text is structured (sentence-length rhythm, lexical diversity, function-word use,
repetition) rather than *which exact tokens* appear. They are comparatively robust to paraphrasing,
which rewrites surface tokens but tends to preserve an author's structural fingerprint — and, unlike
raw perplexity, they don't collapse non-native human text onto AI text. Pure-Python, no heavy deps.
"""
from __future__ import annotations
import math
import re
import numpy as np

# Small closed-class function-word list (paraphrase-stable; AI vs human differ in their distribution).
_FUNCTION_WORDS = {
    "the", "a", "an", "and", "or", "but", "if", "while", "of", "at", "by", "for", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above", "below", "to",
    "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
    "once", "here", "there", "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "will",
    "just", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
    "did", "this", "that", "these", "those", "i", "you", "he", "she", "it", "we", "they", "them",
    "his", "her", "its", "their", "as", "because", "however", "therefore", "thus", "moreover",
}

_WORD_RE = re.compile(r"[A-Za-z']+")
_SENT_RE = re.compile(r"[.!?]+")

FEATURE_NAMES = [
    "log_n_words", "mean_sent_len", "std_sent_len", "type_token_ratio", "hapax_ratio",
    "mean_word_len", "std_word_len", "punct_density", "comma_per_word", "function_word_ratio",
    "bigram_repetition", "uppercase_ratio", "digit_ratio", "stopword_gap_std",
]
N_FEATURES = len(FEATURE_NAMES)


def stylometric_features(text: str) -> np.ndarray:
    """Return a fixed-length (N_FEATURES,) float vector of stylometric features."""
    if not text or not text.strip():
        return np.zeros(N_FEATURES, dtype=float)

    words = _WORD_RE.findall(text.lower())
    n_words = len(words)
    if n_words == 0:
        return np.zeros(N_FEATURES, dtype=float)

    sents = [s for s in _SENT_RE.split(text) if s.strip()]
    n_sents = max(len(sents), 1)
    sent_lens = [len(_WORD_RE.findall(s)) for s in sents] or [n_words]

    uniq = set(words)
    counts: dict[str, int] = {}
    for w in words:
        counts[w] = counts.get(w, 0) + 1
    hapax = sum(1 for c in counts.values() if c == 1)

    word_lens = [len(w) for w in words]
    chars = len(text)
    n_punct = sum(text.count(c) for c in ",.;:!?\"'()-")
    n_commas = text.count(",")
    n_func = sum(1 for w in words if w in _FUNCTION_WORDS)
    bigrams = list(zip(words[:-1], words[1:]))
    bigram_rep = 1.0 - (len(set(bigrams)) / len(bigrams)) if bigrams else 0.0
    letters = sum(1 for ch in text if ch.isalpha())
    uppers = sum(1 for ch in text if ch.isupper())
    digits = sum(1 for ch in text if ch.isdigit())

    # rhythm of function words: gaps (in tokens) between successive function words
    func_pos = [i for i, w in enumerate(words) if w in _FUNCTION_WORDS]
    gaps = np.diff(func_pos) if len(func_pos) > 1 else np.array([0.0])

    feats = [
        math.log1p(n_words),
        float(np.mean(sent_lens)),
        float(np.std(sent_lens)),
        len(uniq) / n_words,
        hapax / max(len(uniq), 1),
        float(np.mean(word_lens)),
        float(np.std(word_lens)),
        n_punct / max(chars, 1),
        n_commas / n_words,
        n_func / n_words,
        bigram_rep,
        uppers / max(letters, 1),
        digits / max(chars, 1),
        float(np.std(gaps)),
    ]
    return np.array(feats, dtype=float)


def stylometric_matrix(texts: list[str]) -> np.ndarray:
    """(len(texts), N_FEATURES) feature matrix."""
    return np.vstack([stylometric_features(t) for t in texts]) if texts \
        else np.zeros((0, N_FEATURES))


if __name__ == "__main__":  # smoke test
    ai = "The implementation leverages a robust framework. The system processes data efficiently. " \
         "The results demonstrate significant improvements across all evaluated metrics."
    hu = "I dunno, it kinda worked? We tried a bunch of stuff and, honestly, some of it broke — " \
         "but the demo went fine in the end, somehow."
    for name, t in [("AI-ish", ai), ("human-ish", hu)]:
        v = stylometric_features(t)
        print(name, dict(zip(FEATURE_NAMES, np.round(v, 3))))
    assert stylometric_features("").shape == (N_FEATURES,)
    assert stylometric_matrix([ai, hu]).shape == (2, N_FEATURES)
    print("stylometric self-test passed")