| """Perplexity baseline detector. Low perplexity under a proxy LM => more likely AI. |
| |
| This is the simplest zero-shot baseline and the one whose paraphrase-fragility / non-native-bias |
| coupling we demonstrated in Phase 0. Score = -perplexity (higher = more AI).""" |
| from __future__ import annotations |
| import math |
| import numpy as np |
| from .base import Detector |
|
|
|
|
| class PerplexityDetector(Detector): |
| name = "perplexity" |
|
|
| def __init__(self, model_name: str = "gpt2", device: str | None = None, max_tokens: int = 512): |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| self.torch = torch |
| self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") |
| self.max_tokens = max_tokens |
| self.tok = AutoTokenizer.from_pretrained(model_name) |
| self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device).eval() |
|
|
| def perplexity(self, text: str) -> float: |
| if not text or not text.strip(): |
| return float("nan") |
| ids = self.tok(text, return_tensors="pt", truncation=True, |
| max_length=self.max_tokens).input_ids.to(self.device) |
| if ids.size(1) < 2: |
| return float("nan") |
| with self.torch.no_grad(): |
| loss = self.model(ids, labels=ids).loss |
| ppl = math.exp(loss.item()) |
| return ppl if math.isfinite(ppl) else float("nan") |
|
|
| def score(self, texts: list[str]) -> np.ndarray: |
| |
| return np.array([-self.perplexity(t) for t in texts], dtype=float) |
|
|