MohsinEli's picture
Add PIRD app + trained checkpoint
bd743a9
Raw
History Blame
1.6 kB
"""Perplexity baseline detector. Low perplexity under a proxy LM => more likely AI.
This is the simplest zero-shot baseline and the one whose paraphrase-fragility / non-native-bias
coupling we demonstrated in Phase 0. Score = -perplexity (higher = more AI)."""
from __future__ import annotations
import math
import numpy as np
from .base import Detector
class PerplexityDetector(Detector):
name = "perplexity"
def __init__(self, model_name: str = "gpt2", device: str | None = None, max_tokens: int = 512):
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
self.torch = torch
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
self.max_tokens = max_tokens
self.tok = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device).eval()
def perplexity(self, text: str) -> float:
if not text or not text.strip():
return float("nan")
ids = self.tok(text, return_tensors="pt", truncation=True,
max_length=self.max_tokens).input_ids.to(self.device)
if ids.size(1) < 2:
return float("nan")
with self.torch.no_grad():
loss = self.model(ids, labels=ids).loss
ppl = math.exp(loss.item())
return ppl if math.isfinite(ppl) else float("nan")
def score(self, texts: list[str]) -> np.ndarray:
# higher score = more AI => negative perplexity
return np.array([-self.perplexity(t) for t in texts], dtype=float)