from huggingface_hub import ModelCard, ModelCardData from transformers import AutoTokenizer, ModernBertForSequenceClassification from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer from datasets import IterableDataset, Dataset, Features, Value from transformers import DataCollatorWithPadding from transformers import pipeline import evaluate import torch import torch.nn as nn import numpy as np import csv import tqdm from more_itertools import chunked from transformers import Pipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline from huggingface_hub import HfApi, HfFolder import requests def load_model(): tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base") model = ModernBertForSequenceClassification.from_pretrained("fa-nsfw-v0/checkpoint-18625") classifier = pipeline("text-classification", model=model, tokenizer=tokenizer) return classifier def cut_desc(desc): if len(desc) > 256: return desc[:256] + "..." return desc class WikipediaKidFriendlyClassificationPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): # We don't need any special parameters for this pipeline return {}, {}, {} def preprocess(self, inputs, **kwargs): data = [] if isinstance(inputs, str): inputs = [inputs] for i in inputs: REST_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}?redirect=true".format(i) print(REST_URL) response = requests.get(REST_URL, headers={"User-Agent": "derenrich/enwiki-kid-friendly-classifier/0.1"}) if response.status_code == 200: jj = response.json() title = jj.get("title", "Unknown Title") desc = jj.get("description", "No Description") data.append((title, desc)) else: print(response.text) title = i desc = None data.append((title, desc)) texts = [cut_desc(title) + " - " + cut_desc(desc or '') for (title, desc) in data] print(texts) return self.tokenizer(texts, truncation=True, padding=True, return_tensors=self.framework) def _forward(self, model_inputs): # Handle cases where preprocessing failed if "error" in model_inputs: return model_inputs # Pass the tokenized inputs to the model return self.model(**model_inputs) def postprocess(self, model_outputs): # Handle cases where an error was passed through if "error" in model_outputs: return model_outputs # Convert model logits to a user-friendly prediction logits = model_outputs.logits probabilities = logits.softmax(dim=-1)[0] # Create a list of labels with their scores scores = {self.model.config.id2label[i]: prob.item() for i, prob in enumerate(probabilities)} return scores