import logging import os import re from pathlib import Path from typing import Dict, List, Optional import torch from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline logging.basicConfig(level=logging.INFO) LOGGER = logging.getLogger("hoax-backend") MODEL_ID = os.getenv("MODEL_ID", "fjrmhri/Deteksi_Hoax_TA") NER_MODEL_ID = os.getenv("NER_MODEL_ID", "cahya/bert-base-indonesian-NER") HF_TOKEN = os.getenv("HF_TOKEN") FRONTEND_ORIGIN = os.getenv("FRONTEND_ORIGIN") ORANGE_THRESHOLD = float(os.getenv("ORANGE_THRESHOLD", "0.65")) MAX_LENGTH = int(os.getenv("MAX_LENGTH", "256")) MAX_INPUT_CHARS = int(os.getenv("MAX_INPUT_CHARS", "50000")) BATCH_SIZE = int(os.getenv("BATCH_SIZE", "16")) LOCAL_MODEL_PATH = Path( os.getenv( "LOCAL_MODEL_PATH", str((Path(__file__).resolve().parents[1] / "indobert_hoax_ner_model_final")), ) ) PARAGRAPH_SPLIT_RE = re.compile(r"(?:\r?\n){2,}") SENTENCE_RE = re.compile(r"[^.!?]+(?:[.!?]+(?:[\"”’)\]]+)?)|[^.!?]+$") # Mapping dipatok eksplisit mengikuti notebook: # kelas 0 -> Fakta, kelas 1 -> Hoaks LABEL_MAP = {0: "Fakta", 1: "Hoaks"} CLASSIFIER_TOKENIZER = None CLASSIFIER_MODEL = None NER_PIPELINE = None MODEL_SOURCE = "hub" DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") class AnalyzeRequest(BaseModel): text: str = Field(..., description="Teks input multi paragraf.") include_ner: bool = Field(True, description="Jalankan NER jika true.") confidence_orange_threshold: Optional[float] = Field( None, description="Threshold untuk warna oranye." ) app = FastAPI(title="Hoax Sentence Analyzer API", version="1.0.0") if FRONTEND_ORIGIN: allowed_origins = [FRONTEND_ORIGIN] else: allowed_origins = ["*"] app.add_middleware( CORSMiddleware, allow_origins=allowed_origins, allow_credentials=False, allow_methods=["*"], allow_headers=["*"], ) def _float(value: float) -> float: return round(float(value), 6) def _hf_auth_kwargs() -> Dict: kwargs: Dict = {} if HF_TOKEN: kwargs["token"] = HF_TOKEN return kwargs def _load_classifier() -> None: global CLASSIFIER_MODEL, CLASSIFIER_TOKENIZER, MODEL_SOURCE auth_kwargs = _hf_auth_kwargs() try: LOGGER.info("Loading classifier from Hub: %s", MODEL_ID) CLASSIFIER_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, **auth_kwargs) CLASSIFIER_MODEL = AutoModelForSequenceClassification.from_pretrained( MODEL_ID, use_safetensors=True, **auth_kwargs, ) MODEL_SOURCE = "hub" except Exception as hub_exc: LOGGER.warning("Hub load failed: %s", hub_exc) if not LOCAL_MODEL_PATH.exists(): raise RuntimeError( f"Model Hub gagal dan fallback lokal tidak ditemukan: {LOCAL_MODEL_PATH}" ) from hub_exc LOGGER.info("Fallback loading local model from: %s", LOCAL_MODEL_PATH) CLASSIFIER_TOKENIZER = AutoTokenizer.from_pretrained( str(LOCAL_MODEL_PATH), local_files_only=True, ) CLASSIFIER_MODEL = AutoModelForSequenceClassification.from_pretrained( str(LOCAL_MODEL_PATH), local_files_only=True, use_safetensors=True, ) MODEL_SOURCE = "local" CLASSIFIER_MODEL.to(DEVICE) CLASSIFIER_MODEL.eval() LOGGER.info("Classifier ready on device=%s source=%s", DEVICE, MODEL_SOURCE) def _get_ner_pipeline(): global NER_PIPELINE if NER_PIPELINE is None: kwargs = _hf_auth_kwargs() NER_PIPELINE = pipeline( "ner", model=NER_MODEL_ID, aggregation_strategy="simple", device=-1, **kwargs, ) return NER_PIPELINE def _split_paragraphs(text: str) -> List[str]: paragraphs = [p.strip() for p in PARAGRAPH_SPLIT_RE.split(text.strip()) if p.strip()] if paragraphs: return paragraphs stripped = text.strip() return [stripped] if stripped else [] def _split_sentences(paragraph: str) -> List[str]: normalized = re.sub(r"\s+", " ", paragraph).strip() if not normalized: return [] sentences = [m.group(0).strip() for m in SENTENCE_RE.finditer(normalized)] sentences = [s for s in sentences if s] if sentences: return sentences return [normalized] def _pick_color(label: str, confidence: float, orange_threshold: float) -> str: if confidence < orange_threshold: return "orange" if label == "Hoaks": return "red" return "green" def _classify_sentences(sentences: List[str], orange_threshold: float) -> List[Dict]: if not sentences: return [] results: List[Dict] = [] with torch.inference_mode(): for start_idx in range(0, len(sentences), BATCH_SIZE): batch = sentences[start_idx : start_idx + BATCH_SIZE] encoded = CLASSIFIER_TOKENIZER( batch, truncation=True, max_length=MAX_LENGTH, padding=True, return_tensors="pt", ) encoded = {k: v.to(DEVICE) for k, v in encoded.items()} logits = CLASSIFIER_MODEL(**encoded).logits probs = torch.softmax(logits, dim=-1).detach().cpu().tolist() for text, p in zip(batch, probs): prob_fakta = float(p[0]) prob_hoax = float(p[1]) pred_id = 1 if prob_hoax >= prob_fakta else 0 label = LABEL_MAP[pred_id] confidence = max(prob_hoax, prob_fakta) color = _pick_color(label, confidence, orange_threshold) results.append( { "text": text, "label": label, "prob_hoax": _float(prob_hoax), "prob_fakta": _float(prob_fakta), "confidence": _float(confidence), "color": color, } ) return results def _extract_entities(text: str) -> List[Dict]: try: ner = _get_ner_pipeline() raw_entities = ner(text) except Exception as exc: raise RuntimeError(f"Gagal menjalankan NER: {exc}") from exc entities: List[Dict] = [] seen = set() for ent in raw_entities: ent_text = str(ent.get("word", "")).strip() entity_group = str(ent.get("entity_group", "")).strip() score = float(ent.get("score", 0.0)) key = (ent_text.lower(), entity_group) if not ent_text or not entity_group or key in seen: continue seen.add(key) entities.append( { "text": ent_text, "entity_group": entity_group, "score": _float(score), } ) return entities @app.on_event("startup") def startup_event() -> None: _load_classifier() @app.get("/health") def health() -> Dict[str, str]: return {"status": "ok"} @app.post("/analyze") def analyze(payload: AnalyzeRequest) -> Dict: if CLASSIFIER_MODEL is None or CLASSIFIER_TOKENIZER is None: raise HTTPException(status_code=503, detail="Model classifier belum siap.") text = payload.text.strip() if not text: raise HTTPException(status_code=400, detail="Field 'text' tidak boleh kosong.") if len(text) > MAX_INPUT_CHARS: raise HTTPException( status_code=413, detail=f"Input terlalu panjang ({len(text)} chars). Maksimum {MAX_INPUT_CHARS} chars.", ) threshold = ( ORANGE_THRESHOLD if payload.confidence_orange_threshold is None else payload.confidence_orange_threshold ) if not (0.0 <= threshold <= 1.0): raise HTTPException( status_code=400, detail="confidence_orange_threshold harus berada pada rentang 0 sampai 1.", ) paragraphs_raw = _split_paragraphs(text) paragraph_responses = [] total_sentences = 0 total_hoax = 0 total_fakta = 0 total_low_conf = 0 for p_idx, paragraph_text in enumerate(paragraphs_raw): sentences = _split_sentences(paragraph_text) classified = _classify_sentences(sentences, threshold) sentence_items = [] paragraph_hoax = 0 paragraph_fakta = 0 paragraph_low = 0 conf_values: List[float] = [] hoax_probs: List[float] = [] for s_idx, item in enumerate(classified): if item["label"] == "Hoaks": paragraph_hoax += 1 else: paragraph_fakta += 1 if item["confidence"] < threshold: paragraph_low += 1 conf_values.append(item["confidence"]) hoax_probs.append(item["prob_hoax"]) sentence_items.append( { "sentence_index": s_idx, "text": item["text"], "label": item["label"], "prob_hoax": item["prob_hoax"], "prob_fakta": item["prob_fakta"], "confidence": item["confidence"], "color": item["color"], } ) paragraph_summary = { "hoax_sentences": paragraph_hoax, "fakta_sentences": paragraph_fakta, "avg_confidence": _float(sum(conf_values) / len(conf_values)) if conf_values else 0.0, "max_hoax_prob": _float(max(hoax_probs)) if hoax_probs else 0.0, } paragraph_responses.append( { "paragraph_index": p_idx, "sentences": sentence_items, "paragraph_summary": paragraph_summary, } ) total_sentences += len(sentence_items) total_hoax += paragraph_hoax total_fakta += paragraph_fakta total_low_conf += paragraph_low entities = [] if payload.include_ner: try: entities = _extract_entities(text) except RuntimeError as exc: raise HTTPException(status_code=503, detail=str(exc)) from exc response = { "model": { "source": MODEL_SOURCE, "model_id": MODEL_ID, "max_length": MAX_LENGTH, }, "summary": { "num_paragraphs": len(paragraph_responses), "num_sentences": total_sentences, "hoax_sentences": total_hoax, "fakta_sentences": total_fakta, "low_conf_sentences": total_low_conf, }, "paragraphs": paragraph_responses, "ner": { "enabled": payload.include_ner, "model_id": NER_MODEL_ID, "entities": entities, }, } return response