fjrmhri commited on
Commit
7c137fc
·
verified ·
1 Parent(s): 7a31aea
Files changed (3) hide show
  1. Dockerfile +16 -0
  2. app.py +354 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt /app/requirements.txt
10
+ RUN pip install --upgrade pip && pip install -r /app/requirements.txt
11
+
12
+ COPY app.py /app/app.py
13
+
14
+ EXPOSE 7860
15
+
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional
6
+
7
+ import torch
8
+ from fastapi import FastAPI, HTTPException
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from pydantic import BaseModel, Field
11
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
12
+
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ LOGGER = logging.getLogger("hoax-backend")
16
+
17
+ MODEL_ID = os.getenv("MODEL_ID", "fjrmhri/Deteksi_Hoax_TA")
18
+ NER_MODEL_ID = os.getenv("NER_MODEL_ID", "cahya/bert-base-indonesian-NER")
19
+ HF_TOKEN = os.getenv("HF_TOKEN")
20
+ FRONTEND_ORIGIN = os.getenv("FRONTEND_ORIGIN")
21
+ ORANGE_THRESHOLD = float(os.getenv("ORANGE_THRESHOLD", "0.65"))
22
+ MAX_LENGTH = int(os.getenv("MAX_LENGTH", "256"))
23
+ MAX_INPUT_CHARS = int(os.getenv("MAX_INPUT_CHARS", "50000"))
24
+ BATCH_SIZE = int(os.getenv("BATCH_SIZE", "16"))
25
+
26
+ LOCAL_MODEL_PATH = Path(
27
+ os.getenv(
28
+ "LOCAL_MODEL_PATH",
29
+ str((Path(__file__).resolve().parents[1] / "indobert_hoax_ner_model_final")),
30
+ )
31
+ )
32
+
33
+ PARAGRAPH_SPLIT_RE = re.compile(r"(?:\r?\n){2,}")
34
+ SENTENCE_RE = re.compile(r"[^.!?]+(?:[.!?]+(?:[\"”’)\]]+)?)|[^.!?]+$")
35
+
36
+ # Mapping dipatok eksplisit mengikuti notebook:
37
+ # kelas 0 -> Fakta, kelas 1 -> Hoaks
38
+ LABEL_MAP = {0: "Fakta", 1: "Hoaks"}
39
+
40
+ CLASSIFIER_TOKENIZER = None
41
+ CLASSIFIER_MODEL = None
42
+ NER_PIPELINE = None
43
+ MODEL_SOURCE = "hub"
44
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+
46
+
47
+ class AnalyzeRequest(BaseModel):
48
+ text: str = Field(..., description="Teks input multi paragraf.")
49
+ include_ner: bool = Field(True, description="Jalankan NER jika true.")
50
+ confidence_orange_threshold: Optional[float] = Field(
51
+ None, description="Threshold untuk warna oranye."
52
+ )
53
+
54
+
55
+ app = FastAPI(title="Hoax Sentence Analyzer API", version="1.0.0")
56
+
57
+ if FRONTEND_ORIGIN:
58
+ allowed_origins = [FRONTEND_ORIGIN]
59
+ else:
60
+ allowed_origins = ["*"]
61
+
62
+ app.add_middleware(
63
+ CORSMiddleware,
64
+ allow_origins=allowed_origins,
65
+ allow_credentials=False,
66
+ allow_methods=["*"],
67
+ allow_headers=["*"],
68
+ )
69
+
70
+
71
+ def _float(value: float) -> float:
72
+ return round(float(value), 6)
73
+
74
+
75
+ def _hf_auth_kwargs() -> Dict:
76
+ kwargs: Dict = {}
77
+ if HF_TOKEN:
78
+ kwargs["token"] = HF_TOKEN
79
+ return kwargs
80
+
81
+
82
+ def _load_classifier() -> None:
83
+ global CLASSIFIER_MODEL, CLASSIFIER_TOKENIZER, MODEL_SOURCE
84
+
85
+ auth_kwargs = _hf_auth_kwargs()
86
+ try:
87
+ LOGGER.info("Loading classifier from Hub: %s", MODEL_ID)
88
+ CLASSIFIER_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, **auth_kwargs)
89
+ CLASSIFIER_MODEL = AutoModelForSequenceClassification.from_pretrained(
90
+ MODEL_ID,
91
+ use_safetensors=True,
92
+ **auth_kwargs,
93
+ )
94
+ MODEL_SOURCE = "hub"
95
+ except Exception as hub_exc:
96
+ LOGGER.warning("Hub load failed: %s", hub_exc)
97
+ if not LOCAL_MODEL_PATH.exists():
98
+ raise RuntimeError(
99
+ f"Model Hub gagal dan fallback lokal tidak ditemukan: {LOCAL_MODEL_PATH}"
100
+ ) from hub_exc
101
+
102
+ LOGGER.info("Fallback loading local model from: %s", LOCAL_MODEL_PATH)
103
+ CLASSIFIER_TOKENIZER = AutoTokenizer.from_pretrained(
104
+ str(LOCAL_MODEL_PATH),
105
+ local_files_only=True,
106
+ )
107
+ CLASSIFIER_MODEL = AutoModelForSequenceClassification.from_pretrained(
108
+ str(LOCAL_MODEL_PATH),
109
+ local_files_only=True,
110
+ use_safetensors=True,
111
+ )
112
+ MODEL_SOURCE = "local"
113
+
114
+ CLASSIFIER_MODEL.to(DEVICE)
115
+ CLASSIFIER_MODEL.eval()
116
+ LOGGER.info("Classifier ready on device=%s source=%s", DEVICE, MODEL_SOURCE)
117
+
118
+
119
+ def _get_ner_pipeline():
120
+ global NER_PIPELINE
121
+ if NER_PIPELINE is None:
122
+ kwargs = _hf_auth_kwargs()
123
+ NER_PIPELINE = pipeline(
124
+ "ner",
125
+ model=NER_MODEL_ID,
126
+ aggregation_strategy="simple",
127
+ device=-1,
128
+ **kwargs,
129
+ )
130
+ return NER_PIPELINE
131
+
132
+
133
+ def _split_paragraphs(text: str) -> List[str]:
134
+ paragraphs = [p.strip() for p in PARAGRAPH_SPLIT_RE.split(text.strip()) if p.strip()]
135
+ if paragraphs:
136
+ return paragraphs
137
+ stripped = text.strip()
138
+ return [stripped] if stripped else []
139
+
140
+
141
+ def _split_sentences(paragraph: str) -> List[str]:
142
+ normalized = re.sub(r"\s+", " ", paragraph).strip()
143
+ if not normalized:
144
+ return []
145
+
146
+ sentences = [m.group(0).strip() for m in SENTENCE_RE.finditer(normalized)]
147
+ sentences = [s for s in sentences if s]
148
+ if sentences:
149
+ return sentences
150
+ return [normalized]
151
+
152
+
153
+ def _pick_color(label: str, confidence: float, orange_threshold: float) -> str:
154
+ if confidence < orange_threshold:
155
+ return "orange"
156
+ if label == "Hoaks":
157
+ return "red"
158
+ return "green"
159
+
160
+
161
+ def _classify_sentences(sentences: List[str], orange_threshold: float) -> List[Dict]:
162
+ if not sentences:
163
+ return []
164
+
165
+ results: List[Dict] = []
166
+ with torch.inference_mode():
167
+ for start_idx in range(0, len(sentences), BATCH_SIZE):
168
+ batch = sentences[start_idx : start_idx + BATCH_SIZE]
169
+ encoded = CLASSIFIER_TOKENIZER(
170
+ batch,
171
+ truncation=True,
172
+ max_length=MAX_LENGTH,
173
+ padding=True,
174
+ return_tensors="pt",
175
+ )
176
+ encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
177
+
178
+ logits = CLASSIFIER_MODEL(**encoded).logits
179
+ probs = torch.softmax(logits, dim=-1).detach().cpu().tolist()
180
+
181
+ for text, p in zip(batch, probs):
182
+ prob_fakta = float(p[0])
183
+ prob_hoax = float(p[1])
184
+ pred_id = 1 if prob_hoax >= prob_fakta else 0
185
+ label = LABEL_MAP[pred_id]
186
+ confidence = max(prob_hoax, prob_fakta)
187
+ color = _pick_color(label, confidence, orange_threshold)
188
+
189
+ results.append(
190
+ {
191
+ "text": text,
192
+ "label": label,
193
+ "prob_hoax": _float(prob_hoax),
194
+ "prob_fakta": _float(prob_fakta),
195
+ "confidence": _float(confidence),
196
+ "color": color,
197
+ }
198
+ )
199
+ return results
200
+
201
+
202
+ def _extract_entities(text: str) -> List[Dict]:
203
+ try:
204
+ ner = _get_ner_pipeline()
205
+ raw_entities = ner(text)
206
+ except Exception as exc:
207
+ raise RuntimeError(f"Gagal menjalankan NER: {exc}") from exc
208
+
209
+ entities: List[Dict] = []
210
+ seen = set()
211
+ for ent in raw_entities:
212
+ ent_text = str(ent.get("word", "")).strip()
213
+ entity_group = str(ent.get("entity_group", "")).strip()
214
+ score = float(ent.get("score", 0.0))
215
+
216
+ key = (ent_text.lower(), entity_group)
217
+ if not ent_text or not entity_group or key in seen:
218
+ continue
219
+ seen.add(key)
220
+ entities.append(
221
+ {
222
+ "text": ent_text,
223
+ "entity_group": entity_group,
224
+ "score": _float(score),
225
+ }
226
+ )
227
+ return entities
228
+
229
+
230
+ @app.on_event("startup")
231
+ def startup_event() -> None:
232
+ _load_classifier()
233
+
234
+
235
+ @app.get("/health")
236
+ def health() -> Dict[str, str]:
237
+ return {"status": "ok"}
238
+
239
+
240
+ @app.post("/analyze")
241
+ def analyze(payload: AnalyzeRequest) -> Dict:
242
+ if CLASSIFIER_MODEL is None or CLASSIFIER_TOKENIZER is None:
243
+ raise HTTPException(status_code=503, detail="Model classifier belum siap.")
244
+
245
+ text = payload.text.strip()
246
+ if not text:
247
+ raise HTTPException(status_code=400, detail="Field 'text' tidak boleh kosong.")
248
+ if len(text) > MAX_INPUT_CHARS:
249
+ raise HTTPException(
250
+ status_code=413,
251
+ detail=f"Input terlalu panjang ({len(text)} chars). Maksimum {MAX_INPUT_CHARS} chars.",
252
+ )
253
+
254
+ threshold = (
255
+ ORANGE_THRESHOLD
256
+ if payload.confidence_orange_threshold is None
257
+ else payload.confidence_orange_threshold
258
+ )
259
+ if not (0.0 <= threshold <= 1.0):
260
+ raise HTTPException(
261
+ status_code=400,
262
+ detail="confidence_orange_threshold harus berada pada rentang 0 sampai 1.",
263
+ )
264
+
265
+ paragraphs_raw = _split_paragraphs(text)
266
+ paragraph_responses = []
267
+
268
+ total_sentences = 0
269
+ total_hoax = 0
270
+ total_fakta = 0
271
+ total_low_conf = 0
272
+
273
+ for p_idx, paragraph_text in enumerate(paragraphs_raw):
274
+ sentences = _split_sentences(paragraph_text)
275
+ classified = _classify_sentences(sentences, threshold)
276
+
277
+ sentence_items = []
278
+ paragraph_hoax = 0
279
+ paragraph_fakta = 0
280
+ paragraph_low = 0
281
+ conf_values: List[float] = []
282
+ hoax_probs: List[float] = []
283
+
284
+ for s_idx, item in enumerate(classified):
285
+ if item["label"] == "Hoaks":
286
+ paragraph_hoax += 1
287
+ else:
288
+ paragraph_fakta += 1
289
+ if item["confidence"] < threshold:
290
+ paragraph_low += 1
291
+
292
+ conf_values.append(item["confidence"])
293
+ hoax_probs.append(item["prob_hoax"])
294
+
295
+ sentence_items.append(
296
+ {
297
+ "sentence_index": s_idx,
298
+ "text": item["text"],
299
+ "label": item["label"],
300
+ "prob_hoax": item["prob_hoax"],
301
+ "prob_fakta": item["prob_fakta"],
302
+ "confidence": item["confidence"],
303
+ "color": item["color"],
304
+ }
305
+ )
306
+
307
+ paragraph_summary = {
308
+ "hoax_sentences": paragraph_hoax,
309
+ "fakta_sentences": paragraph_fakta,
310
+ "avg_confidence": _float(sum(conf_values) / len(conf_values)) if conf_values else 0.0,
311
+ "max_hoax_prob": _float(max(hoax_probs)) if hoax_probs else 0.0,
312
+ }
313
+
314
+ paragraph_responses.append(
315
+ {
316
+ "paragraph_index": p_idx,
317
+ "sentences": sentence_items,
318
+ "paragraph_summary": paragraph_summary,
319
+ }
320
+ )
321
+
322
+ total_sentences += len(sentence_items)
323
+ total_hoax += paragraph_hoax
324
+ total_fakta += paragraph_fakta
325
+ total_low_conf += paragraph_low
326
+
327
+ entities = []
328
+ if payload.include_ner:
329
+ try:
330
+ entities = _extract_entities(text)
331
+ except RuntimeError as exc:
332
+ raise HTTPException(status_code=503, detail=str(exc)) from exc
333
+
334
+ response = {
335
+ "model": {
336
+ "source": MODEL_SOURCE,
337
+ "model_id": MODEL_ID,
338
+ "max_length": MAX_LENGTH,
339
+ },
340
+ "summary": {
341
+ "num_paragraphs": len(paragraph_responses),
342
+ "num_sentences": total_sentences,
343
+ "hoax_sentences": total_hoax,
344
+ "fakta_sentences": total_fakta,
345
+ "low_conf_sentences": total_low_conf,
346
+ },
347
+ "paragraphs": paragraph_responses,
348
+ "ner": {
349
+ "enabled": payload.include_ner,
350
+ "model_id": NER_MODEL_ID,
351
+ "entities": entities,
352
+ },
353
+ }
354
+ return response
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0,<1.0.0
2
+ uvicorn[standard]>=0.30.0,<1.0.0
3
+ torch>=2.2.0
4
+ transformers>=4.45.0
5
+ huggingface_hub>=0.26.0
6
+ safetensors>=0.4.0