import gradio as gr
import torch
import json
import time
import os
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification
from huggingface_hub import InferenceClient
import numpy as np
# ---- CONFIGURATION ----
BERT_REPO = "cihatyldz/carsi-bert-turkish-ecommerce-ner"
LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"
ENTITY_COLORS = {
"PRODUCT": "#3B82F6",
"BRAND": "#8B5CF6",
"PRICE": "#10B981",
"COLOR": "#F59E0B",
"SIZE": "#EF4444",
"MATERIAL": "#06B6D4",
"GENDER": "#EC4899",
}
ENTITY_LABELS_TR = {
"PRODUCT": "Urun",
"BRAND": "Marka",
"PRICE": "Fiyat",
"COLOR": "Renk",
"SIZE": "Beden",
"MATERIAL": "Materyal",
"GENDER": "Cinsiyet",
}
# ---- BERT NER MODEL ----
print("Loading BERT NER model...")
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_REPO)
bert_model = AutoModelForTokenClassification.from_pretrained(BERT_REPO)
bert_model.eval()
id2label = bert_model.config.id2label
label2id = bert_model.config.label2id
print(f"BERT NER ready - {len(id2label)} labels: {list(id2label.values())}")
# ---- LLM CLIENT ----
hf_token = os.environ.get("HF_TOKEN", None)
llm_client = InferenceClient(model=LLM_MODEL, token=hf_token)
print(f"LLM client ready - {LLM_MODEL}")
# ---- BERT PREDICTION ----
def predict_bert_ner(text):
start = time.time()
inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=256, return_offsets_mapping=True)
offset_mapping = inputs.pop("offset_mapping")[0].tolist()
with torch.no_grad():
outputs = bert_model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist()
tokens = bert_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
entities = []
current_entity = None
for idx, (pred, (start_char, end_char)) in enumerate(zip(predictions, offset_mapping)):
if start_char == 0 and end_char == 0:
continue
label = id2label[pred]
if label.startswith("B-"):
if current_entity:
entities.append(current_entity)
entity_type = label[2:]
current_entity = {
"text": text[start_char:end_char],
"type": entity_type,
"start": start_char,
"end": end_char,
}
elif label.startswith("I-") and current_entity:
entity_type = label[2:]
if entity_type == current_entity["type"]:
current_entity["text"] = text[current_entity["start"]:end_char]
current_entity["end"] = end_char
else:
entities.append(current_entity)
current_entity = None
else:
if current_entity:
entities.append(current_entity)
current_entity = None
if current_entity:
entities.append(current_entity)
elapsed = time.time() - start
return entities, elapsed * 1000
# ---- LLM PREDICTION ----
def predict_llm_ner(text):
start = time.time()
entity_types = ", ".join(ENTITY_LABELS_TR.keys())
system_prompt = (
"Sen bir e-ticaret NER (Named Entity Recognition) asistanisin.\n"
"Verilen metindeki asagidaki varliklari (entity) tespit et:\n"
f"Entity tipleri: {entity_types}\n\n"
"PRODUCT: Urun adi (ornegin: kosu ayakkabisi, t-shirt, cep telefonu)\n"
"BRAND: Marka (ornegin: Nike, Samsung, Zara)\n"
"PRICE: Fiyat (ornegin: 2.500 TL, 1.299,99 TL)\n"
"COLOR: Renk (ornegin: kirmizi, siyah, mavi)\n"
"SIZE: Beden veya numara (ornegin: M, XL, 42)\n"
"MATERIAL: Materyal (ornegin: deri, pamuklu)\n"
"GENDER: Cinsiyet (ornegin: kadin, erkek)\n\n"
"SADECE asagidaki JSON formatinda yanit ver:\n"
'[{"text": "...", "type": "ENTITY_TYPE"}, ...]'
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Metin: {text}"},
]
try:
response = llm_client.chat_completion(
messages=messages,
max_tokens=512,
temperature=0.1,
)
response_text = response.choices[0].message.content.strip()
elapsed = time.time() - start
json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
if json_match:
raw_entities = json.loads(json_match.group())
else:
raw_entities = json.loads(response_text)
entities = []
for ent in raw_entities:
ent_text = ent.get("text", "")
ent_type = ent.get("type", "").upper()
if ent_text and ent_type in ENTITY_COLORS:
idx = text.lower().find(ent_text.lower())
if idx >= 0:
entities.append({
"text": text[idx:idx+len(ent_text)],
"type": ent_type,
"start": idx,
"end": idx + len(ent_text),
})
else:
entities.append({"text": ent_text, "type": ent_type, "start": -1, "end": -1})
return entities, elapsed * 1000
except Exception as e:
elapsed = time.time() - start
return [], elapsed * 1000
# ---- HTML FORMATTING ----
def entities_to_html(text, entities):
if not entities:
return f'
{text}
'
sorted_ents = sorted([e for e in entities if e["start"] >= 0], key=lambda x: x["start"])
html_parts = []
last_end = 0
for ent in sorted_ents:
if ent["start"] < last_end:
continue
if ent["start"] > last_end:
html_parts.append(text[last_end:ent["start"]])
color = ENTITY_COLORS.get(ent["type"], "#888")
label_tr = ENTITY_LABELS_TR.get(ent["type"], ent["type"])
html_parts.append(
f''
f'{ent["text"]}'
f''
f'{label_tr}'
)
last_end = ent["end"]
if last_end < len(text):
html_parts.append(text[last_end:])
return f'{"".join(html_parts)}
'
def entities_to_table(entities, latency):
if not entities:
return f"*Entity bulunamadi.*\n\n\u26a1 **Sure: {latency:.0f} ms**"
lines = ["| Entity | Tip | Renk |", "|---|---|---|"]
for ent in entities:
color = ENTITY_COLORS.get(ent["type"], "#888")
label_tr = ENTITY_LABELS_TR.get(ent["type"], ent["type"])
lines.append(f"| **{ent['text']}** | {label_tr} | \u25cf {ent['type']} |")
lines.append(f"\n\u26a1 **Sure: {latency:.0f} ms** | **{len(entities)} entity bulundu**")
return "\n".join(lines)
# ---- MAIN FUNCTION ----
def analyze_text(text):
if not text or len(text.strip()) < 5:
empty = "*Lutfen bir e-ticaret metni girin.*"
return empty, empty, empty, empty
bert_entities, bert_ms = predict_bert_ner(text)
bert_html = entities_to_html(text, bert_entities)
bert_table = entities_to_table(bert_entities, bert_ms)
try:
llm_entities, llm_ms = predict_llm_ner(text)
llm_html = entities_to_html(text, llm_entities)
llm_table = entities_to_table(llm_entities, llm_ms)
except Exception as e:
llm_html = f"LLM hatasi: {str(e)}
"
llm_table = f"*LLM hatasi: {str(e)}*"
return bert_html, bert_table, llm_html, llm_table
# ---- EXAMPLES ----
EXAMPLES = [
["Nike Air Max 90 kirmizi 42 numara 3.499 TL"],
["Samsung Galaxy S24 Ultra siyah 512GB 64.999 TL"],
["Zara kadin deri ceket siyah M beden 2.999 TL"],
["Adidas Superstar beyaz 39 numara orijinal 3.199 TL ile satin aldim"],
["LC Waikiki pamuklu erkek gomlek mavi XL beden 599 TL"],
["Apple iPhone 16 Pro Max 1TB uzay siyahi 89.999 TL"],
["Mavi Jean pantolon erkek 32 beden koyu mavi 1.299 TL"],
["The North Face mont kadin S beden haki yesil 8.499 TL"],
["Dyson V15 kablosuz supurge 24.999 TL cok iyi"],
["Converse Chuck Taylor All Star siyah 38 numara 1.899 TL"],
]
# ---- CSS THEME - Bazaar Amber ----
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
.gradio-container {
font-family: 'Inter', sans-serif !important;
max-width: 1200px !important;
margin: 0 auto !important;
}
.main-title {
text-align: center; padding: 1.5rem;
background: linear-gradient(135deg, #92400E 0%, #D97706 30%, #F59E0B 60%, #EF4444 100%);
border-radius: 16px; margin-bottom: 1rem; color: white;
}
.main-title h1 { margin: 0; font-size: 2rem; font-weight: 700; }
.main-title p { margin: 0.3rem 0 0 0; opacity: 0.9; font-size: 0.95rem; }
.model-header-bert {
background: linear-gradient(135deg, #1E40AF, #3B82F6);
color: white; padding: 0.6rem; border-radius: 8px;
text-align: center; font-weight: 600; margin-bottom: 0.5rem;
}
.model-header-llm {
background: linear-gradient(135deg, #7C3AED, #A855F7);
color: white; padding: 0.6rem; border-radius: 8px;
text-align: center; font-weight: 600; margin-bottom: 0.5rem;
}
.legend-box {
display: flex; flex-wrap: wrap; gap: 8px; justify-content: center;
padding: 8px; margin-bottom: 8px;
}
.legend-item {
font-size: 12px; padding: 3px 10px; border-radius: 12px; font-weight: 500;
}
footer { display: none !important; }
"""
# ---- LEGEND HTML ----
legend_items = "".join(
f''
f'{ENTITY_LABELS_TR[etype]} ({etype})'
for etype, color in ENTITY_COLORS.items()
)
LEGEND_HTML = f'{legend_items}
'
# ---- GRADIO APP ----
with gr.Blocks(css=CUSTOM_CSS, title="Carsi - E-ticaret NER") as demo:
gr.HTML(
''
'
\U0001f3ea Carsi
'
'
Turkce E-Ticaret NER — Named Entity Recognition
'
'
'
'Fine-tuned Turkish BERT \u26a1 vs Qwen2.5-7B Zero-shot \U0001f9e0
'
'
'
)
gr.HTML(LEGEND_HTML)
with gr.Row():
text_input = gr.Textbox(
label="\U0001f6d2 E-Ticaret Metni",
placeholder="Orn: Nike Air Max 90 kirmizi 42 numara 3.499 TL",
lines=2, scale=4,
)
analyze_btn = gr.Button("\U0001f50d Analiz Et", variant="primary", scale=1)
with gr.Row(equal_height=True):
with gr.Column():
gr.HTML('')
bert_html = gr.HTML(label="BERT Entities")
bert_table = gr.Markdown(label="BERT Detay")
with gr.Column():
gr.HTML('')
llm_html = gr.HTML(label="LLM Entities")
llm_table = gr.Markdown(label="LLM Detay")
gr.Examples(examples=EXAMPLES, inputs=text_input, label="\U0001f4cb Ornek Metinler (tiklayip deneyin)")
gr.HTML(
''
'\u26a0\ufe0f Bu uygulama egitim ve arastirma amaclidir. '
'Urun bilgileri gercek piyasa verileriyle dogrulanmalidir.'
'
'
)
analyze_btn.click(fn=analyze_text, inputs=text_input, outputs=[bert_html, bert_table, llm_html, llm_table])
text_input.submit(fn=analyze_text, inputs=text_input, outputs=[bert_html, bert_table, llm_html, llm_table])
demo.queue()
demo.launch()