import gradio as gr import torch import json import time import os import re from transformers import AutoTokenizer, AutoModelForTokenClassification from huggingface_hub import InferenceClient import numpy as np # ---- CONFIGURATION ---- BERT_REPO = "cihatyldz/carsi-bert-turkish-ecommerce-ner" LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct" ENTITY_COLORS = { "PRODUCT": "#3B82F6", "BRAND": "#8B5CF6", "PRICE": "#10B981", "COLOR": "#F59E0B", "SIZE": "#EF4444", "MATERIAL": "#06B6D4", "GENDER": "#EC4899", } ENTITY_LABELS_TR = { "PRODUCT": "Urun", "BRAND": "Marka", "PRICE": "Fiyat", "COLOR": "Renk", "SIZE": "Beden", "MATERIAL": "Materyal", "GENDER": "Cinsiyet", } # ---- BERT NER MODEL ---- print("Loading BERT NER model...") bert_tokenizer = AutoTokenizer.from_pretrained(BERT_REPO) bert_model = AutoModelForTokenClassification.from_pretrained(BERT_REPO) bert_model.eval() id2label = bert_model.config.id2label label2id = bert_model.config.label2id print(f"BERT NER ready - {len(id2label)} labels: {list(id2label.values())}") # ---- LLM CLIENT ---- hf_token = os.environ.get("HF_TOKEN", None) llm_client = InferenceClient(model=LLM_MODEL, token=hf_token) print(f"LLM client ready - {LLM_MODEL}") # ---- BERT PREDICTION ---- def predict_bert_ner(text): start = time.time() inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=256, return_offsets_mapping=True) offset_mapping = inputs.pop("offset_mapping")[0].tolist() with torch.no_grad(): outputs = bert_model(**inputs) predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist() tokens = bert_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) entities = [] current_entity = None for idx, (pred, (start_char, end_char)) in enumerate(zip(predictions, offset_mapping)): if start_char == 0 and end_char == 0: continue label = id2label[pred] if label.startswith("B-"): if current_entity: entities.append(current_entity) entity_type = label[2:] current_entity = { "text": text[start_char:end_char], "type": entity_type, "start": start_char, "end": end_char, } elif label.startswith("I-") and current_entity: entity_type = label[2:] if entity_type == current_entity["type"]: current_entity["text"] = text[current_entity["start"]:end_char] current_entity["end"] = end_char else: entities.append(current_entity) current_entity = None else: if current_entity: entities.append(current_entity) current_entity = None if current_entity: entities.append(current_entity) elapsed = time.time() - start return entities, elapsed * 1000 # ---- LLM PREDICTION ---- def predict_llm_ner(text): start = time.time() entity_types = ", ".join(ENTITY_LABELS_TR.keys()) system_prompt = ( "Sen bir e-ticaret NER (Named Entity Recognition) asistanisin.\n" "Verilen metindeki asagidaki varliklari (entity) tespit et:\n" f"Entity tipleri: {entity_types}\n\n" "PRODUCT: Urun adi (ornegin: kosu ayakkabisi, t-shirt, cep telefonu)\n" "BRAND: Marka (ornegin: Nike, Samsung, Zara)\n" "PRICE: Fiyat (ornegin: 2.500 TL, 1.299,99 TL)\n" "COLOR: Renk (ornegin: kirmizi, siyah, mavi)\n" "SIZE: Beden veya numara (ornegin: M, XL, 42)\n" "MATERIAL: Materyal (ornegin: deri, pamuklu)\n" "GENDER: Cinsiyet (ornegin: kadin, erkek)\n\n" "SADECE asagidaki JSON formatinda yanit ver:\n" '[{"text": "...", "type": "ENTITY_TYPE"}, ...]' ) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"Metin: {text}"}, ] try: response = llm_client.chat_completion( messages=messages, max_tokens=512, temperature=0.1, ) response_text = response.choices[0].message.content.strip() elapsed = time.time() - start json_match = re.search(r'\[.*\]', response_text, re.DOTALL) if json_match: raw_entities = json.loads(json_match.group()) else: raw_entities = json.loads(response_text) entities = [] for ent in raw_entities: ent_text = ent.get("text", "") ent_type = ent.get("type", "").upper() if ent_text and ent_type in ENTITY_COLORS: idx = text.lower().find(ent_text.lower()) if idx >= 0: entities.append({ "text": text[idx:idx+len(ent_text)], "type": ent_type, "start": idx, "end": idx + len(ent_text), }) else: entities.append({"text": ent_text, "type": ent_type, "start": -1, "end": -1}) return entities, elapsed * 1000 except Exception as e: elapsed = time.time() - start return [], elapsed * 1000 # ---- HTML FORMATTING ---- def entities_to_html(text, entities): if not entities: return f'
{text}
' sorted_ents = sorted([e for e in entities if e["start"] >= 0], key=lambda x: x["start"]) html_parts = [] last_end = 0 for ent in sorted_ents: if ent["start"] < last_end: continue if ent["start"] > last_end: html_parts.append(text[last_end:ent["start"]]) color = ENTITY_COLORS.get(ent["type"], "#888") label_tr = ENTITY_LABELS_TR.get(ent["type"], ent["type"]) html_parts.append( f'' f'{ent["text"]}' f'' f'{label_tr}' ) last_end = ent["end"] if last_end < len(text): html_parts.append(text[last_end:]) return f'
{"".join(html_parts)}
' def entities_to_table(entities, latency): if not entities: return f"*Entity bulunamadi.*\n\n\u26a1 **Sure: {latency:.0f} ms**" lines = ["| Entity | Tip | Renk |", "|---|---|---|"] for ent in entities: color = ENTITY_COLORS.get(ent["type"], "#888") label_tr = ENTITY_LABELS_TR.get(ent["type"], ent["type"]) lines.append(f"| **{ent['text']}** | {label_tr} | \u25cf {ent['type']} |") lines.append(f"\n\u26a1 **Sure: {latency:.0f} ms** | **{len(entities)} entity bulundu**") return "\n".join(lines) # ---- MAIN FUNCTION ---- def analyze_text(text): if not text or len(text.strip()) < 5: empty = "*Lutfen bir e-ticaret metni girin.*" return empty, empty, empty, empty bert_entities, bert_ms = predict_bert_ner(text) bert_html = entities_to_html(text, bert_entities) bert_table = entities_to_table(bert_entities, bert_ms) try: llm_entities, llm_ms = predict_llm_ner(text) llm_html = entities_to_html(text, llm_entities) llm_table = entities_to_table(llm_entities, llm_ms) except Exception as e: llm_html = f"
LLM hatasi: {str(e)}
" llm_table = f"*LLM hatasi: {str(e)}*" return bert_html, bert_table, llm_html, llm_table # ---- EXAMPLES ---- EXAMPLES = [ ["Nike Air Max 90 kirmizi 42 numara 3.499 TL"], ["Samsung Galaxy S24 Ultra siyah 512GB 64.999 TL"], ["Zara kadin deri ceket siyah M beden 2.999 TL"], ["Adidas Superstar beyaz 39 numara orijinal 3.199 TL ile satin aldim"], ["LC Waikiki pamuklu erkek gomlek mavi XL beden 599 TL"], ["Apple iPhone 16 Pro Max 1TB uzay siyahi 89.999 TL"], ["Mavi Jean pantolon erkek 32 beden koyu mavi 1.299 TL"], ["The North Face mont kadin S beden haki yesil 8.499 TL"], ["Dyson V15 kablosuz supurge 24.999 TL cok iyi"], ["Converse Chuck Taylor All Star siyah 38 numara 1.899 TL"], ] # ---- CSS THEME - Bazaar Amber ---- CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); .gradio-container { font-family: 'Inter', sans-serif !important; max-width: 1200px !important; margin: 0 auto !important; } .main-title { text-align: center; padding: 1.5rem; background: linear-gradient(135deg, #92400E 0%, #D97706 30%, #F59E0B 60%, #EF4444 100%); border-radius: 16px; margin-bottom: 1rem; color: white; } .main-title h1 { margin: 0; font-size: 2rem; font-weight: 700; } .main-title p { margin: 0.3rem 0 0 0; opacity: 0.9; font-size: 0.95rem; } .model-header-bert { background: linear-gradient(135deg, #1E40AF, #3B82F6); color: white; padding: 0.6rem; border-radius: 8px; text-align: center; font-weight: 600; margin-bottom: 0.5rem; } .model-header-llm { background: linear-gradient(135deg, #7C3AED, #A855F7); color: white; padding: 0.6rem; border-radius: 8px; text-align: center; font-weight: 600; margin-bottom: 0.5rem; } .legend-box { display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; padding: 8px; margin-bottom: 8px; } .legend-item { font-size: 12px; padding: 3px 10px; border-radius: 12px; font-weight: 500; } footer { display: none !important; } """ # ---- LEGEND HTML ---- legend_items = "".join( f'' f'{ENTITY_LABELS_TR[etype]} ({etype})' for etype, color in ENTITY_COLORS.items() ) LEGEND_HTML = f'
{legend_items}
' # ---- GRADIO APP ---- with gr.Blocks(css=CUSTOM_CSS, title="Carsi - E-ticaret NER") as demo: gr.HTML( '
' '

\U0001f3ea Carsi

' '

Turkce E-Ticaret NER — Named Entity Recognition

' '

' 'Fine-tuned Turkish BERT \u26a1 vs Qwen2.5-7B Zero-shot \U0001f9e0

' '
' ) gr.HTML(LEGEND_HTML) with gr.Row(): text_input = gr.Textbox( label="\U0001f6d2 E-Ticaret Metni", placeholder="Orn: Nike Air Max 90 kirmizi 42 numara 3.499 TL", lines=2, scale=4, ) analyze_btn = gr.Button("\U0001f50d Analiz Et", variant="primary", scale=1) with gr.Row(equal_height=True): with gr.Column(): gr.HTML('
\u26a1 BERT NER
Fine-tuned Turkish BERT - CPU - Hizli
') bert_html = gr.HTML(label="BERT Entities") bert_table = gr.Markdown(label="BERT Detay") with gr.Column(): gr.HTML('
\U0001f9e0 LLM NER
Qwen2.5-7B - HF Inference API - Zero-shot
') llm_html = gr.HTML(label="LLM Entities") llm_table = gr.Markdown(label="LLM Detay") gr.Examples(examples=EXAMPLES, inputs=text_input, label="\U0001f4cb Ornek Metinler (tiklayip deneyin)") gr.HTML( '
' '\u26a0\ufe0f Bu uygulama egitim ve arastirma amaclidir. ' 'Urun bilgileri gercek piyasa verileriyle dogrulanmalidir.' '
' ) analyze_btn.click(fn=analyze_text, inputs=text_input, outputs=[bert_html, bert_table, llm_html, llm_table]) text_input.submit(fn=analyze_text, inputs=text_input, outputs=[bert_html, bert_table, llm_html, llm_table]) demo.queue() demo.launch()