"""
سكريبت جلب بيانات طبية موثوقة وإدراجها في Supabase pgvector
المصادر: MedlinePlus (NIH) + Wikipedia العربية
الأداة: Groq لتنظيف وهيكلة المحتوى بالعربية
"""
import os, json, time, re
import requests as http
from dotenv import load_dotenv
load_dotenv(dotenv_path=r'D:\Project\.env')

os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
os.environ['HF_HOME'] = r'D:\Project\model_cache'

from groq import Groq
from langchain_huggingface import HuggingFaceEmbeddings

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
EMBED_MODEL  = "intfloat/multilingual-e5-large"

groq   = Groq(api_key=GROQ_API_KEY)
SB_HDR = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
    "Content-Type": "application/json",
    "Prefer": "return=minimal",
}
REST = f"{SUPABASE_URL}/rest/v1"

# ─── قائمة التحاليل ────────────────────────────────────────────────────────
LAB_TESTS = [
    # CBC
    {"ar": "الهيموجلوبين",          "en": "hemoglobin",             "unit": "g/dL",    "normal": "ذكر: 13.5–17.5 | أنثى: 12–15.5"},
    {"ar": "خلايا الدم الحمراء",    "en": "red blood cells RBC",     "unit": "مليون/μL","normal": "ذكر: 4.5–5.9 | أنثى: 4.1–5.1"},
    {"ar": "خلايا الدم البيضاء",    "en": "white blood cells WBC",   "unit": "ألف/μL",  "normal": "4.5–11"},
    {"ar": "الصفائح الدموية",       "en": "platelets PLT",           "unit": "ألف/μL",  "normal": "150–400"},
    {"ar": "الهيماتوكريت",          "en": "hematocrit HCT",          "unit": "%",       "normal": "ذكر: 41–53 | أنثى: 36–46"},
    {"ar": "متوسط حجم الكرية",      "en": "MCV mean corpuscular volume","unit":"fL",    "normal": "80–100"},
    {"ar": "المتعادلات Neutrophils","en": "neutrophils",             "unit": "%",       "normal": "50–70"},
    {"ar": "الليمفاويات",           "en": "lymphocytes",             "unit": "%",       "normal": "20–40"},
    {"ar": "الحمضات Eosinophils",   "en": "eosinophils",             "unit": "%",       "normal": "1–4"},
    {"ar": "الوحيدات Monocytes",    "en": "monocytes",               "unit": "%",       "normal": "2–8"},
    # سكر
    {"ar": "سكر الدم الصائم",       "en": "fasting blood glucose",   "unit": "mg/dL",   "normal": "70–99"},
    {"ar": "الهيموجلوبين السكري HbA1c","en":"hemoglobin A1c HbA1c", "unit": "%",       "normal": "< 5.7"},
    # دهون
    {"ar": "الكوليسترول الكلي",     "en": "total cholesterol",       "unit": "mg/dL",   "normal": "< 200"},
    {"ar": "الدهون الثلاثية",       "en": "triglycerides",           "unit": "mg/dL",   "normal": "< 150"},
    {"ar": "الكوليسترول الجيد HDL", "en": "HDL cholesterol",         "unit": "mg/dL",   "normal": "ذكر: >40 | أنثى: >50"},
    {"ar": "الكوليسترول السيئ LDL", "en": "LDL cholesterol",         "unit": "mg/dL",   "normal": "< 100"},
    # كلى
    {"ar": "الكرياتينين",           "en": "creatinine",              "unit": "mg/dL",   "normal": "ذكر: 0.7–1.3 | أنثى: 0.5–1.1"},
    {"ar": "اليوريا BUN",           "en": "blood urea nitrogen BUN", "unit": "mg/dL",   "normal": "7–20"},
    {"ar": "حمض اليوريك",           "en": "uric acid",               "unit": "mg/dL",   "normal": "ذكر: 3.4–7 | أنثى: 2.4–6"},
    # كبد
    {"ar": "إنزيم ALT",             "en": "alanine aminotransferase ALT","unit":"U/L",  "normal": "7–56"},
    {"ar": "إنزيم AST",             "en": "aspartate aminotransferase AST","unit":"U/L","normal": "10–40"},
    {"ar": "إنزيم ALP",             "en": "alkaline phosphatase ALP","unit": "U/L",     "normal": "44–147"},
    {"ar": "البيليروبين الكلي",     "en": "total bilirubin",         "unit": "mg/dL",   "normal": "0.1–1.2"},
    {"ar": "الألبومين",             "en": "albumin",                 "unit": "g/dL",     "normal": "3.4–5.4"},
    # غدة درقية
    {"ar": "هرمون TSH",             "en": "thyroid stimulating hormone TSH","unit":"mIU/L","normal":"0.4–4.0"},
    {"ar": "هرمون T4 الحر",         "en": "free thyroxine T4",       "unit": "ng/dL",   "normal": "0.8–1.8"},
    {"ar": "هرمون T3 الحر",         "en": "free triiodothyronine T3","unit": "pg/mL",   "normal": "2.3–4.2"},
    # حديد
    {"ar": "الفيريتين",             "en": "ferritin",                "unit": "ng/mL",   "normal": "ذكر: 24–336 | أنثى: 11–307"},
    {"ar": "الحديد في الدم",        "en": "serum iron",              "unit": "μg/dL",   "normal": "60–170"},
    {"ar": "فيتامين د",             "en": "vitamin D 25-hydroxyvitamin D","unit":"ng/mL","normal":"30–100"},
    {"ar": "فيتامين B12",           "en": "vitamin B12 cobalamin",   "unit": "pg/mL",   "normal": "200–900"},
    {"ar": "حمض الفوليك",           "en": "folate folic acid",       "unit": "ng/mL",   "normal": "2.7–17"},
    # التهاب
    {"ar": "بروتين CRP",            "en": "C-reactive protein CRP",  "unit": "mg/L",    "normal": "< 1.0"},
    {"ar": "سرعة ترسب الدم ESR",   "en": "erythrocyte sedimentation rate ESR","unit":"mm/hr","normal":"ذكر:<15 | أنثى:<20"},
    # هرمونات
    {"ar": "هرمون التستوستيرون",    "en": "testosterone",            "unit": "ng/dL",   "normal": "ذكر: 300–1000 | أنثى: 15–70"},
    {"ar": "هرمون TSH الدرقي",      "en": "prolactin",               "unit": "ng/mL",   "normal": "ذكر: 2–18 | أنثى: 2–29"},
    # تخثر
    {"ar": "وقت البروثرومبين PT",   "en": "prothrombin time PT INR", "unit": "ثانية",   "normal": "11–13.5"},
    {"ar": "الصوديوم",              "en": "sodium Na",               "unit": "mEq/L",   "normal": "136–145"},
    {"ar": "البوتاسيوم",            "en": "potassium K",             "unit": "mEq/L",   "normal": "3.5–5.1"},
    {"ar": "الكالسيوم",             "en": "calcium Ca",              "unit": "mg/dL",   "normal": "8.5–10.5"},
]


def groq_call(prompt: str, max_tokens: int = 1200) -> str:
    try:
        r = groq.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=max_tokens,
        )
        return r.choices[0].message.content.strip()
    except Exception as e:
        print(f"    [Groq error] {e}")
        time.sleep(10)
        return ""


def get_wikipedia_ar(term_ar: str) -> str:
    try:
        r = http.get(
            "https://ar.wikipedia.org/w/api.php",
            params={"action":"query","titles":term_ar,"prop":"extracts",
                    "format":"json","exintro":1,"explaintext":1,"exsectionformat":"plain"},
            timeout=10,
        )
        pages = r.json().get("query", {}).get("pages", {})
        for p in pages.values():
            extract = p.get("extract", "")
            if extract and len(extract) > 100:
                return extract[:2000]
    except Exception:
        pass
    return ""


def get_medlineplus(term_en: str) -> str:
    try:
        r = http.get(
            "https://wsearch.nlm.nih.gov/ws/query",
            params={"db":"healthTopics","term":term_en,"retmax":"2","rettype":"brief"},
            timeout=10,
        )
        # استخراج النص من XML
        text = re.sub(r'<[^>]+>', ' ', r.text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text[:2000] if len(text) > 100 else ""
    except Exception:
        return ""


def _extract_json(raw: str) -> dict | None:
    """محاولات متعددة لاستخراج JSON من رد Groq"""
    if not raw:
        return None
    # محاولة 1: استخراج من أول { لآخر }
    try:
        start = raw.index('{')
        end   = raw.rindex('}') + 1
        return json.loads(raw[start:end])
    except Exception:
        pass
    # محاولة 2: إزالة الأسطر المعطوبة وإعادة المحاولة
    try:
        lines = raw.splitlines()
        cleaned = []
        for ln in lines:
            try:
                cleaned.append(ln)
                json.loads('\n'.join(cleaned) + '}' * 4)
            except json.JSONDecodeError as err:
                if err.msg not in ("Unterminated string", "Expecting ',' delimiter"):
                    continue
        joined = '\n'.join(cleaned)
        start = joined.index('{')
        end   = joined.rindex('}') + 1
        return json.loads(joined[start:end])
    except Exception:
        pass
    return None


def build_chunks(test: dict, wiki_ar: str, medline_en: str) -> list:
    prompt = (
        f"You are a medical expert. Write concise Arabic medical info about lab test: {test['en']} ({test['ar']}). "
        f"Unit: {test['unit']}. Normal range: {test['normal']}. "
        "Reply ONLY with valid JSON, no extra text, using this exact format: "
        '{"definition":"one paragraph in Arabic","normal_values":"one paragraph in Arabic",'
        '"abnormal_causes":"one paragraph in Arabic","symptoms_tips":"one paragraph in Arabic"}'
    )

    raw = groq_call(prompt, max_tokens=1200)
    if not raw:
        print("    [!] Groq response empty")
        return []

    data = _extract_json(raw)
    if not data:
        print(f"    [!] JSON parse error | raw[:120]={raw[:120]}")
        return []

    base_meta = {
        "test_name":    test['ar'],
        "test_name_en": test['en'],
        "unit":         test['unit'],
        "normal_range": test['normal'],
        "source":       "MedlinePlus+Wikipedia",
        "language":     "ar",
    }

    chunks = []
    for topic_type, content in [
        ("lab_definition",  data.get("definition", "")),
        ("normal_values",   data.get("normal_values", "")),
        ("abnormal_causes", data.get("abnormal_causes", "")),
        ("symptoms_tips",   data.get("symptoms_tips", "")),
    ]:
        if content and isinstance(content, str) and len(content.strip()) > 50:
            text = f"تحليل: {test['ar']} | {test['en']}\n{content}"
            chunks.append({"content": text, "metadata": {**base_meta, "topic_type": topic_type}})

    return chunks


def insert_chunks(embeddings_model, chunks: list):
    if not chunks:
        return
    rows = []
    for ch in chunks:
        vec = embeddings_model.embed_query(ch["content"])
        rows.append({
            "content":   ch["content"],
            "metadata":  ch["metadata"],
            "embedding": vec,
        })
    r = http.post(f"{REST}/documents", headers=SB_HDR, json=rows, timeout=60)
    if not r.ok:
        print(f"  [DB] خطأ: {r.status_code} — {r.text[:150]}")


def get_existing_test_names() -> set:
    """جلب أسماء التحاليل الموجودة مسبقاً في قاعدة البيانات"""
    try:
        r = http.get(
            f"{REST}/documents",
            headers={**SB_HDR, "Prefer": "return=representation"},
            params={"select": "metadata->test_name_en", "metadata->>source": "eq.MedlinePlus+Wikipedia"},
            timeout=15,
        )
        if r.ok:
            return {row.get("test_name_en") for row in r.json() if row.get("test_name_en")}
    except Exception:
        pass
    return set()


def main():
    print("تحميل نموذج الـ Embeddings...")
    emb = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

    # عدد السجلات الحالية
    r = http.get(f"{REST}/documents", headers={**SB_HDR, "Prefer":"count=exact","Range":"0-0"}, timeout=10)
    current = int(r.headers.get("Content-Range","0/0").split("/")[-1])
    print(f"قاعدة البيانات حالياً: {current} chunk\n")

    existing = get_existing_test_names()
    print(f"تحاليل موجودة مسبقاً: {len(existing)}\n")

    total_added = 0
    for i, test in enumerate(LAB_TESTS):
        if test['en'] in existing:
            print(f"[{i+1}/{len(LAB_TESTS)}] تخطي (موجود): {test['ar']}")
            continue
        print(f"[{i+1}/{len(LAB_TESTS)}] {test['ar']} ({test['en']})")

        wiki_ar   = get_wikipedia_ar(test['ar'])
        medline   = get_medlineplus(test['en'])
        chunks    = build_chunks(test, wiki_ar, medline)

        if chunks:
            insert_chunks(emb, chunks)
            total_added += len(chunks)
            print(f"  + أضفت {len(chunks)} chunks")
        else:
            print(f"  - فشل توليد المحتوى")

        time.sleep(4)  # احترام rate limit Groq

    print(f"\nاكتمل الاستيعاب: أضفنا {total_added} chunk جديد")
    r2 = http.get(f"{REST}/documents", headers={**SB_HDR,"Prefer":"count=exact","Range":"0-0"}, timeout=10)
    final = int(r2.headers.get("Content-Range","0/0").split("/")[-1])
    print(f"إجمالي قاعدة البيانات الآن: {final} chunk")


if __name__ == "__main__":
    main()