import subprocess, sys, re, os, json

# ── Install llama-cpp-python at runtime ───────────────────────────────────────
result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
print(result.stdout)
match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", result.stdout)
if match:
    major, minor = match.group(1), match.group(2)
    cuda_ver = f"cu{major}{minor}"
else:
    cuda_ver = "cpu"
print(f"CUDA detected: {cuda_ver}")

wheel_map = {
    "cu130": "cu122",  # ← add this
    "cu128": "cu122",
    "cu126": "cu122",
    "cu124": "cu124",
    "cu122": "cu122",
    "cu121": "cu121",
    "cu118": "cu118",
}
wheel_tag = wheel_map.get(cuda_ver, "cu122")
subprocess.run([
    sys.executable, "-m", "pip", "install", "llama-cpp-python",
    "--extra-index-url", f"https://abetlen.github.io/llama-cpp-python/whl/{wheel_tag}",
    "--no-cache-dir", "-q",
], check=True)
print("llama-cpp-python installed.")

import gradio as gr
import spaces
from huggingface_hub import hf_hub_download, login

# ── Auth ──────────────────────────────────────────────────────────────────────
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)

MODEL_REPO   = os.environ.get("MODEL_REPO", "build-small-hackathon/legacystribe-Qwen3.5-9B.Q4_K_M")
MODEL_FILE   = os.environ.get("MODEL_FILE", "Qwen3.5-9B.Q4_K_M.gguf")
N_CTX        = int(os.environ.get("N_CTX", "4096"))
N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))

# ── Download model once at startup ───────────────────────────────────────────
print("Downloading model...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"Model downloaded to {model_path}")

# ── Load model lazily (inside GPU function) ───────────────────────────────────
llm = None

def get_llm():
    global llm
    if llm is None:
        from llama_cpp import Llama
        print("Loading model into llama.cpp...")
        llm = Llama(
            model_path=model_path,
            n_ctx=N_CTX,
            n_gpu_layers=N_GPU_LAYERS,
            verbose=False,
        )
        print("Model ready.")
    return llm

# ── System prompts ────────────────────────────────────────────────────────────
SYSTEM_PROMPTS = {
    "questioner": (
        "/no_think "
        "You are a gentle memory guide helping an elderly person tell their life story. "
        "Ask exactly one warm, open follow-up question. Never ask more than one question. "
        "Be patient, kind, and culturally sensitive to Nepali and South Asian contexts."
    ),
    "extractor": (
        "/no_think "
        "You are an extractor agent. Given a memory fragment, extract structured information "
        "as JSON with keys relevant to the content (who, when, where, what, emotion). "
        "Output only valid JSON, nothing else."
    ),
    "arcdetector": (
        "/no_think "
        "You are an arc detector agent. Given a memory fragment, identify the narrative stage. "
        "Output one word only: setup, tension, turn, or meaning."
    ),
    "publisher": (
        "You are a publisher agent. Given memory notes, synthesize them into a single warm, "
        "narrative paragraph suitable for a family memory book. Write in first person. "
        "Use natural, unhurried language. Output only the paragraph, nothing else."
    ),
}

AGENT_DEFAULTS = {
    "questioner":  {"max_tokens": 2048, "temp": 0.7},
    "extractor":   {"max_tokens": 2048, "temp": 0.1},
    "arcdetector": {"max_tokens": 1024, "temp": 0.1},
    "publisher":   {"max_tokens": 2048, "temp": 0.4},
}

# ── Inference ─────────────────────────────────────────────────────────────────
@spaces.GPU
def infer(agent: str, user_text: str, max_tokens: float = -1, temp: float = -1.0) -> str:
    if agent not in SYSTEM_PROMPTS:
        return json.dumps({"error": f"unknown agent: {agent}"})
    defaults = AGENT_DEFAULTS.get(agent, {"max_tokens": 256, "temp": 0.4})
    _max_tokens = defaults["max_tokens"] if max_tokens < 0 else int(max_tokens)
    _temp       = defaults["temp"]       if temp < 0      else float(temp)
    try:
        response = get_llm().create_chat_completion(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPTS[agent]},
                {"role": "user",   "content": f"/no_think\n{user_text}"},
            ],
            max_tokens=_max_tokens,
            temperature=_temp,
        )
        raw = response["choices"][0]["message"]["content"].strip()
        print(f"[RAW:{agent}] {repr(raw)}")
        return raw
    except Exception as e:
        return json.dumps({"error": str(e)})

def health() -> str:
    return "ok"

def clean_response(text: str) -> str:
    if '</think>' in text:
        text = text.split('</think>', 1)[1]
    elif re.match(r'^Thinking Process:', text):
        # No closing tag — find the first non-numbered paragraph after the list
        parts = re.split(r'\n\n(?!\d+[\.\)])', text, maxsplit=1)
        text = parts[1] if len(parts) > 1 else ''
    return text.strip()

# ── Gradio ────────────────────────────────────────────────────────────────────
with gr.Blocks(title="LegacyScribe Backend") as demo:
    gr.Markdown("## LegacyScribe Inference Backend\nCalled by the org Space frontend via `gradio_client`.")
    with gr.Row():
        with gr.Column():
            agent_in     = gr.Textbox(label="agent")
            text_in      = gr.Textbox(label="user_text", lines=4)
            tokens_in    = gr.Number(label="max_tokens", value=-1)
            temp_in      = gr.Number(label="temp", value=-1.0)
            infer_btn    = gr.Button("Run", variant="primary")
        with gr.Column():
            out          = gr.Textbox(label="Response", lines=6)
            health_btn   = gr.Button("Health check")
            health_out   = gr.Textbox(label="Status")

    infer_btn.click(fn=infer, inputs=[agent_in, text_in, tokens_in, temp_in], outputs=out, api_name="predict")
    health_btn.click(fn=health, inputs=[], outputs=health_out, api_name="health")

demo.launch(ssr_mode=False)