import subprocess, sys, re, os, json # ── Install llama-cpp-python at runtime ─────────────────────────────────────── result = subprocess.run(["nvidia-smi"], capture_output=True, text=True) print(result.stdout) match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", result.stdout) if match: major, minor = match.group(1), match.group(2) cuda_ver = f"cu{major}{minor}" else: cuda_ver = "cpu" print(f"CUDA detected: {cuda_ver}") wheel_map = { "cu130": "cu122", # ← add this "cu128": "cu122", "cu126": "cu122", "cu124": "cu124", "cu122": "cu122", "cu121": "cu121", "cu118": "cu118", } wheel_tag = wheel_map.get(cuda_ver, "cu122") subprocess.run([ sys.executable, "-m", "pip", "install", "llama-cpp-python", "--extra-index-url", f"https://abetlen.github.io/llama-cpp-python/whl/{wheel_tag}", "--no-cache-dir", "-q", ], check=True) print("llama-cpp-python installed.") import gradio as gr import spaces from huggingface_hub import hf_hub_download, login # ── Auth ────────────────────────────────────────────────────────────────────── hf_token = os.environ.get("HF_TOKEN") if hf_token: login(token=hf_token) MODEL_REPO = os.environ.get("MODEL_REPO", "build-small-hackathon/legacystribe-Qwen3.5-9B.Q4_K_M") MODEL_FILE = os.environ.get("MODEL_FILE", "Qwen3.5-9B.Q4_K_M.gguf") N_CTX = int(os.environ.get("N_CTX", "4096")) N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1")) # ── Download model once at startup ─────────────────────────────────────────── print("Downloading model...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) print(f"Model downloaded to {model_path}") # ── Load model lazily (inside GPU function) ─────────────────────────────────── llm = None def get_llm(): global llm if llm is None: from llama_cpp import Llama print("Loading model into llama.cpp...") llm = Llama( model_path=model_path, n_ctx=N_CTX, n_gpu_layers=N_GPU_LAYERS, verbose=False, ) print("Model ready.") return llm # ── System prompts ──────────────────────────────────────────────────────────── SYSTEM_PROMPTS = { "questioner": ( "/no_think " "You are a gentle memory guide helping an elderly person tell their life story. " "Ask exactly one warm, open follow-up question. Never ask more than one question. " "Be patient, kind, and culturally sensitive to Nepali and South Asian contexts." ), "extractor": ( "/no_think " "You are an extractor agent. Given a memory fragment, extract structured information " "as JSON with keys relevant to the content (who, when, where, what, emotion). " "Output only valid JSON, nothing else." ), "arcdetector": ( "/no_think " "You are an arc detector agent. Given a memory fragment, identify the narrative stage. " "Output one word only: setup, tension, turn, or meaning." ), "publisher": ( "You are a publisher agent. Given memory notes, synthesize them into a single warm, " "narrative paragraph suitable for a family memory book. Write in first person. " "Use natural, unhurried language. Output only the paragraph, nothing else." ), } AGENT_DEFAULTS = { "questioner": {"max_tokens": 2048, "temp": 0.7}, "extractor": {"max_tokens": 2048, "temp": 0.1}, "arcdetector": {"max_tokens": 1024, "temp": 0.1}, "publisher": {"max_tokens": 2048, "temp": 0.4}, } # ── Inference ───────────────────────────────────────────────────────────────── @spaces.GPU def infer(agent: str, user_text: str, max_tokens: float = -1, temp: float = -1.0) -> str: if agent not in SYSTEM_PROMPTS: return json.dumps({"error": f"unknown agent: {agent}"}) defaults = AGENT_DEFAULTS.get(agent, {"max_tokens": 256, "temp": 0.4}) _max_tokens = defaults["max_tokens"] if max_tokens < 0 else int(max_tokens) _temp = defaults["temp"] if temp < 0 else float(temp) try: response = get_llm().create_chat_completion( messages=[ {"role": "system", "content": SYSTEM_PROMPTS[agent]}, {"role": "user", "content": f"/no_think\n{user_text}"}, ], max_tokens=_max_tokens, temperature=_temp, ) raw = response["choices"][0]["message"]["content"].strip() print(f"[RAW:{agent}] {repr(raw)}") return raw except Exception as e: return json.dumps({"error": str(e)}) def health() -> str: return "ok" def clean_response(text: str) -> str: if '' in text: text = text.split('', 1)[1] elif re.match(r'^Thinking Process:', text): # No closing tag — find the first non-numbered paragraph after the list parts = re.split(r'\n\n(?!\d+[\.\)])', text, maxsplit=1) text = parts[1] if len(parts) > 1 else '' return text.strip() # ── Gradio ──────────────────────────────────────────────────────────────────── with gr.Blocks(title="LegacyScribe Backend") as demo: gr.Markdown("## LegacyScribe Inference Backend\nCalled by the org Space frontend via `gradio_client`.") with gr.Row(): with gr.Column(): agent_in = gr.Textbox(label="agent") text_in = gr.Textbox(label="user_text", lines=4) tokens_in = gr.Number(label="max_tokens", value=-1) temp_in = gr.Number(label="temp", value=-1.0) infer_btn = gr.Button("Run", variant="primary") with gr.Column(): out = gr.Textbox(label="Response", lines=6) health_btn = gr.Button("Health check") health_out = gr.Textbox(label="Status") infer_btn.click(fn=infer, inputs=[agent_in, text_in, tokens_in, temp_in], outputs=out, api_name="predict") health_btn.click(fn=health, inputs=[], outputs=health_out, api_name="health") demo.launch(ssr_mode=False)