| import subprocess, sys, re, os, json |
|
|
| |
| result = subprocess.run(["nvidia-smi"], capture_output=True, text=True) |
| print(result.stdout) |
| match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", result.stdout) |
| if match: |
| major, minor = match.group(1), match.group(2) |
| cuda_ver = f"cu{major}{minor}" |
| else: |
| cuda_ver = "cpu" |
| print(f"CUDA detected: {cuda_ver}") |
|
|
| wheel_map = { |
| "cu130": "cu122", |
| "cu128": "cu122", |
| "cu126": "cu122", |
| "cu124": "cu124", |
| "cu122": "cu122", |
| "cu121": "cu121", |
| "cu118": "cu118", |
| } |
| wheel_tag = wheel_map.get(cuda_ver, "cu122") |
| subprocess.run([ |
| sys.executable, "-m", "pip", "install", "llama-cpp-python", |
| "--extra-index-url", f"https://abetlen.github.io/llama-cpp-python/whl/{wheel_tag}", |
| "--no-cache-dir", "-q", |
| ], check=True) |
| print("llama-cpp-python installed.") |
|
|
| import gradio as gr |
| import spaces |
| from huggingface_hub import hf_hub_download, login |
|
|
| |
| hf_token = os.environ.get("HF_TOKEN") |
| if hf_token: |
| login(token=hf_token) |
|
|
| MODEL_REPO = os.environ.get("MODEL_REPO", "build-small-hackathon/legacystribe-Qwen3.5-9B.Q4_K_M") |
| MODEL_FILE = os.environ.get("MODEL_FILE", "Qwen3.5-9B.Q4_K_M.gguf") |
| N_CTX = int(os.environ.get("N_CTX", "4096")) |
| N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1")) |
|
|
| |
| print("Downloading model...") |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) |
| print(f"Model downloaded to {model_path}") |
|
|
| |
| llm = None |
|
|
| def get_llm(): |
| global llm |
| if llm is None: |
| from llama_cpp import Llama |
| print("Loading model into llama.cpp...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=N_CTX, |
| n_gpu_layers=N_GPU_LAYERS, |
| verbose=False, |
| ) |
| print("Model ready.") |
| return llm |
|
|
| |
| SYSTEM_PROMPTS = { |
| "questioner": ( |
| "/no_think " |
| "You are a gentle memory guide helping an elderly person tell their life story. " |
| "Ask exactly one warm, open follow-up question. Never ask more than one question. " |
| "Be patient, kind, and culturally sensitive to Nepali and South Asian contexts." |
| ), |
| "extractor": ( |
| "/no_think " |
| "You are an extractor agent. Given a memory fragment, extract structured information " |
| "as JSON with keys relevant to the content (who, when, where, what, emotion). " |
| "Output only valid JSON, nothing else." |
| ), |
| "arcdetector": ( |
| "/no_think " |
| "You are an arc detector agent. Given a memory fragment, identify the narrative stage. " |
| "Output one word only: setup, tension, turn, or meaning." |
| ), |
| "publisher": ( |
| "You are a publisher agent. Given memory notes, synthesize them into a single warm, " |
| "narrative paragraph suitable for a family memory book. Write in first person. " |
| "Use natural, unhurried language. Output only the paragraph, nothing else." |
| ), |
| } |
|
|
| AGENT_DEFAULTS = { |
| "questioner": {"max_tokens": 2048, "temp": 0.7}, |
| "extractor": {"max_tokens": 2048, "temp": 0.1}, |
| "arcdetector": {"max_tokens": 1024, "temp": 0.1}, |
| "publisher": {"max_tokens": 2048, "temp": 0.4}, |
| } |
|
|
| |
| @spaces.GPU |
| def infer(agent: str, user_text: str, max_tokens: float = -1, temp: float = -1.0) -> str: |
| if agent not in SYSTEM_PROMPTS: |
| return json.dumps({"error": f"unknown agent: {agent}"}) |
| defaults = AGENT_DEFAULTS.get(agent, {"max_tokens": 256, "temp": 0.4}) |
| _max_tokens = defaults["max_tokens"] if max_tokens < 0 else int(max_tokens) |
| _temp = defaults["temp"] if temp < 0 else float(temp) |
| try: |
| response = get_llm().create_chat_completion( |
| messages=[ |
| {"role": "system", "content": SYSTEM_PROMPTS[agent]}, |
| {"role": "user", "content": f"/no_think\n{user_text}"}, |
| ], |
| max_tokens=_max_tokens, |
| temperature=_temp, |
| ) |
| raw = response["choices"][0]["message"]["content"].strip() |
| print(f"[RAW:{agent}] {repr(raw)}") |
| return raw |
| except Exception as e: |
| return json.dumps({"error": str(e)}) |
|
|
| def health() -> str: |
| return "ok" |
|
|
| def clean_response(text: str) -> str: |
| if '</think>' in text: |
| text = text.split('</think>', 1)[1] |
| elif re.match(r'^Thinking Process:', text): |
| |
| parts = re.split(r'\n\n(?!\d+[\.\)])', text, maxsplit=1) |
| text = parts[1] if len(parts) > 1 else '' |
| return text.strip() |
|
|
| |
| with gr.Blocks(title="LegacyScribe Backend") as demo: |
| gr.Markdown("## LegacyScribe Inference Backend\nCalled by the org Space frontend via `gradio_client`.") |
| with gr.Row(): |
| with gr.Column(): |
| agent_in = gr.Textbox(label="agent") |
| text_in = gr.Textbox(label="user_text", lines=4) |
| tokens_in = gr.Number(label="max_tokens", value=-1) |
| temp_in = gr.Number(label="temp", value=-1.0) |
| infer_btn = gr.Button("Run", variant="primary") |
| with gr.Column(): |
| out = gr.Textbox(label="Response", lines=6) |
| health_btn = gr.Button("Health check") |
| health_out = gr.Textbox(label="Status") |
|
|
| infer_btn.click(fn=infer, inputs=[agent_in, text_in, tokens_in, temp_in], outputs=out, api_name="predict") |
| health_btn.click(fn=health, inputs=[], outputs=health_out, api_name="health") |
|
|
| demo.launch(ssr_mode=False) |