SPIDEY
Try sending full raw string to frontend
6842ace
Raw
History Blame Contribute Delete
6.8 kB
import subprocess, sys, re, os, json
# ── Install llama-cpp-python at runtime ───────────────────────────────────────
result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
print(result.stdout)
match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", result.stdout)
if match:
major, minor = match.group(1), match.group(2)
cuda_ver = f"cu{major}{minor}"
else:
cuda_ver = "cpu"
print(f"CUDA detected: {cuda_ver}")
wheel_map = {
"cu130": "cu122", # ← add this
"cu128": "cu122",
"cu126": "cu122",
"cu124": "cu124",
"cu122": "cu122",
"cu121": "cu121",
"cu118": "cu118",
}
wheel_tag = wheel_map.get(cuda_ver, "cu122")
subprocess.run([
sys.executable, "-m", "pip", "install", "llama-cpp-python",
"--extra-index-url", f"https://abetlen.github.io/llama-cpp-python/whl/{wheel_tag}",
"--no-cache-dir", "-q",
], check=True)
print("llama-cpp-python installed.")
import gradio as gr
import spaces
from huggingface_hub import hf_hub_download, login
# ── Auth ──────────────────────────────────────────────────────────────────────
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
login(token=hf_token)
MODEL_REPO = os.environ.get("MODEL_REPO", "build-small-hackathon/legacystribe-Qwen3.5-9B.Q4_K_M")
MODEL_FILE = os.environ.get("MODEL_FILE", "Qwen3.5-9B.Q4_K_M.gguf")
N_CTX = int(os.environ.get("N_CTX", "4096"))
N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))
# ── Download model once at startup ───────────────────────────────────────────
print("Downloading model...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"Model downloaded to {model_path}")
# ── Load model lazily (inside GPU function) ───────────────────────────────────
llm = None
def get_llm():
global llm
if llm is None:
from llama_cpp import Llama
print("Loading model into llama.cpp...")
llm = Llama(
model_path=model_path,
n_ctx=N_CTX,
n_gpu_layers=N_GPU_LAYERS,
verbose=False,
)
print("Model ready.")
return llm
# ── System prompts ────────────────────────────────────────────────────────────
SYSTEM_PROMPTS = {
"questioner": (
"/no_think "
"You are a gentle memory guide helping an elderly person tell their life story. "
"Ask exactly one warm, open follow-up question. Never ask more than one question. "
"Be patient, kind, and culturally sensitive to Nepali and South Asian contexts."
),
"extractor": (
"/no_think "
"You are an extractor agent. Given a memory fragment, extract structured information "
"as JSON with keys relevant to the content (who, when, where, what, emotion). "
"Output only valid JSON, nothing else."
),
"arcdetector": (
"/no_think "
"You are an arc detector agent. Given a memory fragment, identify the narrative stage. "
"Output one word only: setup, tension, turn, or meaning."
),
"publisher": (
"You are a publisher agent. Given memory notes, synthesize them into a single warm, "
"narrative paragraph suitable for a family memory book. Write in first person. "
"Use natural, unhurried language. Output only the paragraph, nothing else."
),
}
AGENT_DEFAULTS = {
"questioner": {"max_tokens": 2048, "temp": 0.7},
"extractor": {"max_tokens": 2048, "temp": 0.1},
"arcdetector": {"max_tokens": 1024, "temp": 0.1},
"publisher": {"max_tokens": 2048, "temp": 0.4},
}
# ── Inference ─────────────────────────────────────────────────────────────────
@spaces.GPU
def infer(agent: str, user_text: str, max_tokens: float = -1, temp: float = -1.0) -> str:
if agent not in SYSTEM_PROMPTS:
return json.dumps({"error": f"unknown agent: {agent}"})
defaults = AGENT_DEFAULTS.get(agent, {"max_tokens": 256, "temp": 0.4})
_max_tokens = defaults["max_tokens"] if max_tokens < 0 else int(max_tokens)
_temp = defaults["temp"] if temp < 0 else float(temp)
try:
response = get_llm().create_chat_completion(
messages=[
{"role": "system", "content": SYSTEM_PROMPTS[agent]},
{"role": "user", "content": f"/no_think\n{user_text}"},
],
max_tokens=_max_tokens,
temperature=_temp,
)
raw = response["choices"][0]["message"]["content"].strip()
print(f"[RAW:{agent}] {repr(raw)}")
return raw
except Exception as e:
return json.dumps({"error": str(e)})
def health() -> str:
return "ok"
def clean_response(text: str) -> str:
if '</think>' in text:
text = text.split('</think>', 1)[1]
elif re.match(r'^Thinking Process:', text):
# No closing tag β€” find the first non-numbered paragraph after the list
parts = re.split(r'\n\n(?!\d+[\.\)])', text, maxsplit=1)
text = parts[1] if len(parts) > 1 else ''
return text.strip()
# ── Gradio ────────────────────────────────────────────────────────────────────
with gr.Blocks(title="LegacyScribe Backend") as demo:
gr.Markdown("## LegacyScribe Inference Backend\nCalled by the org Space frontend via `gradio_client`.")
with gr.Row():
with gr.Column():
agent_in = gr.Textbox(label="agent")
text_in = gr.Textbox(label="user_text", lines=4)
tokens_in = gr.Number(label="max_tokens", value=-1)
temp_in = gr.Number(label="temp", value=-1.0)
infer_btn = gr.Button("Run", variant="primary")
with gr.Column():
out = gr.Textbox(label="Response", lines=6)
health_btn = gr.Button("Health check")
health_out = gr.Textbox(label="Status")
infer_btn.click(fn=infer, inputs=[agent_in, text_in, tokens_in, temp_in], outputs=out, api_name="predict")
health_btn.click(fn=health, inputs=[], outputs=health_out, api_name="health")
demo.launch(ssr_mode=False)