Spaces:

safalnarsingh
/

legacyscribe-backend

Paused

SPIDEY

Try sending full raw string to frontend

6842ace 4 days ago

6.8 kB

	import subprocess, sys, re, os, json

	# ── Install llama-cpp-python at runtime ───────────────────────────────────────
	result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
	print(result.stdout)
	match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", result.stdout)
	if match:
	major, minor = match.group(1), match.group(2)
	cuda_ver = f"cu{major}{minor}"
	else:
	cuda_ver = "cpu"
	print(f"CUDA detected: {cuda_ver}")

	wheel_map = {
	"cu130": "cu122", # ← add this
	"cu128": "cu122",
	"cu126": "cu122",
	"cu124": "cu124",
	"cu122": "cu122",
	"cu121": "cu121",
	"cu118": "cu118",
	}
	wheel_tag = wheel_map.get(cuda_ver, "cu122")
	subprocess.run([
	sys.executable, "-m", "pip", "install", "llama-cpp-python",
	"--extra-index-url", f"https://abetlen.github.io/llama-cpp-python/whl/{wheel_tag}",
	"--no-cache-dir", "-q",
	], check=True)
	print("llama-cpp-python installed.")

	import gradio as gr
	import spaces
	from huggingface_hub import hf_hub_download, login

	# ── Auth ──────────────────────────────────────────────────────────────────────
	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)

	MODEL_REPO = os.environ.get("MODEL_REPO", "build-small-hackathon/legacystribe-Qwen3.5-9B.Q4_K_M")
	MODEL_FILE = os.environ.get("MODEL_FILE", "Qwen3.5-9B.Q4_K_M.gguf")
	N_CTX = int(os.environ.get("N_CTX", "4096"))
	N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))

	# ── Download model once at startup ───────────────────────────────────────────
	print("Downloading model...")
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
	print(f"Model downloaded to {model_path}")

	# ── Load model lazily (inside GPU function) ───────────────────────────────────
	llm = None

	def get_llm():
	global llm
	if llm is None:
	from llama_cpp import Llama
	print("Loading model into llama.cpp...")
	llm = Llama(
	model_path=model_path,
	n_ctx=N_CTX,
	n_gpu_layers=N_GPU_LAYERS,
	verbose=False,
	)
	print("Model ready.")
	return llm

	# ── System prompts ────────────────────────────────────────────────────────────
	SYSTEM_PROMPTS = {
	"questioner": (
	"/no_think "
	"You are a gentle memory guide helping an elderly person tell their life story. "
	"Ask exactly one warm, open follow-up question. Never ask more than one question. "
	"Be patient, kind, and culturally sensitive to Nepali and South Asian contexts."
	),
	"extractor": (
	"/no_think "
	"You are an extractor agent. Given a memory fragment, extract structured information "
	"as JSON with keys relevant to the content (who, when, where, what, emotion). "
	"Output only valid JSON, nothing else."
	),
	"arcdetector": (
	"/no_think "
	"You are an arc detector agent. Given a memory fragment, identify the narrative stage. "
	"Output one word only: setup, tension, turn, or meaning."
	),
	"publisher": (
	"You are a publisher agent. Given memory notes, synthesize them into a single warm, "
	"narrative paragraph suitable for a family memory book. Write in first person. "
	"Use natural, unhurried language. Output only the paragraph, nothing else."
	),
	}

	AGENT_DEFAULTS = {
	"questioner": {"max_tokens": 2048, "temp": 0.7},
	"extractor": {"max_tokens": 2048, "temp": 0.1},
	"arcdetector": {"max_tokens": 1024, "temp": 0.1},
	"publisher": {"max_tokens": 2048, "temp": 0.4},
	}

	# ── Inference ─────────────────────────────────────────────────────────────────
	@spaces.GPU
	def infer(agent: str, user_text: str, max_tokens: float = -1, temp: float = -1.0) -> str:
	if agent not in SYSTEM_PROMPTS:
	return json.dumps({"error": f"unknown agent: {agent}"})
	defaults = AGENT_DEFAULTS.get(agent, {"max_tokens": 256, "temp": 0.4})
	_max_tokens = defaults["max_tokens"] if max_tokens < 0 else int(max_tokens)
	_temp = defaults["temp"] if temp < 0 else float(temp)
	try:
	response = get_llm().create_chat_completion(
	messages=[
	{"role": "system", "content": SYSTEM_PROMPTS[agent]},
	{"role": "user", "content": f"/no_think\n{user_text}"},
	],
	max_tokens=_max_tokens,
	temperature=_temp,
	)
	raw = response["choices"][0]["message"]["content"].strip()
	print(f"[RAW:{agent}] {repr(raw)}")
	return raw
	except Exception as e:
	return json.dumps({"error": str(e)})

	def health() -> str:
	return "ok"

	def clean_response(text: str) -> str:
	if '</think>' in text:
	text = text.split('</think>', 1)[1]
	elif re.match(r'^Thinking Process:', text):
	# No closing tag — find the first non-numbered paragraph after the list
	parts = re.split(r'\n\n(?!\d+[\.\)])', text, maxsplit=1)
	text = parts[1] if len(parts) > 1 else ''
	return text.strip()

	# ── Gradio ────────────────────────────────────────────────────────────────────
	with gr.Blocks(title="LegacyScribe Backend") as demo:
	gr.Markdown("## LegacyScribe Inference Backend\nCalled by the org Space frontend via `gradio_client`.")
	with gr.Row():
	with gr.Column():
	agent_in = gr.Textbox(label="agent")
	text_in = gr.Textbox(label="user_text", lines=4)
	tokens_in = gr.Number(label="max_tokens", value=-1)
	temp_in = gr.Number(label="temp", value=-1.0)
	infer_btn = gr.Button("Run", variant="primary")
	with gr.Column():
	out = gr.Textbox(label="Response", lines=6)
	health_btn = gr.Button("Health check")
	health_out = gr.Textbox(label="Status")

	infer_btn.click(fn=infer, inputs=[agent_in, text_in, tokens_in, temp_in], outputs=out, api_name="predict")
	health_btn.click(fn=health, inputs=[], outputs=health_out, api_name="health")

	demo.launch(ssr_mode=False)