trace-field-notes / model_runtime.py
JacobLinCool's picture
feat: serve designer React frontend via gradio.Server on ZeroGPU
bd351d2 verified
Raw
History Blame
6.7 kB
"""Local small-model assistance for Trace Field Notes on Hugging Face ZeroGPU.
The analysis models run on the Space GPU through ``transformers``. Heavy imports
(``torch``, ``transformers``) are loaded lazily inside the generator so that the
deterministic analyzer, the test suite, and local development keep working
without GPU dependencies installed. If a model cannot be loaded or its output is
not valid JSON, :func:`analyzer.analyze_trace_file` falls back to the
deterministic codebook and records the reason in the model notes.
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Any, Callable
from schemas import AnalysisResult
PRIMARY_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
QUICK_MODEL_ID = "Qwen/Qwen3.5-9B"
MODEL_CHOICES = {
"qwen": {
"label": "Qwen3.5 9B — quick analysis",
"model_id": QUICK_MODEL_ID,
},
"nemotron": {
"label": "NVIDIA Nemotron 3 Nano 30B-A3B — deeper analysis",
"model_id": PRIMARY_MODEL_ID,
},
"deterministic": {
"label": "Rule-based — instant, no model",
"model_id": None,
},
}
# (messages, *, model_id, max_new_tokens) -> raw model text.
GenerateFn = Callable[..., str]
_MODEL_CACHE: dict[str, Any] = {}
@dataclass(slots=True)
class ModelAssistResult:
model_id: str
memo: dict[str, Any]
note: str
def model_id_for_engine(engine: str) -> str | None:
choice = MODEL_CHOICES.get(engine)
if not choice:
return None
model_id = choice["model_id"]
return str(model_id) if model_id else None
def run_model_assist(
*,
engine: str,
result: AnalysisResult,
narrative_text: str,
generate: GenerateFn | None = None,
) -> ModelAssistResult:
"""Run the selected model on the GPU and return a concise grounded memo."""
model_id = model_id_for_engine(engine)
if not model_id:
raise ValueError(f"No model is configured for analysis engine {engine!r}.")
prompt = build_model_prompt(result, narrative_text)
messages = [
{
"role": "system",
"content": (
"You analyze visible coding-agent narrative messages. "
"Do not infer hidden reasoning. Return JSON only."
),
},
{"role": "user", "content": prompt},
]
generator = generate or _local_generator
content = generator(messages, model_id=model_id, max_new_tokens=900)
memo = parse_model_json(content)
return ModelAssistResult(
model_id=model_id,
memo=memo,
note=f"Model assist completed on the Space GPU with {model_id}.",
)
def _local_generator(
messages: list[dict[str, str]],
*,
model_id: str,
max_new_tokens: int,
) -> str:
"""Generate text with a locally loaded model on the ZeroGPU device.
Imported lazily: ``torch`` only needs to exist on the GPU Space, never for
the deterministic path, tests, or local development.
"""
import torch
tokenizer, model = _load_model(model_id)
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
).to(model.device)
with torch.no_grad():
generated = model.generate(
inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
)
completion = generated[0][inputs.shape[-1]:]
return tokenizer.decode(completion, skip_special_tokens=True)
def _load_model(model_id: str) -> Any:
"""Lazily load and cache a (tokenizer, model) pair on the GPU.
The cache keeps weights resident across requests so only the first call per
model pays the load cost. ZeroGPU exposes CUDA inside the ``@spaces.GPU``
context, which is where this runs.
"""
cached = _MODEL_CACHE.get(model_id)
if cached is not None:
return cached
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="cuda",
trust_remote_code=True,
)
model.eval()
_MODEL_CACHE[model_id] = (tokenizer, model)
return tokenizer, model
def build_model_prompt(result: AnalysisResult, narrative_text: str) -> str:
deterministic_json = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
narrative_excerpt = narrative_text[:12000]
return f"""Use the deterministic codebook analysis and redacted visible narrative below.
Return JSON with exactly these keys:
- executive_memo: 4-6 sentences for a developer
- detour_memo: 2-4 sentences about productive detours vs wandering
- outcome_audit_memo: 2-4 sentences about completion claims and caveats
- caveats: array of short strings
Rules:
- Analyze only visible narrative messages.
- Do not claim to know hidden reasoning.
- Cite episode IDs where useful.
- Do not include raw secrets, tool outputs, or long quotes.
Deterministic analysis:
{deterministic_json}
Redacted narrative excerpt:
{narrative_excerpt}
"""
def parse_model_json(content: str) -> dict[str, Any]:
parsed = _loads_lenient(content)
required = {
"executive_memo": str,
"detour_memo": str,
"outcome_audit_memo": str,
"caveats": list,
}
for key, expected_type in required.items():
if key not in parsed or not isinstance(parsed[key], expected_type):
raise ValueError(f"Model response missing {key!r} as {expected_type.__name__}.")
parsed["caveats"] = [str(item) for item in parsed["caveats"][:6]]
return parsed
def _loads_lenient(content: str) -> dict[str, Any]:
"""Parse JSON from a model that may wrap it in prose or code fences."""
if not isinstance(content, str) or not content.strip():
raise ValueError("Model response content was empty.")
text = content.strip()
fence = re.match(r"^```[a-zA-Z0-9]*\s*(.*?)\s*```$", text, re.DOTALL)
if fence:
text = fence.group(1).strip()
try:
parsed: Any = json.loads(text)
except json.JSONDecodeError:
start, end = text.find("{"), text.rfind("}")
if start == -1 or end == -1 or end <= start:
raise ValueError("Model response was not valid JSON.")
try:
parsed = json.loads(text[start : end + 1])
except json.JSONDecodeError as exc:
raise ValueError("Model response was not valid JSON.") from exc
if not isinstance(parsed, dict):
raise ValueError("Model response was not a JSON object.")
return parsed