Spaces:
Running on Zero
Running on Zero
File size: 6,701 Bytes
bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 c8055f7 bd351d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | """Local small-model assistance for Trace Field Notes on Hugging Face ZeroGPU.
The analysis models run on the Space GPU through ``transformers``. Heavy imports
(``torch``, ``transformers``) are loaded lazily inside the generator so that the
deterministic analyzer, the test suite, and local development keep working
without GPU dependencies installed. If a model cannot be loaded or its output is
not valid JSON, :func:`analyzer.analyze_trace_file` falls back to the
deterministic codebook and records the reason in the model notes.
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Any, Callable
from schemas import AnalysisResult
PRIMARY_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
QUICK_MODEL_ID = "Qwen/Qwen3.5-9B"
MODEL_CHOICES = {
"qwen": {
"label": "Qwen3.5 9B — quick analysis",
"model_id": QUICK_MODEL_ID,
},
"nemotron": {
"label": "NVIDIA Nemotron 3 Nano 30B-A3B — deeper analysis",
"model_id": PRIMARY_MODEL_ID,
},
"deterministic": {
"label": "Rule-based — instant, no model",
"model_id": None,
},
}
# (messages, *, model_id, max_new_tokens) -> raw model text.
GenerateFn = Callable[..., str]
_MODEL_CACHE: dict[str, Any] = {}
@dataclass(slots=True)
class ModelAssistResult:
model_id: str
memo: dict[str, Any]
note: str
def model_id_for_engine(engine: str) -> str | None:
choice = MODEL_CHOICES.get(engine)
if not choice:
return None
model_id = choice["model_id"]
return str(model_id) if model_id else None
def run_model_assist(
*,
engine: str,
result: AnalysisResult,
narrative_text: str,
generate: GenerateFn | None = None,
) -> ModelAssistResult:
"""Run the selected model on the GPU and return a concise grounded memo."""
model_id = model_id_for_engine(engine)
if not model_id:
raise ValueError(f"No model is configured for analysis engine {engine!r}.")
prompt = build_model_prompt(result, narrative_text)
messages = [
{
"role": "system",
"content": (
"You analyze visible coding-agent narrative messages. "
"Do not infer hidden reasoning. Return JSON only."
),
},
{"role": "user", "content": prompt},
]
generator = generate or _local_generator
content = generator(messages, model_id=model_id, max_new_tokens=900)
memo = parse_model_json(content)
return ModelAssistResult(
model_id=model_id,
memo=memo,
note=f"Model assist completed on the Space GPU with {model_id}.",
)
def _local_generator(
messages: list[dict[str, str]],
*,
model_id: str,
max_new_tokens: int,
) -> str:
"""Generate text with a locally loaded model on the ZeroGPU device.
Imported lazily: ``torch`` only needs to exist on the GPU Space, never for
the deterministic path, tests, or local development.
"""
import torch
tokenizer, model = _load_model(model_id)
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
).to(model.device)
with torch.no_grad():
generated = model.generate(
inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
)
completion = generated[0][inputs.shape[-1]:]
return tokenizer.decode(completion, skip_special_tokens=True)
def _load_model(model_id: str) -> Any:
"""Lazily load and cache a (tokenizer, model) pair on the GPU.
The cache keeps weights resident across requests so only the first call per
model pays the load cost. ZeroGPU exposes CUDA inside the ``@spaces.GPU``
context, which is where this runs.
"""
cached = _MODEL_CACHE.get(model_id)
if cached is not None:
return cached
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="cuda",
trust_remote_code=True,
)
model.eval()
_MODEL_CACHE[model_id] = (tokenizer, model)
return tokenizer, model
def build_model_prompt(result: AnalysisResult, narrative_text: str) -> str:
deterministic_json = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
narrative_excerpt = narrative_text[:12000]
return f"""Use the deterministic codebook analysis and redacted visible narrative below.
Return JSON with exactly these keys:
- executive_memo: 4-6 sentences for a developer
- detour_memo: 2-4 sentences about productive detours vs wandering
- outcome_audit_memo: 2-4 sentences about completion claims and caveats
- caveats: array of short strings
Rules:
- Analyze only visible narrative messages.
- Do not claim to know hidden reasoning.
- Cite episode IDs where useful.
- Do not include raw secrets, tool outputs, or long quotes.
Deterministic analysis:
{deterministic_json}
Redacted narrative excerpt:
{narrative_excerpt}
"""
def parse_model_json(content: str) -> dict[str, Any]:
parsed = _loads_lenient(content)
required = {
"executive_memo": str,
"detour_memo": str,
"outcome_audit_memo": str,
"caveats": list,
}
for key, expected_type in required.items():
if key not in parsed or not isinstance(parsed[key], expected_type):
raise ValueError(f"Model response missing {key!r} as {expected_type.__name__}.")
parsed["caveats"] = [str(item) for item in parsed["caveats"][:6]]
return parsed
def _loads_lenient(content: str) -> dict[str, Any]:
"""Parse JSON from a model that may wrap it in prose or code fences."""
if not isinstance(content, str) or not content.strip():
raise ValueError("Model response content was empty.")
text = content.strip()
fence = re.match(r"^```[a-zA-Z0-9]*\s*(.*?)\s*```$", text, re.DOTALL)
if fence:
text = fence.group(1).strip()
try:
parsed: Any = json.loads(text)
except json.JSONDecodeError:
start, end = text.find("{"), text.rfind("}")
if start == -1 or end == -1 or end <= start:
raise ValueError("Model response was not valid JSON.")
try:
parsed = json.loads(text[start : end + 1])
except json.JSONDecodeError as exc:
raise ValueError("Model response was not valid JSON.") from exc
if not isinstance(parsed, dict):
raise ValueError("Model response was not a JSON object.")
return parsed
|