Spaces:
Running on Zero
Running on Zero
| """Local small-model assistance for Trace Field Notes on Hugging Face ZeroGPU. | |
| The analysis models run on the Space GPU through ``transformers``. Heavy imports | |
| (``torch``, ``transformers``) are loaded lazily inside the generator so that the | |
| deterministic analyzer, the test suite, and local development keep working | |
| without GPU dependencies installed. If a model cannot be loaded or its output is | |
| not valid JSON, :func:`analyzer.analyze_trace_file` falls back to the | |
| deterministic codebook and records the reason in the model notes. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from dataclasses import dataclass | |
| from typing import Any, Callable | |
| from schemas import AnalysisResult | |
| PRIMARY_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" | |
| QUICK_MODEL_ID = "Qwen/Qwen3.5-9B" | |
| MODEL_CHOICES = { | |
| "qwen": { | |
| "label": "Qwen3.5 9B — quick analysis", | |
| "model_id": QUICK_MODEL_ID, | |
| }, | |
| "nemotron": { | |
| "label": "NVIDIA Nemotron 3 Nano 30B-A3B — deeper analysis", | |
| "model_id": PRIMARY_MODEL_ID, | |
| }, | |
| "deterministic": { | |
| "label": "Rule-based — instant, no model", | |
| "model_id": None, | |
| }, | |
| } | |
| # (messages, *, model_id, max_new_tokens) -> raw model text. | |
| GenerateFn = Callable[..., str] | |
| _MODEL_CACHE: dict[str, Any] = {} | |
| class ModelAssistResult: | |
| model_id: str | |
| memo: dict[str, Any] | |
| note: str | |
| def model_id_for_engine(engine: str) -> str | None: | |
| choice = MODEL_CHOICES.get(engine) | |
| if not choice: | |
| return None | |
| model_id = choice["model_id"] | |
| return str(model_id) if model_id else None | |
| def run_model_assist( | |
| *, | |
| engine: str, | |
| result: AnalysisResult, | |
| narrative_text: str, | |
| generate: GenerateFn | None = None, | |
| ) -> ModelAssistResult: | |
| """Run the selected model on the GPU and return a concise grounded memo.""" | |
| model_id = model_id_for_engine(engine) | |
| if not model_id: | |
| raise ValueError(f"No model is configured for analysis engine {engine!r}.") | |
| prompt = build_model_prompt(result, narrative_text) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You analyze visible coding-agent narrative messages. " | |
| "Do not infer hidden reasoning. Return JSON only." | |
| ), | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ] | |
| generator = generate or _local_generator | |
| content = generator(messages, model_id=model_id, max_new_tokens=900) | |
| memo = parse_model_json(content) | |
| return ModelAssistResult( | |
| model_id=model_id, | |
| memo=memo, | |
| note=f"Model assist completed on the Space GPU with {model_id}.", | |
| ) | |
| def _local_generator( | |
| messages: list[dict[str, str]], | |
| *, | |
| model_id: str, | |
| max_new_tokens: int, | |
| ) -> str: | |
| """Generate text with a locally loaded model on the ZeroGPU device. | |
| Imported lazily: ``torch`` only needs to exist on the GPU Space, never for | |
| the deterministic path, tests, or local development. | |
| """ | |
| import torch | |
| tokenizer, model = _load_model(model_id) | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| ).to(model.device) | |
| with torch.no_grad(): | |
| generated = model.generate( | |
| inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=False, | |
| ) | |
| completion = generated[0][inputs.shape[-1]:] | |
| return tokenizer.decode(completion, skip_special_tokens=True) | |
| def _load_model(model_id: str) -> Any: | |
| """Lazily load and cache a (tokenizer, model) pair on the GPU. | |
| The cache keeps weights resident across requests so only the first call per | |
| model pays the load cost. ZeroGPU exposes CUDA inside the ``@spaces.GPU`` | |
| context, which is where this runs. | |
| """ | |
| cached = _MODEL_CACHE.get(model_id) | |
| if cached is not None: | |
| return cached | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16, | |
| device_map="cuda", | |
| trust_remote_code=True, | |
| ) | |
| model.eval() | |
| _MODEL_CACHE[model_id] = (tokenizer, model) | |
| return tokenizer, model | |
| def build_model_prompt(result: AnalysisResult, narrative_text: str) -> str: | |
| deterministic_json = json.dumps(result.to_dict(), ensure_ascii=False, indent=2) | |
| narrative_excerpt = narrative_text[:12000] | |
| return f"""Use the deterministic codebook analysis and redacted visible narrative below. | |
| Return JSON with exactly these keys: | |
| - executive_memo: 4-6 sentences for a developer | |
| - detour_memo: 2-4 sentences about productive detours vs wandering | |
| - outcome_audit_memo: 2-4 sentences about completion claims and caveats | |
| - caveats: array of short strings | |
| Rules: | |
| - Analyze only visible narrative messages. | |
| - Do not claim to know hidden reasoning. | |
| - Cite episode IDs where useful. | |
| - Do not include raw secrets, tool outputs, or long quotes. | |
| Deterministic analysis: | |
| {deterministic_json} | |
| Redacted narrative excerpt: | |
| {narrative_excerpt} | |
| """ | |
| def parse_model_json(content: str) -> dict[str, Any]: | |
| parsed = _loads_lenient(content) | |
| required = { | |
| "executive_memo": str, | |
| "detour_memo": str, | |
| "outcome_audit_memo": str, | |
| "caveats": list, | |
| } | |
| for key, expected_type in required.items(): | |
| if key not in parsed or not isinstance(parsed[key], expected_type): | |
| raise ValueError(f"Model response missing {key!r} as {expected_type.__name__}.") | |
| parsed["caveats"] = [str(item) for item in parsed["caveats"][:6]] | |
| return parsed | |
| def _loads_lenient(content: str) -> dict[str, Any]: | |
| """Parse JSON from a model that may wrap it in prose or code fences.""" | |
| if not isinstance(content, str) or not content.strip(): | |
| raise ValueError("Model response content was empty.") | |
| text = content.strip() | |
| fence = re.match(r"^```[a-zA-Z0-9]*\s*(.*?)\s*```$", text, re.DOTALL) | |
| if fence: | |
| text = fence.group(1).strip() | |
| try: | |
| parsed: Any = json.loads(text) | |
| except json.JSONDecodeError: | |
| start, end = text.find("{"), text.rfind("}") | |
| if start == -1 or end == -1 or end <= start: | |
| raise ValueError("Model response was not valid JSON.") | |
| try: | |
| parsed = json.loads(text[start : end + 1]) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError("Model response was not valid JSON.") from exc | |
| if not isinstance(parsed, dict): | |
| raise ValueError("Model response was not a JSON object.") | |
| return parsed | |