"""Per-Dock GBNF generator + safe mass-JSON parser.""" from __future__ import annotations import json from typing import Iterable def gbnf_for_dock(dock: Iterable[str]) -> str: """Return a GBNF grammar that constrains the model's deposition JSON to the schema {focal_elements: [{suspects: [...], mass: <0..1>}, ...], one_line_summary: "..."}. `suspect` is restricted to the literal names in `dock`. """ suspects = list(dock) if not suspects: raise ValueError("dock must be non-empty") suspect_rule = " | ".join(f'"\\"{s}\\""' for s in suspects) return f""" root ::= "{{" ws "\\"focal_elements\\":" ws "[" element ("," ws element)* "]" ws "," ws "\\"one_line_summary\\":" ws string ws "}}" element ::= "{{" ws "\\"suspects\\":" ws subset ws "," ws "\\"mass\\":" ws number ws "}}" subset ::= "[" ws suspect (ws "," ws suspect)* ws "]" suspect ::= {suspect_rule} number ::= "0" ("." [0-9]+)? | "1" ("." "0"+)? string ::= "\\"" char* "\\"" char ::= [^"\\\\] | "\\\\" . ws ::= [ \\t\\n]* """.strip() def parse_and_validate_mass(raw: str, dock: list[str]) -> tuple[dict[frozenset[str], float], str]: """Parse a deposition JSON string. Returns (mass, summary). - Drops focal elements with non-positive mass or empty suspect sets. - Rejects unknown suspect names with ValueError. - Renormalises so masses sum to 1.0. """ data = json.loads(raw) dock_set = set(dock) summary = str(data.get("one_line_summary", ""))[:240] mass: dict[frozenset[str], float] = {} for el in data.get("focal_elements", []): subset = el.get("suspects", []) if not subset: continue for s in subset: if s not in dock_set: raise ValueError(f"unknown suspect: {s!r}") key = frozenset(subset) m = float(el.get("mass", 0.0)) if m <= 0: continue mass[key] = mass.get(key, 0.0) + m total = sum(mass.values()) if total <= 0: raise ValueError("deposition has no positive mass") return {k: v / total for k, v in mass.items()}, summary