AlexWortega
/

moe100m-physics-tinybpe

+"""Shared serialization for the tiny-vocab physics MoE.
+Reuses physics_core.fmt_header / fmt_frame, but reduces every frame's
+free-text description to a tiny controlled keyword set so the learned vocab
+stays simulation-only.
+Controlled description set (after the `Frame N:` token):
+  - "at rest"     <- "All objects are at rest."
+  - "in motion"   <- "All objects are in motion."
+  - "settling"    <- "K of N objects are moving."  (partial motion)
+Anything else -> dropped (description omitted; frame still emitted).
+"""
+from __future__ import annotations
+import re
+import physics_core as pc
+_AT_REST = re.compile(r"all objects are at rest", re.I)
+_IN_MOTION = re.compile(r"all objects are in motion", re.I)
+_PARTIAL = re.compile(r"\d+\s+of\s+\d+\s+objects are moving", re.I)
+def reduce_desc(raw: str) -> str:
+    """Map a frame's free-text description to a controlled keyword (or '')."""
+    if _AT_REST.search(raw):
+        return "at rest"
+    if _IN_MOTION.search(raw):
+        return "in motion"
+    if _PARTIAL.search(raw):
+        return "settling"
+    return ""
+def fmt_frame_reduced(fr: dict) -> str:
+    """Like pc.fmt_frame but with the description replaced by a keyword."""
+    fr2 = dict(fr)
+    fr2["description"] = reduce_desc(fr.get("description", ""))
+    return pc.fmt_frame(fr2)
+def fmt_header_reduced(header: dict) -> str:
+    """pc.fmt_header with the free-text Scene description blanked out.
+    Keeps every structural line (Gravity / Timestep / Type / Difficulty /
+    Static / Constraints) so the categorical `Type:` token survives, but the
+    `Scene:` line carries no English prose -> vocab stays sim-only.
+    """
+    h2 = dict(header)
+    h2["description"] = ""
+    return pc.fmt_header(h2)
+def serialize_scene(header: dict, frames: list) -> str:
+    """Full scene text: reduced header + reduced frames (no trailing BOS/EOS)."""
+    txt = fmt_header_reduced(header)
+    txt += "".join(fmt_frame_reduced(fr) for fr in frames)
+    return txt