"""Shot grammar for the 4-koma stance comics: named shots -> deterministic camera, computed purely from the skeleton frame (bbox + hip heading). The LLM never sees any of this; it only ever picks captions. Cameras are plain pinhole (eye/target/fov/roll) so the same parameters port 1:1 to the three.js viewer later.""" import numpy as np PARENTS = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, 16, 17, 18, 19] PEL, LHIP, RHIP, HEAD = 0, 1, 2, 15 LEFTJ = {1, 4, 7, 10, 13, 16, 18, 20} RIGHTJ = {2, 5, 8, 11, 14, 17, 19, 21} # az = horizontal angle of the camera relative to the character's facing direction # (0 = head-on). el = elevation in degrees (negative = camera low, looking up). # margin = how much air around the joint bbox. roll = dutch tilt. SHOTS = { # aim_up biases the look-at above the bbox center so the figure sits LOW in # the frame — the upper strip belongs to the speech bubble. "establishing": dict(az=38.0, el=10.0, margin=1.06, roll=0.0, fov=40.0, aim_up=0.18), "action": dict(az=55.0, el=4.0, margin=1.08, roll=-4.0, fov=46.0, aim_up=0.18), "gag": dict(az=22.0, el=16.0, margin=0.98, roll=13.0, fov=50.0, aim_up=0.10), "hero": dict(az=32.0, el=-16.0, margin=1.08, roll=0.0, fov=38.0, aim_up=0.16), } def heading(Pf): """Facing angle about Y from the hip line (0 = facing +Z), as in _build_stances.""" d = Pf[RHIP] - Pf[LHIP] return float(np.arctan2(d[2], -d[0])) def make_camera(shot_name, Pf): """Camera dict for one skeleton frame [22,3]. Frames the joint bbox with the shot's margin, oriented relative to the character's heading.""" s = SHOTS[shot_name] lo, hi = Pf.min(axis=0), Pf.max(axis=0) target = (lo + hi) / 2.0 radius = float(np.linalg.norm(Pf - target, axis=1).max()) fov = np.radians(s["fov"]) dist = s["margin"] * radius / np.tan(fov / 2.0) theta = heading(Pf) + np.radians(s["az"]) el = np.radians(s["el"]) direction = np.array([np.sin(theta) * np.cos(el), np.sin(el), np.cos(theta) * np.cos(el)]) eye = target + direction * dist # Don't let the camera dip below the floor (hero shots on grounded poses). eye[1] = max(eye[1], 0.12) aim = target.copy() aim[1] += s.get("aim_up", 0.0) * radius return dict(eye=eye, target=aim, fov=fov, roll=np.radians(s["roll"])) def project(points, cam, W, H): """Pinhole-project [N,3] world points -> ([N,2] pixel coords, [N] view depth).""" pts = np.asarray(points, np.float64).reshape(-1, 3) eye, target = cam["eye"], cam["target"] fwd = target - eye fwd = fwd / np.linalg.norm(fwd) right = np.cross(fwd, np.array([0.0, 1.0, 0.0])) right = right / (np.linalg.norm(right) + 1e-9) up = np.cross(right, fwd) r = cam["roll"] if abs(r) > 1e-6: # dutch tilt: rotate the basis about the view axis c, s = np.cos(r), np.sin(r) right, up = c * right + s * up, -s * right + c * up rel = pts - eye x = rel @ right y = rel @ up z = rel @ fwd z = np.maximum(z, 1e-4) f = (H / 2.0) / np.tan(cam["fov"] / 2.0) px = W / 2.0 + f * x / z py = H / 2.0 - f * y / z return np.stack([px, py], axis=1), z def ground_grid(center, span=2.6, step=0.65): """3D segments of a floor grid (y=0) around the character, for scene depth.""" segs = [] cx, cz = float(center[0]), float(center[2]) ticks = np.arange(-span, span + 1e-6, step) for t in ticks: segs.append(((cx - span, 0.0, cz + t), (cx + span, 0.0, cz + t))) segs.append(((cx + t, 0.0, cz - span), (cx + t, 0.0, cz + span))) return segs