Spaces:
Running
Running
| """Talkie Gentleman Reachy Mini demo video. | |
| Split-screen: left = MuJoCo 3D robot, right = Victorian chat overlay. | |
| 15 seconds, 720p, 24fps. | |
| Usage: GST_PLUGIN_SCANNER="" python3.13 demo_video.py | |
| """ | |
| import os, sys, math, subprocess | |
| from pathlib import Path | |
| os.environ["GST_PLUGIN_SCANNER"] = "" | |
| os.environ["GST_REGISTRY_UPDATE"] = "no" | |
| import mujoco | |
| import numpy as np | |
| from PIL import Image, ImageDraw, ImageFont | |
| # --- Config --- | |
| SCENE_XML = Path(__file__).parent.parent / "reachy_mini/src/reachy_mini/descriptions/reachy_mini/mjcf/scenes/minimal.xml" | |
| OUTPUT_MP4 = Path(__file__).parent / "demo.mp4" | |
| TOTAL_W, TOTAL_H = 1280, 720 | |
| ROBOT_W = TOTAL_W // 2 # 640 | |
| CHAT_W = TOTAL_W - ROBOT_W | |
| FPS = 24 | |
| TOTAL_DURATION = 15.0 | |
| # Actuator indices | |
| YAW, S1, S2, S3, S4, S5, S6, R_ANT, L_ANT = 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
| # --- Chat content --- | |
| CHAT_EVENTS = [ | |
| # (start_time, role, text) | |
| (0.5, "header", "~ Talkie Gentleman ~"), | |
| (1.0, "user", "Good evening, what are your\nthoughts on modern inventions?"), | |
| (4.5, "bot", "Ah, a most splendid inquiry!\nThe telegraph astounds me —\nto send words across vast\ndistances in mere moments.\nTruly, we live in an age\nof marvels, dear friend."), | |
| (9.0, "user", "What music do you enjoy?"), | |
| (11.5, "bot", "Beethoven, without question.\nHis symphonies stir the very\nsoul. The Moonlight Sonata\nis a masterwork of the\nhighest order."), | |
| ] | |
| # --- Robot motion timeline --- | |
| # (start, end, gesture_name) | |
| GESTURES = [ | |
| # Idle breathing at start | |
| (0.0, 1.0, "idle"), | |
| # Listen to first question - slight attentive tilt | |
| (1.0, 2.5, "attentive_listen"), | |
| # Thinking head tilt before answering | |
| (2.5, 4.5, "thinking_tilt"), | |
| # Speaking - gentle nods while responding | |
| (4.5, 8.5, "speaking_nods"), | |
| # Brief return to neutral | |
| (8.5, 9.0, "idle"), | |
| # Listen to second question | |
| (9.0, 10.0, "attentive_listen"), | |
| # Enthusiastic nod about Beethoven | |
| (10.0, 11.5, "enthusiastic_think"), | |
| # Speaking with conviction | |
| (11.5, 14.5, "speaking_nods"), | |
| # Elegant settle | |
| (14.5, 15.0, "idle"), | |
| ] | |
| def get_robot_pose(t: float) -> dict: | |
| """Return target ctrl values for time t.""" | |
| gesture = "idle" | |
| gesture_t = 0.0 | |
| for gs, ge, gn in GESTURES: | |
| if gs <= t < ge: | |
| gesture = gn | |
| gesture_t = (t - gs) / max(0.01, ge - gs) # normalized 0-1 | |
| break | |
| ctrl = {YAW: 0, S1: 0, S2: 0, S3: 0, S4: 0, S5: 0, S6: 0, R_ANT: 0, L_ANT: 0} | |
| if gesture == "idle": | |
| # Gentle breathing - subtle vertical oscillation | |
| breath = math.sin(t * 1.8) * 0.02 | |
| ctrl[S3] = breath | |
| ctrl[R_ANT] = math.sin(t * 0.7) * 0.05 | |
| ctrl[L_ANT] = math.sin(t * 0.7 + 0.5) * 0.05 | |
| elif gesture == "attentive_listen": | |
| # Gentle head tilt to the right, antenna perk | |
| ease = math.sin(gesture_t * math.pi) # smooth in-out | |
| ctrl[S5] = math.radians(12) * ease # roll tilt | |
| ctrl[S4] = math.radians(-5) * ease # slight pitch down (attentive) | |
| ctrl[R_ANT] = 0.3 * ease | |
| ctrl[L_ANT] = 0.15 * ease | |
| elif gesture == "thinking_tilt": | |
| # Head tilts left, one antenna raises - pondering | |
| ease = min(1.0, gesture_t * 2.5) # quick settle | |
| hold = math.sin(gesture_t * math.pi * 0.8) | |
| ctrl[S5] = math.radians(-15) * ease # tilt left | |
| ctrl[S4] = math.radians(8) * ease # slight look up | |
| ctrl[YAW] = math.radians(5) * ease # slight turn | |
| ctrl[R_ANT] = -0.2 * ease | |
| ctrl[L_ANT] = 0.5 * ease # one antenna raised = thinking | |
| # Subtle micro-movement | |
| ctrl[S4] += math.sin(t * 3) * 0.01 | |
| elif gesture == "speaking_nods": | |
| # Gentle periodic nods with slight body sway | |
| nod_cycle = math.sin(gesture_t * math.pi * 5) # ~2.5 nods over the gesture | |
| sway = math.sin(gesture_t * math.pi * 2) * 0.3 | |
| ctrl[S4] = math.radians(6) * nod_cycle # pitch nod | |
| ctrl[S5] = math.radians(3) * sway # gentle roll sway | |
| ctrl[YAW] = math.radians(2) * math.sin(gesture_t * math.pi * 1.5) | |
| # Antennas follow speech rhythm | |
| ctrl[R_ANT] = 0.2 * nod_cycle | |
| ctrl[L_ANT] = 0.2 * nod_cycle | |
| # Subtle vertical | |
| ctrl[S3] = 0.01 * nod_cycle | |
| elif gesture == "enthusiastic_think": | |
| # More energetic thinking - tilt + antenna waggle | |
| ease = min(1.0, gesture_t * 3) | |
| ctrl[S5] = math.radians(10) * ease | |
| ctrl[S4] = math.radians(10) * ease | |
| ctrl[R_ANT] = 0.4 * math.sin(gesture_t * math.pi * 4) | |
| ctrl[L_ANT] = 0.4 * math.cos(gesture_t * math.pi * 4) | |
| return ctrl | |
| def render_chat_panel(t: float) -> Image.Image: | |
| """Render the Victorian chat panel for time t.""" | |
| img = Image.new("RGB", (CHAT_W, TOTAL_H), (28, 22, 18)) | |
| draw = ImageDraw.Draw(img) | |
| # Try to get a nice font, fall back to default | |
| try: | |
| font_title = ImageFont.truetype("/System/Library/Fonts/Supplemental/Times New Roman.ttf", 26) | |
| font_msg = ImageFont.truetype("/System/Library/Fonts/Supplemental/Times New Roman.ttf", 18) | |
| font_label = ImageFont.truetype("/System/Library/Fonts/Supplemental/Times New Roman.ttf", 14) | |
| except: | |
| font_title = ImageFont.load_default() | |
| font_msg = font_title | |
| font_label = font_title | |
| # Colors | |
| BG_DARK = (28, 22, 18) | |
| GOLD = (198, 166, 100) | |
| CREAM = (230, 218, 195) | |
| USER_BG = (48, 40, 32) | |
| BOT_BG = (42, 35, 28) | |
| BORDER = (100, 82, 58) | |
| DIM = (140, 120, 90) | |
| # Ornamental border | |
| draw.rectangle([0, 0, CHAT_W-1, TOTAL_H-1], outline=BORDER, width=2) | |
| draw.rectangle([4, 4, CHAT_W-5, TOTAL_H-5], outline=(60, 50, 38), width=1) | |
| # Decorative top line | |
| draw.line([(20, 55), (CHAT_W-20, 55)], fill=BORDER, width=1) | |
| # Small ornaments | |
| draw.text((CHAT_W//2 - 10, 48), "◆", fill=GOLD, font=font_label) | |
| y = 70 | |
| for evt_t, role, text in CHAT_EVENTS: | |
| if t < evt_t: | |
| break | |
| if role == "header": | |
| # Title | |
| bbox = draw.textbbox((0, 0), text, font=font_title) | |
| tw = bbox[2] - bbox[0] | |
| draw.text(((CHAT_W - tw) // 2, 18), text, fill=GOLD, font=font_title) | |
| continue | |
| # Typewriter effect for messages appearing | |
| elapsed = t - evt_t | |
| chars_visible = int(elapsed * 35) # 35 chars/sec typing speed | |
| visible_text = text[:chars_visible] | |
| if not visible_text: | |
| continue | |
| # Message bubble | |
| margin = 15 | |
| pad = 10 | |
| if role == "user": | |
| label = "You" | |
| label_color = DIM | |
| bg = USER_BG | |
| text_color = CREAM | |
| else: | |
| label = "Gentleman" | |
| label_color = GOLD | |
| bg = BOT_BG | |
| text_color = CREAM | |
| # Label | |
| draw.text((margin + 5, y), label, fill=label_color, font=font_label) | |
| y += 18 | |
| # Calculate text height | |
| bbox = draw.textbbox((0, 0), visible_text, font=font_msg) | |
| th = bbox[3] - bbox[1] | |
| tw = bbox[2] - bbox[0] | |
| # Bubble background | |
| bubble_h = th + pad * 2 + 4 | |
| draw.rounded_rectangle( | |
| [margin, y, CHAT_W - margin, y + bubble_h], | |
| radius=6, fill=bg, outline=BORDER | |
| ) | |
| # Text | |
| draw.text((margin + pad, y + pad), visible_text, fill=text_color, font=font_msg) | |
| y += bubble_h + 12 | |
| # Typing indicator for bot messages still typing | |
| if role == "bot" and chars_visible < len(text): | |
| dots = "..." [:int((t * 3) % 4)] | |
| draw.text((margin + pad, y - 5), f"✎ {dots}", fill=DIM, font=font_label) | |
| # Bottom ornament | |
| draw.line([(20, TOTAL_H - 25), (CHAT_W - 20, TOTAL_H - 25)], fill=BORDER, width=1) | |
| draw.text((CHAT_W // 2 - 30, TOTAL_H - 20), "⚙ Anno 1842", fill=DIM, font=font_label) | |
| return img | |
| def main(): | |
| print(f"Loading MuJoCo scene: {SCENE_XML}") | |
| model = mujoco.MjModel.from_xml_path(str(SCENE_XML)) | |
| model.vis.global_.offwidth = ROBOT_W | |
| model.vis.global_.offheight = TOTAL_H | |
| data = mujoco.MjData(model) | |
| renderer = mujoco.Renderer(model, TOTAL_H, ROBOT_W) | |
| cam = mujoco.MjvCamera() | |
| cam.type = mujoco.mjtCamera.mjCAMERA_FREE | |
| cam.distance = 0.48 | |
| cam.azimuth = 175 | |
| cam.elevation = -8 | |
| cam.lookat[:] = [0, 0, 0.14] | |
| stp = max(1, int(1.0 / (model.opt.timestep * FPS))) | |
| n_frames = int(TOTAL_DURATION * FPS) | |
| frames = [] | |
| print(f"Rendering {n_frames} frames ({TOTAL_DURATION:.0f}s @ {FPS}fps)...") | |
| for i in range(n_frames): | |
| t = i / FPS | |
| # Set robot pose | |
| pose = get_robot_pose(t) | |
| for k, v in pose.items(): | |
| data.ctrl[k] = v | |
| # Step physics | |
| for _ in range(stp): | |
| mujoco.mj_step(model, data) | |
| # Render robot view | |
| renderer.update_scene(data, cam) | |
| robot_rgb = renderer.render().copy() # (H, W, 3) | |
| # Render chat panel | |
| chat_img = render_chat_panel(t) | |
| chat_rgb = np.array(chat_img) | |
| # Composite split-screen | |
| composite = np.concatenate([robot_rgb, chat_rgb], axis=1) | |
| frames.append(composite) | |
| if (i + 1) % (FPS * 3) == 0: | |
| print(f" {i+1}/{n_frames} frames...") | |
| # Encode video | |
| print(f"\nEncoding {len(frames)} frames to {OUTPUT_MP4}...") | |
| proc = subprocess.Popen([ | |
| 'ffmpeg', '-y', '-f', 'rawvideo', '-vcodec', 'rawvideo', | |
| '-s', f'{TOTAL_W}x{TOTAL_H}', '-pix_fmt', 'rgb24', '-r', str(FPS), | |
| '-i', '-', '-c:v', 'libx264', '-pix_fmt', 'yuv420p', | |
| '-preset', 'fast', '-crf', '18', str(OUTPUT_MP4) | |
| ], stdin=subprocess.PIPE, stderr=subprocess.PIPE) | |
| for frame in frames: | |
| proc.stdin.write(frame.tobytes()) | |
| proc.stdin.close() | |
| _, stderr = proc.communicate() | |
| if proc.returncode == 0: | |
| size = os.path.getsize(str(OUTPUT_MP4)) | |
| print(f"✅ {OUTPUT_MP4} ({size/1024:.0f}KB, {TOTAL_DURATION:.0f}s)") | |
| else: | |
| print(f"❌ ffmpeg error: {stderr.decode()[:500]}") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |