""" Idle Video Generator — generates natural-looking idle animation clips using FLOAT. Uses scripted pose control with dampening curves for realistic idle behavior. Extracted from standalone script for integration into avatar creation pipeline. """ import os import sys import math import random import time import logging import torch import cv2 import numpy as np logger = logging.getLogger(__name__) FLOAT_REPO_PATH = "/app/float_repo" if FLOAT_REPO_PATH not in sys.path: sys.path.insert(0, FLOAT_REPO_PATH) # Generation settings FPS = 25.0 DURATION_SEC = 5.0 SAMPLE_RATE = 16000 NUM_CLIPS = 6 # Clip mode definitions — each clip has different motion characteristics CLIP_MODES = [ # Clips 1-2: Wide Variable (35%-65%) {"damp_mid": 0.50, "damp_amp": 0.15, "label": "Wide Variable"}, {"damp_mid": 0.50, "damp_amp": 0.15, "label": "Wide Variable"}, # Clips 3-5: High Fixed (55%-75%) {"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"}, {"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"}, {"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"}, # Clip 6: Very High Activity (80%-90%) {"damp_mid": 0.85, "damp_amp": 0.05, "label": "Very High Activity"}, ] class IdleVideoGenerator: """Generates idle animation clips using FLOAT model with scripted pose control.""" def __init__(self, float_model, device, input_size=512, lipsync_instance=None): """ Args: float_model: Loaded FLOAT model instance (already on device, eval mode) device: torch device input_size: image size (default 512) lipsync_instance: FloatLipsync instance for reusing its face crop logic """ self.model = float_model self.device = device self.input_size = input_size self.lipsync = lipsync_instance # Store original sample function for monkey-patching self._original_sample = self.model.sample def _load_reference_image(self, image_path): """Load and preprocess a reference image using float_lipsync's proven crop logic.""" if self.lipsync: # Use the same crop/transform as float_lipsync for consistency self.lipsync._preload_reference_image(image_path) return self.lipsync.preprocessed_ref_image.clone() else: # Fallback: basic resize without face crop import albumentations as A import albumentations.pytorch.transforms as A_pytorch img = cv2.imread(image_path) if img is None: raise FileNotFoundError(f"Could not read image: {image_path}") img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) transform = A.Compose([ A.Resize(height=self.input_size, width=self.input_size, interpolation=cv2.INTER_AREA), A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), A_pytorch.ToTensorV2(), ]) return transform(image=img)['image'].unsqueeze(0).to(self.device) def generate(self, ref_image_path, output_dir, avatar_name, num_clips=NUM_CLIPS, progress_callback=None): """ Generate idle video clips for an avatar. Args: ref_image_path: Path to reference face image output_dir: Directory to save clips (e.g. /app/avatars/Darwin/idlevideos/) avatar_name: Name for file naming (e.g. "Darwin") num_clips: Number of clips to generate (default 10) progress_callback: Optional function(clip_index, total, status_text) for progress updates Returns: List of generated video file paths """ os.makedirs(output_dir, exist_ok=True) total_frames = int(DURATION_SEC * FPS) fade_frames = int(1.0 * FPS) # Load reference image if progress_callback: progress_callback(0, num_clips, "Loading reference image...") s = self._load_reference_image(ref_image_path) # Silent audio (idle = no speech) a = torch.zeros(1, int(DURATION_SEC * SAMPLE_RATE)).to(self.device) data = {'s': s, 'a': a} # Set up dampening curve for monkey-patching dampening_curve = torch.zeros(1, total_frames, 1).to(self.device) original_sample = self._original_sample def dynamic_dampened_sample(*args, **kwargs): r_d = original_sample(*args, **kwargs) return r_d * dampening_curve # Monkey-patch the sample function self.model.sample = dynamic_dampened_sample generated_paths = [] try: for clip_idx in range(num_clips): clip_num = clip_idx + 1 mode = CLIP_MODES[clip_idx % len(CLIP_MODES)] status = f"Generating clip {clip_num}/{num_clips} ({mode['label']})" logger.info(f"[IDLE_GEN] {status}") if progress_callback: progress_callback(clip_idx, num_clips, status) t0 = time.time() # Build pose control tensor pose_control = torch.zeros(1, total_frames, 20).to(self.device) # Randomize phases for variety pitch_phase = random.uniform(0, 2 * math.pi) yaw_phase = random.uniform(0, 2 * math.pi) mouth_phase = random.uniform(0, 2 * math.pi) blend_phase = random.uniform(0, 2 * math.pi) damp_phase = random.uniform(0, 2 * math.pi) for t in range(total_frames): # Dampening curve dampening_curve[0, t, 0] = mode["damp_mid"] + mode["damp_amp"] * math.sin(t * 0.06 + damp_phase) # Subtle head sway pose_control[:, t, 1] = 0.08 * math.sin(t * 0.05 + pitch_phase) pose_control[:, t, 2] = 0.04 * math.cos(t * 0.03 + yaw_phase) # Mouth clamp (peak 5.0, min 2.0) overall_tension = 3.5 + 1.5 * math.sin(t * 0.08 + mouth_phase) blend = (math.sin(t * 0.04 + blend_phase) + 1.0) / 2.0 pose_control[:, t, 5] = overall_tension * blend pose_control[:, t, 9] = overall_tension * (1.0 - blend) # 1-second fade out at end if t >= total_frames - fade_frames: fade_multiplier = (total_frames - t - 1) / float(fade_frames) pose_control[:, t, :] *= fade_multiplier # Run inference — pass pose_control via data['p'] data['p'] = pose_control with torch.no_grad(): out = self.model.inference( data, a_cfg_scale=1.0, r_cfg_scale=1.0, e_cfg_scale=1.0, emo='S2E', nfe=7, seed=random.randint(1, 10000), ) # Save video d_hat = out['d_hat'].cpu().clamp(-1, 1) d_hat = ((d_hat + 1) / 2 * 255).to(torch.uint8) frames = d_hat.permute(0, 2, 3, 1).numpy() out_name = f"{avatar_name}_idle_{clip_num}.mp4" out_path = os.path.join(output_dir, out_name) self._save_video(frames, out_path) generated_paths.append(out_path) elapsed = time.time() - t0 logger.info(f"[IDLE_GEN] ✓ {out_name} ({elapsed:.1f}s)") if progress_callback: progress_callback(clip_idx + 1, num_clips, f"Clip {clip_num}/{num_clips} done ({elapsed:.0f}s)") finally: # Restore original sample function self.model.sample = original_sample logger.info(f"[IDLE_GEN] ✓ All {len(generated_paths)} clips generated") return generated_paths def _save_video(self, frames_np, output_path): """Save numpy frames array to mp4 using ffmpeg.""" import subprocess height, width = frames_np.shape[1], frames_np.shape[2] cmd = [ 'ffmpeg', '-y', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-s', f'{width}x{height}', '-pix_fmt', 'rgb24', '-r', str(FPS), '-i', 'pipe:0', '-c:v', 'libx264', '-preset', 'fast', '-crf', '23', '-pix_fmt', 'yuv420p', output_path ] process = subprocess.Popen( cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = process.communicate(input=frames_np.tobytes()) if process.returncode != 0: logger.error(f"[IDLE_GEN] ffmpeg error: {stderr.decode()[:300]}") raise RuntimeError("ffmpeg encoding failed") # Module-level singleton _generator = None def get_idle_generator(): """Get or create the idle video generator (requires FLOAT to be initialized first).""" global _generator if _generator is None: from float_lipsync import get_lipsync lipsync = get_lipsync() if not lipsync.ready: raise RuntimeError("FLOAT not initialized — cannot generate idle videos") _generator = IdleVideoGenerator( float_model=lipsync.model, device=lipsync.device, input_size=lipsync.opt.input_size, lipsync_instance=lipsync, ) return _generator