AvatarChatbot / idle_generator.py
AvatarResurrectionChatbot's picture
Update idle_generator.py
556a15f verified
Raw
History Blame Contribute Delete
9.57 kB
"""
Idle Video Generator — generates natural-looking idle animation clips using FLOAT.
Uses scripted pose control with dampening curves for realistic idle behavior.
Extracted from standalone script for integration into avatar creation pipeline.
"""
import os
import sys
import math
import random
import time
import logging
import torch
import cv2
import numpy as np
logger = logging.getLogger(__name__)
FLOAT_REPO_PATH = "/app/float_repo"
if FLOAT_REPO_PATH not in sys.path:
sys.path.insert(0, FLOAT_REPO_PATH)
# Generation settings
FPS = 25.0
DURATION_SEC = 5.0
SAMPLE_RATE = 16000
NUM_CLIPS = 6
# Clip mode definitions — each clip has different motion characteristics
CLIP_MODES = [
# Clips 1-2: Wide Variable (35%-65%)
{"damp_mid": 0.50, "damp_amp": 0.15, "label": "Wide Variable"},
{"damp_mid": 0.50, "damp_amp": 0.15, "label": "Wide Variable"},
# Clips 3-5: High Fixed (55%-75%)
{"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"},
{"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"},
{"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"},
# Clip 6: Very High Activity (80%-90%)
{"damp_mid": 0.85, "damp_amp": 0.05, "label": "Very High Activity"},
]
class IdleVideoGenerator:
"""Generates idle animation clips using FLOAT model with scripted pose control."""
def __init__(self, float_model, device, input_size=512, lipsync_instance=None):
"""
Args:
float_model: Loaded FLOAT model instance (already on device, eval mode)
device: torch device
input_size: image size (default 512)
lipsync_instance: FloatLipsync instance for reusing its face crop logic
"""
self.model = float_model
self.device = device
self.input_size = input_size
self.lipsync = lipsync_instance
# Store original sample function for monkey-patching
self._original_sample = self.model.sample
def _load_reference_image(self, image_path):
"""Load and preprocess a reference image using float_lipsync's proven crop logic."""
if self.lipsync:
# Use the same crop/transform as float_lipsync for consistency
self.lipsync._preload_reference_image(image_path)
return self.lipsync.preprocessed_ref_image.clone()
else:
# Fallback: basic resize without face crop
import albumentations as A
import albumentations.pytorch.transforms as A_pytorch
img = cv2.imread(image_path)
if img is None:
raise FileNotFoundError(f"Could not read image: {image_path}")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
transform = A.Compose([
A.Resize(height=self.input_size, width=self.input_size, interpolation=cv2.INTER_AREA),
A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
A_pytorch.ToTensorV2(),
])
return transform(image=img)['image'].unsqueeze(0).to(self.device)
def generate(self, ref_image_path, output_dir, avatar_name,
num_clips=NUM_CLIPS, progress_callback=None):
"""
Generate idle video clips for an avatar.
Args:
ref_image_path: Path to reference face image
output_dir: Directory to save clips (e.g. /app/avatars/Darwin/idlevideos/)
avatar_name: Name for file naming (e.g. "Darwin")
num_clips: Number of clips to generate (default 10)
progress_callback: Optional function(clip_index, total, status_text) for progress updates
Returns:
List of generated video file paths
"""
os.makedirs(output_dir, exist_ok=True)
total_frames = int(DURATION_SEC * FPS)
fade_frames = int(1.0 * FPS)
# Load reference image
if progress_callback:
progress_callback(0, num_clips, "Loading reference image...")
s = self._load_reference_image(ref_image_path)
# Silent audio (idle = no speech)
a = torch.zeros(1, int(DURATION_SEC * SAMPLE_RATE)).to(self.device)
data = {'s': s, 'a': a}
# Set up dampening curve for monkey-patching
dampening_curve = torch.zeros(1, total_frames, 1).to(self.device)
original_sample = self._original_sample
def dynamic_dampened_sample(*args, **kwargs):
r_d = original_sample(*args, **kwargs)
return r_d * dampening_curve
# Monkey-patch the sample function
self.model.sample = dynamic_dampened_sample
generated_paths = []
try:
for clip_idx in range(num_clips):
clip_num = clip_idx + 1
mode = CLIP_MODES[clip_idx % len(CLIP_MODES)]
status = f"Generating clip {clip_num}/{num_clips} ({mode['label']})"
logger.info(f"[IDLE_GEN] {status}")
if progress_callback:
progress_callback(clip_idx, num_clips, status)
t0 = time.time()
# Build pose control tensor
pose_control = torch.zeros(1, total_frames, 20).to(self.device)
# Randomize phases for variety
pitch_phase = random.uniform(0, 2 * math.pi)
yaw_phase = random.uniform(0, 2 * math.pi)
mouth_phase = random.uniform(0, 2 * math.pi)
blend_phase = random.uniform(0, 2 * math.pi)
damp_phase = random.uniform(0, 2 * math.pi)
for t in range(total_frames):
# Dampening curve
dampening_curve[0, t, 0] = mode["damp_mid"] + mode["damp_amp"] * math.sin(t * 0.06 + damp_phase)
# Subtle head sway
pose_control[:, t, 1] = 0.08 * math.sin(t * 0.05 + pitch_phase)
pose_control[:, t, 2] = 0.04 * math.cos(t * 0.03 + yaw_phase)
# Mouth clamp (peak 5.0, min 2.0)
overall_tension = 3.5 + 1.5 * math.sin(t * 0.08 + mouth_phase)
blend = (math.sin(t * 0.04 + blend_phase) + 1.0) / 2.0
pose_control[:, t, 5] = overall_tension * blend
pose_control[:, t, 9] = overall_tension * (1.0 - blend)
# 1-second fade out at end
if t >= total_frames - fade_frames:
fade_multiplier = (total_frames - t - 1) / float(fade_frames)
pose_control[:, t, :] *= fade_multiplier
# Run inference — pass pose_control via data['p']
data['p'] = pose_control
with torch.no_grad():
out = self.model.inference(
data,
a_cfg_scale=1.0,
r_cfg_scale=1.0,
e_cfg_scale=1.0,
emo='S2E',
nfe=7,
seed=random.randint(1, 10000),
)
# Save video
d_hat = out['d_hat'].cpu().clamp(-1, 1)
d_hat = ((d_hat + 1) / 2 * 255).to(torch.uint8)
frames = d_hat.permute(0, 2, 3, 1).numpy()
out_name = f"{avatar_name}_idle_{clip_num}.mp4"
out_path = os.path.join(output_dir, out_name)
self._save_video(frames, out_path)
generated_paths.append(out_path)
elapsed = time.time() - t0
logger.info(f"[IDLE_GEN] ✓ {out_name} ({elapsed:.1f}s)")
if progress_callback:
progress_callback(clip_idx + 1, num_clips, f"Clip {clip_num}/{num_clips} done ({elapsed:.0f}s)")
finally:
# Restore original sample function
self.model.sample = original_sample
logger.info(f"[IDLE_GEN] ✓ All {len(generated_paths)} clips generated")
return generated_paths
def _save_video(self, frames_np, output_path):
"""Save numpy frames array to mp4 using ffmpeg."""
import subprocess
height, width = frames_np.shape[1], frames_np.shape[2]
cmd = [
'ffmpeg', '-y',
'-f', 'rawvideo', '-vcodec', 'rawvideo',
'-s', f'{width}x{height}', '-pix_fmt', 'rgb24',
'-r', str(FPS),
'-i', 'pipe:0',
'-c:v', 'libx264', '-preset', 'fast', '-crf', '23',
'-pix_fmt', 'yuv420p',
output_path
]
process = subprocess.Popen(
cmd, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate(input=frames_np.tobytes())
if process.returncode != 0:
logger.error(f"[IDLE_GEN] ffmpeg error: {stderr.decode()[:300]}")
raise RuntimeError("ffmpeg encoding failed")
# Module-level singleton
_generator = None
def get_idle_generator():
"""Get or create the idle video generator (requires FLOAT to be initialized first)."""
global _generator
if _generator is None:
from float_lipsync import get_lipsync
lipsync = get_lipsync()
if not lipsync.ready:
raise RuntimeError("FLOAT not initialized — cannot generate idle videos")
_generator = IdleVideoGenerator(
float_model=lipsync.model,
device=lipsync.device,
input_size=lipsync.opt.input_size,
lipsync_instance=lipsync,
)
return _generator