Spaces:

AvatarResurrectionChatbot
/

AvatarChatbot

Sleeping

App Files Files Community

AvatarChatbot / idle_generator.py

AvatarResurrectionChatbot

Update idle_generator.py

556a15f verified 3 months ago

Raw

History Blame Contribute Delete

9.57 kB

	"""
	Idle Video Generator — generates natural-looking idle animation clips using FLOAT.
	Uses scripted pose control with dampening curves for realistic idle behavior.
	Extracted from standalone script for integration into avatar creation pipeline.
	"""
	import os
	import sys
	import math
	import random
	import time
	import logging
	import torch
	import cv2
	import numpy as np

	logger = logging.getLogger(__name__)

	FLOAT_REPO_PATH = "/app/float_repo"
	if FLOAT_REPO_PATH not in sys.path:
	sys.path.insert(0, FLOAT_REPO_PATH)

	# Generation settings
	FPS = 25.0
	DURATION_SEC = 5.0
	SAMPLE_RATE = 16000
	NUM_CLIPS = 6

	# Clip mode definitions — each clip has different motion characteristics
	CLIP_MODES = [
	# Clips 1-2: Wide Variable (35%-65%)
	{"damp_mid": 0.50, "damp_amp": 0.15, "label": "Wide Variable"},
	{"damp_mid": 0.50, "damp_amp": 0.15, "label": "Wide Variable"},
	# Clips 3-5: High Fixed (55%-75%)
	{"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"},
	{"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"},
	{"damp_mid": 0.65, "damp_amp": 0.10, "label": "High Fixed"},
	# Clip 6: Very High Activity (80%-90%)
	{"damp_mid": 0.85, "damp_amp": 0.05, "label": "Very High Activity"},
	]


	class IdleVideoGenerator:
	"""Generates idle animation clips using FLOAT model with scripted pose control."""

	def __init__(self, float_model, device, input_size=512, lipsync_instance=None):
	"""
	Args:
	float_model: Loaded FLOAT model instance (already on device, eval mode)
	device: torch device
	input_size: image size (default 512)
	lipsync_instance: FloatLipsync instance for reusing its face crop logic
	"""
	self.model = float_model
	self.device = device
	self.input_size = input_size
	self.lipsync = lipsync_instance

	# Store original sample function for monkey-patching
	self._original_sample = self.model.sample

	def _load_reference_image(self, image_path):
	"""Load and preprocess a reference image using float_lipsync's proven crop logic."""
	if self.lipsync:
	# Use the same crop/transform as float_lipsync for consistency
	self.lipsync._preload_reference_image(image_path)
	return self.lipsync.preprocessed_ref_image.clone()
	else:
	# Fallback: basic resize without face crop
	import albumentations as A
	import albumentations.pytorch.transforms as A_pytorch
	img = cv2.imread(image_path)
	if img is None:
	raise FileNotFoundError(f"Could not read image: {image_path}")
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	transform = A.Compose([
	A.Resize(height=self.input_size, width=self.input_size, interpolation=cv2.INTER_AREA),
	A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
	A_pytorch.ToTensorV2(),
	])
	return transform(image=img)['image'].unsqueeze(0).to(self.device)

	def generate(self, ref_image_path, output_dir, avatar_name,
	num_clips=NUM_CLIPS, progress_callback=None):
	"""
	Generate idle video clips for an avatar.

	Args:
	ref_image_path: Path to reference face image
	output_dir: Directory to save clips (e.g. /app/avatars/Darwin/idlevideos/)
	avatar_name: Name for file naming (e.g. "Darwin")
	num_clips: Number of clips to generate (default 10)
	progress_callback: Optional function(clip_index, total, status_text) for progress updates

	Returns:
	List of generated video file paths
	"""
	os.makedirs(output_dir, exist_ok=True)

	total_frames = int(DURATION_SEC * FPS)
	fade_frames = int(1.0 * FPS)

	# Load reference image
	if progress_callback:
	progress_callback(0, num_clips, "Loading reference image...")

	s = self._load_reference_image(ref_image_path)

	# Silent audio (idle = no speech)
	a = torch.zeros(1, int(DURATION_SEC * SAMPLE_RATE)).to(self.device)
	data = {'s': s, 'a': a}

	# Set up dampening curve for monkey-patching
	dampening_curve = torch.zeros(1, total_frames, 1).to(self.device)

	original_sample = self._original_sample

	def dynamic_dampened_sample(args, *kwargs):
	r_d = original_sample(args, *kwargs)
	return r_d * dampening_curve

	# Monkey-patch the sample function
	self.model.sample = dynamic_dampened_sample

	generated_paths = []

	try:
	for clip_idx in range(num_clips):
	clip_num = clip_idx + 1
	mode = CLIP_MODES[clip_idx % len(CLIP_MODES)]

	status = f"Generating clip {clip_num}/{num_clips} ({mode['label']})"
	logger.info(f"[IDLE_GEN] {status}")
	if progress_callback:
	progress_callback(clip_idx, num_clips, status)

	t0 = time.time()

	# Build pose control tensor
	pose_control = torch.zeros(1, total_frames, 20).to(self.device)

	# Randomize phases for variety
	pitch_phase = random.uniform(0, 2 * math.pi)
	yaw_phase = random.uniform(0, 2 * math.pi)
	mouth_phase = random.uniform(0, 2 * math.pi)
	blend_phase = random.uniform(0, 2 * math.pi)
	damp_phase = random.uniform(0, 2 * math.pi)

	for t in range(total_frames):
	# Dampening curve
	dampening_curve[0, t, 0] = mode["damp_mid"] + mode["damp_amp"] * math.sin(t * 0.06 + damp_phase)

	# Subtle head sway
	pose_control[:, t, 1] = 0.08 * math.sin(t * 0.05 + pitch_phase)
	pose_control[:, t, 2] = 0.04 * math.cos(t * 0.03 + yaw_phase)

	# Mouth clamp (peak 5.0, min 2.0)
	overall_tension = 3.5 + 1.5 * math.sin(t * 0.08 + mouth_phase)
	blend = (math.sin(t * 0.04 + blend_phase) + 1.0) / 2.0

	pose_control[:, t, 5] = overall_tension * blend
	pose_control[:, t, 9] = overall_tension * (1.0 - blend)

	# 1-second fade out at end
	if t >= total_frames - fade_frames:
	fade_multiplier = (total_frames - t - 1) / float(fade_frames)
	pose_control[:, t, :] *= fade_multiplier

	# Run inference — pass pose_control via data['p']
	data['p'] = pose_control
	with torch.no_grad():
	out = self.model.inference(
	data,
	a_cfg_scale=1.0,
	r_cfg_scale=1.0,
	e_cfg_scale=1.0,
	emo='S2E',
	nfe=7,
	seed=random.randint(1, 10000),
	)

	# Save video
	d_hat = out['d_hat'].cpu().clamp(-1, 1)
	d_hat = ((d_hat + 1) / 2 * 255).to(torch.uint8)
	frames = d_hat.permute(0, 2, 3, 1).numpy()

	out_name = f"{avatar_name}_idle_{clip_num}.mp4"
	out_path = os.path.join(output_dir, out_name)
	self._save_video(frames, out_path)

	generated_paths.append(out_path)
	elapsed = time.time() - t0
	logger.info(f"[IDLE_GEN] ✓ {out_name} ({elapsed:.1f}s)")

	if progress_callback:
	progress_callback(clip_idx + 1, num_clips, f"Clip {clip_num}/{num_clips} done ({elapsed:.0f}s)")

	finally:
	# Restore original sample function
	self.model.sample = original_sample

	logger.info(f"[IDLE_GEN] ✓ All {len(generated_paths)} clips generated")
	return generated_paths

	def _save_video(self, frames_np, output_path):
	"""Save numpy frames array to mp4 using ffmpeg."""
	import subprocess

	height, width = frames_np.shape[1], frames_np.shape[2]

	cmd = [
	'ffmpeg', '-y',
	'-f', 'rawvideo', '-vcodec', 'rawvideo',
	'-s', f'{width}x{height}', '-pix_fmt', 'rgb24',
	'-r', str(FPS),
	'-i', 'pipe:0',
	'-c:v', 'libx264', '-preset', 'fast', '-crf', '23',
	'-pix_fmt', 'yuv420p',
	output_path
	]

	process = subprocess.Popen(
	cmd, stdin=subprocess.PIPE,
	stdout=subprocess.PIPE, stderr=subprocess.PIPE
	)
	stdout, stderr = process.communicate(input=frames_np.tobytes())

	if process.returncode != 0:
	logger.error(f"[IDLE_GEN] ffmpeg error: {stderr.decode()[:300]}")
	raise RuntimeError("ffmpeg encoding failed")


	# Module-level singleton
	_generator = None


	def get_idle_generator():
	"""Get or create the idle video generator (requires FLOAT to be initialized first)."""
	global _generator
	if _generator is None:
	from float_lipsync import get_lipsync
	lipsync = get_lipsync()
	if not lipsync.ready:
	raise RuntimeError("FLOAT not initialized — cannot generate idle videos")
	_generator = IdleVideoGenerator(
	float_model=lipsync.model,
	device=lipsync.device,
	input_size=lipsync.opt.input_size,
	lipsync_instance=lipsync,
	)
	return _generator