Spaces:

ismailkattakath
/

BFS-Best-Face-Swap-Video

Running

App Files Files Community

BFS-Best-Face-Swap-Video / app.py

ismailkattakath

Fix: set api_name=False to suppress gradio_client schema TypeError on /api/info

18c57e9 verified about 1 month ago

Raw

History Blame Contribute Delete

7.62 kB

	"""
	BFS — Best Face Swap Video · Hugging Face Space
	"""

	from __future__ import annotations

	import os
	import tempfile

	import gradio as gr
	import numpy as np
	from PIL import Image

	from composer import compose_frames, crop_reserved_region
	from video_utils import (
	compute_target_size,
	extract_audio,
	frames_for_duration,
	load_video_frames,
	resize_frames,
	save_video,
	)

	# ---------------------------------------------------------------------------
	# GPU decorator — no-op locally, activates the ZeroGPU grant on HF Spaces
	# ---------------------------------------------------------------------------
	try:
	import spaces
	GPU = spaces.GPU
	except ImportError:
	def GPU(fn=None, **kwargs): # type: ignore
	return fn if fn is not None else lambda f: f

	# ---------------------------------------------------------------------------
	# Global model state (loaded once per worker)
	# ---------------------------------------------------------------------------
	_pipeline_state: dict \| None = None

	REGION_SIZE = 256
	DEFAULT_FPS = 24.0
	DEFAULT_DURATION = 5.0
	DEFAULT_RESOLUTION = 768

	# ---------------------------------------------------------------------------
	# Core processing function
	# ---------------------------------------------------------------------------

	@GPU(duration=300)
	def generate(
	guide_video_path: str,
	face_image: Image.Image,
	prompt: str,
	duration: float,
	fps: float,
	lora_strength: float,
	seed: int,
	hf_token: str = "",
	progress: gr.Progress = gr.Progress(track_tqdm=True),
	) -> tuple[str, str]:
	"""
	Full head-swap pipeline:
	1. Load + resize guide video frames
	2. Compose chroma face strip (ReservedRegionFrameComposer)
	3. Run LTX-2.3 diffusion
	4. Crop face strip from output
	5. Mux original audio back in

	Returns (output_video_path, status_message).
	"""
	global _pipeline_state

	# ---- validate inputs early ----
	if guide_video_path is None:
	return "", "Please upload a guide video."
	if face_image is None:
	return "", "Please upload a reference face image."
	if not prompt.strip():
	return "", "Please enter a text prompt."

	# ---- lazy model load ----
	if _pipeline_state is None:
	from pipeline import load_pipeline
	progress(0, desc="Loading models (first run only — ~5 min)…")
	_pipeline_state = load_pipeline(
	token=hf_token.strip() or None,
	progress_cb=lambda msg: progress(0, desc=msg),
	)

	progress(0.05, desc="Loading guide video…")
	frames, source_fps = load_video_frames(guide_video_path)
	if len(frames) == 0:
	return "", "Could not read frames from the guide video."

	# ---- extract audio before we do anything else ----
	audio_tmp = tempfile.mktemp(suffix=".wav")
	has_audio = extract_audio(guide_video_path, audio_tmp)

	# ---- resize frames ----
	progress(0.10, desc="Resizing frames…")
	orig_h, orig_w = frames.shape[1], frames.shape[2]
	target_w, target_h = compute_target_size(orig_w, orig_h, DEFAULT_RESOLUTION)
	frames = resize_frames(frames, target_w, target_h)

	# ---- trim / pad to requested duration ----
	n_frames = frames_for_duration(fps, duration)
	if len(frames) >= n_frames:
	frames = frames[:n_frames]
	else:
	# loop last frame
	pad = np.stack([frames[-1]] * (n_frames - len(frames)))
	frames = np.concatenate([frames, pad], axis=0)

	# ---- compose chroma strip ----
	progress(0.15, desc="Compositing reference face strip…")
	composed = compose_frames(
	frames,
	face_image,
	region_position="left",
	region_size_px=REGION_SIZE,
	)

	# ---- run diffusion ----
	progress(0.20, desc="Running LTX-2.3 diffusion…")
	from pipeline import run_inference
	generated = run_inference(
	_pipeline_state,
	composed,
	prompt=prompt,
	fps=fps,
	lora_strength=lora_strength,
	seed=int(seed),
	progress_cb=lambda msg: progress(0.20, desc=msg),
	)

	# ---- crop face strip from output ----
	progress(0.90, desc="Cropping reserved region…")
	cropped = crop_reserved_region(
	generated,
	region_position="left",
	region_size_px=REGION_SIZE,
	output_size=(target_w, target_h),
	)

	# ---- save output video with audio ----
	progress(0.95, desc="Encoding output video…")
	out_path = tempfile.mktemp(suffix=".mp4")
	save_video(
	cropped,
	fps=fps,
	output_path=out_path,
	audio_path=audio_tmp if has_audio else None,
	audio_duration=duration,
	)

	progress(1.0, desc="Done.")
	return out_path, "Generation complete."


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------

	DESCRIPTION = """
	# BFS — Best Face Swap Video

	Swap the identity in any video using the V3 persistent-template technique.
	The reference face is placed in a green chroma side-strip that persists across
	all frames, giving the model continuous identity conditioning throughout generation.

	Prompt format:
	```
	head_swap:
	FACE: Female, fair skin, ~25 years old, long wavy auburn hair, green eyes…
	ACTION: A person in a grey hoodie walks toward the camera indoors…
	```
	"""

	EXAMPLES: list[list] = [
	# [guide_video, face_image, prompt, duration, fps, lora_strength, seed]
	]

	with gr.Blocks(title="BFS — Best Face Swap Video") as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=1):
	guide_video = gr.Video(label="Guide Video", sources=["upload"])
	face_image = gr.Image(label="Reference Face", type="pil", sources=["upload"])
	prompt = gr.Textbox(
	label="Text Prompt",
	placeholder="head_swap:\nFACE: ...\nACTION: ...",
	lines=6,
	)

	with gr.Accordion("Parameters", open=False):
	duration = gr.Slider(1, 15, value=DEFAULT_DURATION, step=0.5, label="Duration (seconds)")
	fps = gr.Slider(8, 30, value=DEFAULT_FPS, step=1.0, label="FPS")
	lora_strength = gr.Slider(0.5, 1.5, value=1.2, step=0.05, label="Face Swap Strength")
	seed = gr.Number(value=42, label="Seed", precision=0)
	hf_token = gr.Textbox(
	label="HF Token (optional)",
	type="password",
	placeholder="hf_… — only needed if the Space owner's token has no access to a gated model",
	)

	run_btn = gr.Button("Generate", variant="primary")

	with gr.Column(scale=1):
	output_video = gr.Video(label="Result", interactive=False)
	status_text = gr.Textbox(label="Status", interactive=False)

	run_btn.click(
	fn=generate,
	inputs=[guide_video, face_image, prompt, duration, fps, lora_strength, seed, hf_token],
	outputs=[output_video, status_text],
	api_name=False,
	)

	gr.Markdown("""
	---
	Hardware: A100 80 GB GPU required.
	Model: [Alissonerdx/BFS-Best-Face-Swap-Video](https://huggingface.co/Alissonerdx/BFS-Best-Face-Swap-Video) · Built on [LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3)
	License: For research and professional VFX use only. You must have explicit consent for any likeness you process.
	""")


	if __name__ == "__main__":
	demo.launch()