""" BFS — Best Face Swap Video · Hugging Face Space """ from __future__ import annotations import os import tempfile import gradio as gr import numpy as np from PIL import Image from composer import compose_frames, crop_reserved_region from video_utils import ( compute_target_size, extract_audio, frames_for_duration, load_video_frames, resize_frames, save_video, ) # --------------------------------------------------------------------------- # GPU decorator — no-op locally, activates the ZeroGPU grant on HF Spaces # --------------------------------------------------------------------------- try: import spaces GPU = spaces.GPU except ImportError: def GPU(fn=None, **kwargs): # type: ignore return fn if fn is not None else lambda f: f # --------------------------------------------------------------------------- # Global model state (loaded once per worker) # --------------------------------------------------------------------------- _pipeline_state: dict | None = None REGION_SIZE = 256 DEFAULT_FPS = 24.0 DEFAULT_DURATION = 5.0 DEFAULT_RESOLUTION = 768 # --------------------------------------------------------------------------- # Core processing function # --------------------------------------------------------------------------- @GPU(duration=300) def generate( guide_video_path: str, face_image: Image.Image, prompt: str, duration: float, fps: float, lora_strength: float, seed: int, hf_token: str = "", progress: gr.Progress = gr.Progress(track_tqdm=True), ) -> tuple[str, str]: """ Full head-swap pipeline: 1. Load + resize guide video frames 2. Compose chroma face strip (ReservedRegionFrameComposer) 3. Run LTX-2.3 diffusion 4. Crop face strip from output 5. Mux original audio back in Returns (output_video_path, status_message). """ global _pipeline_state # ---- validate inputs early ---- if guide_video_path is None: return "", "Please upload a guide video." if face_image is None: return "", "Please upload a reference face image." if not prompt.strip(): return "", "Please enter a text prompt." # ---- lazy model load ---- if _pipeline_state is None: from pipeline import load_pipeline progress(0, desc="Loading models (first run only — ~5 min)…") _pipeline_state = load_pipeline( token=hf_token.strip() or None, progress_cb=lambda msg: progress(0, desc=msg), ) progress(0.05, desc="Loading guide video…") frames, source_fps = load_video_frames(guide_video_path) if len(frames) == 0: return "", "Could not read frames from the guide video." # ---- extract audio before we do anything else ---- audio_tmp = tempfile.mktemp(suffix=".wav") has_audio = extract_audio(guide_video_path, audio_tmp) # ---- resize frames ---- progress(0.10, desc="Resizing frames…") orig_h, orig_w = frames.shape[1], frames.shape[2] target_w, target_h = compute_target_size(orig_w, orig_h, DEFAULT_RESOLUTION) frames = resize_frames(frames, target_w, target_h) # ---- trim / pad to requested duration ---- n_frames = frames_for_duration(fps, duration) if len(frames) >= n_frames: frames = frames[:n_frames] else: # loop last frame pad = np.stack([frames[-1]] * (n_frames - len(frames))) frames = np.concatenate([frames, pad], axis=0) # ---- compose chroma strip ---- progress(0.15, desc="Compositing reference face strip…") composed = compose_frames( frames, face_image, region_position="left", region_size_px=REGION_SIZE, ) # ---- run diffusion ---- progress(0.20, desc="Running LTX-2.3 diffusion…") from pipeline import run_inference generated = run_inference( _pipeline_state, composed, prompt=prompt, fps=fps, lora_strength=lora_strength, seed=int(seed), progress_cb=lambda msg: progress(0.20, desc=msg), ) # ---- crop face strip from output ---- progress(0.90, desc="Cropping reserved region…") cropped = crop_reserved_region( generated, region_position="left", region_size_px=REGION_SIZE, output_size=(target_w, target_h), ) # ---- save output video with audio ---- progress(0.95, desc="Encoding output video…") out_path = tempfile.mktemp(suffix=".mp4") save_video( cropped, fps=fps, output_path=out_path, audio_path=audio_tmp if has_audio else None, audio_duration=duration, ) progress(1.0, desc="Done.") return out_path, "Generation complete." # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- DESCRIPTION = """ # BFS — Best Face Swap Video Swap the identity in any video using the **V3 persistent-template** technique. The reference face is placed in a green chroma side-strip that persists across all frames, giving the model continuous identity conditioning throughout generation. **Prompt format:** ``` head_swap: FACE: Female, fair skin, ~25 years old, long wavy auburn hair, green eyes… ACTION: A person in a grey hoodie walks toward the camera indoors… ``` """ EXAMPLES: list[list] = [ # [guide_video, face_image, prompt, duration, fps, lora_strength, seed] ] with gr.Blocks(title="BFS — Best Face Swap Video") as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1): guide_video = gr.Video(label="Guide Video", sources=["upload"]) face_image = gr.Image(label="Reference Face", type="pil", sources=["upload"]) prompt = gr.Textbox( label="Text Prompt", placeholder="head_swap:\nFACE: ...\nACTION: ...", lines=6, ) with gr.Accordion("Parameters", open=False): duration = gr.Slider(1, 15, value=DEFAULT_DURATION, step=0.5, label="Duration (seconds)") fps = gr.Slider(8, 30, value=DEFAULT_FPS, step=1.0, label="FPS") lora_strength = gr.Slider(0.5, 1.5, value=1.2, step=0.05, label="Face Swap Strength") seed = gr.Number(value=42, label="Seed", precision=0) hf_token = gr.Textbox( label="HF Token (optional)", type="password", placeholder="hf_… — only needed if the Space owner's token has no access to a gated model", ) run_btn = gr.Button("Generate", variant="primary") with gr.Column(scale=1): output_video = gr.Video(label="Result", interactive=False) status_text = gr.Textbox(label="Status", interactive=False) run_btn.click( fn=generate, inputs=[guide_video, face_image, prompt, duration, fps, lora_strength, seed, hf_token], outputs=[output_video, status_text], api_name=False, ) gr.Markdown(""" --- **Hardware:** A100 80 GB GPU required. **Model:** [Alissonerdx/BFS-Best-Face-Swap-Video](https://huggingface.co/Alissonerdx/BFS-Best-Face-Swap-Video) · Built on [LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3) **License:** For research and professional VFX use only. You must have explicit consent for any likeness you process. """) if __name__ == "__main__": demo.launch()