Spaces:

ismailkattakath
/

BFS-Best-Face-Swap-Video

Running

File size: 7,622 Bytes

"""
BFS — Best Face Swap Video  ·  Hugging Face Space
"""

from __future__ import annotations

import os
import tempfile

import gradio as gr
import numpy as np
from PIL import Image

from composer import compose_frames, crop_reserved_region
from video_utils import (
    compute_target_size,
    extract_audio,
    frames_for_duration,
    load_video_frames,
    resize_frames,
    save_video,
)

# ---------------------------------------------------------------------------
# GPU decorator — no-op locally, activates the ZeroGPU grant on HF Spaces
# ---------------------------------------------------------------------------
try:
    import spaces
    GPU = spaces.GPU
except ImportError:
    def GPU(fn=None, **kwargs):  # type: ignore
        return fn if fn is not None else lambda f: f

# ---------------------------------------------------------------------------
# Global model state (loaded once per worker)
# ---------------------------------------------------------------------------
_pipeline_state: dict | None = None

REGION_SIZE = 256
DEFAULT_FPS = 24.0
DEFAULT_DURATION = 5.0
DEFAULT_RESOLUTION = 768

# ---------------------------------------------------------------------------
# Core processing function
# ---------------------------------------------------------------------------

@GPU(duration=300)
def generate(
    guide_video_path: str,
    face_image: Image.Image,
    prompt: str,
    duration: float,
    fps: float,
    lora_strength: float,
    seed: int,
    hf_token: str = "",
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> tuple[str, str]:
    """
    Full head-swap pipeline:
      1. Load + resize guide video frames
      2. Compose chroma face strip (ReservedRegionFrameComposer)
      3. Run LTX-2.3 diffusion
      4. Crop face strip from output
      5. Mux original audio back in

    Returns (output_video_path, status_message).
    """
    global _pipeline_state

    # ---- validate inputs early ----
    if guide_video_path is None:
        return "", "Please upload a guide video."
    if face_image is None:
        return "", "Please upload a reference face image."
    if not prompt.strip():
        return "", "Please enter a text prompt."

    # ---- lazy model load ----
    if _pipeline_state is None:
        from pipeline import load_pipeline
        progress(0, desc="Loading models (first run only — ~5 min)…")
        _pipeline_state = load_pipeline(
            token=hf_token.strip() or None,
            progress_cb=lambda msg: progress(0, desc=msg),
        )

    progress(0.05, desc="Loading guide video…")
    frames, source_fps = load_video_frames(guide_video_path)
    if len(frames) == 0:
        return "", "Could not read frames from the guide video."

    # ---- extract audio before we do anything else ----
    audio_tmp = tempfile.mktemp(suffix=".wav")
    has_audio = extract_audio(guide_video_path, audio_tmp)

    # ---- resize frames ----
    progress(0.10, desc="Resizing frames…")
    orig_h, orig_w = frames.shape[1], frames.shape[2]
    target_w, target_h = compute_target_size(orig_w, orig_h, DEFAULT_RESOLUTION)
    frames = resize_frames(frames, target_w, target_h)

    # ---- trim / pad to requested duration ----
    n_frames = frames_for_duration(fps, duration)
    if len(frames) >= n_frames:
        frames = frames[:n_frames]
    else:
        # loop last frame
        pad = np.stack([frames[-1]] * (n_frames - len(frames)))
        frames = np.concatenate([frames, pad], axis=0)

    # ---- compose chroma strip ----
    progress(0.15, desc="Compositing reference face strip…")
    composed = compose_frames(
        frames,
        face_image,
        region_position="left",
        region_size_px=REGION_SIZE,
    )

    # ---- run diffusion ----
    progress(0.20, desc="Running LTX-2.3 diffusion…")
    from pipeline import run_inference
    generated = run_inference(
        _pipeline_state,
        composed,
        prompt=prompt,
        fps=fps,
        lora_strength=lora_strength,
        seed=int(seed),
        progress_cb=lambda msg: progress(0.20, desc=msg),
    )

    # ---- crop face strip from output ----
    progress(0.90, desc="Cropping reserved region…")
    cropped = crop_reserved_region(
        generated,
        region_position="left",
        region_size_px=REGION_SIZE,
        output_size=(target_w, target_h),
    )

    # ---- save output video with audio ----
    progress(0.95, desc="Encoding output video…")
    out_path = tempfile.mktemp(suffix=".mp4")
    save_video(
        cropped,
        fps=fps,
        output_path=out_path,
        audio_path=audio_tmp if has_audio else None,
        audio_duration=duration,
    )

    progress(1.0, desc="Done.")
    return out_path, "Generation complete."


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------

DESCRIPTION = """
# BFS — Best Face Swap Video

Swap the identity in any video using the **V3 persistent-template** technique.
The reference face is placed in a green chroma side-strip that persists across
all frames, giving the model continuous identity conditioning throughout generation.

**Prompt format:**
```
head_swap:
FACE: Female, fair skin, ~25 years old, long wavy auburn hair, green eyes…
ACTION: A person in a grey hoodie walks toward the camera indoors…
```
"""

EXAMPLES: list[list] = [
    # [guide_video, face_image, prompt, duration, fps, lora_strength, seed]
]

with gr.Blocks(title="BFS — Best Face Swap Video") as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1):
            guide_video = gr.Video(label="Guide Video", sources=["upload"])
            face_image   = gr.Image(label="Reference Face", type="pil", sources=["upload"])
            prompt       = gr.Textbox(
                label="Text Prompt",
                placeholder="head_swap:\nFACE: ...\nACTION: ...",
                lines=6,
            )

            with gr.Accordion("Parameters", open=False):
                duration       = gr.Slider(1, 15, value=DEFAULT_DURATION, step=0.5,  label="Duration (seconds)")
                fps            = gr.Slider(8, 30,  value=DEFAULT_FPS,      step=1.0,  label="FPS")
                lora_strength  = gr.Slider(0.5, 1.5, value=1.2,            step=0.05, label="Face Swap Strength")
                seed           = gr.Number(value=42, label="Seed", precision=0)
                hf_token       = gr.Textbox(
                    label="HF Token (optional)",
                    type="password",
                    placeholder="hf_…  — only needed if the Space owner's token has no access to a gated model",
                )

            run_btn = gr.Button("Generate", variant="primary")

        with gr.Column(scale=1):
            output_video  = gr.Video(label="Result", interactive=False)
            status_text   = gr.Textbox(label="Status", interactive=False)

    run_btn.click(
        fn=generate,
        inputs=[guide_video, face_image, prompt, duration, fps, lora_strength, seed, hf_token],
        outputs=[output_video, status_text],
        api_name=False,
    )

    gr.Markdown("""
---
**Hardware:** A100 80 GB GPU required.
**Model:** [Alissonerdx/BFS-Best-Face-Swap-Video](https://huggingface.co/Alissonerdx/BFS-Best-Face-Swap-Video) · Built on [LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3)
**License:** For research and professional VFX use only. You must have explicit consent for any likeness you process.
""")


if __name__ == "__main__":
    demo.launch()