File size: 7,622 Bytes
1405c30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18c57e9
1405c30
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
BFS — Best Face Swap Video  ·  Hugging Face Space
"""

from __future__ import annotations

import os
import tempfile

import gradio as gr
import numpy as np
from PIL import Image

from composer import compose_frames, crop_reserved_region
from video_utils import (
    compute_target_size,
    extract_audio,
    frames_for_duration,
    load_video_frames,
    resize_frames,
    save_video,
)

# ---------------------------------------------------------------------------
# GPU decorator — no-op locally, activates the ZeroGPU grant on HF Spaces
# ---------------------------------------------------------------------------
try:
    import spaces
    GPU = spaces.GPU
except ImportError:
    def GPU(fn=None, **kwargs):  # type: ignore
        return fn if fn is not None else lambda f: f

# ---------------------------------------------------------------------------
# Global model state (loaded once per worker)
# ---------------------------------------------------------------------------
_pipeline_state: dict | None = None

REGION_SIZE = 256
DEFAULT_FPS = 24.0
DEFAULT_DURATION = 5.0
DEFAULT_RESOLUTION = 768

# ---------------------------------------------------------------------------
# Core processing function
# ---------------------------------------------------------------------------

@GPU(duration=300)
def generate(
    guide_video_path: str,
    face_image: Image.Image,
    prompt: str,
    duration: float,
    fps: float,
    lora_strength: float,
    seed: int,
    hf_token: str = "",
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> tuple[str, str]:
    """
    Full head-swap pipeline:
      1. Load + resize guide video frames
      2. Compose chroma face strip (ReservedRegionFrameComposer)
      3. Run LTX-2.3 diffusion
      4. Crop face strip from output
      5. Mux original audio back in

    Returns (output_video_path, status_message).
    """
    global _pipeline_state

    # ---- validate inputs early ----
    if guide_video_path is None:
        return "", "Please upload a guide video."
    if face_image is None:
        return "", "Please upload a reference face image."
    if not prompt.strip():
        return "", "Please enter a text prompt."

    # ---- lazy model load ----
    if _pipeline_state is None:
        from pipeline import load_pipeline
        progress(0, desc="Loading models (first run only — ~5 min)…")
        _pipeline_state = load_pipeline(
            token=hf_token.strip() or None,
            progress_cb=lambda msg: progress(0, desc=msg),
        )

    progress(0.05, desc="Loading guide video…")
    frames, source_fps = load_video_frames(guide_video_path)
    if len(frames) == 0:
        return "", "Could not read frames from the guide video."

    # ---- extract audio before we do anything else ----
    audio_tmp = tempfile.mktemp(suffix=".wav")
    has_audio = extract_audio(guide_video_path, audio_tmp)

    # ---- resize frames ----
    progress(0.10, desc="Resizing frames…")
    orig_h, orig_w = frames.shape[1], frames.shape[2]
    target_w, target_h = compute_target_size(orig_w, orig_h, DEFAULT_RESOLUTION)
    frames = resize_frames(frames, target_w, target_h)

    # ---- trim / pad to requested duration ----
    n_frames = frames_for_duration(fps, duration)
    if len(frames) >= n_frames:
        frames = frames[:n_frames]
    else:
        # loop last frame
        pad = np.stack([frames[-1]] * (n_frames - len(frames)))
        frames = np.concatenate([frames, pad], axis=0)

    # ---- compose chroma strip ----
    progress(0.15, desc="Compositing reference face strip…")
    composed = compose_frames(
        frames,
        face_image,
        region_position="left",
        region_size_px=REGION_SIZE,
    )

    # ---- run diffusion ----
    progress(0.20, desc="Running LTX-2.3 diffusion…")
    from pipeline import run_inference
    generated = run_inference(
        _pipeline_state,
        composed,
        prompt=prompt,
        fps=fps,
        lora_strength=lora_strength,
        seed=int(seed),
        progress_cb=lambda msg: progress(0.20, desc=msg),
    )

    # ---- crop face strip from output ----
    progress(0.90, desc="Cropping reserved region…")
    cropped = crop_reserved_region(
        generated,
        region_position="left",
        region_size_px=REGION_SIZE,
        output_size=(target_w, target_h),
    )

    # ---- save output video with audio ----
    progress(0.95, desc="Encoding output video…")
    out_path = tempfile.mktemp(suffix=".mp4")
    save_video(
        cropped,
        fps=fps,
        output_path=out_path,
        audio_path=audio_tmp if has_audio else None,
        audio_duration=duration,
    )

    progress(1.0, desc="Done.")
    return out_path, "Generation complete."


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------

DESCRIPTION = """
# BFS — Best Face Swap Video

Swap the identity in any video using the **V3 persistent-template** technique.
The reference face is placed in a green chroma side-strip that persists across
all frames, giving the model continuous identity conditioning throughout generation.

**Prompt format:**
```
head_swap:
FACE: Female, fair skin, ~25 years old, long wavy auburn hair, green eyes…
ACTION: A person in a grey hoodie walks toward the camera indoors…
```
"""

EXAMPLES: list[list] = [
    # [guide_video, face_image, prompt, duration, fps, lora_strength, seed]
]

with gr.Blocks(title="BFS — Best Face Swap Video") as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1):
            guide_video = gr.Video(label="Guide Video", sources=["upload"])
            face_image   = gr.Image(label="Reference Face", type="pil", sources=["upload"])
            prompt       = gr.Textbox(
                label="Text Prompt",
                placeholder="head_swap:\nFACE: ...\nACTION: ...",
                lines=6,
            )

            with gr.Accordion("Parameters", open=False):
                duration       = gr.Slider(1, 15, value=DEFAULT_DURATION, step=0.5,  label="Duration (seconds)")
                fps            = gr.Slider(8, 30,  value=DEFAULT_FPS,      step=1.0,  label="FPS")
                lora_strength  = gr.Slider(0.5, 1.5, value=1.2,            step=0.05, label="Face Swap Strength")
                seed           = gr.Number(value=42, label="Seed", precision=0)
                hf_token       = gr.Textbox(
                    label="HF Token (optional)",
                    type="password",
                    placeholder="hf_…  — only needed if the Space owner's token has no access to a gated model",
                )

            run_btn = gr.Button("Generate", variant="primary")

        with gr.Column(scale=1):
            output_video  = gr.Video(label="Result", interactive=False)
            status_text   = gr.Textbox(label="Status", interactive=False)

    run_btn.click(
        fn=generate,
        inputs=[guide_video, face_image, prompt, duration, fps, lora_strength, seed, hf_token],
        outputs=[output_video, status_text],
        api_name=False,
    )

    gr.Markdown("""
---
**Hardware:** A100 80 GB GPU required.
**Model:** [Alissonerdx/BFS-Best-Face-Swap-Video](https://huggingface.co/Alissonerdx/BFS-Best-Face-Swap-Video) · Built on [LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3)
**License:** For research and professional VFX use only. You must have explicit consent for any likeness you process.
""")


if __name__ == "__main__":
    demo.launch()