Spaces:

ltx-community
/

ltx-2.3-deblur

Running on Zero

App Files Files Community

linoyts HF Staff commited on 9 days ago

Commit

fca22f1

verified ·

1 Parent(s): bb0db6b

fixes: natural-speed/aspect, audio, progress, prompts, examples

Browse files

Files changed (4) hide show

app.py +79 -64
examples/landscape_blur.mp4 +2 -2
examples/man_laughing_blur.mp4 +2 -2
examples/slicing_veggie_blur.mp4 +2 -2

app.py CHANGED Viewed

@@ -8,10 +8,11 @@ import random
 import tempfile
 import numpy as np
 import spaces
 import torch
 import gradio as gr
-from PIL import Image
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
@@ -30,10 +31,7 @@ NUM_STEPS = len(DISTILLED_SIGMA_VALUES)  # 8-step distilled schedule
 MAX_SEED = np.iinfo(np.int32).max
 HF_TOKEN = os.environ.get("HF_TOKEN")
-RES_PRESETS = {
-    "Fast (768×448)": (768, 448),
-    "Quality (960×544)": (960, 544),
-}
 FRAME_CHOICES = [49, 73, 97, 121]
 # --- Load pipeline once at module scope (ZeroGPU registers it) ---------------
@@ -47,40 +45,64 @@ pipe.set_adapters("deblur", LORA_SCALE)
 # --- Helpers ----------------------------------------------------------------
-def _resample(frames, n):
-    idx = np.linspace(0, len(frames) - 1, n).round().astype(int)
-    return [frames[i] for i in idx]
-def _pick_resolution(first_frame: Image.Image, preset: str):
     w, h = RES_PRESETS[preset]
     if first_frame.height > first_frame.width:
         w, h = h, w
     return w, h
-def _build_prompt(scene: str) -> str:
     scene = scene.strip() or "the scene"
-    return (
         f"Reference shows {scene}, heavily out of focus with soft defocused blur and no fine detail. "
         f"Edited shows the same scene in sharp focus with crisp detail and clean edges. "
         f"DEBLUR {scene}. "
         f"Subject identity, framing, and background geometry are identical to the reference; "
         f"only focus and sharpness differ between reference and edited."
     )
 def _duration(*args, **kwargs):
-    # args mirror the GPU fn: (video, scene, preset, num_frames, lora_scale, seed, randomize, [progress])
-    preset = args[2] if len(args) > 2 else "Fast"
-    num_frames = args[3] if len(args) > 3 else 73
     per_frame = 1.6 if "Quality" in str(preset) else 1.0
-    return int(50 + int(num_frames) * per_frame)
 # --- Inference --------------------------------------------------------------
 @spaces.GPU(duration=_duration)
-def deblur(video, scene, preset, num_frames, lora_scale, seed, randomize,
            progress=gr.Progress(track_tqdm=True)):
     if video is None:
         raise gr.Error("Please upload an out-of-focus video to sharpen.")
@@ -88,43 +110,35 @@ def deblur(video, scene, preset, num_frames, lora_scale, seed, randomize,
     if randomize:
         seed = random.randint(0, MAX_SEED)
     seed = int(seed)
-    frames = load_video(video)
-    if not frames:
-        raise gr.Error("Could not read any frames from that video.")
-    width, height = _pick_resolution(frames[0], preset)
     num_frames = int(num_frames)
-    ref = [f.convert("RGB").resize((width, height), Image.LANCZOS)
-           for f in _resample(frames, num_frames)]
     pipe.set_adapters("deblur", float(lora_scale))
-    prompt = _build_prompt(scene)
-    ref_cond = LTX2ReferenceCondition(frames=ref, strength=1.0)
-    video_out, _audio = pipe(
         prompt=prompt,
         negative_prompt="",
-        reference_conditions=[ref_cond],
         reference_downscale_factor=1,
-        width=width,
-        height=height,
-        num_frames=num_frames,
-        frame_rate=FPS,
-        num_inference_steps=NUM_STEPS,
-        sigmas=DISTILLED_SIGMA_VALUES,
-        guidance_scale=1.0,
-        stg_scale=0.0,
-        audio_guidance_scale=1.0,
-        audio_stg_scale=0.0,
         generator=torch.Generator(device="cuda").manual_seed(seed),
-        output_type="np",
-        return_dict=False,
     )
     out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
-    encode_video(video_out[0], fps=FPS, output_path=out_path)
     return out_path, seed
@@ -132,20 +146,18 @@ def deblur(video, scene, preset, num_frames, lora_scale, seed, randomize,
 with gr.Blocks(title="LTX-2.3 Deblur") as demo:
     gr.Markdown(
         "# 🔎 LTX-2.3 Video Deblur\n"
-        "Restore sharpness to out-of-focus / defocused footage while keeping subject, "
-        "framing, and scene geometry intact. Upload a soft clip and get it back in focus. "
-        "(Spatial defocus only — not motion blur.) "
-        "IC-LoRA: [`linoyts/LTX-2.3-loras`](https://huggingface.co/linoyts/LTX-2.3-loras) · "
-        "base: distilled LTX-2.3."
     )
     with gr.Row():
         with gr.Column():
             video_in = gr.Video(label="Out-of-focus video")
-            scene = gr.Textbox(
-                label="Scene description (optional — the clip does most of the work)",
-                placeholder="a city street at dusk with passing cars and neon signs",
-                lines=2,
-            )
             with gr.Accordion("Settings", open=False):
                 preset = gr.Dropdown(list(RES_PRESETS), value="Fast (768×448)", label="Resolution")
                 num_frames = gr.Dropdown(FRAME_CHOICES, value=73, label="Frames (24fps)")
@@ -158,23 +170,26 @@ with gr.Blocks(title="LTX-2.3 Deblur") as demo:
             video_out = gr.Video(label="Sharpened result")
             used_seed = gr.Number(label="Seed used", interactive=False)
-    run.click(
-        deblur,
-        inputs=[video_in, scene, preset, num_frames, lora_scale, seed, randomize],
-        outputs=[video_out, used_seed],
-    )
     gr.Examples(
         examples=[
-            ["examples/man_laughing_blur.mp4", "a man laughing, close-up portrait", "Fast (768×448)", 49, 1.0, 42, False],
-            ["examples/slicing_veggie_blur.mp4", "hands slicing fresh vegetables on a wooden cutting board", "Fast (768×448)", 49, 1.0, 42, False],
-            ["examples/landscape_blur.mp4", "a misty green mountain landscape over still water", "Fast (768×448)", 49, 1.0, 42, False],
         ],
-        inputs=[video_in, scene, preset, num_frames, lora_scale, seed, randomize],
-        outputs=[video_out, used_seed],
-        fn=deblur,
-        cache_examples=True,
-        cache_mode="lazy",
     )
 if __name__ == "__main__":

 import tempfile
 import numpy as np
+import imageio.v3 as iio
 import spaces
 import torch
 import gradio as gr
+from PIL import Image, ImageOps
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 MAX_SEED = np.iinfo(np.int32).max
 HF_TOKEN = os.environ.get("HF_TOKEN")
+RES_PRESETS = {"Fast (768×448)": (768, 448), "Quality (960×544)": (960, 544)}
 FRAME_CHOICES = [49, 73, 97, 121]
 # --- Load pipeline once at module scope (ZeroGPU registers it) ---------------
 # --- Helpers ----------------------------------------------------------------
+def _src_fps(path, default=FPS):
+    try:
+        return float(iio.immeta(path, plugin="pyav").get("fps", default)) or default
+    except Exception:
+        return default
+def _load_frames(path, num_frames, width, height):
+    """Natural-speed (real-time at 24fps), aspect-preserving (center-crop) frames."""
+    frames = load_video(path)
+    if not frames:
+        return []
+    fps = _src_fps(path)
+    out = []
+    for i in range(num_frames):
+        idx = min(int(round(i / FPS * fps)), len(frames) - 1)
+        out.append(ImageOps.fit(frames[idx].convert("RGB"), (width, height), Image.LANCZOS))
+    return out
+def _pick_resolution(first_frame, preset):
     w, h = RES_PRESETS[preset]
     if first_frame.height > first_frame.width:
         w, h = h, w
     return w, h
+def _build_prompt(scene, audio):
     scene = scene.strip() or "the scene"
+    p = (
         f"Reference shows {scene}, heavily out of focus with soft defocused blur and no fine detail. "
         f"Edited shows the same scene in sharp focus with crisp detail and clean edges. "
         f"DEBLUR {scene}. "
         f"Subject identity, framing, and background geometry are identical to the reference; "
         f"only focus and sharpness differ between reference and edited."
     )
+    if audio.strip():
+        p += f" Audio: {audio.strip()}."
+    return p
+def _export(video_np, audio, path):
+    kw = {}
+    if audio is not None:
+        kw = dict(audio=audio[0].float().cpu(), audio_sample_rate=pipe.vocoder.config.output_sampling_rate)
+    encode_video(video_np, fps=FPS, output_path=path, **kw)
 def _duration(*args, **kwargs):
+    preset = next((a for a in args if a in RES_PRESETS), "Fast")
+    num_frames = next((a for a in args if a in FRAME_CHOICES), 73)
     per_frame = 1.6 if "Quality" in str(preset) else 1.0
+    return int(60 + int(num_frames) * per_frame)
 # --- Inference --------------------------------------------------------------
 @spaces.GPU(duration=_duration)
+def deblur(video, scene, audio, preset, num_frames, lora_scale, seed, randomize,
            progress=gr.Progress(track_tqdm=True)):
     if video is None:
         raise gr.Error("Please upload an out-of-focus video to sharpen.")
     if randomize:
         seed = random.randint(0, MAX_SEED)
     seed = int(seed)
     num_frames = int(num_frames)
+    probe = load_video(video)
+    if not probe:
+        raise gr.Error("Could not read any frames from that video.")
+    width, height = _pick_resolution(probe[0], preset)
+    ref = _load_frames(video, num_frames, width, height)
     pipe.set_adapters("deblur", float(lora_scale))
+    prompt = _build_prompt(scene, audio)
+    def _cb(p, i, t, kw):
+        progress((i + 1) / NUM_STEPS, desc=f"Deblurring — step {i + 1}/{NUM_STEPS}")
+        return {}
+    video_out, audio_out = pipe(
         prompt=prompt,
         negative_prompt="",
+        reference_conditions=[LTX2ReferenceCondition(frames=ref, strength=1.0)],
         reference_downscale_factor=1,
+        width=width, height=height, num_frames=num_frames, frame_rate=FPS,
+        num_inference_steps=NUM_STEPS, sigmas=DISTILLED_SIGMA_VALUES,
+        guidance_scale=1.0, stg_scale=0.0, audio_guidance_scale=1.0, audio_stg_scale=0.0,
         generator=torch.Generator(device="cuda").manual_seed(seed),
+        output_type="np", return_dict=False, callback_on_step_end=_cb,
     )
     out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+    _export(video_out[0], audio_out, out_path)
     return out_path, seed
 with gr.Blocks(title="LTX-2.3 Deblur") as demo:
     gr.Markdown(
         "# 🔎 LTX-2.3 Video Deblur\n"
+        "Restore sharpness to out-of-focus / defocused footage while keeping subject, framing, and scene "
+        "geometry intact. (Spatial defocus only — not motion blur.) Optionally describe the soundscape and "
+        "the model generates matching audio. "
+        "IC-LoRA: [`linoyts/LTX-2.3-loras`](https://huggingface.co/linoyts/LTX-2.3-loras) · base: distilled LTX-2.3."
     )
     with gr.Row():
         with gr.Column():
             video_in = gr.Video(label="Out-of-focus video")
+            scene = gr.Textbox(label="Scene description (optional — the clip does most of the work)", lines=2,
+                               placeholder="a city street at dusk with passing cars and neon signs")
+            audio = gr.Textbox(label="Sound / audio (optional)", lines=1,
+                               placeholder="city ambience, passing cars, distant chatter")
             with gr.Accordion("Settings", open=False):
                 preset = gr.Dropdown(list(RES_PRESETS), value="Fast (768×448)", label="Resolution")
                 num_frames = gr.Dropdown(FRAME_CHOICES, value=73, label="Frames (24fps)")
             video_out = gr.Video(label="Sharpened result")
             used_seed = gr.Number(label="Seed used", interactive=False)
+    run.click(deblur, inputs=[video_in, scene, audio, preset, num_frames, lora_scale, seed, randomize],
+              outputs=[video_out, used_seed])
     gr.Examples(
         examples=[
+            ["examples/man_laughing_blur.mp4",
+             "a close-up portrait of a man laughing warmly, his face crinkled with joy, soft natural light on his skin and hair",
+             "warm hearty laughter, a quiet room ambience",
+             "Fast (768×448)", 73, 1.0, 42, False],
+            ["examples/slicing_veggie_blur.mp4",
+             "hands slicing fresh green zucchini into thin rounds on a wooden cutting board, crisp vegetable detail and a glinting knife edge",
+             "crisp rhythmic chopping on a wooden board, gentle kitchen ambience",
+             "Fast (768×448)", 73, 1.0, 42, False],
+            ["examples/landscape_blur.mp4",
+             "a misty green mountain landscape over calm still water, fine detail in the trees and rippling reflections",
+             "gentle wind over water, distant birdsong",
+             "Fast (768×448)", 73, 1.0, 42, False],
         ],
+        inputs=[video_in, scene, audio, preset, num_frames, lora_scale, seed, randomize],
+        outputs=[video_out, used_seed], fn=deblur, cache_examples=True, cache_mode="lazy",
     )
 if __name__ == "__main__":

examples/landscape_blur.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41f6d7ce4adb7e16337d39b6d0eac349d88ef0c2aa057e0e9b5cce93f1427652
-size 186588

 version https://git-lfs.github.com/spec/v1
+oid sha256:a68dbd1095dccb3a3d1e9f4f9e7191820f98eaa7de18f373043afc896946624c
+size 312352

examples/man_laughing_blur.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea111417bc2c04f55fb366baa74f5e72034c50cbe8c4adde8376a3e2c9cbb84a
-size 235092

 version https://git-lfs.github.com/spec/v1
+oid sha256:4ed0460ba306b13a815c7bb56eeb7a84eb824d351b5a6ef3f9848c7b76ce968a
+size 345836

examples/slicing_veggie_blur.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5aae50a15eba079473b01e8155346681af18dd882ae76e06306bced81a501f08
-size 232932

 version https://git-lfs.github.com/spec/v1
+oid sha256:19edbb8a77b03578b02dfa53f2b32f945120e6554def62511787004d7d9123c9
+size 347929