linoyts HF Staff commited on
Commit
fca22f1
·
verified ·
1 Parent(s): bb0db6b

fixes: natural-speed/aspect, audio, progress, prompts, examples

Browse files
app.py CHANGED
@@ -8,10 +8,11 @@ import random
8
  import tempfile
9
 
10
  import numpy as np
 
11
  import spaces
12
  import torch
13
  import gradio as gr
14
- from PIL import Image
15
  from huggingface_hub import hf_hub_download
16
  from safetensors.torch import load_file
17
 
@@ -30,10 +31,7 @@ NUM_STEPS = len(DISTILLED_SIGMA_VALUES) # 8-step distilled schedule
30
  MAX_SEED = np.iinfo(np.int32).max
31
  HF_TOKEN = os.environ.get("HF_TOKEN")
32
 
33
- RES_PRESETS = {
34
- "Fast (768×448)": (768, 448),
35
- "Quality (960×544)": (960, 544),
36
- }
37
  FRAME_CHOICES = [49, 73, 97, 121]
38
 
39
  # --- Load pipeline once at module scope (ZeroGPU registers it) ---------------
@@ -47,40 +45,64 @@ pipe.set_adapters("deblur", LORA_SCALE)
47
 
48
 
49
  # --- Helpers ----------------------------------------------------------------
50
- def _resample(frames, n):
51
- idx = np.linspace(0, len(frames) - 1, n).round().astype(int)
52
- return [frames[i] for i in idx]
 
 
53
 
54
 
55
- def _pick_resolution(first_frame: Image.Image, preset: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  w, h = RES_PRESETS[preset]
57
  if first_frame.height > first_frame.width:
58
  w, h = h, w
59
  return w, h
60
 
61
 
62
- def _build_prompt(scene: str) -> str:
63
  scene = scene.strip() or "the scene"
64
- return (
65
  f"Reference shows {scene}, heavily out of focus with soft defocused blur and no fine detail. "
66
  f"Edited shows the same scene in sharp focus with crisp detail and clean edges. "
67
  f"DEBLUR {scene}. "
68
  f"Subject identity, framing, and background geometry are identical to the reference; "
69
  f"only focus and sharpness differ between reference and edited."
70
  )
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  def _duration(*args, **kwargs):
74
- # args mirror the GPU fn: (video, scene, preset, num_frames, lora_scale, seed, randomize, [progress])
75
- preset = args[2] if len(args) > 2 else "Fast"
76
- num_frames = args[3] if len(args) > 3 else 73
77
  per_frame = 1.6 if "Quality" in str(preset) else 1.0
78
- return int(50 + int(num_frames) * per_frame)
79
 
80
 
81
  # --- Inference --------------------------------------------------------------
82
  @spaces.GPU(duration=_duration)
83
- def deblur(video, scene, preset, num_frames, lora_scale, seed, randomize,
84
  progress=gr.Progress(track_tqdm=True)):
85
  if video is None:
86
  raise gr.Error("Please upload an out-of-focus video to sharpen.")
@@ -88,43 +110,35 @@ def deblur(video, scene, preset, num_frames, lora_scale, seed, randomize,
88
  if randomize:
89
  seed = random.randint(0, MAX_SEED)
90
  seed = int(seed)
91
-
92
- frames = load_video(video)
93
- if not frames:
94
- raise gr.Error("Could not read any frames from that video.")
95
-
96
- width, height = _pick_resolution(frames[0], preset)
97
  num_frames = int(num_frames)
98
 
99
- ref = [f.convert("RGB").resize((width, height), Image.LANCZOS)
100
- for f in _resample(frames, num_frames)]
 
 
101
 
 
102
  pipe.set_adapters("deblur", float(lora_scale))
103
- prompt = _build_prompt(scene)
104
- ref_cond = LTX2ReferenceCondition(frames=ref, strength=1.0)
 
 
 
105
 
106
- video_out, _audio = pipe(
107
  prompt=prompt,
108
  negative_prompt="",
109
- reference_conditions=[ref_cond],
110
  reference_downscale_factor=1,
111
- width=width,
112
- height=height,
113
- num_frames=num_frames,
114
- frame_rate=FPS,
115
- num_inference_steps=NUM_STEPS,
116
- sigmas=DISTILLED_SIGMA_VALUES,
117
- guidance_scale=1.0,
118
- stg_scale=0.0,
119
- audio_guidance_scale=1.0,
120
- audio_stg_scale=0.0,
121
  generator=torch.Generator(device="cuda").manual_seed(seed),
122
- output_type="np",
123
- return_dict=False,
124
  )
125
 
126
  out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
127
- encode_video(video_out[0], fps=FPS, output_path=out_path)
128
  return out_path, seed
129
 
130
 
@@ -132,20 +146,18 @@ def deblur(video, scene, preset, num_frames, lora_scale, seed, randomize,
132
  with gr.Blocks(title="LTX-2.3 Deblur") as demo:
133
  gr.Markdown(
134
  "# 🔎 LTX-2.3 Video Deblur\n"
135
- "Restore sharpness to out-of-focus / defocused footage while keeping subject, "
136
- "framing, and scene geometry intact. Upload a soft clip and get it back in focus. "
137
- "(Spatial defocus only not motion blur.) "
138
- "IC-LoRA: [`linoyts/LTX-2.3-loras`](https://huggingface.co/linoyts/LTX-2.3-loras) · "
139
- "base: distilled LTX-2.3."
140
  )
141
  with gr.Row():
142
  with gr.Column():
143
  video_in = gr.Video(label="Out-of-focus video")
144
- scene = gr.Textbox(
145
- label="Scene description (optional the clip does most of the work)",
146
- placeholder="a city street at dusk with passing cars and neon signs",
147
- lines=2,
148
- )
149
  with gr.Accordion("Settings", open=False):
150
  preset = gr.Dropdown(list(RES_PRESETS), value="Fast (768×448)", label="Resolution")
151
  num_frames = gr.Dropdown(FRAME_CHOICES, value=73, label="Frames (24fps)")
@@ -158,23 +170,26 @@ with gr.Blocks(title="LTX-2.3 Deblur") as demo:
158
  video_out = gr.Video(label="Sharpened result")
159
  used_seed = gr.Number(label="Seed used", interactive=False)
160
 
161
- run.click(
162
- deblur,
163
- inputs=[video_in, scene, preset, num_frames, lora_scale, seed, randomize],
164
- outputs=[video_out, used_seed],
165
- )
166
 
167
  gr.Examples(
168
  examples=[
169
- ["examples/man_laughing_blur.mp4", "a man laughing, close-up portrait", "Fast (768×448)", 49, 1.0, 42, False],
170
- ["examples/slicing_veggie_blur.mp4", "hands slicing fresh vegetables on a wooden cutting board", "Fast (768×448)", 49, 1.0, 42, False],
171
- ["examples/landscape_blur.mp4", "a misty green mountain landscape over still water", "Fast (768×448)", 49, 1.0, 42, False],
 
 
 
 
 
 
 
 
 
172
  ],
173
- inputs=[video_in, scene, preset, num_frames, lora_scale, seed, randomize],
174
- outputs=[video_out, used_seed],
175
- fn=deblur,
176
- cache_examples=True,
177
- cache_mode="lazy",
178
  )
179
 
180
  if __name__ == "__main__":
 
8
  import tempfile
9
 
10
  import numpy as np
11
+ import imageio.v3 as iio
12
  import spaces
13
  import torch
14
  import gradio as gr
15
+ from PIL import Image, ImageOps
16
  from huggingface_hub import hf_hub_download
17
  from safetensors.torch import load_file
18
 
 
31
  MAX_SEED = np.iinfo(np.int32).max
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
33
 
34
+ RES_PRESETS = {"Fast (768×448)": (768, 448), "Quality (960×544)": (960, 544)}
 
 
 
35
  FRAME_CHOICES = [49, 73, 97, 121]
36
 
37
  # --- Load pipeline once at module scope (ZeroGPU registers it) ---------------
 
45
 
46
 
47
  # --- Helpers ----------------------------------------------------------------
48
+ def _src_fps(path, default=FPS):
49
+ try:
50
+ return float(iio.immeta(path, plugin="pyav").get("fps", default)) or default
51
+ except Exception:
52
+ return default
53
 
54
 
55
+ def _load_frames(path, num_frames, width, height):
56
+ """Natural-speed (real-time at 24fps), aspect-preserving (center-crop) frames."""
57
+ frames = load_video(path)
58
+ if not frames:
59
+ return []
60
+ fps = _src_fps(path)
61
+ out = []
62
+ for i in range(num_frames):
63
+ idx = min(int(round(i / FPS * fps)), len(frames) - 1)
64
+ out.append(ImageOps.fit(frames[idx].convert("RGB"), (width, height), Image.LANCZOS))
65
+ return out
66
+
67
+
68
+ def _pick_resolution(first_frame, preset):
69
  w, h = RES_PRESETS[preset]
70
  if first_frame.height > first_frame.width:
71
  w, h = h, w
72
  return w, h
73
 
74
 
75
+ def _build_prompt(scene, audio):
76
  scene = scene.strip() or "the scene"
77
+ p = (
78
  f"Reference shows {scene}, heavily out of focus with soft defocused blur and no fine detail. "
79
  f"Edited shows the same scene in sharp focus with crisp detail and clean edges. "
80
  f"DEBLUR {scene}. "
81
  f"Subject identity, framing, and background geometry are identical to the reference; "
82
  f"only focus and sharpness differ between reference and edited."
83
  )
84
+ if audio.strip():
85
+ p += f" Audio: {audio.strip()}."
86
+ return p
87
+
88
+
89
+ def _export(video_np, audio, path):
90
+ kw = {}
91
+ if audio is not None:
92
+ kw = dict(audio=audio[0].float().cpu(), audio_sample_rate=pipe.vocoder.config.output_sampling_rate)
93
+ encode_video(video_np, fps=FPS, output_path=path, **kw)
94
 
95
 
96
  def _duration(*args, **kwargs):
97
+ preset = next((a for a in args if a in RES_PRESETS), "Fast")
98
+ num_frames = next((a for a in args if a in FRAME_CHOICES), 73)
 
99
  per_frame = 1.6 if "Quality" in str(preset) else 1.0
100
+ return int(60 + int(num_frames) * per_frame)
101
 
102
 
103
  # --- Inference --------------------------------------------------------------
104
  @spaces.GPU(duration=_duration)
105
+ def deblur(video, scene, audio, preset, num_frames, lora_scale, seed, randomize,
106
  progress=gr.Progress(track_tqdm=True)):
107
  if video is None:
108
  raise gr.Error("Please upload an out-of-focus video to sharpen.")
 
110
  if randomize:
111
  seed = random.randint(0, MAX_SEED)
112
  seed = int(seed)
 
 
 
 
 
 
113
  num_frames = int(num_frames)
114
 
115
+ probe = load_video(video)
116
+ if not probe:
117
+ raise gr.Error("Could not read any frames from that video.")
118
+ width, height = _pick_resolution(probe[0], preset)
119
 
120
+ ref = _load_frames(video, num_frames, width, height)
121
  pipe.set_adapters("deblur", float(lora_scale))
122
+ prompt = _build_prompt(scene, audio)
123
+
124
+ def _cb(p, i, t, kw):
125
+ progress((i + 1) / NUM_STEPS, desc=f"Deblurring — step {i + 1}/{NUM_STEPS}")
126
+ return {}
127
 
128
+ video_out, audio_out = pipe(
129
  prompt=prompt,
130
  negative_prompt="",
131
+ reference_conditions=[LTX2ReferenceCondition(frames=ref, strength=1.0)],
132
  reference_downscale_factor=1,
133
+ width=width, height=height, num_frames=num_frames, frame_rate=FPS,
134
+ num_inference_steps=NUM_STEPS, sigmas=DISTILLED_SIGMA_VALUES,
135
+ guidance_scale=1.0, stg_scale=0.0, audio_guidance_scale=1.0, audio_stg_scale=0.0,
 
 
 
 
 
 
 
136
  generator=torch.Generator(device="cuda").manual_seed(seed),
137
+ output_type="np", return_dict=False, callback_on_step_end=_cb,
 
138
  )
139
 
140
  out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
141
+ _export(video_out[0], audio_out, out_path)
142
  return out_path, seed
143
 
144
 
 
146
  with gr.Blocks(title="LTX-2.3 Deblur") as demo:
147
  gr.Markdown(
148
  "# 🔎 LTX-2.3 Video Deblur\n"
149
+ "Restore sharpness to out-of-focus / defocused footage while keeping subject, framing, and scene "
150
+ "geometry intact. (Spatial defocus only not motion blur.) Optionally describe the soundscape and "
151
+ "the model generates matching audio. "
152
+ "IC-LoRA: [`linoyts/LTX-2.3-loras`](https://huggingface.co/linoyts/LTX-2.3-loras) · base: distilled LTX-2.3."
 
153
  )
154
  with gr.Row():
155
  with gr.Column():
156
  video_in = gr.Video(label="Out-of-focus video")
157
+ scene = gr.Textbox(label="Scene description (optional — the clip does most of the work)", lines=2,
158
+ placeholder="a city street at dusk with passing cars and neon signs")
159
+ audio = gr.Textbox(label="Sound / audio (optional)", lines=1,
160
+ placeholder="city ambience, passing cars, distant chatter")
 
161
  with gr.Accordion("Settings", open=False):
162
  preset = gr.Dropdown(list(RES_PRESETS), value="Fast (768×448)", label="Resolution")
163
  num_frames = gr.Dropdown(FRAME_CHOICES, value=73, label="Frames (24fps)")
 
170
  video_out = gr.Video(label="Sharpened result")
171
  used_seed = gr.Number(label="Seed used", interactive=False)
172
 
173
+ run.click(deblur, inputs=[video_in, scene, audio, preset, num_frames, lora_scale, seed, randomize],
174
+ outputs=[video_out, used_seed])
 
 
 
175
 
176
  gr.Examples(
177
  examples=[
178
+ ["examples/man_laughing_blur.mp4",
179
+ "a close-up portrait of a man laughing warmly, his face crinkled with joy, soft natural light on his skin and hair",
180
+ "warm hearty laughter, a quiet room ambience",
181
+ "Fast (768×448)", 73, 1.0, 42, False],
182
+ ["examples/slicing_veggie_blur.mp4",
183
+ "hands slicing fresh green zucchini into thin rounds on a wooden cutting board, crisp vegetable detail and a glinting knife edge",
184
+ "crisp rhythmic chopping on a wooden board, gentle kitchen ambience",
185
+ "Fast (768×448)", 73, 1.0, 42, False],
186
+ ["examples/landscape_blur.mp4",
187
+ "a misty green mountain landscape over calm still water, fine detail in the trees and rippling reflections",
188
+ "gentle wind over water, distant birdsong",
189
+ "Fast (768×448)", 73, 1.0, 42, False],
190
  ],
191
+ inputs=[video_in, scene, audio, preset, num_frames, lora_scale, seed, randomize],
192
+ outputs=[video_out, used_seed], fn=deblur, cache_examples=True, cache_mode="lazy",
 
 
 
193
  )
194
 
195
  if __name__ == "__main__":
examples/landscape_blur.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41f6d7ce4adb7e16337d39b6d0eac349d88ef0c2aa057e0e9b5cce93f1427652
3
- size 186588
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a68dbd1095dccb3a3d1e9f4f9e7191820f98eaa7de18f373043afc896946624c
3
+ size 312352
examples/man_laughing_blur.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea111417bc2c04f55fb366baa74f5e72034c50cbe8c4adde8376a3e2c9cbb84a
3
- size 235092
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed0460ba306b13a815c7bb56eeb7a84eb824d351b5a6ef3f9848c7b76ce968a
3
+ size 345836
examples/slicing_veggie_blur.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5aae50a15eba079473b01e8155346681af18dd882ae76e06306bced81a501f08
3
- size 232932
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19edbb8a77b03578b02dfa53f2b32f945120e6554def62511787004d7d9123c9
3
+ size 347929