ahmaddarkhalil Claude Opus 4.8 commited on
Commit
265c11b
Β·
1 Parent(s): f3745b0

Add video tab, cap image height, tabbed UI (examples uploaded via API)

Browse files

- Cap gr.Image height (380) so an uploaded image no longer pushes examples/
buttons below the fold; tabbed Image/Video UI, Soft theme, centered layout.
- New Video tab: predict_video runs HOI per frame (subsampled to <=120 frames
for ZeroGPU) and re-encodes to H.264 via ffmpeg; shared _annotate_bgr.
- Examples (5 original demo images + 2 clips) live under examples/ and are
uploaded via the Hub API (Xet/LFS), not git.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show
  1. .gitignore +16 -0
  2. app.py +179 -71
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ checkpoints/
6
+ *.pth
7
+ # local scratch from log fetching / debugging
8
+ build_error.txt
9
+ build_fetch.txt
10
+ build_log.txt
11
+ build_raw.txt
12
+ run_log.txt
13
+ run_raw.txt
14
+ run_best.txt
15
+ # original long-named source videos (clean copies live in examples/videos/)
16
+ YTDown_*.mp4
app.py CHANGED
@@ -149,6 +149,8 @@ _mmcv_mod.__version__ = _real_mmcv_ver
149
  del _mmcv_mod, _real_mmcv_ver
150
  # ─────────────────────────────────────────────────────────────────────────────
151
 
 
 
152
  import mmcv
153
  import numpy as np
154
  import gradio as gr
@@ -238,92 +240,198 @@ def _save_vis(bgr_image, source_path):
238
  return out_path
239
 
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  @spaces.GPU(duration=60)
242
- def predict(image_path, score_thr):
243
  # Empty/cleared input (e.g. a webcam frame that wasn't captured) arrives as
244
  # None; just clear the output instead of erroring out of mmcv.imread.
245
- if image_path is None or (isinstance(image_path, str) and not image_path):
246
  return None
247
  try:
248
- orig_img = mmcv.imread(image_path)
249
- dets, embeds = run_inference(
250
- model, test_pipeline, image_path,
251
- device=DEVICE, class_names=CLASS_NAMES,
252
- score_thr=score_thr, nms_iou=NMS_IOU,
253
- )
254
- if not dets:
255
- return _save_vis(orig_img, image_path)
256
-
257
- hands = [d for d in dets if d["class_id"] == 0]
258
- firsts = [d for d in dets if d["class_id"] == 1]
259
- seconds = [d for d in dets if d["class_id"] == 2]
260
-
261
- hf_inters, fs_inters = [], []
262
- for h in hands:
263
- for f in firsts:
264
- ok, prob = call_interaction(
265
- interaction_branch,
266
- embeds[h["query_idx"]], embeds[f["query_idx"]],
267
- )
268
- if ok:
269
- hf_inters.append((h, f, prob))
270
- for f in firsts:
271
- for so in seconds:
272
- ok, prob = call_interaction(
273
- interaction_branch,
274
- embeds[f["query_idx"]], embeds[so["query_idx"]],
275
- )
276
- if ok:
277
- fs_inters.append((f, so, prob))
278
-
279
- vis = orig_img.copy()
280
- draw_ui(vis, dets, hf_inters, fs_inters, compute_style(vis.shape),
281
- verbose_labels=True)
282
  return _save_vis(vis, image_path)
 
 
 
 
 
 
 
 
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  except Exception as e:
285
  traceback.print_exc()
286
  raise gr.Error(f"{type(e).__name__}: {e}")
287
 
288
 
289
  # ── UI ────────────────────────────────────────────────────────────────────────
290
- _ex_dir = os.path.join(REPO, "demo", "example_images2")
291
- _examples = sorted(
292
- os.path.join(_ex_dir, f)
293
- for f in os.listdir(_ex_dir)
294
- if f.lower().endswith((".jpg", ".jpeg", ".png"))
295
- ) if os.path.isdir(_ex_dir) else []
296
-
297
- with gr.Blocks(title="HOI-DETR β€” Hand–Object Interaction Detection") as demo:
 
 
 
 
 
 
 
 
 
 
298
  gr.Markdown(
299
- "# HOI-DETR β€” Hand–Object Interaction Detection\n"
300
- "Detects hands, the first-order object held, and the second-order "
301
- "object it contacts. Hover the result image to download it."
 
302
  )
303
- with gr.Row():
304
- with gr.Column():
305
- img_in = gr.Image(type="filepath", label="Upload an image")
306
- thr = gr.Slider(0.0, 1.0, value=DEFAULT_THR, step=0.05,
307
- label="Score threshold")
308
- run_btn = gr.Button("Predict", variant="primary")
309
- with gr.Column():
310
- img_out = gr.Image(type="filepath", label="HOI predictions")
311
-
312
- if _examples:
313
- gr.Examples(
314
- examples=[[p] for p in _examples],
315
- inputs=[img_in],
316
- outputs=img_out,
317
- fn=lambda p: predict(p, DEFAULT_THR),
318
- examples_per_page=max(1, len(_examples)),
319
- label="Example images β€” click to run",
320
- cache_examples=False,
321
- )
322
-
323
- run_btn.click(predict, inputs=[img_in, thr], outputs=img_out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  if __name__ == "__main__":
326
- # The example images live in the cloned repo (outside cwd / tmp); gradio 5
327
- # only serves allow-listed paths, so expose the examples dir explicitly.
328
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True,
329
- allowed_paths=[_ex_dir])
 
149
  del _mmcv_mod, _real_mmcv_ver
150
  # ─────────────────────────────────────────────────────────────────────────────
151
 
152
+ import math
153
+ import cv2
154
  import mmcv
155
  import numpy as np
156
  import gradio as gr
 
240
  return out_path
241
 
242
 
243
+ def _annotate_bgr(orig_img, score_thr):
244
+ """Run HOI detection on a BGR image array and return the annotated frame.
245
+
246
+ Shared by the image and video paths. run_inference loads from a file, so we
247
+ stage the frame to a temp jpg (reused across calls).
248
+ """
249
+ tmp = os.path.join(tempfile.gettempdir(), "hoi_frame_in.jpg")
250
+ mmcv.imwrite(orig_img, tmp)
251
+ dets, embeds = run_inference(
252
+ model, test_pipeline, tmp,
253
+ device=DEVICE, class_names=CLASS_NAMES,
254
+ score_thr=score_thr, nms_iou=NMS_IOU,
255
+ )
256
+ vis = orig_img.copy()
257
+ if not dets:
258
+ return vis
259
+
260
+ hands = [d for d in dets if d["class_id"] == 0]
261
+ firsts = [d for d in dets if d["class_id"] == 1]
262
+ seconds = [d for d in dets if d["class_id"] == 2]
263
+
264
+ hf_inters, fs_inters = [], []
265
+ for h in hands:
266
+ for f in firsts:
267
+ ok, prob = call_interaction(
268
+ interaction_branch,
269
+ embeds[h["query_idx"]], embeds[f["query_idx"]],
270
+ )
271
+ if ok:
272
+ hf_inters.append((h, f, prob))
273
+ for f in firsts:
274
+ for so in seconds:
275
+ ok, prob = call_interaction(
276
+ interaction_branch,
277
+ embeds[f["query_idx"]], embeds[so["query_idx"]],
278
+ )
279
+ if ok:
280
+ fs_inters.append((f, so, prob))
281
+
282
+ draw_ui(vis, dets, hf_inters, fs_inters, compute_style(vis.shape),
283
+ verbose_labels=True)
284
+ return vis
285
+
286
+
287
  @spaces.GPU(duration=60)
288
+ def predict_image(image_path, score_thr):
289
  # Empty/cleared input (e.g. a webcam frame that wasn't captured) arrives as
290
  # None; just clear the output instead of erroring out of mmcv.imread.
291
+ if not image_path:
292
  return None
293
  try:
294
+ vis = _annotate_bgr(mmcv.imread(image_path), score_thr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  return _save_vis(vis, image_path)
296
+ except Exception as e:
297
+ traceback.print_exc()
298
+ raise gr.Error(f"{type(e).__name__}: {e}")
299
+
300
+
301
+ # Cap processed frames so a long clip still fits ZeroGPU's per-call budget;
302
+ # longer videos are temporally subsampled (output fps lowered to match).
303
+ MAX_VIDEO_FRAMES = 120
304
 
305
+
306
+ @spaces.GPU(duration=180)
307
+ def predict_video(video_path, score_thr, progress=gr.Progress()):
308
+ if not video_path:
309
+ return None
310
+ try:
311
+ cap = cv2.VideoCapture(video_path)
312
+ in_fps = cap.get(cv2.CAP_PROP_FPS) or 24.0
313
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
314
+ step = max(1, math.ceil(total / MAX_VIDEO_FRAMES)) if total > 0 else 1
315
+ out_fps = max(1.0, in_fps / step)
316
+ expected = (total // step) if total > 0 else None
317
+
318
+ work_dir = tempfile.mkdtemp(prefix="hoi_vid_")
319
+ raw_path = os.path.join(work_dir, "raw.mp4")
320
+ writer, idx, n_proc = None, 0, 0
321
+ while True:
322
+ ok, frame = cap.read()
323
+ if not ok:
324
+ break
325
+ if idx % step == 0:
326
+ vis = _annotate_bgr(frame, score_thr)
327
+ if writer is None:
328
+ h, w = vis.shape[:2]
329
+ writer = cv2.VideoWriter(
330
+ raw_path, cv2.VideoWriter_fourcc(*"mp4v"),
331
+ out_fps, (w, h))
332
+ writer.write(vis)
333
+ n_proc += 1
334
+ if expected:
335
+ progress(n_proc / expected, desc=f"Frame {n_proc}/{expected}")
336
+ if n_proc >= MAX_VIDEO_FRAMES:
337
+ break
338
+ idx += 1
339
+ cap.release()
340
+ if writer is not None:
341
+ writer.release()
342
+ if n_proc == 0:
343
+ return None
344
+
345
+ # Re-encode to H.264 so it plays in-browser (mp4v often won't).
346
+ out_path = os.path.join(work_dir, "out.mp4")
347
+ try:
348
+ subprocess.run(
349
+ ["ffmpeg", "-y", "-i", raw_path, "-c:v", "libx264",
350
+ "-pix_fmt", "yuv420p", "-movflags", "+faststart", out_path],
351
+ check=True, capture_output=True)
352
+ return out_path
353
+ except Exception as e: # noqa: BLE001
354
+ print(f"[video] ffmpeg re-encode failed ({e!r}); returning raw",
355
+ flush=True)
356
+ return raw_path
357
  except Exception as e:
358
  traceback.print_exc()
359
  raise gr.Error(f"{type(e).__name__}: {e}")
360
 
361
 
362
  # ── UI ────────────────────────────────────────────────────────────────────────
363
+ _APP_DIR = os.path.dirname(os.path.abspath(__file__))
364
+ _IMG_DIR = os.path.join(_APP_DIR, "examples", "images")
365
+ _VID_DIR = os.path.join(_APP_DIR, "examples", "videos")
366
+
367
+
368
+ def _list(d, exts):
369
+ return sorted(
370
+ os.path.join(d, f) for f in os.listdir(d) if f.lower().endswith(exts)
371
+ ) if os.path.isdir(d) else []
372
+
373
+
374
+ img_examples = _list(_IMG_DIR, (".jpg", ".jpeg", ".png"))
375
+ vid_examples = _list(_VID_DIR, (".mp4", ".mov", ".webm", ".avi"))
376
+
377
+ _CSS = ".gradio-container {max-width: 1100px !important; margin: auto;}"
378
+
379
+ with gr.Blocks(title="HOI-DETR β€” Hand–Object Interaction Detection",
380
+ theme=gr.themes.Soft(), css=_CSS) as demo:
381
  gr.Markdown(
382
+ "# πŸ–οΈ HOI-DETR β€” Hand–Object Interaction Detection\n"
383
+ "Detects **hands**, the **first-order object** held, and the "
384
+ "**second-order object** it contacts, with their interaction links. "
385
+ "Try an example or upload your own image or video."
386
  )
387
+
388
+ with gr.Tabs():
389
+ # ── Image tab ────────────────────────────────────────────────
390
+ with gr.Tab("πŸ–ΌοΈ Image"):
391
+ with gr.Row(equal_height=True):
392
+ with gr.Column():
393
+ img_in = gr.Image(type="filepath", label="Input image",
394
+ height=380)
395
+ img_thr = gr.Slider(0.0, 1.0, value=DEFAULT_THR, step=0.05,
396
+ label="Score threshold")
397
+ img_btn = gr.Button("Detect", variant="primary")
398
+ with gr.Column():
399
+ img_out = gr.Image(label="HOI predictions", height=380)
400
+ if img_examples:
401
+ gr.Examples(
402
+ examples=[[p] for p in img_examples],
403
+ inputs=[img_in], outputs=img_out,
404
+ fn=lambda p: predict_image(p, DEFAULT_THR),
405
+ cache_examples=False,
406
+ examples_per_page=len(img_examples),
407
+ label="Example images β€” click to run",
408
+ )
409
+ img_btn.click(predict_image, [img_in, img_thr], img_out)
410
+
411
+ # ── Video tab ───────────────────────────────���────────────────
412
+ with gr.Tab("🎬 Video"):
413
+ with gr.Row(equal_height=True):
414
+ with gr.Column():
415
+ vid_in = gr.Video(label="Input video", height=380)
416
+ vid_thr = gr.Slider(0.0, 1.0, value=DEFAULT_THR, step=0.05,
417
+ label="Score threshold")
418
+ vid_btn = gr.Button("Process video", variant="primary")
419
+ gr.Markdown(
420
+ f"<sub>Processes up to {MAX_VIDEO_FRAMES} frames "
421
+ "(longer clips are subsampled). This can take a minute."
422
+ "</sub>")
423
+ with gr.Column():
424
+ vid_out = gr.Video(label="HOI predictions", height=380)
425
+ if vid_examples:
426
+ gr.Examples(
427
+ examples=[[p] for p in vid_examples],
428
+ inputs=[vid_in],
429
+ examples_per_page=len(vid_examples),
430
+ label="Example videos β€” click to load, then Process",
431
+ )
432
+ vid_btn.click(predict_video, [vid_in, vid_thr], vid_out)
433
+
434
 
435
  if __name__ == "__main__":
 
 
436
  demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True,
437
+ allowed_paths=[_IMG_DIR, _VID_DIR])