Spaces:

ahmaddarkhalil
/

hoi-detr-demo

Running on Zero

ahmaddarkhalil Claude Opus 4.8 commited on 3 days ago

Commit

265c11b

1 Parent(s): f3745b0

Add video tab, cap image height, tabbed UI (examples uploaded via API)

- Cap gr.Image height (380) so an uploaded image no longer pushes examples/
buttons below the fold; tabbed Image/Video UI, Soft theme, centered layout.
- New Video tab: predict_video runs HOI per frame (subsampled to <=120 frames
for ZeroGPU) and re-encodes to H.264 via ffmpeg; shared _annotate_bgr.
- Examples (5 original demo images + 2 clips) live under examples/ and are
uploaded via the Hub API (Xet/LFS), not git.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show

.gitignore +16 -0
app.py +179 -71

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+.DS_Store
+__pycache__/
+*.py[cod]
+*.egg-info/
+checkpoints/
+*.pth
+# local scratch from log fetching / debugging
+build_error.txt
+build_fetch.txt
+build_log.txt
+build_raw.txt
+run_log.txt
+run_raw.txt
+run_best.txt
+# original long-named source videos (clean copies live in examples/videos/)
+YTDown_*.mp4

app.py CHANGED Viewed

@@ -149,6 +149,8 @@ _mmcv_mod.__version__ = _real_mmcv_ver
 del _mmcv_mod, _real_mmcv_ver
 # ─────────────────────────────────────────────────────────────────────────────
 import mmcv
 import numpy as np
 import gradio as gr
@@ -238,92 +240,198 @@ def _save_vis(bgr_image, source_path):
     return out_path
 @spaces.GPU(duration=60)
-def predict(image_path, score_thr):
     # Empty/cleared input (e.g. a webcam frame that wasn't captured) arrives as
     # None; just clear the output instead of erroring out of mmcv.imread.
-    if image_path is None or (isinstance(image_path, str) and not image_path):
         return None
     try:
-        orig_img = mmcv.imread(image_path)
-        dets, embeds = run_inference(
-            model, test_pipeline, image_path,
-            device=DEVICE, class_names=CLASS_NAMES,
-            score_thr=score_thr, nms_iou=NMS_IOU,
-        )
-        if not dets:
-            return _save_vis(orig_img, image_path)
-        hands   = [d for d in dets if d["class_id"] == 0]
-        firsts  = [d for d in dets if d["class_id"] == 1]
-        seconds = [d for d in dets if d["class_id"] == 2]
-        hf_inters, fs_inters = [], []
-        for h in hands:
-            for f in firsts:
-                ok, prob = call_interaction(
-                    interaction_branch,
-                    embeds[h["query_idx"]], embeds[f["query_idx"]],
-                )
-                if ok:
-                    hf_inters.append((h, f, prob))
-        for f in firsts:
-            for so in seconds:
-                ok, prob = call_interaction(
-                    interaction_branch,
-                    embeds[f["query_idx"]], embeds[so["query_idx"]],
-                )
-                if ok:
-                    fs_inters.append((f, so, prob))
-        vis = orig_img.copy()
-        draw_ui(vis, dets, hf_inters, fs_inters, compute_style(vis.shape),
-                verbose_labels=True)
         return _save_vis(vis, image_path)
     except Exception as e:
         traceback.print_exc()
         raise gr.Error(f"{type(e).__name__}: {e}")
 # ── UI ────────────────────────────────────────────────────────────────────────
-_ex_dir = os.path.join(REPO, "demo", "example_images2")
-_examples = sorted(
-    os.path.join(_ex_dir, f)
-    for f in os.listdir(_ex_dir)
-    if f.lower().endswith((".jpg", ".jpeg", ".png"))
-) if os.path.isdir(_ex_dir) else []
-with gr.Blocks(title="HOI-DETR — Hand–Object Interaction Detection") as demo:
     gr.Markdown(
-        "# HOI-DETR — Hand–Object Interaction Detection\n"
-        "Detects hands, the first-order object held, and the second-order "
-        "object it contacts. Hover the result image to download it."
     )
-    with gr.Row():
-        with gr.Column():
-            img_in  = gr.Image(type="filepath", label="Upload an image")
-            thr     = gr.Slider(0.0, 1.0, value=DEFAULT_THR, step=0.05,
-                                label="Score threshold")
-            run_btn = gr.Button("Predict", variant="primary")
-        with gr.Column():
-            img_out = gr.Image(type="filepath", label="HOI predictions")
-    if _examples:
-        gr.Examples(
-            examples=[[p] for p in _examples],
-            inputs=[img_in],
-            outputs=img_out,
-            fn=lambda p: predict(p, DEFAULT_THR),
-            examples_per_page=max(1, len(_examples)),
-            label="Example images — click to run",
-            cache_examples=False,
-        )
-    run_btn.click(predict, inputs=[img_in, thr], outputs=img_out)
 if __name__ == "__main__":
-    # The example images live in the cloned repo (outside cwd / tmp); gradio 5
-    # only serves allow-listed paths, so expose the examples dir explicitly.
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True,
-                allowed_paths=[_ex_dir])

 del _mmcv_mod, _real_mmcv_ver
 # ─────────────────────────────────────────────────────────────────────────────
+import math
+import cv2
 import mmcv
 import numpy as np
 import gradio as gr
     return out_path
+def _annotate_bgr(orig_img, score_thr):
+    """Run HOI detection on a BGR image array and return the annotated frame.
+    Shared by the image and video paths. run_inference loads from a file, so we
+    stage the frame to a temp jpg (reused across calls).
+    """
+    tmp = os.path.join(tempfile.gettempdir(), "hoi_frame_in.jpg")
+    mmcv.imwrite(orig_img, tmp)
+    dets, embeds = run_inference(
+        model, test_pipeline, tmp,
+        device=DEVICE, class_names=CLASS_NAMES,
+        score_thr=score_thr, nms_iou=NMS_IOU,
+    )
+    vis = orig_img.copy()
+    if not dets:
+        return vis
+    hands   = [d for d in dets if d["class_id"] == 0]
+    firsts  = [d for d in dets if d["class_id"] == 1]
+    seconds = [d for d in dets if d["class_id"] == 2]
+    hf_inters, fs_inters = [], []
+    for h in hands:
+        for f in firsts:
+            ok, prob = call_interaction(
+                interaction_branch,
+                embeds[h["query_idx"]], embeds[f["query_idx"]],
+            )
+            if ok:
+                hf_inters.append((h, f, prob))
+    for f in firsts:
+        for so in seconds:
+            ok, prob = call_interaction(
+                interaction_branch,
+                embeds[f["query_idx"]], embeds[so["query_idx"]],
+            )
+            if ok:
+                fs_inters.append((f, so, prob))
+    draw_ui(vis, dets, hf_inters, fs_inters, compute_style(vis.shape),
+            verbose_labels=True)
+    return vis
 @spaces.GPU(duration=60)
+def predict_image(image_path, score_thr):
     # Empty/cleared input (e.g. a webcam frame that wasn't captured) arrives as
     # None; just clear the output instead of erroring out of mmcv.imread.
+    if not image_path:
         return None
     try:
+        vis = _annotate_bgr(mmcv.imread(image_path), score_thr)
         return _save_vis(vis, image_path)
+    except Exception as e:
+        traceback.print_exc()
+        raise gr.Error(f"{type(e).__name__}: {e}")
+# Cap processed frames so a long clip still fits ZeroGPU's per-call budget;
+# longer videos are temporally subsampled (output fps lowered to match).
+MAX_VIDEO_FRAMES = 120
+@spaces.GPU(duration=180)
+def predict_video(video_path, score_thr, progress=gr.Progress()):
+    if not video_path:
+        return None
+    try:
+        cap = cv2.VideoCapture(video_path)
+        in_fps = cap.get(cv2.CAP_PROP_FPS) or 24.0
+        total  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
+        step   = max(1, math.ceil(total / MAX_VIDEO_FRAMES)) if total > 0 else 1
+        out_fps = max(1.0, in_fps / step)
+        expected = (total // step) if total > 0 else None
+        work_dir = tempfile.mkdtemp(prefix="hoi_vid_")
+        raw_path = os.path.join(work_dir, "raw.mp4")
+        writer, idx, n_proc = None, 0, 0
+        while True:
+            ok, frame = cap.read()
+            if not ok:
+                break
+            if idx % step == 0:
+                vis = _annotate_bgr(frame, score_thr)
+                if writer is None:
+                    h, w = vis.shape[:2]
+                    writer = cv2.VideoWriter(
+                        raw_path, cv2.VideoWriter_fourcc(*"mp4v"),
+                        out_fps, (w, h))
+                writer.write(vis)
+                n_proc += 1
+                if expected:
+                    progress(n_proc / expected, desc=f"Frame {n_proc}/{expected}")
+                if n_proc >= MAX_VIDEO_FRAMES:
+                    break
+            idx += 1
+        cap.release()
+        if writer is not None:
+            writer.release()
+        if n_proc == 0:
+            return None
+        # Re-encode to H.264 so it plays in-browser (mp4v often won't).
+        out_path = os.path.join(work_dir, "out.mp4")
+        try:
+            subprocess.run(
+                ["ffmpeg", "-y", "-i", raw_path, "-c:v", "libx264",
+                 "-pix_fmt", "yuv420p", "-movflags", "+faststart", out_path],
+                check=True, capture_output=True)
+            return out_path
+        except Exception as e:  # noqa: BLE001
+            print(f"[video] ffmpeg re-encode failed ({e!r}); returning raw",
+                  flush=True)
+            return raw_path
     except Exception as e:
         traceback.print_exc()
         raise gr.Error(f"{type(e).__name__}: {e}")
 # ── UI ────────────────────────────────────────────────────────────────────────
+_APP_DIR  = os.path.dirname(os.path.abspath(__file__))
+_IMG_DIR  = os.path.join(_APP_DIR, "examples", "images")
+_VID_DIR  = os.path.join(_APP_DIR, "examples", "videos")
+def _list(d, exts):
+    return sorted(
+        os.path.join(d, f) for f in os.listdir(d) if f.lower().endswith(exts)
+    ) if os.path.isdir(d) else []
+img_examples = _list(_IMG_DIR, (".jpg", ".jpeg", ".png"))
+vid_examples = _list(_VID_DIR, (".mp4", ".mov", ".webm", ".avi"))
+_CSS = ".gradio-container {max-width: 1100px !important; margin: auto;}"
+with gr.Blocks(title="HOI-DETR — Hand–Object Interaction Detection",
+               theme=gr.themes.Soft(), css=_CSS) as demo:
     gr.Markdown(
+        "# 🖐️ HOI-DETR — Hand–Object Interaction Detection\n"
+        "Detects **hands**, the **first-order object** held, and the "
+        "**second-order object** it contacts, with their interaction links. "
+        "Try an example or upload your own image or video."
     )
+    with gr.Tabs():
+        # ── Image tab ────────────────────────────────────────────────
+        with gr.Tab("🖼️ Image"):
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    img_in = gr.Image(type="filepath", label="Input image",
+                                      height=380)
+                    img_thr = gr.Slider(0.0, 1.0, value=DEFAULT_THR, step=0.05,
+                                        label="Score threshold")
+                    img_btn = gr.Button("Detect", variant="primary")
+                with gr.Column():
+                    img_out = gr.Image(label="HOI predictions", height=380)
+            if img_examples:
+                gr.Examples(
+                    examples=[[p] for p in img_examples],
+                    inputs=[img_in], outputs=img_out,
+                    fn=lambda p: predict_image(p, DEFAULT_THR),
+                    cache_examples=False,
+                    examples_per_page=len(img_examples),
+                    label="Example images — click to run",
+                )
+            img_btn.click(predict_image, [img_in, img_thr], img_out)
+        # ── Video tab ───────────────────────────────���────────────────
+        with gr.Tab("🎬 Video"):
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    vid_in = gr.Video(label="Input video", height=380)
+                    vid_thr = gr.Slider(0.0, 1.0, value=DEFAULT_THR, step=0.05,
+                                        label="Score threshold")
+                    vid_btn = gr.Button("Process video", variant="primary")
+                    gr.Markdown(
+                        f"<sub>Processes up to {MAX_VIDEO_FRAMES} frames "
+                        "(longer clips are subsampled). This can take a minute."
+                        "</sub>")
+                with gr.Column():
+                    vid_out = gr.Video(label="HOI predictions", height=380)
+            if vid_examples:
+                gr.Examples(
+                    examples=[[p] for p in vid_examples],
+                    inputs=[vid_in],
+                    examples_per_page=len(vid_examples),
+                    label="Example videos — click to load, then Process",
+                )
+            vid_btn.click(predict_video, [vid_in, vid_thr], vid_out)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True,
+                allowed_paths=[_IMG_DIR, _VID_DIR])