Spaces:

RealseeDeveloper
/

Argus

Running on Zero

App Files Files Community

lixi042 commited on 2 days ago

Commit

510e990

1 Parent(s): a4415c0

Initial commit: Argus metric panoramic 3D reconstruction demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
app.py +1499 -0
argus/__init__.py +2 -0
argus/heads/__init__.py +2 -0
argus/heads/camera_head.py +142 -0
argus/heads/dpt_head.py +474 -0
argus/heads/head_act.py +122 -0
argus/heads/utils.py +142 -0
argus/layers/__init__.py +8 -0
argus/layers/attention.py +93 -0
argus/layers/block.py +247 -0
argus/layers/drop_path.py +34 -0
argus/layers/layer_scale.py +22 -0
argus/layers/mlp.py +40 -0
argus/layers/patch_embed.py +85 -0
argus/layers/rope.py +188 -0
argus/layers/swiglu_ffn.py +67 -0
argus/layers/vision_transformer.py +401 -0
argus/models/__init__.py +2 -0
argus/models/aggregator.py +502 -0
argus/models/argus.py +234 -0
argus/utils/__init__.py +2 -0
argus/utils/data_io.py +152 -0
argus/utils/geometry.py +201 -0
argus/utils/normalization.py +65 -0
argus/utils/pose_enc.py +105 -0
argus/utils/rotation.py +118 -0
assets/argus_logo.png +3 -0
examples/far_4/0.jpg +3 -0
examples/far_4/1.jpg +3 -0
examples/far_4/2.jpg +3 -0
examples/far_4/3.jpg +3 -0
examples/scene_00008/1757748389.jpg +3 -0
examples/scene_00008/1757748429.jpg +3 -0
examples/scene_00008/1757748477.jpg +3 -0
examples/scene_00008/1757748528.jpg +3 -0
examples/scene_00008/1757748562.jpg +3 -0
examples/scene_00008/1757748600.jpg +3 -0
examples/scene_00008/1757748638.jpg +3 -0
examples/scene_00008/1757748685.jpg +3 -0
examples/scene_00008/1757748728.jpg +3 -0
examples/scene_00008/1757748770.jpg +3 -0
examples/scene_00008/1757748817.jpg +3 -0
examples/scene_00008/1757748866.jpg +3 -0
examples/scene_00008/1757748907.jpg +3 -0
examples/scene_00008/1757748959.jpg +3 -0
examples/scene_00008/1757749004.jpg +3 -0
examples/scene_00008/1757749043.jpg +3 -0
examples/scene_00008/1757749091.jpg +3 -0
examples/scene_00008/1757749140.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,1499 @@

+# Standard library imports
+import os
+import sys
+import shutil
+import glob
+import gc
+import time
+import base64
+import argparse
+import tempfile
+from datetime import datetime
+from pathlib import Path
+# Third-party library imports
+import cv2
+import torch
+import trimesh
+import numpy as np
+import gradio as gr
+import matplotlib
+import matplotlib.pyplot as plt
+from scipy.spatial.transform import Rotation
+# Custom module imports
+from argus.models.argus import Argus
+from argus.utils.pose_enc import pose_encoding_to_extri360
+from argus.utils.geometry import unproject_depth_to_world_points
+# -------------------------- Argument Parsing --------------------------
+def parse_args():
+    parser = argparse.ArgumentParser(description="Argus Gradio Demo")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default=None,
+        help="Path to pre-trained model weights (.pt file). "
+             "If not specified, auto-downloads from HuggingFace.",
+    )
+    parser.add_argument(
+        "--img_size",
+        type=int,
+        default=560,
+        help="Input panoramic image target width (height = width // 2)",
+    )
+    parser.add_argument(
+        "--crop_ratio",
+        type=float,
+        default=0.15,
+        help="Vertical crop ratio for panoramic image preprocessing (0-0.5)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Port number for Gradio server",
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        default=False,
+        help="Enable Gradio public sharing link",
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="0.0.0.0",
+        help="Server host address (0.0.0.0 for all interfaces)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (cuda/cpu). Default: auto-detect",
+    )
+    parser.add_argument(
+        "--examples_dir",
+        type=str,
+        default="examples",
+        help="Directory containing example scenes",
+    )
+    parser.add_argument(
+        "--save_tmp",
+        type=str,
+        default=None,
+        help="Directory to persist intermediate files (images, predictions, GLB). "
+             "If not set, uses system temp dir and cleans up automatically.",
+    )
+    return parser.parse_args()
+args = parse_args()
+# -------------------------- Global Configuration --------------------------
+# Device configuration: use specified device or auto-detect
+DEVICE = args.device if args.device else ("cuda" if torch.cuda.is_available() else "cpu")
+# Input panoramic image target size (ERP: W=img_size, H=img_size//2)
+IMG_SIZE = args.img_size
+# Vertical crop ratio for panoramic image preprocessing
+CROP_RATIO = args.crop_ratio
+def resolve_model_path(model_path: str) -> str:
+    """
+    Resolve model path: if a local file is specified and exists, use it directly;
+    otherwise download from HuggingFace Hub.
+    Requires `huggingface-cli login` for gated repos.
+    """
+    if model_path is not None and os.path.isfile(model_path):
+        return model_path
+    if model_path is not None:
+        print(f"Specified model path '{model_path}' not found.")
+    print("Downloading model from HuggingFace (RealseeTechnology/argus-realsee3d)...")
+    try:
+        from huggingface_hub import hf_hub_download
+        downloaded_path = hf_hub_download(
+            repo_id="RealseeTechnology/argus-realsee3d",
+            filename="argus_realsee3d.pt",
+        )
+        print(f"Model downloaded to: {downloaded_path}")
+        return downloaded_path
+    except Exception as e:
+        error_msg = str(e)
+        if "GatedRepoError" in type(e).__name__ or "401" in error_msg:
+            raise RuntimeError(
+                "Cannot access gated model repo. Please authenticate first:\n"
+                "  1. Run: hf auth login\n"
+                "  2. Accept the model license at: https://huggingface.co/RealseeTechnology/argus-realsee3d\n"
+                "  3. Re-run this script.\n"
+                "Or download manually and specify --model_path."
+            ) from e
+        raise
+# Pre-trained model path (auto-download if not found locally)
+MODEL_PATH = resolve_model_path(args.model_path)
+# -------------------------- Model Initialization --------------------------
+print("Initializing and loading Argus model...")
+# Initialize Argus model with metric scale and learning ref reorder
+model = Argus(reorder_by_learning_ref=True, restore_metric_scale=True)
+# Load model weights (non-strict to ignore unused parameters)
+model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE)["model"], strict=False)
+# Set model to evaluation mode and move to target device
+model.eval()
+model = model.to(DEVICE)
+# -------------------------- Image Preprocessing --------------------------
+def load_and_preprocess_images(image_path_list, target_size=IMG_SIZE):
+    """
+    Load and preprocess panoramic images for model inference
+    Args:
+        image_path_list (list): List of input image file paths
+        target_size (int): Target width of panoramic image (height = target_size//2)
+    Returns:
+        torch.Tensor: Preprocessed tensor with shape (S, C, H, W)
+                      S: sequence length, C: 3(RGB), H/W: image size
+    """
+    images = []
+    pano_W, pano_H = target_size, target_size // 2
+    # Load and resize each image
+    for image_path in image_path_list:
+        img = cv2.imread(image_path)  # Load as BGR (H, W, C)
+        h, w = img.shape[:2]
+        if w != pano_W or h != pano_H:
+            img = cv2.resize(img, (pano_W, pano_H), interpolation=cv2.INTER_AREA)
+        images.append(img)
+    # Stack and preprocess: crop vertical → BGR2RGB → normalize → reshape
+    images = np.stack(images)  # (S, H, W, C)
+    # Crop top/bottom 15% of height and convert BGR to RGB
+    images = np.ascontiguousarray(
+        images[:, int(pano_H * CROP_RATIO) : int(pano_H * (1 - CROP_RATIO)), :, ::-1]
+    )
+    # Convert to tensor and normalize to [0,1]
+    images = torch.from_numpy(images).float() / 255.0
+    # Reshape to (S, C, H, W) for PyTorch model input
+    images = images.permute(0, 3, 1, 2)
+    return images
+# -------------------------- Point Cloud Utils --------------------------
+def save_point_cloud_to_ply(points: np.ndarray, save_path: str):
+    """
+    Save 3D point cloud (N,3) to PLY format (ASCII) for universal compatibility
+    Args:
+        points (np.ndarray): 3D point cloud with shape [N, 3] (x, y, z for each point)
+        save_path (str): Output PLY file path
+    Raises:
+        ValueError: If input points shape is not [N, 3]
+    """
+    # Validate input point cloud shape
+    if points.ndim != 2 or points.shape[1] != 3:
+        raise ValueError(f"Point cloud must be [N,3], got {points.shape}")
+    num_points = points.shape[0]
+    # PLY format header (follow official specification)
+    ply_header = f"""ply
+format ascii 1.0
+element vertex {num_points}
+property float x
+property float y
+property float z
+end_header
+"""
+    # Write header and point data to file
+    with open(save_path, "w", encoding="utf-8") as f:
+        f.write(ply_header)
+        np.savetxt(f, points, fmt="%.6f %.6f %.6f")
+# -------------------------- Core Model Inference --------------------------
+def run_model(target_dir, model) -> dict:
+    """
+    Run Argus model inference on images in target_dir/images
+    Args:
+        target_dir (str): Root directory containing 'images' subfolder
+        model (Argus): Pre-initialized Argus model
+    Returns:
+        dict: Model predictions with tensor converted to numpy array
+    Raises:
+        ValueError: If CUDA unavailable or no images found in target_dir
+    """
+    print(f"Processing images from {target_dir}")
+    # Enforce CUDA for inference
+    if not torch.cuda.is_available():
+        raise ValueError("CUDA is not available. Inference requires GPU acceleration.")
+    model = model.to(DEVICE)
+    model.eval()
+    # Load and sort input images
+    image_names = sorted(glob.glob(os.path.join(target_dir, "images", "*")))
+    print(f"Found {len(image_names)} input images")
+    if len(image_names) == 0:
+        raise ValueError("No images found in target_dir/images. Check your upload.")
+    # Preprocess images and move to device
+    images = load_and_preprocess_images(image_names, target_size=IMG_SIZE).to(DEVICE)
+    print(f"Preprocessed images shape: {images.shape}")
+    # Mixed precision inference for speed and memory efficiency
+    print("Running model inference...")
+    dtype = (
+        torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+    )
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=dtype):
+        predictions = model(images)
+    torch.cuda.synchronize()
+    t1 = time.perf_counter()
+    inference_time = t1 - t0
+    print(f"Inference time: {inference_time:.3f} s")
+    # Convert pose encoding to extrinsic/intrinsic matrices
+    print("Converting pose encoding to extrinsic matrices...")
+    extrinsic, conf = pose_encoding_to_extri360(pose_encoding=predictions["pose_enc"])
+    predictions["extrinsic"] = extrinsic[:, :, :3, :]
+    # Unproject depth map to 3D world coordinates
+    print("Computing 3D world points from depth map...")
+    world_points = unproject_depth_to_world_points(
+        predictions["depth"], predictions["extrinsic"], size=IMG_SIZE
+    )
+    predictions["world_points_from_depth"] = world_points
+    # Convert all torch tensors to numpy arrays and remove batch dimension
+    print("Converting model outputs to numpy arrays...")
+    for key in predictions.keys():
+        if isinstance(predictions[key], torch.Tensor):
+            predictions[key] = predictions[key].cpu().float().numpy().squeeze(0)
+        elif isinstance(predictions[key], list):
+            for i in range(len(predictions[key])):
+                if isinstance(predictions[key][i], torch.Tensor):
+                    predictions[key][i] = (
+                        predictions[key][i].cpu().float().numpy().squeeze(0)
+                    )
+    print(f"Model prediction keys: {predictions.keys()}")
+    # Clear CUDA cache to save memory
+    torch.cuda.empty_cache()
+    return predictions, inference_time
+# -------------------------- Upload File Handling --------------------------
+def handle_uploads(input_images):
+    """
+    Create directory for uploaded images and copy files to target path.
+    Uses system temp dir by default; uses --save_tmp dir if specified.
+    Args:
+        input_images: Gradio uploaded file data
+    Returns:
+        tuple: (target_dir, sorted_image_paths)
+    """
+    start_time = time.time()
+    gc.collect()
+    torch.cuda.empty_cache()
+    # Create target directory: persistent if --save_tmp is set, otherwise temp
+    if args.save_tmp:
+        os.makedirs(args.save_tmp, exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        target_dir = os.path.join(args.save_tmp, f"input_images_{timestamp}")
+    else:
+        target_dir = tempfile.mkdtemp(prefix="argus_")
+    target_img_dir = os.path.join(target_dir, "images")
+    # Clean up if directory exists (edge case)
+    if os.path.exists(target_dir) and args.save_tmp:
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir, exist_ok=True)
+    os.makedirs(target_img_dir, exist_ok=True)
+    # Copy uploaded images to target directory
+    image_paths = []
+    if input_images is not None:
+        for file_data in input_images:
+            # Get file path from Gradio file data
+            file_path = file_data["name"] if isinstance(file_data, dict) else file_data
+            dst_path = os.path.join(target_img_dir, os.path.basename(file_path))
+            shutil.copy(file_path, dst_path)
+            image_paths.append(dst_path)
+    # Sort images for consistent processing
+    image_paths = sorted(image_paths)
+    print(
+        f"Files copied to {target_img_dir} | Time cost: {time.time() - start_time:.3f}s"
+    )
+    return target_dir, image_paths
+def update_gallery_on_upload(input_images):
+    """
+    Update image gallery immediately after file upload
+    Args:
+        input_images: Gradio uploaded file data
+    Returns:
+        tuple: Gradio component update values
+    """
+    if not input_images:
+        return None, None, None, None
+    target_dir, image_paths = handle_uploads(input_images)
+    return (
+        None,
+        target_dir,
+        image_paths,
+        "Upload complete. Click 'Reconstruct' to begin 3D processing.",
+    )
+# -------------------------- 3D Reconstruction Pipeline --------------------------
+def gradio_demo(
+    target_dir,
+    conf_thres=5.0,
+    frame_filter="All",
+    show_cam=True,
+    show_index=True,
+    ceiling_remove=25,
+):
+    """
+    Main 3D reconstruction pipeline for Gradio interface
+    Args:
+        target_dir (str): Directory with input images
+        conf_thres (float): Confidence threshold for point cloud filtering
+        frame_filter (str): Filter frames to show in 3D model
+        show_cam (bool): Whether to show camera poses in 3D model
+        show_index (bool): Whether to show frame indices in 3D model
+        ceiling_remove (float): Percentage of top Y-coordinate points to remove as ceiling (0-100, 0=disabled)
+    Returns:
+        tuple: Gradio component update values (3D model, logs, dropdown, etc.)
+    """
+    # Validate target directory
+    if not os.path.isdir(target_dir) or target_dir == "None":
+        return (
+            None,
+            "No valid target directory. Please upload images first.",
+            None,
+            None,
+            None,
+            "",
+            None,
+        )
+    start_time = time.time()
+    gc.collect()
+    torch.cuda.empty_cache()
+    # Prepare frame filter dropdown options
+    target_img_dir = os.path.join(target_dir, "images")
+    all_files = (
+        sorted(os.listdir(target_img_dir)) if os.path.isdir(target_img_dir) else []
+    )
+    all_files = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
+    frame_filter_choices = ["All"] + all_files
+    # Run model inference
+    with torch.no_grad():
+        predictions, inference_time = run_model(target_dir, model)
+    # Save predictions to NPZ for later visualization update
+    pred_save_path = os.path.join(target_dir, "predictions.npz")
+    np.savez(pred_save_path, **predictions)
+    # Default frame filter to All if None
+    frame_filter = frame_filter if frame_filter is not None else "All"
+    # Generate unique GLB filename with parameters
+    glb_filename = f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_index{show_index}_ceiling{ceiling_remove}.glb"
+    glbfile = os.path.join(target_dir, glb_filename)
+    # Convert model predictions to GLB 3D model
+    glbscene = predictions_to_glb(
+        predictions,
+        conf_thres=conf_thres,
+        filter_by_frames=frame_filter,
+        show_cam=show_cam,
+        show_index=show_index,
+        ceiling_remove=ceiling_remove,
+        target_dir=target_dir,
+    )
+    glbscene.export(file_obj=glbfile)
+    # Prepare measure view
+    measure_img, _ = update_measure_view(predictions, 0)
+    # Create view selector based on number of input images
+    num_views = (
+        predictions["images"].shape[0] if predictions["images"].shape[0] > 0 else 1
+    )
+    view_choices = [f"View {i + 1}" for i in range(num_views)]
+    measure_selector = gr.Dropdown(choices=view_choices, value=view_choices[0])
+    # Clean up memory
+    gc.collect()
+    torch.cuda.empty_cache()
+    total_time = time.time() - start_time
+    log_msg = f"Reconstruction Success ({len(all_files)} frames). Inference: {inference_time:.2f}s | Total: {total_time:.2f}s"
+    print(f"Reconstruction complete | Inference: {inference_time:.2f}s | Total: {total_time:.2f}s")
+    return (
+        glbfile,
+        log_msg,
+        gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True),
+        predictions,
+        measure_img,
+        "",
+        measure_selector,
+    )
+# -------------------------- UI Utility Functions --------------------------
+def clear_fields():
+    """Clear 3D model viewer for Gradio interface"""
+    return None
+def update_log():
+    """Update log message during model processing"""
+    return "Loading and Reconstructing..."
+def update_visualization(
+    target_dir,
+    conf_thres,
+    frame_filter,
+    show_cam,
+    show_index,
+    ceiling_remove,
+    is_example,
+):
+    """
+    Update 3D visualization when parameters change (without re-running model)
+    Args:
+        is_example (str): Whether it's example data (skip if "True")
+    Returns:
+        tuple: (GLB file path, log message)
+    """
+    # Skip if loading example data
+    if is_example == "True":
+        return (
+            None,
+            "No reconstruction available. Please click the Reconstruct button first.",
+        )
+    # Validate target directory and prediction file
+    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+        return None, "No valid reconstruction. Please upload and reconstruct first."
+    pred_path = os.path.join(target_dir, "predictions.npz")
+    if not os.path.exists(pred_path):
+        return None, f"No prediction file found at {pred_path}. Run Reconstruct first."
+    # Load saved predictions
+    key_list = [
+        "pose_enc",
+        "depth",
+        "depth_conf",
+        "images",
+        "extrinsic",
+        "world_points_from_depth",
+    ]
+    loaded = np.load(pred_path)
+    predictions = {key: np.array(loaded[key]) for key in key_list if key in loaded}
+    # Generate GLB file (create if not exists)
+    glb_filename = f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_index{show_index}_ceiling{ceiling_remove}.glb"
+    glbfile = os.path.join(target_dir, glb_filename)
+    if not os.path.exists(glbfile):
+        glbscene = predictions_to_glb(
+            predictions,
+            conf_thres=conf_thres,
+            filter_by_frames=frame_filter,
+            show_cam=show_cam,
+            show_index=show_index,
+            ceiling_remove=ceiling_remove,
+            target_dir=target_dir,
+        )
+        glbscene.export(file_obj=glbfile)
+    return glbfile, "Visualization updated successfully"
+# -------------------------- Metric Measurement --------------------------
+def update_measure_view(predictions, view_index):
+    """
+    Update measure view with depth confidence mask overlay
+    Args:
+        predictions (dict): Model predictions with images and depth confidence
+        view_index (int): Index of the view to show
+    Returns:
+        tuple: (processed_image, empty_list)
+    """
+    # Get image and depth confidence
+    image = predictions["images"][view_index].transpose(1, 2, 0).copy()
+    depth_conf = predictions["depth_conf"][view_index].copy()
+    # Convert image to uint8 format
+    if image.dtype != np.uint8:
+        image = (
+            (image * 255).astype(np.uint8)
+            if image.max() <= 1.0
+            else image.astype(np.uint8)
+        )
+    # Create depth confidence mask (filter low confidence areas)
+    depth_conf_norm = (depth_conf - depth_conf.min()) / (
+        depth_conf.max() - depth_conf.min()
+    )
+    mask = depth_conf_norm > 0.05
+    invalid_mask = ~mask
+    # Apply red overlay to invalid areas (low confidence)
+    if invalid_mask.any():
+        overlay_color = np.array([255, 220, 220], dtype=np.uint8)
+        alpha = 0.5  # Transparency
+        for c in range(3):
+            image[:, :, c] = np.where(
+                invalid_mask,
+                (1 - alpha) * image[:, :, c] + alpha * overlay_color[c],
+                image[:, :, c],
+            ).astype(np.uint8)
+    return image, []
+def navigate_measure_view(processed_data, current_selector_value, direction):
+    """
+    Navigate between different measure views (previous/next)
+    Args:
+        direction (int): -1 for previous, +1 for next
+    Returns:
+        tuple: (new_selector_value, measure_image, empty_points)
+    """
+    if processed_data["images"].shape[0] == 0:
+        return "View 1", None, []
+    # Parse current view index from selector
+    try:
+        current_view = int(current_selector_value.split()[1]) - 1
+    except:
+        current_view = 0
+    # Calculate new view index (circular navigation)
+    num_views = processed_data["images"].shape[0]
+    new_view = (current_view + direction) % num_views
+    # Update selector and image
+    new_selector = f"View {new_view + 1}"
+    measure_image, _ = update_measure_view(processed_data, new_view)
+    return new_selector, measure_image, []
+def measure(
+    processed_data, measure_points, current_view_selector, event: gr.SelectData
+):
+    """
+    Core metric measurement function: click to select points and calculate 3D distance
+    Args:
+        event (gr.SelectData): Gradio click event data (image coordinates)
+    Returns:
+        tuple: (annotated_image, measure_points, measurement_text)
+    """
+    try:
+        # Get current view index
+        try:
+            current_view = int(current_view_selector.split()[1]) - 1
+        except:
+            current_view = 0
+        # Validate view index
+        current_view = (
+            0
+            if current_view < 0 or current_view >= processed_data["images"].shape[0]
+            else current_view
+        )
+        # Get clicked 2D point
+        point2d = event.index[0], event.index[1]
+        measure_points.append(point2d)
+        print(f"Measuring: clicked point {point2d} (view {current_view + 1})")
+        # Get base image and 3D points
+        image, _ = update_measure_view(processed_data, current_view)
+        image = image.copy()
+        points3d = processed_data["world_points_from_depth"][current_view]
+        # Draw blue circles for clicked points
+        for p in measure_points:
+            if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
+                image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
+        # Calculate depth for single point
+        depth_text = ""
+        depth = processed_data["depth"][current_view].squeeze(axis=-1)
+        for i, p in enumerate(measure_points):
+            try:
+                if 0 <= p[1] < depth.shape[0] and 0 <= p[0] < depth.shape[1]:
+                    d = depth[p[1], p[0]]
+                    depth_text += f"- **P{i + 1} depth: {d:.2f}m.**\n"
+                else:
+                    d = np.linalg.norm(points3d[p[1], p[0]], ord=2)
+                    depth_text += f"- **P{i + 1} dist: {d:.2f}m.**\n"
+            except:
+                depth_text += f"- **P{i + 1}: Depth unavailable**\n"
+        # Calculate 3D distance for two points
+        if len(measure_points) == 2:
+            p1, p2 = measure_points
+            # Draw blue line between two points
+            if all(
+                0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]
+                for p in [p1, p2]
+            ):
+                image = cv2.line(image, p1, p2, color=(255, 0, 0), thickness=2)
+            # Calculate 3D Euclidean distance
+            try:
+                p1_3d = points3d[p1[1], p1[0]]
+                p2_3d = points3d[p2[1], p2[0]]
+                distance = np.linalg.norm(p1_3d - p2_3d)
+                distance_text = f"- **Distance: {distance:.2f}m**"
+            except:
+                distance_text = "- **Distance: Unable to compute**"
+            # Reset points after measurement
+            measure_points = []
+            return [image, measure_points, depth_text + distance_text]
+        return [image, measure_points, depth_text]
+    except Exception as e:
+        print(f"Measurement error: {str(e)}")
+        return None, [], f"Measure error: {str(e)}"
+# -------------------------- Example Data Loader --------------------------
+def get_scene_info(examples_dir):
+    """
+    Load example scene information from examples directory
+    Args:
+        examples_dir (str): Directory containing example scenes
+    Returns:
+        list: List of scene dicts with name, path, thumbnail, image files
+    """
+    scenes = []
+    if not os.path.exists(examples_dir):
+        return scenes
+    # Iterate over example scene folders
+    for scene_folder in sorted(os.listdir(examples_dir)):
+        scene_path = os.path.join(examples_dir, scene_folder)
+        if not os.path.isdir(scene_path):
+            continue
+        # Load all image files
+        img_exts = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
+        image_files = []
+        for ext in img_exts:
+            image_files.extend(glob.glob(os.path.join(scene_path, ext)))
+            image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))
+        # Skip empty folders
+        if not image_files:
+            continue
+        # Sort images and get thumbnail
+        image_files = sorted(image_files)
+        scenes.append(
+            {
+                "name": scene_folder,
+                "path": scene_path,
+                "thumbnail": image_files[0],
+                "num_images": len(image_files),
+                "image_files": image_files,
+            }
+        )
+    return scenes
+def example_pipeline(
+    scene,
+    conf_thres=5.0,
+    show_cam=True,
+    show_index=True,
+    ceiling_remove=25,
+):
+    """
+    Pipeline for loading example scenes and running reconstruction
+    Args:
+        scene (dict): Example scene info from get_scene_info
+    Returns:
+        tuple: Gradio component update values
+    """
+    input_image_paths = scene["image_files"]
+    target_dir, image_paths = handle_uploads(input_image_paths)
+    frame_filter = "All"  # Default to all frames for examples
+    # Run reconstruction
+    (
+        glbfile,
+        log_msg,
+        dropdown,
+        predictions,
+        measure_img,
+        measure_text,
+        measure_selector,
+    ) = gradio_demo(
+        target_dir, conf_thres, frame_filter, show_cam, show_index, ceiling_remove
+    )
+    return (
+        glbfile,
+        log_msg,
+        target_dir,
+        dropdown,
+        image_paths,
+        predictions,
+        measure_img,
+        measure_text,
+        measure_selector,
+    )
+# -------------------------- 3D Visualization Utilities --------------------------
+class SevenSegmentDigit:
+    """7-segment display definition for digital watch style 3D point cloud generation"""
+    # 7 segments definition: A(top), B(upper right), C(lower right), D(bottom), E(lower left), F(upper left), G(middle)
+    SEGMENTS = {
+        'A': np.array([(x, 0.5, 0) for x in np.linspace(-0.4, 0.4, 80) for y in np.linspace(0.45, 0.55, 10)]),
+        'B': np.array([(x, y, 0) for x in np.linspace(0.4, 0.5, 10) for y in np.linspace(0, 0.5, 80)]),
+        'C': np.array([(x, y, 0) for x in np.linspace(0.4, 0.5, 10) for y in np.linspace(-0.5, 0, 80)]),
+        'D': np.array([(x, y, 0) for x in np.linspace(-0.4, 0.4, 80) for y in np.linspace(-0.55, -0.45, 10)]),
+        'E': np.array([(x, y, 0) for x in np.linspace(-0.5, -0.4, 10) for y in np.linspace(-0.5, 0, 80)]),
+        'F': np.array([(x, y, 0) for x in np.linspace(-0.5, -0.4, 10) for y in np.linspace(0, 0.5, 80)]),
+        'G': np.array([(x, y, 0) for x in np.linspace(-0.4, 0.4, 80) for y in np.linspace(-0.05, 0.05, 10)])
+    }
+    # Segment mapping for standard 0-9 digits (specify lit segments for each digit)
+    DIGIT_SEGMENTS = {
+        0: ['A', 'B', 'C', 'D', 'E', 'F'],
+        1: ['B', 'C'],
+        2: ['A', 'B', 'G', 'E', 'D'],
+        3: ['A', 'B', 'G', 'C', 'D'],
+        4: ['F', 'G', 'B', 'C'],
+        5: ['A', 'F', 'G', 'C', 'D'],
+        6: ['A', 'F', 'G', 'C', 'D', 'E'],
+        7: ['A', 'B', 'C'],
+        8: ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
+        9: ['A', 'B', 'C', 'D', 'F', 'G']
+    }
+    @classmethod
+    def get_digit_points(cls, digit, scale=0.05):
+        """
+        Generate 3D point cloud for a single digital watch style digit (0-9)
+        Args:
+            digit (int): Target digit (0-9 only)
+            scale (float): Scale factor for point cloud size
+        Returns:
+            np.ndarray: N×3 array of 3D points for the digit
+        Raises:
+            ValueError: If digit is not in 0-9 range
+        """
+        if not 0 <= digit <= 9:
+            raise ValueError(f"Digit must be 0-9, got {digit}")
+        # Combine lit segments for the target digit
+        segments = cls.DIGIT_SEGMENTS[digit]
+        points = np.vstack([cls.SEGMENTS[seg] for seg in segments])
+        # Scale point cloud and center to origin
+        points = points * scale
+        points -= points.mean(axis=0)
+        # Remove duplicate points and supplement sparse points (ensure dense distribution)
+        points = np.unique(points.round(6), axis=0)
+        if len(points) < 200:
+            points = trimesh.sample.sample_surface(trimesh.Trimesh(points), 500)[0]
+        return points
+def create_number_point_cloud(number, scale=0.05):
+    """
+    Generate 3D point cloud for multi-digit number (digital watch style), facing +Y axis
+    Args:
+        number (int): Non-negative target integer (any digit length)
+        scale (float): Scale factor for single digit point cloud size
+    Returns:
+        trimesh.PointCloud: Colored (red) 3D point cloud of the number
+    Raises:
+        ValueError: If number is negative or non-integer
+    """
+    if not isinstance(number, int) or number < 0:
+        raise ValueError(f"Number must be non-negative integer, got {number}")
+    # Split number into individual digits and handle 0 specially
+    digits = [int(d) for d in str(number)] if number != 0 else [0]
+    all_points, spacing = [], scale * 1.2
+    total_width = (len(digits)-1) * spacing
+    # Arrange digits horizontally and center the whole number
+    for idx, d in enumerate(digits):
+        digit_points = SevenSegmentDigit.get_digit_points(d, scale)
+        digit_points[:, 0] += -total_width/2 + idx * spacing
+        all_points.append(digit_points)
+    # Merge all digit points and apply rotation to face +Y axis
+    all_points = np.vstack(all_points)
+    rotation = np.array([[1, 0, 0],
+                        [0, 0, -1],
+                        [0, 1, 0]])
+    all_points = np.dot(all_points, rotation.T)
+    # Create red point cloud (classic digital watch color)
+    colors = np.full((len(all_points), 3), [255, 0, 0], dtype=np.uint8)
+    return trimesh.PointCloud(all_points, colors)
+def predictions_to_glb(
+    predictions,
+    conf_thres=50.0,
+    filter_by_frames="all",
+    show_cam=True,
+    show_index=True,
+    ceiling_remove=25,
+    target_dir=None,
+    prediction_mode="Predicted Pointmap",
+) -> trimesh.Scene:
+    """
+    Convert VGGT model predictions to a 3D trimesh Scene (exportable to GLB)
+    Integrates colored point cloud, camera meshes and digital camera indexes
+    Args:
+        predictions (dict): Model prediction dict with keys:
+            - world_points: 3D point coordinates (S, H, W, 3)
+            - world_points_conf: Confidence scores (S, H, W)
+            - images: Input images (S, H, W, 3)
+            - extrinsic: Camera extrinsic matrices (S, 3, 4)
+        conf_thres (float): Low-confidence point filter (percentile, 0-100)
+        filter_by_frames (str): Frame filter ("all" or specific frame index like "0:")
+        show_cam (bool): Whether to add camera mesh visualization to scene
+        show_index (bool): Whether to add digital index point cloud above cameras
+        ceiling_remove (float): Percentage of top Y-coordinate points to remove as ceiling (0-100, 0=disabled)
+        target_dir (str): Directory for intermediate files (images)
+        prediction_mode (str): Prediction branch ("Predicted Pointmap" / others for depth-based)
+    Returns:
+        trimesh.Scene: 3D scene with point cloud, cameras and indexes (if enabled)
+    Raises:
+        ValueError: If predictions is not a dictionary
+    """
+    if not isinstance(predictions, dict):
+        raise ValueError("predictions must be a dictionary")
+    conf_thres = 10.0 if conf_thres is None else conf_thres
+    print("Building GLB scene")
+    selected_frame_idx = None
+    # Parse selected frame index from filter string (e.g., "0:" -> 0)
+    if filter_by_frames not in ["all", "All"]:
+        try:
+            selected_frame_idx = int(filter_by_frames.split(":")[0])
+        except (ValueError, IndexError):
+            pass
+    # Select prediction branch (Pointmap direct / Depthmap derived)
+    if "Pointmap" in prediction_mode:
+        print("Using Pointmap Branch")
+        if "world_points" in predictions:
+            pred_world_points = predictions["world_points"]
+            pred_world_points_conf = predictions.get("world_points_conf", np.ones_like(pred_world_points[..., 0]))
+        else:
+            print("Warning: world_points not found, falling back to depth-based world points")
+            pred_world_points = predictions["world_points_from_depth"]
+            pred_world_points_conf = predictions.get("depth_conf", np.ones_like(pred_world_points[..., 0]))
+    else:
+        print("Using Depthmap and Camera Branch")
+        pred_world_points = predictions["world_points_from_depth"]
+        pred_world_points_conf = predictions.get("depth_conf", np.ones_like(pred_world_points[..., 0]))
+    # Extract core prediction data: images and camera extrinsic matrices
+    images = predictions["images"]
+    camera_matrices = predictions["extrinsic"]
+    # Filter prediction data to selected single frame if specified
+    if selected_frame_idx is not None:
+        pred_world_points = pred_world_points[selected_frame_idx][None]
+        pred_world_points_conf = pred_world_points_conf[selected_frame_idx][None]
+        images = images[selected_frame_idx][None]
+        camera_matrices = camera_matrices[selected_frame_idx][None]
+    # Reshape 3D points and convert image colors to 8-bit RGB (match point cloud)
+    vertices_3d = pred_world_points.reshape(-1, 3)
+    if images.ndim == 4 and images.shape[1] == 3:  # Convert NCHW to NHWC format
+        colors_rgb = np.transpose(images, (0, 2, 3, 1))
+    else:  # Direct use if already NHWC format
+        colors_rgb = images
+    colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
+    # Filter points by confidence threshold (remove low-confidence points)
+    conf = pred_world_points_conf.reshape(-1)
+    conf_threshold = 0.0 if conf_thres == 0.0 else np.percentile(conf, conf_thres)
+    conf_mask = (conf >= conf_threshold) & (conf > 1e-5)
+    vertices_3d = vertices_3d[conf_mask]
+    colors_rgb = colors_rgb[conf_mask]
+    # Create dummy point if no valid points left (avoid scene empty error)
+    if vertices_3d is None or np.asarray(vertices_3d).size == 0:
+        vertices_3d = np.array([[1, 0, 0]])
+        colors_rgb = np.array([[255, 255, 255]])
+        scene_scale = 1
+    else:
+        # Calculate scene scale by 5th/95th percentile bounding box diagonal
+        lower_percentile = np.percentile(vertices_3d, 5, axis=0)
+        upper_percentile = np.percentile(vertices_3d, 95, axis=0)
+        scene_scale = np.linalg.norm(upper_percentile - lower_percentile)
+    # Initialize 3D scene and colormap for camera unique colors
+    colormap = matplotlib.colormaps.get_cmap("gist_rainbow")
+    scene_3d = trimesh.Scene()
+    # Filter out ceiling points (remove top N% of Y-coordinates by percentile)
+    if ceiling_remove > 0 and vertices_3d.size > 1:
+        y_coords = vertices_3d[:, 1]
+        y_percentile = np.percentile(y_coords, ceiling_remove)
+        mask = y_coords > y_percentile
+        vertices_3d = vertices_3d[mask]
+        colors_rgb = colors_rgb[mask]
+    # Add colored 3D point cloud to the scene
+    point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
+    scene_3d.add_geometry(point_cloud_data)
+    # Convert 3x4 camera extrinsics to 4x4 homogeneous matrices
+    num_cameras = len(camera_matrices)
+    extrinsics_matrices = np.zeros((num_cameras, 4, 4))
+    extrinsics_matrices[:, :3, :4] = camera_matrices
+    extrinsics_matrices[:, 3, 3] = 1
+    # Add camera meshes and digital index point clouds to the scene
+    for i in range(num_cameras):
+        camera_to_world = extrinsics_matrices[i]
+        rgba_color = colormap(i / num_cameras)  # Unique color for each camera
+        current_color = tuple(int(255 * x) for x in rgba_color[:3])
+        # Add camera mesh to scene
+        if show_cam:
+            integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
+        # Add digital index point cloud above each camera (red, digital watch style)
+        if show_index:
+            camera_center = camera_to_world[:3, 3]
+            y_offset = 0.5  # Y-axis offset for index position (above camera)
+            number_position = camera_center + np.array([0, y_offset, 0])
+            # Generate index point cloud and translate to target position
+            number_scale = 0.3
+            number_pc = create_number_point_cloud(number=i, scale=number_scale)
+            number_pc.apply_translation(number_position)
+            scene_3d.add_geometry(number_pc)
+    # Align the whole scene to the first camera's viewing perspective
+    scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
+    print("GLB Scene built successfully")
+    return scene_3d
+def integrate_camera_into_scene(
+    scene: trimesh.Scene, transform: np.ndarray, face_colors: tuple, scene_scale: float
+):
+    """
+    Add a 3D cone-shaped camera mesh to the 3D scene with specified transform and color
+    Args:
+        scene (trimesh.Scene): Target 3D scene to add camera mesh
+        transform (np.ndarray): 4x4 camera-to-world transformation matrix
+        face_colors (tuple): RGB color tuple (0-255) for camera mesh faces
+        scene_scale (float): Overall scale of the 3D scene (for camera size adaptation)
+    """
+    # Set camera mesh size based on scene scale
+    cam_width = scene_scale * 0.02
+    cam_height = scene_scale * 0.02
+    # 45° Z-axis rotation for camera cone shape and backward translation
+    rot_45_degree = np.eye(4)
+    rot_45_degree[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix()
+    rot_45_degree[2, 3] = -cam_height
+    # Combine OpenGL conversion, rotation and camera transform matrices
+    opengl_transform = get_opengl_conversion_matrix()
+    complete_transform = transform @ opengl_transform @ rot_45_degree
+    camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4)
+    # Slight Z-axis rotation for camera mesh detail enhancement
+    slight_rotation = np.eye(4)
+    slight_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix()
+    # Combine original, scaled and rotated cone vertices for dense camera mesh
+    vertices_combined = np.concatenate(
+        [
+            camera_cone_shape.vertices,
+            0.95 * camera_cone_shape.vertices,
+            transform_points(slight_rotation, camera_cone_shape.vertices),
+        ]
+    )
+    vertices_transformed = transform_points(complete_transform, vertices_combined)
+    # Compute camera mesh faces from cone shape
+    mesh_faces = compute_camera_faces(camera_cone_shape)
+    # Create camera mesh with specified color and add to scene
+    camera_mesh = trimesh.Trimesh(vertices=vertices_transformed, faces=mesh_faces)
+    camera_mesh.visual.face_colors[:, :3] = face_colors
+    scene.add_geometry(camera_mesh)
+def apply_scene_alignment(
+    scene_3d: trimesh.Scene, extrinsics_matrices: np.ndarray
+) -> trimesh.Scene:
+    """
+    Align the 3D scene to the first camera's viewing perspective with OpenGL conversion
+    Args:
+        scene_3d (trimesh.Scene): Unaligned 3D scene
+        extrinsics_matrices (np.ndarray): N×4×4 camera extrinsic matrices
+    Returns:
+        trimesh.Scene: Aligned 3D scene
+    """
+    # Get OpenGL coordinate conversion matrix and 180° Y-axis rotation for alignment
+    opengl_conversion_matrix = get_opengl_conversion_matrix()
+    align_rotation = np.eye(4)
+    align_rotation[:3, :3] = Rotation.from_euler("y", 180, degrees=True).as_matrix()
+    # Combine transformation matrices and apply to the whole scene
+    initial_transformation = np.linalg.inv(extrinsics_matrices[0]) @ opengl_conversion_matrix @ align_rotation
+    scene_3d.apply_transform(initial_transformation)
+    return scene_3d
+def get_opengl_conversion_matrix() -> np.ndarray:
+    """
+    Create 4x4 OpenGL coordinate system conversion matrix (flip Y and Z axes)
+    Returns:
+        np.ndarray: 4x4 identity-based conversion matrix
+    """
+    matrix = np.identity(4)
+    matrix[1, 1] = -1  # Flip Y axis
+    matrix[2, 2] = -1  # Flip Z axis
+    return matrix
+def transform_points(
+    transformation: np.ndarray, points: np.ndarray, dim: int = None
+) -> np.ndarray:
+    """
+    Apply 4x4 homogeneous transformation matrix to a set of 3D points
+    Args:
+        transformation (np.ndarray): 4x4 transformation matrix
+        points (np.ndarray): N×3 array of 3D points to transform
+        dim (int, optional): Target dimension of output points (default: 3)
+    Returns:
+        np.ndarray: N×dim array of transformed points (same shape as input except last dim)
+    """
+    points = np.asarray(points)
+    initial_shape = points.shape[:-1]
+    dim = dim or points.shape[-1]
+    # Transpose matrix and apply affine transformation to points
+    transformation = transformation.swapaxes(-1, -2)
+    points = points @ transformation[..., :-1, :] + transformation[..., -1:, :]
+    # Reshape transformed points to original shape (excluding last dimension)
+    result = points[..., :dim].reshape(*initial_shape, dim)
+    return result
+def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
+    """
+    Compute face indices for camera mesh from original cone shape faces (enhance detail)
+    Args:
+        cone_shape (trimesh.Trimesh): Original cone mesh for camera base shape
+    Returns:
+        np.ndarray: M×3 array of face indices for the camera mesh
+    """
+    faces_list = []
+    num_vertices_cone = len(cone_shape.vertices)
+    # Generate enhanced faces from cone faces (skip origin vertex 0)
+    for face in cone_shape.faces:
+        if 0 in face:
+            continue
+        v1, v2, v3 = face
+        v1_offset, v2_offset, v3_offset = face + num_vertices_cone
+        v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone
+        # Add multiple face variations for dense camera mesh
+        faces_list.extend(
+            [
+                (v1, v2, v2_offset),
+                (v1, v1_offset, v3),
+                (v3_offset, v2, v3),
+                (v1, v2, v2_offset_2),
+                (v1, v1_offset_2, v3),
+                (v3_offset_2, v2, v3),
+            ]
+        )
+    # Add reversed faces for double-sided rendering
+    faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
+    return np.array(faces_list)
+# -------------------------- Gradio UI Construction --------------------------
+if __name__ == "__main__":
+    # Gradio theme configuration
+    theme = gr.themes.Ocean()
+    theme.set(
+        checkbox_label_background_fill_selected="*button_primary_background_fill",
+        checkbox_label_text_color_selected="*button_primary_text_color",
+    )
+    with gr.Blocks(
+        theme=theme,
+        title="Argus - 3D Reconstruction",
+        css="""
+        .custom-log * {
+            font-style: italic;
+            font-size: 20px !important;
+            background-image: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            font-weight: 600 !important;
+            color: transparent !important;
+            text-align: center !important;
+        }
+        .example-log * {
+            font-size: 15px !important;
+            background-image: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            color: transparent !important;
+            font-weight: 500 !important;
+        }
+        .header-banner {
+            background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
+            border-radius: 16px;
+            padding: 32px 24px 24px;
+            margin-bottom: 16px;
+            border: 1px solid #e2e8f0;
+            text-align: center;
+        }
+        .header-banner h1 {
+            font-size: 28px;
+            font-weight: 700;
+            color: #1e293b;
+            margin: 12px 0 8px;
+        }
+        .header-banner .links {
+            margin-top: 12px;
+            font-size: 15px;
+        }
+        .header-banner .links a {
+            margin: 0 10px;
+            color: #4f46e5;
+            text-decoration: none;
+            font-weight: 500;
+        }
+        .header-banner .links a:hover {
+            text-decoration: underline;
+        }
+        .instructions {
+            font-size: 14px;
+            color: #475569;
+            line-height: 1.7;
+            padding: 12px 20px;
+            background: #f8fafc;
+            border-radius: 10px;
+            border: 1px solid #e2e8f0;
+        }
+        .instructions ol {
+            padding-left: 20px;
+            margin: 8px 0;
+        }
+        .instructions li {
+            margin-bottom: 4px;
+        }
+        .param-group {
+            padding: 8px 0;
+        }
+        footer {visibility: hidden;}
+        """,
+    ) as demo:
+        # Hidden state components for data passing
+        is_example = gr.Textbox(label="is_example", visible=False, value="None")
+        processed_data_state = gr.State(value=None)
+        measure_points_state = gr.State(value=[])
+        target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+        # Load and display logo (base64 encoded)
+        root_dir = Path(__file__).parent
+        logo_path = root_dir / "assets" / "argus_logo.png"
+        if logo_path.exists():
+            with open(logo_path, "rb") as f:
+                logo_base64 = base64.b64encode(f.read()).decode()
+                logo_src = f"data:image/png;base64,{logo_base64}"
+        else:
+            logo_src = ""  # Fallback if logo not found
+        # UI Header and Instructions
+        gr.HTML(
+            f"""
+        <div class="header-banner">
+            <div style="display: flex; justify-content: center;">
+                <img src="{logo_src}" alt="Argus Logo" style="height: 72px; border-radius: 8px;">
+            </div>
+            <h1>Argus: Metric Panoramic 3D Reconstruction for Indoor Scenes</h1>
+            <div class="links">
+                <a href="https://github.com/realsee-developer/Argus" target="_blank">🌟 GitHub</a>
+                <a href="https://argus-paper.realsee.ai" target="_blank">🚀 Project Page</a>
+                <a href="https://arxiv.org/abs/2606.30047" target="_blank">📄 Paper</a>
+            </div>
+        </div>
+        <div class="instructions">
+            <ol>
+                <li><strong>Upload</strong> a set of ERP panoramic images on the left.</li>
+                <li><strong>Click "Reconstruct"</strong> to run the 3D reconstruction pipeline.</li>
+                <li><strong>Explore</strong> the 3D model — rotate, pan, zoom, and download the GLB.</li>
+                <li><strong>Measure</strong> — switch to the Metric tab and click two points to measure real-world distance.</li>
+            </ol>
+        </div>
+        """
+        )
+        # Main UI Layout (2 columns: upload/gallery | 3D model/measurement)
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2, min_width=280):
+                input_images = gr.File(
+                    file_count="multiple", label="📁 Upload Panoramic Images", interactive=True
+                )
+                image_gallery = gr.Gallery(
+                    label="Preview",
+                    columns=3,
+                    height="280px",
+                    object_fit="contain",
+                    preview=True,
+                )
+            with gr.Column(scale=5):
+                # Log output
+                log_output = gr.Markdown(
+                    "Upload panoramic images (ERP), then click Reconstruct.",
+                    elem_classes=["custom-log"],
+                )
+                # Tabbed interface: 3D Model + Metric Measure
+                with gr.Tabs():
+                    with gr.Tab("🏠 3D Model"):
+                        reconstruction_output = gr.Model3D(
+                            height=540, zoom_speed=0.5, pan_speed=0.5
+                        )
+                    with gr.Tab("📏 Metric Measure"):
+                        gr.Markdown(
+                            "Click two points on the panorama to measure the real-world distance between them."
+                        )
+                        with gr.Row():
+                            prev_measure_btn = gr.Button(
+                                "◀ Prev", size="sm", scale=1
+                            )
+                            measure_view_selector = gr.Dropdown(
+                                choices=["View 1"],
+                                value="View 1",
+                                label="Select View",
+                                scale=3,
+                                interactive=True,
+                                allow_custom_value=True,
+                            )
+                            next_measure_btn = gr.Button("Next ▶", size="sm", scale=1)
+                        measure_image = gr.Image(
+                            type="numpy",
+                            show_label=False,
+                            format="webp",
+                            interactive=False,
+                            sources=[],
+                        )
+                        measure_text = gr.Markdown("")
+                # Action buttons
+                with gr.Row():
+                    submit_btn = gr.Button("🔨 Reconstruct", scale=2, variant="primary")
+                    clear_btn = gr.ClearButton(
+                        [
+                            input_images,
+                            reconstruction_output,
+                            log_output,
+                            target_dir_output,
+                            image_gallery,
+                        ],
+                        value="🗑️ Clear",
+                        scale=1,
+                    )
+                # Reconstruction parameters
+                gr.Markdown("**Visualization Settings**")
+                with gr.Row():
+                    conf_thres = gr.Slider(
+                        0, 100, 5, 1, label="Confidence Threshold (%)"
+                    )
+                    ceiling_remove = gr.Slider(
+                        0, 100, 25, 1, label="Ceiling Remove (%)"
+                    )
+                with gr.Row():
+                    frame_filter = gr.Dropdown(
+                        ["All"], "All", label="Show Points from Frame", scale=2
+                    )
+                    show_cam = gr.Checkbox(True, label="Show Camera")
+                    show_index = gr.Checkbox(True, label="Show Index")
+        # Example Scenes Section
+        gr.Markdown("---")
+        gr.Markdown("### 🖼️ Example Scenes")
+        gr.Markdown("Click any thumbnail to load and reconstruct.", elem_classes=["example-log"])
+        example_scenes = get_scene_info(args.examples_dir)
+        # Create 4-column example thumbnail grid
+        if example_scenes:
+            for i in range(0, len(example_scenes), 4):
+                with gr.Row():
+                    for j in range(4):
+                        idx = i + j
+                        if idx < len(example_scenes):
+                            scene = example_scenes[idx]
+                            with gr.Column(scale=1):
+                                scene_state = gr.State(value=scene)
+                                scene_img = gr.Image(
+                                    value=scene["thumbnail"],
+                                    height=150,
+                                    interactive=False,
+                                    show_label=False,
+                                    sources=[],
+                                )
+                                gr.Markdown(
+                                    f"**{scene['name']}** \n {scene['num_images']} images"
+                                )
+                                # Bind thumbnail click to example pipeline
+                                scene_img.select(
+                                    example_pipeline,
+                                    [scene_state],
+                                    [
+                                        reconstruction_output,
+                                        log_output,
+                                        target_dir_output,
+                                        frame_filter,
+                                        image_gallery,
+                                        processed_data_state,
+                                        measure_image,
+                                        measure_text,
+                                        measure_view_selector,
+                                    ],
+                                )
+                        else:
+                            with gr.Column(scale=1):
+                                pass  # Empty column for grid alignment
+        # -------------------------- Gradio Event Bindings --------------------------
+        # Reconstruct button logic
+        submit_btn.click(clear_fields, [], [reconstruction_output]).then(
+            update_log, [], [log_output]
+        ).then(
+            gradio_demo,
+            [
+                target_dir_output,
+                conf_thres,
+                frame_filter,
+                show_cam,
+                show_index,
+                ceiling_remove,
+            ],
+            [
+                reconstruction_output,
+                log_output,
+                frame_filter,
+                processed_data_state,
+                measure_image,
+                measure_text,
+                measure_view_selector,
+            ],
+        ).then(
+            lambda: "False", [], [is_example]
+        )
+        # Real-time parameter update for 3D visualization
+        for param in [conf_thres, frame_filter, show_cam, show_index, ceiling_remove]:
+            param.change(
+                update_visualization,
+                [
+                    target_dir_output,
+                    conf_thres,
+                    frame_filter,
+                    show_cam,
+                    show_index,
+                    ceiling_remove,
+                    is_example,
+                ],
+                [reconstruction_output, log_output],
+            )
+        # Auto-update gallery on file upload
+        input_images.change(
+            update_gallery_on_upload,
+            [input_images],
+            [reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        # Metric measure event bindings
+        measure_image.select(
+            measure,
+            [processed_data_state, measure_points_state, measure_view_selector],
+            [measure_image, measure_points_state, measure_text],
+        )
+        # Measure view navigation
+        prev_measure_btn.click(
+            lambda d, s: navigate_measure_view(d, s, -1),
+            [processed_data_state, measure_view_selector],
+            [measure_view_selector, measure_image, measure_points_state],
+        )
+        next_measure_btn.click(
+            lambda d, s: navigate_measure_view(d, s, 1),
+            [processed_data_state, measure_view_selector],
+            [measure_view_selector, measure_image, measure_points_state],
+        )
+        # Update measure view when selector changes
+        measure_view_selector.change(
+            lambda d, s: (
+                update_measure_view(d, int(s.split()[1]) - 1) if s else (None, [])
+            ),
+            [processed_data_state, measure_view_selector],
+            [measure_image, measure_points_state],
+        )
+        # Footer acknowledgement
+        gr.HTML(
+            """
+        <hr style="margin-top: 40px; margin-bottom: 20px; border-color: #e2e8f0;">
+        <div style="text-align: center; font-size: 13px; color: #94a3b8; margin-bottom: 20px;">
+            <p style="margin-bottom: 8px; font-weight: 500; color: #64748b;">Acknowledgements</p>
+            <p>Built upon
+                <a href="https://github.com/facebookresearch/vggt" style="color: #6366f1;">VGGT</a> &
+                <a href="https://github.com/facebookresearch/map-anything" style="color: #6366f1;">Map-Anything</a>
+            </p>
+        </div>
+        """
+        )
+    # Launch Gradio demo
+    demo.queue(max_size=20).launch(
+        show_error=True,
+        share=args.share,
+        server_name=args.server_name,
+        server_port=args.port,
+    )

argus/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2026 Realsee. All rights reserved.
2	+ # Licensed under the Apache License, Version 2.0.

argus/heads/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2026 Realsee. All rights reserved.
2	+ # Licensed under the Apache License, Version 2.0.

argus/heads/camera_head.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from argus.layers import Mlp
+from argus.layers.block import Block
+from argus.heads.head_act import activate_pose
+class CameraHead(nn.Module):
+    """
+    CameraHead predicts camera parameters from token representations using iterative refinement.
+    It applies a series of transformer blocks (the "trunk") to dedicated camera tokens.
+    """
+    def __init__(
+        self,
+        dim_in: int = 2048,
+        trunk_depth: int = 4,
+        num_heads: int = 16,
+        mlp_ratio: int = 4,
+        init_values: float = 0.01,
+        trans_act: str = "linear",
+        quat_act: str = "linear",
+    ):
+        super().__init__()
+        self.target_dim = 9
+        self.trans_act = trans_act
+        self.quat_act = quat_act
+        self.trunk_depth = trunk_depth
+        # Build the trunk using a sequence of transformer blocks.
+        self.trunk = nn.Sequential(
+            *[
+                Block(dim=dim_in, num_heads=num_heads, mlp_ratio=mlp_ratio, init_values=init_values)
+                for _ in range(trunk_depth)
+            ]
+        )
+        # Normalizations for camera token and trunk output.
+        self.token_norm = nn.LayerNorm(dim_in)
+        self.trunk_norm = nn.LayerNorm(dim_in)
+        # Learnable empty camera pose token.
+        self.empty_pose_tokens = nn.Parameter(torch.zeros(1, 1, self.target_dim))
+        self.embed_pose = nn.Linear(self.target_dim, dim_in)
+        # Module for producing modulation parameters: shift, scale, and a gate.
+        self.poseLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim_in, 3 * dim_in, bias=True))
+        # Adaptive layer normalization without affine parameters.
+        self.adaln_norm = nn.LayerNorm(dim_in, elementwise_affine=False, eps=1e-6)
+        self.pose_branch = Mlp(in_features=dim_in, hidden_features=dim_in // 2, out_features=self.target_dim, drop=0)
+        # conf branch for T and R
+        self.conf_branch = Mlp(in_features=dim_in, hidden_features=dim_in // 2, out_features=2, drop=0)
+    def forward(self, aggregated_tokens_list: list, num_iterations: int = 4) -> list:
+        """
+        Forward pass to predict camera parameters.
+        Args:
+            aggregated_tokens_list (list): List of token tensors from the network;
+                the last tensor is used for prediction.
+            num_iterations (int, optional): Number of iterative refinement steps. Defaults to 4.
+        Returns:
+            list: A list of predicted camera encodings (post-activation) from each iteration.
+        """
+        # Use tokens from the last block for camera prediction.
+        tokens = aggregated_tokens_list[-1]
+        # Extract the camera tokens
+        pose_tokens = tokens[:, :, 0]
+        pose_tokens = self.token_norm(pose_tokens)
+        pred_pose_enc_list = self.trunk_fn(pose_tokens, num_iterations)
+        return pred_pose_enc_list
+    def trunk_fn(self, pose_tokens: torch.Tensor, num_iterations: int) -> list:
+        """
+        Iteratively refine camera pose predictions.
+        Args:
+            pose_tokens (torch.Tensor): Normalized camera tokens with shape [B, S, C].
+            num_iterations (int): Number of refinement iterations.
+        Returns:
+            list: List of activated camera encodings from each iteration.
+        """
+        B, S, C = pose_tokens.shape
+        pred_pose_enc = None
+        pred_pose_enc_conf = None
+        pred_pose_enc_list = []
+        for _ in range(num_iterations):
+            # Use a learned empty pose for the first iteration.
+            if pred_pose_enc is None:
+                module_input = self.embed_pose(self.empty_pose_tokens.expand(B, S, -1))
+            else:
+                # Detach the previous prediction to avoid backprop through time.
+                pred_pose_enc = pred_pose_enc.detach()
+                module_input = self.embed_pose(pred_pose_enc)
+            # Generate modulation parameters and split them into shift, scale, and gate components.
+            shift_msa, scale_msa, gate_msa = self.poseLN_modulation(module_input).chunk(3, dim=-1)
+            # Adaptive layer normalization and modulation.
+            pose_tokens_modulated = gate_msa * modulate(self.adaln_norm(pose_tokens), shift_msa, scale_msa)
+            pose_tokens_modulated = pose_tokens_modulated + pose_tokens
+            pose_tokens_modulated = self.trunk(pose_tokens_modulated)
+            # Compute the delta update for the pose encoding.
+            pred_pose_enc_delta = self.pose_branch(self.trunk_norm(pose_tokens_modulated))
+            pred_pose_enc_conf_delta = self.conf_branch(self.trunk_norm(pose_tokens_modulated))
+            if pred_pose_enc is None:
+                pred_pose_enc = pred_pose_enc_delta
+                pred_pose_enc_conf = pred_pose_enc_conf_delta
+            else:
+                pred_pose_enc = pred_pose_enc + pred_pose_enc_delta
+                pred_pose_enc_conf = pred_pose_enc_conf + pred_pose_enc_conf_delta
+            # Apply final activation functions for translation, quaternion
+            activated_pose = activate_pose(
+                    pred_pose_enc, trans_act=self.trans_act, quat_act=self.quat_act
+                )
+            activated_conf = 1 + pred_pose_enc_conf.exp()
+            activated_pose = torch.cat([activated_pose, activated_conf], dim=-1)
+            pred_pose_enc_list.append(activated_pose)
+        return pred_pose_enc_list
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    """
+    Modulate the input tensor using scaling and shifting parameters.
+    """
+    # modified from https://github.com/facebookresearch/DiT/blob/796c29e532f47bba17c5b9c5eb39b9354b8b7c64/models.py#L19
+    return x * (1 + scale) + shift

argus/heads/dpt_head.py ADDED Viewed

	@@ -0,0 +1,474 @@

+import os
+from typing import List, Dict, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .head_act import activate_head
+from .utils import create_uv_grid, position_grid_to_embed
+class DPTHead(nn.Module):
+    """
+    DPT  Head for dense prediction tasks.
+    This implementation follows the architecture described in "Vision Transformers for Dense Prediction"
+    (https://arxiv.org/abs/2103.13413). The DPT head processes features from a vision transformer
+    backbone and produces dense predictions by fusing multi-scale features.
+    Args:
+        dim_in (int): Input dimension (channels).
+        patch_size (int, optional): Patch size. Default is 14.
+        output_dim (int, optional): Number of output channels. Default is 4.
+        activation (str, optional): Activation type. Default is "inv_log".
+        conf_activation (str, optional): Confidence activation type. Default is "expp1".
+        features (int, optional): Feature channels for intermediate representations. Default is 256.
+        out_channels (List[int], optional): Output channels for each intermediate layer.
+        intermediate_layer_idx (List[int], optional): Indices of layers from aggregated tokens used for DPT.
+        pos_embed (bool, optional): Whether to use positional embedding. Default is True.
+        feature_only (bool, optional): If True, return features only without the last several layers and activation head. Default is False.
+        down_ratio (int, optional): Downscaling factor for the output resolution. Default is 1.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        output_dim: int = 4,
+        activation: str = "inv_log",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: List[int] = [256, 512, 1024, 1024],
+        intermediate_layer_idx: List[int] = [4, 11, 17, 23],
+        pos_embed: bool = True,
+        feature_only: bool = False,
+        down_ratio: int = 1,
+    ) -> None:
+        super(DPTHead, self).__init__()
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.feature_only = feature_only
+        self.down_ratio = down_ratio
+        self.intermediate_layer_idx = intermediate_layer_idx
+        self.norm = nn.LayerNorm(dim_in)
+        # Projection layers for each output channel from tokens.
+        self.projects = nn.ModuleList(
+            [nn.Conv2d(in_channels=dim_in, out_channels=oc, kernel_size=1, stride=1, padding=0) for oc in out_channels]
+        )
+        # Resize layers for upsampling feature maps.
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(
+                    in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
+                ),
+            ]
+        )
+        self.scratch = _make_scratch(out_channels, features, expand=False)
+        # Attach additional modules to scratch.
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features)
+        self.scratch.refinenet2 = _make_fusion_block(features)
+        self.scratch.refinenet3 = _make_fusion_block(features)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
+        head_features_1 = features
+        head_features_2 = 32
+        if feature_only:
+            self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1)
+        else:
+            self.scratch.output_conv1 = nn.Conv2d(
+                head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+            )
+            conv2_in_channels = head_features_1 // 2
+            self.scratch.output_conv2 = nn.Sequential(
+                nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+            )
+    def forward(
+        self,
+        aggregated_tokens_list: List[torch.Tensor],
+        images: torch.Tensor,
+        patch_start_idx: int,
+        frames_chunk_size: int = 8,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward pass through the DPT head, supports processing by chunking frames.
+        Args:
+            aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers.
+            images (Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
+            patch_start_idx (int): Starting index for patch tokens in the token sequence.
+                Used to separate patch tokens from other tokens (e.g., camera or register tokens).
+            frames_chunk_size (int, optional): Number of frames to process in each chunk.
+                If None or larger than S, all frames are processed at once. Default: 8.
+        Returns:
+            Tensor or Tuple[Tensor, Tensor]:
+                - If feature_only=True: Feature maps with shape [B, S, C, H, W]
+                - Otherwise: Tuple of (predictions, confidence) both with shape [B, S, 1, H, W]
+        """
+        B, S, _, H, W = images.shape
+        # If frames_chunk_size is not specified or greater than S, process all frames at once
+        if frames_chunk_size is None or frames_chunk_size >= S:
+            return self._forward_impl(aggregated_tokens_list, images, patch_start_idx)
+        # Otherwise, process frames in chunks to manage memory usage
+        assert frames_chunk_size > 0
+        # Process frames in batches
+        all_preds = []
+        all_conf = []
+        for frames_start_idx in range(0, S, frames_chunk_size):
+            frames_end_idx = min(frames_start_idx + frames_chunk_size, S)
+            # Process batch of frames
+            if self.feature_only:
+                chunk_output = self._forward_impl(
+                    aggregated_tokens_list, images, patch_start_idx, frames_start_idx, frames_end_idx
+                )
+                all_preds.append(chunk_output)
+            else:
+                chunk_preds, chunk_conf = self._forward_impl(
+                    aggregated_tokens_list, images, patch_start_idx, frames_start_idx, frames_end_idx
+                )
+                all_preds.append(chunk_preds)
+                all_conf.append(chunk_conf)
+        # Concatenate results along the sequence dimension
+        if self.feature_only:
+            return torch.cat(all_preds, dim=1)
+        else:
+            return torch.cat(all_preds, dim=1), torch.cat(all_conf, dim=1)
+    def _forward_impl(
+        self,
+        aggregated_tokens_list: List[torch.Tensor],
+        images: torch.Tensor,
+        patch_start_idx: int,
+        frames_start_idx: int = None,
+        frames_end_idx: int = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Implementation of the forward pass through the DPT head.
+        This method processes a specific chunk of frames from the sequence.
+        Args:
+            aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers.
+            images (Tensor): Input images with shape [B, S, 3, H, W].
+            patch_start_idx (int): Starting index for patch tokens.
+            frames_start_idx (int, optional): Starting index for frames to process.
+            frames_end_idx (int, optional): Ending index for frames to process.
+        Returns:
+            Tensor or Tuple[Tensor, Tensor]: Feature maps or (predictions, confidence).
+        """
+        if frames_start_idx is not None and frames_end_idx is not None:
+            images = images[:, frames_start_idx:frames_end_idx].contiguous()
+        B, S, _, H, W = images.shape
+        patch_h, patch_w = H // self.patch_size, W // self.patch_size
+        out = []
+        dpt_idx = 0
+        for layer_idx in self.intermediate_layer_idx:
+            x = aggregated_tokens_list[layer_idx][:, :, patch_start_idx:]
+            # Select frames if processing a chunk
+            if frames_start_idx is not None and frames_end_idx is not None:
+                x = x[:, frames_start_idx:frames_end_idx]
+            x = x.reshape(B * S, -1, x.shape[-1])
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[dpt_idx](x)
+            if self.pos_embed:
+                x = self._apply_pos_embed(x, W, H)
+            x = self.resize_layers[dpt_idx](x)
+            out.append(x)
+            dpt_idx += 1
+        # Fuse features from multiple layers.
+        out = self.scratch_forward(out)
+        # Interpolate fused output to match target image resolution.
+        out = custom_interpolate(
+            out,
+            (int(patch_h * self.patch_size / self.down_ratio), int(patch_w * self.patch_size / self.down_ratio)),
+            mode="bilinear",
+            align_corners=True,
+        )
+        if self.pos_embed:
+            out = self._apply_pos_embed(out, W, H)
+        if self.feature_only:
+            return out.view(B, S, *out.shape[1:])
+        out = self.scratch.output_conv2(out)
+        preds, conf = activate_head(out, activation=self.activation, conf_activation=self.conf_activation)
+        preds = preds.view(B, S, *preds.shape[1:])
+        conf = conf.view(B, S, *conf.shape[1:])
+        return preds, conf
+    def _apply_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """
+        Apply positional embedding to tensor x.
+        """
+        patch_w = x.shape[-1]
+        patch_h = x.shape[-2]
+        pos_embed = create_uv_grid(patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
+        pos_embed = pos_embed * ratio
+        pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pos_embed
+    def scratch_forward(self, features: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Forward pass through the fusion blocks.
+        Args:
+            features (List[Tensor]): List of feature maps from different layers.
+        Returns:
+            Tensor: Fused feature map.
+        """
+        layer_1, layer_2, layer_3, layer_4 = features
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        del layer_4_rn, layer_4
+        out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:])
+        del layer_3_rn, layer_3
+        out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:])
+        del layer_2_rn, layer_2
+        out = self.scratch.refinenet1(out, layer_1_rn)
+        del layer_1_rn, layer_1
+        out = self.scratch.output_conv1(out)
+        return out
+################################################################################
+# Modules
+################################################################################
+def _make_fusion_block(features: int, size: int = None, has_residual: bool = True, groups: int = 1) -> nn.Module:
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(inplace=True),
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=size,
+        has_residual=has_residual,
+        groups=groups,
+    )
+def _make_scratch(in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False) -> nn.Module:
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn, groups=1):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = groups
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.norm1 = None
+        self.norm2 = None
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.norm1 is not None:
+            out = self.norm1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.norm2 is not None:
+            out = self.norm2(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None,
+        has_residual=True,
+        groups=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = groups
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=self.groups
+        )
+        if has_residual:
+            self.resConfUnit1 = ResidualConvUnit(features, activation, bn, groups=self.groups)
+        self.has_residual = has_residual
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=self.groups)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size = size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if self.has_residual:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = custom_interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def custom_interpolate(
+    x: torch.Tensor,
+    size: Tuple[int, int] = None,
+    scale_factor: float = None,
+    mode: str = "bilinear",
+    align_corners: bool = True,
+) -> torch.Tensor:
+    """
+    Custom interpolate to avoid INT_MAX issues in nn.functional.interpolate.
+    """
+    if size is None:
+        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
+    INT_MAX = 1610612736
+    input_elements = size[0] * size[1] * x.shape[0] * x.shape[1]
+    if input_elements > INT_MAX:
+        chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0)
+        interpolated_chunks = [
+            nn.functional.interpolate(chunk, size=size, mode=mode, align_corners=align_corners) for chunk in chunks
+        ]
+        x = torch.cat(interpolated_chunks, dim=0)
+        return x.contiguous()
+    else:
+        return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners)

argus/heads/head_act.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+import torch.nn.functional as F
+def activate_pose(pred_pose_enc, trans_act="linear", quat_act="linear"):
+    """
+    Activate pose parameters with specified activation functions.
+    Args:
+        pred_pose_enc: Tensor containing encoded pose parameters [translation, quaternion, xx]
+        trans_act: Activation type for translation component
+        quat_act: Activation type for quaternion component
+    Returns:
+        Activated pose parameters tensor
+    """
+    T = pred_pose_enc[..., :3]
+    quat = pred_pose_enc[..., 3:7]
+    T = base_pose_act(T, trans_act)
+    quat = base_pose_act(quat, quat_act)
+    # Discard the remaining parameters
+    pred_pose_enc = torch.cat([T, quat], dim=-1)
+    return pred_pose_enc
+def base_pose_act(pose_enc, act_type="linear"):
+    """
+    Apply basic activation function to pose parameters.
+    Args:
+        pose_enc: Tensor containing encoded pose parameters
+        act_type: Activation type ("linear", "inv_log", "exp", "relu")
+    Returns:
+        Activated pose parameters
+    """
+    if act_type == "linear":
+        return pose_enc
+    elif act_type == "inv_log":
+        return inverse_log_transform(pose_enc)
+    elif act_type == "exp":
+        return torch.exp(pose_enc)
+    elif act_type == "relu":
+        return F.relu(pose_enc)
+    elif act_type == "expp1":
+        return 1 + pose_enc.exp()
+    elif act_type == "expp0":
+        return pose_enc.exp()
+    elif act_type == "sigmoid":
+        return torch.sigmoid(pose_enc)
+    else:
+        raise ValueError(f"Unknown act_type: {act_type}")
+def activate_head(out, activation="norm_exp", conf_activation="expp1"):
+    """
+    Process network output to extract 3D points and confidence values.
+    Args:
+        out: Network output tensor (B, C, H, W)
+        activation: Activation type for 3D points
+        conf_activation: Activation type for confidence values
+    Returns:
+        Tuple of (3D points tensor, confidence tensor)
+    """
+    # Move channels from last dim to the 4th dimension => (B, H, W, C)
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,C expected
+    # Split into xyz (first C-1 channels) and confidence (last channel)
+    xyz = fmap[:, :, :, :-1]
+    conf = fmap[:, :, :, -1]
+    if activation == "norm_exp":
+        d = xyz.norm(dim=-1, keepdim=True).clamp(min=1e-8)
+        xyz_normed = xyz / d
+        pts3d = xyz_normed * torch.expm1(d)
+    elif activation == "norm":
+        pts3d = xyz / xyz.norm(dim=-1, keepdim=True)
+    elif activation == "exp":
+        pts3d = torch.exp(xyz)
+    elif activation == "relu":
+        pts3d = F.relu(xyz)
+    elif activation == "inv_log":
+        pts3d = inverse_log_transform(xyz)
+    elif activation == "xy_inv_log":
+        xy, z = xyz.split([2, 1], dim=-1)
+        z = inverse_log_transform(z)
+        pts3d = torch.cat([xy * z, z], dim=-1)
+    elif activation == "sigmoid":
+        pts3d = torch.sigmoid(xyz)
+    elif activation == "linear":
+        pts3d = xyz
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
+    if conf_activation == "expp1":
+        conf_out = 1 + conf.exp()
+    elif conf_activation == "expp0":
+        conf_out = conf.exp()
+    elif conf_activation == "sigmoid":
+        conf_out = torch.sigmoid(conf)
+    else:
+        raise ValueError(f"Unknown conf_activation: {conf_activation}")
+    return pts3d, conf_out
+def inverse_log_transform(y):
+    """
+    Apply inverse log transform: sign(y) * (exp(|y|) - 1)
+    Args:
+        y: Input tensor
+    Returns:
+        Transformed tensor
+    """
+    return torch.sign(y) * (torch.expm1(torch.abs(y)))

argus/heads/utils.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import torch
+def position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100) -> torch.Tensor:
+    """
+    Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
+    Args:
+        pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates
+        embed_dim: Output channel dimension for embeddings
+    Returns:
+        Tensor of shape (H, W, embed_dim) with positional embeddings
+    """
+    H, W, grid_dim = pos_grid.shape
+    assert grid_dim == 2
+    pos_flat = pos_grid.reshape(-1, grid_dim)  # Flatten to (H*W, 2)
+    # Process x and y coordinates separately
+    emb_x = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)  # [1, H*W, D/2]
+    emb_y = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)  # [1, H*W, D/2]
+    # Combine and reshape
+    emb = torch.cat([emb_x, emb_y], dim=-1)  # [1, H*W, D]
+    return emb.view(H, W, embed_dim)  # [H, W, D]
+def make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    device = pos.device
+    omega = torch.arange(embed_dim // 2, dtype=torch.float32 if device.type == "mps" else torch.double, device=device)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / omega_0**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb.float()
+# Inspired by https://github.com/microsoft/moge
+def create_uv_grid(
+    width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None
+) -> torch.Tensor:
+    """
+    Create a normalized UV grid of shape (width, height, 2).
+    The grid spans horizontally and vertically according to an aspect ratio,
+    ensuring the top-left corner is at (-x_span, -y_span) and the bottom-right
+    corner is at (x_span, y_span), normalized by the diagonal of the plane.
+    Args:
+        width (int): Number of points horizontally.
+        height (int): Number of points vertically.
+        aspect_ratio (float, optional): Width-to-height ratio. Defaults to width/height.
+        dtype (torch.dtype, optional): Data type of the resulting tensor.
+        device (torch.device, optional): Device on which the tensor is created.
+    Returns:
+        torch.Tensor: A (width, height, 2) tensor of UV coordinates.
+    """
+    # Derive aspect ratio if not explicitly provided
+    if aspect_ratio is None:
+        aspect_ratio = float(width) / float(height)
+    # Compute normalized spans for X and Y
+    diag_factor = (aspect_ratio**2 + 1.0) ** 0.5
+    span_x = aspect_ratio / diag_factor
+    span_y = 1.0 / diag_factor
+    # Establish the linspace boundaries
+    left_x = -span_x * (width - 1) / width
+    right_x = span_x * (width - 1) / width
+    top_y = -span_y * (height - 1) / height
+    bottom_y = span_y * (height - 1) / height
+    # Generate 1D coordinates
+    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
+    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
+    # Create 2D meshgrid (width x height) and stack into UV
+    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
+    uv_grid = torch.stack((uu, vv), dim=-1)
+    return uv_grid
+def reorder_by_reference(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
+    """Reorder tensor views to place the selected reference view at the first position (index 0),
+    while keeping the remaining views in their original order (excluding the reference view).
+    Args:
+        x: Input tensor with shape (B, S, ...) where B = batch size, S = number of views,
+           and trailing dimensions can be arbitrary (e.g., N, C for patch tokens).
+        b_idx: 1D tensor of shape (B,) containing the index of the reference view for each batch element,
+               each value must be in the range [0, S-1].
+    Returns:
+        Reordered tensor with the same shape as input, where the reference view is at position 0
+        and other views retain their original order (skipping the reference view).
+    Example:
+        If B=1, S=5, b_idx=[2], input view order is [0,1,2,3,4],
+        output order becomes [2,0,1,3,4].
+    """
+    # Extract batch size (B) and number of views (S) from input shape
+    B, S = x.shape[0], x.shape[1]
+    # No reordering needed if only one view exists
+    if S <= 1:
+        return x
+    # Generate base index matrix (B, S): each row is [0, 1, ..., S-1] (same across batches)
+    idx = torch.arange(S, device=x.device).expand(B, -1)
+    # Create mask to exclude reference view indices (True for non-reference positions)
+    mask = idx != b_idx.unsqueeze(1)
+    # Build reorder indices: [reference_idx] + [all non-reference indices in original order]
+    # Reshape non-reference indices to (B, S-1) to match batch dimension, then concatenate
+    reorder_idx = torch.cat([b_idx.unsqueeze(1), idx[mask].reshape(B, S-1)], dim=1)
+    # Advanced indexing to reorder: batch indices (B,1) paired with reorder indices (B,S)
+    return x[torch.arange(B).unsqueeze(1), reorder_idx]

argus/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed

argus/layers/attention.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+import torch.nn.functional as F
+XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.fused_attn = fused_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None, pos=None) -> Tensor:
+        assert pos is None
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

argus/layers/block.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            fused_attn=fused_attn,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, bias=ffn_bias
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x, pos=pos, residual_func=attn_residual_func, sample_drop_ratio=self.sample_drop_ratio
+            )
+            x = drop_add_residual_stochastic_depth(
+                x, residual_func=ffn_residual_func, sample_drop_ratio=self.sample_drop_ratio
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor, residual_func: Callable[[Tensor], Tensor], sample_drop_ratio: float = 0.0, pos=None
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    if pos is not None:
+        # if necessary, apply rope to the subset
+        pos = pos[brange]
+        residual = residual_func(x_subset, pos=pos)
+    else:
+        residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=(self.ls1.gamma if isinstance(self.ls1, LayerScale) else None),
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=(self.ls2.gamma if isinstance(self.ls1, LayerScale) else None),
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

argus/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

argus/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(self, dim: int, init_values: Union[float, Tensor] = 1e-5, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

argus/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

argus/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (image_HW[0] // patch_HW[0], image_HW[1] // patch_HW[1])
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

argus/layers/rope.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Implementation of 2D Rotary Position Embeddings (RoPE).
+# This module provides a clean implementation of 2D Rotary Position Embeddings,
+# which extends the original RoPE concept to handle 2D spatial positions.
+# Inspired by:
+#         https://github.com/meta-llama/codellama/blob/main/llama/model.py
+#         https://github.com/naver-ai/rope-vit
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple
+class PositionGetter:
+    """Generates and caches 2D spatial positions for patches in a grid.
+    This class efficiently manages the generation of spatial coordinates for patches
+    in a 2D grid, caching results to avoid redundant computations.
+    Attributes:
+        position_cache: Dictionary storing precomputed position tensors for different
+            grid dimensions.
+    """
+    def __init__(self):
+        """Initializes the position generator with an empty cache."""
+        self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {}
+    def __call__(self, batch_size: int, height: int, width: int, device: torch.device) -> torch.Tensor:
+        """Generates spatial positions for a batch of patches.
+        Args:
+            batch_size: Number of samples in the batch.
+            height: Height of the grid in patches.
+            width: Width of the grid in patches.
+            device: Target device for the position tensor.
+        Returns:
+            Tensor of shape (batch_size, height*width, 2) containing y,x coordinates
+            for each position in the grid, repeated for each batch item.
+        """
+        if (height, width) not in self.position_cache:
+            y_coords = torch.arange(height, device=device)
+            x_coords = torch.arange(width, device=device)
+            positions = torch.cartesian_prod(y_coords, x_coords)
+            self.position_cache[height, width] = positions
+        cached_positions = self.position_cache[height, width]
+        return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
+class RotaryPositionEmbedding2D(nn.Module):
+    """2D Rotary Position Embedding implementation.
+    This module applies rotary position embeddings to input tokens based on their
+    2D spatial positions. It handles the position-dependent rotation of features
+    separately for vertical and horizontal dimensions.
+    Args:
+        frequency: Base frequency for the position embeddings. Default: 100.0
+        scaling_factor: Scaling factor for frequency computation. Default: 1.0
+    Attributes:
+        base_frequency: Base frequency for computing position embeddings.
+        scaling_factor: Factor to scale the computed frequencies.
+        frequency_cache: Cache for storing precomputed frequency components.
+    """
+    def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0):
+        """Initializes the 2D RoPE module."""
+        super().__init__()
+        self.base_frequency = frequency
+        self.scaling_factor = scaling_factor
+        self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {}
+    def _compute_frequency_components(
+        self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Computes frequency components for rotary embeddings.
+        Args:
+            dim: Feature dimension (must be even).
+            seq_len: Maximum sequence length.
+            device: Target device for computations.
+            dtype: Data type for the computed tensors.
+        Returns:
+            Tuple of (cosine, sine) tensors for frequency components.
+        """
+        cache_key = (dim, seq_len, device, dtype)
+        if cache_key not in self.frequency_cache:
+            # Compute frequency bands
+            exponents = torch.arange(0, dim, 2, device=device).float() / dim
+            inv_freq = 1.0 / (self.base_frequency**exponents)
+            # Generate position-dependent frequencies
+            positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            angles = torch.einsum("i,j->ij", positions, inv_freq)
+            # Compute and cache frequency components
+            angles = angles.to(dtype)
+            angles = torch.cat((angles, angles), dim=-1)
+            cos_components = angles.cos().to(dtype)
+            sin_components = angles.sin().to(dtype)
+            self.frequency_cache[cache_key] = (cos_components, sin_components)
+        return self.frequency_cache[cache_key]
+    @staticmethod
+    def _rotate_features(x: torch.Tensor) -> torch.Tensor:
+        """Performs feature rotation by splitting and recombining feature dimensions.
+        Args:
+            x: Input tensor to rotate.
+        Returns:
+            Rotated feature tensor.
+        """
+        feature_dim = x.shape[-1]
+        x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def _apply_1d_rope(
+        self, tokens: torch.Tensor, positions: torch.Tensor, cos_comp: torch.Tensor, sin_comp: torch.Tensor
+    ) -> torch.Tensor:
+        """Applies 1D rotary position embeddings along one dimension.
+        Args:
+            tokens: Input token features.
+            positions: Position indices.
+            cos_comp: Cosine components for rotation.
+            sin_comp: Sine components for rotation.
+        Returns:
+            Tokens with applied rotary position embeddings.
+        """
+        # Embed positions with frequency components
+        cos = F.embedding(positions, cos_comp)[:, None, :, :]
+        sin = F.embedding(positions, sin_comp)[:, None, :, :]
+        # Apply rotation
+        return (tokens * cos) + (self._rotate_features(tokens) * sin)
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
+        """Applies 2D rotary position embeddings to input tokens.
+        Args:
+            tokens: Input tensor of shape (batch_size, n_heads, n_tokens, dim).
+                   The feature dimension (dim) must be divisible by 4.
+            positions: Position tensor of shape (batch_size, n_tokens, 2) containing
+                      the y and x coordinates for each token.
+        Returns:
+            Tensor of same shape as input with applied 2D rotary position embeddings.
+        Raises:
+            AssertionError: If input dimensions are invalid or positions are malformed.
+        """
+        # Validate inputs
+        assert tokens.size(-1) % 2 == 0, "Feature dimension must be even"
+        assert positions.ndim == 3 and positions.shape[-1] == 2, "Positions must have shape (batch_size, n_tokens, 2)"
+        # Compute feature dimension for each spatial direction
+        feature_dim = tokens.size(-1) // 2
+        # Get frequency components
+        max_position = int(positions.max()) + 1
+        cos_comp, sin_comp = self._compute_frequency_components(feature_dim, max_position, tokens.device, tokens.dtype)
+        # Split features for vertical and horizontal processing
+        vertical_features, horizontal_features = tokens.chunk(2, dim=-1)
+        # Apply RoPE separately for each dimension
+        vertical_features = self._apply_1d_rope(vertical_features, positions[..., 0], cos_comp, sin_comp)
+        horizontal_features = self._apply_1d_rope(horizontal_features, positions[..., 1], cos_comp, sin_comp)
+        # Combine processed features
+        return torch.cat((vertical_features, horizontal_features), dim=-1)

argus/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+# try:
+#     if XFORMERS_ENABLED:
+#         from xformers.ops import SwiGLU
+#         XFORMERS_AVAILABLE = True
+#         warnings.warn("xFormers is available (SwiGLU)")
+#     else:
+#         warnings.warn("xFormers is disabled (SwiGLU)")
+#         raise ImportError
+# except ImportError:
+SwiGLU = SwiGLUFFN
+XFORMERS_AVAILABLE = False
+# warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(in_features=in_features, hidden_features=hidden_features, out_features=out_features, bias=bias)

argus/layers/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from torch.nn.init import trunc_normal_
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFNFused
+from .attention import MemEffAttention
+from .block import NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        qk_norm=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.use_reentrant = False # hardcoded to False
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                qk_norm=qk_norm,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat((x[:, :1], self.register_tokens.expand(x.shape[0], -1, -1), x[:, 1:]), dim=1)
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=self.use_reentrant)
+            else:
+                x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=self.use_reentrant)
+            else:
+                x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=True, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

argus/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2026 Realsee. All rights reserved.
2	+ # Licensed under the Apache License, Version 2.0.

argus/models/aggregator.py ADDED Viewed

	@@ -0,0 +1,502 @@

+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from typing import Optional, Tuple, Union, List, Dict, Any
+from argus.layers import Mlp
+from argus.layers import PatchEmbed
+from argus.layers.block import Block
+from argus.layers.rope import RotaryPositionEmbedding2D, PositionGetter
+from argus.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2
+from argus.heads.utils import reorder_by_reference
+logger = logging.getLogger(__name__)
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
+class Aggregator(nn.Module):
+    """
+    Args:
+        img_size (int): Image size in pixels.
+        patch_size (int): Size of each patch for PatchEmbed.
+        embed_dim (int): Dimension of the token embeddings.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of MLP hidden dim to embedding dim.
+        num_register_tokens (int): Number of register tokens.
+        block_fn (nn.Module): The block type used for attention (Block by default).
+        qkv_bias (bool): Whether to include bias in QKV projections.
+        proj_bias (bool): Whether to include bias in the output projection.
+        ffn_bias (bool): Whether to include bias in MLP layers.
+        patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg".
+        aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"].
+        aa_block_size (int): How many blocks to group under each attention type before switching. If not necessary, set to 1.
+        qk_norm (bool): Whether to apply QK normalization.
+        rope_freq (int): Base frequency for rotary embedding. -1 to disable.
+        init_values (float): Init scale for layer scale.
+        reorder_by_learning_ref (bool): Whether to reorder features by learning reference view index.
+        ref_aa_block_num (int): Number of aa blocks for reference view learning.
+    """
+    def __init__(
+        self,
+        img_size=518,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_register_tokens=4,
+        block_fn=Block,
+        qkv_bias=True,
+        proj_bias=True,
+        ffn_bias=True,
+        patch_embed="dinov2_vitl14_reg",
+        aa_order=["frame", "global"],
+        aa_block_size=1,
+        qk_norm=True,
+        rope_freq=100,
+        init_values=0.01,
+        reorder_by_learning_ref=True,
+        ref_aa_block_num=2,
+        save_inference_memory=True,
+    ):
+        super().__init__()
+        self.reorder_by_learning_ref = reorder_by_learning_ref
+        self.save_inference_memory = save_inference_memory
+        self.__build_patch_embed__(patch_embed, img_size, patch_size, num_register_tokens, embed_dim=embed_dim)
+        # Initialize rotary position embedding if frequency > 0
+        self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None
+        self.position_getter = PositionGetter() if self.rope is not None else None
+        self.frame_blocks = nn.ModuleList(
+            [
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    ffn_bias=ffn_bias,
+                    init_values=init_values,
+                    qk_norm=qk_norm,
+                    rope=self.rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.global_blocks = nn.ModuleList(
+            [
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    ffn_bias=ffn_bias,
+                    init_values=init_values,
+                    qk_norm=qk_norm,
+                    rope=self.rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.depth = depth
+        self.aa_order = aa_order
+        self.patch_size = patch_size
+        self.aa_block_size = aa_block_size
+        # Validate that depth is divisible by aa_block_size
+        if self.depth % self.aa_block_size != 0:
+            raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})")
+        self.aa_block_num = self.depth // self.aa_block_size
+        # Reference Learning Network
+        if self.reorder_by_learning_ref:
+            self.ref_aa_block_num = ref_aa_block_num
+            self.ref_frame_blocks = nn.ModuleList(
+                [
+                    block_fn(
+                        dim=embed_dim,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        proj_bias=proj_bias,
+                        ffn_bias=ffn_bias,
+                        init_values=init_values,
+                        qk_norm=qk_norm,
+                        rope=self.rope,
+                    )
+                    for _ in range(self.ref_aa_block_num)
+                ]
+            )
+            self.ref_global_blocks = nn.ModuleList(
+                [
+                    block_fn(
+                        dim=embed_dim,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        proj_bias=proj_bias,
+                        ffn_bias=ffn_bias,
+                        init_values=init_values,
+                        qk_norm=qk_norm,
+                        rope=self.rope,
+                    )
+                    for _ in range(self.ref_aa_block_num)
+                ]
+            )
+        # Note: We have two camera tokens, one for the first frame and one for the rest
+        # The same applies for register tokens
+        self.camera_token = nn.Parameter(torch.randn(1, 2, 1, embed_dim))
+        self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, embed_dim))
+        if self.reorder_by_learning_ref:
+            # describe the covisibility of the current frame with other frames
+            self.covisibility_token = nn.Parameter(torch.randn(1, 1, 1, embed_dim))
+        # The patch tokens start after the camera and register tokens
+        self.patch_start_idx = 1 + num_register_tokens
+        # Initialize parameters with small values
+        nn.init.normal_(self.camera_token, std=1e-6)
+        nn.init.normal_(self.register_token, std=1e-6)
+        if self.reorder_by_learning_ref:
+            nn.init.normal_(self.covisibility_token, std=1e-6)
+        # Register normalization constants as buffers
+        for name, value in (("_resnet_mean", _RESNET_MEAN), ("_resnet_std", _RESNET_STD)):
+            self.register_buffer(name, torch.FloatTensor(value).view(1, 1, 3, 1, 1), persistent=False)
+        self.use_reentrant = False # hardcoded to False
+    def __build_patch_embed__(
+        self,
+        patch_embed,
+        img_size,
+        patch_size,
+        num_register_tokens,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        block_chunks=0,
+        init_values=1.0,
+        embed_dim=1024,
+    ):
+        """
+        Build the patch embed layer. If 'conv', we use a
+        simple PatchEmbed conv layer. Otherwise, we use a vision transformer.
+        """
+        if "conv" in patch_embed:
+            self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=3, embed_dim=embed_dim)
+        else:
+            vit_models = {
+                "dinov2_vitl14_reg": vit_large,
+                "dinov2_vitb14_reg": vit_base,
+                "dinov2_vits14_reg": vit_small,
+                "dinov2_vitg2_reg": vit_giant2,
+            }
+            self.patch_embed = vit_models[patch_embed](
+                img_size=img_size,
+                patch_size=patch_size,
+                num_register_tokens=num_register_tokens,
+                interpolate_antialias=interpolate_antialias,
+                interpolate_offset=interpolate_offset,
+                block_chunks=block_chunks,
+                init_values=init_values,
+            )
+            # Disable gradient updates for mask token
+            if hasattr(self.patch_embed, "mask_token"):
+                # self.patch_embed.mask_token.requires_grad_(False)
+                del self.patch_embed.mask_token
+        # covisibility head
+        if self.reorder_by_learning_ref:
+            self.token_norm = nn.LayerNorm(embed_dim * 2)
+            self.covisibility_head = Mlp(in_features=embed_dim * 2, hidden_features=embed_dim * 2 // 2, out_features=1, drop=0)
+    def forward(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], int]:
+        """
+        Args:
+            images (torch.Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
+                B: batch size, S: sequence length, 3: RGB channels, H: height, W: width
+        Returns:
+            (list[torch.Tensor], int):
+                The list of outputs from the attention blocks,
+                and the patch_start_idx indicating where patch tokens begin.
+        """
+        B, S, C_in, H, W = images.shape
+        if C_in != 3:
+            raise ValueError(f"Expected 3 input channels, got {C_in}")
+        # Normalize images and reshape for patch embed
+        images = (images - self._resnet_mean) / self._resnet_std
+        # Reshape to [B*S, C, H, W] for patch embedding
+        images = images.view(B * S, C_in, H, W)
+        patch_tokens = self.patch_embed(images)
+        if isinstance(patch_tokens, dict):
+            patch_tokens = patch_tokens["x_norm_patchtokens"]
+        _, P, C = patch_tokens.shape
+        ################# ref learning
+        covisibility_scores = None
+        ref_idx = None
+        if self.reorder_by_learning_ref:
+            # expand covisibility token to match batch size and sequence length
+            covisibility_token = self.covisibility_token.expand(B, S, 1, C).view(B * S, 1, C).contiguous()
+            # Concatenate covisibility token with patch tokens
+            covisibility_patch_tokens = torch.cat([covisibility_token, patch_tokens], dim=1) # [BS,1+HW,C]
+            covisibility_pos = None
+            if self.rope is not None:
+                covisibility_pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device)
+            # do not use position embedding for special covisibility_token
+            # so set pos to 0 for the special tokens
+            covisibility_pos = covisibility_pos + 1
+            covisibility_pos_special = torch.zeros(B * S, 1, 2).to(images.device).to(covisibility_pos.dtype)
+            covisibility_pos = torch.cat([covisibility_pos_special, covisibility_pos], dim=1) # [BS, 1+HW, 2]
+            # update P because we added special tokens
+            _, P_covis, C_covis = covisibility_patch_tokens.shape
+            frame_idx = 0
+            global_idx = 0
+            output_list = []
+            for ref_block_i in range(self.ref_aa_block_num):
+                for attn_type in self.aa_order:
+                    if attn_type == "frame":
+                        covisibility_patch_tokens, frame_idx, frame_intermediates = self._ref_process_frame_attention(
+                            covisibility_patch_tokens, B, S, P_covis, C_covis, frame_idx, pos=covisibility_pos
+                        )
+                    elif attn_type == "global":
+                        covisibility_patch_tokens, global_idx, global_intermediates = self._ref_process_global_attention(
+                            covisibility_patch_tokens, B, S, P_covis, C_covis, global_idx, pos=covisibility_pos
+                        )
+                    else:
+                        raise ValueError(f"Unknown attention type: {attn_type}")
+                for i in range(len(frame_intermediates)):
+                    # concat frame and global intermediates, [B x S x P x 2C]
+                    concat_inter = torch.cat([frame_intermediates[-1], global_intermediates[-1]], dim=-1)
+                    output_list.append(concat_inter)
+            last_covisibility_patch_tokens = output_list[-1][:,:,0,:] # [B, S, C]
+            # normalize
+            last_covisibility_patch_tokens = self.token_norm(last_covisibility_patch_tokens)
+            covisibility_scores = self.covisibility_head(last_covisibility_patch_tokens).squeeze(-1) # [B, S]
+            # # cos
+            # feat_norm = F.normalize(covisibility_features, p=2, dim=-1, eps=1e-8)  # [B, S, D]
+            # covisibility_scores = feat_norm @ feat_norm.transpose(-1, -2)
+            ref_idx = covisibility_scores.argmax(-1) # [B, S] -> [B]
+            patch_tokens = patch_tokens.view(B,S,P,C)
+            patch_tokens = reorder_by_reference(patch_tokens, ref_idx)
+            patch_tokens = patch_tokens.view(B*S,P,C).contiguous()
+        ####################
+        # Expand camera and register tokens to match batch size and sequence length
+        camera_token = slice_expand_and_flatten(self.camera_token, B, S)
+        register_token = slice_expand_and_flatten(self.register_token, B, S)
+        # Concatenate special tokens with patch tokens
+        tokens = torch.cat([camera_token, register_token, patch_tokens], dim=1) # [BS,1+4+HW,C]
+        pos = None
+        if self.rope is not None:
+            pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device)
+        if self.patch_start_idx > 0:
+            # do not use position embedding for special tokens (camera and register tokens)
+            # so set pos to 0 for the special tokens
+            pos = pos + 1
+            pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(images.device).to(pos.dtype)
+            pos = torch.cat([pos_special, pos], dim=1) # [BS, 1+4+HW, 2]
+        # update P because we added special tokens
+        _, P, C = tokens.shape
+        frame_idx = 0
+        global_idx = 0
+        output_list = []
+        for block_i in range(self.aa_block_num):
+            for attn_type in self.aa_order:
+                if attn_type == "frame":
+                    tokens, frame_idx, frame_intermediates = self._process_frame_attention(
+                        tokens, B, S, P, C, frame_idx, pos=pos
+                    )
+                elif attn_type == "global":
+                    tokens, global_idx, global_intermediates = self._process_global_attention(
+                        tokens, B, S, P, C, global_idx, pos=pos
+                    )
+                else:
+                    raise ValueError(f"Unknown attention type: {attn_type}")
+            for i in range(len(frame_intermediates)):
+                concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1)
+                if (not self.training ) and (self.save_inference_memory) and (block_i not in [4,11,17,23]):
+                    # only save the useful indices of intermediates
+                        output_list.append(torch.tensor(0))
+                else:
+                    # concat frame and global intermediates, [B x S x P x 2C]
+                    output_list.append(concat_inter)
+        del concat_inter
+        del frame_intermediates
+        del global_intermediates
+        return output_list, self.patch_start_idx, covisibility_scores, ref_idx
+    def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None):
+        """
+        Process frame attention blocks. We keep tokens in shape (B*S, P, C).
+        """
+        # If needed, reshape tokens or positions:
+        if tokens.shape != (B * S, P, C):
+            tokens = tokens.view(B, S, P, C).view(B * S, P, C)
+        if pos is not None and pos.shape != (B * S, P, 2):
+            pos = pos.view(B, S, P, 2).view(B * S, P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.frame_blocks[frame_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.frame_blocks[frame_idx](tokens, pos=pos)
+            frame_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, frame_idx, intermediates
+    def _process_global_attention(self, tokens, B, S, P, C, global_idx, pos=None):
+        """
+        Process global attention blocks. We keep tokens in shape (B, S*P, C).
+        """
+        if tokens.shape != (B, S * P, C):
+            tokens = tokens.view(B, S, P, C).view(B, S * P, C)
+        if pos is not None and pos.shape != (B, S * P, 2):
+            pos = pos.view(B, S, P, 2).view(B, S * P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.global_blocks[global_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.global_blocks[global_idx](tokens, pos=pos)
+            global_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, global_idx, intermediates
+    def _ref_process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None):
+        """
+        Process frame attention blocks. We keep tokens in shape (B*S, P, C).
+        """
+        # If needed, reshape tokens or positions:
+        if tokens.shape != (B * S, P, C):
+            tokens = tokens.view(B, S, P, C).view(B * S, P, C)
+        if pos is not None and pos.shape != (B * S, P, 2):
+            pos = pos.view(B, S, P, 2).view(B * S, P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.ref_frame_blocks[frame_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.ref_frame_blocks[frame_idx](tokens, pos=pos)
+            frame_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, frame_idx, intermediates
+    def _ref_process_global_attention(self, tokens, B, S, P, C, global_idx, pos=None):
+        """
+        Process global attention blocks. We keep tokens in shape (B, S*P, C).
+        """
+        if tokens.shape != (B, S * P, C):
+            tokens = tokens.view(B, S, P, C).view(B, S * P, C)
+        if pos is not None and pos.shape != (B, S * P, 2):
+            pos = pos.view(B, S, P, 2).view(B, S * P, 2)
+        intermediates = []
+        # by default, self.aa_block_size=1, which processes one block at a time
+        for _ in range(self.aa_block_size):
+            if self.training:
+                tokens = checkpoint(self.ref_global_blocks[global_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.ref_global_blocks[global_idx](tokens, pos=pos)
+            global_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, global_idx, intermediates
+def slice_expand_and_flatten(token_tensor, B, S):
+    """
+    Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing:
+    1) Uses the first position (index=0) for the first frame only
+    2) Uses the second position (index=1) for all remaining frames (S-1 frames)
+    3) Expands both to match batch size B
+    4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token
+       followed by (S-1) second-position tokens
+    5) Flattens to (B*S, X, C) for processing
+    Returns:
+        torch.Tensor: Processed tokens with shape (B*S, X, C)
+    """
+    # Slice out the "query" tokens => shape (1, 1, ...)
+    query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:])
+    # Slice out the "other" tokens => shape (1, S-1, ...)
+    others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:])
+    # Concatenate => shape (B, S, ...)
+    combined = torch.cat([query, others], dim=1)
+    # Finally flatten => shape (B*S, ...)
+    combined = combined.view(B * S, *combined.shape[2:])
+    return combined

argus/models/argus.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import torch
+import torch.nn as nn
+from typing import Optional, Dict
+from huggingface_hub import PyTorchModelHubMixin
+# Import model components
+from argus.models.aggregator import Aggregator
+from argus.heads.camera_head import CameraHead
+from argus.heads.dpt_head import DPTHead
+from argus.heads.utils import reorder_by_reference
+class Argus(nn.Module, PyTorchModelHubMixin):
+    """
+    Argus multi-task vision model for camera pose estimation, depth prediction, and 3D points.
+    Integrates an aggregator backbone with task-specific heads for:
+    - Camera pose encoding
+    - Depth map prediction
+    - 3D camera/rotated/world point prediction
+    Args:
+        img_size: Input image size (height/width, assumes square) (default: 518)
+        patch_size: Patch size for vision transformer backbone (default: 14)
+        embed_dim: Embedding dimension for transformer features (default: 1024)
+        enable_camera: Enable camera pose estimation head (default: True)
+        enable_depth: Enable depth prediction head (default: True)
+        enable_cam_point: Enable camera coordinate 3D point prediction head (default: False)
+        enable_rotated_point: Enable rotated 3D point prediction head (default: False)
+        enable_point: Enable world coordinate 3D point prediction head (default: False, Please do not set it to True during training)
+    Note:
+        All heads share the same aggregated transformer features from the Aggregator backbone.
+        Each DPT-based head outputs both predictions and confidence scores.
+    """
+    def __init__(
+        self,
+        img_size: int = 518,
+        patch_size: int = 14,
+        embed_dim: int = 1024,
+        enable_camera: bool = True,
+        enable_depth: bool = True,
+        enable_cam_point: bool = False,
+        enable_rotated_point: bool = False,
+        enable_point: bool = False,
+        reorder_by_learning_ref: bool = True,
+        restore_metric_scale: bool = False
+    ) -> None:
+        super().__init__()
+        # For inference
+        self.restore_metric_scale = restore_metric_scale
+        self.reorder_by_learning_ref = reorder_by_learning_ref
+        # Backbone and geometry transformer
+        self.aggregator = Aggregator(
+            img_size=img_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+            reorder_by_learning_ref=reorder_by_learning_ref,
+        )
+        # Task-specific prediction heads (lazy initialization based on flags)
+        self.camera_head: Optional[CameraHead] = CameraHead(dim_in=2 * embed_dim) if enable_camera else None
+        self.depth_head: Optional[DPTHead] = DPTHead(
+            dim_in=2 * embed_dim,
+            output_dim=2,
+            activation="exp",
+            conf_activation="expp1"
+        ) if enable_depth else None
+        # 3D point prediction heads (shared architecture, different output semantics)
+        self.cam_point_head: Optional[DPTHead] = DPTHead(
+            dim_in=2 * embed_dim,
+            output_dim=4,
+            activation="inv_log",
+            conf_activation="expp1"
+        ) if enable_cam_point else None
+        self.rotated_point_head: Optional[DPTHead] = DPTHead(
+            dim_in=2 * embed_dim,
+            output_dim=4,
+            activation="inv_log",
+            conf_activation="expp1"
+        ) if enable_rotated_point else None
+        self.point_head: Optional[DPTHead] = DPTHead(
+            dim_in=2 * embed_dim,
+            output_dim=4,
+            activation="inv_log",
+            conf_activation="expp1"
+        ) if enable_point else None
+    def forward(
+        self,
+        images: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass of the Argus model.
+        Automatically adds batch dimension if missing and processes multi-task predictions.
+        Args:
+            images: Input RGB images with shape:
+                - [S, 3, H, W] (sequence without batch) or
+                - [B, S, 3, H, W] (batch of sequences)
+                Values in range [0, 1], where:
+                - B: batch size
+                - S: sequence length (number of frames)
+                - 3: RGB channels
+                - H/W: image height/width (matches img_size)
+        Returns:
+            Dictionary of model predictions with task-specific outputs:
+                Common outputs:
+                    - covisibility_scores: Covisibility scores from aggregator (shape varies)
+                    - ref_idx: Reference frame indices (shape varies)
+                Camera head outputs (if enabled):
+                    - pose_enc: Final camera pose encoding [B, S, 9]
+                    - pose_enc_list: List of pose encodings from all iterations [List[torch.Tensor]]
+                Depth head outputs (if enabled):
+                    - depth: Predicted depth maps [B, S, H, W, 1]
+                    - depth_conf: Depth prediction confidence [B, S, H, W]
+                Camera point head outputs (if enabled):
+                    - cam_points: 3D camera coordinates per pixel [B, S, H, W, 3]
+                    - cam_points_conf: Camera point confidence [B, S, H, W]
+                Rotated point head outputs (if enabled):
+                    - rotated_points: Rotated 3D coordinates per pixel [B, S, H, W, 3]
+                    - rotated_points_conf: Rotated point confidence [B, S, H, W]
+                World point head outputs (if enabled):
+                    - world_points: 3D world coordinates per pixel [B, S, H, W, 3]
+                    - world_points_conf: World point confidence [B, S, H, W]
+                Inference-only outputs (not training):
+                    - images: Original input images (for visualization) [B, S, 3, H, W]
+        """
+        # Add batch dimension if missing (handle [S,3,H,W] -> [1,S,3,H,W])
+        if len(images.shape) == 4:
+            images = images.unsqueeze(0)
+        # Extract aggregated features from backbone
+        (
+            aggregated_tokens_list,  # List of aggregated transformer tokens across iterations
+            patch_start_idx,         # Patch start indices for feature reconstruction
+            covisibility_scores,     # Covisibility scores between frames
+            ref_idx                  # Reference frame indices
+        ) = self.aggregator(images)
+        # Initialize prediction dictionary
+        predictions: Dict[str, torch.Tensor] = {}
+        # Disable mixed precision for precise prediction calculations
+        with torch.amp.autocast("cuda", enabled=False):
+            # Add aggregator outputs to predictions
+            if covisibility_scores is not None:
+                predictions["covisibility_scores"] = covisibility_scores
+            if ref_idx is not None:
+                predictions["ref_idx"] = ref_idx
+            # Camera pose prediction (if enabled)
+            if self.camera_head is not None:
+                pose_enc_list = self.camera_head(aggregated_tokens_list)
+                predictions["pose_enc"] = pose_enc_list[-1]  # Use final iteration encoding
+                predictions["pose_enc_list"] = pose_enc_list # Mutil-layer supervision
+            # Depth prediction (if enabled)
+            if self.depth_head is not None:
+                depth, depth_conf = self.depth_head(
+                    aggregated_tokens_list,
+                    images=images,
+                    patch_start_idx=patch_start_idx
+                )
+                predictions["depth"] = depth
+                predictions["depth_conf"] = depth_conf
+            # Camera 3D point prediction (if enabled)
+            if self.cam_point_head is not None:
+                cam_pts3d, cam_pts3d_conf = self.cam_point_head(
+                    aggregated_tokens_list,
+                    images=images,
+                    patch_start_idx=patch_start_idx
+                )
+                predictions["cam_points"] = cam_pts3d
+                predictions["cam_points_conf"] = cam_pts3d_conf
+            # Rotated 3D point prediction (if enabled)
+            if self.rotated_point_head is not None:
+                rotated_pts3d, rotated_pts3d_conf = self.rotated_point_head(
+                    aggregated_tokens_list,
+                    images=images,
+                    patch_start_idx=patch_start_idx
+                )
+                predictions["rotated_points"] = rotated_pts3d
+                predictions["rotated_points_conf"] = rotated_pts3d_conf
+            # World 3D point prediction (if enabled)
+            if self.point_head is not None:
+                world_pts3d, world_pts3d_conf = self.point_head(
+                    aggregated_tokens_list,
+                    images=images,
+                    patch_start_idx=patch_start_idx
+                )
+                predictions["world_points"] = world_pts3d
+                predictions["world_points_conf"] = world_pts3d_conf
+        # Store input images for visualization during inference (skip in training)
+        if not self.training:
+            predictions["images"] = images
+            if "ref_idx" in predictions:
+                ref_idx = predictions["ref_idx"].detach()
+                # Reorder all spatial/temporal data (exclude adjacency matrix and IDs)
+                predictions["images"] = reorder_by_reference(predictions["images"], ref_idx)
+            if self.restore_metric_scale:
+                # Restore metric scale
+                abs_scale = 10.0
+                if self.camera_head is not None:
+                    predictions["pose_enc"][...,:3] *= abs_scale
+                if self.depth_head is not None:
+                    predictions["depth"] *= abs_scale
+                if self.cam_point_head is not None:
+                    predictions["cam_points"] *= abs_scale
+                if self.rotated_point_head is not None:
+                    predictions["rotated_points"] *= abs_scale
+                if self.point_head is not None:
+                    predictions["world_points"] *= abs_scale
+        return predictions

argus/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2026 Realsee. All rights reserved.
2	+ # Licensed under the Apache License, Version 2.0.

argus/utils/data_io.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright 2026 Realsee. All rights reserved.
+# Licensed under the Apache License, Version 2.0.
+"""
+Shared I/O and preprocessing utilities for panoramic image data.
+These functions are used by both evaluation and training pipelines.
+"""
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+import numpy as np
+def read_image_cv2_360(path: str, rgb: bool = True, shape=(560, 280)) -> np.ndarray:
+    """Read and resize a 360 panorama image.
+    Args:
+        path: Path to the image file.
+        rgb: If True, convert BGR to RGB (default: True).
+        shape: Target (width, height) tuple.
+    Returns:
+        Image as numpy array with shape (H, W, 3).
+    """
+    img = cv2.imread(path)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    if img.shape[1] != shape[0]:
+        img = cv2.resize(img, shape, interpolation=cv2.INTER_AREA)
+    return img
+def read_depth_360(path: str, depth_scale=5000.0, shape=(560, 280)) -> np.ndarray:
+    """Read and normalize a 360 depth map.
+    Args:
+        path: Path to the depth image file.
+        depth_scale: Scale factor to convert raw depth to meters.
+        shape: Target (width, height) tuple.
+    Returns:
+        Depth map as float32 numpy array with shape (H, W).
+    """
+    d = cv2.imread(path, cv2.IMREAD_UNCHANGED)
+    if d.shape[1] != shape[0]:
+        d = cv2.resize(d, shape, interpolation=cv2.INTER_NEAREST)
+    d = d.astype(np.float32) / depth_scale
+    return d
+def random_rotate_theta(W=560, max_shift_percent=0.5):
+    """Generate a random rotation angle for panorama augmentation.
+    Args:
+        W: Panorama width in pixels.
+        max_shift_percent: Maximum horizontal shift as fraction of width.
+    Returns:
+        Rotation angle in radians.
+    """
+    max_shift = int(W * max_shift_percent)
+    shift_pixels = np.random.randint(-max_shift, max_shift + 1)
+    theta = (shift_pixels * 2 * np.pi) / W
+    return theta
+def rotate_y(theta):
+    """Create a 3x3 rotation matrix around the Y-axis.
+    Args:
+        theta: Rotation angle in radians.
+    Returns:
+        3x3 rotation matrix as float64 numpy array.
+    """
+    cos_theta = np.cos(theta)
+    sin_theta = np.sin(theta)
+    return np.array(
+        [[cos_theta, 0, -sin_theta], [0, 1, 0], [sin_theta, 0, cos_theta]],
+        dtype=np.float64,
+    )
+def pano_depth_to_points(depth_map, pano_shape=(560, 280), crop=True, crop_ratio=0.15):
+    """Convert a panorama depth map to 3D point cloud.
+    Args:
+        depth_map: 2D depth map (H, W) or flattened array.
+        pano_shape: Original panorama (width, height) tuple.
+        crop: Whether the depth map has been vertically cropped.
+        crop_ratio: Crop ratio applied to top and bottom.
+    Returns:
+        Point cloud as numpy array with shape (N, 3).
+    """
+    w, h = pano_shape
+    if not crop:
+        px = np.tile(np.arange(w), int(h))
+        py = np.arange(0, int(h)).repeat(w)
+    else:
+        px = np.tile(np.arange(w), int(h * (1 - 2 * crop_ratio)))
+        py = np.arange(int(crop_ratio * h), int((1 - crop_ratio) * h)).repeat(w)
+    dist = depth_map.reshape(-1)
+    lat = (py / h - 0.5) * np.pi
+    long = (px / w - 0.5) * np.pi * 2.0
+    y = dist * np.sin(lat)
+    tmp = dist * np.cos(lat)
+    x = tmp * np.sin(long)
+    z = tmp * np.cos(long)
+    point_map = np.concatenate([i.reshape(-1, 1) for i in (x, y, z)], axis=-1)
+    return point_map  # (h*w, 3)
+def crop_panorama(pano, crop_ratio=0.15):
+    """Crop the top and bottom of a panorama by a given ratio.
+    Args:
+        pano: Input panorama array with shape (H, W, ...).
+        crop_ratio: Fraction to crop from top and bottom.
+    Returns:
+        Cropped panorama.
+    """
+    H, W = pano.shape[:2]
+    crop_H_top = int(crop_ratio * H)
+    crop_H_bottom = H - int(crop_ratio * H)
+    crop_pano = pano[crop_H_top:crop_H_bottom, ...]
+    return crop_pano
+def rotate_panorama(panorama, theta):
+    """Horizontally rotate a panorama by shifting pixels.
+    Args:
+        panorama: Input panorama array with shape (H, W, ...).
+        theta: Rotation angle in radians.
+    Returns:
+        Shifted panorama.
+    """
+    H, W = panorama.shape[:2]
+    shift_pixels = int((theta * W) / (2 * np.pi))
+    shifted = np.roll(panorama, shift_pixels, axis=1)
+    return shifted

argus/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import torch
+import numpy as np
+def closed_form_inverse_se3(se3, R=None, T=None):
+    """
+    Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch.
+    If `R` and `T` are provided, they must correspond to the rotation and translation
+    components of `se3`. Otherwise, they will be extracted from `se3`.
+    Args:
+        se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices.
+        R (optional): Nx3x3 array or tensor of rotation matrices.
+        T (optional): Nx3x1 array or tensor of translation vectors.
+    Returns:
+        Inverted SE3 matrices with the same type and device as `se3`.
+    Shapes:
+        se3: (N, 4, 4)
+        R: (N, 3, 3)
+        T: (N, 3, 1)
+    """
+    # Check if se3 is a numpy array or a torch tensor
+    is_numpy = isinstance(se3, np.ndarray)
+    # Validate shapes
+    if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4):
+        raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.")
+    # Extract R and T if not provided
+    if R is None:
+        R = se3[:, :3, :3]  # (N,3,3)
+    if T is None:
+        T = se3[:, :3, 3:]  # (N,3,1)
+    # Transpose R
+    if is_numpy:
+        # Compute the transpose of the rotation for NumPy
+        R_transposed = np.transpose(R, (0, 2, 1))
+        # -R^T t for NumPy
+        top_right = -np.matmul(R_transposed, T)
+        inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1))
+    else:
+        R_transposed = R.transpose(1, 2)  # (N,3,3)
+        top_right = -torch.bmm(R_transposed, T)  # (N,3,1)
+        inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1)
+        inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
+    inverted_matrix[:, :3, :3] = R_transposed
+    inverted_matrix[:, :3, 3:] = top_right
+    return inverted_matrix
+def pano_depth_to_points(depth_map, original_pano_shape=(560, 280), crop_ratio=0.15):
+    """
+    Convert batched cropped panoramic depth maps to 3D point clouds (PyTorch implementation).
+    Assumption: Input depth maps are already cropped by crop_ratio on top and bottom.
+    Args:
+        depth_map (torch.Tensor): Input cropped depth map, shape [B, S, H_crop, W, 1]
+        original_pano_shape (tuple): Original uncropped panorama size (W_ori, H_ori), default (560, 280)
+        crop_ratio (float): Crop ratio of original panorama (top and bottom respectively), default 0.15
+    Returns:
+        torch.Tensor: 3D point cloud with shape [B, S, H_crop, W, 3]
+    """
+    # Validate input shape
+    assert depth_map.dim() == 5 and depth_map.shape[-1] == 1, \
+        f"Input must be [B, S, H_crop, W, 1], got {depth_map.shape}"
+    B, S, H_crop, W, _ = depth_map.shape
+    W_ori, H_ori = original_pano_shape
+    device = depth_map.device  # Align tensor device automatically
+    # Generate pixel grid coordinates (H_crop, W)
+    px_grid, py_grid = torch.meshgrid(
+        torch.arange(W, device=device),
+        torch.arange(H_crop, device=device),
+        indexing='xy'  # Consistent with numpy's meshgrid
+    )
+    # Restore to original panorama y-coordinates (compensate for cropping)
+    crop_top = int(crop_ratio * H_ori)
+    py_ori = py_grid + crop_top
+    # Compute spherical coordinates (lat: latitude, long: longitude)
+    lat = (py_ori / H_ori - 0.5) * torch.pi
+    long = (px_grid / W_ori - 0.5) * 2 * torch.pi
+    # Remove channel dim and compute 3D Cartesian coordinates
+    dist = depth_map.squeeze(-1)  # [B, S, H_crop, W]
+    y = dist * torch.sin(lat)
+    tmp = dist * torch.cos(lat)
+    x = tmp * torch.sin(long)
+    z = tmp * torch.cos(long)
+    # Concatenate to form 3D point cloud
+    point_cloud = torch.stack([x, y, z], dim=-1)
+    return point_cloud
+def points_to_pano_depth(points):
+    """
+    Convert 3D point cloud back to ray panoramic depth map.
+    Ignore the error in direction.
+    Args:
+        points (torch.Tensor): Input 3D point cloud, shape [B, S, H, W, 3]
+    Returns:
+        torch.Tensor: panoramic depth map, shape [B, S, H, W, 1]
+    """
+    # Validate input shape and fill mode
+    assert points.dim() == 5 and points.shape[-1] == 3, \
+        f"Input point cloud must be [B, S, H, W, 3], got {points.shape}"
+    # Compute radial depth (dist = sqrt(x² + y² + z²))
+    dist = torch.norm(points, dim=-1, keepdim=True)  # [B, S, H, W, 1]
+    return dist
+def camera_points_to_rotated_points(cam_points, R):
+    """
+    Rotate batched panoramic camera point clouds with corresponding rotation matrices.
+    Args:
+        cam_points (torch.Tensor): Input camera 3D point cloud, shape [B, S, H, W, 3]
+        R (torch.Tensor): Corresponding rotation matrices, shape [B, S, 3, 3]
+    Returns:
+        torch.Tensor: Rotated 3D point cloud, shape [B, S, H, W, 3] (same as input cam_points)
+    """
+    # Validate input shapes and dimensions matching
+    assert cam_points.dim() == 5 and cam_points.shape[-1] == 3, \
+        f"Camera points must be [B, S, H, W, 3], got {cam_points.shape}"
+    assert R.dim() == 4 and R.shape[2:] == (3, 3), \
+        f"Rotation matrices R must be [B, S, 3, 3], got {R.shape}"
+    assert cam_points.shape[:2] == R.shape[:2], \
+        f"Batch/Sequence dim mismatch: cam_points {cam_points.shape[:2]} vs R {R.shape[:2]}"
+    # Expand dimensions for broadcasting (align spatial dimensions H, W)
+    cam_points_expanded = cam_points.unsqueeze(-1)  # [B, S, H, W, 3, 1]
+    R_expanded = R.unsqueeze(2).unsqueeze(2)        # [B, S, 1, 1, 3, 3]
+    # Batch matrix multiplication: R @ p (rotation operation)
+    rotated_points_expanded = torch.matmul(R_expanded, cam_points_expanded)
+    # Squeeze redundant dimension to recover original shape
+    rotated_points = rotated_points_expanded.squeeze(-1)
+    return rotated_points
+def rotated_points_to_world_points(rotated_points, t):
+    """
+    Transform rotated camera points to world coordinates by adding translation vector.
+    Args:
+        rotated_points (torch.Tensor): Rotated 3D point cloud, shape [B, S, H, W, 3]
+        t (torch.Tensor): Translation vector, shape [B, S, 3] (per batch-sequence translation)
+    Returns:
+        torch.Tensor: World-coordinate 3D point cloud, shape [B, S, H, W, 3] (same as input)
+    """
+    # Validate input shapes and dimension matching
+    assert rotated_points.dim() == 5 and rotated_points.shape[-1] == 3, \
+        f"Rotated points must be [B, S, H, W, 3], got {rotated_points.shape}"
+    assert t.dim() == 3 and t.shape[-1] == 3, \
+        f"Translation t must be [B, S, 3], got {t.shape}"
+    assert rotated_points.shape[:2] == t.shape[:2], \
+        f"Batch/Sequence dim mismatch: rotated_points {rotated_points.shape[:2]} vs t {t.shape[:2]}"
+    # Expand translation dimensions for broadcasting with spatial dimensions (H, W)
+    # t: [B, S, 3] -> [B, S, 1, 1, 3] (broadcast to H and W)
+    t_expanded = t.unsqueeze(2).unsqueeze(2)
+    # Add translation (broadcasting automatically applies t to all H×W points per B-S pair)
+    world_points = rotated_points + t_expanded
+    return world_points
+def unproject_depth_to_world_points(depth, extrinsic, size=560):
+    '''
+    Args:
+        depth: [S, H, W, 1]
+        extrinsic: [S, 4, 4]
+    Returns:
+        world_points: [S, H, W, 3]
+    '''
+    camera_points = pano_depth_to_points(depth, original_pano_shape=(size, size//2))
+    rotated_points = camera_points_to_rotated_points(camera_points, extrinsic[:, :, :3, :3])
+    world_points = rotated_points_to_world_points(rotated_points, extrinsic[:, :, :3, 3])
+    return world_points

argus/utils/normalization.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from typing import Optional, Tuple
+from argus.utils.geometry import closed_form_inverse_se3
+def cal_scale_by_points(points: torch.Tensor, point_masks: torch.Tensor) -> torch.Tensor:
+    # Calculate average distance of valid 3D points (batch-wise)
+    dist = points.norm(dim=-1)
+    dist_sum = (dist * point_masks).sum(dim=[1, 2, 3])  # Shape: [B,]
+    valid_count = point_masks.sum(dim=[1, 2, 3])
+    avg_scale = (dist_sum / (valid_count + 1e-3)).clamp(min=1e-6, max=1e6)
+    return avg_scale
+def normalize_camera_extrinsics_and_points_batch(
+    extrinsics: torch.Tensor,
+    cam_points: torch.Tensor,
+    depths: torch.Tensor,
+    point_masks: torch.Tensor,
+    scale_mode: str = "none",
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Basic input validation
+    assert extrinsics.ndim == 4 and extrinsics.shape[2:] == (4, 4), \
+        f"Extrinsics must be (B, S, 4, 4), got {extrinsics.shape}"
+    B, S = extrinsics.shape[:2]
+    device = extrinsics.device
+    # Step 1: Transform all extrinsics to reference frame (1st frame of each batch)
+    ref_extrinsics = extrinsics[:,0,:,:]  # (B, 4, 4)
+    ref_extr_inv = closed_form_inverse_se3(ref_extrinsics)
+    new_extrinsics = torch.matmul(ref_extr_inv.unsqueeze(1), extrinsics)  # (B, S, 4, 4) world coordinate
+    # Step 2: Clone tensors to avoid in-place modification
+    new_depths = depths.clone()
+    new_cam_points = cam_points.clone()
+    # Step 3: Compute rotated/world points from new extrinsics
+    R_new = new_extrinsics[:, :, :3, :3]  # (B, S, 3, 3)
+    t_new = new_extrinsics[:, :, :3, 3]  # (B, S, 3)
+    new_rotated_points = torch.matmul(R_new.unsqueeze(2).unsqueeze(3), new_cam_points.unsqueeze(-1)).squeeze(-1) # (B,S,1,1,3,3) × (B,S,H,W,3,1) -> (B,S,H,W,3)
+    new_world_points = new_rotated_points + t_new.unsqueeze(2).unsqueeze(3)
+    # Step 4: Apply scene scaling
+    if scale_mode == "avg_dist":
+        avg_scale = cal_scale_by_points(new_world_points, point_masks)  # (B,)
+        # Reshape scale for broadcasting with different tensor shapes
+        scale_3d = avg_scale.view(-1, 1, 1)      # For extrinsics (B, S, 4, 4)
+        scale_4d = avg_scale.view(-1, 1, 1, 1)   # For depths (B, S, H, W)
+        scale_5d = avg_scale.view(-1, 1, 1, 1, 1) # For 3D points (B, S, H, W, 3)
+        new_extrinsics[:, :, :3, 3] /= scale_3d
+        new_depths /= scale_4d
+        new_cam_points /= scale_5d
+        new_rotated_points /= scale_5d
+        new_world_points /= scale_5d
+    elif scale_mode == "abs":
+        metric_scale = 10.0
+        new_extrinsics[:, :, :3, 3] /= metric_scale
+        new_depths /= metric_scale
+        new_cam_points /= metric_scale
+        new_rotated_points /= metric_scale
+        new_world_points /= metric_scale
+    elif scale_mode == "none":
+        pass
+    else:
+        raise ValueError(f"Unknown scale_mode: {scale_mode}")
+    return new_extrinsics, new_cam_points, new_rotated_points, new_world_points, new_depths

argus/utils/pose_enc.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from typing import Tuple, Union
+from .rotation import quat_to_mat, mat_to_quat
+def extri_to_pose_encoding360(
+    extrinsics: torch.Tensor,
+    pose_encoding_type: Union[str, "absT_quaR"] = "absT_quaR"
+) -> torch.Tensor:
+    """
+    Convert camera extrinsic parameters to a compact pose encoding (absolute translation + quaternion rotation).
+    Transforms OpenCV-style camera extrinsics (3x4 [R|t] matrix) into a flattened encoding format
+    suitable for machine learning tasks like pose prediction or representation learning.
+    Args:
+        extrinsics: Camera extrinsic matrices with shape [B, S, 3, 4] or [B, S, 4, 4]
+            - B: Batch size
+            - S: Sequence length (number of frames)
+            - 3x4/4x4: Extrinsic matrix in OpenCV coordinate system (x-right, y-down, z-forward)
+              representing the transformation from world to camera space ([R|t] where R=3x3 rotation, t=3x1 translation)
+        pose_encoding_type: Type of pose encoding format (only "absT_quaR" supported):
+            - "absT_quaR": Absolute translation (3D) + quaternion rotation (4D)
+    Returns:
+        Encoded pose tensor with shape [B, S, 7]
+            - [:3]: Absolute translation vector (T) in world coordinates
+            - [3:7]: Rotation represented as unit quaternion (quat)
+    """
+    # Extract rotation matrix (R) and translation vector (T) from extrinsics
+    # Handle both 3x4 and 4x4 extrinsic matrix inputs
+    R = extrinsics[:, :, :3, :3]  # [B, S, 3, 3] - rotation matrix
+    T = extrinsics[:, :, :3, 3]   # [B, S, 3]    - translation vector
+    if pose_encoding_type == "absT_quaR":
+        # Convert rotation matrix to quaternion (4D)
+        quat = mat_to_quat(R)
+        # Concatenate translation and quaternion to form compact pose encoding
+        pose_encoding = torch.cat([T, quat], dim=-1).float()
+    else:
+        raise NotImplementedError(f"Pose encoding type '{pose_encoding_type}' not supported. Only 'absT_quaR' is implemented.")
+    return pose_encoding
+def pose_encoding_to_extri360(
+    pose_encoding: torch.Tensor,
+    pose_encoding_type: Union[str, "absT_quaR"] = "absT_quaR"
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert compact pose encoding back to full camera extrinsic parameters (inverse of extri_to_pose_encoding360).
+    Reconstructs the 4x4 homogeneous extrinsic matrix from the flattened pose encoding,
+    including extraction of confidence scores from the encoding's extra dimensions.
+    Args:
+        pose_encoding: Encoded pose tensor with shape [B, S, 9]
+            - B: Batch size
+            - S: Sequence length (number of frames)
+            - [:3]: Absolute translation vector (T)
+            - [3:7]: Rotation quaternion (quat)
+            - [-2:]: Confidence scores for translation and rotation
+        pose_encoding_type: Type of pose encoding format (only "absT_quaR" supported):
+            - "absT_quaR": Absolute translation (3D) + quaternion rotation (4D)
+    Returns:
+        Tuple containing:
+            1. extrinsics: Reconstructed camera extrinsic matrices with shape [B, S, 4, 4]
+               (homogeneous matrix in OpenCV coordinate system: [R|t; 0 0 0 1])
+            2. conf: Confidence scores with shape [B, S, 2]
+               - [:, :, 0]: Translation confidence
+               - [:, :, 1]: Rotation confidence
+    Raises:
+        NotImplementedError: If unsupported pose encoding type is provided
+    """
+    if pose_encoding_type == "absT_quaR":
+        # Extract translation (T) and rotation quaternion (quat) from pose encoding
+        T = pose_encoding[..., :3]       # [B, S, 3] - translation vector
+        quat = pose_encoding[..., 3:7]   # [B, S, 4] - rotation quaternion
+        # Convert quaternion back to rotation matrix (3x3)
+        R = quat_to_mat(quat)  # [B, S, 3, 3]
+        # Reconstruct 3x4 [R|t] matrix (rotation + translation)
+        extri_3x4 = torch.cat([R, T[..., None]], dim=-1)  # [B, S, 3, 4]
+        # Add homogeneous row [0, 0, 0, 1] to form 4x4 extrinsic matrix
+        batch_size, seq_len = extri_3x4.shape[:2]
+        homogenous_row = torch.tensor(
+            [0, 0, 0, 1],
+            device=extri_3x4.device,
+            dtype=extri_3x4.dtype
+        ).expand(batch_size, seq_len, 1, 4)  # [B, S, 1, 4]
+        # Combine to form 4x4 homogeneous extrinsic matrix
+        extrinsics = torch.cat((extri_3x4, homogenous_row), dim=2)  # [B, S, 4, 4]
+        # Extract confidence scores (last two dimensions of pose encoding)
+        conf = pose_encoding[..., -2:]  # [B, S, 2]
+        return extrinsics, conf
+    raise NotImplementedError(f"Pose encoding type '{pose_encoding_type}' not supported. Only 'absT_quaR' is implemented.")

argus/utils/rotation.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Quaternion Order: XYZW or say ijkr, scalar-last
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    # Normalize quaternions to unit length
+    quaternions = F.normalize(quaternions, dim=-1)
+    i, j, k, r = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part last, as tensor of shape (..., 4).
+        Quaternion Order: XYZW or say ijkr, scalar-last
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1)
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [1.0 + m00 + m11 + m22, 1.0 + m00 - m11 - m22, 1.0 - m00 + m11 - m22, 1.0 - m00 - m11 + m22], dim=-1
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,))
+    # Convert from rijk to ijkr
+    out = out[..., [1, 2, 3, 0]]
+    out = standardize_quaternion(out)
+    return out
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)